Initial commit.
This commit is contained in:
commit
1fa1606f0d
118
nevernote.sh
Executable file
118
nevernote.sh
Executable file
@ -0,0 +1,118 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ -z "$1" ]; then
|
||||
exit
|
||||
fi
|
||||
|
||||
URL=$1
|
||||
TMP_DIR=`mktemp -d /tmp/nevernote.XXXXXX`
|
||||
NEVERNOTE_DIR="/mnt/tabula/nevernote/read"
|
||||
|
||||
## Take page title
|
||||
echo
|
||||
echo `date '+%H:%M:%S'`
|
||||
echo $URL
|
||||
|
||||
## Check if URL is forbidden to download
|
||||
grep -x "$URL" /tmp/nevernote-list-excluded > /dev/null 2>&1
|
||||
if [ "$?" -eq 0 ]; then
|
||||
echo "exclude"
|
||||
echo $URL >> /tmp/nevernote-error-excluded
|
||||
rm -r $TMP_DIR
|
||||
exit
|
||||
fi
|
||||
|
||||
## Check if it is downloading now
|
||||
ps ax | grep "./scripts/nevernote.sh" | awk '{print($7)}' | grep -x "$URL"
|
||||
if [ "$?" -eq 0 ]; then
|
||||
echo "downloading now"
|
||||
rm -r $TMP_DIR
|
||||
exit
|
||||
fi
|
||||
|
||||
## Check downloaded urls for duplicates
|
||||
#head -qn 1 ${NEVERNOTE_DIR}/*/wget.log | awk '{print($3)}' | grep -x "$URL" > /dev/null 2>&1
|
||||
grep -x "$URL" /tmp/nevernote-list-downloaded > /dev/null 2>&1
|
||||
if [ "$?" -eq 0 ]; then
|
||||
echo "dublicate"
|
||||
echo $URL >> /tmp/nevernote-error-dups
|
||||
rm -r $TMP_DIR
|
||||
exit
|
||||
fi
|
||||
|
||||
wget -T 15 -t 5 --user-agent="" -P $TMP_DIR "$URL" > /dev/null 2>&1
|
||||
INDEX_PAGE=`ls $TMP_DIR`
|
||||
if [ "$INDEX_PAGE" = '' ]; then
|
||||
echo "download error"
|
||||
echo $URL >> /tmp/nevernote-error-404
|
||||
rm -r $TMP_DIR
|
||||
exit
|
||||
fi
|
||||
|
||||
## Convert page to system's charset
|
||||
enconv "$TMP_DIR/$INDEX_PAGE" > /dev/null 2>&1
|
||||
|
||||
## Remove RC and LF symbols
|
||||
#tr -d '\n' < "$TMP_DIR/$INDEX_PAGE" | tr -d '\r' > "$TMP_DIR/${INDEX_PAGE}.plain"
|
||||
#mv "$TMP_DIR/${INDEX_PAGE}.plain" "$TMP_DIR/$INDEX_PAGE"
|
||||
|
||||
## Extract title and leave non-destruct chars
|
||||
PAGE_DIR=$(sed -n -e 's/.*<title>\(.*\)<\/title>.*/\1/p' "$TMP_DIR/$INDEX_PAGE" | sed 's+[\+\{\;\"\\\=\?~\(\)\<\>\&\*\|\$\/\#:]+_+g')
|
||||
PAGE_DIR=$(echo $PAGE_DIR | sed 's+\.*$++g')
|
||||
|
||||
## Remove first and last whitespaces
|
||||
PAGE_DIR=$(echo $PAGE_DIR | sed 's+^ *++g' | sed 's+ *$++g')
|
||||
|
||||
## Trunc too long titles
|
||||
if [ "${#PAGE_DIR}" -gt 100 ]; then
|
||||
PAGE_DIR=${PAGE_DIR:0:100}
|
||||
fi
|
||||
|
||||
## If title wasn't parsed, leave random name
|
||||
if [ "$PAGE_DIR" = '' ]; then
|
||||
PAGE_DIR=`basename $TMP_DIR`
|
||||
fi
|
||||
rm "$TMP_DIR/$INDEX_PAGE"
|
||||
|
||||
## Check local storage folder
|
||||
## If duplicate - rename (add "_dup.X" to the end)
|
||||
while true; do
|
||||
ls "$NEVERNOTE_DIR/$PAGE_DIR" > /dev/null 2>&1
|
||||
if [ "$?" -eq 0 ]; then
|
||||
DUP=${PAGE_DIR#*_dup.}
|
||||
if [ "$DUP" = "$PAGE_DIR" ]; then
|
||||
DUP=1
|
||||
else
|
||||
let "DUP += 1"
|
||||
fi
|
||||
PAGE_DIR=${PAGE_DIR%_dup.*}"_dup."$DUP
|
||||
else
|
||||
break
|
||||
fi
|
||||
done
|
||||
|
||||
## Download full page
|
||||
wget -E -H -k -K -p -e robots=off --user-agent="" -T 15 -t 5 -o $TMP_DIR/wget.log -P $TMP_DIR "$URL"
|
||||
WGET_EXIT_CODE=$?
|
||||
if [ "$WGET_EXIT_CODE" -ne 0 ]; then
|
||||
echo $WGET_EXIT_CODE"|"$URL >> /tmp/nevernote-error-wget
|
||||
fi
|
||||
|
||||
## Make link for index.html
|
||||
#pushd $TMP_DIR > /dev/null 2>&1
|
||||
#INDEX_PATH=`find ./ -name "${INDEX_PAGE}.orig"`
|
||||
#ln -s "${INDEX_PATH%.orig}" "$INDEX_PAGE" > /dev/null 2>&1
|
||||
#if [ "$?" -ne 0 ]; then
|
||||
# echo Cant\'t link $URL
|
||||
# echo $URL >> /tmp/nevernote-errors
|
||||
# rm -r $TMP_DIR
|
||||
# exit
|
||||
#fi
|
||||
#popd > /dev/null 2>&1
|
||||
|
||||
## Save page url
|
||||
echo $URL > $TMP_DIR/URL
|
||||
|
||||
mv $TMP_DIR "$NEVERNOTE_DIR/$PAGE_DIR"
|
||||
echo "saved in $NEVERNOTE_DIR/$PAGE_DIR"
|
||||
echo $URL >> /tmp/nevernote-list-downloaded
|
||||
Loading…
Reference in New Issue
Block a user