nevernote/nevernote.sh

119 lines
2.9 KiB
Bash
Raw Normal View History

2012-08-18 22:46:46 +00:00
#!/bin/bash
if [ -z "$1" ]; then
exit
fi
URL=$1
TMP_DIR=`mktemp -d /tmp/nevernote.XXXXXX`
NEVERNOTE_DIR="/mnt/tabula/nevernote/read"
## Take page title
echo
echo `date '+%H:%M:%S'`
echo $URL
## Check if URL is forbidden to download
grep -x "$URL" /tmp/nevernote-list-excluded > /dev/null 2>&1
if [ "$?" -eq 0 ]; then
echo "exclude"
echo $URL >> /tmp/nevernote-error-excluded
rm -r $TMP_DIR
exit
fi
## Check if it is downloading now
ps ax | grep "./scripts/nevernote.sh" | awk '{print($7)}' | grep -x "$URL"
if [ "$?" -eq 0 ]; then
echo "downloading now"
rm -r $TMP_DIR
exit
fi
## Check downloaded urls for duplicates
#head -qn 1 ${NEVERNOTE_DIR}/*/wget.log | awk '{print($3)}' | grep -x "$URL" > /dev/null 2>&1
grep -x "$URL" /tmp/nevernote-list-downloaded > /dev/null 2>&1
if [ "$?" -eq 0 ]; then
echo "dublicate"
echo $URL >> /tmp/nevernote-error-dups
rm -r $TMP_DIR
exit
fi
wget -T 15 -t 5 --user-agent="" -P $TMP_DIR "$URL" > /dev/null 2>&1
INDEX_PAGE=`ls $TMP_DIR`
if [ "$INDEX_PAGE" = '' ]; then
echo "download error"
echo $URL >> /tmp/nevernote-error-404
rm -r $TMP_DIR
exit
fi
## Convert page to system's charset
enconv "$TMP_DIR/$INDEX_PAGE" > /dev/null 2>&1
## Remove RC and LF symbols
#tr -d '\n' < "$TMP_DIR/$INDEX_PAGE" | tr -d '\r' > "$TMP_DIR/${INDEX_PAGE}.plain"
#mv "$TMP_DIR/${INDEX_PAGE}.plain" "$TMP_DIR/$INDEX_PAGE"
## Extract title and leave non-destruct chars
PAGE_DIR=$(sed -n -e 's/.*<title>\(.*\)<\/title>.*/\1/p' "$TMP_DIR/$INDEX_PAGE" | sed 's+[\+\{\;\"\\\=\?~\(\)\<\>\&\*\|\$\/\#:]+_+g')
PAGE_DIR=$(echo $PAGE_DIR | sed 's+\.*$++g')
## Remove first and last whitespaces
PAGE_DIR=$(echo $PAGE_DIR | sed 's+^ *++g' | sed 's+ *$++g')
## Trunc too long titles
if [ "${#PAGE_DIR}" -gt 100 ]; then
PAGE_DIR=${PAGE_DIR:0:100}
fi
## If title wasn't parsed, leave random name
if [ "$PAGE_DIR" = '' ]; then
PAGE_DIR=`basename $TMP_DIR`
fi
rm "$TMP_DIR/$INDEX_PAGE"
## Check local storage folder
## If duplicate - rename (add "_dup.X" to the end)
while true; do
ls "$NEVERNOTE_DIR/$PAGE_DIR" > /dev/null 2>&1
if [ "$?" -eq 0 ]; then
DUP=${PAGE_DIR#*_dup.}
if [ "$DUP" = "$PAGE_DIR" ]; then
DUP=1
else
let "DUP += 1"
fi
PAGE_DIR=${PAGE_DIR%_dup.*}"_dup."$DUP
else
break
fi
done
## Download full page
wget -E -H -k -K -p -e robots=off --user-agent="" -T 15 -t 5 -o $TMP_DIR/wget.log -P $TMP_DIR "$URL"
WGET_EXIT_CODE=$?
if [ "$WGET_EXIT_CODE" -ne 0 ]; then
echo $WGET_EXIT_CODE"|"$URL >> /tmp/nevernote-error-wget
fi
## Make link for index.html
#pushd $TMP_DIR > /dev/null 2>&1
#INDEX_PATH=`find ./ -name "${INDEX_PAGE}.orig"`
#ln -s "${INDEX_PATH%.orig}" "$INDEX_PAGE" > /dev/null 2>&1
#if [ "$?" -ne 0 ]; then
# echo Cant\'t link $URL
# echo $URL >> /tmp/nevernote-errors
# rm -r $TMP_DIR
# exit
#fi
#popd > /dev/null 2>&1
## Save page url
echo $URL > $TMP_DIR/URL
mv $TMP_DIR "$NEVERNOTE_DIR/$PAGE_DIR"
echo "saved in $NEVERNOTE_DIR/$PAGE_DIR"
echo $URL >> /tmp/nevernote-list-downloaded