From 1fa1606f0d5c43193c794b8618af95165a28baee Mon Sep 17 00:00:00 2001 From: Maks Snegov Date: Sun, 19 Aug 2012 02:46:46 +0400 Subject: [PATCH] Initial commit. --- nevernote.sh | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100755 nevernote.sh diff --git a/nevernote.sh b/nevernote.sh new file mode 100755 index 0000000..d324ff5 --- /dev/null +++ b/nevernote.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +if [ -z "$1" ]; then + exit +fi + +URL=$1 +TMP_DIR=`mktemp -d /tmp/nevernote.XXXXXX` +NEVERNOTE_DIR="/mnt/tabula/nevernote/read" + +## Take page title +echo +echo `date '+%H:%M:%S'` +echo $URL + +## Check if URL is forbidden to download +grep -x "$URL" /tmp/nevernote-list-excluded > /dev/null 2>&1 +if [ "$?" -eq 0 ]; then + echo "exclude" + echo $URL >> /tmp/nevernote-error-excluded + rm -r $TMP_DIR + exit +fi + +## Check if it is downloading now +ps ax | grep "./scripts/nevernote.sh" | awk '{print($7)}' | grep -x "$URL" +if [ "$?" -eq 0 ]; then + echo "downloading now" + rm -r $TMP_DIR + exit +fi + +## Check downloaded urls for duplicates +#head -qn 1 ${NEVERNOTE_DIR}/*/wget.log | awk '{print($3)}' | grep -x "$URL" > /dev/null 2>&1 +grep -x "$URL" /tmp/nevernote-list-downloaded > /dev/null 2>&1 +if [ "$?" -eq 0 ]; then + echo "dublicate" + echo $URL >> /tmp/nevernote-error-dups + rm -r $TMP_DIR + exit +fi + +wget -T 15 -t 5 --user-agent="" -P $TMP_DIR "$URL" > /dev/null 2>&1 +INDEX_PAGE=`ls $TMP_DIR` +if [ "$INDEX_PAGE" = '' ]; then + echo "download error" + echo $URL >> /tmp/nevernote-error-404 + rm -r $TMP_DIR + exit +fi + +## Convert page to system's charset +enconv "$TMP_DIR/$INDEX_PAGE" > /dev/null 2>&1 + +## Remove RC and LF symbols +#tr -d '\n' < "$TMP_DIR/$INDEX_PAGE" | tr -d '\r' > "$TMP_DIR/${INDEX_PAGE}.plain" +#mv "$TMP_DIR/${INDEX_PAGE}.plain" "$TMP_DIR/$INDEX_PAGE" + +## Extract title and leave non-destruct chars +PAGE_DIR=$(sed -n -e 's/.*\(.*\)<\/title>.*/\1/p' "$TMP_DIR/$INDEX_PAGE" | sed 's+[\+\{\;\"\\\=\?~\(\)\<\>\&\*\|\$\/\#:]+_+g') +PAGE_DIR=$(echo $PAGE_DIR | sed 's+\.*$++g') + +## Remove first and last whitespaces +PAGE_DIR=$(echo $PAGE_DIR | sed 's+^ *++g' | sed 's+ *$++g') + +## Trunc too long titles +if [ "${#PAGE_DIR}" -gt 100 ]; then + PAGE_DIR=${PAGE_DIR:0:100} +fi + +## If title wasn't parsed, leave random name +if [ "$PAGE_DIR" = '' ]; then + PAGE_DIR=`basename $TMP_DIR` +fi +rm "$TMP_DIR/$INDEX_PAGE" + +## Check local storage folder +## If duplicate - rename (add "_dup.X" to the end) +while true; do + ls "$NEVERNOTE_DIR/$PAGE_DIR" > /dev/null 2>&1 + if [ "$?" -eq 0 ]; then + DUP=${PAGE_DIR#*_dup.} + if [ "$DUP" = "$PAGE_DIR" ]; then + DUP=1 + else + let "DUP += 1" + fi + PAGE_DIR=${PAGE_DIR%_dup.*}"_dup."$DUP + else + break + fi +done + +## Download full page +wget -E -H -k -K -p -e robots=off --user-agent="" -T 15 -t 5 -o $TMP_DIR/wget.log -P $TMP_DIR "$URL" +WGET_EXIT_CODE=$? +if [ "$WGET_EXIT_CODE" -ne 0 ]; then + echo $WGET_EXIT_CODE"|"$URL >> /tmp/nevernote-error-wget +fi + +## Make link for index.html +#pushd $TMP_DIR > /dev/null 2>&1 +#INDEX_PATH=`find ./ -name "${INDEX_PAGE}.orig"` +#ln -s "${INDEX_PATH%.orig}" "$INDEX_PAGE" > /dev/null 2>&1 +#if [ "$?" -ne 0 ]; then +# echo Cant\'t link $URL +# echo $URL >> /tmp/nevernote-errors +# rm -r $TMP_DIR +# exit +#fi +#popd > /dev/null 2>&1 + +## Save page url +echo $URL > $TMP_DIR/URL + +mv $TMP_DIR "$NEVERNOTE_DIR/$PAGE_DIR" +echo "saved in $NEVERNOTE_DIR/$PAGE_DIR" +echo $URL >> /tmp/nevernote-list-downloaded