From 1fa1606f0d5c43193c794b8618af95165a28baee Mon Sep 17 00:00:00 2001
From: Maks Snegov <snegovik@gmail.com>
Date: Sun, 19 Aug 2012 02:46:46 +0400
Subject: [PATCH] Initial commit.

---
 nevernote.sh | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 118 insertions(+)
 create mode 100755 nevernote.sh
diff --git a/nevernote.sh b/nevernote.sh
new file mode 100755
index 0000000..d324ff5
--- /dev/null
+++ b/nevernote.sh
@@ -0,0 +1,118 @@
+#!/bin/bash
+
+if [ -z "$1" ]; then
+	exit
+fi
+
+URL=$1
+TMP_DIR=`mktemp -d /tmp/nevernote.XXXXXX`
+NEVERNOTE_DIR="/mnt/tabula/nevernote/read"
+
+## Take page title
+echo
+echo `date '+%H:%M:%S'`
+echo $URL
+
+## Check if URL is forbidden to download
+grep -x "$URL" /tmp/nevernote-list-excluded > /dev/null 2>&1
+if [ "$?" -eq 0 ]; then
+	echo "exclude"
+	echo $URL >> /tmp/nevernote-error-excluded
+	rm -r $TMP_DIR
+	exit
+fi
+
+## Check if it is downloading now
+ps ax | grep "./scripts/nevernote.sh" | awk '{print($7)}' | grep -x "$URL"
+if [ "$?" -eq 0 ]; then
+	echo "downloading now"
+	rm -r $TMP_DIR
+	exit
+fi
+
+## Check downloaded urls for duplicates
+#head -qn 1 ${NEVERNOTE_DIR}/*/wget.log | awk '{print($3)}' | grep -x "$URL" > /dev/null 2>&1
+grep -x "$URL" /tmp/nevernote-list-downloaded > /dev/null 2>&1
+if [ "$?" -eq 0 ]; then
+	echo "dublicate"
+	echo $URL >> /tmp/nevernote-error-dups
+	rm -r $TMP_DIR
+	exit
+fi
+
+wget -T 15 -t 5 --user-agent="" -P $TMP_DIR "$URL" > /dev/null 2>&1
+INDEX_PAGE=`ls $TMP_DIR`
+if [ "$INDEX_PAGE" = '' ]; then
+	echo "download error"
+	echo $URL >> /tmp/nevernote-error-404
+	rm -r $TMP_DIR
+	exit
+fi
+
+## Convert page to system's charset
+enconv "$TMP_DIR/$INDEX_PAGE" > /dev/null 2>&1
+
+## Remove RC and LF symbols
+#tr -d '\n' < "$TMP_DIR/$INDEX_PAGE" | tr -d '\r' > "$TMP_DIR/${INDEX_PAGE}.plain"
+#mv "$TMP_DIR/${INDEX_PAGE}.plain" "$TMP_DIR/$INDEX_PAGE"
+
+## Extract title and leave non-destruct chars
+PAGE_DIR=$(sed -n -e 's/.*<title>\(.*\)<\/title>.*/\1/p' "$TMP_DIR/$INDEX_PAGE" | sed 's+[\+\{\;\"\\\=\?~\(\)\<\>\&\*\|\$\/\#:]+_+g')
+PAGE_DIR=$(echo $PAGE_DIR | sed 's+\.*$++g')
+
+## Remove first and last whitespaces
+PAGE_DIR=$(echo $PAGE_DIR | sed 's+^ *++g' | sed 's+ *$++g')
+
+## Trunc too long titles
+if [ "${#PAGE_DIR}" -gt 100 ]; then
+	PAGE_DIR=${PAGE_DIR:0:100}
+fi
+
+## If title wasn't parsed, leave random name
+if [ "$PAGE_DIR" = '' ]; then
+	PAGE_DIR=`basename $TMP_DIR`
+fi
+rm "$TMP_DIR/$INDEX_PAGE"
+
+## Check local storage folder
+## If duplicate - rename (add "_dup.X" to the end)
+while true; do
+	ls "$NEVERNOTE_DIR/$PAGE_DIR" > /dev/null 2>&1
+	if [ "$?" -eq 0 ]; then
+		DUP=${PAGE_DIR#*_dup.}
+		if [ "$DUP" = "$PAGE_DIR" ]; then
+			DUP=1
+		else
+			let "DUP += 1"
+		fi
+		PAGE_DIR=${PAGE_DIR%_dup.*}"_dup."$DUP
+	else
+		break
+	fi
+done
+
+## Download full page
+wget -E -H -k -K -p -e robots=off --user-agent="" -T 15 -t 5 -o $TMP_DIR/wget.log -P $TMP_DIR "$URL"
+WGET_EXIT_CODE=$?
+if [ "$WGET_EXIT_CODE" -ne 0 ]; then
+	echo $WGET_EXIT_CODE"|"$URL >> /tmp/nevernote-error-wget
+fi
+
+## Make link for index.html
+#pushd $TMP_DIR > /dev/null 2>&1
+#INDEX_PATH=`find ./ -name "${INDEX_PAGE}.orig"`
+#ln -s "${INDEX_PATH%.orig}" "$INDEX_PAGE" > /dev/null 2>&1
+#if [ "$?" -ne 0 ]; then
+#	echo Cant\'t link $URL
+#	echo $URL >> /tmp/nevernote-errors
+#	rm -r $TMP_DIR
+#	exit
+#fi
+#popd > /dev/null 2>&1
+
+## Save page url
+echo $URL > $TMP_DIR/URL
+
+mv $TMP_DIR "$NEVERNOTE_DIR/$PAGE_DIR"
+echo "saved in $NEVERNOTE_DIR/$PAGE_DIR"
+echo $URL >> /tmp/nevernote-list-downloaded