2012-08-18 22:46:46 +00:00
|
|
|
#!/bin/bash
|
|
|
|
|
|
|
|
|
|
if [ -z "$1" ]; then
|
|
|
|
|
exit
|
|
|
|
|
fi
|
|
|
|
|
URL=$1
|
2012-10-07 11:26:44 +00:00
|
|
|
|
|
|
|
|
if [ -n "$2" ]; then
|
|
|
|
|
NOTEBOOK=$2
|
|
|
|
|
else
|
|
|
|
|
NOTEBOOK='read'
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
TODO='no'
|
|
|
|
|
if [ -n "$3" ]; then
|
|
|
|
|
echo $3 | grep todo > /dev/null 2>&1
|
|
|
|
|
if [ "$?" -eq 0 ]; then
|
|
|
|
|
TODO='yes'
|
|
|
|
|
fi
|
|
|
|
|
fi
|
|
|
|
|
|
2012-10-07 13:31:29 +00:00
|
|
|
mkdir -p $HOME/.nevernote
|
2012-10-07 11:26:44 +00:00
|
|
|
TMP_DIR=`mktemp -d $HOME/.nevernote/nevernote.XXXXXX`
|
|
|
|
|
NEVERNOTE_DIR="/mnt/tabula/nevernote/$NOTEBOOK"
|
|
|
|
|
TODO_DIR="/mnt/tabula/nevernote/todo"
|
2012-08-18 22:46:46 +00:00
|
|
|
|
|
|
|
|
## Take page title
|
|
|
|
|
echo
|
|
|
|
|
echo `date '+%H:%M:%S'`
|
|
|
|
|
echo $URL
|
|
|
|
|
|
|
|
|
|
## Check if URL is forbidden to download
|
2012-10-07 11:26:44 +00:00
|
|
|
grep -x "$URL" $HOME/.nevernote/nevernote-list-excluded > /dev/null 2>&1
|
2012-08-18 22:46:46 +00:00
|
|
|
if [ "$?" -eq 0 ]; then
|
|
|
|
|
echo "exclude"
|
2012-10-07 11:26:44 +00:00
|
|
|
echo $URL >> $HOME/.nevernote/nevernote-error-excluded
|
2012-08-18 22:46:46 +00:00
|
|
|
rm -r $TMP_DIR
|
|
|
|
|
exit
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
## Check if it is downloading now
|
2012-10-07 11:26:44 +00:00
|
|
|
#ps ax | grep "./scripts/nevernote.sh" | awk '{print($7)}' | grep -x "$URL"
|
|
|
|
|
#if [ "$?" -eq 0 ]; then
|
|
|
|
|
# echo "downloading now"
|
|
|
|
|
# rm -r $TMP_DIR
|
|
|
|
|
# exit
|
|
|
|
|
#fi
|
2012-08-18 22:46:46 +00:00
|
|
|
|
|
|
|
|
## Check downloaded urls for duplicates
|
|
|
|
|
#head -qn 1 ${NEVERNOTE_DIR}/*/wget.log | awk '{print($3)}' | grep -x "$URL" > /dev/null 2>&1
|
2012-10-07 11:26:44 +00:00
|
|
|
grep -x "$URL" $HOME/.nevernote/nevernote-list-downloaded > /dev/null 2>&1
|
2012-08-18 22:46:46 +00:00
|
|
|
if [ "$?" -eq 0 ]; then
|
|
|
|
|
echo "dublicate"
|
2012-10-07 11:26:44 +00:00
|
|
|
echo $URL >> $HOME/.nevernote/nevernote-error-dups
|
2012-08-18 22:46:46 +00:00
|
|
|
rm -r $TMP_DIR
|
|
|
|
|
exit
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
wget -T 15 -t 5 --user-agent="" -P $TMP_DIR "$URL" > /dev/null 2>&1
|
|
|
|
|
INDEX_PAGE=`ls $TMP_DIR`
|
|
|
|
|
if [ "$INDEX_PAGE" = '' ]; then
|
|
|
|
|
echo "download error"
|
2012-10-07 11:26:44 +00:00
|
|
|
echo $URL >> $HOME/.nevernote/nevernote-error-404
|
2012-08-18 22:46:46 +00:00
|
|
|
rm -r $TMP_DIR
|
|
|
|
|
exit
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
## Convert page to system's charset
|
|
|
|
|
enconv "$TMP_DIR/$INDEX_PAGE" > /dev/null 2>&1
|
|
|
|
|
|
|
|
|
|
## Remove RC and LF symbols
|
|
|
|
|
#tr -d '\n' < "$TMP_DIR/$INDEX_PAGE" | tr -d '\r' > "$TMP_DIR/${INDEX_PAGE}.plain"
|
|
|
|
|
#mv "$TMP_DIR/${INDEX_PAGE}.plain" "$TMP_DIR/$INDEX_PAGE"
|
|
|
|
|
|
|
|
|
|
## Extract title and leave non-destruct chars
|
|
|
|
|
PAGE_DIR=$(sed -n -e 's/.*<title>\(.*\)<\/title>.*/\1/p' "$TMP_DIR/$INDEX_PAGE" | sed 's+[\+\{\;\"\\\=\?~\(\)\<\>\&\*\|\$\/\#:]+_+g')
|
|
|
|
|
PAGE_DIR=$(echo $PAGE_DIR | sed 's+\.*$++g')
|
|
|
|
|
|
|
|
|
|
## Remove first and last whitespaces
|
|
|
|
|
PAGE_DIR=$(echo $PAGE_DIR | sed 's+^ *++g' | sed 's+ *$++g')
|
|
|
|
|
|
|
|
|
|
## Trunc too long titles
|
|
|
|
|
if [ "${#PAGE_DIR}" -gt 100 ]; then
|
|
|
|
|
PAGE_DIR=${PAGE_DIR:0:100}
|
|
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
## If title wasn't parsed, leave random name
|
|
|
|
|
if [ "$PAGE_DIR" = '' ]; then
|
|
|
|
|
PAGE_DIR=`basename $TMP_DIR`
|
|
|
|
|
fi
|
|
|
|
|
rm "$TMP_DIR/$INDEX_PAGE"
|
|
|
|
|
|
|
|
|
|
## Check local storage folder
|
|
|
|
|
## If duplicate - rename (add "_dup.X" to the end)
|
|
|
|
|
while true; do
|
|
|
|
|
ls "$NEVERNOTE_DIR/$PAGE_DIR" > /dev/null 2>&1
|
|
|
|
|
if [ "$?" -eq 0 ]; then
|
|
|
|
|
DUP=${PAGE_DIR#*_dup.}
|
|
|
|
|
if [ "$DUP" = "$PAGE_DIR" ]; then
|
|
|
|
|
DUP=1
|
|
|
|
|
else
|
|
|
|
|
let "DUP += 1"
|
|
|
|
|
fi
|
|
|
|
|
PAGE_DIR=${PAGE_DIR%_dup.*}"_dup."$DUP
|
|
|
|
|
else
|
|
|
|
|
break
|
|
|
|
|
fi
|
|
|
|
|
done
|
|
|
|
|
|
|
|
|
|
## Download full page
|
2012-10-07 11:26:44 +00:00
|
|
|
wget -E -H -k -K -p -e robots=off --user-agent="" -T 15 -t 5 -nd -o $TMP_DIR/wget.log -P $TMP_DIR "$URL"
|
2012-08-18 22:46:46 +00:00
|
|
|
WGET_EXIT_CODE=$?
|
|
|
|
|
if [ "$WGET_EXIT_CODE" -ne 0 ]; then
|
2012-10-07 11:26:44 +00:00
|
|
|
echo $WGET_EXIT_CODE"|"$URL >> $HOME/.nevernote/nevernote-error-wget
|
2012-08-18 22:46:46 +00:00
|
|
|
fi
|
|
|
|
|
|
|
|
|
|
## Make link for index.html
|
|
|
|
|
#pushd $TMP_DIR > /dev/null 2>&1
|
|
|
|
|
#INDEX_PATH=`find ./ -name "${INDEX_PAGE}.orig"`
|
|
|
|
|
#ln -s "${INDEX_PATH%.orig}" "$INDEX_PAGE" > /dev/null 2>&1
|
|
|
|
|
#if [ "$?" -ne 0 ]; then
|
|
|
|
|
# echo Cant\'t link $URL
|
2012-10-07 11:26:44 +00:00
|
|
|
# echo $URL >> $HOME/.nevernote/nevernote-errors
|
2012-08-18 22:46:46 +00:00
|
|
|
# rm -r $TMP_DIR
|
|
|
|
|
# exit
|
|
|
|
|
#fi
|
|
|
|
|
#popd > /dev/null 2>&1
|
|
|
|
|
|
|
|
|
|
## Save page url
|
|
|
|
|
echo $URL > $TMP_DIR/URL
|
|
|
|
|
|
2012-10-07 17:19:07 +00:00
|
|
|
mkdir -p $NEVERNOTE_DIR
|
2012-08-18 22:46:46 +00:00
|
|
|
mv $TMP_DIR "$NEVERNOTE_DIR/$PAGE_DIR"
|
|
|
|
|
echo "saved in $NEVERNOTE_DIR/$PAGE_DIR"
|
2012-10-07 11:26:44 +00:00
|
|
|
if [ "$TODO" = "yes" ]; then
|
2012-10-07 17:19:07 +00:00
|
|
|
mkdir -p $TODO_DIR
|
2012-10-07 11:26:44 +00:00
|
|
|
ln -s "$NEVERNOTE_DIR/$PAGE_DIR" $TODO_DIR
|
|
|
|
|
fi
|
|
|
|
|
echo $URL >> $HOME/.nevernote/nevernote-list-downloaded
|