diff --git a/www/googlebook_dl/Makefile b/www/googlebook_dl/Makefile index 32f345617269..d1034bd8fe40 100644 --- a/www/googlebook_dl/Makefile +++ b/www/googlebook_dl/Makefile @@ -1,14 +1,15 @@ -# Created by: # $FreeBSD$ PORTNAME= googlebook_dl -PORTVERSION= 20100502 +PORTVERSION= 20120817 CATEGORIES= www MASTER_SITES= # none DISTFILES= # none MAINTAINER= ak@FreeBSD.org -COMMENT= A command-line utility for downloading books from Google Books +COMMENT= Command-line utility for downloading books from Google Books + +LICENSE= BSD RUN_DEPENDS= wget:${PORTSDIR}/ftp/wget @@ -16,8 +17,8 @@ NO_BUILD= yes PLIST_FILES= bin/${PORTNAME} -NO_STAGE= yes do-install: - ${INSTALL_SCRIPT} ${FILESDIR}/${PORTNAME}.sh ${PREFIX}/bin/${PORTNAME} + ${INSTALL_SCRIPT} ${FILESDIR}/${PORTNAME}.sh \ + ${STAGEDIR}${PREFIX}/bin/${PORTNAME} .include diff --git a/www/googlebook_dl/files/googlebook_dl.sh b/www/googlebook_dl/files/googlebook_dl.sh index e8afa512c327..cfc5f5bf9679 100644 --- a/www/googlebook_dl/files/googlebook_dl.sh +++ b/www/googlebook_dl/files/googlebook_dl.sh @@ -1,8 +1,12 @@ #!/bin/sh +# +# SUBS +# + parse_options() { - local OPT OPTARG OPTIND + local _proxylist while getopts ap:P:vw: OPT; do # escape meta @@ -10,7 +14,13 @@ parse_options() case ${OPT} in a) all=yes ;; - p) proxylist="${OPTARG}" ;; + p) _proxylist="${OPTARG}" + if [ -r "${_proxylist}" ]; then # file + proxylist=$(cat "${_proxylist}") + else # list + proxylist=$(echo "${_proxylist}" | sed -e 's/,/ /g') + fi + ;; P) pageprefix="${OPTARG}" ;; v) verbose=yes ;; w) pagewidth="${OPTARG}" ;; @@ -21,97 +31,204 @@ parse_options() OPTC=$((${OPTIND} - 1)) } +# +# returns true if argument is a positive/negative whole integer. +# stolen from bsdinstall +# +isinteger() +{ + local arg="$1" + + # prevent division-by-zero + [ "${arg}" = "0" ] && return + + # attempt to perform arithmetic divison (an operation which will exit + # with error unless arg is a valid positive/negative whole integer). + ( : $((0/$arg)) ) > /dev/null 2>&1 +} + +err() +{ + local exitval + + exitval=$1 + shift + echo 1>&2 "${0##*/}: $*" + exit ${exitval} +} + usage() { - echo "usage: ${0##*/} [-ahPpw] totpages bookid" + echo "usage: ${0##*/} [-ahPpvw] totpages bookid" echo ' -h display this help' - echo ' -a all mode (try to get sigs from all pages, including already downloaded)' + echo ' -a all mode (try to get links from all pages, including already downloaded)' echo ' -P pageprefix (*PA, PP, PR, PT)' - echo ' -p proxylist' + echo ' -p http://proxy.tld:port,proxy.tld,ip:port | proxylist.txt' echo ' -v verbose' echo ' -w pagewidth (800, *1024, 1280, 1440, 1680, ...)' echo exit 1 } -get_pages() +# +# shows progress in dots/got_page numbers +# stolen from portsnap +# +progress() { - local ua page url _return + local page - # with wrong ua we will get 401 Unauthorized - # ua='Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Firefox/3.0' - ua='Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)' + page=$1 + if [ $((${page} % 10)) -eq 0 -a "${lastchar}" = '.' ]; then + echo -n ${page} + elif [ $((${page} % 2)) -eq 0 ]; then + echo -n . + fi +} + +# +# out $msg $verbose_msg +# +out() +{ + [ -z "$1" -a -z "$2" ] && err 3 'out(): bad syntax' + + if [ -n "${verbose}" -a -n "$2" ]; then + echo $2 + elif [ -z "${verbose}" -a ! -z "$1" ]; then + [ "$1" = '.' ] && lastchar=. + case ${lastchar} in + [.ce]) printf "$1" ;; + *) printf " $1" ;; + esac + lastchar=${1#${1%?}} + fi +} + +get_cookie() +{ + local cookie_str _return + + # remove old cookie + rm "${cookie}" 2>/dev/null # get cookie wget -T5 -t2 -q -U"${ua}" --keep-session-cookies \ - --save-cookies "${DIR}/cookies.txt" -O/dev/null \ + --save-cookies "${cookie}" -O/dev/null \ "http://books.google.com/books?id=${bookid}&pg=PA1&jscmd=click3" - # bail if wget returned non zero exit code or cookies.txt is empty + # fail if wget returned non-zero exitcode or cookies.txt is empty _return=$? - cookie="$(grep '^.google.com' "${DIR}/cookies.txt" 2>/dev/null | \ - sed 's/^.*\(ID.*\)$/\1/')" - [ ${_return} -ne 0 -o -z "${cookie}" ] && \ - { rm "${DIR}/cookies.txt"; return 1; } + cookie_str="$(grep '^.google.com[[:space:]]' "${cookie}" 2>/dev/null | \ + sed -ne 's/^.*\(ID=.*\)$/\1/p')" + if [ ${_return} -ne 0 -o -z "${cookie_str}" ]; then + rm "${cookie}" 2>/dev/null + out 'E\n' "cannot get cookie: ${cookie_str}" + return 1 + fi # show cookie - [ -n "${verbose}" ] && echo "cookie: ${cookie}" + out 'c' "cookie: ${cookie_str}" +} - # if downloaded less that half of total pages, use all mode - [ $(ls "${bookid}/" | wc -l) -le $((${totpages} / 2)) ] && all=yes +get_page() +{ + local page url urls _return - # pull sigs only from missing pages unless in all mode - page=1 - while [ ${page} -le ${totpages} ]; do - [ -f "${bookid}/${pageprefix}${page}" -a -z "${all}" ] || \ - echo "http://books.google.com/books?id=${bookid}&pg=${pageprefix}${page}&jscmd=click3" \ - >> "${DIR}/urls" - page=$(( ${page} + 1)) - done + [ -z $1 ] && err 3 'get_page(): bad syntax' + page=$1 - # get all sigs at once - # NB! sigs tied to cookie and ip - wget -T5 -t2 -q -U"${ua}" --no-cache --load-cookies "${DIR}/cookies.txt" \ - -O- -i "${DIR}/urls" | tr '}' '\n' | grep "{\"pid\":\"P.*\",\"src\":" | \ - sed 's/^.*"src":"\(http:\/\/[^"]*\)".*$/\1/;s/\\u0026/\&/g' | sort -u | \ - while read -r url; do + # pull signatures only from missing pages unless in all mode + [ -f "${bookid}/${pageprefix}${page}.png" -a -z "${all}" ] && return + + # change cookie every 100 pages + if [ $((${got_pages} % 100)) -eq 0 ]; then + get_cookie || return 1 + fi + got_pages=$((${got_pages} + 1)) + + url="http://books.google.com/books?id=${bookid}&pg=${pageprefix}${page}&jscmd=click3" + out "$(progress ${got_pages})" "${pageprefix}${page}: ${url}&w=${pagewidth} TRY" + + # NB! signatures tied to cookie and ip + urls=$(wget -T5 -t2 -q -U"${ua}" --no-cache \ + --load-cookies "${cookie}" -O- \ + "${url}" | tr '}' '\n' | grep "{\"pid\":\"P.*\",\"src\":" | \ + sed 's/^.*"src":"\(http:\/\/[^"]*\)".*$/\1/;s/\\u0026/\&/g' | sort -u) + + for url in ${urls}; do page=$(echo "${url}" | sed 's/^.*&pg=\([^&]*\)&.*$/\1/') - [ -n "${verbose}" ] && verbose="${page}: ${url}&w=${pagewidth}" + # check again if page already downloaded, we usually get a few + # urls from a single request + if [ ! -f "${bookid}/${page}.png" ]; then + got_pages=$((${got_pages} + 1)) - # skip already downloaded pages - [ -f "${bookid}/${page}" ] || \ - { wget -T5 -t3 -q -U"${ua}" --no-cache \ - --load-cookies "${DIR}/cookies.txt" \ - -O"${bookid}/${page}" "${url}&w=${pagewidth}" + --load-cookies "${cookie}" \ + -O"${bookid}/${page}.png" "${url}&w=${pagewidth}" _return=$? if [ ${_return} -ne 0 ]; then - # sometimes google books returns 404 - rm "${bookid}/${page}" - [ -n "${verbose}" ] && verbose="${verbose} ERROR" + # sometime google books just returns 404 + rm "${bookid}/${page}.png" + out 'e' "${page}: ${url}&w=${pagewidth} ERROR" else - if [ -n "${verbose}" ]; then - verbose="${verbose} DOWNLOADED" - else - echo -n "${page} " - fi + out "${page}" "${page}: ${url}&w=${pagewidth} DOWNLOADED" fi - } - - [ -n "${verbose}" ] && echo "${verbose}" + else + out '' "${page}: ${url}&w=${pagewidth} ALREADY" + fi done - # clean temp files - rm "${DIR}/cookies.txt" "${DIR}/urls" +} + +get_pages() +{ + local page got_pages + + # for out(), progress() + local lastchar=. + + got_pages=1 + + # randomize page requests - google books only shows 200 - 300 urls in one + # session + # + # if start on odd second count from 1 to totpages, on even - from totpages to 1 + # [ $((`date -j "+%s"` % 2)) -eq 0 ] && descending_order=yes + # XXX not portable + if [ $(jot -r 1 0 1) -ne 0 ]; then + echo "fetching pages in ascending order" + + get_cookie || return 1 + page=1 + while [ ${page} -le ${totpages} ]; do + get_page ${page} || return 1 + page=$((${page} + 1)) + done + else + echo "fetching pages in descending order" + + get_cookie || return 1 + page=${totpages} + while [ ${page} -ge 1 ]; do + get_page ${page} || return 1 + page=$((${page} - 1)) + done + fi echo } + # # MAIN # +# with wrong UserAgent we will get 401 Unauthorized +# ua='Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Firefox/3.0' +ua='Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)' + # default page width pagewidth=1024 @@ -124,28 +241,31 @@ pageprefix=PA parse_options ${1+"$@"} shift ${OPTC} +isinteger "${pagewidth}" || err 4 "pagewidth must be integer: ${pagewidth}" + [ -z $1 ] && usage totpages=$1 +isinteger "${totpages}" || err 4 "totpages must be integer: ${totpages}" [ -z $2 ] && usage bookid=$2 -# if bookid dir already exists, continue from previous try -[ -d "${bookid}" ] || \ -{ - mkdir "${bookid}" || { echo "cannot create dir ${bookid}"; exit 2; } -} +# if bookid dir already exist, continue from previous try +if [ ! -d "${bookid}" ]; then + mkdir -- "${bookid}" || err 2 "cannot create dir ${bookid}" +fi -DIR=`mktemp -d googlebook_dl.XXXXXXXXXX` || exit 2 -trap "rm -rf ${DIR}; exit 1" 1 2 3 10 13 15 +cookie=`mktemp -t cookie` || err 2 'mktemp error' + +trap "rm ${cookie} 2>/dev/null; exit 1" 1 2 3 10 13 15 if [ -z "${proxylist}" ]; then get_pages else - for http_proxy in `cat "${proxylist}"`; do + for http_proxy in ${proxylist}; do echo "using proxy ${http_proxy}" get_pages done fi -rmdir "${DIR}" +rm "${cookie}" 2>/dev/null