- Update to 20120817
- Support STAGEDIR - Remove indefinite article from COMMENT
This commit is contained in:
parent
6ee9a8e343
commit
e4f71f27d5
Notes:
svn2git
2021-03-31 03:12:20 +00:00
svn path=/head/; revision=331124
@ -1,14 +1,15 @@
|
||||
# Created by: <spam@rm-rf.kiev.ua>
|
||||
# $FreeBSD$
|
||||
|
||||
PORTNAME= googlebook_dl
|
||||
PORTVERSION= 20100502
|
||||
PORTVERSION= 20120817
|
||||
CATEGORIES= www
|
||||
MASTER_SITES= # none
|
||||
DISTFILES= # none
|
||||
|
||||
MAINTAINER= ak@FreeBSD.org
|
||||
COMMENT= A command-line utility for downloading books from Google Books
|
||||
COMMENT= Command-line utility for downloading books from Google Books
|
||||
|
||||
LICENSE= BSD
|
||||
|
||||
RUN_DEPENDS= wget:${PORTSDIR}/ftp/wget
|
||||
|
||||
@ -16,8 +17,8 @@ NO_BUILD= yes
|
||||
|
||||
PLIST_FILES= bin/${PORTNAME}
|
||||
|
||||
NO_STAGE= yes
|
||||
do-install:
|
||||
${INSTALL_SCRIPT} ${FILESDIR}/${PORTNAME}.sh ${PREFIX}/bin/${PORTNAME}
|
||||
${INSTALL_SCRIPT} ${FILESDIR}/${PORTNAME}.sh \
|
||||
${STAGEDIR}${PREFIX}/bin/${PORTNAME}
|
||||
|
||||
.include <bsd.port.mk>
|
||||
|
@ -1,8 +1,12 @@
|
||||
#!/bin/sh
|
||||
|
||||
#
|
||||
# SUBS
|
||||
#
|
||||
|
||||
parse_options()
|
||||
{
|
||||
local OPT OPTARG OPTIND
|
||||
local _proxylist
|
||||
|
||||
while getopts ap:P:vw: OPT; do
|
||||
# escape meta
|
||||
@ -10,7 +14,13 @@ parse_options()
|
||||
|
||||
case ${OPT} in
|
||||
a) all=yes ;;
|
||||
p) proxylist="${OPTARG}" ;;
|
||||
p) _proxylist="${OPTARG}"
|
||||
if [ -r "${_proxylist}" ]; then # file
|
||||
proxylist=$(cat "${_proxylist}")
|
||||
else # list
|
||||
proxylist=$(echo "${_proxylist}" | sed -e 's/,/ /g')
|
||||
fi
|
||||
;;
|
||||
P) pageprefix="${OPTARG}" ;;
|
||||
v) verbose=yes ;;
|
||||
w) pagewidth="${OPTARG}" ;;
|
||||
@ -21,97 +31,204 @@ parse_options()
|
||||
OPTC=$((${OPTIND} - 1))
|
||||
}
|
||||
|
||||
#
|
||||
# returns true if argument is a positive/negative whole integer.
|
||||
# stolen from bsdinstall
|
||||
#
|
||||
isinteger()
|
||||
{
|
||||
local arg="$1"
|
||||
|
||||
# prevent division-by-zero
|
||||
[ "${arg}" = "0" ] && return
|
||||
|
||||
# attempt to perform arithmetic divison (an operation which will exit
|
||||
# with error unless arg is a valid positive/negative whole integer).
|
||||
( : $((0/$arg)) ) > /dev/null 2>&1
|
||||
}
|
||||
|
||||
err()
|
||||
{
|
||||
local exitval
|
||||
|
||||
exitval=$1
|
||||
shift
|
||||
echo 1>&2 "${0##*/}: $*"
|
||||
exit ${exitval}
|
||||
}
|
||||
|
||||
usage()
|
||||
{
|
||||
echo "usage: ${0##*/} [-ahPpw] totpages bookid"
|
||||
echo "usage: ${0##*/} [-ahPpvw] totpages bookid"
|
||||
echo ' -h display this help'
|
||||
echo ' -a all mode (try to get sigs from all pages, including already downloaded)'
|
||||
echo ' -a all mode (try to get links from all pages, including already downloaded)'
|
||||
echo ' -P pageprefix (*PA, PP, PR, PT)'
|
||||
echo ' -p proxylist'
|
||||
echo ' -p http://proxy.tld:port,proxy.tld,ip:port | proxylist.txt'
|
||||
echo ' -v verbose'
|
||||
echo ' -w pagewidth (800, *1024, 1280, 1440, 1680, ...)'
|
||||
echo
|
||||
exit 1
|
||||
}
|
||||
|
||||
get_pages()
|
||||
#
|
||||
# shows progress in dots/got_page numbers
|
||||
# stolen from portsnap
|
||||
#
|
||||
progress()
|
||||
{
|
||||
local ua page url _return
|
||||
local page
|
||||
|
||||
# with wrong ua we will get 401 Unauthorized
|
||||
# ua='Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Firefox/3.0'
|
||||
ua='Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)'
|
||||
page=$1
|
||||
if [ $((${page} % 10)) -eq 0 -a "${lastchar}" = '.' ]; then
|
||||
echo -n ${page}
|
||||
elif [ $((${page} % 2)) -eq 0 ]; then
|
||||
echo -n .
|
||||
fi
|
||||
}
|
||||
|
||||
#
|
||||
# out $msg $verbose_msg
|
||||
#
|
||||
out()
|
||||
{
|
||||
[ -z "$1" -a -z "$2" ] && err 3 'out(): bad syntax'
|
||||
|
||||
if [ -n "${verbose}" -a -n "$2" ]; then
|
||||
echo $2
|
||||
elif [ -z "${verbose}" -a ! -z "$1" ]; then
|
||||
[ "$1" = '.' ] && lastchar=.
|
||||
case ${lastchar} in
|
||||
[.ce]) printf "$1" ;;
|
||||
*) printf " $1" ;;
|
||||
esac
|
||||
lastchar=${1#${1%?}}
|
||||
fi
|
||||
}
|
||||
|
||||
get_cookie()
|
||||
{
|
||||
local cookie_str _return
|
||||
|
||||
# remove old cookie
|
||||
rm "${cookie}" 2>/dev/null
|
||||
|
||||
# get cookie
|
||||
wget -T5 -t2 -q -U"${ua}" --keep-session-cookies \
|
||||
--save-cookies "${DIR}/cookies.txt" -O/dev/null \
|
||||
--save-cookies "${cookie}" -O/dev/null \
|
||||
"http://books.google.com/books?id=${bookid}&pg=PA1&jscmd=click3"
|
||||
|
||||
# bail if wget returned non zero exit code or cookies.txt is empty
|
||||
# fail if wget returned non-zero exitcode or cookies.txt is empty
|
||||
_return=$?
|
||||
cookie="$(grep '^.google.com' "${DIR}/cookies.txt" 2>/dev/null | \
|
||||
sed 's/^.*\(ID.*\)$/\1/')"
|
||||
[ ${_return} -ne 0 -o -z "${cookie}" ] && \
|
||||
{ rm "${DIR}/cookies.txt"; return 1; }
|
||||
cookie_str="$(grep '^.google.com[[:space:]]' "${cookie}" 2>/dev/null | \
|
||||
sed -ne 's/^.*\(ID=.*\)$/\1/p')"
|
||||
if [ ${_return} -ne 0 -o -z "${cookie_str}" ]; then
|
||||
rm "${cookie}" 2>/dev/null
|
||||
out 'E\n' "cannot get cookie: ${cookie_str}"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# show cookie
|
||||
[ -n "${verbose}" ] && echo "cookie: ${cookie}"
|
||||
out 'c' "cookie: ${cookie_str}"
|
||||
}
|
||||
|
||||
# if downloaded less that half of total pages, use all mode
|
||||
[ $(ls "${bookid}/" | wc -l) -le $((${totpages} / 2)) ] && all=yes
|
||||
get_page()
|
||||
{
|
||||
local page url urls _return
|
||||
|
||||
# pull sigs only from missing pages unless in all mode
|
||||
page=1
|
||||
while [ ${page} -le ${totpages} ]; do
|
||||
[ -f "${bookid}/${pageprefix}${page}" -a -z "${all}" ] || \
|
||||
echo "http://books.google.com/books?id=${bookid}&pg=${pageprefix}${page}&jscmd=click3" \
|
||||
>> "${DIR}/urls"
|
||||
page=$(( ${page} + 1))
|
||||
done
|
||||
[ -z $1 ] && err 3 'get_page(): bad syntax'
|
||||
page=$1
|
||||
|
||||
# get all sigs at once
|
||||
# NB! sigs tied to cookie and ip
|
||||
wget -T5 -t2 -q -U"${ua}" --no-cache --load-cookies "${DIR}/cookies.txt" \
|
||||
-O- -i "${DIR}/urls" | tr '}' '\n' | grep "{\"pid\":\"P.*\",\"src\":" | \
|
||||
sed 's/^.*"src":"\(http:\/\/[^"]*\)".*$/\1/;s/\\u0026/\&/g' | sort -u | \
|
||||
while read -r url; do
|
||||
# pull signatures only from missing pages unless in all mode
|
||||
[ -f "${bookid}/${pageprefix}${page}.png" -a -z "${all}" ] && return
|
||||
|
||||
# change cookie every 100 pages
|
||||
if [ $((${got_pages} % 100)) -eq 0 ]; then
|
||||
get_cookie || return 1
|
||||
fi
|
||||
got_pages=$((${got_pages} + 1))
|
||||
|
||||
url="http://books.google.com/books?id=${bookid}&pg=${pageprefix}${page}&jscmd=click3"
|
||||
out "$(progress ${got_pages})" "${pageprefix}${page}: ${url}&w=${pagewidth} TRY"
|
||||
|
||||
# NB! signatures tied to cookie and ip
|
||||
urls=$(wget -T5 -t2 -q -U"${ua}" --no-cache \
|
||||
--load-cookies "${cookie}" -O- \
|
||||
"${url}" | tr '}' '\n' | grep "{\"pid\":\"P.*\",\"src\":" | \
|
||||
sed 's/^.*"src":"\(http:\/\/[^"]*\)".*$/\1/;s/\\u0026/\&/g' | sort -u)
|
||||
|
||||
for url in ${urls}; do
|
||||
page=$(echo "${url}" | sed 's/^.*&pg=\([^&]*\)&.*$/\1/')
|
||||
|
||||
[ -n "${verbose}" ] && verbose="${page}: ${url}&w=${pagewidth}"
|
||||
# check again if page already downloaded, we usually get a few
|
||||
# urls from a single request
|
||||
if [ ! -f "${bookid}/${page}.png" ]; then
|
||||
got_pages=$((${got_pages} + 1))
|
||||
|
||||
# skip already downloaded pages
|
||||
[ -f "${bookid}/${page}" ] || \
|
||||
{
|
||||
wget -T5 -t3 -q -U"${ua}" --no-cache \
|
||||
--load-cookies "${DIR}/cookies.txt" \
|
||||
-O"${bookid}/${page}" "${url}&w=${pagewidth}"
|
||||
--load-cookies "${cookie}" \
|
||||
-O"${bookid}/${page}.png" "${url}&w=${pagewidth}"
|
||||
|
||||
_return=$?
|
||||
if [ ${_return} -ne 0 ]; then
|
||||
# sometimes google books returns 404
|
||||
rm "${bookid}/${page}"
|
||||
[ -n "${verbose}" ] && verbose="${verbose} ERROR"
|
||||
# sometime google books just returns 404
|
||||
rm "${bookid}/${page}.png"
|
||||
out 'e' "${page}: ${url}&w=${pagewidth} ERROR"
|
||||
else
|
||||
if [ -n "${verbose}" ]; then
|
||||
verbose="${verbose} DOWNLOADED"
|
||||
else
|
||||
echo -n "${page} "
|
||||
fi
|
||||
out "${page}" "${page}: ${url}&w=${pagewidth} DOWNLOADED"
|
||||
fi
|
||||
}
|
||||
|
||||
[ -n "${verbose}" ] && echo "${verbose}"
|
||||
else
|
||||
out '' "${page}: ${url}&w=${pagewidth} ALREADY"
|
||||
fi
|
||||
done
|
||||
# clean temp files
|
||||
rm "${DIR}/cookies.txt" "${DIR}/urls"
|
||||
}
|
||||
|
||||
get_pages()
|
||||
{
|
||||
local page got_pages
|
||||
|
||||
# for out(), progress()
|
||||
local lastchar=.
|
||||
|
||||
got_pages=1
|
||||
|
||||
# randomize page requests - google books only shows 200 - 300 urls in one
|
||||
# session
|
||||
#
|
||||
# if start on odd second count from 1 to totpages, on even - from totpages to 1
|
||||
# [ $((`date -j "+%s"` % 2)) -eq 0 ] && descending_order=yes
|
||||
# XXX not portable
|
||||
if [ $(jot -r 1 0 1) -ne 0 ]; then
|
||||
echo "fetching pages in ascending order"
|
||||
|
||||
get_cookie || return 1
|
||||
page=1
|
||||
while [ ${page} -le ${totpages} ]; do
|
||||
get_page ${page} || return 1
|
||||
page=$((${page} + 1))
|
||||
done
|
||||
else
|
||||
echo "fetching pages in descending order"
|
||||
|
||||
get_cookie || return 1
|
||||
page=${totpages}
|
||||
while [ ${page} -ge 1 ]; do
|
||||
get_page ${page} || return 1
|
||||
page=$((${page} - 1))
|
||||
done
|
||||
fi
|
||||
|
||||
echo
|
||||
}
|
||||
|
||||
|
||||
#
|
||||
# MAIN
|
||||
#
|
||||
|
||||
# with wrong UserAgent we will get 401 Unauthorized
|
||||
# ua='Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Firefox/3.0'
|
||||
ua='Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1)'
|
||||
|
||||
# default page width
|
||||
pagewidth=1024
|
||||
|
||||
@ -124,28 +241,31 @@ pageprefix=PA
|
||||
parse_options ${1+"$@"}
|
||||
shift ${OPTC}
|
||||
|
||||
isinteger "${pagewidth}" || err 4 "pagewidth must be integer: ${pagewidth}"
|
||||
|
||||
[ -z $1 ] && usage
|
||||
totpages=$1
|
||||
isinteger "${totpages}" || err 4 "totpages must be integer: ${totpages}"
|
||||
|
||||
[ -z $2 ] && usage
|
||||
bookid=$2
|
||||
|
||||
# if bookid dir already exists, continue from previous try
|
||||
[ -d "${bookid}" ] || \
|
||||
{
|
||||
mkdir "${bookid}" || { echo "cannot create dir ${bookid}"; exit 2; }
|
||||
}
|
||||
# if bookid dir already exist, continue from previous try
|
||||
if [ ! -d "${bookid}" ]; then
|
||||
mkdir -- "${bookid}" || err 2 "cannot create dir ${bookid}"
|
||||
fi
|
||||
|
||||
DIR=`mktemp -d googlebook_dl.XXXXXXXXXX` || exit 2
|
||||
trap "rm -rf ${DIR}; exit 1" 1 2 3 10 13 15
|
||||
cookie=`mktemp -t cookie` || err 2 'mktemp error'
|
||||
|
||||
trap "rm ${cookie} 2>/dev/null; exit 1" 1 2 3 10 13 15
|
||||
|
||||
if [ -z "${proxylist}" ]; then
|
||||
get_pages
|
||||
else
|
||||
for http_proxy in `cat "${proxylist}"`; do
|
||||
for http_proxy in ${proxylist}; do
|
||||
echo "using proxy ${http_proxy}"
|
||||
get_pages
|
||||
done
|
||||
fi
|
||||
|
||||
rmdir "${DIR}"
|
||||
rm "${cookie}" 2>/dev/null
|
||||
|
Loading…
Reference in New Issue
Block a user