frmtget.py

download

frmtget.py

#!/usr/bin/env python2.4
# :set ts=4
 
"""
Author:  demod at foosel dot net
Licence: gpl
Version: 0.3
 
 
Requirements
 - python >= 2.4
 - wget
 
 
The Idea
 frmtgets purpose is to fetch files over http which are in a tiered
 structure without resorting to brute force or simple traversal of
 directory structures.
 
 - Example
   A set of articles, spread over multiple directories, like:
   http://foo/author[0-5]/article[0-10]/chapter[0-20]-subchapter[0-5].pdf
 
 
Quick Start
 - simple patterns
   * convert the url of the files you want into a format-string
       url = "http://foo/article001/chapter01-subchapter1.pdf"
     => fs = "http://foo/article%03i/chapter%02i-subchapter%i.pdf"
   * run frmtget
     % frmtget.py $fs
 
 - complex patterns
   * complex patterns contain single counters (e.g. a chapter number) more
     than once. so you've got to specify how they are to be used to generate
     the URLs with -c.
     -c specifies the indices of the internal counter array (see below).
   * Examples
     - with reoccurring chapter numbers
       *   url = "http://foo/chapter01/chapter01-subchapter1.pdf"
         => fs = "http://foo/chapter%02i/chapter%02i-subchapter%i.pdf"
       * Note: the chapter id got the index 0 in the internal counter array
         because it is the first to occur, the sub-chapter id got the index 1
         => -c 0,0,1
       * run frmtget
         frmtget.py -c 0,0,1 $fs
     - with other numbers
       *   url = "http://foo/year00/author000/year00/author000-num0.pdf"
         => fs = "http://foo/year%02i/author%03i/year%02i-author%03i-num%i.pdf"
       * run frmtget
         frmtget.py -c 0,1,0,1,2 $fs
"""
 
import os
import sys
import re
import urllib
from optparse import OptionParser
 
options = fs = None
 
def main():
	global abortCount
 
	setup()
 
	try:
		url = genUrl()
 
		while(url):
			# wget
			dePrint("# url: %s" % url, level=0)
 
			opts = list(options.wgetOptions)
			opts.append(url)
			ret = os.spawnvp(os.P_WAIT, "wget", opts)
 
			if ret == 0:
				abortCount = 0
			else:
				abortCount += 1.0 / options.tries
				dePrint("# fetch error: %s" % url, level=0)
 
			url = getNext()
 
	except KeyboardInterrupt:
		dePrint("# KeyboardInterrupt caught", level=0)
 
 
def setup():
	global options, fs, state, abortCount
 
	## check the interpreter version
	#
	minVer = [2, 4]
	ver = sys.version_info[0:2]
	if not (ver[0] > minVer[0] or (ver[0] == minVer[0] and ver[1] >= minVer[1])):
		print >>sys.stderr, "this program requires python >=%s, aborting." % ",".join(map(str, minVer))
		sys.exit(-1)
 
	## argument parsing setup
	#
	parser = OptionParser(version = "%prog 0.1")
	parser.set_usage("%prog [options] formatstring")
 
	parser.add_option("--doc",
		action = "store_true", dest = "doc",
		help = "show the so called documentation"
	)
	parser.add_option("-v", "--verbose",
		action = "count", dest = "verbose", default = 0,
		help = "verbose output, use multiple times for spam"
	)
	parser.add_option("-q", "--quite",
		action = "count", dest = "quite", default = 0,
		help = "suppress output, use multiple times for silence"
	)
 
	parser.add_option("-d", "--dest",
		dest = "dest", default = ".",
		help = "specifies the destination directory [default: '%default']"
	)
	parser.add_option("-c", "--complex",
		type = "string", dest = "complex", default = False,
		help = "specifies the use of complex format strings, requires "
		     + "a comma separated list of state indices as argument "
			 + "e.g.: 0,0,1,2 (see: --doc)"
	)
 
	parser.add_option("-s", "--startvalue",
		type = "int", dest = "startValue", default = 1,
		help = "specifies where to start counting "
		     +  "[default: '%default']"
	)
	parser.add_option("-t", "--tries",
		type = "int", dest = "tries", default = 1,
		help = "specifies how many requests must fail before giving up "
		     +  "[default: '%default']"
	)
	parser.add_option("--dry-run",
		action = "store_true", dest = "dryrun",
		help = "won't download anything, but check if documents exist"
	)
	parser.add_option("-r", "--resume",
		dest = "resume",
		help = "specifies a start state, useful to resume downloads. "
		     + "arguments must be comma seperated, without spaces. "
		     + "e.g.: 1,2,3"
	)
 
 
	## argument parsing
	#
	(options, fs) = parser.parse_args()
 
	# show doc
	if options.doc:
		print __doc__
		sys.exit(0)
 
	# argument validation
	if len(fs) == 0:
		parser.error("no arguments given\n"
		           + "for help type: %s --help" % parser.get_prog_name())
	elif len(fs) != 1:
		parser.error("incorrect number of arguments given")
	else:
		fs = fs[0]
 
 
	## decoding encoded characters in the format-string, to prevent mixups
	#  with the real format-string placeholders
	#
	l = []
	sp = fs.split('%')
	l.append(sp[0])
	map(lambda x: l.append('%'+x), sp[1:])
 
	fsList = []
	# regex matching encoded chars, not format-string parts
	pattern = re.compile("%[A-Fa-f0-9]{2}[^i]")
 
	for i in l:
		if pattern.match(i):
			i = urllib.unquote(i)
		fsList.append(i)
 
	# put together the 'new' format-string
	fs = "".join(fsList)
 
 
	## misc configuration
	#
 
	# verbosity
	if options.quite:
		options.verbose = -options.quite
 
	# wget options
	options.wgetOptions = ["-c", "-P%s" % options.dest]
 
	#verbOpts = {-1:"-q", 0:"-nv", 1:"", 2:"--verbose"}
	verbOpts = {0:"-q", 1:"-nv", 2:"", 3:"--verbose"}
 
	if options.dryrun:
		options.wgetOptions.append("--spider")
 
	if options.verbose in verbOpts:
		options.wgetOptions.append(verbOpts[options.verbose])
	elif options.verbose < min(verbOpts):
		options.wgetOptions.append(verbOpts[min(verbOpts)])
	else:
		options.wgetOptions.append(verbOpts[max(verbOpts)])
 
 
	## internal variables
	#
 
	# options.num, determines the number of states/counters
	if options.complex:
		options.complexStateOrder = map(int, options.complex.split(","))
 
		unique = set()
		map(unique.add, options.complexStateOrder)
 
		options.num = len(unique)
		options.complexNum = len(options.complexStateOrder)
	else:
		options.num = fs.count('%')
 
	# abortCount
	abortCount = 0
 
	# state: init or resume
	if options.resume:
		# resume
		state = options.resume.split(',')
		state = map(int, state)
 
		if not options.complex and (len(state) != options.num) \
		  or options.complex and (len(state) == options.complexNum):
			parser.error("incorrect state given")
		else:
			dePrint("# restored state: %s" % state, level=0)
	else:
		# init
		state = [options.startValue] * options.num
 
 
def getNext():
	if incState():
		return genUrl()
	else:
		return False
 
 
def incState():
	"""increments the state list entries
 
	   the abortCount determines which entry we got to increment and
	   which to reset
	"""
 
	global state
 
	if abortCount >= options.num:
		dePrint("\ndone", level=0)
		return False
 
	#for i in range(abortCount):
	for i in range(int(abortCount)):
		print "RESETING"
		state[-i -1] = options.startValue
 
	state[-int(abortCount) -1] += 1
 
	return True
 
 
def genUrl():
	"""generates the next URL to fetch"""
 
	## building the string
	#
	if options.complex:
		url = fs % genComplexIndices()
	else:
		url = fs % tuple(state)
 
	return url
 
 
def genComplexIndices():
	ret = []
	for i in options.complexStateOrder:
		ret.append(state[i])
 
	return tuple(ret)
 
 
def dePrint(str, level=1):
	"""prints (debug) messages"""
 
	if (options.verbose - level) >= 0:
		print str
 
 
if __name__ == "__main__":
	main()