frmtget.py
frmtget.py
#!/usr/bin/env python2.4 # :set ts=4 """ Author: demod at foosel dot net Licence: gpl Version: 0.3 Requirements - python >= 2.4 - wget The Idea frmtgets purpose is to fetch files over http which are in a tiered structure without resorting to brute force or simple traversal of directory structures. - Example A set of articles, spread over multiple directories, like: http://foo/author[0-5]/article[0-10]/chapter[0-20]-subchapter[0-5].pdf Quick Start - simple patterns * convert the url of the files you want into a format-string url = "http://foo/article001/chapter01-subchapter1.pdf" => fs = "http://foo/article%03i/chapter%02i-subchapter%i.pdf" * run frmtget % frmtget.py $fs - complex patterns * complex patterns contain single counters (e.g. a chapter number) more than once. so you've got to specify how they are to be used to generate the URLs with -c. -c specifies the indices of the internal counter array (see below). * Examples - with reoccurring chapter numbers * url = "http://foo/chapter01/chapter01-subchapter1.pdf" => fs = "http://foo/chapter%02i/chapter%02i-subchapter%i.pdf" * Note: the chapter id got the index 0 in the internal counter array because it is the first to occur, the sub-chapter id got the index 1 => -c 0,0,1 * run frmtget frmtget.py -c 0,0,1 $fs - with other numbers * url = "http://foo/year00/author000/year00/author000-num0.pdf" => fs = "http://foo/year%02i/author%03i/year%02i-author%03i-num%i.pdf" * run frmtget frmtget.py -c 0,1,0,1,2 $fs """ import os import sys import re import urllib from optparse import OptionParser options = fs = None def main(): global abortCount setup() try: url = genUrl() while(url): # wget dePrint("# url: %s" % url, level=0) opts = list(options.wgetOptions) opts.append(url) ret = os.spawnvp(os.P_WAIT, "wget", opts) if ret == 0: abortCount = 0 else: abortCount += 1.0 / options.tries dePrint("# fetch error: %s" % url, level=0) url = getNext() except KeyboardInterrupt: dePrint("# KeyboardInterrupt caught", level=0) def setup(): global options, fs, state, abortCount ## check the interpreter version # minVer = [2, 4] ver = sys.version_info[0:2] if not (ver[0] > minVer[0] or (ver[0] == minVer[0] and ver[1] >= minVer[1])): print >>sys.stderr, "this program requires python >=%s, aborting." % ",".join(map(str, minVer)) sys.exit(-1) ## argument parsing setup # parser = OptionParser(version = "%prog 0.1") parser.set_usage("%prog [options] formatstring") parser.add_option("--doc", action = "store_true", dest = "doc", help = "show the so called documentation" ) parser.add_option("-v", "--verbose", action = "count", dest = "verbose", default = 0, help = "verbose output, use multiple times for spam" ) parser.add_option("-q", "--quite", action = "count", dest = "quite", default = 0, help = "suppress output, use multiple times for silence" ) parser.add_option("-d", "--dest", dest = "dest", default = ".", help = "specifies the destination directory [default: '%default']" ) parser.add_option("-c", "--complex", type = "string", dest = "complex", default = False, help = "specifies the use of complex format strings, requires " + "a comma separated list of state indices as argument " + "e.g.: 0,0,1,2 (see: --doc)" ) parser.add_option("-s", "--startvalue", type = "int", dest = "startValue", default = 1, help = "specifies where to start counting " + "[default: '%default']" ) parser.add_option("-t", "--tries", type = "int", dest = "tries", default = 1, help = "specifies how many requests must fail before giving up " + "[default: '%default']" ) parser.add_option("--dry-run", action = "store_true", dest = "dryrun", help = "won't download anything, but check if documents exist" ) parser.add_option("-r", "--resume", dest = "resume", help = "specifies a start state, useful to resume downloads. " + "arguments must be comma seperated, without spaces. " + "e.g.: 1,2,3" ) ## argument parsing # (options, fs) = parser.parse_args() # show doc if options.doc: print __doc__ sys.exit(0) # argument validation if len(fs) == 0: parser.error("no arguments given\n" + "for help type: %s --help" % parser.get_prog_name()) elif len(fs) != 1: parser.error("incorrect number of arguments given") else: fs = fs[0] ## decoding encoded characters in the format-string, to prevent mixups # with the real format-string placeholders # l = [] sp = fs.split('%') l.append(sp[0]) map(lambda x: l.append('%'+x), sp[1:]) fsList = [] # regex matching encoded chars, not format-string parts pattern = re.compile("%[A-Fa-f0-9]{2}[^i]") for i in l: if pattern.match(i): i = urllib.unquote(i) fsList.append(i) # put together the 'new' format-string fs = "".join(fsList) ## misc configuration # # verbosity if options.quite: options.verbose = -options.quite # wget options options.wgetOptions = ["-c", "-P%s" % options.dest] #verbOpts = {-1:"-q", 0:"-nv", 1:"", 2:"--verbose"} verbOpts = {0:"-q", 1:"-nv", 2:"", 3:"--verbose"} if options.dryrun: options.wgetOptions.append("--spider") if options.verbose in verbOpts: options.wgetOptions.append(verbOpts[options.verbose]) elif options.verbose < min(verbOpts): options.wgetOptions.append(verbOpts[min(verbOpts)]) else: options.wgetOptions.append(verbOpts[max(verbOpts)]) ## internal variables # # options.num, determines the number of states/counters if options.complex: options.complexStateOrder = map(int, options.complex.split(",")) unique = set() map(unique.add, options.complexStateOrder) options.num = len(unique) options.complexNum = len(options.complexStateOrder) else: options.num = fs.count('%') # abortCount abortCount = 0 # state: init or resume if options.resume: # resume state = options.resume.split(',') state = map(int, state) if not options.complex and (len(state) != options.num) \ or options.complex and (len(state) == options.complexNum): parser.error("incorrect state given") else: dePrint("# restored state: %s" % state, level=0) else: # init state = [options.startValue] * options.num def getNext(): if incState(): return genUrl() else: return False def incState(): """increments the state list entries the abortCount determines which entry we got to increment and which to reset """ global state if abortCount >= options.num: dePrint("\ndone", level=0) return False #for i in range(abortCount): for i in range(int(abortCount)): print "RESETING" state[-i -1] = options.startValue state[-int(abortCount) -1] += 1 return True def genUrl(): """generates the next URL to fetch""" ## building the string # if options.complex: url = fs % genComplexIndices() else: url = fs % tuple(state) return url def genComplexIndices(): ret = [] for i in options.complexStateOrder: ret.append(state[i]) return tuple(ret) def dePrint(str, level=1): """prints (debug) messages""" if (options.verbose - level) >= 0: print str if __name__ == "__main__": main()
qh/python.txt · Last modified: 2008/05/22 12:32 (external edit)

