' % div_class)
if i < 0:
raise_error('Unable to find %s link' % div_class, 'HTML changed')
# look for the href inside the #!/usr/bin/env python ###################################################################### # # getfotolog.py 1.9 # # Python script to crawl your fotolog pages. Tested with Python 2.3 # and 2.4. # # author: will luo, photo (a t) wluo dot org # # This software is released under the MFJ Software License: # # http://www.motherfuckingjackson.com/license.html # ###################################################################### import httplib, urlparse import os, sys, time # define some constants HOST = 'www.fotolog.com' # where to get the pages from IMGHOST = None # image host GBHOST = None # guestbook host # globals gconn = None # page HTTPConnection ggbconn = None # guest book HTTPConnection gimgconn = None # image HTTPConnection # their cookie # typically we get something like this: # # Set-Cookie: FCED=iF0NJuzEtixYiDg0lyXVX2IyV%2Bjt8d3xRMDJ1JJl1woDIh14JbysdJ%2BNNXHF8ZY2Y0XmVaYMiS8Kkq%2F%2Ba2bjpyqg5SDcAMA%2Fxi0gLrMyHofL; expires=Wednesday, 15-Jan-20 05:00:00 GMT; path=/; domain=.fotolog.net # fced = None STYLE_BLOCK = None # style from the main page class FatalError(Exception): pass # print an error message and raise an error and exit def raise_error(msg, ex): print >> sys.stderr, msg raise ex def re_replace(pat, data, replacement=None): while True: m = pat.search(data) if m is None: break if replacement is not None: data = data[:m.start()] + replacement + data[m.end():] else: data = data[:m.start()] + data[m.end():] return data def get_style_block(html): s = html.find('', e+1) return html[s:e+8] + '\n' def reset_HTTPConnection(conn): ''' reopen HTTPConnection ''' conn.close() conn.connect() def get_response(conn, path): ''' patiently wait for our response from the HTTPConnection conn. path is the URL minus the protocol and host name ''' global fced conn_reset = False # did we have to reset our HTTPConnection? while True: try: headers = {"USER-AGENT" : "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.10) Gecko/20050716 Firefox/1.0.6"} if (fced != None): headers['Cookie'] = fced print 'fetching http://%s/%s' % (conn.host, path) conn.request('GET', path, '', headers) r = conn.getresponse() cookie = r.getheader('set-cookie', None) if (cookie != None): idx = cookie.find('FCED'); idx1 = cookie.find(';', idx+1) fced = cookie[idx:idx1] return r except httplib.ResponseNotReady, ex: print 'httplib.ResponseNotReady. retrying...' reset_HTTPConnection(conn) conn_reset = True continue except httplib.CannotSendRequest, ex: print 'httplib.CannotSendRequest. retrying...' reset_HTTPConnection(conn) conn_reset = True continue except Exception, ex: print 'ex:' + str(ex) # keep going print 'ex.args:' + str(ex.args) + ', len=' + str(len(ex.args)) if (len(ex.args) > 0) and (ex.args[0] == 10060): continue # timed out elif not conn_reset: print >> sys.stdout, 'establishing a new connection' reset_HTTPConnection(conn) conn_reset = True continue else: raise ex # # save data in file fname # def save_content(fname, data): f = open(fname, 'wb') f.write(data) f.close() # # retrieve some content at url and save it as localfile in directory # named by the pid parameter in the request URL. # def save_image(fname, imgurl): global IMGHOST, gimgconn if IMGHOST is None: u = urlparse.urlparse(imgurl) IMGHOST = u[1] if gimgconn is None: gimgconn = httplib.HTTPConnection(IMGHOST) r = get_response(gimgconn, imgurl) clenhdr = r.getheader('content-length') clen = 0 loopcnt = 0 while True: # loop until we really get it if (clenhdr != None): print '...trying to read ' + clenhdr + ' bytes of image' data = r.read() if (len(data) >= clen): break; else: print '...did not get the whole content. trying again.' time.sleep(3.0) r = get_response(gimgconn, imgurl) # try again continue save_content(fname, data) def fetch_image(data, pid): ''' parse out the main image src url and fetch it. replace its reference in the page data with the local version ''' i = data.find('
' % div_class)
if i < 0:
raise_error('Unable to find %s link' % div_class, 'HTML changed')
# look for the href inside the