diff --git a/bookmarkstokml.py b/bookmarkstokml.py index cf603f1..f45db3d 100644 --- a/bookmarkstokml.py +++ b/bookmarkstokml.py @@ -11,9 +11,10 @@ After downloading the html file, run this script on it to generate a KML. from lxml.html import document_fromstring import simplekml -from urllib2 import urlopen +from urllib import FancyURLopener import os +import random import re import sys import time @@ -33,6 +34,18 @@ coords_in_url = re.compile('\?q=(-?\d{,3}\.\d*),\s*(-?\d{,3}\.\d*)') doc = document_fromstring(data) +class Browser(FancyURLopener): + user_agents = [ + 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', + 'Opera/9.25 (Windows NT 5.1; U; en)', + 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', + 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', + 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', + 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9' + ] + + version = random.choice(user_agents) + for label in doc.body.iterfind('dl/dl/h3'): labelName = label.text_content() #print labelName @@ -46,23 +59,28 @@ for label in doc.body.iterfind('dl/dl/h3'): print description.encode('UTF8') print "URL: {0}".format(url) + browser = Browser() + if coords_in_url.search(url): # Coordinates are in URL itself latitude = coords_in_url.search(url).groups()[0] longitude = coords_in_url.search(url).groups()[1] else: # Load map and find coordinates in source of page - try: - sock = urlopen(url.replace(' ','+')) - except Exception, e: - print 'Connection problem:' - print repr(e) - print 'Waiting 2 minutes and trying again' - time.sleep(120) - sock = urlopen(url.replace(' ','+')) + sock = False + + while not sock: + try: + sock = browser.open(url.replace(' ','+')) + except Exception, e: + print 'Connection problem:' + print repr(e) + print 'Retrying randomly between 15 and 60 seconds.' + time.sleep(random.randint(15, 60)) + content = sock.read() sock.close() - time.sleep(3) # Don't annoy server + time.sleep(random.randint(15, 60)) # Don't annoy server try: latitude = lat_re.findall(content)[0] longitude = lon_re.findall(content)[0]