ADDED: Mock other browser useragents and randomize behaviour to hide from Google's antispam protection.

2026-05-06 18:43:40 +02:00 · 2013-06-05 07:11:01 -04:00 · 2013-06-05 07:11:01 -04:00 · dca9c77233
commit dca9c77233
parent 59def025e4
1 changed files with 28 additions and 10 deletions
--- a/bookmarkstokml.py
+++ b/bookmarkstokml.py
@ -11,9 +11,10 @@ After downloading the html file, run this script on it to generate a KML.
 from lxml.html import document_fromstring
 import simplekml
-from urllib2 import urlopen
+from urllib import FancyURLopener
 import os
 import random
 import re
 import sys
 import time
@ -33,6 +34,18 @@ coords_in_url = re.compile('\?q=(-?\d{,3}\.\d*),\s*(-?\d{,3}\.\d*)')
 doc = document_fromstring(data)
 class Browser(FancyURLopener):
    user_agents = [
        'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
        'Opera/9.25 (Windows NT 5.1; U; en)',
        'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
        'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
        'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
        'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
    ]
    version = random.choice(user_agents)
 for label in doc.body.iterfind('dl/dl/h3'):
    labelName = label.text_content()
    #print labelName
@ -46,23 +59,28 @@ for label in doc.body.iterfind('dl/dl/h3'):
            print description.encode('UTF8')
            print "URL: {0}".format(url)
            browser = Browser()
            if coords_in_url.search(url):
                # Coordinates are in URL itself
                latitude = coords_in_url.search(url).groups()[0]
                longitude = coords_in_url.search(url).groups()[1]
            else:
                # Load map and find coordinates in source of page
-                try:
+                sock = False
-                    sock = urlopen(url.replace(' ','+'))
+
-                except Exception, e:
+                while not sock:
-                    print 'Connection problem:'
+                    try:
-                    print repr(e)
+                        sock = browser.open(url.replace(' ','+'))
-                    print 'Waiting 2 minutes and trying again'
+                    except Exception, e:
-                    time.sleep(120)
+                        print 'Connection problem:'
-                    sock = urlopen(url.replace(' ','+'))
+                        print repr(e)
                        print 'Retrying randomly between 15 and 60 seconds.'
                        time.sleep(random.randint(15, 60))
                content = sock.read()
                sock.close()
-                time.sleep(3) # Don't annoy server
+                time.sleep(random.randint(15, 60)) # Don't annoy server
                try:
                    latitude = lat_re.findall(content)[0]
                    longitude = lon_re.findall(content)[0]