ADDED: Mock other browser useragents and randomize behaviour to hide from Google's antispam protection.

This commit is contained in:
Khalil Fazal 2013-06-05 07:11:01 -04:00
commit dca9c77233

View file

@ -11,9 +11,10 @@ After downloading the html file, run this script on it to generate a KML.
from lxml.html import document_fromstring from lxml.html import document_fromstring
import simplekml import simplekml
from urllib2 import urlopen from urllib import FancyURLopener
import os import os
import random
import re import re
import sys import sys
import time import time
@ -33,6 +34,18 @@ coords_in_url = re.compile('\?q=(-?\d{,3}\.\d*),\s*(-?\d{,3}\.\d*)')
doc = document_fromstring(data) doc = document_fromstring(data)
class Browser(FancyURLopener):
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
]
version = random.choice(user_agents)
for label in doc.body.iterfind('dl/dl/h3'): for label in doc.body.iterfind('dl/dl/h3'):
labelName = label.text_content() labelName = label.text_content()
#print labelName #print labelName
@ -46,23 +59,28 @@ for label in doc.body.iterfind('dl/dl/h3'):
print description.encode('UTF8') print description.encode('UTF8')
print "URL: {0}".format(url) print "URL: {0}".format(url)
browser = Browser()
if coords_in_url.search(url): if coords_in_url.search(url):
# Coordinates are in URL itself # Coordinates are in URL itself
latitude = coords_in_url.search(url).groups()[0] latitude = coords_in_url.search(url).groups()[0]
longitude = coords_in_url.search(url).groups()[1] longitude = coords_in_url.search(url).groups()[1]
else: else:
# Load map and find coordinates in source of page # Load map and find coordinates in source of page
sock = False
while not sock:
try: try:
sock = urlopen(url.replace(' ','+')) sock = browser.open(url.replace(' ','+'))
except Exception, e: except Exception, e:
print 'Connection problem:' print 'Connection problem:'
print repr(e) print repr(e)
print 'Waiting 2 minutes and trying again' print 'Retrying randomly between 15 and 60 seconds.'
time.sleep(120) time.sleep(random.randint(15, 60))
sock = urlopen(url.replace(' ','+'))
content = sock.read() content = sock.read()
sock.close() sock.close()
time.sleep(3) # Don't annoy server time.sleep(random.randint(15, 60)) # Don't annoy server
try: try:
latitude = lat_re.findall(content)[0] latitude = lat_re.findall(content)[0]
longitude = lon_re.findall(content)[0] longitude = lon_re.findall(content)[0]