ADDED: Mock other browser useragents and randomize behaviour to hide from Google's antispam protection.

This commit is contained in:
Khalil Fazal 2013-06-05 07:11:01 -04:00
commit dca9c77233

View file

@ -11,9 +11,10 @@ After downloading the html file, run this script on it to generate a KML.
from lxml.html import document_fromstring from lxml.html import document_fromstring
import simplekml import simplekml
from urllib2 import urlopen from urllib import FancyURLopener
import os import os
import random
import re import re
import sys import sys
import time import time
@ -33,6 +34,18 @@ coords_in_url = re.compile('\?q=(-?\d{,3}\.\d*),\s*(-?\d{,3}\.\d*)')
doc = document_fromstring(data) doc = document_fromstring(data)
class Browser(FancyURLopener):
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Opera/9.25 (Windows NT 5.1; U; en)',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
]
version = random.choice(user_agents)
for label in doc.body.iterfind('dl/dl/h3'): for label in doc.body.iterfind('dl/dl/h3'):
labelName = label.text_content() labelName = label.text_content()
#print labelName #print labelName
@ -46,23 +59,28 @@ for label in doc.body.iterfind('dl/dl/h3'):
print description.encode('UTF8') print description.encode('UTF8')
print "URL: {0}".format(url) print "URL: {0}".format(url)
browser = Browser()
if coords_in_url.search(url): if coords_in_url.search(url):
# Coordinates are in URL itself # Coordinates are in URL itself
latitude = coords_in_url.search(url).groups()[0] latitude = coords_in_url.search(url).groups()[0]
longitude = coords_in_url.search(url).groups()[1] longitude = coords_in_url.search(url).groups()[1]
else: else:
# Load map and find coordinates in source of page # Load map and find coordinates in source of page
try: sock = False
sock = urlopen(url.replace(' ','+'))
except Exception, e: while not sock:
print 'Connection problem:' try:
print repr(e) sock = browser.open(url.replace(' ','+'))
print 'Waiting 2 minutes and trying again' except Exception, e:
time.sleep(120) print 'Connection problem:'
sock = urlopen(url.replace(' ','+')) print repr(e)
print 'Retrying randomly between 15 and 60 seconds.'
time.sleep(random.randint(15, 60))
content = sock.read() content = sock.read()
sock.close() sock.close()
time.sleep(3) # Don't annoy server time.sleep(random.randint(15, 60)) # Don't annoy server
try: try:
latitude = lat_re.findall(content)[0] latitude = lat_re.findall(content)[0]
longitude = lon_re.findall(content)[0] longitude = lon_re.findall(content)[0]