mirror of
https://github.com/heyarne/bookmarks-to-kml.git
synced 2026-05-06 18:43:40 +02:00
ADDED: Mock other browser useragents and randomize behaviour to hide from Google's antispam protection.
This commit is contained in:
parent
59def025e4
commit
dca9c77233
1 changed files with 28 additions and 10 deletions
|
|
@ -11,9 +11,10 @@ After downloading the html file, run this script on it to generate a KML.
|
||||||
from lxml.html import document_fromstring
|
from lxml.html import document_fromstring
|
||||||
import simplekml
|
import simplekml
|
||||||
|
|
||||||
from urllib2 import urlopen
|
from urllib import FancyURLopener
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
|
|
@ -33,6 +34,18 @@ coords_in_url = re.compile('\?q=(-?\d{,3}\.\d*),\s*(-?\d{,3}\.\d*)')
|
||||||
|
|
||||||
doc = document_fromstring(data)
|
doc = document_fromstring(data)
|
||||||
|
|
||||||
|
class Browser(FancyURLopener):
|
||||||
|
user_agents = [
|
||||||
|
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
|
||||||
|
'Opera/9.25 (Windows NT 5.1; U; en)',
|
||||||
|
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
|
||||||
|
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
|
||||||
|
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
|
||||||
|
'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9'
|
||||||
|
]
|
||||||
|
|
||||||
|
version = random.choice(user_agents)
|
||||||
|
|
||||||
for label in doc.body.iterfind('dl/dl/h3'):
|
for label in doc.body.iterfind('dl/dl/h3'):
|
||||||
labelName = label.text_content()
|
labelName = label.text_content()
|
||||||
#print labelName
|
#print labelName
|
||||||
|
|
@ -46,23 +59,28 @@ for label in doc.body.iterfind('dl/dl/h3'):
|
||||||
print description.encode('UTF8')
|
print description.encode('UTF8')
|
||||||
print "URL: {0}".format(url)
|
print "URL: {0}".format(url)
|
||||||
|
|
||||||
|
browser = Browser()
|
||||||
|
|
||||||
if coords_in_url.search(url):
|
if coords_in_url.search(url):
|
||||||
# Coordinates are in URL itself
|
# Coordinates are in URL itself
|
||||||
latitude = coords_in_url.search(url).groups()[0]
|
latitude = coords_in_url.search(url).groups()[0]
|
||||||
longitude = coords_in_url.search(url).groups()[1]
|
longitude = coords_in_url.search(url).groups()[1]
|
||||||
else:
|
else:
|
||||||
# Load map and find coordinates in source of page
|
# Load map and find coordinates in source of page
|
||||||
|
sock = False
|
||||||
|
|
||||||
|
while not sock:
|
||||||
try:
|
try:
|
||||||
sock = urlopen(url.replace(' ','+'))
|
sock = browser.open(url.replace(' ','+'))
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
print 'Connection problem:'
|
print 'Connection problem:'
|
||||||
print repr(e)
|
print repr(e)
|
||||||
print 'Waiting 2 minutes and trying again'
|
print 'Retrying randomly between 15 and 60 seconds.'
|
||||||
time.sleep(120)
|
time.sleep(random.randint(15, 60))
|
||||||
sock = urlopen(url.replace(' ','+'))
|
|
||||||
content = sock.read()
|
content = sock.read()
|
||||||
sock.close()
|
sock.close()
|
||||||
time.sleep(3) # Don't annoy server
|
time.sleep(random.randint(15, 60)) # Don't annoy server
|
||||||
try:
|
try:
|
||||||
latitude = lat_re.findall(content)[0]
|
latitude = lat_re.findall(content)[0]
|
||||||
longitude = lon_re.findall(content)[0]
|
longitude = lon_re.findall(content)[0]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue