Escape urls to fix unicode bug

This commit is contained in:
Arne Schlüter 2017-01-29 14:58:28 +01:00
commit d167158a6e

View file

@ -8,11 +8,13 @@ from lxml.html import document_fromstring
import simplekml import simplekml
from urllib.request import FancyURLopener from urllib.request import FancyURLopener
from urllib.parse import quote_plus, urlparse, parse_qsl
import os import os
import re import re
import sys import sys
import time import time
import random
coords_in_content = re.compile('\/@(-?\d+\.\d+),(-?\d+\.\d+),') coords_in_content = re.compile('\/@(-?\d+\.\d+),(-?\d+\.\d+),')
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36' user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36'
@ -27,6 +29,7 @@ with open(filename) as bookmarks_file:
doc = document_fromstring(data) doc = document_fromstring(data)
class Browser(FancyURLopener): class Browser(FancyURLopener):
version = user_agent version = user_agent
@ -36,10 +39,13 @@ for label in doc.body.iterfind('dl/dt/h3'):
kml = simplekml.Kml() kml = simplekml.Kml()
kml.document.name = labelName kml.document.name = labelName
for element, attribute, url, pos in label.getparent().getnext().iterlinks(): for element, _, url, _ in label.getparent().getnext().iterlinks():
if 'maps.google' in url: if 'maps.google' in url:
print
description = element.text or '' description = element.text or ''
safe_query = ['{0}={1}'.format(k, quote_plus(v))
for (k, v) in parse_qsl(urlparse(url).query)]
url = '{0}/?{1}'.format(url.split('/?')[0], '&'.join(safe_query))
print('GET {0} {1}'.format(url, description.encode('UTF8'))) print('GET {0} {1}'.format(url, description.encode('UTF8')))
browser = Browser() browser = Browser()
@ -47,7 +53,7 @@ for label in doc.body.iterfind('dl/dt/h3'):
sock = False sock = False
while not sock: while not sock:
try: try:
sock = browser.open(url.replace(' ','+')) sock = browser.open(url)
except Exception as e: except Exception as e:
print('Connection problem:' + repr(e)) print('Connection problem:' + repr(e))
print('Retrying randomly between 15 and 60 seconds.') print('Retrying randomly between 15 and 60 seconds.')
@ -62,7 +68,8 @@ for label in doc.body.iterfind('dl/dt/h3'):
longitude = coords[1] longitude = coords[1]
except (AttributeError, IndexError): except (AttributeError, IndexError):
print('[Coordinates not found: ' + str(coords) + '. Try to update "user_agent"]') print('[Coordinates not found: ' + str(coords) +
'. Try to update "user_agent"]')
continue continue
print(latitude, longitude) print(latitude, longitude)