From d167158a6e37c9c3664daf6d59e64e3f7603b401 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arne=20Schl=C3=BCter?= Date: Sun, 29 Jan 2017 14:58:28 +0100 Subject: [PATCH] Escape urls to fix unicode bug --- bookmarkstokml.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/bookmarkstokml.py b/bookmarkstokml.py index a20066b..272111f 100644 --- a/bookmarkstokml.py +++ b/bookmarkstokml.py @@ -8,11 +8,13 @@ from lxml.html import document_fromstring import simplekml from urllib.request import FancyURLopener +from urllib.parse import quote_plus, urlparse, parse_qsl import os import re import sys import time +import random coords_in_content = re.compile('\/@(-?\d+\.\d+),(-?\d+\.\d+),') user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36' @@ -27,6 +29,7 @@ with open(filename) as bookmarks_file: doc = document_fromstring(data) + class Browser(FancyURLopener): version = user_agent @@ -36,10 +39,13 @@ for label in doc.body.iterfind('dl/dt/h3'): kml = simplekml.Kml() kml.document.name = labelName - for element, attribute, url, pos in label.getparent().getnext().iterlinks(): + for element, _, url, _ in label.getparent().getnext().iterlinks(): if 'maps.google' in url: - print description = element.text or '' + safe_query = ['{0}={1}'.format(k, quote_plus(v)) + for (k, v) in parse_qsl(urlparse(url).query)] + url = '{0}/?{1}'.format(url.split('/?')[0], '&'.join(safe_query)) + print('GET {0} {1}'.format(url, description.encode('UTF8'))) browser = Browser() @@ -47,7 +53,7 @@ for label in doc.body.iterfind('dl/dt/h3'): sock = False while not sock: try: - sock = browser.open(url.replace(' ','+')) + sock = browser.open(url) except Exception as e: print('Connection problem:' + repr(e)) print('Retrying randomly between 15 and 60 seconds.') @@ -62,7 +68,8 @@ for label in doc.body.iterfind('dl/dt/h3'): longitude = coords[1] except (AttributeError, IndexError): - print('[Coordinates not found: ' + str(coords) + '. Try to update "user_agent"]') + print('[Coordinates not found: ' + str(coords) + + '. Try to update "user_agent"]') continue print(latitude, longitude)