From 66b68b269d08f05344fcbfbd43fb1ccd390cd18c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arne=20Schl=C3=BCter?= Date: Mon, 8 Dec 2014 14:40:05 +0100 Subject: [PATCH] Normalize \r\n to \n and use date objects instead of the original date string --- scraper/scraper.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scraper/scraper.py b/scraper/scraper.py index ae49d40..57d0cc4 100644 --- a/scraper/scraper.py +++ b/scraper/scraper.py @@ -1,5 +1,6 @@ from bs4 import BeautifulSoup from urllib import parse, request +from datetime import date class Scraper(): @@ -24,7 +25,7 @@ class Scraper(): # headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info) headline = table.select('tr:first-child')[0].get_text() - date = headline[:headline.find(' ')] + year, month, day = headline[:headline.find(' ')].strip().split('-') places = headline[headline.find(' ') + 1:] if places.find(' ') == -1: @@ -32,15 +33,15 @@ class Scraper(): additional = None else: district = places[:places.find(' ')] - additional = places[places.find(' ') + 1:] + additional = places[places.find(' ') + 1:].strip() text = table.select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].get_text() article = { - 'date': date.strip(), - 'place': district, + 'date': date(int(year), int(month), int(day)), + 'place': district.strip(), 'additional': additional, - 'text': text.strip() + 'text': text.strip().replace('\r\n', '\n') } articles.append(article)