diff --git a/scraper/scraper.py b/scraper/scraper.py index ae49d40..57d0cc4 100644 --- a/scraper/scraper.py +++ b/scraper/scraper.py @@ -1,5 +1,6 @@ from bs4 import BeautifulSoup from urllib import parse, request +from datetime import date class Scraper(): @@ -24,7 +25,7 @@ class Scraper(): # headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info) headline = table.select('tr:first-child')[0].get_text() - date = headline[:headline.find(' ')] + year, month, day = headline[:headline.find(' ')].strip().split('-') places = headline[headline.find(' ') + 1:] if places.find(' ') == -1: @@ -32,15 +33,15 @@ class Scraper(): additional = None else: district = places[:places.find(' ')] - additional = places[places.find(' ') + 1:] + additional = places[places.find(' ') + 1:].strip() text = table.select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].get_text() article = { - 'date': date.strip(), - 'place': district, + 'date': date(int(year), int(month), int(day)), + 'place': district.strip(), 'additional': additional, - 'text': text.strip() + 'text': text.strip().replace('\r\n', '\n') } articles.append(article)