Normalize \r\n to \n and use date objects instead of the original date string

This commit is contained in:
Arne Schlüter 2014-12-08 14:40:05 +01:00
commit 66b68b269d

View file

@ -1,5 +1,6 @@
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from urllib import parse, request from urllib import parse, request
from datetime import date
class Scraper(): class Scraper():
@ -24,7 +25,7 @@ class Scraper():
# headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info) # headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info)
headline = table.select('tr:first-child')[0].get_text() headline = table.select('tr:first-child')[0].get_text()
date = headline[:headline.find(' ')] year, month, day = headline[:headline.find(' ')].strip().split('-')
places = headline[headline.find(' ') + 1:] places = headline[headline.find(' ') + 1:]
if places.find(' ') == -1: if places.find(' ') == -1:
@ -32,15 +33,15 @@ class Scraper():
additional = None additional = None
else: else:
district = places[:places.find(' ')] district = places[:places.find(' ')]
additional = places[places.find(' ') + 1:] additional = places[places.find(' ') + 1:].strip()
text = table.select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].get_text() text = table.select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].get_text()
article = { article = {
'date': date.strip(), 'date': date(int(year), int(month), int(day)),
'place': district, 'place': district.strip(),
'additional': additional, 'additional': additional,
'text': text.strip() 'text': text.strip().replace('\r\n', '\n')
} }
articles.append(article) articles.append(article)