mirror of
https://github.com/heyarne/berliner-winter.git
synced 2026-05-06 19:23:39 +02:00
Normalize \r\n to \n and use date objects instead of the original date string
This commit is contained in:
parent
7fe08227bb
commit
66b68b269d
1 changed files with 6 additions and 5 deletions
|
|
@ -1,5 +1,6 @@
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib import parse, request
|
from urllib import parse, request
|
||||||
|
from datetime import date
|
||||||
|
|
||||||
class Scraper():
|
class Scraper():
|
||||||
|
|
||||||
|
|
@ -24,7 +25,7 @@ class Scraper():
|
||||||
# headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info)
|
# headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info)
|
||||||
headline = table.select('tr:first-child')[0].get_text()
|
headline = table.select('tr:first-child')[0].get_text()
|
||||||
|
|
||||||
date = headline[:headline.find(' ')]
|
year, month, day = headline[:headline.find(' ')].strip().split('-')
|
||||||
places = headline[headline.find(' ') + 1:]
|
places = headline[headline.find(' ') + 1:]
|
||||||
|
|
||||||
if places.find(' ') == -1:
|
if places.find(' ') == -1:
|
||||||
|
|
@ -32,15 +33,15 @@ class Scraper():
|
||||||
additional = None
|
additional = None
|
||||||
else:
|
else:
|
||||||
district = places[:places.find(' ')]
|
district = places[:places.find(' ')]
|
||||||
additional = places[places.find(' ') + 1:]
|
additional = places[places.find(' ') + 1:].strip()
|
||||||
|
|
||||||
text = table.select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].get_text()
|
text = table.select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].get_text()
|
||||||
|
|
||||||
article = {
|
article = {
|
||||||
'date': date.strip(),
|
'date': date(int(year), int(month), int(day)),
|
||||||
'place': district,
|
'place': district.strip(),
|
||||||
'additional': additional,
|
'additional': additional,
|
||||||
'text': text.strip()
|
'text': text.strip().replace('\r\n', '\n')
|
||||||
}
|
}
|
||||||
articles.append(article)
|
articles.append(article)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue