mirror of
https://github.com/heyarne/berliner-winter.git
synced 2026-05-06 19:23:39 +02:00
Remove field 'addtional_place' because it can't be reliably parsed
This commit is contained in:
parent
c1ac5e5ed4
commit
98d1e21a90
3 changed files with 4 additions and 15 deletions
|
|
@ -29,7 +29,6 @@ for article in articles:
|
||||||
h = hashlib.sha256()
|
h = hashlib.sha256()
|
||||||
h.update(str(article['date']).encode(encoding))
|
h.update(str(article['date']).encode(encoding))
|
||||||
h.update(article['place'].encode(encoding))
|
h.update(article['place'].encode(encoding))
|
||||||
h.update((article['additional_place'] or '').encode(encoding))
|
|
||||||
h.update(article['description'].encode(encoding))
|
h.update(article['description'].encode(encoding))
|
||||||
digest = h.digest()
|
digest = h.digest()
|
||||||
|
|
||||||
|
|
@ -41,7 +40,6 @@ for article in articles:
|
||||||
date = article['date'],
|
date = article['date'],
|
||||||
month_only = article['month_only'],
|
month_only = article['month_only'],
|
||||||
place = article['place'],
|
place = article['place'],
|
||||||
additional_place = article['additional_place'],
|
|
||||||
description = article['description'],
|
description = article['description'],
|
||||||
hash = digest
|
hash = digest
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -13,11 +13,10 @@ class Article(BaseModel):
|
||||||
date = DateField(index=True)
|
date = DateField(index=True)
|
||||||
month_only = BooleanField(default=False)
|
month_only = BooleanField(default=False)
|
||||||
place = CharField()
|
place = CharField()
|
||||||
additional_place = CharField(null=True)
|
|
||||||
description = TextField()
|
description = TextField()
|
||||||
hash = BlobField(index=True)
|
hash = BlobField(index=True)
|
||||||
|
|
||||||
# Set up the tables
|
# Set up the tables
|
||||||
def create_tables():
|
def create_tables():
|
||||||
database.connect()
|
db.connect()
|
||||||
database.create_tables([Article])
|
db.create_tables([Article])
|
||||||
|
|
|
||||||
|
|
@ -42,22 +42,14 @@ class Scraper():
|
||||||
print('Failed for headline ' + headline)
|
print('Failed for headline ' + headline)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
places = headline[headline.find(' ') + 1:]
|
place = headline[headline.find(' ') + 1:]
|
||||||
|
|
||||||
if places.find(' ') == -1:
|
|
||||||
district = places
|
|
||||||
additional = None
|
|
||||||
else:
|
|
||||||
district = places[:places.find(' ')]
|
|
||||||
additional = places[places.find(' ') + 1:].strip()
|
|
||||||
|
|
||||||
text = table.select('tr')[2].select('td')[1].get_text()
|
text = table.select('tr')[2].select('td')[1].get_text()
|
||||||
|
|
||||||
article = {
|
article = {
|
||||||
'date': date(int(year), int(month), int(day) if day else 1),
|
'date': date(int(year), int(month), int(day) if day else 1),
|
||||||
'month_only': day is None,
|
'month_only': day is None,
|
||||||
'place': district.strip(),
|
'place': place.strip(),
|
||||||
'additional_place': additional,
|
|
||||||
'description': text.strip()
|
'description': text.strip()
|
||||||
}
|
}
|
||||||
articles.append(article)
|
articles.append(article)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue