Remove field 'addtional_place' because it can't be reliably parsed

This commit is contained in:
Arne Schlüter 2014-12-11 00:07:48 +01:00
commit 98d1e21a90
3 changed files with 4 additions and 15 deletions

View file

@ -29,7 +29,6 @@ for article in articles:
h = hashlib.sha256()
h.update(str(article['date']).encode(encoding))
h.update(article['place'].encode(encoding))
h.update((article['additional_place'] or '').encode(encoding))
h.update(article['description'].encode(encoding))
digest = h.digest()
@ -41,7 +40,6 @@ for article in articles:
date = article['date'],
month_only = article['month_only'],
place = article['place'],
additional_place = article['additional_place'],
description = article['description'],
hash = digest
)

View file

@ -13,11 +13,10 @@ class Article(BaseModel):
date = DateField(index=True)
month_only = BooleanField(default=False)
place = CharField()
additional_place = CharField(null=True)
description = TextField()
hash = BlobField(index=True)
# Set up the tables
def create_tables():
database.connect()
database.create_tables([Article])
db.connect()
db.create_tables([Article])

View file

@ -42,22 +42,14 @@ class Scraper():
print('Failed for headline ' + headline)
raise
places = headline[headline.find(' ') + 1:]
if places.find(' ') == -1:
district = places
additional = None
else:
district = places[:places.find(' ')]
additional = places[places.find(' ') + 1:].strip()
place = headline[headline.find(' ') + 1:]
text = table.select('tr')[2].select('td')[1].get_text()
article = {
'date': date(int(year), int(month), int(day) if day else 1),
'month_only': day is None,
'place': district.strip(),
'additional_place': additional,
'place': place.strip(),
'description': text.strip()
}
articles.append(article)