Remove field 'addtional_place' because it can't be reliably parsed

This commit is contained in:
Arne Schlüter 2014-12-11 00:07:48 +01:00
commit 98d1e21a90
3 changed files with 4 additions and 15 deletions

View file

@ -29,7 +29,6 @@ for article in articles:
h = hashlib.sha256() h = hashlib.sha256()
h.update(str(article['date']).encode(encoding)) h.update(str(article['date']).encode(encoding))
h.update(article['place'].encode(encoding)) h.update(article['place'].encode(encoding))
h.update((article['additional_place'] or '').encode(encoding))
h.update(article['description'].encode(encoding)) h.update(article['description'].encode(encoding))
digest = h.digest() digest = h.digest()
@ -41,7 +40,6 @@ for article in articles:
date = article['date'], date = article['date'],
month_only = article['month_only'], month_only = article['month_only'],
place = article['place'], place = article['place'],
additional_place = article['additional_place'],
description = article['description'], description = article['description'],
hash = digest hash = digest
) )

View file

@ -13,11 +13,10 @@ class Article(BaseModel):
date = DateField(index=True) date = DateField(index=True)
month_only = BooleanField(default=False) month_only = BooleanField(default=False)
place = CharField() place = CharField()
additional_place = CharField(null=True)
description = TextField() description = TextField()
hash = BlobField(index=True) hash = BlobField(index=True)
# Set up the tables # Set up the tables
def create_tables(): def create_tables():
database.connect() db.connect()
database.create_tables([Article]) db.create_tables([Article])

View file

@ -42,22 +42,14 @@ class Scraper():
print('Failed for headline ' + headline) print('Failed for headline ' + headline)
raise raise
places = headline[headline.find(' ') + 1:] place = headline[headline.find(' ') + 1:]
if places.find(' ') == -1:
district = places
additional = None
else:
district = places[:places.find(' ')]
additional = places[places.find(' ') + 1:].strip()
text = table.select('tr')[2].select('td')[1].get_text() text = table.select('tr')[2].select('td')[1].get_text()
article = { article = {
'date': date(int(year), int(month), int(day) if day else 1), 'date': date(int(year), int(month), int(day) if day else 1),
'month_only': day is None, 'month_only': day is None,
'place': district.strip(), 'place': place.strip(),
'additional_place': additional,
'description': text.strip() 'description': text.strip()
} }
articles.append(article) articles.append(article)