Start with improving the place results

This commit is contained in:
Arne Schlüter 2015-01-17 15:57:34 +01:00
commit b85c84139b
2 changed files with 79 additions and 21 deletions

View file

@ -1,33 +1,17 @@
from nltk.tag.stanford import POSTagger
from models import Article
from analyze import *
tagger = POSTagger('./stanford-postagger-full-2014-10-26/models/german-fast.tagger',
'./stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar',
'UTF-8')
for article in Article.select().limit(100):
pos = tagger.tag((article.place + " " + article.description).split())
# extract the places
places = []
is_matching = False
current_match = []
for tuple in pos:
if is_matching:
# when we're matching, the phrases we're looking for look like
# "Im S-Bahnhof Wedding"... the tags below mean
if tuple[1] in ("ART", "ADJA", "NN", "NE", "CARD"):
current_match.append(tuple)
else:
places.append(current_match)
current_match = []
is_matching = False
else:
# start matching when we have a preposition
if tuple[1] in ("APPR", "APPRART"):
is_matching = True
potential = get_potential_places(article.place, article.description)
places = improve_potential_places(potential)
print(article.place)
print(article.description)
print()
print("Relevant: " + str(places))
print("Potential: " + str(potential))
print("Improved: " + str(places))