berliner-winter/tagger.py

from nltk.tag.stanford import POSTagger
from models import Article

tagger = POSTagger('./stanford-postagger-full-2014-10-26/models/german-fast.tagger',
                   './stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar',
                    'UTF-8')

for article in Article.select().limit(100):
    pos = tagger.tag((article.place + " " + article.description).split())

    # extract the places
    places = []
    is_matching = False
    current_match = []
    for tuple in pos:
        if is_matching:
            # when we're matching, the phrases we're looking for look like
            # "Im S-Bahnhof Wedding"... the tags below mean
            if tuple[1] in ("ART", "ADJA", "NN", "NE", "CARD"):
                current_match.append(tuple)
            else:
                places.append(current_match)
                current_match = []
                is_matching = False
        else:
            # start matching when we have a preposition
            if tuple[1] in ("APPR", "APPRART"):
                is_matching = True

    print(article.place)
    print(article.description)
    print()
    print("Relevant: " + str(places))