Start with improving the place results

2026-05-06 19:23:39 +02:00 · 2015-01-17 15:57:34 +01:00 · 2015-01-17 15:57:34 +01:00 · b85c84139b
commit b85c84139b
parent 8e6032be3a
2 changed files with 79 additions and 21 deletions
--- a/analyze.py
+++ b/analyze.py
@ -1,3 +1,77 @@
 import re
 import string
 from nltk.tag.stanford import POSTagger
 tagger = POSTagger('./stanford-postagger-full-2014-10-26/models/german-fast.tagger',
                   './stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar',
                    'UTF-8')
 punctuation_regex = re.compile("[%s]" % re.escape(string.punctuation))
 def get_potential_places(article_place, article_body):
    """
    Returns a list of potential places as tuples with their part-of-speech tags
    for later filtering
    """
    # remove punctuation
    full_text = punctuation_regex.sub(" ", article_place + " " + article_body)
    pos = tagger.tag(full_text.split())
    # extract the places
    places = []
    is_matching = False
    current_match = []
    for tuple in pos:
        if is_matching:
            # when we're matching, the phrases we're looking for look like
            # "Im S-Bahnhof Wedding"... the tags below mean
            if tuple[1] in ("ART", "ADJA", "NN", "NE", "CARD"):
                current_match.append(tuple)
            else:
                # we stop the match, so append the current match
                places.append(current_match)
                current_match = []
                is_matching = False
        else:
            # start matching when we have a preposition
            if tuple[1] in ("APPR", "APPRART"):
                is_matching = True
    return places
 def improve_potential_places(pos_tuples):
    """
    Improves the matches' quality so we don't have to look up the lat-lng of so
    many mismatches
    """
    better_tuples = []
    for tuple_list in pos_tuples:
        # first, exluce empty lists
        if tuple_list:
            cleaner = []
            index = -1
            for tuple in tuple_list:
                index += 1
                # exclude articles ("the", "a"), they only introduce noise, but
                # keep the wh
                if tuple[1] is "ART":
                    continue
                # if we have numbers in the middle of our phrase, it's probably
                # also not useful (as opposed to Krügerstr. 22)
                if tuple[1] is "CARD" and index < len(tuple_list):
                    cleaner_tuple = []
                    break
                cleaner.append(tuple)
            better_tuples.append(cleaner)
    return better_tuples
 def get_district(article_headline):
    """
    Returns a geo-coded version of a district an article is about, based on its
--- a/tagger.py
+++ b/tagger.py
@ -1,33 +1,17 @@
 from nltk.tag.stanford import POSTagger
 from models import Article
 from analyze import *
 tagger = POSTagger('./stanford-postagger-full-2014-10-26/models/german-fast.tagger',
                   './stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar',
                    'UTF-8')
 for article in Article.select().limit(100):
-    pos = tagger.tag((article.place + " " + article.description).split())
+    potential = get_potential_places(article.place, article.description)
-
+    places = improve_potential_places(potential)
    # extract the places
    places = []
    is_matching = False
    current_match = []
    for tuple in pos:
        if is_matching:
            # when we're matching, the phrases we're looking for look like
            # "Im S-Bahnhof Wedding"... the tags below mean
            if tuple[1] in ("ART", "ADJA", "NN", "NE", "CARD"):
                current_match.append(tuple)
            else:
                places.append(current_match)
                current_match = []
                is_matching = False
        else:
            # start matching when we have a preposition
            if tuple[1] in ("APPR", "APPRART"):
                is_matching = True
    print(article.place)
    print(article.description)
    print()
-    print("Relevant: " + str(places))
+    print("Potential: " + str(potential))
    print("Improved:  " + str(places))