From b85c84139b1da99db984d219bd5abd37cb2134b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arne=20Schl=C3=BCter?= Date: Sat, 17 Jan 2015 15:57:34 +0100 Subject: [PATCH] Start with improving the place results --- analyze.py | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ tagger.py | 26 ++++--------------- 2 files changed, 79 insertions(+), 21 deletions(-) diff --git a/analyze.py b/analyze.py index 04e45aa..9b23678 100644 --- a/analyze.py +++ b/analyze.py @@ -1,3 +1,77 @@ +import re +import string +from nltk.tag.stanford import POSTagger + +tagger = POSTagger('./stanford-postagger-full-2014-10-26/models/german-fast.tagger', + './stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar', + 'UTF-8') + +punctuation_regex = re.compile("[%s]" % re.escape(string.punctuation)) + +def get_potential_places(article_place, article_body): + """ + Returns a list of potential places as tuples with their part-of-speech tags + for later filtering + """ + # remove punctuation + full_text = punctuation_regex.sub(" ", article_place + " " + article_body) + + pos = tagger.tag(full_text.split()) + + # extract the places + places = [] + is_matching = False + current_match = [] + for tuple in pos: + if is_matching: + # when we're matching, the phrases we're looking for look like + # "Im S-Bahnhof Wedding"... the tags below mean + if tuple[1] in ("ART", "ADJA", "NN", "NE", "CARD"): + current_match.append(tuple) + else: + # we stop the match, so append the current match + places.append(current_match) + current_match = [] + is_matching = False + else: + # start matching when we have a preposition + if tuple[1] in ("APPR", "APPRART"): + is_matching = True + + return places + +def improve_potential_places(pos_tuples): + """ + Improves the matches' quality so we don't have to look up the lat-lng of so + many mismatches + """ + better_tuples = [] + for tuple_list in pos_tuples: + # first, exluce empty lists + if tuple_list: + cleaner = [] + + index = -1 + for tuple in tuple_list: + index += 1 + + # exclude articles ("the", "a"), they only introduce noise, but + # keep the wh + if tuple[1] is "ART": + continue + + # if we have numbers in the middle of our phrase, it's probably + # also not useful (as opposed to Krügerstr. 22) + if tuple[1] is "CARD" and index < len(tuple_list): + cleaner_tuple = [] + break + + cleaner.append(tuple) + + better_tuples.append(cleaner) + + return better_tuples + def get_district(article_headline): """ Returns a geo-coded version of a district an article is about, based on its diff --git a/tagger.py b/tagger.py index 6149fcd..d62efac 100644 --- a/tagger.py +++ b/tagger.py @@ -1,33 +1,17 @@ from nltk.tag.stanford import POSTagger from models import Article +from analyze import * tagger = POSTagger('./stanford-postagger-full-2014-10-26/models/german-fast.tagger', './stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar', 'UTF-8') for article in Article.select().limit(100): - pos = tagger.tag((article.place + " " + article.description).split()) - - # extract the places - places = [] - is_matching = False - current_match = [] - for tuple in pos: - if is_matching: - # when we're matching, the phrases we're looking for look like - # "Im S-Bahnhof Wedding"... the tags below mean - if tuple[1] in ("ART", "ADJA", "NN", "NE", "CARD"): - current_match.append(tuple) - else: - places.append(current_match) - current_match = [] - is_matching = False - else: - # start matching when we have a preposition - if tuple[1] in ("APPR", "APPRART"): - is_matching = True + potential = get_potential_places(article.place, article.description) + places = improve_potential_places(potential) print(article.place) print(article.description) print() - print("Relevant: " + str(places)) + print("Potential: " + str(potential)) + print("Improved: " + str(places))