From 73dee66d16b0ad14d9f16d17bec008ab5b2ab9f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arne=20Schl=C3=BCter?= Date: Sat, 17 Jan 2015 16:27:53 +0100 Subject: [PATCH] Look at title and text separately --- analyze.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/analyze.py b/analyze.py index 9b23678..4103cc8 100644 --- a/analyze.py +++ b/analyze.py @@ -13,16 +13,14 @@ def get_potential_places(article_place, article_body): Returns a list of potential places as tuples with their part-of-speech tags for later filtering """ - # remove punctuation - full_text = punctuation_regex.sub(" ", article_place + " " + article_body) + place_pos = tagger.tag(punctuation_regex.sub(" ", article_place).split()) + text_pos = tagger.tag(punctuation_regex.sub(" ", article_body).split()) - pos = tagger.tag(full_text.split()) - - # extract the places - places = [] + # extract the places out of the full text + places = [place_pos] is_matching = False current_match = [] - for tuple in pos: + for tuple in text_pos: if is_matching: # when we're matching, the phrases we're looking for look like # "Im S-Bahnhof Wedding"... the tags below mean