Merge branch 'place_extraction'

Conflicts: analyze.py
2026-05-06 19:23:39 +02:00 · 2015-01-17 17:38:41 +01:00 · 2015-01-17 17:38:41 +01:00 · 41f2afcc12
commit 41f2afcc12
parent 309cd191af 492ea184bc
21 changed files with 54650 additions and 0 deletions
--- a/analyze.py
+++ b/analyze.py
@ -1,4 +1,82 @@
+<<<<<<< HEAD
 import requests, json
+=======
+import re
+import string
+from nltk.tag.stanford import POSTagger
+
+tagger = POSTagger('./stanford-postagger-full-2014-10-26/models/german-fast.tagger',
+                   './stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar',
+                    'UTF-8')
+
+punctuation_regex = re.compile("[%s]" % re.escape(string.punctuation))
+
+def get_potential_places(article_place, article_body):
+    """
+    Returns a list of potential places as tuples with their part-of-speech tags
+    for later filtering
+    """
+    place_pos = tagger.tag(punctuation_regex.sub(" ", article_place).split())
+    text_pos = tagger.tag(punctuation_regex.sub(" ", article_body).split())
+
+    # extract the places out of the full text
+    places = [place_pos]
+    is_matching = False
+    current_match = []
+    for tuple in text_pos:
+        if is_matching:
+            # when we're matching, the phrases we're looking for look like
+            # "Im S-Bahnhof Wedding"... the tags below mean
+            if tuple[1] in ("ART", "ADJA", "NN", "NE", "CARD"):
+                current_match.append(tuple)
+            else:
+                # we stop the match, so append the current match
+                places.append(current_match)
+                current_match = []
+
+                # whe we're looking at a preposition again, just start new match
+                if tuple[1] not in ("APPR", "APPRART"):
+                    is_matching = False
+        else:
+            # start matching when we have a preposition
+            if tuple[1] in ("APPR", "APPRART"):
+                is_matching = True
+
+    return places
+
+def improve_potential_places(pos_tuples):
+    """
+    Improves the matches' quality so we don't have to look up the lat-lng of so
+    many mismatches
+    """
+    better_tuples = []
+    for tuple_list in pos_tuples:
+        # first, exluce empty lists
+        if tuple_list:
+            cleaner_list = []
+
+            index = -1
+            for tuple in tuple_list:
+                index += 1
+
+                # exclude articles ("the", "a"), they only introduce noise, but
+                # keep the list as a whole
+                if tuple[1] == "ART":
+                    continue
+
+                # if we have numbers in the middle of our phrase, probably the
+                # whole list is not useful (as opposed to e.g. Krügerstr. 22)
+                if tuple[1] == "CARD" and index < len(tuple_list):
+                    cleaner_list = []
+                    break
+
+                cleaner_list.append(tuple)
+
+            if cleaner_list:
+                better_tuples.append(cleaner_list)
+
+    return better_tuples
+>>>>>>> place_extraction

 def get_district(article_headline):
    """