mirror of
https://github.com/heyarne/berliner-winter.git
synced 2026-05-06 19:23:39 +02:00
Start with improving the place results
This commit is contained in:
parent
8e6032be3a
commit
b85c84139b
2 changed files with 79 additions and 21 deletions
74
analyze.py
74
analyze.py
|
|
@ -1,3 +1,77 @@
|
||||||
|
import re
|
||||||
|
import string
|
||||||
|
from nltk.tag.stanford import POSTagger
|
||||||
|
|
||||||
|
tagger = POSTagger('./stanford-postagger-full-2014-10-26/models/german-fast.tagger',
|
||||||
|
'./stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar',
|
||||||
|
'UTF-8')
|
||||||
|
|
||||||
|
punctuation_regex = re.compile("[%s]" % re.escape(string.punctuation))
|
||||||
|
|
||||||
|
def get_potential_places(article_place, article_body):
|
||||||
|
"""
|
||||||
|
Returns a list of potential places as tuples with their part-of-speech tags
|
||||||
|
for later filtering
|
||||||
|
"""
|
||||||
|
# remove punctuation
|
||||||
|
full_text = punctuation_regex.sub(" ", article_place + " " + article_body)
|
||||||
|
|
||||||
|
pos = tagger.tag(full_text.split())
|
||||||
|
|
||||||
|
# extract the places
|
||||||
|
places = []
|
||||||
|
is_matching = False
|
||||||
|
current_match = []
|
||||||
|
for tuple in pos:
|
||||||
|
if is_matching:
|
||||||
|
# when we're matching, the phrases we're looking for look like
|
||||||
|
# "Im S-Bahnhof Wedding"... the tags below mean
|
||||||
|
if tuple[1] in ("ART", "ADJA", "NN", "NE", "CARD"):
|
||||||
|
current_match.append(tuple)
|
||||||
|
else:
|
||||||
|
# we stop the match, so append the current match
|
||||||
|
places.append(current_match)
|
||||||
|
current_match = []
|
||||||
|
is_matching = False
|
||||||
|
else:
|
||||||
|
# start matching when we have a preposition
|
||||||
|
if tuple[1] in ("APPR", "APPRART"):
|
||||||
|
is_matching = True
|
||||||
|
|
||||||
|
return places
|
||||||
|
|
||||||
|
def improve_potential_places(pos_tuples):
|
||||||
|
"""
|
||||||
|
Improves the matches' quality so we don't have to look up the lat-lng of so
|
||||||
|
many mismatches
|
||||||
|
"""
|
||||||
|
better_tuples = []
|
||||||
|
for tuple_list in pos_tuples:
|
||||||
|
# first, exluce empty lists
|
||||||
|
if tuple_list:
|
||||||
|
cleaner = []
|
||||||
|
|
||||||
|
index = -1
|
||||||
|
for tuple in tuple_list:
|
||||||
|
index += 1
|
||||||
|
|
||||||
|
# exclude articles ("the", "a"), they only introduce noise, but
|
||||||
|
# keep the wh
|
||||||
|
if tuple[1] is "ART":
|
||||||
|
continue
|
||||||
|
|
||||||
|
# if we have numbers in the middle of our phrase, it's probably
|
||||||
|
# also not useful (as opposed to Krügerstr. 22)
|
||||||
|
if tuple[1] is "CARD" and index < len(tuple_list):
|
||||||
|
cleaner_tuple = []
|
||||||
|
break
|
||||||
|
|
||||||
|
cleaner.append(tuple)
|
||||||
|
|
||||||
|
better_tuples.append(cleaner)
|
||||||
|
|
||||||
|
return better_tuples
|
||||||
|
|
||||||
def get_district(article_headline):
|
def get_district(article_headline):
|
||||||
"""
|
"""
|
||||||
Returns a geo-coded version of a district an article is about, based on its
|
Returns a geo-coded version of a district an article is about, based on its
|
||||||
|
|
|
||||||
26
tagger.py
26
tagger.py
|
|
@ -1,33 +1,17 @@
|
||||||
from nltk.tag.stanford import POSTagger
|
from nltk.tag.stanford import POSTagger
|
||||||
from models import Article
|
from models import Article
|
||||||
|
from analyze import *
|
||||||
|
|
||||||
tagger = POSTagger('./stanford-postagger-full-2014-10-26/models/german-fast.tagger',
|
tagger = POSTagger('./stanford-postagger-full-2014-10-26/models/german-fast.tagger',
|
||||||
'./stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar',
|
'./stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar',
|
||||||
'UTF-8')
|
'UTF-8')
|
||||||
|
|
||||||
for article in Article.select().limit(100):
|
for article in Article.select().limit(100):
|
||||||
pos = tagger.tag((article.place + " " + article.description).split())
|
potential = get_potential_places(article.place, article.description)
|
||||||
|
places = improve_potential_places(potential)
|
||||||
# extract the places
|
|
||||||
places = []
|
|
||||||
is_matching = False
|
|
||||||
current_match = []
|
|
||||||
for tuple in pos:
|
|
||||||
if is_matching:
|
|
||||||
# when we're matching, the phrases we're looking for look like
|
|
||||||
# "Im S-Bahnhof Wedding"... the tags below mean
|
|
||||||
if tuple[1] in ("ART", "ADJA", "NN", "NE", "CARD"):
|
|
||||||
current_match.append(tuple)
|
|
||||||
else:
|
|
||||||
places.append(current_match)
|
|
||||||
current_match = []
|
|
||||||
is_matching = False
|
|
||||||
else:
|
|
||||||
# start matching when we have a preposition
|
|
||||||
if tuple[1] in ("APPR", "APPRART"):
|
|
||||||
is_matching = True
|
|
||||||
|
|
||||||
print(article.place)
|
print(article.place)
|
||||||
print(article.description)
|
print(article.description)
|
||||||
print()
|
print()
|
||||||
print("Relevant: " + str(places))
|
print("Potential: " + str(potential))
|
||||||
|
print("Improved: " + str(places))
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue