berliner-winter/analyze.py

import requests, json
from urllib.parse import urlencode

import re, string
from nltk.tag.stanford import POSTagger

tagger = POSTagger('./stanford-postagger-full-2014-10-26/models/german-fast.tagger',
                   './stanford-postagger-full-2014-10-26/stanford-postagger-3.5.0.jar',
                    'UTF-8')

punctuation_regex = re.compile("[%s]" % re.escape(string.punctuation))

def get_potential_places(article_place, article_body):
    """
    Returns a list of potential places as tuples with their part-of-speech tags
    for later filtering
    """
    place_pos = tagger.tag(punctuation_regex.sub(" ", article_place).split())
    text_pos = tagger.tag(punctuation_regex.sub(" ", article_body).split())

    # extract the places out of the full text
    places = [place_pos]
    is_matching = False
    current_match = []
    for tuple in text_pos:
        if is_matching:
            # when we're matching, the phrases we're looking for look like
            # "Im S-Bahnhof Wedding"... the tags below mean
            if tuple[1] in ("ART", "ADJA", "NN", "NE", "CARD"):
                current_match.append(tuple)
            else:
                # we stop the match, so append the current match
                places.append(current_match)
                current_match = []

                # whe we're looking at a preposition again, just start new match
                if tuple[1] not in ("APPR", "APPRART"):
                    is_matching = False
        else:
            # start matching when we have a preposition
            if tuple[1] in ("APPR", "APPRART"):
                is_matching = True

    return places

def improve_potential_places(pos_tuples):
    """
    Improves the matches' quality so we don't have to look up the lat-lng of so
    many mismatches
    """
    better_tuples = []
    for tuple_list in pos_tuples:
        # first, exluce empty lists
        if tuple_list:
            cleaner_list = []

            index = -1
            for tuple in tuple_list:
                index += 1

                # exclude articles ("the", "a") beginning the phrase, they only
                # introduce noise, but keep the list as a whole
                if tuple[1] == "ART" and index == 0:
                    continue

                # if we have numbers in the middle of our phrase, probably the
                # whole list is not useful (as opposed to e.g. Krügerstr. 22)
                if tuple[1] == "CARD" and index < len(tuple_list):
                    cleaner_list = []
                    break

                cleaner_list.append(tuple)

            if cleaner_list:
                better_tuples.append(cleaner_list)

    return better_tuples

def get_categories(article_body):
    """
    Gives a list of categories an article falls into, which is empty if none of
    the following are matched:
    - sexism
    - antisemitism
    - homophobia
    - racism
    """
    bad_words = {
        'antisemit': 'antisemitism',
        'jud': 'antisemitism',
        'jüd': 'antisemitism',
        'homo': 'homophobia',
        'schwul': 'homophobia',
        'lesb': 'homophobia',
        'trans': 'homophobia',
        'sexis': 'sexism',
        'frauenfeind': 'sexism',
        'rassis': 'racism',
        'fremdenfeind': 'racism',
        'flüchtling': 'racism',
        'migrant': 'racism'
    }
    found_categories = set([bad_words[key] for key in bad_words
                                            if key in article_body.lower()])
    return found_categories

def get_geoloc(query):
    confidence_map = {
        "ROOFTOP": 10,
        "RANGE_INTERPOLATED": 7,
        "GEOMETRIC_CENTER": 4,
        "APPROXIMATE": 1
    }

    params = {
        "address": query + ", Berlin",
        "bounds": "52.6754542,13.7611176|52.33962959999999,13.0891553",
        "components": "country:DE",
        "sensor": False
    }

    url = "http://maps.googleapis.com/maps/api/geocode/json?" + urlencode(params)
    r = requests.get(url).json()["results"]

    locations = []
    for location in r:
        print(location)
        locations.append({
            "lat": location["geometry"]["location"]["lat"],
            "lng": location["geometry"]["location"]["lng"],
            "confidence": confidence_map[location["geometry"]["location_type"]],
            "returned_place": location["formatted_address"]
        })

    return locations