From 06814ecae8937310ee32ddb2aed144fef6c7c542 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arne=20Schl=C3=BCter?= Date: Sat, 17 Jan 2015 11:45:52 +0100 Subject: [PATCH] Start writing some analyzaton code --- analyze.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 analyze.py diff --git a/analyze.py b/analyze.py new file mode 100644 index 0000000..04e45aa --- /dev/null +++ b/analyze.py @@ -0,0 +1,25 @@ +def get_district(article_headline): + """ + Returns a geo-coded version of a district an article is about, based on its + headline. + """ + pass + +def get_categories(article_body): + """ + Gives a list of categories an article falls into, which is empty if none of + the following are matched: + - sexism + - antisemitism + - homophobia + - racism + """ + bad_words = { + 'antisemit': 'antisemitism', + 'homophob': 'homophobia', + 'sexis': 'sexism', + 'rassis': 'racism' + } + found_categories = [bad_words[key] for key in bad_words + if key in article_body.lower()] + return found_categories or ['other']