From 66b68b269d08f05344fcbfbd43fb1ccd390cd18c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arne=20Schl=C3=BCter?= <aesthaddicts@gmail.com>
Date: Mon, 8 Dec 2014 14:40:05 +0100
Subject: [PATCH 1/4] Normalize \r\n to \n and use date objects instead of the
 original date string

---
 scraper/scraper.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/scraper/scraper.py b/scraper/scraper.py
index ae49d40..57d0cc4 100644
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@@ -1,5 +1,6 @@
 from bs4 import BeautifulSoup
 from urllib import parse, request
+from datetime import date
 
 class Scraper():
 
@@ -24,7 +25,7 @@ class Scraper():
             # headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info)
             headline = table.select('tr:first-child')[0].get_text()
 
-            date = headline[:headline.find(' ')]
+            year, month, day = headline[:headline.find(' ')].strip().split('-')
             places = headline[headline.find(' ') + 1:]
 
             if places.find(' ') == -1:
@@ -32,15 +33,15 @@ class Scraper():
                 additional = None
             else:
                 district = places[:places.find(' ')]
-                additional = places[places.find(' ') + 1:]
+                additional = places[places.find(' ') + 1:].strip()
 
             text = table.select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].get_text()
 
             article = {
-                'date': date.strip(),
-                'place': district,
+                'date': date(int(year), int(month), int(day)),
+                'place': district.strip(),
                 'additional': additional,
-                'text': text.strip()
+                'text': text.strip().replace('\r\n', '\n')
             }
             articles.append(article)
 

From 58f8e6566973d222e77614ce2a5439dafbaa90da Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arne=20Schl=C3=BCter?= <aesthaddicts@gmail.com>
Date: Mon, 8 Dec 2014 14:46:26 +0100
Subject: [PATCH 2/4] Simplify code by removing :nth-of-type and accessing the
 list directly

---
 scraper/scraper.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/scraper/scraper.py b/scraper/scraper.py
index 57d0cc4..8841b21 100644
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@@ -35,7 +35,7 @@ class Scraper():
                 district = places[:places.find(' ')]
                 additional = places[places.find(' ') + 1:].strip()
 
-            text = table.select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].get_text()
+            text = table.select('tr')[2].select('td')[1].get_text()
 
             article = {
                 'date': date(int(year), int(month), int(day)),
@@ -67,5 +67,6 @@ class Scraper():
 
         # for url in overview_urls:
 
-        #     while self.has_more_pages():
+        #     while document:
         #         self.visit_next_page()
+        #         document = self.get_next_page()

From e018aead0e950a457d1ae335f047a2e516013f37 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arne=20Schl=C3=BCter?= <aesthaddicts@gmail.com>
Date: Mon, 8 Dec 2014 15:40:31 +0100
Subject: [PATCH 3/4] Add database setup code

---
 get_incidents.py | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 get_incidents.py

diff --git a/get_incidents.py b/get_incidents.py
new file mode 100644
index 0000000..6dcf0a6
--- /dev/null
+++ b/get_incidents.py
@@ -0,0 +1,32 @@
+# This file contains the logic that periodically fetches all pages on the
+# Reachout Berlin homepage, checks if they're already in the database and inserts
+# them if needed.
+
+import sqlite3
+from scraper.scraper import Scraper
+
+# scraper = Scraper()
+# articles = scraper.scrape()
+
+conn = sqlite3.connect('violence.db')
+c = conn.cursor()
+
+# setup database schema
+c.execute('''
+    CREATE TABLE IF NOT EXISTS incidents (
+        incident_id INTEGER PRIMARY KEY,
+        date TEXT,
+        place TEXT,
+        additional_place TEXT,
+        description TEXT
+    );
+''')
+
+c.execute('''
+    CREATE INDEX IF NOT EXISTS incidents_date
+    ON incidents (date);
+''')
+
+# insert articles
+# for article in articles:
+#     pass

From 55a599a47b69d1289540cd618a6aa61556a00a3b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arne=20Schl=C3=BCter?= <aesthaddicts@gmail.com>
Date: Mon, 8 Dec 2014 16:19:18 +0100
Subject: [PATCH 4/4] Write insertion logic for articles

---
 get_incidents.py | 45 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 39 insertions(+), 6 deletions(-)

diff --git a/get_incidents.py b/get_incidents.py
index 6dcf0a6..f47936e 100644
--- a/get_incidents.py
+++ b/get_incidents.py
@@ -3,10 +3,13 @@
 # them if needed.
 
 import sqlite3
+import hashlib
 from scraper.scraper import Scraper
 
-# scraper = Scraper()
-# articles = scraper.scrape()
+encoding = 'utf-8'
+
+scraper = Scraper()
+articles = scraper.scrape()
 
 conn = sqlite3.connect('violence.db')
 c = conn.cursor()
@@ -18,7 +21,8 @@ c.execute('''
         date TEXT,
         place TEXT,
         additional_place TEXT,
-        description TEXT
+        description TEXT,
+        hash
     );
 ''')
 
@@ -27,6 +31,35 @@ c.execute('''
     ON incidents (date);
 ''')
 
-# insert articles
-# for article in articles:
-#     pass
+c.execute('''
+    CREATE INDEX IF NOT EXISTS incidents_hash
+    ON incidents (hash);
+''')
+
+# insert articles if necessary
+select_query = 'SELECT * FROM incidents WHERE hash=?'
+insert_query = '''
+    INSERT INTO incidents (
+        date, place, additional_place, description, hash
+    ) VALUES (?)
+'''
+for article in articles:
+    # build a hash so we can more easily find out if we have an article already
+    h = h.sha256()
+    h.update(article.date.encode(encoding))
+    h.update(article.place.encode(encoding))
+    h.update(article.additional_place.encode(encoding))
+    h.update(article.description.encode(encoding))
+    digest = h.digest()
+
+    c.execute(select_query)
+
+    if (not c.fetchone()):
+        article_tuple = (
+            article.date,
+            article.place,
+            article.additional_place,
+            article.description,
+            digest
+        )
+        c.execute(insert_query, article_tuple)