Merge branch 'dev-arne'

Conflicts: scraper/scraper.py
2026-05-06 19:23:39 +02:00 · 2014-12-08 16:20:08 +01:00 · 2014-12-08 16:20:08 +01:00 · 5306e6dab4
commit 5306e6dab4
parent c872c8e39d 55a599a47b
2 changed files with 69 additions and 3 deletions
--- a/get_incidents.py
+++ b/get_incidents.py
@ -0,0 +1,65 @@
 # This file contains the logic that periodically fetches all pages on the
 # Reachout Berlin homepage, checks if they're already in the database and inserts
 # them if needed.
 import sqlite3
 import hashlib
 from scraper.scraper import Scraper
 encoding = 'utf-8'
 scraper = Scraper()
 articles = scraper.scrape()
 conn = sqlite3.connect('violence.db')
 c = conn.cursor()
 # setup database schema
 c.execute('''
    CREATE TABLE IF NOT EXISTS incidents (
        incident_id INTEGER PRIMARY KEY,
        date TEXT,
        place TEXT,
        additional_place TEXT,
        description TEXT,
        hash
    );
 ''')
 c.execute('''
    CREATE INDEX IF NOT EXISTS incidents_date
    ON incidents (date);
 ''')
 c.execute('''
    CREATE INDEX IF NOT EXISTS incidents_hash
    ON incidents (hash);
 ''')
 # insert articles if necessary
 select_query = 'SELECT * FROM incidents WHERE hash=?'
 insert_query = '''
    INSERT INTO incidents (
        date, place, additional_place, description, hash
    ) VALUES (?)
 '''
 for article in articles:
    # build a hash so we can more easily find out if we have an article already
    h = h.sha256()
    h.update(article.date.encode(encoding))
    h.update(article.place.encode(encoding))
    h.update(article.additional_place.encode(encoding))
    h.update(article.description.encode(encoding))
    digest = h.digest()
    c.execute(select_query)
    if (not c.fetchone()):
        article_tuple = (
            article.date,
            article.place,
            article.additional_place,
            article.description,
            digest
        )
        c.execute(insert_query, article_tuple)
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@ -1,5 +1,6 @@
 from bs4 import BeautifulSoup
 from urllib import parse, request
 from datetime import date
 class Scraper():
@ -27,7 +28,7 @@ class Scraper():
            # headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info)
            headline = table.select('tr:first-child')[0].get_text()
-            date = headline[:headline.find(' ')]
+            year, month, day = headline[:headline.find(' ')].strip().split('-')
            places = headline[headline.find(' ') + 1:]
            if places.find(' ') == -1:
@ -35,9 +36,9 @@ class Scraper():
                additional = None
            else:
                district = places[:places.find(' ')]
-                additional = places[places.find(' ') + 1:]
+                additional = places[places.find(' ') + 1:].strip()
-            text = table.select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].get_text()
+            text = table.select('tr')[2].select('td')[1].get_text()
            article = {
                'date': date.strip(),