diff --git a/get_incidents.py b/get_incidents.py new file mode 100644 index 0000000..f47936e --- /dev/null +++ b/get_incidents.py @@ -0,0 +1,65 @@ +# This file contains the logic that periodically fetches all pages on the +# Reachout Berlin homepage, checks if they're already in the database and inserts +# them if needed. + +import sqlite3 +import hashlib +from scraper.scraper import Scraper + +encoding = 'utf-8' + +scraper = Scraper() +articles = scraper.scrape() + +conn = sqlite3.connect('violence.db') +c = conn.cursor() + +# setup database schema +c.execute(''' + CREATE TABLE IF NOT EXISTS incidents ( + incident_id INTEGER PRIMARY KEY, + date TEXT, + place TEXT, + additional_place TEXT, + description TEXT, + hash + ); +''') + +c.execute(''' + CREATE INDEX IF NOT EXISTS incidents_date + ON incidents (date); +''') + +c.execute(''' + CREATE INDEX IF NOT EXISTS incidents_hash + ON incidents (hash); +''') + +# insert articles if necessary +select_query = 'SELECT * FROM incidents WHERE hash=?' +insert_query = ''' + INSERT INTO incidents ( + date, place, additional_place, description, hash + ) VALUES (?) +''' +for article in articles: + # build a hash so we can more easily find out if we have an article already + h = h.sha256() + h.update(article.date.encode(encoding)) + h.update(article.place.encode(encoding)) + h.update(article.additional_place.encode(encoding)) + h.update(article.description.encode(encoding)) + digest = h.digest() + + c.execute(select_query) + + if (not c.fetchone()): + article_tuple = ( + article.date, + article.place, + article.additional_place, + article.description, + digest + ) + c.execute(insert_query, article_tuple) diff --git a/scraper/scraper.py b/scraper/scraper.py index 72ebf83..2a2da71 100644 --- a/scraper/scraper.py +++ b/scraper/scraper.py @@ -1,5 +1,6 @@ from bs4 import BeautifulSoup from urllib import parse, request +from datetime import date class Scraper(): @@ -27,7 +28,7 @@ class Scraper(): # headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info) headline = table.select('tr:first-child')[0].get_text() - date = headline[:headline.find(' ')] + year, month, day = headline[:headline.find(' ')].strip().split('-') places = headline[headline.find(' ') + 1:] if places.find(' ') == -1: @@ -35,9 +36,9 @@ class Scraper(): additional = None else: district = places[:places.find(' ')] - additional = places[places.find(' ') + 1:] + additional = places[places.find(' ') + 1:].strip() - text = table.select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].get_text() + text = table.select('tr')[2].select('td')[1].get_text() article = { 'date': date.strip(),