From 66b68b269d08f05344fcbfbd43fb1ccd390cd18c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arne=20Schl=C3=BCter?= Date: Mon, 8 Dec 2014 14:40:05 +0100 Subject: [PATCH 1/4] Normalize \r\n to \n and use date objects instead of the original date string --- scraper/scraper.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/scraper/scraper.py b/scraper/scraper.py index ae49d40..57d0cc4 100644 --- a/scraper/scraper.py +++ b/scraper/scraper.py @@ -1,5 +1,6 @@ from bs4 import BeautifulSoup from urllib import parse, request +from datetime import date class Scraper(): @@ -24,7 +25,7 @@ class Scraper(): # headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info) headline = table.select('tr:first-child')[0].get_text() - date = headline[:headline.find(' ')] + year, month, day = headline[:headline.find(' ')].strip().split('-') places = headline[headline.find(' ') + 1:] if places.find(' ') == -1: @@ -32,15 +33,15 @@ class Scraper(): additional = None else: district = places[:places.find(' ')] - additional = places[places.find(' ') + 1:] + additional = places[places.find(' ') + 1:].strip() text = table.select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].get_text() article = { - 'date': date.strip(), - 'place': district, + 'date': date(int(year), int(month), int(day)), + 'place': district.strip(), 'additional': additional, - 'text': text.strip() + 'text': text.strip().replace('\r\n', '\n') } articles.append(article) From 58f8e6566973d222e77614ce2a5439dafbaa90da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arne=20Schl=C3=BCter?= Date: Mon, 8 Dec 2014 14:46:26 +0100 Subject: [PATCH 2/4] Simplify code by removing :nth-of-type and accessing the list directly --- scraper/scraper.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scraper/scraper.py b/scraper/scraper.py index 57d0cc4..8841b21 100644 --- a/scraper/scraper.py +++ b/scraper/scraper.py @@ -35,7 +35,7 @@ class Scraper(): district = places[:places.find(' ')] additional = places[places.find(' ') + 1:].strip() - text = table.select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].get_text() + text = table.select('tr')[2].select('td')[1].get_text() article = { 'date': date(int(year), int(month), int(day)), @@ -67,5 +67,6 @@ class Scraper(): # for url in overview_urls: - # while self.has_more_pages(): + # while document: # self.visit_next_page() + # document = self.get_next_page() From e018aead0e950a457d1ae335f047a2e516013f37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arne=20Schl=C3=BCter?= Date: Mon, 8 Dec 2014 15:40:31 +0100 Subject: [PATCH 3/4] Add database setup code --- get_incidents.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 get_incidents.py diff --git a/get_incidents.py b/get_incidents.py new file mode 100644 index 0000000..6dcf0a6 --- /dev/null +++ b/get_incidents.py @@ -0,0 +1,32 @@ +# This file contains the logic that periodically fetches all pages on the +# Reachout Berlin homepage, checks if they're already in the database and inserts +# them if needed. + +import sqlite3 +from scraper.scraper import Scraper + +# scraper = Scraper() +# articles = scraper.scrape() + +conn = sqlite3.connect('violence.db') +c = conn.cursor() + +# setup database schema +c.execute(''' + CREATE TABLE IF NOT EXISTS incidents ( + incident_id INTEGER PRIMARY KEY, + date TEXT, + place TEXT, + additional_place TEXT, + description TEXT + ); +''') + +c.execute(''' + CREATE INDEX IF NOT EXISTS incidents_date + ON incidents (date); +''') + +# insert articles +# for article in articles: +# pass From 55a599a47b69d1289540cd618a6aa61556a00a3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arne=20Schl=C3=BCter?= Date: Mon, 8 Dec 2014 16:19:18 +0100 Subject: [PATCH 4/4] Write insertion logic for articles --- get_incidents.py | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/get_incidents.py b/get_incidents.py index 6dcf0a6..f47936e 100644 --- a/get_incidents.py +++ b/get_incidents.py @@ -3,10 +3,13 @@ # them if needed. import sqlite3 +import hashlib from scraper.scraper import Scraper -# scraper = Scraper() -# articles = scraper.scrape() +encoding = 'utf-8' + +scraper = Scraper() +articles = scraper.scrape() conn = sqlite3.connect('violence.db') c = conn.cursor() @@ -18,7 +21,8 @@ c.execute(''' date TEXT, place TEXT, additional_place TEXT, - description TEXT + description TEXT, + hash ); ''') @@ -27,6 +31,35 @@ c.execute(''' ON incidents (date); ''') -# insert articles -# for article in articles: -# pass +c.execute(''' + CREATE INDEX IF NOT EXISTS incidents_hash + ON incidents (hash); +''') + +# insert articles if necessary +select_query = 'SELECT * FROM incidents WHERE hash=?' +insert_query = ''' + INSERT INTO incidents ( + date, place, additional_place, description, hash + ) VALUES (?) +''' +for article in articles: + # build a hash so we can more easily find out if we have an article already + h = h.sha256() + h.update(article.date.encode(encoding)) + h.update(article.place.encode(encoding)) + h.update(article.additional_place.encode(encoding)) + h.update(article.description.encode(encoding)) + digest = h.digest() + + c.execute(select_query) + + if (not c.fetchone()): + article_tuple = ( + article.date, + article.place, + article.additional_place, + article.description, + digest + ) + c.execute(insert_query, article_tuple)