diff --git a/.gitignore b/.gitignore index 77f5f19..c32cfb5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ www/bower_components !www/bower_components/.gitkeep +# Ignore the SQLite database +violence.db + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/get_incidents.py b/get_incidents.py index f47936e..372669d 100644 --- a/get_incidents.py +++ b/get_incidents.py @@ -4,21 +4,36 @@ import sqlite3 import hashlib +import time from scraper.scraper import Scraper -encoding = 'utf-8' +encoding = 'UTF-8' +# helper function for benchmarking +current_milli_time = lambda: int(round(time.time() * 1000)) + +# +# this is where the logic starts: +# + +print('Start crawling…') +start_time = current_milli_time() scraper = Scraper() articles = scraper.scrape() +time_taken = current_milli_time() - start_time +print('Found {} articles in {} ms'.format(len(articles), time_taken)) conn = sqlite3.connect('violence.db') c = conn.cursor() # setup database schema +c.execute('PRAGMA encoding = "{}"'.format(encoding)) + c.execute(''' CREATE TABLE IF NOT EXISTS incidents ( incident_id INTEGER PRIMARY KEY, date TEXT, + month_only INTEGER, place TEXT, additional_place TEXT, description TEXT, @@ -40,26 +55,33 @@ c.execute(''' select_query = 'SELECT * FROM incidents WHERE hash=?' insert_query = ''' INSERT INTO incidents ( - date, place, additional_place, description, hash - ) VALUES (?) + date, month_only, place, additional_place, description, hash + ) VALUES (?,?,?,?,?,?) ''' + +print('Starting database work') for article in articles: # build a hash so we can more easily find out if we have an article already - h = h.sha256() - h.update(article.date.encode(encoding)) - h.update(article.place.encode(encoding)) - h.update(article.additional_place.encode(encoding)) - h.update(article.description.encode(encoding)) + h = hashlib.sha256() + h.update(str(article['date']).encode(encoding)) + h.update(article['place'].encode(encoding)) + h.update((article['additional_place'] or '').encode(encoding)) + h.update(article['description'].encode(encoding)) digest = h.digest() - c.execute(select_query) + c.execute(select_query, (digest,)) + # now if it's not in the database insert it if (not c.fetchone()): article_tuple = ( - article.date, - article.place, - article.additional_place, - article.description, + article['date'], + article['month_only'], + article['place'], + article['additional_place'], + article['description'], digest ) c.execute(insert_query, article_tuple) + +final_time = current_milli_time() - start_time +print('All done in {} ms'.format(final_time)) diff --git a/scraper/scraper.py b/scraper/scraper.py index 2a2da71..24485b7 100644 --- a/scraper/scraper.py +++ b/scraper/scraper.py @@ -1,6 +1,7 @@ from bs4 import BeautifulSoup from urllib import parse, request from datetime import date +import re class Scraper(): @@ -10,6 +11,11 @@ class Scraper(): self.start = request.urlopen(index) self.base_url = parsed_url.scheme + "://" + parsed_url.netloc + # dates are a bit dificult; usually they're formatted like YYYY-MM-DD, + # followed by a space character, but sometimes the day is missing or it's + # followed by another character… + self.date_matcher = re.compile('^(\d{4})-(\d{,2})(-(\d{,2}))?') + def get_next_page(self, document): nav_elem = document.select('.nav')[1] @@ -25,10 +31,17 @@ class Scraper(): articles = [] for table in article_tables: - # headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info) + # headlines are always YYYY-MM-DD? Berlin-DISTRICT (+ sometimes additional info) headline = table.select('tr:first-child')[0].get_text() - year, month, day = headline[:headline.find(' ')].strip().split('-') + date_match = self.date_matcher.match(headline.strip()) + + try: + year, month, day = date_match.group(1,2,4) + except: + print('Failed for headline ' + headline) + raise + places = headline[headline.find(' ') + 1:] if places.find(' ') == -1: @@ -41,8 +54,9 @@ class Scraper(): text = table.select('tr')[2].select('td')[1].get_text() article = { - 'date': date.strip(), - 'place': district, + 'date': date(int(year), int(month), int(day) if day else 1), + 'month_only': day is None, + 'place': district.strip(), 'additional_place': additional, 'description': text.strip() } @@ -67,11 +81,11 @@ class Scraper(): articles = [] for url in overview_urls: - currentDoc = BeautifulSoup(request.urlopen(url)) + current_doc = BeautifulSoup(request.urlopen(url)) - while currentDoc: - new_articles = self.get_articles_on_page(currentDoc) + while current_doc: + new_articles = self.get_articles_on_page(current_doc) articles.extend(new_articles) - currentDoc = self.get_next_page(currentDoc) + current_doc = self.get_next_page(current_doc) return articles