diff --git a/.editorconfig b/.editorconfig index 52a01f2..8f754a1 100644 --- a/.editorconfig +++ b/.editorconfig @@ -14,3 +14,6 @@ trim_trailing_whitespace = true [*.py] indent_style = space indent_size = 4 + +[*.md] +insert_final_newline = false diff --git a/README.md b/README.md index 4660847..da3fc83 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,10 @@ A visualization of hate crime in Berlin, starting 2005. The data is kindly provided by [ReachOut - Opferberatung und Bildung gegen Rechtsextremismus, Rassismus und Antisemitismus](http://www.reachoutberlin.de). It is scraped regularly from their webpage and visualized and analyzed by software written by [Joshua Widmann](https://github.com/jshwdmnn) and [Arne Schlüter](https://github.com/aesthaddicts). + +## How do I start? +In order to set up the tables you have to create them first. This is done quite easily using the `python` interpreter: +```python +from models import * +create_tables() +``` \ No newline at end of file diff --git a/get_incidents.py b/get_incidents.py index e46b81d..b3b4cb5 100644 --- a/get_incidents.py +++ b/get_incidents.py @@ -5,6 +5,7 @@ import sqlite3 import hashlib import time +from models import * from scraper.scraper import Scraper encoding = 'UTF-8' @@ -12,10 +13,7 @@ encoding = 'UTF-8' # helper function for benchmarking current_milli_time = lambda: int(round(time.time() * 1000)) -# -# this is where the logic starts: -# - +# First crawl through the whole index and get all articles we can find print('Start crawling…') start_time = current_milli_time() scraper = Scraper() @@ -23,43 +21,9 @@ articles = scraper.scrape() time_taken = current_milli_time() - start_time print('Found {} articles in {} ms'.format(len(articles), time_taken)) -conn = sqlite3.connect('violence.db') -c = conn.cursor() - -# setup database schema -c.execute('PRAGMA encoding = "{}"'.format(encoding)) - -c.execute(''' - CREATE TABLE IF NOT EXISTS incidents ( - incident_id INTEGER PRIMARY KEY, - date TEXT, - month_only INTEGER, - place TEXT, - additional_place TEXT, - description TEXT, - hash - ); -''') - -c.execute(''' - CREATE INDEX IF NOT EXISTS incidents_date - ON incidents (date); -''') - -c.execute(''' - CREATE INDEX IF NOT EXISTS incidents_hash - ON incidents (hash); -''') - -# insert articles if necessary -select_query = 'SELECT * FROM incidents WHERE hash=?' -insert_query = ''' - INSERT INTO incidents ( - date, month_only, place, additional_place, description, hash - ) VALUES (?,?,?,?,?,?) -''' - +# Now fill the database print('Starting database work') + for article in articles: # build a hash so we can more easily find out if we have an article already h = hashlib.sha256() @@ -69,20 +33,18 @@ for article in articles: h.update(article['description'].encode(encoding)) digest = h.digest() - c.execute(select_query, (digest,)) - - # now if it's not in the database insert it - if (not c.fetchone()): - article_tuple = ( - article['date'], - article['month_only'], - article['place'], - article['additional_place'], - article['description'], - digest + try: + Article.get(Article.hash == digest) + except: + # article not found + Article.create( + date = article['date'], + month_only = article['month_only'], + place = article['place'], + additional_place = article['additional_place'], + description = article['description'], + hash = digest ) - c.execute(insert_query, article_tuple) -c.close() final_time = current_milli_time() - start_time print('All done in {} ms'.format(final_time)) diff --git a/models.py b/models.py new file mode 100644 index 0000000..07de832 --- /dev/null +++ b/models.py @@ -0,0 +1,23 @@ +from peewee import * + +db = SqliteDatabase('violence.db') + +class BaseModel(Model): + class Meta: + database = db + +class Article(BaseModel): + """ + An article is a single incident as crawled from the reach-out webpage + """ + date = DateField(index=True) + month_only = BooleanField(default=False) + place = CharField() + additional_place = CharField(null=True) + description = TextField() + hash = BlobField(index=True) + +# Set up the tables +def create_tables(): + database.connect() + database.create_tables([Article]) diff --git a/requirements.txt b/requirements.txt index 3e74a0b..4dcf412 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ beautifulsoup4==4.3.2 +peewee==2.4.4