mirror of
https://github.com/heyarne/berliner-winter.git
synced 2026-05-06 19:23:39 +02:00
Use peewee as model and rewrite the code
This commit is contained in:
parent
0e095dbb63
commit
c1ac5e5ed4
5 changed files with 49 additions and 53 deletions
|
|
@ -14,3 +14,6 @@ trim_trailing_whitespace = true
|
||||||
[*.py]
|
[*.py]
|
||||||
indent_style = space
|
indent_style = space
|
||||||
indent_size = 4
|
indent_size = 4
|
||||||
|
|
||||||
|
[*.md]
|
||||||
|
insert_final_newline = false
|
||||||
|
|
|
||||||
|
|
@ -2,3 +2,10 @@
|
||||||
A visualization of hate crime in Berlin, starting 2005.
|
A visualization of hate crime in Berlin, starting 2005.
|
||||||
The data is kindly provided by [ReachOut - Opferberatung und Bildung gegen Rechtsextremismus, Rassismus und Antisemitismus](http://www.reachoutberlin.de).
|
The data is kindly provided by [ReachOut - Opferberatung und Bildung gegen Rechtsextremismus, Rassismus und Antisemitismus](http://www.reachoutberlin.de).
|
||||||
It is scraped regularly from their webpage and visualized and analyzed by software written by [Joshua Widmann](https://github.com/jshwdmnn) and [Arne Schlüter](https://github.com/aesthaddicts).
|
It is scraped regularly from their webpage and visualized and analyzed by software written by [Joshua Widmann](https://github.com/jshwdmnn) and [Arne Schlüter](https://github.com/aesthaddicts).
|
||||||
|
|
||||||
|
## How do I start?
|
||||||
|
In order to set up the tables you have to create them first. This is done quite easily using the `python` interpreter:
|
||||||
|
```python
|
||||||
|
from models import *
|
||||||
|
create_tables()
|
||||||
|
```
|
||||||
|
|
@ -5,6 +5,7 @@
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import hashlib
|
import hashlib
|
||||||
import time
|
import time
|
||||||
|
from models import *
|
||||||
from scraper.scraper import Scraper
|
from scraper.scraper import Scraper
|
||||||
|
|
||||||
encoding = 'UTF-8'
|
encoding = 'UTF-8'
|
||||||
|
|
@ -12,10 +13,7 @@ encoding = 'UTF-8'
|
||||||
# helper function for benchmarking
|
# helper function for benchmarking
|
||||||
current_milli_time = lambda: int(round(time.time() * 1000))
|
current_milli_time = lambda: int(round(time.time() * 1000))
|
||||||
|
|
||||||
#
|
# First crawl through the whole index and get all articles we can find
|
||||||
# this is where the logic starts:
|
|
||||||
#
|
|
||||||
|
|
||||||
print('Start crawling…')
|
print('Start crawling…')
|
||||||
start_time = current_milli_time()
|
start_time = current_milli_time()
|
||||||
scraper = Scraper()
|
scraper = Scraper()
|
||||||
|
|
@ -23,43 +21,9 @@ articles = scraper.scrape()
|
||||||
time_taken = current_milli_time() - start_time
|
time_taken = current_milli_time() - start_time
|
||||||
print('Found {} articles in {} ms'.format(len(articles), time_taken))
|
print('Found {} articles in {} ms'.format(len(articles), time_taken))
|
||||||
|
|
||||||
conn = sqlite3.connect('violence.db')
|
# Now fill the database
|
||||||
c = conn.cursor()
|
|
||||||
|
|
||||||
# setup database schema
|
|
||||||
c.execute('PRAGMA encoding = "{}"'.format(encoding))
|
|
||||||
|
|
||||||
c.execute('''
|
|
||||||
CREATE TABLE IF NOT EXISTS incidents (
|
|
||||||
incident_id INTEGER PRIMARY KEY,
|
|
||||||
date TEXT,
|
|
||||||
month_only INTEGER,
|
|
||||||
place TEXT,
|
|
||||||
additional_place TEXT,
|
|
||||||
description TEXT,
|
|
||||||
hash
|
|
||||||
);
|
|
||||||
''')
|
|
||||||
|
|
||||||
c.execute('''
|
|
||||||
CREATE INDEX IF NOT EXISTS incidents_date
|
|
||||||
ON incidents (date);
|
|
||||||
''')
|
|
||||||
|
|
||||||
c.execute('''
|
|
||||||
CREATE INDEX IF NOT EXISTS incidents_hash
|
|
||||||
ON incidents (hash);
|
|
||||||
''')
|
|
||||||
|
|
||||||
# insert articles if necessary
|
|
||||||
select_query = 'SELECT * FROM incidents WHERE hash=?'
|
|
||||||
insert_query = '''
|
|
||||||
INSERT INTO incidents (
|
|
||||||
date, month_only, place, additional_place, description, hash
|
|
||||||
) VALUES (?,?,?,?,?,?)
|
|
||||||
'''
|
|
||||||
|
|
||||||
print('Starting database work')
|
print('Starting database work')
|
||||||
|
|
||||||
for article in articles:
|
for article in articles:
|
||||||
# build a hash so we can more easily find out if we have an article already
|
# build a hash so we can more easily find out if we have an article already
|
||||||
h = hashlib.sha256()
|
h = hashlib.sha256()
|
||||||
|
|
@ -69,20 +33,18 @@ for article in articles:
|
||||||
h.update(article['description'].encode(encoding))
|
h.update(article['description'].encode(encoding))
|
||||||
digest = h.digest()
|
digest = h.digest()
|
||||||
|
|
||||||
c.execute(select_query, (digest,))
|
try:
|
||||||
|
Article.get(Article.hash == digest)
|
||||||
# now if it's not in the database insert it
|
except:
|
||||||
if (not c.fetchone()):
|
# article not found
|
||||||
article_tuple = (
|
Article.create(
|
||||||
article['date'],
|
date = article['date'],
|
||||||
article['month_only'],
|
month_only = article['month_only'],
|
||||||
article['place'],
|
place = article['place'],
|
||||||
article['additional_place'],
|
additional_place = article['additional_place'],
|
||||||
article['description'],
|
description = article['description'],
|
||||||
digest
|
hash = digest
|
||||||
)
|
)
|
||||||
c.execute(insert_query, article_tuple)
|
|
||||||
c.close()
|
|
||||||
|
|
||||||
final_time = current_milli_time() - start_time
|
final_time = current_milli_time() - start_time
|
||||||
print('All done in {} ms'.format(final_time))
|
print('All done in {} ms'.format(final_time))
|
||||||
|
|
|
||||||
23
models.py
Normal file
23
models.py
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
from peewee import *
|
||||||
|
|
||||||
|
db = SqliteDatabase('violence.db')
|
||||||
|
|
||||||
|
class BaseModel(Model):
|
||||||
|
class Meta:
|
||||||
|
database = db
|
||||||
|
|
||||||
|
class Article(BaseModel):
|
||||||
|
"""
|
||||||
|
An article is a single incident as crawled from the reach-out webpage
|
||||||
|
"""
|
||||||
|
date = DateField(index=True)
|
||||||
|
month_only = BooleanField(default=False)
|
||||||
|
place = CharField()
|
||||||
|
additional_place = CharField(null=True)
|
||||||
|
description = TextField()
|
||||||
|
hash = BlobField(index=True)
|
||||||
|
|
||||||
|
# Set up the tables
|
||||||
|
def create_tables():
|
||||||
|
database.connect()
|
||||||
|
database.create_tables([Article])
|
||||||
|
|
@ -1 +1,2 @@
|
||||||
beautifulsoup4==4.3.2
|
beautifulsoup4==4.3.2
|
||||||
|
peewee==2.4.4
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue