mirror of
https://github.com/heyarne/berliner-winter.git
synced 2026-05-06 19:23:39 +02:00
88 lines
2.3 KiB
Python
88 lines
2.3 KiB
Python
# This file contains the logic that periodically fetches all pages on the
|
|
# Reachout Berlin homepage, checks if they're already in the database and inserts
|
|
# them if needed.
|
|
|
|
import sqlite3
|
|
import hashlib
|
|
import time
|
|
from scraper.scraper import Scraper
|
|
|
|
encoding = 'UTF-8'
|
|
|
|
# helper function for benchmarking
|
|
current_milli_time = lambda: int(round(time.time() * 1000))
|
|
|
|
#
|
|
# this is where the logic starts:
|
|
#
|
|
|
|
print('Start crawling…')
|
|
start_time = current_milli_time()
|
|
scraper = Scraper()
|
|
articles = scraper.scrape()
|
|
time_taken = current_milli_time() - start_time
|
|
print('Found {} articles in {} ms'.format(len(articles), time_taken))
|
|
|
|
conn = sqlite3.connect('violence.db')
|
|
c = conn.cursor()
|
|
|
|
# setup database schema
|
|
c.execute('PRAGMA encoding = "{}"'.format(encoding))
|
|
|
|
c.execute('''
|
|
CREATE TABLE IF NOT EXISTS incidents (
|
|
incident_id INTEGER PRIMARY KEY,
|
|
date TEXT,
|
|
month_only INTEGER,
|
|
place TEXT,
|
|
additional_place TEXT,
|
|
description TEXT,
|
|
hash
|
|
);
|
|
''')
|
|
|
|
c.execute('''
|
|
CREATE INDEX IF NOT EXISTS incidents_date
|
|
ON incidents (date);
|
|
''')
|
|
|
|
c.execute('''
|
|
CREATE INDEX IF NOT EXISTS incidents_hash
|
|
ON incidents (hash);
|
|
''')
|
|
|
|
# insert articles if necessary
|
|
select_query = 'SELECT * FROM incidents WHERE hash=?'
|
|
insert_query = '''
|
|
INSERT INTO incidents (
|
|
date, month_only, place, additional_place, description, hash
|
|
) VALUES (?,?,?,?,?,?)
|
|
'''
|
|
|
|
print('Starting database work')
|
|
for article in articles:
|
|
# build a hash so we can more easily find out if we have an article already
|
|
h = hashlib.sha256()
|
|
h.update(str(article['date']).encode(encoding))
|
|
h.update(article['place'].encode(encoding))
|
|
h.update((article['additional_place'] or '').encode(encoding))
|
|
h.update(article['description'].encode(encoding))
|
|
digest = h.digest()
|
|
|
|
c.execute(select_query, (digest,))
|
|
|
|
# now if it's not in the database insert it
|
|
if (not c.fetchone()):
|
|
article_tuple = (
|
|
article['date'],
|
|
article['month_only'],
|
|
article['place'],
|
|
article['additional_place'],
|
|
article['description'],
|
|
digest
|
|
)
|
|
c.execute(insert_query, article_tuple)
|
|
c.close()
|
|
|
|
final_time = current_milli_time() - start_time
|
|
print('All done in {} ms'.format(final_time))
|