From 55a599a47b69d1289540cd618a6aa61556a00a3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arne=20Schl=C3=BCter?= Date: Mon, 8 Dec 2014 16:19:18 +0100 Subject: [PATCH] Write insertion logic for articles --- get_incidents.py | 45 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/get_incidents.py b/get_incidents.py index 6dcf0a6..f47936e 100644 --- a/get_incidents.py +++ b/get_incidents.py @@ -3,10 +3,13 @@ # them if needed. import sqlite3 +import hashlib from scraper.scraper import Scraper -# scraper = Scraper() -# articles = scraper.scrape() +encoding = 'utf-8' + +scraper = Scraper() +articles = scraper.scrape() conn = sqlite3.connect('violence.db') c = conn.cursor() @@ -18,7 +21,8 @@ c.execute(''' date TEXT, place TEXT, additional_place TEXT, - description TEXT + description TEXT, + hash ); ''') @@ -27,6 +31,35 @@ c.execute(''' ON incidents (date); ''') -# insert articles -# for article in articles: -# pass +c.execute(''' + CREATE INDEX IF NOT EXISTS incidents_hash + ON incidents (hash); +''') + +# insert articles if necessary +select_query = 'SELECT * FROM incidents WHERE hash=?' +insert_query = ''' + INSERT INTO incidents ( + date, place, additional_place, description, hash + ) VALUES (?) +''' +for article in articles: + # build a hash so we can more easily find out if we have an article already + h = h.sha256() + h.update(article.date.encode(encoding)) + h.update(article.place.encode(encoding)) + h.update(article.additional_place.encode(encoding)) + h.update(article.description.encode(encoding)) + digest = h.digest() + + c.execute(select_query) + + if (not c.fetchone()): + article_tuple = ( + article.date, + article.place, + article.additional_place, + article.description, + digest + ) + c.execute(insert_query, article_tuple)