mirror of
https://github.com/heyarne/berliner-winter.git
synced 2026-05-06 19:23:39 +02:00
Merge branch 'dev-arne'
Conflicts: scraper/scraper.py
This commit is contained in:
commit
5306e6dab4
2 changed files with 69 additions and 3 deletions
65
get_incidents.py
Normal file
65
get_incidents.py
Normal file
|
|
@ -0,0 +1,65 @@
|
||||||
|
# This file contains the logic that periodically fetches all pages on the
|
||||||
|
# Reachout Berlin homepage, checks if they're already in the database and inserts
|
||||||
|
# them if needed.
|
||||||
|
|
||||||
|
import sqlite3
|
||||||
|
import hashlib
|
||||||
|
from scraper.scraper import Scraper
|
||||||
|
|
||||||
|
encoding = 'utf-8'
|
||||||
|
|
||||||
|
scraper = Scraper()
|
||||||
|
articles = scraper.scrape()
|
||||||
|
|
||||||
|
conn = sqlite3.connect('violence.db')
|
||||||
|
c = conn.cursor()
|
||||||
|
|
||||||
|
# setup database schema
|
||||||
|
c.execute('''
|
||||||
|
CREATE TABLE IF NOT EXISTS incidents (
|
||||||
|
incident_id INTEGER PRIMARY KEY,
|
||||||
|
date TEXT,
|
||||||
|
place TEXT,
|
||||||
|
additional_place TEXT,
|
||||||
|
description TEXT,
|
||||||
|
hash
|
||||||
|
);
|
||||||
|
''')
|
||||||
|
|
||||||
|
c.execute('''
|
||||||
|
CREATE INDEX IF NOT EXISTS incidents_date
|
||||||
|
ON incidents (date);
|
||||||
|
''')
|
||||||
|
|
||||||
|
c.execute('''
|
||||||
|
CREATE INDEX IF NOT EXISTS incidents_hash
|
||||||
|
ON incidents (hash);
|
||||||
|
''')
|
||||||
|
|
||||||
|
# insert articles if necessary
|
||||||
|
select_query = 'SELECT * FROM incidents WHERE hash=?'
|
||||||
|
insert_query = '''
|
||||||
|
INSERT INTO incidents (
|
||||||
|
date, place, additional_place, description, hash
|
||||||
|
) VALUES (?)
|
||||||
|
'''
|
||||||
|
for article in articles:
|
||||||
|
# build a hash so we can more easily find out if we have an article already
|
||||||
|
h = h.sha256()
|
||||||
|
h.update(article.date.encode(encoding))
|
||||||
|
h.update(article.place.encode(encoding))
|
||||||
|
h.update(article.additional_place.encode(encoding))
|
||||||
|
h.update(article.description.encode(encoding))
|
||||||
|
digest = h.digest()
|
||||||
|
|
||||||
|
c.execute(select_query)
|
||||||
|
|
||||||
|
if (not c.fetchone()):
|
||||||
|
article_tuple = (
|
||||||
|
article.date,
|
||||||
|
article.place,
|
||||||
|
article.additional_place,
|
||||||
|
article.description,
|
||||||
|
digest
|
||||||
|
)
|
||||||
|
c.execute(insert_query, article_tuple)
|
||||||
|
|
@ -1,5 +1,6 @@
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib import parse, request
|
from urllib import parse, request
|
||||||
|
from datetime import date
|
||||||
|
|
||||||
class Scraper():
|
class Scraper():
|
||||||
|
|
||||||
|
|
@ -27,7 +28,7 @@ class Scraper():
|
||||||
# headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info)
|
# headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info)
|
||||||
headline = table.select('tr:first-child')[0].get_text()
|
headline = table.select('tr:first-child')[0].get_text()
|
||||||
|
|
||||||
date = headline[:headline.find(' ')]
|
year, month, day = headline[:headline.find(' ')].strip().split('-')
|
||||||
places = headline[headline.find(' ') + 1:]
|
places = headline[headline.find(' ') + 1:]
|
||||||
|
|
||||||
if places.find(' ') == -1:
|
if places.find(' ') == -1:
|
||||||
|
|
@ -35,9 +36,9 @@ class Scraper():
|
||||||
additional = None
|
additional = None
|
||||||
else:
|
else:
|
||||||
district = places[:places.find(' ')]
|
district = places[:places.find(' ')]
|
||||||
additional = places[places.find(' ') + 1:]
|
additional = places[places.find(' ') + 1:].strip()
|
||||||
|
|
||||||
text = table.select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].get_text()
|
text = table.select('tr')[2].select('td')[1].get_text()
|
||||||
|
|
||||||
article = {
|
article = {
|
||||||
'date': date.strip(),
|
'date': date.strip(),
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue