mirror of
https://github.com/heyarne/berliner-winter.git
synced 2026-05-06 19:23:39 +02:00
Crawl all pages and insert them into the database
This commit is contained in:
parent
5306e6dab4
commit
36df116ed0
3 changed files with 60 additions and 21 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -1,6 +1,9 @@
|
|||
www/bower_components
|
||||
!www/bower_components/.gitkeep
|
||||
|
||||
# Ignore the SQLite database
|
||||
violence.db
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
|
|
|
|||
|
|
@ -4,21 +4,36 @@
|
|||
|
||||
import sqlite3
|
||||
import hashlib
|
||||
import time
|
||||
from scraper.scraper import Scraper
|
||||
|
||||
encoding = 'utf-8'
|
||||
encoding = 'UTF-8'
|
||||
|
||||
# helper function for benchmarking
|
||||
current_milli_time = lambda: int(round(time.time() * 1000))
|
||||
|
||||
#
|
||||
# this is where the logic starts:
|
||||
#
|
||||
|
||||
print('Start crawling…')
|
||||
start_time = current_milli_time()
|
||||
scraper = Scraper()
|
||||
articles = scraper.scrape()
|
||||
time_taken = current_milli_time() - start_time
|
||||
print('Found {} articles in {} ms'.format(len(articles), time_taken))
|
||||
|
||||
conn = sqlite3.connect('violence.db')
|
||||
c = conn.cursor()
|
||||
|
||||
# setup database schema
|
||||
c.execute('PRAGMA encoding = "{}"'.format(encoding))
|
||||
|
||||
c.execute('''
|
||||
CREATE TABLE IF NOT EXISTS incidents (
|
||||
incident_id INTEGER PRIMARY KEY,
|
||||
date TEXT,
|
||||
month_only INTEGER,
|
||||
place TEXT,
|
||||
additional_place TEXT,
|
||||
description TEXT,
|
||||
|
|
@ -40,26 +55,33 @@ c.execute('''
|
|||
select_query = 'SELECT * FROM incidents WHERE hash=?'
|
||||
insert_query = '''
|
||||
INSERT INTO incidents (
|
||||
date, place, additional_place, description, hash
|
||||
) VALUES (?)
|
||||
date, month_only, place, additional_place, description, hash
|
||||
) VALUES (?,?,?,?,?,?)
|
||||
'''
|
||||
|
||||
print('Starting database work')
|
||||
for article in articles:
|
||||
# build a hash so we can more easily find out if we have an article already
|
||||
h = h.sha256()
|
||||
h.update(article.date.encode(encoding))
|
||||
h.update(article.place.encode(encoding))
|
||||
h.update(article.additional_place.encode(encoding))
|
||||
h.update(article.description.encode(encoding))
|
||||
h = hashlib.sha256()
|
||||
h.update(str(article['date']).encode(encoding))
|
||||
h.update(article['place'].encode(encoding))
|
||||
h.update((article['additional_place'] or '').encode(encoding))
|
||||
h.update(article['description'].encode(encoding))
|
||||
digest = h.digest()
|
||||
|
||||
c.execute(select_query)
|
||||
c.execute(select_query, (digest,))
|
||||
|
||||
# now if it's not in the database insert it
|
||||
if (not c.fetchone()):
|
||||
article_tuple = (
|
||||
article.date,
|
||||
article.place,
|
||||
article.additional_place,
|
||||
article.description,
|
||||
article['date'],
|
||||
article['month_only'],
|
||||
article['place'],
|
||||
article['additional_place'],
|
||||
article['description'],
|
||||
digest
|
||||
)
|
||||
c.execute(insert_query, article_tuple)
|
||||
|
||||
final_time = current_milli_time() - start_time
|
||||
print('All done in {} ms'.format(final_time))
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
from bs4 import BeautifulSoup
|
||||
from urllib import parse, request
|
||||
from datetime import date
|
||||
import re
|
||||
|
||||
class Scraper():
|
||||
|
||||
|
|
@ -10,6 +11,11 @@ class Scraper():
|
|||
self.start = request.urlopen(index)
|
||||
self.base_url = parsed_url.scheme + "://" + parsed_url.netloc
|
||||
|
||||
# dates are a bit dificult; usually they're formatted like YYYY-MM-DD,
|
||||
# followed by a space character, but sometimes the day is missing or it's
|
||||
# followed by another character…
|
||||
self.date_matcher = re.compile('^(\d{4})-(\d{,2})(-(\d{,2}))?')
|
||||
|
||||
def get_next_page(self, document):
|
||||
nav_elem = document.select('.nav')[1]
|
||||
|
||||
|
|
@ -25,10 +31,17 @@ class Scraper():
|
|||
articles = []
|
||||
|
||||
for table in article_tables:
|
||||
# headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info)
|
||||
# headlines are always YYYY-MM-DD? Berlin-DISTRICT (+ sometimes additional info)
|
||||
headline = table.select('tr:first-child')[0].get_text()
|
||||
|
||||
year, month, day = headline[:headline.find(' ')].strip().split('-')
|
||||
date_match = self.date_matcher.match(headline.strip())
|
||||
|
||||
try:
|
||||
year, month, day = date_match.group(1,2,4)
|
||||
except:
|
||||
print('Failed for headline ' + headline)
|
||||
raise
|
||||
|
||||
places = headline[headline.find(' ') + 1:]
|
||||
|
||||
if places.find(' ') == -1:
|
||||
|
|
@ -41,8 +54,9 @@ class Scraper():
|
|||
text = table.select('tr')[2].select('td')[1].get_text()
|
||||
|
||||
article = {
|
||||
'date': date.strip(),
|
||||
'place': district,
|
||||
'date': date(int(year), int(month), int(day) if day else 1),
|
||||
'month_only': day is None,
|
||||
'place': district.strip(),
|
||||
'additional_place': additional,
|
||||
'description': text.strip()
|
||||
}
|
||||
|
|
@ -67,11 +81,11 @@ class Scraper():
|
|||
articles = []
|
||||
|
||||
for url in overview_urls:
|
||||
currentDoc = BeautifulSoup(request.urlopen(url))
|
||||
current_doc = BeautifulSoup(request.urlopen(url))
|
||||
|
||||
while currentDoc:
|
||||
new_articles = self.get_articles_on_page(currentDoc)
|
||||
while current_doc:
|
||||
new_articles = self.get_articles_on_page(current_doc)
|
||||
articles.extend(new_articles)
|
||||
currentDoc = self.get_next_page(currentDoc)
|
||||
current_doc = self.get_next_page(current_doc)
|
||||
|
||||
return articles
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue