mirror of
https://github.com/heyarne/berliner-winter.git
synced 2026-05-06 19:23:39 +02:00
Crawl all pages and insert them into the database
This commit is contained in:
parent
5306e6dab4
commit
36df116ed0
3 changed files with 60 additions and 21 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -1,6 +1,9 @@
|
||||||
www/bower_components
|
www/bower_components
|
||||||
!www/bower_components/.gitkeep
|
!www/bower_components/.gitkeep
|
||||||
|
|
||||||
|
# Ignore the SQLite database
|
||||||
|
violence.db
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
|
|
|
||||||
|
|
@ -4,21 +4,36 @@
|
||||||
|
|
||||||
import sqlite3
|
import sqlite3
|
||||||
import hashlib
|
import hashlib
|
||||||
|
import time
|
||||||
from scraper.scraper import Scraper
|
from scraper.scraper import Scraper
|
||||||
|
|
||||||
encoding = 'utf-8'
|
encoding = 'UTF-8'
|
||||||
|
|
||||||
|
# helper function for benchmarking
|
||||||
|
current_milli_time = lambda: int(round(time.time() * 1000))
|
||||||
|
|
||||||
|
#
|
||||||
|
# this is where the logic starts:
|
||||||
|
#
|
||||||
|
|
||||||
|
print('Start crawling…')
|
||||||
|
start_time = current_milli_time()
|
||||||
scraper = Scraper()
|
scraper = Scraper()
|
||||||
articles = scraper.scrape()
|
articles = scraper.scrape()
|
||||||
|
time_taken = current_milli_time() - start_time
|
||||||
|
print('Found {} articles in {} ms'.format(len(articles), time_taken))
|
||||||
|
|
||||||
conn = sqlite3.connect('violence.db')
|
conn = sqlite3.connect('violence.db')
|
||||||
c = conn.cursor()
|
c = conn.cursor()
|
||||||
|
|
||||||
# setup database schema
|
# setup database schema
|
||||||
|
c.execute('PRAGMA encoding = "{}"'.format(encoding))
|
||||||
|
|
||||||
c.execute('''
|
c.execute('''
|
||||||
CREATE TABLE IF NOT EXISTS incidents (
|
CREATE TABLE IF NOT EXISTS incidents (
|
||||||
incident_id INTEGER PRIMARY KEY,
|
incident_id INTEGER PRIMARY KEY,
|
||||||
date TEXT,
|
date TEXT,
|
||||||
|
month_only INTEGER,
|
||||||
place TEXT,
|
place TEXT,
|
||||||
additional_place TEXT,
|
additional_place TEXT,
|
||||||
description TEXT,
|
description TEXT,
|
||||||
|
|
@ -40,26 +55,33 @@ c.execute('''
|
||||||
select_query = 'SELECT * FROM incidents WHERE hash=?'
|
select_query = 'SELECT * FROM incidents WHERE hash=?'
|
||||||
insert_query = '''
|
insert_query = '''
|
||||||
INSERT INTO incidents (
|
INSERT INTO incidents (
|
||||||
date, place, additional_place, description, hash
|
date, month_only, place, additional_place, description, hash
|
||||||
) VALUES (?)
|
) VALUES (?,?,?,?,?,?)
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
print('Starting database work')
|
||||||
for article in articles:
|
for article in articles:
|
||||||
# build a hash so we can more easily find out if we have an article already
|
# build a hash so we can more easily find out if we have an article already
|
||||||
h = h.sha256()
|
h = hashlib.sha256()
|
||||||
h.update(article.date.encode(encoding))
|
h.update(str(article['date']).encode(encoding))
|
||||||
h.update(article.place.encode(encoding))
|
h.update(article['place'].encode(encoding))
|
||||||
h.update(article.additional_place.encode(encoding))
|
h.update((article['additional_place'] or '').encode(encoding))
|
||||||
h.update(article.description.encode(encoding))
|
h.update(article['description'].encode(encoding))
|
||||||
digest = h.digest()
|
digest = h.digest()
|
||||||
|
|
||||||
c.execute(select_query)
|
c.execute(select_query, (digest,))
|
||||||
|
|
||||||
|
# now if it's not in the database insert it
|
||||||
if (not c.fetchone()):
|
if (not c.fetchone()):
|
||||||
article_tuple = (
|
article_tuple = (
|
||||||
article.date,
|
article['date'],
|
||||||
article.place,
|
article['month_only'],
|
||||||
article.additional_place,
|
article['place'],
|
||||||
article.description,
|
article['additional_place'],
|
||||||
|
article['description'],
|
||||||
digest
|
digest
|
||||||
)
|
)
|
||||||
c.execute(insert_query, article_tuple)
|
c.execute(insert_query, article_tuple)
|
||||||
|
|
||||||
|
final_time = current_milli_time() - start_time
|
||||||
|
print('All done in {} ms'.format(final_time))
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from urllib import parse, request
|
from urllib import parse, request
|
||||||
from datetime import date
|
from datetime import date
|
||||||
|
import re
|
||||||
|
|
||||||
class Scraper():
|
class Scraper():
|
||||||
|
|
||||||
|
|
@ -10,6 +11,11 @@ class Scraper():
|
||||||
self.start = request.urlopen(index)
|
self.start = request.urlopen(index)
|
||||||
self.base_url = parsed_url.scheme + "://" + parsed_url.netloc
|
self.base_url = parsed_url.scheme + "://" + parsed_url.netloc
|
||||||
|
|
||||||
|
# dates are a bit dificult; usually they're formatted like YYYY-MM-DD,
|
||||||
|
# followed by a space character, but sometimes the day is missing or it's
|
||||||
|
# followed by another character…
|
||||||
|
self.date_matcher = re.compile('^(\d{4})-(\d{,2})(-(\d{,2}))?')
|
||||||
|
|
||||||
def get_next_page(self, document):
|
def get_next_page(self, document):
|
||||||
nav_elem = document.select('.nav')[1]
|
nav_elem = document.select('.nav')[1]
|
||||||
|
|
||||||
|
|
@ -25,10 +31,17 @@ class Scraper():
|
||||||
articles = []
|
articles = []
|
||||||
|
|
||||||
for table in article_tables:
|
for table in article_tables:
|
||||||
# headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info)
|
# headlines are always YYYY-MM-DD? Berlin-DISTRICT (+ sometimes additional info)
|
||||||
headline = table.select('tr:first-child')[0].get_text()
|
headline = table.select('tr:first-child')[0].get_text()
|
||||||
|
|
||||||
year, month, day = headline[:headline.find(' ')].strip().split('-')
|
date_match = self.date_matcher.match(headline.strip())
|
||||||
|
|
||||||
|
try:
|
||||||
|
year, month, day = date_match.group(1,2,4)
|
||||||
|
except:
|
||||||
|
print('Failed for headline ' + headline)
|
||||||
|
raise
|
||||||
|
|
||||||
places = headline[headline.find(' ') + 1:]
|
places = headline[headline.find(' ') + 1:]
|
||||||
|
|
||||||
if places.find(' ') == -1:
|
if places.find(' ') == -1:
|
||||||
|
|
@ -41,8 +54,9 @@ class Scraper():
|
||||||
text = table.select('tr')[2].select('td')[1].get_text()
|
text = table.select('tr')[2].select('td')[1].get_text()
|
||||||
|
|
||||||
article = {
|
article = {
|
||||||
'date': date.strip(),
|
'date': date(int(year), int(month), int(day) if day else 1),
|
||||||
'place': district,
|
'month_only': day is None,
|
||||||
|
'place': district.strip(),
|
||||||
'additional_place': additional,
|
'additional_place': additional,
|
||||||
'description': text.strip()
|
'description': text.strip()
|
||||||
}
|
}
|
||||||
|
|
@ -67,11 +81,11 @@ class Scraper():
|
||||||
articles = []
|
articles = []
|
||||||
|
|
||||||
for url in overview_urls:
|
for url in overview_urls:
|
||||||
currentDoc = BeautifulSoup(request.urlopen(url))
|
current_doc = BeautifulSoup(request.urlopen(url))
|
||||||
|
|
||||||
while currentDoc:
|
while current_doc:
|
||||||
new_articles = self.get_articles_on_page(currentDoc)
|
new_articles = self.get_articles_on_page(current_doc)
|
||||||
articles.extend(new_articles)
|
articles.extend(new_articles)
|
||||||
currentDoc = self.get_next_page(currentDoc)
|
current_doc = self.get_next_page(current_doc)
|
||||||
|
|
||||||
return articles
|
return articles
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue