from bs4 import BeautifulSoup from urllib import parse, request from datetime import date import re class Scraper(): def __init__(self, index='http://www.reachoutberlin.de/modules.php?op=modload&name=topics&file=index&cm=9&cb=8'): parsed_url = parse.urlparse(index) self.start = request.urlopen(index) self.base_url = parsed_url.scheme + "://" + parsed_url.netloc # dates are a bit dificult; usually they're formatted like YYYY-MM-DD, # followed by a space character, but sometimes the day is missing or it's # followed by another character… self.date_matcher = re.compile('^(\d{4})-(\d{,2})(-(\d{,2}))?') def get_next_page(self, document): nav_elem = document.select('.nav')[1] if nav_elem.get_text().strip() == '>': href = nav_elem.get('href') return BeautifulSoup(request.urlopen(href)) else: return None def get_articles_on_page(self, document): article_tables = document.select('table[width="98%"]') articles = [] for table in article_tables: # headlines are always YYYY-MM-DD? Berlin-DISTRICT # sometimes they use Berlin followed by a space, usually by a dash; # additionally maybe there is some information such as a # train or bus station appended but often there isn't. headline = table.select('tr:first-child')[0].get_text() date_match = self.date_matcher.match(headline.strip()) year, month, day = date_match.group(1,2,4) place = headline[headline.find(' ') + 1:] text = table.select('tr')[2].select('td')[1].get_text() article = { 'date': date(int(year), int(month), int(day) if day else 1), 'month_only': day is None, 'place': place.strip(), 'description': text.strip() } articles.append(article) return articles def get_yearly_overviews(self): document = BeautifulSoup(self.start) links = document.find_all('a') overviews = [] for link in links: if link.get_text().lower().startswith('chronik'): overview_link = link.get('href') overviews.append(parse.urljoin(self.base_url, overview_link)) return overviews def scrape(self): overview_urls = self.get_yearly_overviews() articles = [] for url in overview_urls: current_doc = BeautifulSoup(request.urlopen(url)) while current_doc: new_articles = self.get_articles_on_page(current_doc) articles.extend(new_articles) current_doc = self.get_next_page(current_doc) return articles