From 9ae38d3279ca8cd1c336caccfb061380c5b7c30b Mon Sep 17 00:00:00 2001 From: Joshua Widmann Date: Mon, 8 Dec 2014 15:38:58 +0100 Subject: [PATCH] scraping all articles --- scraper/scraper.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/scraper/scraper.py b/scraper/scraper.py index ae49d40..cdcd9e3 100644 --- a/scraper/scraper.py +++ b/scraper/scraper.py @@ -9,14 +9,17 @@ class Scraper(): self.start = request.urlopen(index) self.base_url = parsed_url.scheme + "://" + parsed_url.netloc - def has_more_pages(self): - pass + def get_next_page(self, document): + nav_elem = document.select('.nav')[1] - def visit_next_page(self): - pass + if nav_elem.get_text().strip() == '>': + href = nav_elem.get('href') + return BeautifulSoup(request.urlopen(href)) + else: + return None - def get_articles_on_page(self, url): - document = BeautifulSoup(request.urlopen(url)) + + def get_articles_on_page(self, document): article_tables = document.select('table[width="98%"]') articles = [] @@ -62,9 +65,12 @@ class Scraper(): overview_urls = self.get_yearly_overviews() articles = [] - return self.get_articles_on_page(overview_urls[0]) + for url in overview_urls: + currentDoc = BeautifulSoup(request.urlopen(url)) - # for url in overview_urls: + while currentDoc: + new_articles = self.get_articles_on_page(currentDoc) + articles.extend(new_articles) + currentDoc = self.get_next_page(currentDoc) - # while self.has_more_pages(): - # self.visit_next_page() + return articles