scraping all articles

This commit is contained in:
Joshua Widmann 2014-12-08 15:38:58 +01:00
commit 9ae38d3279

View file

@ -9,14 +9,17 @@ class Scraper():
self.start = request.urlopen(index) self.start = request.urlopen(index)
self.base_url = parsed_url.scheme + "://" + parsed_url.netloc self.base_url = parsed_url.scheme + "://" + parsed_url.netloc
def has_more_pages(self): def get_next_page(self, document):
pass nav_elem = document.select('.nav')[1]
def visit_next_page(self): if nav_elem.get_text().strip() == '>':
pass href = nav_elem.get('href')
return BeautifulSoup(request.urlopen(href))
else:
return None
def get_articles_on_page(self, url):
document = BeautifulSoup(request.urlopen(url)) def get_articles_on_page(self, document):
article_tables = document.select('table[width="98%"]') article_tables = document.select('table[width="98%"]')
articles = [] articles = []
@ -62,9 +65,12 @@ class Scraper():
overview_urls = self.get_yearly_overviews() overview_urls = self.get_yearly_overviews()
articles = [] articles = []
return self.get_articles_on_page(overview_urls[0]) for url in overview_urls:
currentDoc = BeautifulSoup(request.urlopen(url))
# for url in overview_urls: while currentDoc:
new_articles = self.get_articles_on_page(currentDoc)
articles.extend(new_articles)
currentDoc = self.get_next_page(currentDoc)
# while self.has_more_pages(): return articles
# self.visit_next_page()