scraping all articles

2026-05-06 19:23:39 +02:00 · 2014-12-08 15:38:58 +01:00 · 2014-12-08 15:38:58 +01:00 · 9ae38d3279
commit 9ae38d3279
parent 7fe08227bb
1 changed files with 16 additions and 10 deletions
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@ -9,14 +9,17 @@ class Scraper():
        self.start = request.urlopen(index)
        self.base_url = parsed_url.scheme + "://" + parsed_url.netloc
-    def has_more_pages(self):
+    def get_next_page(self, document):
-        pass
+        nav_elem = document.select('.nav')[1]
-    def visit_next_page(self):
+        if nav_elem.get_text().strip() == '>':
-        pass
+            href = nav_elem.get('href')
            return BeautifulSoup(request.urlopen(href))
        else:
            return None
-    def get_articles_on_page(self, url):
+
-        document = BeautifulSoup(request.urlopen(url))
+    def get_articles_on_page(self, document):
        article_tables = document.select('table[width="98%"]')
        articles = []
@ -62,9 +65,12 @@ class Scraper():
        overview_urls = self.get_yearly_overviews()
        articles = []
-        return self.get_articles_on_page(overview_urls[0])
+        for url in overview_urls:
            currentDoc = BeautifulSoup(request.urlopen(url))
-        # for url in overview_urls:
+            while currentDoc:
                new_articles = self.get_articles_on_page(currentDoc)
                articles.extend(new_articles)
                currentDoc = self.get_next_page(currentDoc)
-        #     while self.has_more_pages():
+        return articles
        #         self.visit_next_page()