From 66b68b269d08f05344fcbfbd43fb1ccd390cd18c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arne=20Schl=C3=BCter?= <aesthaddicts@gmail.com>
Date: Mon, 8 Dec 2014 14:40:05 +0100
Subject: [PATCH] Normalize \r\n to \n and use date objects instead of the
 original date string

---
 scraper/scraper.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/scraper/scraper.py b/scraper/scraper.py
index ae49d40..57d0cc4 100644
--- a/scraper/scraper.py
+++ b/scraper/scraper.py
@@ -1,5 +1,6 @@
 from bs4 import BeautifulSoup
 from urllib import parse, request
+from datetime import date
 
 class Scraper():
 
@@ -24,7 +25,7 @@ class Scraper():
             # headlines are always YYYY-MM-DD Berlin-DISTRICT (+ sometimes additional info)
             headline = table.select('tr:first-child')[0].get_text()
 
-            date = headline[:headline.find(' ')]
+            year, month, day = headline[:headline.find(' ')].strip().split('-')
             places = headline[headline.find(' ') + 1:]
 
             if places.find(' ') == -1:
@@ -32,15 +33,15 @@ class Scraper():
                 additional = None
             else:
                 district = places[:places.find(' ')]
-                additional = places[places.find(' ') + 1:]
+                additional = places[places.find(' ') + 1:].strip()
 
             text = table.select('tr:nth-of-type(3)')[0].select('td:nth-of-type(2)')[0].get_text()
 
             article = {
-                'date': date.strip(),
-                'place': district,
+                'date': date(int(year), int(month), int(day)),
+                'place': district.strip(),
                 'additional': additional,
-                'text': text.strip()
+                'text': text.strip().replace('\r\n', '\n')
             }
             articles.append(article)