From ac6e5248dae6cbf78ba5147fdf6282cbbe48253a Mon Sep 17 00:00:00 2001
From: Hannes Kuchelmeister <hannes@kuchelmeister.org>
Date: Tue, 11 May 2021 14:05:23 +0200
Subject: [PATCH] fix german scraping to also handle xls not only xlsx

---
 download_data/requirements.txt | 5 -----
 download_data/scrape_de.py     | 8 ++++----
 2 files changed, 4 insertions(+), 9 deletions(-)
 delete mode 100644 download_data/requirements.txt

diff --git a/download_data/requirements.txt b/download_data/requirements.txt
deleted file mode 100644
index aa4dad9..0000000
--- a/download_data/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-openpyxl==3.0.7
-pandas==1.2.4
-
-# Optional but needed for scraping
-selenium==3.141.0
\ No newline at end of file
diff --git a/download_data/scrape_de.py b/download_data/scrape_de.py
index 23e13b3..b9cc545 100644
--- a/download_data/scrape_de.py
+++ b/download_data/scrape_de.py
@@ -17,8 +17,8 @@ DRIVER_PATH = "./chromedriver"
 WAIT_TIME_SEC = 7
 
 # Filter
-DATE_FROM = "01.01.2019"
-DATE_TO = "20.04.2021"
+DATE_FROM = "17.10.2012" #dd.mm.yyyy
+DATE_TO = "11.05.2021" #dd.mm.yyyy
 
 # Output
 DOWNLOAD_FOLDER = "../de/input/"
@@ -55,6 +55,7 @@ def get_title_and_url():
     running = True
     while running:
         # as the side does not provide any loading indicators we need to wait after performing an action that requires loading
+        print('waiting to be sure the page has updated')
         time.sleep(WAIT_TIME_SEC)
 
         # element selector to get elements that contain title and link to excel file
@@ -64,8 +65,7 @@ def get_title_and_url():
         for element in elements:
             if element.is_displayed():
                 title_element = get_element_by_xpath_or_false(element, './p/strong')
-                link_element = get_element_by_xpath_or_false(element, './ul/li/a[starts-with(@title, "XLSX")]')
-                
+                link_element = get_element_by_xpath_or_false(element, './ul/li/a[starts-with(@title, "XLS")]')
                 if title_element and link_element:
                     title = title_element.text
                     link = link_element.get_attribute("href")