From ac6e5248dae6cbf78ba5147fdf6282cbbe48253a Mon Sep 17 00:00:00 2001 From: Hannes Kuchelmeister Date: Tue, 11 May 2021 14:05:23 +0200 Subject: [PATCH] fix german scraping to also handle xls not only xlsx --- download_data/requirements.txt | 5 ----- download_data/scrape_de.py | 8 ++++---- 2 files changed, 4 insertions(+), 9 deletions(-) delete mode 100644 download_data/requirements.txt diff --git a/download_data/requirements.txt b/download_data/requirements.txt deleted file mode 100644 index aa4dad9..0000000 --- a/download_data/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -openpyxl==3.0.7 -pandas==1.2.4 - -# Optional but needed for scraping -selenium==3.141.0 \ No newline at end of file diff --git a/download_data/scrape_de.py b/download_data/scrape_de.py index 23e13b3..b9cc545 100644 --- a/download_data/scrape_de.py +++ b/download_data/scrape_de.py @@ -17,8 +17,8 @@ DRIVER_PATH = "./chromedriver" WAIT_TIME_SEC = 7 # Filter -DATE_FROM = "01.01.2019" -DATE_TO = "20.04.2021" +DATE_FROM = "17.10.2012" #dd.mm.yyyy +DATE_TO = "11.05.2021" #dd.mm.yyyy # Output DOWNLOAD_FOLDER = "../de/input/" @@ -55,6 +55,7 @@ def get_title_and_url(): running = True while running: # as the side does not provide any loading indicators we need to wait after performing an action that requires loading + print('waiting to be sure the page has updated') time.sleep(WAIT_TIME_SEC) # element selector to get elements that contain title and link to excel file @@ -64,8 +65,7 @@ def get_title_and_url(): for element in elements: if element.is_displayed(): title_element = get_element_by_xpath_or_false(element, './p/strong') - link_element = get_element_by_xpath_or_false(element, './ul/li/a[starts-with(@title, "XLSX")]') - + link_element = get_element_by_xpath_or_false(element, './ul/li/a[starts-with(@title, "XLS")]') if title_element and link_element: title = title_element.text link = link_element.get_attribute("href")