From 3283bcd71704cf96f138af6df5918e96c6fec76c Mon Sep 17 00:00:00 2001 From: Hannes Kuchelmeister Date: Thu, 22 Apr 2021 17:19:54 +0200 Subject: [PATCH] Implement scraper for DE without download #2 --- .gitignore | 1 + README.md | 8 +++++- requirements.txt | 3 +++ scraper.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 scraper.py diff --git a/.gitignore b/.gitignore index 6509fc7..769d945 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ de/csv +chromedriver # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/README.md b/README.md index 7542acd..10008e1 100644 --- a/README.md +++ b/README.md @@ -1 +1,7 @@ -# UU_NCML_Project \ No newline at end of file +# UU_NCML_Project + +## Selenium + + +Install a google chrome, download the corresponding Webdriver and place it into this folder (filename: `chromedriver`): +https://www.selenium.dev/documentation/en/webdriver/driver_requirements/ \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 21f1e8d..aa4dad9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,5 @@ openpyxl==3.0.7 pandas==1.2.4 + +# Optional but needed for scraping +selenium==3.141.0 \ No newline at end of file diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..cab82a3 --- /dev/null +++ b/scraper.py @@ -0,0 +1,70 @@ +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as ec +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.action_chains import ActionChains + + +import time + +def get_element_by_xpath_or_false(driver, xpath): + try: + element = driver.find_element_by_xpath(xpath) + except NoSuchElementException: + return False + return element + +WAIT_TIME_SEC = 7 + +DATE_FROM = "01.01.2019" +DATE_TO = "20.04.2021" + +DRIVER_PATH = "./chromedriver" +url = r'https://www.bundestag.de/parlament/plenum/abstimmung/liste' + +options = Options() +#options.add_argument("--headless") +options.add_argument("window-size=800,600") + +driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH) +driver.get(url) + +WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.XPATH, "//table"))) + +driver.find_element_by_xpath('//input[starts-with(@id, "from_")]').send_keys(DATE_FROM) +driver.find_element_by_xpath('//input[starts-with(@id, "to_")]').send_keys(DATE_TO) +driver.find_element_by_xpath('//div[@class= "bt-filterzeile-scroller"]').click() + +running = True +while running: + # as the side does not provide any loading indicators we need to wait after performing an action that requires loading + time.sleep(WAIT_TIME_SEC) + + # element selector to get elements that contain title and link to excel file + element_selector = '//div[contains(@class, "bt-standard-conten") and not(@aria-hidden="true")]/table//div[@class= "bt-documents-description"]' + elements = driver.find_elements_by_xpath(element_selector) + + for element in elements: + if element.is_displayed(): + title_element = get_element_by_xpath_or_false(element, './p/strong') + link_element = get_element_by_xpath_or_false(element, './ul/li/a[starts-with(@title, "XLSX")]') + + if title_element and link_element: + title = title_element.text + link = link_element.get_attribute("href") + + print(title) + print(link) + + # Is there a next page + element = get_element_by_xpath_or_false(driver, '//button[contains(@class, "slick-next") and not(contains(@class, "slick-disabled"))]') + if element: + # Move to bottom of page to avoid share button when clicking on element + ActionChains(driver).move_to_element(driver.find_element_by_xpath('//div[contains(@class, "bt-footer-service")]')).perform() + element.click() + else: + running = False + +# TODO: file downloader