Implement scraper for DE without download #2

2024-09-03 20:50:59 +02:00 · 2021-04-22 17:19:54 +02:00
parent 75308cbf60
commit 3283bcd717
4 changed files with 81 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 de/csv
 chromedriver
 # Byte-compiled / optimized / DLL files
 __pycache__/
--- a/README.md
+++ b/README.md
@@ -1 +1,7 @@
 # UU_NCML_Project
 ## Selenium
 Install a google chrome, download the corresponding Webdriver and place it into this folder (filename: `chromedriver`): 
 https://www.selenium.dev/documentation/en/webdriver/driver_requirements/
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,5 @@
 openpyxl==3.0.7
 pandas==1.2.4
 # Optional but needed for scraping
 selenium==3.141.0
--- a/scraper.py
+++ b/scraper.py
@@ -0,0 +1,70 @@
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as ec
 from selenium import webdriver
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.action_chains import ActionChains
 import time
 def get_element_by_xpath_or_false(driver, xpath):
    try:
        element = driver.find_element_by_xpath(xpath)
    except NoSuchElementException:
        return False
    return element
 WAIT_TIME_SEC = 7
 DATE_FROM = "01.01.2019"
 DATE_TO = "20.04.2021"
 DRIVER_PATH = "./chromedriver"
 url = r'https://www.bundestag.de/parlament/plenum/abstimmung/liste'
 options = Options()
 #options.add_argument("--headless")
 options.add_argument("window-size=800,600")
 driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
 driver.get(url)
 WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.XPATH, "//table")))
 driver.find_element_by_xpath('//input[starts-with(@id, "from_")]').send_keys(DATE_FROM)
 driver.find_element_by_xpath('//input[starts-with(@id, "to_")]').send_keys(DATE_TO)
 driver.find_element_by_xpath('//div[@class= "bt-filterzeile-scroller"]').click()
 running = True
 while running:
    # as the side does not provide any loading indicators we need to wait after performing an action that requires loading
    time.sleep(WAIT_TIME_SEC)
    # element selector to get elements that contain title and link to excel file
    element_selector = '//div[contains(@class, "bt-standard-conten") and not(@aria-hidden="true")]/table//div[@class= "bt-documents-description"]'
    elements = driver.find_elements_by_xpath(element_selector)
    for element in elements:
        if element.is_displayed():
            title_element = get_element_by_xpath_or_false(element, './p/strong')
            link_element = get_element_by_xpath_or_false(element, './ul/li/a[starts-with(@title, "XLSX")]')
            if title_element and link_element:
                title = title_element.text
                link = link_element.get_attribute("href")
                print(title)
                print(link)
    # Is there a next page
    element = get_element_by_xpath_or_false(driver, '//button[contains(@class, "slick-next") and not(contains(@class, "slick-disabled"))]')
    if element:
        # Move to bottom of page to avoid share button when clicking on element
        ActionChains(driver).move_to_element(driver.find_element_by_xpath('//div[contains(@class, "bt-footer-service")]')).perform()
        element.click()
    else:
        running = False
 # TODO: file downloader