mirror of
https://github.com/13hannes11/UU_NCML_Project.git
synced 2024-09-03 20:50:59 +02:00
Implement scraper for DE without download #2
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,4 +1,5 @@
|
|||||||
de/csv
|
de/csv
|
||||||
|
chromedriver
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
|
|||||||
@@ -1 +1,7 @@
|
|||||||
# UU_NCML_Project
|
# UU_NCML_Project
|
||||||
|
|
||||||
|
## Selenium
|
||||||
|
|
||||||
|
|
||||||
|
Install a google chrome, download the corresponding Webdriver and place it into this folder (filename: `chromedriver`):
|
||||||
|
https://www.selenium.dev/documentation/en/webdriver/driver_requirements/
|
||||||
@@ -1,2 +1,5 @@
|
|||||||
openpyxl==3.0.7
|
openpyxl==3.0.7
|
||||||
pandas==1.2.4
|
pandas==1.2.4
|
||||||
|
|
||||||
|
# Optional but needed for scraping
|
||||||
|
selenium==3.141.0
|
||||||
70
scraper.py
Normal file
70
scraper.py
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as ec
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.common.exceptions import NoSuchElementException
|
||||||
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
|
||||||
|
|
||||||
|
import time
|
||||||
|
|
||||||
|
def get_element_by_xpath_or_false(driver, xpath):
|
||||||
|
try:
|
||||||
|
element = driver.find_element_by_xpath(xpath)
|
||||||
|
except NoSuchElementException:
|
||||||
|
return False
|
||||||
|
return element
|
||||||
|
|
||||||
|
WAIT_TIME_SEC = 7
|
||||||
|
|
||||||
|
DATE_FROM = "01.01.2019"
|
||||||
|
DATE_TO = "20.04.2021"
|
||||||
|
|
||||||
|
DRIVER_PATH = "./chromedriver"
|
||||||
|
url = r'https://www.bundestag.de/parlament/plenum/abstimmung/liste'
|
||||||
|
|
||||||
|
options = Options()
|
||||||
|
#options.add_argument("--headless")
|
||||||
|
options.add_argument("window-size=800,600")
|
||||||
|
|
||||||
|
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
|
||||||
|
driver.get(url)
|
||||||
|
|
||||||
|
WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.XPATH, "//table")))
|
||||||
|
|
||||||
|
driver.find_element_by_xpath('//input[starts-with(@id, "from_")]').send_keys(DATE_FROM)
|
||||||
|
driver.find_element_by_xpath('//input[starts-with(@id, "to_")]').send_keys(DATE_TO)
|
||||||
|
driver.find_element_by_xpath('//div[@class= "bt-filterzeile-scroller"]').click()
|
||||||
|
|
||||||
|
running = True
|
||||||
|
while running:
|
||||||
|
# as the side does not provide any loading indicators we need to wait after performing an action that requires loading
|
||||||
|
time.sleep(WAIT_TIME_SEC)
|
||||||
|
|
||||||
|
# element selector to get elements that contain title and link to excel file
|
||||||
|
element_selector = '//div[contains(@class, "bt-standard-conten") and not(@aria-hidden="true")]/table//div[@class= "bt-documents-description"]'
|
||||||
|
elements = driver.find_elements_by_xpath(element_selector)
|
||||||
|
|
||||||
|
for element in elements:
|
||||||
|
if element.is_displayed():
|
||||||
|
title_element = get_element_by_xpath_or_false(element, './p/strong')
|
||||||
|
link_element = get_element_by_xpath_or_false(element, './ul/li/a[starts-with(@title, "XLSX")]')
|
||||||
|
|
||||||
|
if title_element and link_element:
|
||||||
|
title = title_element.text
|
||||||
|
link = link_element.get_attribute("href")
|
||||||
|
|
||||||
|
print(title)
|
||||||
|
print(link)
|
||||||
|
|
||||||
|
# Is there a next page
|
||||||
|
element = get_element_by_xpath_or_false(driver, '//button[contains(@class, "slick-next") and not(contains(@class, "slick-disabled"))]')
|
||||||
|
if element:
|
||||||
|
# Move to bottom of page to avoid share button when clicking on element
|
||||||
|
ActionChains(driver).move_to_element(driver.find_element_by_xpath('//div[contains(@class, "bt-footer-service")]')).perform()
|
||||||
|
element.click()
|
||||||
|
else:
|
||||||
|
running = False
|
||||||
|
|
||||||
|
# TODO: file downloader
|
||||||
Reference in New Issue
Block a user