add downloader to scraper script

2024-09-03 20:50:59 +02:00 · 2021-04-26 17:44:15 +02:00
parent 3283bcd717
commit a1c93ea83c
1 changed files with 95 additions and 40 deletions
--- a/scraper.py
+++ b/scraper.py
@@ -6,9 +6,24 @@ from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.chrome.options import Options
 from selenium.webdriver.common.action_chains import ActionChains
 import multiprocessing
 import requests
 import time
 # Settings
 DRIVER_PATH = "./chromedriver"
 WAIT_TIME_SEC = 7
 # Filter
 DATE_FROM = "01.01.2019"
 DATE_TO = "20.04.2021"
 # Output
 DOWNLOAD_FOLDER = "de/input/"
 def get_element_by_xpath_or_false(driver, xpath):
    try:
        element = driver.find_element_by_xpath(xpath)
@@ -16,16 +31,16 @@ def get_element_by_xpath_or_false(driver, xpath):
        return False
    return element
-WAIT_TIME_SEC = 7
+def get_title_and_url():
    '''
    function to get titles and URLS for dataset
    '''
    title_url_list = []
 DATE_FROM = "01.01.2019"
 DATE_TO = "20.04.2021"
 DRIVER_PATH = "./chromedriver"
    url = r'https://www.bundestag.de/parlament/plenum/abstimmung/liste'
    options = Options()
-#options.add_argument("--headless")
+    options.add_argument("--headless")
    options.add_argument("window-size=800,600")
    driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
@@ -54,7 +69,7 @@ while running:
                if title_element and link_element:
                    title = title_element.text
                    link = link_element.get_attribute("href")
-
+                    title_url_list.append((title, link))
                    print(title)
                    print(link)
@@ -66,5 +81,45 @@ while running:
            element.click()
        else:
            running = False
    driver.quit()
    return title_url_list
 def save_to_file(file_url, folder):
    '''
    function to save file from url into specified folder
    '''
    file_name = file_url.split("/")[-1]
    req = requests.get(file_url)
    with open(folder + file_name,'wb') as output_file:
        output_file.write(req.content)
    return file_name
 def save_titles(title_filename_list, folder):
    '''
    function to save title file mappings into folder as 'filename_to_titles.csv'
    '''
    with open(folder + 'filename_to_titles.csv', 'wt') as output_file:
        for title, file_name in title_filename_list:
            output_file.write(f'{file_name};{title}\n')
 def title_url_list_element_saver(x:(str, str)):
    """ Function that downloads the file and returns a tuple of title and filename
    Args:
        x (str, str): tuple of title and url
    Returns:
        (str, str): returns a tuple of title and filename
    """
    print(f'Saving {x[0]}')
    return x[0], save_to_file(x[1], DOWNLOAD_FOLDER)
 title_url_list = get_title_and_url()
 pool = multiprocessing.Pool()
 title_filenames_map = pool.map(title_url_list_element_saver, title_url_list)
 save_titles(title_filenames_map, DOWNLOAD_FOLDER)
 # TODO: file downloader