Restructured source files, closed #18

2024-09-03 20:50:59 +02:00 · 2021-05-10 14:55:31 +02:00
parent c5fa8fead7
commit db762c4061
3 changed files with 3 additions and 3 deletions
--- a/download_data/convert_to_csv_de.py
+++ b/download_data/convert_to_csv_de.py
@@ -0,0 +1,29 @@
+import pandas as pd
+import os
+
+base_dir = "../de/"
+out_dir = "csv/"
+in_dir = "input/"
+title_file = "filename_to_titles.csv"
+
+
+if not os.path.exists(os.path.join(base_dir, out_dir)):
+    os.makedirs(os.path.join(base_dir, out_dir))
+
+# Copy titles file and replace file endings
+print(f'Copying Title File')
+with open(os.path.join(base_dir, in_dir, title_file), 'r') as file:
+    file_content = file.read()
+    file_content = file_content.replace('.xlsx', '.csv')
+with open(os.path.join(base_dir, out_dir, title_file), 'w') as file:
+    file.write(file_content)
+
+# Convert xlsx files to csv 
+for dirname, _, filenames in os.walk(os.path.join(base_dir, in_dir)):
+    for filename in filenames:
+        if filename != title_file:
+            print(f'Reading {filename}')
+            read_file = pd.read_excel (os.path.join(dirname, filename))
+            print(f'Saving {filename}')
+            read_file.to_csv (os.path.join(base_dir, out_dir, filename.split(".", 1)[0] + ".csv"), index = None, header=True)
+
--- a/download_data/scrape_de.py
+++ b/download_data/scrape_de.py
@@ -0,0 +1,125 @@
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as ec
+from selenium import webdriver
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.action_chains import ActionChains
+
+import multiprocessing
+
+import requests
+
+import time
+
+# Settings
+DRIVER_PATH = "./chromedriver"
+WAIT_TIME_SEC = 7
+
+# Filter
+DATE_FROM = "01.01.2019"
+DATE_TO = "20.04.2021"
+
+# Output
+DOWNLOAD_FOLDER = "../de/input/"
+
+
+def get_element_by_xpath_or_false(driver, xpath):
+    try:
+        element = driver.find_element_by_xpath(xpath)
+    except NoSuchElementException:
+        return False
+    return element
+
+def get_title_and_url():
+    '''
+    function to get titles and URLS for dataset
+    '''
+    title_url_list = []
+
+    url = r'https://www.bundestag.de/parlament/plenum/abstimmung/liste'
+
+    options = Options()
+    options.add_argument("--headless")
+    options.add_argument("window-size=800,600")
+
+    driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
+    driver.get(url)
+
+    WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.XPATH, "//table")))
+
+    driver.find_element_by_xpath('//input[starts-with(@id, "from_")]').send_keys(DATE_FROM)
+    driver.find_element_by_xpath('//input[starts-with(@id, "to_")]').send_keys(DATE_TO)
+    driver.find_element_by_xpath('//div[@class= "bt-filterzeile-scroller"]').click()
+
+    running = True
+    while running:
+        # as the side does not provide any loading indicators we need to wait after performing an action that requires loading
+        time.sleep(WAIT_TIME_SEC)
+
+        # element selector to get elements that contain title and link to excel file
+        element_selector = '//div[contains(@class, "bt-standard-conten") and not(@aria-hidden="true")]/table//div[@class= "bt-documents-description"]'
+        elements = driver.find_elements_by_xpath(element_selector)
+
+        for element in elements:
+            if element.is_displayed():
+                title_element = get_element_by_xpath_or_false(element, './p/strong')
+                link_element = get_element_by_xpath_or_false(element, './ul/li/a[starts-with(@title, "XLSX")]')
+                
+                if title_element and link_element:
+                    title = title_element.text
+                    link = link_element.get_attribute("href")
+                    title_url_list.append((title, link))
+                    print(title)
+                    print(link)
+
+        # Is there a next page
+        element = get_element_by_xpath_or_false(driver, '//button[contains(@class, "slick-next") and not(contains(@class, "slick-disabled"))]')
+        if element:
+            # Move to bottom of page to avoid share button when clicking on element
+            ActionChains(driver).move_to_element(driver.find_element_by_xpath('//div[contains(@class, "bt-footer-service")]')).perform()
+            element.click()
+        else:
+            running = False
+    driver.quit()
+    return title_url_list
+
+
+def save_to_file(file_url, folder):
+    '''
+    function to save file from url into specified folder
+    '''
+
+    file_name = file_url.split("/")[-1]
+    req = requests.get(file_url)
+    with open(folder + file_name,'wb') as output_file:
+        output_file.write(req.content)
+    return file_name
+
+def save_titles(title_filename_list, folder):
+    '''
+    function to save title file mappings into folder as 'filename_to_titles.csv'
+    '''
+    with open(folder + 'filename_to_titles.csv', 'wt') as output_file:
+        for title, file_name in title_filename_list:
+            output_file.write(f'{file_name};{title}\n')
+
+def title_url_list_element_saver(x:(str, str)):
+    """ Function that downloads the file and returns a tuple of title and filename
+    
+    Args:
+        x (str, str): tuple of title and url
+
+    Returns:
+        (str, str): returns a tuple of title and filename
+    """
+    print(f'Saving {x[0]}')
+    return x[0], save_to_file(x[1], DOWNLOAD_FOLDER)
+
+
+title_url_list = get_title_and_url()
+
+pool = multiprocessing.Pool()
+title_filenames_map = pool.map(title_url_list_element_saver, title_url_list)
+save_titles(title_filenames_map, DOWNLOAD_FOLDER)
+
--- a/download_data/scrape_uk.py
+++ b/download_data/scrape_uk.py
@@ -0,0 +1,123 @@
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as ec
+from selenium import webdriver
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.action_chains import ActionChains
+
+import multiprocessing
+
+import requests
+
+import time
+
+# Settings
+DRIVER_PATH = "./chromedriver"
+WAIT_TIME_SEC = 3
+
+# Filter
+DATE_FROM = "01/05/2020"
+DATE_TO = "01/05/2021"
+
+# Output
+DOWNLOAD_FOLDER = "../uk/csv/"
+
+
+def get_element_by_xpath_or_false(driver, xpath):
+    try:
+        element = driver.find_element_by_xpath(xpath)
+    except NoSuchElementException:
+        return False
+    return element
+
+title_link_list = []
+title_url_list = []
+title_csv_list = []
+#Function to get title url's
+def get_all_link_urls():
+   
+
+    url = r'https://votes.parliament.uk/Votes/Commons'
+
+    options = Options()
+    options.add_argument("--headless")
+    options.add_argument("window-size=800,600")
+
+    driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
+    driver.get(url)
+
+    WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.XPATH, '//div[contains(@class, "card-list")]')))
+
+    driver.find_element_by_xpath('//*[@id="FromDate"]').clear()
+    driver.find_element_by_xpath('//*[@id="ToDate"]').clear()
+    driver.find_element_by_xpath('//*[@id="FromDate"]').send_keys(DATE_FROM)
+    driver.find_element_by_xpath('//*[@id="ToDate"]').send_keys(DATE_TO)
+
+    driver.find_element_by_xpath('//button[@class="btn btn-primary"]').click()
+
+    running = True
+    while running:
+        # as the site does not provide any loading indicators we need to wait after performing an action that requires loading
+        time.sleep(WAIT_TIME_SEC)
+        
+        # element selector to get elements that contain title and link to excel fil
+        elem_selector = '//a[@class="card card-vote"]'
+        elems = driver.find_elements_by_xpath(elem_selector)
+    
+        for elem in elems:
+            if elem.is_displayed():
+                title_url_list.append((elem.get_attribute("href")))
+                print(f'Link to vote page: { elem.get_attribute("href") }') 
+
+        # Is there a next page
+        
+        #last page
+        elem_x = get_element_by_xpath_or_false(driver, '//li[@class="next"]') 
+        if elem_x:
+           elem_x.click()
+        else:
+            running = False
+    driver.quit()
+    return title_url_list   
+
+
+   
+#Function to get CSV file download url's
+def get_all_file_links():
+    options = Options()
+    options.add_argument("--headless")
+    options.add_argument("window-size=800,600")
+
+    driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
+    for elm in title_url_list:
+        driver.get(elm)
+        element = get_element_by_xpath_or_false(driver,'//a[2][@class="dropdown-item"]')
+        element_x = element.get_attribute("href")
+        print(f'Download url: {element_x}')
+        title_link_list.append((elm, element_x))
+        title_csv_list.append(element_x)
+    driver.quit()
+    return title_link_list
+
+
+def save_to_file(file_url, folder):
+    '''
+    function to save file from url into specified folder
+    '''
+
+    file_name = file_url.split("/")[-1] + '.csv'
+    req = requests.get(file_url)
+    with open(folder + file_name,'wb') as output_file:
+        output_file.write(req.content)
+    return file_name
+
+title_url_list = get_all_link_urls()
+title_link_list = get_all_file_links()
+for elem in title_link_list:
+    print(elem)
+
+for file_url in title_csv_list:
+    print(f'saving: {file_url}')
+    save_to_file(file_url, DOWNLOAD_FOLDER)
+