mirror of
https://github.com/13hannes11/UU_NCML_Project.git
synced 2024-09-03 20:50:59 +02:00
Restructured source files, closed #18
This commit is contained in:
29
download_data/convert_to_csv_de.py
Normal file
29
download_data/convert_to_csv_de.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import pandas as pd
|
||||
import os
|
||||
|
||||
base_dir = "../de/"
|
||||
out_dir = "csv/"
|
||||
in_dir = "input/"
|
||||
title_file = "filename_to_titles.csv"
|
||||
|
||||
|
||||
if not os.path.exists(os.path.join(base_dir, out_dir)):
|
||||
os.makedirs(os.path.join(base_dir, out_dir))
|
||||
|
||||
# Copy titles file and replace file endings
|
||||
print(f'Copying Title File')
|
||||
with open(os.path.join(base_dir, in_dir, title_file), 'r') as file:
|
||||
file_content = file.read()
|
||||
file_content = file_content.replace('.xlsx', '.csv')
|
||||
with open(os.path.join(base_dir, out_dir, title_file), 'w') as file:
|
||||
file.write(file_content)
|
||||
|
||||
# Convert xlsx files to csv
|
||||
for dirname, _, filenames in os.walk(os.path.join(base_dir, in_dir)):
|
||||
for filename in filenames:
|
||||
if filename != title_file:
|
||||
print(f'Reading {filename}')
|
||||
read_file = pd.read_excel (os.path.join(dirname, filename))
|
||||
print(f'Saving {filename}')
|
||||
read_file.to_csv (os.path.join(base_dir, out_dir, filename.split(".", 1)[0] + ".csv"), index = None, header=True)
|
||||
|
||||
125
download_data/scrape_de.py
Normal file
125
download_data/scrape_de.py
Normal file
@@ -0,0 +1,125 @@
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as ec
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import NoSuchElementException
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
|
||||
import multiprocessing
|
||||
|
||||
import requests
|
||||
|
||||
import time
|
||||
|
||||
# Settings
|
||||
DRIVER_PATH = "./chromedriver"
|
||||
WAIT_TIME_SEC = 7
|
||||
|
||||
# Filter
|
||||
DATE_FROM = "01.01.2019"
|
||||
DATE_TO = "20.04.2021"
|
||||
|
||||
# Output
|
||||
DOWNLOAD_FOLDER = "../de/input/"
|
||||
|
||||
|
||||
def get_element_by_xpath_or_false(driver, xpath):
|
||||
try:
|
||||
element = driver.find_element_by_xpath(xpath)
|
||||
except NoSuchElementException:
|
||||
return False
|
||||
return element
|
||||
|
||||
def get_title_and_url():
|
||||
'''
|
||||
function to get titles and URLS for dataset
|
||||
'''
|
||||
title_url_list = []
|
||||
|
||||
url = r'https://www.bundestag.de/parlament/plenum/abstimmung/liste'
|
||||
|
||||
options = Options()
|
||||
options.add_argument("--headless")
|
||||
options.add_argument("window-size=800,600")
|
||||
|
||||
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
|
||||
driver.get(url)
|
||||
|
||||
WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.XPATH, "//table")))
|
||||
|
||||
driver.find_element_by_xpath('//input[starts-with(@id, "from_")]').send_keys(DATE_FROM)
|
||||
driver.find_element_by_xpath('//input[starts-with(@id, "to_")]').send_keys(DATE_TO)
|
||||
driver.find_element_by_xpath('//div[@class= "bt-filterzeile-scroller"]').click()
|
||||
|
||||
running = True
|
||||
while running:
|
||||
# as the side does not provide any loading indicators we need to wait after performing an action that requires loading
|
||||
time.sleep(WAIT_TIME_SEC)
|
||||
|
||||
# element selector to get elements that contain title and link to excel file
|
||||
element_selector = '//div[contains(@class, "bt-standard-conten") and not(@aria-hidden="true")]/table//div[@class= "bt-documents-description"]'
|
||||
elements = driver.find_elements_by_xpath(element_selector)
|
||||
|
||||
for element in elements:
|
||||
if element.is_displayed():
|
||||
title_element = get_element_by_xpath_or_false(element, './p/strong')
|
||||
link_element = get_element_by_xpath_or_false(element, './ul/li/a[starts-with(@title, "XLSX")]')
|
||||
|
||||
if title_element and link_element:
|
||||
title = title_element.text
|
||||
link = link_element.get_attribute("href")
|
||||
title_url_list.append((title, link))
|
||||
print(title)
|
||||
print(link)
|
||||
|
||||
# Is there a next page
|
||||
element = get_element_by_xpath_or_false(driver, '//button[contains(@class, "slick-next") and not(contains(@class, "slick-disabled"))]')
|
||||
if element:
|
||||
# Move to bottom of page to avoid share button when clicking on element
|
||||
ActionChains(driver).move_to_element(driver.find_element_by_xpath('//div[contains(@class, "bt-footer-service")]')).perform()
|
||||
element.click()
|
||||
else:
|
||||
running = False
|
||||
driver.quit()
|
||||
return title_url_list
|
||||
|
||||
|
||||
def save_to_file(file_url, folder):
|
||||
'''
|
||||
function to save file from url into specified folder
|
||||
'''
|
||||
|
||||
file_name = file_url.split("/")[-1]
|
||||
req = requests.get(file_url)
|
||||
with open(folder + file_name,'wb') as output_file:
|
||||
output_file.write(req.content)
|
||||
return file_name
|
||||
|
||||
def save_titles(title_filename_list, folder):
|
||||
'''
|
||||
function to save title file mappings into folder as 'filename_to_titles.csv'
|
||||
'''
|
||||
with open(folder + 'filename_to_titles.csv', 'wt') as output_file:
|
||||
for title, file_name in title_filename_list:
|
||||
output_file.write(f'{file_name};{title}\n')
|
||||
|
||||
def title_url_list_element_saver(x:(str, str)):
|
||||
""" Function that downloads the file and returns a tuple of title and filename
|
||||
|
||||
Args:
|
||||
x (str, str): tuple of title and url
|
||||
|
||||
Returns:
|
||||
(str, str): returns a tuple of title and filename
|
||||
"""
|
||||
print(f'Saving {x[0]}')
|
||||
return x[0], save_to_file(x[1], DOWNLOAD_FOLDER)
|
||||
|
||||
|
||||
title_url_list = get_title_and_url()
|
||||
|
||||
pool = multiprocessing.Pool()
|
||||
title_filenames_map = pool.map(title_url_list_element_saver, title_url_list)
|
||||
save_titles(title_filenames_map, DOWNLOAD_FOLDER)
|
||||
|
||||
123
download_data/scrape_uk.py
Normal file
123
download_data/scrape_uk.py
Normal file
@@ -0,0 +1,123 @@
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as ec
|
||||
from selenium import webdriver
|
||||
from selenium.common.exceptions import NoSuchElementException
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
from selenium.webdriver.common.action_chains import ActionChains
|
||||
|
||||
import multiprocessing
|
||||
|
||||
import requests
|
||||
|
||||
import time
|
||||
|
||||
# Settings
|
||||
DRIVER_PATH = "./chromedriver"
|
||||
WAIT_TIME_SEC = 3
|
||||
|
||||
# Filter
|
||||
DATE_FROM = "01/05/2020"
|
||||
DATE_TO = "01/05/2021"
|
||||
|
||||
# Output
|
||||
DOWNLOAD_FOLDER = "../uk/csv/"
|
||||
|
||||
|
||||
def get_element_by_xpath_or_false(driver, xpath):
|
||||
try:
|
||||
element = driver.find_element_by_xpath(xpath)
|
||||
except NoSuchElementException:
|
||||
return False
|
||||
return element
|
||||
|
||||
title_link_list = []
|
||||
title_url_list = []
|
||||
title_csv_list = []
|
||||
#Function to get title url's
|
||||
def get_all_link_urls():
|
||||
|
||||
|
||||
url = r'https://votes.parliament.uk/Votes/Commons'
|
||||
|
||||
options = Options()
|
||||
options.add_argument("--headless")
|
||||
options.add_argument("window-size=800,600")
|
||||
|
||||
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
|
||||
driver.get(url)
|
||||
|
||||
WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.XPATH, '//div[contains(@class, "card-list")]')))
|
||||
|
||||
driver.find_element_by_xpath('//*[@id="FromDate"]').clear()
|
||||
driver.find_element_by_xpath('//*[@id="ToDate"]').clear()
|
||||
driver.find_element_by_xpath('//*[@id="FromDate"]').send_keys(DATE_FROM)
|
||||
driver.find_element_by_xpath('//*[@id="ToDate"]').send_keys(DATE_TO)
|
||||
|
||||
driver.find_element_by_xpath('//button[@class="btn btn-primary"]').click()
|
||||
|
||||
running = True
|
||||
while running:
|
||||
# as the site does not provide any loading indicators we need to wait after performing an action that requires loading
|
||||
time.sleep(WAIT_TIME_SEC)
|
||||
|
||||
# element selector to get elements that contain title and link to excel fil
|
||||
elem_selector = '//a[@class="card card-vote"]'
|
||||
elems = driver.find_elements_by_xpath(elem_selector)
|
||||
|
||||
for elem in elems:
|
||||
if elem.is_displayed():
|
||||
title_url_list.append((elem.get_attribute("href")))
|
||||
print(f'Link to vote page: { elem.get_attribute("href") }')
|
||||
|
||||
# Is there a next page
|
||||
|
||||
#last page
|
||||
elem_x = get_element_by_xpath_or_false(driver, '//li[@class="next"]')
|
||||
if elem_x:
|
||||
elem_x.click()
|
||||
else:
|
||||
running = False
|
||||
driver.quit()
|
||||
return title_url_list
|
||||
|
||||
|
||||
|
||||
#Function to get CSV file download url's
|
||||
def get_all_file_links():
|
||||
options = Options()
|
||||
options.add_argument("--headless")
|
||||
options.add_argument("window-size=800,600")
|
||||
|
||||
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
|
||||
for elm in title_url_list:
|
||||
driver.get(elm)
|
||||
element = get_element_by_xpath_or_false(driver,'//a[2][@class="dropdown-item"]')
|
||||
element_x = element.get_attribute("href")
|
||||
print(f'Download url: {element_x}')
|
||||
title_link_list.append((elm, element_x))
|
||||
title_csv_list.append(element_x)
|
||||
driver.quit()
|
||||
return title_link_list
|
||||
|
||||
|
||||
def save_to_file(file_url, folder):
|
||||
'''
|
||||
function to save file from url into specified folder
|
||||
'''
|
||||
|
||||
file_name = file_url.split("/")[-1] + '.csv'
|
||||
req = requests.get(file_url)
|
||||
with open(folder + file_name,'wb') as output_file:
|
||||
output_file.write(req.content)
|
||||
return file_name
|
||||
|
||||
title_url_list = get_all_link_urls()
|
||||
title_link_list = get_all_file_links()
|
||||
for elem in title_link_list:
|
||||
print(elem)
|
||||
|
||||
for file_url in title_csv_list:
|
||||
print(f'saving: {file_url}')
|
||||
save_to_file(file_url, DOWNLOAD_FOLDER)
|
||||
|
||||
Reference in New Issue
Block a user