Restructured source files, closed #18

This commit is contained in:
Deepthi Pathare
2021-05-10 14:55:31 +02:00
parent c5fa8fead7
commit db762c4061
3 changed files with 3 additions and 3 deletions

View File

@@ -0,0 +1,29 @@
import pandas as pd
import os
base_dir = "../de/"
out_dir = "csv/"
in_dir = "input/"
title_file = "filename_to_titles.csv"
if not os.path.exists(os.path.join(base_dir, out_dir)):
os.makedirs(os.path.join(base_dir, out_dir))
# Copy titles file and replace file endings
print(f'Copying Title File')
with open(os.path.join(base_dir, in_dir, title_file), 'r') as file:
file_content = file.read()
file_content = file_content.replace('.xlsx', '.csv')
with open(os.path.join(base_dir, out_dir, title_file), 'w') as file:
file.write(file_content)
# Convert xlsx files to csv
for dirname, _, filenames in os.walk(os.path.join(base_dir, in_dir)):
for filename in filenames:
if filename != title_file:
print(f'Reading {filename}')
read_file = pd.read_excel (os.path.join(dirname, filename))
print(f'Saving {filename}')
read_file.to_csv (os.path.join(base_dir, out_dir, filename.split(".", 1)[0] + ".csv"), index = None, header=True)

125
download_data/scrape_de.py Normal file
View File

@@ -0,0 +1,125 @@
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import multiprocessing
import requests
import time
# Settings
DRIVER_PATH = "./chromedriver"
WAIT_TIME_SEC = 7
# Filter
DATE_FROM = "01.01.2019"
DATE_TO = "20.04.2021"
# Output
DOWNLOAD_FOLDER = "../de/input/"
def get_element_by_xpath_or_false(driver, xpath):
try:
element = driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return element
def get_title_and_url():
'''
function to get titles and URLS for dataset
'''
title_url_list = []
url = r'https://www.bundestag.de/parlament/plenum/abstimmung/liste'
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=800,600")
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
driver.get(url)
WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.XPATH, "//table")))
driver.find_element_by_xpath('//input[starts-with(@id, "from_")]').send_keys(DATE_FROM)
driver.find_element_by_xpath('//input[starts-with(@id, "to_")]').send_keys(DATE_TO)
driver.find_element_by_xpath('//div[@class= "bt-filterzeile-scroller"]').click()
running = True
while running:
# as the side does not provide any loading indicators we need to wait after performing an action that requires loading
time.sleep(WAIT_TIME_SEC)
# element selector to get elements that contain title and link to excel file
element_selector = '//div[contains(@class, "bt-standard-conten") and not(@aria-hidden="true")]/table//div[@class= "bt-documents-description"]'
elements = driver.find_elements_by_xpath(element_selector)
for element in elements:
if element.is_displayed():
title_element = get_element_by_xpath_or_false(element, './p/strong')
link_element = get_element_by_xpath_or_false(element, './ul/li/a[starts-with(@title, "XLSX")]')
if title_element and link_element:
title = title_element.text
link = link_element.get_attribute("href")
title_url_list.append((title, link))
print(title)
print(link)
# Is there a next page
element = get_element_by_xpath_or_false(driver, '//button[contains(@class, "slick-next") and not(contains(@class, "slick-disabled"))]')
if element:
# Move to bottom of page to avoid share button when clicking on element
ActionChains(driver).move_to_element(driver.find_element_by_xpath('//div[contains(@class, "bt-footer-service")]')).perform()
element.click()
else:
running = False
driver.quit()
return title_url_list
def save_to_file(file_url, folder):
'''
function to save file from url into specified folder
'''
file_name = file_url.split("/")[-1]
req = requests.get(file_url)
with open(folder + file_name,'wb') as output_file:
output_file.write(req.content)
return file_name
def save_titles(title_filename_list, folder):
'''
function to save title file mappings into folder as 'filename_to_titles.csv'
'''
with open(folder + 'filename_to_titles.csv', 'wt') as output_file:
for title, file_name in title_filename_list:
output_file.write(f'{file_name};{title}\n')
def title_url_list_element_saver(x:(str, str)):
""" Function that downloads the file and returns a tuple of title and filename
Args:
x (str, str): tuple of title and url
Returns:
(str, str): returns a tuple of title and filename
"""
print(f'Saving {x[0]}')
return x[0], save_to_file(x[1], DOWNLOAD_FOLDER)
title_url_list = get_title_and_url()
pool = multiprocessing.Pool()
title_filenames_map = pool.map(title_url_list_element_saver, title_url_list)
save_titles(title_filenames_map, DOWNLOAD_FOLDER)

123
download_data/scrape_uk.py Normal file
View File

@@ -0,0 +1,123 @@
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
import multiprocessing
import requests
import time
# Settings
DRIVER_PATH = "./chromedriver"
WAIT_TIME_SEC = 3
# Filter
DATE_FROM = "01/05/2020"
DATE_TO = "01/05/2021"
# Output
DOWNLOAD_FOLDER = "../uk/csv/"
def get_element_by_xpath_or_false(driver, xpath):
try:
element = driver.find_element_by_xpath(xpath)
except NoSuchElementException:
return False
return element
title_link_list = []
title_url_list = []
title_csv_list = []
#Function to get title url's
def get_all_link_urls():
url = r'https://votes.parliament.uk/Votes/Commons'
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=800,600")
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
driver.get(url)
WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.XPATH, '//div[contains(@class, "card-list")]')))
driver.find_element_by_xpath('//*[@id="FromDate"]').clear()
driver.find_element_by_xpath('//*[@id="ToDate"]').clear()
driver.find_element_by_xpath('//*[@id="FromDate"]').send_keys(DATE_FROM)
driver.find_element_by_xpath('//*[@id="ToDate"]').send_keys(DATE_TO)
driver.find_element_by_xpath('//button[@class="btn btn-primary"]').click()
running = True
while running:
# as the site does not provide any loading indicators we need to wait after performing an action that requires loading
time.sleep(WAIT_TIME_SEC)
# element selector to get elements that contain title and link to excel fil
elem_selector = '//a[@class="card card-vote"]'
elems = driver.find_elements_by_xpath(elem_selector)
for elem in elems:
if elem.is_displayed():
title_url_list.append((elem.get_attribute("href")))
print(f'Link to vote page: { elem.get_attribute("href") }')
# Is there a next page
#last page
elem_x = get_element_by_xpath_or_false(driver, '//li[@class="next"]')
if elem_x:
elem_x.click()
else:
running = False
driver.quit()
return title_url_list
#Function to get CSV file download url's
def get_all_file_links():
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=800,600")
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
for elm in title_url_list:
driver.get(elm)
element = get_element_by_xpath_or_false(driver,'//a[2][@class="dropdown-item"]')
element_x = element.get_attribute("href")
print(f'Download url: {element_x}')
title_link_list.append((elm, element_x))
title_csv_list.append(element_x)
driver.quit()
return title_link_list
def save_to_file(file_url, folder):
'''
function to save file from url into specified folder
'''
file_name = file_url.split("/")[-1] + '.csv'
req = requests.get(file_url)
with open(folder + file_name,'wb') as output_file:
output_file.write(req.content)
return file_name
title_url_list = get_all_link_urls()
title_link_list = get_all_file_links()
for elem in title_link_list:
print(elem)
for file_url in title_csv_list:
print(f'saving: {file_url}')
save_to_file(file_url, DOWNLOAD_FOLDER)