from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as ec from selenium import webdriver from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.action_chains import ActionChains import multiprocessing import requests import time # Settings DRIVER_PATH = "./chromedriver" WAIT_TIME_SEC = 3 # Filter DATE_FROM = "01/01/2019" DATE_TO = "01/05/2021" # Output DOWNLOAD_FOLDER = "./uk/" def get_element_by_xpath_or_false(driver, xpath): try: element = driver.find_element_by_xpath(xpath) except NoSuchElementException: return False return element title_link_list = [] title_url_list = [] title_csv_list = [] #Function to get title url's def get_all_link_urls(): url = r'https://votes.parliament.uk/Votes/Commons' options = Options() #options.add_argument("--headless") options.add_argument("window-size=800,600") driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH) driver.get(url) WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.XPATH, '//div[contains(@class, "card-list")]'))) driver.find_element_by_xpath('//*[@id="FromDate"]').clear() driver.find_element_by_xpath('//*[@id="ToDate"]').clear() driver.find_element_by_xpath('//*[@id="FromDate"]').send_keys(DATE_FROM) driver.find_element_by_xpath('//*[@id="ToDate"]').send_keys(DATE_TO) #time.sleep(WAIT_TIME_SEC) driver.find_element_by_xpath('//button[@class="btn btn-primary"]').click() running = True while running: # as the site does not provide any loading indicators we need to wait after performing an action that requires loading time.sleep(WAIT_TIME_SEC) # element selector to get elements that contain title and link to excel fil elem_selector = '//a[@class="card card-vote"]' elems = driver.find_elements_by_xpath(elem_selector) for elem in elems: if elem.is_displayed(): title_url_list.append((elem.get_attribute("href"))) # print(elem.get_attribute("href")) # Is there a next page #last page elem_x = get_element_by_xpath_or_false(driver, '//li[@class="next"]') if elem_x: elem_x.click() else: running = False driver.quit() return title_url_list #Function to get CSV file download url's def get_all_file_links(): options = Options() options.add_argument("--headless") options.add_argument("window-size=800,600") driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH) for elm in title_url_list: driver.get(elm) element = get_element_by_xpath_or_false(driver,'//a[2][@class="dropdown-item"]') element_x = element.get_attribute("href") title_link_list.append((elm, element_x)) title_csv_list.append(element_x) driver.quit() return title_link_list def save_to_file(file_url, folder): ''' function to save file from url into specified folder ''' file_name = file_url.split("/")[-1] + '.csv' req = requests.get(file_url) with open(folder + file_name,'wb') as output_file: output_file.write(req.content) return file_name title_url_list = get_all_link_urls() title_link_list = get_all_file_links() for elem in title_link_list: print(elem) for file_url in title_csv_list: save_to_file(file_url, DOWNLOAD_FOLDER) #print(file_url)