diff --git a/uk_scraper.py b/uk_scraper.py new file mode 100644 index 0000000..8ece6c5 --- /dev/null +++ b/uk_scraper.py @@ -0,0 +1,118 @@ +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as ec +from selenium import webdriver +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.common.action_chains import ActionChains + +import multiprocessing + +import requests + +import time + +# Settings +DRIVER_PATH = "./chromedriver" +WAIT_TIME_SEC = 3 + +# Filter +DATE_FROM = "01/01/2019" +DATE_TO = "01/05/2021" + +# Output +DOWNLOAD_FOLDER = "./uk/" + + +def get_element_by_xpath_or_false(driver, xpath): + try: + element = driver.find_element_by_xpath(xpath) + except NoSuchElementException: + return False + return element + +title_link_list = [] +title_url_list = [] +title_csv_list = [] +#Function to get title url's +def get_all_link_urls(): + + + url = r'https://votes.parliament.uk/Votes/Commons' + + options = Options() + #options.add_argument("--headless") + options.add_argument("window-size=800,600") + + driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH) + driver.get(url) + + WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.XPATH, '//div[contains(@class, "card-list")]'))) + + #driver.find_element_by_xpath('//*[@id="FromDate"]').send_keys(DATE_FROM) + #driver.find_element_by_xpath('//*[@id="ToDate"]').send_keys(DATE_TO) + #driver.find_element_by_xpath('//*[@id="voteSearch"]/div[2]/div/div/div/button').click() + + running = True + while running: + # as the site does not provide any loading indicators we need to wait after performing an action that requires loading + time.sleep(WAIT_TIME_SEC) + + # element selector to get elements that contain title and link to excel fil + elem_selector = '//a[@class="card card-vote"]' + elems = driver.find_elements_by_xpath(elem_selector) + + for elem in elems: + if elem.is_displayed(): + title_url_list.append((elem.get_attribute("href"))) + # print(elem.get_attribute("href")) + + # Is there a next page + + #last page + elem_x = get_element_by_xpath_or_false(driver, '//li[@class="next"]') + if elem_x: + elem_x.click() + else: + running = False + driver.quit() + return title_url_list + + + +#Function to get CSV file download url's +def get_all_file_links(): + options = Options() + options.add_argument("--headless") + options.add_argument("window-size=800,600") + + driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH) + for elm in title_url_list: + driver.get(elm) + element = get_element_by_xpath_or_false(driver,'//a[2][@class="dropdown-item"]') + element_x = element.get_attribute("href") + title_link_list.append((elm, element_x)) + title_csv_list.append(element_x) + driver.quit() + return title_link_list + + +def save_to_file(file_url, folder): + ''' + function to save file from url into specified folder + ''' + + file_name = file_url.split("/")[-1] + '.csv' + req = requests.get(file_url) + with open(folder + file_name,'wb') as output_file: + output_file.write(req.content) + return file_name + +title_url_list = get_all_link_urls() +title_link_list = get_all_file_links() +for elem in title_link_list: + print(elem) + +for file_url in title_csv_list: + save_to_file(file_url, DOWNLOAD_FOLDER) + #print(file_url) \ No newline at end of file