Add files via upload

2024-09-03 20:50:59 +02:00 · 2021-05-07 22:39:25 +02:00
parent 272e5721f3
commit b4cee9c3c9
1 changed files with 118 additions and 0 deletions
--- a/uk_scraper.py
+++ b/uk_scraper.py
@@ -0,0 +1,118 @@
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as ec
+from selenium import webdriver
+from selenium.common.exceptions import NoSuchElementException
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.common.action_chains import ActionChains
+
+import multiprocessing
+
+import requests
+
+import time
+
+# Settings
+DRIVER_PATH = "./chromedriver"
+WAIT_TIME_SEC = 3
+
+# Filter
+DATE_FROM = "01/01/2019"
+DATE_TO = "01/05/2021"
+
+# Output
+DOWNLOAD_FOLDER = "./uk/"
+
+
+def get_element_by_xpath_or_false(driver, xpath):
+    try:
+        element = driver.find_element_by_xpath(xpath)
+    except NoSuchElementException:
+        return False
+    return element
+
+title_link_list = []
+title_url_list = []
+title_csv_list = []
+#Function to get title url's
+def get_all_link_urls():
+   
+
+    url = r'https://votes.parliament.uk/Votes/Commons'
+
+    options = Options()
+    #options.add_argument("--headless")
+    options.add_argument("window-size=800,600")
+
+    driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
+    driver.get(url)
+
+    WebDriverWait(driver, 10).until(ec.visibility_of_element_located((By.XPATH, '//div[contains(@class, "card-list")]')))
+
+    #driver.find_element_by_xpath('//*[@id="FromDate"]').send_keys(DATE_FROM)
+    #driver.find_element_by_xpath('//*[@id="ToDate"]').send_keys(DATE_TO)
+    #driver.find_element_by_xpath('//*[@id="voteSearch"]/div[2]/div/div/div/button').click()
+
+    running = True
+    while running:
+        # as the site does not provide any loading indicators we need to wait after performing an action that requires loading
+        time.sleep(WAIT_TIME_SEC)
+        
+        # element selector to get elements that contain title and link to excel fil
+        elem_selector = '//a[@class="card card-vote"]'
+        elems = driver.find_elements_by_xpath(elem_selector)
+    
+        for elem in elems:
+            if elem.is_displayed():
+                title_url_list.append((elem.get_attribute("href")))
+               # print(elem.get_attribute("href")) 
+
+        # Is there a next page
+        
+        #last page
+        elem_x = get_element_by_xpath_or_false(driver, '//li[@class="next"]') 
+        if elem_x:
+           elem_x.click()
+        else:
+            running = False
+    driver.quit()
+    return title_url_list   
+
+
+   
+#Function to get CSV file download url's
+def get_all_file_links():
+    options = Options()
+    options.add_argument("--headless")
+    options.add_argument("window-size=800,600")
+
+    driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
+    for elm in title_url_list:
+        driver.get(elm)
+        element = get_element_by_xpath_or_false(driver,'//a[2][@class="dropdown-item"]')
+        element_x = element.get_attribute("href")
+        title_link_list.append((elm, element_x))
+        title_csv_list.append(element_x)
+    driver.quit()
+    return title_link_list
+
+
+def save_to_file(file_url, folder):
+    '''
+    function to save file from url into specified folder
+    '''
+
+    file_name = file_url.split("/")[-1] + '.csv'
+    req = requests.get(file_url)
+    with open(folder + file_name,'wb') as output_file:
+        output_file.write(req.content)
+    return file_name
+
+title_url_list = get_all_link_urls()
+title_link_list = get_all_file_links()
+for elem in title_link_list:
+    print(elem)
+
+for file_url in title_csv_list:
+    save_to_file(file_url, DOWNLOAD_FOLDER)
+    #print(file_url)