mirror of
https://github.com/13hannes11/UU_NCML_Project.git
synced 2024-09-03 20:50:59 +02:00
add downloader to scraper script
This commit is contained in:
71
scraper.py
71
scraper.py
@@ -6,9 +6,24 @@ from selenium.common.exceptions import NoSuchElementException
|
|||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
from selenium.webdriver.common.action_chains import ActionChains
|
from selenium.webdriver.common.action_chains import ActionChains
|
||||||
|
|
||||||
|
import multiprocessing
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
# Settings
|
||||||
|
DRIVER_PATH = "./chromedriver"
|
||||||
|
WAIT_TIME_SEC = 7
|
||||||
|
|
||||||
|
# Filter
|
||||||
|
DATE_FROM = "01.01.2019"
|
||||||
|
DATE_TO = "20.04.2021"
|
||||||
|
|
||||||
|
# Output
|
||||||
|
DOWNLOAD_FOLDER = "de/input/"
|
||||||
|
|
||||||
|
|
||||||
def get_element_by_xpath_or_false(driver, xpath):
|
def get_element_by_xpath_or_false(driver, xpath):
|
||||||
try:
|
try:
|
||||||
element = driver.find_element_by_xpath(xpath)
|
element = driver.find_element_by_xpath(xpath)
|
||||||
@@ -16,16 +31,16 @@ def get_element_by_xpath_or_false(driver, xpath):
|
|||||||
return False
|
return False
|
||||||
return element
|
return element
|
||||||
|
|
||||||
WAIT_TIME_SEC = 7
|
def get_title_and_url():
|
||||||
|
'''
|
||||||
|
function to get titles and URLS for dataset
|
||||||
|
'''
|
||||||
|
title_url_list = []
|
||||||
|
|
||||||
DATE_FROM = "01.01.2019"
|
|
||||||
DATE_TO = "20.04.2021"
|
|
||||||
|
|
||||||
DRIVER_PATH = "./chromedriver"
|
|
||||||
url = r'https://www.bundestag.de/parlament/plenum/abstimmung/liste'
|
url = r'https://www.bundestag.de/parlament/plenum/abstimmung/liste'
|
||||||
|
|
||||||
options = Options()
|
options = Options()
|
||||||
#options.add_argument("--headless")
|
options.add_argument("--headless")
|
||||||
options.add_argument("window-size=800,600")
|
options.add_argument("window-size=800,600")
|
||||||
|
|
||||||
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
|
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)
|
||||||
@@ -54,7 +69,7 @@ while running:
|
|||||||
if title_element and link_element:
|
if title_element and link_element:
|
||||||
title = title_element.text
|
title = title_element.text
|
||||||
link = link_element.get_attribute("href")
|
link = link_element.get_attribute("href")
|
||||||
|
title_url_list.append((title, link))
|
||||||
print(title)
|
print(title)
|
||||||
print(link)
|
print(link)
|
||||||
|
|
||||||
@@ -66,5 +81,45 @@ while running:
|
|||||||
element.click()
|
element.click()
|
||||||
else:
|
else:
|
||||||
running = False
|
running = False
|
||||||
|
driver.quit()
|
||||||
|
return title_url_list
|
||||||
|
|
||||||
|
|
||||||
|
def save_to_file(file_url, folder):
|
||||||
|
'''
|
||||||
|
function to save file from url into specified folder
|
||||||
|
'''
|
||||||
|
|
||||||
|
file_name = file_url.split("/")[-1]
|
||||||
|
req = requests.get(file_url)
|
||||||
|
with open(folder + file_name,'wb') as output_file:
|
||||||
|
output_file.write(req.content)
|
||||||
|
return file_name
|
||||||
|
|
||||||
|
def save_titles(title_filename_list, folder):
|
||||||
|
'''
|
||||||
|
function to save title file mappings into folder as 'filename_to_titles.csv'
|
||||||
|
'''
|
||||||
|
with open(folder + 'filename_to_titles.csv', 'wt') as output_file:
|
||||||
|
for title, file_name in title_filename_list:
|
||||||
|
output_file.write(f'{file_name};{title}\n')
|
||||||
|
|
||||||
|
def title_url_list_element_saver(x:(str, str)):
|
||||||
|
""" Function that downloads the file and returns a tuple of title and filename
|
||||||
|
|
||||||
|
Args:
|
||||||
|
x (str, str): tuple of title and url
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(str, str): returns a tuple of title and filename
|
||||||
|
"""
|
||||||
|
print(f'Saving {x[0]}')
|
||||||
|
return x[0], save_to_file(x[1], DOWNLOAD_FOLDER)
|
||||||
|
|
||||||
|
|
||||||
|
title_url_list = get_title_and_url()
|
||||||
|
|
||||||
|
pool = multiprocessing.Pool()
|
||||||
|
title_filenames_map = pool.map(title_url_list_element_saver, title_url_list)
|
||||||
|
save_titles(title_filenames_map, DOWNLOAD_FOLDER)
|
||||||
|
|
||||||
# TODO: file downloader
|
|
||||||
|
|||||||
Reference in New Issue
Block a user