Merge pull request #10 from 13hannes11/feature/scraper

Feature/scraper
This commit is contained in:
Hannes Kuchelmeister
2021-04-28 13:57:47 +02:00
committed by GitHub
111 changed files with 298 additions and 33 deletions

1
.gitignore vendored
View File

@@ -1,4 +1,5 @@
de/csv
chromedriver
# Byte-compiled / optimized / DLL files
__pycache__/

View File

@@ -1 +1,7 @@
# UU_NCML_Project
# UU_NCML_Project
## Selenium
Install a google chrome, download the corresponding Webdriver and place it into this folder (filename: `chromedriver`):
https://www.selenium.dev/documentation/en/webdriver/driver_requirements/

View File

@@ -1,15 +1,29 @@
import pandas as pd
import os
# Convert data to csv
base_dir = "./de/"
out_dir = "csv/"
in_dir = "input/"
title_file = "filename_to_titles.csv"
out_dir = "csv"
if not os.path.exists(base_dir + out_dir):
os.makedirs(base_dir + out_dir)
for dirname, _, filenames in os.walk(base_dir + 'input'):
if not os.path.exists(os.path.join(base_dir, out_dir)):
os.makedirs(os.path.join(base_dir, out_dir))
# Copy titles file and replace file endings
print(f'Copying Title File')
with open(os.path.join(base_dir, in_dir, title_file), 'r') as file:
file_content = file.read()
file_content = file_content.replace('.xlsx', '.csv')
with open(os.path.join(base_dir, out_dir, title_file), 'w') as file:
file.write(file_content)
# Convert xlsx files to csv
for dirname, _, filenames in os.walk(os.path.join(base_dir, in_dir)):
for filename in filenames:
read_file = pd.read_excel (os.path.join(dirname, filename))
read_file.to_csv (os.path.join(base_dir + out_dir, filename.split(".", 1)[0] + ".csv"), index = None, header=True)
if filename != title_file:
print(f'Reading {filename}')
read_file = pd.read_excel (os.path.join(dirname, filename))
print(f'Saving {filename}')
read_file.to_csv (os.path.join(base_dir, out_dir, filename.split(".", 1)[0] + ".csv"), index = None, header=True)

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Some files were not shown because too many files have changed in this diff Show More