From 3a15bd127958d63654760bcf50a6d126650fd288 Mon Sep 17 00:00:00 2001 From: Hannes Kuchelmeister Date: Wed, 19 May 2021 16:00:59 +0200 Subject: [PATCH] cleaned up uk code for data loading --- load_data.py | 109 -------------------------------- uk_analysis.py | 27 ++++++-- uk_analysis_new.py | 43 ------------- voting_lib/load_data.py | 6 +- voting_lib/political_compass.py | 1 - 5 files changed, 24 insertions(+), 162 deletions(-) delete mode 100644 load_data.py mode change 100755 => 100644 uk_analysis.py delete mode 100644 uk_analysis_new.py diff --git a/load_data.py b/load_data.py deleted file mode 100644 index ca93a45..0000000 --- a/load_data.py +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -import os -import pandas as pd -import numpy as np - -def load_german_data(): - """ - Load German Parliament data - return : Data with columns [Member, Party, vote_0, vote_1 etc] - """ - title_file = "filename_to_titles.csv" - vote_counter = -1 - #data = pd.DataFrame() - data = {} - - period_column_g = 'Wahlperiode' - name_column_g = 'Bezeichnung' - party_column_g = 'Fraktion/Gruppe' - name_column = 'Member' - party_column = 'Party' - - vote_column_to_title = {} - - voting_features = ['ja', 'nein', 'Enthaltung', 'ungültig'] - for dirname, _, filenames in os.walk('./de/csv'): - for filename in filenames: - if filename != title_file: - - print(filename) - - vote_counter += 1 - df = pd.read_csv(os.path.join(dirname, filename)) - - # Give each voting behaviour type an identifier from 0 to len(voting_features) - 1 - for i, feature in enumerate(voting_features): - df[feature] *= i - vote_column_name = f'vote_{vote_counter}' - - # Map column name of vote to filename -> allows retrieving what the vote was about - vote_column_to_title[vote_column_name] = filename - - # add feature for the vote - df[vote_column_name] = df[voting_features].sum(axis=1) - - df=df.rename(columns={name_column_g:name_column,party_column_g:party_column}) - - period = df.iloc[0][period_column_g] - - if period in data: - # merge data with already loaded data - data[period] = data[period].merge(df[[name_column, vote_column_name]], on=name_column) - else: - # if first file that is loaded set data equal to data from first file - data[period] = df[[name_column, party_column, vote_column_name]] - - print(data) - return data - - -def load_uk_data(path): - """ - Load German Parliament data - return : Data with columns [Member, Party, vote_0, vote_1 etc] - """ - #print directory path - print(path) - # Preprocess data - vote_counter = -1 - data = pd.DataFrame() - - name_column = 'Member' - party_column = 'Party' - vote_column = 'Vote' - - column_to_filename = {} - - voting_features = {'Aye':0, 'Teller - Ayes':0, 'No':1, 'Teller - Noes':1, 'No Vote Recorded':2} - for dirname, _, filenames in os.walk(path): - for filename in filenames: - vote_counter += 1 - - # Read title rows - # sep is set to new line so it never splits up the title cells - title_df = pd.read_csv(os.path.join(dirname, filename), sep='\n',nrows=(3),skip_blank_lines=True,header=None) - - # Read data rows - df = pd.read_csv(os.path.join(dirname, filename),skiprows=(10)) - - # Give each voting behaviour type an identifier from 0 to len(voting_features) - 1 - df[vote_column].replace(voting_features, inplace=True) - - #Replace the vote column name - vote_column_name = f'vote_{vote_counter}' - df=df.rename(columns={vote_column:vote_column_name}) - - # Map column name of vote to title -> allows retrieving what the vote was about - column_to_filename[vote_column_name] = title_df.iat[2,0] - - if data.empty: - # if first file that is loaded set data equal to data from first file - data = df[[name_column, party_column, vote_column_name]] - else: - # merge data with already loaded data - data = data.merge(df[[name_column, vote_column_name]], on=name_column) - - print(data) - return data \ No newline at end of file diff --git a/uk_analysis.py b/uk_analysis.py old mode 100755 new mode 100644 index 392aeb1..f7955bd --- a/uk_analysis.py +++ b/uk_analysis.py @@ -1,21 +1,34 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +#This code is modified to run in Kaggle + import voting_lib.load_data as ld import voting_lib.voting_analysis as va +import numpy as np +import pandas as pd +import os + -# Load data -data = ld.load_uk_data().to_numpy() -X = data[:,2:] # Train model grid_h = 30 # Grid height grid_w = 30 # Grid width radius = 3 # Neighbour radius step = 0.5 -ep = 100 # No of epochs +ep = 100 # No of epochs -model = va.train_model(X, grid_h, grid_w, radius, step, ep) -# Predict and visualize output -va.predict(model, data, grid_h, grid_w) \ No newline at end of file +main_directory = 'uk/csv' +for dirname, _, filenames in os.walk(main_directory): + if dirname == main_directory: #to skip main directory path + continue + else: + # Load data + data = ld.load_uk_data(dirname).to_numpy() + + X = data[:,2:] + + model = va.train_model(X, grid_h, grid_w, radius, step, ep) + # Predict and visualize output + va.predict(model, data, grid_h, grid_w) \ No newline at end of file diff --git a/uk_analysis_new.py b/uk_analysis_new.py deleted file mode 100644 index 21af450..0000000 --- a/uk_analysis_new.py +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -#This code is modified to run in Kaggle - -#import voting_lib.load_data as ld -#import voting_lib.voting_analysis as va -import numpy as np -import pandas as pd -import os - - - -# Train model -grid_h = 30 # Grid height -grid_w = 30 # Grid width -radius = 3 # Neighbour radius -step = 0.5 -ep = 100 # No of epochs - -#main directory path(should contain differnt dataset directory) can be changed -main_directory = '/kaggle/input' - -for dirname, _, filenames in os.walk(main_directory): - #print(os.path.join(dirname)) - if dirname == main_directory: #to skip main directory path - continue - else: - # Load data - #data = ld.load_uk_data().to_numpy() - - #modifiy load_data.py --> load_uk_data() to load_uk_data(path) - # --> Place path in directory -> for dirname, _, filenames in os.walk(path): - data = load_uk_data(dirname).to_numpy() - - X = data[:,2:] - - #model = va.train_model(X, grid_h, grid_w, radius, step, ep) - model = train_model(X, grid_h, grid_w, radius, step, ep) - - # Predict and visualize output - #va.predict(model, data, grid_h, grid_w) - predict(model, data, grid_h, grid_w) \ No newline at end of file diff --git a/voting_lib/load_data.py b/voting_lib/load_data.py index d410f0d..ca93a45 100755 --- a/voting_lib/load_data.py +++ b/voting_lib/load_data.py @@ -59,11 +59,13 @@ def load_german_data(): return data -def load_uk_data(): +def load_uk_data(path): """ Load German Parliament data return : Data with columns [Member, Party, vote_0, vote_1 etc] """ + #print directory path + print(path) # Preprocess data vote_counter = -1 data = pd.DataFrame() @@ -75,7 +77,7 @@ def load_uk_data(): column_to_filename = {} voting_features = {'Aye':0, 'Teller - Ayes':0, 'No':1, 'Teller - Noes':1, 'No Vote Recorded':2} - for dirname, _, filenames in os.walk('./uk/csv'): + for dirname, _, filenames in os.walk(path): for filename in filenames: vote_counter += 1 diff --git a/voting_lib/political_compass.py b/voting_lib/political_compass.py index 67b4c21..7832978 100644 --- a/voting_lib/political_compass.py +++ b/voting_lib/political_compass.py @@ -9,7 +9,6 @@ def get_compass_parties(year=2017, country='de'): data = [[-3.5, -4], [7, 6.5], [-7, -6.5], [1, 2]] index = ['BÜ90/GR', 'CDU/CSU', 'DIE LINKE.', 'SPD'] elif year == 2005: - # TODO: add data for 2011 data = [[-1.5, -1.5], [9.5, 8], [-6, -2], [3, 3.5]] index = [ 'BÜ90/GR', 'CDU/CSU', 'DIE LINKE.', 'SPD'] else: