From 3a15bd127958d63654760bcf50a6d126650fd288 Mon Sep 17 00:00:00 2001
From: Hannes Kuchelmeister <hannes@kuchelmeister.org>
Date: Wed, 19 May 2021 16:00:59 +0200
Subject: [PATCH] cleaned up uk code for data loading

---
 load_data.py                    | 109 --------------------------------
 uk_analysis.py                  |  27 ++++++--
 uk_analysis_new.py              |  43 -------------
 voting_lib/load_data.py         |   6 +-
 voting_lib/political_compass.py |   1 -
 5 files changed, 24 insertions(+), 162 deletions(-)
 delete mode 100644 load_data.py
 mode change 100755 => 100644 uk_analysis.py
 delete mode 100644 uk_analysis_new.py

diff --git a/load_data.py b/load_data.py
deleted file mode 100644
index ca93a45..0000000
--- a/load_data.py
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-import os
-import pandas as pd
-import numpy as np
-
-def load_german_data():
-    """
-    Load German Parliament data
-    return : Data with columns [Member, Party, vote_0, vote_1 etc]
-    """  
-    title_file = "filename_to_titles.csv"
-    vote_counter = -1
-    #data = pd.DataFrame()
-    data = {}
-    
-    period_column_g = 'Wahlperiode'
-    name_column_g = 'Bezeichnung'
-    party_column_g = 'Fraktion/Gruppe'
-    name_column = 'Member'
-    party_column = 'Party'
-    
-    vote_column_to_title = {}
-    
-    voting_features = ['ja', 'nein', 'Enthaltung', 'ungültig']
-    for dirname, _, filenames in os.walk('./de/csv'):
-        for filename in filenames:
-            if filename != title_file:
-                
-                print(filename)
-                
-                vote_counter += 1
-                df = pd.read_csv(os.path.join(dirname, filename))
-                
-                # Give each voting behaviour type an identifier from 0 to len(voting_features) - 1
-                for i, feature in enumerate(voting_features):
-                    df[feature] *= i
-                vote_column_name = f'vote_{vote_counter}'
-                
-                # Map column name of vote to filename -> allows retrieving what the vote was about
-                vote_column_to_title[vote_column_name] = filename
-                
-                # add feature for the vote
-                df[vote_column_name] = df[voting_features].sum(axis=1)
-                
-                df=df.rename(columns={name_column_g:name_column,party_column_g:party_column})
-                
-                period = df.iloc[0][period_column_g]
-                                    
-                if period in data:
-                    # merge data with already loaded data 
-                    data[period] = data[period].merge(df[[name_column, vote_column_name]], on=name_column)                    
-                else:
-                    # if first file that is loaded set data equal to data from first file
-                    data[period] = df[[name_column, party_column, vote_column_name]]
-                    
-    print(data)
-    return data
-
-
-def load_uk_data(path):
-    """
-    Load German Parliament data
-    return : Data with columns [Member, Party, vote_0, vote_1 etc]
-    """  
-    #print directory path
-    print(path) 
-    # Preprocess data
-    vote_counter = -1
-    data = pd.DataFrame()
-    
-    name_column = 'Member'
-    party_column = 'Party'
-    vote_column = 'Vote'
-    
-    column_to_filename = {}
-    
-    voting_features = {'Aye':0, 'Teller - Ayes':0, 'No':1, 'Teller - Noes':1, 'No Vote Recorded':2}
-    for dirname, _, filenames in os.walk(path):
-        for filename in filenames:
-            vote_counter += 1
-            
-            # Read title rows
-            # sep is set to new line so it never splits up the title cells
-            title_df = pd.read_csv(os.path.join(dirname, filename), sep='\n',nrows=(3),skip_blank_lines=True,header=None)
-        
-            # Read data rows
-            df = pd.read_csv(os.path.join(dirname, filename),skiprows=(10))
-            
-            # Give each voting behaviour type an identifier from 0 to len(voting_features) - 1
-            df[vote_column].replace(voting_features, inplace=True)
-            
-            #Replace the vote column name
-            vote_column_name = f'vote_{vote_counter}'
-            df=df.rename(columns={vote_column:vote_column_name})
-             
-            # Map column name of vote to title -> allows retrieving what the vote was about
-            column_to_filename[vote_column_name] = title_df.iat[2,0]
-                    
-            if data.empty:
-                # if first file that is loaded set data equal to data from first file
-                data = df[[name_column, party_column, vote_column_name]]
-            else:
-                # merge data with already loaded data 
-                data = data.merge(df[[name_column, vote_column_name]], on=name_column)
-    
-    print(data)
-    return data
\ No newline at end of file
diff --git a/uk_analysis.py b/uk_analysis.py
old mode 100755
new mode 100644
index 392aeb1..f7955bd
--- a/uk_analysis.py
+++ b/uk_analysis.py
@@ -1,21 +1,34 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
+#This code is modified to run in Kaggle
+
 import voting_lib.load_data as ld
 import voting_lib.voting_analysis as va
+import numpy as np
+import pandas as pd
+import os
+
 
-# Load data
-data = ld.load_uk_data().to_numpy()
-X = data[:,2:]
 
 # Train model
 grid_h = 30       # Grid height
 grid_w = 30       # Grid width
 radius = 3        # Neighbour radius
 step = 0.5
-ep = 100          # No of epochs
+ep = 100         # No of epochs
 
-model = va.train_model(X, grid_h, grid_w, radius, step, ep)
 
-# Predict and visualize output
-va.predict(model, data, grid_h, grid_w)
\ No newline at end of file
+main_directory = 'uk/csv'
+for dirname, _, filenames in os.walk(main_directory):
+        if dirname == main_directory: #to skip main directory path 
+            continue
+        else:
+            # Load data
+            data = ld.load_uk_data(dirname).to_numpy() 
+
+            X = data[:,2:]
+
+            model = va.train_model(X, grid_h, grid_w, radius, step, ep)
+            # Predict and visualize output
+            va.predict(model, data, grid_h, grid_w)
\ No newline at end of file
diff --git a/uk_analysis_new.py b/uk_analysis_new.py
deleted file mode 100644
index 21af450..0000000
--- a/uk_analysis_new.py
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-#This code is modified to run in Kaggle
-
-#import voting_lib.load_data as ld
-#import voting_lib.voting_analysis as va
-import numpy as np
-import pandas as pd
-import os
-
-
-
-# Train model
-grid_h = 30       # Grid height
-grid_w = 30       # Grid width
-radius = 3        # Neighbour radius
-step = 0.5
-ep = 100          # No of epochs
-
-#main directory path(should contain differnt dataset directory) can be changed 
-main_directory = '/kaggle/input'
-
-for dirname, _, filenames in os.walk(main_directory):
-        #print(os.path.join(dirname))
-        if dirname == main_directory: #to skip main directory path 
-            continue
-        else:
-            # Load data
-            #data = ld.load_uk_data().to_numpy()
-
-            #modifiy load_data.py --> load_uk_data() to load_uk_data(path)
-                                # --> Place path in directory -> for dirname, _, filenames in os.walk(path):
-            data = load_uk_data(dirname).to_numpy() 
-
-            X = data[:,2:]
-
-            #model = va.train_model(X, grid_h, grid_w, radius, step, ep)
-            model = train_model(X, grid_h, grid_w, radius, step, ep)
-
-            # Predict and visualize output
-            #va.predict(model, data, grid_h, grid_w)
-            predict(model, data, grid_h, grid_w)
\ No newline at end of file
diff --git a/voting_lib/load_data.py b/voting_lib/load_data.py
index d410f0d..ca93a45 100755
--- a/voting_lib/load_data.py
+++ b/voting_lib/load_data.py
@@ -59,11 +59,13 @@ def load_german_data():
     return data
 
 
-def load_uk_data():
+def load_uk_data(path):
     """
     Load German Parliament data
     return : Data with columns [Member, Party, vote_0, vote_1 etc]
     """  
+    #print directory path
+    print(path) 
     # Preprocess data
     vote_counter = -1
     data = pd.DataFrame()
@@ -75,7 +77,7 @@ def load_uk_data():
     column_to_filename = {}
     
     voting_features = {'Aye':0, 'Teller - Ayes':0, 'No':1, 'Teller - Noes':1, 'No Vote Recorded':2}
-    for dirname, _, filenames in os.walk('./uk/csv'):
+    for dirname, _, filenames in os.walk(path):
         for filename in filenames:
             vote_counter += 1
             
diff --git a/voting_lib/political_compass.py b/voting_lib/political_compass.py
index 67b4c21..7832978 100644
--- a/voting_lib/political_compass.py
+++ b/voting_lib/political_compass.py
@@ -9,7 +9,6 @@ def get_compass_parties(year=2017, country='de'):
             data  = [[-3.5, -4],  [7, 6.5],   [-7, -6.5], [1, 2]]
             index =  ['BÜ90/GR', 'CDU/CSU', 'DIE LINKE.', 'SPD']
         elif year == 2005:
-            # TODO: add data for 2011
             data  = [[-1.5, -1.5],  [9.5, 8],     [-6, -2], [3, 3.5]]
             index =  [  'BÜ90/GR', 'CDU/CSU', 'DIE LINKE.',   'SPD']
         else: