cleaned up uk code for data loading

2024-09-03 20:50:59 +02:00 · 2021-05-19 16:00:59 +02:00
parent a2fef60b27
commit 3a15bd1279
5 changed files with 24 additions and 162 deletions
@@ -1,109 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-import os
-import pandas as pd
-import numpy as np
-
-def load_german_data():
-    """
-    Load German Parliament data
-    return : Data with columns [Member, Party, vote_0, vote_1 etc]
-    """  
-    title_file = "filename_to_titles.csv"
-    vote_counter = -1
-    #data = pd.DataFrame()
-    data = {}
-    
-    period_column_g = 'Wahlperiode'
-    name_column_g = 'Bezeichnung'
-    party_column_g = 'Fraktion/Gruppe'
-    name_column = 'Member'
-    party_column = 'Party'
-    
-    vote_column_to_title = {}
-    
-    voting_features = ['ja', 'nein', 'Enthaltung', 'ungültig']
-    for dirname, _, filenames in os.walk('./de/csv'):
-        for filename in filenames:
-            if filename != title_file:
-                
-                print(filename)
-                
-                vote_counter += 1
-                df = pd.read_csv(os.path.join(dirname, filename))
-                
-                # Give each voting behaviour type an identifier from 0 to len(voting_features) - 1
-                for i, feature in enumerate(voting_features):
-                    df[feature] *= i
-                vote_column_name = f'vote_{vote_counter}'
-                
-                # Map column name of vote to filename -> allows retrieving what the vote was about
-                vote_column_to_title[vote_column_name] = filename
-                
-                # add feature for the vote
-                df[vote_column_name] = df[voting_features].sum(axis=1)
-                
-                df=df.rename(columns={name_column_g:name_column,party_column_g:party_column})
-                
-                period = df.iloc[0][period_column_g]
-                                    
-                if period in data:
-                    # merge data with already loaded data 
-                    data[period] = data[period].merge(df[[name_column, vote_column_name]], on=name_column)                    
-                else:
-                    # if first file that is loaded set data equal to data from first file
-                    data[period] = df[[name_column, party_column, vote_column_name]]
-                    
-    print(data)
-    return data
-
-
-def load_uk_data(path):
-    """
-    Load German Parliament data
-    return : Data with columns [Member, Party, vote_0, vote_1 etc]
-    """  
-    #print directory path
-    print(path) 
-    # Preprocess data
-    vote_counter = -1
-    data = pd.DataFrame()
-    
-    name_column = 'Member'
-    party_column = 'Party'
-    vote_column = 'Vote'
-    
-    column_to_filename = {}
-    
-    voting_features = {'Aye':0, 'Teller - Ayes':0, 'No':1, 'Teller - Noes':1, 'No Vote Recorded':2}
-    for dirname, _, filenames in os.walk(path):
-        for filename in filenames:
-            vote_counter += 1
-            
-            # Read title rows
-            # sep is set to new line so it never splits up the title cells
-            title_df = pd.read_csv(os.path.join(dirname, filename), sep='\n',nrows=(3),skip_blank_lines=True,header=None)
-        
-            # Read data rows
-            df = pd.read_csv(os.path.join(dirname, filename),skiprows=(10))
-            
-            # Give each voting behaviour type an identifier from 0 to len(voting_features) - 1
-            df[vote_column].replace(voting_features, inplace=True)
-            
-            #Replace the vote column name
-            vote_column_name = f'vote_{vote_counter}'
-            df=df.rename(columns={vote_column:vote_column_name})
-             
-            # Map column name of vote to title -> allows retrieving what the vote was about
-            column_to_filename[vote_column_name] = title_df.iat[2,0]
-                    
-            if data.empty:
-                # if first file that is loaded set data equal to data from first file
-                data = df[[name_column, party_column, vote_column_name]]
-            else:
-                # merge data with already loaded data 
-                data = data.merge(df[[name_column, vote_column_name]], on=name_column)
-    
-    print(data)
-    return data
@@ -1,12 +1,15 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-

+#This code is modified to run in Kaggle
+
 import voting_lib.load_data as ld
 import voting_lib.voting_analysis as va
+import numpy as np
+import pandas as pd
+import os
+

-# Load data
-data = ld.load_uk_data().to_numpy()
-X = data[:,2:]

 # Train model
 grid_h = 30       # Grid height
@@ -15,7 +18,17 @@ radius = 3        # Neighbour radius
 step = 0.5
 ep = 100         # No of epochs

-model = va.train_model(X, grid_h, grid_w, radius, step, ep)

-# Predict and visualize output
-va.predict(model, data, grid_h, grid_w)
+main_directory = 'uk/csv'
+for dirname, _, filenames in os.walk(main_directory):
+        if dirname == main_directory: #to skip main directory path 
+            continue
+        else:
+            # Load data
+            data = ld.load_uk_data(dirname).to_numpy() 
+
+            X = data[:,2:]
+
+            model = va.train_model(X, grid_h, grid_w, radius, step, ep)
+            # Predict and visualize output
+            va.predict(model, data, grid_h, grid_w)
@@ -1,43 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-#This code is modified to run in Kaggle
-
-#import voting_lib.load_data as ld
-#import voting_lib.voting_analysis as va
-import numpy as np
-import pandas as pd
-import os
-
-
-
-# Train model
-grid_h = 30       # Grid height
-grid_w = 30       # Grid width
-radius = 3        # Neighbour radius
-step = 0.5
-ep = 100          # No of epochs
-
-#main directory path(should contain differnt dataset directory) can be changed 
-main_directory = '/kaggle/input'
-
-for dirname, _, filenames in os.walk(main_directory):
-        #print(os.path.join(dirname))
-        if dirname == main_directory: #to skip main directory path 
-            continue
-        else:
-            # Load data
-            #data = ld.load_uk_data().to_numpy()
-
-            #modifiy load_data.py --> load_uk_data() to load_uk_data(path)
-                                # --> Place path in directory -> for dirname, _, filenames in os.walk(path):
-            data = load_uk_data(dirname).to_numpy() 
-
-            X = data[:,2:]
-
-            #model = va.train_model(X, grid_h, grid_w, radius, step, ep)
-            model = train_model(X, grid_h, grid_w, radius, step, ep)
-
-            # Predict and visualize output
-            #va.predict(model, data, grid_h, grid_w)
-            predict(model, data, grid_h, grid_w)
@@ -59,11 +59,13 @@ def load_german_data():
    return data


-def load_uk_data():
+def load_uk_data(path):
    """
    Load German Parliament data
    return : Data with columns [Member, Party, vote_0, vote_1 etc]
    """  
+    #print directory path
+    print(path) 
    # Preprocess data
    vote_counter = -1
    data = pd.DataFrame()
@@ -75,7 +77,7 @@ def load_uk_data():
    column_to_filename = {}
    
    voting_features = {'Aye':0, 'Teller - Ayes':0, 'No':1, 'Teller - Noes':1, 'No Vote Recorded':2}
-    for dirname, _, filenames in os.walk('./uk/csv'):
+    for dirname, _, filenames in os.walk(path):
        for filename in filenames:
            vote_counter += 1
            
@@ -9,7 +9,6 @@ def get_compass_parties(year=2017, country='de'):
            data  = [[-3.5, -4],  [7, 6.5],   [-7, -6.5], [1, 2]]
            index =  ['BÜ90/GR', 'CDU/CSU', 'DIE LINKE.', 'SPD']
        elif year == 2005:
-            # TODO: add data for 2011
            data  = [[-1.5, -1.5],  [9.5, 8],     [-6, -2], [3, 3.5]]
            index =  [  'BÜ90/GR', 'CDU/CSU', 'DIE LINKE.',   'SPD']
        else: