diff --git a/VotingAnalysis.py b/VotingAnalysis.py index b7130ee..55fa030 100644 --- a/VotingAnalysis.py +++ b/VotingAnalysis.py @@ -1 +1,210 @@ -# %% [markdown] # # **Loading German Parliament Votes** # %% [code] import os import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) from itertools import product import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from neupy import algorithms, utils def load_german_data(): title_file = "filename_to_titles.csv" vote_counter = -1 data = pd.DataFrame() name_column = 'Bezeichnung' vote_column_to_title = {} voting_features = ['ja', 'nein', 'Enthaltung', 'ungültig'] for dirname, _, filenames in os.walk('./de/csv'): for filename in filenames: if filename != title_file: vote_counter += 1 df = pd.read_csv(os.path.join(dirname, filename)) # Give each voting behaviour type an identifier from 0 to len(voting_features) - 1 for i, feature in enumerate(voting_features): df[feature] *= i vote_column_name = f'vote_{vote_counter}' # Map column name of vote to filename -> allows retrieving what the vote was about vote_column_to_title[vote_column_name] = filename # add feature for the vote df[vote_column_name] = df[voting_features].sum(axis=1) if data.empty: # if first file that is loaded set data equal to data from first file data = df[[name_column, vote_column_name]] else: # merge data with already loaded data data = data.merge(df[[name_column, vote_column_name]], on=name_column) print(vote_column_to_title) print(data) return data # %% [markdown] # # **Loading UK Parliament Votes** # %% [code] def load_uk_data(): # Preprocess data vote_counter = -1 data = pd.DataFrame() name_column = 'Member' vote_column = 'Vote' column_to_filename = {} voting_features = {'Aye':0, 'Teller - Ayes':0, 'No':1, 'Teller - Noes':1, 'No Vote Recorded':2} for dirname, _, filenames in os.walk('./uk/csv'): for filename in filenames: vote_counter += 1 print(os.path.join(dirname, filename)) # Read title rows title_df = pd.read_csv(os.path.join(dirname, filename),nrows=(3),skip_blank_lines=True,header=None) # Read data rows df = pd.read_csv(os.path.join(dirname, filename),skiprows=(10)) # Give each voting behaviour type an identifier from 0 to len(voting_features) - 1 df[vote_column].replace(voting_features, inplace=True) #Replace the vote column name vote_column_name = f'vote_{vote_counter}' df=df.rename(columns={vote_column:vote_column_name}) # Map column name of vote to title -> allows retrieving what the vote was about column_to_filename[vote_column_name] = title_df.iat[2,0] if data.empty: # if first file that is loaded set data equal to data from first file data = df[[name_column, vote_column_name]] else: # merge data with already loaded data data = data.merge(df[[name_column, vote_column_name]], on=name_column) print(column_to_filename) print(data) return data # %% [markdown] # # **Heatmap** # %% [code] def iter_neighbours(weights, hexagon=False): _, grid_height, grid_width = weights.shape hexagon_even_actions = ((-1, 0), (0, -1), (1, 0), (0, 1), (1, 1), (-1, 1)) hexagon_odd_actions = ((-1, 0), (0, -1), (1, 0), (0, 1), (-1, -1), (1, -1)) rectangle_actions = ((-1, 0), (0, -1), (1, 0), (0, 1)) for neuron_x, neuron_y in product(range(grid_height), range(grid_width)): neighbours = [] if hexagon and neuron_x % 2 == 1: actions = hexagon_even_actions elif hexagon: actions = hexagon_odd_actions else: actions = rectangle_actions for shift_x, shift_y in actions: neigbour_x = neuron_x + shift_x neigbour_y = neuron_y + shift_y if 0 <= neigbour_x < grid_height and 0 <= neigbour_y < grid_width: neighbours.append((neigbour_x, neigbour_y)) yield (neuron_x, neuron_y), neighbours def compute_heatmap(weight): heatmap = np.zeros((GRID_HEIGHT, GRID_WIDTH)) for (neuron_x, neuron_y), neighbours in iter_neighbours(weight): total_distance = 0 for (neigbour_x, neigbour_y) in neighbours: neuron_vec = weight[:, neuron_x, neuron_y] neigbour_vec = weight[:, neigbour_x, neigbour_y] distance = np.linalg.norm(neuron_vec - neigbour_vec) total_distance += distance avg_distance = total_distance / len(neighbours) heatmap[neuron_x, neuron_y] = avg_distance return heatmap def compute_heatmap_expanded(weight): heatmap = np.zeros((2 * GRID_HEIGHT - 1, 2 * GRID_WIDTH - 1)) for (neuron_x, neuron_y), neighbours in iter_neighbours(weight): for (neigbour_x, neigbour_y) in neighbours: neuron_vec = weight[:, neuron_x, neuron_y] neigbour_vec = weight[:, neigbour_x, neigbour_y] distance = np.linalg.norm(neuron_vec - neigbour_vec) if neuron_x == neigbour_x and (neigbour_y - neuron_y) == 1: heatmap[2 * neuron_x, 2 * neuron_y + 1] = distance elif (neigbour_x - neuron_x) == 1 and neigbour_y == neuron_y: heatmap[2 * neuron_x + 1, 2 * neuron_y] = distance return heatmap # %% [markdown] # # **Simple SOFM for German** # %% [markdown] # **Training SOFM** # %% [code] utils.reproducible() plt.style.use('ggplot') # Load data data = load_german_data().to_numpy() X = data[:,1:] print(X) inp = X.shape[1] # No of features (bill count) g = 150 # Grid size rad = 2 # Neighbour radius ep = 500 # No of epochs # Create SOFM sofmnet = algorithms.SOFM( n_inputs=inp, step=0.5, show_epoch=100, shuffle_data=True, verbose=True, learning_radius=rad, features_grid=(g,g), ) sofmnet.train(X, epochs=ep) # %% [markdown] # **Visualizing Output** # %% [code] GRID_HEIGHT = g GRID_WIDTH = g plt.figure() weight = sofmnet.weight.reshape((sofmnet.n_inputs, g, g)) heatmap = compute_heatmap(weight) plt.imshow(heatmap, cmap='Greys_r', interpolation='nearest') plt.axis('off') plt.colorbar() plt.show() # %% [markdown] # # **Simple SOFM for UK** # %% [markdown] # **Training SOFM** # %% [code] utils.reproducible() plt.style.use('ggplot') # Load data data = load_uk_data().to_numpy() X = data[:,1:] print(X) inp = X.shape[1] # No of features (bill count) g = 30 # Grid size rad = 3 # Neighbour radius ep = 100 # No of epochs # Create SOFM sofmnet = algorithms.SOFM( n_inputs=inp, step=0.5, show_epoch=20, shuffle_data=True, verbose=True, learning_radius=rad, features_grid=(g,g), ) sofmnet.train(X, epochs=ep) # %% [markdown] # **Visualizing Output** # %% [code] fig = plt.figure() ax = plt.axes(projection='3d') ax.scatter3D(*sofmnet.weight, label='SOFM Weights') ax.scatter3D(*X.T, label='Input'); ax.set_xlabel('vote_0') ax.set_ylabel('vote_1') ax.set_zlabel('vote_2') ax.legend() \ No newline at end of file +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import os +import numpy as np +import pandas as pd +from itertools import product +import matplotlib.pyplot as plt +from mpl_toolkits.mplot3d import Axes3D +from neupy import algorithms + +# Loading German Parliament Votes +def load_german_data(): + title_file = "filename_to_titles.csv" + vote_counter = -1 + data = pd.DataFrame() + + name_column = 'Bezeichnung' + + vote_column_to_title = {} + + voting_features = ['ja', 'nein', 'Enthaltung', 'ungültig'] + for dirname, _, filenames in os.walk('./de/csv'): + for filename in filenames: + if filename != title_file: + vote_counter += 1 + df = pd.read_csv(os.path.join(dirname, filename)) + + # Give each voting behaviour type an identifier from 0 to len(voting_features) - 1 + for i, feature in enumerate(voting_features): + df[feature] *= i + vote_column_name = f'vote_{vote_counter}' + # Map column name of vote to filename -> allows retrieving what the vote was about + vote_column_to_title[vote_column_name] = filename + + # add feature for the vote + df[vote_column_name] = df[voting_features].sum(axis=1) + + if data.empty: + # if first file that is loaded set data equal to data from first file + data = df[[name_column, vote_column_name]] + else: + # merge data with already loaded data + data = data.merge(df[[name_column, vote_column_name]], on=name_column) + + print(vote_column_to_title) + print(data) + return data + +# Loading UK Parliament Votes +def load_uk_data(): + # Preprocess data + vote_counter = -1 + data = pd.DataFrame() + + name_column = 'Member' + vote_column = 'Vote' + + column_to_filename = {} + + voting_features = {'Aye':0, 'Teller - Ayes':0, 'No':1, 'Teller - Noes':1, 'No Vote Recorded':2} + for dirname, _, filenames in os.walk('./uk/csv'): + for filename in filenames: + vote_counter += 1 + print(os.path.join(dirname, filename)) + + # Read title rows + title_df = pd.read_csv(os.path.join(dirname, filename),nrows=(3),skip_blank_lines=True,header=None) + + # Read data rows + df = pd.read_csv(os.path.join(dirname, filename),skiprows=(10)) + + # Give each voting behaviour type an identifier from 0 to len(voting_features) - 1 + df[vote_column].replace(voting_features, inplace=True) + + #Replace the vote column name + vote_column_name = f'vote_{vote_counter}' + df=df.rename(columns={vote_column:vote_column_name}) + + # Map column name of vote to title -> allows retrieving what the vote was about + column_to_filename[vote_column_name] = title_df.iat[2,0] + + if data.empty: + # if first file that is loaded set data equal to data from first file + data = df[[name_column, vote_column_name]] + else: + # merge data with already loaded data + data = data.merge(df[[name_column, vote_column_name]], on=name_column) + + print(column_to_filename) + print(data) + return data + +# Heatmap +def iter_neighbours(weights, hexagon=False): + _, grid_height, grid_width = weights.shape + + hexagon_even_actions = ((-1, 0), (0, -1), (1, 0), (0, 1), (1, 1), (-1, 1)) + hexagon_odd_actions = ((-1, 0), (0, -1), (1, 0), (0, 1), (-1, -1), (1, -1)) + rectangle_actions = ((-1, 0), (0, -1), (1, 0), (0, 1)) + + for neuron_x, neuron_y in product(range(grid_height), range(grid_width)): + neighbours = [] + + if hexagon and neuron_x % 2 == 1: + actions = hexagon_even_actions + elif hexagon: + actions = hexagon_odd_actions + else: + actions = rectangle_actions + + for shift_x, shift_y in actions: + neigbour_x = neuron_x + shift_x + neigbour_y = neuron_y + shift_y + + if 0 <= neigbour_x < grid_height and 0 <= neigbour_y < grid_width: + neighbours.append((neigbour_x, neigbour_y)) + + yield (neuron_x, neuron_y), neighbours + +def compute_heatmap(weight, grid_height, grid_width): + heatmap = np.zeros((grid_height, grid_width)) + for (neuron_x, neuron_y), neighbours in iter_neighbours(weight): + total_distance = 0 + + for (neigbour_x, neigbour_y) in neighbours: + neuron_vec = weight[:, neuron_x, neuron_y] + neigbour_vec = weight[:, neigbour_x, neigbour_y] + + distance = np.linalg.norm(neuron_vec - neigbour_vec) + total_distance += distance + + avg_distance = total_distance / len(neighbours) + heatmap[neuron_x, neuron_y] = avg_distance + + return heatmap + +#Simple SOFM for German +plt.style.use('ggplot') + +# Load data +data = load_german_data().to_numpy() +X = data[:,1:] +print(X) + +inp = X.shape[1] # No of features (bill count) +h = 150 # Grid height +w = 150 # Grid width +rad = 2 # Neighbour radius +ep = 50 # No of epochs + +# Create SOFM +sofmnet = algorithms.SOFM( + n_inputs=inp, + step=0.5, + show_epoch=100, + shuffle_data=True, + verbose=True, + learning_radius=rad, + features_grid=(h,w), +) + +sofmnet.train(X, epochs=ep) + +#Visualizing Output +plt.figure() +weight = sofmnet.weight.reshape((sofmnet.n_inputs, h, w)) +heatmap = compute_heatmap(weight, h, w) +plt.imshow(heatmap, cmap='Greys_r', interpolation='nearest') +plt.axis('off') +plt.colorbar() +plt.show() + +#Simple SOFM for UK +plt.style.use('ggplot') + +# Load data +data = load_uk_data().to_numpy() +X = data[:,1:] +print(X) + +inp = X.shape[1] # No of features (bill count) +h = 30 # Grid height +w = 30 # Grid width +rad = 3 # Neighbour radius +ep = 10 # No of epochs + +# Create SOFM +sofmnet = algorithms.SOFM( + n_inputs=inp, + step=0.5, + show_epoch=20, + shuffle_data=True, + verbose=True, + learning_radius=rad, + features_grid=(h,w), +) + +sofmnet.train(X, epochs=ep) + +#Visualizing Output +fig = plt.figure() +ax = plt.axes(projection='3d') +ax.scatter3D(*sofmnet.weight, label='SOFM Weights') +ax.scatter3D(*X.T, label='Input'); + +ax.set_xlabel('vote_0') +ax.set_ylabel('vote_1') +ax.set_zlabel('vote_2') +ax.legend() \ No newline at end of file