csci5612 methods

Principal Component Analysis

Overview

PCA is a technique that uses quantitative record data to determine which variables of that data contain the most variation and in turn can be used to transform the record data into new columns called principal components that are decorrelated and independent from each other. These components capture all of the variance of the data in decreasing order of importance and can thus be used to reduce the dimensionality of the data into fewer columns than what was started out with while preserving most of the important features.

Data

First, the stackoverflow data is prepared for PCA by only keeping the quantitative columns. Then sklearn StandardScaler() was used to normalize all the columns.

code_quant.py


import pandas as pd
from sklearn.preprocessing import StandardScaler

def get_quant_cols(df):
    df2 = df.drop(columns=["Age", "EdLevel", "Gender", "MentalHealth", "MainBranch", "Country", "Employed"])
    return df2

def get_scaled_df(df, col_tail="S"):
    scaler = StandardScaler()
    scaled = scaler.fit_transform(df)
    df_scaled = pd.DataFrame({df.columns[j]+col_tail: scaled[:,j] for j in range(scaled.shape[1])})
    return df_scaled

if __name__ == "__main__":
    datafile = "../../dataprep/stackoverflow_clean.csv"
    df = pd.read_csv(datafile, index_col=0)
    quant_df = get_quant_cols(df)
    print(quant_df.describe())
    scaled_df = get_scaled_df(quant_df)
    print(scaled_df)
    scaled_df.to_csv("stackoverflow_quant.csv")

code_quant.py output


          YearsCode  YearsCodePro  PreviousSalary  ComputerSkills
count  67265.000000  67265.000000    67265.000000    67265.000000
mean      14.014004      8.965093    67570.164484       13.644362
std        9.126883      7.698038    49438.578398        7.012238
min        0.000000      0.000000        1.000000        0.000000
25%        7.000000      3.000000    28584.000000        9.000000
50%       12.000000      7.000000    57336.000000       13.000000
75%       19.000000     12.000000    95541.000000       17.000000
max       50.000000     42.000000   224000.000000      107.000000
       YearsCodeS  YearsCodeProS  PreviousSalaryS  ComputerSkillsS
0       -0.768505      -0.644986        -0.324004        -1.375372
1       -0.220669      -0.515082        -0.426556        -0.234501
2        0.108033      -0.385178         0.196606        -0.947545
3       -0.549371      -0.385178        -0.433575        -0.091892
4       -0.549371      -0.904795        -0.579616        -1.232763
...           ...            ...              ...              ...
67260   -0.768505      -0.904795        -0.536269        -0.091892
67261    0.765436       0.913864         0.959376        -0.377109
67262   -1.097207      -0.774891        -0.199242        -0.234501
67263   -0.987640      -1.034699         0.049149         0.193326
67264   -0.439803      -0.774891         0.158053        -1.660589

[67265 rows x 4 columns]

The prepared quantitative data can be found here: stackoverflow_quant.csv.

Code

code_pca.py


import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def get_pca_data(df, n=3):
    pca = PCA(n_components=n)
    X_pca = pca.fit_transform(df.to_numpy())

    eigenvalues = pca.explained_variance_
    eigenvectors = pca.components_
    explained_ratio = pca.explained_variance_ratio_
    eigenveclist = list(eigenvectors[j,:] for j in range(len(eigenvalues)))

    for i, (eig, vec, ratio) in enumerate(zip(eigenvalues, eigenveclist, explained_ratio)):
        print(f"PC{i+1}:")
        print(" Eigenvalue: %.4f" % eig)
        print(" Eigenvector:", np.round(vec, 4))
        print(" Variance Explained: %.2f %%" % (ratio * 100))

    print("Total Variance Explained: %.2f %%" % sum(explained_ratio*100))
    return pca, X_pca


def plot_feature_importance(feature_names, pca, output_png="output.png"):
    relative_importance = []
    n_vecs = len(pca.components_)
    for j in range(len(pca.components_)):
        pc = pca.components_[j]
        rel_imp = np.abs(pc) / np.sum(np.abs(pc))
        relative_importance.append(rel_imp)
    relative_importance = np.array(relative_importance)

    plt.figure(figsize=(8,6))
    bottom = np.zeros(n_vecs)
    for i, feature in enumerate(feature_names):
        plt.bar(
            [f'PC{j+1}' for j in range(n_vecs)],
            relative_importance[:, i],
            bottom=bottom,
            label=feature
        )
        bottom += relative_importance[:,i]

    plt.ylabel("Relative Importance")
    plt.title("Stacked Relative Feature Importance by Principal Component")
    plt.legend( bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.tight_layout()
    plt.savefig(output_png)


def plot2d(X, output_png="output.png"):
    fig = plt.figure(figsize=(8,6))
    ax = fig.add_subplot(111)
    pc1 = X[:,0]
    pc2 = X[:,1]
    ax.scatter(pc1, pc2, marker=".", cmap="Paired")
    plt.title("Transformed Data (PCA n_components=2)")
    ax.set_xlabel("pc1")
    ax.set_ylabel("pc2")
    m1 = min(pc1)
    M1 = max(pc1)
    m2 = min(pc2)
    M2 = max(pc2)
    plt.xlim(m1, M1)
    plt.ylim(m2, M2)
    plt.savefig(output_png)


def plot3d(X, output_png="output.png"):
    fig = plt.figure(figsize=(8,6))
    ax = fig.add_subplot(111, projection="3d")
    pc1 = X[:,0]
    pc2 = X[:,1]
    pc3 = X[:,2]
    ax.scatter(pc1, pc2, pc3, marker=".", cmap="Paired")
    plt.title("Transformed Data (PCA n_components=3)")
    ax.set_xlabel("pc1")
    ax.set_ylabel("pc2")
    ax.set_zlabel("pc3")
    m1 = min(pc1)
    M1 = max(pc1)
    m2 = min(pc2)
    M2 = max(pc2)
    m3 = min(pc3)
    M3 = max(pc3)
    ax.set_xlim(m1, M1)
    ax.set_ylim(m2, M2)
    ax.set_zlim(m3, M3)
    plt.savefig(output_png)

if __name__ == "__main__":
    datafile = "stackoverflow_quant.csv"
    df = pd.read_csv(datafile, index_col=0)
    print("covar:")
    print(df.cov())
    print("performing pca with n=2")
    pca, X_pca = get_pca_data(df, n=2)
    print()
    print("performing pca with n=3")
    pca, X_pca = get_pca_data(df, n=3)
    print("plotting feature importance")
    plot_feature_importance(df.columns, pca, "pca_feature_importance.png")
    plot2d(X_pca, "pca_2.png")
    plot3d(X_pca, "pca_3.png")

    X_pca_df = pd.DataFrame({"PC%d" % (j+1) : X_pca[:,j] for j in range(X_pca.shape[1])})
    X_pca_df.to_csv("stackoverflow_pca.csv")

code_pca.py output


covar:
                 YearsCodeS  YearsCodeProS  PreviousSalaryS  ComputerSkillsS
YearsCodeS         1.000015       0.903143         0.392955        -0.019983
YearsCodeProS      0.903143       1.000015         0.397412        -0.014316
PreviousSalaryS    0.392955       0.397412         1.000015         0.025314
ComputerSkillsS   -0.019983      -0.014316         0.025314         1.000015
performing pca with n=2
PC1:
 Eigenvalue: 2.1702
 Eigenvector: [ 0.6376  0.6385  0.4308 -0.0094]
 Variance Explained: 54.25 %
PC2:
 Eigenvalue: 1.0040
 Eigenvector: [-0.0346 -0.0278  0.1141  0.9925]
 Variance Explained: 25.10 %
Total Variance Explained: 79.35 %

performing pca with n=3
PC1:
 Eigenvalue: 2.1702
 Eigenvector: [ 0.6376  0.6385  0.4308 -0.0094]
 Variance Explained: 54.25 %
PC2:
 Eigenvalue: 1.0040
 Eigenvector: [-0.0346 -0.0278  0.1141  0.9925]
 Variance Explained: 25.10 %
PC3:
 Eigenvalue: 0.7290
 Eigenvector: [-0.3057 -0.3005  0.8952 -0.122 ]
 Variance Explained: 18.22 %
Total Variance Explained: 97.58 %
plotting feature importance

The final transformed pca data can be found here: stackoverflow_pca.csv.

Results

Looking at PC1, which accounts for 54% of the variance in the data in the feature importance plot, it is interesting that it weighs the features YearsCode and YearsCodePro almost equally. This is actually true if you look at every single eigenvector, where the first 2 columns appear approximately equal all throughout. This makes sense because the 2 should be highly correllated and not much additional information is gained by knowing both over one since if someone has been coding professionally for a longer amount of time, then they have been coding overall for much longer as well. The data shows that the covariance between these variables is 90%. This brings up the question of whether the difference between when someone started coding professionally and when they started coding in general is a useful derived feature.

Once the 2nd principal component is added, which mainly comprises of the variable of the number of listed ComputerSkills, then 79.3% of the variance is explained. Adding the 3rd component brings the total explained variance up to 97.6%. Only 3 dimensions is necessary to capture over 95% of the data which makes sense because the first 2 features are very highly correlated.