csci5612 methods - clustering

Clustering

Overview

Clustering is an unsupervised method so that data must be quantitative and unlabeled to utilize any of these techniques. There is also a need to select a distance metric so that the distance between points in a group can be optimized for. The most well known clustering method is k-means which groups the vectors into groups using k different centroids. The centroids represent the mean value of the group of points which are nearest to that centroid. Hierarchical clustering uses pairwise distance comparisons to form compositions of clusters and either combines them or divides/refines them to form dendograms. DBSCAN is a density based technique that divides a cluster into core samples and nearby samples where the cluster has in common that points are relatively close to some other points in the cluster.

Data

After PCA was performed on the data, over 95% of the variance was retained in the first 3 components, so clustering can be performed on this reduced dimensionality data. Clusters can then be transformed back into original variables using the eigenvectors. The transformed pca data can be found here: stackoverflow_pca.csv.

code_showcols.py


import pandas as pd

if __name__ == "__main__":
    datafile = "../pca/stackoverflow_quant.csv"
    quant_df = pd.read_csv(datafile, index_col=0)
    print(quant_df.head(5))
    print(quant_df.describe())
    print()
    datafile = "../pca/stackoverflow_pca.csv"
    pca_df = pd.read_csv(datafile, index_col=0)
    print(pca_df.head(5))
    print(pca_df.describe())
    pca_df.to_csv("stackoverflow_pca.csv")

code_showcols.py output


   YearsCodeS  YearsCodeProS  PreviousSalaryS  ComputerSkillsS
0   -0.768505      -0.644986        -0.324004        -1.375372
1   -0.220669      -0.515082        -0.426556        -0.234501
2    0.108033      -0.385178         0.196606        -0.947545
3   -0.549371      -0.385178        -0.433575        -0.091892
4   -0.549371      -0.904795        -0.579616        -1.232763
         YearsCodeS  YearsCodeProS  PreviousSalaryS  ComputerSkillsS
count  6.726500e+04   6.726500e+04     6.726500e+04     6.726500e+04
mean   4.901387e-17   7.690108e-17    -4.056321e-17    -8.450668e-18
std    1.000007e+00   1.000007e+00     1.000007e+00     1.000007e+00
min   -1.535476e+00  -1.164603e+00    -1.366740e+00    -1.945807e+00
25%   -7.685051e-01  -7.748907e-01    -7.885836e-01    -6.623273e-01
50%   -2.206689e-01  -2.552739e-01    -2.070092e-01    -9.189173e-02
75%    5.463018e-01   3.942471e-01     5.657736e-01     4.785438e-01
max    3.942886e+00   4.291373e+00     3.164148e+00     1.331334e+01

        PC1       PC2       PC3
0 -1.028562 -1.357438  0.306498
1 -0.651167 -0.259434 -0.131018
2 -0.083483 -0.910989  0.374335
3 -0.782174 -0.110941 -0.093256
4 -1.166182 -1.245421  0.071350
                PC1           PC2           PC3
count  6.726500e+04  6.726500e+04  67265.000000
mean   5.070401e-17 -1.014080e-17      0.000000
std    1.473160e+00  1.002010e+00      0.853811
min   -2.401924e+00 -2.104107e+00     -4.080911
25%   -1.116483e+00 -7.091379e-01     -0.544464
50%   -3.483036e-01 -1.297791e-01     -0.135902
75%    7.909193e-01  5.435465e-01      0.456801
max    6.260588e+00  1.342557e+01      3.466914

Code

First K-Means was performed on the data and the silhouette method was used to compare different values of k.

code_kmeans.py


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

def plot_scatter(df, labels, title="scatter", output_png="output.png"):
    fig = plt.figure(figsize=(8,6))
    mins = list(min(df["PC%d"%(j+1)]) for j in range(3))
    maxs = list(max(df["PC%d"%(j+1)]) for j in range(3))
    ax = fig.add_subplot(111, projection="3d")
    ax.scatter(df["PC1"], df["PC2"], df["PC3"], c=labels, marker=".", cmap="Paired")
    plt.title(title)
    ax.set_xlabel("PC1")
    ax.set_ylabel("PC2")
    ax.set_zlabel("PC3")
    ax.set_xlim(mins[0]-.1, maxs[0]+.1)
    ax.set_ylim(mins[1]-.1, maxs[1]+.1)
    ax.set_zlim(mins[2]-.1, maxs[2]+.1)
    plt.savefig(output_png)

def plot_silhouettes(silhouettes, output_png="output.png"):
    plt.figure(figsize=(8,6))
    k = list(sorted(silhouettes.keys()))
    s = list(silhouettes[j] for j in k)
    plt.plot(k, s, marker="x")
    plt.xlabel("k")
    plt.ylabel("silhouette score")
    plt.title("K-means K vs Silhouette Score")
    plt.savefig(output_png)

if __name__ == "__main__":
    df = pd.read_csv("stackoverflow_pca.csv", index_col=0)
    X = df.to_numpy()
    silhouettes = dict()
    for k in range(2,6):
        kmeaner = KMeans(n_clusters=k, random_state=10)
        labels = kmeaner.fit_predict(X)
        silhouette = silhouette_score(X[::20], labels[::20])
        silhouettes[k] = silhouette
        print("k = %d" % k)
        print("  silhouette: %.3f" % silhouette)
        plot_scatter(df, labels, "kmeans k=%d" % k,"kmeans-k%d.png" % k)
        print()
    plot_silhouettes(silhouettes, "kmeans-silhouette.png")

code_kmeans.py output


k = 2
  silhouette: 0.399

k = 3
  silhouette: 0.303

k = 4
  silhouette: 0.309

k = 5
  silhouette: 0.304

Agglomerative Clustering was used to perform a type of hierarchical clustering. The clusters and dendograms can be viewed below. To run on all 67000 rows would have required computing a n^2 distance matrix taking 16GB of memory, so a random sample of 1/20 of the data was used.

code_hierarchy.py


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import dendrogram

def plot_scatter(X, labels, title="scatter", output_png="output.png"):
    fig = plt.figure(figsize=(8,6))
    mins = list(min(X[:,j]) for j in range(3))
    maxs = list(max(X[:,j]) for j in range(3))
    ax = fig.add_subplot(111, projection="3d")
    ax.scatter(X[:,0], X[:,1], X[:,2], c=labels, marker=".", cmap="Paired")
    plt.title(title)
    ax.set_xlabel("PC1")
    ax.set_ylabel("PC2")
    ax.set_zlabel("PC3")
    ax.set_xlim(mins[0]-.1, maxs[0]+.1)
    ax.set_ylim(mins[1]-.1, maxs[1]+.1)
    ax.set_zlim(mins[2]-.1, maxs[2]+.1)
    plt.savefig(output_png)

def plot_dendogram(model, title, output_png="output.png"):
    # https://scikit-learn.org/stable/auto_examples/cluster/plot_agglomerative_dendrogram.html
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    plt.figure(figsize=(8,6))
    plt.title(title)
    dendrogram(linkage_matrix)
    plt.savefig(output_png)

def plot_silhouettes(silhouettes, output_png="output.png"):
    plt.figure(figsize=(8,6))
    k = list(sorted(silhouettes.keys()))
    s = list(silhouettes[j] for j in k)
    plt.plot(k, s, marker="x")
    plt.xlabel("k")
    plt.ylabel("silhouette score")
    plt.title("Agglomerative K vs Silhouette Score")
    plt.savefig(output_png)

if __name__ == "__main__":
    df = pd.read_csv("stackoverflow_pca.csv", index_col=0)
    X = df.to_numpy()
    X_sample = X[np.random.choice(a=np.arange(X.shape[0]), size=X.shape[0]//20, replace=False)]
    silhouettes = dict()
    for k in range(2,6):
        agglomerator = AgglomerativeClustering(n_clusters=k, compute_distances=True)
        model = agglomerator.fit(X_sample)
        labels = model.labels_
        silhouette = silhouette_score(X_sample, labels)
        silhouettes[k] = silhouette
        print("k = %d" % k)
        print("  silhouette: %.3f" % silhouette)
        plot_scatter(X_sample, labels, "agglomerative k=%d" % k,"agglomerative-k%d.png" % k)
        plot_dendogram(model, "agglomerative dendogram k=%d" % k, "agglomerative-dend-k%d.png" % k)
        print()
    plot_silhouettes(silhouettes, "agglomerative-silhouette.png")

code_hierarchy.py output


k = 2
  silhouette: 0.364

k = 3
  silhouette: 0.267

k = 4
  silhouette: 0.246

k = 5
  silhouette: 0.193

DBSCAN was performed on the 3D PCA data using a random sample of 1/20 of the rows as well.

code_dbscan.py


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

def plot_scatter(X, labels, title="scatter", output_png="output.png"):
    fig = plt.figure(figsize=(8,6))
    mins = list(min(X[:,j]) for j in range(3))
    maxs = list(max(X[:,j]) for j in range(3))
    ax = fig.add_subplot(111, projection="3d")
    ax.scatter(X[:,0], X[:,1], X[:,2], c=labels, marker=".", cmap="Paired")
    plt.title(title)
    ax.set_xlabel("PC1")
    ax.set_ylabel("PC2")
    ax.set_zlabel("PC3")
    ax.set_xlim(mins[0]-.1, maxs[0]+.1)
    ax.set_ylim(mins[1]-.1, maxs[1]+.1)
    ax.set_zlim(mins[2]-.1, maxs[2]+.1)
    plt.savefig(output_png)

def plot_silhouettes(silhouettes, output_png="output.png"):
    plt.figure(figsize=(8,6))
    k = list(sorted(silhouettes.keys()))
    s = list(silhouettes[j] for j in k)
    plt.plot(k, s, marker="x")
    plt.xlabel("k")
    plt.ylabel("silhouette score")
    plt.title("DBSCAN K vs Silhouette Score")
    plt.savefig(output_png)

if __name__ == "__main__":
    df = pd.read_csv("stackoverflow_pca.csv", index_col=0)
    X = df.to_numpy()
    X_sample = X[np.random.choice(a=np.arange(X.shape[0]), size=X.shape[0]//20, replace=False)]
    silhouettes = dict()
    for eps in [0.6, 0.5, 0.4, 0.3]:
        dbscanner = DBSCAN(eps=eps)
        model = dbscanner.fit(X_sample)
        labels = model.labels_
        k = len(set(labels))
        silhouette = silhouette_score(X_sample, labels)
        silhouettes[k] = silhouette
        print("eps = %.3f -> k = %d" % (eps, k))
        print("  silhouette: %.3f" % silhouette)
        plot_scatter(X_sample, labels, "dbscan eps=%.3f k=%d" % (eps, k),"dbscan-eps%.3f.png" % eps)
        print()
    plot_silhouettes(silhouettes, "dbscan-silhouette.png")

code_dbscan.py output


eps = 0.600 -> k = 3
  silhouette: 0.341

eps = 0.500 -> k = 8
  silhouette: 0.134

eps = 0.400 -> k = 12
  silhouette: 0.076

eps = 0.300 -> k = 20
  silhouette: -0.153

Results

KMeans was ineffective where upon repeated runs, the centroids would appear to just randomly divide the mass of points into separate divisions inconsistently. Similarly for the agglomerative clustering, with different random samples of points, entirely different clusters would form. The dendogram is consistently generated so any number of labels or clusters can be chosen once the algorithm is run. It also had the limitation of not being able to be run on the entire dataset but only on a smaller subset. The plots for DBSCAN show that varying epsilon is ineffective at finding different clusters and majority of the points always seem to end up in the same label due to the high concentration and relatively convex shape of the point cloud.