import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from sklearn.cluster import KMeans
from matplotlib.animation import FuncAnimation
from IPython import display
# Generating random data for the illustration
1)
np.random.seed(= np.random.randn(50, 2) + [4, 4]
points1 = np.random.randn(50, 2) + [-4, -4]
points2 = np.random.randn(50, 2) + [4, -4]
points3 = np.vstack([points1, points2, points3])
data
= plt.subplots()
fig, ax "Data")
plt.title(= ax.scatter(data[:, 0], data[:, 1], c='black')
scatter = ax.plot([], [], markeredgewidth=2, color='yellow', ls='', marker='*')
centroids_plot, =':')
ax.grid(linestyle plt.show()
Clustering
Danya Merkulov based on the materials from Mikhail Belyaev and Maxim Panov.
π§ Intuition with k-means
π€ Problem
- In the scatter plot below, we can see three separate groups of data points and we would like to recover them using clustering.
- Think of βdiscoveringβ the class labels that we already take for granted in a classification task.
- Even if the groups are obvious in the data, it is hard to find them when the data lives in a high-dimensional space, which we canβt visualize in a single histogram or scatterplot.
π¨βπ« k-means algorithm
- Now we will use one of the simplest clustering algorithms, K-means.
- This is an iterative algorithm which searches for three cluster centers such that the distance from each point to its cluster is minimized.
- The standard implementation of K-means uses the Euclidean distance.
Question: what would you expect the output to look like??
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython import display
from sklearn.metrics import accuracy_score
from itertools import permutations
# Generating random data for the illustration
1)
np.random.seed(= np.random.randn(50, 2) + [4, 4]
points1 = np.random.randn(50, 2) + [-4, -4]
points2 = np.random.randn(50, 2) + [4, -4]
points3 = np.vstack([points1, points2, points3])
data = np.array([0]*50 + [1]*50 + [2]*50)
true_labels
= plt.subplots()
fig, ax =':')
ax.grid(linestyle
def compute_accuracy(true_labels, kmeans_labels):
= permutations([0, 1, 2])
perm = -1
max_accuracy for p in perm:
= np.array([p[i] for i in kmeans_labels])
temp_labels = accuracy_score(true_labels, temp_labels)
accuracy if accuracy > max_accuracy:
= accuracy
max_accuracy return max_accuracy
# Initialize centroids
def initialize_centroids(points, k):
return points[np.random.choice(points.shape[0], size=k, replace=False)]
# Closest centroid for each point
def closest_centroid(points, centroids):
= np.sqrt(((points - centroids[:, np.newaxis]) ** 2).sum(axis=2))
distances return np.argmin(distances, axis=0)
# Update the centroid locations
def update_centroids(points, closest, centroids):
return np.array([points[closest == k].mean(axis=0) for k in range(centroids.shape[0])])
def voronoi_polygons(centroids, ax):
= np.meshgrid(np.linspace(ax.get_xlim()[0], ax.get_xlim()[1], 500),
xx, yy 0], ax.get_ylim()[1], 500))
np.linspace(ax.get_ylim()[= np.argmin(((xx - centroids[:, 0][:, np.newaxis, np.newaxis]) ** 2 +
Z - centroids[:, 1][:, np.newaxis, np.newaxis]) ** 2), axis=0)
(yy return Z
def animate(i):
global data, centroids
ax.clear()=':')
ax.grid(linestyle
if i == 0:
= ax.scatter(data[:, 0], data[:, 1], c='black')
scatter "Data")
ax.set_title(elif i == 1:
= initialize_centroids(data, 3)
centroids = ax.scatter(data[:, 0], data[:, 1], c='black')
scatter 0], centroids[:, 1], s=100, c=['red', 'green', 'blue'], marker='*')
ax.scatter(centroids[:, "Centroid initialization")
ax.set_title(elif i % 2 == 0:
= closest_centroid(data, centroids)
closest = ax.scatter(data[:, 0], data[:, 1], c=closest, cmap='rainbow')
scatter = voronoi_polygons(centroids, ax)
Z ='nearest',
ax.imshow(Z, interpolation=(ax.get_xlim()[0], ax.get_xlim()[1], ax.get_ylim()[0], ax.get_ylim()[1]),
extent='auto', origin='lower', alpha=0.3, cmap='rainbow')
aspectfor idx, centroid in enumerate(centroids):
0], centroid[1], markersize=10, color=plt.cm.rainbow(idx/2.5), markeredgecolor='black', ls='', marker='*')
ax.plot(centroid[f"Assigning points | Accuracy: {compute_accuracy(true_labels, closest)*100:.2f}%")
ax.set_title(else:
= closest_centroid(data, centroids)
closest = update_centroids(data, closest, centroids)
centroids = ax.scatter(data[:, 0], data[:, 1], c=closest, cmap='rainbow')
scatter = voronoi_polygons(centroids, ax)
Z ='nearest',
ax.imshow(Z, interpolation=(ax.get_xlim()[0], ax.get_xlim()[1], ax.get_ylim()[0], ax.get_ylim()[1]),
extent='auto', origin='lower', alpha=0.3, cmap='rainbow')
aspectfor idx, centroid in enumerate(centroids):
0], centroid[1], markersize=10, color=plt.cm.rainbow(idx/2.5), markeredgecolor='black', ls='', marker='*')
ax.plot(centroid["Updating centroids")
ax.set_title(
return scatter,
= FuncAnimation(fig, animate, frames=20, interval=2000, blit=True)
ani
plt.close()= display.HTML(ani.to_html5_video())
html display.display(html)
Algorithm
The \(k\)-means algorithm is a popular clustering method used to partition \(n\) data points into \(k\) clusters. Each data point belongs to the cluster with the closest mean.
Initialization: Randomly select \(k\) data points (or seed them in some other manner) to serve as the initial centroids.
Assignment: Assign each data point to the nearest centroid. The distance is typically computed using the Euclidean distance, though other metrics can be used. Mathematically, assign each data point \(x_i\) to the nearest centroid \(c_j\) using the formula: \[ s(i) = \text{argmin}_{j} \left\| x_i - c_j \right\|^2 \] where \(s(i)\) is the cluster to which data point \(x_i\) is assigned.
Update: Recompute the centroid of each cluster as the mean of all points currently assigned to that cluster. For each cluster \(j\): \[ c_j = \frac{1}{\left| S(j) \right|} \sum_{i \in S(j)} x_i \] where \(S(j)\) is the set of data points assigned to cluster \(j\).
Convergence: Repeat steps 2 and 3 until the centroids no longer change significantly or some other stopping criteria is met.
Objective Function:
The \(k\)-means algorithm aims to minimize the within-cluster sum of squares (WCSS), given by: \[ J = \sum_{j=1}^{k} \sum_{i \in S(j)} \left\| x_i - c_j \right\|^2 \] Where: - \(J\) is the objective function value (WCSS). - \(k\) is the number of clusters. - \(x_i\) is a data point. - \(c_j\) is the centroid of cluster \(j\).
The goal of the \(k\)-means algorithm is to find the set of centroids \(\{c_1, c_2, ..., c_k\}\) that minimize \(J\).
Notes:
- The \(k\)-means algorithm does not guarantee a global optimum solution. The final result might depend on the initial centroids.
- To improve the chances of finding a global optimum, the algorithm can be run multiple times with different initializations and the best result (i.e., the one with the lowest WCSS) can be selected.
π€ͺ What could possibly go wrong?
π² Randomness
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython import display
from sklearn.metrics import accuracy_score
from itertools import permutations
# Generating random data for the illustration
1)
np.random.seed(= np.random.randn(50, 2) + [4, 4]
points1 = np.random.randn(50, 2) + [-4, -4]
points2 = np.random.randn(50, 2) + [4, -4]
points3 = np.vstack([points1, points2, points3])
data = np.array([0]*50 + [1]*50 + [2]*50)
true_labels
= plt.subplots()
fig, ax =':')
ax.grid(linestyle
def compute_accuracy(true_labels, kmeans_labels):
= permutations([0, 1, 2])
perm = -1
max_accuracy for p in perm:
= np.array([p[i] for i in kmeans_labels])
temp_labels = accuracy_score(true_labels, temp_labels)
accuracy if accuracy > max_accuracy:
= accuracy
max_accuracy return max_accuracy
# Initialize centroids
def initialize_centroids(points, k):
return points[np.random.choice(points.shape[0], size=k, replace=False)]
# Closest centroid for each point
def closest_centroid(points, centroids):
= np.sqrt(((points - centroids[:, np.newaxis]) ** 2).sum(axis=2))
distances return np.argmin(distances, axis=0)
# Update the centroid locations
def update_centroids(points, closest, centroids):
return np.array([points[closest == k].mean(axis=0) for k in range(centroids.shape[0])])
def voronoi_polygons(centroids, ax):
= np.meshgrid(np.linspace(ax.get_xlim()[0], ax.get_xlim()[1], 500),
xx, yy 0], ax.get_ylim()[1], 500))
np.linspace(ax.get_ylim()[= np.argmin(((xx - centroids[:, 0][:, np.newaxis, np.newaxis]) ** 2 +
Z - centroids[:, 1][:, np.newaxis, np.newaxis]) ** 2), axis=0)
(yy return Z
def animate(i):
global data, centroids
ax.clear()=':')
ax.grid(linestyle
if i == 0:
= ax.scatter(data[:, 0], data[:, 1], c='black')
scatter "Data")
ax.set_title(elif i == 1:
= initialize_centroids(data, 3)
centroids = ax.scatter(data[:, 0], data[:, 1], c='black')
scatter 0], centroids[:, 1], s=100, c=['red', 'green', 'blue'], marker='*')
ax.scatter(centroids[:, "Centroid initialization")
ax.set_title(elif i % 2 == 0:
= closest_centroid(data, centroids)
closest = ax.scatter(data[:, 0], data[:, 1], c=closest, cmap='rainbow')
scatter = voronoi_polygons(centroids, ax)
Z ='nearest',
ax.imshow(Z, interpolation=(ax.get_xlim()[0], ax.get_xlim()[1], ax.get_ylim()[0], ax.get_ylim()[1]),
extent='auto', origin='lower', alpha=0.3, cmap='rainbow')
aspectfor idx, centroid in enumerate(centroids):
0], centroid[1], markersize=10, color=plt.cm.rainbow(idx/2.5), markeredgecolor='black', ls='', marker='*')
ax.plot(centroid[f"Assigning points | Accuracy: {compute_accuracy(true_labels, closest)*100:.2f}%")
ax.set_title(else:
= closest_centroid(data, centroids)
closest = update_centroids(data, closest, centroids)
centroids = ax.scatter(data[:, 0], data[:, 1], c=closest, cmap='rainbow')
scatter = voronoi_polygons(centroids, ax)
Z ='nearest',
ax.imshow(Z, interpolation=(ax.get_xlim()[0], ax.get_xlim()[1], ax.get_ylim()[0], ax.get_ylim()[1]),
extent='auto', origin='lower', alpha=0.3, cmap='rainbow')
aspectfor idx, centroid in enumerate(centroids):
0], centroid[1], markersize=10, color=plt.cm.rainbow(idx/2.5), markeredgecolor='black', ls='', marker='*')
ax.plot(centroid["Updating centroids")
ax.set_title(
return scatter,
= FuncAnimation(fig, animate, frames=20, interval=2000, blit=True)
ani
plt.close()= display.HTML(ani.to_html5_video())
html display.display(html)
Question How do you think this could be possibly fixed?
π€·ββοΈ The number of clusters
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython import display
from sklearn.metrics import accuracy_score
from itertools import permutations
= 20
K
# Generating random data for the illustration
1)
np.random.seed(= np.random.randn(50, 2) + [4, 4]
points1 = np.random.randn(50, 2) + [-4, -4]
points2 = np.random.randn(50, 2) + [4, -4]
points3 = np.vstack([points1, points2, points3])
data = np.array([0]*50 + [1]*50 + [2]*50)
true_labels
= plt.subplots()
fig, ax =':')
ax.grid(linestyle
def compute_accuracy(true_labels, kmeans_labels):
= permutations([0, 1, 2])
perm = -1
max_accuracy for p in perm:
= np.array([p[i] for i in kmeans_labels])
temp_labels = accuracy_score(true_labels, temp_labels)
accuracy if accuracy > max_accuracy:
= accuracy
max_accuracy return max_accuracy
# Initialize centroids
def initialize_centroids(points, k):
return points[np.random.choice(points.shape[0], size=k, replace=False)]
# Closest centroid for each point
def closest_centroid(points, centroids):
= np.sqrt(((points - centroids[:, np.newaxis]) ** 2).sum(axis=2))
distances return np.argmin(distances, axis=0)
# Update the centroid locations
def update_centroids(points, closest, centroids):
return np.array([points[closest == k].mean(axis=0) for k in range(centroids.shape[0])])
def voronoi_polygons(centroids, ax):
= np.meshgrid(np.linspace(ax.get_xlim()[0], ax.get_xlim()[1], 500),
xx, yy 0], ax.get_ylim()[1], 500))
np.linspace(ax.get_ylim()[= np.argmin(((xx - centroids[:, 0][:, np.newaxis, np.newaxis]) ** 2 +
Z - centroids[:, 1][:, np.newaxis, np.newaxis]) ** 2), axis=0)
(yy return Z
def animate(i):
global data, centroids
ax.clear()=':')
ax.grid(linestyle
if i == 0:
= ax.scatter(data[:, 0], data[:, 1], c='black')
scatter "Data")
ax.set_title(elif i == 1:
= initialize_centroids(data, k=K)
centroids = ax.scatter(data[:, 0], data[:, 1], c='black')
scatter 0], centroids[:, 1], s=100, marker='*')
ax.scatter(centroids[:, "Centroid initialization")
ax.set_title(elif i % 2 == 0:
= closest_centroid(data, centroids)
closest
= ax.scatter(data[:, 0], data[:, 1], c=closest, cmap='rainbow')
scatter = voronoi_polygons(centroids, ax)
Z ='nearest',
ax.imshow(Z, interpolation=(ax.get_xlim()[0], ax.get_xlim()[1], ax.get_ylim()[0], ax.get_ylim()[1]),
extent='auto', origin='lower', alpha=0.3, cmap='rainbow')
aspectfor idx, centroid in enumerate(centroids):
0], centroid[1], markersize=10, color=plt.cm.rainbow(idx/2.5), markeredgecolor='black', ls='', marker='*')
ax.plot(centroid[f"Assigning points")
ax.set_title(else:
= closest_centroid(data, centroids)
closest = update_centroids(data, closest, centroids)
centroids = ax.scatter(data[:, 0], data[:, 1], c=closest, cmap='rainbow')
scatter = voronoi_polygons(centroids, ax)
Z ='nearest',
ax.imshow(Z, interpolation=(ax.get_xlim()[0], ax.get_xlim()[1], ax.get_ylim()[0], ax.get_ylim()[1]),
extent='auto', origin='lower', alpha=0.3, cmap='rainbow')
aspectfor idx, centroid in enumerate(centroids):
0], centroid[1], markersize=10, color=plt.cm.rainbow(idx/2.5), markeredgecolor='black', ls='', marker='*')
ax.plot(centroid["Updating centroids")
ax.set_title(
return scatter,
= FuncAnimation(fig, animate, frames=20, interval=2000, blit=True)
ani
plt.close()= display.HTML(ani.to_html5_video())
html display.display(html)
Question How do you think this could be possibly fixed?
πͺ Elbow method
- The Elbow method is a βrule-of-thumbβ approach to finding the optimal number of clusters.
- Here, we look at the cluster variance for different values of k:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
= []
distortions for i in range(1, 11):
= KMeans(n_clusters=i,
km =1,
random_state=10)
n_init
km.fit(data)
distortions.append(km.inertia_)
# Plot the distortions
range(1, 11), distortions, "o-")
plt.plot('Number of clusters')
plt.xlabel('Total sum of distances')
plt.ylabel(=":")
plt.grid(linestyle
# Annotate the point at k=3 with an arrow
= 3
k 'Elbow', xy=(k, distortions[k-1]), xytext=(k+1.5, distortions[k-1]+(max(distortions)*0.2)),
plt.annotate(=dict(facecolor='black', shrink=0.05), horizontalalignment='right')
arrowprops plt.show()
Then, we pick the value that resembles the βpit of an elbow.β As we can see, this would be k=3 in this case, which makes sense given our visual expection of the dataset previously.
Clustering comes with assumptions: * A clustering algorithm finds clusters by making assumptions with samples should be grouped together. * Each algorithm makes different assumptions and the quality and interpretability of your results will depend on whether the assumptions are satisfied for your goal. * For K-means clustering, the model is that all clusters have equal, spherical variance.
In general, there is no guarantee that structure found by a clustering algorithm has anything to do with what you were interested in.
βΎοΈβΌοΈβ¬οΈ Different variance
from sklearn.datasets import make_blobs
= 1500
n_samples = 170
random_state
= make_blobs(n_samples=n_samples,
X_varied, y_varied =[1.0, 2.5, 0.5],
cluster_std=random_state)
random_state
= KMeans(n_clusters=3, random_state=random_state, n_init=10)
kmeans
= kmeans.fit_predict(X_varied)
labels
0], X_varied[:, 1], c=labels, alpha=.5)
plt.scatter(X_varied[:, "Unequal Variance")
plt.title(=":")
plt.grid(linestyle plt.show()
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython import display
from sklearn.metrics import accuracy_score
from itertools import permutations
# Generating random data for the illustration
1)
np.random.seed(= 1e-1*np.random.randn(50, 2) + [1.1, 1.1]
points1 = np.random.randn(50, 2) + [-1.1, -1.1]
points2 = 6e-1*np.random.randn(50, 2) + [1.1, -1.1]
points3 = np.vstack([points1, points2, points3])
data = np.array([0]*50 + [1]*50 + [2]*50)
true_labels
= plt.subplots()
fig, ax =':')
ax.grid(linestyle
def compute_accuracy(true_labels, kmeans_labels):
= permutations([0, 1, 2])
perm = -1
max_accuracy for p in perm:
= np.array([p[i] for i in kmeans_labels])
temp_labels = accuracy_score(true_labels, temp_labels)
accuracy if accuracy > max_accuracy:
= accuracy
max_accuracy return max_accuracy
# Initialize centroids
def initialize_centroids(points, k):
return points[np.random.choice(points.shape[0], size=k, replace=False)]
# Closest centroid for each point
def closest_centroid(points, centroids):
= np.sqrt(((points - centroids[:, np.newaxis]) ** 2).sum(axis=2))
distances return np.argmin(distances, axis=0)
# Update the centroid locations
def update_centroids(points, closest, centroids):
return np.array([points[closest == k].mean(axis=0) for k in range(centroids.shape[0])])
def voronoi_polygons(centroids, ax):
= np.meshgrid(np.linspace(ax.get_xlim()[0], ax.get_xlim()[1], 500),
xx, yy 0], ax.get_ylim()[1], 500))
np.linspace(ax.get_ylim()[= np.argmin(((xx - centroids[:, 0][:, np.newaxis, np.newaxis]) ** 2 +
Z - centroids[:, 1][:, np.newaxis, np.newaxis]) ** 2), axis=0)
(yy return Z
def animate(i):
global data, centroids
ax.clear()=':')
ax.grid(linestyle
if i == 0:
= ax.scatter(data[:, 0], data[:, 1], c='black')
scatter "Data")
ax.set_title(elif i == 1:
= initialize_centroids(data, 3)
centroids = ax.scatter(data[:, 0], data[:, 1], c='black')
scatter 0], centroids[:, 1], s=100, c=['red', 'green', 'blue'], marker='*')
ax.scatter(centroids[:, "Centroid initialization")
ax.set_title(elif i % 2 == 0:
= closest_centroid(data, centroids)
closest = ax.scatter(data[:, 0], data[:, 1], c=closest, cmap='rainbow')
scatter = voronoi_polygons(centroids, ax)
Z ='nearest',
ax.imshow(Z, interpolation=(ax.get_xlim()[0], ax.get_xlim()[1], ax.get_ylim()[0], ax.get_ylim()[1]),
extent='auto', origin='lower', alpha=0.3, cmap='rainbow')
aspectfor idx, centroid in enumerate(centroids):
0], centroid[1], markersize=10, color=plt.cm.rainbow(idx/2.5), markeredgecolor='black', ls='', marker='*')
ax.plot(centroid[f"Assigning points | Accuracy: {compute_accuracy(true_labels, closest)*100:.2f}%")
ax.set_title(else:
= closest_centroid(data, centroids)
closest = update_centroids(data, closest, centroids)
centroids = ax.scatter(data[:, 0], data[:, 1], c=closest, cmap='rainbow')
scatter = voronoi_polygons(centroids, ax)
Z ='nearest',
ax.imshow(Z, interpolation=(ax.get_xlim()[0], ax.get_xlim()[1], ax.get_ylim()[0], ax.get_ylim()[1]),
extent='auto', origin='lower', alpha=0.3, cmap='rainbow')
aspectfor idx, centroid in enumerate(centroids):
0], centroid[1], markersize=10, color=plt.cm.rainbow(idx/2.5), markeredgecolor='black', ls='', marker='*')
ax.plot(centroid["Updating centroids")
ax.set_title(
return scatter,
= FuncAnimation(fig, animate, frames=20, interval=2000, blit=True)
ani
plt.close()= display.HTML(ani.to_html5_video())
html display.display(html)
π£/π Unevenly sized blobs
from sklearn.datasets import make_blobs
= 1500
n_samples = 170
random_state
= make_blobs(n_samples=n_samples,
X, y =[1.5, 1.5, 1.5],
cluster_std=random_state)
random_state= np.vstack((X[y == 0][:500], X[y == 1][:100], X[y == 2][:10]))
X_filtered
= KMeans(n_clusters=3, random_state=random_state,
kmeans =10)
n_init
= kmeans.fit_predict(X_filtered)
labels
0], X_filtered[:, 1], c=labels, alpha=0.5)
plt.scatter(X_filtered[:, "Unevenly Sized Blobs")
plt.title(=":")
plt.grid(linestyle plt.show()
⬬ Anisotropic data
= 1500
n_samples = 170
random_state
= make_blobs(n_samples=n_samples, cluster_std=[1.0, 1.0, 1.0], random_state=random_state)
X, y
= [[ 0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
transformation = np.dot(X, transformation)
X_aniso
"Anisotropically Distributed Blobs")
plt.title(=":")
plt.grid(linestyle0], X_aniso[:, 1])
plt.scatter(X_aniso[:, plt.show()
= KMeans(n_clusters=3, random_state=random_state,n_init=10)
kmeans
= kmeans.fit_predict(X_aniso)
labels
0], X_aniso[:, 1], c=labels)
plt.scatter(X_aniso[:, "Anisotropically Distributed Blobs")
plt.title(=":")
plt.grid(linestyle plt.show()
Gaussian Mixture could solve it
from sklearn.mixture import GaussianMixture
= GaussianMixture(n_components=3, covariance_type='full',
gmm =random_state)
random_state
gmm.fit(X_aniso)
= gmm.predict(X_aniso)
y_pred
0], X_aniso[:, 1], c=y_pred)
plt.scatter(X_aniso[:, "Anisotropically Distributed Blobs")
plt.title(=":")
plt.grid(linestyle plt.show()
Notable Clustering Routines
The following are several well-known clustering algorithms.
sklearn.cluster.KMeans
:
The simplest, yet effective clustering algorithm. Needs to be provided with the number of clusters in advance, and assumes that the data is normalized as input (but use a PCA model as preprocessor).sklearn.cluster.MeanShift
:
Can find better looking clusters than KMeans but is not scalable to high number of samples.sklearn.cluster.DBSCAN
:
Can detect irregularly shaped clusters based on density, i.e. sparse regions in the input space are likely to become inter-cluster boundaries. Can also detect outliers (samples that are not part of a cluster).sklearn.cluster.AffinityPropagation
:
Clustering algorithm based on message passing between data points.sklearn.cluster.SpectralClustering
:
KMeans applied to a projection of the normalized graph Laplacian: finds normalized graph cuts if the affinity matrix is interpreted as an adjacency matrix of a graph.sklearn.cluster.Ward
:
Ward implements hierarchical clustering based on the Ward algorithm, a variance-minimizing approach. At each step, it minimizes the sum of squared differences within all clusters (inertia criterion).
Of these, Ward, SpectralClustering, DBSCAN and Affinity propagation can also work with precomputed similarity matrices.
# Source: https://scikit-learn.org/stable/auto_examples/cluster/plot_cluster_comparison.html
import time
import warnings
from itertools import cycle, islice
import matplotlib.pyplot as plt
import numpy as np
from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler
# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============
= 500
n_samples = 30
seed = datasets.make_circles(
noisy_circles =n_samples, factor=0.5, noise=0.05, random_state=seed
n_samples
)= datasets.make_moons(n_samples=n_samples, noise=0.05, random_state=seed)
noisy_moons = datasets.make_blobs(n_samples=n_samples, random_state=seed)
blobs = np.random.RandomState(seed)
rng = rng.rand(n_samples, 2), None
no_structure
# Anisotropicly distributed data
= 170
random_state = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
X, y = [[0.6, -0.6], [-0.4, 0.8]]
transformation = np.dot(X, transformation)
X_aniso = (X_aniso, y)
aniso
# blobs with varied variances
= datasets.make_blobs(
varied =n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state
n_samples
)
# ============
# Set up cluster parameters
# ============
=(9 * 2 + 3, 13))
plt.figure(figsize
plt.subplots_adjust(=0.02, right=0.98, bottom=0.001, top=0.95, wspace=0.05, hspace=0.01
left
)
= 1
plot_num
= {
default_base "quantile": 0.3,
"eps": 0.3,
"damping": 0.9,
"preference": -200,
"n_neighbors": 3,
"n_clusters": 3,
"min_samples": 7,
"xi": 0.05,
"min_cluster_size": 0.1,
"allow_single_cluster": True,
"hdbscan_min_cluster_size": 15,
"hdbscan_min_samples": 3,
"random_state": 42,
}
= [
datasets
(
noisy_circles,
{"damping": 0.77,
"preference": -240,
"quantile": 0.2,
"n_clusters": 2,
"min_samples": 7,
"xi": 0.08,
},
),
(
noisy_moons,
{"damping": 0.75,
"preference": -220,
"n_clusters": 2,
"min_samples": 7,
"xi": 0.1,
},
),
(
varied,
{"eps": 0.18,
"n_neighbors": 2,
"min_samples": 7,
"xi": 0.01,
"min_cluster_size": 0.2,
},
),
(
aniso,
{"eps": 0.15,
"n_neighbors": 2,
"min_samples": 7,
"xi": 0.1,
"min_cluster_size": 0.2,
},
),"min_samples": 7, "xi": 0.1, "min_cluster_size": 0.2}),
(blobs, {
(no_structure, {}),
]
for i_dataset, (dataset, algo_params) in enumerate(datasets):
# update parameters with dataset-specific values
= default_base.copy()
params
params.update(algo_params)
= dataset
X, y
# normalize dataset for easier parameter selection
= StandardScaler().fit_transform(X)
X
# estimate bandwidth for mean shift
= cluster.estimate_bandwidth(X, quantile=params["quantile"])
bandwidth
# connectivity matrix for structured Ward
= kneighbors_graph(
connectivity =params["n_neighbors"], include_self=False
X, n_neighbors
)# make connectivity symmetric
= 0.5 * (connectivity + connectivity.T)
connectivity
# ============
# Create cluster objects
# ============
= cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms = cluster.MiniBatchKMeans(
two_means =params["n_clusters"],
n_clusters="auto",
n_init=params["random_state"],
random_state
)= cluster.AgglomerativeClustering(
ward =params["n_clusters"], linkage="ward", connectivity=connectivity
n_clusters
)= cluster.SpectralClustering(
spectral =params["n_clusters"],
n_clusters="arpack",
eigen_solver="nearest_neighbors",
affinity=params["random_state"],
random_state
)= cluster.DBSCAN(eps=params["eps"])
dbscan = cluster.HDBSCAN(
hdbscan =params["hdbscan_min_samples"],
min_samples=params["hdbscan_min_cluster_size"],
min_cluster_size=params["allow_single_cluster"],
allow_single_cluster
)= cluster.OPTICS(
optics =params["min_samples"],
min_samples=params["xi"],
xi=params["min_cluster_size"],
min_cluster_size
)= cluster.AffinityPropagation(
affinity_propagation =params["damping"],
damping=params["preference"],
preference=params["random_state"],
random_state
)= cluster.AgglomerativeClustering(
average_linkage ="average",
linkage="cityblock",
metric=params["n_clusters"],
n_clusters=connectivity,
connectivity
)= cluster.Birch(n_clusters=params["n_clusters"])
birch = mixture.GaussianMixture(
gmm =params["n_clusters"],
n_components="full",
covariance_type=params["random_state"],
random_state
)
= (
clustering_algorithms "MiniBatch\nKMeans", two_means),
("Affinity\nPropagation", affinity_propagation),
("MeanShift", ms),
("Spectral\nClustering", spectral),
("Ward", ward),
("Agglomerative\nClustering", average_linkage),
("DBSCAN", dbscan),
("HDBSCAN", hdbscan),
("OPTICS", optics),
("BIRCH", birch),
("Gaussian\nMixture", gmm),
(
)
for name, algorithm in clustering_algorithms:
= time.time()
t0
# catch warnings related to kneighbors_graph
with warnings.catch_warnings():
warnings.filterwarnings("ignore",
="the number of connected components of the "
message+ "connectivity matrix is [0-9]{1,2}"
+ " > 1. Completing it to avoid stopping the tree early.",
=UserWarning,
category
)
warnings.filterwarnings("ignore",
="Graph is not fully connected, spectral embedding"
message+ " may not work as expected.",
=UserWarning,
category
)
algorithm.fit(X)
= time.time()
t1 if hasattr(algorithm, "labels_"):
= algorithm.labels_.astype(int)
y_pred else:
= algorithm.predict(X)
y_pred
len(datasets), len(clustering_algorithms), plot_num)
plt.subplot(if i_dataset == 0:
=18)
plt.title(name, size
= np.array(
colors list(
islice(
cycle(
["#377eb8",
"#ff7f00",
"#4daf4a",
"#f781bf",
"#a65628",
"#984ea3",
"#999999",
"#e41a1c",
"#dede00",
]
),int(max(y_pred) + 1),
)
)
)# add black color for outliers (if any)
= np.append(colors, ["#000000"])
colors 0], X[:, 1], s=10, color=colors[y_pred])
plt.scatter(X[:,
-2.5, 2.5)
plt.xlim(-2.5, 2.5)
plt.ylim(
plt.xticks(())
plt.yticks(())
plt.text(0.99,
0.01,
"%.2fs" % (t1 - t0)).lstrip("0"),
(=plt.gca().transAxes,
transform=15,
size="right",
horizontalalignment
)+= 1
plot_num
plt.show()
ποΈββοΈ Exercise: cluster digits
- Perform K-means clustering on the digits data, searching for ten clusters.
- Visualize the cluster centers as images (i.e. reshape each to 8x8 and use
plt.imshow
) - Do the clusters seem to be correlated with particular digits?
- What is the
adjusted_rand_score
?
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.datasets import fetch_openml
from sklearn.metrics import adjusted_rand_score
# Load the MNIST dataset
= fetch_openml('mnist_784',
mnist ="auto")
parser= mnist.data / 255.0 # Normalize data to [0, 1]
X = mnist.target.astype(int) # Convert target values to integers
y
# Apply K-means clustering
= 10
n_clusters
### π±π±π± YOUR CODE HERE π±π±π±
= None
kmeans
# Visualize the cluster centers as images
= plt.subplots(2, 5, figsize=(8, 4))
fig, axes = axes.ravel()
axes
for i, center in enumerate(kmeans.cluster_centers_):
28, 28), cmap=plt.cm.gray)
axes[i].imshow(center.reshape('off')
axes[i].axis(f'Cluster {i}')
axes[i].set_title(
plt.tight_layout()3
plt.show()
# Do the clusters seem to be correlated with particular digits?
# Let's compute the most common label in each cluster
= np.zeros_like(kmeans.labels_)
labels for i in range(n_clusters):
= (kmeans.labels_ == i)
mask = np.bincount(y[mask]).argmax()
labels[mask]
# Calculate the adjusted_rand_score
### π±π±π± YOUR CODE HERE π±π±π±
= None
score print(f"Adjusted Rand Score: {score:.4f}")
Try the light version of MNIST (load_digits from sklearn)
from sklearn.datasets import load_digits
= load_digits(n_class=10)
digits
digits.data.shape
### π±π±π± YOUR CODE HERE π±π±π±
= None
kmeans = kmeans.fit_predict(digits.data)
clusters
# Visualize the cluster centers as images
= plt.subplots(2, 5, figsize=(8, 4))
fig, axes = axes.ravel()
axes
for i, center in enumerate(kmeans.cluster_centers_):
8, 8), cmap=plt.cm.gray)
axes[i].imshow(center.reshape('off')
axes[i].axis(f'Cluster {i}')
axes[i].set_title(
plt.show()
### π±π±π± YOUR CODE HERE π±π±π±
= None
score print(f"Adjusted Rand Score: {score:.4f}")
0.6649258693926379
π Hierarchical Clustering
- One nice feature of hierarchical clustering is that we can visualize the results as a dendrogram, a hierarchical tree.
- Using the visualization, we can then decide how βdeepβ we want to cluster the dataset by setting a βdepthβ threshold
- Or in other words, we donβt need to make a decision about the number of clusters upfront.
β Agglomerative and divisive hierarchical clustering
- Furthermore, we can distinguish between 2 main approaches to hierarchical clustering: Divisive clustering and agglomerative clustering.
- In agglomerative clustering, we start with a single sample from our dataset and iteratively merge it with other samples to form clusters - we can see it as a bottom-up approach for building the clustering dendrogram.
- In divisive clustering, however, we start with the whole dataset as one cluster, and we iteratively split it into smaller subclusters - a top-down approach.
In this notebook, we will use agglomerative clustering.
Single and complete linkage
- Now, the next question is how we measure the similarity between samples.
- One approach is the familiar Euclidean distance metric that we already used via the K-Means algorithm.
- As a refresher, the distance between 2 m-dimensional vectors \(\mathbf{p}\) and \(\mathbf{q}\) can be computed as:
\[\begin{align} \mathrm{d}(\mathbf{q},\mathbf{p}) & = \sqrt{(q_1-p_1)^2 + (q_2-p_2)^2 + \cdots + (q_m-p_m)^2} \\[8pt] & = \sqrt{\sum_{j=1}^m (q_j-p_j)^2} = \|\mathbf{q} - \mathbf{p}\|_2\end{align}\]
However, thatβs the distance between 2 samples.
Now, how do we compute the similarity between subclusters of samples?
I.e., our goal is to iteratively merge the most similar pairs of clusters until only one big cluster remains.
There are many different approaches to this, for example single and complete linkage.
In single linkage, we take the pair of the most similar samples (based on the Euclidean distance, for example) in each cluster, and merge the two clusters which have the most similar 2 members into one new, bigger cluster.
In complete linkage, we compare the pairs of the two most dissimilar members of each cluster with each other, and we merge the 2 clusters where the distance between its 2 most dissimilar members is smallest.
from sklearn import datasets
= datasets.load_iris()
iris = iris.data[:, [2, 3]]
X = iris.target
y = X.shape
n_samples, n_features
0], X[:, 1], c=y)
plt.scatter(X[:, =":")
plt.grid(linestyle plt.show()
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
= linkage(X, metric='euclidean', method='complete') clusters
= plt.subplots(1,1, figsize=(12,8))
fig, ax = dendrogram(clusters, ax=ax)
dendr 'Euclidean Distance')
plt.ylabel( plt.show()
from sklearn.cluster import AgglomerativeClustering
= AgglomerativeClustering(n_clusters=3,
ac ='euclidean',
affinity='complete')
linkage
= ac.fit_predict(X)
prediction print('Cluster labels: %s\n' % prediction)
Cluster labels: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1
1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0]
/Users/bratishka/.pyenv/versions/3.9.17/envs/benchmarx/lib/python3.9/site-packages/sklearn/cluster/_agglomerative.py:1006: FutureWarning: Attribute `affinity` was deprecated in version 1.2 and will be removed in 1.4. Use `metric` instead
warnings.warn(
0], X[:, 1], c=prediction)
plt.scatter(X[:, =":")
plt.grid(linestyle plt.show()
π Density-based Clustering - DBSCAN
- Another useful approach to clustering is Density-based Spatial Clustering of Applications with Noise (DBSCAN).
- In essence, we can think of DBSCAN as an algorithm that divides the dataset into subgroup based on dense regions of points.
In DBSCAN, we distinguish between 3 different βpointsβ:
- Core points: A core point is a point that has at least a minimum number of other points (MinPts) in its radius epsilon.
- Border points: A border point is a point that is not a core point, since it doesnβt have enough MinPts in its neighborhood, but lies within the radius epsilon of a core point.
- Noise points: All other points that are neither core points nor border points.
A nice feature about DBSCAN is that we donβt have to specify a number of clusters upfront. However, it requires the setting of additional hyperparameters such as the value for MinPts and the radius epsilon.
from sklearn.datasets import make_moons
= make_moons(n_samples=400, noise=0.1, random_state=1)
X, y 0], X[:,1])
plt.scatter(X[:,=":")
plt.grid(linestyle plt.show()
from sklearn.cluster import DBSCAN
= DBSCAN(eps=0.2, min_samples=10, metric='euclidean')
db = db.fit_predict(X)
labels print(labels)
0], X[:, 1], c=labels)
plt.scatter(X[:, =":")
plt.grid(linestyle plt.show()
[ 0 1 0 0 1 0 1 0 1 0 1 0 1 0 0 0 0 1 1 0 1 0 0 0
1 1 0 0 0 0 1 1 1 0 1 1 1 1 0 1 0 0 1 1 0 1 0 0
0 0 1 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 0 1
0 0 1 0 1 1 1 0 1 1 0 -1 1 1 0 0 0 1 1 0 0 0 1 0
0 0 0 1 1 0 1 1 1 0 1 0 0 0 1 0 1 0 1 1 0 0 1 0
1 1 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0 1 1 1 1
1 1 1 1 1 1 0 1 1 1 0 0 1 1 1 0 1 0 0 1 0 1 1 0
1 0 0 0 0 0 1 1 0 1 1 1 1 0 1 0 1 1 1 1 0 1 0 0
0 1 0 1 1 0 1 1 0 1 0 0 0 1 1 1 0 1 0 1 0 0 0 1
0 1 0 1 1 1 1 0 1 0 0 0 0 0 0 1 0 1 0 1 1 -1 0 0
1 1 1 1 1 1 1 0 1 1 0 1 1 0 1 0 1 0 0 0 0 0 0 1
0 0 0 1 1 1 1 0 -1 0 1 1 1 1 1 0 1 0 1 0 1 1 1 1
1 0 1 0 1 0 1 0 0 1 0 1 0 1 0 1 0 1 0 0 1 0 1 1
0 0 0 1 1 0 0 1 1 0 0 1 0 1 0 0 0 1 1 1 0 1 0 1
1 0 0 1 1 0 0 1 1 1 1 0 1 0 0 1 1 1 0 1 0 1 0 1
1 1 0 0 0 0 0 1 0 1 0 0 1 1 1 0 0 0 1 1 1 0 0 0
0 1 0 0 1 1 1 0 1 0 0 0 1 0 0 1]
π Extremely good visualization of DBSCAN
ποΈββοΈ Exercise
- Using the following toy datasets, two concentric circles, experiment with the three different clustering algorithms that we used so far
KMeans
,AgglomerativeClustering
, andDBSCAN
. - Which clustering algorithms reproduces or discovers the hidden structure (pretending we donβt know
y
) best? - Can you explain why this particular algorithm is a good choice while the other 2 βfailβ?
from sklearn.datasets import make_circles
= make_circles(n_samples=500, factor=.6, noise=.05)
X, y
0], X[:, 1], c=y)
plt.scatter(X[:, =":")
plt.grid(linestyle plt.show()
### π±π±π± YOUR CODE HERE π±π±π±
π Community detection
π₯ Karate club
!pip install -q community node2vec python-louvain
import networkx as nx
import matplotlib.pyplot as plt
# Load the Karate Club graph
= nx.karate_club_graph()
G
# Draw the graph
=(8, 6))
plt.figure(figsize=True, node_color="skyblue", node_size=1000)
nx.draw(G, with_labels"Karate Club Graph")
plt.title( plt.show()
print(nx.adjacency_matrix(G))
(0, 1) 4
(0, 2) 5
(0, 3) 3
(0, 4) 3
(0, 5) 3
(0, 6) 3
(0, 7) 2
(0, 8) 2
(0, 10) 2
(0, 11) 3
(0, 12) 1
(0, 13) 3
(0, 17) 2
(0, 19) 2
(0, 21) 2
(0, 31) 2
(1, 0) 4
(1, 2) 6
(1, 3) 3
(1, 7) 4
(1, 13) 5
(1, 17) 1
(1, 19) 2
(1, 21) 2
(1, 30) 2
: :
(32, 18) 1
(32, 20) 3
(32, 22) 2
(32, 23) 5
(32, 29) 4
(32, 30) 3
(32, 31) 4
(32, 33) 5
(33, 8) 4
(33, 9) 2
(33, 13) 3
(33, 14) 2
(33, 15) 4
(33, 18) 2
(33, 19) 1
(33, 20) 1
(33, 22) 3
(33, 23) 4
(33, 26) 2
(33, 27) 4
(33, 28) 2
(33, 29) 2
(33, 30) 3
(33, 31) 4
(33, 32) 5
from node2vec import Node2Vec
# Generate embeddings using Node2Vec
= Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)
node2vec = node2vec.fit(window=10, min_count=1)
model
# Get embeddings for all nodes
= [model.wv[str(node)] for node in G.nodes()] embeddings
Generating walks (CPU: 4): 100%|ββββββββββ| 50/50 [00:00<00:00, 531.03it/s]
Generating walks (CPU: 2): 100%|ββββββββββ| 50/50 [00:00<00:00, 528.77it/s]
Generating walks (CPU: 1): 100%|ββββββββββ| 50/50 [00:00<00:00, 516.09it/s]
Generating walks (CPU: 3): 100%|ββββββββββ| 50/50 [00:00<00:00, 506.18it/s]
from sklearn.cluster import KMeans
= KMeans(n_clusters=5)
kmeans = kmeans.fit_predict(embeddings)
kmeans_clusters
# Visualize K-Means clusters
=(8, 6))
plt.figure(figsize=True, node_color=kmeans_clusters, cmap="coolwarm", node_size=1000)
nx.draw(G, with_labels"K-Means Clustering on Karate Club Graph")
plt.title( plt.show()
/Users/bratishka/.pyenv/versions/3.9.17/envs/benchmarx/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
super()._check_params_vs_input(X, default_n_init=10)
from sklearn.cluster import DBSCAN
= DBSCAN(eps=0.5, min_samples=5)
dbscan = dbscan.fit_predict(embeddings)
dbscan_clusters
# Visualize DBSCAN clusters
=(8, 6))
plt.figure(figsize=True, node_color=dbscan_clusters, cmap="coolwarm", node_size=1000)
nx.draw(G, with_labels"DBSCAN Clustering on Karate Club Graph")
plt.title( plt.show()
π₯Ίπ Facebook ego networks
import networkx as nx
import requests
import gzip
import shutil
from matplotlib import pyplot as plt
# Download the dataset
= "https://snap.stanford.edu/data/facebook_combined.txt.gz"
url = requests.get(url, stream=True)
response with open("facebook_combined.txt.gz", "wb") as file:
for chunk in response.iter_content(chunk_size=128):
file.write(chunk)
# Unzip the dataset
with gzip.open("facebook_combined.txt.gz", 'rb') as f_in:
with open("facebook_combined.txt", 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
# Load the Facebook graph
= "facebook_combined.txt"
path_to_dataset = nx.read_edgelist(path_to_dataset)
G
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")
Number of nodes: 4039
Number of edges: 88234
#Create network layout for visualizations
= nx.layout.spring_layout(G)
spring_pos
"off")
plt.axis(=spring_pos, with_labels=False, node_size=15) nx.draw_networkx(G, pos
import community as community_louvain
= community_louvain.best_partition(G)
parts = [parts.get(node) for node in G.nodes()] values
#drawing
"off")
plt.axis(=spring_pos, cmap=plt.get_cmap("Dark2"),
nx.draw_networkx(G, pos=values, node_size=35, with_labels=False) node_color
print(community_louvain.modularity(parts, G))
0.8348452461385243
from node2vec import Node2Vec
# Generate embeddings using Node2Vec
= Node2Vec(G, dimensions=128, walk_length=30, num_walks=50, workers=4)
node2vec = node2vec.fit(window=10, min_count=1)
model
# Get embeddings for all nodes
= [model.wv[node] for node in G.nodes()] embeddings
Generating walks (CPU: 1): 100%|ββββββββββ| 13/13 [00:09<00:00, 1.35it/s]
Generating walks (CPU: 2): 100%|ββββββββββ| 13/13 [00:09<00:00, 1.35it/s]
Generating walks (CPU: 3): 100%|ββββββββββ| 12/12 [00:08<00:00, 1.36it/s]
Generating walks (CPU: 4): 100%|ββββββββββ| 12/12 [00:08<00:00, 1.38it/s]
from sklearn.cluster import KMeans
= KMeans(n_clusters=10) # assuming 10 clusters for demonstration
kmeans = kmeans.fit_predict(embeddings) kmeans_clusters
/Users/bratishka/.pyenv/versions/3.9.17/envs/benchmarx/lib/python3.9/site-packages/sklearn/cluster/_kmeans.py:1416: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
super()._check_params_vs_input(X, default_n_init=10)
from sklearn.cluster import DBSCAN
= DBSCAN(eps=0.5, min_samples=5)
dbscan = dbscan.fit_predict(embeddings) dbscan_clusters
import matplotlib.pyplot as plt
import random
# Sample a subgraph
= random.sample(G.nodes(), 1000) # sample 1000 nodes
sampled_nodes = G.subgraph(sampled_nodes)
subG
# Get clusters for sampled nodes
= [kmeans_clusters[int(node)] for node in sampled_nodes]
sampled_kmeans_clusters
# Visualize K-Means clusters on subgraph
=(12, 12))
plt.figure(figsize= nx.spring_layout(subG)
pos =sampled_kmeans_clusters, cmap="coolwarm", node_size=50)
nx.draw(subG, pos, node_color"K-Means Clustering on Facebook Subgraph")
plt.title( plt.show()
/var/folders/7m/3rbdnx5n5sz625f3l87m91cc0000gn/T/ipykernel_24364/271705878.py:5: DeprecationWarning: Sampling from a set deprecated
since Python 3.9 and will be removed in a subsequent version.
sampled_nodes = random.sample(G.nodes(), 1000) # sample 1000 nodes