import random
x = [1, 2, 3, 4, 9, 10]
random.shuffle(x)
x
[3, 9, 2, 1, 10, 4]
import numpy as np
from matplotlib import pyplot as plt
from ipywidgets import interact

Overview:

  • Supervised vs unsupervised. Meta-picture still holds. What is "generalisation"?
  • Mind map. Clustering. EM. KMeans. Hierarchical. Normal mixture.

Question 1 & 2

def distance(x, y):
    return np.sum((x - y)**2)

D = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
K = 2
# init_seed = [1, 2]
init_seed = [2, 9]

centers_history = [init_seed]
clusters = None

fig, ax = plt.subplots(1, 1, figsize=(8, 8))
cmap = plt.get_cmap("tab10")

num_rounds = 10
for r in range(num_rounds):
    current_centers = centers_history[-1]
    clusters = [[] for _ in range(K)] 
    
    # compute new clusters
    for x in D:
        best_cluster_index = np.argmin([distance(x, center) for center in current_centers])
        clusters[best_cluster_index].append(x)
    print(clusters)
    
    # update centeres. 
    new_centers = [np.mean(c) for c in clusters]
    centers_history.append(new_centers)
    
    # plotting
    for i, c in enumerate(clusters):
        h = r
        ax.scatter(c, np.ones(len(c), dtype=int) * h, color=cmap(i))
        ax.plot([new_centers[i]], [h], "r*", markersize=15, alpha=0.5)
    
    # stopping condition
    if new_centers == current_centers:
        print("Centers converged.")
        break
        
        
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]
[[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]]
Centers converged.

2D version

clusters = [[] for _ in range(3)]
for x in D:
    clusters[np.argmin([distance(x, center) for center in init_seed])].append(x)
# new_centers = [np.mean(np.array(c), axis=1) for c in clusters]
np.array(clusters[0]).mean(axis=0)
array([[ 2.99566573,  1.40041147],
       [ 2.07301107, -0.94884373],
       [ 2.24820435,  1.02345642],
       [ 1.09671259, -0.85771165],
       [ 2.54128794,  0.93508839]])
true_means = [(1, 1), (1, 8), (8, 8), (2, 3)]
num_data_per_cluster = 30
D = np.concatenate(
    [np.random.multivariate_normal(mean=m, cov=np.eye(2), size=num_data_per_cluster) for m in true_means]
)

K = 4
rand_indices = np.random.choice(np.arange(len(D)), size=K, replace=False)
init_seed = D[rand_indices, :]
centers_history = [init_seed]
cluster_history = []

num_rounds = 10
for r in range(num_rounds):
    current_centers = centers_history[-1]
    clusters = [[] for _ in range(K)] 
    
    # compute new clusters
    for x in D:
        best_cluster_index = np.argmin([distance(x, center) for center in current_centers])
        clusters[best_cluster_index].append(x)
    cluster_history.append(clusters)
    
    # update centeres. 
    new_centers = [np.array(c).mean(axis=0) for c in clusters]
    centers_history.append(new_centers)
    
    
    # stopping condition
    if np.sum([np.sum((a - b)**2) for a, b in zip(new_centers, current_centers)]) < 0.001:
        print(f"Centers converged at round {r}")
        break
        




@interact(t=(0, len(cluster_history)-1))
def plot(t=-1):
    fig, ax = plt.subplots(1, 1, figsize=(8, 8))
    cmap = plt.get_cmap("tab10")
    for i, c in enumerate(cluster_history[t]):
        if len(c) > 1:
            c = np.array(c)
            ax.scatter(c[:, 0], c[:, 1], color=cmap(i))
        elif len(c) == 1:
            ax.scatter([c[0][0]], [c[0][1]], color=cmap(i))
        else:
            print(f"EMTPY CLUSTER {i}")
            
        x, y = centers_history[t][i]
        ax.plot([x], [y], "b*", markersize=15)
Centers converged at round 3

Question 3

3) Using the 1-dimensional dataset {1, 2.7, 3.3, 4.6, 6.0, 7.5, 9.1, 10.8, 12.6, 16.5}, agglomerative hierarchical clustering and Euclidean distance, show the dendrogram for each of single linkage, average linkage and complete linkage criteria!

from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist, squareform


inputs=[
    [1],[2.7],[3.3],[4.6],[6.0],[7.5],[9.1],[10.8],[12.6],[16.5]
]
# print(inputs)

d = pdist(inputs, 'euclidean')
#d = squareform(pdist(inputs, 'euclidean'))

len(inputs), len(d)
(10, 45)
hc1 = linkage(d, 'average')
dendrogram(hc1, labels=inputs)
plt.show()
# dendrogram(hc2, labels=inputs)
# plt.show()

# hc3 = linkage(d, 'complete')
# dendrogram(hc3, labels=inputs)
# plt.show()
[[1], [2.7], [3.3], [4.6], [6.0], [7.5], [9.1], [10.8], [12.6], [16.5]]

Question 4 - 8

Link to written note:

Practical

Use Fisher’s famous Iris dataset.

  • It contains 3 classes (species), 150 instances, and 4 numeric attributes.
  • Take the Iris data set and subset the first four columns i.e., discard the 5th column containing the species classification)
  • Read the scikit-learn documentation for k-means; Use k-means to cluster the unlabelled data into three clusters and extract cluster assignments.
import pandas as pd
from sklearn.cluster import KMeans

irisall = pd.read_csv('./Workshop-materials-2022/iris.csv')
iris=irisall[['Sepal.Length','Sepal.Width','Petal.Length','Petal.Width']]

kmeans = KMeans(n_clusters=3, random_state=0).fit(iris)
labels=kmeans.labels_

# concatenating predicted labels to our dataset
iris_label = irisall.join(pd.DataFrame({'km_labels':labels}))

# What is the related label to each species
print('Predicted labels for setosa')
print(iris_label.loc[iris_label.Species=='setosa', 'km_labels'].value_counts())
print("")
#Setosa is matched with 1

print('Predicted labels for versicolor')
print(iris_label.loc[iris_label.Species=='versicolor', 'km_labels'].value_counts())
print("")
#versicolor is matched with 0


print('Predicted labels for virginica')
print(iris_label.loc[iris_label.Species=='virginica', 'km_labels'].value_counts())
print("")
#virginica is matched with 2


# matches in each iris type
TP_setosa = iris_label[(iris_label.Species=='setosa') & (iris_label.km_labels==1)].shape[0]
TP_versicolor = iris_label[(iris_label.Species=='versicolor') & (iris_label.km_labels==0)].shape[0]
TP_virginica = iris_label[(iris_label.Species=='virginica') & (iris_label.km_labels==2)].shape[0]

# total matches
matches = TP_setosa + TP_versicolor + TP_virginica
        
print("Total number of matches between cluster label and class label=",matches)
# iris size is 150
Predicted labels for setosa
1    50
Name: km_labels, dtype: int64

Predicted labels for versicolor
0    48
2     2
Name: km_labels, dtype: int64

Predicted labels for virginica
2    36
0    14
Name: km_labels, dtype: int64

Total number of matches between cluster label and class label= 134

Plot the first two columns and colour by the assigned clusters. Are the cluster assignments well separated?

import matplotlib.pyplot as plt


unique_labels = list(set(labels))

colors = ['red','blue','green']
IrisName = ['versicolor','setosa', 'virginica']


for l in unique_labels:
    
    xi = iris_label.loc[iris_label['km_labels']==l,'Sepal.Length']
    yi = iris_label.loc[iris_label['km_labels']==l,'Sepal.Width']
    
    plt.scatter(xi, yi, c=colors[l], label=IrisName[l])
    
plt.legend()

plt.show()
Plot columns 3 and 4 coloured by cluster. Are these cluster assignments well separated?

Does the clustering coincide with the species? Why would this be the case?

import matplotlib.pyplot as plt


unique_labels = list(set(labels))

colors = ['red','blue','green']
IrisName = ['versicolor','setosa', 'virginica']

for l in unique_labels:
    
    xi = iris_label.loc[iris_label['km_labels']==l,'Petal.Length']
    yi = iris_label.loc[iris_label['km_labels']==l,'Petal.Width']
    
    plt.scatter(xi, yi, c=colors[l], label=IrisName[l])
    
plt.legend()
plt.show()

Colored based on Actual class label

import matplotlib.pyplot as plt


unique_labels = iris_label.Species.unique()

colors = ['red','blue','green']
i=0
for l in unique_labels:
    
    xi = iris_label.loc[iris_label['Species']==l,'Sepal.Length']
    yi = iris_label.loc[iris_label['Species']==l,'Sepal.Width']
    
    plt.scatter(xi, yi, c=colors[i], label=l)
    i=i+1
    
plt.legend()

plt.show()
import matplotlib.pyplot as plt


unique_labels = iris_label.Species.unique()

colors = ['red','blue','green']
i=0
for l in unique_labels:
    
    xi = iris_label.loc[iris_label['Species']==l,'Petal.Length']
    yi = iris_label.loc[iris_label['Species']==l,'Petal.Width']
    
    plt.scatter(xi, yi, c=colors[i], label=l)
    i=i+1
    
plt.legend()

plt.show()