Week 6 notebook
unsupervised learning, clustering
import random
x = [1, 2, 3, 4, 9, 10]
random.shuffle(x)
x
import numpy as np
from matplotlib import pyplot as plt
from ipywidgets import interact
Overview:
- Supervised vs unsupervised. Meta-picture still holds. What is "generalisation"?
- Mind map. Clustering. EM. KMeans. Hierarchical. Normal mixture.
def distance(x, y):
return np.sum((x - y)**2)
D = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
K = 2
# init_seed = [1, 2]
init_seed = [2, 9]
centers_history = [init_seed]
clusters = None
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
cmap = plt.get_cmap("tab10")
num_rounds = 10
for r in range(num_rounds):
current_centers = centers_history[-1]
clusters = [[] for _ in range(K)]
# compute new clusters
for x in D:
best_cluster_index = np.argmin([distance(x, center) for center in current_centers])
clusters[best_cluster_index].append(x)
print(clusters)
# update centeres.
new_centers = [np.mean(c) for c in clusters]
centers_history.append(new_centers)
# plotting
for i, c in enumerate(clusters):
h = r
ax.scatter(c, np.ones(len(c), dtype=int) * h, color=cmap(i))
ax.plot([new_centers[i]], [h], "r*", markersize=15, alpha=0.5)
# stopping condition
if new_centers == current_centers:
print("Centers converged.")
break
clusters = [[] for _ in range(3)]
for x in D:
clusters[np.argmin([distance(x, center) for center in init_seed])].append(x)
# new_centers = [np.mean(np.array(c), axis=1) for c in clusters]
np.array(clusters[0]).mean(axis=0)
true_means = [(1, 1), (1, 8), (8, 8), (2, 3)]
num_data_per_cluster = 30
D = np.concatenate(
[np.random.multivariate_normal(mean=m, cov=np.eye(2), size=num_data_per_cluster) for m in true_means]
)
K = 4
rand_indices = np.random.choice(np.arange(len(D)), size=K, replace=False)
init_seed = D[rand_indices, :]
centers_history = [init_seed]
cluster_history = []
num_rounds = 10
for r in range(num_rounds):
current_centers = centers_history[-1]
clusters = [[] for _ in range(K)]
# compute new clusters
for x in D:
best_cluster_index = np.argmin([distance(x, center) for center in current_centers])
clusters[best_cluster_index].append(x)
cluster_history.append(clusters)
# update centeres.
new_centers = [np.array(c).mean(axis=0) for c in clusters]
centers_history.append(new_centers)
# stopping condition
if np.sum([np.sum((a - b)**2) for a, b in zip(new_centers, current_centers)]) < 0.001:
print(f"Centers converged at round {r}")
break
@interact(t=(0, len(cluster_history)-1))
def plot(t=-1):
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
cmap = plt.get_cmap("tab10")
for i, c in enumerate(cluster_history[t]):
if len(c) > 1:
c = np.array(c)
ax.scatter(c[:, 0], c[:, 1], color=cmap(i))
elif len(c) == 1:
ax.scatter([c[0][0]], [c[0][1]], color=cmap(i))
else:
print(f"EMTPY CLUSTER {i}")
x, y = centers_history[t][i]
ax.plot([x], [y], "b*", markersize=15)
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import pdist, squareform
inputs=[
[1],[2.7],[3.3],[4.6],[6.0],[7.5],[9.1],[10.8],[12.6],[16.5]
]
# print(inputs)
d = pdist(inputs, 'euclidean')
#d = squareform(pdist(inputs, 'euclidean'))
len(inputs), len(d)
hc1 = linkage(d, 'average')
dendrogram(hc1, labels=inputs)
plt.show()
# dendrogram(hc2, labels=inputs)
# plt.show()
# hc3 = linkage(d, 'complete')
# dendrogram(hc3, labels=inputs)
# plt.show()
Practical
Use Fisher’s famous Iris dataset.
- It contains 3 classes (species), 150 instances, and 4 numeric attributes.
- Take the Iris data set and subset the first four columns i.e., discard the 5th column containing the species classification)
- Read the scikit-learn documentation for k-means; Use k-means to cluster the unlabelled data into three clusters and extract cluster assignments.
import pandas as pd
from sklearn.cluster import KMeans
irisall = pd.read_csv('./Workshop-materials-2022/iris.csv')
iris=irisall[['Sepal.Length','Sepal.Width','Petal.Length','Petal.Width']]
kmeans = KMeans(n_clusters=3, random_state=0).fit(iris)
labels=kmeans.labels_
# concatenating predicted labels to our dataset
iris_label = irisall.join(pd.DataFrame({'km_labels':labels}))
# What is the related label to each species
print('Predicted labels for setosa')
print(iris_label.loc[iris_label.Species=='setosa', 'km_labels'].value_counts())
print("")
#Setosa is matched with 1
print('Predicted labels for versicolor')
print(iris_label.loc[iris_label.Species=='versicolor', 'km_labels'].value_counts())
print("")
#versicolor is matched with 0
print('Predicted labels for virginica')
print(iris_label.loc[iris_label.Species=='virginica', 'km_labels'].value_counts())
print("")
#virginica is matched with 2
# matches in each iris type
TP_setosa = iris_label[(iris_label.Species=='setosa') & (iris_label.km_labels==1)].shape[0]
TP_versicolor = iris_label[(iris_label.Species=='versicolor') & (iris_label.km_labels==0)].shape[0]
TP_virginica = iris_label[(iris_label.Species=='virginica') & (iris_label.km_labels==2)].shape[0]
# total matches
matches = TP_setosa + TP_versicolor + TP_virginica
print("Total number of matches between cluster label and class label=",matches)
# iris size is 150
import matplotlib.pyplot as plt
unique_labels = list(set(labels))
colors = ['red','blue','green']
IrisName = ['versicolor','setosa', 'virginica']
for l in unique_labels:
xi = iris_label.loc[iris_label['km_labels']==l,'Sepal.Length']
yi = iris_label.loc[iris_label['km_labels']==l,'Sepal.Width']
plt.scatter(xi, yi, c=colors[l], label=IrisName[l])
plt.legend()
plt.show()
import matplotlib.pyplot as plt
unique_labels = list(set(labels))
colors = ['red','blue','green']
IrisName = ['versicolor','setosa', 'virginica']
for l in unique_labels:
xi = iris_label.loc[iris_label['km_labels']==l,'Petal.Length']
yi = iris_label.loc[iris_label['km_labels']==l,'Petal.Width']
plt.scatter(xi, yi, c=colors[l], label=IrisName[l])
plt.legend()
plt.show()
import matplotlib.pyplot as plt
unique_labels = iris_label.Species.unique()
colors = ['red','blue','green']
i=0
for l in unique_labels:
xi = iris_label.loc[iris_label['Species']==l,'Sepal.Length']
yi = iris_label.loc[iris_label['Species']==l,'Sepal.Width']
plt.scatter(xi, yi, c=colors[i], label=l)
i=i+1
plt.legend()
plt.show()
import matplotlib.pyplot as plt
unique_labels = iris_label.Species.unique()
colors = ['red','blue','green']
i=0
for l in unique_labels:
xi = iris_label.loc[iris_label['Species']==l,'Petal.Length']
yi = iris_label.loc[iris_label['Species']==l,'Petal.Width']
plt.scatter(xi, yi, c=colors[i], label=l)
i=i+1
plt.legend()
plt.show()