Week 5 notebook
adaboost, gradient boosting, xgboost
- Written notes
- Sampling with replacement for a given probability distribution
- Adaboost walkthrough
- Scikit-learn Adaboost implementation
- Bagging vs Boosting
- XGBoost demo walkthrough
import numpy as np
import sklearn
import matplotlib.pyplot as plt
def sample_one(probabilities):
"""
Generate a single sample with probability density specified by
`probabilities = [p1, p2, ..., pN]`
We will return an index `i` between 0 and N-1 inclusive.
"""
assert np.abs(np.sum(probabilities) - 1) < 0.0001 # just checking this is an honest probability density
N = len(probabilities) # size of the sample space
cummulative_density = np.cumsum(probabilities) # compute CDF
cummulative_density = np.concatenate([[0], cummulative_density]) # prepend 0 to the list
random_seed = np.random.rand() # generate a single random number between 0 and 1 uniformly.
# Look for the index i such that CDF[i -1] <= random_seed <= CDF[i]
for i in range(N):
if cummulative_density[i] < random_seed <= cummulative_density[i + 1]:
return i # break and return when found
def sample(n, items, probabilities):
"""For pedagogy only. Inefficient implementation"""
samples = []
for _ in range(n):
index = sample_one(probabilities)
samples.append(items[index])
return samples
items = ['a', 'b', 'c']
probabilities = [2/3, 1/6, 1/6]
num_samples = 10000
samples = sample(num_samples, items, probabilities)
print(f"Empirical probabilities:")
a, b = np.unique(samples, return_counts=True)
for a, b in zip(*np.unique(samples, return_counts=True)):
print(f"Probability of oberving {a} = {b / num_samples}")
Generating dataset.
from sklearn.datasets import make_circles
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import DecisionBoundaryDisplay
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
n = 1000
sigma = 0.3
factor = 0.1
X, y = make_circles(n_samples=n, noise=sigma, factor=factor)
y = (2 * y - 1).astype(int)
plt.plot(X[y == 1][:, 0], X[y == 1][:, 1], 'b+')
plt.plot(X[y == -1][:, 0], X[y == -1][:, 1], 'r+')
Using one weak learner.
clf = DecisionTreeClassifier(max_depth=1)
# clf = LogisticRegression(C=1e5)
clf.fit(X, y)
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
DecisionBoundaryDisplay.from_estimator(clf, X, alpha=0.2, response_method="predict", ax=ax)
ax.plot(X[y == 1][:, 0], X[y == 1][:, 1], 'b+')
ax.plot(X[y == -1][:, 0], X[y == -1][:, 1], 'r+')
Boosting to train a few learners.
classifiers = []
weights = np.ones(n) / n
confidences = []
loss_fn = lambda f, x, y: np.exp(-y * f(x))
num_iter = 54
chosen = []
weights_rec = []
for t in range(num_iter):
weights = weights / np.sum(weights)
weights_rec.append(weights)
# clf = SVC(kernel="linear", C=1.0)
clf = DecisionTreeClassifier(max_depth=1)
# clf = LogisticRegression(C=1e5)
indices = sample(n, list(range(len(y))), weights)
X_t = X[indices]
y_t = y[indices]
chosen.append(indices)
clf.fit(X_t, y_t)
classifiers.append(clf)
pred = clf.predict(X)
error = np.sum([weights[i] for i in range(n) if pred[i] != y[i]])
# error = np.clip(np.sum(pred != y), a_min=0.01, a_max=0.99)
conf = 1/2 * np.log((1 - error) / error)
confidences.append(conf)
weights = weights * np.exp(-(pred * y) * conf)
What those individual learners did.
N = min(15, num_iter)
ncol = 3
nrow = N // ncol + (N % ncol)
fig, axes = plt.subplots(nrow, ncol, figsize=(5 * ncol, 5 * nrow))
axes = np.ravel(axes)
for i in range(N):
ax = axes[i]
clf = classifiers[i]
indices = chosen[i]
weights = 5 ** (np.array(weights_rec[i][indices]) * n)
# print(weights)
ax.scatter(
X[indices][y[indices] == 1][:, 0],
X[indices][y[indices] == 1][:, 1],
color="blue",
marker="*",
s=weights[y[indices] == 1]
)
ax.scatter(
X[indices][y[indices] == -1][:, 0],
X[indices][y[indices] == -1][:, 1],
color="red",
marker="*",
s=weights[y[indices] == -1]
)
DecisionBoundaryDisplay.from_estimator(clf, X, alpha=0.2, response_method="predict", ax=ax)
Weighted majority voting of weak learners.
Question: Write down the decision rule.
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
ax.scatter(
X[y == 1][:, 0],
X[y == 1][:, 1],
# s=10 ** (weights[y == 1] * 100),
marker="*",
color="blue"
)
ax.scatter(
X[y == -1][:, 0],
X[y == -1][:, 1],
# s=10 ** (weights[y == -1] * 100),
marker="*",
color="red"
)
a = np.linspace(-2, 2, num=50)
b = np.linspace(-2, 2, num=50)
ps = np.array([(x1, x2) for x1 in a for x2 in b])
preds = np.array([clf.predict(ps) for clf in classifiers]).T
preds = preds * np.array(confidences)
pos = np.sum(preds, axis=1)
neg = np.sum(-preds, axis=1)
Z = 2 * (pos > neg) - 1
Z = Z.reshape(50, 50)
XX, YY = np.meshgrid(a, b)
ax.contourf(XX, YY, Z, alpha=0.2)
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(
# base_estimator=SVC(kernel="linear"),
algorithm="SAMME",
n_estimators=54
)
clf.fit(X, y)
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
DecisionBoundaryDisplay.from_estimator(clf, X, alpha=0.2, response_method="predict", ax=ax)
ax.plot(X[y == 1][:, 0], X[y == 1][:, 1], 'b+')
ax.plot(X[y == -1][:, 0], X[y == -1][:, 1], 'r+')
Bagging vs Boosting
Bagging | Boosting | |
---|---|---|
Sampling | Uniformly at random. | Weighted based on performance from last iteration. |
Parallelisable? | Can train learners in parallel | Learners training depend on previous iteration. |
Classification rule | Majority vote | Weighted majority vote |
Bias or Variance | Focus on reducing variance | Focus on reducing bias |
Prone to overfitting? | Relatively safe from overfitting | Can overfit |
x = np.linspace(0, 10, num=20)
y = np.ones_like(x)
y[(x > 3) & (x < 6)] *= 5
y[(x > 6)] *= 10
y += np.random.randn(len(x)) * 0.8
x = x.reshape(-1, 1)
plt.plot(x, y, "b*")
from xgboost import XGBRegressor, XGBClassifier
xgboostmodel = XGBRegressor(
n_estimators=10,
eta=1,
# min_child_weight=1,
max_depth=1,
# max_leaves=2,
# reg_alpha=10,
# reg_lambda=100
)
xgboostmodel.fit(x, y)
plt.plot(x, y, "b*")
plt.plot(x, xgboostmodel.predict(x), "b--")
plt.plot(x, y - xgboostmodel.predict(x), "b*")