From cf8fd3c3b447cd5c01b6b96ae128ac5a52a1b947 Mon Sep 17 00:00:00 2001
From: mvargas33 <maximiliano.vargas@ug.uchile.cl>
Date: Wed, 17 Nov 2021 18:11:49 +0100
Subject: [PATCH] Add model selection example with LFW dataset and KNN task

---
 examples/model_selection.py | 115 ++++++++++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 examples/model_selection.py

diff --git a/examples/model_selection.py b/examples/model_selection.py
new file mode 100644
index 00000000..b3063fa1
--- /dev/null
+++ b/examples/model_selection.py
@@ -0,0 +1,115 @@
+from metric_learn import LMNN, NCA, LFDA, MLKR, RCA_Supervised, ITML_Supervised, LSML_Supervised, MMC_Supervised, SDML_Supervised, SCML_Supervised
+from sklearn.utils import check_random_state
+from sklearn.model_selection import train_test_split
+import matplotlib.pyplot as plt
+from sklearn.datasets import fetch_lfw_people
+from time import time
+from sklearn.decomposition import PCA
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.metrics import precision_recall_fscore_support
+from sklearn.pipeline import Pipeline
+import matplotlib.pyplot as plt
+
+SEED = 33
+RNG = check_random_state(SEED)
+
+# #############################################################################
+# Load the dataset
+
+lfw_people = fetch_lfw_people(min_faces_per_person=40, resize=0.5)
+
+# introspect the images arrays to find the shapes (for plotting)
+n_samples, h, w = lfw_people.images.shape
+
+# for machine learning we use the 2 data directly (as relative pixel
+# positions info is ignored by this model)
+X = lfw_people.data
+n_features = X.shape[1]
+
+# the label to predict is the id of the person
+y = lfw_people.target
+target_names = lfw_people.target_names
+n_classes = target_names.shape[0]
+
+print("Total dataset size:")
+print("n_samples: %d" % n_samples)
+print("n_features: %d" % n_features)
+print("n_classes: %d" % n_classes)
+
+# #############################################################################
+# Split into a training set and a test set
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.25, random_state=42
+)
+
+# #############################################################################
+# Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
+# dataset): unsupervised feature extraction / dimensionality reduction
+n_components = 150
+
+print("Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]))
+t0 = time()
+pca = PCA(n_components=n_components, svd_solver="randomized", whiten=True).fit(X_train)
+print("done in %0.3fs" % (time() - t0))
+
+eigenfaces = pca.components_.reshape((n_components, h, w))
+
+print("Projecting the input data on the eigenfaces orthonormal basis")
+t0 = time()
+X_train_pca = pca.transform(X_train)
+X_test_pca = pca.transform(X_test)
+print("done in %0.3fs" % (time() - t0))
+
+# #############################################################################
+# Get a baseline score without using metric learning
+clf = KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree')
+clf.fit(X_train_pca, y_train)
+y_pred = clf.predict(X_test_pca)
+bs = precision_recall_fscore_support(y_test, y_pred, average='weighted', zero_division=0)
+print("{:<15} {:<10} {:<10} {:<10}".format('','Precision','Recall','F-beta'))
+print("{:<15} {:<10} {:<10} {:<10}".format('Baseline', str(round(bs[0], 2)), str(round(bs[1], 2)), str(round(bs[2], 2))))
+
+# #############################################################################
+# For each model, we train and learn a transformation, then we evaluate on KNN
+# Consider tunning the params of each metric learner beforehand
+
+models = [LMNN(random_state=RNG),
+          NCA(random_state=RNG),
+          LFDA(),
+          MLKR(random_state=RNG),
+          RCA_Supervised(random_state=RNG),
+          ITML_Supervised(random_state=RNG),
+          LSML_Supervised(random_state=RNG),
+          MMC_Supervised(random_state=RNG),
+          SDML_Supervised(random_state=RNG),
+          # SCML_Supervised(random_state=RNG)
+          ]
+
+pipes = []
+
+for model in models:
+  pipes.append(Pipeline([('metric-learner', model), ('knn', KNeighborsClassifier(n_neighbors=5, algorithm='kd_tree'))]))
+
+scores = []
+
+for index, pipe in enumerate(pipes):
+  name = models[index].__class__.__name__
+  pipe.fit(X_train_pca, y_train)
+  y_pred = pipe.predict(X_test_pca)
+  s = precision_recall_fscore_support(y_test, y_pred, average='weighted', zero_division=0)
+  scores.append(s)
+  print("{:<15} {:<10} {:<10} {:<10}".format(name, str(round(s[0], 2)), str(round(s[1], 2)), str(round(s[2], 2))))
+
+# #############################################################################
+# Plot precision, Recall, F-beta for each metric learner compared to the baseline
+x = [ m.__class__.__name__ for m in models]
+f1 = [a[2] for a in scores]
+
+plt.axhline(y=bs[2], color='k', linestyle='-') # Baseline
+plt.axhline(y=max(f1), color='r', linestyle='dotted') # Max score
+
+plt.bar(x, f1)
+plt.ylabel('F-beta score')
+plt.xlabel('Metric learning models');
+plt.title('F-beta score for different metric learners')
+plt.show()
\ No newline at end of file