
Fitting distance metric learning algorithms

>>> import numpy as np
>>> from sklearn.datasets import load_iris

>>> # Loading DML Algorithm
>>> from dml import NCA

>>> # Loading dataset
>>> iris = load_iris()
>>> X = iris['data']
>>> y = iris['target']

>>> # DML construction
>>> nca = NCA()

>>> # Fitting algorithm
>>> nca.fit(X,y)

>>> # We can look at the algorithm metadata after fitting it
>>> meta = nca.metadata()
>>> meta
{'final_expectance': 0.95771240234375,
 'initial_expectance': 0.8380491129557291,
 'num_iters': 3}

>>> # We can see the metric the algorithm has learned.
>>> # This metric is the PSD matrix that defines how the distance is measured:
>>> # d(x,y) = (x-y).T.dot(M).dot(x-y)
>>> M = nca.metric()
>>> M
array([[ 1.19098678,  0.51293714, -2.15818151, -2.01464351],
       [ 0.51293714,  1.58128238, -2.14573777, -2.10714773],
       [-2.15818151, -2.14573777,  6.46881853,  5.86280474],
       [-2.01464351, -2.10714773,  5.86280474,  6.83271473]])

>>> # Equivalently, we can see the learned linear map.
>>> # The distance coincides with the euclidean distance after applying the linear map.
>>> L = nca.transformer()
>>> L
array([[ 0.77961001, -0.01911998, -0.35862791, -0.23992861],
       [-0.04442949,  1.00747788, -0.29936559, -0.25812144],
       [-0.60744415, -0.57288453,  2.16095076,  1.35212555],
       [-0.46068713, -0.48755353,  1.25732916,  2.20913531]])

>>> # Finally, we can obtain the transformed data ...
>>> Lx = nca.transform()
>>> Lx[:5,:]
array([[ 3.35902632,  2.8288461 , -1.80730485, -1.85385382],
       [ 3.21266431,  2.33399305, -1.39937375, -1.51793964],
       [ 3.0887811 ,  2.57431109, -1.60855691, -1.64904583],
       [ 2.94100652,  2.41813313, -1.05833389, -1.30275593],
       [ 3.27915332,  2.93403684, -1.80384889, -1.85654046]])

>>> # ... or transform new data.
>>> X_ = np.array([[1.0,0.0,0.0,0.0],[1.0,1.0,0.0,0.0],[1.0,1.0,1.0,0.0]])
>>> Lx_ = nca.transform(X_)
>>> Lx_
array([[ 0.77961001, -0.04442949, -0.60744415, -0.46068713],
       [ 0.76049003,  0.9630484 , -1.18032868, -0.94824066],
       [ 0.40186212,  0.66368281,  0.98062208,  0.3090885 ]])

Similarity learning classifier extensions for Scikit-learn

>>> import numpy as np
>>> from sklearn.datasets import load_iris

>>> from dml import NCA, kNN, MultiDML_kNN

>>> # Loading dataset
>>> iris = load_iris()
>>> X = iris['data']
>>> y = iris['target']

>>> # Initializing transformer and predictor
>>> nca = NCA()
>>> knn = kNN(n_neighbors=7,dml_algorithm=nca)

>>> # Fitting transformer and predictor
>>> nca.fit(X,y)
>>> knn.fit(X,y)

# Now we can predict the labels for k-NN with the learned distance.
>>> knn.predict() # Also we can use predict(X_) for other datasets.
>>>               # When using the training set predictions are made
>>>               # leaving the sample to predict out.
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  2.,  1.,  2.,  1.,  1.,  1.,  1.,  2.,
        1.,  1.,  1.,  1.,  1.,  2.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.])

>>> knn.predict_proba()[-10:,:] # Again it can be used for other datasets.
array([[ 0.        ,  0.        ,  1.        ],
       [ 0.        ,  0.        ,  1.        ],
       [ 0.        ,  0.        ,  1.        ],
       [ 0.        ,  0.        ,  1.        ],
       [ 0.        ,  0.        ,  1.        ],
       [ 0.        ,  0.        ,  1.        ],
       [ 0.        ,  0.14285714,  0.85714286],
       [ 0.        ,  0.        ,  1.        ],
       [ 0.        ,  0.        ,  1.        ],
       [ 0.        ,  0.14285714,  0.85714286]])

>>> knn.score() # The classification score (score(X_,y_) for other datasets).

>>> # We can also compare with the euclidean distance k-NN
>>> knn.score_orig()

>>> # With MultiDML_kNN we can test multiple dmls. In this case, dmls are fitted automatically.
>>> lda = LDA()
>>> mknn = MultiDML_kNN(n_neighbors=7,dmls=[lda,nca])
>>> mknn.fit(X,y)

>>> # And we can predict and take scores in the same way, for every dml.
>>> # The euclidean distance will be added always in first place.
>>> mknn.score_all() # It will show [euclidean, lda, nca]
array([ 0.96666667,  0.96666667,  0.97333333])

>>> # The NCMC Classifier works like every ClassifierMixin.
>>> ncmc = NCMC_Classifier(centroids_num=2)
>>> ncmc.fit(X,y)
>>> ncmc.score(X,y)

>>> # To learn a distance to use with NCMC Classifier, and with any other distance classifier
>>> # we can use pipelines.
>>> from sklearn.pipeline import Pipeline
>>> dml_ncmc = Pipeline([('nca',nca),('ncmc',ncmc)])
>>> dml_ncmc.fit(X,y)
>>> dml_ncmc.score(X,y)

Plotting classifier regions induced by different distances

>>> import numpy as np
>>> from sklearn.datasets import load_iris
>>> from dml import NCA, LDA, NCMC_Classifier, classifier_plot, dml_plot, knn_plot,
>>>                 dml_multiplot, knn_pairplots

>>> # Loading dataset
>>> iris = load_iris()
>>> X = iris['data']
>>> y = iris['target']

>>> # Initializing transformers and predictors
>>> nca = NCA()
>>> lda = LDA()
>>> ncmc = NCMC_Classifier(centroids_num=2)

>>> # We can plot regions for different classifiers
>>> f1 = classifier_plot(X[:,[0,1]],y,clf=ncmc,title = "NCMC Classification",
>>>                      cmap="rainbow",figsize=(12,6))
>>> f2 = knn_plot(X[:,[0,1]],y,k=3,title = "3-NN Classification", cmap="rainbow",
>>>               figsize=(12,6))
>>> # We can also make with the transformation determined by a metric,
>>> # a transformer or a DML Algorithm
>>> f3 = dml_plot(X[:,[0,1]],y,clf=ncmc,dml=nca,title = "NCMC Classification + NCA",
>>>               cmap="rainbow",figsize=(12,6))
>>> f4 = knn_plot(X[:,[0,1]],y,k=2,dml=lda,title="3-NN Classification + LDA",
>>>               cmap="rainbow",figsize=(12,6))
>>> # Or we can see how the distance changes the classifier region
>>> # using the option transform=False
>>> f5 = dml_plot(X[:,[0,1]],y,clf=ncmc,dml=nca,title = "NCMC Classification + NCA",
>>>               cmap="rainbow",transform=False,figsize=(12,6))
>>> f6 = knn_plot(X[:,[0,1]],y,k=2,dml=lda,title="3-NN Classification + LDA",
>>>               cmap="rainbow",transform=False,figsize=(12,6))
>>> # We can compare different algorithms or distances together in the same figure
>>> f7 = dml_multiplot(X[:,[0,1]],y,nrow=2,ncol=2,ks=[None,None,3,3],
>>>                    clfs=[ncmc,ncmc,None,None],dmls=[None,nca,None,lda],
>>>                    transforms=[False,False,False,False],title="Comparing",
>>>                    subtitles=["NCMC","NCMC + NCA","3-NN","3-NN + LDA"],
>>>                    cmap="rainbow",figsize=(12,12))
>>> # Finally, we can also plot each pair of attributes. Here the classifier region
>>> # is made taking a section in the features space.
>>> f8 = knn_pairplots(X,y,k=3,sections="mean",dml=nca,title="pairplots",
>>>                    cmap="gist_rainbow",figsize=(24,24))

Tuning parameters

>>> import numpy as np
>>> from sklearn.datasets import load_iris
>>> from dml import NCA, tune

>>> # Loading dataset
>>> iris = load_iris()
>>> X = iris['data']
>>> y = iris['target']

>>> # Using cross validation we can tune parameters for the DML algorithms.
>>> # Here, we tune the NCA algorithm, with a fixed parameter learning_rate='constant'.
>>> # The parameters we tune are num_dims and eta0.
>>> # The metrics we use are 3-NN and 5-NN scores, and the final expectance metadata of NCA.
>>> # A 5-fold cross validation is done twice, to obtain the results.
>>> results,best,nca_best,detailed = tune(NCA,X,y,dml_params={'learning_rate':'constant'},
>>>                                       tune_args={'num_dims':[3,4],'eta0':[0.001,0.01,0.1]},
>>>                                       metrics=[3,5,'final_expectance'],
>>>                                       n_folds=5,n_reps=2,seed=28,verbose=True)
*** Tuning Case  {'num_dims': 3, 'eta0': 0.001} ...
** FOLD  1
** FOLD  2
** FOLD  3
** FOLD  4
** FOLD  5
** FOLD  6
** FOLD  7
** FOLD  8
** FOLD  9
** FOLD  10
*** Tuning Case  {'num_dims': 3, 'eta0': 0.01} ...
** FOLD  1
** FOLD  2
** FOLD  3
** FOLD  4

>>> # Now we can compare the results obtained for each case.
>>> results
                                    3-NN      5-NN  final_expectance
{'num_dims': 3, 'eta0': 0.001}  0.963333  0.970000          0.890105
{'num_dims': 3, 'eta0': 0.01}   0.966667  0.963333          0.916240
{'num_dims': 3, 'eta0': 0.1}    0.970000  0.963333          0.935243
{'num_dims': 4, 'eta0': 0.001}  0.956667  0.963333          0.897238
{'num_dims': 4, 'eta0': 0.01}   0.956667  0.963333          0.922415
{'num_dims': 4, 'eta0': 0.1}    0.960000  0.963333          0.947319

>>> # We can also take the best result (respect to the first metric).
>>> best
({'eta0': 0.1, 'num_dims': 3}, 0.97000000000000008)

>>> # We also obtain the best DML algorithm already constructed to be used.
>>> nca_best.fit(X,y)

>>> # If we want, we can look at the detailed results of cross validation for each case.
>>> detailed["{'num_dims': 3, 'eta0': 0.01}"]
              3-NN      5-NN  final_expectance
SPLIT 1   0.966667  0.966667          0.923293
SPLIT 2   0.966667  0.966667          0.922091
SPLIT 3   1.000000  0.966667          0.907416
SPLIT 4   0.966667  0.966667          0.903700
SPLIT 5   0.966667  0.966667          0.915030
SPLIT 6   0.966667  0.966667          0.905189
SPLIT 7   0.966667  0.966667          0.922051
SPLIT 8   0.933333  0.933333          0.933400
SPLIT 9   0.966667  1.000000          0.912236
SPLIT 10  0.966667  0.933333          0.917992
MEAN      0.966667  0.963333          0.916240
STD       0.014907  0.017951          0.008888