In [0]:
import numpy as np

# Caricamento Dataset

### Dataset di default

In [0]:
from sklearn.datasets import load_breast_cancer
breast = load_breast_cancer()
print breast.DESCR

# Matrice degli esempi e label associate
X, y = breast.data, breast.target

### Da file svmlight

In [0]:
#Carichiamo un dataset svmlight da URL
import urllib
raw_data = urllib.urlopen("http://www.math.unipd.it/~mpolato/didattica/ml1819/tic-tac-toe.svmlight")

#SVMLIGHT
from sklearn.datasets import load_svmlight_file
X, y = load_svmlight_file(raw_data) #o "nome file" se caricato da file
X = X.toarray() #Necessario per convertire da matrice sparsa a densa (alternativa todense)

## Training e test set

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Preprocessing

### Standard scaler

"Standardizza" le feature rimuovendo la media e scalando a varianza unitaria (media 0, varianza 1).

### Minmax scaler
Di deafult, scala le feature tra 0 e 1 secondo questa trasformazione

```
X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))
X_scaled = X_std * (max - min) + min
```


In [0]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()
#scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Valutazione (classificazione)

$\textit{accuracy} = \frac{\text{TP}+\text{TN}}{\text{TP}+\text{TN}+\text{FP}+\text{FN}}$

$\textit{precision} = \frac{\text{TP}}{\text{TP}+\text{FP}}$

$\textit{recall} = \frac{\text{TP}}{\text{TP}+\text{FN}}$

*AUC* = Area Under the ROC Curve

In [0]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score

def evaluate(y_test, y_pred):
	print "accuracy:", accuracy_score(y_test, y_pred)
	print "precision:", precision_score(y_test, y_pred)
	print "recall:", recall_score(y_test, y_pred)
	#print "AUC:", roc_auc_score(y_test, y_pred)
	print

#Â Metodi per classificazione

## Alberi di decisione

In [0]:
from sklearn import tree
clf_tree = tree.DecisionTreeClassifier()
#DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_split=1e-07, class_weight=None, presort=False)

clf_tree = clf_tree.fit(X_train, y_train)
y_pred = clf_tree.predict(X_test)

print "DT"
evaluate(y_test, y_pred)

## Multi Layer Perceptron

In [0]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, alpha=1, tol=1e-8, learning_rate_init=.01)

mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

print "MLP"
evaluate(y_test, y_pred)

## Support Vector Machine

In [0]:
#SVM
from sklearn import svm
svc = svm.SVC(gamma=0.0001, C=100.0) #kernel : string, optional (default=rbf) linear, poly, rbf, sigmoid
#SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,  gamma=0.001, kernel='rbf', max_iter=-1, probability=False,  random_state=None, shrinking=True, tol=0.001, verbose=False)

svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

print "SVM"
evaluate(y_test, y_pred)

# Model selection

## k-fold cross validation e grid search

In [0]:
from sklearn.model_selection import GridSearchCV, KFold
from sklearn import metrics

Dizionario di dizionari che contiene la griglia dei parametri per SVM (kernel lineare, RBF e custom) e NN

In [0]:
C_values = [2**i for i in range(-5,5)]
p_grid = { 
            "svm" : [{"C": C_values, "kernel": ["rbf"], "gamma" : [10**i for i in range(-4, 4)]},
			         {"C": C_values, "kernel": ["poly"], "degree": [2+i for i in range(4)]}],
            "precomputed" : {"C": C_values, "kernel": ["precomputed"]},
			"nn" : {"alpha" : [10**i for i in range(-5,1)], 'hidden_layer_sizes': [(10,), (50,), (100,), (200,)]}
         }

In [0]:
#from sklearn import svm
#from sklearn.neural_network import MLPClassifier

### Nascondo i warning di sklearn
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

kernel = "precomputed" #rbf/linear/poly

#Esempio di kernel lineare
Klin = np.dot(X, X.T)

### Test ripetuto N volte
skf = KFold(n_splits=5, shuffle=True, random_state=42)

fold, accs = 1, []
for train, test in skf.split(X, y):
	print "FOLD:", fold
	X_train, X_test = X[train],  X[test]
	
	### Il contenuto di X deve essere il custom kernel!
	if kernel == "precomputed": 
		X_train, X_test = Klin[train][:,train], Klin[test][:,train]
	
	### Eventuale Preprocessing!!!
	
	### Validazione dei parametri con Grid Search
	#GridSearchCV(estimator, param_grid, scoring=None, fit_params=None, n_jobs=1, iid=True, refit=True, cv=None, verbose=0, pre_dispatch='2*n_jobs', error_score='raise', return_train_score='warn')
	clf = GridSearchCV(svm.SVC(kernel), param_grid=p_grid["precomputed"], cv=5, scoring='accuracy') #SVM
	#clf = GridSearchCV(MLPClassifier(), param_grid=p_grid["nn"], cv=5, scoring='accuracy') #NN
	
	### Training
	clf.fit(X_train, y[train])
	
	### Informazioni sulla validazione
	#print "CV info:", clf.cv_results_#.keys()
	print "VALIDATION score:", clf.best_score_
	print "BEST parameters:", clf.best_params_
	#clf.best_estimator_
	
	### Notare che e' lo score e non la predizione! Utile per calcolare l'AUC (ranking metric)
	#y_pred = clf.decision_function(X_test) 
	
	y_pred = clf.predict(X_test)
	y_true = y[test]
	
	### Training classification report e confusion matrix
	print metrics.classification_report(y_true, y_pred)
	print metrics.confusion_matrix(y_true, y_pred)
	
	#auc = roc_auc_score(y_true, y_pred) # AUC
	acc = accuracy_score(y_true, y_pred) # Accuracy
	print "TEST score:", acc
	print
	
	accs.append(acc)
	fold += 1

print "AVG ACCURACY:", np.mean(accs), "+-", np.std(accs)