8. //load breast cancer data
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
//split data into train & test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test =
train_test_split(cancer.data,cancer.target,stratify=cancer.target,random_sta
te=66)
//use k-neighbors algorithm to perform classification
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train,y_train
9. //predict cancer on test data
clf.predict(X_test)
//check accuracy
clf.score(X_test,y_test)
13. //Kmeans algorithm
from sklearn.cluster import KMeans
km = KMeans(n_clusters=2,random_state=0)
//split data into train & test
labels_km = km.fit_predict(X_train)
print(labels_km)
print(y_train)
17. Typical tasks
Categorical data —> one-hot-
encoding (dummy variable)
Multidimensional data —> scaling
Too many features —> Principal
Component Analysis (PCA)
Text —> bag-of-words
18. One-hot-encoding
# of flights account
# of days
since join
features
150
google,
facebook
300
gmail_parse
d_success
200 icloud 600
gmail_parse
d_success
1 live 0
3 google 1
23. //load Labeled Faces in the Wild dataset
from sklearn.datasets import fetch_lfw_people
people = fetch_lfw_people(min_faces_per_person=20,resize=0.7)
//display 10 faces
image_shape = people.images[0].shape
import matplotlib.pyplot as plt
fix,axes = plt.subplots(2,5, figsize=(15,8),subplot_kw={‘xticks’:(),’yticks':()})
for target,image,ax in zip(people.target,people.images,axes.ravel()):
ax.imshow(image)
ax.set_title(people.target_names[target])
plt.show()
//use plt.ion() if plot isn't displayed or create .matplotlibrc in ./.matplotlib/ with text
‘backend: TkAgg'
24. //apply k-neighbors & estimate score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, y_train, y_test =
train_test_split(people.data,people.target,stratify=people.target,random_state=0)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
knn.score(X_test,y_test)
without PCA
25. //apply PCA and then KNN
from sklearn.decomposition import PCA
pca = PCA(n_components=100,whiten=True,random_state=0).fit(X_train)
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train_pca,y_train)
knn.score(X_test_pca,y_test)
with PCA
26. //display eigenfaces
fix,axes = plt.subplots(3,5,figsize=(15,12),subplot_kw={'xticks':
(),'yticks':()})
for i, (component, ax) in
enumerate(zip(pca.components_,axes.ravel())):
ax.imshow(component.reshape(image_shape),cmap='viridis')
ax.set_title("{}. component”.format((i+1)))
plt.show()
Eigenfaces
31. Exercises
Predict user purchase (User, UserInfo,
UserSessionAction)
Find clusters of users (User, UserInfo,
UserSessionAction)
Determine if there is free wifi at the airport? (Tip)
Predicting CBP wait times at the airport
(regression)
Others?