PCA Steps
- Standardize the data.
- Use the standardized data to create a covariance matrix.
- Use the resulting matrix to calculate eigenvectors (principal components) and their corresponding eigenvalues.
- Sort the components in decending order by its eigenvalue.
- Choose n components which explain the most variance within the data (larger eigenvalue means the feature explains more variance).
- Create a new matrix using the n components.
The concept of Eigenvectors is applied in a machine learning algorithm Principal Component Analysis. Suppose you have a data with a large number of features i.e. it has a very high dimensionality. It is possible that there are redundant features in that data. Apart from this, a large number of features will cause reduced efficiency and more disk space. What PCA does is that it craps some of lesser important features. But how to determine those features? Here, Eigenvectors come to our rescue.
Obtain the Eigenvectors and Eigenvalues from the covariance matrix or correlation matrix, or perform Singular Value Decomposition.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data=pd.read_csv('Wine.csv')
data
Alcohol | Malic_Acid | Ash | Ash_Alcanity | Magnesium | Total_Phenols | Flavanoids | Nonflavanoid_Phenols | Proanthocyanins | Color_Intensity | Hue | OD280 | Proline | Customer_Segment | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 14.23 | 1.71 | 2.43 | 15.6 | 127 | 2.80 | 3.06 | 0.28 | 2.29 | 5.64 | 1.04 | 3.92 | 1065 | 1 |
1 | 13.20 | 1.78 | 2.14 | 11.2 | 100 | 2.65 | 2.76 | 0.26 | 1.28 | 4.38 | 1.05 | 3.40 | 1050 | 1 |
2 | 13.16 | 2.36 | 2.67 | 18.6 | 101 | 2.80 | 3.24 | 0.30 | 2.81 | 5.68 | 1.03 | 3.17 | 1185 | 1 |
3 | 14.37 | 1.95 | 2.50 | 16.8 | 113 | 3.85 | 3.49 | 0.24 | 2.18 | 7.80 | 0.86 | 3.45 | 1480 | 1 |
4 | 13.24 | 2.59 | 2.87 | 21.0 | 118 | 2.80 | 2.69 | 0.39 | 1.82 | 4.32 | 1.04 | 2.93 | 735 | 1 |
… | … | … | … | … | … | … | … | … | … | … | … | … | … | … |
173 | 13.71 | 5.65 | 2.45 | 20.5 | 95 | 1.68 | 0.61 | 0.52 | 1.06 | 7.70 | 0.64 | 1.74 | 740 | 3 |
174 | 13.40 | 3.91 | 2.48 | 23.0 | 102 | 1.80 | 0.75 | 0.43 | 1.41 | 7.30 | 0.70 | 1.56 | 750 | 3 |
175 | 13.27 | 4.28 | 2.26 | 20.0 | 120 | 1.59 | 0.69 | 0.43 | 1.35 | 10.20 | 0.59 | 1.56 | 835 | 3 |
176 | 13.17 | 2.59 | 2.37 | 20.0 | 120 | 1.65 | 0.68 | 0.53 | 1.46 | 9.30 | 0.60 | 1.62 | 840 | 3 |
177 | 14.13 | 4.10 | 2.74 | 24.5 | 96 | 2.05 | 0.76 | 0.56 | 1.35 | 9.20 | 0.61 | 1.60 | 560 | 3 |
178 rows × 14 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 178 entries, 0 to 177 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Alcohol 178 non-null float64 1 Malic_Acid 178 non-null float64 2 Ash 178 non-null float64 3 Ash_Alcanity 178 non-null float64 4 Magnesium 178 non-null int64 5 Total_Phenols 178 non-null float64 6 Flavanoids 178 non-null float64 7 Nonflavanoid_Phenols 178 non-null float64 8 Proanthocyanins 178 non-null float64 9 Color_Intensity 178 non-null float64 10 Hue 178 non-null float64 11 OD280 178 non-null float64 12 Proline 178 non-null int64 13 Customer_Segment 178 non-null int64 dtypes: float64(11), int64(3) memory usage: 19.6 KB
X=data.iloc[:,:13].values
y=data.iloc[:,-1].values
y
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], dtype=int64)
pd.DataFrame(X[0])
0 | |
---|---|
0 | 14.23 |
1 | 1.71 |
2 | 2.43 |
3 | 15.60 |
4 | 127.00 |
5 | 2.80 |
6 | 3.06 |
7 | 0.28 |
8 | 2.29 |
9 | 5.64 |
10 | 1.04 |
11 | 3.92 |
12 | 1065.00 |
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)
X_train
array([[ 0.87668336, 0.79842885, 0.64412971, ..., 0.0290166 , -1.06412236, -0.2059076 ], [-0.36659076, -0.7581304 , -0.39779858, ..., 0.0290166 , -0.73083231, -0.81704676], [-1.69689407, -0.34424759, -0.32337513, ..., 0.90197362, 0.51900537, -1.31256499], ..., [-0.70227477, -0.68615078, -0.65828065, ..., 0.46549511, 0.51900537, -1.31256499], [ 1.13777093, -0.62316862, -0.91876272, ..., -0.18922266, 1.03282752, 0.80164614], [ 1.4610222 , 0.12361993, 0.42085937, ..., -1.45501034, -1.2168803 , -0.2719767 ]])
from sklearn.decomposition import PCA
pca=PCA(n_components=2)
X_train=pca.fit_transform(X_train)
X_test=pca.transform(X_test)
pca.explained_variance_ratio_
array([0.36884109, 0.19318394])
0.36884109 + 0.19318394
0.56202503
X_train
array([[-2.17884511e+00, 1.07218467e+00], [-1.80819239e+00, -1.57822344e+00], [ 1.09829474e+00, -2.22124345e+00], [-2.55584748e+00, 1.66210369e+00], [ 1.85698063e+00, -2.41573166e-01], [ 2.58288576e+00, 1.37668170e+00], [ 8.72876119e-01, -2.25618512e+00], [-4.18384029e-01, -2.35415681e+00], [-3.04977245e-01, -2.27659433e+00], [ 2.14082532e+00, 1.10052871e+00], [-2.98136465e+00, 2.47159183e-01], [ 1.96188242e+00, -1.25407738e+00], [-2.16177795e+00, 9.75966550e-01], [ 2.21976084e+00, 2.39507167e+00], [-2.30179529e+00, 2.05791962e-01], [-3.00953827e+00, 2.79141212e-01], [ 2.63443473e+00, 8.68313119e-01], [-1.09219965e+00, 3.53906625e+00], [ 2.62578435e+00, 2.96852840e-03], [ 1.98077342e-01, -2.29193443e+00], [-2.67442753e+00, 2.58800132e+00], [-2.54763698e+00, 4.52703891e-01], [ 1.77416736e+00, -8.43586940e-01], [-2.77786938e+00, 4.32090258e-01], [-2.86679938e+00, 1.87580875e+00], [ 1.35498845e+00, -3.99545184e-02], [-2.43900474e+00, -9.44074889e-02], [-2.27268121e+00, -5.05883053e-01], [ 1.17887166e+00, -2.50068415e+00], [-2.30673313e+00, -1.30502777e+00], [-2.53020738e+00, 5.51277126e-01], [ 1.63200028e-01, -1.29107817e+00], [ 2.57881158e+00, 1.17515982e+00], [-7.62471566e-01, -3.16097049e+00], [ 2.57005937e+00, 9.66718786e-02], [-4.78337042e-01, -5.77763823e-01], [ 5.47417096e-01, 3.77647780e-01], [ 3.55763538e+00, 1.45816125e+00], [ 1.69260971e+00, -1.37844174e+00], [ 2.65288395e+00, 2.39399539e-01], [-3.62047411e+00, 6.90153979e-01], [-1.61462317e+00, 2.41170340e+00], [ 1.50959767e+00, -1.32717326e+00], [ 5.36413494e-02, -2.07680094e+00], [-1.07889168e-01, -2.85115217e+00], [-2.39610454e+00, 2.45883860e+00], [-3.12315181e+00, -4.23261512e-01], [ 3.28569649e+00, 3.22859884e-01], [-3.55506872e+00, 1.74242946e+00], [-3.87020538e-01, -2.61510101e+00], [-4.74514016e-01, -1.98023790e+00], [-1.06865761e+00, -6.78906271e-01], [ 1.08546036e+00, -1.30817801e+00], [ 2.02340107e+00, -1.56926094e+00], [ 2.76257094e+00, 1.85603600e+00], [ 2.06778286e+00, 1.35861191e+00], [ 9.06600653e-01, -2.04110996e+00], [ 3.52336455e+00, 1.39946872e+00], [-3.79914321e+00, 4.92727617e-02], [ 1.75250993e+00, -4.64283651e-01], [-3.46539192e+00, 7.90385134e-01], [ 3.16221605e+00, 8.39879111e-01], [ 2.37656864e+00, 1.72475988e+00], [ 1.31278073e+00, -8.53348760e-01], [ 3.57258440e+00, 1.78091597e+00], [ 9.50877158e-01, -2.38927332e+00], [ 4.95461316e-01, -2.16498322e+00], [ 3.79294638e+00, 2.92787186e+00], [-2.37978591e+00, 2.13572422e+00], [-1.50346992e+00, -1.39146991e+00], [ 2.50566646e+00, 1.30365941e+00], [-6.97535788e-01, -2.78160736e-01], [-7.25562555e-01, -2.54007170e+00], [ 9.24047324e-01, -1.46344718e+00], [-1.25151294e+00, -2.74792621e-02], [ 2.20937835e+00, 8.05690832e-01], [-3.84416995e+00, 5.74263508e-01], [-1.77983157e+00, 1.38012167e+00], [ 4.35413058e+00, 2.33750318e+00], [ 3.33834347e+00, 1.51169086e+00], [-1.44769123e+00, -1.90826204e+00], [-2.75066706e+00, 2.07100640e+00], [ 2.79475799e+00, 1.36659228e+00], [ 1.84642601e+00, 6.82481476e-01], [-4.13332842e-01, -2.20440158e+00], [-4.81356617e-02, -1.17469609e+00], [ 1.99166500e+00, 2.50860656e-01], [ 2.26421169e+00, 1.32120813e+00], [ 7.85551414e-01, 2.46487051e-01], [-3.32586984e+00, 2.14485564e+00], [ 1.00496881e+00, -7.20390295e-01], [ 2.31479633e+00, -2.62129546e-01], [ 8.67032066e-01, -1.36440259e+00], [-2.28629769e+00, 4.54244754e-01], [ 3.14452871e+00, 1.29318898e+00], [-1.65297942e+00, -1.74177394e+00], [-2.84689388e+00, 7.78426712e-02], [-2.68393126e+00, 2.53813173e-01], [ 1.97280128e+00, 1.70171835e+00], [ 1.63120111e+00, -7.24762688e-01], [-2.05082836e+00, 2.11848206e-01], [ 2.28798382e+00, 1.95899701e+00], [-2.28266458e+00, 2.07243579e-01], [-3.45079842e-01, -1.92360626e+00], [ 1.49448758e+00, 7.18673825e-01], [ 2.26695932e+00, 7.93531817e-01], [-3.44673144e-01, -1.92686997e+00], [ 2.75927029e+00, 1.56391999e+00], [-2.86839562e+00, 1.85579453e+00], [-1.46228982e+00, -1.22151405e+00], [-3.31754434e+00, 1.20382601e+00], [ 1.72057718e+00, 1.36843828e-01], [-2.90065973e+00, 3.71521776e-01], [-2.30532411e+00, 2.14386284e+00], [-3.51377495e+00, 1.17981731e+00], [ 2.32780065e+00, 2.95396131e-01], [ 1.54528723e+00, -1.99996309e+00], [-3.32660657e-01, -2.37118865e+00], [ 4.96648201e-01, -9.57282660e-01], [ 6.69654741e-01, -3.80907536e+00], [-2.76009366e+00, 1.48785734e+00], [-3.19187371e+00, 2.70815669e+00], [-6.05023707e-01, -1.10583182e+00], [-1.34006934e+00, -1.51232906e+00], [ 1.05506599e+00, 9.49724036e-01], [ 1.16017702e+00, -1.39768493e+00], [-2.87675356e+00, 1.15157946e+00], [-2.35838421e+00, 2.44842974e+00], [ 2.54704855e+00, 1.86824592e+00], [ 3.20597222e+00, 1.85912926e+00], [-2.69949485e+00, 1.75638262e-01], [-9.67436859e-01, -1.81399824e+00], [-1.46454259e+00, -1.01680272e+00], [-4.04000223e-01, -2.40815711e+00], [ 1.46393837e+00, 6.90763351e-01], [ 1.15903114e+00, 2.91379684e-01], [ 2.82057099e+00, 8.99578955e-01], [-5.01011897e-01, -2.68453162e+00], [ 3.30453915e-01, -2.43396193e+00], [ 1.09727608e-02, -1.99585453e+00], [ 2.89176687e+00, 7.71555485e-01], [-2.44830439e+00, 2.11360296e+00]])
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression()
classifier.fit(X_train,y_train)
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
y_test
array([1, 3, 2, 1, 2, 2, 1, 3, 2, 2, 3, 3, 1, 2, 3, 2, 1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 3, 1, 1, 2, 1, 1, 1], dtype=int64)
y_pred=classifier.predict(X_test)
y_pred
array([1, 3, 2, 1, 2, 1, 1, 3, 2, 2, 3, 3, 1, 2, 3, 2, 1, 1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 3, 1, 1, 2, 1, 1, 1], dtype=int64)
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
cm
array([[14, 0, 0], [ 1, 15, 0], [ 0, 0, 6]], dtype=int64)
X_set,y_set = X_train,y_train
plt.scatter(X_set[y_set==1,0],X_set[y_set==1,1],label=1)
plt.scatter(X_set[y_set==2,0],X_set[y_set==2,1],label=2)
plt.scatter(X_set[y_set==3,0],X_set[y_set==3,1],label=3)
A1=np.arange(X_set[:,0].min()-1,X_set[:,0].max()+1,0.01)
A2=np.arange(X_set[:,1].min()-1,X_set[:,1].max()+1,0.01)
X1,X2=np.meshgrid(A1,A2)
Z=classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape)
plt.contourf(X1,X2,Z,alpha=0.2)
plt.legend()
plt.show()
X1.ravel()
array([-4.84416995, -4.83416995, -4.82416995, ..., 5.32583005, 5.33583005, 5.34583005])
np.array([X1.ravel(),X2.ravel()]).T
array([[-4.84416995, -4.80907536], [-4.83416995, -4.80907536], [-4.82416995, -4.80907536], ..., [ 5.32583005, 4.53092464], [ 5.33583005, 4.53092464], [ 5.34583005, 4.53092464]])
X_set,y_set = X_test,y_test
plt.scatter(X_set[y_set==1,0],X_set[y_set==1,1],label=1)
plt.scatter(X_set[y_set==2,0],X_set[y_set==2,1],label=2)
plt.scatter(X_set[y_set==3,0],X_set[y_set==3,1],label=3)
A1=np.arange(X_set[:,0].min()-1,X_set[:,0].max()+1,0.01)
A2=np.arange(X_set[:,1].min()-1,X_set[:,1].max()+1,0.01)
X1,X2=np.meshgrid(A1,A2)
Z=classifier.predict(np.array([X1.ravel(),X2.ravel()]).T).reshape(X1.shape)
plt.contourf(X1,X2,Z,alpha=0.2)
plt.legend()
plt.show()
data = [[14.23,1.71,0.43,15.60,127.00,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.00]]
data = [[14.23,1.71,2.43,15.60,127.00,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.00]]
data = sc.transform(data)
data = pca.transform(data)
Pred=classifier.predict(data)
Pred
array([1], dtype=int64)