Sie sind auf Seite 1von 13

9/7/2018

komal_knn1_minMaxScalar

import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn import metrics

import seaborn as sns

sns.set(font_scale=1.5)

sns.set(style='white',color_codes=True)

In [36]: import numpy as np import pandas as pd

True ) In [36]: import numpy as np import pandas as pd In [2]: location =

In [2]: location = r"D:\komal\SIMPLILEARN\MY COURSES\IN PROGRESS\DATA SCIENCE WITH PYT HON\Live class downloads\Aug 11 Sat - Sep 15 Sat - Attending\datasets\iris.cs v"

11 Sat - Sep 15 Sat - Attending\datasets\iris.cs v" In [3]: # load the training data

In [3]: # load the training data from breast cancer data set df_iris = pd.read_csv(location) df_iris.head()

Out[3]:

   

sepal_length

sepal_width

petal_length

petal_width

class

0

5.1

3.5

1.4

0.2

Iris-setosa

1

4.9

3.0

1.4

0.2

Iris-setosa

2

4.7

3.2

1.3

0.2

Iris-setosa

3

4.6

3.1

1.5

0.2

Iris-setosa

4

5.0

3.6

1.4

0.2

Iris-setosa

9/7/2018

komal_knn1_minMaxScalar

9/7/2018 komal_knn1_minMaxScalar In [4]: # Check the available styles plt.style.available

In [4]: # Check the available styles plt.style.available

Out[4]: ['bmh', 'classic', 'dark_background', 'fast', 'fivethirtyeight', 'ggplot', 'grayscale', 'seaborn-bright', 'seaborn-colorblind', 'seaborn-dark-palette', 'seaborn-dark', 'seaborn-darkgrid', 'seaborn-deep', 'seaborn-muted', 'seaborn-notebook', 'seaborn-paper', 'seaborn-pastel', 'seaborn-poster', 'seaborn-talk', 'seaborn-ticks', 'seaborn-white', 'seaborn-whitegrid', 'seaborn',

'Solarize_Light2',

'tableau-colorblind10',

'_classic_test']

'tableau-colorblind10', '_classic_test'] In [5]: plt.style.use('ggplot') # might not be

In [5]: plt.style.use('ggplot')

# might not be beneficial.

# If mean values were of different orders of magnitude, scaling could

# significantly improve accuracy of a classifier.

df_iris.describe()

In [6]: # Means are in the same order of magnitude for all features so scaling

Out[6]:

   

sepal_length

sepal_width

petal_length

petal_width

count

150.000000

150.000000

150.000000

150.000000

mean

5.843333

3.054000

3.758667

1.198667

std

0.828066

0.433594

1.764420

0.763161

min

4.300000

2.000000

1.000000

0.100000

25%

5.100000

2.800000

1.600000

0.300000

50%

5.800000

3.000000

4.350000

1.300000

75%

6.400000

3.300000

5.100000

1.800000

max

7.900000

4.400000

6.900000

2.500000

9/7/2018

komal_knn1_minMaxScalar

9/7/2018 komal_knn1_minMaxScalar In [7]: X = df_iris.drop('class' , 1).values # drop target variable y1 =

In [7]: X = df_iris.drop('class' , 1).values # drop target variable y1 = df_iris['class'].values y = df_iris['class']

df_iris['class'].values y = df_iris['class'] In [8]: scaler = MinMaxScaler() scaler Out[8]:

In [8]: scaler = MinMaxScaler() scaler

Out[8]: MinMaxScaler(copy=True, feature_range=(0, 1))

9/7/2018

komal_knn1_minMaxScalar

print('X_scaled type is', type(X_scaled)) X_scaled

In [9]: X_scaled = scaler.fit_transform(X)

9/7/2018

komal_knn1_minMaxScalar

X_scaled type is <class 'numpy.ndarray'>

9/7/2018

komal_knn1_minMaxScalar

Out[9]: array([[0.22222222, 0.625

, 0.06779661, 0.04166667],

[0.16666667, 0.41666667, 0.06779661, 0.04166667],

[0.11111111, 0.5

[0.08333333, 0.45833333, 0.08474576, 0.04166667],

[0.19444444, 0.66666667, 0.06779661, 0.04166667],

],

[0.08333333, 0.58333333, 0.06779661, 0.08333333],

[0.19444444, 0.58333333, 0.08474576, 0.04166667],

[0.02777778, 0.375

],

[0.30555556, 0.70833333, 0.08474576, 0.04166667], [0.13888889, 0.58333333, 0.10169492, 0.04166667],

, 0.06779661, 0.04166667],

, 0.05084746, 0.04166667],

[0.30555556, 0.79166667, 0.11864407, 0.125

[0.16666667, 0.45833333, 0.08474576, 0.

[0.13888889, 0.41666667, 0.06779661, 0.

],

[0.

, 0.41666667, 0.01694915, 0.

],

[0.41666667, 0.83333333, 0.03389831, 0.04166667],

[0.38888889, 1.

, 0.08474576, 0.125

],

[0.30555556, 0.79166667, 0.05084746, 0.125

],

[0.22222222, 0.625 [0.38888889, 0.75 [0.22222222, 0.75

[0.30555556, 0.58333333, 0.11864407, 0.04166667],

],

, 0.04166667],

[0.22222222, 0.54166667, 0.11864407, 0.16666667], [0.13888889, 0.58333333, 0.15254237, 0.04166667], [0.19444444, 0.41666667, 0.10169492, 0.04166667],

],

[0.08333333, 0.66666667, 0.

[0.22222222, 0.70833333, 0.08474576, 0.125

, 0.06779661, 0.08333333], , 0.11864407, 0.08333333], , 0.08474576, 0.08333333],

[0.19444444, 0.58333333, 0.10169492, 0.125

[0.25

, 0.625

, 0.08474576, 0.04166667],

[0.25

, 0.58333333, 0.06779661, 0.04166667],

, 0.10169492, 0.04166667],

[0.13888889, 0.45833333, 0.10169492, 0.04166667],

[0.11111111, 0.5

[0.30555556, 0.58333333, 0.08474576, 0.125

],

[0.25

, 0.875

, 0.08474576, 0.

],

[0.33333333, 0.91666667, 0.06779661, 0.04166667],

],

, 0.03389831, 0.04166667],

, 0.05084746, 0.04166667],

],

[0.02777778, 0.41666667, 0.05084746, 0.04166667],

[0.22222222, 0.58333333, 0.08474576, 0.04166667],

[0.19444444, 0.625 [0.05555556, 0.125 [0.02777778, 0.5 [0.19444444, 0.625 [0.22222222, 0.75

[0.13888889, 0.41666667, 0.06779661, 0.08333333],

],

, 0.10169492, 0.20833333],

, 0.05084746, 0.08333333], , 0.05084746, 0.08333333], , 0.05084746, 0.04166667],

[0.16666667, 0.45833333, 0.08474576, 0.

[0.19444444, 0.5 [0.33333333, 0.625

[0.16666667, 0.45833333, 0.08474576, 0.

, 0.15254237, 0.125

, 0.10169492, 0.04166667], , 0.06779661, 0.04166667],

[0.27777778, 0.70833333, 0.08474576, 0.04166667],

[0.19444444, 0.54166667, 0.06779661, 0.04166667],

[0.75

[0.58333333, 0.5

[0.72222222, 0.45833333, 0.66101695, 0.58333333],

[0.33333333, 0.125

],

[0.61111111, 0.33333333, 0.61016949, 0.58333333],

[0.22222222, 0.75 [0.08333333, 0.5

, 0.5

, 0.62711864, 0.54166667], , 0.59322034, 0.58333333],

, 0.50847458, 0.5

[0.38888889, 0.33333333, 0.59322034, 0.5

],

[0.55555556, 0.54166667, 0.62711864, 0.625

],

9/7/2018

komal_knn1_minMaxScalar

[0.16666667, 0.16666667, 0.38983051, 0.375

],

[0.63888889, 0.375

, 0.61016949, 0.5

],

[0.25

[0.19444444, 0.

[0.44444444, 0.41666667, 0.54237288, 0.58333333],

],

[0.5

[0.36111111, 0.375

[0.66666667, 0.45833333, 0.57627119, 0.54166667],

[0.36111111, 0.41666667, 0.59322034, 0.58333333],

],

[0.52777778, 0.08333333, 0.59322034, 0.58333333],

[0.36111111, 0.20833333, 0.49152542, 0.41666667],

[0.44444444, 0.5

],

[0.55555556, 0.20833333, 0.66101695, 0.58333333],

[0.5

[0.58333333, 0.375

[0.63888889, 0.41666667, 0.57627119, 0.54166667],

[0.69444444, 0.33333333, 0.6440678 , 0.54166667], [0.66666667, 0.41666667, 0.6779661 , 0.66666667],

, 0.59322034, 0.58333333],

],

],

[0.5

, 0.6440678 , 0.70833333],

],

, 0.62711864, 0.54166667],

],

, 0.29166667, 0.49152542, 0.54166667],

, 0.42372881, 0.375

[0.47222222, 0.08333333, 0.50847458, 0.375

, 0.375

, 0.44067797, 0.5

[0.41666667, 0.29166667, 0.52542373, 0.375

, 0.33333333, 0.50847458, 0.5

, 0.33333333, 0.62711864, 0.45833333],

, 0.55932203, 0.5

[0.47222222, 0.375 [0.38888889, 0.25

, 0.42372881, 0.375

[0.33333333, 0.16666667, 0.47457627, 0.41666667],

],

[0.41666667, 0.29166667, 0.49152542, 0.45833333],

],

[0.30555556, 0.41666667, 0.59322034, 0.58333333],

[0.47222222, 0.58333333, 0.59322034, 0.625

],

[0.66666667, 0.45833333, 0.62711864, 0.58333333],

[0.47222222, 0.29166667, 0.69491525, 0.625

[0.33333333, 0.16666667, 0.45762712, 0.375

[0.55555556, 0.125

, 0.57627119, 0.5

],

[0.36111111, 0.41666667, 0.52542373, 0.5

],

[0.33333333, 0.20833333, 0.50847458, 0.5

],

[0.33333333, 0.25

[0.5

, 0.57627119, 0.45833333],

, 0.41666667, 0.61016949, 0.54166667],

[0.41666667, 0.25

, 0.50847458, 0.45833333],

[0.19444444, 0.125

, 0.38983051, 0.375

],

[0.36111111, 0.29166667, 0.54237288, 0.5

],

[0.38888889, 0.41666667, 0.54237288, 0.45833333],

[0.38888889, 0.375

, 0.54237288, 0.5

],

[0.52777778, 0.375

, 0.55932203, 0.5

],

[0.22222222, 0.20833333, 0.33898305, 0.41666667],

[0.38888889, 0.33333333, 0.52542373, 0.5

],

[0.55555556, 0.54166667, 0.84745763, 1.

],

[0.41666667, 0.29166667, 0.69491525, 0.75

],

[0.77777778, 0.41666667, 0.83050847, 0.83333333],

, 0.77966102, 0.70833333],

],

[0.91666667, 0.41666667, 0.94915254, 0.83333333],

[0.16666667, 0.20833333, 0.59322034, 0.66666667],

[0.83333333, 0.375

[0.66666667, 0.20833333, 0.81355932, 0.70833333],

],

[0.61111111, 0.5

],

[0.69444444, 0.41666667, 0.76271186, 0.83333333], [0.38888889, 0.20833333, 0.6779661 , 0.79166667],

[0.55555556, 0.375

[0.61111111, 0.41666667, 0.81355932, 0.875

, 0.89830508, 0.70833333],

[0.80555556, 0.66666667, 0.86440678, 1.

, 0.69491525, 0.79166667],

[0.58333333, 0.29166667, 0.72881356, 0.75

9/7/2018

komal_knn1_minMaxScalar

[0.41666667, 0.33333333, 0.69491525, 0.95833333],

[0.58333333, 0.5

[0.61111111, 0.41666667, 0.76271186, 0.70833333],

[0.94444444, 0.75 [0.94444444, 0.25

[0.47222222, 0.08333333, 0.6779661 , 0.58333333],

, 0.79661017, 0.91666667],

[0.72222222, 0.5

, 0.91666667],

],

, 0.72881356, 0.91666667],

, 0.96610169, 0.875

, 1.

[0.36111111, 0.33333333, 0.66101695, 0.79166667], [0.94444444, 0.33333333, 0.96610169, 0.79166667],

[0.55555556, 0.29166667, 0.66101695, 0.70833333], [0.66666667, 0.54166667, 0.79661017, 0.83333333],

, 0.84745763, 0.70833333],

[0.80555556, 0.5

[0.52777778, 0.33333333, 0.6440678 , 0.70833333],

[0.5

[0.58333333, 0.33333333, 0.77966102, 0.83333333],

, 0.41666667, 0.66101695, 0.70833333],

[0.80555556, 0.41666667, 0.81355932, 0.625

],

[0.86111111, 0.33333333, 0.86440678, 0.75

],

[1.

],

[0.55555556, 0.33333333, 0.69491525, 0.58333333],

[0.5

[0.94444444, 0.41666667, 0.86440678, 0.91666667], [0.55555556, 0.58333333, 0.77966102, 0.95833333], [0.58333333, 0.45833333, 0.76271186, 0.70833333], [0.47222222, 0.41666667, 0.6440678 , 0.70833333], [0.72222222, 0.45833333, 0.74576271, 0.83333333], [0.66666667, 0.45833333, 0.77966102, 0.95833333],

[0.72222222, 0.45833333, 0.69491525, 0.91666667],

],

, 0.83050847, 0.91666667],

[0.69444444, 0.5

],

[0.66666667, 0.41666667, 0.71186441, 0.91666667],

],

, 0.91525424, 0.79166667],

[0.58333333, 0.33333333, 0.77966102, 0.875

, 0.75

, 0.25

, 0.77966102, 0.54166667],

[0.41666667, 0.29166667, 0.69491525, 0.75

[0.66666667, 0.54166667, 0.79661017, 1.

[0.55555556, 0.20833333, 0.6779661 , 0.75

[0.61111111, 0.41666667, 0.71186441, 0.79166667], [0.52777778, 0.58333333, 0.74576271, 0.91666667], [0.44444444, 0.41666667, 0.69491525, 0.70833333]])

X_scaled_df.head()

In [10]: # transform back to df for easier exploration/plotting (output of scaler) X_scaled_df = pd.DataFrame(X_scaled, columns=['s_SepalLength','s_SepalWidth', 's_PetalLength','s_PetalWidth'])

Out[10]:

   

s_SepalLength

s_SepalWidth

s_PetalLength

s_PetalWidth

0

0.222222

0.625000

0.067797

0.041667

1

0.166667

0.416667

0.067797

0.041667

2

0.111111

0.500000

0.050847

0.041667

3

0.083333

0.458333

0.084746

0.041667

4

0.194444

0.666667

0.067797

0.041667

9/7/2018

komal_knn1_minMaxScalar

9/7/2018 komal_knn1_minMaxScalar In [11]: df_iris_scaled = pd.concat([X_scaled_df,y],axis=1) df_iris_scaled.head()

In [11]: df_iris_scaled = pd.concat([X_scaled_df,y],axis=1) df_iris_scaled.head()

Out[11]:

   

s_SepalLength

s_SepalWidth

s_PetalLength

s_PetalWidth

class

0

0.222222

0.625000

0.067797

0.041667

Iris-setosa

1

0.166667

0.416667

0.067797

0.041667

Iris-setosa

2

0.111111

0.500000

0.050847

0.041667

Iris-setosa

3

0.083333

0.458333

0.084746

0.041667

Iris-setosa

4

0.194444

0.666667

0.067797

0.041667

Iris-setosa

9/7/2018

komal_knn1_minMaxScalar

ax1 = fig.add_subplot(221) df_iris_scaled.groupby("class").s_PetalLength.plot(kind='hist',

alpha=0.8,

legend=True,

title='s_PetalLength')

ax2 = fig.add_subplot(222,sharey=ax1) df_iris_scaled.groupby("class").s_PetalWidth.plot(kind='hist',

alpha=0.8,

legend=True,

title='s_PetalWidth')

ax3 = fig.add_subplot(223,sharey=ax1) df_iris_scaled.groupby("class").s_SepalLength.plot(kind='hist',

alpha=0.8,

legend=True,

title='s_SepalLength')

ax4 = fig.add_subplot(224,sharey=ax1) df_iris_scaled.groupby("class").s_SepalWidth.plot(kind='hist',

alpha=0.8,

legend=True,

title='s_SepalWidth');

In [12]: # Notice x-axis on subplots are all the same for all features (0 to 1) # after scaling. fig = plt.figure(figsize=(14,9)) fig.suptitle('Frequency Distribution of Features by Species ',fontsize=20)

Distribution of Features by Species ',fontsize=20)

9/7/2018

komal_knn1_minMaxScalar

9/7/2018 komal_knn1_minMaxScalar In [13]: X_scaled_df.describe()

In [13]: X_scaled_df.describe()

Out[13]:

   

s_SepalLength

s_SepalWidth

s_PetalLength

s_PetalWidth

count

150.000000

150.000000

150.000000

150.000000

mean

0.428704

0.439167

0.467571

0.457778

std

0.230018

0.180664

0.299054

0.317984

min

0.000000

0.000000

0.000000

0.000000

25%

0.222222

0.333333

0.101695

0.083333

50%

0.416667

0.416667

0.567797

0.500000

75%

0.583333

0.541667

0.694915

0.708333

max

1.000000

1.000000

1.000000

1.000000

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, rando m_state = 0)

In [18]: # train and test split

rando m_state = 0) In [18]: # train and test split In [19]: print("train sample size",X_train.shape,

In [19]: print("train sample size",X_train.shape, type(X_train)) print("test sample size",X_test.shape, type(X_test))

train sample size (105, 4) <class 'numpy.ndarray'> test sample size (45, 4) <class 'numpy.ndarray'>

sample size (45, 4) <class 'numpy.ndarray'> In [23]: clf = KNeighborsClassifier(n_neighbors=5)

In [23]: clf = KNeighborsClassifier(n_neighbors=5) clf.fit(X_train, y_train)

Out[23]: KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2, weights='uniform')

n_jobs=1, n_neighbors=5, p=2, weights='uniform') In [24]: y_pred = clf.predict(X_test) In [28]: # Creates a

In [24]: y_pred = clf.predict(X_test)

In [24]: y_pred = clf.predict(X_test) In [28]: # Creates a confusion matrix cm =

In [28]: # Creates a confusion matrix cm = metrics.confusion_matrix(y_test, y_pred)

matrix cm = metrics.confusion_matrix(y_test, y_pred) In [29]: cm Out[29]: array([[16, 0, 0], [ 0, 17, 1],

In [29]: cm

Out[29]: array([[16, 0, 0],

[

0, 17,

1],

[

0, 0, 11]], dtype=int64)

9/7/2018

komal_knn1_minMaxScalar

9/7/2018 komal_knn1_minMaxScalar In [32]: CT=pd.crosstab(y_test, y_pred, rownames=['True'],

In [32]: CT=pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], marg ins=True) CT

Out[32]:

 

Predicted

Iris-setosa

Iris-versicolor

Iris-virginica

All

True

       

Iris-setosa

16

0

0

16

Iris-versicolor

0

17

1

18

Iris-virginica

0

0

11

11

All

16

17

12

45

0 0 11 11 All 16 17 12 45 In [38]: from sklearn.metrics import accuracy_score An

In [38]: from sklearn.metrics import accuracy_score

An insight we can get from the matrix is that the model was very accurate at classifying setosa and versicolor (True Positive/All = 1.0). However, accuracy for virginica was lower (11/12 = 0.917).

However, accuracy for virginica was lower (11/12 = 0.917). In [39]: plt.figure(figsize=(6,4)) sns.heatmap(CT, annot=

In [39]: plt.figure(figsize=(6,4)) sns.heatmap(CT, annot=True) plt.title('KNN classification model \nAccuracy:{0:.3f}'.format(accuracy_score( y_test, y_pred))) plt.ylabel('True label') plt.xlabel('Predicted label')

Out[39]: Text(0.5,16,'Predicted label')

label') Out[39]: Text(0.5,16,'Predicted label')

9/7/2018

komal_knn1_minMaxScalar

9/7/2018 komal_knn1_minMaxScalar In [42]: from sklearn.metrics import classification_report

In [42]: from sklearn.metrics import classification_report print(classification_report(y_test,y_pred))

 

precision

recall f1-score

support

Iris-setosa

1.00

1.00

1.00

16

Iris-versicolor

1.00

0.94

0.97

18

Iris-virginica

0.92

1.00

0.96

11

avg / total

0.98

0.98

0.98

45

# classification error : Overall how often is the classifier incorrect? print(1-metrics.accuracy_score(y_test, y_pred))

In [43]: # Classification accuracy : Overall how often is the classifier correct? print(metrics.accuracy_score(y_test, y_pred))

0.9777777777777777

0.022222222222222254

# Also known as "True Positive Rate" or 'Recall"

# should be MAXIMIZED

#print(metrics.recall_score(y_test, y_pred, average='none'))

#

Specificity: When the actual value is -ve, how often the prediction correct

#

Also known as "Selective"

#

should be MAXIMIZED

#

False Positive Rate : when the actual value is negative, how often is the

#

prediction incorrect

#

1- Specificity

#

Precision: when a +ve value is predicted, how often is the prediction correc

t?

#

print(metrics.precision_score(y_test, y_pred, average='none'))

In [45]: # Sensitivity : when the actual value is +ve, how often is the predication cor rect