Sie sind auf Seite 1von 17

9/21/2018 Untitled35

RANDOM FOREST/roc&auc - HYPERPARAMER TUNING


with for loop - TITANIC DB
In [92]: import numpy as np
import pandas as pd

import matplotlib.pyplot as plt


import seaborn as sns
sns.set(style="white", color_codes=True)
sns.set(font_scale=1.5)

from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier


from sklearn import metrics

In [93]: location = r"D:\KOMAL\SIMPLILEARN\MY COURSES\IN PROGRESS\DATA SCIENCE WITH PYT


HON\Live class downloads\Aug 11 Sat - Sep 15 Sat - Attending\datasets\titanic-
train.csv"

In [94]: df_train = pd.read_csv(location)

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypTu… 1/17
9/21/2018 Untitled35

In [95]: df_train.head()

Out[95]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket F

Braund,
0 1 0 3 Mr. Owen male 22.0 1 0 A/5 21171 7.250
Harris

Cumings,
Mrs. John
Bradley
1 2 1 1 female 38.0 1 0 PC 17599 71.28
(Florence
Briggs
Th...

Heikkinen,
STON/O2.
2 3 1 3 Miss. female 26.0 0 0 7.925
3101282
Laina

Futrelle,
Mrs.
Jacques
3 4 1 1 female 35.0 1 0 113803 53.10
Heath
(Lily May
Peel)

Allen, Mr.
4 5 0 3 William male 35.0 0 0 373450 8.050
Henry

In [114]: # Cleaning
# We will remove ‘Cabin’, ‘Name’ and ‘Ticket’ columns

df_train_dropped = df_train.drop(['Cabin', 'Name', 'Ticket'], axis=1)


df_train_dropped.head()

Out[114]:
PassengerId Survived Pclass Sex Age SibSp Parch Fare Embarked

0 1 0 3 male 22.0 1 0 7.2500 S

1 2 1 1 female 38.0 1 0 71.2833 C

2 3 1 3 female 26.0 0 0 7.9250 S

3 4 1 1 female 35.0 1 0 53.1000 S

4 5 0 3 male 35.0 0 0 8.0500 S

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypTu… 2/17
9/21/2018 Untitled35

In [115]: # Examine any missing


df_train_dropped.isnull().sum()

Out[115]: PassengerId 0
Survived 0
Pclass 0
Sex 0
Age 177
SibSp 0
Parch 0
Fare 0
Embarked 2
dtype: int64

In [116]: # Fill the missing age values by the mean value


# Filling missing Age values with mean
# Filling missing Embarked values with most common value

In [117]: df_train_dropped['Age'] = df_train_dropped['Age'].fillna(df_train_dropped['Ag


e'].mean())

In [118]: df_train_dropped['Embarked'] = df_train_dropped['Embarked'].fillna(df_train_dr


opped['Embarked'].mode()[0])

In [119]: df_train_dropped.isnull().sum()

Out[119]: PassengerId 0
Survived 0
Pclass 0
Sex 0
Age 0
SibSp 0
Parch 0
Fare 0
Embarked 0
dtype: int64

In [120]: # check wich embark value appears the most (mode)

df_train_dropped['Embarked'].value_counts()

Out[120]: S 646
C 168
Q 77
Name: Embarked, dtype: int64

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypTu… 3/17
9/21/2018 Untitled35

In [121]: df_train_dropped.head()

Out[121]:
PassengerId Survived Pclass Sex Age SibSp Parch Fare Embarked

0 1 0 3 male 22.0 1 0 7.2500 S

1 2 1 1 female 38.0 1 0 71.2833 C

2 3 1 3 female 26.0 0 0 7.9250 S

3 4 1 1 female 35.0 1 0 53.1000 S

4 5 0 3 male 35.0 0 0 8.0500 S

In [122]: df_train_dropped['Pclass'].value_counts()

Out[122]: 3 491
1 216
2 184
Name: Pclass, dtype: int64

In [123]: # one hot encoding of categorical features

In [124]: df_train_dummied = pd.get_dummies(df_train_dropped,columns = ['Pclass','Embark


ed','Sex'])
df_train_dummied.head()

Out[124]:
PassengerId Survived Age SibSp Parch Fare Pclass_1 Pclass_2 Pclass_3 Em

0 1 0 22.0 1 0 7.2500 0 0 1 0

1 2 1 38.0 1 0 71.2833 1 0 0 1

2 3 1 26.0 0 0 7.9250 0 0 1 0

3 4 1 35.0 1 0 53.1000 1 0 0 0

4 5 0 35.0 0 0 8.0500 0 0 1 0

In [125]: X_df = df_train_dummied.drop('Survived',axis = 1)


y_df = df_train_dummied['Survived']

In [126]: X_train, X_test, y_train, y_test = train_test_split(X_df,y_df,test_size = 0.25


, random_state =42)

In [127]: # instantiate the RF classifier


# Set the random state for reproducibility

clf = RandomForestClassifier(random_state=42)

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypTu… 4/17
9/21/2018 Untitled35

In [128]: # train the algorithm utilizing the training and target class

clf.fit(X_train, y_train)

Out[128]: RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',


max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=42, verbose=0, warm_start=False)

In [130]: #feature_importances_ is an internal variable

importances = list(clf.feature_importances_)

In [131]: importances

Out[131]: [0.17704930921912193,
0.1701609513162492,
0.028821370222286914,
0.03565150653155073,
0.1973100861519638,
0.019865619648430373,
0.016223643943662054,
0.05675591752670881,
0.014813114457404308,
0.0058535692652198195,
0.0158160841729409,
0.11751209931796955,
0.14416672822649168]

In [132]: df_train_dummied.columns

Out[132]: Index(['PassengerId', 'Survived', 'Age', 'SibSp', 'Parch', 'Fare', 'Pclass_


1',
'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
'Sex_female', 'Sex_male'],
dtype='object')

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypTu… 5/17
9/21/2018 Untitled35

In [135]: # List of tuples with variables/features along with their importance


features_importances = [(feature,round(importances,2)) for feature,importances
in zip(df_train_dummied.columns,importances) ]
features_importances

Out[135]: [('PassengerId', 0.18),


('Survived', 0.17),
('Age', 0.03),
('SibSp', 0.04),
('Parch', 0.2),
('Fare', 0.02),
('Pclass_1', 0.02),
('Pclass_2', 0.06),
('Pclass_3', 0.01),
('Embarked_C', 0.01),
('Embarked_Q', 0.02),
('Embarked_S', 0.12),
('Sex_female', 0.14)]

In [137]: # Sort the feature importances by most important first


features_importances = sorted(features_importances, key = lambda x: x[1], reve
rse = True)
features_importances

Out[137]: [('Parch', 0.2),


('PassengerId', 0.18),
('Survived', 0.17),
('Sex_female', 0.14),
('Embarked_S', 0.12),
('Pclass_2', 0.06),
('SibSp', 0.04),
('Age', 0.03),
('Fare', 0.02),
('Pclass_1', 0.02),
('Embarked_Q', 0.02),
('Pclass_3', 0.01),
('Embarked_C', 0.01)]

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypTu… 6/17
9/21/2018 Untitled35

In [142]: # plot the graph - feature importance wise

df_feature_importance = pd.DataFrame(features_importances, columns=['Feature',


'Importance'])
df_feature_importance

Out[142]:
Feature Importance

0 Parch 0.20

1 PassengerId 0.18

2 Survived 0.17

3 Sex_female 0.14

4 Embarked_S 0.12

5 Pclass_2 0.06

6 SibSp 0.04

7 Age 0.03

8 Fare 0.02

9 Pclass_1 0.02

10 Embarked_Q 0.02

11 Pclass_3 0.01

12 Embarked_C 0.01

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypTu… 7/17
9/21/2018 Untitled35

In [141]: ax = df_feature_importance.plot(kind='bar',
x='feature',
y='importance',
figsize=(10,8),
title= 'Feature importances for Random Forest Model',
grid=True,
legend=True,
fontsize = 12,
color='orange',
);

In [143]: # predict the test set


y_pred = clf.predict(X_test)

In [149]: from sklearn.metrics import confusion_matrix

In [150]: # Confusion Matrix


conf_mat = confusion_matrix(y_test, y_pred)

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypTu… 8/17
9/21/2018 Untitled35

In [151]: sns.heatmap(conf_mat, annot=True, fmt='d', cbar=False)


plt.xlabel('Predicted Values')
plt.ylabel('Actual Values')
plt.title('Actual vs. Predicted Confusion Matrix')

Out[151]: Text(0.5,1,'Actual vs. Predicted Confusion Matrix')

In [152]: # comparing actual response values (y_test) with predicted response values (y_
pred)
print("model accuracy:", metrics.accuracy_score(y_test, y_pred)* 100)

model accuracy: 82.95964125560538

ROC Curve Metrics

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypTu… 9/17
9/21/2018 Untitled35

In [153]: # We grab the second array from the output which corresponds to
# to the predicted probabilites of positive classes
# Ordered wrt fit.classes_ in our case [0, 1] where 1 is our positive class

predictions_prob = clf.predict_proba(X_test)[:, 1]

predictions_prob

Out[153]: array([0.4, 0. , 0.2, 1. , 0.1, 1. , 0.9, 0. , 0.9, 0.7, 0.4, 0.1, 0.2,
0. , 0. , 0.9, 0.6, 0.9, 0.1, 0.1, 0.3, 0.7, 0.2, 0.1, 0.1, 0.1,
0.2, 0. , 0.2, 0.7, 0. , 0.7, 0.6, 0.3, 0.4, 0.5, 0.1, 0.8, 1. ,
0. , 0. , 0.1, 0.2, 0.2, 0.4, 0. , 0.3, 0. , 0.3, 0.6, 1. , 1. ,
0.1, 0.4, 0. , 0.7, 0.1, 0.9, 0.9, 0.9, 0.3, 1. , 1. , 0.1, 0. ,
1. , 0.1, 0.4, 0.4, 0.9, 1. , 1. , 0.9, 0.9, 0. , 0. , 1. , 1. ,
1. , 0.4, 0.1, 1. , 1. , 0. , 0.1, 0.3, 1. , 1. , 0.1, 0.1, 0.3,
0.1, 0.4, 0. , 0. , 0.2, 0.5, 0.1, 1. , 0.1, 0.2, 0.1, 1. , 0. ,
0.4, 0.5, 1. , 0.1, 0.1, 0. , 1. , 0.1, 1. , 0.7, 0.3, 0. , 0.3,
0.4, 1. , 0. , 0.3, 1. , 1. , 0.5, 0.1, 0.5, 1. , 0.6, 0.2, 0. ,
0.9, 0.2, 0. , 0.7, 1. , 0.2, 1. , 0.2, 0. , 0. , 0.1, 1. , 0. ,
0.3, 0.3, 1. , 0. , 0.6, 1. , 0.2, 0.1, 0.2, 0.1, 0.7, 0. , 0.1,
0.7, 1. , 1. , 0.5, 0.3, 0.5, 0.1, 1. , 0.1, 0.4, 0. , 1. , 0. ,
0.1, 0.6, 1. , 0.7, 0.7, 0.2, 0. , 0.2, 1. , 0.6, 0.9, 0. , 0.6,
0.1, 0.2, 0.5, 0.6, 0.2, 0.2, 0. , 1. , 0.1, 0. , 0.1, 0. , 1. ,
1. , 1. , 0. , 1. , 0.1, 0.1, 0.3, 1. , 0. , 0.2, 0.6, 0.2, 0.3,
0.3, 0.1, 0.5, 0.1, 0.9, 0. , 0.2, 0.4, 1. , 0.4, 0.9, 0.3, 0. ,
1. , 0.1])

In [154]: FPR,TPR,thresholds = metrics.roc_curve(y_test,


predictions_prob,
pos_label = 1)

A low AUC might say that you are not using the best metric for the problem at hand.

It could also mean overfitting but this is hard to tell if you don’t specify on which type of dataset you are getting
this low value.

Is it in your training, cross-validation, or test set?

In [156]: auc_rf = metrics.auc(FPR, TPR)


auc_rf

Out[156]: 0.8541002850913971

In [158]: # Classification Report


print(metrics.classification_report(y_test, y_pred))

precision recall f1-score support

0 0.84 0.89 0.86 134


1 0.81 0.74 0.78 89

avg / total 0.83 0.83 0.83 223

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypT… 10/17
9/21/2018 Untitled35

HYPERPARAMETER TUNING

n_estimators

In [161]: # represents the number of trees in the forest.


# Usually the higher the number of trees the better to learn the data.
# However, adding a lot of trees can slow down the training process considerab
ly, therefore
# we do a parameter search to find the sweet spot.

In [166]: from sklearn.metrics import roc_curve


from sklearn.metrics import auc

In [167]: n_estimators = [1, 2, 4, 8, 16, 32, 64, 100, 200]

train_results = []
test_results = []

for estimator in n_estimators:


rf = RandomForestClassifier(n_estimators=estimator, n_jobs=-1)
rf.fit(X_train, y_train)

# predict for all the training samples in X_train


train_pred = rf.predict(X_train)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, tr
ain_pred)

roc_auc = auc(false_positive_rate, true_positive_rate)


train_results.append(roc_auc)

# predict for all the training samples in X_test


y_pred = rf.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_p
red)

roc_auc = auc(false_positive_rate, true_positive_rate)


test_results.append(roc_auc)

In [169]: from matplotlib import style

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypT… 11/17
9/21/2018 Untitled35

In [170]: # select the style of the plot


style.use('seaborn')

plt.plot(n_estimators, train_results, 'b', label= 'Train AUC')


plt.plot(n_estimators, test_results, 'r', label= 'Test AUC')

plt.ylabel('AUC score')
plt.xlabel('n_estimators')
plt.legend();

max_depth

In [171]: # max_depth represents the depth of each tree in the forest. The deeper the tr
ee,
# the more splits it has and it captures more information about the data. We f
it each decision
# tree with depths ranging from 1 to 32 and plot the training and test errors.

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypT… 12/17
9/21/2018 Untitled35

In [172]: max_depths = np.linspace(1, 32, 32, endpoint=True)

train_results = []
test_results = []

for max_depth in max_depths:


rf = RandomForestClassifier(max_depth=max_depth, n_jobs=-1)
rf.fit(X_train, y_train)

# predict for all the training samples in X_train


train_pred = rf.predict(X_train)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, tr
ain_pred)

roc_auc = auc(false_positive_rate, true_positive_rate)


train_results.append(roc_auc)

# predict for all the training samples in X_test


y_pred = rf.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_p
red)

roc_auc = auc(false_positive_rate, true_positive_rate)


test_results.append(roc_auc)

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypT… 13/17
9/21/2018 Untitled35

In [173]: # model overfits for large depth values.


# The trees perfectly predicts all of the train data,
# however, it fails to generalize the findings for new data

# select the style of the plot


style.use('seaborn')

plt.plot(max_depths, train_results, 'b', label= 'Train AUC')


plt.plot(max_depths, test_results, 'r', label= 'Test AUC')

plt.ylabel('AUC score')
plt.xlabel('Tree depth')
plt.legend();

min_samples_split

In [174]: # represents the minimum number of samples required to split an internal node.

# This can vary between considering at least one sample at each node to
# considering all of the samples at each node. When we increase this paramete
r, each tree
# in the forest becomes more constrained as it has to consider more samples at
each node.
# Here we will vary the parameter from 10% to 100% of the samples

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypT… 14/17
9/21/2018 Untitled35

In [175]: min_samples_splits = np.linspace(0.1, 1.0, 10, endpoint=True)

train_results = []
test_results = []

for min_samples_split in min_samples_splits:


rf = RandomForestClassifier(min_samples_split=min_samples_split, n_jobs=-1)
rf.fit(X_train, y_train)

# predict for all the training samples in X_train


train_pred = rf.predict(X_train)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, tr
ain_pred)

roc_auc = auc(false_positive_rate, true_positive_rate)


train_results.append(roc_auc)

# predict for all the training samples in X_test


y_pred = rf.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_p
red)

roc_auc = auc(false_positive_rate, true_positive_rate)


test_results.append(roc_auc)

In [176]: # select the style of the plot


style.use('seaborn')

plt.plot(min_samples_splits, train_results, 'b', label= 'Train AUC')


plt.plot(min_samples_splits, test_results, 'r', label= 'Test AUC')

plt.ylabel('AUC score')
plt.xlabel('Tree depth')
plt.legend();

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypT… 15/17
9/21/2018 Untitled35

In [177]: # We can clearly see that when we require all of the samples at each node, the
model cannot
# learn enough about the data.
# This is an underfitting case.

In [178]: list(range(1,X_train.shape[1]))

Out[178]: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

max_features

In [179]: # represents the number of features to consider when looking for the best spli
t.

max_features = list(range(1,X_train.shape[1]))

train_results = []
test_results = []

for max_feature in max_features:


rf = RandomForestClassifier(max_features=max_feature, n_jobs=-1)
rf.fit(X_train, y_train)

# predict for all the training samples in X_train


train_pred = rf.predict(X_train)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, tr
ain_pred)

roc_auc = auc(false_positive_rate, true_positive_rate)


train_results.append(roc_auc)

# predict for all the training samples in X_test


y_pred = rf.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_p
red)

roc_auc = auc(false_positive_rate, true_positive_rate)


test_results.append(roc_auc)

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypT… 16/17
9/21/2018 Untitled35

In [180]: # select the style of the plot


style.use('seaborn')

plt.plot(max_features, train_results, 'b', label= 'Train AUC')


plt.plot(max_features, test_results, 'r', label= 'Test AUC')

plt.ylabel('AUC score')
plt.xlabel('Tree depth')
plt.legend();

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_RF_roc_auc_HypT… 17/17