Sie sind auf Seite 1von 9

9/20/2018 komal_DT1_EDAWithFunctions_Titanic

Decision Tree and EDA with functions


In [51]: import numpy as np
import pandas as pd

In [52]: datafile = "D:\komal\SIMPLILEARN\MY COURSES\IN PROGRESS\MACHINE LEARNING RECOR


DINGS\Jul 28 Sat - Aug 25 Sat\Drive downloads\Machine Learning _ Jul 28 - Aug
25 _ Sayan\Decision Trees/titanicdata.htm"

In [53]: #BeautifulSoup is the library used for web scrapping

from bs4 import BeautifulSoup


with open(datafile,"r",encoding="Latin-1") as f:
soup = BeautifulSoup(f,"html.parser")

In [54]: table = soup.find('table')

In [55]: import pandas as pd


data = data = pd.read_html(str(table).encode('ascii', errors='replace'), flavo
r='bs4')[0]

In [56]: data.head()

Out[56]:
Boat Unnamed:
Name Age Class/Dept Ticket Joined Job
[Body] 7

AB??-AL-
MUN??, Mr 3rd Class 2699?18
0 27 Cherbourg ? 15? NaN
N??s??f Passenger 15s 9d
Q??sim

ABBING, Mr 3rd Class 5547?7 Blacksmith


1 42 Southampton ?? NaN
Anthony Passenger 11s ?

ABBOTT,
3rd Class CA2673?
2 Mrs Rhoda 39 Southampton ? A? NaN
Passenger 20 5s
Mary 'Rosa'

ABBOTT, Mr
3rd Class CA2673?
3 Rossmore 16 Southampton Jeweller ? ?[190] NaN
Passenger 20 5s
Edward

ABBOTT, Mr
3rd Class CA2673?
4 Eugene 13 Southampton Scholar ? ?? NaN
Passenger 20 5s
Joseph

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 1/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic

In [57]: def cleanup(value):


return value.replace("?"," ")

In [58]: data['Name'] = data['Name'].apply(cleanup)


data['Boat [Body]'] = data['Boat [Body]'].apply(cleanup)

data['Age'] = data['Age'].apply(pd.to_numeric,errors='coerce')
data.head()

Out[58]:
Boat Unnamed:
Name Age Class/Dept Ticket Joined Job
[Body] 7

AB -AL-
3rd Class 2699?18
0 MUN , Mr N 27.0 Cherbourg ? 15 NaN
Passenger 15s 9d
s f Q sim

ABBING, Mr 3rd Class 5547?7 Blacksmith


1 42.0 Southampton NaN
Anthony Passenger 11s ?

ABBOTT,
3rd Class CA2673?
2 Mrs Rhoda 39.0 Southampton ? A NaN
Passenger 20 5s
Mary 'Rosa'

ABBOTT, Mr
3rd Class CA2673?
3 Rossmore 16.0 Southampton Jeweller ? [190] NaN
Passenger 20 5s
Edward

ABBOTT, Mr
3rd Class CA2673?
4 Eugene 13.0 Southampton Scholar ? NaN
Passenger 20 5s
Joseph

In [59]: data = data[["Name","Age","Class/Dept","Boat [Body]"]]


data.head()

Out[59]:
Name Age Class/Dept Boat [Body]

0 AB -AL-MUN , Mr N s f Q sim 27.0 3rd Class Passenger 15

1 ABBING, Mr Anthony 42.0 3rd Class Passenger

2 ABBOTT, Mrs Rhoda Mary 'Rosa' 39.0 3rd Class Passenger A

3 ABBOTT, Mr Rossmore Edward 16.0 3rd Class Passenger [190]

4 ABBOTT, Mr Eugene Joseph 13.0 3rd Class Passenger

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 2/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic

In [60]: def checkPass(class_type):


if "Passenger" in class_type:
return "Passenger"
else:
return "Crew"

data["Crew/Pass"]=data["Class/Dept"].apply(checkPass)
data.head()

Out[60]:
Name Age Class/Dept Boat [Body] Crew/Pass

0 AB -AL-MUN , Mr N s f Q sim 27.0 3rd Class Passenger 15 Passenger

1 ABBING, Mr Anthony 42.0 3rd Class Passenger Passenger

2 ABBOTT, Mrs Rhoda Mary 'Rosa' 39.0 3rd Class Passenger A Passenger

3 ABBOTT, Mr Rossmore Edward 16.0 3rd Class Passenger [190] Passenger

4 ABBOTT, Mr Eugene Joseph 13.0 3rd Class Passenger Passenger

In [61]: def class_person(class_type):


if "Passenger" in class_type:
return class_type.split(" ")[0]
else:
return 'crew'

data['Class'] = data['Class/Dept'].apply(class_person)
data.head()

Out[61]:
Boat
Name Age Class/Dept Crew/Pass Class
[Body]

3rd Class
0 AB -AL-MUN , Mr N s f Q sim 27.0 15 Passenger 3rd
Passenger

3rd Class
1 ABBING, Mr Anthony 42.0 Passenger 3rd
Passenger

ABBOTT, Mrs Rhoda Mary 3rd Class


2 39.0 A Passenger 3rd
'Rosa' Passenger

ABBOTT, Mr Rossmore 3rd Class


3 16.0 [190] Passenger 3rd
Edward Passenger

3rd Class
4 ABBOTT, Mr Eugene Joseph 13.0 Passenger 3rd
Passenger

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 3/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic

In [62]: def child_class(value):


if value>=18:
return 'adult'
else:
return 'child'

data['Adult/Child'] = data['Age'].apply(child_class)
data.head()

Out[62]:
Boat
Name Age Class/Dept Crew/Pass Class Adult/Child
[Body]

AB -AL-MUN , Mr N s f 3rd Class


0 27.0 15 Passenger 3rd adult
Q sim Passenger

3rd Class
1 ABBING, Mr Anthony 42.0 Passenger 3rd adult
Passenger

ABBOTT, Mrs Rhoda 3rd Class


2 39.0 A Passenger 3rd adult
Mary 'Rosa' Passenger

ABBOTT, Mr Rossmore 3rd Class


3 16.0 [190] Passenger 3rd child
Edward Passenger

ABBOTT, Mr Eugene 3rd Class


4 13.0 Passenger 3rd child
Joseph Passenger

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 4/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic

In [63]: def gender_determiner(name):


firstname = name[name.index(",")+ 2:]
salutation = firstname.split(" ")[0]
if salutation in ['Mr','Master']:
return 'Male'
else:
return 'Female'

data['Gender'] = data['Name'].apply(gender_determiner)
data.head()

Out[63]:
Boat
Name Age Class/Dept Crew/Pass Class Adult/Child Gender
[Body]

AB -AL-MUN , Mr 3rd Class


0 27.0 15 Passenger 3rd adult Male
N s f Q sim Passenger

ABBING, Mr 3rd Class


1 42.0 Passenger 3rd adult Male
Anthony Passenger

ABBOTT, Mrs
3rd Class
2 Rhoda Mary 39.0 A Passenger 3rd adult Female
Passenger
'Rosa'

ABBOTT, Mr
3rd Class
3 Rossmore 16.0 [190] Passenger 3rd child Male
Passenger
Edward

ABBOTT, Mr 3rd Class


4 13.0 Passenger 3rd child Male
Eugene Joseph Passenger

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 5/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic

In [65]: def checkSurvival(value):


if value.strip() == " " or "[" in value:
return 0
else:
return 1

data["Survival"]=data["Boat [Body]"].apply(checkSurvival)
data.head()

Out[65]:
Boat
Name Age Class/Dept Crew/Pass Class Adult/Child Gender Survival
[Body]

AB -AL-
MUN , Mr 3rd Class
0 27.0 15 Passenger 3rd adult Male 1
NsfQ Passenger
sim

ABBING,
3rd Class
1 Mr 42.0 Passenger 3rd adult Male 1
Passenger
Anthony

ABBOTT,
Mrs
3rd Class
2 Rhoda 39.0 A Passenger 3rd adult Female 1
Passenger
Mary
'Rosa'

ABBOTT,
Mr 3rd Class
3 16.0 [190] Passenger 3rd child Male 0
Rossmore Passenger
Edward

ABBOTT,
Mr 3rd Class
4 13.0 Passenger 3rd child Male 1
Eugene Passenger
Joseph

In [67]: data.groupby(['Crew/Pass'])['Survival'].sum()*100/data.groupby(['Crew/Pass'])[
'Survival'].count()

Out[67]: Crew/Pass
Crew 90.217391
Passenger 90.310651
Name: Survival, dtype: float64

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 6/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic

In [69]: def compare(group,data):


return data.groupby([group])['Survival'].sum()*100/data.groupby([group])[
'Survival'].count()

compare("Class",data)

Out[69]: Class
1st 89.714286
2nd 88.395904
3rd 91.396333
crew 90.217391
Name: Survival, dtype: float64

In [70]: compare("Gender",data)

Out[70]: Gender
Female 95.840555
Male 88.557743
Name: Survival, dtype: float64

In [71]: compare("Adult/Child",data)

Out[71]: Adult/Child
adult 89.699955
child 95.964126
Name: Survival, dtype: float64

In [72]: trainingData=data[["Age","Crew/Pass","Class","Adult/Child","Gender","Survival"
]]
trainingData.head()

Out[72]:
Age Crew/Pass Class Adult/Child Gender Survival

0 27.0 Passenger 3rd adult Male 1

1 42.0 Passenger 3rd adult Male 1

2 39.0 Passenger 3rd adult Female 1

3 16.0 Passenger 3rd child Male 0

4 13.0 Passenger 3rd child Male 1

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 7/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic

In [73]: def catToNum(series):


series = series.astype('category')
return series.cat.codes

catData=trainingData[["Crew/Pass","Class","Adult/Child","Gender"]].apply(catTo
Num)
trainingData[["Crew/Pass","Class","Adult/Child","Gender"]]=catData
trainingData.head()

C:\Users\hariz\Anaconda3\lib\site-packages\pandas\core\frame.py:3137: Setting
WithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/st


able/indexing.html#indexing-view-versus-copy
self[k1] = value[k2]
Out[73]:
Age Crew/Pass Class Adult/Child Gender Survival

0 27.0 1 2 0 1 1

1 42.0 1 2 0 1 1

2 39.0 1 2 0 0 1

3 16.0 1 2 1 1 0

4 13.0 1 2 1 1 1

In [74]: len(trainingData)

Out[74]: 2456

In [75]: trainingData = trainingData.dropna()


len(trainingData)

Out[75]: 2426

In [76]: from sklearn.model_selection import train_test_split


train, test = train_test_split(trainingData, test_size = 0.2)

In [77]: len(train)

Out[77]: 1940

In [78]: len(test)

Out[78]: 486

In [79]: from sklearn.tree import DecisionTreeClassifier


clf=DecisionTreeClassifier(max_leaf_nodes=25)
clf=clf.fit(train[["Age","Crew/Pass","Class","Adult/Child","Gender"]],train["S
urvival"])

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 8/9
9/20/2018 komal_DT1_EDAWithFunctions_Titanic

In [81]: clf

Out[81]: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,


max_features=None, max_leaf_nodes=25,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, presort=False, random_state=None,
splitter='best')

In [82]: clf.feature_importances_

Out[82]: array([0.72325166, 0.03119177, 0.15634522, 0. , 0.08921135])

In [83]: predictions = clf.predict(test[["Age","Crew/Pass","Class","Adult/Child","Gende


r"]])

In [89]: from sklearn.metrics import accuracy_score


accuracy_score(test["Survival"], predictions)

Out[89]: 0.8847736625514403

file:///D:/KOMAL/SIMPLILEARN/MY%20COURSES/IN%20PROGRESS/My%20Codes_ML_DS/pdf%20conversion/htmls/komal_DT1_EDAWithFunctio… 9/9