Sie sind auf Seite 1von 15

In [5]:

# import the needed module


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 讀取 Iris data set


rawdat = pd.read_excel('./Irisdata.xls')
# 取出 training set

training_set = rawdat[rawdat['TRAIN'] == 1 ]# training set 為 的部份
1 assign 到
training_st
# 將資料分別轉成 matrix
sepal_len = pd.Series.as_matrix(training_set['SEPALLEN Length of Sepals'])
sepal_width = pd.Series.as_matrix(training_set['SEPALWID Width of Sepals'])
pedal_len = pd.Series.as_matrix(training_set['PETALLEN Length of Petals'])
pedal_width = pd.Series.as_matrix(training_set['PETALWID Width of Petals'])
type_flower = pd.Series.as_matrix(training_set['IRISTYPE Three types of iris'])
train ={'sepal_len':sepal_len,'sepal_width':sepal_width,'pedal_len':pedal_len,'p
edal_width':pedal_width}
flower = ['SETOSA','VERSICOL','VIRGINIC']
color = ['red','blue','green']

# A. Plot Matrix of Scatter Plots


### 產⽣⼀個畫出多種顏⾊的 scatter function
def scatter(x,y,color,flower,type_flower):
for i in range(len(color)):
plt.scatter(x[type_flower == flower[i]],y[type_flower == flower[i]],colo
r = color[i])
# x[[type_flower]== flower[i]] 只會產⽣含有特定 flower[i] 的vect
or
return

## 畫出 matrix of scatter plot


list_of_data = [sepal_len,sepal_width,pedal_len,pedal_width]
name_of_list = ['sepal_len','sepal_width','pedal_len','pedal_width']
import matplotlib as mp
mp.rcParams['figure.figsize']=(20,20) # 設定輸出圖形的 size
num = 1
bin_ = [9,9,6,5]# 針對不同 data 調整 bin數量(參考講義)
for i in range(len(list_of_data)):
for j in range(len(list_of_data)):
if i == j:# 如果是在 diagnol 上 則畫出 histogram
plt.subplot(4,4,num)
plt.hist(list_of_data[i],bins=bin_[i])
plt.grid(True)
else: #在⾮ diagnol 上 畫出 scatter plot
plt.subplot(4,4,num)
scatter(list_of_data[j],list_of_data[i],color,flower,type_flower)
plt.grid(True)

if num in [1,5,9,13]:
plt.ylabel(name_of_list[i]) 在最左邊標⽰y label
#
if num in [13,14,15,16]:
plt.xlabel(name_of_list[j]) # 在最下⾯標⽰x label
if j==0:#設定 x_axis 的範圍
plt.xlim((4,8))
elif j==1:
plt.xlim((1.5,4.5))
elif j==2:
plt.xlim((1,7))
else:
plt.xlim((0,2.5))

num+=1
plt.show()

# B. Histogram with Normal Probability Fit & Normal Probability Plot

# 產⽣
mean 與
standard deviation,因為之後需要產⽣ probability density function
mean = [sepal_len.mean(),sepal_width.mean(),pedal_len.mean(),pedal_width.mean()]
std = [sepal_len.std(),sepal_width.std(),pedal_len.std(),pedal_width.std()]

from scipy.stats import norm


from scipy import stats

mp.rcParams['figure.figsize']=(20,10) # 設定畫出圖形的 size


for i in range(4):
plt.subplot(2,4,i+1)
plt.title(name_of_list[i])
n,bins,patches = plt.hist(list_of_data[i],bins=20)
y = norm.pdf(bins,loc = mean[i],scale = std[i])*n.max()
# 因為pdf function 已經被 normalized 過 所以為了讓同⼀張圖可以顯⽰ histogram & plot

#我將pdf 乘上每個 histogram 中的最⼤值
plt.plot(bins,y,'r') #⽤紅線畫出 gaussian distribution

img = plt.subplot(2,4,4+i+1)#畫出 normal probability plot


stats.probplot(list_of_data[i],plot=plt)
plt.show()

# C. Build up the classifier


## QDA(Quadratic Discriminant Analysis)

#把事前機率先紀錄起來
prior_list = []
for i in range(3):
prior_list.append(len(pedal_len[type_flower ==flower[i]])/len(training_set))
#pedal_len[type_flower == flower[i]] return the data of type_flower == flowe
r[i]
# using len() to calculate the number of each type of flower

print('P(',flower[i],')=',prior_list[i])

#使⽤ dict type 儲存 estimated mean,藉由這種mapping 的⽅式⽐較⽅便我們去acccess,不⽤去


記⼀堆 index 與 data 關係
est_m = {}
# Maximum Likelihood Method
# Maximum Likelihood Method
for i in range(3):
est_m['SEPAL_LEN '+flower[i]] = sepal_len[type_flower ==
flower[i]].mean()
for i in range(3):
est_m['SEPAL_WIDTH '+ flower[i]] = sepal_width[type_flower == flower[i]].m
ean()
for i in range(3):
est_m['PEDAL_LEN '+ flower[i]] = pedal_len[type_flower == flower[i]].mea
n()
for i in range(3):
est_m['PEDAL_WIDTH '+ flower[i]] = pedal_width[type_flower == flower[i]].m
ean()

est_std ={}
for i in range(3):
est_std['SEPAL_LEN '+ flower[i]]=sepal_len[type_flower == flower[i]].std()
for i in range(3):
est_std['SEPAL_WIDTH '+ flower[i]]=sepal_width[type_flower ==
flower[i]].std()
for i in range(3):
est_std['PEDAL_LEN '+ flower[i]]=pedal_len[type_flower == flower[i]].std()
for i in range(3):
est_std['PEDAL_WIDTH '+ flower[i]]=pedal_width[type_flower ==
flower[i]].std()


# qda 的 discriminate 式⼦包成 function
import math
def qda_discriminate(mean,std_deviation,prior,x):
return -math.log(std_deviation)-(x-mean)**2/(2*std_deviation**2) +
math.log(prior)

# qda_classifier
def QDA_sepal_len_classifier(x):
g = []# 產⽣空⽩的list g
for i in range(3):
g.append(qda_discriminate(est_m['SEPAL_LEN '+flower[i]],est_std['SEPAL_L
EN '+flower[i]],prior_list[i],x) )
將 與
# est_m est_std 依照SEPAL_LEN 與 花的種類 產⽣對應的 輸⼊ discriminate fu
nction 產⽣ score
#並且紀錄在 裡⾯ g

從 裡⾯找最⼤值 並把index 找出來


# g
max_g = g[0]
max_i = 0

for i in range(3):
if max_g < g[i]:
max_i = i
max_g = g[i]

return flower[max_i]# 將找到的index 所對應的花的名⼦return 回去


def QDA_sepal_width_classifier(x):
g = []
g = []
for i in range(3):
g.append(qda_discriminate(est_m['SEPAL_WIDTH '+flower[i]],est_std['SEPAL
_WIDTH '+\
flower[i]],prior_lis
t[i],x) )
max_g = g[0]
max_i = 0

for i in range(3):
if max_g < g[i]:
max_i = i
max_g = g[i]
return flower[max_i]
def QDA_pedal_width_classifier(x):
g = []
for i in range(3):
g.append(qda_discriminate(est_m['PEDAL_WIDTH '+flower[i]],est_std['PEDAL
_WIDTH '+\
flower[i]],prior_lis
t[i],x) )
max_g = g[0]
max_i = 0

for i in range(3):
if max_g < g[i]:
max_i = i
max_g = g[i]
return flower[max_i]
def QDA_pedal_len_classifier(x):
g = []
for i in range(3):
g.append(qda_discriminate(est_m['PEDAL_LEN '+flower[i]],est_std['PEDAL_L
EN '+flower[i]],prior_list[i],x) )
max_g = g[0]
max_i = 0

for i in range(3):
if max_g < g[i]:
max_i = i
max_g = g[i]
return flower[max_i]

#把所有 classifier包成⼀個 dictionary type⽅便之後使⽤


qda_classifier = {'sepal_len':QDA_sepal_len_classifier,'sepal_width':QDA_sepal_w
idth_classifier,\
'pedal_len':QDA_pedal_len_classifier,'pedal_width':QDA_pedal_w
idth_classifier}

# 因為 的
LDA variance 相同,因此我們需要加權平均算出 weighted variance 作為estimated par
ameter
# 權重為 prior probability

sepal_len_weighted_var = est_std['SEPAL_LEN SETOSA']**2*prior_list[0] + \


est_std['SEPAL_LEN VERSICOL']**2*prior_list[1]+est_std['SEPAL_LEN
VIRGINIC']**2*prior_list[2]
VIRGINIC']**2*prior_list[2]
sepal_width_weighted_var = est_std['SEPAL_WIDTH SETOSA']**2*prior_list[0] + \
est_std['SEPAL_WIDTH VERSICOL']**2*prior_list[1]+est_std['SEPAL_WIDTH
VIRGINIC']**2*prior_list[2]
pedal_len_weighted_var = est_std['PEDAL_LEN SETOSA']**2*prior_list[0] + \
est_std['PEDAL_LEN VERSICOL']**2*prior_list[1]+est_std['PEDAL_LEN
VIRGINIC']**2*prior_list[2]
pedal_width_weighted_var = est_std['PEDAL_WIDTH SETOSA']**2*prior_list[0] + \
est_std['PEDAL_WIDTH VERSICOL']**2*prior_list[1]+est_std['PEDAL_WIDTH
VIRGINIC']**2*prior_list[2]

# 將 lda discriminate 包成⼀個 function


def lda_discriminate(mean,variance,prior,x):
return (2*x*mean - mean**2)/(2*variance) + math.log(prior)

# lda classifier function


def LDA_sepal_len_classifier(x):
g = []
for i in range(3):
g.append(lda_discriminate(est_m['SEPAL_LEN '+flower[i]],sepal_len_weight
ed_var,prior_list[i],x) )
max_g = g[0]
max_i = 0

for i in range(3):
if max_g < g[i]:
max_i = i
max_g = g[i]
return flower[max_i]
def LDA_sepal_width_classifier(x):
g = []
for i in range(3):
g.append(lda_discriminate(est_m['SEPAL_WIDTH '+flower[i]],sepal_width_we
ighted_var,prior_list[i],x) )
max_g = g[0]
max_i = 0

for i in range(3):
if max_g < g[i]:
max_i = i
max_g = g[i]
return flower[max_i]
def LDA_pedal_len_classifier(x):
g = []
for i in range(3):
g.append(lda_discriminate(est_m['PEDAL_LEN '+flower[i]],pedal_len_weight
ed_var,prior_list[i],x) )
max_g = g[0]
max_i = 0

for i in range(3):
if max_g < g[i]:
max_i = i
max_g = g[i]
return flower[max_i]
def LDA_pedal_width_classifier(x):
def LDA_pedal_width_classifier(x):
g = []
for i in range(3):
g.append(lda_discriminate(est_m['PEDAL_WIDTH '+flower[i]],pedal_width_we
ighted_var,prior_list[i],x) )
max_g = g[0]
max_i = 0

for i in range(3):
if max_g < g[i]:
max_i = i
max_g = g[i]
return flower[max_i]

#把所有 classifier 包成⼀個 dictionary type⽅便之後使⽤


lda_classifier = {'sepal_len':LDA_sepal_len_classifier,'sepal_width':LDA_sepal_w
idth_classifier,\
'pedal_len':LDA_pedal_len_classifier,'pedal_width':LDA_pedal_w
idth_classifier}

## NMC: Nearest Mean Classifier


### No other new estimated parameter needed

# 把 nmc discriminate 包成⼀個 function


def nmc_discriminate(mean,x):
return (2*x*mean - mean**2)

def nmc_sepal_len_classifier(x):
g = []
for i in range(3):
g.append(nmc_discriminate(est_m['SEPAL_LEN '+flower[i]],x) )
max_g = g[0]
max_i = 0
for i in range(3):
if max_g < g[i]:
max_i = i
max_g = g[i]
return flower[max_i]
def nmc_sepal_width_classifier(x):
g = []
for i in range(3):
g.append(nmc_discriminate(est_m['SEPAL_WIDTH '+flower[i]],x) )
max_g = g[0]
max_i = 0
for i in range(3):
if max_g < g[i]:
max_i = i
max_g = g[i]
return flower[max_i]
def nmc_pedal_len_classifier(x):
g = []
for i in range(3):
g.append(nmc_discriminate(est_m['PEDAL_LEN '+flower[i]],x) )
max_g = g[0]
max_i = 0
for i in range(3):
if max_g < g[i]:
max_i = i
max_g = g[i]
max_g = g[i]
return flower[max_i]
def nmc_pedal_width_classifier(x):
g = []
for i in range(3):
g.append(nmc_discriminate(est_m['PEDAL_WIDTH '+flower[i]],x) )
max_g = g[0]
max_i = 0
for i in range(3):
if max_g < g[i]:
max_i = i
max_g = g[i]
return flower[max_i]

nmc_classifier = {'sepal_len':nmc_sepal_len_classifier,'sepal_width':nmc_sepal_w
idth_classifier,\
'pedal_len':nmc_pedal_len_classifier,'pedal_width':nmc_pedal_w
idth_classifier}

# Extract the test_set from raw data

test_set = rawdat[rawdat['TRAIN'] == 0 ]
test_sepal_len = pd.Series.as_matrix(test_set['SEPALLEN Length of Sepals'])
test_sepal_width = pd.Series.as_matrix(test_set['SEPALWID Width of Sepals'])
test_pedal_len = pd.Series.as_matrix(test_set['PETALLEN Length of Petals'])
test_pedal_width = pd.Series.as_matrix(test_set['PETALWID Width of Petals'])
test_type_flower = pd.Series.as_matrix(test_set['IRISTYPE Three types of iris'])
test={'sepal_len':test_sepal_len,'sepal_width':test_sepal_width,'pedal_len':test
_pedal_len,\
'pedal_width':test_pedal_width}

## Confusion Matrix
### Create the confusion matrix function, it will be used for all the Classifier

# 把產⽣ confusion matrix包成⼀個 function


def confusion_matrix(classifier,testdata,sol,flower):
size = len(flower)
confusion_m = np.zeros((size+1,size+1))

for i in range(len(testdata)):
predict = classifier(testdata[i])
col = flower.index(sol[i])#confusion matrix 的
column 對應的是observed
row = flower.index(predict) # row 對應的是 predict
if col == row :
confusion_m[row,col]+= 1
else:
confusion_m[row,col]+= 1

# This part is added for build up sum


for i in range(size):
在最後的column 分別計算每個row
confusion_m[i,-1] = confusion_m[i,:3].sum()#
在最後的column 分別計算每個row
confusion_m[i,-1] = confusion_m[i,:3].sum()#
的總和
for i in range(size):
confusion_m[-1,i] = confusion_m[:3,i].sum()#在最後的row 分別計算每個column
的總和
#-----------------------------------------------------------
confusion_m[-1,-1] = confusion_m[:3,-1].sum()# 最右下⾓ 則是 total test data
return confusion_m

#因為我們之前有把不同的 包成⼀個
classifier dictionay type,
#因此我們的這個 可以對不同的
function classfier ⽣成不同的 confusion matrix
def Analysis_Matrix(Classifier,test,test_type_flower):
Dict= {}
for i in range(4):
name = name_of_list[i]
Dict[name] = confusion_matrix(Classifier[name],test[name],test_type_flow
er,flower)
# 回傳的confusion matrix 以 feature data 最為 值去
key access
return Dict

QDA_matrix = Analysis_Matrix(qda_classifier,test,test_type_flower)
LDA_matrix = Analysis_Matrix(lda_classifier,test,test_type_flower)
NMC_matrix = Analysis_Matrix(nmc_classifier,test,test_type_flower)
Confusion_Matrix = [QDA_matrix,LDA_matrix,NMC_matrix]
#包成⼀個⼤的 Confusion_Matrix 只是⽅便我們做輸出
print('----------------Confusion Matrix-----------------')
Confusion_Matrix_name = ['QDA_matrix','LDA_matrix','NMC_matrix']
Average_Accuracy ={}
All_Accuracy = {}
for i in range(len(Confusion_Matrix)):
a = Confusion_Matrix_name[i]
print(a)# 先把Confusion_Matrix 的Analysis Method 印出來
sum_of_accuracy = 0

for b in Confusion_Matrix[i]:
print(b)# 輸出
Confusion_Matrix 的 值,也就是
key feature name
print(Confusion_Matrix[i][b])# 輸出
confusion matrix
diagonal_sum = np.ndarray.diagonal(Confusion_Matrix[i][b]).sum()# trace
total_test = Confusion_Matrix[i][b][-1][-1]# access 右下⾓的 total test si
ze
accuracy = (diagonal_sum-total_test)/total_test #產⽣ accuracy
print('Accuracy: ', accuracy )
All_Accuracy[a+b] = accuracy
sum_of_accuracy += accuracy# 累加 accuracy ⽅便之後算 average accuracy
print('Average Accuracy',sum_of_accuracy/len(Confusion_Matrix[i]))
Average_Accuracy[a] = sum_of_accuracy/len(Confusion_Matrix[i])

# average accuracy 存到
Average_Accuracy 這個
dictionary type 之後可以直接⽤ fea
ture name 為
key 去
access
print('------------------------------------')

print('\n\n--------------------Average Accuracy--------------------------')
print(Average_Accuracy)

# Rank:
## 1. QDA & NMC
## 2. LDA
## 2. LDA

#### 下⾯產⽣的 Gaussian PDF ( 紅⾊ 的 是採⽤


) mean 的
training set estimated mean, va
riance 是採⽤
#### LDA estimated parameter 的weighted variance
#### Gaussian Distribution ( ⿈⾊ 是⽤
) 的
test set與 作圖mean std
#### histogram 則是來⾃於 test set 的資料分別對不同花去作圖
list_of_test = [test_sepal_len,test_sepal_width,test_pedal_len,test_pedal_width]
import matplotlib as mp
mp.rcParams['figure.figsize']=(20,4) # 設定畫出圖形的 size
lim = [(4,7.5),(2,4),(0,8),(0,3)]
weighted_variance = [sepal_len_weighted_var,sepal_width_weighted_var,pedal_len_w
eighted_var,\
pedal_width_weighted_var]
for j in range(len(flower)):
num = 1
for i in range(len(list_of_test)):
plt.subplot(1,4,num)
data = list_of_test[i][test_type_flower==flower[j]]

n,bins,patch =plt.hist(data,bins=10,color = color[j])


m = est_m[name_of_list[i].upper() +' '+flower[j]]
y = norm.pdf(bins,loc = m ,scale = math.sqrt(weighted_variance[i]) )*n.m
ax()
plt.plot(bins,y,'k')
y_test = norm.pdf(bins,loc = data.mean() ,scale = data.std() )*n.max()
plt.plot(bins,y_test,'y')
plt.grid(True)
plt.xlim(lim[num-1])
plt.title(flower[j] +' ' +name_of_list[i])
num+=1
plt.show()

print('----------------------- 輸出依照 analysis method 與


feature 各⾃不同的accura
cy------------------------')
for j in range(len(name_of_list)):
for i in range(len(Confusion_Matrix)):
a = Confusion_Matrix_name[i]
print( a,name_of_list[j],All_Accuracy[a + name_of_list[j]])

### 我們可以看到,唯⼀不⼀樣的就是 sepal_len


## 因此我們試著把 -1/(2*weighted varince) 與
log(prior probability) 印出來
print('******************************** -1/(2*weighted varince) ****************
******************')
for i in range(len(weighted_variance)):
print(name_of_list[i],-1/(2*weighted_variance[i]))

print('\n\n\n******************************** log(prior probability) ***********


**********************')
for i in range(len(prior_list)):
print(flower[i], math.log(prior_list[i]))

的 weighted variance 影響是四個裡⾯最⼩的,因此 prior probability 的影響


## speal len
會相對增加,
## 因此這⾮常有可能是 lda 在 sepal_len classifier 的 accuracy 低於其他兩者的原因
P( SETOSA )= 0.35
P( VERSICOL )= 0.29
P( VIRGINIC )= 0.36
----------------Confusion Matrix-----------------
QDA_matrix
pedal_width
[[ 15. 0. 0. 15.]
[ 0. 20. 0. 20.]
[ 0. 1. 14. 15.]
[ 15. 21. 14. 50.]]
Accuracy: 0.98
sepal_width
[[ 11. 0. 2. 13.]
[ 1. 7. 3. 11.]
[ 3. 14. 9. 26.]
[ 15. 21. 14. 50.]]
Accuracy: 0.54
sepal_len
[[ 14. 2. 0. 16.]
[ 1. 11. 3. 15.]
[ 0. 8. 11. 19.]
[ 15. 21. 14. 50.]]
Accuracy: 0.72
pedal_len
[[ 15. 0. 0. 15.]
[ 0. 18. 0. 18.]
[ 0. 3. 14. 17.]
[ 15. 21. 14. 50.]]
Accuracy: 0.94
Average Accuracy 0.795
------------------------------------
LDA_matrix
pedal_width
[[ 15. 0. 0. 15.]
[ 0. 20. 0. 20.]
[ 0. 1. 14. 15.]
[ 15. 21. 14. 50.]]
Accuracy: 0.98
sepal_width
[[ 11. 0. 2. 13.]
[ 1. 7. 3. 11.]
[ 3. 14. 9. 26.]
[ 15. 21. 14. 50.]]
Accuracy: 0.54
sepal_len
[[ 14. 2. 0. 16.]
[ 1. 10. 3. 14.]
[ 0. 9. 11. 20.]
[ 15. 21. 14. 50.]]
Accuracy: 0.7
pedal_len
[[ 15. 0. 0. 15.]
[ 0. 18. 0. 18.]
[ 0. 3. 14. 17.]
[ 15. 21. 14. 50.]]
Accuracy: 0.94
Average Accuracy 0.79
------------------------------------
NMC_matrix
pedal_width
[[ 15. 0. 0. 15.]
[ 0. 20. 0. 20.]
[ 0. 1. 14. 15.]
[ 15. 21. 14. 50.]]
Accuracy: 0.98
sepal_width
[[ 11. 0. 2. 13.]
[ 1. 10. 6. 17.]
[ 3. 11. 6. 20.]
[ 15. 21. 14. 50.]]
Accuracy: 0.54
sepal_len
[[ 14. 2. 0. 16.]
[ 1. 11. 3. 15.]
[ 0. 8. 11. 19.]
[ 15. 21. 14. 50.]]
Accuracy: 0.72
pedal_len
[[ 15. 0. 0. 15.]
[ 0. 18. 0. 18.]
[ 0. 3. 14. 17.]
[ 15. 21. 14. 50.]]
Accuracy: 0.94
Average Accuracy 0.795
------------------------------------

--------------------Average Accuracy--------------------------
{'NMC_matrix': 0.79500000000000004, 'LDA_matrix': 0.789999999999999
92, 'QDA_matrix': 0.79500000000000004}
輸出依照
----------------------- analysis method與 feature 各⾃不同的
accuracy------------------------
QDA_matrix sepal_len 0.72
LDA_matrix sepal_len 0.7
NMC_matrix sepal_len 0.72
QDA_matrix sepal_width 0.54
LDA_matrix sepal_width 0.54
NMC_matrix sepal_width 0.54
QDA_matrix pedal_len 0.94
LDA_matrix pedal_len 0.94
NMC_matrix pedal_len 0.94
QDA_matrix pedal_width 0.98
LDA_matrix pedal_width 0.98
NMC_matrix pedal_width 0.98
******************************** -1/(2*weighted varince) **********
************************
sepal_len -1.87688852277
sepal_width -4.32780649309
pedal_len -2.66146400711
pedal_width -10.9069131084

******************************** log(prior probability) ***********


**********************
SETOSA -1.0498221244986778
VERSICOL -1.2378743560016174
VIRGINIC -1.0216512475319814

In [ ]:

Das könnte Ihnen auch gefallen