Sie sind auf Seite 1von 6

In [1]:  import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

from scipy.stats import randint

from sklearn.preprocessing import LabelEncoder


from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

import warnings
warnings.filterwarnings('ignore')

In [2]:  # train = pd.read_csv("_Data/train.csv", encoding='utf8', index_col='ID')


# test = pd.read_csv("_Data/test.csv", encoding='utf8', index_col='ID')

train = pd.read_csv("_Data/train.csv", encoding='utf8')


test = pd.read_csv("_Data/test.csv", encoding='utf8')

In [3]:  train_X = train.iloc[:, 1:-1]


train_y = train.iloc[:, -1]

test_X = test.iloc[:, 1:-1]


test_y = test.iloc[:, -1]

le = LabelEncoder()
train_y = le.fit_transform(train_y)
test_y = le.transform(test_y)

KNN
Base Model 1
In [4]:  knn = KNeighborsClassifier()
param_dist = {'n_neighbors': [1, 20],
'weights': ['uniform', 'distance'],
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
'leaf_size': randint(10, 50),
'p': [1, 2]
}

knn_cv = RandomizedSearchCV(knn, param_dist, cv=20, n_iter=50, random_state=42)

knn_cv.fit(train_X, train_y)

Out[4]: RandomizedSearchCV(cv=20, error_score='raise-deprecating',


estimator=KNeighborsClassifier(algorithm='auto',
leaf_size=30,
metric='minkowski',
metric_params=None,
n_jobs=None, n_neighbors=5,
p=2, weights='uniform'),
iid='warn', n_iter=50, n_jobs=None,
param_distributions={'algorithm': ['auto', 'ball_tree',
'kd_tree', 'brute'],
'leaf_size': <scipy.stats._distn_infrastructu
re.rv_frozen object at 0x000001BECD1631D0>,
'n_neighbors': [1, 20], 'p': [1, 2],
'weights': ['uniform', 'distance']},
pre_dispatch='2*n_jobs', random_state=42, refit=True,
return_train_score=False, scoring=None, verbose=0)

In [5]:  print("Tuned Parameters: {}".format(knn_cv.best_params_))


print("Best score is {}".format(knn_cv.best_score_))

Tuned Parameters: {'algorithm': 'auto', 'leaf_size': 30, 'n_neighbors': 1, 'p': 2, 'w


eights': 'uniform'}
Best score is 0.6933333333333334

In [6]:  knn_cv_score = knn_cv.score(test_X, test_y)

In [7]:  score_comparisons = {'knn': knn_cv_score,}

def show_scores(score_dict):
return pd.DataFrame.from_dict(score_comparisons, orient='index', columns=['score']

show_scores(score_comparisons)

Out[7]:
score

knn 0.702703

SVM Classifier
Base Model 2
In [8]:  svc = SVC()
param_dist = {'C': np.linspace(0.01, 3000, 5000),
'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'degree': randint(1, 10),
'class_weight': ['balanced', None],
'random_state': [42]
}

svc_cv = RandomizedSearchCV(svc, param_dist, cv=20, n_iter=50, random_state=42)

svc_cv.fit(train_X, train_y)

Out[8]: RandomizedSearchCV(cv=20, error_score='raise-deprecating',


estimator=SVC(C=1.0, cache_size=200, class_weight=None,
coef0=0.0, decision_function_shape='ovr',
degree=3, gamma='auto_deprecated',
kernel='rbf', max_iter=-1, probability=False,
random_state=None, shrinking=True, tol=0.001,
verbose=False),
iid='warn', n_iter=50, n_jobs=None,
param_distributions={'C': array([1...24e-01, 1.21023605e+00, ...,
2.99879976e+03, 2.99939988e+03, 3.00000000e+03]),
'class_weight': ['balanced', None],
'degree': <scipy.stats._distn_infrastructure.
rv_frozen object at 0x000001BECD24D198>,
'kernel': ['linear', 'poly', 'rbf',
'sigmoid'],
'random_state': [42]},
pre_dispatch='2*n_jobs', random_state=42, refit=True,
return_train_score=False, scoring=None, verbose=0)

In [9]:  print("Tuned Parameters: {}".format(svc_cv.best_params_))


print("Best score is {}".format(svc_cv.best_score_))

Tuned Parameters: {'C': 360.0808141628325, 'class_weight': None, 'degree': 2, 'kerne


l': 'poly', 'random_state': 42}
Best score is 0.8133333333333334

In [10]:  svc_cv_score = svc_cv.score(test_X, test_y)

In [11]:  score_comparisons['svc'] = svc_cv_score


show_scores(score_comparisons)

Out[11]:
score

knn 0.702703

svc 0.837838

Plotting KNN & SVC


In [12]:  # from matplotlib.colors import ListedColormap

In [13]:  # h = .02 # step size in the mesh

# # Create color maps


# cmap_bold = ListedColormap(['#33CC00', '#993399', '#330099', '#FF3366'])
# cmap_light = ListedColormap(['#bde0b1', '#e3c8e3', '#c1aee6', '#ffbfcf'])

In [14]:  # # Plot the decision boundary. For that, we will assign a color to each
# # point in the mesh [x_min, x_max]x[y_min, y_max].
# x_min, x_max = train_X.iloc[:, 0].min() - 1, train_X.iloc[:, 0].max() + 1
# y_min, y_max = train_X.iloc[:, 1].min() - 1, train_X.iloc[:, 1].max() + 1
# xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
# np.arange(y_min, y_max, h))

In [15]:  # Z = knn_cv.predict(np.c_[xx.ravel(), yy.ravel()])

# # Put the result into a color plot


# Z = Z.reshape(xx.shape)
# plt.figure()
# plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# # Plot also the training points


# plt.scatter(train_X.iloc[:, 0], train_X.iloc[:, 1], c=train_y, cmap=cmap_bold,
# edgecolor='k', s=20)
# plt.xlim(xx.min(), xx.max())
# plt.ylim(yy.min(), yy.max())
# plt.title("KNN 4-Class classification")

# neg_padding = 0.7
# plt.xlim(x_min + neg_padding, x_max - neg_padding)
# plt.ylim(y_min + neg_padding, y_max - neg_padding)
# plt.show()

In [16]:  # Z = svc_cv.predict(np.c_[xx.ravel(), yy.ravel()])

# # Put the result into a color plot


# Z = Z.reshape(xx.shape)
# plt.figure()
# plt.pcolormesh(xx, yy, Z, cmap=cmap_light)

# # Plot also the training points


# plt.scatter(train_X.iloc[:, 0], train_X.iloc[:, 1], c=train_y, cmap=cmap_bold,
# edgecolor='k', s=20)
# plt.xlim(xx.min(), xx.max())
# plt.ylim(yy.min(), yy.max())
# plt.title("SVC 4-Class classification")

# neg_padding = 0.7
# plt.xlim(x_min + neg_padding, x_max - neg_padding)
# plt.ylim(y_min + neg_padding, y_max - neg_padding)
# plt.show()
Stacking KNN & SVC
In [17]:  # Get predictions on training & test sets
train_pred_knn = pd.Series(knn_cv.predict(train_X))
train_pred_svc = pd.Series(svc_cv.predict(train_X))

test_pred_knn = pd.Series(knn_cv.predict(test_X))
test_pred_svc = pd.Series(svc_cv.predict(test_X))

# Concat training & test predictions into their own dataframes


df = pd.concat([train_pred_knn, train_pred_svc], axis=1)
df_test = pd.concat([test_pred_knn, test_pred_svc], axis=1)

In [18]:  lr = LogisticRegression()

param_dist = {'C': np.linspace(0.01, 3000, 5000),


'solver': ['newton-cg', 'sag', 'saga', 'lbfgs'],
'class_weight': ['balanced', None],
'multi_class': ['auto'],
'random_state': [42]
}

lr_cv = RandomizedSearchCV(lr, param_dist, cv=40, n_iter=20, random_state=42)

lr_cv.fit(df, train_y)

Out[18]: RandomizedSearchCV(cv=40, error_score='raise-deprecating',


estimator=LogisticRegression(C=1.0, class_weight=None,
dual=False, fit_intercept=True,
intercept_scaling=1,
l1_ratio=None, max_iter=100,
multi_class='warn', n_jobs=None,
penalty='l2', random_state=None,
solver='warn', tol=0.0001,
verbose=0, warm_start=False),
iid='warn', n_iter=20, n_jobs=None,
param_distributions={'C': array([1.00000000e-02, 6.10118024e-01,
1.21023605e+00, ...,
2.99879976e+03, 2.99939988e+03, 3.00000000e+03]),
'class_weight': ['balanced', None],
'multi_class': ['auto'],
'random_state': [42],
'solver': ['newton-cg', 'sag', 'saga',
'lbfgs']},
pre_dispatch='2*n_jobs', random_state=42, refit=True,
return_train_score=False, scoring=None, verbose=0)

In [19]:  print("Tuned Parameters: {}".format(lr_cv.best_params_))


print("Best score is {}".format(lr_cv.best_score_))

Tuned Parameters: {'solver': 'lbfgs', 'random_state': 42, 'multi_class': 'auto', 'cla


ss_weight': 'balanced', 'C': 1184.6429785957191}
Best score is 1.0
In [20]:  lr_cv.best_estimator_

Out[20]: LogisticRegression(C=1184.6429785957191, class_weight='balanced', dual=False,


fit_intercept=True, intercept_scaling=1, l1_ratio=None,
max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)

In [21]:  lr_cv_score = lr_cv.score(df_test, test_y)

In [22]:  score_comparisons['stacked_lr'] = lr_cv_score


show_scores(score_comparisons)

Out[22]:
score

knn 0.702703

svc 0.837838

stacked_lr 0.702703

Das könnte Ihnen auch gefallen