Sie sind auf Seite 1von 24

Machine Learning LAB CSA-405


AIM:Implement Exploratory Data Analysis on any data set.

Dataset Used: Iris Flower Data Set
#Importing packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

#Loading dataset and do the needful formatting to set a header on data frame
url =
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pd.read_csv(url, names=names)

‘‘‘Now analyse the data frame and output some of the useful information about data
Like it’s shape, information, mean, quantiles, count, max value, min value, data type of
Values, etc’’’

Machine Learning LAB CSA-405

‘‘‘Now I graphically represent the behaviour of dataset in form of box plot, histogram,
and Scatter-matrix’’’
dataset.plot(kind='box', subplots=True, layout=(2,2), sharex=False, sharey=False)


from pandas.plotting import scatter_matrix


Machine Learning LAB CSA-405

Machine Learning LAB CSA-405


AIM:Implement Linear Regression on any data set.

Dataset Used:Random Values using the NUMPY library
#Importing packages
import numpy as np
import tensorflow as tf
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10,6)

#Declaring Independent Variable “x” and Dependent Variable “y” for trial
x = np.arange(0.0,5.0,0.1)
a,b = 1,0
y = a*x+b

#Draw the graphical relationship between Dependent and Independent variable

plt.ylabel('Dependent Variable')
plt.xlabel('Independent Variable')

‘‘‘Declaring Independent Variable “x_data”, Target Variable “y_data” (where a=3 and # b=2 as I
want to draw the line of form y=3x+2) and also declare the dependent variable “y_data” using
random function of numpy library. ’’’
x_data = np.random.rand(100).astype(np.float32)
y_data = 3*x_data+2
Machine Learning LAB CSA-405
y_data = np.vectorize(lambda y:y+np.random.normal(loc=0.0,scale=0.1))(y_data)
a = tf.Variable(1.0)
b = tf.Variable(0.2)
y = a*x_data+b

‘‘‘Calculate the RMSE using redeuce_mean library of tensorflow to analyse the error between target
and obtained value and after that using GradientDescentOptimizer library of tensorflow I minimize
the error on scale of 0.5’’’
loss = tf.reduce_mean(tf.square(y-y_data))
optimizer = tf.train.GradientDescentOptimizer(0.5)
train = optimizer.minimize(loss)

‘‘‘Now initialize the tensorflow with global_variables_initializer library of tensorflow to demonstrate

the regression function neatly and precisely through graphical illustration’’’
init = tf.global_variables_initializer()
sess = tf.Session()

‘‘‘Now I providing the finishing touch to my program and start the optimization process for my
Regression algorithm and finally draw the scattering of data along the regression line’’’
train_data = []
for step in range(100):
evals =[train,a,b])[1:]
if step%5 == 0:
print(step," ",evals)
cr,cg,cb = (1.0,1.0,1.0)

for f in train_data:
cb += 1.0/len(train_data)
cg -= 1.0/len(train_data)
Machine Learning LAB CSA-405
if cb> 1.0:
cb = 1.0
if cg < 0.0:
cg = 0.0
[a,b] = f
f_y = np.vectorize(lambda x:a*x+b)(x_data)
line = plt.plot(x_data,f_y)

green_line = mpatches.Patch(color='red',label='Data Points')

Machine Learning LAB CSA-405


AIM: Program to implement Confusion Matrix and ROC Curve.

Dataset Used: Iris Flower imported from sklearn library
‘‘‘Importing useful libraries to implement logistic regression and then evaluate it using Confusion
Matrix and ROC Curve’’’
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

#Splitting the dataset into train and test list so that we will evaluate it later using it.
X_train, X_test, y_train, y_test = train_test_split(,, random_state=0)
#Implementing the Logistic Regression in below steps
logreg = LogisticRegression(C=0.1).fit(X_train, y_train)
pred_logreg = logreg.predict(X_test)
print("logreg score: %f" % logreg.score(X_test, y_test))

‘‘‘Here is the confusion matrix which contain the predicted result based on test data and prediction
obtained by logistic regression in above step’’’
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_test, pred_logreg)

‘‘‘In the below steps we are going to evaluate the predicted result obtained by Logistic regression in
above step using ROC curve and for this we use some libraries from sklearn’’’
from sklearn.metrics import precision_recall_curve
from sklearn.datasets import make_blobs
from sklearn.svm import SVC
from sklearn.datasets import load_digits
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
digits = load_digits()
y = == 9

Machine Learning LAB CSA-405
X_train, X_test, y_train, y_test = train_test_split(, y, random_state=0)
for gamma in [1, 0.05, 0.01]:
svc = SVC(gamma=gamma).fit(X_train, y_train)
accuracy = svc.score(X_test, y_test)
auc = roc_auc_score(y_test, svc.decision_function(X_test))
fpr, tpr, _ = roc_curve(y_test , svc.decision_function(X_test))
print("gamma = %.02f accuracy = %.02f AUC = %.02f" % (gamma, accuracy, auc))
plt.plot(fpr, tpr, label="gamma=%.03f" % gamma, linewidth=4)

Machine Learning LAB CSA-405


AIM: Implement Support Vector Machine on any dataset and analyse the accuracy
with Logistic Regression.
Dataset Used: Iris Flower dataset
#Importing library to load dataset
import pandas as pd
dataset = pd.read_csv('/Iris.csv')
#Below image is just the visual representation of Iris data frame which show top 5 data

‘‘‘In the below steps we are going to delete the unwanted column from data frame and set the target
value using set’’’
dataset = dataset.drop(['Id'],axis=1)
target = dataset['Species']
s = set()
for val in target:
s = list(s)
‘‘‘Since the Iris dataset has three classes so in the below steps we willfurther reorganize the dataset
and remove one of the classes. This will leave us with a binary class classification problem’’’
rows = list(range(100,150))
dataset = dataset.drop(dataset.index[rows])

‘‘‘Since there are four features available for us to use. But we will be using only two features, i.e.
Sepal length and petal length. Now we take these two features and plot them to visualize and we are
going to perform same in the below subsequent steps’’’
import matplotlib.pyplot as plt
x = dataset['SepalLengthCm']
y = dataset['PetalLengthCm']
setosa_x = x[:50]
setosa_y = y[:50]
Machine Learning LAB CSA-405
versicolor_x = x[50:]
versicolor_y = y[50:]
‘‘‘Below is the plot to visualize the scatterness of data points so that it will give us the idea about
hyperplane position during SVM implementation’’’

#In the following subsequent step we are going to split the dataset into training and test set
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import numpy as np
#Now drop all of the features except our target values
dataset = dataset.drop(['SepalWidthCm','PetalWidthCm'],axis=1)
Y = []
target = dataset['Species']
for val in target:
dataset = dataset.drop(['Species'],axis=1)
X = dataset.values.tolist()
#Now shuffle and split the data into training and test set
10 | P a g e
Machine Learning LAB CSA-405
X,Y = shuffle(X,Y)
x_train,y_train = [],[]
x_test,y_test = [],[]
x_train,x_test,y_train,y_test = train_test_split(X,Y,train_size=0.9)
x_train,y_train = np.array(x_train),np.array(y_train)
x_test,y_test = np.array(x_test),np.array(y_test)
y_train = y_train.reshape(90,1)
y_test = y_test.reshape(10,1)

‘‘‘In the steps below we are going to implement the SVM algorithm at learning rate of 0.0001 for
10000 iterations and the hyperparameter value for the algorithm will be changing in each iteration by
equation (1/epochs) here epochs will contain the value corresponding to iteration number. Therefore,
the regularization value decreases when the number of epochs increases and this is called adjustment
of regularization parameter.’’’
train_f1 = x_train[:,0]
train_f2 = x_train[:,1]
train_f1 = train_f1.reshape(90,1)
train_f2 = train_f2.reshape(90,1)
w1 = np.zeros((90,1))
w2 = np.zeros((90,1))
alpha = 0.0001
y = w1*train_f1 + w2*train_f2
prod = y*y_train
count = 0
for val in prod:
cost = 0
w1 = w1-alpha*(2*1/epochs*w1)
w2 = w2-alpha*(2*1/epochs*w2)
cost = 1-val
w1 = w1+alpha*(train_f1[count]*y_train[count]-2*1/epochs*w1)
w2 = w2+alpha*(train_f2[count]*y_train[count]-2*1/epochs*w2)
count += 1
epochs += 1

11 | P a g e
Machine Learning LAB CSA-405
‘‘‘Now in further step we will clip down the weights as the test data contains only 10 data points. We
extract the features from the test data and predict the values. We will obtain the predictions and
compare it with the actual values and got the accuracy of our model.’’’
from sklearn.metrics import accuracy_score
#Weight clipping
index = list(range(10,90))
w1 = np.delete(w1,index)
w2 = np.delete(w2,index)
w1 = w1.reshape(10,1)
w2 = w2.reshape(10,1)
#Extracting the test data features
test_f1 = x_test[:,0]
test_f2 = x_test[:,1]
test_f1 = test_f1.reshape(10,1)
test_f2 = test_f2.reshape(10,1)
#Now prediction will be started
y_pred = w1*test_f1 + w2*test_f2
predictions = []
for val in y_pred:

‘‘‘In the below step we are going to implement the logistic regression using sklearn library and then
compare it’s accuracy with SVM’’’
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(,, random_state=0)
#Implementing the Logistic Regression in below steps
logreg = LogisticRegression(C=0.1).fit(X_train, y_train)
pred_logreg = logreg.predict(X_test)
print("logreg score: %f" % logreg.score(X_test, y_test))

So now we can clearly see that the prediction accuracy of Logistic regression is around 63.15% which is
much smaller than the prediction accuracy of SVM. So, we can say that SVM classify the dataset more
accurately than Logistic Regression.

12 | P a g e
Machine Learning LAB CSA-405


AIM: Implement K-Nearest Neighbor on any dataset.

Dataset Used: Iris Flower dataset
#Importing necessary libraries to implement the program
import pandas as pd
import numpy as np
import math
import operator
#Load the dataset
x = pd.read_csv("/")

#Now we are calculating Euclidean distance in this step

def ED(x1, x2, length):
distance = 0
for x in range(length):
distance += np.square(x1[x] - x2[x])
return np.sqrt(distance)

#Here we are defining our model

def knn(trainingSet, testInstance, k):
distances = {}
sort = {}
length = testInstance.shape[1]
for x in range(len(trainingSet)):
dist = ED(testInstance, trainingSet.iloc[x], length)
distances[x] = dist[0]
sortdist = sorted(distances.items(), key=operator.itemgetter(1))
neighbors = []
for x in range(k):
Votes = {}
for x in range(len(neighbors)):
response = trainingSet.iloc[neighbors[x]][-1]
if response in Votes:
Votes[response] += 1
13 | P a g e
Machine Learning LAB CSA-405
Votes[response] = 1
sortvotes = sorted(Votes.items(), key=operator.itemgetter(1), reverse=True)
return(sortvotes[0][0], neighbors)

‘‘‘Now in the subsequent steps we are going to take some random values as our test data set, call the
above implemented function and print the calculated reesult’’’
testSet = [[6.8, 3.4, 4.8, 2.4]]
test = pd.DataFrame(testSet)
k1 = 3
result,neigh = knn(x, test, k)
result1,neigh1 = knn(x, test, k1)

14 | P a g e
Machine Learning LAB CSA-405


AIM: Implement Random Forest on any dataset.

Dataset Used: Sonar dataset which describes sonar chirp returns bouncing off different surfaces
#Importing necessary libraries to implement the program
from random import seed
from random import randrange
from csv import reader
from math import sqrt

#In this step we are going to build a function to load our dataset.
def load_csv(filename):
dataset = list()
with open(filename, 'r') as file:
csv_reader = reader(file)
for row in csv_reader:
if not row:
return dataset

‘‘‘As we load our dataset now so we proceed with process where the string values converted to
numeric and the output column is converted from strings to the integer values of 0 and 1 and we will
apply some validation approaches on them’’’
def str_column_to_float(dataset, column):
for row in dataset:
row[column] = float(row[column].strip())

def str_column_to_int(dataset, column):

class_values = [row[column] for row in dataset]
unique = set(class_values)
lookup = dict()
for i, value in enumerate(unique):
lookup[value] = i
for row in dataset:
row[column] = lookup[row[column]]
return lookup

def cross_validation_split(dataset, n_folds):

dataset_split = list()
dataset_copy = list(dataset)
fold_size = int(len(dataset) / n_folds)
15 | P a g e
Machine Learning LAB CSA-405
for i in range(n_folds):
fold = list()
while len(fold) < fold_size:
index = randrange(len(dataset_copy))
return dataset_split

def accuracy_metric(actual, predicted):

correct = 0
for i in range(len(actual)):
if actual[i] == predicted[i]:
correct += 1
return correct / float(len(actual)) * 100.0

def evaluate_algorithm(dataset, algorithm, n_folds, *args):

folds = cross_validation_split(dataset, n_folds)
scores = list()
for fold in folds:
train_set = list(folds)
train_set = sum(train_set, [])
test_set = list()
for row in fold:
row_copy = list(row)
row_copy[-1] = None
predicted = algorithm(train_set, test_set, *args)
actual = [row[-1] for row in fold]
accuracy = accuracy_metric(actual, predicted)
return scores

#Split a dataset based on an attribute and an attribute value

def test_split(index, value, dataset):
left, right = list(), list()
for row in dataset:
if row[index] < value:
return left, right

#Calculate the Gini index for a split dataset

def gini_index(groups, classes):
#Count all samples at split point
n_instances = float(sum([len(group) for group in groups]))
#Sum weighted Gini index for each group

16 | P a g e
Machine Learning LAB CSA-405
gini = 0.0
for group in groups:
size = float(len(group))
#Avoid divide by zero
if size == 0:
score = 0.0
#Score the group based on the score for each class
for class_val in classes:
p = [row[-1] for row in group].count(class_val) / size
score += p * p
#Weight the group score by its relative size
gini += (1.0 - score) * (size / n_instances)
return gini
#Select the best split point for a dataset
def get_split(dataset, n_features):
class_values = list(set(row[-1] for row in dataset))
b_index, b_value, b_score, b_groups = 999, 999, 999, None
features = list()
while len(features) < n_features:
index = randrange(len(dataset[0])-1)
if index not in features:
for index in features:
for row in dataset:
groups = test_split(index, row[index], dataset)
gini = gini_index(groups, class_values)
if gini < b_score:
b_index, b_value, b_score, b_groups = index, row[index], gini, groups
return {'index':b_index, 'value':b_value, 'groups':b_groups}
#Create a terminal node value
def to_terminal(group):
outcomes = [row[-1] for row in group]
return max(set(outcomes), key=outcomes.count)
#Create child splits for a node or make terminal
def split(node, max_depth, min_size, n_features, depth):
left, right = node['groups']
#Check for a no split
if not left or not right:
node['left'] = node['right'] = to_terminal(left + right)
#Check for max depth
if depth >= max_depth:
node['left'], node['right'] = to_terminal(left), to_terminal(right)
#Process left child
if len(left) <= min_size:

17 | P a g e
Machine Learning LAB CSA-405
node['left'] = to_terminal(left)
node['left'] = get_split(left, n_features)
split(node['left'], max_depth, min_size, n_features, depth+1)
# process right child
if len(right) <= min_size:
node['right'] = to_terminal(right)
node['right'] = get_split(right, n_features)
split(node['right'], max_depth, min_size, n_features, depth+1)
#Build a decision tree
def build_tree(train, max_depth, min_size, n_features):
root = get_split(train, n_features)
split(root, max_depth, min_size, n_features, 1)
return root
#Make a prediction with a decision tree
def predict(node, row):
if row[node['index']] < node['value']:
if isinstance(node['left'], dict):
return predict(node['left'], row)
return node['left']
if isinstance(node['right'], dict):
return predict(node['right'], row)
return node['right']
#Create a random subsample from the dataset with replacement
def subsample(dataset, ratio):
sample = list()
n_sample = round(len(dataset) * ratio)
while len(sample) < n_sample:
index = randrange(len(dataset))
return sample
#Make a prediction with a list of bagged trees
def bagging_predict(trees, row):
predictions = [predict(tree, row) for tree in trees]
return max(set(predictions), key=predictions.count)

def random_forest(train, test, max_depth, min_size, sample_size, n_trees, n_features):

trees = list()
for i in range(n_trees):
sample = subsample(train, sample_size)
tree = build_tree(sample, max_depth, min_size, n_features)
predictions = [bagging_predict(trees, row) for row in test]

18 | P a g e
Machine Learning LAB CSA-405

‘‘‘Now after building functions for all of our major operations. Now we will proceed with calling the
testing function to check the RF algorithm and calling the functions that we have made in above steps
by passing actual dataset.’’’
#Test the random forest algorithm
#Load and prepare data
filename = '/sonar.all-data.csv'
dataset = load_csv(filename)
#Convert string attributes to integers
for i in range(0, len(dataset[0])-1):
str_column_to_float(dataset, i)
#Convert class column to integers
str_column_to_int(dataset, len(dataset[0])-1)
#Evaluate algorithm
n_folds = 5
max_depth = 10
min_size = 1
sample_size = 1.0
n_features = int(sqrt(len(dataset[0])-1))
for n_trees in [1, 5, 10, 15, 20]:
scores = evaluate_algorithm(dataset, random_forest, n_folds, max_depth, min_size, sample_size, n_trees,
print('Trees: %d' % n_trees)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

19 | P a g e
Machine Learning LAB CSA-405


AIM: Implement K-means clustering on any sample dataset.

Dataset Used: Iris Flower dataset
#Importing necessary libraries to implement the program
from copy import deepcopy
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

#Load the dataset and delete the unwanted columns from data frame
df = pd.read_csv("/Iris.csv")

# Change categorical data to number 0-2

df["Species"] = pd.Categorical(df["Species"])
df["Species"] = df["Species"]
# Change dataframe to numpy matrix
data = df.values[:, 0:4]
category = df.values[:, 4]
# Number of clusters
# Number of training data
n = data.shape[0]
# Number of features in the data
c = data.shape[1]
# Generate random centers, here we use sigma and mean to ensure it represent the whole data
mean = np.mean(data, axis = 0)
std = np.std(data, axis = 0)
centers = np.random.randn(k,c)*std + mean

# Plot the data and the centers generated as random

colors=['orange', 'blue', 'green']
20 | P a g e
Machine Learning LAB CSA-405
for i inrange(n):
plt.scatter(data[i, 0], data[i,1], s=7, color = colors[int(category[i])])
plt.scatter(centers[:,0], centers[:,1], marker='*', c='g', s=150)

centers_old = np.zeros(centers.shape) # to store old centers

centers_new = deepcopy(centers) # Store new centers
clusters = np.zeros(n)
distances = np.zeros((n,k))
error = np.linalg.norm(centers_new - centers_old)
# When, after an update, the estimate of that center stays the same, exit loop
while error != 0:
# Measure the distance to every center
for i inrange(k):
distances[:,i] = np.linalg.norm(data - centers[i], axis=1)
# Assign all training data to closest center
clusters = np.argmin(distances, axis = 1)

centers_old = deepcopy(centers_new)
# Calculate mean for every cluster and update the center
for i inrange(k):
centers_new[i] = np.mean(data[clusters == i], axis=0)
error = np.linalg.norm(centers_new - centers_old)

21 | P a g e
Machine Learning LAB CSA-405
# Plot the data and the centers generated as random
colors=['orange', 'blue', 'green']
for i inrange(n):
plt.scatter(data[i, 0], data[i,1], s=7, color = colors[int(category[i])])
plt.scatter(centers_new[:,0], centers_new[:,1], marker='*', c='g', s=150)

22 | P a g e
Machine Learning LAB CSA-405


AIM: Implement Principle Component Analysis on any sample dataset.

Dataset Used: MNIST dataset
#Importing necessary libraries to implement the program
from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
import pandas as pd

#Download and Load the dataset

mnist=fetch_mldata('MNIST original')

#Splitting the data into train and test set


#Standardizing the dataset

#Fit on training set only.

#Apply transform to both the training set and the test set.
23 | P a g e
Machine Learning LAB CSA-405

#PCA to speed up the logistic regression algorithm of machine learning





#Measuring the model performances


24 | P a g e