Sie sind auf Seite 1von 23

12/8/2019 Untitled19

In [53]: import pandas as pd


%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
df=pd.read_csv(r"C:\Users\User\Desktop\Cristano_Ronaldo_Final_v1\data.csv")

In [54]: df.head() #Printing Out the 1st 5 elements

Out[54]:
Unnamed:
match_event_id location_x location_y remaining_min power_of_shot knockout_match game_season remaining_sec distance_of_sho
0

0 0 10.0 167.0 72.0 10.0 1.0 0.0 2000-01 27.0 38

1 1 12.0 -157.0 0.0 10.0 1.0 0.0 2000-01 22.0 35

2 2 35.0 -101.0 135.0 7.0 1.0 0.0 2000-01 45.0 36

3 3 43.0 138.0 175.0 6.0 1.0 0.0 2000-01 52.0 42

4 4 155.0 0.0 0.0 NaN 2.0 0.0 2000-01 19.0 20

localhost:8888/notebooks/Untitled19.ipynb# 1/23
12/8/2019 Untitled19

In [55]: pd.set_option('display.max_columns', None) #this function helps us view all the columns available
df=df.rename(columns={ df.columns[0]: "Index" })
df.head()

Out[55]:
Index match_event_id location_x location_y remaining_min power_of_shot knockout_match game_season remaining_sec distance_of_shot is

0 0 10.0 167.0 72.0 10.0 1.0 0.0 2000-01 27.0 38.0

1 1 12.0 -157.0 0.0 10.0 1.0 0.0 2000-01 22.0 35.0

2 2 35.0 -101.0 135.0 7.0 1.0 0.0 2000-01 45.0 36.0

3 3 43.0 138.0 175.0 6.0 1.0 0.0 2000-01 52.0 42.0

4 4 155.0 0.0 0.0 NaN 2.0 0.0 2000-01 19.0 20.0

In [56]: df.is_goal.unique() #checking out unique values in is_goal

Out[56]: array([nan, 0., 1.])

In [57]: pd.value_counts(df.is_goal) #counting the number of values of each unique type in is_goal

Out[57]: 0.0 13550


1.0 10879
Name: is_goal, dtype: int64

In [58]: df.area_of_shot.unique() #checking out unique values

Out[58]: array(['Right Side(R)', 'Left Side(L)', 'Left Side Center(LC)',


'Right Side Center(RC)', 'Center(C)', nan, 'Mid Ground(MG)'],
dtype=object)

localhost:8888/notebooks/Untitled19.ipynb# 2/23
12/8/2019 Untitled19

In [59]: pd.value_counts(df.area_of_shot) #counting the number of values of each unique type

Out[59]: Center(C) 12761


Right Side Center(RC) 4562
Right Side(R) 4370
Left Side Center(LC) 3848
Left Side(L) 3573
Mid Ground(MG) 81
Name: area_of_shot, dtype: int64

In [60]: df.area_of_shot.fillna("Center(C)",inplace=True) #filling the missing values

In [61]: pd.value_counts(df.area_of_shot) #counting the number of values of each unique type

Out[61]: Center(C) 14263


Right Side Center(RC) 4562
Right Side(R) 4370
Left Side Center(LC) 3848
Left Side(L) 3573
Mid Ground(MG) 81
Name: area_of_shot, dtype: int64

localhost:8888/notebooks/Untitled19.ipynb# 3/23
12/8/2019 Untitled19

In [62]: # Assigning integral values for each unique value


df.area_of_shot.replace({'Center(C)':1, 'Right Side Center(RC)':2,'Right Side(R)':3,'Left Side Center(LC)':4,'Left Side(
df.area_of_shot

Out[62]: 0 3
1 5
2 4
3 2
4 1
5 5
6 1
7 1
8 5
9 1
10 4
11 2
12 5
13 5
14 1
15 1
16 1
17 4
18 4
19 1
20 1
21 2
22 1
23 4
24 1
25 1
26 1
27 4
28 2
29 1
..
30667 1
30668 1
30669 1
30670 1
30671 1

localhost:8888/notebooks/Untitled19.ipynb# 4/23
12/8/2019 Untitled19

30672 3
30673 3
30674 2
30675 4
30676 5
30677 4
30678 1
30679 1
30680 1
30681 1
30682 5
30683 1
30684 5
30685 1
30686 1
30687 1
30688 5
30689 6
30690 5
30691 1
30692 1
30693 1
30694 4
30695 1
30696 1
Name: area_of_shot, Length: 30697, dtype: int64

In [63]: df.area_of_shot.unique() #checking out unique values

Out[63]: array([3, 5, 4, 2, 1, 6], dtype=int64)

In [64]: df.shot_basics.unique() #checking out unique values

Out[64]: array(['Mid Range', 'Goal Area', 'Goal Line', 'Penalty Spot', nan,
'Right Corner', 'Mid Ground Line', 'Left Corner'], dtype=object)

localhost:8888/notebooks/Untitled19.ipynb# 5/23
12/8/2019 Untitled19

In [65]: pd.value_counts(df.shot_basics) #counting the number of values of each unique type

Out[65]: Mid Range 11955


Goal Area 6787
Penalty Spot 5321
Goal Line 4357
Right Corner 367
Left Corner 268
Mid Ground Line 67
Name: shot_basics, dtype: int64

In [66]: df.shot_basics.fillna("Mid Range",inplace=True) #filling the missing values


pd.value_counts(df.shot_basics) #counting the number of values of each unique type

Out[66]: Mid Range 13530


Goal Area 6787
Penalty Spot 5321
Goal Line 4357
Right Corner 367
Left Corner 268
Mid Ground Line 67
Name: shot_basics, dtype: int64

localhost:8888/notebooks/Untitled19.ipynb# 6/23
12/8/2019 Untitled19

In [67]: # Assigning integral values for each unique value


df.shot_basics.replace({'Mid Range':1, 'Goal Area':2,'Penalty Spot':3,'Goal Line':4,'Right Corner':5,'Mid Ground Line':6
df.shot_basics

Out[67]: 0 1
1 1
2 1
3 1
4 2
5 1
6 2
7 2
8 4
9 4
10 3
11 1
12 4
13 1
14 4
15 1
16 2
17 1
18 1
19 1
20 1
21 1
22 4
23 1
24 4
25 2
26 1
27 3
28 1
29 4
..
30667 3
30668 1
30669 1
30670 4
30671 2

localhost:8888/notebooks/Untitled19.ipynb# 7/23
12/8/2019 Untitled19

30672 1
30673 1
30674 3
30675 3
30676 1
30677 1
30678 2
30679 2
30680 2
30681 3
30682 4
30683 1
30684 1
30685 3
30686 4
30687 4
30688 1
30689 6
30690 1
30691 2
30692 1
30693 2
30694 1
30695 3
30696 4
Name: shot_basics, Length: 30697, dtype: int64

In [68]: df.range_of_shot.unique() #checking out unique values

Out[68]: array(['16-24 ft.', '8-16 ft.', 'Less Than 8 ft.', '24+ ft.', nan,
'Back Court Shot'], dtype=object)

In [69]: pd.value_counts(df.range_of_shot) #counting the number of values of each unique type

Out[69]: Less Than 8 ft. 8933


16-24 ft. 7892
8-16 ft. 6290
24+ ft. 5937
Back Court Shot 81
Name: range_of_shot, dtype: int64

localhost:8888/notebooks/Untitled19.ipynb# 8/23
12/8/2019 Untitled19

In [70]: df.range_of_shot.fillna("Less Than 8 ft.",inplace=True) #filling the missing values


pd.value_counts(df.range_of_shot) #counting the number of values of each unique type

Out[70]: Less Than 8 ft. 10497


16-24 ft. 7892
8-16 ft. 6290
24+ ft. 5937
Back Court Shot 81
Name: range_of_shot, dtype: int64

localhost:8888/notebooks/Untitled19.ipynb# 9/23
12/8/2019 Untitled19

In [71]: # Assigning integral values for each unique value


df.range_of_shot.replace({'Less Than 8 ft.':1, '8-16 ft.':2,'16-24 ft.':3,'24+ ft.':4,'Back Court Shot':5},inplace=True)
df.range_of_shot

Out[71]: 0 3
1 2
2 3
3 3
4 1
5 2
6 1
7 1
8 2
9 2
10 4
11 3
12 2
13 2
14 1
15 3
16 1
17 1
18 3
19 1
20 2
21 3
22 2
23 3
24 1
25 1
26 3
27 4
28 3
29 2
..
30667 4
30668 3
30669 3
30670 1
30671 1
30672 2

localhost:8888/notebooks/Untitled19.ipynb# 10/23
12/8/2019 Untitled19

30673 2
30674 4
30675 4
30676 2
30677 3
30678 1
30679 1
30680 1
30681 4
30682 2
30683 3
30684 2
30685 4
30686 2
30687 2
30688 2
30689 5
30690 2
30691 1
30692 1
30693 1
30694 3
30695 1
30696 1
Name: range_of_shot, Length: 30697, dtype: int64

In [72]: pd.value_counts(df.power_of_shot) #counting the number of values of each unique type

Out[72]: 3.0 7885


1.0 7659
4.0 6910
2.0 6399
5.0 314
6.0 37
7.0 7
Name: power_of_shot, dtype: int64

In [73]: df.power_of_shot.fillna(df.power_of_shot.mean(),inplace=True) #filling the missing values

localhost:8888/notebooks/Untitled19.ipynb# 11/23
12/8/2019 Untitled19

In [74]: pd.value_counts(df.power_of_shot) #counting the number of values of each unique type

Out[74]: 3.000000 7885


1.000000 7659
4.000000 6910
2.000000 6399
2.519359 1486
5.000000 314
6.000000 37
7.000000 7
Name: power_of_shot, dtype: int64

In [75]: df.distance_of_shot.fillna(df.distance_of_shot.mean(),inplace=True) #filling the missing values

localhost:8888/notebooks/Untitled19.ipynb# 12/23
12/8/2019 Untitled19

In [76]: df.distance_of_shot

Out[76]: 0 38.000000
1 35.000000
2 36.000000
3 42.000000
4 20.000000
5 34.000000
6 20.000000
7 22.000000
8 32.000000
9 32.000000
10 45.000000
11 37.000000
12 33.448884
13 29.000000
14 25.000000
15 40.000000
16 20.000000
17 45.000000
18 36.000000
19 20.000000
20 34.000000
21 38.000000
22 31.000000
23 38.000000
24 27.000000
25 20.000000
26 40.000000
27 46.000000
28 39.000000
29 28.000000
...
30667 45.000000
30668 42.000000
30669 37.000000
30670 27.000000
30671 20.000000
30672 30.000000
30673 31.000000
30674 44.000000

localhost:8888/notebooks/Untitled19.ipynb# 13/23
12/8/2019 Untitled19

30675 45.000000
30676 34.000000
30677 38.000000
30678 33.448884
30679 20.000000
30680 20.000000
30681 46.000000
30682 28.000000
30683 41.000000
30684 33.000000
30685 46.000000
30686 29.000000
30687 30.000000
30688 33.000000
30689 87.000000
30690 35.000000
30691 20.000000
30692 24.000000
30693 20.000000
30694 41.000000
30695 46.000000
30696 27.000000
Name: distance_of_shot, Length: 30697, dtype: float64

In [ ]:

In [118]: df.location_x.fillna(df.location_x.mean(),inplace=True) #filling the missing values

In [82]: df.location_y.fillna(df.location_y.mean(),inplace=True) #filling the missing values

localhost:8888/notebooks/Untitled19.ipynb# 14/23
12/8/2019 Untitled19

In [84]: import numpy as np


df2=df[~np.isfinite(df.is_goal)] #Assigning all the rows of df with missing values in is_goal to df2
df2

Out[84]:
Index match_event_id location_x location_y remaining_min power_of_shot knockout_match game_season remaining_sec distance_o

0 0 10.0 167.000000 72.000000 10.0 1.000000 0.0 2000-01 27.0

7 7 254.0 1.000000 28.000000 8.0 3.000000 0.0 2000-01 5.0

16 16 100.0 0.000000 0.000000 0.0 1.000000 0.0 2000-01 1.0

19 19 249.0 0.000000 0.000000 10.0 3.000000 0.0 NaN 46.0

21 21 265.0 134.000000 127.000000 9.0 3.000000 0.0 NaN 4.0

32 32 4.0 163.000000 76.000000 11.0 1.000000 0.0 2000-01 26.0

33 33 8.0 70.000000 194.000000 10.0 1.000000 0.0 2000-01 58.0

localhost:8888/notebooks/Untitled19.ipynb# 15/23
12/8/2019 Untitled19

In [85]: df=df[np.isfinite(df.is_goal)] #Assigning all the rows of df with finding values in is_goal to df
df

Out[85]:
Index match_event_id location_x location_y remaining_min power_of_shot knockout_match game_season remaining_sec distance_o

1 1 12.0 -157.000000 0.000000 10.0 1.000000 0.0 2000-01 22.0 35.

2 2 35.0 -101.000000 135.000000 7.0 1.000000 0.0 2000-01 45.0 36.

3 3 43.0 138.000000 175.000000 6.0 1.000000 0.0 2000-01 52.0 42.

4 4 155.0 0.000000 0.000000 NaN 2.000000 0.0 2000-01 19.0 20.

5 5 244.0 -145.000000 -11.000000 9.0 3.000000 0.0 NaN 32.0 34.

6 6 251.0 0.000000 0.000000 8.0 2.519359 0.0 2000-01 52.0 20.

8 8 265.0 -65.000000 91.126933 6.0 3.000000 0.0 2000-01 12.0 32.

In [86]: # The column shot_id_number has a lot of nan values.It is important to fill those values since this column us needed in
# submission file. A loop is written and and the missing values of shot_id_number is assigned by adding to the correspon
# value of the Index column
i=0
while i<6268:
df2.shot_id_number.iloc[i]=df2.Index.iloc[i]+1
i=i+1

C:\ProgramData\Anaconda3\lib\site-packages\pandas\core\indexing.py:189: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-co


py (http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy)
self._setitem_with_indexer(indexer, value)

localhost:8888/notebooks/Untitled19.ipynb# 16/23
12/8/2019 Untitled19

In [87]: df2.shot_id_number

Out[87]: 0 1.0
7 8.0
16 17.0
19 20.0
21 22.0
32 33.0
33 34.0
34 35.0
35 36.0
36 37.0
37 38.0
44 45.0
49 50.0
54 55.0
59 60.0
61 62.0
65 66.0
66 67.0
70 71.0
71 72.0
75 76.0
79 80.0
84 85.0
85 86.0
86 87.0
91 92.0
94 95.0
96 97.0
103 104.0
112 113.0
...
30567 30568.0
30569 30570.0
30580 30581.0
30583 30584.0
30590 30591.0
30593 30594.0
30613 30614.0
30616 30617.0

localhost:8888/notebooks/Untitled19.ipynb# 17/23
12/8/2019 Untitled19

30617 30618.0
30625 30626.0
30629 30630.0
30630 30631.0
30631 30632.0
30633 30634.0
30635 30636.0
30636 30637.0
30638 30639.0
30646 30647.0
30648 30649.0
30655 30656.0
30659 30660.0
30664 30665.0
30668 30669.0
30679 30680.0
30680 30681.0
30681 30682.0
30682 30683.0
30686 30687.0
30687 30688.0
30693 30694.0
Name: shot_id_number, Length: 6268, dtype: float64

In [ ]:

In [88]: model=LogisticRegression() #Logistic regression is used

In [89]: X=df[['location_x','location_y','power_of_shot','distance_of_shot','area_of_shot','shot_basics','range_of_shot']]
# important features are taken to be fed to logistic regression

localhost:8888/notebooks/Untitled19.ipynb# 18/23
12/8/2019 Untitled19

In [90]: X.head()

Out[90]:
location_x location_y power_of_shot distance_of_shot area_of_shot shot_basics range_of_shot

1 -157.0 0.0 1.0 35.0 5 1 2

2 -101.0 135.0 1.0 36.0 4 1 3

3 138.0 175.0 1.0 42.0 2 1 3

4 0.0 0.0 2.0 20.0 1 2 1

5 -145.0 -11.0 3.0 34.0 5 1 2

In [91]: Y=df[['is_goal']]

In [92]: model.fit(X,Y) #the model is trained

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:433: FutureWarning: Default solver will be


changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\validation.py:761: DataConversionWarning: A column-vector y wa
s passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
y = column_or_1d(y, warn=True)

Out[92]: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,


intercept_scaling=1, max_iter=100, multi_class='warn',
n_jobs=None, penalty='l2', random_state=None, solver='warn',
tol=0.0001, verbose=0, warm_start=False)

In [93]: X2=df2[['location_x','location_y','power_of_shot','distance_of_shot','area_of_shot','shot_basics','range_of_shot']]

localhost:8888/notebooks/Untitled19.ipynb# 19/23
12/8/2019 Untitled19

In [94]: X2.head()

Out[94]:
location_x location_y power_of_shot distance_of_shot area_of_shot shot_basics range_of_shot

0 167.0 72.0 1.0 38.0 3 1 3

7 1.0 28.0 3.0 22.0 1 2 1

16 0.0 0.0 1.0 20.0 1 2 1

19 0.0 0.0 3.0 20.0 1 1 1

21 134.0 127.0 3.0 38.0 2 1 3

In [101]: k=model.predict_proba(X2) # probability of occurence of goal is predicted


print(k[:,1])

[0.40169024 0.57141636 0.61488025 ... 0.48234342 0.46908281 0.58139038]

In [102]: df2.is_goal=k

localhost:8888/notebooks/Untitled19.ipynb# 20/23
12/8/2019 Untitled19

In [111]: k2=df2[['shot_id_number','is_goal']]
k2

Out[111]:
shot_id_number is_goal

0 1.0 0.598310

7 8.0 0.428584

16 17.0 0.385120

19 20.0 0.396475

21 22.0 0.600862

32 33.0 0.589593

33 34.0 0.595674

34 35.0 0.396205

35 36.0 0.403566

36 37.0 0.385752

37 38.0 0.594587

44 45.0 0.425638

49 50.0 0.628595

54 55.0 0.697189

59 60.0 0.684408

61 62.0 0.471838

65 66.0 0.532036

66 67.0 0.553568

70 71.0 0.610627

71 72.0 0.630920

75 76.0 0.652395

79 80.0 0.517826

84 85.0 0.473821

localhost:8888/notebooks/Untitled19.ipynb# 21/23
12/8/2019 Untitled19

shot_id_number is_goal

85 86.0 0.580721

86 87.0 0.578107

91 92.0 0.571578

94 95.0 0.541471

96 97.0 0.607626

103 104.0 0.606294

112 113.0 0.645887

... ... ...

30567 30568.0 0.561095

30569 30570.0 0.396179

30580 30581.0 0.611101

30583 30584.0 0.724255

30590 30591.0 0.488396

30593 30594.0 0.396179

30613 30614.0 0.578843

30616 30617.0 0.627606

30617 30618.0 0.481273

30625 30626.0 0.532360

30629 30630.0 0.534439

30630 30631.0 0.627044

30631 30632.0 0.535716

30633 30634.0 0.484675

30635 30636.0 0.432006

30636 30637.0 0.432006

30638 30639.0 0.601569

30646 30647.0 0.688807

localhost:8888/notebooks/Untitled19.ipynb# 22/23
12/8/2019 Untitled19

shot_id_number is_goal

30648 30649.0 0.463663

30655 30656.0 0.569293

30659 30660.0 0.596168

30664 30665.0 0.494496

30668 30669.0 0.615066

30679 30680.0 0.396179

30680 30681.0 0.396179

30681 30682.0 0.647187

30682 30683.0 0.593148

30686 30687.0 0.517657

30687 30688.0 0.530917

30693 30694.0 0.418610

6268 rows × 2 columns

In [115]: # the predicted values are written into a csv file


k2.to_csv("C:/Users/User/Desktop/Cristano_Ronaldo_Final_v1/sample_submission.csv",index=False)

In [ ]:

localhost:8888/notebooks/Untitled19.ipynb# 23/23