Sie sind auf Seite 1von 14

In [1]: import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))

In [2]: !pip install pyspark


from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from tqdm import tqdm
import pandas as pd

def load_raw():
#Create Spark Session
spark = SparkSession.builder \
.master('local') \
.appName('myAppName') \
.config('spark.executor.memory', '12gb') \
.config("spark.cores.max", "10") \
.getOrCreate()

#Get spark context


sc = spark.sparkContext

sqlContext = SQLContext(sc)

df=pd.read_excel('C:/Users/Administrator/Notebook/dataset.xlsx')

df['Respiratory Syncytial Virus']=df['Respiratory Syncytial Virus'].astype(str)


df['Influenza A']=df['Influenza A'].astype(str)
df['Influenza B']=df['Influenza B'].astype(str)
df['Parainfluenza 1']=df['Parainfluenza 1'].astype(str)
df['CoronavirusNL63']=df['CoronavirusNL63'].astype(str)
df['Rhinovirus/Enterovirus']=df['Rhinovirus/Enterovirus'].astype(str)
df['Coronavirus HKU1']=df['Coronavirus HKU1'].astype(str)

for column in df.columns:


df[column]=df[column].astype(str)

df=sqlContext.createDataFrame(df)

return df,sqlContext

Requirement already satisfied: pyspark in c:\anaconda3\lib\site-packages (3.0.1)


Requirement already satisfied: py4j==0.10.9 in c:\anaconda3\lib\site-packages (from pyspark) (0.10.9)

In [3]: df,sqlContext=load_raw()

In [4]: print('Number of lines ',df.count())

Number of lines 5644

Exploration of data The first operation to perform after importing data is to get some information of what it looks like. It’s possible to do with the following commands:
In [5]: df.printSchema()
root
|-- Patient ID: string (nullable = true)
|-- Patient age quantile: string (nullable = true)
|-- SARS-Cov-2 exam result: string (nullable = true)
|-- Patient addmited to regular ward (1=yes, 0=no): string (nullable = true)
|-- Patient addmited to semi-intensive unit (1=yes, 0=no): string (nullable = true)
|-- Patient addmited to intensive care unit (1=yes, 0=no): string (nullable = true)
|-- Hematocrit: string (nullable = true)
|-- Hemoglobin: string (nullable = true)
|-- Platelets: string (nullable = true)
|-- Mean platelet volume : string (nullable = true)
|-- Red blood Cells: string (nullable = true)
|-- Lymphocytes: string (nullable = true)
|-- Mean corpuscular hemoglobin concentration (MCHC): string (nullable = true)
|-- Leukocytes: string (nullable = true)
|-- Basophils: string (nullable = true)
|-- Mean corpuscular hemoglobin (MCH): string (nullable = true)
|-- Eosinophils: string (nullable = true)
|-- Mean corpuscular volume (MCV): string (nullable = true)
|-- Monocytes: string (nullable = true)
|-- Red blood cell distribution width (RDW): string (nullable = true)
|-- Serum Glucose: string (nullable = true)
|-- Respiratory Syncytial Virus: string (nullable = true)
|-- Influenza A: string (nullable = true)
|-- Influenza B: string (nullable = true)
|-- Parainfluenza 1: string (nullable = true)
|-- CoronavirusNL63: string (nullable = true)
|-- Rhinovirus/Enterovirus: string (nullable = true)
|-- Mycoplasma pneumoniae: string (nullable = true)
|-- Coronavirus HKU1: string (nullable = true)
|-- Parainfluenza 3: string (nullable = true)
|-- Chlamydophila pneumoniae: string (nullable = true)
|-- Adenovirus: string (nullable = true)
|-- Parainfluenza 4: string (nullable = true)
|-- Coronavirus229E: string (nullable = true)
|-- CoronavirusOC43: string (nullable = true)
|-- Inf A H1N1 2009: string (nullable = true)
|-- Bordetella pertussis: string (nullable = true)
|-- Metapneumovirus: string (nullable = true)
|-- Parainfluenza 2: string (nullable = true)
|-- Neutrophils: string (nullable = true)
|-- Urea: string (nullable = true)
|-- Proteina C reativa mg/dL: string (nullable = true)
|-- Creatinine: string (nullable = true)
|-- Potassium: string (nullable = true)
|-- Sodium: string (nullable = true)
|-- Influenza B, rapid test: string (nullable = true)
|-- Influenza A, rapid test: string (nullable = true)
|-- Alanine transaminase: string (nullable = true)
|-- Aspartate transaminase: string (nullable = true)
|-- Gamma-glutamyltransferase : string (nullable = true)
|-- Total Bilirubin: string (nullable = true)
|-- Direct Bilirubin: string (nullable = true)
|-- Indirect Bilirubin: string (nullable = true)
|-- Alkaline phosphatase: string (nullable = true)
|-- Ionized calcium : string (nullable = true)
|-- Strepto A: string (nullable = true)
|-- Magnesium: string (nullable = true)
|-- pCO2 (venous blood gas analysis): string (nullable = true)
|-- Hb saturation (venous blood gas analysis): string (nullable = true)
|-- Base excess (venous blood gas analysis): string (nullable = true)
|-- pO2 (venous blood gas analysis): string (nullable = true)
|-- Fio2 (venous blood gas analysis): string (nullable = true)
|-- Total CO2 (venous blood gas analysis): string (nullable = true)
|-- pH (venous blood gas analysis): string (nullable = true)
|-- HCO3 (venous blood gas analysis): string (nullable = true)
|-- Rods #: string (nullable = true)
|-- Segmented: string (nullable = true)
|-- Promyelocytes: string (nullable = true)
|-- Metamyelocytes: string (nullable = true)
|-- Myelocytes: string (nullable = true)
|-- Myeloblasts: string (nullable = true)
|-- Urine - Esterase: string (nullable = true)
|-- Urine - Aspect: string (nullable = true)
|-- Urine - pH: string (nullable = true)
|-- Urine - Hemoglobin: string (nullable = true)
|-- Urine - Bile pigments: string (nullable = true)
|-- Urine - Ketone Bodies: string (nullable = true)
|-- Urine - Nitrite: string (nullable = true)
|-- Urine - Density: string (nullable = true)
|-- Urine - Urobilinogen: string (nullable = true)
|-- Urine - Protein: string (nullable = true)
|-- Urine - Sugar: string (nullable = true)
|-- Urine - Leukocytes: string (nullable = true)
|-- Urine - Crystals: string (nullable = true)
|-- Urine - Red blood cells: string (nullable = true)
|-- Urine - Hyaline cylinders: string (nullable = true)
|-- Urine - Granular cylinders: string (nullable = true)
|-- Urine - Yeasts: string (nullable = true)
|-- Urine - Color: string (nullable = true)
|-- Partial thromboplastin time (PTT) : string (nullable = true)
|-- Relationship (Patient/Normal): string (nullable = true)
|-- International normalized ratio (INR): string (nullable = true)
|-- Lactic Dehydrogenase: string (nullable = true)
|-- Prothrombin time (PT), Activity: string (nullable = true)
|-- Vitamin B12: string (nullable = true)
|-- Creatine phosphokinase (CPK) : string (nullable = true)
|-- Ferritin: string (nullable = true)
|-- Arterial Lactic Acid: string (nullable = true)
|-- Lipase dosage: string (nullable = true)
|-- D-Dimer: string (nullable = true)
|-- Albumin: string (nullable = true)
|-- Hb saturation (arterial blood gases): string (nullable = true)
|-- pCO2 (arterial blood gas analysis): string (nullable = true)
|-- Base excess (arterial blood gas analysis): string (nullable = true)
|-- pH (arterial blood gas analysis): string (nullable = true)
|-- Total CO2 (arterial blood gas analysis): string (nullable = true)
|-- HCO3 (arterial blood gas analysis): string (nullable = true)
|-- pO2 (arterial blood gas analysis): string (nullable = true)
|-- Arteiral Fio2: string (nullable = true)
|-- Phosphor: string (nullable = true)
|-- ctO2 (arterial blood gas analysis): string (nullable = true)

In [26]: df.describe()

Out[26]: DataFrame[summary: string, Patient ID: string, Patient age quantile: string, SARS-Cov-2 exam result: string, Patient addmited to regular
ward (1=yes, 0=no): string, Patient addmited to semi-intensive unit (1=yes, 0=no): string, Patient addmited to intensive care unit (1=ye
s, 0=no): string, Hematocrit: string, Hemoglobin: string, Platelets: string, Mean platelet volume : string, Red blood Cells: string, Lym
phocytes: string, Mean corpuscular hemoglobin concentration (MCHC): string, Leukocytes: string, Basophils: string, Mean corpuscular hemo
globin (MCH): string, Eosinophils: string, Mean corpuscular volume (MCV): string, Monocytes: string, Red blood cell distribution width
(RDW): string, Serum Glucose: string, Respiratory Syncytial Virus: string, Influenza A: string, Influenza B: string, Parainfluenza 1: st
ring, CoronavirusNL63: string, Rhinovirus/Enterovirus: string, Mycoplasma pneumoniae: string, Coronavirus HKU1: string, Parainfluenza 3:
string, Chlamydophila pneumoniae: string, Adenovirus: string, Parainfluenza 4: string, Coronavirus229E: string, CoronavirusOC43: string,
Inf A H1N1 2009: string, Bordetella pertussis: string, Metapneumovirus: string, Parainfluenza 2: string, Neutrophils: string, Urea: stri
ng, Proteina C reativa mg/dL: string, Creatinine: string, Potassium: string, Sodium: string, Influenza B, rapid test: string, Influenza
A, rapid test: string, Alanine transaminase: string, Aspartate transaminase: string, Gamma-glutamyltransferase : string, Total Bilirubi
n: string, Direct Bilirubin: string, Indirect Bilirubin: string, Alkaline phosphatase: string, Ionized calcium : string, Strepto A: stri
ng, Magnesium: string, pCO2 (venous blood gas analysis): string, Hb saturation (venous blood gas analysis): string, Base excess (venous
blood gas analysis): string, pO2 (venous blood gas analysis): string, Fio2 (venous blood gas analysis): string, Total CO2 (venous blood
gas analysis): string, pH (venous blood gas analysis): string, HCO3 (venous blood gas analysis): string, Rods #: string, Segmented: stri
ng, Promyelocytes: string, Metamyelocytes: string, Myelocytes: string, Myeloblasts: string, Urine - Esterase: string, Urine - Aspect: st
ring, Urine - pH: string, Urine - Hemoglobin: string, Urine - Bile pigments: string, Urine - Ketone Bodies: string, Urine - Nitrite: str
ing, Urine - Density: string, Urine - Urobilinogen: string, Urine - Protein: string, Urine - Sugar: string, Urine - Leukocytes: string,
Urine - Crystals: string, Urine - Red blood cells: string, Urine - Hyaline cylinders: string, Urine - Granular cylinders: string, Urine
- Yeasts: string, Urine - Color: string, Partial thromboplastin time (PTT) : string, Relationship (Patient/Normal): string, Internationa
l normalized ratio (INR): string, Lactic Dehydrogenase: string, Prothrombin time (PT), Activity: string, Vitamin B12: string, Creatine p
hosphokinase (CPK) : string, Ferritin: string, Arterial Lactic Acid: string, Lipase dosage: string, D-Dimer: string, Albumin: string, Hb
saturation (arterial blood gases): string, pCO2 (arterial blood gas analysis): string, Base excess (arterial blood gas analysis): strin
g, pH (arterial blood gas analysis): string, Total CO2 (arterial blood gas analysis): string, HCO3 (arterial blood gas analysis): strin
g, pO2 (arterial blood gas analysis): string, Arteiral Fio2: string, Phosphor: string, ctO2 (arterial blood gas analysis): string]
In [27]: df.dtypes
Out[27]: [('Patient ID', 'int'),
('Patient age quantile', 'int'),
('SARS-Cov-2 exam result', 'int'),
('Patient addmited to regular ward (1=yes, 0=no)', 'int'),
('Patient addmited to semi-intensive unit (1=yes, 0=no)', 'int'),
('Patient addmited to intensive care unit (1=yes, 0=no)', 'int'),
('Hematocrit', 'int'),
('Hemoglobin', 'int'),
('Platelets', 'int'),
('Mean platelet volume ', 'int'),
('Red blood Cells', 'int'),
('Lymphocytes', 'int'),
('Mean corpuscular hemoglobin concentration\xa0(MCHC)', 'int'),
('Leukocytes', 'int'),
('Basophils', 'int'),
('Mean corpuscular hemoglobin (MCH)', 'int'),
('Eosinophils', 'int'),
('Mean corpuscular volume (MCV)', 'int'),
('Monocytes', 'int'),
('Red blood cell distribution width (RDW)', 'int'),
('Serum Glucose', 'int'),
('Respiratory Syncytial Virus', 'int'),
('Influenza A', 'int'),
('Influenza B', 'int'),
('Parainfluenza 1', 'int'),
('CoronavirusNL63', 'int'),
('Rhinovirus/Enterovirus', 'int'),
('Mycoplasma pneumoniae', 'int'),
('Coronavirus HKU1', 'int'),
('Parainfluenza 3', 'int'),
('Chlamydophila pneumoniae', 'int'),
('Adenovirus', 'int'),
('Parainfluenza 4', 'int'),
('Coronavirus229E', 'int'),
('CoronavirusOC43', 'int'),
('Inf A H1N1 2009', 'int'),
('Bordetella pertussis', 'int'),
('Metapneumovirus', 'int'),
('Parainfluenza 2', 'int'),
('Neutrophils', 'int'),
('Urea', 'int'),
('Proteina C reativa mg/dL', 'int'),
('Creatinine', 'int'),
('Potassium', 'int'),
('Sodium', 'int'),
('Influenza B, rapid test', 'int'),
('Influenza A, rapid test', 'int'),
('Alanine transaminase', 'int'),
('Aspartate transaminase', 'int'),
('Gamma-glutamyltransferase\xa0', 'int'),
('Total Bilirubin', 'int'),
('Direct Bilirubin', 'int'),
('Indirect Bilirubin', 'int'),
('Alkaline phosphatase', 'int'),
('Ionized calcium\xa0', 'int'),
('Strepto A', 'int'),
('Magnesium', 'int'),
('pCO2 (venous blood gas analysis)', 'int'),
('Hb saturation (venous blood gas analysis)', 'int'),
('Base excess (venous blood gas analysis)', 'int'),
('pO2 (venous blood gas analysis)', 'int'),
('Fio2 (venous blood gas analysis)', 'int'),
('Total CO2 (venous blood gas analysis)', 'int'),
('pH (venous blood gas analysis)', 'int'),
('HCO3 (venous blood gas analysis)', 'int'),
('Rods #', 'int'),
('Segmented', 'int'),
('Promyelocytes', 'int'),
('Metamyelocytes', 'int'),
('Myelocytes', 'int'),
('Myeloblasts', 'int'),
('Urine - Esterase', 'int'),
('Urine - Aspect', 'int'),
('Urine - pH', 'int'),
('Urine - Hemoglobin', 'int'),
('Urine - Bile pigments', 'int'),
('Urine - Ketone Bodies', 'int'),
('Urine - Nitrite', 'int'),
('Urine - Density', 'int'),
('Urine - Urobilinogen', 'int'),
('Urine - Protein', 'int'),
('Urine - Sugar', 'int'),
('Urine - Leukocytes', 'int'),
('Urine - Crystals', 'int'),
('Urine - Red blood cells', 'int'),
('Urine - Hyaline cylinders', 'int'),
('Urine - Granular cylinders', 'int'),
('Urine - Yeasts', 'int'),
('Urine - Color', 'int'),
('Partial thromboplastin time\xa0(PTT)\xa0', 'int'),
('Relationship (Patient/Normal)', 'int'),
('International normalized ratio (INR)', 'int'),
('Lactic Dehydrogenase', 'int'),
('Prothrombin time (PT), Activity', 'int'),
('Vitamin B12', 'int'),
('Creatine phosphokinase\xa0(CPK)\xa0', 'int'),
('Ferritin', 'int'),
('Arterial Lactic Acid', 'int'),
('Lipase dosage', 'int'),
('D-Dimer', 'int'),
('Albumin', 'int'),
('Hb saturation (arterial blood gases)', 'int'),
('pCO2 (arterial blood gas analysis)', 'int'),
('Base excess (arterial blood gas analysis)', 'int'),
('pH (arterial blood gas analysis)', 'int'),
('Total CO2 (arterial blood gas analysis)', 'int'),
('HCO3 (arterial blood gas analysis)', 'int'),
('pO2 (arterial blood gas analysis)', 'int'),
('Arteiral Fio2', 'int'),
('Phosphor', 'int'),
('ctO2 (arterial blood gas analysis)', 'int')]

In [29]: #data cleaning and explore data


df=df.fillna(0)
from pyspark.sql.functions import *
df=df.replace("nan", "0")
pd.DataFrame(df.head(5),columns=df.schema.names)

Out[29]:
Patient Patient Patient
Base Total
addmited addmited addmited Hb pCO2 pH HCO3
SARS- excess CO2
Patient to to semi- to Mean saturation (arterial (arterial (arteria
Patient Cov-2 (arterial (arterial
age regular intensive intensive Hematocrit Hemoglobin Platelets platelet ... (arterial blood blood blood
ID exam blood blood
quantile ward unit care unit volume blood gas gas gas
result gas gas
(1=yes, (1=yes, (1=yes, gases) analysis) analysis) analysis
analysis) analysis)
0=no) 0=no) 0=no)

0 0 13 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0

1 0 17 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0

2 0 8 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0

3 0 5 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0

4 0 15 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0

5 rows × 111 columns

In [30]: #Hemoglobin values observation


df_hemoglobin=df.select("Hemoglobin").toPandas()
df_hemoglobin['Hemoglobin']=pd.to_numeric(df_hemoglobin['Hemoglobin'])
df_hemoglobin['Hemoglobin'].hist()

Out[30]: <matplotlib.axes._subplots.AxesSubplot at 0xa5f7af0>

In [31]: df.select("SARS-Cov-2 exam result").show(5)

+----------------------+
|SARS-Cov-2 exam result|
+----------------------+
| 0|
| 0|
| 0|
| 0|
| 0|
+----------------------+
only showing top 5 rows
In [34]: df.select("CoronavirusNL63").show(5)
df.select("CoronavirusOC43").show(5)

+---------------+
|CoronavirusNL63|
+---------------+
| 0|
| 0|
| 0|
| 0|
| 0|
+---------------+
only showing top 5 rows

+---------------+
|CoronavirusOC43|
+---------------+
| 0|
| 0|
| 0|
| 0|
| 0|
+---------------+
only showing top 5 rows

In [35]: df_=df.select(col("SARS-Cov-2 exam result").alias("result"),col('Patient age quantile').alias('age'))


df_.show(5)

+------+---+
|result|age|
+------+---+
| 0| 13|
| 0| 17|
| 0| 8|
| 0| 5|
| 0| 15|
+------+---+
only showing top 5 rows
In [49]: df.printSchema()
root
|-- Patient ID: integer (nullable = true)
|-- Patient age quantile: integer (nullable = true)
|-- SARS-Cov-2 exam result: integer (nullable = true)
|-- Patient addmited to regular ward (1=yes, 0=no): integer (nullable = true)
|-- Patient addmited to semi-intensive unit (1=yes, 0=no): integer (nullable = true)
|-- Patient addmited to intensive care unit (1=yes, 0=no): integer (nullable = true)
|-- Hematocrit: integer (nullable = true)
|-- Hemoglobin: integer (nullable = true)
|-- Platelets: integer (nullable = true)
|-- Mean platelet volume : integer (nullable = true)
|-- Red blood Cells: integer (nullable = true)
|-- Lymphocytes: integer (nullable = true)
|-- Mean corpuscular hemoglobin concentration (MCHC): integer (nullable = true)
|-- Leukocytes: integer (nullable = true)
|-- Basophils: integer (nullable = true)
|-- Mean corpuscular hemoglobin (MCH): integer (nullable = true)
|-- Eosinophils: integer (nullable = true)
|-- Mean corpuscular volume (MCV): integer (nullable = true)
|-- Monocytes: integer (nullable = true)
|-- Red blood cell distribution width (RDW): integer (nullable = true)
|-- Serum Glucose: integer (nullable = true)
|-- Respiratory Syncytial Virus: integer (nullable = true)
|-- Influenza A: integer (nullable = true)
|-- Influenza B: integer (nullable = true)
|-- Parainfluenza 1: integer (nullable = true)
|-- CoronavirusNL63: integer (nullable = true)
|-- Rhinovirus/Enterovirus: integer (nullable = true)
|-- Mycoplasma pneumoniae: integer (nullable = true)
|-- Coronavirus HKU1: integer (nullable = true)
|-- Parainfluenza 3: integer (nullable = true)
|-- Chlamydophila pneumoniae: integer (nullable = true)
|-- Adenovirus: integer (nullable = true)
|-- Parainfluenza 4: integer (nullable = true)
|-- Coronavirus229E: integer (nullable = true)
|-- CoronavirusOC43: integer (nullable = true)
|-- Inf A H1N1 2009: integer (nullable = true)
|-- Bordetella pertussis: integer (nullable = true)
|-- Metapneumovirus: integer (nullable = true)
|-- Parainfluenza 2: integer (nullable = true)
|-- Neutrophils: integer (nullable = true)
|-- Urea: integer (nullable = true)
|-- Proteina C reativa mg/dL: integer (nullable = true)
|-- Creatinine: integer (nullable = true)
|-- Potassium: integer (nullable = true)
|-- Sodium: integer (nullable = true)
|-- Influenza B, rapid test: integer (nullable = true)
|-- Influenza A, rapid test: integer (nullable = true)
|-- Alanine transaminase: integer (nullable = true)
|-- Aspartate transaminase: integer (nullable = true)
|-- Gamma-glutamyltransferase : integer (nullable = true)
|-- Total Bilirubin: integer (nullable = true)
|-- Direct Bilirubin: integer (nullable = true)
|-- Indirect Bilirubin: integer (nullable = true)
|-- Alkaline phosphatase: integer (nullable = true)
|-- Ionized calcium : integer (nullable = true)
|-- Strepto A: integer (nullable = true)
|-- Magnesium: integer (nullable = true)
|-- pCO2 (venous blood gas analysis): integer (nullable = true)
|-- Hb saturation (venous blood gas analysis): integer (nullable = true)
|-- Base excess (venous blood gas analysis): integer (nullable = true)
|-- pO2 (venous blood gas analysis): integer (nullable = true)
|-- Fio2 (venous blood gas analysis): integer (nullable = true)
|-- Total CO2 (venous blood gas analysis): integer (nullable = true)
|-- pH (venous blood gas analysis): integer (nullable = true)
|-- HCO3 (venous blood gas analysis): integer (nullable = true)
|-- Rods #: integer (nullable = true)
|-- Segmented: integer (nullable = true)
|-- Promyelocytes: integer (nullable = true)
|-- Metamyelocytes: integer (nullable = true)
|-- Myelocytes: integer (nullable = true)
|-- Myeloblasts: integer (nullable = true)
|-- Urine - Esterase: integer (nullable = true)
|-- Urine - Aspect: integer (nullable = true)
|-- Urine - pH: integer (nullable = true)
|-- Urine - Hemoglobin: integer (nullable = true)
|-- Urine - Bile pigments: integer (nullable = true)
|-- Urine - Ketone Bodies: integer (nullable = true)
|-- Urine - Nitrite: integer (nullable = true)
|-- Urine - Density: integer (nullable = true)
|-- Urine - Urobilinogen: integer (nullable = true)
|-- Urine - Protein: integer (nullable = true)
|-- Urine - Sugar: integer (nullable = true)
|-- Urine - Leukocytes: integer (nullable = true)
|-- Urine - Crystals: integer (nullable = true)
|-- Urine - Red blood cells: integer (nullable = true)
|-- Urine - Hyaline cylinders: integer (nullable = true)
|-- Urine - Granular cylinders: integer (nullable = true)
|-- Urine - Yeasts: integer (nullable = true)
|-- Urine - Color: integer (nullable = true)
|-- Partial thromboplastin time (PTT) : integer (nullable = true)
|-- Relationship (Patient/Normal): integer (nullable = true)
|-- International normalized ratio (INR): integer (nullable = true)
|-- Lactic Dehydrogenase: integer (nullable = true)
|-- Prothrombin time (PT), Activity: integer (nullable = true)
|-- Vitamin B12: integer (nullable = true)
|-- Creatine phosphokinase (CPK) : integer (nullable = true)
|-- Ferritin: integer (nullable = true)
|-- Arterial Lactic Acid: integer (nullable = true)
|-- Lipase dosage: integer (nullable = true)
|-- D-Dimer: integer (nullable = true)
|-- Albumin: integer (nullable = true)
|-- Hb saturation (arterial blood gases): integer (nullable = true)
|-- pCO2 (arterial blood gas analysis): integer (nullable = true)
|-- Base excess (arterial blood gas analysis): integer (nullable = true)
|-- pH (arterial blood gas analysis): integer (nullable = true)
|-- Total CO2 (arterial blood gas analysis): integer (nullable = true)
|-- HCO3 (arterial blood gas analysis): integer (nullable = true)
|-- pO2 (arterial blood gas analysis): integer (nullable = true)
|-- Arteiral Fio2: integer (nullable = true)
|-- Phosphor: integer (nullable = true)
|-- ctO2 (arterial blood gas analysis): integer (nullable = true)

In [36]: df.groupBy('Hemoglobin').count().show()

+----------+-----+
|Hemoglobin|count|
+----------+-----+
| -1| 64|
| 1| 80|
| -4| 1|
| -2| 12|
| 2| 7|
| -3| 7|
| 0| 5473|
+----------+-----+

In [13]: df_.printSchema()

root
|-- result: string (nullable = true)
|-- age: string (nullable = true)

In [21]: rdd = df.rdd

In [23]: rdd = df.rdd.map(list)

In [14]: import pyspark.sql.functions as func

In [16]: #Aggregation
df_.groupBy("result").agg(func.max("age"), func.avg("age")).show()

+--------+--------+------------------+
| result|max(age)| avg(age)|
+--------+--------+------------------+
|positive| 9|10.630824372759857|
|negative| 9| 9.174400314589068|
+--------+--------+------------------+

In [17]: df_pandas_age=df_.groupBy("result").agg(func.max("age"), func.avg("age")).toPandas()


df_pandas_age.plot()

Out[17]: <matplotlib.axes._subplots.AxesSubplot at 0xbc619d0>

In [18]: from pyspark.sql.types import IntegerType


columns=df.schema.names
for column in columns:
df= df.withColumn(column, df[column].cast(IntegerType()))
In [19]: df.printSchema()
root
|-- Patient ID: integer (nullable = true)
|-- Patient age quantile: integer (nullable = true)
|-- SARS-Cov-2 exam result: integer (nullable = true)
|-- Patient addmited to regular ward (1=yes, 0=no): integer (nullable = true)
|-- Patient addmited to semi-intensive unit (1=yes, 0=no): integer (nullable = true)
|-- Patient addmited to intensive care unit (1=yes, 0=no): integer (nullable = true)
|-- Hematocrit: integer (nullable = true)
|-- Hemoglobin: integer (nullable = true)
|-- Platelets: integer (nullable = true)
|-- Mean platelet volume : integer (nullable = true)
|-- Red blood Cells: integer (nullable = true)
|-- Lymphocytes: integer (nullable = true)
|-- Mean corpuscular hemoglobin concentration (MCHC): integer (nullable = true)
|-- Leukocytes: integer (nullable = true)
|-- Basophils: integer (nullable = true)
|-- Mean corpuscular hemoglobin (MCH): integer (nullable = true)
|-- Eosinophils: integer (nullable = true)
|-- Mean corpuscular volume (MCV): integer (nullable = true)
|-- Monocytes: integer (nullable = true)
|-- Red blood cell distribution width (RDW): integer (nullable = true)
|-- Serum Glucose: integer (nullable = true)
|-- Respiratory Syncytial Virus: integer (nullable = true)
|-- Influenza A: integer (nullable = true)
|-- Influenza B: integer (nullable = true)
|-- Parainfluenza 1: integer (nullable = true)
|-- CoronavirusNL63: integer (nullable = true)
|-- Rhinovirus/Enterovirus: integer (nullable = true)
|-- Mycoplasma pneumoniae: integer (nullable = true)
|-- Coronavirus HKU1: integer (nullable = true)
|-- Parainfluenza 3: integer (nullable = true)
|-- Chlamydophila pneumoniae: integer (nullable = true)
|-- Adenovirus: integer (nullable = true)
|-- Parainfluenza 4: integer (nullable = true)
|-- Coronavirus229E: integer (nullable = true)
|-- CoronavirusOC43: integer (nullable = true)
|-- Inf A H1N1 2009: integer (nullable = true)
|-- Bordetella pertussis: integer (nullable = true)
|-- Metapneumovirus: integer (nullable = true)
|-- Parainfluenza 2: integer (nullable = true)
|-- Neutrophils: integer (nullable = true)
|-- Urea: integer (nullable = true)
|-- Proteina C reativa mg/dL: integer (nullable = true)
|-- Creatinine: integer (nullable = true)
|-- Potassium: integer (nullable = true)
|-- Sodium: integer (nullable = true)
|-- Influenza B, rapid test: integer (nullable = true)
|-- Influenza A, rapid test: integer (nullable = true)
|-- Alanine transaminase: integer (nullable = true)
|-- Aspartate transaminase: integer (nullable = true)
|-- Gamma-glutamyltransferase : integer (nullable = true)
|-- Total Bilirubin: integer (nullable = true)
|-- Direct Bilirubin: integer (nullable = true)
|-- Indirect Bilirubin: integer (nullable = true)
|-- Alkaline phosphatase: integer (nullable = true)
|-- Ionized calcium : integer (nullable = true)
|-- Strepto A: integer (nullable = true)
|-- Magnesium: integer (nullable = true)
|-- pCO2 (venous blood gas analysis): integer (nullable = true)
|-- Hb saturation (venous blood gas analysis): integer (nullable = true)
|-- Base excess (venous blood gas analysis): integer (nullable = true)
|-- pO2 (venous blood gas analysis): integer (nullable = true)
|-- Fio2 (venous blood gas analysis): integer (nullable = true)
|-- Total CO2 (venous blood gas analysis): integer (nullable = true)
|-- pH (venous blood gas analysis): integer (nullable = true)
|-- HCO3 (venous blood gas analysis): integer (nullable = true)
|-- Rods #: integer (nullable = true)
|-- Segmented: integer (nullable = true)
|-- Promyelocytes: integer (nullable = true)
|-- Metamyelocytes: integer (nullable = true)
|-- Myelocytes: integer (nullable = true)
|-- Myeloblasts: integer (nullable = true)
|-- Urine - Esterase: integer (nullable = true)
|-- Urine - Aspect: integer (nullable = true)
|-- Urine - pH: integer (nullable = true)
|-- Urine - Hemoglobin: integer (nullable = true)
|-- Urine - Bile pigments: integer (nullable = true)
|-- Urine - Ketone Bodies: integer (nullable = true)
|-- Urine - Nitrite: integer (nullable = true)
|-- Urine - Density: integer (nullable = true)
|-- Urine - Urobilinogen: integer (nullable = true)
|-- Urine - Protein: integer (nullable = true)
|-- Urine - Sugar: integer (nullable = true)
|-- Urine - Leukocytes: integer (nullable = true)
|-- Urine - Crystals: integer (nullable = true)
|-- Urine - Red blood cells: integer (nullable = true)
|-- Urine - Hyaline cylinders: integer (nullable = true)
|-- Urine - Granular cylinders: integer (nullable = true)
|-- Urine - Yeasts: integer (nullable = true)
|-- Urine - Color: integer (nullable = true)
|-- Partial thromboplastin time (PTT) : integer (nullable = true)
|-- Relationship (Patient/Normal): integer (nullable = true)
|-- International normalized ratio (INR): integer (nullable = true)
|-- Lactic Dehydrogenase: integer (nullable = true)
|-- Prothrombin time (PT), Activity: integer (nullable = true)
|-- Vitamin B12: integer (nullable = true)
|-- Creatine phosphokinase (CPK) : integer (nullable = true)
|-- Ferritin: integer (nullable = true)
|-- Arterial Lactic Acid: integer (nullable = true)
|-- Lipase dosage: integer (nullable = true)
|-- D-Dimer: integer (nullable = true)
|-- Albumin: integer (nullable = true)
|-- Hb saturation (arterial blood gases): integer (nullable = true)
|-- pCO2 (arterial blood gas analysis): integer (nullable = true)
|-- Base excess (arterial blood gas analysis): integer (nullable = true)
|-- pH (arterial blood gas analysis): integer (nullable = true)
|-- Total CO2 (arterial blood gas analysis): integer (nullable = true)
|-- HCO3 (arterial blood gas analysis): integer (nullable = true)
|-- pO2 (arterial blood gas analysis): integer (nullable = true)
|-- Arteiral Fio2: integer (nullable = true)
|-- Phosphor: integer (nullable = true)
|-- ctO2 (arterial blood gas analysis): integer (nullable = true)

In [44]: from pyspark.ml.feature import (VectorAssembler,OneHotEncoder,


StringIndexer)

Das könnte Ihnen auch gefallen