Beruflich Dokumente
Kultur Dokumente
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
def load_raw():
#Create Spark Session
spark = SparkSession.builder \
.master('local') \
.appName('myAppName') \
.config('spark.executor.memory', '12gb') \
.config("spark.cores.max", "10") \
.getOrCreate()
sqlContext = SQLContext(sc)
df=pd.read_excel('C:/Users/Administrator/Notebook/dataset.xlsx')
df=sqlContext.createDataFrame(df)
return df,sqlContext
In [3]: df,sqlContext=load_raw()
Exploration of data The first operation to perform after importing data is to get some information of what it looks like. It’s possible to do with the following commands:
In [5]: df.printSchema()
root
|-- Patient ID: string (nullable = true)
|-- Patient age quantile: string (nullable = true)
|-- SARS-Cov-2 exam result: string (nullable = true)
|-- Patient addmited to regular ward (1=yes, 0=no): string (nullable = true)
|-- Patient addmited to semi-intensive unit (1=yes, 0=no): string (nullable = true)
|-- Patient addmited to intensive care unit (1=yes, 0=no): string (nullable = true)
|-- Hematocrit: string (nullable = true)
|-- Hemoglobin: string (nullable = true)
|-- Platelets: string (nullable = true)
|-- Mean platelet volume : string (nullable = true)
|-- Red blood Cells: string (nullable = true)
|-- Lymphocytes: string (nullable = true)
|-- Mean corpuscular hemoglobin concentration (MCHC): string (nullable = true)
|-- Leukocytes: string (nullable = true)
|-- Basophils: string (nullable = true)
|-- Mean corpuscular hemoglobin (MCH): string (nullable = true)
|-- Eosinophils: string (nullable = true)
|-- Mean corpuscular volume (MCV): string (nullable = true)
|-- Monocytes: string (nullable = true)
|-- Red blood cell distribution width (RDW): string (nullable = true)
|-- Serum Glucose: string (nullable = true)
|-- Respiratory Syncytial Virus: string (nullable = true)
|-- Influenza A: string (nullable = true)
|-- Influenza B: string (nullable = true)
|-- Parainfluenza 1: string (nullable = true)
|-- CoronavirusNL63: string (nullable = true)
|-- Rhinovirus/Enterovirus: string (nullable = true)
|-- Mycoplasma pneumoniae: string (nullable = true)
|-- Coronavirus HKU1: string (nullable = true)
|-- Parainfluenza 3: string (nullable = true)
|-- Chlamydophila pneumoniae: string (nullable = true)
|-- Adenovirus: string (nullable = true)
|-- Parainfluenza 4: string (nullable = true)
|-- Coronavirus229E: string (nullable = true)
|-- CoronavirusOC43: string (nullable = true)
|-- Inf A H1N1 2009: string (nullable = true)
|-- Bordetella pertussis: string (nullable = true)
|-- Metapneumovirus: string (nullable = true)
|-- Parainfluenza 2: string (nullable = true)
|-- Neutrophils: string (nullable = true)
|-- Urea: string (nullable = true)
|-- Proteina C reativa mg/dL: string (nullable = true)
|-- Creatinine: string (nullable = true)
|-- Potassium: string (nullable = true)
|-- Sodium: string (nullable = true)
|-- Influenza B, rapid test: string (nullable = true)
|-- Influenza A, rapid test: string (nullable = true)
|-- Alanine transaminase: string (nullable = true)
|-- Aspartate transaminase: string (nullable = true)
|-- Gamma-glutamyltransferase : string (nullable = true)
|-- Total Bilirubin: string (nullable = true)
|-- Direct Bilirubin: string (nullable = true)
|-- Indirect Bilirubin: string (nullable = true)
|-- Alkaline phosphatase: string (nullable = true)
|-- Ionized calcium : string (nullable = true)
|-- Strepto A: string (nullable = true)
|-- Magnesium: string (nullable = true)
|-- pCO2 (venous blood gas analysis): string (nullable = true)
|-- Hb saturation (venous blood gas analysis): string (nullable = true)
|-- Base excess (venous blood gas analysis): string (nullable = true)
|-- pO2 (venous blood gas analysis): string (nullable = true)
|-- Fio2 (venous blood gas analysis): string (nullable = true)
|-- Total CO2 (venous blood gas analysis): string (nullable = true)
|-- pH (venous blood gas analysis): string (nullable = true)
|-- HCO3 (venous blood gas analysis): string (nullable = true)
|-- Rods #: string (nullable = true)
|-- Segmented: string (nullable = true)
|-- Promyelocytes: string (nullable = true)
|-- Metamyelocytes: string (nullable = true)
|-- Myelocytes: string (nullable = true)
|-- Myeloblasts: string (nullable = true)
|-- Urine - Esterase: string (nullable = true)
|-- Urine - Aspect: string (nullable = true)
|-- Urine - pH: string (nullable = true)
|-- Urine - Hemoglobin: string (nullable = true)
|-- Urine - Bile pigments: string (nullable = true)
|-- Urine - Ketone Bodies: string (nullable = true)
|-- Urine - Nitrite: string (nullable = true)
|-- Urine - Density: string (nullable = true)
|-- Urine - Urobilinogen: string (nullable = true)
|-- Urine - Protein: string (nullable = true)
|-- Urine - Sugar: string (nullable = true)
|-- Urine - Leukocytes: string (nullable = true)
|-- Urine - Crystals: string (nullable = true)
|-- Urine - Red blood cells: string (nullable = true)
|-- Urine - Hyaline cylinders: string (nullable = true)
|-- Urine - Granular cylinders: string (nullable = true)
|-- Urine - Yeasts: string (nullable = true)
|-- Urine - Color: string (nullable = true)
|-- Partial thromboplastin time (PTT) : string (nullable = true)
|-- Relationship (Patient/Normal): string (nullable = true)
|-- International normalized ratio (INR): string (nullable = true)
|-- Lactic Dehydrogenase: string (nullable = true)
|-- Prothrombin time (PT), Activity: string (nullable = true)
|-- Vitamin B12: string (nullable = true)
|-- Creatine phosphokinase (CPK) : string (nullable = true)
|-- Ferritin: string (nullable = true)
|-- Arterial Lactic Acid: string (nullable = true)
|-- Lipase dosage: string (nullable = true)
|-- D-Dimer: string (nullable = true)
|-- Albumin: string (nullable = true)
|-- Hb saturation (arterial blood gases): string (nullable = true)
|-- pCO2 (arterial blood gas analysis): string (nullable = true)
|-- Base excess (arterial blood gas analysis): string (nullable = true)
|-- pH (arterial blood gas analysis): string (nullable = true)
|-- Total CO2 (arterial blood gas analysis): string (nullable = true)
|-- HCO3 (arterial blood gas analysis): string (nullable = true)
|-- pO2 (arterial blood gas analysis): string (nullable = true)
|-- Arteiral Fio2: string (nullable = true)
|-- Phosphor: string (nullable = true)
|-- ctO2 (arterial blood gas analysis): string (nullable = true)
In [26]: df.describe()
Out[26]: DataFrame[summary: string, Patient ID: string, Patient age quantile: string, SARS-Cov-2 exam result: string, Patient addmited to regular
ward (1=yes, 0=no): string, Patient addmited to semi-intensive unit (1=yes, 0=no): string, Patient addmited to intensive care unit (1=ye
s, 0=no): string, Hematocrit: string, Hemoglobin: string, Platelets: string, Mean platelet volume : string, Red blood Cells: string, Lym
phocytes: string, Mean corpuscular hemoglobin concentration (MCHC): string, Leukocytes: string, Basophils: string, Mean corpuscular hemo
globin (MCH): string, Eosinophils: string, Mean corpuscular volume (MCV): string, Monocytes: string, Red blood cell distribution width
(RDW): string, Serum Glucose: string, Respiratory Syncytial Virus: string, Influenza A: string, Influenza B: string, Parainfluenza 1: st
ring, CoronavirusNL63: string, Rhinovirus/Enterovirus: string, Mycoplasma pneumoniae: string, Coronavirus HKU1: string, Parainfluenza 3:
string, Chlamydophila pneumoniae: string, Adenovirus: string, Parainfluenza 4: string, Coronavirus229E: string, CoronavirusOC43: string,
Inf A H1N1 2009: string, Bordetella pertussis: string, Metapneumovirus: string, Parainfluenza 2: string, Neutrophils: string, Urea: stri
ng, Proteina C reativa mg/dL: string, Creatinine: string, Potassium: string, Sodium: string, Influenza B, rapid test: string, Influenza
A, rapid test: string, Alanine transaminase: string, Aspartate transaminase: string, Gamma-glutamyltransferase : string, Total Bilirubi
n: string, Direct Bilirubin: string, Indirect Bilirubin: string, Alkaline phosphatase: string, Ionized calcium : string, Strepto A: stri
ng, Magnesium: string, pCO2 (venous blood gas analysis): string, Hb saturation (venous blood gas analysis): string, Base excess (venous
blood gas analysis): string, pO2 (venous blood gas analysis): string, Fio2 (venous blood gas analysis): string, Total CO2 (venous blood
gas analysis): string, pH (venous blood gas analysis): string, HCO3 (venous blood gas analysis): string, Rods #: string, Segmented: stri
ng, Promyelocytes: string, Metamyelocytes: string, Myelocytes: string, Myeloblasts: string, Urine - Esterase: string, Urine - Aspect: st
ring, Urine - pH: string, Urine - Hemoglobin: string, Urine - Bile pigments: string, Urine - Ketone Bodies: string, Urine - Nitrite: str
ing, Urine - Density: string, Urine - Urobilinogen: string, Urine - Protein: string, Urine - Sugar: string, Urine - Leukocytes: string,
Urine - Crystals: string, Urine - Red blood cells: string, Urine - Hyaline cylinders: string, Urine - Granular cylinders: string, Urine
- Yeasts: string, Urine - Color: string, Partial thromboplastin time (PTT) : string, Relationship (Patient/Normal): string, Internationa
l normalized ratio (INR): string, Lactic Dehydrogenase: string, Prothrombin time (PT), Activity: string, Vitamin B12: string, Creatine p
hosphokinase (CPK) : string, Ferritin: string, Arterial Lactic Acid: string, Lipase dosage: string, D-Dimer: string, Albumin: string, Hb
saturation (arterial blood gases): string, pCO2 (arterial blood gas analysis): string, Base excess (arterial blood gas analysis): strin
g, pH (arterial blood gas analysis): string, Total CO2 (arterial blood gas analysis): string, HCO3 (arterial blood gas analysis): strin
g, pO2 (arterial blood gas analysis): string, Arteiral Fio2: string, Phosphor: string, ctO2 (arterial blood gas analysis): string]
In [27]: df.dtypes
Out[27]: [('Patient ID', 'int'),
('Patient age quantile', 'int'),
('SARS-Cov-2 exam result', 'int'),
('Patient addmited to regular ward (1=yes, 0=no)', 'int'),
('Patient addmited to semi-intensive unit (1=yes, 0=no)', 'int'),
('Patient addmited to intensive care unit (1=yes, 0=no)', 'int'),
('Hematocrit', 'int'),
('Hemoglobin', 'int'),
('Platelets', 'int'),
('Mean platelet volume ', 'int'),
('Red blood Cells', 'int'),
('Lymphocytes', 'int'),
('Mean corpuscular hemoglobin concentration\xa0(MCHC)', 'int'),
('Leukocytes', 'int'),
('Basophils', 'int'),
('Mean corpuscular hemoglobin (MCH)', 'int'),
('Eosinophils', 'int'),
('Mean corpuscular volume (MCV)', 'int'),
('Monocytes', 'int'),
('Red blood cell distribution width (RDW)', 'int'),
('Serum Glucose', 'int'),
('Respiratory Syncytial Virus', 'int'),
('Influenza A', 'int'),
('Influenza B', 'int'),
('Parainfluenza 1', 'int'),
('CoronavirusNL63', 'int'),
('Rhinovirus/Enterovirus', 'int'),
('Mycoplasma pneumoniae', 'int'),
('Coronavirus HKU1', 'int'),
('Parainfluenza 3', 'int'),
('Chlamydophila pneumoniae', 'int'),
('Adenovirus', 'int'),
('Parainfluenza 4', 'int'),
('Coronavirus229E', 'int'),
('CoronavirusOC43', 'int'),
('Inf A H1N1 2009', 'int'),
('Bordetella pertussis', 'int'),
('Metapneumovirus', 'int'),
('Parainfluenza 2', 'int'),
('Neutrophils', 'int'),
('Urea', 'int'),
('Proteina C reativa mg/dL', 'int'),
('Creatinine', 'int'),
('Potassium', 'int'),
('Sodium', 'int'),
('Influenza B, rapid test', 'int'),
('Influenza A, rapid test', 'int'),
('Alanine transaminase', 'int'),
('Aspartate transaminase', 'int'),
('Gamma-glutamyltransferase\xa0', 'int'),
('Total Bilirubin', 'int'),
('Direct Bilirubin', 'int'),
('Indirect Bilirubin', 'int'),
('Alkaline phosphatase', 'int'),
('Ionized calcium\xa0', 'int'),
('Strepto A', 'int'),
('Magnesium', 'int'),
('pCO2 (venous blood gas analysis)', 'int'),
('Hb saturation (venous blood gas analysis)', 'int'),
('Base excess (venous blood gas analysis)', 'int'),
('pO2 (venous blood gas analysis)', 'int'),
('Fio2 (venous blood gas analysis)', 'int'),
('Total CO2 (venous blood gas analysis)', 'int'),
('pH (venous blood gas analysis)', 'int'),
('HCO3 (venous blood gas analysis)', 'int'),
('Rods #', 'int'),
('Segmented', 'int'),
('Promyelocytes', 'int'),
('Metamyelocytes', 'int'),
('Myelocytes', 'int'),
('Myeloblasts', 'int'),
('Urine - Esterase', 'int'),
('Urine - Aspect', 'int'),
('Urine - pH', 'int'),
('Urine - Hemoglobin', 'int'),
('Urine - Bile pigments', 'int'),
('Urine - Ketone Bodies', 'int'),
('Urine - Nitrite', 'int'),
('Urine - Density', 'int'),
('Urine - Urobilinogen', 'int'),
('Urine - Protein', 'int'),
('Urine - Sugar', 'int'),
('Urine - Leukocytes', 'int'),
('Urine - Crystals', 'int'),
('Urine - Red blood cells', 'int'),
('Urine - Hyaline cylinders', 'int'),
('Urine - Granular cylinders', 'int'),
('Urine - Yeasts', 'int'),
('Urine - Color', 'int'),
('Partial thromboplastin time\xa0(PTT)\xa0', 'int'),
('Relationship (Patient/Normal)', 'int'),
('International normalized ratio (INR)', 'int'),
('Lactic Dehydrogenase', 'int'),
('Prothrombin time (PT), Activity', 'int'),
('Vitamin B12', 'int'),
('Creatine phosphokinase\xa0(CPK)\xa0', 'int'),
('Ferritin', 'int'),
('Arterial Lactic Acid', 'int'),
('Lipase dosage', 'int'),
('D-Dimer', 'int'),
('Albumin', 'int'),
('Hb saturation (arterial blood gases)', 'int'),
('pCO2 (arterial blood gas analysis)', 'int'),
('Base excess (arterial blood gas analysis)', 'int'),
('pH (arterial blood gas analysis)', 'int'),
('Total CO2 (arterial blood gas analysis)', 'int'),
('HCO3 (arterial blood gas analysis)', 'int'),
('pO2 (arterial blood gas analysis)', 'int'),
('Arteiral Fio2', 'int'),
('Phosphor', 'int'),
('ctO2 (arterial blood gas analysis)', 'int')]
Out[29]:
Patient Patient Patient
Base Total
addmited addmited addmited Hb pCO2 pH HCO3
SARS- excess CO2
Patient to to semi- to Mean saturation (arterial (arterial (arteria
Patient Cov-2 (arterial (arterial
age regular intensive intensive Hematocrit Hemoglobin Platelets platelet ... (arterial blood blood blood
ID exam blood blood
quantile ward unit care unit volume blood gas gas gas
result gas gas
(1=yes, (1=yes, (1=yes, gases) analysis) analysis) analysis
analysis) analysis)
0=no) 0=no) 0=no)
0 0 13 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
1 0 17 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
2 0 8 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
3 0 5 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
4 0 15 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0
+----------------------+
|SARS-Cov-2 exam result|
+----------------------+
| 0|
| 0|
| 0|
| 0|
| 0|
+----------------------+
only showing top 5 rows
In [34]: df.select("CoronavirusNL63").show(5)
df.select("CoronavirusOC43").show(5)
+---------------+
|CoronavirusNL63|
+---------------+
| 0|
| 0|
| 0|
| 0|
| 0|
+---------------+
only showing top 5 rows
+---------------+
|CoronavirusOC43|
+---------------+
| 0|
| 0|
| 0|
| 0|
| 0|
+---------------+
only showing top 5 rows
+------+---+
|result|age|
+------+---+
| 0| 13|
| 0| 17|
| 0| 8|
| 0| 5|
| 0| 15|
+------+---+
only showing top 5 rows
In [49]: df.printSchema()
root
|-- Patient ID: integer (nullable = true)
|-- Patient age quantile: integer (nullable = true)
|-- SARS-Cov-2 exam result: integer (nullable = true)
|-- Patient addmited to regular ward (1=yes, 0=no): integer (nullable = true)
|-- Patient addmited to semi-intensive unit (1=yes, 0=no): integer (nullable = true)
|-- Patient addmited to intensive care unit (1=yes, 0=no): integer (nullable = true)
|-- Hematocrit: integer (nullable = true)
|-- Hemoglobin: integer (nullable = true)
|-- Platelets: integer (nullable = true)
|-- Mean platelet volume : integer (nullable = true)
|-- Red blood Cells: integer (nullable = true)
|-- Lymphocytes: integer (nullable = true)
|-- Mean corpuscular hemoglobin concentration (MCHC): integer (nullable = true)
|-- Leukocytes: integer (nullable = true)
|-- Basophils: integer (nullable = true)
|-- Mean corpuscular hemoglobin (MCH): integer (nullable = true)
|-- Eosinophils: integer (nullable = true)
|-- Mean corpuscular volume (MCV): integer (nullable = true)
|-- Monocytes: integer (nullable = true)
|-- Red blood cell distribution width (RDW): integer (nullable = true)
|-- Serum Glucose: integer (nullable = true)
|-- Respiratory Syncytial Virus: integer (nullable = true)
|-- Influenza A: integer (nullable = true)
|-- Influenza B: integer (nullable = true)
|-- Parainfluenza 1: integer (nullable = true)
|-- CoronavirusNL63: integer (nullable = true)
|-- Rhinovirus/Enterovirus: integer (nullable = true)
|-- Mycoplasma pneumoniae: integer (nullable = true)
|-- Coronavirus HKU1: integer (nullable = true)
|-- Parainfluenza 3: integer (nullable = true)
|-- Chlamydophila pneumoniae: integer (nullable = true)
|-- Adenovirus: integer (nullable = true)
|-- Parainfluenza 4: integer (nullable = true)
|-- Coronavirus229E: integer (nullable = true)
|-- CoronavirusOC43: integer (nullable = true)
|-- Inf A H1N1 2009: integer (nullable = true)
|-- Bordetella pertussis: integer (nullable = true)
|-- Metapneumovirus: integer (nullable = true)
|-- Parainfluenza 2: integer (nullable = true)
|-- Neutrophils: integer (nullable = true)
|-- Urea: integer (nullable = true)
|-- Proteina C reativa mg/dL: integer (nullable = true)
|-- Creatinine: integer (nullable = true)
|-- Potassium: integer (nullable = true)
|-- Sodium: integer (nullable = true)
|-- Influenza B, rapid test: integer (nullable = true)
|-- Influenza A, rapid test: integer (nullable = true)
|-- Alanine transaminase: integer (nullable = true)
|-- Aspartate transaminase: integer (nullable = true)
|-- Gamma-glutamyltransferase : integer (nullable = true)
|-- Total Bilirubin: integer (nullable = true)
|-- Direct Bilirubin: integer (nullable = true)
|-- Indirect Bilirubin: integer (nullable = true)
|-- Alkaline phosphatase: integer (nullable = true)
|-- Ionized calcium : integer (nullable = true)
|-- Strepto A: integer (nullable = true)
|-- Magnesium: integer (nullable = true)
|-- pCO2 (venous blood gas analysis): integer (nullable = true)
|-- Hb saturation (venous blood gas analysis): integer (nullable = true)
|-- Base excess (venous blood gas analysis): integer (nullable = true)
|-- pO2 (venous blood gas analysis): integer (nullable = true)
|-- Fio2 (venous blood gas analysis): integer (nullable = true)
|-- Total CO2 (venous blood gas analysis): integer (nullable = true)
|-- pH (venous blood gas analysis): integer (nullable = true)
|-- HCO3 (venous blood gas analysis): integer (nullable = true)
|-- Rods #: integer (nullable = true)
|-- Segmented: integer (nullable = true)
|-- Promyelocytes: integer (nullable = true)
|-- Metamyelocytes: integer (nullable = true)
|-- Myelocytes: integer (nullable = true)
|-- Myeloblasts: integer (nullable = true)
|-- Urine - Esterase: integer (nullable = true)
|-- Urine - Aspect: integer (nullable = true)
|-- Urine - pH: integer (nullable = true)
|-- Urine - Hemoglobin: integer (nullable = true)
|-- Urine - Bile pigments: integer (nullable = true)
|-- Urine - Ketone Bodies: integer (nullable = true)
|-- Urine - Nitrite: integer (nullable = true)
|-- Urine - Density: integer (nullable = true)
|-- Urine - Urobilinogen: integer (nullable = true)
|-- Urine - Protein: integer (nullable = true)
|-- Urine - Sugar: integer (nullable = true)
|-- Urine - Leukocytes: integer (nullable = true)
|-- Urine - Crystals: integer (nullable = true)
|-- Urine - Red blood cells: integer (nullable = true)
|-- Urine - Hyaline cylinders: integer (nullable = true)
|-- Urine - Granular cylinders: integer (nullable = true)
|-- Urine - Yeasts: integer (nullable = true)
|-- Urine - Color: integer (nullable = true)
|-- Partial thromboplastin time (PTT) : integer (nullable = true)
|-- Relationship (Patient/Normal): integer (nullable = true)
|-- International normalized ratio (INR): integer (nullable = true)
|-- Lactic Dehydrogenase: integer (nullable = true)
|-- Prothrombin time (PT), Activity: integer (nullable = true)
|-- Vitamin B12: integer (nullable = true)
|-- Creatine phosphokinase (CPK) : integer (nullable = true)
|-- Ferritin: integer (nullable = true)
|-- Arterial Lactic Acid: integer (nullable = true)
|-- Lipase dosage: integer (nullable = true)
|-- D-Dimer: integer (nullable = true)
|-- Albumin: integer (nullable = true)
|-- Hb saturation (arterial blood gases): integer (nullable = true)
|-- pCO2 (arterial blood gas analysis): integer (nullable = true)
|-- Base excess (arterial blood gas analysis): integer (nullable = true)
|-- pH (arterial blood gas analysis): integer (nullable = true)
|-- Total CO2 (arterial blood gas analysis): integer (nullable = true)
|-- HCO3 (arterial blood gas analysis): integer (nullable = true)
|-- pO2 (arterial blood gas analysis): integer (nullable = true)
|-- Arteiral Fio2: integer (nullable = true)
|-- Phosphor: integer (nullable = true)
|-- ctO2 (arterial blood gas analysis): integer (nullable = true)
In [36]: df.groupBy('Hemoglobin').count().show()
+----------+-----+
|Hemoglobin|count|
+----------+-----+
| -1| 64|
| 1| 80|
| -4| 1|
| -2| 12|
| 2| 7|
| -3| 7|
| 0| 5473|
+----------+-----+
In [13]: df_.printSchema()
root
|-- result: string (nullable = true)
|-- age: string (nullable = true)
In [16]: #Aggregation
df_.groupBy("result").agg(func.max("age"), func.avg("age")).show()
+--------+--------+------------------+
| result|max(age)| avg(age)|
+--------+--------+------------------+
|positive| 9|10.630824372759857|
|negative| 9| 9.174400314589068|
+--------+--------+------------------+