Sie sind auf Seite 1von 29

BioSense

P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Creating Syndrome Definitions Using


RStudio

Tim Hopper
Data Scientist
RTI International

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Code Is Available Online

https://gist.github.com/tdhopper/d5939aaf74886143224e/raw/3ae
883a25ef078a5edd2fcced0f0268b34be3d6b/Custom+Syndromes

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Setup
# Connect to TarrantCounty_FP database
# Credentials
USERNAME <- 'username'
PASSWORD <- 'password'
HOSTNAME <- 'data3.biosen.se'
DBNAME <- 'TarrantCounty_FP'
TABLE <- 'TC_Meaningful_Use_Base'
# Create database connection
con <- dbConnect(dbDriver('MySQL'), user=USERNAME,
password=PASSWORD,
host=HOSTNAME, dbname=DBNAME)

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Example: Co-morbid Syndrome

We want to see the co-occurrence of


influenza (influenza-like illness) and
asthma.
Data source: Texas region 2/3
Location: Tarrant County
Time: February 1October 31, 2013

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Query for Asthma


SELECT Facility_City, Facility_State, Diagnosis_Code, Diagnosis_Text,
Chief_Complaint, Age, Gender, Visit_Date_Time,
Row_Number
FROM TC_Meaningful_Use_Base
WHERE Visit_Date_Time
BETWEEN '2013-02-01 00:00:00' AND '2013-10-31 23:59:59'
AND (Diagnosis_Code LIKE '%493%')

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Query for Influenza-Like Illness


SELECT Facility_City, Facility_State, Diagnosis_Code, Diagnosis_Text,
Chief_Complaint, Age, Gender, Visit_Date_Time,
Row_Number
FROM TC_Meaningful_Use_Base
WHERE Visit_Date_Time
BETWEEN '2013-02-01 00:00:00' AND '2013-10-31 23:59:59'
AND (Diagnosis_Code LIKE '%487%'
OR Diagnosis_Code LIKE '%488%'
OR Diagnosis_Code LIKE '%V04.8%'
OR Diagnosis_Code LIKE '%V0481%'
OR Diagnosis_Code LIKE '%V06.6%'
OR Diagnosis_Code LIKE '%V066%')

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Run Query and Process Data


# Run Query
df.asthma <- dbGetQuery(con, query.asthma)
df.ili <- dbGetQuery(con, query.ili)
# Add column naming each as a syndrome
df.asthma$Syndrome <- 'ASTHMA'
df.ili$Syndrome <- 'ILI'
# Combine these two data sets into one data.frame
df <- rbind(df.asthma, df.ili)
# Format dates and add date column (without time)
df$Visit_Date_Time <- ymd_hms(df$Visit_Date_Time)
df$Visit_Date <- as.Date(df$Visit_Date_Time)

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Create Summary Data Set


events.per.day.split <- ddply(df,
.(Visit_Date, Syndrome),
summarize,
Number_of_Visits=length(Visit_Date))
############################################
# Visit_Date
Syndrome Number_of_Visits
# 1 2013-02-01
ASTHMA
49
# 2 2013-02-01 ASTHMA & ILI
2
# 3 2013-02-01
ILI
5
# 4 2013-02-02
ASTHMA
60
# 5 2013-02-02
ILI
21
# 6 2013-02-03
ASTHMA
89
############################################

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Visits Per Day by Syndrome


ggplot(events.per.day.split) +
aes(Visit_Date, Number_of_Visits, color=Syndrome) +
geom_line()

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Create Summary Data Set


events.per.day <- ddply(df,
.(Visit_Date),
summarize,
Number_of_Visits=length(Visit_Date))
###############################
# Visit_Date Number_of_Visits
# 1 2013-02-01
513
# 2 2013-02-02
396
# 3 2013-02-03
428
# 4 2013-02-04
409
# 5 2013-02-05
580
# 6 2013-02-06
391
###############################

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Visits Per Day by Syndrome


ggplot(events.per.day) +
aes(Visit_Date, Number_of_Visits) +
geom_line()

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Example: New Syndrome

We want to see create a new syndrome to


identify visits during which the patient had
cough AND dizziness AND headache.
Data source: Texas region 2/3
Location: Tarrant County
Time: February 1October 31, 2013

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Query
SELECT Facility_City, Facility_State, Diagnosis_Code, Diagnosis_Text,
Chief_Complaint, Age, Gender, Visit_Date_Time,
Row_Number
FROM TC_Meaningful_Use_Base
WHERE Visit_Date_Time BETWEEN '2013-02-01 00:00:00'
AND '2013-10-31 23:59:59'
AND (Diagnosis_Code LIKE '%786.2%' OR
Diagnosis_Code LIKE '%7862%')
AND (Diagnosis_Code LIKE '%780.4%' OR
Diagnosis_Code LIKE '%7804%')
AND (Diagnosis_Code LIKE '%784.0%' OR
Diagnosis_Code LIKE '%7840%');

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Run Query

# Run Query
df.sick <- dbGetQuery(con, query)
# Fix dates using lubridate
df.sick$Visit_Date_Time <- ymd_hms(df.sick$Visit_Date_Time)
# Create a month column
df.sick$Month <- month(df.sick$Visit_Date, label=T)

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Run Query
ggplot(df.sick) +
aes(Month) +
geom_histogram()

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Create Line Listing


write.csv(df.sick, 'sick.csv', quote=F, row.names=F)
#
#
#
#
#
#
#
#
#
#
#

sick.csv:
Row_Number,Facility_City,Facility_State,Diagnosis_Code,D...
1374852,Houston,TX,473.9:780.4:300.00:786.2:784.0:305.1:...
1536525,Houston,TX,486:786.2:780.4:784.0:794.00:789.00:7...
2100347,Rowlett,TX,780.4:784.0:786.2,NA,SCREENING - HA -...
2189305,Rowlett,TX,780.4:784.0:786.2:V76.12,NA,SCREENING...
3108090,Rowlett,TX,780.4:784.0:786.2:V76.12,NA,SCREENING...
5887191,Rowlett,TX,786.2:780.1:780.4:784.0,NA,786.2:SEP:...
7968958,Houston,TX,493.90:780.4:780.60:784.0:786.2:787.0...
9197758,Houston,TX,493.90:780.4:780.60:784.0:786.2:787.0...

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Example: Refined Age Groups

We want to see motor vehicle traffic


accidents involving young people. We
recombine the ages to the following groups:
015, 1620, 2125, 2630, and 3135 years.
Data source: Texas region 2/3
Location: Tarrant County
Time: February 1October 31, 2013

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Query
SELECT Facility_City, Facility_State, Diagnosis_Code, Diagnosis_Text,
Chief_Complaint, Age, Gender, Visit_Date_Time,
Row_Number
FROM TC_Meaningful_Use_Base
WHERE Visit_Date_Time BETWEEN '2013-02-01 00:00:00'
AND '2013-10-31 23:59:59'
AND (Diagnosis_Code LIKE '%E81_%')
AND Age <= 35;

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Run Query

# Run Query
df.auto <- dbGetQuery(con, query)
# Fix dates using lubridate
df.auto$Visit_Date_Time <ymd_hms(df.auto$Visit_Date_Time)
# Create a date column
df.auto$Visit_Date <- as.Date(df.auto$Visit_Date_Time)

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Bin Ages
# Drop all rows where age is greater than 35 years or is
undefined
df.auto <- df.auto[!is.na(df.auto$Age),]
df.auto <- df.auto[df.auto$Age <= 35,]
# Bin ages
df.auto$Age_binned <- cut(df.auto$Age,
breaks=c(0, 15, 20, 25, 30, 35), include.lowest=T)

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Histogram of Visits by Age Group


ggplot(df.auto) + aes(Age_binned) + geom_histogram()

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Create Summary Data Set


df.auto.daily.counts <- ddply(df.auto,
.(Visit_Date, Age_binned),
summarize,
count=length(Chief_Complaint))
################################
# Visit_Date Age_binned count
# 1 2013-02-01
[0,15]
3
# 2 2013-02-01 (15,20] 25
# 3 2013-02-01 (20,25] 16
# 4 2013-02-01 (25,30] 16
# 5 2013-02-02
[0,15] 13
# 6 2013-02-02 (15,20]
6
###############################

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Visits per Week by Age


ggplot(df.auto.daily.counts)
+ aes(x = Visit_Date, y = count, color=Age_binned)
+ geom_line(size=2, alpha=.7)

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Visits per Week by Age


ggplot(df.auto.daily.counts)
+ aes(x = Visit_Date, y = count, color=Age_binned)
+ geom_smooth(size=3, alpha=.7)

BioSense
P u b lic H e a lth S u r v e illa n c e T h r o u g h C o lla b o r a tio n

https://biosen.se

Questions?