Sie sind auf Seite 1von 6

A pain researcher is interested in finding methods to reduce lower back pain in individuals

without having to use drugs. The researcher thinks that having acupuncture in the lower back
might reduce back pain. To investigate this, the researcher recruits 25 participants to their
study. At the beginning of the study, the researcher asks the participants to rate their back pain
on a scale of 1 to 10, with 10 indicating the greatest level of pain. After 4 weeks of twice weekly
acupuncture, the participants are asked again to indicate their level of back pain on a scale of 1
to 10, with 10 indicating the greatest level of pain. The researcher wishes to understand whether
the participants' pain levels changed after they had undergone the acupuncture, so a Wilcoxon
signed-rank test is run.

R Functions
library(help = "datasets")
data(cars)
View(cars)
str(cars)

#Session / Set working directory


#Titanic Data: Read the data: Objective?
 My_Test <- read.csv ("test.csv", header = TRUE)
 View (My_Test)
 My_Train <- read.csv ("train.csv", header = TRUE)
 str(My_Train)
 str(My_Test)
 hist(My_Train$Age)
 boxplot(Train$Age)

# to add these two set together we must have same number of columns, so we need to add
one Survival column to My_Test set.. My_Test [Row, Coloum]

My_Test.survived <- data.frame(Survived = rep ("None", nrow(My_Test)), My_Test[,])


#combine the dataset
data.combined <- rbind(My_Train,My_Test.survived)

# factor …drop down list.. enumeration, States of india,..


# str ().. gives the way the data is read by R from the csv file.. so we need to prepare this for R
 is.factor(My_Test$Pclass)
 data.combined$Pclass <- as.factor(data.combined$Pclass)
 str(data.combined$Pclass)
# distribution of data
 table(data.combined$Pclass)
# ggplot2 ..first install the package.. add gg plot2 and then library (ggplot2).. if you get an error
add package stringer.. run again.. library..
 library (ggplot2)

 hist(My_Train$Age)
# Let us use the ggplot … this graph is more powerful and informative
 ggplot(data.combined, aes (x= data.combined$Sex, fill = Survived)) +
geom_bar(width= 0.5)+
facet_wrap(~Pclass) + ggtitle ("Pclass") +
xlab("Title")+ ylab ("Total count")+
labs (fill= "Survived")
 My_Train$Survived <- as.factor(My_Train$Survived)
 ggplot(My_Train, aes (x= My_Train$Sex, fill = Survived)) +
geom_bar(width= 0.5)+
xlab("Title")+ ylab ("Total count")+
labs (fill= "Survived")

 ggplot(data.combined, aes (x= data.combined$Sex, fill = Survived)) +


geom_bar(width= 0.5)+
facet_wrap(~Pclass) + ggtitle ("Pclass") +
xlab("Title")+ ylab ("Total count")+
labs (fill= "Survived")

 ggplot(My_Train, aes (x= My_Train$Sex, fill = Survived)) +


geom_bar(width= 0.5)+
facet_wrap(~Pclass) + ggtitle ("Pclass") +
xlab("Title")+ ylab ("Total count")+
labs (fill= "Survived")

 ggplot(TrainTitle, aes (x= TrainTitle$Age, fill = Survived)) +


geom_bar(width= 0.5)+
facet_wrap(~Pclass + Sex) + ggtitle ("Pclass") +
xlab("Title")+ ylab ("Total count")+
labs (fill= "Survived")

# added Title
TrainTitle <- read.csv ("trainTitle.csv", header = TRUE)
TrainTitle$Title <- as.factor(TrainTitle$Title)
TrainTitle$Survived <- as.factor(TrainTitle$Survived)
 ggplot(TrainTitle, aes (x= TrainTitle$Age, fill = Survived)) +
geom_bar(width= 0.5)+
facet_wrap(~Pclass + Sex) + ggtitle ("Pclass") +
xlab("Title")+ ylab ("Total count")+
labs (fill= "Survived")

 ggplot(TrainTitle, aes (x= TrainTitle$Age, fill = Survived)) +


geom_bar(width= 10)+
facet_wrap(~Pclass + Sex) + ggtitle ("Pclass") +
xlab("Title")+ ylab ("Total count")+
labs (fill= "Survived")
Statistical Operations
# create a dataframe from scratch
age <- c(25, 30, 56)
gender <- c("male", "female", "male")
weight <- c(160, 110, 220)
mydata <- data.frame(age,gender,weight)
How to deal with Missing Data
 My_Train_NA_Omit <- na.omit(My_Train)
 summary(My_Train_NA_Omit$Age)
 summary(My_Train$Age)
 mean(My_Train_NA_Omit$Age)

normal density function (by default m=0 sd=1)

# plot standard normal curve

x <- pretty(c(-3,3), 30)

y <- dnorm(x)

plot(x, y, type='l', xlab="Normal Deviate", ylab="Density", yaxs="i")


pnorm(q)

cumulative normal probability for q (area under the normal curve to the right of q)

pnorm(1.96) is 0.975

(x <- c(1,2,NA,3)
mean(x) # returns NA
mean (x, na.rm=TRUE) # returns 2
 mean(My_Train_NA_Omit$Age)
 sd(My_Train_NA_Omit$Age)
 range(My_Train_NA_Omit$Age)

 write.csv(My_Train_NA_Omit, "My_Train_NA_Omit.csv")

One sample T test


Before examining an analysis, it is always better to plot a histogram / scater plot r Box plot
of the data.
 Boxplot (My_Train_NA_Omit$Age)
#H0: Mu = 35 , Mu != 35 …We want to be 95 % confident on our finding. It can be one tailed
too… like Mu>35, Mu< 35.
> t.test(My_Train_NA_Omit$Age, mu = 35)
> t.test(My_Train_NA_Omit$Age, mu = 35, alt = "two.sided", con= 0.90)

 t.test(My_Train_NA_Omit$Age, mu = 35, alternative = "less", conf.level= 0.95)

#'arg' should be one of “two.sided”, “less”, “greater”

Two sided is by default…

 t.test(My_Train_NA_Omit$Age, mu = 35, alt = "two.sided", con= 0.95)


Two Sample T test (Independent Two sample Test)
 t.test (My_Train$Age~My_Train$Sex, paired= F)
 t.test (My_Train$Age~My_Train$Sex, mu=0, alt= "two.sided", conf = 0.95, var.eq = T,
paired= F)
 t.test (My_Train_NA_Omit$Age~My_Train_NA_Omit$Sex, mu=0, alt= "two.sided", conf =
0.95, var.eq = T, paired= F)

 boxplot(My_Train_NA_Omit$Age~My_Train_NA_Omit$Sex)

Paired T test Two tail test


 t.test (Test_paired$Hotel.16, Test_paired$Hotel.17, paired = T)
 t.test (Test_paired$Hotel.16, Test_paired$Hotel.17, mu=0, alternative = "two.sided",
paired = T, conf.level = 0.95)

Anova
# One Way Anova (Completely Randomized Design)
fit <- aov(y ~ A, data=mydataframe)
Y is the Values ( numeric) and A is factors (categorical)

Group1<- c(1,3,4,6,7,8,6,4,5,3,5)
Group2<- c(3,6,3,4,5,6,7,5,6)
Group3 <- c(4,5,6,7,8,9)
combines_group <-(data.frame(cbind(Group1,Group2,Group3)))
S<-stack(combines_group)
ANV<- aov (values~ ind, data= S)
summary (ANV)

Correlation

> cor(x~y)
> cor.test (x~y)
x<- data.frame(My_Test$Age,My_Test$Fare)
> cor(x, use="complete.obs", method="pearson")

Simple Linear regression


Lm (a~b)

Multiple Regression
# Multiple Linear Regression Example
fit <- lm(y ~ x1 + x2 + x3, data=mydata)
summary(fit) # show results

Non Parametric Test


chisq.test (My_Test$Pclass, My_Test$Sex)

wilcox.test(y~A) # where y is numeric and A is A binary factor


wilcox.test(y,x) # where y and x are numeric
wilcox.test(y1,y2,paired=TRUE) # where y1 and y2 are numeric
kruskal.test(y~A) # where y1 is numeric and A is a factor