Sie sind auf Seite 1von 8

# Sample data set

Introduction
Presentation Title Goes Here to R: mydata[3,4]
Data Manipulation and Statistical Analysis
…presentation subtitle.

Data Manipulation

Violeta I. Bartolome
Senior Associate Scientist-Biometrics
Crop Research Informatics Laboratory
International Rice Research Institute

## Selecting Variables Selecting Variables

• Select variable Y1 • Select variables Y1, Y2, Y3, Y4
o mydata[“Y1”] o mydata[c(3,4,5,6)]

o mydata[,3] o mydata[3:6]

o mydata[3] o mydata[-c(1,2)]

## o mydata[c(FALSE, FALSE, TRUE, FALSE, o mydata[-I(1:2)] # I() is the isolation function

FALSE, FALSE)] o mydata[c(“Y1”, “Y2”, “Y3”, “Y4”)]
o mydata[as.logical(c(0,0,1,0,0,0))] o mydata[c(FALSE, FALSE, TRUE, TRUE, TRUE,
o mydata[names(mydata)==“Y1”] TRUE)]
o mydata\$Y1 o mydata[as.logical(c(0,0,1,1,1,1))]

To create a data frame containing Y1 To create a data frame containing Y1, Y2, Y3, Y4
Dataset
myA<- mydata[“Y1”] myB<- mydata[c(3,4,5,6)]
:: color, composition, and layout :: color, composition, and layout
Selecting Variables
Selecting Observations
• Select variables Y1, Y2, Y3, Y4
• Select observation numbers 3 to 8
o myB<-data.frame(mydata\$Y1, mydata\$Y2,
o mydata[3:8, ]
mydata\$Y3, mydata\$Y4)
o mydata[-c(1,2), ]
this is equivalent to
attach(mydata) • Select observations of Site B
o mydata[mydata\$Site==“B”, ]
myB<-data.frame(Y1,Y2,Y3,Y4)
o subset(mydata,subset=Site==“B”)
detach(mydata)
o mydata[which(mydata\$Site==“B”),]
o myB<-subset(mydata, select=Y1:Y4)

## To create a data frame

myC<- mydata[mydata\$Site==“B”, ]
Dataset

## Selecting Both Variables and

Selecting Observations Observations
Select observations of Sites A and B, and
Trt 1 and 2 • Data frame containing Site B and Y1-Y4
o attach(mydata)
o myD<-mydata[4:6, 3:6]
mydata[(Site==“A” | Site==“B”) &
(Trt==1 | Trt==2), ] myD<-mydata[mydata\$Site==“B”,
c(“Y1”,”Y2”,”Y3”,”Y4”)]
detach(mydata)
o myD<-subset(mydata,subset=Site==“B”,select=Y1:Y4)
o subset(mydata,subset=((Site==“A” |
Site==“B”) & (Trt==1 | Trt==2)))
o mydata[which((mydata\$Site==“A” |
mydata\$Site==“B”) &
(mydata\$Trt==1 | mydata\$Trt==2)),] Dataset Dataset

## :: color, composition, and layout Hands-on :: color, composition, and layout

Transforming/Creating New Variables
• Using Numerical Expressions
o mydata\$Y5 <- mydata\$Y3
sample dataset
Using Numerical Expressions Using Mathematical Operations
o mydata\$Y6 <- 0
• Using Mathematical Operations (+, -, *. / **)
o mydata\$sum <-
mydata\$Y1+mydata\$Y2+mydata\$Y3+mydata\$Y4
o attach(mydata)
mydata\$sum<-Y1+Y2+Y3+Y4
detach(mydata)
o mydata<-transform(mydata, sum=Y1+Y2+Y3+Y4)
o If with more than 1 transformation
mydata<-transform(mydata, sample dataset back
forward
sum=Y1+Y2+Y3+Y4,
mean=sum/4)
:: color, composition, and layout :: color, composition, and layout

## Transforming/Creating New Variables Missing data: using the na.rm option

• Using functions
• Consider the statement
o mydata\$sqrtY3 <- sqrt(mydata\$Y3)
o mydata\$sumy<-
o mydata\$Y4 <- log10(mydata\$Y4)
mydata\$Y1+mydata\$Y2+mydata\$Y3
Note: if any of the Y’s is missing sum will be missing
• To get sum of non-missing observations
o myYs<-subset(mydata,select=c(Y1,Y2,Y3))
o mydata\$sum<-rowSums(myYs,na.rm=TRUE)

## :: color, composition, and layout :: color, composition, and layout

Missing data: using the is.na()
• Selecting observations with at least one missing
observation
o missing <- subset(mydata,subset=(is.na(Y1)==T|
is.na(Y2)==T|is.na(Y3)==T|is.na(Y4)==T))

back
forward

## Keeping and Dropping Variables Renaming Variables

• Create a copy of mydata • Rename Y1-Y4 to X1-X4, respectively
mysubset <- mydata o library (reshape)
• Drop Y3 and Y4 from mysubset mydata <- rename(mydata, c(Y1=“X1”))
mysubset\$Y3 <- mysubset\$Y4 <- NULL mydata <- rename(mydata, c(Y2=“X2”))
mydata <- rename(mydata, c(Y3=“X3”))
mydata <- rename(mydata, c(Y4=“X4”))
o names(mydata) <- c(“Site”, “Trt”, “X1”, “X2”, “X3”, X4”)

## :: color, composition, and layout Hands-on :: color, composition, and layout

Merging Data Frames
Stacking/Concatenating Data Frames

## • Data frame containing Site A only • Data frame containing Y1 and Y2

attach(mydata) attach(mydata)
A <- mydata[Site==“A”, ] left <- mydata[c(“Site”,”Trt”,”Y1”,”Y2”)]
• Data frame containing Site B only • Data frame containing Y3 and Y4
B <- mydata[Site==“B”, ] right <- mydata[c(“Site”,”Trt”,”Y3”,”Y4”)]
• Combine the two data frames • Merge the two data frames
both <- rbind(A,B) both <- merge(left, right,
detach(mydata) by=c(“Site”,”Trt”))
detach(mydata)

Hands-on :: color, composition, and layout Hands-on :: color, composition, and layout

Parallel to Serial
Sorting Data Frames
• Sort by Trt and Site
mydataSorted <-
mydata[order(mydata\$Trt,
mydata\$Site), ]

## Note: Default is ascending data.serial <- reshape(mydata, # object to be reshaped

order. Prefix a variable by a varying=list(3:6), # if >1 variable -- list(3:4,5:6)
minus sign to get descending v.names=“Y", # v.names=c(“Y”,”X”)
order idvar=c(“Site“,”Trt”), # be used as rownames
timevar=“Rep", # new variable to be created
mydataSorted <- times=c(1:4), # values of new variable
mydata[order(-mydata\$Trt, direction="long“)
mydata\$Site), ] data.serial
Hands-on :: color, composition, and layout :: color, composition, and layout
Parallel to Serial Parallel to Serial

idvar used as
row names

## row.names(data.serial) <- 1:NROW(data.serial)

data.serial
:: color, composition, and layout Hands-on :: color, composition, and layout

## data.parallel <- reshape(serialdata, # object to be reshaped

v.names=c("yld","dm"), # variables to be converted Remove “.” from column names
idvar=c("plot","date"), # variables to be retained
timevar="rep", # values of which will be colnames(data.parallel) <- gsub("[.]", "", colnames(data.parallel))
affixed to column names data.parallel
drop=c(“var1”,”var2”) # variables to be removed
from the reshaped data
direction="wide“)
data.parallel :: color, composition, and layout :: color, composition, and layout
Serial to Parallel
Serial to Parallel

## Change row names

row.names(data.parallel) <-
1:NROW(data.parallel)
data.parallel

## Aggregating data Aggregating data

• With only one response variable • With more than one response variables
meanY <- aggregate(data.serial\$Y, Ys <- subset(mydata,select=Y1:Y4) # data frame of numerical variables
by = list(data.serial\$Site,data.serial\$Trt), meanYs <- aggregate(Ys,
FUN=mean, by=list(mydata\$Site), # subsetting variables
na.rm=TRUE) # gets statistics from nonmissing values FUN=mean, # function to be performed

meanY na.rm=TRUE)
na.rm=TRUE na.rm=FALSE
meanYs