Sie sind auf Seite 1von 66

#=================================================================================

Basics of R and Data Types


#=================================================================================
#ff6600
#ff0000
#fb641b

R in Action

Hands-on Programming with R

R for Everyone: Advanced Analytics and Graphics

R Cookbook

Machine Learning with R

Practical Data Science with R

Introduction to Statistical Learning

# Example - 6 ( Second Maximum Number )


vec <- c(1,8,6,4,5)
second.max <- function( vec ) {
secondLargestNum <- 0
largestNum <- 0
for( num in vec ) {

if ( largestNum < num ){


secondLargestNum = largestNum;
largestNum = num;
}
if ( secondLargestNum < num && largestNum != num )
secondLargestNum = num;

}
return (secondLargestNum)
}

# Evaluate the function with arguments


second.max (vec)

if (score == 100) {
grade = ’A’;
cout << "Superb" << endl;
}
else if (score >= 90) {
grade = ’A’;
cout << "Excellent" << endl;
}
else if (score >= 80) {
grade = ’B’;
cout << "Very Good" << endl;
}
else if (score >= 70) {
grade = ’C’;
cout << "Good" << endl;
}
else if (score >= 60)
grade = ’D’;
else
grade = ’F’;

if ((Ch >= ’a’) && (Ch <= ’z’)) {


// code to process lower-case leter
}
else if ((Ch >= ’A’) && (Ch <= ’Z’)) {
// code to process upper-case leter
}
else if ((Ch >= ’0’) && (Ch <= ’9’)) {
// code to process digit character
}
else {
// code to process non-letter/non-digit characters
}

Assign a value to double variable cost depending on the value of integer variable
distance as follows:
Distance Cost
----------------------------------- ----------
0 through 100 5.00
More than 100 but not more than 500 8.00
More than 500 but less than 1,000 10.00
1,000 or more 12.00

#--------------------------------------------------------------------------------
# R Basic Syntax & Data Types
#---------------------------------------------------------------------------------

1. PRINT HELLO WORLD :


print("Hello World")

2. ADD TWO NUMBERS :


print(23.9 + 11.6)

3. R Script File / Comments ( My first program in R Programming )

if(FALSE){
"This is a demo for multi-line comments and it should be put
inside either a single of double quote"
}
myString <- "Hello, World!"
print ( myString)

4. R WORK AS A CALCULATOR :

1+2 #Addition
5-2 #Subtraction
2*3 #Multiplication
3/2 #Division
2**3 #Exponentation
2^3 #Exponentation

#------------------------------------------------------------------------------
# WorkSpace
#-----------------------------------------------------------------------------

5. REGARDING WORKSPACE :
getwd() #Get working directory
setwd("C:\\Users\\Anup\\Desktop\\RFiles") #Setting a new working directory
x<-10
print(x)
X<-20
print(X)
ls() #Lists objects in current workspace
rm("x") #Remove object x from workspace
rm(list=ls()) #Remove all objects in workspace

#--------------------------------------------------------------------------------
# Create Variables
#--------------------------------------------------------------------------------

6. HOW TO CREATE A VARIABLE :

x<-10
print(x)
X<-20
print(X) # R is case sensitive
assign("x",15)
assign("X",25) # Alternative way of creating variable but not preferred

7. NAMING CONVENTIONS ( CORRECT WAYS )

studentNames <- c("Anup","Sangram","Subhajit")


student.names <- c("Anup","Sangram","Subhajit") #Best way
student_names <- c("Anup","Sangram","Subhajit")
studentnames3 <- c("Anup","Sangram","Subhajit")
.studentnames <- c("Anup","Sangram","Subhajit") # Correct but not preferred

8. NAMING CONVENTIONS ( INCORRECT WAYS )

3studentnames <- c("Anup","Sangram","Subhajit")


student names <- c("Anup","Sangram","Subhajit")
student@names <- c("Anup","Sangram","Subhajit")
student!names <- c("Anup","Sangram","Subhajit")

9. ?Reserved # Cannot use reserved words as variable names

10. QUITTING R
q()

#---------------------------------------------------------------------------------
# Getting Help
#---------------------------------------------------------------------------------

student.marks<-c(30,60,90)
student.marks
mean(student.marks) #Arithmetic mean
median(student.marks) #Arithmetic median

help.start()
?mean #if we know the function
help(mean) #Same as ?mean
??mean #if you do not know the function
??"arithmetic mean"
help.search("arithmetic mean") #same as ??"arithmetic mean"
#---------------------------------------------------------------------------------
# Packages
#--------------------------------------------------------------------------------

library() #All installed packages


search() #All loaded packages
install.packages("tm",dependencies = TRUE) #Install package "tm"
library(tm) #Load package "tm"
detach("package:tm") #Unload package

#---------------------------------------------------------------------------------
# Simple Data Types
#---------------------------------------------------------------------------------

1. LOGICAL ( TRUE , FALSE )

v <- TRUE
print(class(v))

2. NUMERIC ( 12.3, 5, 999 )

v <- 23.5
print(class(v))

3. INTEGER ( 2L, 34L, 0L )

v <- 2L
print(class(v))

4. COMPLEX ( 3 + 2i )

v <- 2+5i
print(class(v))

5. CHARACTER ( 'A' , "GOOD", "TRUE", '23.4' )

v <- "TRUE"
print(class(v))

6. RAW ( "HELLO" IS STORED AS 48 65 6C 6C 6F )

v <- charToRaw("Hello")
print(class(v))

==================================================================================
R data structures

There are numerous types of data structures across programming languages, each
with strengths and weaknesses suited to particular tasks. Since R is a programming
language used widely for statistical data analysis, the data structures it utilizes
were designed with this type of work in mind.

The R data structures used most frequently in machine learning are vectors,
factors, lists, arrays and matrices, and data frames. Each is tailored to a
specific data management task, which makes it important to understand how they will
interact in your R project. In the sections that follow, we will review their
similarities and differences.
1. Vectors ( A vector can contain any number of elements, but all of the elements
must be of the same type of values. For instance, a vector cannot contain both
numbers and text. To determine the type of vector v, use the typeof(v) command. )

subject_name <- c("Mathematics", "Statistic", "Machine Learning","Data Science")


#Character vector
temperature <- c(98.1, 98.6, 101.4) #Numeric vector
flu_status <- c(FALSE, FALSE, TRUE) #Logical Vector
num <- c(1L,2L,3L) #Integer Vector
color <- c('red','green',"yellow")
print(color)
print(class(color))

Creating R Vector using Range ( In R programming there is a special operator called


Range or Colon and this will help to create a vector. For example, i <- 1:10 means
1, 2, 3, 4, 5, 6, 7, 8, 9, 10 )

# Creating a sequence from 5 to 13.


v <- 5:13
print(v)
# Creating a sequence from 6.6 to 12.6.
v <- 6.6:12.6
print(v)
# If the final element specified does not belong to the sequence then it is
discarded.
v <- 3.8:11.4
print(v)

Creating R Vector using Sequence (seq) Operator ( In this example we will show you,
How to create a vector using sequence operator, or simply seq operator. The
Sequence operator will return values sequentially )

# Vector with Sequence


a <- seq(from = 1, to = 10, by = 1)
print(a)
# Here, from =, to =, by = values are option so you can remove them too
b <- seq(11, to = 15, by = 1) # Removing from
print(b)
c <- seq(15, 25, by = 3) # Removing both from, to
print(c)
d <- seq(2, 20, 2) # removing from, to, and by
print(d)

Accessing R Vector Elements ( In R programming, We can use the index position to


access the elements in a Vector. Using this index value, we can access or
alter/change each and every individual element present in the vector. Index value
starts at 1 and end at n where n is the vector length. For example, if we declare a
vector that stores 10 elements then index starts at 1 and ends at 10. To access or
alter 1st value use Vector_Name[1] and to alter or access the 10th value, use
Vector_Name[10]. Lets see the example for better understanding: )

# R Vector Elements Accessing


a <- c("India", "China", "Japan", "UK", "USA", "Russia", "Sri Lanka")
print(a)
print(a[1])
print(a[3])
print(a[1:3])
print(a[4:6])
Access using Vector ( In this example we will show you, How to access the Vector
elements using another Vector.)

# R Vector Elements Accessing


a <- c("India", "China", "Japan", "UK", "USA", "Russia", "Sri Lanka")
print(a)
b <- c(2, 4, 6)
print(a[b])
print(a[c(5, 7)])
print(a[c(7, 4, 1)])

Using Negative Values ( In this example we will show you, How to access the Vector
elements using Negative values and the Boolean values. In R Vectors, Negative index
position are used to omit those values )

# R Vector Elements Accessing


a <- c("India", "China", "Japan", "UK", "USA", "Russia", "Sri Lanka")
print(a)
b <- c(-3, -7)
print(a[b])
print(a[c(-4, -6, -7)])
print(a[c(TRUE, FALSE, TRUE, TRUE, FALSE, TRUE, FALSE)])

Use Character Vectors as Index ( In this example we will show you, How to access
the Vector elements using Character Vectors Index values. From the below code
snippet you can observe that, we declare a vector with alphabet indexes. This can
help us to extract the vector elements using the alphabets. )

# R Vector Elements Accessing


v <- c("a" = 10, "b" = 20, "c" = 30, "d" = -15, "e" = 40)
print(v)
print(v["a"])
print(v["d"])
print(v[c("a", "c")])

Manipulating R Vector Elements ( In R Programming, we can manipulate the Vector


elements in following ways: )

# R Vector Elements Manipulation


a <- c(10, 20, 30, -15, 40, -25, 60, -5)
print(a)
a[7] <- 77
print(a)
a[a < 0] <- 99
print(a)
# Truncating the Vector Elements
a <- a[1:5]
print(a)
# Deleting Vector
a <- NULL
print(a)

Important Functions ( Following functions are some of the most useful functions
supported by the Vectors in R programming. )

# typeof(Vector): This method will tell you the data type of the vector
# Sort(Vector): This method will help us to sort the items in Ascending order
# length(Vector): This method will count number of elements in a vector
# head(Vector, limit): This method will return top six elements (if you Omit the
limit). If you specify the limit as 4 then, it will return first 4 elements.
# tail(Vector, limit): This method will return last six elements (if you Omit the
limit). If you specify the limit as 2 then, it will return last two elements.

# R Vector Elements Accessing


a <- c(10, 20, 30, -15, 40, -25, 60, -5)
print(a)
# Some of the Important Functions
typeof(a)
length(a)
head(a)
head(a, 4)
tail(a)
tail(a, 3)
x <- sort(a)
print(x)

2. Factors ( A factor is a special case of vector that is solely used to represent


categorical or ordinal variables. In the medical dataset we are building, we might
use a factor to represent gender, because it uses two categories: MALE and
FEMALE. )

gender <- factor(c("MALE", "FEMALE", "MALE"))


print(gender)
blood <- factor(c("O", "AB", "A"),levels = c("A", "B", "AB", "O"))
print(blood[1:2])
symptoms <- factor(c("SEVERE", "MILD", "MODERATE"), levels = c("MILD", "MODERATE",
"SEVERE"), ordered = TRUE)
print(symptoms)
colors <- c('green','green','yellow','red','red','red','green')
factor_color <- factor(colors)
print(factor_color)
print(nlevels(factor_color))

# Factors
# Create a vector as input.
data <-
c("East","West","East","North","North","East","West","West","West","East","North")
print(data)
print(is.factor(data))
# Apply the factor function.
factor_data <- factor(data)
print(factor_data)
print(is.factor(factor_data))

# Factors in Data Frame


# Create the vectors for data frame.
height <- c(132,151,162,139,166,147,122)
weight <- c(48,49,66,53,67,52,40)
gender <- c("male","male","female","female","male","female","male")
# Create the data frame.
input_data <- data.frame(height,weight,gender)
print(input_data)
# Test if the gender column is a factor.
print(is.factor(input_data$gender))
# Print the gender column so see the levels.
print(input_data$gender)
# Changing the Order of Levels
data <-
c("East","West","East","North","North","East","West","West","West","East","North")
# Create the factors
factor_data <- factor(data)
print(factor_data)
# Apply the factor function with required order of the level.
new_order_data <- factor(factor_data,levels = c("East","West","North"))
print(new_order_data)

# Generating Factor Levels


v <- gl(3, 4, labels = c("Tampa", "Seattle","Boston"))
print(v)

3. Lists ( A list is a data structure, much like a vector, in that it is used for
storing an ordered set of elements. However, where a vector requires all its
elements to be the same type, a list allows different types of elements to be
collected. Due to this flexibility, lists are often used to store various types of
input and output data and sets of configuration parameters for machine learning
models )

# Create R List
list.data <- list("R", "Basic Programming", TRUE, c(10, 20, 30), 95, 105.61)
print(list.data)

Creating R List using Vectors

Example - 1
> subject_name <- c("Mathematics", "Statistic", "Machine Learning","Data Science")
#Character vector
> temperature <- c(98.1, 98.6, 101.4) #Numeric vector
> flu_status <- c(FALSE, FALSE, TRUE) #Logical Vector
gender <- factor(c("MALE", "FEMALE", "MALE"))
> num <- c(1L,2L,3L) #Integer Vector
> blood <- factor(c("O", "AB", "A"),levels = c("A", "B", "AB", "O"))
> symptoms <- factor(c("SEVERE", "MILD", "MODERATE"), levels = c("MILD",
"MODERATE", "SEVERE"), ordered = TRUE)
list.data <- list(fullname = subject_name[1],temperature = temperature[1],
flu_status = flu_status[1],gender = gender[1],blood = blood[1],symptoms =
symptoms[1])
print(subject1)

Example - 2
# Creating three vectors
vect.a <- c(10.25, 20.45, 30.75, 40.85)
vect.b <- c(25, 50, 75, 100, 125)
vect.c <- c("India", "China", "Japan", "Russia", "USA")
# Creating list
list.data <- list(vect.a, vect.b, vect.c )
print(list.data)

Example - 3
vect.a <- c(10.25, 30.75, 20.45, 40.85)
vect.b <- c("India", "Japan", "Russia", "China", "USA")
list.data <- list(vect.a, vect.b )
print(list.data)

#Creating R List using Matrix


A <- matrix(c(1:12), nrow = 3)
vect.c <- c(50, 75, 25, 100, 125)
list.mixed <- list(A, list.data, vect.c )
print(list.mixed)

Creating Named List in R


list.data <- list("Company" = "R Basic Programming", "Flag" = TRUE, "prod" = c(10,
20, 30), "val" = 95, "sale" = 105.61)
print(list.data)

Creating Named List in R using names function


vect.x <- c(10, 30, 50, 70)
vect.y <- c("India", "Japan", "UK", "Russia", "China", "USA")
list.a <- list(vect.x, vect.y )
# Assigning Names
names(list.a) <- c("Num_Vector", "Country")
print(list.a)

matrix.A <- matrix(c(1:12), 3, 4)


vect.z <- c(55, 75, 25, 105, 125)
list.mixed <- list(matrix.A, list.a, vect.z, "R Basic Programming")
names(list.mixed) <- c("Num_Matrix", "Inner_List", "Rand_vector", "Company")
print(list.mixed)

Accessing R List Elements


vect.a <- c(10.25, 30.75, 20.45, 40.85)
vect.b <- c("India", "Japan", "Russia", "China", "USA")
vect.c <- c(50, 75, 25, 100, 125)

A <- matrix(c(1:12), 3, 4)

list.data <- list(A, vect.a, "R Basic Programming", vect.b, 95, vect.c )
print(list.data)

# Accessing First Element


print(list.data[1])

# Accessing Fourth Element


print(list.data[4])

Accessing R List items using Names


vect.x <- c(10, 30, 50, 70)
vect.y <- c("India", "Russia", "Japan", "UK", "China", "USA")

list.data <- list(vect.x, vect.y )


names(list.data) <- c("Numeric_Vector", "Country")

matrix.A <- matrix(c(1:12), 3, 4)


vect.z <- c(55, 75, 25, 105, 125)

list.mixed <- list(matrix.A, list.data, vect.z, "R Basic Programming")


names(list.mixed) <- c("Numeric_Matrix", "Nested_List", "Random_vector", "Company")
print(list.mixed)

# Accessing Vector.z Elements


print(list.mixed$Random_vector)

# Accessing Vector.z Elements


print(list.mixed$Numeric_Matrix)
# Accessing Nested List Elements
print(list.mixed$Nested_List)

Accessing R List items using Boolean vector


vect.a <- c(10.25, 30.75, 20.45, 40.85)
vect.b <- c("India", "Japan", "Russia", "China", "USA")
vect.c <- c(50, 75, 25, 100, 125)

A <- matrix(c(1:12), 3, 4)

list.data <- list(A, vect.a, "R Programming", vect.b, 95, vect.c )


print(list.data)

# Accessing List Element using Boolean Vector


print(list.data[c(FALSE, FALSE, TRUE, FALSE, TRUE, TRUE)])

# Accessing All Element except 1


print(list.data[-1])

# Accessing All Element except 4


print(list.data[-4])

# Accessing All Element except 1 and 6th element


print(list.data[c(-1, -6)])

Manipulating R List Elements


vect.x <- c(10, 30, 50, 70)
vect.y <- c("India", "Russia", "Japan", "UK", "China", "USA")

list.data <- list(vect.x, vect.y )


names(list.data) <- c("Numeric_Vector", "Country")

matrix.A <- matrix(c(1:12), 3, 4)


vect.z <- c(55, 75, 25, 105, 125)

list.mixed <- list(matrix.A, list.data, 95, vect.z, "R Programming")


names(list.mixed) <- c("Numeric_Matrix", "Nested_List", "favNum", "Random_vector",
"Company")
print(list.mixed)

list.mixed$Company <- "TutortialProgramming.org"


print(list.mixed$Company)

list.mixed$Random_vector <- c(22, 44, 66, 88)


print(list.mixed$Random_vector)

list.mixed$Numeric_Matrix <- NULL


print(list.mixed)

Merging two lists in R programming


vect.a <- c(10.25, 30.75, 20.45, 40.85)
vect.b <- c("India", "Japan", "Russia", "China", "USA")

# Creating List 1 from those two Vectors Vect.a, Vect.b


list.x <- list(vect.a, 95, vect.b )
print(list.x)

# Declared One 4 * 3 matrix


A <- matrix(c(1:12), 4, 3)
# Creating second list with one String, Matrix, and a vector
list.y <- list("R Programming", A, c(5, 10, 15) )
print(list.y)

# Combining or Merging two List


list.z <- c(list.x, list.y)
print(list.z)

Convert List to Vector in R


list.x <- list(1:15)
list.y <- list(25:40)
print(list.x)
print(list.y)
typeof(list.x)

vect.a <- unlist(list.x)


print(vect.a)

vect.b <- unlist(list.y)


print(vect.b)

4. Matrices ( R provides other structures that store values in a tabular


form. A matrix is a data structure that represents a two-dimensional table with
rows and columns of data. Like vectors, R matrixes can contain any one type of
data, although they are most often used for mathematical operations and, therefore,
typically store only numeric data. )

> m1<-matrix(1:30,nrow=5,ncol=6) #Filled columnwise by default


> m2<-matrix(1:30,nrow=5,ncol=6,byrow=TRUE) #Filled rowwise

> x = matrix( c('a','a','b','c','b','a'), nrow=2,ncol=3,byrow = TRUE)


print(x)

Create R Matrix using cbind and rbind


A <- c(1, 2, 3)
B <- c(20, 30, 40)
X <- cbind(A, B)
print(X)
Y <- rbind(A, B)
print(Y)

Defining Row names and Column names for matrix in R


A <- matrix(20:31, 3, 4, byrow = TRUE, dimnames = list(c("X", "Y", "Z"), c("A",
"B", "C", "D")))
print(A)

# Defining Row names and Column names of Matrix in R


row.names <- c("Row1", "Row2", "Row3")
column.names <-c("Col1", "Col2", "Col3", "Col4")
B <- matrix(c(1:12), nrow = 3, dimnames = list(row.names, column.names))
print(B)

Important Function for Matrix in R


A <- matrix(c(1:12), nrow = 3, ncol = 4, byrow = TRUE)
print(A)
class(A)
dim(A)
R Vector with Recycling elements ( All the above specify examples are working fine
because we specify exact elements in Rows and columns such as 12 elements arranged
in 3 rows and 4 columns. In this example we will show you, What will happen if we
specify less number of element )

A <- matrix(c(44: 46), nrow = 3, ncol = 3)


print(A)

B <- matrix(c(44: 46), nrow = 3, ncol = 3, byrow = TRUE)


print(B)
Accessing R Matrix Elements
A <- matrix(c(1:12), nrow = 3, ncol = 4, byrow = TRUE)
print(A)
# Access the element at 1st row and 2nd column.
print(A[1, 2])
# Access the element at 3rd row and 4th column.
print(A[3, 4])
# Access only the 2nd row.
print(A[2,])
# Access only the 4th column.
print(A[, 4])
# Access Complete Matrix.
print(A[ , ])
# Access the elements at 1st, 3rd row and 2nd, 4th column.
print(A[c(1, 2), c(3, 4)])
# Access All the element at 2nd and 3rd row.
print(A[c(2, 3), ])
# Access All the element at 1st and 4th Column.
print(A[ , c(1, 4)])
# Access All the element except 2nd row.
print(A[-2, ])
# Access All the element except 2nd row and 3rd Columm.
print(A[-2, -3])
# Access All the element except 3rd and 4th Columm.
print(A[, c(-3, -4)])

Modifying R Matrix Elements


A <- matrix(c(1:9), nrow = 3, ncol = 3)
print(A)
A[2, 2] <- 100
print(A)
A[A < 5] <- 222
print(A)

R Matrix Addition, Subtraction, Multiplication and Division

# Create 2x3 matrices.


A <- matrix( c(15, 34, 38, 44, 75, 93), nrow = 2)
B <- matrix( c(10, 20, 30, 40, 50, 60), nrow = 2)
print(A)
print(B)

# Addiing two Matrices


print(A + B)

# Subtraction One Matrix from another


print(A - B)

# R Matrix Multiplication
print(a * b)

# Matrix Division
print(a / b)

5. Arrays ( While matrices are confined to two dimensions, arrays can be of any
number of dimensions. The array function takes a dim attribute which creates the
required number of dimension. In the below example we create an array with two
elements which are 4x4 matrices each )

# Create Array in R
a <- array(c('green','yellow'),dim=c(4,4,3))
print(a)
z<-array(1:30,c(2,3,5))
print(z)

# Create two vectors of different lengths.


vector1 <- c(5,9,3)
vector2 <- c(10,11,12,13,14,15)
# Take these vectors as input to the array.
result <- array(c(vector1,vector2),dim=c(3,3,2))
print(result)

# Defining Row names and Column names for Array in R


# Create two vectors of different lengths.
vector1 <- c(5,9,3)
vector2 <- c(10,11,12,13,14,15)
column.names <- c("COL1","COL2","COL3")
row.names <- c("ROW1","ROW2","ROW3")
matrix.names <- c("Matrix1","Matrix2")
# Take these vectors as input to the array.
result <- array(c(vector1,vector2),dim=c(3,3,2),dimnames =
list(column.names,row.names,matrix.names))
print(result)

# Accessing R Array Elements


# Create two vectors of different lengths.
vector1 <- c(5,9,3)
vector2 <- c(10,11,12,13,14,15)
column.names <- c("COL1","COL2","COL3")
row.names <- c("ROW1","ROW2","ROW3")
matrix.names <- c("Matrix1","Matrix2")
# Take these vectors as input to the array.
result <- array(c(vector1,vector2),dim=c(3,3,2),dimnames =
list(column.names,row.names,matrix.names))
# Print the third row of the second matrix of the array.
print(result[3,,2])
# Print the element in the 1st row and 3rd column of the 1st matrix.
print(result[1,3,1])
# Print the 2nd Matrix.
print(result[,,2])

# Manipulating Array Elements


# Create two vectors of different lengths.
vector1 <- c(5,9,3)
vector2 <- c(10,11,12,13,14,15)
# Take these vectors as input to the array.
array1 <- array(c(vector1,vector2),dim=c(3,3,2))
# Create two vectors of different lengths.
vector3 <- c(9,1,0)
vector4 <- c(6,0,11,3,14,1,2,6,9)
array2 <- array(c(vector1,vector2),dim=c(3,3,2))
# create matrices from these arrays.
matrix1 <- array1[,,2]
matrix2 <- array2[,,2]
# Add the matrices.
result <- matrix1+matrix2
print(result)

# Calculations Across Array Elements


# Create two vectors of different lengths.
vector1 <- c(5,9,3)
vector2 <- c(10,11,12,13,14,15)
# Take these vectors as input to the array.
new.array <- array(c(vector1,vector2),dim=c(3,3,2))
print(new.array)
# Use apply to calculate the sum of the rows across all the matrices.
result <- apply(new.array, c(1), sum)
print(result)

#R Array Addition and Subtraction


vect1 <- c(10, 20, 40 )
vect2 <- c(55, 67, 89, 96, 100)
A <- array(c(vect1, vect2), dim = c(3, 4, 2))
print(A)
mat.A <- A[, , 1]
mat.B <- A[, , 2]
print(mat.A + mat.B)
print(mat.B - mat.A)

6. Data Frames ( By far, the most important R data structure utilized in machine
learning is the data frame, a structure analogous to a spreadsheet or database,
since it has both rows and columns of data. In R terms, a data frame can be
understood as a list of vectors or factors, each having exactly the same number of
values. Because the data frame is literally a list of vector type objects, it
combines aspects of both vectors and lists. )

Lets create a data frame for our patient dataset. Using the patient data vectors we
created previously, the data.frame() function combines them into a data frame:

> subject_name <- c("Mathematics", "Statistic", "Machine Learning","Data Science")


#Character vector
> temperature <- c(98.1, 98.6, 101.4) #Numeric vector
> flu_status <- c(FALSE, FALSE, TRUE) #Logical Vector
gender <- factor(c("MALE", "FEMALE", "MALE"))
> num <- c(1L,2L,3L) #Integer Vector
> blood <- factor(c("O", "AB", "A"),levels = c("A", "B", "AB", "O"))
> symptoms <- factor(c("SEVERE", "MILD", "MODERATE"), levels = c("MILD",
"MODERATE", "SEVERE"), ordered = TRUE)

> pt_data <- data.frame(subject_name, temperature, flu_status,


gender, blood, symptoms, stringsAsFactors = FALSE)

# R Create Data Frame


Id <- c(1:10)
Name <- c("John", "Rob", "Ruben", "Christy","Johnson", "Miller", "Carlson", "Ruiz",
"Yang","Zhu")
Occupation <- c("Professional", "Programmer","Management", "Clerical",
"Developer", "Programmer", "Management", "Clerical",
"Developer","Programmer")
Salary <- c(80000, 70000, 90000, 50000, 60000, 75000, 92000, 68000, 55000, 82000)

employee <- data.frame(Id, Name, Occupation, Salary)


print(employee)

# Create Named Data Frame in R Programming


Id <- c(1:6)
Name <- c("John", "Rob", "Christy","Johnson", "Miller", "Zhu")
Occupation <- c("Professional", "Management", "Developer", "Programmer",
"Clerical", "Admin")
Salary <- c(80000, 90000, 75000, 92000, 68000, 82000)
# We are assigning new names to the Columns
employee <- data.frame("Empid" = Id, "Full_Name" = Name, "Profession" = Occupation,
"income" = Salary)
print(employee)
# Names function will display the Index Names of each Item
print(names(employee))

# Accessing R Data Frame Elements


Id <- c(1:6)
Name <- c("John", "Rob", "Christy","Johnson", "Miller", "Zhu")
Occupation <- c("Professional", "Management", "Developer", "Programmer",
"Clerical", "Admin")
Salary <- c(80000, 90000, 75000, 92000, 68000, 82000)
employee <- data.frame(Id, Name, Occupation, Salary)
print(employee)
# Accessing all the Elements (Rows) Present in the Name Items (Column)
employee["Name"]
# Accessing all the Elements (Rows) Present in the 3rd Column (i.e., Occupation)
employee[3] # Index Values: 1 = Id, 2 = Name, 3 = Occupation, 4 = Salary

# Accessing R Data Frame Elements


employee[["Name"]]
employee[[3]]

# Accessing all the Elements (Rows) Present in the Name Item (Column)
employee$Name
# Accessing all the Elements (Rows) Present in the Salary Item (Column)
employee$Salary

# Accessing Element at 1st Row and 2nd Column


employee[1, 2]

# Accessing Element at 4th Row and 3rd Column


employee[4, 3]

# Accessing All Elements at 5th Row


employee[5, ]

# Accessing All Item of the 4th Column


employee[, 4]

# Accessing Item at 1st, 2nd Rows and 3rd, 4th Columns


employee[c(1, 2), c(3, 4)]
# Accessing Item at 2nd, 3rd, 4th Rows and 2nd, 4th Columns
employee[2:4, c(2, 4)]

# Accessing All Item at 2nd, 3rd, 4th, 5th Rows


employee[2:5, ]

# Accessing All Item of 2nd and 4th Column


employee[c(2, 4)]

# Accessing Item at 2nd, 4th Rows of Name Columns


employee$Name[c(2, 4)]

# Accessing Item at 2nd, 3rd, 4th, 5th Rows of Occupation Column


employee$Occupation[2:5]

# Modifying Item at 2nd Row and 3rd Column


employee[2, 3] <- 100000
print(employee)

# Modifying All Item of 1st Column


employee[, 1] <- c(10:15)
print(employee)

# Adding Extra Row


rbind(employee, list(7, "Programming", 105505))

# Adding Extra Column


Occupation <- c("Management", "Developer", "User", "Programmer", "Clerical",
"Admin")
cbind(employee, Occupation)

You might notice something new in the preceding code. We included an additional
parameter: stringsAsFactors = FALSE. If we do not specify this option, R will
automatically convert every character vector to a factor.

This feature is occasionally useful, but also sometimes unwarranted. Here, for
example, the subject_name field is definitely not categorical data, as names are
not categories of values. Therefore, setting the stringsAsFactors option to FALSE
allows us to convert character vectors to factors only where it makes sense for the
project.

When we display the pt_data data frame, we see that the structure is quite
different from the data structures we worked with previously:

> print(pt_data)

#=================================================================================
Operators in R
#=================================================================================

#---------------------------------------------------------------------------------
# Arithmetic/Relational Operators
#---------------------------------------------------------------------------------
Operators are particular symbols which operate on some values and produce an
output.The values are known as Operands.
9 <- 4+5
Here 4 and 5 are Operands and (+) , (<-) signs are the operators. They produce the
output 9.

Arithmetic Operators

1. ADDS TWO VECTORS


a <- c( 2,5.5,6)
b <- c(8, 3, 4)
print(a+b)

2. SUBTRACTS SECOND VECTOR FROM THE FIRST


v <- c( 2,5.5,6)
t <- c(8, 3, 4)
print(v-t)

3. MULTIPLIES BOTH VECTORS


v <- c( 2,5.5,6)
t <- c(8, 3, 4)
print(v*t)

4. DIVIDE THE FIRST VECTOR WITH THE SECOND


v <- c( 2,5.5,6)
t <- c(8, 3, 4)
print(v/t)

5. GIVE THE REMAINDER OF THE FIRST VECTOR WITH THE SECOND


v <- c( 2,5.5,6)
t <- c(8, 3, 4)
print(v%%t)

6. THE RESULT OF DIVISION OF FIRST VECTOR WITH SECOND (QUOTIENT)


v <- c( 2,5.5,6)
t <- c(8, 3, 4)
print(v%/%t)

7. THE FIRST VECTOR RAISED TO THE EXPONENT OF SECOND VECTOR


v <- c( 2,5.5,6)
t <- c(8, 3, 4)
print(v^t)

Relational Operators

1. CHECKS IF EACH ELEMENT OF THE FIRST VECTOR IS GREATER THAN THE CORRESPONDING
ELEMENT OF THE SECOND VECTOR.
v <- c(2,5.5,6,9)
t <- c(8,2.5,14,9)
print(v > t)

2. CHECKS IF EACH ELEMENT OF THE FIRST VECTOR IS LESS THAN THE CORRESPONDING
ELEMENT OF THE SECOND VECTOR.
v <- c(2,5.5,6,9)
t <- c(8,2.5,14,9)
print(v < t)

3. CHECKS IF EACH ELEMENT OF THE FIRST VECTOR IS EQUAL TO THE CORRESPONDING ELEMENT
OF THE SECOND VECTOR.
v <- c(2,5.5,6,9)
t <- c(8,2.5,14,9)
print(v==t)
4. CHECKS IF EACH ELEMENT OF THE FIRST VECTOR IS LESS THAN OR EQUAL TO THE
CORRESPONDING ELEMENT OF THE SECOND VECTOR.
v <- c(2,5.5,6,9)
t <- c(8,2.5,14,9)
print(v<=t)

5. CHECKS IF EACH ELEMENT OF THE FIRST VECTOR IS GREATER THAN OR EQUAL TO THE
CORRESPONDING ELEMENT OF THE SECOND VECTOR.
v <- c(2,5.5,6,9)
t <- c(8,2.5,14,9)
print(v>=t)

6. CHECKS IF EACH ELEMENT OF THE FIRST VECTOR IS UNEQUAL TO THE CORRESPONDING


ELEMENT OF THE SECOND VECTOR.
v <- c(2,5.5,6,9)
t <- c(8,2.5,14,9)
print(v!=t)

#---------------------------------------------------------------------------------
# Logical/Assignment/Miscellaneous Operators
#---------------------------------------------------------------------------------

1. IT IS CALLED ELEMENT-WISE LOGICAL AND OPERATOR. IT COMBINES EACH ELEMENT OF THE


FIRST VECTOR WITH THE CORRESPONDING ELEMENT OF THE SECOND VECTOR AND GIVES A OUTPUT
TRUE IF BOTH THE ELEMENTS ARE TRUE.
a <- c(3,1,TRUE,2+3i)
b <- c(4,1,FALSE,2+3i)
print(a&b)

2. IT IS CALLED ELEMENT-WISE LOGICAL OR OPERATOR. IT COMBINES EACH ELEMENT OF THE


FIRST VECTOR WITH THE CORRESPONDING ELEMENT OF THE SECOND VECTOR AND GIVES A OUTPUT
TRUE IF ONE THE ELEMENTS IS TRUE.
x <- c(3,0,TRUE,2+2i)
y <- c(4,0,FALSE,2+3i)
print(x|y)

3. IT IS CALLED LOGICAL NOT OPERATOR. TAKES EACH ELEMENT OF THE VECTOR AND GIVES
THE OPPOSITE LOGICAL VALUE
p <- c(3,0,TRUE,2+2i)
print(!p)

#---------------------------------------------------------------------------------
4. CALLED LOGICAL AND OPERATOR. TAKES FIRST ELEMENT OF BOTH THE VECTORS AND GIVES
THE TRUE ONLY IF BOTH ARE TRUE.
a <- c(3,0,TRUE,2+2i)
b <- c(1,3,TRUE,2+3i)
print(a&&b)

5. CALLED LOGICAL OR OPERATOR. TAKES FIRST ELEMENT OF BOTH THE VECTORS AND GIVES
THE TRUE ONLY IF BOTH ARE TRUE.
x <- c(0,0,TRUE,2+2i)
y <- c(0,3,TRUE,2+3i)
print(x||y)

#---------------------------------------------------------------------------------
# Assignment Operators
#---------------------------------------------------------------------------------
6. CALLED LEFT ASSIGNMENT
x1 <- c(3,1,TRUE,2+3i)
print(x1)

x2 <<- c(3,1,TRUE,2+3i)
print(x2)

x3 = c(3,1,TRUE,2+3i)
print(x3)

7. CALLED RIGHT ASSIGNMENT


c(3,1,TRUE,2+3i) -> y1
print(y1)

c(3,1,TRUE,2+3i) ->> y2
print(y2)

#---------------------------------------------------------------------------------
# Miscellaneous Operators
#---------------------------------------------------------------------------------
8.COLON OPERATOR. IT CREATES THE SERIES OF NUMBERS IN SEQUENCE FOR A VECTOR.
a <- 2:8
print(a)

9. THIS OPERATOR IS USED TO IDENTIFY IF AN ELEMENT BELONGS TO A VECTOR.


x1 <- 8
x2 <- 12
f <- 1:10
print(x1 %in% f)
print(x2 %in% f)

10. THIS OPERATOR IS USED TO MULTIPLY A MATRIX WITH ITS TRANSPOSE.


m = matrix( c(2,6,5,1,10,4), nrow=2,ncol=3,byrow =
TRUE)
r = m %*% t(m)
print(r)

#=================================================================================
Programming with R
#=================================================================================

#---------------------------------------------------------------------------------
# if, if else, if else if & switch Statements
#---------------------------------------------------------------------------------

#---------------------------------------------------------------------------------
The R if statement is used to test the condition. It checks boolean
condition:true or false. There are various types of if statement in R.

if statement
if-else statement
nested if statement
if-else-if ladder

1.if ( The R If Statement is the basic decision-making statement in real


programming world. It allows the compiler to test the condition first, and
depending upon the result it will execute the statements. If the test condition is
true then only statements with in the if statement is executed. )
Here is the syntax for an if statement in R:
if(condition){
//code to be executed
}

# EXAMPLE -1
number <- as.integer(readline(prompt="Please Enter any integer Value: "))

if (number > 1) {
print("You have entered POSITIVE Number")
}

# EXAMPLE - 2
hot <- FALSE
temp <- 60
if (temp > 80){
hot <- TRUE
}
print(hot)

# EXAMPLE - 3
temp <- 100
if (temp > 80){
hot <- TRUE
}
print(hot)

# EXAMPLE - 4
if( 1 == 1){ print('hi')}

# EXAMPLE - 5
x <- 30L
if(is.integer(x)){
print("X is an Integer")
}

2.if-else ( The R if-else statement also tests the condition. It executes the if
block if condition is true otherwise else block is executed. )

It has the syntax:


if(condition){
//code if condition is true
}else{
//code if condition is false
}

# EXAMPLE - 1
my.age <- as.integer(readline(prompt="Please Enter your Age: "))
if (my.age < 18) {
print("You are Not a Major.")
print("You are Not Eligible to Work")
} else {
if (my.age >= 18 && my.age <= 60 ) {
print("You are Eligible to Work")
print("Please fill the Application Form and Email to us")
} else {
print("As per the Government Rules, You are too Old to Work")
print("Please Collect your pension!")
}
}

# EXAMPLE - 2
temp <- 30
if (temp > 90){
print("Hot outside!")
} else{
print("Its not too hot today!")
}

# EXAMPLE - 3
x <- c("what","is","truth")
if("Truth" %in% x){
print("Truth is found")
} else {
print("Truth is not found")
}

3. if-else-if.. ( The if-else-if ladder statement executes one condition from


multiple statements. )
It has the syntax:
if(condition1){
//code to be executed if condition1 is true
}else if(condition2){
//code to be executed if condition2 is true
}
else if(condition3){
//code to be executed if condition3 is true
}
...
else{
//code to be executed if all the conditions are false
}

# EXAMPLE - 1
my.marks <- as.integer(readline(prompt="Please Enter your Total Marks: "))
if (my.marks >= 550) {
print("Congratulations!!")
print("You are eligible for Full Scholarship")
} else if (my.marks >= 490) {
print("Congratulations!!")
print("You are eligible for 50% Scholarship")
} else if (my.marks >= 400) {
print("Congratulations!!")
print("You are eligible for 10% Scholarship")
} else {
print("You are NOT eligible for Scholarship")
print("We are really Sorry for You")
}

# EXAMPLE - 2
temp <- 30
if (temp > 80){
print("Hot outside!")
} else if(temp<80 & temp>50){
print('Nice outside!')
} else if(temp <50 & temp > 32){
print("Its cooler outside!")
} else {
print("Its really cold outside!")
}

# EXAMPLE - 3
x <- c("what","is","truth")
if("Truth" %in% x){
print("Truth is found the first time")
} else if ("truth" %in% x) {
print("truth is found the second time")
} else {
print("No truth found")
}

# EXAMPLE - 4
ham <- 10
cheese <- 10
report <- 'blank'
if(ham >= 10 & cheese >= 10){
report <- "Strong sales of both items"
} else if(ham == 0 & cheese == 0){
report <- "Nothing sold!"
} else {
report <- 'We had some sales'
}
print(report)

4. Switch Statement ( The R switch statement executes one statement from multiple
conditions. It is like if-else-if ladder statement. )

# EXAMPLE - 1
subject <- switch( 3,
"Learn",
"R Programming",
"R ",
"Programming"
)
print(subject)

# EXAMPLE - 2
number1 <- 30
number2 <- 20
operator <- readline(prompt="Please enter any ARITHMETIC OPERATOR You wish!: ")

switch(operator,
"+" = print(paste("Addition of two numbers is: ", number1 + number2)),
"-" = print(paste("Subtraction of two numbers is: ", number1 - number2)),
"*" = print(paste("Multiplication of two numbers is: ", number1 * number2)),
"^" = print(paste("Exponent of two numbers is: ", number1 ^ number2)),
"/" = print(paste("Division of two numbers is: ", number1 / number2)),
"%/%" = print(paste("Integer Division of two numbers is: ", number1 %/%
number2)),
"%%" = print(paste("Division of two numbers is: ", number1 %% number2))
)

#---------------------------------------------------------------------------------
# Different Loops
#--------------------------------------------------------------------------------
1. Repeat Loop ( The R repeat loop is used to iterate a part of the program several
times. If the number of iteration is not fixed and you must have to execute the
loop at least once, it is recommended to use repeat loop. The R repeat loop is
executed at least once because condition is checked after loop body.)

The basic syntax for creating a repeat loop in R is:


repeat {
commands
if(condition){
break
}
}

# EXAMPLE - 1
total <- 0
number <- as.integer(readline(prompt="Please Enter any integer Value below 10: "))
repeat {
total = total + number
number = number + 1
if (number > 10) {
break
}
}
print(paste("The total Sum of Numbers From the Repeat Loop is: ", total))

# EXAMPLE - 2
v <- c("Hello","loop")
cnt <- 2
repeat{
print(v)
cnt <- cnt+1
if(cnt > 5){
break
}
}

2. While Loop ( The R while loop is used to iterate a part of the program several
times. If the number of iteration is not fixed, it is recommended to use while
loop.)
The basic syntax for creating a while loop in R is :
while (test_expression) {
statement
}

# EXAMPLE - 1
v <- c("Hello","while loop")
cnt <- 2
while (cnt < 7){
print(v)
cnt = cnt + 1
}

# EXAMPLE - 2
x <- 0
while(x < 10){
cat('x is currently: ',x)
print(' x is still less than 10, adding 1 to x')
# add one to x
x <- x+1
}

# EXAMPLE - 3
x <- 0
while(x < 10){
cat('x is currently: ',x)
print(' x is still less than 10, adding 1 to x')
# add one to x
x <- x+1
if(x==10){
print("x is equal to 10! Terminating loop")
}
}

# EXAMPLE - 4
total = 0
number <- as.integer(readline(prompt="Please Enter any integer Value below 10: "))

while (number <= 10) {


total = total + number
number = number + 1
}
print(paste("The total Sum of Numbers From the While Loop is: ", total))

3. For Loop ( The R for loop is used to iterate a part of the program several
times. If the number of iteration is fixed, it is recommended to use for loop.)
The basic syntax for creating a for loop statement in R is:
for (value in vector) {
statements
}

# EXAMPLE - 1
v <- LETTERS[1:4]
for ( i in v) {
print(i)
}

# EXAMPLE - 2
vec <- c(1,2,3,4,5)
for (temp_var in vec){
print(temp_var)
}

# EXAMPLE - 3
vec <- c(1,2,3,4,5)
for (i in 1:length(vec)){
print(vec[i])
}

# EXAMPLE - 4
li <- list(1,2,3,4,5)
for (temp_var in li){
print(temp_var)
}

# EXAMPLE - 5
bins <- seq(20,130,by=10)
li <- list(1,2,3,4,5)
for (i in 1:length(li)){
print(li[[i]]) # Remember to use double brackets!
}

# EXAMPLE - 6
mat <- matrix(1:25,nrow=5)
for (num in mat){
print(num)
}

# EXAMPLE - 7 ( Nested for loops )


mat <- matrix(1:25,nrow=5)
for (row in 1:nrow(mat)){
for (col in 1:ncol(mat)){
print(paste('The element at row:',row,'and col:',col,'is',mat[row,col]))
}
}

4. Break Statement ( The R break is used to break loop or switch statement. It


breaks the current flow of the program at specified condition. In case of inner
loop, it breaks only inner loop. )

# EXAMPLE - 1
v <- c("Hello","loop")
cnt <- 2
repeat{
print(v)
cnt <- cnt+1
if(cnt > 5){
break
}
}

# EXAMPLE - 2
number <- 10
while (number > 0) {
if (number == 3) {
print(paste("Coming out from While loop Where number = ", number))
break
}
print(paste("Values are : ", number))
number = number - 1
}

# EXAMPLE - 3
number <- 1:10
for (val in number) {
if (val == 7) {
print(paste("Coming out from for loop Where i = ", val))
break
}
print(paste("Values are : ", val))
}

5. Next Statement ( The R next statement is used to continue loop. It continues the
current flow of the program and skips the remaining code at specified condition. In
case of inner loop, it continues only inner loop. )
# EXAMPLE - 1
number <- 1:20
for (val in number) {
if (val %% 2 != 0) {
print(paste("ODD Number = ", val, "(Skipped by Next Statement)"))
next
}
print(paste("EVEN Number = ", val))
}

# EXAMPLE - 2
v <- LETTERS[1:6]
for ( i in v){
if (i == "D"){
next
}
print(i)
}

# EXAMPLE - 3
number <- 0
while (number <= 10) {
if (number == 4 || number == 7) {
print(paste("Skipped by the Next Statement = ", number))
number = number + 1
next
}
print(paste("Values are : ", number))
number = number + 1
}

#=================================================================================
Functions with R
#================================================================================

A Function is a self block of code.


A Function can be called as a section of a program that is written once and can be
executed whenever required in the program, thus making code reusability.
A Function is a subprogram that works on data and produce some output.

Types of Functions:
There are two types of Functions.
a) Built-in Functions: Functions that are predefined. We have used many predefined
functions in R.
b) User - Defined: Functions that are created according to the requirements.
c) Recursive Functions : Recursive functions in R means a function calling itself

Function Definition
An R function is created by using the keyword function. The basic syntax of an R
function definition is as follows:

function_name <- function(arg_1, arg_2, ...) {


Function body
}

1. Built-in Function
# Create a sequence of numbers from 32 to 44.
print(seq(32,44))
# Find mean of numbers from 25 to 82.
print(mean(25:82))
# Find sum of numbers frm 41 to 68.
print(sum(41:68))

2. User-defined Function
# EXAMPLE - 1 ( CREATE A FUNCTION TO PRINT SQUARES OF NUMBERS IN SEQUENCE )
new.function <- function(a) {
for(i in 1:a) {
b <- i^2
print(b)
}
}
# Call the function new.function supplying 6 as an argument.
new.function(6)

# EXAMPLE - 2 ( CREATE A FUNCTION WITHOUT AN ARGUMENT )


new.function <- function() {
for(i in 1:5) {
print(i^2)
}
}
# Call the function without supplying an argument.
new.function()

# EXAMPLE -3 ( CALLING A FUNCTION WITH ARGUMENT VALUES (BY POSITION AND BY NAME))
new.function <- function(a,b,c) {
result <- a*b+c
print(result)
}
# Call the function by position of arguments.
new.function(5,3,11)
# Call the function by names of the arguments.
new.function(a=11,b=5,c=3)

# EXAMPLE -4 ( CREATE A FUNCTION WITH DEFAULT ARGUMENT )


new.function <- function(a = 3,b =6) {
result <- a*b
print(result)
}
# Call the function without giving any argument.
new.function()
# Call the function with giving new values of the argument.new.function(9,5)

# EXAMPLE - 5 ( LAZY EVALUATION OF FUNCTION.CREATE A FUNCTION WITH ARGUMENTS.)


new.function <- function(a, b) {
print(a^2)
print(a)
print(b)
}
# Evaluate the function without supplying one of the arguments.
new.function(6)

# EXAMPLE - 6 ( LET US LOOK AT AN EXAMPLE WHICH WILL RETURN WHETHER A GIVEN NUMBER
IS POSITIVE, NEGATIVE OR ZERO.)
check <- function(x) {
if (x > 0) {
result <- "Positive"
}
else if (x < 0) {
result <- "Negative"
}
else {
result <- "Zero"
}
return(result)
}

# evaluate the function with arguments


check(1)
check(-10)
check(0)

# EXERCISE : SUM AND AVERAGE OF 3 NUMBERS USING R FUNCTIONS


sum.numbers <- function(a, b, c)
{
Sum = a + b + c
Average = Sum/3

print(paste("Sum of ",a, ",", b, ",", c, "is = ", Sum))


print(paste("Average of ",a, ",", b, ",", c, "is = ", Average))
}
sum.numbers(20, 10, 70)

# Exercise : Sum and Average of 3 Numbers using R Functions ( Input Should be taken
from keyboard )
sum.numbers <- function(a, b, c)
{
a <- readline(prompt="Enter a Value: ")
b <- readline(prompt="Enter b Value: ")
c <- readline(prompt="Enter c Value: ")

# convert character into integer


a <- as.integer(a)
b <- as.integer(b)
c <- as.integer(c)

Sum = a + b + c
Average = Sum/3

print(paste("Sum of ",a, ",", b, ",", c, "is = ", Sum))


print(paste("Average of ",a, ",", b, ",", c, "is = ", Average))
}
sum.numbers(a, b, c)

3. Recursive Functions :
The R Programming language introduced new technique called as Recursion for simple,
and elegant coding. Recursive functions in R means a function calling itself. To
understand the recursive programming, let us consider a well know, yet simple
example called factorial.

# EXAMPLE - 1
Number.factorial <- function(number)
{
if(number == 0 || number == 1) {
return (1)
} else {
return (number * Number.factorial(number - 1))
}
}
Sum.Series(6)

# EXERCISE : FIND SUM OF SERIES 1²+2²+3²+…..+N²

# Recursive Functions in R Example


Sum.Series <- function(number)
{
if(number == 0) {
return (0)
} else {
return ((number * number ) + Sum.Series(number - 1))
}
}
Sum.Series(5)

#================================================================================
Strings in R
#================================================================================

In R, string is basically an object that represents sequence of char values. An


array of characters works same as R string

# EXAMPLES OF VALID STRINGS


a <- 'Start and end with single quote'
print(a)
b <- "Start and end with double quotes"
print(b)
c <- "single quote ' in between double quotes"
print(c)
d <- 'Double quotes " in between single quote'
print(d)

# EXAMPLES OF INVALID STRINGS


e <- 'Mixed quotes"
print(e)
f <- 'Single quote ' inside single quote'
print(f)
g <- "Double quotes inside double quotes"
print(g)

# EXAMPLE - 1( CONCATENATING STRINGS USING PASTE() FUNCTION )


a <- "Hello"
b <- 'How'
c <- "are you? "
print(paste(a,b,c))
print(paste(a,b,c, sep = "-"))
print(paste(a,b,c, sep = "", collapse = ""))

# EXAMPLE -2 ( FORMATTING NUMBERS & STRINGS USING FORMAT() FUNCTION )


# Total number of digits displayed. Last digit rounded off.
result <- format(23.123456789, digits = 9)
print(result)
# Display numbers in scientific notation.
result <- format(c(6, 13.14521), scientific = TRUE)
print(result)
# The minimum number of digits to the right of the decimal point.
result <- format(23.47, nsmall = 5)
print(result)
# Format treats everything as a string.
result <- format(6)
print(result)
# Numbers are padded with blank in the beginning for width.
result <- format(13.7, width = 6)
print(result)
# Left justify strings.
result <- format("Hello",width = 8, justify = "l")
print(result)
# Justfy string with center.
result <- format("Hello",width = 8, justify = "c")
print(result)

# Example -3 ( Counting number of characters in a string - nchar() function )


result <- nchar("Count the number of characters")
print(result)

# Example -4 ( Changing the case - toupper() & tolower() functions )


# Changing to Upper case.
result <- toupper("Changing To Upper")
print(result)
# Changing to lower case.
result <- tolower("Changing To Lower")
print(result)

# Example -5 ( Extracting parts of a string - substring() function )


# Extract characters from 5th to 7th position.
result <- substring("Extract", 5, 7)
print(result)

#================================================================================
Data Reshaping in R
#================================================================================

# -------------------------------------------------------------------------------
# Data Reshaping
# Joining Columns and Rows in a Data Frame
# Create vector objects.
city <- c("Tampa","Seattle","Hartford","Denver")
state <- c("FL","WA","CT","CO")
zipcode <- c(33602,98104,06161,80294)
# Combine above three vectors into one data frame.
addresses <- cbind(city,state,zipcode)

# Print a header.
cat("# # # # The First data frame\n")
# Print the data frame.
print(addresses)
# Create another data frame with similar columns
new.address <- data.frame(
city = c("Lowry","Charlotte"),
state = c("CO","FL"),
zipcode = c("80230","33949"),
stringsAsFactors=FALSE
)
# Print a header.
cat("# # # The Second data frame\n")
# Print the data frame.
print(new.address)
# Combine rows form both the data frames.
all.addresses <- rbind(addresses,new.address)
# Print a header.
cat("# # # The combined data frame\n")
# Print the result.
print(all.addresses)

# Merging Data Frames

# Example - 1
df1 = data.frame(CustomerId = c(1:6), Product = c(rep("Toaster", 3), rep("Radio",
3)))
df2 = data.frame(CustomerId = c(2, 4, 6), State = c(rep("Alabama", 2), rep("Ohio",
1)))

# Inner join: merge(df1, df2) will work for these examples because R automatically
joins the frames by common variable names, but you would most likely want to
specify merge(df1, df2, by = "CustomerId") to make sure that you were matching on
only the fields you desired. You can also use the by.x and by.y parameters if the
matching variables have different names in the different data frames.

inner.join <- merge(df1, df2)


print(inner.join)

# Outer join:
outer.join <- merge(x = df1, y = df2, by = "CustomerId", all = TRUE)

# Left outer:
left.join <- merge(x = df1, y = df2, by = "CustomerId", all.x = TRUE)

# Right outer:
right.join <- merge(x = df1, y = df2, by = "CustomerId", all.y = TRUE)

# Cross join:
cross.join <- merge(x = df1, y = df2, by = NULL)

library(MASS)
merged.Pima <- merge(x=Pima.te, y=Pima.tr,
by.x=c("bp", "bmi"),
by.y=c("bp", "bmi")
)
print(merged.Pima)
nrow(merged.Pima)

# Melting and Casting


library(MASS)
print(ships)
install.packages("reshape")

library(reshape)
# Melt the Data
molten.ships <- melt(ships, id = c("type","year"))
print(molten.ships)

# Cast the Molten Data


recasted.ship <- cast(molten.ships, type+year~variable,sum)
print(recasted.ship)
=================================================

Basically, you "melt" data so that each row is a unique id-variable combination.
Then you "cast" the melted data into any shape you would like. Here is a very
simple example.

mydata

id time x1 x2
1 1 5 6
1 2 3 5
2 1 6 1
2 2 2 4

# example of melt function


library(reshape)
mdata <- melt(mydata, id=c("id","time"))

newdata

id time variable value


1 1 x1 5
1 2 x1 3
2 1 x1 6
2 2 x1 2
1 1 x2 6
1 2 x2 5
2 1 x2 1
2 2 x2 4

# cast the melted data


# cast(data, formula, function)
subjmeans <- cast(mdata, id~variable, mean)
timemeans <- cast(mdata, time~variable, mean)

subjmeans

id x1 x2
1 4 5.5
2 4 2.5

timemeans

time x1 x2
1 5.5 3.5
2 2.5 4.5
There is much more that you can do with the melt( ) and cast( ) functions. See the
documentation for more details.

===================================================

An Introduction to reshape2
reshape2 is an R package written by Hadley Wickham that makes it easy to transform
data between wide and long formats.

What makes data wide or long?


Wide data has a column for each variable. For example, this is wide-format data:
# ozone wind temp
# 1 23.62 11.623 65.55
# 2 29.44 10.267 79.10
# 3 59.12 8.942 83.90
# 4 59.96 8.794 83.97

And this is long-format data:

# variable value
# 1 ozone 23.615
# 2 ozone 29.444
# 3 ozone 59.115
# 4 ozone 59.962
# 5 wind 11.623
# 6 wind 10.267
# 7 wind 8.942
# 8 wind 8.794
# 9 temp 65.548
# 10 temp 79.100
# 11 temp 83.903
# 12 temp 83.968

Long-format data has a column for possible variable types and a column for the
values of those variables. Long-format data isn’t necessarily only two columns. For
example, we might have ozone measurements for each day of the year. In that case,
we could have another column for day. In other words, there are different levels of
“longness”. The ultimate shape you want to get your data into will depend on what
you are doing with it.

It turns out that you need wide-format data for some types of data analysis and
long-format data for others. In reality, you need long-format data much more
commonly than wide-format data. For example, ggplot2 requires long-format data
(technically tidy data), plyr requires long-format data, and most modelling
functions (such as lm(), glm(), and gam()) require long-format data. But people
often find it easier to record their data in wide format.

The reshape2 package

reshape2 is based around two key functions: melt and cast:

melt takes wide-format data and melts it into long-format data.

cast takes long-format data and casts it into wide-format data.

Think of working with metal: if you melt metal, it drips and becomes long. If you
cast it into a mould, it becomes wide.

Wide- to long-format data: the melt function


For this example we’ll work with the airquality dataset that is built into R. First
we’ll change the column names to lower case to make them easier to work with. Then
we’ll look at the data:

names(airquality) <- tolower(names(airquality))


head(airquality)

# ozone solar.r wind temp month day


# 1 41 190 7.4 67 5 1
# 2 36 118 8.0 72 5 2
# 3 12 149 12.6 74 5 3
# 4 18 313 11.5 62 5 4
# 5 NA NA 14.3 56 5 5
# 6 28 NA 14.9 66 5 6

What happens if we run the function melt with all the default argument values?

aql <- melt(airquality) # [a]ir [q]uality [l]ong format


head(aql)
# variable value
# 1 ozone 41
# 2 ozone 36
# 3 ozone 12
# 4 ozone 18
# 5 ozone NA
# 6 ozone 28

tail(aql)
# variable value
# 913 day 25
# 914 day 26
# 915 day 27
# 916 day 28
# 917 day 29
# 918 day 30

By default, melt has assumed that all columns with numeric values are variables
with values. Often this is what you want. Maybe here we want to know the values of
ozone, solar.r, wind, and temp for each month and day. We can do that with melt by
telling it that we want month and day to be “ID variables”. ID variables are the
variables that identify individual rows of data.

aql <- melt(airquality, id.vars = c("month", "day"))


head(aql)
# month day variable value
# 1 5 1 ozone 41
# 2 5 2 ozone 36
# 3 5 3 ozone 12
# 4 5 4 ozone 18
# 5 5 5 ozone NA
# 6 5 6 ozone 28

What if we wanted to control the column names in our long-format data? melt lets us
set those too all in one step:

aql <- melt(airquality, id.vars = c("month", "day"),


variable.name = "climate_variable",
value.name = "climate_value")

head(aql)
# month day climate_variable climate_value
# 1 5 1 ozone 41
# 2 5 2 ozone 36
# 3 5 3 ozone 12
# 4 5 4 ozone 18
# 5 5 5 ozone NA
# 6 5 6 ozone 28

Long- to wide-format data: the cast functions


Whereas going from wide- to long-format data is pretty straightforward, going from
long- to wide-format data can take a bit more thought. It usually involves some
head scratching and some trial and error for all but the simplest cases. Let’s go
through some examples.

In reshape2 there are multiple cast functions. Since you will most commonly work
with data.frame objects, we’ll explore the dcast function. (There is also acast to
return a vector, matrix, or array.)

Let’s take the long-format airquality data and cast it into some different wide
formats. To start with, we’ll recover the same format we started with and compare
the two.

dcast uses a formula to describe the shape of the data. The arguments on the left
refer to the ID variables and the arguments on the right refer to the measured
variables. Coming up with the right formula can take some trial and error at first.
So, if you’re stuck don’t feel bad about just experimenting with formulas. There
are usually only so many ways you can write the formula.

Here, we need to tell dcast that month and day are the ID variables (we want a
column for each) and that variable describes the measured variables. Since there is
only one remaining column, dcast will figure out that it contains the values
themselves. We could explicitly declare this with value.var. (And in some cases it
will be necessary to do so.)

aql <- melt(airquality, id.vars = c("month", "day"))


aqw <- dcast(aql, month + day ~ variable)
head(aqw)
# month day ozone solar.r wind temp
# 1 5 1 41 190 7.4 67
# 2 5 2 36 118 8.0 72
# 3 5 3 12 149 12.6 74
# 4 5 4 18 313 11.5 62
# 5 5 5 NA NA 14.3 56
# 6 5 6 28 NA 14.9 66
head(airquality) # original data
# ozone solar.r wind temp month day
# 1 41 190 7.4 67 5 1
# 2 36 118 8.0 72 5 2
# 3 12 149 12.6 74 5 3
# 4 18 313 11.5 62 5 4
# 5 NA NA 14.3 56 5 5
# 6 28 NA 14.9 66 5 6
So, besides re-arranging the columns, we’ve recovered our original data.

If it isn’t clear to you what just happened there, then have a look at this
illustration:

Figure 1: An illustration of the dcast function. The blue shading indicates ID


variables that we want to represent individual rows. The red shading represents
variable names that we want to swing into column names. The grey shading represents
the data values that we want to fill in the cells with.

One confusing “mistake” you might make is casting a dataset in which there is more
than one value per data cell. For example, this time we won’t include day as an ID
variable:

dcast(aql, month ~ variable)


# month ozone solar.r wind temp
# 1 5 31 31 31 31
# 2 6 30 30 30 30
# 3 7 31 31 31 31
# 4 8 31 31 31 31
# 5 9 30 30 30 30
When you run this in R, you’ll notice the warning message:

# Aggregation function missing: defaulting to length


And if you look at the output, the cells are filled with the number of data rows
for each month-climate combination. The numbers we’re seeing are the number of days
recorded in each month. When you cast your data and there are multiple values per
cell, you also need to tell dcast how to aggregate the data. For example, maybe you
want to take the mean, or the median, or the sum. Let’s try the last example, but
this time we’ll take the mean of the climate values. We’ll also pass the option
na.rm = TRUE through the ... argument to remove NA values. (The ... let’s you pass
on additional arguments to your fun.aggregate function, here mean.)

dcast(aql, month ~ variable, fun.aggregate = mean,


na.rm = TRUE)
# month ozone solar.r wind temp
# 1 5 23.62 181.3 11.623 65.55
# 2 6 29.44 190.2 10.267 79.10
# 3 7 59.12 216.5 8.942 83.90
# 4 8 59.96 171.9 8.794 83.97
# 5 9 31.45 167.4 10.180 76.90
Unlike melt, there are some other fancy things you can do with dcast that I’m not
covering here. It’s worth reading the help file ?dcast. For example, you can
compute summaries for rows and columns, subset the columns, and fill in missing
cells in one call to dcast.

Additional help
Read the package help: help(package = "reshape2")

#================================================================================
Data Input and Output with R
#================================================================================

#--------------------------------------------------------------------------------
# CSV Files
#--------------------------------------------------------------------------------

#-------------------------------------------------------------------------------

#Please copy the below data and paste it in notepad, and save it as employee.csv

EMPNO,ENAME,JOB,MGR,SAL,COMM,DEPTNO
7369,SMITH,CLERK,7902,800,,20
7499,ALLEN,SALESMAN,7698,1600,300,30
7521,WARD,SALESMAN,7698,1250,500,30
7566,JONES,MANAGER,7839,2975,,20
7654,MARTIN,SALESMAN,7698,1250,1400,30
7698,BLAKE,MANAGER,7839,2850,,30
7782,CLARK,MANAGER,7839,2450,,10
7788,SCOTT,ANALYST,7566,3000,,20
7839,KING,PRESIDENT,,5000,,10
7844,TURNER,SALESMAN,7698,1500,0,30
7876,ADAMS,CLERK,7788,1100,,20
7900,JAMES,CLERK,7698,950,,30
7902,FORD,ANALYST,7566,3000,,20
7934,MILLER,CLERK,7782,1300,,10

files=list.files(pattern=".csv")
for(i in 1:length(files))
{
filename=files[i]
data=read.csv(file = filename,header = T)
assign(x = filename,value = data)
}

group.by <- aggregate( employee$SAL ~ employee$JOB, employee, sum)


print(group.by)

CSV stands for comma separated variable and its one of the most common ways we'll
be working with data throughout this course. The basic format of a csv file is the
first line indicating the column names and the rest of the rows/lines being data
points separated by commas. One of the most basic ways to read in csv files in R is
to use read.csv() which is built-in to R. Later on we'll learn about fread which
will be a bit faster and more convenient, but its important to understand all your
options!
When using read.csv() youll need to either pass in the entire path of the file or
have the file be in the same directory as your R script. Make sure to account for
possible spaces in the file path name, you may need to use backslashes to account
for this. This is often a point of confusion for people new to programming, so make
sure you understand the above before continuing!

read.csv(file, header = TRUE, sep = ",", quote = "\"",


dec = ".", fill = TRUE, comment.char = "", ...)

read.csv2(file, header = TRUE, sep = ";", quote = "\"",


dec = ",", fill = TRUE, comment.char = "", ...)

read.delim(file, header = TRUE, sep = "\t", quote = "\"",


dec = ".", fill = TRUE, comment.char = "", ...)

read.delim2(file, header = TRUE, sep = "\t", quote = "\"",


dec = ",", fill = TRUE, comment.char = "", ...)

#R Read csv File from Current Working Directory


# Locate the Current Working Directory
getwd()
setwd("D:\\R Programming\\Data")
mydata <- read.csv("employee.csv")
print(mydata)

# If you don't have the names of the variables in the first row
employee <- read.csv("employee_csv.csv", header=TRUE)
print(employee)

# If you want to set any value to a missing value


employee <- read.csv("employee_csv.csv", header=TRUE, na.strings=".")
print(employee)

# If you want to set multiple values to missing values


employee <- read.csv("employee_csv.csv", header=TRUE, na.strings= c("A" , "B" ))
print(employee)
# Accessing all the Elements (Rows) Present in the 3rd Column (i.e., Occupation)
# Index Values: 1 = empno, 2 = ename, 3 = job, 4 = mgr, 5 = Salary, 6=comm and 7 =
deptno
employee[[5]]

# Accessing all the Elements (Rows) Present in the JOB (Column)


employee$JOB

# Accessing Element at 4th Row and 3rd Column


employee[4, 3]

# Accessing Item at 1st, 2nd 4th Rows and 4th, 5th, 6th, 7th Columns
employee[c(1, 2, 4), c(4:7)]

# It returns the Maximum Value within the SAL Column


maximum.salary <- max(employee$SAL)
print(maximum.salary)

# It returns the Minimum Value within the SAL Column


minimum.sales <- min(employee$SAL)
print(minimum.sales)

# It will calculate and returns the SAL Column Mean Value


mean.sales <- mean(employee$SAL)
print(mean.sales)

# It returns all the records, whose JOB is equal to SALESMAN


subdata <- subset(employee, JOB == "SALESMAN")
print(subdata)

# It returns all the records, whose Education is equal to Bachelors and Yearly
Income > 70000
partialdata <- subset(employee, JOB == "SALESMAN" & SAL > 1000 )
print(partialdata)

print(employee)
typeof(employee)
class(employee)
names(employee)
length(employee)
nrow(employee)
ncol(employee)
dim(employee)
str(employee)
summary(employee)

StringsAsFactor in R Read csv function :


If your csv file contains both character and numeric variables then the character
variables get automatically converted to the factors type. To prevent this
automatic conversion, we have to specify stringsAsFactors = FALSE explicitly.

# Locate the Current Working Directory


getwd()
employee <- read.csv("employee_csv.csv", TRUE, sep = ",", stringsAsFactors = FALSE)
str(employee)

employee <- read.csv("employee_csv.csv", TRUE, sep = ",", stringsAsFactors = TRUE)


str(employee)
=================================================================================
R Read table Function

The read.table function is very useful to import the data from text files from file
system & URLs, and store the data in a Data Frame. In this article we will show
you, How to use this R read table function, how to manipulate the data in R
Programming with example

read.table(file, header = FALSE, sep = "", quote = "\"'",


dec = ".", numerals = c("allow.loss", "warn.loss", "no.loss"),
row.names, col.names, as.is = !stringsAsFactors,
na.strings = "NA", colClasses = NA, nrows = -1,
skip = 0, check.names = TRUE, fill = !blank.lines.skip,
strip.white = FALSE, blank.lines.skip = TRUE,
comment.char = "#",
allowEscapes = FALSE, flush = FALSE,
stringsAsFactors = default.stringsAsFactors(),
fileEncoding = "", encoding = "unknown", text, skipNul = FALSE)

Arguments
1. file
the name of the file which the data are to be read from. Each row of the table
appears as one line of the file. If it does not contain an absolute path, the file
name is relative to the current working directory, getwd(). Tilde-expansion is
performed where supported. This can be a compressed file (see file).
Alternatively, file can be a readable text-mode connection (which will be opened
for reading if necessary, and if so closed (and hence destroyed) at the end of the
function call). (If stdin() is used, the prompts for lines may be somewhat
confusing. Terminate input with a blank line or an EOF signal, Ctrl-D on Unix and
Ctrl-Z on Windows. Any pushback on stdin() will be cleared before return.)
file can also be a complete URL. (For the supported URL schemes, see the ‘URLs’
section of the help for url.)
2. header
a logical value indicating whether the file contains the names of the variables as
its first line. If missing, the value is determined from the file format: header is
set to TRUE if and only if the first row contains one fewer field than the number
of columns.
3. sep
the field separator character. Values on each line of the file are separated by
this character. If sep = "" (the default for read.table) the separator is ‘white
space’, that is one or more spaces, tabs, newlines or carriage returns.
4. quote
the set of quoting characters. To disable quoting altogether, use quote = "". See
scan for the behaviour on quotes embedded in quotes. Quoting is only considered for
columns read as character, which is all of them unless colClasses is specified.
5. dec
the character used in the file for decimal points.
6. numerals
string indicating how to convert numbers whose conversion to double precision would
lose accuracy, see type.convert. Can be abbreviated.
7. row.names
a vector of row names. This can be a vector giving the actual row names, or a
single number giving the column of the table which contains the row names, or
character string giving the name of the table column containing the row names.
If there is a header and the first row contains one fewer field than the number of
columns, the first column in the input is used for the row names. Otherwise if
row.names is missing, the rows are numbered.
Using row.names = NULL forces row numbering. Missing or NULL row.names generate row
names that are considered to be ‘automatic’ (and not preserved by as.matrix).
8. col.names
a vector of optional names for the variables. The default is to use "V" followed by
the column number.
9. as.is
the default behavior of read.table is to convert character variables (which are not
converted to logical, numeric or complex) to factors. The variable as.is controls
the conversion of columns not otherwise specified by colClasses. Its value is
either a vector of logicals (values are recycled if necessary), or a vector of
numeric or character indices which specify which columns should not be converted to
factors.
Note: to suppress all conversions including those of numeric columns, set
colClasses = "character".
Note that as.is is specified per column (not per variable) and so includes the
column of row names (if any) and any columns to be skipped.
10. na.strings
a character vector of strings which are to be interpreted as NA values. Blank
fields are also considered to be missing values in logical, integer, numeric and
complex fields.
11. colClasses
character. A vector of classes to be assumed for the columns. Recycled as
necessary. If named and shorter than required, names are matched to the column
names with unspecified values are taken to be NA.
Possible values are NA (the default, when type.convert is used), "NULL" (when the
column is skipped), one of the atomic vector classes (logical, integer, numeric,
complex, character, raw), or "factor", "Date" or "POSIXct". Otherwise there needs
to be an as method (from package methods) for conversion from "character" to the
specified formal class.
Note that colClasses is specified per column (not per variable) and so includes the
column of row names (if any).
12. nrows
integer: the maximum number of rows to read in. Negative and other invalid values
are ignored.
13. skip
integer: the number of lines of the data file to skip before beginning to read
data.
14. check.names
logical. If TRUE then the names of the variables in the data frame are checked to
ensure that they are syntactically valid variable names. If necessary they are
adjusted (by make.names) so that they are, and also to ensure that there are no
duplicates.
15. fill
logical. If TRUE then in case the rows have unequal length, blank fields are
implicitly added. See ‘Details’.
17. strip.white
logical. Used only when sep has been specified, and allows the stripping of leading
and trailing white space from unquoted character fields (numeric fields are always
stripped). See scan for further details (including the exact meaning of ‘white
space’), remembering that the columns may include the row names.
18. blank.lines.skip
logical: if TRUE blank lines in the input are ignored.
19. comment.char
character: a character vector of length one containing a single character or an
empty string. Use "" to turn off the interpretation of comments altogether.
20. allowEscapes
logical. Should C-style escapes such as \n be processed or read verbatim (the
default)? Note that if not within quotes these could be interpreted as a delimiter
(but not as a comment character). For more details see scan.
21. flush
logical: if TRUE, scan will flush to the end of the line after reading the last of
the fields requested. This allows putting comments after the last field.
22. stringsAsFactors
logical: should character vectors be converted to factors? Note that this is
overridden by as.is and colClasses, both of which allow finer control.
23. fileEncoding
character string: if non-empty declares the encoding used on a file (not a
connection) so the character data can be re-encoded. See the ‘Encoding’ section of
the help for file, the ‘R Data Import/Export Manual’ and ‘Note’.
24. encoding
encoding to be assumed for input strings. It is used to mark character strings as
known to be in Latin-1 or UTF-8 (see Encoding): it is not used to re-encode the
input, but allows R to handle encoded strings in their native encoding (if one of
those two). See ‘Value’ and ‘Note’.
25. text
character string: if file is not supplied and this is, then data are read from the
value of text via a text connection. Notice that a literal string can be used to
include (small) data sets within R code.
26. skipNul
logical: should nuls be skipped?
...
Further arguments to be passed to read.table.

# R Read table function to read Text File from Current Working Directory
# To Locate the Current Working Directory
getwd()
setwd("D:\\R Programming\\Data")
Company.employees <- read.table("employee.txt", TRUE, sep = ",")
print(Company.employees)

R Read table Function – testing arguments


# R Read table function - Testing argument
# To Locate the Current Working Directory
getwd()
setwd("D:\\R Programming\\Data")
employees <- read.table("employee_update.txt", TRUE, sep = ",", quote="\"",
na.strings = TRUE, strip.white = TRUE,
comment.char = "$",blank.lines.skip = TRUE)

print(employees)

In this example we will show you, How to read NA records, escape the blank lines,
and comment lines while reading data from the text file.

# allowEscapes: A Boolean value that indicates whether you want to allow the
escapes (such as \n for new line) or not.
# strip.white: If the sep argument is not equal to “” then you may use this Boolean
value to trim the extra leading & tailing white spaces from the character field.
# comment.char: If there are any comment lines in your text file then you can use
this argument to ignore those lines. Here, You have to describe the single special
character that you used to comment the line. For example, if your text file
contains comment starting with $ then use comment.char = “$” to skip this comment
line from reading.
# blank.lines.skip: A Boolean value that specifies whether you want to skip/ignore
the blank lines or not.
# na.strings: A character vector specifying values that should be read as NA

# R Read table function - Testing argument


# To Locate the Current Working Directory
getwd()
employeeNames <- c("Employee_ID", "First Name", "Last Name", "Education",
"Profession","Salary","Sales")
employees <- read.table("EmployeeSales.txt", TRUE, sep = ",", quote="\"",
na.strings = TRUE, strip.white = TRUE, skip = 3,
as.is = c(TRUE, TRUE, FALSE, FALSE, TRUE),
col.names = employeeNames,
comment.char = "$", blank.lines.skip = TRUE)
print(employees)
print(str(employees))

# col.names: A Character vector that contains the column names for the returned
data frame
# as.is: Please specify the Boolean vector of same length as the number of column.
This argument will convert the character values to factors based on the Boolean
value. For example, we have two columns (FirstName, Occupation) and we uses as.is =
c(TRUE, FALSE). This will keep the FirstName as character (not an implicit factor),
and Occupation as Factors
# skip: Please specify the number of rows you want to skip from text file before
beginning the data read. For example, if you want to skip top 3 records, use skip =
3

#--------------------------------------------------------------------------------
# EXCEL Files
#--------------------------------------------------------------------------------

#-------------------------------------------------------------------------------
# In case you don't have readxl (you may not need to specify repos)
install.packages('readxl',repos="http://cran.rstudio.com/")

# Load the readxl package


library(readxl)

# To Locate the Current Working Directory


getwd()
setwd("D:\\R Programming\\Data")

# list the sheets of the excel file


excel_sheets('Sample-Sales-Data.xlsx')

# Call info from the sheets using read_excel


df <- read_excel('Sample-Sales-Data.xlsx',sheet='Sheet1')
head(df)
sum(df['Value'])
str(df)
summary(df)

# If you had multiple sheets that you wanted to import into a list, you could do
this with lapply():

excel_sheets('Sample-Superstore.xls')

entire_workbook <- lapply(excel_sheets("Sample-Superstore.xls"),


read_excel,path = "Sample-Superstore.xls")

# Show entire list:


print(entire_workbook)

Question : How to read first sheet & second sheet attribute.


#--------------------------------------------------------------------------------
# XML Files
#--------------------------------------------------------------------------------
#--------------------------------------------------------------------------------

#You can read a xml file in R using the "XML" package. This package can be
installed using following command.
install.packages("XML")

# Load the package required to read XML files.


library("XML")

# To Locate the Current Working Directory


getwd()
setwd("D:\\R Programming\\Data")

# Also load the other required package.


library("methods")

# Give the input file name to the function.


result <- xmlParse(file="input.xml")

# Print the result.


print(result)

# Exract the root node form the xml file.


rootnode <- xmlRoot(result)

# Find number of nodes in the root.


rootsize <- xmlSize(rootnode)

# Print the result.


print(rootsize)

# Get the first element of the first node.


print(rootnode[[1]][[1]])

# Get the fifth element of the first node.


print(rootnode[[1]][[5]])

# Get the second element of the third node.


print(rootnode[[3]][[2]])

# Convert the input xml file to a data frame.


xmldataframe <- xmlToDataFrame("input.xml")
print(xmldataframe)

#--------------------------------------------------------------------------------
# JSON Files
#--------------------------------------------------------------------------------
#--------------------------------------------------------------------------------
#In the R console, you can issue the following command to install the rjson
package.
install.packages("rjson")
install.packages("jsonlite")

# Load the package required to read JSON files.


library("jsonlite")
library("rjson")

# To Locate the Current Working Directory


getwd()
setwd("D:\\R Programming\\Data")

# Give the input file name to the function.


result <- fromJSON("artistsen.json")

# This is because this JSON file turned out to be something called ‘NDJSON (Newline
delimited JSON)’, which means there are multiple JSON values inside this file and
each of the JSON values is considered as an independent object. In this particular
case, each business information makes up one single JSON value therefore there are
many JSON values inside of this JSON file. This could be used often in data
streaming situations where each JSON data can be separated from other parts of the
file so that each JSON data can be processed without waiting for the whole document
to load.
# Anyway, ‘jsonlite’ actually has a function to deal with this ‘NDJSON’ file type
with ‘stream_in()’ function, so we can use it instead like below.

result <- stream_in(file("artistsen.json"))

# Convert JSON file to a data frame.


json_data_frame <- as.data.frame(result)
str(json_data_frame)
head(json_data_frame)
print(json_data_frame)

result <- stream_in(file("movies_en.json"))

# Convert JSON file to a data frame.


json_data_frame <- as.data.frame(result)
str(json_data_frame)
head(json_data_frame,1)
print(json_data_frame)

#--------------------------------------------------------------------------------
# Database
#--------------------------------------------------------------------------------
#--------------------------------------------------------------------------------

The RODBC package provides access to databases (including Microsoft Access and
Microsoft SQL Server) through an ODBC interface.

The primary functions are given below.

Function Description
odbcConnect(dsn, uid="", pwd="") Open a connection to an ODBC database
sqlFetch(channel, sqtable) Read a table from an ODBC database into a data frame
sqlQuery(channel, query) Submit a query to an ODBC database and return the
results
sqlSave(channel, mydf, tablename = sqtable, append = FALSE)Write or update
(append=True) a data frame to a table in the ODBC database
sqlDrop(channel, sqtable) Remove a table from the ODBC database
close(channel) Close the connection

# RODBC Example
# import 2 tables (Crime and Punishment) from a DBMS
# into R data frames (and call them crimedat and pundat)

library(RODBC)
myconn <-odbcConnect("mydsn", uid="Rob", pwd="aardvark")
crimedat <- sqlFetch(myconn, "Crime")
pundat <- sqlQuery(myconn, "select * from Punishment")
close(myconn)

# R has a built-in package named "RMySQL" which provides native connectivity


between with MySql database. You can install this package in the R environment
using the following command.

install.packages("RMySQL")

# Create a connection Object to MySQL database.


# We will connect to the sampel database named "sakila" that comes with MySql
installation.

library("RMySQL")
mysqlconnection = dbConnect(MySQL(), user='retail_dba', password='cloudera',
dbname='retail_db', host='192.168.153.129')

# List the tables available in this database.


dbListTables(mysqlconnection)

# Query the "actor" tables to get all the rows.


result = dbSendQuery(mysqlconnection, "select * from order_items")

# Store the result in a R data frame object. n=5 is used to fetch first 5 rows.
data.frame = fetch(result, n=5)
print(data.fame)

result = dbSendQuery(mysqlconnection, "select * from products where product_id <


20;")
# Fetch all the records(with n = -1) and store it as a data frame.
data.frame = fetch(result, n=-1)
print(data)

dbSendQuery(mysqlconnection, "update mtcars set disp = 168.5 where hp = 110")

dbSendQuery(mysqlconnection,
"insert into mtcars(row_names, mpg, cyl, disp, hp, drat, wt, qsec, vs, am,
gear, carb)
values('New Mazda RX4 Wag', 21, 6, 168.5, 110, 3.9, 2.875, 17.02, 0, 1, 4, 4)"
)

dbSendQuery(mysqlconnection, 'drop table if exists mtcars')

#================================================================================
Advanced Programming with R
#================================================================================

Do Faster Data Manipulation using These 7 R Packages

Introduction

Data Manipulation is an inevitable phase of predictive modeling. A robust


predictive model can’t just be built using machine learning algorithms. But, with
an approach to understand the business problem, the underlying data, performing
required data manipulations and then extracting business insights.

Among these several phases of model building, most of the time is usually spent in
understanding underlying data and performing required manipulations. This would
also be the focus of this article – packages to perform faster data manipulation in
R.

What is Data Manipulation ?

If you are still confused with this ‘term’, let me explain it to you. Data
Manipulation is a loosely used term with ‘Data Exploration’. It involves
‘manipulating’ data using available set of variables. This is done to enhance
accuracy and precision associated with data.

Actually, the data collection process can have many loopholes. There are various
uncontrollable factors which lead to inaccuracy in data such as mental situation of
respondents, personal biases, difference / error in readings of machines etc. To
mitigate these inaccuracies, data manipulation is done to increase the possible
(highest) accuracy in data.

At times, this stage is also known as data wrangling or data cleaning.

Different Ways to Manipulate / Treat Data:

There is no right or wrong way in manipulating data, as long as you understand the
data and have taken the necessary actions by the end of the exercise. However, here
are a few broad ways in which people try and approach data manipulation. Here are
they:

Usually, beginners on R find themselves comfortable manipulating data using inbuilt


base R functions. This is a good first step, but is often repetitive and time
consuming. Hence, it is a less efficient way to solve the problem.
Use of packages for data manipulation. CRAN has more than 7000 packages available
today. In simple words, these packages are nothing but a collection of pre-written
commonly used pieces of codes. They help you perform the repetitive tasks fasts,
reduce errors in coding and take help of code written by experts (across the open
source eco-system for R) to make your code more efficient. This is usually the most
common way of performing data manipulation.
Use of ML algorithms for data manipulation. You can use tree based boosting
algorithms to take care of missing data & outliers. While these are definitely less
time consuming, these approaches typically leave you wanting for a better
understanding of data at the end of it.
Hence, more often than not, use of packages is the de-facto method to perform data
manipulation. In this article, I have explained several packages which make ‘R’
life easier during the data manipulation stage.

Note: This article is best suited for beginners in R Language. You can install a
packages using:

install.packages('package name')

List of Packages

For better understanding, I’ve also demonstrated their usage by undertaking


commonly used operations. Below is the list of packages discussed in this article:

dplyr
data.table
ggplot2
reshape2
readr
tidyr
lubridate

Note: I understand ggplot2 is a graphical package. But, it generally helps in


visualizing data ( distributions, correlations) and making manipulations
accordingly. Hence, I’ve added it in this list. In all packages, I’ve covered only
the most commonly used commands in data manipulation.

dplyr Package

This packages is created and maintained by Hadley Wickham. This package has
everything (almost) to accelerate your data manipulation efforts. It is known best
for data exploration and transformation. It’s chaining syntax makes it highly
adaptive to use. It includes 5 major data manipulation commands:

filter – It filters the data based on a condition


select – It is used to select columns of interest from a data set
arrange – It is used to arrange data set values on ascending or descending order
mutate – It is used to create new variables from existing variables
summarise (with group_by) – It is used to perform analysis by commonly used
operations such as min, max, mean count etc
Simple focus on these commands and do great in data exploration. Let’s understand
these commands one by one. I have used 2 pre-installed R data sets namely mtcars
and iris.

library(dplyr)
data("mtcars")
data('iris')

mydata <- mtcars

#read data
head(mydata)

# creating a local dataframe. Local data frame are easier to read


# data.frame() is R's function for creating regular data frames.
# data_frame() is dplyr's function for creating local data frames.
# tbl_df() and as_data_frame() are dplyr's functions for converting a regular data
frame (or a list) into a local data frame.
# So, what is the difference between regular and local data frames? Very little. A
local data frame is just a regular data frame that has been wrapped with the tbl_df
class for nicer printing. (The data is still stored in a regular data frame "under
the hood".)
# Specifically, printing a local data frame only shows the first 10 rows, and as
many columns as can fit on your screen. (You can see an example of this behavior at
the top of the RMarkdown document from my first dplyr video tutorial, which
precedes the tutorial linked above).

mynewdata <- tbl_df(mydata)


myirisdata <- tbl_df(iris)

#now data will be in tabular structure


mynewdata
myirisdata
#use filter to filter data with required condition
filter(mynewdata, cyl > 4 & gear > 4 )

filter(mynewdata, cyl > 4)

filter(myirisdata, Species %in% c('setosa', 'virginica'))

#use select to pick columns by name


select(mynewdata, cyl,mpg,hp)

#here you can use (-) to hide columns


select(mynewdata, -cyl, -mpg )

#hide a range of columns


select(mynewdata, -c(cyl,mpg))

#select series of columns


select(mynewdata, cyl:gear)

#chaining or pipelining - a way to perform multiple operations


#in one line
mynewdata %>%
select(cyl, wt, gear)%>%
filter(wt > 2)

#arrange can be used to reorder rows


mynewdata%>%
select(cyl, wt, gear)%>%
arrange(wt)

#arrange can be used to reorder rows


mynewdata%>%
select(cyl, wt, gear)%>%
arrange(wt)

mynewdata%>%
select(cyl, wt, gear)%>%
arrange(desc(wt))

#mutate - create new variables

mynewdata %>%
select(mpg, cyl)%>%
mutate(newvariable = mpg*cyl)

newvariable <- mynewdata %>% mutate(newvariable = mpg*cyl)

#summarise - this is used to find insights from data


myirisdata%>%
group_by(Species)%>%
summarise(Average = mean(Sepal.Length, na.rm = TRUE))

myirisdata%>%
group_by(Species)%>%
summarise_each(funs(mean, n()), Sepal.Length, Sepal.Width)

#you can rename the variables using rename command


mynewdata %>% rename(miles = mpg)
data.table Package

This package allows you to perform faster manipulation in a data set. Leave your
traditional ways of sub setting rows and columns and use this package. With minimum
coding, you can do much more. Using data.table helps in reducing computing time as
compared to data.frame. You’ll be astonished by the simplicity of this package.

A data table has 3 parts namely DT[i,j,by]. You can understand this as, we can tell
R to subset the rows using ‘i’, to calculate ‘j’ which is grouped by ‘by’. Most of
the times, ‘by’ relates to categorical variable. In the code below, I’ve used 2
data sets (airquality and iris).

#load data
data("airquality")
mydata <- airquality
head(airquality,6)

data(iris)
myiris <- iris

#load package
library(data.table)

mydata <- data.table(mydata)


mydata

myiris <- data.table(myiris)


myiris

#subset rows - select 2nd to 4th row

mydata[2:4,]

#select columns with particular values


myiris[Species == 'setosa']

#select columns with multiple values. This will give you columns with Setosa
#and virginica species
myiris[Species %in% c('setosa', 'virginica')]

#select columns. Returns a vector


mydata[,Temp]

mydata[,.(Temp,Month)]

#returns sum of selected column


mydata[,sum(Ozone, na.rm = TRUE)]

#returns sum and standard deviation


mydata[,.(sum(Ozone, na.rm = TRUE), sd(Ozone, na.rm = TRUE))]

#print and plot


myiris[,{print(Sepal.Length)
plot(Sepal.Width)
NULL}]

#grouping by a variable
myiris[,.(sepalsum = sum(Sepal.Length)), by=Species]

#select a column for computation, hence need to set the key on column
setkey(myiris, Species)

#selects all the rows associated with this data point


myiris['setosa']
myiris[c('setosa', 'virginica')]

reshape2 Package

As the name suggests, this package is useful in reshaping data. We all know the
data come in many forms. Hence, we are required to tame it according to our need.
Usually, the process of reshaping data in R is tedious and worrisome. R base
functions consist of ‘Aggregation’ option using which data can be reduced and
rearranged into smaller forms, but with reduction in amount of information.
Aggregation includes tapply, by and aggregate base functions. The reshape package
overcome these problems. Here we try to combine features which have unique values.
It has 2 functions namely melt and cast.

melt : This function converts data from wide format to long format. It’s a form of
restructuring where multiple categorical columns are ‘melted’ into unique rows.
Let’s understand it using the code below.

#create a data
> ID <- c(1,2,3,4,5)
> Names <- c('Joseph','Matrin','Joseph','James','Matrin')
> DateofBirth <- c(1993,1992,1993,1994,1992)
> Subject<- c('Maths','Biology','Science','Psycology','Physics')

> thisdata <- data.frame(ID, Names, DateofBirth, Subject)


> data.table(thisdata)

#load package
> install.packages('reshape2')
> library(reshape2)

#melt
> mt <- melt(thisdata, id=(c('ID','Names')))
> mtcars

cast : This function converts data from long format to wide format. It starts with
melted data and reshapes into long format. It’s just the reverse of melt function.
It has two functions namely, dcast and acast. dcast returns a data frame as output.
acast returns a vector/matrix/array as the output. Let’s understand it using the
code below.

#cast
> mcast <- dcast(mt, DateofBirth + Subject ~ variable)
> mcast

readr Package

As the name suggests, ‘readr’ helps in reading various forms of data into R. With
10x faster speed. Here, characters are never converted to factors(so no more
stringAsFactors = FALSE). This package can replace the traditional read.csv() and
read.table() base R functions. It helps in reading the following data:
Delimited files withread_delim(), read_csv(), read_tsv(), andread_csv2().
Fixed width files with read_fwf(), and read_table().
Web log files with read_log()
If the data loading time is more than 5 seconds, this function will show you a
progress bar too. You can suppress the progress bar by marking it as FALSE. Let’s
look at the code below:

> install.packages('readr')
> library(readr)
> read_csv('test.csv',col_names = TRUE)
You can also specify the data type of every column loaded in data using the code
below:

> read_csv("iris.csv", col_types = list(


Sepal.Length = col_double(),
Sepal.Width = col_double(),
Petal.Length = col_double(),
Petal.Width = col_double(),
Species = col_factor(c("setosa", "versicolor", "virginica"))
))
However, if you choose to omit unimportant columns, it will take care of it
automatically. So, the code above can also be re-written as:

> read_csv("iris.csv", col_types = list(


Species = col_factor(c("setosa", "versicolor", "virginica"))
)
P.S – readr has many helper functions. So, next when you write a csv file, use
write_csv instead. It’s a lot faster than write.csv.

tidyr Package

This package can make your data look ‘tidy’. It has 4 major functions to accomplish
this task. Needless to say, if you find yourself stuck in data exploration phase,
you can use them anytime (along with dplyr). This duo makes a formidable team. They
are easy to learn, code and implement. These 4 functions are:

gather() – it ‘gathers’ multiple columns. Then, it converts them into key:value


pairs. This function will transform wide from of data to long form. You can use it
as in alternative to ‘melt’ in reshape package.
spread() – It does reverse of gather. It takes a key:value pair and converts it
into separate columns.
separate() – It splits a column into multiple columns.
unite() – It does reverse of separate. It unites multiple columns into single
column
Let’s understand it closely using the code below:

#load package
> library(tidyr)

#create a dummy data set


> names <- c('A','B','C','D','E','A','B')
> weight <- c(55,49,76,71,65,44,34)
> age <- c(21,20,25,29,33,32,38)
> Class <- c('Maths','Science','Social','Physics','Biology','Economics','Accounts')

#create data frame


> tdata <- data.frame(names, age, weight, Class)
> tdata
#using gather function
> long_t <- tdata %>% gather(Key, Value, weight:Class)
> long_t

Separate function comes best in use when we are provided a date time variable in
the data set. Since, the column contains multiple information, hence it makes sense
to split it and use those values individually. Using the code below, I have
separated a column into date, month and year.

#create a data set


> Humidity <- c(37.79, 42.34, 52.16, 44.57, 43.83, 44.59)
> Rain <- c(0.971360441, 1.10969716, 1.064475853, 0.953183435, 0.98878849,
0.939676146)
> Time <- c("27/01/2015 15:44","23/02/2015 23:24", "31/03/2015 19:15", "20/01/2015
20:52", "23/02/2015 07:46", "31/01/2015 01:55")

#build a data frame


> d_set <- data.frame(Humidity, Rain, Time)

#using separate function we can separate date, month, year


> separate_d <- d_set %>% separate(Time, c('Date', 'Month','Year'))
> separate_d

#using unite function - reverse of separate


> unite_d <- separate_d%>% unite(Time, c(Date, Month, Year), sep = "/")
> unite_d

#using spread function - reverse of gather


> wide_t <- long_t %>% spread(Key, Value)
> wide_t

Lubridate Package

Lubridate package reduces the pain of working of data time variable in R. The
inbuilt function of this package offers a nice way to make easy parsing in dates
and times. This packages is frequently used with data comprising of timely data.
Here I have covered three basic tasks accomplished using Lubridate.

This includes update function, duration function and date extraction. As a


beginner, knowing these 3 functions would give you good enough expertise to deal
with time variables. Though, R has inbuilt functions for handling dates, but this
is much faster. Let’s understand it using the code below:

> install.packages('lubridate')
> library(lubridate)

#current date and time


> now()

#assigning current date and time to variable n_time


> n_time <- now()

#using update function


> n_update <- update(n_time, year = 2013, month = 10)
> n_update

#add days, months, year, seconds


> d_time <- now()
> d_time + ddays(1)
[1] "2015-12-12 13:24:54 IST"
> d_time + dweeks(2)
[1] "2015-12-12 13:24:54 IST"

> d_time + dyears(3)


[1] "2018-12-10 13:24:54 IST"

> d_time + dhours(2)


[1] "2015-12-11 15:24:54 IST"

> d_time + dminutes(50)


[1] "2015-12-11 14:14:54 IST"

> d_time + dseconds(60)


[1] "2015-12-11 13:25:54 IST"

#extract date,time
> n_time$hour <- hour(now())
> n_time$minute <- minute(now())
> n_time$second <- second(now())
> n_time$month <- month(now())
> n_time$year <- year(now())
#check the extracted dates in separate columns
> new_data <- data.frame(n_time$hour, n_time$minute, n_time$second, n_time$month,
n_time$year)
> new_data

Note: The best use of these packages is not in isolation but in conjunction. You
could easily use this package with dplyr where you can easily select a data
variable and extract the useful data from it using the chain command.

End Notes

These packages would not only enhance your data manipulation experience, but also
give you reasons to explore R in depth. Now we have seen, these packages make
coding in R easier. You no longer need to write long codes. Instead write short
codes and do more.

Every package has multi tasking abilities. Hence, I would suggest you to get hold
of important function which can be used frequently. And, once you get familiar with
them, you can dig deeper. I did this mistake initially. I tried at exploring all
the features in ggplot2 and ended up in a confusion. I’d suggest you to practice
these codes as you read. This would help you build confidence on using these
packages.

In this article, I’ve explained the use of 7 R packages which can make data
exploration easier and faster. R known for its awesome statistical functions, with
newly updated packages makes a favorite tool of data scientists too.

#--------------------------------------------------------------------------------

A brief introduction to “apply” in R

So, what are these wondrous apply functions and how do they work? I think the best
way to figure out anything in R is to learn by experimentation, using
embarrassingly trivial data and functions.

If you fire up your R console, type “??apply” and scroll down to the functions in
the base package, you’ll see something like this:
base::apply Apply Functions Over Array Margins
base::by Apply a Function to a Data Frame Split by Factors
base::eapply Apply a Function Over Values in an Environment
base::lapply Apply a Function over a List or Vector
base::mapply Apply a Function to Multiple List or Vector Arguments
base::rapply Recursively Apply a Function to a List
base::tapply Apply a Function Over a Ragged Array

Let’s examine each of those.

1. apply
Description: “Returns a vector or array or list of values obtained by applying a
function to margins of an array or matrix.”

OK – we know about vectors/arrays and functions, but what are these “margins”?
Simple: either the rows (1), the columns (2) or both (1:2). By “both”, we mean
“apply the function to each individual value.” An example:

# create a matrix of 10 rows x 2 columns


m <- matrix(c(1:10, 11:20), nrow = 10, ncol = 2)
# mean of the rows
apply(m, 1, mean)
[1] 6 7 8 9 10 11 12 13 14 15
# mean of the columns
apply(m, 2, mean)
[1] 5.5 15.5
# divide all values by 2
apply(m, 1:2, function(x) x/2)

[,1] [,2]
[1,] 0.5 5.5
[2,] 1.0 6.0
[3,] 1.5 6.5
[4,] 2.0 7.0
[5,] 2.5 7.5
[6,] 3.0 8.0
[7,] 3.5 8.5
[8,] 4.0 9.0
[9,] 4.5 9.5
[10,] 5.0 10.0

That last example was rather trivial; you could just as easily do “m[, 1:2]/2” –
but you get the idea.

2. by

Updated 27/2/14: note that the original example in this section no longer works;
use colMeans now instead of mean.
Description: “Function ‘by’ is an object-oriented wrapper for ‘tapply’ applied to
data frames.”

The by function is a little more complex than that. Read a little further and the
documentation tells you that “a data frame is split by row into data frames
subsetted by the values of one or more factors, and function ‘FUN’ is applied to
each subset in turn.” So, we use this one where factors are involved.

To illustrate, we can load up the classic R dataset “iris”, which contains a bunch
of flower measurements:
attach(iris)
head(iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa

# get the mean of the first 4 variables, by species


by(iris[, 1:4], Species, colMeans)
Species: setosa
Sepal.Length Sepal.Width Petal.Length Petal.Width
5.006 3.428 1.462 0.246
------------------------------------------------------------
Species: versicolor
Sepal.Length Sepal.Width Petal.Length Petal.Width
5.936 2.770 4.260 1.326
------------------------------------------------------------
Species: virginica
Sepal.Length Sepal.Width Petal.Length Petal.Width
6.588 2.974 5.552 2.026

Essentially, by provides a way to split your data by factors and do calculations on


each subset. It returns an object of class “by” and there are many, more complex
ways to use it.

3. eapply
Description: “eapply applies FUN to the named values from an environment and
returns the results as a list.”

This one is a little trickier, since you need to know something about environments
in R. An environment, as the name suggests, is a self-contained object with its own
variables and functions. To continue using our very simple example:

# a new environment
e <- new.env()
# two environment variables, a and b
e$a <- 1:10
e$b <- 11:20
# mean of the variables
eapply(e, mean)
$b
[1] 15.5

$a
[1] 5.5

I don’t often create my own environments, but they’re commonly used by R packages
such as Bioconductor so it’s good to know how to handle them.

4. lapply
Description: “lapply returns a list of the same length as X, each element of which
is the result of applying FUN to the corresponding element of X.”

That’s a nice, clear description which makes lapply one of the easier apply
functions to understand. A simple example:
# create a list with 2 elements
l <- list(a = 1:10, b = 11:20)
# the mean of the values in each element
lapply(l, mean)
$a
[1] 5.5

$b
[1] 15.5

# the sum of the values in each element


lapply(l, sum)
$a
[1] 55

$b
[1] 155

The lapply documentation tells us to consult further documentation for sapply,


vapply and replicate. Let’s do that.

4.1 sapply
Description: “sapply is a user-friendly version of lapply by default returning a
vector or matrix if appropriate.”

That simply means that if lapply would have returned a list with elements $a and
$b, sapply will return either a vector, with elements [[‘a’]] and [[‘b’]], or a
matrix with column names “a” and “b”. Returning to our previous simple example:

# create a list with 2 elements


l <- list(a = 1:10, b = 11:20)
# mean of values using sapply
l.mean <- sapply(l, mean)
# what type of object was returned?
class(l.mean)
[1] "numeric"
# it's a numeric vector, so we can get element "a" like this
l.mean[['a']]
[1] 5.5

4.2 vapply
Description: “vapply is similar to sapply, but has a pre-specified type of return
value, so it can be safer (and sometimes faster) to use.”

A third argument is supplied to vapply, which you can think of as a kind of


template for the output. The documentation uses the fivenum function as an example,
so let’s go with that:

l <- list(a = 1:10, b = 11:20)


# fivenum of values using vapply
l.fivenum <- vapply(l, fivenum, c(Min.=0, "1st Qu."=0, Median=0, "3rd Qu."=0,
Max.=0))
class(l.fivenum)
[1] "matrix"
# let's see it
l.fivenum
a b
Min. 1.0 11.0
1st Qu. 3.0 13.0
Median 5.5 15.5
3rd Qu. 8.0 18.0
Max. 10.0 20.0

So, vapply returned a matrix, where the column names correspond to the original
list elements and the row names to the output template. Nice.

4.3 replicate
Description: “replicate is a wrapper for the common use of sapply for repeated
evaluation of an expression (which will usually involve random number generation).”

The replicate function is very useful. Give it two mandatory arguments: the number
of replications and the function to replicate; a third optional argument, simplify
= T, tries to simplify the result to a vector or matrix. An example – let’s
simulate 10 normal distributions, each with 10 observations:

replicate(10, rnorm(10))
[,1] [,2] [,3] [,4] [,5] [,6]
[1,] 0.67947001 -1.94649409 0.28144696 0.5872913 2.22715085 -0.275918282
[2,] 1.17298643 -0.01529898 -1.47314092 -1.3274354 -0.04105249 0.528666264
[3,] 0.77272662 -2.36122644 0.06397576 1.5870779 -0.33926083 1.121164338
[4,] -0.42702542 -0.90613885 0.83645668 -0.5462608 -0.87458396 -0.723858258
[5,] -0.73892937 -0.57486661 -0.04418200 -0.1120936 0.08253614 1.319095242
[6,] 2.93827883 -0.33363446 0.55405024 -0.4942736 0.66407615 -0.153623614
[7,] 1.30037496 -0.26207115 0.49818215 1.0774543 -0.28206908 0.825488436
[8,] -0.04153545 -0.23621632 -1.01192741 0.4364413 -2.28991601 -0.002867193
[9,] 0.01262547 0.40247248 0.65816829 0.9541927 -1.63770154 0.328180660
[10,] 0.96525278 -0.37850821 -0.85869035 -0.6055622 1.13756753 -0.371977151
[,7] [,8] [,9] [,10]
[1,] 0.03928297 0.34990909 -0.3159794 1.08871657
[2,] -0.79258805 -0.30329668 -1.0902070 0.73356542
[3,] 0.10673459 -0.02849216 0.8094840 0.06446245
[4,] -0.84584079 -0.57308461 -1.3570979 -0.89801330
[5,] -1.50226560 -2.35751419 1.2104163 0.74650696
[6,] -0.32790991 0.80144695 -0.0071844 0.05742356
[7,] 1.36719970 2.34148354 0.9148911 0.20451421
[8,] -0.51112579 -0.53658159 1.5194130 -0.94250069
[9,] 0.52017814 -1.22252527 0.4519702 0.08779704
[10,] 1.35908918 1.09024342 0.5912627 -0.20709053

5. mapply
Description: “mapply is a multivariate version of sapply. mapply applies FUN to the
first elements of each (…) argument, the second elements, the third elements, and
so on.”

The mapply documentation is full of quite complex examples, but here’s a simple,
silly one:

l1 <- list(a = c(1:10), b = c(11:20))


l2 <- list(c = c(21:30), d = c(31:40))
# sum the corresponding elements of l1 and l2
mapply(sum, l1$a, l1$b, l2$c, l2$d)
[1] 64 68 72 76 80 84 88 92 96 100

Here, we sum l1$a[1] + l1$b[1] + l2$c[1] + l2$d[1] (1 + 11 + 21 + 31) to get 64,


the first element of the returned list. All the way through to l1$a[10] + l1$b[10]
+ l2$c[10] + l2$d[10] (10 + 20 + 30 + 40) = 100, the last element.
6. rapply
Description: “rapply is a recursive version of lapply.”

I think “recursive” is a little misleading. What rapply does is apply functions to


lists in different ways, depending on the arguments supplied. Best illustrated by
examples:

# let's start with our usual simple list example


l <- list(a = 1:10, b = 11:20)
# log2 of each value in the list
rapply(l, log2)
a1 a2 a3 a4 a5 a6 a7 a8
0.000000 1.000000 1.584963 2.000000 2.321928 2.584963 2.807355 3.000000
a9 a10 b1 b2 b3 b4 b5 b6
3.169925 3.321928 3.459432 3.584963 3.700440 3.807355 3.906891 4.000000
b7 b8 b9 b10
4.087463 4.169925 4.247928 4.321928
# log2 of each value in each list
rapply(l, log2, how = "list")
$a
[1] 0.000000 1.000000 1.584963 2.000000 2.321928 2.584963 2.807355 3.000000
[9] 3.169925 3.321928

$b
[1] 3.459432 3.584963 3.700440 3.807355 3.906891 4.000000 4.087463 4.169925
[9] 4.247928 4.321928

# what if the function is the mean?


rapply(l, mean)
a b
5.5 15.5

rapply(l, mean, how = "list")


$a
[1] 5.5

$b
[1] 15.5

So, the output of rapply depends on both the function and the how argument. When
how = “list” (or “replace”), the original list structure is preserved. Otherwise,
the default is to unlist, which results in a vector.

You can also pass a “classes=” argument to rapply. For example, in a mixed list of
numeric and character variables, you could specify that the function act only on
the numeric values with “classes = numeric”.

7. tapply
Description: “Apply a function to each cell of a ragged array, that is to each
(non-empty) group of values given by a unique combination of the levels of certain
factors.”

Woah there. That sounds complicated. Don’t panic though, it becomes clearer when
the required arguments are described. Usage is “tapply(X, INDEX, FUN = NULL, …,
simplify = TRUE)”, where X is “an atomic object, typically a vector” and INDEX is
“a list of factors, each of same length as X”.

So, to go back to the famous iris data, “Species” might be a factor and
“iris$Petal.Width” would give us a vector of values. We could then run something
like:

attach(iris)
# mean petal length by species
tapply(iris$Petal.Length, Species, mean)
setosa versicolor virginica
1.462 4.260 5.552

# Built-in R Features - Data Structures


# R contains quite a few useful built-in functions to work with data structures.
Here are some of the key functions to know:
seq(): Create sequences
sort(): Sort a vector
rev(): Reverse elements in object
str(): Show the structure of an object
append(): Merge objects together (works on vectors and lists)

# seq(start,end,step size)
seq(0, 100, by = 3)
num <- c(1,4,6,7,2,13,2)
sort( num )
sort( num,decreasing = TRUE)

num2 <- c(1,2,3,4,5)


rev(num2)
str(num)

append(num,num2)
sort(append(num,num2))

a <- c(1,2,3)
is.vector(a)

# Data Types
#is.*(): Check the class of an R object
#as.*(): Convert R objects

v <- c(1,2,3)
is.vector(v)
is.list(v)
as.list(v)
as.matrix(v)

#--------------------------------------------------------------------------------
#Built-in R Features - Math
#We've talked a bit about some of the built-in math functions and features in R,
but let's have one more look at a few of them:
#abs(): computes the absolute value.
#sum(): returns the sum of all the values present in the input.
#mean(): computes the arithmetic mean.
#round(): rounds values (additional arguments to nearest)

vec <- c(-1,0,1,2,3,4,5)


abs(-2)
abs(vec)
sum(vec)
mean(vec)
round(23.1231)
round(23.1231234,2)

#Numeric Functions
#Function Description
#abs(x) absolute value
#sqrt(x) square root
#ceiling(x) ceiling(3.475) is 4
#floor(x) floor(3.475) is 3
#trunc(x) trunc(5.99) is 5
#round(x, digits=n) round(3.475, digits=2) is 3.48
#signif(x, digits=n) signif(3.475, digits=2) is 3.5
#cos(x), sin(x), tan(x) also acos(x), cosh(x), acosh(x), etc.
#log(x) natural logarithm
#log10(x) common logarithm
#exp(x) e^x

#--------------------------------------------------------------------------------
#For now we'll learn about two useful functions for regular expressions and pattern
searching (we'll go deeper into this topic in general later on):
#grepl(), which returns a logical indicating if the pattern was found
#grep(), which returns a vector of index locations of matching pattern instances

text <- "Hi there, do you know who you are voting for?"
grepl('voting',text)
grepl('Hi',text)
grepl('Sammy',text)
v <- c('a','b','c','d')
grep('a',v)
grep('c',v)

#--------------------------------------------------------------------------------
#Timestamps
#R gives us a variety of tools for working with timestamp information. Let's start
off by exploring the Date object:
#Dates
#You can use the as.Date() function to convert a character string to a Date object,
which will allow it to contain more time information. The string will need to be in
a standard time format. We can ask for today's date by asking the system (Sys.) for
the Date:

Sys.Date()

# Set as a variable
today <- Sys.Date()
today

#You can also convert character strings in R to a Date object using as.Date().
#You'll need to make sure its in the correct format, or use % symbols that
correlate with your given format:
#Code Value
#%d Day of the month (decimal number)
#%m Month (decimal number)
#%b Month (abbreviated)
#%B Month (full name)
#%y Year (2 digit)
#%Y Year (4 digit)

# YYYY-MM-DD
as.Date('1990-11-03')
# Using Format
as.Date("Nov-03-90",format="%b-%d-%y")
# Using Format
as.Date("November-03-1990",format="%B-%d-%Y")
as.POSIXct("11:02:03",format="%H:%M:%S")
as.POSIXct("November-03-1990 11:02:03",format="%B-%d-%Y %H:%M:%S")

strptime("11:02:03",format="%H:%M:%S")

#================================================================================
Data Manipulation in R
#================================================================================

#--------------------------------------------------------------------------------
#lapply()
#lapply() will apply a function over a list or vector:
#lapply(X, FUN, ...)
#where X is your list/vector and FUN is your function. For more info you can use:

help(lapply)

# sample just 1 random number between 1 and 10


sample(x = 1:10,1)

# vector
v <- c(1,2,3,4,5)

# our custom function


addrand <- function(x){
# Get a random number
ran <-sample(x=1:10,1)
# return x plus the random number
return(x+ran)
}

# lapply()
lapply(v,addrand)

print(v)

# Anon func with lapply()


lapply(v, function(a){a+sample(x=1:10,1)})

# adds two to every element


lapply(v,function(x){x+2})

add_choice <- function(num,choice){


return(num+choice)
}

add_choice(2,3)

# Uh oh! Forgot to add other arguments!


lapply(v,add_choice)

# Nice!
lapply(v,add_choice,choice=10)

#sapply() vs. lapply()


#Notice that lapply returned a list, we can use sapply, which simplifies the
process by returning a vector or matrix. For example:

help(sapply)

# Nice! A vector returned


sapply(v,add_choice,choice=10)

# let's prove it to ourselves


lapp <- lapply(v,add_choice,choice=10)
sapp <- sapply(v,add_choice,choice=10)

class(lapp) # a list
class(sapp) # vector of numerics

# Checks for even numbers


even <- function(x) {
return(x[(x %% 2 == 0)])
}

nums <- c(1,2,3,4,5)

sapply(nums,even)

lapply(nums,even)

#================================================================================
Data Visualization with ggplot2
#================================================================================

#--------------------------------------------------------------------------------
# Line Graphs
#--------------------------------------------------------------------------------
The basic syntax to create a line chart in R is:
plot(v,type,col,xlab,ylab)
Following is the description of the parameters used:
 v is a vector containing the numeric values.
 type takes the value "p" to draw only the points, "i" to draw only the lines and
"o"
to draw both points and lines.
 xlab is the label for x axis.
 ylab is the label for y axis.
 main is the Title of the chart.
 col is used to give colors to both the points and lines.

# Create the data for the chart.


v <- c(7,12,28,3,41)
# Give the chart file a name.
png(file = "line_chart.jpg")
# Plot the line chart.
plot(v,type="o")
# Save the file.
dev.off()

# Plot the line chart.


plot(v,type="o",col="red",xlab="Month",ylab="Rain fall",main="Rain fall chart")
# Save the file.
dev.off()
# Create the data for the chart.
v <- c(7,12,28,3,41)
t <- c(14,7,6,19,3)
# Give the chart file a name.
png(file = "line_chart_2_lines.jpg")
# Plot the bar chart.
plot(v,type="o",col="red",xlab="Month",ylab="Rain fall",main="Rain fall chart")
lines(t, type="o", col="blue")
# Save the file.
dev.off()

#--------------------------------------------------------------------------------
# Bar Charts
#--------------------------------------------------------------------------------

The basic syntax to create a bar-chart in R is:


barplot(H,xlab,ylab,main, names.arg,col)
Following is the description of the parameters used:
 H is a vector or matrix containing numeric values used in bar chart.
 xlab is the label for x axis.
 ylab is the label for y axis.
 main is the title of the bar chart.
 names.arg is a vector of names appearing under each bar.
 col is used to give colors to the bars in the graph.

# Create the data for the chart.


H <- c(7,12,28,3,41)
# Give the chart file a name.
png(file = "barchart.png")
# Plot the bar chart.
barplot(H)
# Save the file.
dev.off()

# Create the data for the chart.


H <- c(7,12,28,3,41)
M <- c("Mar","Apr","May","Jun","Jul")
# Give the chart file a name.
png(file = "barchart_months_revenue.png")
# Plot the bar chart.
barplot(H,names.arg=M,xlab="Month",ylab="Revenue",col="blue",
main="Revenue chart",border="red")
# Save the file.
dev.off()

# Create the input vectors.


colors <- c("green","orange","brown")
months <- c("Mar","Apr","May","Jun","Jul")
regions <- c("East","West","North")
# Create the matrix of the values.
Values <- matrix(c(2,9,3,11,9,4,8,7,3,12,5,2,8,10,11),nrow=3,ncol=5,byrow=TRUE)
# Give the chart file a name.
png(file = "barchart_stacked.png")
# Create the bar chart.
barplot(Values,main="total
revenue",names.arg=months,xlab="month",ylab="revenue",col=colors)
# Add the legend to the chart.
legend("topleft", regions, cex=1.3, fill=colors)
# Save the file.
dev.off()

#--------------------------------------------------------------------------------
# Pie Charts
#--------------------------------------------------------------------------------
Syntax
The basic syntax for creating a pie-chart using the R is:
pie(x, labels, radius, main, col, clockwise)

Following is the description of the parameters used:


 x is a vector containing the numeric values used in the pie chart.
 labels is used to give description to the slices.
 radius indicates the radius of the circle of the pie chart.(value between -1 and
+1).
 main indicates the title of the chart.
 col indicates the color palette.
 clockwise is a logical value indicating if the slices are drawn clockwise or anti
clockwise.

# A very simple pie-chart is created using just the input vector and labels. The
below script will create and save the pie chart in the current R working directory.

# Create data for the graph.


x <- c(21, 62, 10, 53)
labels <- c("London", "New York", "Singapore", "Mumbai")
# Give the chart file a name.
png(file = "city.jpg")
# Plot the chart.
pie(x,labels)
# Save the file.
dev.off()

#The below script will create and save the pie chart in the current R working
directory.

# Give the chart file a name.


png(file = "city_title_colours.jpg")
# Plot the chart with title and rainbow color pallet.
pie(x, labels, main="City pie chart", col=rainbow(length(x)))
# Save the file.
dev.off()

# We can add slice percentage and a chart legend by creating additional chart
variables.

# Create data for the graph.


x <- c(21, 62, 10,53)
labels <- c("London","New York","Singapore","Mumbai")
piepercent<- round(100*x/sum(x), 1)
# Give the chart file a name.
png(file = "city_percentage_legends.jpg")
# Plot the chart.
pie(x, labels=piepercent, main="City pie chart",col=rainbow(length(x)))

legend("topright", c("London","New York","Singapore","Mumbai"), cex=0.8,


fill=rainbow(length(x)))
# Save the file.
dev.off()
# A pie chart with 3 dimensions can be drawn using additional packages. The package
plotrix has a function called pie3D() that is used for this.

# Get the library.


library(plotrix)
# Create data for the graph.
x <- c(21, 62, 10,53)
lbl <- c("London","New York","Singapore","Mumbai")
# Give the chart file a name.
png(file = "3d_pie_chart.jpg")
# Plot the chart.
pie3D(x,labels=lbl,explode=0.1,
main="Pie Chart of Countries ")
# Save the file.
dev.off()

#--------------------------------------------------------------------------------
# Boxplots
#--------------------------------------------------------------------------------
The basic syntax to create a boxplot in R is :
boxplot(x,data,notch,varwidth,names,main)
Following is the description of the parameters used:
 x is a vector or a formula.
 data is the data frame.
 notch is a logical value. Set as TRUE to draw a notch.
 varwidth is a logical value. Set as true to draw width of the box proportionate
to
the sample size.
 names are the group labels which will be printed under each boxplot.
 main is used to give a title to the graph.

input <- mtcars[,c('mpg','cyl')]


print(head(input))

# Give the chart file a name.


png(file = "boxplot.png")
# Plot the chart.
boxplot(mpg ~ cyl, data=mtcars,
xlab="Number of Cylinders",
ylab="Miles Per Gallon",
main="Mileage Data")
# Save the file.
dev.off()

#--------------------------------------------------------------------------------
# Histrograms
#--------------------------------------------------------------------------------
Syntax
The basic syntax for creating a histogram using R is:
hist(v,main,xlab,xlim,ylim,breaks,col,border)
Following is the description of the parameters used:
 v is a vector containing numeric values used in histogram.
 main indicates title of the chart.
 col is used to set color of the bars.
 border is used to set border color of each bar.
 xlab is used to give description of x-axis.
 xlim is used to specify the range of values on the x-axis.
 ylim is used to specify the range of values on the y-axis.
 breaks is used to mention the width of each bar.

# Create data for the graph.


v <- c(9,13,21,8,36,22,12,41,31,33,19)
# Give the chart file a name.
png(file = "histogram.png")
# Create the histogram.
hist(v,xlab="Weight",col="yellow",border="blue")
# Save the file.
dev.off()

# Create the histogram.


hist(v,xlab="Weight",col="green",border="red",xlim = c(0,40), ylim = c(0,5),
breaks = 5 )
# Save the file.
dev.off()

#--------------------------------------------------------------------------------
# Scatterplots
#--------------------------------------------------------------------------------

The basic syntax for creating scatterplot in R is :


plot(x, y, main, xlab, ylab, xlim, ylim, axes)
Following is the description of the parameters used:
 x is the data set whose values are the horizontal coordinates.
 y is the data set whose values are the vertical coordinates.
 main is the tile of the graph.
 xlab is the label in the horizontal axis.
 ylab is the label in the vertical axis.
 xlim is the limits of the values of x used for plotting.
 ylim is the limits of the values of y used for plotting.
 axes indicates whether both axes should be drawn on the plot.

input <- mtcars[,c('wt','mpg')]


print(head(input))

# Plot the chart for cars with weight between 2.5 to 5 and mileage between 15
and 30.
plot(x=input$wt,y=input$mpg,
xlab="Weight",
ylab="Milage",
xlim=c(2.5,5),
ylim=c(15,30),
main="Weight vs Milage"
)
# Save the file.
dev.off()

# Plot the matrices between 4 variables giving 12 plots.


# One variable with 3 others and total 4 variables.
pairs(~wt+mpg+disp+cyl,data=mtcars,
main="Scatterplot Matrix")
# Save the file.
dev.off()

http://www.r-graph-gallery.com/portfolio/ggplot2-package/

Das könnte Ihnen auch gefallen