Beruflich Dokumente
Kultur Dokumente
(a) Particionar los datos en 70% para entrenamiento del modelo (TRAIN) y 30% para prueba (TEST).
particion = createDataPartition(datos$delay,
p=0.7,
list=FALSE,
times=1)
TRAIN = datos[particion,]
TEST = datos[-particion,]
(b) Para cada variable, entrene un modelo de bosque aleatorio con una sola variable. Utilice los datos de
entrenamiento.
#Variable 1 – schedtime
v1 = train(x=subset(TRAIN,select=schedtime),
y=TRAIN$delay,
method = "rf", ntree = 50,
trControl =
trainControl(method="cv",number=10))
v1$finalModel
prediccion1 = predict(v1, TEST)
confusionMatrix(prediccion1, TEST$delay)
#Variable 2 – carrier
v2 = train(x=subset(TRAIN,select=carrier),
y=TRAIN$delay,
method = "rf", ntree = 50,
trControl =
trainControl(method="cv",number=10))
v2$finalModel
prediccion2 = predict(v2, TEST)
confusionMatrix(prediccion2, TEST$delay)
Variable 3 – dest
v3 = train(x=subset(TRAIN,select=dest),
y=TRAIN$delay,
method = "rf", ntree = 100,
trControl =
trainControl(method="cv",number=10))
v3$finalModel
prediccion3 = predict(v3, TEST)
confusionMatrix(prediccion3, TEST$delay)
Variable 4 – distance
v4 = train(x=subset(TRAIN,select=distance),
y=TRAIN$delay,
method = "rf", ntree = 100,
trControl =
trainControl(method="cv",number=10))
v4$finalModel
prediccion4 = predict(v4, TEST)
confusionMatrix(prediccion4, TEST$delay)
Variable 5 – date
v5 = train(x=subset(TRAIN,select=date),
y=TRAIN$delay,
method = "rf", ntree = 100,
trControl =
trainControl(method="cv",number=10))
v5$finalModel
prediccion5 = predict(v5, TEST)
confusionMatrix(prediccion5, TEST$delay)
Variable 6 – flightnumber
v6 =
train(x=subset(TRAIN,select=flightnumber),
y=TRAIN$delay,
method = "rf", ntree = 30,
trControl =
trainControl(method="cv",number=10))
v6$finalModel
prediccion6 = predict(v6, TEST)
confusionMatrix(prediccion6, TEST$delay)
Variable 7 – origin
v7 = train(x=subset(TRAIN,select=origin),
y=TRAIN$delay,
method = "rf", ntree = 30,
trControl =
trainControl(method="cv",number=10))
v7$finalModel
prediccion7 = predict(v7, TEST)
confusionMatrix(prediccion7, TEST$delay)
Variable 8 – weather
v8 = train(x=subset(TRAIN,select=weather),
y=TRAIN$delay,
method = "rf", ntree = 100,
trControl =
trainControl(method="cv",number=10))
v8$finalModel
prediccion8 = predict(v8, TEST)
confusionMatrix(prediccion8, TEST$delay)
Variable 9 – dayweek
v9 = train(x=subset(TRAIN,select=dayweek),
y=TRAIN$delay,
method = "rf", ntree = 30,
trControl =
trainControl(method="cv",number=10))
v9$finalModel
prediccion9 = predict(v9, TEST)
confusionMatrix(prediccion9, TEST$delay)
Variable 10 – daymonth
v10 = train(x=subset(TRAIN,select=daymonth),
y=TRAIN$delay,
method = "rf", ntree = 100,
trControl =
trainControl(method="cv",number=10))
v10$finalModel
prediccion10 = predict(v10, TEST)
confusionMatrix(prediccion10, TEST$delay)
(c) Identifique el modelo con el menor error de predicción. Utilice los datos de prueba.
Variable Accuracy
El modelo con menor error de predicción es el de la variable 8 – Weather, con un accuracy de 0.8179:
CÓDIGO:
modelo = train(x=subset(TRAIN,select=-delay),
y=TRAIN$delay,
method = "rf",
ntree = 100,
trControl = trainControl(method="cv",number=10))
modelo$finalModel
prediccionModelo = predict(modelo, TEST)
confusionMatrix(prediccionModelo, TEST$delay)
(e) Determine el "accuracy" (ACC) del modelo ajustado. Utilice los datos de TEST.
El accuracy del modelo de bosques aleatorios es 0.8361.
(f) ¿Cuál es la probabilidad de que un vuelo con los datos: schedtime 840, carrier DL, dest LGA, distance
214, date 1/15/2004, flightnumber 4964, origin DCA, weather 0, dayweek 4, daymonth15, tailnu N703MQ,
¿se retrase?
CÓDIGO:
prob = train(x=subset(TRAIN,select=-delay),
y=TRAIN$delay,
method="rpart",
trControl=trainControl(method="cv",number=10))
variables=data.frame(schedtime=840,carrier="DL",dest="LGA", distance=214,
date="1/15/2004", flightnumber=4964,
origin="DCA",weather="0", dayweek="0", daymonth="15")
(g) Considere que el costo de predecir un vuelo atrasado equivocadamente es la mitad que el de predecir un
vuelo no atrasado equivocadamente. Determine el umbral de probabilidad óptimo en esta situación.
Grafica:
El umbral optimo es el punto más bajo de la curva.
(h) Determine si el modelo encontrado en (d) tiene un menor o mayor ACC que el encontrado en (b). Utilice
los datos de prueba.
Modelo D Modelo B
El modelo D tiene mayor accuracy al realizar el análisis con más variables que puedan explicar los retrasos
en los vuelos.
TEMA 2: Conjunto de Datos Individual
El objetivo es utilizar los datos individuales que cada estudiante ha entregado en la tarea anterior, y utilizarlo
para entrenar un modelo de predicción. Específicamente, se requiere utilizar el paquete caret para lo siguiente:
(a) Asegure que las variables sean reconocidas correctamente en R según su tipo (numéricas o categóricas)
CÓDIGO:
summary(base)
modelo_AC = train(x=subset(TRAIN2,select=-favorable),
y=TRAIN2$favorable,
method="rpart",
trControl=trainControl(method="cv",number=10))
(e) Determine el "accuracy" (ACC) del modelo ajustado. Utilice los datos de PRUEBA.
El accuracy es de 0.5237, este valor podría mejorar si encontramos las variables que sean óptimas para nuestro
modelo, esto se puede realizar utilizando la función de Eliminación recursiva de características (rfe).
CÓDIGO UTILIZADO PARA FLIGHT DELAYS
install.packages("caret")
library(caret)
install.packages("klaR")
library(klaR)
install.packages("randomForest")
library("randomForest")
install.packages("e1071")
library(e1071)
install.packages("rpart.plot")
library("rpart.plot")
summary(datos)
#Para cada variable, entrene un modelo de bosque aleatorio con una sola variable.
#V1 - schedtime
v1 = train(x=subset(TRAIN,select=schedtime), y=TRAIN$delay,
method = "rf", ntree = 50,
trControl = trainControl(method="cv",number=10))
v1$finalModel
prediccion1 = predict(v1, TEST)
confusionMatrix(prediccion1, TEST$delay)
#V2 - carrier
v2 = train(x=subset(TRAIN,select=carrier), y=TRAIN$delay,
method = "rf", ntree = 50,
trControl = trainControl(method="cv",number=10))
v2$finalModel
prediccion2 = predict(v2, TEST)
confusionMatrix(prediccion2, TEST$delay)
#V3 - dest
v3 = train(x=subset(TRAIN,select=dest), y=TRAIN$delay,
method = "rf", ntree = 100,
trControl = trainControl(method="cv",number=10))
v3$finalModel
prediccion3 = predict(v3, TEST)
confusionMatrix(prediccion3, TEST$delay)
#V4 - distance
v4 = train(x=subset(TRAIN,select=distance), y=TRAIN$delay,
method = "rf", ntree = 100,
trControl = trainControl(method="cv",number=10))
v4$finalModel
prediccion4 = predict(v4, TEST)
confusionMatrix(prediccion4, TEST$delay)
#V5 - date
v5 = train(x=subset(TRAIN,select=date), y=TRAIN$delay,
method = "rf", ntree = 100,
trControl = trainControl(method="cv",number=10))
v5$finalModel
prediccion5 = predict(v5, TEST)
confusionMatrix(prediccion5, TEST$delay)
#V6 - flightnumber
v6 = train(x=subset(TRAIN,select=flightnumber), y=TRAIN$delay,
method = "rf", ntree = 30,
trControl = trainControl(method="cv",number=10))
v6$finalModel
prediccion6 = predict(v6, TEST)
confusionMatrix(prediccion6, TEST$delay)
#V7 - origin
v7 = train(x=subset(TRAIN,select=origin), y=TRAIN$delay,
method = "rf", ntree = 30,
trControl = trainControl(method="cv",number=10))
v7$finalModel
prediccion7 = predict(v7, TEST)
confusionMatrix(prediccion7, TEST$delay)
#V8 - weather
v8 = train(x=subset(TRAIN,select=weather), y=TRAIN$delay,
method = "rf", ntree = 100,
trControl = trainControl(method="cv",number=10))
v8$finalModel
prediccion8 = predict(v8, TEST)
confusionMatrix(prediccion8, TEST$delay)
#V9 - dayweek
v9 = train(x=subset(TRAIN,select=dayweek), y=TRAIN$delay,
method = "rf", ntree = 30,
trControl = trainControl(method="cv",number=10))
v9$finalModel
prediccion9 = predict(v9, TEST)
confusionMatrix(prediccion9, TEST$delay)
#V10 - daymonth
v10 = train(x=subset(TRAIN,select=daymonth), y=TRAIN$delay,
method = "rf", ntree = 100,
trControl = trainControl(method="cv",number=10))
v10$finalModel
prediccion10 = predict(v10, TEST)
confusionMatrix(prediccion10, TEST$delay)
modelo = train(x=subset(TRAIN,select=-delay),
y=TRAIN$delay,
method = "rf",
ntree = 100,
trControl = trainControl(method="cv",number=10))
modelo$finalModel
prediccionModelo = predict(modelo, TEST)
confusionMatrix(prediccionModelo, TEST$delay)
modelo
prob = train(x=subset(TRAIN,select=-delay),
y=TRAIN$delay,
method="rpart",
trControl=trainControl(method="cv",number=10))
#Umbral óptimo
umbral = seq(0,1,.001)
costo = NULL
costo.beta = 100
costo.alpha = 0.5*costo.beta
for (i in 1:length(umbral))
{
prediccion.clase = ifelse(prediccion.prob[,2]>umbral[i],1,0)
alpha = confusionMatrix(prediccion.clase, TEST$delay)$table[1,2]
beta = confusionMatrix(prediccion.clase, TEST$delay)$table[2,1]
costo[i]<-costo.alpha*alpha + costo.beta*beta
}
plot(umbral,costo,type = "h")