In this assignment, I clean the train data by columns without missing values and remove some coumns by nearZeroVar. I split the train data into training / testing set to validate the model before running the 20 test cases of this assignment. I use Randorm Forest to train my model for prediction. I change some parameters to find a better model to predict the 20 test cases to submit for this assignment. The data for this project come from this source: http://groupware.les.inf.puc-rio.br/har.
setwd("/Users/hadoop/Dropbox/R/MachineLearning")
trainData <- read.csv("pml-training.csv",
header = TRUE, sep=",")
#print bar plot show the distribution of classe
barplot(table(trainData$classe),
xlab="classe",
ylab="quantity",
main="Quantity by classes - Weight Lifting Exercise training dataset",
col=c(2:6))
Random Forest takes a lot of time to train model, I decides to skip the columns with missing value or nearZeroVar nzv is TRUE or zeroVar is TRUE After cleaning data , ————————————- 1. remove columns not related to measurement and classe ————————————-
# remove columns not related to measurement and classe
trainData <- trainData[,-(1:7)]
# convert to numeric for each column , except classe
for(i in c(1:152)) {
trainData[,i] <- as.numeric(trainData[,i])
}
| 2. remove column with N/A, |
# remove column with N/A
naTrainIdx <-apply(trainData, 2, function(x) any(is.na(x)))
cleanData <- trainData[, which(!naTrainIdx)]
# check every element is not N/A
sum(is.na(cleanData))
## [1] 0
| 3. keep columns by nearZeroVar which(!nsv\(zeroVar & !nsv\)nzv) |
#check NearZeroVar, remove zeroVar or nzv
require(caret)
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
nsv <- nearZeroVar(cleanData, saveMetrics= TRUE)
head(nsv)
## freqRatio percentUnique zeroVar nzv
## roll_belt 1.101904 6.7781062 FALSE FALSE
## pitch_belt 1.036082 9.3772296 FALSE FALSE
## yaw_belt 1.058480 9.9734991 FALSE FALSE
## total_accel_belt 1.063160 0.1477933 FALSE FALSE
## kurtosis_roll_belt 1921.600000 2.0232392 FALSE TRUE
## kurtosis_picth_belt 600.500000 1.6155336 FALSE TRUE
subset <- row.names(nsv) [which(!nsv$zeroVar & !nsv$nzv)]
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
cleanData = select(cleanData, which(names(cleanData) %in% subset) )
#split into training(0.70)/testing(0.30) data set to validate model
require(caret)
set.seed(1000)
inTrain<-createDataPartition(cleanData$classe,p=0.70 ,list=FALSE)
training <- cleanData[inTrain,]
testing <- cleanData[-inTrain,]
#Parallel Random Forest
require(e1071)
## Loading required package: e1071
require(foreach)
## Loading required package: foreach
#parameter I used to train model
#modFit <- train(training$classe ~., data = training, method = "parRF") #default
## ,trControl = trainControl(method = "cv", number = 4) #option
# ,importance = TRUE) ) #option
#build compiled html, I read trained model from file instead of training model
setwd("/Users/hadoop/Dropbox/R/MachineLearning")
modResult <- readRDS("RF70CV5/modFit.RData")
modFit <- modResult$mod
show(modFit)
## Random Forest
##
## 13737 samples
## 52 predictor
## 5 classes: 'A', 'B', 'C', 'D', 'E'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 10990, 10988, 10990, 10989, 10991
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa Accuracy SD Kappa SD
## 2 0.9903908 0.9878434 0.0020693270 0.0026179406
## 27 0.9904637 0.9879363 0.0004754991 0.0006010356
## 52 0.9858774 0.9821347 0.0030071642 0.0038044322
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 27.
#Predicting testing and display confusion matrix
pred <- predict(modFit, testing)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
cm <- confusionMatrix(testing$classe, pred)
show(cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction A B C D E
## A 1672 2 0 0 0
## B 10 1128 1 0 0
## C 0 5 1018 3 0
## D 0 0 14 949 1
## E 0 2 0 7 1073
##
## Overall Statistics
##
## Accuracy : 0.9924
## 95% CI : (0.9898, 0.9944)
## No Information Rate : 0.2858
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9903
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: A Class: B Class: C Class: D Class: E
## Sensitivity 0.9941 0.9921 0.9855 0.9896 0.9991
## Specificity 0.9995 0.9977 0.9984 0.9970 0.9981
## Pos Pred Value 0.9988 0.9903 0.9922 0.9844 0.9917
## Neg Pred Value 0.9976 0.9981 0.9969 0.9980 0.9998
## Prevalence 0.2858 0.1932 0.1755 0.1630 0.1825
## Detection Rate 0.2841 0.1917 0.1730 0.1613 0.1823
## Detection Prevalence 0.2845 0.1935 0.1743 0.1638 0.1839
## Balanced Accuracy 0.9968 0.9949 0.9919 0.9933 0.9986
write.csv(data.frame(overall= cm$overall), file="overall.txt")
# write.csv(cm$byClass, file = paste0(modDir, "confusionByClass.csv"))
require(ggplot2)
input.matrix <- data.matrix(cm)
confusion <- as.data.frame(as.table(input.matrix))
plot <- ggplot(confusion)
plot <- plot + geom_tile(aes(x=Var1, y=Var2, fill=Freq)) +
scale_x_discrete(name="Actual Classe") +
scale_y_discrete(name="Predicted Classe") +
scale_fill_gradient(breaks=seq(from=3000, to=0, by=-500), low="pink", high = "red") +
labs(fill="Frequency") +
geom_text(aes(x = Var1, y = Var2, label = Freq), size = 3)
print(plot)
# save plot
#ggsave(filename="WLE/ConfusionMatrix.jpg" , plot=plot)
#display the importance variables
imp_vals <- importance(modFit$finalModel)
tail(imp_vals[order(imp_vals[,6]),])
## A B C D E
## magnet_dumbbell_z 46.31477 36.00274 43.34114 32.42444 33.18925
## magnet_dumbbell_y 40.34623 39.71227 50.15887 38.70864 35.45309
## pitch_belt 21.00786 56.53161 39.03031 31.76992 26.55756
## pitch_forearm 39.17611 50.66011 65.41022 39.01924 40.97057
## yaw_belt 41.72957 39.09945 44.32857 45.02114 33.22362
## roll_belt 46.65033 54.21058 52.03500 54.81043 57.38519
## MeanDecreaseAccuracy MeanDecreaseGini
## magnet_dumbbell_z 45.63266 628.9257
## magnet_dumbbell_y 47.66969 619.1564
## pitch_belt 49.40580 644.1595
## pitch_forearm 60.51466 872.5259
## yaw_belt 69.78095 807.2493
## roll_belt 76.54471 1393.8640
predRight <- pred == testing$classe
# plot roll_belt v.s pitch_belt
p <- qplot(yaw_belt , roll_belt, col=classe, data=testing)
p + geom_point(aes(x=yaw_belt,y= roll_belt, col="Error"),size=6,shape=4,
data=testing[which(!predRight), ])
setwd("/Users/hadoop/Dropbox/R/MachineLearning")
overall <- read.csv("parRF60CtrlN8/overall.txt")
names(overall) <- c("overall", "parRF60CtrlN8")
overall[c(2:6), 2] <- round(overall[c(2:6), 2] ,5)
acc <- read.csv("parRF70/overall.txt")
overall <- mutate(overall, parRF70 = round(acc[,2],5))
acc1 <- read.csv("RF50NACtrlN4/overall.txt")
overall <- mutate(overall, RF50NACtrlN4 = round(acc1[,2],5))
acc2 <- read.csv("parRF60CtrlN8/overall.txt")
overall <- mutate(overall, parRF60CtrlN8 = round(acc2[,2],5))
acc3 <- read.csv("RF60CtrlN4/overall.txt")
overall <- mutate(overall, RF60CtrlN4 = round(acc3[,2],5))
overall[-c(1,7),]
## overall parRF60CtrlN8 parRF70 RF50NACtrlN4 RF60CtrlN4
## 2 Kappa 0.98807 0.99033 0.98749 0.98758
## 3 AccuracyLower 0.98817 0.98978 0.98795 0.98775
## 4 AccuracyUpper 0.99259 0.99442 0.99197 0.99225
## 5 AccuracyNull 0.28652 0.28581 0.28481 0.28715
## 6 AccuracyPValue 0.00000 0.00000 0.00000 0.00000
choose the model by
setwd("/Users/hadoop/Dropbox/R/MachineLearning")
# read new test data
testData <- read.csv("pml-testing.csv", header = TRUE, sep=",")
# predict new data with model
testPred <- predict(modFit, testData)
# create output for assignment
##pml_write_files("WLE/", testPred)