In this assignment, I clean the train data by columns without missing values and remove some coumns by nearZeroVar. I split the train data into training / testing set to validate the model before running the 20 test cases of this assignment. I use Randorm Forest to train my model for prediction. I change some parameters to find a better model to predict the 20 test cases to submit for this assignment. The data for this project come from this source: http://groupware.les.inf.puc-rio.br/har.

Read training data

setwd("/Users/hadoop/Dropbox/R/MachineLearning")
trainData <- read.csv("pml-training.csv", 
                      header = TRUE, sep=",")

#print bar plot show the distribution of classe
barplot(table(trainData$classe), 
        xlab="classe", 
        ylab="quantity", 
        main="Quantity by classes - Weight Lifting Exercise training dataset",
        col=c(2:6))

Make Clean data

Random Forest takes a lot of time to train model, I decides to skip the columns with missing value or nearZeroVar nzv is TRUE or zeroVar is TRUE After cleaning data , ————————————- 1. remove columns not related to measurement and classe ————————————-

# remove columns not related to measurement and classe
trainData <- trainData[,-(1:7)]
# convert to numeric for each column , except classe
for(i in c(1:152)) {
    trainData[,i] <- as.numeric(trainData[,i])
}
2. remove column with N/A,
# remove column with N/A
naTrainIdx <-apply(trainData, 2, function(x) any(is.na(x)))
cleanData <- trainData[, which(!naTrainIdx)]
# check every element is not N/A
sum(is.na(cleanData))
## [1] 0
3. keep columns by nearZeroVar which(!nsv\(zeroVar & !nsv\)nzv)
#check NearZeroVar, remove zeroVar  or nzv  
require(caret)
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
nsv <- nearZeroVar(cleanData, saveMetrics= TRUE)
head(nsv)
##                       freqRatio percentUnique zeroVar   nzv
## roll_belt              1.101904     6.7781062   FALSE FALSE
## pitch_belt             1.036082     9.3772296   FALSE FALSE
## yaw_belt               1.058480     9.9734991   FALSE FALSE
## total_accel_belt       1.063160     0.1477933   FALSE FALSE
## kurtosis_roll_belt  1921.600000     2.0232392   FALSE  TRUE
## kurtosis_picth_belt  600.500000     1.6155336   FALSE  TRUE
subset <- row.names(nsv) [which(!nsv$zeroVar & !nsv$nzv)]
  
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
cleanData = select(cleanData, which(names(cleanData) %in% subset) )

Split data to training/testing set

#split into training(0.70)/testing(0.30) data set to validate model
require(caret)
set.seed(1000)
inTrain<-createDataPartition(cleanData$classe,p=0.70 ,list=FALSE)
training <- cleanData[inTrain,]
testing <- cleanData[-inTrain,]

Training model by training Data

#Parallel Random Forest 
require(e1071)
## Loading required package: e1071
require(foreach)
## Loading required package: foreach
#parameter I used to train model
#modFit <- train(training$classe ~., data = training, method = "parRF") #default 
##             ,trControl = trainControl(method = "cv",  number = 4)  #option
#              ,importance = TRUE) )   #option
#build compiled html, I read trained model from file instead of training model
setwd("/Users/hadoop/Dropbox/R/MachineLearning")
modResult <- readRDS("RF70CV5/modFit.RData")
modFit <- modResult$mod
show(modFit)
## Random Forest 
## 
## 13737 samples
##    52 predictor
##     5 classes: 'A', 'B', 'C', 'D', 'E' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 10990, 10988, 10990, 10989, 10991 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa      Accuracy SD   Kappa SD    
##    2    0.9903908  0.9878434  0.0020693270  0.0026179406
##   27    0.9904637  0.9879363  0.0004754991  0.0006010356
##   52    0.9858774  0.9821347  0.0030071642  0.0038044322
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was mtry = 27.

Predict testing cases

#Predicting testing and display confusion matrix
pred <- predict(modFit, testing)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
cm <- confusionMatrix(testing$classe, pred)
show(cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1672    2    0    0    0
##          B   10 1128    1    0    0
##          C    0    5 1018    3    0
##          D    0    0   14  949    1
##          E    0    2    0    7 1073
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9924          
##                  95% CI : (0.9898, 0.9944)
##     No Information Rate : 0.2858          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9903          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9941   0.9921   0.9855   0.9896   0.9991
## Specificity            0.9995   0.9977   0.9984   0.9970   0.9981
## Pos Pred Value         0.9988   0.9903   0.9922   0.9844   0.9917
## Neg Pred Value         0.9976   0.9981   0.9969   0.9980   0.9998
## Prevalence             0.2858   0.1932   0.1755   0.1630   0.1825
## Detection Rate         0.2841   0.1917   0.1730   0.1613   0.1823
## Detection Prevalence   0.2845   0.1935   0.1743   0.1638   0.1839
## Balanced Accuracy      0.9968   0.9949   0.9919   0.9933   0.9986
write.csv(data.frame(overall= cm$overall), file="overall.txt")
# write.csv(cm$byClass, file = paste0(modDir, "confusionByClass.csv"))

plot the frequency of predict v.s. actual

require(ggplot2)
input.matrix <- data.matrix(cm)
confusion <- as.data.frame(as.table(input.matrix))
plot <- ggplot(confusion)
plot <- plot + geom_tile(aes(x=Var1, y=Var2, fill=Freq)) + 
scale_x_discrete(name="Actual Classe") + 
scale_y_discrete(name="Predicted Classe") + 
scale_fill_gradient(breaks=seq(from=3000, to=0, by=-500), low="pink", high = "red") +
    labs(fill="Frequency") +
    geom_text(aes(x = Var1, y = Var2, label = Freq), size = 3)
print(plot)

# save plot
#ggsave(filename="WLE/ConfusionMatrix.jpg" , plot=plot)

plot the point with wrong prediction

#display the importance variables 
imp_vals <- importance(modFit$finalModel)
tail(imp_vals[order(imp_vals[,6]),])
##                          A        B        C        D        E
## magnet_dumbbell_z 46.31477 36.00274 43.34114 32.42444 33.18925
## magnet_dumbbell_y 40.34623 39.71227 50.15887 38.70864 35.45309
## pitch_belt        21.00786 56.53161 39.03031 31.76992 26.55756
## pitch_forearm     39.17611 50.66011 65.41022 39.01924 40.97057
## yaw_belt          41.72957 39.09945 44.32857 45.02114 33.22362
## roll_belt         46.65033 54.21058 52.03500 54.81043 57.38519
##                   MeanDecreaseAccuracy MeanDecreaseGini
## magnet_dumbbell_z             45.63266         628.9257
## magnet_dumbbell_y             47.66969         619.1564
## pitch_belt                    49.40580         644.1595
## pitch_forearm                 60.51466         872.5259
## yaw_belt                      69.78095         807.2493
## roll_belt                     76.54471        1393.8640
predRight <- pred == testing$classe
# plot roll_belt v.s pitch_belt
p <- qplot(yaw_belt , roll_belt, col=classe, data=testing)
p + geom_point(aes(x=yaw_belt,y= roll_belt, col="Error"),size=6,shape=4,
               data=testing[which(!predRight), ])

combine the overall accuracy of training models

setwd("/Users/hadoop/Dropbox/R/MachineLearning")
overall <- read.csv("parRF60CtrlN8/overall.txt")
names(overall) <- c("overall", "parRF60CtrlN8")
overall[c(2:6), 2] <- round(overall[c(2:6), 2] ,5)

acc <- read.csv("parRF70/overall.txt")
overall <- mutate(overall, parRF70 = round(acc[,2],5))

acc1 <- read.csv("RF50NACtrlN4/overall.txt")
overall <- mutate(overall, RF50NACtrlN4 = round(acc1[,2],5))

acc2 <- read.csv("parRF60CtrlN8/overall.txt")
overall <- mutate(overall, parRF60CtrlN8 = round(acc2[,2],5))

acc3 <- read.csv("RF60CtrlN4/overall.txt")
overall <- mutate(overall, RF60CtrlN4 = round(acc3[,2],5))
overall[-c(1,7),]
##          overall parRF60CtrlN8 parRF70 RF50NACtrlN4 RF60CtrlN4
## 2          Kappa       0.98807 0.99033      0.98749    0.98758
## 3  AccuracyLower       0.98817 0.98978      0.98795    0.98775
## 4  AccuracyUpper       0.99259 0.99442      0.99197    0.99225
## 5   AccuracyNull       0.28652 0.28581      0.28481    0.28715
## 6 AccuracyPValue       0.00000 0.00000      0.00000    0.00000

Predict 20 new testing cases

choose the model by

setwd("/Users/hadoop/Dropbox/R/MachineLearning")
# read new test data
testData <-  read.csv("pml-testing.csv", header = TRUE, sep=",")
# predict new data with model 
testPred <- predict(modFit, testData)
# create output for assignment
##pml_write_files("WLE/", testPred) 

Reference

  1. Velloso, E.; Bulling, A.; Gellersen, H.; Ugulino, W.; Fuks, H. Qualitative Activity Recognition of Weight Lifting Exercises. Proceedings of 4th International Conference in Cooperation with SIGCHI (Augmented Human ’13) . Stuttgart, Germany: ACM SIGCHI, 2013. Read more: http://groupware.les.inf.puc-rio.br/har#wle_paper_section#ixzz3gpGqu9xB