The cleaningdataproject from renebroekhoven

title	author	date	output
Cleaning Data Project	Rene Broekhoven	Thursday, July 23, 2015	html_document

The datafile for this project can be obtained at :

https://d396qusza40orc.cloudfront.net/getdata%2Fprojectfiles%2FUCI%20HAR%20Dataset.zip

This is a ZIP-file which you should unzip with your own tools (Windows or Mac) in a directory of your choosing. For example : c:/desktop/project After unzipping, you will get a folder "getdata-projectfiles-UCI HAR Dataset". In this folder you will find "UCI HAR Dataset" : this directory will be your starting point. This script can be executed from the original directory (in this example : c:/desktop/project)


## Unzipping the dataset will get you the directory :  getdata-projectfiles-UCI HAR Dataset
## The starting workingdirectory will be set to the subdirectory herein : "........./"UCI HAR Dataset/"

## Cleaning everything and setting the working directory ##
rm(list=ls())
startwd <- getwd()


## !! Change the dots(....) to your directory where you unzipped the UCI HAR dataset.
## After the unzipping, there should be the folder :  "getdata-projectfiles-UCI HAR Dataset"
## This example is my own directory I used :
##      EXAMPLE : my_unzip_dir <- "c:/users/rene/desktop/cleaningdata/project/"
my_unzip_dir <- "...."


# setting the working directory
project_working_dir <- "getdata-projectfiles-UCI HAR Dataset/UCI HAR Dataset"
setwd(paste(my_unzip_dir,project_working_dir, sep=""))

#using library(plyr) to make things easy
library(dplyr)

## QUESTION 1 ##

# Making dataframe's and merging
test_df <- read.table("./test/X_test.txt")
train_df <- read.table("./train/X_train.txt")
my_df <- rbind(train_df, test_df)

## QUESTION 2 ##

# Getting only the mean en std variables
# First getting the names from the file features.txt
features <- read.table("features.txt")
features <- as.character(features[,2])
colnames(my_df) <- features

# to remove the duplicates (although I could nog find them by hand...)
my_df <- my_df[ , !duplicated(colnames(my_df))]

# selecting the right columns with a simle regular expression
# I also deleted the variables with meanFreq, because they seem to be derived variables asnd therefore untidy
my_df <- select(my_df, matches("mean|std"))  # as requested in the project
my_df <- select(my_df,-matches("meanFreq"))  # my interpretation ! : these are derived variables (= 'untidy')

# adding the activity and subject columns from train and test (the columns now are called V1 and V2)
test_activity  <- read.table("./test/y_test.txt")
test_subject   <- read.table("./test/subject_test.txt")
train_activity <- read.table("./train/y_train.txt")
train_subject  <- read.table("./train/subject_train.txt")

# binding train and test with activity as Factor
tot_activity     <- rbind(test_activity, train_activity)
tot_activity[,1] <- as.factor(tot_activity[,1])
tot_subject      <- rbind(test_subject, train_subject)

# naming the new column names from V1 and V2 into "activity" and "subject"
my_df <- cbind(my_df, tot_activity,tot_subject)
colnames(my_df)[length(names(my_df))-1] <-"activity"
colnames(my_df)[length(names(my_df))]   <-"subject"

## QUESTION 3 ##
# Making the activity in descriptive labels from the activity-labels.txt 
labels <- read.table("activity_labels.txt")
labels <- labels[,2]
levels(my_df$activity) <- labels 


## QUESTION 4 ##
# Appropriately labels the data set with descriptive variable names from the features.txt 
# activity and subject already labeled
# this was a lot of work : I wonder if this was meant in the assignment ?
# I had little clue as to what the variables really meant
temp <- names(my_df)
temp <- tolower(temp)
temp <- gsub("^t", "time", temp)
temp <- gsub("^f", "frequency", temp)
temp <- gsub("acc", "acceleration", temp)
temp <- gsub("gyro", "gyroscope", temp)
temp <- gsub("mag", "magnitude", temp)
temp <- gsub("\\(\\)", "", temp)
temp <- gsub("\\(tbody", "timebody", temp)
temp <- gsub("\\(", "", temp)
temp <- gsub("\\(", "", temp)
names(my_df) <- temp


# creating the table format for use in dplyr
my_tb <- tbl_df(my_df)

## QUESTION 5 ##

# using the summarise_each function from dplyr and using "View" for a seperate window
my_tidy_data <- my_tb %>% arrange(activity, subject) %>% group_by(activity, subject) %>% summarise_each(funs(mean))
View(my_tidy_data)

# writing the file as .txt file to my_tidy_data.txt
write.table(my_tidy_data, file="my_tidy_data.txt", row.names = FALSE)

# get back to the origional directory from where we started
setwd(startwd)

renebroekhoven / cleaningdataproject Goto Github PK

cleaningdataproject's Introduction

cleaningdataproject's People

Contributors

Watchers

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent