EasyD · ghost · Nov 11, 2017
diff --git a/.Rhistory b/.Rhistory
diff --git a/TitanicDataAnalysis_Video1.R b/TitanicDataAnalysis_Video1.R
@@ -29,20 +29,20 @@ train <- read.csv("train.csv", header = TRUE)
 test <- read.csv("test.csv", header = TRUE)
 
 # Add a "Survived" variable to the test set to allow for combining data sets
-test.survived <- data.frame(survived = rep("None", nrow(test)), test[,])
+test.survived <- data.frame(Survived = rep("None", nrow(test)), test[,])
 
 # Combine data sets
 data.combined <- rbind(train, test.survived)
 
 # A bit about R data types (e.g., factors)
 str(data.combined)
 
-data.combined$survived <- as.factor(data.combined$survived)
-data.combined$pclass <- as.factor(data.combined$pclass)
+data.combined$Survived <- as.factor(data.combined$Survived)
+data.combined$Pclass <- as.factor(data.combined$Pclass)
 
 
 # Take a look at gross survival rates
-table(data.combined$survived)
+table(data.combined$Survived)
 
 
 # Distribution across classes
@@ -54,47 +54,47 @@ library(ggplot2)
 
 
 # Hypothesis - Rich folks survived at a higer rate
-train$pclass <- as.factor(train$pclass)
-ggplot(train, aes(x = pclass, fill = factor(survived))) +
+train$Pclass <- as.factor(train$Pclass)
+ggplot(train, aes(x = Pclass, fill = factor(Survived))) +
   geom_bar() +
   xlab("Pclass") +
   ylab("Total Count") +
   labs(fill = "Survived") 
 
 
 # Examine the first few names in the training data set
-head(as.character(train$name))
+head(as.character(train$Name))
 
 
 # How many unique names are there across both train & test?
-length(unique(as.character(data.combined$name)))
+length(unique(as.character(data.combined$Name)))
 
 
 # Two duplicate names, take a closer look
 # First, get the duplicate names and store them as a vector
-dup.names <- as.character(data.combined[which(duplicated(as.character(data.combined$name))), "name"])
+dup.names <- as.character(data.combined[which(duplicated(as.character(data.combined$Name))), "Name"])
 
 
 # Next, take a look at the records in the combined data set
-data.combined[which(data.combined$name %in% dup.names),]
+data.combined[which(data.combined$Name %in% dup.names),]
 
 
 # What is up with the 'Miss.' and 'Mr.' thing?
 library(stringr)
 
 
 # Any correlation with other variables (e.g., sibsp)?
-misses <- data.combined[which(str_detect(data.combined$name, "Miss.")),]
+misses <- data.combined[which(str_detect(data.combined$Name, "Miss.")),]
 misses[1:5,]
 
 
 # Hypothesis - Name titles correlate with age
-mrses <- data.combined[which(str_detect(data.combined$name, "Mrs.")), ]
+mrses <- data.combined[which(str_detect(data.combined$Name, "Mrs.")), ]
 mrses[1:5,]
 
 
 # Check out males to see if pattern continues
-males <- data.combined[which(data.combined$sex == "male"), ]
+males <- data.combined[which(data.combined$Sex == "male"), ]
 males[1:5,]
 
 
@@ -124,17 +124,17 @@ extractTitle <- function(name) {
 #        doing things
 titles <- NULL
 for (i in 1:nrow(data.combined)) {
-  titles <- c(titles, extractTitle(data.combined[i,"name"]))
+  titles <- c(titles, extractTitle(data.combined[i,"Name"]))
 }
-data.combined$title <- as.factor(titles)
+data.combined$Title <- as.factor(titles)
 
 
 # Since we only have survived lables for the train set, only use the
 # first 891 rows
-ggplot(data.combined[1:891,], aes(x = title, fill = survived)) +
+ggplot(data.combined[1:891,], aes(x = Title, fill = Survived)) +
   geom_bar() +
-  facet_wrap(~pclass) + 
+  facet_wrap(~Pclass) + 
   ggtitle("Pclass") +
   xlab("Title") +
   ylab("Total Count") +
-  labs(fill = "Survived")
+  labs(fill = "Survived")
diff --git a/TitanicDataAnalysis_Video2.R b/TitanicDataAnalysis_Video2.R
@@ -28,59 +28,59 @@ train <- read.csv("train.csv", header = TRUE)
 test <- read.csv("test.csv", header = TRUE)
 
 # Add a "Survived" variable to the test set to allow for combining data sets
-test.survived <- data.frame(survived = rep("None", nrow(test)), test[,])
+test.survived <- data.frame(Survived = rep("None", nrow(test)), test[,])
 
 # Combine data sets
 data.combined <- rbind(train, test.survived)
 
 # A bit about R data types (e.g., factors)
 str(data.combined)
 
-data.combined$survived <- as.factor(data.combined$survived)
-data.combined$pclass <- as.factor(data.combined$pclass)
+data.combined$Survived <- as.factor(data.combined$Survived)
+data.combined$Pclass <- as.factor(data.combined$Pclass)
 
 
 # Take a look at gross survival rates
-table(data.combined$survived)
+table(data.combined$Survived)
 
 
 # Distribution across classes
-table(data.combined$pclass)
+table(data.combined$Pclass)
 
 
 # Load up ggplot2 package to use for visualizations
 library(ggplot2)
 
 
 # Hypothesis - Rich folks survived at a higer rate
-train$pclass <- as.factor(train$pclass)
-ggplot(train, aes(x = pclass, fill = factor(survived))) +
+train$Pclass <- as.factor(train$Pclass)
+ggplot(train, aes(x = Pclass, fill = factor(Survived))) +
   geom_bar() +
   xlab("Pclass") +
   ylab("Total Count") +
   labs(fill = "Survived") 
 
 
 # Examine the first few names in the training data set
-head(as.character(train$name))
+head(as.character(train$Name))
 
 # How many unique names are there across both train & test?
-length(unique(as.character(data.combined$name)))
+length(unique(as.character(data.combined$Name)))
 
 
 # Two duplicate names, take a closer look
 # First, get the duplicate names and store them as a vector
-dup.names <- as.character(data.combined[which(duplicated(as.character(data.combined$name))), "name"])
+dup.names <- as.character(data.combined[which(duplicated(as.character(data.combined$Name))), "Name"])
 
 # Next, take a look at the records in the combined data set
-data.combined[which(data.combined$name %in% dup.names),]
+data.combined[which(data.combined$Name %in% dup.names),]
 
 
 # What is up with the 'Miss.' and 'Mr.' thing?
 library(stringr)
 
 # Any correlation with other variables (e.g., sibsp)?
-misses <- data.combined[which(str_detect(data.combined$name, "Miss.")),]
+misses <- data.combined[which(str_detect(data.combined$Name, "Miss.")),]
 misses[1:5,]
 
 
@@ -89,7 +89,7 @@ mrses <- data.combined[which(str_detect(data.combined$name, "Mrs.")), ]
 mrses[1:5,]
 
 # Check out males to see if pattern continues
-males <- data.combined[which(data.combined$sex == "male"), ]
+males <- data.combined[which(data.combined$Sex == "male"), ]
 males[1:5,]
 
 
@@ -115,28 +115,28 @@ extractTitle <- function(name) {
 
 titles <- NULL
 for (i in 1:nrow(data.combined)) {
-  titles <- c(titles, extractTitle(data.combined[i,"name"]))
+  titles <- c(titles, extractTitle(data.combined[i,"Name"]))
 }
-data.combined$title <- as.factor(titles)
+data.combined$Title <- as.factor(titles)
 
 # Since we only have survived lables for the train set, only use the
 # first 891 rows
-ggplot(data.combined[1:891,], aes(x = title, fill = survived)) +
+ggplot(data.combined[1:891,], aes(x = Title, fill = Survived)) +
   geom_bar() +
-  facet_wrap(~pclass) + 
+  facet_wrap(~Pclass) + 
   ggtitle("Pclass") +
   xlab("Title") +
   ylab("Total Count") +
   labs(fill = "Survived")
 
 # What's the distribution of females to males across train & test?
-table(data.combined$sex)
+table(data.combined$Sex)
 
 
 # Visualize the 3-way relationship of sex, pclass, and survival, compare to analysis of title
-ggplot(data.combined[1:891,], aes(x = sex, fill = survived)) +
+ggplot(data.combined[1:891,], aes(x = Sex, fill = Survived)) +
   geom_bar() +
-  facet_wrap(~pclass) + 
+  facet_wrap(~Pclass) + 
   ggtitle("Pclass") +
   xlab("Sex") +
   ylab("Total Count") +
@@ -145,28 +145,28 @@ ggplot(data.combined[1:891,], aes(x = sex, fill = survived)) +
 
 # OK, age and sex seem pretty important as derived from analysis of title, let's take a closer 
 # look at the distibutions of age over entire data set
-summary(data.combined$age)
-summary(data.combined[1:891,"age"])
+summary(data.combined$Age)
+summary(data.combined[1:891,"Age"])
 
 # Just to be thorough, take a look at survival rates broken out by sex, pclass, and age
-ggplot(data.combined[1:891,], aes(x = age, fill = survived)) +
-  facet_wrap(~sex + pclass) +
+ggplot(data.combined[1:891,], aes(x = Age, fill = Survived)) +
+  facet_wrap(~Sex + Pclass) +
   geom_histogram(binwidth = 10) +
   xlab("Age") +
   ylab("Total Count")
 
 
 # Validate that "Master." is a good proxy for male children
-boys <- data.combined[which(data.combined$title == "Master."),]
-summary(boys$age)
+boys <- data.combined[which(data.combined$Title == "Master."),]
+summary(boys$Age)
 
 
 # We know that "Miss." is more complicated, let's examine further
-misses <- data.combined[which(data.combined$title == "Miss."),]
-summary(misses$age)
+misses <- data.combined[which(data.combined$Title == "Miss."),]
+summary(misses$Age)
 
-ggplot(misses[misses$survived != "None",], aes(x = age, fill = survived)) +
-  facet_wrap(~pclass) +
+ggplot(misses[misses$Survived != "None",], aes(x = Age, fill = Survived)) +
+  facet_wrap(~Pclass) +
   geom_histogram(binwidth = 5) +
   ggtitle("Age for 'Miss.' by Pclass") + 
   xlab("Age") +
@@ -175,26 +175,26 @@ ggplot(misses[misses$survived != "None",], aes(x = age, fill = survived)) +
 
 # OK, appears female children may have different survival rate, 
 # could be a candidate for feature engineering later
-misses.alone <- misses[which(misses$sibsp == 0 & misses$parch == 0),]
-summary(misses.alone$age)
-length(which(misses.alone$age <= 14.5))
+misses.alone <- misses[which(misses$SibSp == 0 & misses$Parch == 0),]
+summary(misses.alone$Age)
+length(which(misses.alone$Age <= 14.5))
 
 
 # Move on to the sibsp variable, summarize the variable
-summary(data.combined$sibsp)
+summary(data.combined$SibSp)
 
 
 # Can we treat as a factor?
-length(unique(data.combined$sibsp))
+length(unique(data.combined$SibSp))
 
 
-data.combined$sibsp <- as.factor(data.combined$sibsp)
+data.combined$SibSp <- as.factor(data.combined$SibSp)
 
 
 # We believe title is predictive. Visualize survival reates by sibsp, pclass, and title
-ggplot(data.combined[1:891,], aes(x = sibsp, fill = survived)) +
+ggplot(data.combined[1:891,], aes(x = SibSp, fill = Survived)) +
   geom_bar() +
-  facet_wrap(~pclass + title) + 
+  facet_wrap(~Pclass + Title) + 
   ggtitle("Pclass, Title") +
   xlab("SibSp") +
   ylab("Total Count") +
@@ -203,10 +203,10 @@ ggplot(data.combined[1:891,], aes(x = sibsp, fill = survived)) +
 
 
 # Treat the parch vaiable as a factor and visualize
-data.combined$parch <- as.factor(data.combined$parch)
-ggplot(data.combined[1:891,], aes(x = parch, fill = survived)) +
+data.combined$Parch <- as.factor(data.combined$Parch)
+ggplot(data.combined[1:891,], aes(x = Parch, fill = Survived)) +
   geom_bar() +
-  facet_wrap(~pclass + title) + 
+  facet_wrap(~Pclass + Title) + 
   ggtitle("Pclass, Title") +
   xlab("ParCh") +
   ylab("Total Count") +
@@ -215,15 +215,15 @@ ggplot(data.combined[1:891,], aes(x = parch, fill = survived)) +
 
 
 # Let's try some feature engineering. What about creating a family size feature?
-temp.sibsp <- c(train$sibsp, test$sibsp)
-temp.parch <- c(train$parch, test$parch)
+temp.sibsp <- c(train$SibSp, test$SibSp)
+temp.parch <- c(train$Parch, test$Parch)
 data.combined$family.size <- as.factor(temp.sibsp + temp.parch + 1)
 
 
 # Visualize it to see if it is predictive
-ggplot(data.combined[1:891,], aes(x = family.size, fill = survived)) +
+ggplot(data.combined[1:891,], aes(x = family.size, fill = Survived)) +
   geom_bar() +
-  facet_wrap(~pclass + title) + 
+  facet_wrap(~Pclass + Title) + 
   ggtitle("Pclass, Title") +
   xlab("family.size") +
   ylab("Total Count") +