Skip to content

adapt to 11.2017 Titanic competition #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
986 changes: 493 additions & 493 deletions .Rhistory

Large diffs are not rendered by default.

36 changes: 18 additions & 18 deletions TitanicDataAnalysis_Video1.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,20 @@ train <- read.csv("train.csv", header = TRUE)
test <- read.csv("test.csv", header = TRUE)

# Add a "Survived" variable to the test set to allow for combining data sets
test.survived <- data.frame(survived = rep("None", nrow(test)), test[,])
test.survived <- data.frame(Survived = rep("None", nrow(test)), test[,])

# Combine data sets
data.combined <- rbind(train, test.survived)

# A bit about R data types (e.g., factors)
str(data.combined)

data.combined$survived <- as.factor(data.combined$survived)
data.combined$pclass <- as.factor(data.combined$pclass)
data.combined$Survived <- as.factor(data.combined$Survived)
data.combined$Pclass <- as.factor(data.combined$Pclass)


# Take a look at gross survival rates
table(data.combined$survived)
table(data.combined$Survived)


# Distribution across classes
Expand All @@ -54,47 +54,47 @@ library(ggplot2)


# Hypothesis - Rich folks survived at a higer rate
train$pclass <- as.factor(train$pclass)
ggplot(train, aes(x = pclass, fill = factor(survived))) +
train$Pclass <- as.factor(train$Pclass)
ggplot(train, aes(x = Pclass, fill = factor(Survived))) +
geom_bar() +
xlab("Pclass") +
ylab("Total Count") +
labs(fill = "Survived")


# Examine the first few names in the training data set
head(as.character(train$name))
head(as.character(train$Name))


# How many unique names are there across both train & test?
length(unique(as.character(data.combined$name)))
length(unique(as.character(data.combined$Name)))


# Two duplicate names, take a closer look
# First, get the duplicate names and store them as a vector
dup.names <- as.character(data.combined[which(duplicated(as.character(data.combined$name))), "name"])
dup.names <- as.character(data.combined[which(duplicated(as.character(data.combined$Name))), "Name"])


# Next, take a look at the records in the combined data set
data.combined[which(data.combined$name %in% dup.names),]
data.combined[which(data.combined$Name %in% dup.names),]


# What is up with the 'Miss.' and 'Mr.' thing?
library(stringr)


# Any correlation with other variables (e.g., sibsp)?
misses <- data.combined[which(str_detect(data.combined$name, "Miss.")),]
misses <- data.combined[which(str_detect(data.combined$Name, "Miss.")),]
misses[1:5,]


# Hypothesis - Name titles correlate with age
mrses <- data.combined[which(str_detect(data.combined$name, "Mrs.")), ]
mrses <- data.combined[which(str_detect(data.combined$Name, "Mrs.")), ]
mrses[1:5,]


# Check out males to see if pattern continues
males <- data.combined[which(data.combined$sex == "male"), ]
males <- data.combined[which(data.combined$Sex == "male"), ]
males[1:5,]


Expand Down Expand Up @@ -124,17 +124,17 @@ extractTitle <- function(name) {
# doing things
titles <- NULL
for (i in 1:nrow(data.combined)) {
titles <- c(titles, extractTitle(data.combined[i,"name"]))
titles <- c(titles, extractTitle(data.combined[i,"Name"]))
}
data.combined$title <- as.factor(titles)
data.combined$Title <- as.factor(titles)


# Since we only have survived lables for the train set, only use the
# first 891 rows
ggplot(data.combined[1:891,], aes(x = title, fill = survived)) +
ggplot(data.combined[1:891,], aes(x = Title, fill = Survived)) +
geom_bar() +
facet_wrap(~pclass) +
facet_wrap(~Pclass) +
ggtitle("Pclass") +
xlab("Title") +
ylab("Total Count") +
labs(fill = "Survived")
labs(fill = "Survived")
90 changes: 45 additions & 45 deletions TitanicDataAnalysis_Video2.R
Original file line number Diff line number Diff line change
Expand Up @@ -28,59 +28,59 @@ train <- read.csv("train.csv", header = TRUE)
test <- read.csv("test.csv", header = TRUE)

# Add a "Survived" variable to the test set to allow for combining data sets
test.survived <- data.frame(survived = rep("None", nrow(test)), test[,])
test.survived <- data.frame(Survived = rep("None", nrow(test)), test[,])

# Combine data sets
data.combined <- rbind(train, test.survived)

# A bit about R data types (e.g., factors)
str(data.combined)

data.combined$survived <- as.factor(data.combined$survived)
data.combined$pclass <- as.factor(data.combined$pclass)
data.combined$Survived <- as.factor(data.combined$Survived)
data.combined$Pclass <- as.factor(data.combined$Pclass)


# Take a look at gross survival rates
table(data.combined$survived)
table(data.combined$Survived)


# Distribution across classes
table(data.combined$pclass)
table(data.combined$Pclass)


# Load up ggplot2 package to use for visualizations
library(ggplot2)


# Hypothesis - Rich folks survived at a higer rate
train$pclass <- as.factor(train$pclass)
ggplot(train, aes(x = pclass, fill = factor(survived))) +
train$Pclass <- as.factor(train$Pclass)
ggplot(train, aes(x = Pclass, fill = factor(Survived))) +
geom_bar() +
xlab("Pclass") +
ylab("Total Count") +
labs(fill = "Survived")


# Examine the first few names in the training data set
head(as.character(train$name))
head(as.character(train$Name))

# How many unique names are there across both train & test?
length(unique(as.character(data.combined$name)))
length(unique(as.character(data.combined$Name)))


# Two duplicate names, take a closer look
# First, get the duplicate names and store them as a vector
dup.names <- as.character(data.combined[which(duplicated(as.character(data.combined$name))), "name"])
dup.names <- as.character(data.combined[which(duplicated(as.character(data.combined$Name))), "Name"])

# Next, take a look at the records in the combined data set
data.combined[which(data.combined$name %in% dup.names),]
data.combined[which(data.combined$Name %in% dup.names),]


# What is up with the 'Miss.' and 'Mr.' thing?
library(stringr)

# Any correlation with other variables (e.g., sibsp)?
misses <- data.combined[which(str_detect(data.combined$name, "Miss.")),]
misses <- data.combined[which(str_detect(data.combined$Name, "Miss.")),]
misses[1:5,]


Expand All @@ -89,7 +89,7 @@ mrses <- data.combined[which(str_detect(data.combined$name, "Mrs.")), ]
mrses[1:5,]

# Check out males to see if pattern continues
males <- data.combined[which(data.combined$sex == "male"), ]
males <- data.combined[which(data.combined$Sex == "male"), ]
males[1:5,]


Expand All @@ -115,28 +115,28 @@ extractTitle <- function(name) {

titles <- NULL
for (i in 1:nrow(data.combined)) {
titles <- c(titles, extractTitle(data.combined[i,"name"]))
titles <- c(titles, extractTitle(data.combined[i,"Name"]))
}
data.combined$title <- as.factor(titles)
data.combined$Title <- as.factor(titles)

# Since we only have survived lables for the train set, only use the
# first 891 rows
ggplot(data.combined[1:891,], aes(x = title, fill = survived)) +
ggplot(data.combined[1:891,], aes(x = Title, fill = Survived)) +
geom_bar() +
facet_wrap(~pclass) +
facet_wrap(~Pclass) +
ggtitle("Pclass") +
xlab("Title") +
ylab("Total Count") +
labs(fill = "Survived")

# What's the distribution of females to males across train & test?
table(data.combined$sex)
table(data.combined$Sex)


# Visualize the 3-way relationship of sex, pclass, and survival, compare to analysis of title
ggplot(data.combined[1:891,], aes(x = sex, fill = survived)) +
ggplot(data.combined[1:891,], aes(x = Sex, fill = Survived)) +
geom_bar() +
facet_wrap(~pclass) +
facet_wrap(~Pclass) +
ggtitle("Pclass") +
xlab("Sex") +
ylab("Total Count") +
Expand All @@ -145,28 +145,28 @@ ggplot(data.combined[1:891,], aes(x = sex, fill = survived)) +

# OK, age and sex seem pretty important as derived from analysis of title, let's take a closer
# look at the distibutions of age over entire data set
summary(data.combined$age)
summary(data.combined[1:891,"age"])
summary(data.combined$Age)
summary(data.combined[1:891,"Age"])

# Just to be thorough, take a look at survival rates broken out by sex, pclass, and age
ggplot(data.combined[1:891,], aes(x = age, fill = survived)) +
facet_wrap(~sex + pclass) +
ggplot(data.combined[1:891,], aes(x = Age, fill = Survived)) +
facet_wrap(~Sex + Pclass) +
geom_histogram(binwidth = 10) +
xlab("Age") +
ylab("Total Count")


# Validate that "Master." is a good proxy for male children
boys <- data.combined[which(data.combined$title == "Master."),]
summary(boys$age)
boys <- data.combined[which(data.combined$Title == "Master."),]
summary(boys$Age)


# We know that "Miss." is more complicated, let's examine further
misses <- data.combined[which(data.combined$title == "Miss."),]
summary(misses$age)
misses <- data.combined[which(data.combined$Title == "Miss."),]
summary(misses$Age)

ggplot(misses[misses$survived != "None",], aes(x = age, fill = survived)) +
facet_wrap(~pclass) +
ggplot(misses[misses$Survived != "None",], aes(x = Age, fill = Survived)) +
facet_wrap(~Pclass) +
geom_histogram(binwidth = 5) +
ggtitle("Age for 'Miss.' by Pclass") +
xlab("Age") +
Expand All @@ -175,26 +175,26 @@ ggplot(misses[misses$survived != "None",], aes(x = age, fill = survived)) +

# OK, appears female children may have different survival rate,
# could be a candidate for feature engineering later
misses.alone <- misses[which(misses$sibsp == 0 & misses$parch == 0),]
summary(misses.alone$age)
length(which(misses.alone$age <= 14.5))
misses.alone <- misses[which(misses$SibSp == 0 & misses$Parch == 0),]
summary(misses.alone$Age)
length(which(misses.alone$Age <= 14.5))


# Move on to the sibsp variable, summarize the variable
summary(data.combined$sibsp)
summary(data.combined$SibSp)


# Can we treat as a factor?
length(unique(data.combined$sibsp))
length(unique(data.combined$SibSp))


data.combined$sibsp <- as.factor(data.combined$sibsp)
data.combined$SibSp <- as.factor(data.combined$SibSp)


# We believe title is predictive. Visualize survival reates by sibsp, pclass, and title
ggplot(data.combined[1:891,], aes(x = sibsp, fill = survived)) +
ggplot(data.combined[1:891,], aes(x = SibSp, fill = Survived)) +
geom_bar() +
facet_wrap(~pclass + title) +
facet_wrap(~Pclass + Title) +
ggtitle("Pclass, Title") +
xlab("SibSp") +
ylab("Total Count") +
Expand All @@ -203,10 +203,10 @@ ggplot(data.combined[1:891,], aes(x = sibsp, fill = survived)) +


# Treat the parch vaiable as a factor and visualize
data.combined$parch <- as.factor(data.combined$parch)
ggplot(data.combined[1:891,], aes(x = parch, fill = survived)) +
data.combined$Parch <- as.factor(data.combined$Parch)
ggplot(data.combined[1:891,], aes(x = Parch, fill = Survived)) +
geom_bar() +
facet_wrap(~pclass + title) +
facet_wrap(~Pclass + Title) +
ggtitle("Pclass, Title") +
xlab("ParCh") +
ylab("Total Count") +
Expand All @@ -215,15 +215,15 @@ ggplot(data.combined[1:891,], aes(x = parch, fill = survived)) +


# Let's try some feature engineering. What about creating a family size feature?
temp.sibsp <- c(train$sibsp, test$sibsp)
temp.parch <- c(train$parch, test$parch)
temp.sibsp <- c(train$SibSp, test$SibSp)
temp.parch <- c(train$Parch, test$Parch)
data.combined$family.size <- as.factor(temp.sibsp + temp.parch + 1)


# Visualize it to see if it is predictive
ggplot(data.combined[1:891,], aes(x = family.size, fill = survived)) +
ggplot(data.combined[1:891,], aes(x = family.size, fill = Survived)) +
geom_bar() +
facet_wrap(~pclass + title) +
facet_wrap(~Pclass + Title) +
ggtitle("Pclass, Title") +
xlab("family.size") +
ylab("Total Count") +
Expand Down
Loading