diff --git a/EduardoHidalgo/algas/00-load.R b/EduardoHidalgo/algas/00-load.R new file mode 100644 index 0000000..a68f80d --- /dev/null +++ b/EduardoHidalgo/algas/00-load.R @@ -0,0 +1 @@ +algas_data <- load() diff --git a/EduardoHidalgo/algas/01-prepare.R b/EduardoHidalgo/algas/01-prepare.R new file mode 100644 index 0000000..f8cddc2 --- /dev/null +++ b/EduardoHidalgo/algas/01-prepare.R @@ -0,0 +1,15 @@ +colnames(algas_data) <- algas_colnames + +#german_data$good_loan <- as.factor( +# ifelse( +# german_data$good_loan == 1, +# 'GoodLoan', +# 'BadLoan' +# ) +#) + +#german_data <- german_data %>% +# mutate_all(funs(german_decode)) + +#german_data <- german_data %>% +# mutate_at(c(1,3,4,6,7,9,10,12,14,15,17,19,20),funs(as.factor)) diff --git a/EduardoHidalgo/algas/02-clean.R b/EduardoHidalgo/algas/02-clean.R new file mode 100644 index 0000000..a3e8c02 --- /dev/null +++ b/EduardoHidalgo/algas/02-clean.R @@ -0,0 +1,13 @@ +colnames(algas_data) <- algas_clean_colnames(algas_colnames) + +problematic_rows <- problems(algas_data)$row + +algas_data[problematic_rows,] <- algas_data %>% + slice(problematic_rows) %>% + unite(col="all", -seq(1:6), sep = "/", remove=TRUE) %>% + extract(all, into=c("NO3", "NH4", "resto"), regex="([0-9]*.[0-9]{5})([0-9]*.[0-9]*)/(.*)/NA", remove=TRUE) %>% + separate(resto, into=names(algas_data)[9:18], sep="/", remove=TRUE) + +algas_data <- algas_data %>% mutate_at(c(2,3), funs(algas_clean_data)) + +algas_data <- readr::type_convert(algas_data) diff --git a/EduardoHidalgo/algas/algas.R b/EduardoHidalgo/algas/algas.R new file mode 100644 index 0000000..9822576 --- /dev/null +++ b/EduardoHidalgo/algas/algas.R @@ -0,0 +1,79 @@ +setwd("~/GitHub/MineriaYAnalisisDeDatos/algas") + +library(readr) +library(stringr) +library(tidyr) +library(dplyr) +library(ggplot2) +library(ggthemes) + +source("metadata.R") +source("utils.R") +source("00-load.R") +source("01-prepare.R") +source("02-clean.R") + +summary(algas_data) + +glimpse(algas_data) + +problems(algas_data) + + +library(mice) +md.pattern(algas_data) + +library("VIM") +aggr(algas_data, prop=FALSE, numbers=TRUE) + +matrixplot(algas_data) + + +x <- as.data.frame(abs(is.na(algas_data))) # df es un data.frame + +head(algas_data) + +head(x) + +# Extrae las variables que tienen algunas celdas con NAs +y <- x[which(sapply(x, sd) > 0)] + +# Da la correación un valor alto positivo significa que desaparecen juntas. +cor(y,y) + +summary(algas_data[-grep(colnames(algas_data),pattern = "^a[1-9]")]) + +algas_con_NAs <- algas_data[!complete.cases(algas_data),] + +algas_con_NAs[c('max_ph', 'min_o2', 'cl', 'no3', 'nh4', 'opo4', 'po4', 'chla')] %>% + print(n = 33) + + +algas_data %>% + select(-c(1:3)) %>% + cor(use="complete.obs") %>% + symnum() + +ggplot(data=algas_data) + + aes(x=opo4, y=po4) + + geom_point(shape=1) + # Usamos una bolita para los puntos + geom_smooth(method=lm, se=FALSE) + + theme_hc() +# Mostramos la linea de la regresión y no mostramos la región de confianza + + +### + + +c2 <- ggplot(algas_data, aes(max_ph)) + + geom_histogram(aes(y = ..density..), binwidth=1) + + geom_density()+ + xlab("PH Maximo") + ylab("") + ggtitle("Distribucion Empirica del PH Maximo Por Estacion")+facet_wrap(~ season, nrow = 3) + +p2 <- ggplot(father.son, aes(fheight)) + + geom_histogram(aes(y = ..density..), binwidth=1) + + geom_density() + xlim(58, 80) + ylim(0, 0.16) + + xlab("ht (inches)") + ylab("") + + ggtitle("Fathers") + +grid.arrange(c2, p2, nrow = 1) \ No newline at end of file diff --git a/EduardoHidalgo/algas/algas.rds b/EduardoHidalgo/algas/algas.rds new file mode 100644 index 0000000..33fca75 Binary files /dev/null and b/EduardoHidalgo/algas/algas.rds differ diff --git a/EduardoHidalgo/algas/metadata.R b/EduardoHidalgo/algas/metadata.R new file mode 100644 index 0000000..49f5a5f --- /dev/null +++ b/EduardoHidalgo/algas/metadata.R @@ -0,0 +1,81 @@ +.## German credit --------------------------------------------------------------- + +## Nombres de columnas --------------------------------------------------------- +german_colnames <- c('Status of existing checking account', + 'Duration in month', + 'Credit history', + 'Purpose', + 'Credit amount', + 'Savings account/bonds', + 'Present employment since', + 'Installment rate in percentage of disposable income', + 'Personal status and sex', + 'Other debtors / guarantors', + 'Present residence since', + 'Property', + 'Age in years', + 'Other installment plans', + 'Housing', + 'Number of existing credits at this bank', + 'Job', + 'Number of people being liable to provide maintenance for', + 'Telephone', + 'foreign worker', + 'good_loan' +) + +## Códigos --------------------------------------------------------------------- +german_codes <- list('A11'='... < 0 DM', + 'A12'='0 <= ... < 200 DM', + 'A13'='... >= 200 DM / salary assignments for at least 1 year', + 'A14'='no checking account', + 'A30'='no credits taken/all credits paid back duly', + 'A31'='all credits at this bank paid back duly', + 'A32'='existing credits paid back duly till now', + 'A33'='delay in paying off in the past', + 'A34'='critical account/other credits existing (not at this bank)', + 'A40'='car (new)', + 'A41'='car (used)', + 'A42'='furniture/equipment', + 'A43'='radio/television', 'A44'='domestic appliances', 'A45'='repairs', + 'A46'='education', 'A47'='(vacation - does not exist?)', + 'A48'='retraining', 'A49'='business', 'A410'='others', 'A61'='... < 100 DM', + 'A62'='100 <= ... < 500 DM', 'A63'='500 <= ... < 1000 DM', + 'A64'='.. >= 1000 DM', 'A65'='unknown/ no savings account', + 'A71'='unemployed', 'A72'='... < 1 year', 'A73'='1 <= ... < 4 years', + 'A74'='4 <= ... < 7 years', 'A75'='.. >= 7 years', 'A91'='male : divorced/separated', + 'A92'='female : divorced/separated/married', + 'A93'='male : single', + 'A94'='male : married/widowed', + 'A95'='female : single', + 'A101'='none', + 'A102'='co-applicant', + 'A103'='guarantor', 'A121'='real estate', + 'A122'='if not A121 : building society savings agreement/life insurance', + 'A123'='if not A121/A122 : car or other, not in attribute 6', + 'A124'='unknown / no property', + 'A141'='bank', 'A142'='stores', 'A143'='none', 'A151'='rent', 'A152'='own', + 'A153'='for free', 'A171'='unemployed/ unskilled - non-resident', + 'A172'='unskilled - resident', 'A173'='skilled employee / official', + 'A174'='management/ self-employed/highly qualified employee/ officer', + 'A191'='none', 'A192'='yes, registered under the customers name', + 'A201'='yes', 'A202'='no' +) + + +## Algas ----------------------------------------------------------------------- + +## Nombre de columnas ---------------------------------------------------------- +algas_colnames <- c('season', + 'river_size', + 'fluid_velocity', + 'max_PH', + 'min_O2', + 'Cl', + 'NO3', + 'NH4', + 'oPO4', + 'PO4', + 'Chla', + paste('a', seq(1:7), sep="") +) \ No newline at end of file diff --git a/EduardoHidalgo/algas/utils.R b/EduardoHidalgo/algas/utils.R new file mode 100644 index 0000000..d05e437 --- /dev/null +++ b/EduardoHidalgo/algas/utils.R @@ -0,0 +1,34 @@ + +load <- function(){ + if(!file.exists('algas.rds')){ + algas_url <- 'https://archive.ics.uci.edu/ml/machine-learning-databases/coil-mld/analysis.data' + + algas_data <- read_csv(algas_url, + col_names = algas_colnames, + na = 'XXXXXXX') + saveRDS(algas_data, "algas.rds") + print('algas.rds se bajó y guardó\n') + } + else{ + warning('algas.rds ya existe\n') + algas_data <- readRDS("algas.rds") + } + + return(algas_data) +} + +algas_decode <- function(columna){ + if(is.character(columna)){ + unlist(german_codes[columna],use.names = F) + }else{ + columna + } +} + +algas_clean_colnames <- function(x){ + str_replace_all(tolower(x),"/| ",'_') +} + +algas_clean_data <- function(x){ + str_replace_all(tolower(x),"_",'') +} diff --git a/EduardoHidalgo/german/00-load.R b/EduardoHidalgo/german/00-load.R new file mode 100644 index 0000000..62c7f44 --- /dev/null +++ b/EduardoHidalgo/german/00-load.R @@ -0,0 +1 @@ +german_data <- load() \ No newline at end of file diff --git a/EduardoHidalgo/german/01-prepare.R b/EduardoHidalgo/german/01-prepare.R new file mode 100644 index 0000000..2ca6ec3 --- /dev/null +++ b/EduardoHidalgo/german/01-prepare.R @@ -0,0 +1,15 @@ +colnames(german_data) <- german_colnames + +german_data$good_loan <- as.factor( + ifelse( + german_data$good_loan == 1, + 'GoodLoan', + 'BadLoan' + ) +) + +german_data <- german_data %>% + mutate_all(funs(german_decode)) + +german_data <- german_data %>% + mutate_at(c(1,3,4,6,7,9,10,12,14,15,17,19,20),funs(as.factor)) diff --git a/EduardoHidalgo/german/02-clean.R b/EduardoHidalgo/german/02-clean.R new file mode 100644 index 0000000..6c22fea --- /dev/null +++ b/EduardoHidalgo/german/02-clean.R @@ -0,0 +1 @@ +colnames(german_data) <- german_clean_colnames(german_colnames) diff --git a/EduardoHidalgo/german/german.R b/EduardoHidalgo/german/german.R new file mode 100644 index 0000000..8ac8c74 --- /dev/null +++ b/EduardoHidalgo/german/german.R @@ -0,0 +1,37 @@ +setwd("~/GitHub/MineriaYAnalisisDeDatos/german") + +library(readr) +library(stringr) +library(dplyr) +library(ggplot2) +library(ggthemes) + +source("metadata.R") +source("utils.R") +source("00-load.R") +source("01-prepare.R") +source("02-clean.R") + +ggplot(data = german_data) + + geom_bar(mapping = aes(x = `personal_status_and_sex`, fill = `good_loan`), position = "fill")+ + theme(axis.text.x = element_text(angle = 60, hjust = 1)) + +ggplot(data = german_data) + + geom_bar(mapping = aes(x = `good_loan`, fill = `credit_history`), position = "fill") + +ggplot(data = german_data) + + geom_bar(mapping = aes(x = `credit_history`, fill = `good_loan`), position = "fill")+ + theme(axis.text.x = element_text(angle = 70, hjust = 1)) + +german_data %>% + group_by(credit_history) %>% + dplyr::summarise(count = n()) %>% + arrange(desc(count)) %>% + ggplot(.) + + geom_bar(aes(x=reorder(credit_history, count), y = count), stat="identity", fill="gray") + + coord_flip() + + theme_hc() + + ylab('casos') + + xlab('Historial de crédito') + +summary(german_data) diff --git a/EduardoHidalgo/german/german.Rproj b/EduardoHidalgo/german/german.Rproj new file mode 100644 index 0000000..f43d039 --- /dev/null +++ b/EduardoHidalgo/german/german.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: ISO8859-1 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/EduardoHidalgo/german/german.rds b/EduardoHidalgo/german/german.rds new file mode 100644 index 0000000..313c080 Binary files /dev/null and b/EduardoHidalgo/german/german.rds differ diff --git a/EduardoHidalgo/german/german.zip b/EduardoHidalgo/german/german.zip new file mode 100644 index 0000000..38f1dea Binary files /dev/null and b/EduardoHidalgo/german/german.zip differ diff --git a/EduardoHidalgo/german/metadata.R b/EduardoHidalgo/german/metadata.R new file mode 100644 index 0000000..4bc5fca --- /dev/null +++ b/EduardoHidalgo/german/metadata.R @@ -0,0 +1,81 @@ +## German credit --------------------------------------------------------------- + +## Nombres de columnas --------------------------------------------------------- +german_colnames <- c('Status of existing checking account', + 'Duration in month', + 'Credit history', + 'Purpose', + 'Credit amount', + 'Savings account/bonds', + 'Present employment since', + 'Installment rate in percentage of disposable income', + 'Personal status and sex', + 'Other debtors / guarantors', + 'Present residence since', + 'Property', + 'Age in years', + 'Other installment plans', + 'Housing', + 'Number of existing credits at this bank', + 'Job', + 'Number of people being liable to provide maintenance for', + 'Telephone', + 'foreign worker', + 'good_loan' +) + +## Códigos --------------------------------------------------------------------- +german_codes <- list('A11'='... < 0 DM', + 'A12'='0 <= ... < 200 DM', + 'A13'='... >= 200 DM / salary assignments for at least 1 year', + 'A14'='no checking account', + 'A30'='no credits taken/all credits paid back duly', + 'A31'='all credits at this bank paid back duly', + 'A32'='existing credits paid back duly till now', + 'A33'='delay in paying off in the past', + 'A34'='critical account/other credits existing (not at this bank)', + 'A40'='car (new)', + 'A41'='car (used)', + 'A42'='furniture/equipment', + 'A43'='radio/television', 'A44'='domestic appliances', 'A45'='repairs', + 'A46'='education', 'A47'='(vacation - does not exist?)', + 'A48'='retraining', 'A49'='business', 'A410'='others', 'A61'='... < 100 DM', + 'A62'='100 <= ... < 500 DM', 'A63'='500 <= ... < 1000 DM', + 'A64'='.. >= 1000 DM', 'A65'='unknown/ no savings account', + 'A71'='unemployed', 'A72'='... < 1 year', 'A73'='1 <= ... < 4 years', + 'A74'='4 <= ... < 7 years', 'A75'='.. >= 7 years', 'A91'='male : divorced/separated', + 'A92'='female : divorced/separated/married', + 'A93'='male : single', + 'A94'='male : married/widowed', + 'A95'='female : single', + 'A101'='none', + 'A102'='co-applicant', + 'A103'='guarantor', 'A121'='real estate', + 'A122'='if not A121 : building society savings agreement/life insurance', + 'A123'='if not A121/A122 : car or other, not in attribute 6', + 'A124'='unknown / no property', + 'A141'='bank', 'A142'='stores', 'A143'='none', 'A151'='rent', 'A152'='own', + 'A153'='for free', 'A171'='unemployed/ unskilled - non-resident', + 'A172'='unskilled - resident', 'A173'='skilled employee / official', + 'A174'='management/ self-employed/highly qualified employee/ officer', + 'A191'='none', 'A192'='yes, registered under the customers name', + 'A201'='yes', 'A202'='no' +) + + +## Algas ----------------------------------------------------------------------- + +## Nombre de columnas ---------------------------------------------------------- +algas_colnames <- c('season', + 'river_size', + 'fluid_velocity', + 'max_PH', + 'min_O2', + 'Cl', + 'NO3', + 'NH4', + 'oPO4', + 'PO4', + 'Chla', + paste('a', seq(1:7), sep="") +) \ No newline at end of file diff --git a/EduardoHidalgo/german/utils.R b/EduardoHidalgo/german/utils.R new file mode 100644 index 0000000..c04456e --- /dev/null +++ b/EduardoHidalgo/german/utils.R @@ -0,0 +1,32 @@ + +load <- function(){ + if(!file.exists('german.rds')){ + german_url <- paste('http://archive.ics.uci.edu/ml', + '/machine-learning-databases/statlog', + '/german/german.data', + sep='') + german_data <- read_delim(german_url, + col_names=FALSE, + delim = " ") + saveRDS(german_data, "german.rds") + print('german.rds se bajó y guardó\n') + } + else{ + warning('german.rds ya existe\n') + german_data <- readRDS("german.rds") + } + + return(german_data) +} + +german_decode <- function(columna){ + if(is.character(columna)){ + unlist(german_codes[columna],use.names = F) + }else{ + columna + } +} + +german_clean_colnames <- function(x){ + str_replace_all(tolower(x),"/| ",'_') +} \ No newline at end of file diff --git a/intro-to-data-science-2017 b/intro-to-data-science-2017 new file mode 160000 index 0000000..7d86024 --- /dev/null +++ b/intro-to-data-science-2017 @@ -0,0 +1 @@ +Subproject commit 7d860243917fd2a67ef00356d573ac1dfbce8b3c