05-Core.Rmd

# Working with big data in R

## Read in CSV files

### Read one large CSV file 

#### Read a CSV file with header
</center>
![](pic/tranall.png)
</center>
+ [Explanations of important fields in the new attribute-linked residential property price dataset](https://github.com/BINCHI1990/Link-LR-PPD-and-Domestic-EPCs)
```{r}

##It will take around 3 mins to read in 5,732,838 records with 105 variables
tran2<-fread("tranall2011_19.csv")

```


#### Read a CSV file without header

```{r,eval=FALSE}

tran2<-fread("tranall2011_19.csv",header=F)

```

#### Read a CSV file with the second row as header and dropping the first row
<center>

![](pic/sepc.png)
</center>
```{r}

epcdata1<-fread("D_EPC_data_2012_Q4_extract_0221.csv", skip = 1)
epcdata14<-read.csv("D_EPC_data_2020_Q4_extract_0221.csv", skip = 1)

```


### Fast reading multiple EPC csv files together in R

#### Code for reading in EPCs in England and Wales

<center>

![](pic/epc0.png)
</center>
***

<center>

![](pic/epc1.png)
</center>
```{r,eval=FALSE}
## assume all the unzipped EPC stored in EPC folder in D drive
setwd("D:/EPC")

x1 <- list.files(path = ".", pattern = NULL, all.files = FALSE,
                 full.names = FALSE, recursive = FALSE)


datalist <- paste("D:/EPC",x1,"certificates.csv",sep="/")

epcdata = data.table::rbindlist(lapply(datalist, data.table::fread, showProgress = FALSE))

```

#### Code for reading in EPCs in Scotland
<center>

![](pic/epc3.png)
</center>
```{r,eval=FALSE}
datalist = list.files(pattern="*.csv")

epcdata = data.table::rbindlist(lapply(datalist, data.table::fread, skip=1,showProgress = FALSE))

```


## Basic larger dataset munging/wrangling
### Select columns

```{r}

class(tran2)

needlist<-c("transactionid","postcode","price","dateoftransfer","propertytype","laua","lad11nm","tfarea","priceper","TRANSACTION_TYPE")

tran2<-tran2[,..needlist]

head(tran2)
```


### Changing column names to lower case or upper case
#### Changing column names to lower case
```{r}

setnames(tran2, tolower(names(tran2)))

head(tran2)

```


#### Changing column names to upper case

```{r,eval=FALSE}

setnames(tran2, toupper(names(tran2)))

```


### Filter rows based on conditions

```{r}

tran2[laua=="E09000007", ]

Camden<-tran2[laua=="E09000007", ]
head(Camden)
```

### Add in the ID column
```{r}

Camden[,tranid := .I]
head(Camden)

#Camden[, tranid := .I+1000000]
```

### Convert datatable values to uppercase
```{r}

Camden[,  `:=`(tran_type = toupper(transaction_type))]
head(Camden)
```

### Delete a column


```{r}

Camden[,transaction_type:=NULL]

head(Camden)
```


### Remove Duplicates
```{r}

dim(Camden)

unique(Camden)

dim(Camden)

```


### Write files
```{r,eval=FALSE}

fwrite(Camden,"Camden.csv")

```

### Bind datasets

```{r}

# Bind by names
class(epcdata1)
class(epcdata14)

# Convert data.frame to data.table
setDT(epcdata14)


# Select columns
needlist<- c("BUILDING_REFERENCE_NUMBER","OSG_REFERENCE_NUMBER","ADDRESS1","ADDRESS2","ADDRESS3","POSTCODE","INSPECTION_DATE","LODGEMENT_DATE","PROPERTY_TYPE","TYPE_OF_ASSESSMENT","TRANSACTION_TYPE","TOTAL_FLOOR_AREA","NUMBER_HABITABLE_ROOMS","CURRENT_ENERGY_EFFICIENCY","POTENTIAL_ENERGY_EFFICIENCY")

epcdata1<-epcdata1[,..needlist]
epcdata14<-epcdata14[,..needlist]

# Bind by names
l = list(epcdata1,epcdata14)
epc<- rbindlist(l, use.names=TRUE)

# Remove Duplicates
dim(epc)
unique(epc)
dim(epc)
```
## Work with PostGIS database in R
### Write files to PostGIS
```{r,eval=FALSE}

# Loads the PostgreSQL driver
drv <- dbDriver("PostgreSQL")
# Creates a connection to the casa postGIS databas
con <- dbConnect(drv, dbname = "casa",port=5432, user="postgres",password=******)

# Write Camden to the database
dbWriteTable(con, "Camden",value=Camden, append = TRUE, row.names = FALSE)

# Delete some objects from workspace
rm(Camden,tran2,epc,epcdata1,epcdata14,epcdata)
```


### Read files from PostGIS
```{r,eval=FALSE}

drv <- dbDriver("PostgreSQL")
con <- dbConnect(drv, dbname = "casa",port=5432, user="postgres",password=******)

# load the data from PostGIS database
tran<- dbGetQuery(con,"select * from Camden") 

```


## Measure code performance

### Measure running time of the code
```{r}
class(epc)
# Delete the epc object
rm(epc)
# Bind by names
start_time <- Sys.time()
epc<-rbindlist(list(epcdata1,epcdata14),use.names=TRUE)
end_time <- Sys.time()

end_time - start_time

# Convert data.table to data.frame
setDF(epcdata1)
setDF(epcdata14)

# Combine two data frames by rows
start_time <- Sys.time()
epcdata<-rbind(epcdata1,epcdata14)
end_time <- Sys.time()
end_time - start_time

# Convert data.frame to data.table
setDT(epcdata14)
setDT(epcdata1)

# Combine two data tables by position
start_time <- Sys.time()
epc<-rbindlist(list(epcdata1,epcdata14))
end_time <- Sys.time()

end_time - start_time

```


### profvis- an interactive profile visualizations

```{r,eval=FALSE}
profvis({
uniqueresult <- function(x){
  dt <- as.data.table(x)
  esummary<-dt[,.(count=.N),by=epcdataid]
  idd1 <- esummary[esummary$count==1,]
  result1 <- x[x$epcdataid %in% idd1$epcdataid,]
  return(result1)
}
function1<- function(x,y){
  x<-x[is.na(x$saotext),]
  x<-x[is.na(x$subbuildingname),]
  x$bnstreet <- paste(x$buildingnumber,x$streetdescription,sep=", ")
  x$bnstreet <- gsub(" ", "", x$bnstreet)
  x$addressf <- paste(x$postcodelocator,x$bnstreet,sep=", ")
  
  y$addressfinal <- trimws(y$add)
  y$addressfinal <- gsub(" ", "", y$addressfinal)
  y$addressf <- paste(y$postcode,y$addressfinal,sep=", ")
  
  taba1 <- inner_join(x,y,by="addressf")
  return(taba1)
}

link1<-function1(add,epc)

link1u<- uniqueresult(link1)

})

```

***
<center>

![](pic/first.png)
</center>
***
<center>

![](pic/pro2.png)
</center>
***
<center>

![](pic/pro3.png)
</center>


## Execute R code in Alteryx 
<center>

![](pic/alt1.png)
</center>

+ Step 1: build a workflow in Alteryx

<center>

![](pic/alt2.png)
</center>

+ Step 2: create R code in the Alteryx

<center>

![](pic/alt3.png)
</center>

+ Step 3: Click the use AMP engine and run the workflow

<center>

![](pic/alt5.png)
</center>

+ Step 4: Finish

<center>

![](pic/1.png)
</center>

<center>

![](pic/2.png)
</center>

**Notes:** If you have any questions about Alteryx, [Steven Zhang](https://twitter.com/steven4320555) at [Billigence](https://billigence.com/solutions/business-intelligence/) would like to help.