This repository has been archived by the owner on Apr 20, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 6c23817
Showing
9 changed files
with
5,934 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,203 @@ | ||
rm(list = ls()) | ||
setwd("C:/Users/tomfr/OneDrive/Studium/UNI/5. Semester/Data Sience/Prüfung") | ||
library(tidyverse) | ||
library(ggplot2) | ||
library(tidyr) | ||
library(reshape2) | ||
|
||
# Read the data from csv with the ";" as separator | ||
data <- read.csv("./open-data_berlin_Nutzerdaten.csv", sep = ";") | ||
str(data) | ||
# View(data) | ||
|
||
# After looking into the data: | ||
# 1. The Names of the pages with ä,ö,ü,ß are escaped --> First step is to remove the escape | ||
# 2. Every Month has two columns: pi (Page impressions), v (Page Visits) | ||
# 3. Many NA's | ||
|
||
####### REMOVE ESCAPED LETTERS START ####### | ||
# Next: Start with 1. step and remove escaped letters | ||
# - √º = ü | ||
# - √∂ = ö | ||
# - √§ = ä | ||
# - √ü = ß | ||
|
||
# ü <- data[grep('√º', data$page), ] | ||
# ö <- data[grep('√∂', data$page), ] | ||
# ä <- data[grep('√§', data$page), ] | ||
ü <- grep('√º', data$page) | ||
ö <- grep('√∂', data$page) | ||
ä <- grep('√§', data$page) | ||
|
||
# Search for 'Straße' --> √ü = ß | ||
# View(data[grep('stra', data$page), ]) | ||
ß <- grep('√ü', data$page) | ||
|
||
# Thesis: Every escaped char starts with '√'. Search for not found escaped chars: | ||
# escaped <- data[grep('√', data$page)] | ||
escaped <- grep('√', data$page) | ||
others <- setdiff(setdiff(setdiff(setdiff(escaped, ß), ü), ö), ä) | ||
# View(data[others, ]) | ||
# There exist more: | ||
# √É≈∏ = ß | ||
# √É∆í√Ü‚Äô√ɂĶ√Ǭ∏ = ß | ||
|
||
# Ö: | ||
# √ɬ∂ = ö | ||
# √É∆í√Ü‚Äô√ɂĆ√¢‚Ǩ‚Ñ¢√É∆í√¢‚Ǩ √ɬ¢√¢‚Äö¬¨√¢‚Äû¬¢√É∆í√Ü‚Äô√ɬ¢√¢‚Äö¬¨√Ö¬°√É∆í√¢‚Ǩ≈°√É‚Äö√Ǭ∂ = ö | ||
# √É∆í√Ü‚Äô√É‚Äö√Ǭ∂ = ö | ||
|
||
# Ü: | ||
# √ɬº = ü | ||
# √É∆í√Ü‚Äô√ɂĆ√¢‚Ǩ‚Ñ¢√É∆í√¢‚Ǩ √ɬ¢√¢‚Äö¬¨√¢‚Äû¬¢√É∆í√Ü‚Äô√ɬ¢√¢‚Äö¬¨√Ö¬°√É∆í√¢‚Ǩ≈°√É‚Äö√Ǭº = ü | ||
# √É∆í√Ü‚Äô√ɂĆ√¢‚Ǩ‚Ñ¢√É∆í√¢‚Ǩ≈°√É‚Äö√Ǭº = ü | ||
# √É∆í√Ü‚Äô√É‚Äö√Ǭº = ü | ||
# √ɬɬº = ü | ||
|
||
# Ä: | ||
# √ɬ§A4 = ä | ||
# √É∆í√Ü‚Äô√É‚Äö√Ǭ§ = ä | ||
# √ɬ§ = ä | ||
# √É∆í√Ǭ§ = ä | ||
|
||
others <- setdiff(others, grep('ß', data$page)) | ||
others <- setdiff(others, grep('√É∆í√Ü‚Äô√ɂĆ√¢‚Ǩ‚Ñ¢√É∆í√¢‚Ǩ √ɬ¢√¢‚Äö¬¨√¢‚Äû¬¢√É∆í√Ü‚Äô√ɬ¢√¢‚Äö¬¨√Ö¬°√É∆í√¢‚Ǩ≈°√É‚Äö√Ǭ', data$page)) | ||
others <- setdiff(others, grep('ö', data$page)) | ||
others <- setdiff(others, grep('ö', data$page)) | ||
others <- setdiff(others, grep('ü', data$page)) | ||
others <- setdiff(others, grep('ü', data$page)) | ||
others <- setdiff(others, grep('äA4', data$page)) | ||
others <- setdiff(others, grep('ä', data$page)) | ||
others <- setdiff(others, grep('ä', data$page)) | ||
others <- setdiff(others, grep('ä', data$page)) | ||
others <- setdiff(others, grep('ü', data$page)) | ||
others <- setdiff(others, grep('ü', data$page)) | ||
# View(data[others,]) | ||
|
||
# Thesis is a pattern for the most escaped letters: | ||
# - √(something)º = ü Problem: | ||
# √É∆í√Ü‚Äô√ɂĆ√¢‚Ǩ‚Ñ¢√É∆í√¢‚Ǩ≈°√É‚Äö√Ǭº because 2 times ° | ||
# ü | ||
# Solution: ° is different to º and º only one time at the end! | ||
# - %C3%BC = ü | ||
# - √(something)∂ = ö Problem: none | ||
# - %C3%B6 = ö | ||
# - √(something)§ = ä Problem: √ɬ§A4 and because one more letter | ||
# - %C3%A4 = ä | ||
# - A4 = ä | ||
# - "¬® " = ä | ||
# - √(something)ü or √(something)∏ = ß Problem: should be parsed before ü because it uses ü | ||
# - %C3%9F = ß | ||
# | ||
# Replace letters: | ||
temp <- data[others, "page"] | ||
data[ß, "page"] | ||
data_sub <- data | ||
data_sub$page <- gsub("(√[^A-Za-z]*?[ü∏])|%C3%9F", "ß", data_sub$page) # ß | ||
data_sub$page <- gsub("(√[^A-Za-z]*?º)|%C3%BC", "ü", data_sub$page) # ü | ||
data_sub$page <- gsub("(√[^A-Za-z]*?∂)|%C3%B6", "ö", data_sub$page) # ö | ||
data_sub$page <- gsub("(√[^A-Za-z]*?§(A4)?)|(A4)|¬® ", "ä", data_sub$page) # ä | ||
# View(data_sub) | ||
# Unsolved Problem: ¬ is sometimes in the data but don't know what it does | ||
|
||
####### REMOVE ESCAPED LETTERS END ####### | ||
|
||
|
||
|
||
|
||
####### FIND DUPLICATES START ####### | ||
# Find the duplicates | ||
data_dup <- data_sub[(duplicated(data_sub$page) | duplicated(data_sub$page, fromLast = T)),] | ||
|
||
# Sum the amount of visits and replace the 0's with NA | ||
data_agg <- aggregate(x = data_sub[ , colnames(data_sub) != "page"], # Mean by group | ||
by = list(data_sub$page), | ||
FUN = sum, | ||
na.rm = TRUE | ||
) | ||
colnames(data_agg)[colnames(data_agg) == 'Group.1'] <- 'page' | ||
data_agg[data_agg == 0] <- NA | ||
####### FIND DUPLICATES END ####### | ||
|
||
|
||
|
||
|
||
|
||
|
||
####### PREPARE SUMS START####### | ||
data_enr <- data_agg | ||
|
||
# Add the sum of visits | sum of impressions | ||
month_cols_pi <- names(data_enr)[grep("X[0-9]*[.][0-9][0-9][.]pi", names(data_enr))] | ||
month_cols_v <- names(data_enr)[grep("X[0-9]*[.][0-9][0-9][.]v", names(data_enr))] | ||
|
||
# Calculate the sums | ||
sum_pi <- apply(data_enr[,month_cols_pi], c(1), function(x) {sum(x, na.rm = T)}) | ||
sum_v <- apply(data_enr[,month_cols_v], c(1), function(x) {sum(x, na.rm = T)}) | ||
sum <- sum_pi + sum_v | ||
data_enr['sum_pi'] <- sum_pi | ||
data_enr['sum_v'] <- sum_v | ||
data_enr['sum'] <- sum | ||
|
||
# View(data_enr[,c('page','sum_v', 'sum_pi')]) | ||
####### PREPARE SUMS END ####### | ||
|
||
|
||
|
||
|
||
|
||
####### TASK A) 10 most used pages ALTRNATIVE START####### | ||
# Function for chart | ||
fun_bar_chart <- function(data_enr_temp, number_to_display, decreasing = T) { | ||
# Order by sum of visits and page impressions | ||
data_enr_temp <- data_enr_temp[order(data_enr_temp$sum, decreasing = decreasing), ] | ||
|
||
# Get the 10 highest/lowest per class | ||
top10 <- data_enr_temp[which(data_enr_temp$page %in% head(data_enr_temp, number_to_display)$page),] | ||
|
||
# Make two fields for every variable of each page | ||
melted <- melt(top10[,c("page", "sum_v", "sum_pi", "sum")], id="page") | ||
#melted <- melted[order(-melted$variable, -melted$value, decreasing = F), ] | ||
|
||
# Make a factor to order the data by the number of visits | ||
melted$page <- factor(melted$page, levels = unique(melted$page[order(melted$variable, melted$value, decreasing = T)])) | ||
melted <- melted[!(melted$variable == "sum"),] | ||
|
||
# Define Title | ||
title <- "" | ||
if(decreasing) { | ||
title <- paste("Sum of page impressions and visits of the", number_to_display ,"most visited pages", sep = " ") | ||
} else { | ||
title <- paste("Sum of page impressions and visits of the", number_to_display ,"least visited pages", sep = " ") | ||
} | ||
|
||
# Make plot | ||
plot <- ggplot(melted, aes(value, page, label=value)) + | ||
geom_bar(aes(fill = variable), position = "stack", stat="identity") + | ||
ggtitle(title) + | ||
ylab("Sum") + xlab("Page") + | ||
scale_fill_discrete(labels=c('Visits', 'Page impressions')) + | ||
labs(fill='') + | ||
geom_text(size = 3, position = position_stack(vjust = .5))+ | ||
theme(legend.position = "top", plot.title = element_text(hjust = 0.5, size=18), axis.title=element_text(size=14,face="bold")) | ||
return(plot) | ||
} | ||
|
||
p1 <- fun_bar_chart(data_enr, 10, T) | ||
p1 | ||
####### TASK A) 10 most used pages END####### | ||
|
||
|
||
|
||
|
||
####### TASK B) 10 least used pages START####### | ||
# Use the function of a) | ||
p2 <- fun_bar_chart(data_enr, 10, F) | ||
p2 | ||
|
||
# View(melted) | ||
####### TASK B) 10 least used pages END####### | ||
|
||
|
||
|
||
|
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
####### TASK A) 10 most used pages START####### | ||
# Add the sum of visits | sum of impressions | ||
month_cols <- names(page_impressions)[grep("[0-9]*[.][0-9]", names(page_impressions))] | ||
|
||
# Both tables have the same month col names! | ||
month_cols == names(page_visits)[grep("[0-9]*[.][0-9]", names(page_visits))] | ||
|
||
# Calculate the sums | ||
sum_pi <- apply(page_impressions[,month_cols], c(1), function(x) {sum(x, na.rm = T)}) | ||
sum_v <- apply(page_visits[,month_cols], c(1), function(x) {sum(x, na.rm = T)}) | ||
page_impressions['sum'] <- sum_pi | ||
page_visits['sum'] <- sum_v | ||
|
||
# Order decreasing by sum | ||
page_impressions <- page_impressions[order(page_impressions$sum, decreasing = T), ] | ||
head(page_impressions$sum) | ||
page_visits <- page_visits[order(page_visits$sum, decreasing = T), ] | ||
head(page_visits$sum) | ||
|
||
# Get the 10 highest per class | ||
top10_imperssions <- page_impressions[which(page_impressions$page %in% head(page_impressions, 10)$page),] | ||
top10_visits <- page_visits[which(page_visits$page %in% head(page_visits, 10)$page),] | ||
|
||
ggplot(data=top10_imperssions) + | ||
geom_bar(mapping = aes(top10_imperssions$page)) | ||
|
||
####### TASK A) 10 most used pages END####### | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
####### DIVIDE IN TWO TABLES START ####### | ||
# For the shiny app the Month must be detected by itself. Goal: two data frames | ||
# One for the page impressions and one for the page visits | ||
cols <- colnames(data_agg) | ||
cols_pi <- cols[grep('pi', cols)] | ||
cols_v <- cols[grep('v', cols)] | ||
|
||
page_impressions <- data_agg[ , c('page', cols_pi)] | ||
page_visits <- data_agg[ , c('page', cols_v)] | ||
|
||
# Rename the columns | ||
names(page_impressions) <- gsub("X|([.]pi)", "", names(page_impressions)) | ||
names(page_visits) <- gsub("X|[.]v", "", names(page_visits)) | ||
|
||
####### DIVIDE IN TWO TABLES END ####### | ||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
# Define UI for application that draws a histogram | ||
ui <- fluidPage( | ||
|
||
# Application title | ||
titlePanel("Mothly visits and page impressions of open data pages in Berlin"), | ||
|
||
# Sidebar with a slider input for number of bins | ||
sidebarLayout( | ||
sidebarPanel( | ||
sliderInput("Month", | ||
"Number of bins:", | ||
min = 1, | ||
max = 50, | ||
value = 30) | ||
), | ||
|
||
# Show a plot of the generated distribution | ||
mainPanel( | ||
plotOutput("distPlot") | ||
) | ||
) | ||
) | ||
|
||
# Define server logic required to draw a histogram | ||
server <- function(input, output) { | ||
|
||
output$distPlot <- renderPlot({ | ||
# generate bins based on input$bins from ui.R | ||
x <- faithful[, 2] | ||
bins <- seq(min(x), max(x), length.out = input$bins + 1) | ||
|
||
# draw the histogram with the specified number of bins | ||
hist(x, breaks = bins, col = 'darkgray', border = 'white', | ||
xlab = 'Waiting time to next eruption (in mins)', | ||
main = 'Histogram of waiting times') | ||
}) | ||
} | ||
|
||
# Run the application | ||
shinyApp(ui = ui, server = server) | ||
|
||
|
||
|
Oops, something went wrong.