Skip to content
This repository has been archived by the owner on Apr 20, 2023. It is now read-only.


INIT: Initial Commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Super-T02 committed Nov 27, 2022
0 parents commit 6c23817
Show file tree
Hide file tree
Showing 9 changed files with 5,934 additions and 0 deletions.
Binary file added .RData
Binary file not shown.
512 changes: 512 additions & 0 deletions .Rhistory

Large diffs are not rendered by default.

203 changes: 203 additions & 0 deletions Open-Data-Berlin.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
rm(list = ls())
setwd("C:/Users/tomfr/OneDrive/Studium/UNI/5. Semester/Data Sience/Prüfung")

# Read the data from csv with the ";" as separator
data <- read.csv("./open-data_berlin_Nutzerdaten.csv", sep = ";")
# View(data)

# After looking into the data:
# 1. The Names of the pages with ä,ö,ü,ß are escaped --> First step is to remove the escape
# 2. Every Month has two columns: pi (Page impressions), v (Page Visits)
# 3. Many NA's

# Next: Start with 1. step and remove escaped letters
# - √º = ü
# - √∂ = ö
# - √§ = ä
# - √ü = ß

# ü <- data[grep('√º', data$page), ]
# ö <- data[grep('√∂', data$page), ]
# ä <- data[grep('√§', data$page), ]
ü <- grep('√º', data$page)
ö <- grep('√∂', data$page)
ä <- grep('√§', data$page)

# Search for 'Straße' --> √ü = ß
# View(data[grep('stra', data$page), ])
ß <- grep('√ü', data$page)

# Thesis: Every escaped char starts with '√'. Search for not found escaped chars:
# escaped <- data[grep('√', data$page)]
escaped <- grep('', data$page)
others <- setdiff(setdiff(setdiff(setdiff(escaped, ß), ü), ö), ä)
# View(data[others, ])
# There exist more:
# √É≈∏ = ß
# √É∆í√Ü‚Äô√ɂĶ√Ǭ∏ = ß

# Ö:
# √ɬ∂ = ö
# √É∆í√Ü‚Äô√ɂĆ√¢‚Ǩ‚Ñ¢√É∆í√¢‚Ǩ √ɬ¢√¢‚Äö¬¨√¢‚Äû¬¢√É∆í√Ü‚Äô√ɬ¢√¢‚Äö¬¨√Ö¬°√É∆í√¢‚Ǩ≈°√É‚Äö√Ǭ∂ = ö
# √É∆í√Ü‚Äô√É‚Äö√Ǭ∂ = ö

# Ü:
# √ɬº = ü
# √É∆í√Ü‚Äô√ɂĆ√¢‚Ǩ‚Ñ¢√É∆í√¢‚Ǩ √ɬ¢√¢‚Äö¬¨√¢‚Äû¬¢√É∆í√Ü‚Äô√ɬ¢√¢‚Äö¬¨√Ö¬°√É∆í√¢‚Ǩ≈°√É‚Äö√Ǭº = ü
# √É∆í√Ü‚Äô√ɂĆ√¢‚Ǩ‚Ñ¢√É∆í√¢‚Ǩ≈°√É‚Äö√Ǭº = ü
# √É∆í√Ü‚Äô√É‚Äö√Ǭº = ü
# √ɬɬº = ü

# Ä:
# √ɬ§A4 = ä
# √É∆í√Ü‚Äô√É‚Äö√Ǭ§ = ä
# √ɬ§ = ä
# √É∆í√Ǭ§ = ä

others <- setdiff(others, grep('ß', data$page))
others <- setdiff(others, grep('√É∆í√Ü‚Äô√ɂĆ√¢‚Ǩ‚Ñ¢√É∆í√¢‚Ǩ √ɬ¢√¢‚Äö¬¨√¢‚Äû¬¢√É∆í√Ü‚Äô√ɬ¢√¢‚Äö¬¨√Ö¬°√É∆í√¢‚Ǩ≈°√É‚Äö√Ǭ', data$page))
others <- setdiff(others, grep('ö', data$page))
others <- setdiff(others, grep('ö', data$page))
others <- setdiff(others, grep('ü', data$page))
others <- setdiff(others, grep('ü', data$page))
others <- setdiff(others, grep('äA4', data$page))
others <- setdiff(others, grep('ä', data$page))
others <- setdiff(others, grep('ä', data$page))
others <- setdiff(others, grep('ä', data$page))
others <- setdiff(others, grep('ü', data$page))
others <- setdiff(others, grep('ü', data$page))
# View(data[others,])

# Thesis is a pattern for the most escaped letters:
# - √(something)º = ü Problem:
# √É∆í√Ü‚Äô√ɂĆ√¢‚Ǩ‚Ñ¢√É∆í√¢‚Ǩ≈°√É‚Äö√Ǭº because 2 times °
# ü
# Solution: ° is different to º and º only one time at the end!
# - %C3%BC = ü
# - √(something)∂ = ö Problem: none
# - %C3%B6 = ö
# - √(something)§ = ä Problem: √ɬ§A4 and because one more letter
# - %C3%A4 = ä
# - A4 = ä
# - "¬® " = ä
# - √(something)ü or √(something)∏ = ß Problem: should be parsed before ü because it uses ü
# - %C3%9F = ß
# Replace letters:
temp <- data[others, "page"]
data[ß, "page"]
data_sub <- data
data_sub$page <- gsub("(√[^A-Za-z]*?[ü∏])|%C3%9F", "ß", data_sub$page) # ß
data_sub$page <- gsub("(√[^A-Za-z]*?º)|%C3%BC", "ü", data_sub$page) # ü
data_sub$page <- gsub("(√[^A-Za-z]*?∂)|%C3%B6", "ö", data_sub$page) # ö
data_sub$page <- gsub("(√[^A-Za-z]*?§(A4)?)|(A4)|¬® ", "ä", data_sub$page) # ä
# View(data_sub)
# Unsolved Problem: ¬ is sometimes in the data but don't know what it does


# Find the duplicates
data_dup <- data_sub[(duplicated(data_sub$page) | duplicated(data_sub$page, fromLast = T)),]

# Sum the amount of visits and replace the 0's with NA
data_agg <- aggregate(x = data_sub[ , colnames(data_sub) != "page"], # Mean by group
by = list(data_sub$page),
FUN = sum,
na.rm = TRUE
colnames(data_agg)[colnames(data_agg) == 'Group.1'] <- 'page'
data_agg[data_agg == 0] <- NA
####### FIND DUPLICATES END #######

####### PREPARE SUMS START#######
data_enr <- data_agg

# Add the sum of visits | sum of impressions
month_cols_pi <- names(data_enr)[grep("X[0-9]*[.][0-9][0-9][.]pi", names(data_enr))]
month_cols_v <- names(data_enr)[grep("X[0-9]*[.][0-9][0-9][.]v", names(data_enr))]

# Calculate the sums
sum_pi <- apply(data_enr[,month_cols_pi], c(1), function(x) {sum(x, na.rm = T)})
sum_v <- apply(data_enr[,month_cols_v], c(1), function(x) {sum(x, na.rm = T)})
sum <- sum_pi + sum_v
data_enr['sum_pi'] <- sum_pi
data_enr['sum_v'] <- sum_v
data_enr['sum'] <- sum

# View(data_enr[,c('page','sum_v', 'sum_pi')])
####### PREPARE SUMS END #######

####### TASK A) 10 most used pages ALTRNATIVE START#######
# Function for chart
fun_bar_chart <- function(data_enr_temp, number_to_display, decreasing = T) {
# Order by sum of visits and page impressions
data_enr_temp <- data_enr_temp[order(data_enr_temp$sum, decreasing = decreasing), ]

# Get the 10 highest/lowest per class
top10 <- data_enr_temp[which(data_enr_temp$page %in% head(data_enr_temp, number_to_display)$page),]

# Make two fields for every variable of each page
melted <- melt(top10[,c("page", "sum_v", "sum_pi", "sum")], id="page")
#melted <- melted[order(-melted$variable, -melted$value, decreasing = F), ]

# Make a factor to order the data by the number of visits
melted$page <- factor(melted$page, levels = unique(melted$page[order(melted$variable, melted$value, decreasing = T)]))
melted <- melted[!(melted$variable == "sum"),]

# Define Title
title <- ""
if(decreasing) {
title <- paste("Sum of page impressions and visits of the", number_to_display ,"most visited pages", sep = " ")
} else {
title <- paste("Sum of page impressions and visits of the", number_to_display ,"least visited pages", sep = " ")

# Make plot
plot <- ggplot(melted, aes(value, page, label=value)) +
geom_bar(aes(fill = variable), position = "stack", stat="identity") +
ggtitle(title) +
ylab("Sum") + xlab("Page") +
scale_fill_discrete(labels=c('Visits', 'Page impressions')) +
labs(fill='') +
geom_text(size = 3, position = position_stack(vjust = .5))+
theme(legend.position = "top", plot.title = element_text(hjust = 0.5, size=18), axis.title=element_text(size=14,face="bold"))

p1 <- fun_bar_chart(data_enr, 10, T)
####### TASK A) 10 most used pages END#######

####### TASK B) 10 least used pages START#######
# Use the function of a)
p2 <- fun_bar_chart(data_enr, 10, F)

# View(melted)
####### TASK B) 10 least used pages END#######

Binary file added Prüfungsaufgabe.pdf
Binary file not shown.
Binary file added README.pdf
Binary file not shown.
126 changes: 126 additions & 0 deletions Sonstiges.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
####### TASK A) 10 most used pages START#######
# Add the sum of visits | sum of impressions
month_cols <- names(page_impressions)[grep("[0-9]*[.][0-9]", names(page_impressions))]

# Both tables have the same month col names!
month_cols == names(page_visits)[grep("[0-9]*[.][0-9]", names(page_visits))]

# Calculate the sums
sum_pi <- apply(page_impressions[,month_cols], c(1), function(x) {sum(x, na.rm = T)})
sum_v <- apply(page_visits[,month_cols], c(1), function(x) {sum(x, na.rm = T)})
page_impressions['sum'] <- sum_pi
page_visits['sum'] <- sum_v

# Order decreasing by sum
page_impressions <- page_impressions[order(page_impressions$sum, decreasing = T), ]
page_visits <- page_visits[order(page_visits$sum, decreasing = T), ]

# Get the 10 highest per class
top10_imperssions <- page_impressions[which(page_impressions$page %in% head(page_impressions, 10)$page),]
top10_visits <- page_visits[which(page_visits$page %in% head(page_visits, 10)$page),]

ggplot(data=top10_imperssions) +
geom_bar(mapping = aes(top10_imperssions$page))

####### TASK A) 10 most used pages END#######

# For the shiny app the Month must be detected by itself. Goal: two data frames
# One for the page impressions and one for the page visits
cols <- colnames(data_agg)
cols_pi <- cols[grep('pi', cols)]
cols_v <- cols[grep('v', cols)]

page_impressions <- data_agg[ , c('page', cols_pi)]
page_visits <- data_agg[ , c('page', cols_v)]

# Rename the columns
names(page_impressions) <- gsub("X|([.]pi)", "", names(page_impressions))
names(page_visits) <- gsub("X|[.]v", "", names(page_visits))

####### DIVIDE IN TWO TABLES END #######

# Define UI for application that draws a histogram
ui <- fluidPage(

# Application title
titlePanel("Mothly visits and page impressions of open data pages in Berlin"),

# Sidebar with a slider input for number of bins
"Number of bins:",
min = 1,
max = 50,
value = 30)

# Show a plot of the generated distribution

# Define server logic required to draw a histogram
server <- function(input, output) {

output$distPlot <- renderPlot({
# generate bins based on input$bins from ui.R
x <- faithful[, 2]
bins <- seq(min(x), max(x), length.out = input$bins + 1)

# draw the histogram with the specified number of bins
hist(x, breaks = bins, col = 'darkgray', border = 'white',
xlab = 'Waiting time to next eruption (in mins)',
main = 'Histogram of waiting times')

# Run the application
shinyApp(ui = ui, server = server)


0 comments on commit 6c23817

Please sign in to comment.