INIT: Initial Commit

Super-T02 · Nov 27, 2022 · 6c23817 · 6c23817
commit 6c23817
Show file tree

Hide file tree

Showing 9 changed files with 5,934 additions and 0 deletions.
diff --git a/.RData b/.RData
diff --git a/.Rhistory b/.Rhistory
diff --git a/Open-Data-Berlin.R b/Open-Data-Berlin.R
@@ -0,0 +1,203 @@
+rm(list = ls())
+setwd("C:/Users/tomfr/OneDrive/Studium/UNI/5. Semester/Data Sience/Prüfung")
+library(tidyverse)
+library(ggplot2)
+library(tidyr)
+library(reshape2)
+
+# Read the data from csv with the ";" as separator
+data <- read.csv("./open-data_berlin_Nutzerdaten.csv", sep = ";")
+str(data)
+# View(data)
+
+# After looking into the data:
+# 1. The Names of the pages with ä,ö,ü,ß are escaped --> First step is to remove the escape
+# 2. Every Month has two columns: pi (Page impressions), v (Page Visits)
+# 3. Many NA's
+
+####### REMOVE ESCAPED LETTERS START ####### 
+# Next: Start with 1. step and remove escaped letters
+# - √º = ü
+# - √∂ = ö
+# - √§ = ä
+# - √ü = ß
+
+# ü <- data[grep('√º', data$page), ]
+# ö <- data[grep('√∂', data$page), ]
+# ä <- data[grep('√§', data$page), ]
+ü <- grep('√º', data$page)
+ö <- grep('√∂', data$page)
+ä <- grep('√§', data$page)
+
+# Search for 'Straße' --> √ü = ß
+# View(data[grep('stra', data$page), ])
+ß <- grep('√ü', data$page)
+
+# Thesis: Every escaped char starts with '√'. Search for not found escaped chars:
+# escaped <- data[grep('√', data$page)]
+escaped <- grep('√', data$page)
+others <- setdiff(setdiff(setdiff(setdiff(escaped, ß), ü), ö), ä)
+# View(data[others, ])
+# There exist more:
+# √É≈∏ = ß
+# √É∆í√Ü‚Äô√É‚Ä¶√Ç¬∏ = ß
+
+# Ö:
+# √É¬∂ = ö
+# √É∆í√Ü‚Äô√É‚Ä†√¢‚Ç¨‚Ñ¢√É∆í√¢‚Ç¨ √É¬¢√¢‚Äö¬¨√¢‚Äû¬¢√É∆í√Ü‚Äô√É¬¢√¢‚Äö¬¨√Ö¬°√É∆í√¢‚Ç¨≈°√É‚Äö√Ç¬∂ = ö
+# √É∆í√Ü‚Äô√É‚Äö√Ç¬∂ = ö
+
+# Ü:
+# √É¬º = ü
+# √É∆í√Ü‚Äô√É‚Ä†√¢‚Ç¨‚Ñ¢√É∆í√¢‚Ç¨ √É¬¢√¢‚Äö¬¨√¢‚Äû¬¢√É∆í√Ü‚Äô√É¬¢√¢‚Äö¬¨√Ö¬°√É∆í√¢‚Ç¨≈°√É‚Äö√Ç¬º = ü
+# √É∆í√Ü‚Äô√É‚Ä†√¢‚Ç¨‚Ñ¢√É∆í√¢‚Ç¨≈°√É‚Äö√Ç¬º = ü
+# √É∆í√Ü‚Äô√É‚Äö√Ç¬º = ü
+# √É¬É¬º = ü
+
+# Ä:
+# √É¬§A4 = ä
+# √É∆í√Ü‚Äô√É‚Äö√Ç¬§ = ä
+# √É¬§ = ä
+# √É∆í√Ç¬§ = ä
+
+others <- setdiff(others, grep('√É≈∏', data$page))
+others <- setdiff(others, grep('√É∆í√Ü‚Äô√É‚Ä†√¢‚Ç¨‚Ñ¢√É∆í√¢‚Ç¨ √É¬¢√¢‚Äö¬¨√¢‚Äû¬¢√É∆í√Ü‚Äô√É¬¢√¢‚Äö¬¨√Ö¬°√É∆í√¢‚Ç¨≈°√É‚Äö√Ç¬', data$page))
+others <- setdiff(others, grep('√É¬∂', data$page))
+others <- setdiff(others, grep('√É∆í√Ü‚Äô√É‚Äö√Ç¬∂', data$page))
+others <- setdiff(others, grep('√É¬º', data$page))
+others <- setdiff(others, grep('√É∆í√Ü‚Äô√É‚Äö√Ç¬º', data$page))
+others <- setdiff(others, grep('√É¬§A4', data$page))
+others <- setdiff(others, grep('√É∆í√Ü‚Äô√É‚Äö√Ç¬§', data$page))
+others <- setdiff(others, grep('√É¬§', data$page))
+others <- setdiff(others, grep('√É∆í√Ç¬§', data$page))
+others <- setdiff(others, grep('√É∆í√Ü‚Äô√É‚Ä†√¢‚Ç¨‚Ñ¢√É∆í√¢‚Ç¨≈°√É‚Äö√Ç¬º', data$page))
+others <- setdiff(others, grep('√É¬É¬º', data$page))
+# View(data[others,])
+
+# Thesis is a pattern for the most escaped letters:
+# - √(something)º = ü Problem:
+#                     √É∆í√Ü‚Äô√É‚Ä†√¢‚Ç¨‚Ñ¢√É∆í√¢‚Ç¨≈°√É‚Äö√Ç¬º because 2 times °
+#                     √É∆í√Ü‚Äô√É‚Ä†√¢‚Ç¨‚Ñ¢√É∆í√¢‚Ç¨ √É¬¢√¢‚Äö¬¨√¢‚Äû¬¢√É∆í√Ü‚Äô√É¬¢√¢‚Äö¬¨√Ö¬°√É∆í√¢‚Ç¨≈°√É‚Äö√Ç¬º
+#                     Solution: ° is different to º and º only one time at the end!
+# - %C3%BC = ü
+# - √(something)∂ = ö Problem: none
+# - %C3%B6 = ö
+# - √(something)§ = ä Problem: √É¬§A4 and because one more letter
+# - %C3%A4 = ä
+# - A4 = ä
+# - "¬® " = ä
+# - √(something)ü or √(something)∏ = ß Problem: should be parsed before ü because it uses ü
+# - %C3%9F = ß
+#
+# Replace letters:
+temp <- data[others, "page"]
+data[ß, "page"]
+data_sub <- data
+data_sub$page <- gsub("(√[^A-Za-z]*?[ü∏])|%C3%9F", "ß", data_sub$page) # ß
+data_sub$page <- gsub("(√[^A-Za-z]*?º)|%C3%BC", "ü", data_sub$page) # ü
+data_sub$page <- gsub("(√[^A-Za-z]*?∂)|%C3%B6", "ö", data_sub$page) # ö
+data_sub$page <- gsub("(√[^A-Za-z]*?§(A4)?)|(A4)|¬® ", "ä", data_sub$page) # ä
+# View(data_sub)
+# Unsolved Problem: ¬ is sometimes in the data but don't know what it does
+
+####### REMOVE ESCAPED LETTERS END ####### 
+
+
+
+
+####### FIND DUPLICATES START #######
+# Find the duplicates
+data_dup <- data_sub[(duplicated(data_sub$page) | duplicated(data_sub$page, fromLast = T)),]
+
+# Sum the amount of visits and replace the 0's with NA
+data_agg <- aggregate(x = data_sub[ , colnames(data_sub) != "page"],             # Mean by group
+          by = list(data_sub$page),
+          FUN = sum,
+          na.rm = TRUE
+          )
+colnames(data_agg)[colnames(data_agg) == 'Group.1'] <- 'page'
+data_agg[data_agg == 0] <- NA
+####### FIND DUPLICATES END ####### 
+
+
+
+
+
+
+####### PREPARE SUMS START####### 
+data_enr <- data_agg
+
+# Add the sum of visits | sum of impressions
+month_cols_pi <- names(data_enr)[grep("X[0-9]*[.][0-9][0-9][.]pi", names(data_enr))]
+month_cols_v <- names(data_enr)[grep("X[0-9]*[.][0-9][0-9][.]v", names(data_enr))]
+
+# Calculate the sums
+sum_pi <- apply(data_enr[,month_cols_pi], c(1), function(x) {sum(x, na.rm = T)})
+sum_v <- apply(data_enr[,month_cols_v], c(1), function(x) {sum(x, na.rm = T)})
+sum <- sum_pi + sum_v
+data_enr['sum_pi'] <- sum_pi
+data_enr['sum_v'] <- sum_v
+data_enr['sum'] <- sum
+
+# View(data_enr[,c('page','sum_v', 'sum_pi')])
+####### PREPARE SUMS END ####### 
+
+
+
+
+
+####### TASK A) 10 most used pages ALTRNATIVE START#######
+# Function for chart
+fun_bar_chart <- function(data_enr_temp, number_to_display, decreasing = T) {
+  # Order by sum of visits and page impressions
+  data_enr_temp <- data_enr_temp[order(data_enr_temp$sum, decreasing = decreasing), ]
+
+  # Get the 10 highest/lowest per class
+  top10 <- data_enr_temp[which(data_enr_temp$page %in% head(data_enr_temp, number_to_display)$page),]
+
+  # Make two fields for every variable of each page
+  melted <- melt(top10[,c("page", "sum_v", "sum_pi", "sum")], id="page")
+  #melted <- melted[order(-melted$variable, -melted$value, decreasing = F), ]
+
+  # Make a factor to order the data by the number of visits
+  melted$page <- factor(melted$page, levels = unique(melted$page[order(melted$variable, melted$value, decreasing = T)]))
+  melted <- melted[!(melted$variable == "sum"),]
+
+  # Define Title
+  title <- ""
+  if(decreasing) {
+    title <- paste("Sum of page impressions and visits of the", number_to_display ,"most visited pages", sep = " ")
+  } else {
+    title <- paste("Sum of page impressions and visits of the", number_to_display ,"least visited pages", sep = " ")
+  }
+
+  # Make plot
+  plot <- ggplot(melted, aes(value, page, label=value)) +   
+    geom_bar(aes(fill = variable), position = "stack", stat="identity") +
+    ggtitle(title) +
+    ylab("Sum") + xlab("Page") +
+    scale_fill_discrete(labels=c('Visits', 'Page impressions')) +
+    labs(fill='') +
+    geom_text(size = 3, position = position_stack(vjust = .5))+
+    theme(legend.position = "top", plot.title = element_text(hjust = 0.5, size=18), axis.title=element_text(size=14,face="bold"))
+  return(plot)
+}
+
+p1 <- fun_bar_chart(data_enr, 10, T)
+p1
+####### TASK A) 10 most used pages END#######
+
+
+
+
+####### TASK B) 10 least used pages START#######
+# Use the function of a)
+p2 <- fun_bar_chart(data_enr, 10, F)
+p2
+
+# View(melted)
+####### TASK B) 10 least used pages END#######
+
+
+
+
diff --git a/Prüfungsaufgabe.pdf b/Prüfungsaufgabe.pdf
diff --git a/README.pdf b/README.pdf
diff --git a/Sonstiges.R b/Sonstiges.R
@@ -0,0 +1,126 @@
+####### TASK A) 10 most used pages START#######
+# Add the sum of visits | sum of impressions
+month_cols <- names(page_impressions)[grep("[0-9]*[.][0-9]", names(page_impressions))]
+
+# Both tables have the same month col names!
+month_cols == names(page_visits)[grep("[0-9]*[.][0-9]", names(page_visits))]
+
+# Calculate the sums
+sum_pi <- apply(page_impressions[,month_cols], c(1), function(x) {sum(x, na.rm = T)})
+sum_v <- apply(page_visits[,month_cols], c(1), function(x) {sum(x, na.rm = T)})
+page_impressions['sum'] <- sum_pi
+page_visits['sum'] <- sum_v
+
+# Order decreasing by sum
+page_impressions <- page_impressions[order(page_impressions$sum, decreasing = T), ]
+head(page_impressions$sum)
+page_visits <- page_visits[order(page_visits$sum, decreasing = T), ]
+head(page_visits$sum)
+
+# Get the 10 highest per class
+top10_imperssions <- page_impressions[which(page_impressions$page %in% head(page_impressions, 10)$page),]
+top10_visits <- page_visits[which(page_visits$page %in% head(page_visits, 10)$page),]
+
+ggplot(data=top10_imperssions) +
+  geom_bar(mapping = aes(top10_imperssions$page))
+
+####### TASK A) 10 most used pages END#######
+
+
+
+
+
+
+
+
+
+####### DIVIDE IN TWO TABLES START #######
+# For the shiny app the Month must be detected by itself. Goal: two data frames
+# One for the page impressions and one for the page visits
+cols <- colnames(data_agg)
+cols_pi <- cols[grep('pi', cols)]
+cols_v <- cols[grep('v', cols)]
+
+page_impressions <- data_agg[ , c('page', cols_pi)]
+page_visits <- data_agg[ , c('page', cols_v)]
+
+# Rename the columns
+names(page_impressions) <- gsub("X|([.]pi)", "", names(page_impressions))
+names(page_visits) <- gsub("X|[.]v", "", names(page_visits))
+
+####### DIVIDE IN TWO TABLES END #######
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+# Define UI for application that draws a histogram
+ui <- fluidPage(
+
+  # Application title
+  titlePanel("Mothly visits and page impressions of open data pages in Berlin"),
+
+  # Sidebar with a slider input for number of bins 
+  sidebarLayout(
+    sidebarPanel(
+      sliderInput("Month",
+                  "Number of bins:",
+                  min = 1,
+                  max = 50,
+                  value = 30)
+    ),
+
+    # Show a plot of the generated distribution
+    mainPanel(
+      plotOutput("distPlot")
+    )
+  )
+)
+
+# Define server logic required to draw a histogram
+server <- function(input, output) {
+
+  output$distPlot <- renderPlot({
+    # generate bins based on input$bins from ui.R
+    x    <- faithful[, 2]
+    bins <- seq(min(x), max(x), length.out = input$bins + 1)
+
+    # draw the histogram with the specified number of bins
+    hist(x, breaks = bins, col = 'darkgray', border = 'white',
+         xlab = 'Waiting time to next eruption (in mins)',
+         main = 'Histogram of waiting times')
+  })
+}
+
+# Run the application 
+shinyApp(ui = ui, server = server)
+
+
+