Merge pull request #4 from iramler/main

adding spring 2024 data from SLU seminar group
SCOREnetworkorg · Apr 17, 2024 · 17d1fb3 · 17d1fb3
2 parents f09d3b4 + 30baa44
commit 17d1fb3
Show file tree

Hide file tree

Showing 26 changed files with 8,478 additions and 1 deletion.
diff --git a/_prep/NordicW10k/init-nordicw10k.qmd b/_prep/NordicW10k/init-nordicw10k.qmd
@@ -0,0 +1,143 @@
+---
+title: "Scraping Data From FIS Website"
+author: "Abigail Smith"
+format: html
+---
+
+Load needed libraries
+
+```{r}
+library(rvest)
+library(chromote)
+library(tidyverse)
+library(lubridate)
+```
+
+Pick a page to scrape from and read it with embedded chrome browser
+
+```{r}
+# Copy and paste the pages URL into code below
+url <- "https://live.fis-ski.com/cc-2257/results-pda.htm"
+
+newpage <- ChromoteSession$new()
+{
+  newpage$Page$navigate(url)
+  newpage$Page$loadEventFired()
+}
+# This opens a browser that displays the page
+newpage$view()
+```
+
+After you have what you want to scarpe displayed in the browser, this code chunk will pull out all html elements. 
+
+```{r}
+elements <- newpage$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value |>
+  read_html()
+```
+
+```{r}
+# Close the page so that your session does not time out
+newpage$close()
+```
+
+Create columns for each variable using the html.elements("") function, by piping into  elements. These each use a different html key which can be identified with the Selector Gadget.
+
+```{r}
+Bib <- elements |>
+  html_elements(".col_bib") |>
+  html_text2()
+
+Rank <- elements |>
+  html_elements(".col_rank") |>
+  html_text2()
+
+Name <- elements |>
+  html_elements(".col_name") |>
+  html_text2()
+
+NSA <- elements |>
+  html_elements(".col_nsa") |>
+  html_text2()
+
+Result <- elements |>
+  html_elements(".col_result") |>
+  html_text2()
+
+Diff <- elements |>
+  html_elements(".col_diff") |>
+  html_text2()
+```
+
+Make a data frame with all the columns that you just created. 
+
+```{r}
+x <- data.frame(Bib, Name, NSA, Rank, Result, Diff)
+x <- x[x$Bib!="Bib",] # This removes any rows which are extra labels without skiers
+```
+
+
+Make the values in each variable numeric.
+
+```{r}
+x$Bib=as.numeric(x$Bib)
+x$Rank=as.numeric(x$Rank)
+x$Diff=as.numeric(x$Diff)
+```
+
+Convert the values for the Result variable from milliseconds to seconds and round them. 
+
+```{r}
+x$Result<-x$Result |> ms() |> period_to_seconds()
+x$Result<-round(x$Result/60,3)
+```
+
+Right now we have data for 4 different points in the race all in one table, we want to split that up into four individual tables.
+
+
+Create a vector of the distances being used as reference points for each of the four tables.
+
+```{r}
+Dist <- c(2,5.2,7.5,11)
+```
+
+Within that data frame, create a vector with the number of rows in the data frame and then for the true value of n create a vector in which those are divided by 4 to get rid of duplicates. The webpage shows results at 4 different times in the race, so this will split these up so that we have a table for each distance. 
+
+```{r}
+rows=dim(x)[1]
+n=rows/4
+```
+
+Make groups for each distance and create columns for the rank and difference at each distance.
+
+```{r}
+group1 <- x[1: (n-3),]
+group2 <- x[(n+1):(2*n-3),]
+group3 <- x[(2*n+1):(3*n-3),]
+group4 <- x[(3*n+1):(4*n-3),]
+
+group1$Dist1 = Dist[1]
+group2$Dist2 = Dist[2]
+group3$Dist3 = Dist[3]
+group4$Dist4 = Dist[4]
+
+names(group1)[4:6]=c("Rank1", "Time1", "Diff1")
+names(group2)[4:6]=c("Rank2", "Time2", "Diff2")
+names(group3)[4:6]=c("Rank3", "Time3", "Diff3")
+names(group4)[4:6]=c("Rank4", "Time4", "Diff4")
+```
+
+Join the groups into one big data frame and create a variable with the events code.
+
+```{r}
+y<-group1 |> left_join(group2) |> left_join(group3) |> left_join(group4)
+y <- arrange(y, Rank4)
+y$event = 2257
+```
+
+Make the data frame into a csv file so that it will be saved permanently. 
+
+```{r}
+# user specifies the name and path to the file
+write.csv(y, "W10kTro_2257.csv", row.names = FALSE) 
+```
+
diff --git a/_prep/UFC_stats/init-UFC_stats.R b/_prep/UFC_stats/init-UFC_stats.R
@@ -0,0 +1,30 @@
+# Cleaning process of Kaggle's UFC Data
+
+# This prep file assumes you have already downloaded 
+# the data from https://www.kaggle.com/datasets/rajeevw/ufcdata
+
+library(tidyverse)
+library(lubridate)
+fight_data <- read.csv("raw_fighter_details.csv") # change to your local path
+
+ufc_data <- fight_data |>
+  mutate(Str_Acc = parse_number(Str_Acc),
+         Weight = parse_number(Weight),
+         Reach = parse_number(Reach),
+         Str_Def = parse_number(Str_Def),
+         TD_Acc = parse_number(TD_Acc),
+         TD_Def = parse_number(TD_Def),
+         birthyear = year(mdy(DOB)), 
+         feet = as.numeric(str_extract(Height, "\\d+(?=')")), 
+         inches = as.numeric(str_extract(Height, "\\d+(?=\")")),
+         height_inches = (feet * 12) + inches 
+         ) |>
+  filter(Str_Acc != 0) |> # dump errors
+  filter(Reach != "") |> # dump errors
+  select(-feet, -inches, -DOB, -Height) |> # remove unnecessary columns
+  rename(Height = height_inches, Birthyear = birthyear) |>
+  relocate(fighter_name, Height, Weight, Reach, Stance, Birthyear, SLpM, Str_Acc, 
+             SApM, Str_Def, TD_Avg, TD_Acc, TD_Def, Sub_Avg)
+
+# save file
+write.csv(ufc_data, "UFC_stats.csv")
diff --git a/_prep/WNBA20Yrs/init-WNBA20Yrs.qmd b/_prep/WNBA20Yrs/init-WNBA20Yrs.qmd
@@ -0,0 +1,48 @@
+---
+title: "data_cleaning"
+format: html
+editor: visual
+---
+
+The following code can be used to recreate the `wnba_data.csv` file.
+
+```{r}
+library(wehoop)
+library(tidyverse)
+```
+
+```{r}
+team_data <- load_wnba_team_box(
+  seasons = 2003:2022)
+```
+
+```{r}
+# filter for actual teams
+team_data <- team_data %>% filter(team_id <= 20)
+```
+
+```{r}
+# views teams
+team_data %>% filter(season == 2008) %>% select(team_id, team_display_name) %>% distinct() %>% arrange(team_id)
+```
+
+```{r}
+# make new team name variable for teams that changed their name
+team_data <- team_data %>% mutate(team_name = case_when(team_id == 3 ~ 'Dallas Wings',
+                                           team_id == 17 ~ 'Las Vegas Aces',
+                                           TRUE ~ team_display_name)) 
+```
+
+```{r}
+# Select the variables we want to keep
+wnba_data <- team_data %>% select(game_id, season, season_type, game_date, team_id, team_display_name, team_winner, opponent_team_id, team_home_away)
+```
+
+```{r}
+# Filter for what we need
+wnba_data <- team_data %>% select(game_id, season, season_type, game_date, team_id, team_display_name, team_winner, opponent_team_id, team_home_away)
+```
+
+```{r}
+wnba_data %>% write.csv("wnba_data.csv", row.names = FALSE)
+```
diff --git a/_prep/Womens_Tremblant_GS/init-womens_tremblant_gs.qmd b/_prep/Womens_Tremblant_GS/init-womens_tremblant_gs.qmd
@@ -0,0 +1,105 @@
+---
+title: "Alpine Ski Data Scraping Description"
+format: html
+---
+
+Loading necessary packages
+```{r}
+library(rvest)
+library(chromote)
+library(tidyverse)
+library(lubridate)
+```
+
+We need a function to convert times to seconds. This function works for times over a minute.
+```{r}
+fix_time <- function(time){
+  n = length(time)
+  Diff = time[2:n]
+  t1 = time[1] |> ms() |> period_to_seconds()
+  Diff <- parse_number(Diff)
+  Diff <- t1 + Diff
+  return(c(t1, Diff))
+}
+
+```
+
+Here is the main function to scrape the data and reformat it.
+```{r}
+get_ski <- function(id_code){
+  ## building the url
+  first <- "https://live.fis-ski.com/lv-al"
+  end <- ".htm#/follow"
+  url <- paste(first, id_code, end, sep = "")
+
+## opening up a headless browser session
+newpage <- ChromoteSession$new()
+{
+  newpage$Page$navigate(url)
+  newpage$Page$loadEventFired()
+}
+
+## waiting a second for the browser to load before extracting
+## elements
+Sys.sleep(1)
+
+## extracting elements
+elements <- newpage$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value |>
+  read_html()
+
+## closing the browser session
+newpage$close()
+
+## grabbing the relevant elements
+results_table<- elements |>
+  html_elements(".sortable") |>
+  html_text2()
+
+## constructing a data frame
+results_matrix = matrix(results_table, ncol = 9,byrow = TRUE)
+print(dim(results_matrix))
+colnames(results_matrix) = results_matrix[1,]
+results_matrix = results_matrix[-1,] |>
+  as_tibble()
+
+## tidying the data and using the fix_time() function
+results_matrix1 <- results_matrix |> rename(Number = `N°`)
+results_matrix1$Total <- fix_time(results_matrix$Total)
+results_matrix2 <- results_matrix1 |>
+  mutate(Pr=parse_number(Pr)) |>
+  separate(col = `Run 1`, into = c("Run_1_Time", "Run_1_Rank"),
+           sep = "\\(")|>
+  mutate(Run_1_Rank = parse_number(Run_1_Rank)) |>
+  separate(col = `Run 2`, into = c("Run_2_Time", "Run_2_Rank"),
+           sep = "\\(")|>
+  mutate(Run_2_Rank = parse_number(Run_2_Rank))
+results_matrix2$Run_1_Time <- fix_time(results_matrix2$Run_1_Time)
+results_matrix2$Run_2_Time <- fix_time(results_matrix2$Run_2_Time)
+return(results_matrix2)
+}
+```
+
+Testing out the scraping function with an id code. The id code comes from the 4 digit code in the url. For example, the url https://live.fis-ski.com/lv-al5009.htm#/follow has id code 5009.
+```{r}
+x = get_ski(5009)
+```
+
+Renaming and reordering variables.
+```{r}
+x <- x |> rename(Final_Rank = Rank,
+                                  Run2_Order = Number,
+                                  Run1_Order = Bib,
+                                  Total_Time = Total,
+                                  Rank_Diff = Pr,
+                                  Run1_Time = Run_1_Time,
+                                  Run1_Rank = Run_1_Rank,
+                                  Run2_Time = Run_2_Time,
+                                  Run2_Rank = Run_2_Rank) |>
+  select(4, 5, 3, 8, 9, 2, 10, 11, 6, 1, 7)
+x
+```
+
+Saving as csv
+```{r}
+write.csv(x, "Tremblant.csv", row.names = FALSE)
+```
diff --git a/_prep/bull_riding/bullRider.jpg b/_prep/bull_riding/bullRider.jpg