Skip to content

Commit

Permalink
Merge pull request #4 from iramler/main
Browse files Browse the repository at this point in the history
adding spring 2024 data from SLU seminar group
  • Loading branch information
ryurko authored Apr 17, 2024
2 parents f09d3b4 + 30baa44 commit 17d1fb3
Show file tree
Hide file tree
Showing 26 changed files with 8,478 additions and 1 deletion.
143 changes: 143 additions & 0 deletions _prep/NordicW10k/init-nordicw10k.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
---
title: "Scraping Data From FIS Website"
author: "Abigail Smith"
format: html
---

Load needed libraries

```{r}
library(rvest)
library(chromote)
library(tidyverse)
library(lubridate)
```

Pick a page to scrape from and read it with embedded chrome browser

```{r}
# Copy and paste the pages URL into code below
url <- "https://live.fis-ski.com/cc-2257/results-pda.htm"
newpage <- ChromoteSession$new()
{
newpage$Page$navigate(url)
newpage$Page$loadEventFired()
}
# This opens a browser that displays the page
newpage$view()
```

After you have what you want to scarpe displayed in the browser, this code chunk will pull out all html elements.

```{r}
elements <- newpage$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value |>
read_html()
```

```{r}
# Close the page so that your session does not time out
newpage$close()
```

Create columns for each variable using the html.elements("") function, by piping into elements. These each use a different html key which can be identified with the Selector Gadget.

```{r}
Bib <- elements |>
html_elements(".col_bib") |>
html_text2()
Rank <- elements |>
html_elements(".col_rank") |>
html_text2()
Name <- elements |>
html_elements(".col_name") |>
html_text2()
NSA <- elements |>
html_elements(".col_nsa") |>
html_text2()
Result <- elements |>
html_elements(".col_result") |>
html_text2()
Diff <- elements |>
html_elements(".col_diff") |>
html_text2()
```

Make a data frame with all the columns that you just created.

```{r}
x <- data.frame(Bib, Name, NSA, Rank, Result, Diff)
x <- x[x$Bib!="Bib",] # This removes any rows which are extra labels without skiers
```


Make the values in each variable numeric.

```{r}
x$Bib=as.numeric(x$Bib)
x$Rank=as.numeric(x$Rank)
x$Diff=as.numeric(x$Diff)
```

Convert the values for the Result variable from milliseconds to seconds and round them.

```{r}
x$Result<-x$Result |> ms() |> period_to_seconds()
x$Result<-round(x$Result/60,3)
```

Right now we have data for 4 different points in the race all in one table, we want to split that up into four individual tables.


Create a vector of the distances being used as reference points for each of the four tables.

```{r}
Dist <- c(2,5.2,7.5,11)
```

Within that data frame, create a vector with the number of rows in the data frame and then for the true value of n create a vector in which those are divided by 4 to get rid of duplicates. The webpage shows results at 4 different times in the race, so this will split these up so that we have a table for each distance.

```{r}
rows=dim(x)[1]
n=rows/4
```

Make groups for each distance and create columns for the rank and difference at each distance.

```{r}
group1 <- x[1: (n-3),]
group2 <- x[(n+1):(2*n-3),]
group3 <- x[(2*n+1):(3*n-3),]
group4 <- x[(3*n+1):(4*n-3),]
group1$Dist1 = Dist[1]
group2$Dist2 = Dist[2]
group3$Dist3 = Dist[3]
group4$Dist4 = Dist[4]
names(group1)[4:6]=c("Rank1", "Time1", "Diff1")
names(group2)[4:6]=c("Rank2", "Time2", "Diff2")
names(group3)[4:6]=c("Rank3", "Time3", "Diff3")
names(group4)[4:6]=c("Rank4", "Time4", "Diff4")
```

Join the groups into one big data frame and create a variable with the events code.

```{r}
y<-group1 |> left_join(group2) |> left_join(group3) |> left_join(group4)
y <- arrange(y, Rank4)
y$event = 2257
```

Make the data frame into a csv file so that it will be saved permanently.

```{r}
# user specifies the name and path to the file
write.csv(y, "W10kTro_2257.csv", row.names = FALSE)
```

30 changes: 30 additions & 0 deletions _prep/UFC_stats/init-UFC_stats.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Cleaning process of Kaggle's UFC Data

# This prep file assumes you have already downloaded
# the data from https://www.kaggle.com/datasets/rajeevw/ufcdata

library(tidyverse)
library(lubridate)
fight_data <- read.csv("raw_fighter_details.csv") # change to your local path

ufc_data <- fight_data |>
mutate(Str_Acc = parse_number(Str_Acc),
Weight = parse_number(Weight),
Reach = parse_number(Reach),
Str_Def = parse_number(Str_Def),
TD_Acc = parse_number(TD_Acc),
TD_Def = parse_number(TD_Def),
birthyear = year(mdy(DOB)),
feet = as.numeric(str_extract(Height, "\\d+(?=')")),
inches = as.numeric(str_extract(Height, "\\d+(?=\")")),
height_inches = (feet * 12) + inches
) |>
filter(Str_Acc != 0) |> # dump errors
filter(Reach != "") |> # dump errors
select(-feet, -inches, -DOB, -Height) |> # remove unnecessary columns
rename(Height = height_inches, Birthyear = birthyear) |>
relocate(fighter_name, Height, Weight, Reach, Stance, Birthyear, SLpM, Str_Acc,
SApM, Str_Def, TD_Avg, TD_Acc, TD_Def, Sub_Avg)

# save file
write.csv(ufc_data, "UFC_stats.csv")
48 changes: 48 additions & 0 deletions _prep/WNBA20Yrs/init-WNBA20Yrs.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
---
title: "data_cleaning"
format: html
editor: visual
---

The following code can be used to recreate the `wnba_data.csv` file.

```{r}
library(wehoop)
library(tidyverse)
```

```{r}
team_data <- load_wnba_team_box(
seasons = 2003:2022)
```

```{r}
# filter for actual teams
team_data <- team_data %>% filter(team_id <= 20)
```

```{r}
# views teams
team_data %>% filter(season == 2008) %>% select(team_id, team_display_name) %>% distinct() %>% arrange(team_id)
```

```{r}
# make new team name variable for teams that changed their name
team_data <- team_data %>% mutate(team_name = case_when(team_id == 3 ~ 'Dallas Wings',
team_id == 17 ~ 'Las Vegas Aces',
TRUE ~ team_display_name))
```

```{r}
# Select the variables we want to keep
wnba_data <- team_data %>% select(game_id, season, season_type, game_date, team_id, team_display_name, team_winner, opponent_team_id, team_home_away)
```

```{r}
# Filter for what we need
wnba_data <- team_data %>% select(game_id, season, season_type, game_date, team_id, team_display_name, team_winner, opponent_team_id, team_home_away)
```

```{r}
wnba_data %>% write.csv("wnba_data.csv", row.names = FALSE)
```
105 changes: 105 additions & 0 deletions _prep/Womens_Tremblant_GS/init-womens_tremblant_gs.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
---
title: "Alpine Ski Data Scraping Description"
format: html
---

Loading necessary packages
```{r}
library(rvest)
library(chromote)
library(tidyverse)
library(lubridate)
```

We need a function to convert times to seconds. This function works for times over a minute.
```{r}
fix_time <- function(time){
n = length(time)
Diff = time[2:n]
t1 = time[1] |> ms() |> period_to_seconds()
Diff <- parse_number(Diff)
Diff <- t1 + Diff
return(c(t1, Diff))
}
```

Here is the main function to scrape the data and reformat it.
```{r}
get_ski <- function(id_code){
## building the url
first <- "https://live.fis-ski.com/lv-al"
end <- ".htm#/follow"
url <- paste(first, id_code, end, sep = "")
## opening up a headless browser session
newpage <- ChromoteSession$new()
{
newpage$Page$navigate(url)
newpage$Page$loadEventFired()
}
## waiting a second for the browser to load before extracting
## elements
Sys.sleep(1)
## extracting elements
elements <- newpage$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value |>
read_html()
## closing the browser session
newpage$close()
## grabbing the relevant elements
results_table<- elements |>
html_elements(".sortable") |>
html_text2()
## constructing a data frame
results_matrix = matrix(results_table, ncol = 9,byrow = TRUE)
print(dim(results_matrix))
colnames(results_matrix) = results_matrix[1,]
results_matrix = results_matrix[-1,] |>
as_tibble()
## tidying the data and using the fix_time() function
results_matrix1 <- results_matrix |> rename(Number = `N°`)
results_matrix1$Total <- fix_time(results_matrix$Total)
results_matrix2 <- results_matrix1 |>
mutate(Pr=parse_number(Pr)) |>
separate(col = `Run 1`, into = c("Run_1_Time", "Run_1_Rank"),
sep = "\\(")|>
mutate(Run_1_Rank = parse_number(Run_1_Rank)) |>
separate(col = `Run 2`, into = c("Run_2_Time", "Run_2_Rank"),
sep = "\\(")|>
mutate(Run_2_Rank = parse_number(Run_2_Rank))
results_matrix2$Run_1_Time <- fix_time(results_matrix2$Run_1_Time)
results_matrix2$Run_2_Time <- fix_time(results_matrix2$Run_2_Time)
return(results_matrix2)
}
```

Testing out the scraping function with an id code. The id code comes from the 4 digit code in the url. For example, the url https://live.fis-ski.com/lv-al5009.htm#/follow has id code 5009.
```{r}
x = get_ski(5009)
```

Renaming and reordering variables.
```{r}
x <- x |> rename(Final_Rank = Rank,
Run2_Order = Number,
Run1_Order = Bib,
Total_Time = Total,
Rank_Diff = Pr,
Run1_Time = Run_1_Time,
Run1_Rank = Run_1_Rank,
Run2_Time = Run_2_Time,
Run2_Rank = Run_2_Rank) |>
select(4, 5, 3, 8, 9, 2, 10, 11, 6, 1, 7)
x
```

Saving as csv
```{r}
write.csv(x, "Tremblant.csv", row.names = FALSE)
```
Binary file added _prep/bull_riding/bullRider.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 17d1fb3

Please sign in to comment.