-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from iramler/main
adding spring 2024 data from SLU seminar group
- Loading branch information
Showing
26 changed files
with
8,478 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
--- | ||
title: "Scraping Data From FIS Website" | ||
author: "Abigail Smith" | ||
format: html | ||
--- | ||
|
||
Load needed libraries | ||
|
||
```{r} | ||
library(rvest) | ||
library(chromote) | ||
library(tidyverse) | ||
library(lubridate) | ||
``` | ||
|
||
Pick a page to scrape from and read it with embedded chrome browser | ||
|
||
```{r} | ||
# Copy and paste the pages URL into code below | ||
url <- "https://live.fis-ski.com/cc-2257/results-pda.htm" | ||
newpage <- ChromoteSession$new() | ||
{ | ||
newpage$Page$navigate(url) | ||
newpage$Page$loadEventFired() | ||
} | ||
# This opens a browser that displays the page | ||
newpage$view() | ||
``` | ||
|
||
After you have what you want to scarpe displayed in the browser, this code chunk will pull out all html elements. | ||
|
||
```{r} | ||
elements <- newpage$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value |> | ||
read_html() | ||
``` | ||
|
||
```{r} | ||
# Close the page so that your session does not time out | ||
newpage$close() | ||
``` | ||
|
||
Create columns for each variable using the html.elements("") function, by piping into elements. These each use a different html key which can be identified with the Selector Gadget. | ||
|
||
```{r} | ||
Bib <- elements |> | ||
html_elements(".col_bib") |> | ||
html_text2() | ||
Rank <- elements |> | ||
html_elements(".col_rank") |> | ||
html_text2() | ||
Name <- elements |> | ||
html_elements(".col_name") |> | ||
html_text2() | ||
NSA <- elements |> | ||
html_elements(".col_nsa") |> | ||
html_text2() | ||
Result <- elements |> | ||
html_elements(".col_result") |> | ||
html_text2() | ||
Diff <- elements |> | ||
html_elements(".col_diff") |> | ||
html_text2() | ||
``` | ||
|
||
Make a data frame with all the columns that you just created. | ||
|
||
```{r} | ||
x <- data.frame(Bib, Name, NSA, Rank, Result, Diff) | ||
x <- x[x$Bib!="Bib",] # This removes any rows which are extra labels without skiers | ||
``` | ||
|
||
|
||
Make the values in each variable numeric. | ||
|
||
```{r} | ||
x$Bib=as.numeric(x$Bib) | ||
x$Rank=as.numeric(x$Rank) | ||
x$Diff=as.numeric(x$Diff) | ||
``` | ||
|
||
Convert the values for the Result variable from milliseconds to seconds and round them. | ||
|
||
```{r} | ||
x$Result<-x$Result |> ms() |> period_to_seconds() | ||
x$Result<-round(x$Result/60,3) | ||
``` | ||
|
||
Right now we have data for 4 different points in the race all in one table, we want to split that up into four individual tables. | ||
|
||
|
||
Create a vector of the distances being used as reference points for each of the four tables. | ||
|
||
```{r} | ||
Dist <- c(2,5.2,7.5,11) | ||
``` | ||
|
||
Within that data frame, create a vector with the number of rows in the data frame and then for the true value of n create a vector in which those are divided by 4 to get rid of duplicates. The webpage shows results at 4 different times in the race, so this will split these up so that we have a table for each distance. | ||
|
||
```{r} | ||
rows=dim(x)[1] | ||
n=rows/4 | ||
``` | ||
|
||
Make groups for each distance and create columns for the rank and difference at each distance. | ||
|
||
```{r} | ||
group1 <- x[1: (n-3),] | ||
group2 <- x[(n+1):(2*n-3),] | ||
group3 <- x[(2*n+1):(3*n-3),] | ||
group4 <- x[(3*n+1):(4*n-3),] | ||
group1$Dist1 = Dist[1] | ||
group2$Dist2 = Dist[2] | ||
group3$Dist3 = Dist[3] | ||
group4$Dist4 = Dist[4] | ||
names(group1)[4:6]=c("Rank1", "Time1", "Diff1") | ||
names(group2)[4:6]=c("Rank2", "Time2", "Diff2") | ||
names(group3)[4:6]=c("Rank3", "Time3", "Diff3") | ||
names(group4)[4:6]=c("Rank4", "Time4", "Diff4") | ||
``` | ||
|
||
Join the groups into one big data frame and create a variable with the events code. | ||
|
||
```{r} | ||
y<-group1 |> left_join(group2) |> left_join(group3) |> left_join(group4) | ||
y <- arrange(y, Rank4) | ||
y$event = 2257 | ||
``` | ||
|
||
Make the data frame into a csv file so that it will be saved permanently. | ||
|
||
```{r} | ||
# user specifies the name and path to the file | ||
write.csv(y, "W10kTro_2257.csv", row.names = FALSE) | ||
``` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Cleaning process of Kaggle's UFC Data | ||
|
||
# This prep file assumes you have already downloaded | ||
# the data from https://www.kaggle.com/datasets/rajeevw/ufcdata | ||
|
||
library(tidyverse) | ||
library(lubridate) | ||
fight_data <- read.csv("raw_fighter_details.csv") # change to your local path | ||
|
||
ufc_data <- fight_data |> | ||
mutate(Str_Acc = parse_number(Str_Acc), | ||
Weight = parse_number(Weight), | ||
Reach = parse_number(Reach), | ||
Str_Def = parse_number(Str_Def), | ||
TD_Acc = parse_number(TD_Acc), | ||
TD_Def = parse_number(TD_Def), | ||
birthyear = year(mdy(DOB)), | ||
feet = as.numeric(str_extract(Height, "\\d+(?=')")), | ||
inches = as.numeric(str_extract(Height, "\\d+(?=\")")), | ||
height_inches = (feet * 12) + inches | ||
) |> | ||
filter(Str_Acc != 0) |> # dump errors | ||
filter(Reach != "") |> # dump errors | ||
select(-feet, -inches, -DOB, -Height) |> # remove unnecessary columns | ||
rename(Height = height_inches, Birthyear = birthyear) |> | ||
relocate(fighter_name, Height, Weight, Reach, Stance, Birthyear, SLpM, Str_Acc, | ||
SApM, Str_Def, TD_Avg, TD_Acc, TD_Def, Sub_Avg) | ||
|
||
# save file | ||
write.csv(ufc_data, "UFC_stats.csv") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
--- | ||
title: "data_cleaning" | ||
format: html | ||
editor: visual | ||
--- | ||
|
||
The following code can be used to recreate the `wnba_data.csv` file. | ||
|
||
```{r} | ||
library(wehoop) | ||
library(tidyverse) | ||
``` | ||
|
||
```{r} | ||
team_data <- load_wnba_team_box( | ||
seasons = 2003:2022) | ||
``` | ||
|
||
```{r} | ||
# filter for actual teams | ||
team_data <- team_data %>% filter(team_id <= 20) | ||
``` | ||
|
||
```{r} | ||
# views teams | ||
team_data %>% filter(season == 2008) %>% select(team_id, team_display_name) %>% distinct() %>% arrange(team_id) | ||
``` | ||
|
||
```{r} | ||
# make new team name variable for teams that changed their name | ||
team_data <- team_data %>% mutate(team_name = case_when(team_id == 3 ~ 'Dallas Wings', | ||
team_id == 17 ~ 'Las Vegas Aces', | ||
TRUE ~ team_display_name)) | ||
``` | ||
|
||
```{r} | ||
# Select the variables we want to keep | ||
wnba_data <- team_data %>% select(game_id, season, season_type, game_date, team_id, team_display_name, team_winner, opponent_team_id, team_home_away) | ||
``` | ||
|
||
```{r} | ||
# Filter for what we need | ||
wnba_data <- team_data %>% select(game_id, season, season_type, game_date, team_id, team_display_name, team_winner, opponent_team_id, team_home_away) | ||
``` | ||
|
||
```{r} | ||
wnba_data %>% write.csv("wnba_data.csv", row.names = FALSE) | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
--- | ||
title: "Alpine Ski Data Scraping Description" | ||
format: html | ||
--- | ||
|
||
Loading necessary packages | ||
```{r} | ||
library(rvest) | ||
library(chromote) | ||
library(tidyverse) | ||
library(lubridate) | ||
``` | ||
|
||
We need a function to convert times to seconds. This function works for times over a minute. | ||
```{r} | ||
fix_time <- function(time){ | ||
n = length(time) | ||
Diff = time[2:n] | ||
t1 = time[1] |> ms() |> period_to_seconds() | ||
Diff <- parse_number(Diff) | ||
Diff <- t1 + Diff | ||
return(c(t1, Diff)) | ||
} | ||
``` | ||
|
||
Here is the main function to scrape the data and reformat it. | ||
```{r} | ||
get_ski <- function(id_code){ | ||
## building the url | ||
first <- "https://live.fis-ski.com/lv-al" | ||
end <- ".htm#/follow" | ||
url <- paste(first, id_code, end, sep = "") | ||
## opening up a headless browser session | ||
newpage <- ChromoteSession$new() | ||
{ | ||
newpage$Page$navigate(url) | ||
newpage$Page$loadEventFired() | ||
} | ||
## waiting a second for the browser to load before extracting | ||
## elements | ||
Sys.sleep(1) | ||
## extracting elements | ||
elements <- newpage$Runtime$evaluate("document.querySelector('html').outerHTML")$result$value |> | ||
read_html() | ||
## closing the browser session | ||
newpage$close() | ||
## grabbing the relevant elements | ||
results_table<- elements |> | ||
html_elements(".sortable") |> | ||
html_text2() | ||
## constructing a data frame | ||
results_matrix = matrix(results_table, ncol = 9,byrow = TRUE) | ||
print(dim(results_matrix)) | ||
colnames(results_matrix) = results_matrix[1,] | ||
results_matrix = results_matrix[-1,] |> | ||
as_tibble() | ||
## tidying the data and using the fix_time() function | ||
results_matrix1 <- results_matrix |> rename(Number = `N°`) | ||
results_matrix1$Total <- fix_time(results_matrix$Total) | ||
results_matrix2 <- results_matrix1 |> | ||
mutate(Pr=parse_number(Pr)) |> | ||
separate(col = `Run 1`, into = c("Run_1_Time", "Run_1_Rank"), | ||
sep = "\\(")|> | ||
mutate(Run_1_Rank = parse_number(Run_1_Rank)) |> | ||
separate(col = `Run 2`, into = c("Run_2_Time", "Run_2_Rank"), | ||
sep = "\\(")|> | ||
mutate(Run_2_Rank = parse_number(Run_2_Rank)) | ||
results_matrix2$Run_1_Time <- fix_time(results_matrix2$Run_1_Time) | ||
results_matrix2$Run_2_Time <- fix_time(results_matrix2$Run_2_Time) | ||
return(results_matrix2) | ||
} | ||
``` | ||
|
||
Testing out the scraping function with an id code. The id code comes from the 4 digit code in the url. For example, the url https://live.fis-ski.com/lv-al5009.htm#/follow has id code 5009. | ||
```{r} | ||
x = get_ski(5009) | ||
``` | ||
|
||
Renaming and reordering variables. | ||
```{r} | ||
x <- x |> rename(Final_Rank = Rank, | ||
Run2_Order = Number, | ||
Run1_Order = Bib, | ||
Total_Time = Total, | ||
Rank_Diff = Pr, | ||
Run1_Time = Run_1_Time, | ||
Run1_Rank = Run_1_Rank, | ||
Run2_Time = Run_2_Time, | ||
Run2_Rank = Run_2_Rank) |> | ||
select(4, 5, 3, 8, 9, 2, 10, 11, 6, 1, 7) | ||
x | ||
``` | ||
|
||
Saving as csv | ||
```{r} | ||
write.csv(x, "Tremblant.csv", row.names = FALSE) | ||
``` |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.