From f9d7e9f8d356e7b7026dd11a26d609a132ec71fa Mon Sep 17 00:00:00 2001 From: aymennasri Date: Wed, 29 Jan 2025 10:50:27 +0100 Subject: [PATCH 1/2] Collects players info from fbref.com but misses the age column due to it being inside a tag. --- NAMESPACE | 1 + R/fb_player_attributes.R | 57 ++++++++++++++++++++++++++++++++++++++++ man/get_player_info.Rd | 17 ++++++++++++ 3 files changed, 75 insertions(+) create mode 100644 R/fb_player_attributes.R create mode 100644 man/get_player_info.Rd diff --git a/NAMESPACE b/NAMESPACE index 78dabe42..b40b7556 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -31,6 +31,7 @@ export(get_match_results) export(get_match_shooting) export(get_match_summary) export(get_match_urls) +export(get_player_info) export(get_player_market_values) export(get_season_team_stats) export(get_team_match_results) diff --git a/R/fb_player_attributes.R b/R/fb_player_attributes.R new file mode 100644 index 00000000..a1fc8bf4 --- /dev/null +++ b/R/fb_player_attributes.R @@ -0,0 +1,57 @@ +#' Get Player Attributes +#' +#' Return a data frame of a player's info +#' +#' @param url URL of the player's profile page on fbref.com +#' +#' @return A data frame +#' @export + +get_player_info <- function(url) { + page <- xml2::read_html(url) + + full_name <- page %>% rvest::html_node("h1") %>% rvest::html_text2() %>% stringr::str_trim() + + position_node <- page %>% rvest::html_node(xpath = "//p[contains(., 'Position:')]") + position_footed <- position_node %>% rvest::html_text2() %>% stringr::str_trim() + position <- stringr::str_split(position_footed, "▪", 2)[[1]][1] %>% + stringr::str_remove("Position:") %>% stringr::str_trim() + footed <- stringr::str_split(position_footed, "▪", 2)[[1]][2] %>% + stringr::str_remove("Footed:") %>% stringr::str_trim() + + height_weight <- page %>% rvest::html_node(xpath = "//p[contains(., 'cm')]") %>% rvest::html_text2() + height <- stringr::str_extract(height_weight, "\\d+cm") + weight <- stringr::str_extract(height_weight, "\\d+kg") + + birth_date <- page %>% rvest::html_node("#necro-birth") %>% rvest::html_attr("data-birth") + age <- page %>% rvest::html_node(xpath = "//nobr[contains(., 'Age:')]") %>% rvest::html_text2() %>% + stringr::str_extract("\\d+-\\d+d") + + birth_place <- page %>% rvest::html_node(xpath = "//p[contains(., 'Born:')]//span[contains(., 'in ')]") %>% + rvest::html_text2() %>% stringr::str_remove("^in ") %>% stringr::str_trim() + + national_team <- page %>% rvest::html_node(xpath = "//p[contains(., 'National Team:')]/a") %>% rvest::html_text2() + club <- page %>% rvest::html_node(xpath = "//p[contains(., 'Club:')]/a") %>% rvest::html_text2() + + wages <- page %>% rvest::html_node(".important.poptip") %>% rvest::html_text2() %>% stringr::str_trim() + + twitter <- page %>% rvest::html_node(xpath = "//p[contains(., 'Twitter:')]/a") %>% rvest::html_text2() + instagram <- page %>% rvest::html_node(xpath = "//p[contains(., 'Instagram:')]/a") %>% rvest::html_text2() + + data.frame( + full_name = .replace_empty_na(full_name), + position = .replace_empty_na(position), + footed = .replace_empty_na(footed), + height = .replace_empty_na(height), + weight = .replace_empty_na(weight), + birth_date = .replace_empty_na(birth_date), + age = .replace_empty_na(age), + birth_place = .replace_empty_na(birth_place), + national_team = .replace_empty_na(national_team), + club = .replace_empty_na(club), + wages = .replace_empty_na(wages), + twitter = .replace_empty_na(twitter), + instagram = .replace_empty_na(instagram), + stringsAsFactors = FALSE + ) +} diff --git a/man/get_player_info.Rd b/man/get_player_info.Rd new file mode 100644 index 00000000..697faaa2 --- /dev/null +++ b/man/get_player_info.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/fb_player_attributes.R +\name{get_player_info} +\alias{get_player_info} +\title{Get Player Attributes} +\usage{ +get_player_info(url) +} +\arguments{ +\item{url}{URL of the player's profile page on fbref.com} +} +\value{ +A data frame +} +\description{ +Return a data frame of a player's info +} From fdbc25b6f76981b85fcd0ec8d3e56c42c3f1cd71 Mon Sep 17 00:00:00 2001 From: aymennasri Date: Thu, 30 Jan 2025 22:40:10 +0100 Subject: [PATCH 2/2] Added manual age calculation instead of scraping it. --- R/fb_player_attributes.R | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/R/fb_player_attributes.R b/R/fb_player_attributes.R index a1fc8bf4..311440ba 100644 --- a/R/fb_player_attributes.R +++ b/R/fb_player_attributes.R @@ -24,8 +24,40 @@ get_player_info <- function(url) { weight <- stringr::str_extract(height_weight, "\\d+kg") birth_date <- page %>% rvest::html_node("#necro-birth") %>% rvest::html_attr("data-birth") - age <- page %>% rvest::html_node(xpath = "//nobr[contains(., 'Age:')]") %>% rvest::html_text2() %>% - stringr::str_extract("\\d+-\\d+d") + + # Calculates age from birth_date + age <- NA + birth_date_clean <- .replace_empty_na(birth_date) + + if (!is.na(birth_date_clean)) { + birth_date_date <- as.Date(birth_date_clean) + today <- Sys.Date() + + # Calculates years + years <- as.integer(format(today, "%Y")) - as.integer(format(birth_date_date, "%Y")) + current_year_birthday <- as.Date(paste0(format(today, "%Y"), "-", format(birth_date_date, "%m-%d"))) + + # Handles invalid dates (e.g., February 29 in non-leap years) + if (is.na(current_year_birthday)) { + current_year_birthday <- as.Date(paste0(format(today, "%Y"), "-03-01")) + } + + # Adjusts years and find last valid birthday + if (current_year_birthday > today) { + years <- years - 1 + last_birthday <- as.Date(paste0(as.integer(format(today, "%Y")) - 1, "-", format(birth_date_date, "%m-%d"))) + # Handles invalid adjusted dates + if (is.na(last_birthday)) { + last_birthday <- as.Date(paste0(as.integer(format(today, "%Y")) - 1, "-03-01")) + } + } else { + last_birthday <- current_year_birthday + } + + # Calculates days since last birthday + days <- as.integer(difftime(today, last_birthday, units = "days")) + age <- paste0(years, "-", days, "d") + } birth_place <- page %>% rvest::html_node(xpath = "//p[contains(., 'Born:')]//span[contains(., 'in ')]") %>% rvest::html_text2() %>% stringr::str_remove("^in ") %>% stringr::str_trim()