diff --git a/fetch_napari_data.py b/fetch_napari_data.py index 6a4611448..314dfe8cd 100644 --- a/fetch_napari_data.py +++ b/fetch_napari_data.py @@ -7,6 +7,7 @@ import logging import re +import string import sys from concurrent.futures import ThreadPoolExecutor from urllib.parse import urljoin @@ -19,6 +20,8 @@ API_CONDA_BASE_URL = "https://npe2api.vercel.app/api/conda/" API_PYPI_BASE_URL = "https://npe2api.vercel.app/api/pypi/" API_MANIFEST_BASE_URL = "https://npe2api.vercel.app/api/manifest/" +HOME_PYPI_REGEX = r"(.*)(pypi.org)(/)(project)(/)(.*)" +HOME_GITHUB_REGEX = r"(http(s)?)(:(//)?)(.*)(github.com)(/)?(.+)(/)(.+)(\.git)?$" # Define columns needed for the plugin html page PLUGIN_PAGE_COLUMNS = [ @@ -31,7 +34,9 @@ "author", "package_metadata_author_email", "license", - "home", + "home_github", + "home_pypi", + "home_other", "package_metadata_home_page", "summary", "package_metadata_requires_python", @@ -114,6 +119,57 @@ def classify_url(url: str) -> str: return "other" +def expand_proj_url(plugin_data: dict) -> None: + """ + Expands the project URL in the plugin data dictionary. + + This function checks if the 'project_url' key exists in + the plugin data and extracts homepage and github repository URLs, + if present. + + Parameters + ---------- + plugin_data : dict + The dictionary containing plugin data. + + Returns + ------- + None + The function modifies the `plugin_data` dictionary in place. + """ + urls = plugin_data.get("project_url", []) + + # If urls do not exist, we try using the 'home_page' key (like in the old metadata spec). + if not urls and "home_page" in plugin_data: + urls = f"homepage, {plugin_data['home_page']}" + + plugin_data["home_github"] = "" + plugin_data["home_other"] = "" + for url_info in urls: + label, url = url_info.split(", ") + plugin_data[normalize_label(label)] = url + if re.match(HOME_GITHUB_REGEX, url): + # If url matches github repository link structure, + # we display the github icon. + # Otherwise, display the homepage label + # as some other url + plugin_data["home_github"] = url + elif label == "homepage": + plugin_data["home_other"] = url + del plugin_data["project_url"] + + +def normalize_label(label: str) -> str: + """Normalize project URL label. + + Code reproduced from: + https://packaging.python.org/en/latest/specifications/well-known-project-urls/#label-normalization + """ + chars_to_remove = string.punctuation + string.whitespace + removal_map = str.maketrans("", "", chars_to_remove) + return label.translate(removal_map).lower() + + def flatten_and_merge(original, additional, parent_key="") -> None: """ Recursively flattens a nested dictionary or list of dictionaries and merges the result into the original dictionary. @@ -258,13 +314,13 @@ def process_plugin(plugin): # need pypi info to get the initial and latest release date pypi_info = fetch(urljoin(API_PYPI_BASE_URL, plugin_normalized_name)) + expand_proj_url(plugin_data) + if conda_info: # we only want a limited set of conda info conda_info = { "conda_name": conda_info["name"], "conda_html_url": conda_info["html_url"], - # TODO: this should come from project_url not conda info - "home": conda_info["home"], } plugin_data.update(conda_info) @@ -278,10 +334,14 @@ def process_plugin(plugin): last_updated_date = get_version_release_date( pypi_info, plugin_latest_release ) + + # grab pypi project link + home_pypi = pypi_info["info"].get("package_url", "") plugin_data.update( { "created_at": initial_release_date, "modified_at": last_updated_date, + "home_pypi": home_pypi, } ) @@ -335,23 +395,7 @@ def process_plugin(plugin): df_plugins["modified_at"], format="mixed" ).dt.date - # Set a temporary helper column 'home_type' by classifying the 'home' URL to a common package repository name, like 'pypi', 'github', or 'other' - df_plugins["home_type"] = df_plugins["home"].apply(classify_url) - # Using the 'home_type' column, create new colums for 'pypi', 'github', and 'other' - df_plugins["home_pypi"] = df_plugins["home"].where( - df_plugins["home_type"] == "pypi", "" - ) - df_plugins["home_github"] = df_plugins["home"].where( - df_plugins["home_type"] == "github", "" - ) - df_plugins["home_other"] = df_plugins["home"].where( - df_plugins["home_type"] == "other", "" - ) - - # Delete the temporary 'home_type' column as it is no longer needed - df_plugins.drop("home_type", axis=1, inplace=True) - - # Perform row-wise cleaning of the DataFrame for author, license, and home_pypi fields + # Perform row-wise cleaning of the DataFrame for author and license fields for index, row in df_plugins.iterrows(): # Check if 'author' is NaN or contains quotation marks if pd.isna(row["author"]) or '"' in str(row["author"]): @@ -371,11 +415,5 @@ def process_plugin(plugin): elif '"' in str(row["license"]): df_plugins.at[index, "license"] = f"{row['license'][:30]}..." - # Fill home_pypi - if not row["home_pypi"]: - df_plugins.at[index, "home_pypi"] = ( - f"https://pypi.org/project/{row['name']}" - ) - # Save the final DataFrame of plugin page information to a CSV file df_plugins.to_csv(f"{data_dir}/final_plugins.csv")