diff --git a/cfa/cloudops/_cloudclient.py b/cfa/cloudops/_cloudclient.py index f7d0d87..57a0fc6 100644 --- a/cfa/cloudops/_cloudclient.py +++ b/cfa/cloudops/_cloudclient.py @@ -3,6 +3,7 @@ import logging import os from graphlib import CycleError, TopologicalSorter +from typing import Optional import networkx as nx import pandas as pd @@ -13,6 +14,9 @@ OnAllTasksComplete, OnTaskFailure, ) +from azure.keyvault.secrets import SecretClient + +# from azure.batch.models import TaskAddParameter from azure.mgmt.batch import models from azure.mgmt.resource import SubscriptionClient @@ -45,6 +49,7 @@ class CloudClient: provides convenient methods for common batch operations. Args: + keyvault (str, optional): Name of the Azure Key Vault to use for secrets. dotenv_path (str, optional): Path to .env file containing environment variables. If None, uses default .env file discovery. Default is None. use_sp (bool, optional): Whether to use Service Principal authentication. @@ -87,23 +92,47 @@ class CloudClient: def __init__( self, + keyvault: str = None, dotenv_path: str = None, use_sp: bool = False, use_federated: bool = False, + force_keyvault: bool = False, **kwargs, ): logger.debug("Initializing CloudClient.") + if keyvault is None: + dotenv_path = dotenv_path or ".env" + if keyvault is None and force_keyvault: + logger.error( + "Keyvault information not found but force_keyvault set to True." + ) + raise ValueError("Keyvault information is required but not found.") # authenticate to get credentials if not use_sp and not use_federated: - self.cred = EnvCredentialHandler(dotenv_path=dotenv_path, **kwargs) + self.cred = EnvCredentialHandler( + dotenv_path=dotenv_path, + keyvault=keyvault, + force_keyvault=force_keyvault, + **kwargs, + ) self.method = "env" - logger.info("Using environment-based credentials.") + logger.info("Using managed identity credentials.") elif use_federated: - self.cred = DefaultCredentialHandler(dotenv_path=dotenv_path, **kwargs) + self.cred = DefaultCredentialHandler( + dotenv_path=dotenv_path, + keyvault=keyvault, + force_keyvault=force_keyvault, + **kwargs, + ) self.method = "default" logger.info("Using default credentials.") else: - self.cred = SPCredentialHandler(dotenv_path=dotenv_path, **kwargs) + self.cred = SPCredentialHandler( + dotenv_path=dotenv_path, + keyvault=keyvault, + force_keyvault=force_keyvault, + **kwargs, + ) self.method = "sp" logger.info("Using service principal credentials.") # get clients @@ -1701,6 +1730,7 @@ def async_upload_folder( location_in_blob="project") Note: + The blob container must exist before uploading. Directory structure is preserved in the container. Use filtering options to avoid uploading unnecessary files like temporary files or build artifacts. @@ -2180,3 +2210,33 @@ def run_dag(self, *args: batch_helpers.Task, job_name: str, **kwargs): dlist.append(str(dp)) task_df.at[i, "deps"] = dlist logger.info(f"Completed DAG run for job '{job_name}'.") + + def get_kv_secret(self, secret_name: str, keyvault: str) -> Optional[str]: + """Retrieve a secret from Azure Key Vault. + + Args: + secret_name (str): The name of the secret to retrieve. + keyvault (str): The name of the Key Vault. + + Returns: + Optional[str]: The value of the secret, or None if not found. + """ + if self.method == "env": + cred = self.cred.user_credential + elif self.method == "default": + cred = self.cred.user_credential + else: + cred = self.cred.client_secret_credential + try: + secret_client = SecretClient( + vault_url=f"https://{keyvault}.vault.azure.net/", + credential=cred, + ) + secret = secret_client.get_secret(secret_name) + return secret.value + except Exception as e: + logger.error( + f"Failed to retrieve secret '{secret_name}' from Key Vault '{keyvault}': {e}" + ) + print(f"Error retrieving secret '{secret_name}': {e}") + return None diff --git a/cfa/cloudops/auth.py b/cfa/cloudops/auth.py index 026fa52..0b740c1 100644 --- a/cfa/cloudops/auth.py +++ b/cfa/cloudops/auth.py @@ -533,7 +533,13 @@ class EnvCredentialHandler(CredentialHandler): >>> handler = EnvCredentialHandler(dotenv_path="/path/to/.env") """ - def __init__(self, dotenv_path: str = None, **kwargs) -> None: + def __init__( + self, + dotenv_path: str = ".env", + keyvault: str = None, + force_keyvault: bool = False, + **kwargs, + ) -> None: """Initialize the EnvCredentialHandler. Loads environment variables from .env file and populates credential attributes from them. @@ -541,10 +547,17 @@ def __init__(self, dotenv_path: str = None, **kwargs) -> None: Args: dotenv_path (str, optional): Path to .env file to load environment variables from. If None, uses default .env file discovery. + keyvault (str, optional): Name of the Azure Key Vault to use for secrets. + force_keyvault (bool, optional): If True, forces loading of Key Vault secrets even if they are already set in the environment. **kwargs: Additional keyword arguments to override specific credential attributes. """ logger.debug("Initializing EnvCredentialHandler.") - load_env_vars(dotenv_path=dotenv_path) + load_env_vars( + dotenv_path=dotenv_path, + keyvault_name=keyvault, + force_keyvault=force_keyvault, + ) + get_conf = partial(get_config_val, config_dict=kwargs, try_env=True) for key in self.__dataclass_fields__.keys(): @@ -556,7 +569,9 @@ def __init__(self, dotenv_path: str = None, **kwargs) -> None: self.__setattr__("azure_batch_location", d.default_azure_batch_location) -def load_env_vars(dotenv_path=None): +def load_env_vars( + dotenv_path=None, keyvault_name: str = None, force_keyvault: bool = False +): """Load environment variables and Azure subscription information. Loads variables from a .env file (if specified), retrieves Azure subscription @@ -564,21 +579,34 @@ def load_env_vars(dotenv_path=None): Args: dotenv_path: Path to .env file to load. If None, uses default .env file discovery. + keyvault_name: Name of the Azure Key Vault to use for secrets. + force_keyvault: If True, forces loading of Key Vault secrets even if they are already set in the environment. Example: >>> load_env_vars() # Load from default .env >>> load_env_vars("/path/to/.env") # Load from specific file """ + # get ManagedIdentityCredential + mid_cred = ManagedIdentityCredential() + logger.debug("Loading environment variables.") load_dotenv(dotenv_path=dotenv_path, override=True) - # get ManagedIdentityCredential to pull SubscriptionClient - mid_cred = ManagedIdentityCredential() + sub_c = SubscriptionClient(mid_cred) # pull in account info and save to environment vars account_info = list(sub_c.subscriptions.list())[0] os.environ["AZURE_SUBSCRIPTION_ID"] = account_info.subscription_id os.environ["AZURE_TENANT_ID"] = account_info.tenant_id os.environ["AZURE_RESOURCE_GROUP_NAME"] = account_info.display_name + + # get Key Vault secrets + if keyvault_name is not None: + get_keyvault_vars( + keyvault_name=keyvault_name, + credential=mid_cred, + force_keyvault=force_keyvault, + ) + # save default values d.set_env_vars() @@ -590,7 +618,9 @@ def __init__( azure_subscription_id: str = None, azure_client_id: str = None, azure_client_secret: str = None, - dotenv_path: str = None, + dotenv_path: str = ".env", + keyvault: str = None, + force_keyvault: bool = False, **kwargs, ): """Initialize a Service Principal Credential Handler. @@ -611,6 +641,8 @@ def __init__( attempt to load from AZURE_CLIENT_SECRET environment variable. dotenv_path: Path to .env file to load environment variables from. If None, uses default .env file discovery. + keyvault: Name of the Azure Key Vault to use for secrets. + force_keyvault: If True, forces loading of Key Vault secrets even if they are already set in the environment. **kwargs: Additional keyword arguments to override specific credential attributes. Raises: @@ -681,6 +713,18 @@ def __init__( [x.lower() for x in mandatory_environment_variables], goal="service principal credentials", ) + sp_cred = ClientSecretCredential( + tenant_id=self.azure_tenant_id, + client_id=self.azure_client_id, + client_secret=self.azure_client_secret, + ) + # load keyvault secrets + if keyvault is not None: + get_keyvault_vars( + keyvault_name=keyvault, + credential=sp_cred, + force_keyvault=force_keyvault, + ) d.set_env_vars() @@ -698,7 +742,9 @@ def __init__( class DefaultCredentialHandler(CredentialHandler): def __init__( self, - dotenv_path: str | None = None, + dotenv_path: str | None = ".env", + keyvault: str = None, + force_keyvault: bool = False, **kwargs, ) -> None: """Initialize a Default Credential Handler. @@ -711,6 +757,8 @@ def __init__( Args: dotenv_path: Path to .env file to load environment variables from. If None, uses default .env file discovery. + keyvault: Name of the Azure Key Vault to use for secrets. + force_keyvault: If True, forces loading of Key Vault secrets even if they are already set in the environment. **kwargs: Additional keyword arguments to override specific credential attributes. Raises: @@ -731,7 +779,25 @@ def __init__( "Retrieving Azure subscription information using DefaultCredential." ) d_cred = DefaultCredential() - sub_c = SubscriptionClient(d_cred) + + # load keyvault secrets + if keyvault is None: + try: + keyvault = os.environ["AZURE_KEYVAULT_NAME"] + except KeyError: + keyvault = None + if keyvault is not None: + get_keyvault_vars( + keyvault_name=keyvault, + credential=d_cred, + force_keyvault=force_keyvault, + ) + + try: + sub_c = SubscriptionClient(d_cred) + except Exception as e: + logger.error(f"Failed to create SubscriptionClient: {e}") + raise sub_id = os.getenv("AZURE_SUBSCRIPTION_ID", None) if sub_id is None: logger.error("AZURE_SUBSCRIPTION_ID not found in environment variables.") @@ -929,3 +995,98 @@ def get_compute_node_identity_reference( ch = EnvCredentialHandler() logger.debug("Retrieving compute_node_identity_reference from CredentialHandler.") return ch.compute_node_identity_reference + + +def get_secret_client(keyvault: str, credential: object) -> SecretClient: + """Get an Azure Key Vault SecretClient using a CredentialHandler. + + Args: + keyvault: Name of the Azure Key Vault to connect to. + credential: Credential handler for connecting and authenticating to Azure resources. + + Returns: + SecretClient: An authenticated SecretClient for the specified Key Vault. + + Example: + >>> handler = CredentialHandler() + >>> secret_client = get_secret_client("myvault", handler) + """ + logger.debug("Creating SecretClient for Azure Key Vault.") + vault_url = f"https://{keyvault}.{d.default_azure_keyvault_endpoint_subdomain}" + secret_client = SecretClient(vault_url=vault_url, credential=credential) + logger.debug("Created SecretClient for Azure Key Vault.") + return secret_client + + +def load_keyvault_vars( + secret_client: SecretClient, + force_keyvault: bool = False, +): + """Load secrets from an Azure Key Vault into environment variables. + + Args: + secret_client: SecretClient for accessing the Azure Key Vault. + force_keyvault: If True, forces loading of Key Vault secrets even if they are already set in the environment. + """ + kv_keys = d.default_kv_keys + + for key in kv_keys: + if force_keyvault: + logger.debug( + "Force Key Vault load enabled; loading secret regardless of existing environment variable." + ) + try: + secret = secret_client.get_secret(key.replace("_", "-")).value + os.environ[key] = secret + logger.debug( + f"Loaded secret '{key}' from Key Vault into environment variable." + ) + except Exception as e: + logger.warning(f"Could not load secret '{key}' from Key Vault: {e}") + print("Error loading secret: ", e) + else: + if key in os.environ: + logger.debug( + f"Environment variable '{key}' already set; skipping Key Vault load." + ) + continue + else: + try: + secret = secret_client.get_secret(key.replace("_", "-")).value + os.environ[key] = secret + logger.debug( + f"Loaded secret '{key}' from Key Vault into environment variable." + ) + except Exception as e: + logger.warning(f"Could not load secret '{key}' from Key Vault: {e}") + print(f"Error loading secret: {e}") + + +def get_keyvault_vars( + keyvault_name: str, + credential: object, + force_keyvault: bool = False, +): + """Retrieve secrets from an Azure Key Vault and save to environment. + + Args: + keyvault_name: Name of the Azure Key Vault to connect to. + credential: Credential handler for connecting and authenticating to Azure resources. + force_keyvault: If True, forces loading of Key Vault secrets even if they are already set in the environment. + """ + if keyvault_name is None: + logger.debug("No Key Vault name provided; skipping Key Vault variable loading.") + return None + else: + os.environ["AZURE_KEYVAULT_NAME"] = keyvault_name + logger.debug("Getting SecretClient for Azure Key Vault.") + try: + secret_client = get_secret_client( + keyvault=keyvault_name, + credential=credential, + ) + except Exception as e: + logger.error(f"Failed to get SecretClient: {e}") + raise + logger.debug("Loading Key Vault secrets into environment variables.") + load_keyvault_vars(secret_client, force_keyvault=force_keyvault) diff --git a/cfa/cloudops/defaults.py b/cfa/cloudops/defaults.py index 8558421..5ba2dfb 100644 --- a/cfa/cloudops/defaults.py +++ b/cfa/cloudops/defaults.py @@ -137,6 +137,17 @@ def remaining_task_autoscale_formula( ), ) +default_kv_keys = [ + "AZURE_BATCH_ACCOUNT", + "AZURE_BATCH_LOCATION", + "AZURE_USER_ASSIGNED_IDENTITY", + "AZURE_SUBNET_ID", + "AZURE_CLIENT_ID", + "AZURE_KEYVAULT_SP_SECRET_ID", + "AZURE_BLOB_STORAGE_ACCOUNT", + "AZURE_CONTAINER_REGISTRY_ACCOUNT", +] + def set_env_vars(): """Set default Azure environment variables. diff --git a/docs/CloudClient/authentication.md b/docs/CloudClient/authentication.md index 3678a83..513a1a0 100644 --- a/docs/CloudClient/authentication.md +++ b/docs/CloudClient/authentication.md @@ -1,6 +1,6 @@ # Authentication with `cfa.cloudops.CloudClient` -Authentication with the `CloudClient` class is meant to be user-friendly while maintaining flexibility. There are three different ways to authenticate to the Azure environment, all of which center around environment variables for Azure account information. These environment variables can be pulled from the local environment or instantiated from a .env file specified during the `CloudClient` instantiation. +Authentication with the `CloudClient` class is meant to be user-friendly while maintaining flexibility. There are three different ways to authenticate to the Azure environment, all of which center around either a Key Vault or environment variables for Azure account information. A key vault name can be provided to pull necessary values for instantiating the CloudClient. Or these environment variables can be pulled from the local environment or instantiated from a .env file specified during the `CloudClient` instantiation. The three authentication methods available are: @@ -8,9 +8,38 @@ The three authentication methods available are: - Service Principal credential - Federated Token credential +## Using Key Vault Setup + +When the `CloudClient` class gets instantiated, one way it attempts to get one of the three credentials listed above is by pulling values from the specified `keyvault`. The Key Vault to be used by CFA individuals can be found in the documentation [here](https://github.com/cdcent/cfa-cloudops-example). This will then pull the following values from the Key Vault: + +- azure_batch_account +- azure_batch_location +- azure_user_assigned_identity +- azure_subnet_id +- azure_client_id +- azure_keyvault_sp_secret_id +- azure_blob_storage_account +- azure_container_registry_account + +If the Key Vault is setup with these keys/values (the correct CFA key vault is), then no .env file is necessary. If a .env is still provided, then values from the .env will be used over what is stored in the key vault. If you desire to use values in the keyvault over the .env, provide the flag `force_keyvault=True` when instantiating the `CloudClient`. Note that if you are using a service principal then "AZURE_TENANT_ID","AZURE_SUBSCRIPTION_ID", "AZURE_CLIENT_ID", and "AZURE_CLIENT_SECRET" need to be in the .env file, saved as local environment variables, or passed to the `CloudClient`. + +For ease of use, you can also set AZURE_KEYVAULT_NAME as a global environment variable in your development workspace so that it will be passed to the `CloudClient` and eliminate the need for any parameters when instantiating the CloudClient. + +For example, the following way pulls values from our Key Vault called 'my-key-vault'. + +```python3 +client = CloudClient(keyvault = "my-key-vault") +``` + +If we want to force the use of Key Vault values, the following should be run: + +```python3 +client = CloudClient(keyvault = "my-key-vault", force_keyvault = True) +``` + ## Environment Variable Setup -When the `CloudClient` class gets instantiated, it attempts to get one of the three credentials listed above based on environment variables. These environment variables can be stored locally on your system before calling out to the `CloudClient` class. A potentially easier way is to store the required variables is in a .env file. This allows for easier changing of variables or sharing between individuals. +When the `CloudClient` class gets instantiated, the other way it attempts to get one of the three credentials listed above is based on environment variables. These environment variables can be stored locally on your system before calling out to the `CloudClient` class. A potentially easier way is to store the required variables is in a .env file. This allows for easier changing of variables or sharing between individuals. The path to the .env file can be provided via the `dotenv_path` parameter when calling `CloudClient()`. By default, it looks for a file called `.env`. If the name of the file is anything else, it should be passed to `dotenv_path`. For example, instantiating the client in the following ways would be identical: ```python @@ -23,8 +52,7 @@ If the .env file is called "my_azure.env" then the following should be run: client = CloudClient(dotenv_path = "my_azure.env") ``` -During instantiation of the `CloudClient`, the variables from the .env file get added to the local environment variables, overriding any variables with the same name. Then all the environment variables from the local environment are used to create a cre -dential. +During instantiation of the `CloudClient`, the variables from the .env file get added to the local environment variables, overriding any variables with the same name. Then all the environment variables from the local environment are used to create a credential. An example .env file can be found [here](../files/sample.env). @@ -35,7 +63,7 @@ An example .env file can be found [here](../files/sample.env). The default method for authenticating to the Azure environment via the `CloudClient` is a Managed Identity. Data Scientists at CFA should already have identities associated with Azure in their development environment (VAP). Because of this, we can reduce the number of inputs to authenticate with Azure because your machine is already approved. This is the encouraged method when possible. When this method is used, we are able to pull in AZURE_SUBSCRIPTION_ID, AZURE_TENANT_ID, and AZURE_RESOURCE_GROUP_NAME from the linked subscription. Therefore, these values do not need to exist in the local environment or .env file. To instantiate a `CloudClient` object using a Managed Identity credential, no additional arguments need to be passed in, except from `dotenv_path` if needed. For example: -```python3 +```python client = CloudClient() ``` diff --git a/docs/examples/getting_started/cloudclient_walkthrough.ipynb b/docs/examples/getting_started/cloudclient_walkthrough.ipynb index 476f8bb..dea8f9f 100644 --- a/docs/examples/getting_started/cloudclient_walkthrough.ipynb +++ b/docs/examples/getting_started/cloudclient_walkthrough.ipynb @@ -38,7 +38,7 @@ "id": "b60f671d", "metadata": {}, "source": [ - "The initialization below is the simplest way to create and instance of the `CloudClient` class. It will use environment variables or values stored in a .env file to authenticate, like the .env file stored [here](../../files/sample.env), and a managed identity credential based on your local working environment. The .env file should be stored at the same level in the directory in which you're working." + "The initialization below is the simplest way to create and instance of the `CloudClient` class. If a variable called AZURE_KEYVAULT_NAME is saved to your environment, the `CloudClient` will initialize based on some Azure values stored in the Key Vault. Otherwise it will use environment variables or values stored in a .env file to authenticate, like the .env file stored [here](../../files/sample.env), and a managed identity credential based on your local working environment. The .env file should be stored at the same level in the directory in which you're working." ] }, { @@ -56,6 +56,50 @@ "cc = CloudClient()" ] }, + { + "cell_type": "markdown", + "id": "a2643149", + "metadata": {}, + "source": [ + "We could also specify the Key Vault directly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c2f7e39", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "#cc = CloudClient(keyvault = 'my-key-vault')" + ] + }, + { + "cell_type": "markdown", + "id": "9d6d3c1d", + "metadata": {}, + "source": [ + "If we want to ensure we use values from the Key Vault even though they might exist in the .env file, we can specify this by setiing `force_keyvault=True`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f33ea7d0", + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "# cc = CloudClient(keyvault = 'my-key-vault', force_keyvault = True)" + ] + }, { "cell_type": "markdown", "id": "8a3e55aa",