diff --git a/README.md b/README.md index c36a8457..50c1af44 100644 --- a/README.md +++ b/README.md @@ -54,3 +54,33 @@ In order to install this code you should clone this repository and do:: (ActSNClass) >> pip install -r requirements (ActSNClass) >> python setup.py install + +# Mlflow Integration + +I have successfully integrated MLflow for tracking and model registry in the project. This integration is implemented in the classifier.py file. + +### Function Name: + +- mlflow_tracking_And_Registry: this function handles the tracking of model metrics, parameters, and artifacts with MLflow, and registers the trained model in the MLflow Model Registry. + + +### Enabling Mlflow + +To utilize MLflow for tracking and model registry, simply set the mlflow parameter to True in the random_forest function, which is located in the classifier.py file. + +- Example Usage: random_forest( Parameters, mlflow=True) + + +### Accessing the Mlflow User Interface + +Once you have run the code with MLflow enabled, you can access the MLflow user interface to monitor your experiments and models. + +- Start the MLflow UI by running the following command in your terminal:: + + (ActSNClass) >> mlflow ui + +- Visit the MLflow UI by opening your web browser and navigating to: http://127.0.0.1:5000 + +### Reusing Models from MLflow Registry + +To download and reuse models that have been registered in the MLflow Model Registry, you can refer to the example provided in the managemodel.py file located in the actsnclass directory diff --git a/actsnclass/classifiers.py b/actsnclass/classifiers.py index 2e5a15f5..19d7e27d 100644 --- a/actsnclass/classifiers.py +++ b/actsnclass/classifiers.py @@ -16,15 +16,105 @@ # See the License for the specific language governing permissions and # limitations under the License. -__all__ = ['random_forest'] +__all__ = ['mlflow_tracking_And_Registry','random_forest'] import numpy as np from sklearn.ensemble import RandomForestClassifier +import mlflow +import mlflow.sklearn +import pandas as pd +from mlflow.tracking import MlflowClient +from datetime import datetime + + +def mlflow_tracking_And_Registry(clf, train_features): + + """Integrates MLflow tracking and registry features for machine learning model management. + + This function sets up an MLflow experiment, logs various components including model parameters, + training data, and registers the model in MLflow's model registry. It handles creating unique + identifiers for experiments and models based on training size and current date. + + Parameters + ---------- + clf : estimator + A fitted model or classifier. + train_features : np.array + Features of the training data used to fit the model. + + Notes + ----- + - Requires an active MLflow environment. + - The function assumes that `mlflow` and its related functions are properly configured. + + Outputs + ------- + - Logs model parameters, training data sample, and registers the model with a unique name. + - Saves the training data sample to a CSV file. + - The model is registered under a generated name, combining the experiment's name with the current date. + + Examples + -------- + >>> from sklearn.ensemble import RandomForestClassifier + >>> clf = RandomForestClassifier(n_estimators=100) + >>> train_features = np.random.rand(100, 4) # 100 samples, 4 features each + >>> mlflow_tracking_And_Registry(clf, train_features) + """ + + # Set the MLflow experiment + mlflow.set_experiment("Random_Forest_Experiment") + + # Enable autolog + mlflow.sklearn.autolog() + + train_size = train_features.shape[0] + + # Generate the run name with the train size + run_name = f"Train_size_{train_size}" + + # Get the current date in day-month-year format + current_date = datetime.now().strftime("%d-%m-%Y") + + # Generate the model name with the current date + model_name = f"Random_Forest_Experiment_{current_date}" + + # Start a new run + with mlflow.start_run(run_name=run_name) as run: + + # Log the model parameters + params = clf.get_params() + for param_name, param_value in params.items(): + mlflow.log_param(param_name, param_value) + + train_sample = pd.DataFrame(train_features) + train_sample_file = f"train_sample.csv" + + train_sample.to_csv(train_sample_file, index=False) + + # Log the training Data + mlflow.log_artifact(train_sample_file) + + + # Log the model with the generated name + mlflow.sklearn.log_model(clf, model_name) + + + # Register the model in the Model Registry + model_uri = f"runs:/{run.info.run_id}/{model_name}" + registered_model = mlflow.register_model(model_uri, model_name) + + # Add a description of the model version if needed + client = MlflowClient() + client.update_model_version( + name=model_name, + version=registered_model.version, + description="Random Forest model registered on " + current_date + ) def random_forest(train_features: np.array, train_labels: np.array, test_features: np.array, nest=1000, seed=42, max_depth=None, - n_jobs=1): + n_jobs=1, mlflow=False): """Random Forest classifier. Parameters @@ -61,6 +151,11 @@ def random_forest(train_features: np.array, train_labels: np.array, predictions = clf.predict(test_features) # predict prob = clf.predict_proba(test_features) # get probabilities + + if(mlflow): + # Call mlflow_tracking_And_Registry function to handle MLflow logging + mlflow_tracking_And_Registry(clf, train_features) + return predictions, prob diff --git a/actsnclass/database.py b/actsnclass/database.py index 2b784b28..93931665 100644 --- a/actsnclass/database.py +++ b/actsnclass/database.py @@ -86,7 +86,7 @@ class DataBase: Save current metrics to file. save_queried_sample(queried_sample_file: str, loop: int, full_sample: str) Save queried sample to file. - + Examples -------- >>> from actsnclass import DataBase diff --git a/actsnclass/learn_loop.py b/actsnclass/learn_loop.py index a023d952..e31854fa 100644 --- a/actsnclass/learn_loop.py +++ b/actsnclass/learn_loop.py @@ -73,7 +73,7 @@ def learn_loop(nloops: int, strategy: str, path_to_features: str, # classify data.classify(method=classifier) - + # calculate metrics data.evaluate_classification() diff --git a/actsnclass/managemodel.py b/actsnclass/managemodel.py new file mode 100644 index 00000000..8e66730a --- /dev/null +++ b/actsnclass/managemodel.py @@ -0,0 +1,29 @@ +from mlflow.tracking import MlflowClient +import mlflow.pyfunc +import numpy as np +import pandas as pd + +client = MlflowClient() + +"""# Transitionner la version 1 de "RandomForestModel" à "Staging" +client.transition_model_version_stage( + name="Random_Forest_Experiment_18-06-2024", + version=2, + stage="Staging" +) """ + +# Charger la version 1 du modèle en stage "Staging" +model = mlflow.pyfunc.load_model(model_uri="models:/Random_Forest_Experiment_Vi_19-06-2024/5") + + +data = np.random.rand(100, 12) + +feature_names = [f"feature_{i+1}" for i in range(12)] +df = pd.DataFrame(data, columns=feature_names) + +X_test = df.values +predictions = model.predict(X_test) +print(X_test) +print(predictions) + +