From 0215a9a5a4f8cff351636dbfa22876af8db71063 Mon Sep 17 00:00:00 2001 From: zhongtianq <141391912+zhongtianq@users.noreply.github.com> Date: Thu, 20 Jun 2024 19:32:40 +0800 Subject: [PATCH] repo-sync-2024-06-19T10:45:51+0800 (#72) --- docs/architecture/apps/index.rst | 2 + docs/architecture/apps/lgbm_predict.md | 123 ++++++ docs/architecture/apps/lgbm_train.md | 141 +++++++ docs/architecture/index.rst | 2 +- docs/architecture/policy.md | 75 ++-- docs/development/index.rst | 15 + docs/development/new_component.ipynb | 559 +++++++++++++++++++++++++ docs/index.rst | 4 + docs/quick_start/step3.ipynb | 88 ++-- 9 files changed, 923 insertions(+), 86 deletions(-) create mode 100644 docs/architecture/apps/lgbm_predict.md create mode 100644 docs/architecture/apps/lgbm_train.md create mode 100644 docs/development/index.rst create mode 100644 docs/development/new_component.ipynb diff --git a/docs/architecture/apps/index.rst b/docs/architecture/apps/index.rst index 5fa23f1..f576345 100644 --- a/docs/architecture/apps/index.rst +++ b/docs/architecture/apps/index.rst @@ -31,6 +31,8 @@ TrustedFlow内置了多种可信APP,每一个可信APP在执行计算逻辑之 lr_train xgb_predict lr_predict + lgbm_train + lgbm_predict binary_evaluation prediction_bias_eval diff --git a/docs/architecture/apps/lgbm_predict.md b/docs/architecture/apps/lgbm_predict.md new file mode 100644 index 0000000..5b2719f --- /dev/null +++ b/docs/architecture/apps/lgbm_predict.md @@ -0,0 +1,123 @@ +# LightGBM预测 + +使用给定的LightGBM模型对数据进行预测。 + +## 组件定义 + +1. 参数 + (1) pred_name: 预测值的列名。 + (2) save_label: 输出结果是否包含标签列,true表示保存。 + (3) label_name: 标签列的名称,默认为“label”。 + (4) save_id: 输出结果是否保存ID列,true表示保存。 + (5) id_name: ID列的名称。 + (6) col_names: 可选,输出指定的列到结果中,默认为空。 +2. 输入:待预测的数据以及LightGBM模型。 +3. 输出:预测结果。 + +```json +{ + "domain": "ml.predict", + "name": "lgbm_predict", + "desc": "Predict using the lgbm model.", + "version": "0.0.1", + "attrs": [ + { + "name": "pred_name", + "desc": "Column name for predictions.", + "type": "AT_STRING", + "atomic": { + "is_optional": true, + "default_value": { + "s": "pred" + } + } + }, + { + "name": "save_label", + "desc": "Whether or not to save real label column into output pred table. If true, input feature_dataset must contain label column.", + "type": "AT_BOOL", + "atomic": { + "is_optional": true, + "default_value": {} + } + }, + { + "name": "label_name", + "desc": "Column name for label.", + "type": "AT_STRING", + "atomic": { + "is_optional": true, + "default_value": { + "s": "label" + } + } + }, + { + "name": "save_id", + "desc": "Whether to save id column into output pred table. If true, input feature_dataset must contain id column.", + "type": "AT_BOOL", + "atomic": { + "is_optional": true, + "default_value": {} + } + }, + { + "name": "id_name", + "desc": "Column name for id.", + "type": "AT_STRING", + "atomic": { + "is_optional": true, + "default_value": { + "s": "id" + } + } + }, + { + "name": "col_names", + "desc": "Extra column names into output pred table.", + "type": "AT_STRINGS", + "atomic": { + "list_max_length_inclusive": "-1", + "is_optional": true + } + } + ], + "inputs": [ + { + "name": "feature_dataset", + "desc": "Input feature dataset.", + "types": [ + "sf.table.individual" + ], + "attrs": [ + { + "name": "ids", + "desc": "Id columns.", + "col_max_cnt_inclusive": "1" + }, + { + "name": "label", + "desc": "Label column.", + "col_max_cnt_inclusive": "1" + } + ] + }, + { + "name": "model", + "desc": "Input model.", + "types": [ + "sf.model.lgbm" + ] + } + ], + "outputs": [ + { + "name": "pred", + "desc": "Output prediction.", + "types": [ + "sf.table.individual" + ] + } + ] +} +``` \ No newline at end of file diff --git a/docs/architecture/apps/lgbm_train.md b/docs/architecture/apps/lgbm_train.md new file mode 100644 index 0000000..d8684e0 --- /dev/null +++ b/docs/architecture/apps/lgbm_train.md @@ -0,0 +1,141 @@ +# LightGBM训练 + +使用LightGBM对数据集进行训练,得到LightGBM模型,支持二分类和线性回归。 + +## 组件定义 + +```json +{ + "domain": "ml.train", + "name": "lgbm_train", + "desc": "LightGBM train component for individual dataset.", + "version": "0.0.1", + "attrs": [ + { + "name": "n_estimators", + "desc": "Number of boosted trees to fit.", + "type": "AT_INT", + "atomic": { + "is_optional": true, + "default_value": { + "i64": "10" + }, + "lower_bound_enabled": true, + "lower_bound": { + "i64": "1" + }, + "lower_bound_inclusive": true, + "upper_bound_enabled": true, + "upper_bound": { + "i64": "1024" + }, + "upper_bound_inclusive": true + } + }, + { + "name": "objective", + "desc": "Specify the learning objective.", + "type": "AT_STRING", + "atomic": { + "is_optional": true, + "default_value": { + "s": "binary" + }, + "allowed_values": { + "ss": [ + "regression", + "binary" + ] + } + } + }, + { + "name": "boosting_type", + "desc": "Boosting type.", + "type": "AT_STRING", + "atomic": { + "is_optional": true, + "default_value": { + "s": "gbdt" + }, + "allowed_values": { + "ss": [ + "gbdt", + "rf", + "dart" + ] + } + } + }, + { + "name": "learning_rate", + "desc": "Learning rate.", + "type": "AT_FLOAT", + "atomic": { + "is_optional": true, + "default_value": { + "f": 0.1 + }, + "lower_bound_enabled": true, + "lower_bound": {}, + "upper_bound_enabled": true, + "upper_bound": { + "f": 1 + }, + "upper_bound_inclusive": true + } + }, + { + "name": "num_leaves", + "desc": "Max number of leaves in one tree.", + "type": "AT_INT", + "atomic": { + "is_optional": true, + "default_value": { + "i64": "31" + }, + "lower_bound_enabled": true, + "lower_bound": { + "i64": "2" + }, + "lower_bound_inclusive": true, + "upper_bound_enabled": true, + "upper_bound": { + "i64": "1024" + }, + "upper_bound_inclusive": true + } + } + ], + "inputs": [ + { + "name": "train_dataset", + "desc": "Input table.", + "types": [ + "sf.table.individual" + ], + "attrs": [ + { + "name": "ids", + "desc": "Id columns will not be trained." + }, + { + "name": "label", + "desc": "Label column.", + "col_min_cnt_inclusive": "1", + "col_max_cnt_inclusive": "1" + } + ] + } + ], + "outputs": [ + { + "name": "output_model", + "desc": "Output model.", + "types": [ + "sf.model.lgbm" + ] + } + ] +} +``` \ No newline at end of file diff --git a/docs/architecture/index.rst b/docs/architecture/index.rst index 012aa17..d366782 100644 --- a/docs/architecture/index.rst +++ b/docs/architecture/index.rst @@ -1,4 +1,4 @@ -核心功能 +架构设计 ======================== 想了解TrustedFlow原理和功能,欢迎阅读下列文章! diff --git a/docs/architecture/policy.md b/docs/architecture/policy.md index 523ea54..75eb64c 100644 --- a/docs/architecture/policy.md +++ b/docs/architecture/policy.md @@ -16,7 +16,15 @@ constraint本质上是描述“访问控制”这件事,比如允许对数据 ### 可限制的元信息 constraint支持对以下元信息进行限制。具体语法上,每一条constraint的元素都是以`r.`作为开头。(TrustedFlow采用了[casbin](https://github.com/casbin/casbin)作为底层的访问控制实现) +#### platform +在[global_constraints](#global_constraints)下设置。 +限制代码运行的TEE平台。目前可选`sim/sgx/tdx/csv`。 +```yaml +r.env.tee.platform=="tee platform type" +``` + #### mr_enclave +在[global_constraints](#global_constraints)下设置。 限制代码的MRENCLAVE,关于MRENCLAVE的说明参见 [Enclave](./tee/sgx.md#enclave) 。 ```yaml @@ -24,41 +32,26 @@ r.env.tee.sgx.mr_encalve=="mrenclave of the enclave" ``` #### mr_signer +在[global_constraints](#global_constraints)下设置。 限制代码的MRSIGNER,关于MRSIGNER的说明参见 [Enclave](./tee/sgx.md#enclave) 。 ```yaml r.env.tee.sgx.mr_signer=="mrsigner of the enclave" ``` -#### op -限制可以使用哪些[可信APP](./apps/index.rst)进行计算,需要配合rule一齐生效(具体参见后面的rule说明)。 -目前可信APP对应的op名称为 - -- [数据求交](./apps/intersect.md): `OP_PSI` -- [数据随机切割](./apps/split.md): `OP_DATASET_SPLIT` -- [特征过滤](./apps/feature_filter.md): `OP_DATASET_FILTER` -- [全表统计](./apps/data_describe.md): `OP_TABLE_STATISTICS` -- [WOE分箱](./apps/woe_binning.md): `OP_WOE_BINNING` -- [WOE转换](./apps/woe_substitution.md): `OP_WOE_SUBSTITUTION` -- [相关系数矩阵](./apps/corr.md): `OP_STATS_CORR` -- [VIF](./apps/vif.md): `OP_LR` -- [LR训练](./apps/lr_train.md): `OP_WOE_SUBSTITUTION` -- [LR预测](./apps/lr_predict.md): `OP_PREDICT` -- [XGBoost训练](./apps/xgb_train.md): `OP_XGB` -- [XGBoost预测](./apps/xgb_train.md): `OP_PREDICT` -- [二分类评估](./apps/binary_evaluation.md): `OP_BICLASSIFIER_EVALUATION` -- [预测偏差评估](./apps/prediction_bias_eval.md): `OP_PREDICTION_BIAS_EVALUATION` - -示例写法如下。 +#### (暂不可用) execution_time +在[global_constraints](#global_constraints)下设置。 +限制执行时间。 ```yaml -# 表示限制仅能对数据执行XGBoost训练。 -r.op=="OP_XGB" +r.execution_time<="2023-10-01 23:59:59" ``` -#### (暂不可用)execution_time -限制执行时间。 +#### (暂不可用) op参数 +在[op_constraints](#op_constraints)下设置。 +限制可信app的参数。具体参数名可以在[可信APP](./apps/index.rst)中找到对应的app查询。 +例如限制回归类型为逻辑回归: ```yaml -r.execution_time<="2023-10-01 23:59:59" +r.op.params.reg_type=="logistic" ``` ### 元素之间支持的操作符 @@ -93,8 +86,10 @@ op_constraints表示作用于特定算法的约束,由一条或者多条op_con 下列rule描述了以下限制 1. 被授权方为bob和carol 2. 允许使用数据列f1、f2和f3 -3. 限制XGB和LR的mrenclave -4. 限制所有代码的mrsigner +3. 允许xgb_train组件使用数据 +4. 允许lr_train组件进行逻辑回归时使用数据 +5. 限制组件运行平台为sgx +6. 限制代码的mr_enclave为MRENCLAVE ```json { @@ -110,20 +105,19 @@ op_constraints表示作用于特定算法的约束,由一条或者多条op_con ], "op_constraints":[ { - "op_name":"OP_XGB", - "constraints":[ - "r.op==\"OP_XGB\" && r.env.tee.sgx.mr_enclave==\"XGB_ENCLAVE\"" - ] + "op_name": "xgb_train", + "constraints":[] }, { - "op_name":"OP_LR", + "op_name": "lr_train", "constraints":[ - "r.op==\"OP_LR\" && r.env.tee.sgx.mr_enclave==\"LR_ENCLAVE\"" + "r.op.params.reg_type==\"logistic\"" ] } ], "global_constraints":[ - "r.env.tee.sgx.mr_signer==\"MRSIGNER\"" + "r.env.tee.platform==\"sgx\"", + "r.env.tee.sgx.mr_enclave==\"MRENCLAVE\"" ] } ``` @@ -160,20 +154,19 @@ op_constraints表示作用于特定算法的约束,由一条或者多条op_con ], "op_constraints":[ { - "op_name":"OP_XGB", - "constraints":[ - "r.op==\"OP_XGB\" && r.env.tee.sgx.mr_enclave==\"XGB_ENCLAVE\"" - ] + "op_name": "xgb_train", + "constraints":[] }, { - "op_name":"OP_LR", + "op_name": "lr_train", "constraints":[ - "r.op==\"OP_LR\" && r.env.tee.sgx.mr_enclave==\"LR_ENCLAVE\"" + "r.op.params.reg_type==\"logistic\"" ] } ], "global_constraints":[ - "r.env.tee.sgx.mr_signer==\"MRSIGNER\"" + "r.env.tee.platform==\"sgx\"", + "r.env.tee.sgx.mr_enclave==\"MRENCLAVE\"" ] } ] diff --git a/docs/development/index.rst b/docs/development/index.rst new file mode 100644 index 0000000..5d4b216 --- /dev/null +++ b/docs/development/index.rst @@ -0,0 +1,15 @@ +.. _development: + +开发者教程 +=============== +本文档面向想要基于TrustedFlow进行二次开发的工程人员。 + +新组件开发 +--------------- +TrustedFlow已经提供了一些常用的组件。 +但有时候您可能想新增一个组件,或者想要修改已有组件的参数。这是您可以阅读 `新组件开发教程 `_ 。 + +.. toctree:: + :maxdepth: 2 + + new_component \ No newline at end of file diff --git a/docs/development/new_component.ipynb b/docs/development/new_component.ipynb new file mode 100644 index 0000000..0430a8f --- /dev/null +++ b/docs/development/new_component.ipynb @@ -0,0 +1,559 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 新组件开发教程\n", + "TrustedFlow中组件定义和实现都在[Teeapps](https://github.com/secretflow/teeapps)中。\n", + "我们的组件通过secretflow的[component spec](https://github.com/secretflow/spec)来统一定义。这是隐语开放标准中用于定义组件的标准。用这套标准我们可以定义组件的名称、版本,参数的类型、取值范围、说明,定义输入输出的格式。建议您先阅读这套组件定义标准,便于理解接下来的开发流程。\n", + "\n", + "下面我们以新增LightGBM训练算法为例来说明新增组件的开发流程。\n", + "准备好代码:\n", + "```bash\n", + "git clone https://github.com/secretflow/teeapps.git\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 定义组件\n", + "\n", + "### 1. 新建文件\n", + "在`teeapps/component`目录下找到合适的组件分类,您也可以新建新的组件分类。\n", + "LightGBM训练算法属于机器学习领域,因此在`ml/train`分类下新建`lgbm_component.h`和`lgbm_component.cc`文件。\n", + "\n", + "### 2. 声明组件\n", + "LightGBM训练组件头文件`lgbm_component.h`示例如下:\n", + "```c++\n", + "#pragma once\n", + "\n", + "#include \"../../component.h\"\n", + "\n", + "namespace teeapps {\n", + "namespace component {\n", + "\n", + "class LgbmTrainComponent : public Component {\n", + " private:\n", + " void Init();\n", + "\n", + " explicit LgbmTrainComponent(\n", + " const std::string& name = \"lgbm_train\",\n", + " const std::string& domain = \"ml.train\",\n", + " const std::string& version = \"0.0.1\",\n", + " const std::string& desc =\n", + " \"LightGBM train component for individual dataset.\")\n", + " : Component(name, domain, version, desc) {\n", + " Init();\n", + " }\n", + " ~LgbmTrainComponent() {}\n", + " LgbmTrainComponent(const LgbmTrainComponent&) = delete;\n", + " const LgbmTrainComponent& operator=(const LgbmTrainComponent&) = delete;\n", + "\n", + " public:\n", + " static LgbmTrainComponent& GetInstance() {\n", + " static LgbmTrainComponent instance;\n", + " return instance;\n", + " }\n", + "};\n", + "\n", + "} // namespace component\n", + "} // namespace teeapps\n", + "```\n", + "\n", + "这段代码声明了新组件的一些基本信息:\n", + "\n", + "- 组件名称:\"lgbm_train\"\n", + "- 组件所属领域:\"ml.train\"\n", + "- 组件版本号:\"0.0.1\"\n", + "- 组件描述:\"LightGBM train component for individual dataset.\"\n", + "\n", + "### 3. 定义组件参数\n", + "我们在`lgbm_component.cc`中来定义组件的详细参数,包含每个参数的名字、类型、取值范围、默认值、是否是可选参数等。\n", + "添加参数需要用到的`AddAttr`函数的具体定义如下:\n", + "\n", + "```c++\n", + "template \n", + "void AddAttr(\n", + " const std::string& name, const std::string& desc, bool is_list,\n", + " bool is_optional,\n", + " const std::optional>& default_values = std::nullopt,\n", + " const std::optional>& allowed_values = std::nullopt,\n", + " const std::optional& lower_bound = std::nullopt,\n", + " const std::optional& upper_bound = std::nullopt,\n", + " const std::optional& lower_bound_inclusive = std::nullopt,\n", + " const std::optional& upper_bound_inclusive = std::nullopt,\n", + " const std::optional& list_min_length_inclusive = std::nullopt,\n", + " const std::optional& list_max_length_inclusive = std::nullopt);\n", + "```\n", + "- name:参数名称。\n", + "- desc:参数的详细描述。\n", + "- is_list:参数是否是一个列表。如果为fasle,则代表了我们允许用户输入一个T类型的值;如果是true,则代表了允许用户输入一个T类型的列表。\n", + "- is_optional:参数是否是optional的。如果为true,则代表了用户可以不填,此时会使用该参数的默认值;如果为false,则代表用户必须传递该值。\n", + "- default_values:参数的默认值,std::nullopt表示不设置默认值。当is_optional为true时必须定义该默认值。\n", + "- allowed_values:参数的允许值,std::nullopt表示不设置允许值。如果设置了该值,那么用户就必须在给出的allowed_values中选择输入。\n", + "- lower_bound:参数下限,std::nullopt表示不设置下限。\n", + "- lower_bound_inclusive:下限是否是包含。如果为true,则代表了lower_bound也是一个合法的输入。\n", + "- upper_bound:参数上限,std::nullopt表示不设置上限。\n", + "- upper_bound_inclusive:上限是否包含。如果为true,则代表了upper_bound也是一个合法的输入。\n", + "- list_min_length_inclusive:列表类型参数的最小长度,std::nullopt表示不设置列表最小长度。该值仅在is_list为true的时候可选设置。\n", + "- list_max_length_inclusive:列表类型参数的最大长度,std::nullopt表示不设置列表最大长度。该值仅在is_list为true的时候可选设置。\n", + "\n", + "我们先添加一个训练轮数的参数:\n", + "```c++\n", + "#include \"lgbm_component.h\"\n", + "\n", + "namespace teeapps {\n", + "namespace component {\n", + "\n", + "void LgbmTrainComponent::Init() {\n", + " AddAttr(\"n_estimators\", \"Number of boosted trees to fit.\", false,\n", + " true, std::vector{10}, std::nullopt, 1, 1024, true,\n", + " true);\n", + "}\n", + "\n", + "} // namespace component\n", + "} // namespace teeapps\n", + "```\n", + "这段代码使用`AddAttr`函数为`lgbm_train`这个组件添加了一个参数:\n", + "\n", + "- 参数名称:\"n_estimators\"\n", + "- 参数详细描述:\"Number of boosted trees to fit.\"\n", + "- 非列表类型参数,代表输入的是一个值而非一个列表\n", + "- 可选参数,代表用户如果不填该参数,就会使用默认值\n", + "- 参数的默认值为10\n", + "- 不设定特定的某几个可选值,但是设定了取值范围为[1, 1024],包含下限1和上限1024。\n", + "\n", + "接下来,您可以在`lgbm_component.cc`中继续添加您需要的参数,下面为一个示例,您可以根据需要进行删改:\n", + "```c++\n", + "#include \"lgbm_component.h\"\n", + "\n", + "namespace teeapps {\n", + "namespace component {\n", + "\n", + "void LgbmTrainComponent::Init() {\n", + " AddAttr(\"n_estimators\", \"Number of boosted trees to fit.\", false,\n", + " true, std::vector{10}, std::nullopt, 1, 1024, true,\n", + " true);\n", + " AddAttr(\"objective\", \"Specify the learning objective.\", false,\n", + " true, std::vector{\"binary\"},\n", + " std::vector{\"regression\", \"binary\"});\n", + " AddAttr(\"boosting_type\", \"Boosting type.\", false, true,\n", + " std::vector{\"gbdt\"},\n", + " std::vector{\"gbdt\", \"rf\", \"dart\"});\n", + " AddAttr(\"learning_rate\", \"Learning rate.\", false, true,\n", + " std::vector{0.1}, std::nullopt, 0, 1, false, true);\n", + " AddAttr(\"num_leaves\", \"Max number of leaves in one tree.\", false,\n", + " true, std::vector{31}, std::nullopt, 2, 1024, true,\n", + " true);\n", + "}\n", + "\n", + "} // namespace component\n", + "} // namespace teeapps\n", + "```\n", + "\n", + "### 4. 定义组件输入输出\n", + "我们继续在`lgbm_component.cc`中来定义组件的输入输出。\n", + "定义输入输出需要用到的`AddIo`函数定义如下:\n", + "```c++\n", + "void AddIo(const IoType io_type, const std::string& name,\n", + " const std::string& desc, const std::vector& types,\n", + " const std::optional>& col_params =\n", + " std::nullopt);\n", + "```\n", + "- io_type:表示这是输入还是输出,可选IoType::INPUT和IoType::OUTPUT。\n", + "- name:io名称。\n", + "- desc:io的详细说明。\n", + "- types: io的类型,使用定义在teeapps/component/util.h的DistDataType中的字符串作为类型名称,如果需要添加新的类型,请在DistDataType中添加定义。\n", + "- col_params: 额外列参数。对于输入的数据表,如果需要额外指定一些列参数,可以在这一项进行设置。比如在psi组件中,可以用\"key\"指定哪一列用于求交;比如在woe组件中,可以用\"feature_selects\"指定对哪些列进行woe binning;再比如训练组件中可以用\"label\"指定哪一列作为训练标签。\n", + "\n", + "\n", + "定义输入输出:\n", + "```c++\n", + "#include \"lgbm_component.h\"\n", + "\n", + "namespace teeapps {\n", + "namespace component {\n", + "\n", + "void LgbmTrainComponent::Init() {\n", + " //省略了前面定义的组件参数....\n", + "\n", + " AddIo(IoType::INPUT, \"train_dataset\", \"Input table.\",\n", + " {DistDataType::INDIVIDUAL_TABLE},\n", + " std::vector{\n", + " TableColParam(\"ids\", \"Id columns will not be trained.\"),\n", + " TableColParam(\"label\", \"Label column.\", 1, 1)});\n", + " AddIo(IoType::OUTPUT, \"output_model\", \"Output model.\",\n", + " {DistDataType::LGBM_MODEL});\n", + "}\n", + "\n", + "} // namespace component\n", + "} // namespace teeapps\n", + "```\n", + "\n", + "这段代码定义了`lgbm_train`组件的输入输出:\n", + "\n", + "- 输入名称为\"train_dataset\"\n", + "- 输入的类型为DistDataType::INDIVIDUAL_TABLE,单边表,通常就是一个csv表格\n", + "- 输入的额外列参数\"ids\":指明了哪些列作为id列,没有设定数量的上下限,也就是可以不指定id列,也可以指定多列都作为id列,被作为id列的那些列不会被用于训练。\n", + "- 输入的额外列参数\"label\":指明了哪一列作为训练的标签列,设定数量的上下限均为1,也就是有且仅有一列标签用于训练(不同于数据表的schema中可以有多列label,训练时必须指明一列作为训练标签)。\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 编写组件执行逻辑\n", + "\n", + "Teeapps框架会解析json化的[sf_node_eval_param](https://github.com/secretflow/spec/blob/main/secretflow/spec/v1/evaluation.proto),检查参数范围,对默认值进行赋值等,然后生成一个执行时的配置文件。组件执行代码可以直接读取该配置文件中的参数值,配置文件格式示例如下:\n", + "```json\n", + "{\n", + " \"component_name\": \"lgbm_train\",\n", + " \"n_estimators\": 10,\n", + " \"objective\": \"binary\",\n", + " \"boosting_type\": \"gbdt\",\n", + " \"num_leaves\": 15,\n", + " \"learning_rate\": 0.1,\n", + " \"inputs\": [\n", + " {\n", + " \"data_path\": \"teeapps/biz/testdata/breast_cancer/breast_cancer.csv\",\n", + " \"schema\": {\n", + " \"ids\": [\n", + " \"id\"\n", + " ],\n", + " \"features\": [\n", + " \"mean radius\",\n", + " \"mean texture\",\n", + " \"mean perimeter\",\n", + " \"mean area\",\n", + " \"mean smoothness\",\n", + " \"mean compactness\",\n", + " \"mean concavity\",\n", + " \"mean concave points\",\n", + " \"mean symmetry\",\n", + " \"mean fractal dimension\"\n", + " ],\n", + " \"labels\": [\n", + " \"target\"\n", + " ],\n", + " \"id_types\": [\n", + " \"int\"\n", + " ],\n", + " \"feature_types\": [\n", + " \"float\",\n", + " \"float\",\n", + " \"float\",\n", + " \"float\",\n", + " \"float\",\n", + " \"float\",\n", + " \"float\",\n", + " \"float\",\n", + " \"float\",\n", + " \"float\"\n", + " ],\n", + " \"label_types\": [\n", + " \"bool\"\n", + " ]\n", + " },\n", + " \"ids\": [\"id\"],\n", + " \"label\": [\"target\"]\n", + " }\n", + " ],\n", + " \"outputs\": [\n", + " {\n", + " \"data_path\": \"lgbm_bin_class.model\"\n", + " }\n", + " ]\n", + "}\n", + "```\n", + "\n", + "在`teeapps/biz`目录下新建`lgbm/lgbm.py`实现组件执行逻辑,它将按照上述json中的配置执行相应算法:\n", + "```python\n", + "import json\n", + "import logging\n", + "import sys\n", + "\n", + "import joblib\n", + "import lightgbm as lgb\n", + "import pandas\n", + "\n", + "from teeapps.biz.common import common\n", + "\n", + "COMPONENT_NAME = \"lgbm_train\"\n", + "\n", + "IDS = \"ids\"\n", + "LABEL = \"label\"\n", + "\n", + "N_ESTIMATORS = \"n_estimators\"\n", + "OBJECTIVE = \"objective\"\n", + "BOOSTING_TYPE = \"boosting_type\"\n", + "LEARNING_RATE = \"learning_rate\"\n", + "NUM_LEAVES = \"num_leaves\"\n", + "\n", + "REGRESSION = \"regression\"\n", + "BINARY = \"binary\"\n", + "\n", + "\n", + "def run_lgbm(task_config: dict):\n", + " logging.info(\"Running lgbm training...\")\n", + "\n", + " assert (\n", + " task_config[common.COMPONENT_NAME] == COMPONENT_NAME\n", + " ), f\"Component name should be {COMPONENT_NAME}, but got {task_config[common.COMPONENT_NAME]}\"\n", + "\n", + " inputs = task_config[common.INPUTS]\n", + " outputs = task_config[common.OUTPUTS]\n", + "\n", + " assert len(inputs) == 1, f\"{COMPONENT_NAME} should have only 1 input\"\n", + " assert len(outputs) == 1, f\"{COMPONENT_NAME} should have only 1 output\"\n", + "\n", + " # get train data\n", + " logging.info(\"Loading training data...\")\n", + " df = common.gen_data_frame(inputs[0])\n", + "\n", + " # labels in schema can be multiple, but eval target label is unique(in params)\n", + " ids = inputs[0][IDS]\n", + " labels = inputs[0][LABEL]\n", + " assert len(labels) == 1, f\"{COMPONENT_NAME} should have only 1 labels column\"\n", + "\n", + " features = inputs[0][common.SCHEMA][common.FEATURES]\n", + " features = [feature for feature in features if feature not in ids + labels]\n", + "\n", + " X = df[features]\n", + " Y = pandas.to_numeric(df[labels[0]], errors=\"coerce\")\n", + "\n", + " param = dict()\n", + " param_keys = [N_ESTIMATORS, OBJECTIVE, BOOSTING_TYPE, LEARNING_RATE, NUM_LEAVES]\n", + "\n", + " for key in param_keys:\n", + " param[key] = task_config[key]\n", + "\n", + " if param[OBJECTIVE] == REGRESSION:\n", + " model = lgb.LGBMRegressor(**param)\n", + " elif param[OBJECTIVE] == BINARY:\n", + " model = lgb.LGBMClassifier(**param)\n", + " else:\n", + " raise RuntimeError(f\"unsupported objective function: {param[OBJECTIVE]}\")\n", + "\n", + " # train model\n", + " model.fit(X, Y)\n", + "\n", + " logging.info(\"Setting origin feature_name in model...\")\n", + " model.origin_feature_name_ = features\n", + "\n", + " # dump model\n", + " logging.info(\"Dumping model...\")\n", + " model_data_path = outputs[0][common.DATA_PATH]\n", + " joblib.dump(model, model_data_path)\n", + "\n", + "\n", + "def main():\n", + " assert len(sys.argv) == 2, f\"Wrong arguments number: {len(sys.argv)}\"\n", + " # load task_config json\n", + " task_config_path = sys.argv[1]\n", + " logging.info(\"Reading task config file...\")\n", + " with open(task_config_path, \"r\") as task_config_f:\n", + " task_config = json.load(task_config_f)\n", + " logging.debug(f\"Configurations: {task_config}\")\n", + " run_lgbm(task_config)\n", + "\n", + "\n", + "\"\"\"\n", + "This app is expected to be launched by app framework via running a subprocess \n", + "`python3 lgbm.py config`. Before launching the subprocess, the app framework will \n", + "firstly generate a config file which is a json file containing all the required \n", + "parameters and is serialized from the task.proto. Currently we do not handle any \n", + "errors/exceptions in this file as the outer app framework will capture the stderr \n", + "and stdout.\n", + "\"\"\"\n", + "if __name__ == \"__main__\":\n", + " # TODO set log level\n", + " logging.basicConfig(\n", + " stream=sys.stdout,\n", + " level=logging.INFO,\n", + " format=\"%(asctime)s - %(levelname)s - %(message)s\",\n", + " )\n", + " main()\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 注册组件\n", + "\n", + "### 1. 在`teeapps/component/component_list.h`中注册组件\n", + "\n", + "在ComponentDomain中新增Domain名,没有新增Domain则不需要添加。\n", + "\n", + "在ComponentName中新增组件名:\n", + "\n", + "```c++\n", + "struct ComponentName {\n", + " ...\n", + " static constexpr char kLgbmTrainComp[] = \"lgbm_train\";\n", + " ...\n", + "};\n", + "```\n", + "\n", + "在ComponentPyFile中新增组件执行逻辑的python文件名:\n", + "\n", + "```c++\n", + "struct ComponentPyFile {\n", + " ...\n", + " static constexpr char kLgbmPy[] = \"lgbm.py\";\n", + " ...\n", + "};\n", + "```\n", + "\n", + "在comp_py_map中新增组件名与组件执行python文件名的映射关系:\n", + "\n", + "```c++\n", + "const std::unordered_map comp_py_map = {\n", + " ...\n", + " {ComponentName::kLgbmTrainComp, ComponentPyFile::kLgbmPy},\n", + " ...\n", + "};\n", + "```\n", + "\n", + "在COMP_DEF_MAP中新增组件全名与组件定义的映射关系:\n", + "\n", + "```c++\n", + "const std::map COMP_DEF_MAP = {\n", + " ...\n", + " {GenCompFullName(ComponentDomain::kMlTrainDomain,\n", + " ComponentName::kLgbmTrainComp, kCompVersion),\n", + " secretflow::spec::v1::ComponentDef(\n", + " *teeapps::component::LgbmTrainComponent::GetInstance().Definition())},\n", + " ...\n", + " };\n", + "```\n", + "\n", + "### 2. 增加翻译(可选)\n", + "在teeapps/component/all_translation_cn.json中增加组件名称和参数的翻译,例如:\n", + "```json\n", + "{\n", + " ...\n", + " \"ml.train/lgbm_train:0.0.1\": {\n", + " \"ml.train\": \"模型训练\",\n", + " \"lgbm_train\": \"LightGBM训练\",\n", + " \"LightGBM train component for individual dataset.\": \"为独立数据集提供LightGBM训练能力的组件\",\n", + " \"0.0.1\": \"0.0.1\",\n", + " \"n_estimators\": \"训练轮数\",\n", + " \"Number of boosted trees to fit.\": \"训练轮数\",\n", + " \"objective\": \"学习目标\",\n", + " \"Specify the learning objective.\": \"指定学习目标(二分类或回归)\",\n", + " \"boosting_type\": \"基学习类型\",\n", + " \"Boosting type.\": \"基学习类型\",\n", + " \"learning_rate\": \"学习率\",\n", + " \"Learning rate.\": \"学习率\",\n", + " \"num_leaves\": \"叶子数\",\n", + " \"Max number of leaves in one tree.\": \"一棵树中的最大叶子数量\",\n", + " \"train_dataset\": \"训练数据集\",\n", + " \"Input table.\": \"输入的训练数据集\",\n", + " \"ids\": \"id列\",\n", + " \"Id columns will not be trained.\": \"指定的id列不会作为训练的特征\",\n", + " \"label\": \"标签列\",\n", + " \"Label column.\": \"标签列\",\n", + " \"output_model\": \"输出模型\",\n", + " \"Output model.\": \"输出模型\"\n", + " },\n", + " ...\n", + "}\n", + "```\n", + "\n", + "### 3. 生成新的组件列表(可选)\n", + "进入开发容器\n", + "```bash\n", + "bash env.sh\n", + "bash env.sh enter\n", + "```\n", + "\n", + "编译component目录\n", + "```bash\n", + "bazel --output_base=target build //teeapps/component/...\n", + "```\n", + "\n", + "生成组件列表和翻译列表\n", + "```bash\n", + "./bazel-bin/teeapps/component/main\n", + "```\n", + "您将在`teeapps/component/comp_list.json`中看到新的组件列表,它对应secretpad中的[trustedflow组件定义](https://github.com/secretflow/secretpad/blob/main/config/components/trustedflow.json)\n", + "在`teeapps/component/translation.json`中看到相关字段的翻译,它对应secretpad中的[trustedflow组件翻译](https://github.com/secretflow/secretpad/blob/main/config/i18n/trustedflow.json)。\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 构建Teeapps镜像\n", + "在主机上用`deployment`目录下的`build.sh`脚本来构建不同平台下的镜像。\n", + "\n", + "对于sgx平台,在运行脚本前还需要在`deployment/occlum/python.yaml`中添加`lgbm.py`,如下:\n", + "\n", + "```yaml\n", + "includes:\n", + " - base.yaml\n", + "targets:\n", + " - target: /bin\n", + " createlinks:\n", + " - src: /opt/python-occlum/bin/python3\n", + " linkname: python3\n", + " # python packages\n", + " - target: /opt\n", + " copy: \n", + " - dirs:\n", + " - /home/teeapp/python-occlum\n", + " - target: /\n", + " copy:\n", + " - from: /home/teeapp/occlum/teeapps/biz\n", + " dirs:\n", + " - secretflow\n", + " - teeapps\n", + " files: \n", + " - biclassification_eval.py\n", + " - feature_filter.py\n", + " - train_test_split.py\n", + " - lr.py\n", + " - predict.py\n", + " - prediction_bias_eval.py\n", + " - psi.py\n", + " - pearsonr.py\n", + " - vif.py\n", + " - table_statistics.py\n", + " - woe_binning.py\n", + " - woe_substitution.py\n", + " - xgb.py\n", + " - lgbm.py\n", + " - __init__.py\n", + "```\n", + "\n", + "`build.sh`镜像构建脚本执行方式如下:\n", + "\n", + "```bash\n", + "cd deployment\n", + "\n", + "bash build.sh -p sim -v ${VERSION}\n", + "\n", + "bash build.sh -p sgx -v ${VERSION}\n", + "\n", + "bash build.sh -p tdx -v ${VERSION}\n", + "\n", + "bash build.sh -p csv -v ${VERSION}\n", + "```" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/index.rst b/docs/index.rst index 75d9021..5a7274b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -47,6 +47,9 @@ TrustedFlow保护了数据使用中(data-in-use)、数据存储(data-at-re 2. `授权策略 `_ 3. `可信应用 `_ +自定义可信组件 +--------------- +开发新的自定义可信组件,欢迎阅读 `新组件开发教程 `_ 。 高阶话题 ----------- @@ -72,4 +75,5 @@ TrustedFlow功能列表和路线图,欢迎阅读 `路线图 <./advanced_topic/ quick_start/index architecture/index + development/index advanced_topic/index diff --git a/docs/quick_start/step3.ipynb b/docs/quick_start/step3.ipynb index 1f4b974..5258e83 100644 --- a/docs/quick_start/step3.ipynb +++ b/docs/quick_start/step3.ipynb @@ -40,7 +40,7 @@ "- `rule_id`:alice为它要授权的规则取了id号为alice_rule_id_1。如果后续有删除该条规则的需求,可以根据该id号来做删除。\n", "- `grantee_party_ids`: alice指定被他授权的人是carol,因为可以授权给多个人,所以是一个列表。\n", "- `columns`: alice允许carol使用数据的这些列:id、mean radius、mean texture、mean perimeter、mean area、mean smoothness。\n", - "- `op_constraints`: alice允许carol执行以下计算:数据求交(OP_PSI)、数据拆分(OP_DATASET_SPLIT)、XGB训练(OP_XGB)、XGB预测(OP_XGB_PREDICT)、二分类评估(OP_BICLASSIFIER_EVALUATION)。关于算子的更详细说明,可以阅读[可信应用](../architecture/apps/index.rst)。\n", + "- `op_constraints`: alice允许carol执行以下计算:数据求交(`psi`)、数据拆分(`train_test_split`)、XGB训练(`xgb_train`)、XGB预测(`xgb_predict`)、二分类评估(`biclassification_eval`)。关于算子的更详细说明,可以阅读[可信APP](../architecture/apps/index.rst)。\n", "\n", "\n", "下面的配置还需要您根据实际情况进行完善,包含:\n", @@ -73,27 +73,27 @@ " op_constraints:\n", " - \n", " # (required) str\n", - " op_name: OP_PSI\n", + " op_name: psi\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_DATASET_SPLIT\n", + " op_name: train_test_split\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_XGB\n", + " op_name: xgb_train\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_PREDICT\n", + " op_name: xgb_predict\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_BICLASSIFIER_EVALUATION\n", + " op_name: biclassification_eval\n", " # (optional) List[str]\n", " constraints:\n", "```\n", @@ -130,27 +130,27 @@ " op_constraints:\n", " - \n", " # (required) str\n", - " op_name: OP_PSI\n", + " op_name: psi\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_DATASET_SPLIT\n", + " op_name: train_test_split\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_XGB\n", + " op_name: xgb_train\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_PREDICT\n", + " op_name: xgb_predict\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_BICLASSIFIER_EVALUATION\n", + " op_name: biclassification_eval\n", " # (optional) List[str]\n", " constraints:\n", "```\n", @@ -245,7 +245,7 @@ "- `rule_id`:alice为它要授权的规则取了id号为alice_rule_id_1。如果后续有删除该条规则的需求,可以根据该id号来做删除。\n", "- `grantee_party_ids`: alice指定被他授权的人是carol,因为可以授权给多个人,所以是一个列表。\n", "- `columns`: alice允许carol使用数据的这些列:id、mean radius、mean texture、mean perimeter、mean area、mean smoothness。\n", - "- `op_constraints`: alice允许carol执行以下计算:数据求交(OP_PSI)、数据拆分(OP_DATASET_SPLIT)、XGB训练(OP_XGB)、XGB预测(OP_XGB_PREDICT)、二分类评估(OP_BICLASSIFIER_EVALUATION)。关于算子的更详细说明,可以阅读[可信应用](../architecture/apps/index.rst)。\n", + "- `op_constraints`: alice允许carol执行以下计算:数据求交(`psi`)、数据拆分(`train_test_split`)、XGB训练(`xgb_train`)、XGB预测(`xgb_predict`)、二分类评估(`biclassification_eval`)。关于算子的更详细说明,可以阅读[可信APP](../architecture/apps/index.rst)。\n", "\n", "下面的配置还需要您根据实际情况进行完善,包含:\n", "\n", @@ -281,27 +281,27 @@ " op_constraints:\n", " - \n", " # (required) str\n", - " op_name: OP_PSI\n", + " op_name: psi\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_DATASET_SPLIT\n", + " op_name: train_test_split\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_XGB\n", + " op_name: xgb_train\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_PREDICT\n", + " op_name: xgb_predict\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_BICLASSIFIER_EVALUATION\n", + " op_name: biclassification_eval\n", " # (optional) List[str]\n", " constraints:\n", "```\n", @@ -338,27 +338,27 @@ " op_constraints:\n", " - \n", " # (required) str\n", - " op_name: OP_PSI\n", + " op_name: psi\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_DATASET_SPLIT\n", + " op_name: train_test_split\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_XGB\n", + " op_name: xgb_train\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_PREDICT\n", + " op_name: xgb_predict\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_BICLASSIFIER_EVALUATION\n", + " op_name: biclassification_eval\n", " # (optional) List[str]\n", " constraints:\n", "```\n", @@ -400,7 +400,7 @@ "- `rule_id`:alice为它要授权的规则取了id号为alice_rule_id_1。如果后续有删除该条规则的需求,可以根据该id号来做删除。\n", "- `grantee_party_ids`: alice指定被他授权的人是carol,因为可以授权给多个人,所以是一个列表。\n", "- `columns`: alice允许carol使用数据的这些列:id、mean radius、mean texture、mean perimeter、mean area、mean smoothness。\n", - "- `op_constraints`: alice允许carol执行以下计算:数据求交(OP_PSI)、数据拆分(OP_DATASET_SPLIT)、XGB训练(OP_XGB)、XGB预测(OP_XGB_PREDICT)、二分类评估(OP_BICLASSIFIER_EVALUATION)。关于算子的更详细说明,可以阅读[可信应用](../architecture/apps/index.rst)。\n", + "- `op_constraints`: alice允许carol执行以下计算:数据求交(`psi`)、数据拆分(`train_test_split`)、XGB训练(`xgb_train`)、XGB预测(`xgb_predict`)、二分类评估(`biclassification_eval`)。关于算子的更详细说明,可以阅读[可信APP](../architecture/apps/index.rst)。\n", "\n", "下面的配置还需要您根据实际情况进行完善,包含:\n", "\n", @@ -433,27 +433,27 @@ " op_constraints:\n", " - \n", " # (required) str\n", - " op_name: OP_PSI\n", + " op_name: psi\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_DATASET_SPLIT\n", + " op_name: train_test_split\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_XGB\n", + " op_name: xgb_train\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_PREDICT\n", + " op_name: xgb_predict\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_BICLASSIFIER_EVALUATION\n", + " op_name: biclassification_eval\n", " # (optional) List[str]\n", " constraints:\n", "```\n", @@ -490,27 +490,27 @@ " op_constraints:\n", " - \n", " # (required) str\n", - " op_name: OP_PSI\n", + " op_name: psi\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_DATASET_SPLIT\n", + " op_name: train_test_split\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_XGB\n", + " op_name: xgb_train\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_PREDICT\n", + " op_name: xgb_predict\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_BICLASSIFIER_EVALUATION\n", + " op_name: biclassification_eval\n", " # (optional) List[str]\n", " constraints:\n", "```\n", @@ -554,7 +554,7 @@ "- `rule_id`:alice为它要授权的规则取了id号为alice_rule_id_1。如果后续有删除该条规则的需求,可以根据该id号来做删除。\n", "- `grantee_party_ids`: alice指定被他授权的人是carol,因为可以授权给多个人,所以是一个列表。\n", "- `columns`: alice允许carol使用数据的这些列:id、mean radius、mean texture、mean perimeter、mean area、mean smoothness。\n", - "- `op_constraints`: alice允许carol执行以下计算:数据求交(OP_PSI)、数据拆分(OP_DATASET_SPLIT)、XGB训练(OP_XGB)、XGB预测(OP_XGB_PREDICT)、二分类评估(OP_BICLASSIFIER_EVALUATION)。关于算子的更详细说明,可以阅读[可信应用](../architecture/apps/index.rst)。\n", + "- `op_constraints`: alice允许carol执行以下计算:数据求交(`psi`)、数据拆分(`train_test_split`)、XGB训练(`xgb_train`)、XGB预测(`xgb_predict`)、二分类评估(`biclassification_eval`)。关于算子的更详细说明,可以阅读[可信APP](../architecture/apps/index.rst)。\n", "\n", "下面的配置还需要您根据实际情况进行完善,包含:\n", "\n", @@ -588,27 +588,27 @@ " op_constraints:\n", " - \n", " # (required) str\n", - " op_name: OP_PSI\n", + " op_name: psi\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_DATASET_SPLIT\n", + " op_name: train_test_split\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_XGB\n", + " op_name: xgb_train\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_PREDICT\n", + " op_name: xgb_predict\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_BICLASSIFIER_EVALUATION\n", + " op_name: biclassification_eval\n", " # (optional) List[str]\n", " constraints:\n", "```\n", @@ -643,27 +643,27 @@ " op_constraints:\n", " - \n", " # (required) str\n", - " op_name: OP_PSI\n", + " op_name: psi\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_DATASET_SPLIT\n", + " op_name: train_test_split\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_XGB\n", + " op_name: xgb_train\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_PREDICT\n", + " op_name: xgb_predict\n", " # (optional) List[str]\n", " constraints:\n", " -\n", " # (required) str\n", - " op_name: OP_BICLASSIFIER_EVALUATION\n", + " op_name: biclassification_eval\n", " # (optional) List[str]\n", " constraints:\n", "```\n",