From 33af18051b303d3119d4a4c1e633f14eac7113eb Mon Sep 17 00:00:00 2001 From: wangbingnan136 <42114835+wangbingnan136@users.noreply.github.com> Date: Mon, 28 Mar 2022 13:17:12 +0800 Subject: [PATCH 1/2] Create adversial_training.py --- stabilizer/adversial_training.py | 219 +++++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 stabilizer/adversial_training.py diff --git a/stabilizer/adversial_training.py b/stabilizer/adversial_training.py new file mode 100644 index 0000000..0b2d232 --- /dev/null +++ b/stabilizer/adversial_training.py @@ -0,0 +1,219 @@ +import torch +import torch.nn + +class FGM(object): + """ + Example + # initialization + fgm = FGM(model,epsilon=1,emb_name='word_embeddings.') + for batch_input, batch_label in processor: + # normal training + loss = model(batch_input, batch_label) + loss.backward() # backward,get the normal gradients + # begin adversarial training + fgm.attack() # ddd adversarial perturbation to the embedding + loss_adv = model(batch_input, batch_label) + loss_adv.backward() # Backpropagation, and on the basis of the normal grad, accumulate the gradient of the adversarial training + fgm.restore() # restore embedding parameters + # gradient descent, update parameters as usual + optimizer.step() + model.zero_grad() + """ + + def __init__(self, model, epsilon=1.0,emb_name='word_embeddings.'): + self.model = model + self.epsilon = epsilon + self.backup = {} + self.emb_name=emb_name + + def attack(self): + emb_name=self.emb_name + for name, param in self.model.named_parameters(): + if param.requires_grad and emb_name in name: + self.backup[name] = param.data.clone() + norm = torch.norm(param.grad) + if norm != 0 and not torch.isnan(norm): + r_at = self.epsilon * param.grad / norm + param.data.add_(r_at) + + def restore(self): + emb_name=self.emb_name + for name, param in self.model.named_parameters(): + if param.requires_grad and emb_name in name: + assert name in self.backup + param.data = self.backup[name] + self.backup = {} + + +class PGD(object): + """ + Example + pgd = PGD(model,emb_name='word_embeddings.',epsilon=1.0,alpha=0.3) + K = 3 + for batch_input, batch_label in processor: + # normal training + loss = model(batch_input, batch_label) + loss.backward() # Backpropagation, get normal grad + pgd.backup_grad() + # adversarial training + for t in range(K): + pgd.attack(is_first_attack=(t==0)) #add adversarial perturbation to embedding, backup param.processor during first attack + if t != K-1: + model.zero_grad() + else: + pgd.restore_grad() + loss_adv = model(batch_input, batch_label) + loss_adv.backward() # Backpropagation, and on the basis of the normal grad, accumulate the gradient of the adversarial training + pgd.restore() # restore embedding parameters + # gradient descent, update parameters + optimizer.step() + model.zero_grad() + """ + + def __init__(self, model, emb_name, epsilon=1.0, alpha=0.3): + self.model = model + self.emb_name = emb_name + self.epsilon = epsilon + self.alpha = alpha + self.emb_backup = {} + self.grad_backup = {} + + def attack(self, is_first_attack=False): + for name, param in self.model.named_parameters(): + if param.requires_grad and self.emb_name in name: + if is_first_attack: + self.emb_backup[name] = param.data.clone() + norm = torch.norm(param.grad) + if norm != 0: + r_at = self.alpha * param.grad / norm + param.data.add_(r_at) + param.data = self.project(name, param.data, self.epsilon) + + def restore(self): + for name, param in self.model.named_parameters(): + if param.requires_grad and self.emb_name in name: + assert name in self.emb_backup + param.data = self.emb_backup[name] + self.emb_backup = {} + + def project(self, param_name, param_data, epsilon): + r = param_data - self.emb_backup[param_name] + if torch.norm(r) > epsilon: + r = epsilon * r / torch.norm(r) + return self.emb_backup[param_name] + r + + def backup_grad(self): + for name, param in self.model.named_parameters(): + if param.requires_grad and param.grad is not None: + self.grad_backup[name] = param.grad.clone() + + def restore_grad(self): + for name, param in self.model.named_parameters(): + if param.requires_grad and param.grad is not None: + param.grad = self.grad_backup[name] + + +class FreeLB(object): + """ + https://arxiv.org/pdf/1909.11764.pdf + ExampleL:Currently only for bert + freelb = FreeLB() + K = 3 + for batch_input, batch_label in processor: + loss = freelb.attack(model,inputs,.....) + """ + + def __init__( + self, + adv_K, + adv_lr, + adv_init_mag, + adv_max_norm=0.0, + adv_norm_type="l2", + base_model="bert", + ): + self.adv_K = adv_K + self.adv_lr = adv_lr + self.adv_max_norm = adv_max_norm + self.adv_init_mag = adv_init_mag + self.adv_norm_type = adv_norm_type + self.base_model = base_model + + def attack(self, model, inputs, gradient_accumulation_steps=1): + input_ids = inputs["input_ids"] + if isinstance(model, torch.nn.DataParallel): + embeds_init = getattr( + model.module, self.base_model + ).embeddings.word_embeddings(input_ids) + else: + embeds_init = getattr(model, self.base_model).embeddings.word_embeddings( + input_ids + ) + if self.adv_init_mag > 0: + input_mask = inputs["attention_mask"].to(embeds_init) + input_lengths = torch.sum(input_mask, 1) + if self.adv_norm_type == "l2": + delta = torch.zeros_like(embeds_init).uniform_( + -1, 1 + ) * input_mask.unsqueeze(2) + dims = input_lengths * embeds_init.size(-1) + mag = self.adv_init_mag / torch.sqrt(dims) + delta = (delta * mag.view(-1, 1, 1)).detach() + elif self.adv_norm_type == "linf": + delta = torch.zeros_like(embeds_init).uniform_( + -self.adv_init_mag, self.adv_init_mag + ) + delta = delta * input_mask.unsqueeze(2) + else: + delta = torch.zeros_like(embeds_init) + + for astep in range(self.adv_K): + delta.requires_grad_() + inputs["inputs_embeds"] = delta + embeds_init + inputs["input_ids"] = None + outputs = model(**inputs) + loss, logits = outputs[ + :2 + ] # model outputs are always tuple in transformers (see doc) + loss = loss.mean() # mean() to average on multi-gpu parallel training + loss = loss / gradient_accumulation_steps + loss.backward() + delta_grad = delta.grad.clone().detach() + if self.adv_norm_type == "l2": + denorm = torch.norm( + delta_grad.view(delta_grad.size(0), -1), dim=1 + ).view(-1, 1, 1) + denorm = torch.clamp(denorm, min=1e-8) + delta = (delta + self.adv_lr * delta_grad / denorm).detach() + if self.adv_max_norm > 0: + delta_norm = torch.norm( + delta.view(delta.size(0), -1).float(), p=2, dim=1 + ).detach() + exceed_mask = (delta_norm > self.adv_max_norm).to(embeds_init) + reweights = ( + self.adv_max_norm / delta_norm * exceed_mask + (1 - exceed_mask) + ).view(-1, 1, 1) + delta = (delta * reweights).detach() + elif self.adv_norm_type == "linf": + denorm = torch.norm( + delta_grad.view(delta_grad.size(0), -1), dim=1, p=float("inf") + ).view(-1, 1, 1) + denorm = torch.clamp(denorm, min=1e-8) + delta = (delta + self.adv_lr * delta_grad / denorm).detach() + if self.adv_max_norm > 0: + delta = torch.clamp( + delta, -self.adv_max_norm, self.adv_max_norm + ).detach() + else: + raise ValueError( + "Norm type {} not specified.".format(self.adv_norm_type) + ) + if isinstance(model, torch.nn.DataParallel): + embeds_init = getattr( + model.module, self.base_model + ).embeddings.word_embeddings(input_ids) + else: + embeds_init = getattr( + model, self.base_model + ).embeddings.word_embeddings(input_ids) + return loss \ No newline at end of file From c3262acee358b929851ccde3920875796503866e Mon Sep 17 00:00:00 2001 From: wangbingnan136 <42114835+wangbingnan136@users.noreply.github.com> Date: Tue, 29 Mar 2022 01:22:07 +0800 Subject: [PATCH 2/2] Create deberta in kaggle U.S. Patent Phrase to Phrase Matching with FGM and custom huggingface trainer.ipynb --- ...h FGM and custom huggingface trainer.ipynb | 614 ++++++++++++++++++ 1 file changed, 614 insertions(+) create mode 100644 examples/deberta in kaggle U.S. Patent Phrase to Phrase Matching with FGM and custom huggingface trainer.ipynb diff --git a/examples/deberta in kaggle U.S. Patent Phrase to Phrase Matching with FGM and custom huggingface trainer.ipynb b/examples/deberta in kaggle U.S. Patent Phrase to Phrase Matching with FGM and custom huggingface trainer.ipynb new file mode 100644 index 0000000..0d78c4c --- /dev/null +++ b/examples/deberta in kaggle U.S. Patent Phrase to Phrase Matching with FGM and custom huggingface trainer.ipynb @@ -0,0 +1,614 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "QKcPrW9PfEgm" + }, + "source": [ + "# Begomg" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EwQu561KaVZu" + }, + "outputs": [], + "source": [ + "!pip install transformers\n", + "!pip install datasets\n", + "!pip install sentencepiece" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "r-DR0OHwjIYj" + }, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "import joblib\n", + "from sklearn.model_selection import KFold, StratifiedKFold\n", + "import shutil\n", + "from sklearn.model_selection import StratifiedKFold,GroupKFold\n", + "\n", + "from torch.utils.data import DataLoader, Dataset\n", + "import transformers\n", + "#datasets, transformers\n", + "from transformers import TrainingArguments, Trainer\n", + "from transformers import AutoModelForSequenceClassification, AutoTokenizer,AutoConfig\n", + "import joblib\n", + "\n", + "import random\n", + "import os\n", + "import numpy as np\n", + "import torch\n", + "def seed_everything(seed=42):\n", + " random.seed(seed)\n", + " os.environ['PYTHONHASHSEED'] = str(seed)\n", + " np.random.seed(seed)\n", + " torch.manual_seed(seed)\n", + " torch.cuda.manual_seed(seed)\n", + " torch.backends.cudnn.deterministic = True\n", + " \n", + "seed_everything(seed=42)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5uGQc5L2zfAr" + }, + "outputs": [], + "source": [ + "class CFG:\n", + " input_path = '/content/drive/MyDrive/kaggle_competition/U.S. Patent Phrase to Phrase Matching/'\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4v7noOvBTmAe" + }, + "outputs": [], + "source": [ + "cpc_texts=joblib.load('/content/drive/MyDrive/kaggle_competition/U.S. Patent Phrase to Phrase Matching/cpc_texts')\n", + "test_df = pd.read_csv(f\"{CFG.input_path}test.csv\")\n", + "train_df = pd.read_csv(f\"{CFG.input_path}train.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4kHgYZmbx0sv" + }, + "source": [ + "# deberta v3\n", + "from https://www.kaggle.com/code/yasufuminakama/pppm-deberta-v3-large-baseline-w-w-b-train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "I4y_6x8_x16u" + }, + "outputs": [], + "source": [ + "from IPython.display import display\n", + "class CFG:\n", + " wandb=True\n", + " competition='PPPM'\n", + " _wandb_kernel='nakama'\n", + " print_freq=100\n", + " num_workers=4\n", + " model=\"microsoft/deberta-v3-large\"\n", + " scheduler='linear' # ['linear', 'cosine']\n", + " batch_scheduler=True\n", + " num_cycles=0.5\n", + " num_warmup_steps=0\n", + " epochs=50\n", + " encoder_lr=2e-5\n", + " decoder_lr=2e-3\n", + " min_lr=1e-6\n", + " eps=1e-6\n", + " betas=(0.9, 0.999)\n", + " batch_size=16\n", + " fc_dropout=0.2\n", + " target_size=1\n", + " max_len=512\n", + " weight_decay=0.01\n", + " gradient_accumulation_steps=1\n", + " max_grad_norm=1\n", + " seed=42\n", + " n_fold=5\n", + " trn_fold=[0, 1, 2, 3]\n", + " train=True\n", + " model_path = '/content/drive/MyDrive/kaggle_competition/U.S. Patent Phrase to Phrase Matching/'\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vm3GqjZ3zrnm" + }, + "outputs": [], + "source": [ + "def get_logger(filename='train'):\n", + " from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter\n", + " logger = getLogger(__name__)\n", + " logger.setLevel(INFO)\n", + " handler1 = StreamHandler()\n", + " handler1.setFormatter(Formatter(\"%(message)s\"))\n", + " handler2 = FileHandler(filename=f\"{filename}.log\")\n", + " handler2.setFormatter(Formatter(\"%(message)s\"))\n", + " logger.addHandler(handler1)\n", + " logger.addHandler(handler2)\n", + " return logger\n", + "\n", + "LOGGER = get_logger()\n", + "\n", + "\n", + "train=train_df\n", + "test=test_df\n", + "\n", + "cpc_texts=joblib.load(\"/content/drive/MyDrive/kaggle_competition/U.S. Patent Phrase to Phrase Matching/cpc_texts\")\n", + "\n", + "train['context_text'] = train['context'].map(cpc_texts)\n", + "test['context_text'] = test['context'].map(cpc_texts)\n", + "\n", + "train['text'] = train['anchor'] + '[SEP]' + train['target'] + '[SEP]' + train['context_text']\n", + "test['text'] = test['anchor'] + '[SEP]' + test['target'] + '[SEP]' + test['context_text']\n", + "display(train.head())\n", + "display(test.head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "LfMyAvaYyr08" + }, + "outputs": [], + "source": [ + "train['score_map'] = train['score'].map({0.00: 0, 0.25: 1, 0.50: 2, 0.75: 3, 1.00: 4})\n", + "Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)\n", + "for n, (train_index, val_index) in enumerate(Fold.split(train, train['score_map'])):\n", + " train.loc[val_index, 'fold'] = int(n)\n", + "train['fold'] = train['fold'].astype(int)\n", + "display(train.groupby('fold').size())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MgHZfbMEzDpm" + }, + "outputs": [], + "source": [ + "tokenizer = AutoTokenizer.from_pretrained(CFG.model)\n", + "CFG.tokenizer = tokenizer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NLhSoTJ8zLLR" + }, + "outputs": [], + "source": [ + "from tqdm import tqdm\n", + "lengths_dict = {}\n", + "\n", + "lengths = []\n", + "tk0 = tqdm(cpc_texts.values(), total=len(cpc_texts))\n", + "for text in tk0:\n", + " length = len(tokenizer(text, add_special_tokens=False)['input_ids'])\n", + " lengths.append(length)\n", + "lengths_dict['context_text'] = lengths\n", + "\n", + "for text_col in ['anchor', 'target']:\n", + " lengths = []\n", + " tk0 = tqdm(train[text_col].fillna(\"\").values, total=len(train))\n", + " for text in tk0:\n", + " length = len(tokenizer(text, add_special_tokens=False)['input_ids'])\n", + " lengths.append(length)\n", + " lengths_dict[text_col] = lengths\n", + " \n", + "CFG.max_len = max(lengths_dict['anchor']) + max(lengths_dict['target'])\\\n", + " + max(lengths_dict['context_text']) + 4 # CLS + SEP + SEP + SEP\n", + "LOGGER.info(f\"max_len: {CFG.max_len}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ki-fHB3L0FTe" + }, + "outputs": [], + "source": [ + "from torch import nn\n", + "\n", + "class Deberta(nn.Module):\n", + " def __init__(self, cfg, config_path=None, pretrained=False):\n", + " super().__init__()\n", + " self.cfg = cfg\n", + " if config_path is None:\n", + " self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)\n", + " else:\n", + " self.config = torch.load(config_path)\n", + " if pretrained:\n", + " self.model = AutoModel.from_pretrained(cfg.model, config=self.config)\n", + " else:\n", + " self.model = AutoModel.from_config(self.config)\n", + " self.fc_dropout = nn.Dropout(cfg.fc_dropout)\n", + " self.fc = nn.Linear(self.config.hidden_size, self.cfg.target_size)\n", + " self._init_weights(self.fc)\n", + " self.attention = nn.Sequential(\n", + " nn.Linear(self.config.hidden_size, 512),\n", + " nn.Tanh(),\n", + " nn.Linear(512, 1),\n", + " nn.Softmax(dim=1)\n", + " )\n", + " self._init_weights(self.attention)\n", + " \n", + " def _init_weights(self, module):\n", + " if isinstance(module, nn.Linear):\n", + " module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)\n", + " if module.bias is not None:\n", + " module.bias.data.zero_()\n", + " elif isinstance(module, nn.Embedding):\n", + " module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)\n", + " if module.padding_idx is not None:\n", + " module.weight.data[module.padding_idx].zero_()\n", + " elif isinstance(module, nn.LayerNorm):\n", + " module.bias.data.zero_()\n", + " module.weight.data.fill_(1.0)\n", + " \n", + " def feature(self, inputs):\n", + " for key in inputs.keys():\n", + " if inputs[key].dim()==3:\n", + " inputs[key].squeeze_(1)\n", + " outputs = self.model(**inputs)\n", + " last_hidden_states = outputs[0]\n", + " # feature = torch.mean(last_hidden_states, 1)\n", + " weights = self.attention(last_hidden_states)\n", + " feature = torch.sum(weights * last_hidden_states, dim=1)\n", + " return feature\n", + "\n", + " def forward(self, inputs):\n", + " feature = self.feature(inputs)\n", + " output = self.fc(self.fc_dropout(feature))\n", + " return output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "cR2DJa6e0dJq" + }, + "outputs": [], + "source": [ + "from transformers import EarlyStoppingCallback\n", + "from transformers import AutoModel,AutoConfig\n", + "from transformers import Trainer\n", + "from transformers import TrainingArguments\n", + "from torch import nn as nn\n", + "from transformers.trainer_pt_utils import get_parameter_names\n", + "from transformers.optimization import get_scheduler\n", + "from transformers import get_scheduler\n", + "from transformers.trainer_pt_utils import get_parameter_names\n", + "from transformers.optimization import get_scheduler,get_linear_schedule_with_warmup,get_cosine_schedule_with_warmup\n", + "AdamW=torch.optim.AdamW" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EOmgl_aa0jm8" + }, + "outputs": [], + "source": [ + "def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):\n", + " param_optimizer = list(model.named_parameters())\n", + " no_decay = [\"bias\", \"LayerNorm.bias\", \"LayerNorm.weight\"]\n", + " optimizer_parameters = [\n", + " {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],\n", + " 'lr': encoder_lr, 'weight_decay': weight_decay},\n", + " {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],\n", + " 'lr': encoder_lr, 'weight_decay': 0.0},\n", + " {'params': [p for n, p in model.named_parameters() if \"model\" not in n],\n", + " 'lr': decoder_lr, 'weight_decay': 0.0}\n", + " ]\n", + " return optimizer_parameters\n", + "\n", + "\n", + "# ====================================================\n", + "# scheduler\n", + "# ====================================================\n", + "def get_scheduler(cfg, optimizer, num_train_steps):\n", + " if cfg.scheduler == 'linear':\n", + " scheduler = get_linear_schedule_with_warmup(\n", + " optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps\n", + " )\n", + " elif cfg.scheduler == 'cosine':\n", + " scheduler = get_cosine_schedule_with_warmup(\n", + " optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles\n", + " )\n", + " return scheduler\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "419XYkG74dv_" + }, + "outputs": [], + "source": [ + "def prepare_input(cfg, text):\n", + " inputs = cfg.tokenizer(text,\n", + " add_special_tokens=True,\n", + " max_length=cfg.max_len,\n", + " padding=\"max_length\",\n", + " return_offsets_mapping=False)\n", + " for k, v in inputs.items():\n", + " inputs[k] = torch.tensor(v, dtype=torch.long)\n", + " return inputs\n", + "\n", + "\n", + "class TrainDataset(Dataset):\n", + " def __init__(self, cfg, df):\n", + " self.cfg = cfg\n", + " self.texts = df['text'].values\n", + " self.labels = df['score'].values\n", + "\n", + " def __len__(self):\n", + " return len(self.labels)\n", + "\n", + " def __getitem__(self, item):\n", + " inputs = prepare_input(self.cfg, self.texts[item])\n", + " label = torch.tensor(self.labels[item], dtype=torch.float)\n", + " return {'inputs':inputs, 'labels':label}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "bzq9uAKB0IcB" + }, + "outputs": [], + "source": [ + "from transformers import EarlyStoppingCallback\n", + "import scipy as sp\n", + "from apex import amp\n", + "from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union\n", + "\n", + "\n", + "def compute_metrics(eval_pred):\n", + "\n", + " predictions, labels = eval_pred\n", + " predictions = sp.special.expit(predictions.flatten())\n", + " #reshape(len(predictions))\n", + " labels = labels.flatten()\n", + " return {\n", + " 'pearson': sp.stats.pearsonr(predictions, labels)[0]\n", + " }\n", + "from transformers import Trainer\n", + "\n", + "class FGM(object):\n", + " \"\"\"\n", + " Example\n", + " fgm = FGM(model,epsilon=1,emb_name='word_embeddings.')\n", + " for batch_input, batch_label in processor:\n", + " # 正常训练\n", + " loss = model(batch_input, batch_label)\n", + " loss.backward() \n", + "\n", + " # 对抗训练\n", + " fgm.attack() \n", + "\n", + " loss_adv = model(batch_input, batch_label)\n", + "\n", + " loss_adv.backward()\n", + " fgm.restore()\n", + " optimizer.step()\n", + " model.zero_grad()\n", + " \"\"\"\n", + "\n", + " def __init__(self, model, epsilon=1.0,emb_name='word_embeddings.'):\n", + " self.model = model\n", + " self.epsilon = epsilon\n", + " self.backup = {}\n", + " self.emb_name=emb_name\n", + "\n", + " def attack(self):\n", + " emb_name=self.emb_name\n", + " for name, param in self.model.named_parameters():\n", + " if param.requires_grad and emb_name in name:\n", + " self.backup[name] = param.data.clone()\n", + " norm = torch.norm(param.grad)\n", + " if norm != 0 and not torch.isnan(norm):\n", + " r_at = self.epsilon * param.grad / norm\n", + " param.data.add_(r_at)\n", + "\n", + " def restore(self):\n", + " emb_name = self.emb_name\n", + " for name, param in self.model.named_parameters():\n", + " if param.requires_grad and emb_name in name:\n", + " assert name in self.backup\n", + " param.data = self.backup[name]\n", + " self.backup = {}\n", + "\n", + "\n", + "class Trainers(Trainer):\n", + "\n", + " def compute_loss(self, model, inputs, return_outputs=False): ## compute loss这个步骤实际上定义了 model的forward和output以及model的loss计算的结果\n", + " labels = inputs.get(\"labels\")\n", + " logits = model(inputs.get('inputs')) ##在这里定义了foward和batch的计算过程\n", + " loss_fct = nn.BCEWithLogitsLoss(reduction=\"mean\") ## loss可以不断重新定义无所谓的\n", + " loss = loss_fct(logits.view(-1, 1), labels.float().view(-1,1))\n", + " #final_loss = torch.masked_select(loss, labels.view(-1, 1) != -1).mean()\n", + " #return (torch.masked_select(loss, labels.view(-1, 1) != -1).mean(), outputs) if return_outputs else loss\n", + " return (loss, {'outputs':logits}) if return_outputs else loss\n", + "\n", + " # We can add any adversial training methods here by custom training step of huggingface trainer\n", + " def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:\n", + " model.train()\n", + " inputs = self._prepare_inputs(inputs)\n", + " fgm = FGM(model,epsilon=1,emb_name='word_embeddings.')\n", + "\n", + " with self.autocast_smart_context_manager():\n", + " loss = self.compute_loss(model, inputs)\n", + " loss = loss / self.args.gradient_accumulation_steps\n", + " loss=self.scaler.scale(loss) ##半精度用torch自带的就行了\n", + " loss.backward() \n", + " fgm.attack() # 在embedding上添加对抗扰动\n", + " loss_adv = self.compute_loss(model, inputs)\n", + " loss_adv.backward()\n", + " fgm.restore()\n", + " return loss_adv.detach()\n", + "\n", + " \n", + "es=EarlyStoppingCallback(early_stopping_patience=10)\n", + "\n", + "\n", + "args=TrainingArguments(output_dir='/content/drive/tmp/deberta_temp',evaluation_strategy='steps',prediction_loss_only =False,overwrite_output_dir =True, \\\n", + " per_device_train_batch_size=16,per_device_eval_batch_size=256,learning_rate=2e-5, \\\n", + " weight_decay=0.01,max_grad_norm=1,num_train_epochs =50,logging_strategy ='epoch',save_steps=72*2,save_total_limit =1,seed=42,eval_steps=72*2,dataloader_num_workers=4, \\\n", + " fp16=True,dataloader_drop_last =False,disable_tqdm =False,load_best_model_at_end=True,sharded_ddp=False,label_smoothing_factor=0.000, \\\n", + " group_by_length=False,gradient_accumulation_steps=1*16,dataloader_pin_memory=False, \\\n", + " metric_for_best_model='pearson', greater_is_better=True,) \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0AsiTfKGnx5P" + }, + "outputs": [], + "source": [ + "import gc\n", + "y_oof=np.zeros(train_df.shape[0])\n", + "pearsons=0\n", + "model_function=Deberta\n", + "datasets=TrainDataset\n", + "args=args\n", + "callbacks=[es]\n", + "\n", + "\n", + "for i in range(5):\n", + " model=model_function(CFG,config_path=None, pretrained=True)\n", + " if i==0:\n", + " torch.save(model.state_dict(), CFG.model_path+CFG.model.split('/')[1]+'-'+'initialization')\n", + "\n", + " model.load_state_dict(torch.load(CFG.model_path+CFG.model.split('/')[1]+'-'+'initialization'))\n", + "\n", + " model.train()\n", + "\n", + " train_dataset=datasets(CFG,train.query('fold!=@i'))\n", + "\n", + " eval_dataset=datasets(CFG,train.query('fold==@i'))\n", + "\n", + " optimizer_parameters = get_optimizer_params(model,\n", + " encoder_lr=CFG.encoder_lr, \n", + " decoder_lr=CFG.decoder_lr,\n", + " weight_decay=CFG.weight_decay)\n", + " AdamW=torch.optim.AdamW\n", + " optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)\n", + " num_train_steps = int(train_df.query('fold!=@i').size / CFG.batch_size * 5)\n", + " lr_scheduler = get_scheduler(CFG, optimizer, num_train_steps)\n", + "\n", + "\n", + " trainer = Trainers(\n", + " model,\n", + " args,\n", + " train_dataset = train_dataset,\n", + " eval_dataset = eval_dataset,\n", + " tokenizer=None,\n", + " data_collator = None,\n", + " callbacks=callbacks,\n", + " optimizers=(optimizer,lr_scheduler),compute_metrics=compute_metrics\n", + " )\n", + " trainer.data_collator=None#You must set the data_collator=None\n", + "\n", + " trainer.train()\n", + "\n", + " torch.save(trainer.model.state_dict(), CFG.model_path+CFG.model.split('/')[1]+'-best_fold_{}'.format(i))\n", + " y_oof[train_df.query('fold==@i').index]=trainer.predict(eval_dataset).predictions.flatten()\n", + " pearsons+=(compute_metrics((y_oof[train_df.query('fold==@i').index],train_df.query('fold==@i').score.values))).get('pearson')/5.0\n", + " del model,trainer,train_dataset,eval_dataset;\n", + " gc.collect()\n", + " torch.cuda.empty_cache()\n", + "\n", + "oof_pearson=compute_metrics((y_oof,train_df.score.values)).get('pearson')\n", + "print(f'average pearson is {pearsons}')\n", + "print(f'oof pearson is {oof_pearson}')\n", + "#pearsons,oof_pearson" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "background_execution": "on", + "collapsed_sections": [ + "4kHgYZmbx0sv" + ], + "machine_shape": "hm", + "name": "deberta.ipynb", + "private_outputs": true, + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}