diff --git a/.gitignore b/.gitignore index d6ba010..b6f9544 100644 --- a/.gitignore +++ b/.gitignore @@ -161,3 +161,5 @@ cython_debug/ .DS_Store wandb/ +model/ +.idea/ \ No newline at end of file diff --git a/examples/flock_llm/Dockerfile b/examples/flock_llm/Dockerfile index 85b57a6..5316a1f 100644 --- a/examples/flock_llm/Dockerfile +++ b/examples/flock_llm/Dockerfile @@ -1,4 +1,17 @@ -FROM nvidia/cuda:11.8.0-base-ubuntu22.04 +FROM nvidia/cuda:12.2.0-base-ubuntu22.04 +#FROM nvidia/cuda:11.8.0-base-ubuntu22.04 + +# ================ Setup system environment ================= +ARG WANDB_API_KEY=your_wandb_api_key_here + +ENV WANDB_API_KEY=$WANDB_API_KEY + +# AWS S3 condential info +ARG AWS_ACCESS_KEY_ID +ARG AWS_SECRET_ACCESS_KEY + +ENV AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID +ENV AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY # ================ Install Python from source code ================= @@ -42,4 +55,5 @@ RUN pip install --no-cache-dir -r requirements.txt COPY . . EXPOSE 5000/tcp -CMD [ "python3.11", "./flock_llm_example.py" ] +CMD [ "python3.11", "main.py", "--conf", "config.yaml" ] + diff --git a/examples/flock_llm/FLockLLMFinetuneModel.py b/examples/flock_llm/FLockLLMFinetuneModel.py new file mode 100644 index 0000000..ca214fe --- /dev/null +++ b/examples/flock_llm/FLockLLMFinetuneModel.py @@ -0,0 +1,414 @@ +""" + +FLock LLM example code based on the FLock sdk + +""" +import io +import os +import datasets as pypi_datasets +from datasets import load_dataset as pypi_load_dataset +from utils.helper import print_number_of_trainable_model_parameters +import torch + +from peft import ( + LoraConfig, + get_peft_model, + prepare_model_for_int8_training, + prepare_model_for_kbit_training, + set_peft_model_state_dict, +) +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, AutoConfig + +import bitsandbytes as bnb +from trl import SFTTrainer + +from loguru import logger + +from utils.helper import test_mkdir +from flock_sdk import FlockModel +from client import GeneralClient +from prompters.prompter_hub import get_prompter + +pypi_datasets.utils.logging.set_verbosity_error() + +class FLockLLMFinetuneModel(FlockModel): + def __init__( + self, + args, + ): + + self.args = args + + # Model args + self.model_name = args.foundation_model + self.global_model_path = args.foundation_model_pre_trained_weights_path + self.finetune_adapter = args.finetune_adapter + + if args.finetune_adapter.lower() == "lora": + self.lora_r = 16 + self.lora_alpha = 16 + elif args.finetune_adapter.lower() == "qlora": + self.lora_r = 4 + self.lora_alpha = 4 + else: + raise ValueError(f"Adapter type {self.finetune_adapter} not recognized") + self.lora_dropout = args.lora_dropout + self.lora_target_modules = args.lora_target_modules + + + # Train args + self.local_batch_size = args.proposer_train_batch_size + self.local_micro_batch_size = args.proposer_train_micro_batch_size + self.local_num_epochs = args.proposer_num_epochs + self.local_learning_rate = args.proposer_learning_rate + self.local_val_set_size = args.proposer_val_set_size + self.voter_val_set_size = args.voter_val_set_size + self.local_save_steps = args.proposer_save_steps + self.cutoff_len = args.cutoff_len + self.group_by_length = args.proposer_train_group_by_length + self.optim = args.proposer_train_optimizer + self.lr_scheduler_type = args.proposer_train_lr_scheduler_type + self.warmup_steps = args.proposer_train_warmup_steps + self.weight_decay = args.proposer_train_weight_decay + self.block_size = args.proposer_train_block_size + + + # Tracking args + self.output_dir = args.finetune_adapter_checkpoint_save_dir + self.gradient_checkpointing = args.proposer_train_gradient_checkpointing + self.logging_steps = args.proposer_train_logging_steps + self.report_to = args.report_to + self.save_total_limit = args.save_total_limit + + logger.debug( + f"FLockLLM finetuning using LoRA with params:\n" + f"global_model: {self.global_model_path}\n" + f"output_dir: {self.output_dir}\n" + f"local_batch_size: {self.local_batch_size}\n" + f"local_micro_batch_size: {self.local_micro_batch_size}\n" + f"local_num_epochs: {self.local_num_epochs}\n" + f"local_learning_rate: {self.local_learning_rate}\n" + f"local_val_set_size: {self.local_val_set_size}\n" + f"local_save_steps: {self.local_save_steps}\n" + f"cutoff_len: {self.cutoff_len}\n" + f"lora_r: {self.lora_r}\n" + f"lora_alpha: {self.lora_alpha}\n" + f"lora_dropout: {self.lora_dropout}\n" + f"lora_target_modules: {self.lora_target_modules}\n" + f"group_by_length: {self.group_by_length}\n" + f"gradient_checkpointing: {self.gradient_checkpointing or False}\n" + # f"prompt template: {prompt_template_name}\n" + f"logging_steps: {self.logging_steps}\n" + f"optim: {self.optim}\n" + f"lr_scheduler_type: {self.lr_scheduler_type}\n" + f"warmup_steps: {self.warmup_steps}\n" + f"weight_decay: {self.weight_decay}\n" + f"report_to: {self.report_to}\n" + f"save_total_limit: {self.save_total_limit}\n" + f"block_size: {self.block_size}\n" + ) + + if torch.cuda.is_available(): + logger.debug("CUDA is available. Here are the device details:") + # 获取CUDA设备数量 + num_devices = torch.cuda.device_count() + logger.debug(f"Number of CUDA devices available: {num_devices}") + + # 遍历每个CUDA设备 + for i in range(num_devices): + logger.debug(f"Device {i}: {torch.cuda.get_device_name(i)}") + # 获取当前设备的详细信息 + device_properties = torch.cuda.get_device_properties(i) + logger.debug(f" Memory Allocation: {device_properties.total_memory / 1e9} GB") + logger.debug(f" CUDA Capability: {device_properties.major}.{device_properties.minor}") + else: + logger.warning("CUDA is not available.") + + self.local_comm_round_idx = 0 + + """ + LLM Settings & Preparation + """ + + self.prompter = get_prompter() + self.gradient_accumulation_steps = self.local_batch_size // self.local_micro_batch_size + + self.tokenizer = AutoTokenizer.from_pretrained(self.global_model_path, + trust_remote_code=False, + use_fast=True) + + pad_token_id = 0 + self.tokenizer.pad_token_id = pad_token_id + self.tokenizer.pad_token = self.tokenizer.convert_ids_to_tokens(pad_token_id) + self.tokenizer.padding_side = "right" # "left" + + """ + Device and DDP setting + """ + # TODO temp solution + if "gemma" in self.model_name: + self.device_map = "cuda:0" + else: + self.device_map = "auto" + + world_size = int(os.environ.get("WORLD_SIZE", 1)) + self.ddp = world_size != 1 + self.global_model = self.get_model() + + if self.ddp: + self.device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)} + self.gradient_accumulation_steps = self.gradient_accumulation_steps // world_size + + """ + Dataset loading + """ + self.local_train_dataset, self.local_eval_dataset = self.init_dataset(self.args.data_path) + + def init_dataset(self, dataset_path: str): + logger.info("\nPreparing the local training and validation dataset") + + local_data = pypi_load_dataset("json", data_files=dataset_path) + + if self.voter_val_set_size > 0: + split_params = { + "test_size": self.voter_val_set_size, + "shuffle": True + } + if hasattr(self.args, 'random_seed'): + split_params['seed'] = self.args.random_seed + local_train_val = local_data["train"].train_test_split(**split_params) + + self.local_train_dataset = ( + local_train_val["train"].shuffle().map(self.generate_and_tokenize_prompt) + ) + self.local_eval_dataset = ( + local_train_val["test"].shuffle().map(self.generate_and_tokenize_prompt) + ) + else: + self.local_train_dataset = local_data["train"].shuffle().map(self.generate_and_tokenize_prompt) + self.local_eval_dataset = None + + return self.local_train_dataset, self.local_eval_dataset + + # def tokenize(self, prompt, add_eos_token=True): + # result = self.tokenizer( + # prompt, + # truncation=True, + # max_length=self.cutoff_len, + # padding="max_length", + # return_tensors=None, + # ) + # + # if ( + # result["input_ids"][-1] != self.tokenizer.eos_token_id + # and len(result["input_ids"]) < self.cutoff_len + # and add_eos_token + # ): + # result["input_ids"].append(self.tokenizer.eos_token_id) + # result["attention_mask"].append(1) + # + # result["labels"] = result["input_ids"].copy() + # + # return result + + def generate_and_tokenize_prompt(self, data_point): + full_prompt = self.prompter.generate_prompt( + data_point["instruction"], + data_point["context"], + data_point["response"], + ) + + tokenized_full_prompt = {"text": full_prompt} + + return tokenized_full_prompt + + def get_model(self): + torch_dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16 + + if self.finetune_adapter.lower() == "lora": + model = AutoModelForCausalLM.from_pretrained(self.global_model_path, + load_in_8bit=True, + trust_remote_code=False, + device_map=self.device_map) + model = prepare_model_for_int8_training(model) + elif self.finetune_adapter.lower() == "qlora": + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", # 量化时使用nf4最优,也可以使用fp4 + bnb_4bit_compute_dtype=torch_dtype, + bnb_4bit_use_double_quant=True, # 二次量化 + ) + + config = AutoConfig.from_pretrained(self.global_model_path) + config.use_cache = False + config.gradient_checkpointing = True + + model = AutoModelForCausalLM.from_pretrained(self.global_model_path, + config=config, + quantization_config=bnb_config, + trust_remote_code=False, + torch_dtype=torch_dtype, + device_map=self.device_map) + model = prepare_model_for_kbit_training(model, + use_gradient_checkpointing=True) + else: + raise ValueError(f"Adapter type {self.finetune_adapter} not recognized") + + def find_all_linear_names(model, add_lm_head=True): + cls = bnb.nn.Linear4bit + lora_module_names = set() + for name, module in model.named_modules(): + if isinstance(module, cls): + names = name.split('.') + lora_module_names.add(names[0] if len(names) == 1 else names[-1]) + + if add_lm_head and not "lm_head" in lora_module_names: + lora_module_names.add("lm_head") + + return list(lora_module_names) + + if len(self.lora_target_modules) == 0: + self.lora_target_modules = find_all_linear_names(model) + + self.lora_config = LoraConfig( + r=self.lora_r, + lora_alpha=self.lora_alpha, + target_modules=self.lora_target_modules, + lora_dropout=self.lora_dropout, + bias="none", + task_type="CAUSAL_LM", + # inference_mode=False + ) + + # Inject QLoRA into pre-trained model + + model = get_peft_model(model, self.lora_config) + + if self.ddp and torch.cuda.device_count() > 1: + model.is_parallelizable = True + model.model_parallel = True + + print_number_of_trainable_model_parameters(model) + + return model + + def train(self, parameters) -> bytes: + self.local_comm_round_idx += 1 + + # Load model template with pre-trained weights + model = self.global_model + if parameters is not None: + logger.debug("Loading latest global adapter model parameters to local model...") + set_peft_model_state_dict(model, torch.load(io.BytesIO(parameters)), "default") + + model.train() + + client = GeneralClient(model=model, local_train_dataset=self.local_train_dataset, local_eval_dataset=None, + local_val_set_size=self.local_val_set_size, output_dir=self.output_dir) + client.build_local_trainer(model_name=self.model_name, + tokenizer=self.tokenizer, + local_micro_batch_size=self.local_micro_batch_size, + gradient_accumulation_steps=self.gradient_accumulation_steps, + local_num_epochs=self.local_num_epochs, + local_learning_rate=self.local_learning_rate, + group_by_length=self.group_by_length, + logging_steps=self.logging_steps, + optim=self.optim, + lr_scheduler_type=self.lr_scheduler_type, + warmup_steps=self.warmup_steps, + weight_decay=self.weight_decay, + report_to=self.report_to, + save_total_limit=self.save_total_limit, + block_size=self.block_size, + gradient_checkpointing=self.gradient_checkpointing, + ddp=self.ddp, + ) + + logger.info("Initiating the local training...") + client.initiate_local_training() + + logger.info("Local training starts...") + client.train() + + logger.info("Terminating the local training...") + model = client.terminate_local_training(self.local_comm_round_idx) + + logger.info("Wrapping up the local model parameters and sending to voters...") + buffer = io.BytesIO() + torch.save(model.state_dict(), buffer) + return buffer.getvalue() + + def evaluate(self, parameters: bytes) -> float: + model = self.global_model + if parameters is not None: + logger.debug("\nLoading latest global adapter model parameters to local model...") + set_peft_model_state_dict(model, torch.load(io.BytesIO(parameters)), "default") + + eval_args = TrainingArguments( + do_train=False, + do_eval=True, + output_dir=self.output_dir, + ) + + trainer = SFTTrainer( + model=model, + args=eval_args, + eval_dataset=self.local_eval_dataset, + dataset_text_field="text", + max_seq_length=self.block_size, + tokenizer=self.tokenizer, + data_collator=None, + packing=None + ) + + logger.info( + f"Global adapter model evaluation start..." + ) + + eval_result = trainer.evaluate() + + logger.info( + f"Global adapter model loss: {round(eval_result['eval_loss'], 6)}" + ) + + # Using miners for temp + return eval_result['eval_loss'] + + def aggregate(self, parameters_list: list[bytes]) -> bytes: + # Handle DDP alignment problem: relocate the model weights to unified device + device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') + parameters_list = [ + torch.load(io.BytesIO(parameters), map_location=device) for parameters in parameters_list + ] + + logger.info("Aggregating the all local model parameters...") + if self.args.federated_optimizer.lower() == "fedavg": + averaged_params_template = parameters_list[0] + for k in averaged_params_template.keys(): + temp_w = [] + for local_w in parameters_list: + temp_w.append(local_w[k]) + averaged_params_template[k] = sum(temp_w) / torch.tensor(len(temp_w)).to(device) + else: + raise NotImplementedError(f"The federated optimizer ({self.args.federated_optimizer}) is not supported.") + + # Check output dir + target_path = os.path.join(self.output_dir, str(self.local_comm_round_idx)) + test_mkdir(target_path) + + # Save the averaged parameters to the file + global_model_output_path = os.path.join(target_path, "pytorch_local_model_lora.bin") + logger.info(f"Saving the global adapter model parameters to {global_model_output_path}...") + torch.save(averaged_params_template, + global_model_output_path) + self.lora_config.save_pretrained(self.output_dir) + + logger.info("Wrapping up the global adapter model parameters and sending to all Proposers...") + # Create a buffer + buffer = io.BytesIO() + # Save state dict to the buffer + torch.save(averaged_params_template, buffer) + # Get the byte representation + aggregated_parameters = buffer.getvalue() + + return aggregated_parameters diff --git a/examples/flock_llm/README.md b/examples/flock_llm/README.md index 0e9a1d1..e05d6f0 100644 --- a/examples/flock_llm/README.md +++ b/examples/flock_llm/README.md @@ -110,3 +110,16 @@ We compared the fine-tuned Vicuna model with the original pre-trained Vicuna mod 
The finetuned model still has the similiar performance even better for answering general QA.
+ + +# Additional Information (new S3) +put the s3 credentials in the `~/.aws/credentials` file + +```bash +[default] +aws_access_key_id = YOUR_ACCESS_KEY_ID +aws_secret_access_key = YOUR_SECRET_ACCESS_KEY +``` + +## TODO +- [ ] Optimize the model host \ No newline at end of file diff --git a/examples/flock_llm/arguments.py b/examples/flock_llm/arguments.py new file mode 100644 index 0000000..828853c --- /dev/null +++ b/examples/flock_llm/arguments.py @@ -0,0 +1,76 @@ +''' + Ref FedML: https://github.com/FedML-AI/FedML/blob/master/python/fedml/arguments.py +''' + +import argparse +from os import path + +import yaml +from loguru import logger + +def add_args(): + parser = argparse.ArgumentParser(description="FedContinuum") + parser.add_argument( + "--yaml_config_file", + "--conf", + help="configuration file in yaml", + type=str, + default="", + ) + + args, unknown = parser.parse_known_args() + return args + +class Arguments: + + def __init__(self, cmd_args, override_cmd_args=True): + # set the command line arguments + cmd_args_dict = cmd_args.__dict__ + for arg_key, arg_val in cmd_args_dict.items(): + setattr(self, arg_key, arg_val) + + self.get_default_yaml_config(cmd_args) + if not override_cmd_args: + # reload cmd args again + for arg_key, arg_val in cmd_args_dict.items(): + setattr(self, arg_key, arg_val) + + def load_yaml_config(self, yaml_path): + try: + with open(yaml_path, "r") as stream: + try: + return yaml.safe_load(stream) + except yaml.YAMLError as exc: + raise ValueError("Yaml error - check yaml file") + except Exception as e: + logger.error(f"Error loading yaml file: {e}") + return None + + def get_default_yaml_config(self, cmd_args): + if cmd_args.yaml_config_file == "": + path_current_file = path.abspath(path.dirname(__file__)) + raise Exception(f"yaml_config_file is not specified or cannot fined via {path_current_file}") + + self.yaml_paths = [cmd_args.yaml_config_file] + # Load all arguments from yaml config + # https://www.cloudbees.com/blog/yaml-tutorial-everything-you-need-get-started + configuration = self.load_yaml_config(cmd_args.yaml_config_file) + + # Override class attributes from current yaml config + if configuration is not None: + self.set_attr_from_config(configuration) + + return configuration + + def set_attr_from_config(self, configuration): + for _, param_family in configuration.items(): + for key, val in param_family.items(): + setattr(self, key, val) + + +def load_arguments(): + cmd_args = add_args() + # Load all arguments from YAML config file + args = Arguments(cmd_args) + + return args \ No newline at end of file diff --git a/examples/flock_llm/build_and_upload.sh b/examples/flock_llm/build_and_upload.sh deleted file mode 100755 index f67b057..0000000 --- a/examples/flock_llm/build_and_upload.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/bash -set -e - -IMAGE_TAG="flock_model" -OUTPUT_FILE=`mktemp` -echo "Building the model image." -docker build -t $IMAGE_TAG . - -echo "Saving the docker image to a file and compressing it. It may take a while.." -time (docker save $IMAGE_TAG | xz -T 0 > $OUTPUT_FILE) - -echo "Uploading the compressed image to IPFS.." -json=`curl -F "file=@$OUTPUT_FILE" ipfs.flock.io/api/v0/add` - -# Uncomment if you'd like to upload to your local IPFS -#json=`curl -F "file=@$OUTPUT_FILE" 127.0.0.1:5001/api/v0/add` - -hash=`echo $json | grep -o '"Hash":"[^"]*' | grep -o '[^"]*$'` -rm $OUTPUT_FILE -echo "Model definition IPFS hash: $hash" diff --git a/examples/flock_llm/build_and_upload_IPFS.sh b/examples/flock_llm/build_and_upload_IPFS.sh new file mode 100755 index 0000000..634de8a --- /dev/null +++ b/examples/flock_llm/build_and_upload_IPFS.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +OUTPUT_FILE=`mktemp` + +time (tar -czf $OUTPUT_FILE .) + +echo "Uploading the compressed archive to IPFS.." +# json=`curl -F "file=@$OUTPUT_FILE" ipfs.flock.io/api/v0/add` + +# Uncomment if you'd like to upload to your local IPFS +json=`curl -F "file=@$OUTPUT_FILE" 127.0.0.1:5001/api/v0/add` + +echo "Extracting IpfsHash.." +hash=`echo $json | grep -o '"Hash":"[^"]*' | grep -o '[^"]*$'` +rm $OUTPUT_FILE +echo "Model definition IPFS hash: $hash" diff --git a/examples/flock_llm/build_and_upload_S3.py b/examples/flock_llm/build_and_upload_S3.py new file mode 100644 index 0000000..c1e71e0 --- /dev/null +++ b/examples/flock_llm/build_and_upload_S3.py @@ -0,0 +1,50 @@ +import os +import tempfile + +from loguru import logger +from s3_storage_manager import S3StorageManager, S3_MODEL_IMAGES_BUCKET +from utils.file_operations import compress_directory, get_file_size + +if __name__ == "__main__": + + s3_storage_manager = S3StorageManager(images_bucket_name=S3_MODEL_IMAGES_BUCKET) + + # Create a temp file name + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + temp_file_path = temp_file.name + target_file_path = None + + try: + logger.info("Compressing files..") + compress_directory(temp_file_path) + + logger.info("Generating file name based on hash..") + upload_file_name_in_hash = s3_storage_manager.generate_file_name_in_hash(temp_file_path) + + # Generate the target file path + target_file_path = os.path.join(os.path.dirname(temp_file_path), upload_file_name_in_hash) + + if os.path.exists(target_file_path): + os.remove(target_file_path) + + # Rename the temp file to the target file path + os.rename(temp_file_path, target_file_path) + logger.info(f"File {temp_file_path} renamed to: {target_file_path}") + + # Print the size of the compressed file + file_size_str = get_file_size(target_file_path) + logger.info(f"The size of the compressed file is: {file_size_str}") + + logger.info("Uploading the file to S3..") + s3_storage_manager.upload_file(target_file_path, upload_file_name_in_hash) + + logger.warning(f"Please keep this hash code for the FLock client to download the model, hash: {upload_file_name_in_hash}") + + except Exception as e: + logger.error(f"An error occurred: {e}") + finally: + # Clean up the temp file + if os.path.exists(temp_file_path): + os.remove(temp_file_path) + if target_file_path is not None and os.path.exists(target_file_path) and temp_file_path != target_file_path: + os.remove(target_file_path) diff --git a/examples/flock_llm/fl_libs/client.py b/examples/flock_llm/client.py similarity index 65% rename from examples/flock_llm/fl_libs/client.py rename to examples/flock_llm/client.py index 6972927..76cf042 100644 --- a/examples/flock_llm/fl_libs/client.py +++ b/examples/flock_llm/client.py @@ -12,12 +12,13 @@ from collections import OrderedDict import torch -import transformers from peft import get_peft_model_state_dict +from trl import SFTTrainer +from transformers import TrainingArguments +import transformers class GeneralClient: def __init__(self, - client_id, model, local_train_dataset, local_eval_dataset, @@ -25,8 +26,6 @@ def __init__(self, output_dir, model_eval_steps=40, model_save_steps=40): - self.client_id = client_id - self.model = model self.output_dir = output_dir self.local_output_dir = os.path.join(self.output_dir, "local_trainer_saved", "local_output") @@ -38,42 +37,65 @@ def __init__(self, self.model_save_steps = model_save_steps def build_local_trainer(self, + model_name, tokenizer, local_micro_batch_size, gradient_accumulation_steps, local_num_epochs, local_learning_rate, + logging_steps, + optim, + lr_scheduler_type, + warmup_steps, + weight_decay, + report_to, + save_total_limit, + block_size, + gradient_checkpointing, group_by_length, ddp): - self.train_args = transformers.TrainingArguments( - per_device_train_batch_size=local_micro_batch_size, - gradient_accumulation_steps=gradient_accumulation_steps, - warmup_steps=0, - num_train_epochs=local_num_epochs, - learning_rate=local_learning_rate, - fp16=True, - logging_steps=1, - optim="adamw_torch", + self.train_args = TrainingArguments( + do_train=True, + do_eval=True, + output_dir=self.local_output_dir, + dataloader_drop_last=False, # Ori: True evaluation_strategy="steps" if self.local_val_set_size > 0 else "no", save_strategy="steps", + logging_strategy="steps", + num_train_epochs=local_num_epochs, eval_steps=self.model_eval_steps if self.local_val_set_size > 0 else None, save_steps=self.model_save_steps, - output_dir=self.local_output_dir, - save_total_limit=1, + logging_steps=logging_steps, + per_device_train_batch_size=local_micro_batch_size, + per_device_eval_batch_size=local_micro_batch_size * 2, + optim=optim, + learning_rate=local_learning_rate, + lr_scheduler_type=lr_scheduler_type, + warmup_steps=warmup_steps, + gradient_accumulation_steps=gradient_accumulation_steps, + gradient_checkpointing=gradient_checkpointing, + weight_decay=weight_decay, + report_to=report_to, load_best_model_at_end=True if self.local_val_set_size > 0 else False, + save_total_limit=save_total_limit, + bf16=True if torch.cuda.is_bf16_supported() else False, + fp16=False if torch.cuda.is_bf16_supported() else True, ddp_find_unused_parameters=False if ddp else None, group_by_length=group_by_length, - dataloader_drop_last=False ) - self.local_trainer = transformers.Trainer(model=self.model, - train_dataset=self.local_train_dataset, - eval_dataset=self.local_eval_dataset, - args=self.train_args, - data_collator=transformers.DataCollatorForSeq2Seq( - tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True - ), - ) + + self.local_trainer = SFTTrainer( + model=self.model, + args=self.train_args, + train_dataset=self.local_train_dataset, + eval_dataset=self.local_eval_dataset, + dataset_text_field="text", + max_seq_length=block_size, + tokenizer=tokenizer, + data_collator=None, + packing=None, + ) def initiate_local_training(self): self.model.config.use_cache = False @@ -91,7 +113,7 @@ def train(self): def terminate_local_training(self, local_comm_round_idx): new_adapter_weight = self.model.state_dict() - single_output_dir = os.path.join(self.output_dir, str(local_comm_round_idx), "local_output_{}".format(self.client_id)) + single_output_dir = os.path.join(self.output_dir, str(local_comm_round_idx), "local_output") os.makedirs(single_output_dir, exist_ok=True) torch.save(new_adapter_weight, single_output_dir + "/pytorch_local_model_lora.bin") diff --git a/examples/flock_llm/config.yaml b/examples/flock_llm/config.yaml new file mode 100644 index 0000000..7e55f3c --- /dev/null +++ b/examples/flock_llm/config.yaml @@ -0,0 +1,42 @@ +common_args: + project_name: "FLockLLM_finetune" + mode: "experiment" # Options: 1. experiment 2. deployment + random_seed: 1993 # Note: only works for experiment mode + +data_args: + data_path: "/dataset.json" + +model_args: + foundation_model: "google/gemma-2b" # google/gemma-2b, mistralai/Mistral-7B-v0.1, lmsys/vicuna-7b-v1.5 + foundation_model_pre_trained_weights_source: "flock_s3" # Options: "huggingface", "flock_s3" Defaults: "huggingface" + finetune_adapter: "qlora" + lora_r: 4 + lora_alpha: 4 + lora_dropout: 0.05 + lora_target_modules: [] # Options: "q_proj","k_proj","v_proj","o_proj" Defaults: [] (let system auto search) + +train_args: + proposer_train_batch_size: 32 + proposer_train_micro_batch_size: 8 + proposer_num_epochs: 1 + proposer_learning_rate: 0.0003 + proposer_val_set_size: 0 + proposer_save_steps: 3 + cutoff_len: 512 + proposer_train_group_by_length: false + proposer_train_optimizer: "paged_adamw_8bit" + proposer_train_lr_scheduler_type: "constant" + proposer_train_warmup_steps: 1 + proposer_train_weight_decay: 0.05 + proposer_train_block_size: 8 + federated_optimizer: "fedavg" + +evaluation_args: + voter_val_set_size: 5 + +tracking_args: + finetune_adapter_checkpoint_save_dir: "output/checkpoints/gemma-2b" + proposer_train_gradient_checkpointing: true + proposer_train_logging_steps: 10 + report_to: "wandb" + save_total_limit: 3 \ No newline at end of file diff --git a/examples/flock_llm/configs/README.md b/examples/flock_llm/configs/README.md new file mode 100644 index 0000000..2556429 --- /dev/null +++ b/examples/flock_llm/configs/README.md @@ -0,0 +1,46 @@ +