From 2bc64597aad2a2bcc7122986fc5457b27418a7b1 Mon Sep 17 00:00:00 2001 From: kooshi <1934337+kooshi@users.noreply.github.com> Date: Fri, 24 Mar 2023 23:03:43 -0500 Subject: [PATCH 1/2] model parallelism --- GPTQ-for-LLaMa/autograd_4bit.py | 11 +++++++++-- finetune.py | 5 +++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/GPTQ-for-LLaMa/autograd_4bit.py b/GPTQ-for-LLaMa/autograd_4bit.py index c55b7d3..c5c5489 100644 --- a/GPTQ-for-LLaMa/autograd_4bit.py +++ b/GPTQ-for-LLaMa/autograd_4bit.py @@ -15,6 +15,8 @@ auto_switch_thd = 16 def get_buffer(shape_of_qweight, dtype=torch.float16, device='cuda'): if shape_of_qweight not in buffer_mat_dic.keys(): buffer_mat_dic[shape_of_qweight] = torch.zeros((shape_of_qweight[0] * 8, shape_of_qweight[1]), dtype=dtype, device=device) + elif buffer_mat_dic[shape_of_qweight].device != device: + buffer_mat_dic[shape_of_qweight] = buffer_mat_dic[shape_of_qweight].to(device) return buffer_mat_dic[shape_of_qweight] @@ -217,8 +219,13 @@ def load_llama_model_4bit_low_ram(config_path, model_path, half=False): if name in layers: del layers[name] make_quant_for_4bit_autograd(model, layers) - model = accelerate.load_checkpoint_and_dispatch(model=model, checkpoint=model_path, device_map='auto') - model.cuda() + model = accelerate.load_checkpoint_and_dispatch( + model=model, + checkpoint=model_path, + device_map='auto', + no_split_module_classes=["LlamaDecoderLayer"] + ) + model.seqlen = 2048 if half: diff --git a/finetune.py b/finetune.py index 6224f2e..a259372 100644 --- a/finetune.py +++ b/finetune.py @@ -92,6 +92,11 @@ if not ft_config.skip: from gradient_checkpointing import apply_gradient_checkpointing apply_gradient_checkpointing(model, checkpoint_ratio=ft_config.gradient_checkpointing_ratio) + # Disable Trainer's DataParallel for multigpu + if torch.cuda.device_count() > 1: + model.is_parallelizable = True + model.model_parallel = True + trainer = transformers.Trainer( model=model, train_dataset=data.train_data, From 8e471516b86cee141be6a4ea72e29eff1b2bfb3e Mon Sep 17 00:00:00 2001 From: kooshi <1934337+kooshi@users.noreply.github.com> Date: Fri, 24 Mar 2023 23:56:06 -0500 Subject: [PATCH 2/2] distributed data parallelism with torchrun --- .gitignore | 3 +++ Finetune4bConfig.py | 10 +++++++++- GPTQ-for-LLaMa/autograd_4bit.py | 4 ++-- finetune.py | 10 ++++++---- 4 files changed, 20 insertions(+), 7 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..85c8fed --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +alpaca_lora/ +repository/ +__pycache__/ diff --git a/Finetune4bConfig.py b/Finetune4bConfig.py index c074028..3514060 100644 --- a/Finetune4bConfig.py +++ b/Finetune4bConfig.py @@ -1,3 +1,4 @@ +import os class Finetune4bConfig: """Config holder for LLaMA 4bit finetuning """ @@ -64,6 +65,12 @@ class Finetune4bConfig: self.logging_steps = logging_steps self.checkpoint = checkpoint self.skip = skip + self.world_size = int(os.environ.get("WORLD_SIZE", 1)) + self.local_rank = int(os.environ.get("LOCAL_RANK", 0)) + self.ddp = self.world_size != 1 + self.device_map = "auto" if not self.ddp else {"": self.local_rank} + if self.ddp: + self.gradient_accumulation_steps = self.gradient_accumulation_steps // self.world_size def __str__(self) -> str: @@ -74,5 +81,6 @@ class Finetune4bConfig: f"{self.gradient_checkpointing=}\n{self.gradient_checkpointing_ratio=}\n" +\ f"{self.warmup_steps=}\n{self.save_steps=}\n{self.save_total_limit=}\n" +\ f"{self.logging_steps=}\n" +\ - f"{self.checkpoint=}\n{self.skip=}" + f"{self.checkpoint=}\n{self.skip=}\n" +\ + f"{self.world_size=}\n{self.ddp=}\n{self.device_map=}" return s.replace("self.", "") diff --git a/GPTQ-for-LLaMa/autograd_4bit.py b/GPTQ-for-LLaMa/autograd_4bit.py index c5c5489..ac90789 100644 --- a/GPTQ-for-LLaMa/autograd_4bit.py +++ b/GPTQ-for-LLaMa/autograd_4bit.py @@ -197,7 +197,7 @@ def model_to_float(model): print('Converted as Float.') -def load_llama_model_4bit_low_ram(config_path, model_path, half=False): +def load_llama_model_4bit_low_ram(config_path, model_path, half=False, device_map="auto"): import transformers import accelerate from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer @@ -222,7 +222,7 @@ def load_llama_model_4bit_low_ram(config_path, model_path, half=False): model = accelerate.load_checkpoint_and_dispatch( model=model, checkpoint=model_path, - device_map='auto', + device_map=device_map, no_split_module_classes=["LlamaDecoderLayer"] ) diff --git a/finetune.py b/finetune.py index a259372..3f8821e 100644 --- a/finetune.py +++ b/finetune.py @@ -38,13 +38,14 @@ import train_data ft_config = get_config() # * Show loaded parameters -print(f"{ft_config}\n") +if ft_config.local_rank == 0: + print(f"{ft_config}\n") if ft_config.gradient_checkpointing: print('Disable Dropout.') # Load Basic Model -model, tokenizer = load_llama_model_4bit_low_ram(ft_config.llama_q4_config_dir, ft_config.llama_q4_model) +model, tokenizer = load_llama_model_4bit_low_ram(ft_config.llama_q4_config_dir, ft_config.llama_q4_model, device_map=ft_config.device_map) # Config Lora lora_config = LoraConfig( @@ -93,7 +94,7 @@ if not ft_config.skip: apply_gradient_checkpointing(model, checkpoint_ratio=ft_config.gradient_checkpointing_ratio) # Disable Trainer's DataParallel for multigpu - if torch.cuda.device_count() > 1: + if not ft_config.ddp and torch.cuda.device_count() > 1: model.is_parallelizable = True model.model_parallel = True @@ -115,7 +116,8 @@ if not ft_config.skip: save_steps=ft_config.save_steps, output_dir=ft_config.lora_out_dir, save_total_limit=ft_config.save_total_limit, - load_best_model_at_end=False + load_best_model_at_end=False, + ddp_find_unused_parameters=False if ft_config.ddp else None, ), data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), )