diff --git a/Finetune4bConfig.py b/Finetune4bConfig.py new file mode 100644 index 0000000..62f8091 --- /dev/null +++ b/Finetune4bConfig.py @@ -0,0 +1,69 @@ +class Finetune4bConfig: + """Config holder for LLaMA 4bit finetuning + """ + def __init__(self, dataset : str, ds_type : str, + lora_out_dir : str, lora_apply_dir : str, + llama_q4_config_dir : str, llama_q4_model : str, + mbatch_size : int, batch_size : int, + epochs : int, lr : float, + cutoff_len : int, + lora_r : int, lora_alpha : int, lora_dropout : float, + val_set_size : float, + warmup_steps : int, save_steps : int, save_total_limit : int, logging_steps : int, + checkpoint : bool, skip : bool + ): + """ + Args: + dataset (str): Path to dataset file + ds_type (str): Dataset structure format + lora_out_dir (str): Directory to place new LoRA + lora_apply_dir (str): Path to directory from which LoRA has to be applied before training + llama_q4_config_dir (str): Path to the config.json, tokenizer_config.json, etc + llama_q4_model (str): Path to the quantized model in huggingface format + mbatch_size (int): Micro-batch size + batch_size (int): Batch size + epochs (int): Epochs + lr (float): Learning rate + cutoff_len (int): Cutoff length + lora_r (int): LoRA R + lora_alpha (int): LoRA Alpha + lora_dropout (float): LoRA Dropout + val_set_size (int): Validation set size + warmup_steps (int): Warmup steps before training + save_steps (int): Save steps + save_total_limit (int): Save total limit + logging_steps (int): Logging steps + checkpoint (bool): Produce checkpoint instead of LoRA + skip (bool): Don't train model + """ + self.dataset = dataset + self.ds_type = ds_type + self.lora_out_dir = lora_out_dir + self.lora_apply_dir = lora_apply_dir + self.llama_q4_config_dir = llama_q4_config_dir + self.llama_q4_model = llama_q4_model + self.mbatch_size = mbatch_size + self.batch_size = batch_size + self.gradient_accumulation_steps = self.batch_size // self.mbatch_size + self.epochs = epochs + self.lr = lr + self.cutoff_len = cutoff_len + self.lora_r = lora_r + self.lora_alpha = lora_alpha + self.lora_dropout = lora_dropout + self.val_set_size = int(val_set_size) if val_set_size > 1.0 else float(val_set_size) + self.warmup_steps = warmup_steps + self.save_steps = save_steps + self.save_total_limit = save_total_limit + self.logging_steps = logging_steps + self.checkpoint = checkpoint + self.skip = skip + + + def __str__(self) -> str: + return f"\nParameters:\n{'config':-^20}\n{self.dataset=}\n{self.ds_type=}\n{self.lora_out_dir=}\n{self.lora_apply_dir=}\n{self.llama_q4_config_dir=}\n{self.llama_q4_model=}\n\n" +\ + f"{'training':-^20}\n" +\ + f"{self.mbatch_size=}\n{self.batch_size=}\n{self.gradient_accumulation_steps=}\n{self.epochs=}\n{self.lr=}\n{self.cutoff_len=}\n" +\ + f"{self.lora_r=}\n{self.lora_alpha=}\n{self.lora_dropout=}\n{self.val_set_size=}\n{self.warmup_steps=}\n{self.save_steps=}\n{self.save_total_limit=}\n" +\ + f"{self.logging_steps=}\n" +\ + f"{self.checkpoint=}\n{self.skip=}" diff --git a/arg_parser.py b/arg_parser.py new file mode 100644 index 0000000..9794e57 --- /dev/null +++ b/arg_parser.py @@ -0,0 +1,81 @@ +import os +import argparse +from Finetune4bConfig import Finetune4bConfig + +def parse_commandline(): + parser = argparse.ArgumentParser( + prog=__file__.split(os.path.sep)[-1], + description="Produce LoRA in 4bit training", + usage="%(prog)s [config] [training]\n\nAll arguments are optional" + ) + + parser.add_argument("dataset", nargs="?", + default="./dataset.json", + help="Path to dataset file. Default: %(default)s" + ) + + parser_config = parser.add_argument_group("config") + parser_training = parser.add_argument_group("training") + + # Config args group + parser_config.add_argument("--ds_type", choices=["txt", "alpaca"], default="alpaca", required=False, + help="Dataset structure format. Default: %(default)s" + ) + parser_config.add_argument("--lora_out_dir", default="alpaca_lora", required=False, + help="Directory to place new LoRA. Default: %(default)s" + ) + parser_config.add_argument("--lora_apply_dir", default=None, required=False, + help="Path to directory from which LoRA has to be applied before training. Default: %(default)s" + ) + parser_config.add_argument("--llama_q4_config_dir", default="./llama-13b-4bit/", required=False, + help="Path to the config.json, tokenizer_config.json, etc. Default: %(default)s" + ) + parser_config.add_argument("--llama_q4_model", default="./llama-13b-4bit.pt", required=False, + help="Path to the quantized model in huggingface format. Default: %(default)s" + ) + + # Training args group + parser_training.add_argument("--mbatch_size", default=1, type=int, help="Micro-batch size. Default: %(default)s") + parser_training.add_argument("--batch_size", default=2, type=int, help="Batch size. Default: %(default)s") + parser_training.add_argument("--epochs", default=3, type=int, help="Epochs. Default: %(default)s") + parser_training.add_argument("--lr", default=2e-4, type=float, help="Learning rate. Default: %(default)s") + parser_training.add_argument("--cutoff_len", default=256, type=int, help="Default: %(default)s") + parser_training.add_argument("--lora_r", default=8, type=int, help="Default: %(default)s") + parser_training.add_argument("--lora_alpha", default=16, type=int, help="Default: %(default)s") + parser_training.add_argument("--lora_dropout", default=0.05, type=float, help="Default: %(default)s") + parser_training.add_argument("--val_set_size", default=0.2, type=float, help="Validation set size. Default: %(default)s") + parser_training.add_argument("--warmup_steps", default=50, type=int, help="Default: %(default)s") + parser_training.add_argument("--save_steps", default=50, type=int, help="Default: %(default)s") + parser_training.add_argument("--save_total_limit", default=3, type=int, help="Default: %(default)s") + parser_training.add_argument("--logging_steps", default=10, type=int, help="Default: %(default)s") + parser_training.add_argument("-c", "--checkpoint", action="store_true", help="Produce checkpoint instead of LoRA. Default: %(default)s") + parser_training.add_argument("--skip", action="store_true", help="Don't train model. Can be useful to produce checkpoint from existing LoRA. Default: %(default)s") + + return vars(parser.parse_args()) + + +def get_config() -> Finetune4bConfig: + args = parse_commandline() + return Finetune4bConfig( + dataset=args["dataset"], + ds_type=args["ds_type"], + lora_out_dir=args["lora_out_dir"], + lora_apply_dir=args["lora_apply_dir"], + llama_q4_config_dir=args["llama_q4_config_dir"], + llama_q4_model=args["llama_q4_model"], + mbatch_size=args["mbatch_size"], + batch_size=args["batch_size"], + epochs=args["epochs"], + lr=args["lr"], + cutoff_len=args["cutoff_len"], + lora_r=args["lora_r"], + lora_alpha=args["lora_alpha"], + lora_dropout=args["lora_dropout"], + val_set_size=args["val_set_size"], + warmup_steps=args["warmup_steps"], + save_steps=args["save_steps"], + save_total_limit=args["save_total_limit"], + logging_steps=args["logging_steps"], + checkpoint=args["checkpoint"], + skip=args["skip"] + ) diff --git a/finetune.py b/finetune.py index 7047a5c..b8809f6 100644 --- a/finetune.py +++ b/finetune.py @@ -1,144 +1,127 @@ -import os -import sys -sys.path.insert(0, './repository/transformers/src') -sys.path.insert(0, './repository/GPTQ-for-LLaMa') -sys.path.insert(0, './repository/peft/src') - -import peft -import peft.tuners.lora -assert peft.tuners.lora.is_gptq_available() - -import time -import torch -import transformers -from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer -import accelerate -from modelutils import find_layers -from autograd_4bit import make_quant_for_4bit_autograd -from autograd_4bit import load_llama_model_4bit_low_ram -from datasets import load_dataset, Dataset -import json -from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model, get_peft_model_state_dict, PeftModel - - -# Parameters -DATA_PATH = "./data.txt" -OUTPUT_DIR = "alpaca_lora" -lora_path_old = '' -config_path = './llama-13b-4bit/' -model_path = './llama-13b-4bit.pt' - -MICRO_BATCH_SIZE = 1 -BATCH_SIZE = 2 -GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE -EPOCHS = 3 -LEARNING_RATE = 2e-4 -CUTOFF_LEN = 256 -LORA_R = 8 -LORA_ALPHA = 16 -LORA_DROPOUT = 0.05 -VAL_SET_SIZE = 0 -TARGET_MODULES = [ - "q_proj", - "v_proj", -] -warmup_steps = 50 -save_steps = 50 -save_total_limit = 3 -logging_steps = 10 - -# Load Basic Model -model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path) - -# Config Lora -config = LoraConfig( - r=LORA_R, - lora_alpha=LORA_ALPHA, - target_modules=["q_proj", "v_proj"], - lora_dropout=LORA_DROPOUT, - bias="none", - task_type="CAUSAL_LM", -) -if lora_path_old == '': - model = get_peft_model(model, config) -else: - model = PeftModel.from_pretrained(model, lora_path_old) - print(lora_path_old, 'loaded') - -# Scales to half -print('Fitting 4bit scales and zeros to half') -for n, m in model.named_modules(): - if '4bit' in str(type(m)): - m.zeros = m.zeros.half() - m.scales = m.scales.half() - -# Set tokenizer -tokenizer.pad_token_id = 0 - -# Load Data -with open(DATA_PATH, 'r', encoding='utf8') as file: - txt = file.read() -txt = txt.replace('\r\n', '\n') -rows = [r for r in txt.split('\n') if r != ''] -data = Dataset.from_dict({"input": rows}) -exceed_count = 0 -def tokenize(prompt): - # there's probably a way to do this with the tokenizer settings - # but again, gotta move fast - global exceed_count - prompt = prompt['input'] - result = tokenizer( - prompt, - truncation=True, - max_length=CUTOFF_LEN + 1, - padding="max_length", - ) - d = { - "input_ids": result["input_ids"][:-1], - "attention_mask": result["attention_mask"][:-1], - } - if sum(d['attention_mask']) >= CUTOFF_LEN: - exceed_count += 1 - return d -data = data.shuffle().map(lambda x: tokenize(x)) -print('Train Data: {:.2f}%'.format(exceed_count / len(data) * 100), 'outliers') -train_data = data - -trainer = transformers.Trainer( - model=model, - train_dataset=train_data, - args=transformers.TrainingArguments( - per_device_train_batch_size=MICRO_BATCH_SIZE, - gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS, - warmup_steps=warmup_steps, - num_train_epochs=EPOCHS, - learning_rate=LEARNING_RATE, - fp16=True, - logging_steps=logging_steps, - evaluation_strategy="no", - save_strategy="steps", - eval_steps=None, - save_steps=save_steps, - output_dir=OUTPUT_DIR, - save_total_limit=save_total_limit, - load_best_model_at_end=False - ), - data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), -) -model.config.use_cache = False - -# Set Model dict -old_state_dict = model.state_dict -model.state_dict = ( - lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict()) -).__get__(model, type(model)) - -# Run Trainer -trainer.train() - -print('Train completed.') - -# Save Model -model.save_pretrained(OUTPUT_DIR) - -print('Model Saved.') +""" + llama-4b trainer with support of Stanford Alpaca-like JSON datasets (short for SAD) + Intended to use with https://github.com/johnsmith0031/alpaca_lora_4bit + + SAD structure: + [ + { + "instruction": "Give null hypothesis", + "input": "6 subjects were given a drug (treatment group) and an additional 6 subjects a placebo (control group).", + "output": "Drug is equivalent of placebo" + }, + { + "instruction": "What does RNA stand for?", + "input": "", + "output": "RNA stands for ribonucleic acid." + } + ] +""" + +import sys +sys.path.insert(0, './repository/transformers/src') +sys.path.insert(0, './repository/GPTQ-for-LLaMa') +sys.path.insert(0, './repository/peft/src') + +import peft +import peft.tuners.lora +assert peft.tuners.lora.is_gptq_available() + +import torch +import transformers +from autograd_4bit import load_llama_model_4bit_low_ram +from peft import LoraConfig, get_peft_model, get_peft_model_state_dict, PeftModel + +# ! Config +from arg_parser import get_config +import train_data + +ft_config = get_config() + +# * Show loaded parameters +print(f"{ft_config}\n") + + +# Load Basic Model +model, tokenizer = load_llama_model_4bit_low_ram(ft_config.llama_q4_config_dir, ft_config.llama_q4_model) + +# Config Lora +lora_config = LoraConfig( + r=ft_config.lora_r, + lora_alpha=ft_config.lora_alpha, + target_modules=["q_proj", "v_proj"], + lora_dropout=ft_config.lora_dropout, + bias="none", + task_type="CAUSAL_LM", +) +if ft_config.lora_apply_dir is None: + model = get_peft_model(model, lora_config) +else: + model = PeftModel.from_pretrained(model, ft_config.lora_apply_dir, device_map={'': 0}, torch_dtype=torch.float32) # ! Direct copy from inference.py + print(ft_config.lora_apply_dir, 'loaded') + +# Scales to half +print('Fitting 4bit scales and zeros to half') +for n, m in model.named_modules(): + if '4bit' in str(type(m)): + m.zeros = m.zeros.half() + m.scales = m.scales.half() + +# Set tokenizer +tokenizer.pad_token_id = 0 + +if not ft_config.skip: + # ! TODO: Refactor to load both SAD and LLAMA datasets + # Load Data + data = None + match ft_config.ds_type: + case "txt" if not ft_config.skip: + #### LLaMA + data = train_data.TrainTxt(ft_config.dataset, ft_config.val_set_size, tokenizer, ft_config.cutoff_len) + case "alpaca" if not ft_config.skip: + #### Stanford Alpaca-like Data + data = train_data.TrainSAD(ft_config.dataset, ft_config.val_set_size, tokenizer, ft_config.cutoff_len) + case _: + raise NotImplementedError("ERROR: Unknown dataset format") + data.prepare_data() + #### + + trainer = transformers.Trainer( + model=model, + train_dataset=data.train_data, + eval_dataset=data.val_data, + args=transformers.TrainingArguments( + per_device_train_batch_size=ft_config.mbatch_size, + gradient_accumulation_steps=ft_config.gradient_accumulation_steps, + warmup_steps=ft_config.warmup_steps, + num_train_epochs=ft_config.epochs, + learning_rate=ft_config.lr, + fp16=True, + logging_steps=ft_config.logging_steps, + evaluation_strategy="no", + save_strategy="steps", + eval_steps=None, + save_steps=ft_config.save_steps, + output_dir=ft_config.lora_out_dir, + save_total_limit=ft_config.save_total_limit, + load_best_model_at_end=False + ), + data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False), + ) + model.config.use_cache = False + + +# Set Model dict +old_state_dict = model.state_dict +model.state_dict = ( + lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict()) +).__get__(model, type(model)) + +# Run Trainer +trainer.train() + +print('Train completed.') + +# Save Model +model.save_pretrained(ft_config.lora_out_dir) + +print('Model Saved.') diff --git a/train_data.py b/train_data.py new file mode 100644 index 0000000..d2d30a8 --- /dev/null +++ b/train_data.py @@ -0,0 +1,126 @@ +from abc import ABC, abstractmethod +from typing import Dict, Any +from datasets import load_dataset, Dataset + + +# Abstract train data loader +class ATrainData(ABC): + """ + """ + @abstractmethod + def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len: int) -> None: + """ + Args: + dataset (str): Path to dataset + val_set_size (int) : Size of validation set + tokenizer (_type_): Tokenizer + """ + self.tokenizer = tokenizer + self.dataset = dataset + self.val_set_size = val_set_size + self.cutoff_len = cutoff_len + self.train_data = None + self.val_data = None + + @abstractmethod + def tokenize(self, prompt: str) -> Dict[str, Any]: + """Tokenization method + + Args: + prompt (str): Prompt string from dataset + + Returns: + Dict[str, Any]: token + """ + pass + + @abstractmethod + def prepare_data(self) -> None: + """Loads dataset from file and prepares train_data property for trainer + """ + pass + + +# LLaMA txt train data loader +class TrainTxt(ATrainData): + def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len): + super().__init__(dataset, val_set_size, tokenizer, cutoff_len) # TODO: Validation size isn't used + self.cutoff_len = cutoff_len + self.exceed_count = 0 + + def tokenize(self, prompt: str) -> Dict[str, Any]: + # there's probably a way to do this with the tokenizer settings + # but again, gotta move fast + prompt = prompt['input'] + result = self.tokenizer( + prompt, + truncation=True, + max_length=self.cutoff_len + 1, + padding="max_length", + ) + d = { + "input_ids": result["input_ids"][:-1], + "attention_mask": result["attention_mask"][:-1], + } + if sum(d['attention_mask']) >= self.cutoff_len: + self.exceed_count += 1 + return d + + def prepare_data(self): + with open(self.dataset, 'r', encoding='utf8') as file: + txt = file.read() + txt = txt.replace('\r\n', '\n') + rows = [r for r in txt.split('\n') if r != ''] + data = Dataset.from_dict({"input": rows}) + data = data.shuffle().map(lambda x: self.tokenize(x)) + print('Train Data: {:.2f}%'.format(self.exceed_count / len(data) * 100), 'outliers') + self.train_data = data + + +# Stanford Alpaca-like Data +class TrainSAD(ATrainData): + def __init__(self, dataset: str, val_set_size: int, tokenizer, cutoff_len) -> None: + super().__init__(dataset, val_set_size, tokenizer, cutoff_len) + + def tokenize(self, prompt: str) -> Dict[str, Any]: + # there's probably a way to do this with the tokenizer settings + # but again, gotta move fast + result = self.tokenizer( + prompt, + truncation=True, + max_length=self.cutoff_len + 1, + padding="max_length", + ) + return { + "input_ids": result["input_ids"][:-1], + "attention_mask": result["attention_mask"][:-1], + } + + def prepare_data(self) -> None: + data = load_dataset("json", data_files=self.dataset) + + if self.val_set_size > 0: + train_val = data["train"].train_test_split( + test_size=self.val_set_size, shuffle=True, seed=42 # ! Seed = 42 (?) + ) + self.train_data = train_val["train"].shuffle().map(self.generate_and_tokenize_prompt) + self.val_data = train_val["test"].shuffle().map(self.generate_and_tokenize_prompt) + else: + self.train_data = data["train"].shuffle().map(self.generate_and_tokenize_prompt) + self.val_data = None + + # Auxiliary methods + def generate_prompt(self, data_point): + return "{0}\n\n{1}\n{2}\n\n{3}\n{4}\n\n{5}\n{6}".format( + "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.", + "### Instruction:", + data_point["instruction"], + "### Input:", + data_point["input"], + "### Response:", + data_point["output"] + ) + + def generate_and_tokenize_prompt(self, data_point): + prompt = self.generate_prompt(data_point) + return self.tokenize(prompt)