parent
50dbb101e9
commit
397f5041c3
|
|
@ -1,16 +1,18 @@
|
||||||
class Finetune4bConfig:
|
class Finetune4bConfig:
|
||||||
"""Config holder for LLaMA 4bit finetuning
|
"""Config holder for LLaMA 4bit finetuning
|
||||||
"""
|
"""
|
||||||
def __init__(self, dataset : str, ds_type : str,
|
def __init__(self, dataset: str, ds_type: str,
|
||||||
lora_out_dir : str, lora_apply_dir : str,
|
lora_out_dir: str, lora_apply_dir : str,
|
||||||
llama_q4_config_dir : str, llama_q4_model : str,
|
llama_q4_config_dir: str, llama_q4_model: str,
|
||||||
mbatch_size : int, batch_size : int,
|
mbatch_size: int, batch_size: int,
|
||||||
epochs : int, lr : float,
|
epochs: int, lr: float,
|
||||||
cutoff_len : int,
|
cutoff_len: int,
|
||||||
lora_r : int, lora_alpha : int, lora_dropout : float,
|
lora_r: int, lora_alpha: int, lora_dropout: float,
|
||||||
val_set_size : float,
|
val_set_size: float,
|
||||||
warmup_steps : int, save_steps : int, save_total_limit : int, logging_steps : int,
|
gradient_checkpointing: bool,
|
||||||
checkpoint : bool, skip : bool
|
gradient_checkpointing_ratio: float,
|
||||||
|
warmup_steps: int, save_steps: int, save_total_limit: int, logging_steps: int,
|
||||||
|
checkpoint: bool, skip: bool
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
|
|
@ -28,6 +30,8 @@ class Finetune4bConfig:
|
||||||
lora_r (int): LoRA R
|
lora_r (int): LoRA R
|
||||||
lora_alpha (int): LoRA Alpha
|
lora_alpha (int): LoRA Alpha
|
||||||
lora_dropout (float): LoRA Dropout
|
lora_dropout (float): LoRA Dropout
|
||||||
|
gradient_checkpointing (bool) : Use gradient checkpointing
|
||||||
|
gradient_checkpointing_ratio (float) : Gradient checkpoint ratio
|
||||||
val_set_size (int): Validation set size
|
val_set_size (int): Validation set size
|
||||||
warmup_steps (int): Warmup steps before training
|
warmup_steps (int): Warmup steps before training
|
||||||
save_steps (int): Save steps
|
save_steps (int): Save steps
|
||||||
|
|
@ -50,8 +54,10 @@ class Finetune4bConfig:
|
||||||
self.cutoff_len = cutoff_len
|
self.cutoff_len = cutoff_len
|
||||||
self.lora_r = lora_r
|
self.lora_r = lora_r
|
||||||
self.lora_alpha = lora_alpha
|
self.lora_alpha = lora_alpha
|
||||||
self.lora_dropout = lora_dropout
|
self.lora_dropout = 0 if gradient_checkpointing else lora_dropout # should be 0 if gradient checkpointing is on
|
||||||
self.val_set_size = int(val_set_size) if val_set_size > 1.0 else float(val_set_size)
|
self.val_set_size = int(val_set_size) if val_set_size > 1.0 else float(val_set_size)
|
||||||
|
self.gradient_checkpointing = gradient_checkpointing
|
||||||
|
self.gradient_checkpointing_ratio = gradient_checkpointing_ratio
|
||||||
self.warmup_steps = warmup_steps
|
self.warmup_steps = warmup_steps
|
||||||
self.save_steps = save_steps
|
self.save_steps = save_steps
|
||||||
self.save_total_limit = save_total_limit
|
self.save_total_limit = save_total_limit
|
||||||
|
|
@ -61,9 +67,12 @@ class Finetune4bConfig:
|
||||||
|
|
||||||
|
|
||||||
def __str__(self) -> str:
|
def __str__(self) -> str:
|
||||||
return f"\nParameters:\n{'config':-^20}\n{self.dataset=}\n{self.ds_type=}\n{self.lora_out_dir=}\n{self.lora_apply_dir=}\n{self.llama_q4_config_dir=}\n{self.llama_q4_model=}\n\n" +\
|
s = f"\nParameters:\n{'config':-^20}\n{self.dataset=}\n{self.ds_type=}\n{self.lora_out_dir=}\n{self.lora_apply_dir=}\n{self.llama_q4_config_dir=}\n{self.llama_q4_model=}\n\n" +\
|
||||||
f"{'training':-^20}\n" +\
|
f"{'training':-^20}\n" +\
|
||||||
f"{self.mbatch_size=}\n{self.batch_size=}\n{self.gradient_accumulation_steps=}\n{self.epochs=}\n{self.lr=}\n{self.cutoff_len=}\n" +\
|
f"{self.mbatch_size=}\n{self.batch_size=}\n{self.gradient_accumulation_steps=}\n{self.epochs=}\n{self.lr=}\n{self.cutoff_len=}\n" +\
|
||||||
f"{self.lora_r=}\n{self.lora_alpha=}\n{self.lora_dropout=}\n{self.val_set_size=}\n{self.warmup_steps=}\n{self.save_steps=}\n{self.save_total_limit=}\n" +\
|
f"{self.lora_r=}\n{self.lora_alpha=}\n{self.lora_dropout=}\n{self.val_set_size=}\n" +\
|
||||||
|
f"{self.gradient_checkpointing=}\n{self.gradient_checkpointing_ratio=}\n" +\
|
||||||
|
f"{self.warmup_steps=}\n{self.save_steps=}\n{self.save_total_limit=}\n" +\
|
||||||
f"{self.logging_steps=}\n" +\
|
f"{self.logging_steps=}\n" +\
|
||||||
f"{self.checkpoint=}\n{self.skip=}"
|
f"{self.checkpoint=}\n{self.skip=}"
|
||||||
|
return s.replace("self.", "")
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,8 @@ def parse_commandline():
|
||||||
parser_training.add_argument("--lora_r", default=8, type=int, help="Default: %(default)s")
|
parser_training.add_argument("--lora_r", default=8, type=int, help="Default: %(default)s")
|
||||||
parser_training.add_argument("--lora_alpha", default=16, type=int, help="Default: %(default)s")
|
parser_training.add_argument("--lora_alpha", default=16, type=int, help="Default: %(default)s")
|
||||||
parser_training.add_argument("--lora_dropout", default=0.05, type=float, help="Default: %(default)s")
|
parser_training.add_argument("--lora_dropout", default=0.05, type=float, help="Default: %(default)s")
|
||||||
|
parser_training.add_argument("--grad_chckpt", action="store_true", required=False, help="Use gradient checkpoint. For 30B model. Default: %(default)s")
|
||||||
|
parser_training.add_argument("--grad_chckpt_ratio", default=1, type=float, help="Gradient checkpoint ratio. Default: %(default)s")
|
||||||
parser_training.add_argument("--val_set_size", default=0.2, type=float, help="Validation set size. Default: %(default)s")
|
parser_training.add_argument("--val_set_size", default=0.2, type=float, help="Validation set size. Default: %(default)s")
|
||||||
parser_training.add_argument("--warmup_steps", default=50, type=int, help="Default: %(default)s")
|
parser_training.add_argument("--warmup_steps", default=50, type=int, help="Default: %(default)s")
|
||||||
parser_training.add_argument("--save_steps", default=50, type=int, help="Default: %(default)s")
|
parser_training.add_argument("--save_steps", default=50, type=int, help="Default: %(default)s")
|
||||||
|
|
@ -72,6 +74,8 @@ def get_config() -> Finetune4bConfig:
|
||||||
lora_alpha=args["lora_alpha"],
|
lora_alpha=args["lora_alpha"],
|
||||||
lora_dropout=args["lora_dropout"],
|
lora_dropout=args["lora_dropout"],
|
||||||
val_set_size=args["val_set_size"],
|
val_set_size=args["val_set_size"],
|
||||||
|
gradient_checkpointing=args["grad_chckpt"],
|
||||||
|
gradient_checkpointing_ratio=args["grad_chckpt_ratio"],
|
||||||
warmup_steps=args["warmup_steps"],
|
warmup_steps=args["warmup_steps"],
|
||||||
save_steps=args["save_steps"],
|
save_steps=args["save_steps"],
|
||||||
save_total_limit=args["save_total_limit"],
|
save_total_limit=args["save_total_limit"],
|
||||||
|
|
|
||||||
33
finetune.py
33
finetune.py
|
|
@ -40,6 +40,8 @@ ft_config = get_config()
|
||||||
# * Show loaded parameters
|
# * Show loaded parameters
|
||||||
print(f"{ft_config}\n")
|
print(f"{ft_config}\n")
|
||||||
|
|
||||||
|
if ft_config.gradient_checkpointing:
|
||||||
|
print('Disable Dropout.')
|
||||||
|
|
||||||
# Load Basic Model
|
# Load Basic Model
|
||||||
model, tokenizer = load_llama_model_4bit_low_ram(ft_config.llama_q4_config_dir, ft_config.llama_q4_model)
|
model, tokenizer = load_llama_model_4bit_low_ram(ft_config.llama_q4_config_dir, ft_config.llama_q4_model)
|
||||||
|
|
@ -70,7 +72,6 @@ for n, m in model.named_modules():
|
||||||
tokenizer.pad_token_id = 0
|
tokenizer.pad_token_id = 0
|
||||||
|
|
||||||
if not ft_config.skip:
|
if not ft_config.skip:
|
||||||
# ! TODO: Refactor to load both SAD and LLAMA datasets
|
|
||||||
# Load Data
|
# Load Data
|
||||||
data = None
|
data = None
|
||||||
match ft_config.ds_type:
|
match ft_config.ds_type:
|
||||||
|
|
@ -85,6 +86,12 @@ if not ft_config.skip:
|
||||||
data.prepare_data()
|
data.prepare_data()
|
||||||
####
|
####
|
||||||
|
|
||||||
|
# Use gradient checkpointing
|
||||||
|
if ft_config.gradient_checkpointing:
|
||||||
|
print('Applying gradient checkpointing ...')
|
||||||
|
from gradient_checkpointing import apply_gradient_checkpointing
|
||||||
|
apply_gradient_checkpointing(model, checkpoint_ratio=ft_config.gradient_checkpointing_ratio)
|
||||||
|
|
||||||
trainer = transformers.Trainer(
|
trainer = transformers.Trainer(
|
||||||
model=model,
|
model=model,
|
||||||
train_dataset=data.train_data,
|
train_dataset=data.train_data,
|
||||||
|
|
@ -109,19 +116,21 @@ if not ft_config.skip:
|
||||||
)
|
)
|
||||||
model.config.use_cache = False
|
model.config.use_cache = False
|
||||||
|
|
||||||
|
# Set Model dict
|
||||||
|
old_state_dict = model.state_dict
|
||||||
|
model.state_dict = (
|
||||||
|
lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
|
||||||
|
).__get__(model, type(model))
|
||||||
|
|
||||||
# Set Model dict
|
# Run Trainer
|
||||||
old_state_dict = model.state_dict
|
trainer.train()
|
||||||
model.state_dict = (
|
|
||||||
lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
|
|
||||||
).__get__(model, type(model))
|
|
||||||
|
|
||||||
# Run Trainer
|
print('Train completed.')
|
||||||
trainer.train()
|
|
||||||
|
|
||||||
print('Train completed.')
|
if not ft_config.checkpoint:
|
||||||
|
# Save Model
|
||||||
# Save Model
|
model.save_pretrained(ft_config.lora_out_dir)
|
||||||
model.save_pretrained(ft_config.lora_out_dir)
|
else:
|
||||||
|
raise NotImplemented("TODO: Merge model + LoRA and save the whole checkpoint")
|
||||||
|
|
||||||
print('Model Saved.')
|
print('Model Saved.')
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue