From 62e54ac1c789172f9fff4c98ca30bb4823dd5131 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 27 Mar 2023 16:08:20 -0400 Subject: [PATCH 1/2] backwards support for pre-py3.10, add datasets requirement used in train --- finetune.py | 21 ++++++++++----------- requirements.txt | 1 + 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/finetune.py b/finetune.py index 3970cc8..32124f9 100644 --- a/finetune.py +++ b/finetune.py @@ -1,7 +1,7 @@ """ llama-4b trainer with support of Stanford Alpaca-like JSON datasets (short for SAD) Intended to use with https://github.com/johnsmith0031/alpaca_lora_4bit - + SAD structure: [ { @@ -72,15 +72,14 @@ tokenizer.pad_token_id = 0 if not ft_config.skip: # Load Data data = None - match ft_config.ds_type: - case "txt" if not ft_config.skip: - #### LLaMA - data = train_data.TrainTxt(ft_config.dataset, ft_config.val_set_size, tokenizer, ft_config.cutoff_len) - case "alpaca" if not ft_config.skip: - #### Stanford Alpaca-like Data - data = train_data.TrainSAD(ft_config.dataset, ft_config.val_set_size, tokenizer, ft_config.cutoff_len) - case _: - raise NotImplementedError("ERROR: Unknown dataset format") + if ft_config.ds_type == "txt" and not ft_config.skip: + #### LLaMa + data = train_data.TrainTxt(ft_config.dataset, ft_config.val_set_size, tokenizer, ft_config.cutoff_len) + elif ft_config.ds_type == "alpaca" and not ft_config.skip: + #### Stanford Alpaca-like Data + data = train_data.TrainSAD(ft_config.dataset, ft_config.val_set_size, tokenizer, ft_config.cutoff_len) + else: + raise NotImplementedError("ERROR: Unknown dataset format") data.prepare_data() #### @@ -136,5 +135,5 @@ model.save_pretrained(ft_config.lora_out_dir) if ft_config.checkpoint: print("Warning: Merge model + LoRA and save the whole checkpoint not implemented yet.") - + print('Model Saved.') \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b8128e3..dadcad1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ torch accelerate bitsandbytes +datasets git+https://github.com/huggingface/transformers.git git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit git+https://github.com/sterlind/peft.git From 101d314bd9437a236e485956438ef48a867aea78 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 27 Mar 2023 16:13:46 -0400 Subject: [PATCH 2/2] add missing dependency to train with LlamaTokenizer --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index dadcad1..c752c4b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ torch accelerate bitsandbytes datasets +sentencepiece git+https://github.com/huggingface/transformers.git git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit git+https://github.com/sterlind/peft.git