From f91d4cbb593b097f5dfb60866a04e90044414da6 Mon Sep 17 00:00:00 2001 From: John Smith Date: Fri, 7 Apr 2023 16:10:36 +0800 Subject: [PATCH] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 4256ac0..d5336fd 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,8 @@ Made some adjust for the code in peft and gptq for llama, and make it possible f * Added some options on finetune: set default to use eos_token instead of padding, add resume_checkpoint to continue training * Added offload support. load_llama_model_4bit_low_ram_and_offload_to_cpu function can be used. * Added monkey patch for text generation webui for fixing initial eos token issue. +* Added Flash attention support. (Use --flash-attention) +* Added Triton backend to support model using groupsize and act-order. (Use --backend=triton) # Requirements gptq-for-llama
@@ -82,6 +84,4 @@ python server.py # Flash Attention It seems that we can apply a monkey patch for llama model. To use it, simply download the file from [MonkeyPatch](https://github.com/lm-sys/FastChat/blob/daa9c11080ceced2bd52c3e0027e4f64b1512683/fastchat/train/llama_flash_attn_monkey_patch.py). And also, flash-attention is needed, and currently do not support pytorch 2.0. -``` -pip install flash-attn -``` +Just add --flash-attention to use it for finetuning.