From 417eba372ae5492ef8b12997538c28643a191133 Mon Sep 17 00:00:00 2001 From: Andy Barry Date: Wed, 5 Apr 2023 23:13:35 -0400 Subject: [PATCH] Add dockerfile and change some numbers to use 7bn model. --- Dockerfile | 46 ++++++++++ README.md | 75 +++++---------- requirements.txt | 4 +- requirements2.txt | 96 ++++++++++++++++++++ text-generation-webui/custom_monkey_patch.py | 12 ++- 5 files changed, 173 insertions(+), 60 deletions(-) create mode 100644 Dockerfile create mode 100644 requirements2.txt diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..d795a74 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,46 @@ +#FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel +FROM nvidia/cuda:11.7.0-devel-ubuntu22.04 + +# Get + +RUN apt-get update && apt-get install -y git wget python3 python3-pip +RUN ln -s `which python3` /usr/bin/python + +RUN pip3 install --upgrade pip requests tqdm + +# Some of the requirements expect some python packages in their setup.py, just install them first. +RUN pip install torch==2.0.0 +RUN pip install semantic-version==2.10.0 + +RUN git clone --depth=1 --branch main https://github.com/andybarry/alpaca_lora_4bit_docker.git && cd alpaca_lora_4bit +# && git checkout 86387a0a3575c82e689a452c20b2c9a5cc94a0f3 + +WORKDIR alpaca_lora_4bit + +COPY requirements2.txt requirements2.txt +RUN pip install -r requirements2.txt + +# The docker build environment has trouble detecting CUDA version, build for all reasonable archs +ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6" +COPY requirements.txt requirements.txt +RUN pip install -r requirements.txt + +RUN git clone --depth=1 --branch main https://github.com/andybarry/text-generation-webui-4bit.git text-generation-webui-tmp && cd text-generation-webui-tmp +# && git checkout 378d21e80c3d6f11a4835e57597c69e340008e2c + +RUN mv -f text-generation-webui-tmp/* text-generation-webui/ + +# Get the model +RUN cd text-generation-webui && python download-model.py --text-only decapoda-research/llama-7b-hf && mv models/decapoda-research_llama-7b-hf ../llama-7b-4bit + +RUN wget https://huggingface.co/decapoda-research/llama-7b-hf-int4/resolve/main/llama-7b-4bit.pt -O llama-7b-4bit.pt + +# Get LoRA +RUN cd text-generation-webui && python download-model.py samwit/alpaca7B-lora && mv loras/samwit_alpaca7B-lora ../alpaca7b_lora + +# Symlink for monkeypatch +RUN cd text-generation-webui && ln -s ../autograd_4bit.py ./autograd_4bit.py && ln -s ../matmul_utils_4bit.py . + +# Run the server +WORKDIR /alpaca_lora_4bit/text-generation-webui +CMD ["python", "server.py"] \ No newline at end of file diff --git a/README.md b/README.md index 2326cde..1ee9ee5 100644 --- a/README.md +++ b/README.md @@ -1,69 +1,36 @@ -# Alpaca Lora 4bit -Made some adjust for the code in peft and gptq for llama, and make it possible for lora finetuning with a 4 bits base model. The same adjustment can be made for 2, 3 and 8 bits. +# Run LLM chat in realtime on an 8GB NVIDIA GPU -* Install Manual by s4rduk4r: https://github.com/s4rduk4r/alpaca_lora_4bit_readme/blob/main/README.md (**NOTE:** don't use the install script, use the requirements.txt instead.) -* Also Remember to create a venv if you do not want the packages be overwritten. +## Dockerfile for alpaca_lora_4bit +Based on https://github.com/johnsmith0031/alpaca_lora_4bit -# Update Logs -* Resolved numerically unstable issue -* Reconstruct fp16 matrix from 4bit data and call torch.matmul largely increased the inference speed. -* Added install script for windows and linux. -* Added Gradient Checkpointing. Now It can finetune 30b model 4bit on a single GPU with 24G VRAM with Gradient Checkpointing enabled. (finetune.py updated) (but would reduce training speed, so if having enough VRAM this option is not needed) -* Added install manual by s4rduk4r -* Added pip install support by sterlind, preparing to merge changes upstream -* Added V2 model support (with groupsize, both inference + finetune) -* Added some options on finetune: set default to use eos_token instead of padding, add resume_checkpoint to continue training -* Added offload support. load_llama_model_4bit_low_ram_and_offload_to_cpu function can be used. +## Use +Can run real-time LLM chat using alpaca on a 8GB NVIDIA/CUDA GPU (ie 3070 Ti mobile) -# Requirements -gptq-for-llama
-peft
-The specific version is inside requirements.txt
+## Requirements +- linux with docker +- nvidia GPU -# Install -~copy files from GPTQ-for-LLaMa into GPTQ-for-LLaMa path and re-compile cuda extension~
-~copy files from peft/tuners/lora.py to peft path, replace it~
- -**NOTE:** Install scripts are no longer needed! requirements.txt now pulls from forks with the necessary patches. +## Installation ``` -pip install -r requirements.txt +docker build -t alpaca_lora_4bit . +docker run -p 7086:7086 alpaca_lora_4bit ``` +Point your browser to http://localhost:7086 -# Finetune -~The same finetune script from https://github.com/tloen/alpaca-lora can be used.~
+## Results +It's fast on a 3070 Ti. -After installation, this script can be used: +### Discussion +The model isn't all that good, sometimes it goes crazy. But hey, "when 4-bits _you reach_ look this good you will not." -``` -python finetune.py -``` +But it is fast (on my 3070 Ti mobile at least) -# Inference -After installation, this script can be used: -``` -python inference.py -``` +## References -# Text Generation Webui Monkey Patch +- https://github.com/johnsmith0031/alpaca_lora_4bit +- https://github.com/s4rduk4r/alpaca_lora_4bit_readme/blob/main/README.md +- https://github.com/tloen/alpaca-lora -Clone the latest version of text generation webui and copy all the files into ./text-generation-webui/ -``` -git clone https://github.com/oobabooga/text-generation-webui.git -``` - -Open server.py and insert a line at the beginning -``` -import custom_monkey_patch # apply monkey patch -import gc -import io -... -``` - -Use the command to run - -``` -python server.py -``` diff --git a/requirements.txt b/requirements.txt index 605c0d1..9cc37db 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,9 @@ accelerate bitsandbytes datasets sentencepiece -safetensors +safetensors==0.3.0 +gradio +semantic-version==2.10.0 git+https://github.com/huggingface/transformers.git git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit git+https://github.com/sterlind/peft.git diff --git a/requirements2.txt b/requirements2.txt new file mode 100644 index 0000000..439e9fa --- /dev/null +++ b/requirements2.txt @@ -0,0 +1,96 @@ +accelerate==0.18.0 +aiofiles==23.1.0 +aiohttp==3.8.4 +aiosignal==1.3.1 +altair==4.2.2 +anyio==3.6.2 +async-timeout==4.0.2 +attrs==22.2.0 +bitsandbytes==0.37.2 +certifi==2022.12.7 +charset-normalizer==3.1.0 +click==8.1.3 +cmake==3.26.1 +contourpy==1.0.7 +cycler==0.11.0 +datasets==2.11.0 +dill==0.3.6 +entrypoints==0.4 +fastapi==0.95.0 +ffmpy==0.3.0 +filelock==3.10.7 +fonttools==4.39.3 +frozenlist==1.3.3 +fsspec==2023.3.0 +gradio==3.24.1 +gradio_client==0.0.7 +h11==0.14.0 +httpcore==0.16.3 +httpx==0.23.3 +huggingface-hub==0.13.3 +idna==3.4 +Jinja2==3.1.2 +jsonschema==4.17.3 +kiwisolver==1.4.4 +linkify-it-py==2.0.0 +lit==16.0.0 +Markdown==3.4.3 +markdown-it-py==2.2.0 +MarkupSafe==2.1.2 +matplotlib==3.7.1 +mdit-py-plugins==0.3.3 +mdurl==0.1.2 +mpmath==1.3.0 +multidict==6.0.4 +multiprocess==0.70.14 +networkx==3.1 +numpy==1.24.2 +nvidia-cublas-cu11==11.10.3.66 +nvidia-cuda-cupti-cu11==11.7.101 +nvidia-cuda-nvrtc-cu11==11.7.99 +nvidia-cuda-runtime-cu11==11.7.99 +nvidia-cudnn-cu11==8.5.0.96 +nvidia-cufft-cu11==10.9.0.58 +nvidia-curand-cu11==10.2.10.91 +nvidia-cusolver-cu11==11.4.0.1 +nvidia-cusparse-cu11==11.7.4.91 +nvidia-nccl-cu11==2.14.3 +nvidia-nvtx-cu11==11.7.91 +orjson==3.8.9 +packaging==23.0 +pandas==2.0.0 +Pillow==9.5.0 +psutil==5.9.4 +pyarrow==11.0.0 +pydantic==1.10.7 +pydub==0.25.1 +pyparsing==3.0.9 +pyrsistent==0.19.3 +python-dateutil==2.8.2 +python-multipart==0.0.6 +pytz==2023.3 +PyYAML==6.0 +regex==2023.3.23 +requests==2.28.2 +responses==0.18.0 +rfc3986==1.5.0 +safetensors==0.3.0 +semantic-version==2.10.0 +sentencepiece==0.1.97 +six==1.16.0 +sniffio==1.3.0 +starlette==0.26.1 +sympy==1.11.1 +tokenizers==0.13.3 +toolz==0.12.0 +torch==2.0.0 +tqdm==4.65.0 +triton==2.0.0 +typing_extensions==4.5.0 +tzdata==2023.3 +uc-micro-py==1.0.1 +urllib3==1.26.15 +uvicorn==0.21.1 +websockets==11.0 +xxhash==3.2.0 +yarl==1.8.2 diff --git a/text-generation-webui/custom_monkey_patch.py b/text-generation-webui/custom_monkey_patch.py index 6f586e3..cfdf724 100644 --- a/text-generation-webui/custom_monkey_patch.py +++ b/text-generation-webui/custom_monkey_patch.py @@ -7,9 +7,9 @@ from peft.tuners.lora import Linear4bitLt def load_model_llama(*args, **kwargs): - config_path = '../llama-13b-4bit/' - model_path = '../llama-13b-4bit.pt' - lora_path = '../alpaca13b_lora/' + config_path = '../llama-7b-4bit/' + model_path = '../llama-7b-4bit.pt' + lora_path = '../alpaca7b_lora/' print("Loading {} ...".format(model_path)) t0 = time.time() @@ -25,7 +25,9 @@ def load_model_llama(*args, **kwargs): if m.groupsize == -1: m.zeros = m.zeros.half() m.scales = m.scales.half() - m.bias = m.bias.half() + + # This line failed for me, commenting it out seems to work... + #m.bias = m.bias.half() autograd_4bit.use_new = True autograd_4bit.auto_switch = True @@ -35,7 +37,7 @@ def load_model_llama(*args, **kwargs): from modules import models from modules import shared models.load_model = load_model_llama -shared.args.model = 'llama-13b-4bit' +shared.args.model = 'llama-7b-4bit' shared.settings['name1'] = 'You' shared.settings['name2'] = 'Assistant' shared.settings['chat_prompt_size_max'] = 2048