Add dockerfile and change some numbers to use 7bn model.

2023-04-05 23:13:35 -04:00 · 2023-04-05 23:13:35 -04:00 · 417eba372a
parent 86387a0a35
commit 417eba372a
5 changed files with 173 additions and 60 deletions
--- a/46
+++ b/46
@ -0,0 +1,46 @@
+#FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel
+FROM nvidia/cuda:11.7.0-devel-ubuntu22.04
+
+# Get 
+
+RUN apt-get update && apt-get install -y git wget python3 python3-pip
+RUN ln -s `which python3` /usr/bin/python
+
+RUN pip3 install --upgrade pip requests tqdm
+
+# Some of the requirements expect some python packages in their setup.py, just install them first.
+RUN pip install torch==2.0.0
+RUN pip install semantic-version==2.10.0
+
+RUN git clone --depth=1 --branch main https://github.com/andybarry/alpaca_lora_4bit_docker.git && cd alpaca_lora_4bit
+# && git checkout 86387a0a3575c82e689a452c20b2c9a5cc94a0f3
+
+WORKDIR alpaca_lora_4bit
+
+COPY requirements2.txt requirements2.txt
+RUN pip install -r requirements2.txt
+
+# The docker build environment has trouble detecting CUDA version, build for all reasonable archs
+ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6"
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+
+RUN git clone --depth=1 --branch main https://github.com/andybarry/text-generation-webui-4bit.git text-generation-webui-tmp && cd text-generation-webui-tmp 
+# && git checkout 378d21e80c3d6f11a4835e57597c69e340008e2c 
+
+RUN mv -f text-generation-webui-tmp/* text-generation-webui/
+
+# Get the model
+RUN cd text-generation-webui && python download-model.py --text-only decapoda-research/llama-7b-hf && mv models/decapoda-research_llama-7b-hf ../llama-7b-4bit
+
+RUN wget https://huggingface.co/decapoda-research/llama-7b-hf-int4/resolve/main/llama-7b-4bit.pt -O llama-7b-4bit.pt
+
+# Get LoRA
+RUN cd text-generation-webui && python download-model.py samwit/alpaca7B-lora && mv loras/samwit_alpaca7B-lora ../alpaca7b_lora
+
+# Symlink for monkeypatch
+RUN cd text-generation-webui && ln -s ../autograd_4bit.py ./autograd_4bit.py && ln -s ../matmul_utils_4bit.py .
+
+# Run the server
+WORKDIR /alpaca_lora_4bit/text-generation-webui
+CMD ["python", "server.py"]
--- a/README.md
+++ b/README.md
@ -1,69 +1,36 @@
-# Alpaca Lora 4bit
-Made some adjust for the code in peft and gptq for llama, and make it possible for lora finetuning with a 4 bits base model. The same adjustment can be made for 2, 3 and 8 bits.
+# Run LLM chat in realtime on an 8GB NVIDIA GPU

-* Install Manual by s4rduk4r: https://github.com/s4rduk4r/alpaca_lora_4bit_readme/blob/main/README.md (**NOTE:** don't use the install script, use the requirements.txt instead.)
-* Also Remember to create a venv if you do not want the packages be overwritten.
+## Dockerfile for alpaca_lora_4bit
+Based on https://github.com/johnsmith0031/alpaca_lora_4bit

-# Update Logs
-* Resolved numerically unstable issue
-* Reconstruct fp16 matrix from 4bit data and call torch.matmul largely increased the inference speed.
-* Added install script for windows and linux.
-* Added Gradient Checkpointing. Now It can finetune 30b model 4bit on a single GPU with 24G VRAM with Gradient Checkpointing enabled. (finetune.py updated) (but would reduce training speed, so if having enough VRAM this option is not needed)
-* Added install manual by s4rduk4r
-* Added pip install support by sterlind, preparing to merge changes upstream
-* Added V2 model support (with groupsize, both inference + finetune)
-* Added some options on finetune: set default to use eos_token instead of padding, add resume_checkpoint to continue training
-* Added offload support. load_llama_model_4bit_low_ram_and_offload_to_cpu function can be used.
+## Use
+Can run real-time LLM chat using alpaca on a 8GB NVIDIA/CUDA GPU (ie 3070 Ti mobile)

-# Requirements
-gptq-for-llama <br>
-peft<br>
-The specific version is inside requirements.txt<br>
+## Requirements
+- linux with docker
+- nvidia GPU

-# Install
-~copy files from GPTQ-for-LLaMa into GPTQ-for-LLaMa path and re-compile cuda extension~<br>
-~copy files from peft/tuners/lora.py to peft path, replace it~<br>
-
-**NOTE:** Install scripts are no longer needed! requirements.txt now pulls from forks with the necessary patches.
+## Installation

 ```
-pip install -r requirements.txt
+docker build -t alpaca_lora_4bit .
+docker run -p 7086:7086 alpaca_lora_4bit
 ```
+Point your browser to http://localhost:7086

-# Finetune
-~The same finetune script from https://github.com/tloen/alpaca-lora can be used.~<br>
+## Results
+It's fast on a 3070 Ti.

-After installation, this script can be used:
+### Discussion
+The model isn't all that good, sometimes it goes crazy.  But hey, "when 4-bits _you reach_ look this good you will not."

-```
-python finetune.py
-```
+But it is fast (on my 3070 Ti mobile at least)

-# Inference

-After installation, this script can be used:

-```
-python inference.py
-```
+## References

-# Text Generation Webui Monkey Patch
+- https://github.com/johnsmith0031/alpaca_lora_4bit
+- https://github.com/s4rduk4r/alpaca_lora_4bit_readme/blob/main/README.md
+- https://github.com/tloen/alpaca-lora

-Clone the latest version of text generation webui and copy all the files into ./text-generation-webui/
-```
-git clone https://github.com/oobabooga/text-generation-webui.git
-```
-
-Open server.py and insert a line at the beginning
-```
-import custom_monkey_patch # apply monkey patch
-import gc
-import io
-...
-```
-
-Use the command to run
-
-```
-python server.py
-```
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,9 @@ accelerate
 bitsandbytes
 datasets
 sentencepiece
-safetensors
+safetensors==0.3.0
+gradio
+semantic-version==2.10.0
 git+https://github.com/huggingface/transformers.git
 git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
 git+https://github.com/sterlind/peft.git
--- a/requirements2.txt
+++ b/requirements2.txt
@ -0,0 +1,96 @@
+accelerate==0.18.0
+aiofiles==23.1.0
+aiohttp==3.8.4
+aiosignal==1.3.1
+altair==4.2.2
+anyio==3.6.2
+async-timeout==4.0.2
+attrs==22.2.0
+bitsandbytes==0.37.2
+certifi==2022.12.7
+charset-normalizer==3.1.0
+click==8.1.3
+cmake==3.26.1
+contourpy==1.0.7
+cycler==0.11.0
+datasets==2.11.0
+dill==0.3.6
+entrypoints==0.4
+fastapi==0.95.0
+ffmpy==0.3.0
+filelock==3.10.7
+fonttools==4.39.3
+frozenlist==1.3.3
+fsspec==2023.3.0
+gradio==3.24.1
+gradio_client==0.0.7
+h11==0.14.0
+httpcore==0.16.3
+httpx==0.23.3
+huggingface-hub==0.13.3
+idna==3.4
+Jinja2==3.1.2
+jsonschema==4.17.3
+kiwisolver==1.4.4
+linkify-it-py==2.0.0
+lit==16.0.0
+Markdown==3.4.3
+markdown-it-py==2.2.0
+MarkupSafe==2.1.2
+matplotlib==3.7.1
+mdit-py-plugins==0.3.3
+mdurl==0.1.2
+mpmath==1.3.0
+multidict==6.0.4
+multiprocess==0.70.14
+networkx==3.1
+numpy==1.24.2
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+orjson==3.8.9
+packaging==23.0
+pandas==2.0.0
+Pillow==9.5.0
+psutil==5.9.4
+pyarrow==11.0.0
+pydantic==1.10.7
+pydub==0.25.1
+pyparsing==3.0.9
+pyrsistent==0.19.3
+python-dateutil==2.8.2
+python-multipart==0.0.6
+pytz==2023.3
+PyYAML==6.0
+regex==2023.3.23
+requests==2.28.2
+responses==0.18.0
+rfc3986==1.5.0
+safetensors==0.3.0
+semantic-version==2.10.0
+sentencepiece==0.1.97
+six==1.16.0
+sniffio==1.3.0
+starlette==0.26.1
+sympy==1.11.1
+tokenizers==0.13.3
+toolz==0.12.0
+torch==2.0.0
+tqdm==4.65.0
+triton==2.0.0
+typing_extensions==4.5.0
+tzdata==2023.3
+uc-micro-py==1.0.1
+urllib3==1.26.15
+uvicorn==0.21.1
+websockets==11.0
+xxhash==3.2.0
+yarl==1.8.2
--- a/text-generation-webui/custom_monkey_patch.py
+++ b/text-generation-webui/custom_monkey_patch.py
@ -7,9 +7,9 @@ from peft.tuners.lora import Linear4bitLt

 def load_model_llama(*args, **kwargs):

-    config_path = '../llama-13b-4bit/'
-    model_path = '../llama-13b-4bit.pt'
-    lora_path = '../alpaca13b_lora/'
+    config_path = '../llama-7b-4bit/'
+    model_path = '../llama-7b-4bit.pt'
+    lora_path = '../alpaca7b_lora/'

    print("Loading {} ...".format(model_path))
    t0 = time.time()
@ -25,7 +25,9 @@ def load_model_llama(*args, **kwargs):
            if m.groupsize == -1:
                m.zeros = m.zeros.half()
            m.scales = m.scales.half()
-            m.bias = m.bias.half()
+
+            # This line failed for me, commenting it out seems to work...
+            #m.bias = m.bias.half()
    autograd_4bit.use_new = True
    autograd_4bit.auto_switch = True
    
@ -35,7 +37,7 @@ def load_model_llama(*args, **kwargs):
 from modules import models
 from modules import shared
 models.load_model = load_model_llama
-shared.args.model = 'llama-13b-4bit'
+shared.args.model = 'llama-7b-4bit'
 shared.settings['name1'] = 'You'
 shared.settings['name2'] = 'Assistant'
 shared.settings['chat_prompt_size_max'] = 2048