Merge pull request #63 from andybarry/dockerfile

Add a Dockerfile and readme changes for quick start
2023-04-08 15:48:27 +08:00 · 2023-04-08 15:48:27 +08:00 · 56e5bf2854
parent f91d4cbb59 a93cf1264a
commit 56e5bf2854
4 changed files with 94 additions and 0 deletions
--- a/78
+++ b/78
@ -0,0 +1,78 @@
+# syntax = docker/dockerfile:experimental
+
+# Dockerfile is split into parts because we want to cache building the requirements and downloading the model, both of which can take a long time.
+
+FROM nvidia/cuda:11.7.0-devel-ubuntu22.04 AS builder
+
+RUN apt-get update && apt-get install -y python3 python3-pip git
+
+RUN pip3 install --upgrade pip 
+
+# Some of the requirements expect some python packages in their setup.py, just install them first.
+RUN --mount=type=cache,target=/root/.cache/pip pip install --user torch==2.0.0
+RUN --mount=type=cache,target=/root/.cache/pip pip install --user semantic-version==2.10.0 requests tqdm
+
+# The docker build environment has trouble detecting CUDA version, build for all reasonable archs
+ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6"
+COPY requirements.txt requirements.txt
+RUN --mount=type=cache,target=/root/.cache pip install --user -r requirements.txt
+
+# -------------------------------
+
+# Download the model
+FROM nvidia/cuda:11.7.0-devel-ubuntu22.04 AS downloader
+RUN apt-get update && apt-get install -y wget
+
+RUN wget --progress=bar:force:noscroll https://huggingface.co/decapoda-research/llama-7b-hf-int4/resolve/main/llama-7b-4bit.pt
+
+
+
+# -------------------------------
+
+#FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-devel
+FROM nvidia/cuda:11.7.0-devel-ubuntu22.04
+
+RUN --mount=type=cache,target=/var/cache/apt apt-get update && apt-get install -y git python3 python3-pip
+
+RUN ln -s `which python3` /usr/bin/python
+
+
+# Copy the installed packages from the first stage
+COPY --from=builder /root/.local /root/.local
+
+RUN mkdir alpaca_lora_4bit
+WORKDIR alpaca_lora_4bit
+
+COPY --from=downloader llama-7b-4bit.pt llama-7b-4bit.pt
+
+#RUN git clone --depth=1 --branch main https://github.com/andybarry/text-generation-webui-4bit.git text-generation-webui-tmp
+
+RUN git clone --depth=1 --branch main https://github.com/oobabooga/text-generation-webui.git text-generation-webui-tmp
+
+RUN --mount=type=cache,target=/root/.cache pip install --user markdown gradio
+
+# Apply monkey patch
+RUN cd text-generation-webui-tmp && printf '%s'"import custom_monkey_patch # apply monkey patch\nimport gc\n\n" | cat - server.py > tmpfile && mv tmpfile server.py
+
+# Get the model config
+RUN cd text-generation-webui-tmp && python download-model.py --text-only decapoda-research/llama-7b-hf && mv models/decapoda-research_llama-7b-hf ../llama-7b-4bit
+
+
+# Get LoRA
+RUN cd text-generation-webui-tmp && python download-model.py samwit/alpaca7b-lora && mv loras/samwit_alpaca7b-lora ../alpaca7b_lora
+
+COPY *.py .
+COPY text-generation-webui text-generation-webui
+COPY monkeypatch .
+
+RUN mv -f text-generation-webui-tmp/* text-generation-webui/
+
+# Symlink for monkeypatch
+RUN cd text-generation-webui && ln -s ../autograd_4bit.py ./autograd_4bit.py && ln -s ../matmul_utils_4bit.py .
+
+# Swap to the 7bn parameter model
+RUN sed -i 's/llama-13b-4bit/llama-7b-4bit/g' text-generation-webui/custom_monkey_patch.py && sed -i 's/alpaca13b_lora/alpaca7b_lora/g' text-generation-webui/custom_monkey_patch.py
+
+# Run the server
+WORKDIR /alpaca_lora_4bit/text-generation-webui
+CMD ["python", "-u", "server.py", "--listen", "--chat"]
--- a/README.md
+++ b/README.md
@ -1,6 +1,21 @@
 # Alpaca Lora 4bit
 Made some adjust for the code in peft and gptq for llama, and make it possible for lora finetuning with a 4 bits base model. The same adjustment can be made for 2, 3 and 8 bits.

+## Quick start for running the chat UI
+
+```
+git clone https://github.com/andybarry/alpaca_lora_4bit_docker.git
+DOCCKER_BUILDKIT=1 docker build -t alpaca_lora_4bit . # build step can take 12 min
+docker run --gpus=all -p 7860:7860 alpaca_lora_4bit
+```
+Point your browser to http://localhost:7860
+
+## Results
+It's fast on a 3070 Ti mobile.  Uses 5-6 GB of GPU RAM.
+
+![](alpaca_lora_4bit_penguin_fact.gif)
+
+# Development
 * Install Manual by s4rduk4r: https://github.com/s4rduk4r/alpaca_lora_4bit_readme/blob/main/README.md (**NOTE:** don't use the install script, use the requirements.txt instead.)
 * Also Remember to create a venv if you do not want the packages be overwritten.

--- a/alpaca_lora_4bit_penguin_fact.gif
+++ b/alpaca_lora_4bit_penguin_fact.gif
--- a/autograd_4bit.py
+++ b/autograd_4bit.py
@ -107,6 +107,7 @@ class Autograd4bitQuantLinear(nn.Module):
        self.bits = bits
        self.maxq = 2 ** self.bits - 1
        self.groupsize = groupsize
+        self.g_idx = 0
        if groupsize == -1:
            self.register_buffer('zeros', torch.empty((out_features, 1)))
            self.register_buffer('scales', torch.empty((out_features, 1)))