add monkey patch for webui
This commit is contained in:
parent
45d2f22c14
commit
9b04b8eec6
21
README.md
21
README.md
|
|
@ -45,3 +45,24 @@ After installation, this script can be used:
|
|||
```
|
||||
python inference.py
|
||||
```
|
||||
|
||||
# Text Generation Webui Monkey Patch
|
||||
|
||||
Clone the latest version of text generation webui and copy all the files into ./text-generation-webui/
|
||||
```
|
||||
git clone https://github.com/oobabooga/text-generation-webui.git
|
||||
```
|
||||
|
||||
Open server.py and insert a line at the beginning
|
||||
```
|
||||
import custom_monkey_patch # apply monkey patch
|
||||
import gc
|
||||
import io
|
||||
...
|
||||
```
|
||||
|
||||
Use the command to run
|
||||
|
||||
```
|
||||
python server.py
|
||||
```
|
||||
|
|
|
|||
|
|
@ -0,0 +1,47 @@
|
|||
import sys
|
||||
sys.path.insert(0, '../repository/transformers/src')
|
||||
sys.path.insert(0, '../repository/GPTQ-for-LLaMa')
|
||||
sys.path.insert(0, '../repository/peft/src')
|
||||
import time
|
||||
import torch
|
||||
import autograd_4bit
|
||||
from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
|
||||
from peft import PeftModel
|
||||
from peft.tuners.lora import Linear4bitLt
|
||||
|
||||
def load_model_llama(*args, **kwargs):
|
||||
|
||||
config_path = '../llama-13b-4bit/'
|
||||
model_path = '../llama-13b-4bit.pt'
|
||||
lora_path = '../alpaca13b_lora/'
|
||||
|
||||
print("Loading {} ...".format(model_path))
|
||||
t0 = time.time()
|
||||
|
||||
model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path)
|
||||
|
||||
model = PeftModel.from_pretrained(model, lora_path, device_map={'': 0}, torch_dtype=torch.float32)
|
||||
print('{} Lora Applied.'.format(lora_path))
|
||||
|
||||
print('Apply auto switch and half')
|
||||
for n, m in model.named_modules():
|
||||
if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
|
||||
m.zeros = m.zeros.half()
|
||||
m.scales = m.scales.half()
|
||||
m.bias = m.bias.half()
|
||||
autograd_4bit.use_new = True
|
||||
autograd_4bit.auto_switch = True
|
||||
|
||||
return model, tokenizer
|
||||
|
||||
# Monkey Patch
|
||||
from modules import models
|
||||
from modules import shared
|
||||
models.load_model = load_model_llama
|
||||
shared.args.model = 'llama-13b-4bit'
|
||||
shared.settings['name1'] = 'You'
|
||||
shared.settings['name2'] = 'Assistant'
|
||||
shared.settings['chat_prompt_size_max'] = 2048
|
||||
shared.settings['chat_prompt_size'] = 2048
|
||||
|
||||
print('Monkey Patch Completed.')
|
||||
Loading…
Reference in New Issue