add monkey patch for webui

2023-03-22 07:58:51 +00:00
parent 45d2f22c14
commit 9b04b8eec6
2 changed files with 68 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -45,3 +45,24 @@ After installation, this script can be used:
 ```
 python inference.py
 ```
 # Text Generation Webui Monkey Patch
 Clone the latest version of text generation webui and copy all the files into ./text-generation-webui/
 ```
 git clone https://github.com/oobabooga/text-generation-webui.git
 ```
 Open server.py and insert a line at the beginning
 ```
 import custom_monkey_patch # apply monkey patch
 import gc
 import io
 ...
 ```
 Use the command to run
 ```
 python server.py
 ```
--- a/text-generation-webui/custom_monkey_patch.py
+++ b/text-generation-webui/custom_monkey_patch.py
@@ -0,0 +1,47 @@
 import sys
 sys.path.insert(0, '../repository/transformers/src')
 sys.path.insert(0, '../repository/GPTQ-for-LLaMa')
 sys.path.insert(0, '../repository/peft/src')
 import time
 import torch
 import autograd_4bit
 from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
 from peft import PeftModel
 from peft.tuners.lora import Linear4bitLt
 def load_model_llama(*args, **kwargs):
    config_path = '../llama-13b-4bit/'
    model_path = '../llama-13b-4bit.pt'
    lora_path = '../alpaca13b_lora/'
    print("Loading {} ...".format(model_path))
    t0 = time.time()
    model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path)
    model = PeftModel.from_pretrained(model, lora_path, device_map={'': 0}, torch_dtype=torch.float32)
    print('{} Lora Applied.'.format(lora_path))
    print('Apply auto switch and half')
    for n, m in model.named_modules():
        if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
            m.zeros = m.zeros.half()
            m.scales = m.scales.half()
            m.bias = m.bias.half()
    autograd_4bit.use_new = True
    autograd_4bit.auto_switch = True
    return model, tokenizer
 # Monkey Patch
 from modules import models
 from modules import shared
 models.load_model = load_model_llama
 shared.args.model = 'llama-13b-4bit'
 shared.settings['name1'] = 'You'
 shared.settings['name2'] = 'Assistant'
 shared.settings['chat_prompt_size_max'] = 2048
 shared.settings['chat_prompt_size'] = 2048
 print('Monkey Patch Completed.')