From 9b04b8eec6d864b30813d6c068d19bdf05e0fddd Mon Sep 17 00:00:00 2001 From: John Smith Date: Wed, 22 Mar 2023 07:58:51 +0000 Subject: [PATCH] add monkey patch for webui --- README.md | 21 +++++++++ text-generation-webui/custom_monkey_patch.py | 47 ++++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 text-generation-webui/custom_monkey_patch.py diff --git a/README.md b/README.md index a231710..c926eac 100644 --- a/README.md +++ b/README.md @@ -45,3 +45,24 @@ After installation, this script can be used: ``` python inference.py ``` + +# Text Generation Webui Monkey Patch + +Clone the latest version of text generation webui and copy all the files into ./text-generation-webui/ +``` +git clone https://github.com/oobabooga/text-generation-webui.git +``` + +Open server.py and insert a line at the beginning +``` +import custom_monkey_patch # apply monkey patch +import gc +import io +... +``` + +Use the command to run + +``` +python server.py +``` diff --git a/text-generation-webui/custom_monkey_patch.py b/text-generation-webui/custom_monkey_patch.py new file mode 100644 index 0000000..83bbddb --- /dev/null +++ b/text-generation-webui/custom_monkey_patch.py @@ -0,0 +1,47 @@ +import sys +sys.path.insert(0, '../repository/transformers/src') +sys.path.insert(0, '../repository/GPTQ-for-LLaMa') +sys.path.insert(0, '../repository/peft/src') +import time +import torch +import autograd_4bit +from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear +from peft import PeftModel +from peft.tuners.lora import Linear4bitLt + +def load_model_llama(*args, **kwargs): + + config_path = '../llama-13b-4bit/' + model_path = '../llama-13b-4bit.pt' + lora_path = '../alpaca13b_lora/' + + print("Loading {} ...".format(model_path)) + t0 = time.time() + + model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path) + + model = PeftModel.from_pretrained(model, lora_path, device_map={'': 0}, torch_dtype=torch.float32) + print('{} Lora Applied.'.format(lora_path)) + + print('Apply auto switch and half') + for n, m in model.named_modules(): + if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt): + m.zeros = m.zeros.half() + m.scales = m.scales.half() + m.bias = m.bias.half() + autograd_4bit.use_new = True + autograd_4bit.auto_switch = True + + return model, tokenizer + +# Monkey Patch +from modules import models +from modules import shared +models.load_model = load_model_llama +shared.args.model = 'llama-13b-4bit' +shared.settings['name1'] = 'You' +shared.settings['name2'] = 'Assistant' +shared.settings['chat_prompt_size_max'] = 2048 +shared.settings['chat_prompt_size'] = 2048 + +print('Monkey Patch Completed.')