From 9b04b8eec6d864b30813d6c068d19bdf05e0fddd Mon Sep 17 00:00:00 2001
From: John Smith <yfshi123@163.com>
Date: Wed, 22 Mar 2023 07:58:51 +0000
Subject: [PATCH] add monkey patch for webui

---
 README.md                                    | 21 +++++++++
 text-generation-webui/custom_monkey_patch.py | 47 ++++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 text-generation-webui/custom_monkey_patch.py

diff --git a/README.md b/README.md
index a231710..c926eac 100644
--- a/README.md
+++ b/README.md
@@ -45,3 +45,24 @@ After installation, this script can be used:
 ```
 python inference.py
 ```
+
+# Text Generation Webui Monkey Patch
+
+Clone the latest version of text generation webui and copy all the files into ./text-generation-webui/
+```
+git clone https://github.com/oobabooga/text-generation-webui.git
+```
+
+Open server.py and insert a line at the beginning
+```
+import custom_monkey_patch # apply monkey patch
+import gc
+import io
+...
+```
+
+Use the command to run
+
+```
+python server.py
+```
diff --git a/text-generation-webui/custom_monkey_patch.py b/text-generation-webui/custom_monkey_patch.py
new file mode 100644
index 0000000..83bbddb
--- /dev/null
+++ b/text-generation-webui/custom_monkey_patch.py
@@ -0,0 +1,47 @@
+import sys
+sys.path.insert(0, '../repository/transformers/src')
+sys.path.insert(0, '../repository/GPTQ-for-LLaMa')
+sys.path.insert(0, '../repository/peft/src')
+import time
+import torch
+import autograd_4bit
+from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
+from peft import PeftModel
+from peft.tuners.lora import Linear4bitLt
+
+def load_model_llama(*args, **kwargs):
+
+    config_path = '../llama-13b-4bit/'
+    model_path = '../llama-13b-4bit.pt'
+    lora_path = '../alpaca13b_lora/'
+
+    print("Loading {} ...".format(model_path))
+    t0 = time.time()
+    
+    model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path)
+    
+    model = PeftModel.from_pretrained(model, lora_path, device_map={'': 0}, torch_dtype=torch.float32)
+    print('{} Lora Applied.'.format(lora_path))
+    
+    print('Apply auto switch and half')
+    for n, m in model.named_modules():
+        if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
+            m.zeros = m.zeros.half()
+            m.scales = m.scales.half()
+            m.bias = m.bias.half()
+    autograd_4bit.use_new = True
+    autograd_4bit.auto_switch = True
+    
+    return model, tokenizer
+
+# Monkey Patch
+from modules import models
+from modules import shared
+models.load_model = load_model_llama
+shared.args.model = 'llama-13b-4bit'
+shared.settings['name1'] = 'You'
+shared.settings['name2'] = 'Assistant'
+shared.settings['chat_prompt_size_max'] = 2048
+shared.settings['chat_prompt_size'] = 2048
+
+print('Monkey Patch Completed.')