From 8a62560e6cb7d91e43f1976f477d7319a6e46b1f Mon Sep 17 00:00:00 2001
From: John Smith <yfshi123@163.com>
Date: Thu, 30 Mar 2023 11:21:21 +0800
Subject: [PATCH] add offload support

---
 autograd_4bit.py | 72 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

diff --git a/autograd_4bit.py b/autograd_4bit.py
index d89f116..166a2a1 100644
--- a/autograd_4bit.py
+++ b/autograd_4bit.py
@@ -146,4 +146,74 @@ def load_llama_model_4bit_low_ram(config_path, model_path, groupsize=-1, half=Fa
     print(f"Loaded the model in {(time.time()-t0):.2f} seconds.")
     
     return model, tokenizer
-    
\ No newline at end of file
+    
+def load_llama_model_4bit_low_ram_and_offload_to_cpu(config_path, model_path, lora_path=None, groupsize=-1, seqlen=2048, max_memory=None):
+    import accelerate
+    from transformers import LlamaConfig, LlamaForCausalLM, LlamaTokenizer
+
+    if max_memory is None:
+        max_memory = {0: '24Gib', 'cpu': '48Gib'}
+
+    print("Loading Model ...")
+    t0 = time.time()
+
+    with accelerate.init_empty_weights():
+        config = LlamaConfig.from_pretrained(config_path)
+        model = LlamaForCausalLM(config)
+        model = model.eval()
+        layers = find_layers(model)
+        for name in ['lm_head']:
+            if name in layers:
+                del layers[name]
+        make_quant_for_4bit_autograd(model, layers, groupsize=groupsize)
+    accelerate.load_checkpoint_in_model(model, checkpoint=model_path, device_map={'': 'cpu'})
+
+    # rotary_emb fix
+    for n, m in model.named_modules():
+        if 'rotary_emb' in n:
+            cos_cached = m.cos_cached.clone().cpu()
+            sin_cached = m.sin_cached.clone().cpu()
+            break
+
+    if lora_path is not None:
+        from peft import PeftModel
+        from peft.tuners.lora import Linear4bitLt
+        model = PeftModel.from_pretrained(model, lora_path, device_map={'': 'cpu'}, torch_dtype=torch.float32)
+        print('{} Lora Applied.'.format(lora_path))
+
+    model.seqlen = seqlen
+
+    print('Apply half ...')
+    for n, m in model.named_modules():
+        if isinstance(m, Autograd4bitQuantLinear) or ((lora_path is not None) and isinstance(m, Linear4bitLt)):
+            if m.groupsize == -1:
+                m.zeros = m.zeros.half()
+            m.scales = m.scales.half()
+            m.bias = m.bias.half()
+    
+    print('Dispatching model ...')
+    device_map = accelerate.infer_auto_device_map(model, max_memory=max_memory, no_split_module_classes=["LlamaDecoderLayer"])
+    model = accelerate.dispatch_model(model, device_map=device_map, offload_buffers=True, main_device=0)
+    torch.cuda.empty_cache()
+    print('Total {:.2f} Gib VRAM used.'.format(torch.cuda.memory_allocated() / 1024 / 1024))
+    
+    # rotary_emb fix
+    for n, m in model.named_modules():
+        if 'rotary_emb' in n:
+            if getattr(m, '_hf_hook', None):
+                if isinstance(m._hf_hook, accelerate.hooks.SequentialHook):
+                    hooks = m._hf_hook.hooks
+                else:
+                    hooks = [m._hf_hook]
+                for hook in hooks:
+                    if hook.offload:
+                        if n + '.sin_cached' not in hook.weights_map.dataset.state_dict.keys():
+                            hook.weights_map.dataset.state_dict[n + '.sin_cached'] = sin_cached.clone().cpu()
+                            hook.weights_map.dataset.state_dict[n + '.cos_cached'] = cos_cached.clone().cpu()
+    
+    tokenizer = LlamaTokenizer.from_pretrained(config_path)
+    tokenizer.truncation_side = 'left'
+
+    print(f"Loaded the model in {(time.time()-t0):.2f} seconds.")
+
+    return model, tokenizer