import time
import torch
import autograd_4bit
from autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
from peft import PeftModel
from monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_gptq_lora_model, Linear4bitLt
replace_peft_model_with_gptq_lora_model()

patch_encode_func = False

def load_model_llama(*args, **kwargs):

    config_path = '../llama-13b-4bit/'
    model_path = '../llama-13b-4bit.pt'
    lora_path = '../alpaca13b_lora/'

    print("Loading {} ...".format(model_path))
    t0 = time.time()
    
    model, tokenizer = load_llama_model_4bit_low_ram(config_path, model_path, groupsize=-1, is_v1_model=True)
    
    model = PeftModel.from_pretrained(model, lora_path, device_map={'': 0}, torch_dtype=torch.float32)
    print('{} Lora Applied.'.format(lora_path))
    
    print('Apply auto switch and half')
    for n, m in model.named_modules():
        if isinstance(m, Autograd4bitQuantLinear) or isinstance(m, Linear4bitLt):
            if m.is_v1_model:
                m.zeros = m.zeros.half()
            m.scales = m.scales.half()
            m.bias = m.bias.half()
    autograd_4bit.use_new = True
    autograd_4bit.auto_switch = True
    
    return model, tokenizer

# Monkey Patch
from modules import models
from modules import shared
models.load_model = load_model_llama
shared.args.model = 'llama-13b-4bit'
shared.settings['name1'] = 'You'
shared.settings['name2'] = 'Assistant'
shared.settings['chat_prompt_size_max'] = 2048
shared.settings['chat_prompt_size'] = 2048

if patch_encode_func:
    from modules import text_generation
    text_generation.encode_old = text_generation.encode
    def encode_patched(*args, **kwargs):
        input_ids = text_generation.encode_old(*args, **kwargs)
        if input_ids[0,0] == 0:
            input_ids = input_ids[:, 1:]
        return input_ids
    text_generation.encode = encode_patched
    print('Encode Function Patched.')

print('Monkey Patch Completed.')