add server

2023-04-26 12:50:36 +08:00 · 2023-04-26 12:50:36 +08:00 · 1abdc99675
parent 633c28fd25
commit 1abdc99675
6 changed files with 542 additions and 0 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -5,6 +5,7 @@ sentencepiece
 safetensors
 einops
 colorama
 pyzmq
 git+https://github.com/huggingface/peft.git@70af02a2bca5a63921790036b2c9430edf4037e2
 git+https://github.com/huggingface/transformers.git
 git+https://github.com/sterlind/GPTQ-for-LLaMa.git@lora_4bit
--- a/scripts/run_server.py
+++ b/scripts/run_server.py
@ -0,0 +1,26 @@
 from server import ModelServer
 import argparse
 if __name__ == '__main__':
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('--config_path', type=str, required=True)
    arg_parser.add_argument('--model_path', type=str, required=True)
    arg_parser.add_argument('--lora_path', type=str, default=None)
    arg_parser.add_argument('--groupsize', type=int, default=-1)
    arg_parser.add_argument('--v1', action='store_true')
    arg_parser.add_argument('--quant_attn', action='store_true')
    arg_parser.add_argument('--port', type=int, default=5555)
    arg_parser.add_argument('--pub_port', type=int, default=5556)
    args = arg_parser.parse_args()
    server = ModelServer(
        config_path=args.config_path,
        model_path=args.model_path,
        lora_path=args.lora_path,
        groupsize=args.groupsize,
        is_v1_model=args.v1,
        quant_attn=args.quant_attn,
        port=args.port,
        pub_port=args.pub_port)
    server.run()
--- a/server/init.py
+++ b/server/init.py
@ -0,0 +1 @@
 from .server import ModelClient, ModelServer
--- a/server/server.py
+++ b/server/server.py
@ -0,0 +1,264 @@
 from .. import autograd_4bit
 import time
 import torch
 from ..autograd_4bit import load_llama_model_4bit_low_ram, Autograd4bitQuantLinear
 from alpaca_lora_4bit.model_attn_mlp_patch import make_quant_attn, make_fused_mlp, inject_lora_layers
 import zmq
 from transformers import StoppingCriteria, StoppingCriteriaList
 from io import BytesIO
 import gc
 import threading
 def decode(output_ids, tokenizer, skip_special_tokens=True):
    if skip_special_tokens:
        reply = tokenizer.decode(output_ids, skip_special_tokens=True)
        reply = reply.replace(r'<|endoftext|>', '')
        return reply
    else:
        return tokenizer.decode(output_ids, skip_special_tokens=False)
 def clear_torch_cache():
    gc.collect()
    torch.cuda.empty_cache()
 # Copy from text-generation-webui/modules/callbacks.py
 class Stream(StoppingCriteria):
    def __init__(self, callback_func=None):
        self.callback_func = callback_func
    def __call__(self, input_ids, scores) -> bool:
        if self.callback_func is not None:
            self.callback_func(input_ids[0])
        return False
 class ModelServer:
    def __init__(self, config_path, model_path, lora_path=None, groupsize=128, is_v1_model=False, quant_attn=False, port=5555, pub_port=5556):
        self.config_path = config_path
        self.model_path = model_path
        self.lora_path = lora_path
        self.groupsize = groupsize
        self.is_v1_model = is_v1_model
        self.quant_attn = quant_attn
        self.port = port
        self.model = None
        self.tokenizer = None
        self.is_generating = False
        self.socket = None
        self.socket_pub = None
        self.pub_port = pub_port
        self.topic = b'10001'
    def load_model(self):
        print("Loading {} ...".format(self.model_path))
        t0 = time.time()
        model, tokenizer = load_llama_model_4bit_low_ram(self.config_path, self.model_path, groupsize=self.groupsize, is_v1_model=self.is_v1_model)
        if not self.quant_attn and self.lora_path is not None:
            from peft import PeftModel
            from ..monkeypatch.peft_tuners_lora_monkey_patch import replace_peft_model_with_int4_lora_model
            replace_peft_model_with_int4_lora_model()
            model = PeftModel.from_pretrained(model, self.lora_path, device_map={'': 0}, torch_dtype=torch.float16)
            print('{} Lora Applied.'.format(self.lora_path))
        print('Apply half ...')
        model.half()
        for n, m in model.named_modules():
            if isinstance(m, Autograd4bitQuantLinear):
                if m.is_v1_model:
                    m.zeros = m.zeros.half()
                m.scales = m.scales.half()
                m.bias = m.bias.half()
        torch.cuda.empty_cache()
        print('Total {:.2f} GiB VRAM used.'.format(torch.cuda.memory_allocated() / 1024 / 1024))
        if not self.quant_attn and self.lora_path is not None:
            from ..amp_wrapper import AMPWrapper
            wrapper = AMPWrapper(model)
            wrapper.apply_generate()
            print('AMP applied.')
        if self.quant_attn:
            make_quant_attn(model, is_v1_model=self.is_v1_model)
            make_fused_mlp(model, is_v1_model=self.is_v1_model)
            print('Quantized attention applied.')
            if self.lora_path is not None:
                inject_lora_layers(model, self.lora_path, device='cuda', torch_dtype=torch.float16)
        self.model, self.tokenizer = model, tokenizer
        print("Loaded in {:.2f} seconds.".format(time.time() - t0))
    def wrap_result(self, result):
        with BytesIO() as bio:
            torch.save(result, bio)
            return bio.getvalue()
    def unwrap_result(self, result):
        with BytesIO(result) as bio:
            return torch.load(bio, map_location='cuda')
    def send_generate_end_flag(self):
        data = {
            'type': 'generate_end'
        }
        self.socket_pub.send(self.topic + self.wrap_result(data))
    def generate_thread(self, *args, **kwargs):
        clear_torch_cache()
        self.is_generating = True
        try:
            self.model.generate(*args, **kwargs)
        except ValueError:
            pass
        finally:
            self.is_generating = False
            self.send_generate_end_flag()
            clear_torch_cache()
    def stop_generate(self):
        self.is_generating = False
    def run(self):
        self.load_model()
        context = zmq.Context()
        socket = context.socket(zmq.REP)
        socket.bind("tcp://*:{}".format(self.port))
        self.socket = socket
        context_pub = zmq.Context()
        socket_pub = context_pub.socket(zmq.PUB)
        socket_pub.bind("tcp://*:{}".format(self.pub_port))
        self.socket_pub = socket_pub
        print('Server started at port {} and {}.'.format(self.port, self.pub_port))
        '''
            Message Format:
            {'function': 'generate',
             'args': ...,
             'kwargs': ...}
        '''
        while True:
            try:
                #  Wait for next request from client
                message = socket.recv()
                message = self.unwrap_result(message)
                function = message['function']
                if function == 'generate':
                    if not self.is_generating:
                        self.is_generating = True
                        args = message['args']
                        kwargs = message['kwargs']
                        input_ids = kwargs['inputs']
                        def func(x):
                            if not self.is_generating:
                                raise ValueError
                            new_tokens = len(x) - len(input_ids[0])
                            result = decode(x[-new_tokens:], self.tokenizer, True)
                            data = {
                                'type': 'generate',
                                'data': result
                            }
                            socket_pub.send(self.topic + self.wrap_result(data))
                        kwargs['stopping_criteria'] = StoppingCriteriaList([Stream(callback_func=func)])
                        t = threading.Thread(target=self.generate_thread, args=args, kwargs=kwargs)
                        t.setDaemon(True)
                        t.start()
                        socket.send(self.wrap_result({'type': 'generate_rsp', 'data': 'ok'}))
                    else:
                        print('Already generating.')
                        socket.send(self.wrap_result({'type': 'generate_rsp', 'data': 'already generating'}))
                elif function == 'stop_generate':
                    self.stop_generate()
                    socket.send(self.wrap_result({'type': 'stop_generate_rsp', 'data': 'ok'}))
                elif function == 'test':
                    print('test ok.')
                    self.socket.send(self.wrap_result(
                        {
                            'type': 'test',
                            'data': 'test ok.'
                        }
                    ))
                elif function == 'exit':
                    socket.send(self.wrap_result({'type': 'exit_rsp', 'data': 'ok'}))
                    break
                else:
                    socket.send(self.wrap_result({'type': 'rsp', 'data': 'no function'}))
                    raise ValueError('Unknown function {}'.format(function))
            except Exception as e:
                print(str(e))
                raise
        print('Server stopped.')
 class ModelClient:
    def __init__(self, port=5555, port_sub=5556):
        self.port = port
        self.context = zmq.Context()
        self.socket = self.context.socket(zmq.REQ)
        self.socket.connect("tcp://localhost:{}".format(self.port))
        self.socket_sub = self.context.socket(zmq.SUB)
        self.topic = b'10001'
        self.socket_sub.setsockopt(zmq.SUBSCRIBE, self.topic)
        self.socket_sub.connect("tcp://localhost:{}".format(port_sub))
        self.callback_func = None
    def wrap_result(self, result):
        with BytesIO() as bio:
            torch.save(result, bio)
            return bio.getvalue()
    def unwrap_result(self, result):
        with BytesIO(result) as bio:
            return torch.load(bio, map_location='cuda')
    def recieve_thread(self):
        while True:
            message = self.socket_sub.recv()
            message = message[len(self.topic):]
            message = self.unwrap_result(message)
            if message['type'] == 'generate':
                if self.callback_func is not None:
                    self.callback_func(message['data'], is_end=False)
            elif message['type'] == 'generate_end':
                if self.callback_func is not None:
                    self.callback_func(None, is_end=True)
                break
            else:
                print(message)
                break
        print('receive completed.')
    def start_recieving(self):
        t = threading.Thread(target=self.recieve_thread)
        t.setDaemon(True)
        t.start()
    def generate(self, *args, **kwargs):
        data = {
            'function': 'generate',
            'args': args,
            'kwargs': kwargs
        }
        self.socket.send(self.wrap_result(data))
        result = self.socket.recv()
        return result
    def stop(self):
        data = {
            'function': 'stop_generate'
        }
        self.socket.send(self.wrap_result(data))
        result = self.socket.recv()
        return result
    def test(self):
        data = {
            'function': 'test'
        }
        self.socket.send(self.wrap_result(data))
        result = self.socket.recv()
        return result
--- a/text-generation-webui/custom_model_server_monkey_patch.py
+++ b/text-generation-webui/custom_model_server_monkey_patch.py
@ -0,0 +1,37 @@
 from server import ModelClient
 from transformers import LlamaTokenizer
 def load_model_llama(*args, **kwargs):
    config_path = '../llama-13b-4bit/'
    tokenizer = LlamaTokenizer.from_pretrained(config_path)
    tokenizer.truncation_side = 'left'
    model = ModelClient(port=5555, port_sub=5556)
    return model, tokenizer
 patch_encode_func = True
 # Monkey Patch
 from modules import models
 from modules import shared
 models.load_model = load_model_llama
 shared.args.model = 'llama-13b-4bit'
 shared.settings['name1'] = 'You'
 shared.settings['name2'] = 'Assistant'
 shared.settings['chat_prompt_size_max'] = 2048
 shared.settings['chat_prompt_size'] = 2048
 if patch_encode_func:
    from modules import text_generation
    text_generation.encode_old = text_generation.encode
    def encode_patched(*args, **kwargs):
        input_ids = text_generation.encode_old(*args, **kwargs)
        if input_ids[0,0] == 0:
            input_ids = input_ids[:, 1:]
        return input_ids
    text_generation.encode = encode_patched
    print('Encode Function Patched.')
 print('Monkey Patch Completed.')
 # Apply Generate Monkey Patch
 import generate_monkey_patch
--- a/text-generation-webui/generate_monkey_patch.py
+++ b/text-generation-webui/generate_monkey_patch.py
@ -0,0 +1,213 @@
 import modules.text_generation
 from modules.text_generation import *
 from modules.callbacks import _SentinelTokenStoppingCriteria
 def generate_reply_patched(question, state, eos_token=None, stopping_strings=[]):
    if shared.model_name == 'None' or shared.model is None:
        print("No model is loaded! Select one in the Model tab.")
        yield formatted_outputs(question, shared.model_name)
        return
    clear_torch_cache()
    seed = set_manual_seed(state['seed'])
    shared.stop_everything = False
    generate_params = get_generate_params(state)
    t0 = time.time()
    # Preparing the input
    original_question = question
    if not shared.is_chat():
        question = apply_extensions('input', question)
    # If the model is not on transformers, handle it separately and end this
    # function call earlier.
    if shared.model_type in ['rwkv', 'llamacpp']:
        if shared.args.verbose:
            print(f'\n\n{question}\n--------------------\n')
        try:
            if shared.args.no_stream:
                reply = shared.model.generate(context=question, **generate_params)
                output = original_question + reply
                if not shared.is_chat():
                    reply = original_question + apply_extensions('output', reply)
                yield formatted_outputs(reply, shared.model_name)
            else:
                if not shared.is_chat():
                    yield formatted_outputs(question, shared.model_name)
                for reply in shared.model.generate_with_streaming(context=question, **generate_params):
                    output = original_question + reply
                    if not shared.is_chat():
                        reply = original_question + apply_extensions('output', reply)
                    yield formatted_outputs(reply, shared.model_name)
        except Exception:
            traceback.print_exc()
        finally:
            t1 = time.time()
            original_tokens = len(encode(original_question)[0])
            new_tokens = len(encode(output)[0]) - original_tokens
            print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
            return
    # Encode the input
    input_ids = encode(question, add_bos_token=state['add_bos_token'], truncation_length=get_max_prompt_length(state))
    output = input_ids[0]
    cuda = not any((shared.args.cpu, shared.args.deepspeed, shared.args.flexgen))
    if shared.args.verbose:
        print(f'\n\n{decode(input_ids[0], state["skip_special_tokens"])}\n--------------------\n')
    # Find the eos tokens
    eos_token_ids = [shared.tokenizer.eos_token_id] if shared.tokenizer.eos_token_id is not None else []
    if eos_token is not None:
        eos_token_ids.append(int(encode(eos_token)[0][-1]))
    # Create the StoppingCriteriaList with the stopping strings
    stopping_criteria_list = transformers.StoppingCriteriaList()
    for st in (stopping_strings, ast.literal_eval(f"[{state['custom_stopping_strings']}]")):
        if type(st) is list and len(st) > 0:
            sentinel_token_ids = [encode(string, add_special_tokens=False) for string in st]
            stopping_criteria_list.append(_SentinelTokenStoppingCriteria(sentinel_token_ids=sentinel_token_ids, starting_idx=len(input_ids[0])))
            break
    # Update generate_params with the eos token and the stopping strings
    if shared.args.flexgen:
        generate_params['stop'] = eos_token_ids[-1]
    else:
        generate_params['eos_token_id'] = eos_token_ids
        generate_params['stopping_criteria'] = stopping_criteria_list
    # Add the encoded tokens to generate_params
    if shared.soft_prompt:
        inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
        question, filler_input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, filler_input_ids, inputs_embeds)
        original_input_ids = input_ids
        generate_params.update({'inputs_embeds': inputs_embeds})
        generate_params.update({'inputs': filler_input_ids})
    else:
        question, input_ids, inputs_embeds = apply_extensions('tokenizer', state, question, input_ids, None)
        original_input_ids = input_ids
        generate_params.update({'inputs': input_ids})
        if inputs_embeds is not None:
            generate_params.update({'inputs_embeds': inputs_embeds})
    try:
        # Generate the entire reply at once.
        if shared.args.no_stream:
            with torch.no_grad():
                output = shared.model.generate(**generate_params)[0]
                if cuda:
                    output = output.cuda()
            if shared.soft_prompt:
                output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
            new_tokens = len(output) - len(input_ids[0])
            reply = decode(output[-new_tokens:], state['skip_special_tokens'])
            if not shared.is_chat():
                reply = original_question + apply_extensions('output', reply)
            yield formatted_outputs(reply, shared.model_name)
        # Stream the reply 1 token at a time.
        # This is based on the trick of using 'stopping_criteria' to create an iterator.
        elif not shared.args.flexgen:
            # def generate_with_callback(callback=None, **kwargs):
            #     kwargs['stopping_criteria'].append(Stream(callback_func=callback))
            #     clear_torch_cache()
            #     with torch.no_grad():
            #         shared.model.generate(**kwargs)
            # def generate_with_streaming(**kwargs):
            #     return Iteratorize(generate_with_callback, kwargs, callback=None)
            # if not shared.is_chat():
            #     yield formatted_outputs(original_question, shared.model_name)
            # with generate_with_streaming(**generate_params) as generator:
            #     for output in generator:
            #         if shared.soft_prompt:
            #             output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
            #         new_tokens = len(output) - len(input_ids[0])
            #         reply = decode(output[-new_tokens:], state['skip_special_tokens'])
            #         if not shared.is_chat():
            #             reply = original_question + apply_extensions('output', reply)
            #         if output[-1] in eos_token_ids:
            #             break
            #         yield formatted_outputs(reply, shared.model_name)
            from queue import Queue
            queue = Queue()
            def callback_func(x, is_end=False):
                if not is_end:
                    queue.put(x)
                else:
                    queue.put(None)
            # remove stopping_criteria
            generate_params.pop('stopping_criteria')
            shared.model.callback_func = callback_func
            shared.model.generate(**generate_params)
            shared.model.start_recieving()
            token_count = 0
            while True:
                reply = queue.get()
                if reply is None:
                    break
                token_count += 1
                yield formatted_outputs(reply, shared.model_name)
        # Stream the output naively for FlexGen since it doesn't support 'stopping_criteria'
        else:
            for i in range(state['max_new_tokens'] // 8 + 1):
                clear_torch_cache()
                with torch.no_grad():
                    output = shared.model.generate(**generate_params)[0]
                if shared.soft_prompt:
                    output = torch.cat((input_ids[0], output[filler_input_ids.shape[1]:]))
                new_tokens = len(output) - len(original_input_ids[0])
                reply = decode(output[-new_tokens:], state['skip_special_tokens'])
                if not shared.is_chat():
                    reply = original_question + apply_extensions('output', reply)
                if np.count_nonzero(np.isin(input_ids[0], eos_token_ids)) < np.count_nonzero(np.isin(output, eos_token_ids)):
                    break
                yield formatted_outputs(reply, shared.model_name)
                input_ids = np.reshape(output, (1, output.shape[0]))
                if shared.soft_prompt:
                    inputs_embeds, filler_input_ids = generate_softprompt_input_tensors(input_ids)
                    generate_params.update({'inputs_embeds': inputs_embeds})
                    generate_params.update({'inputs': filler_input_ids})
                else:
                    generate_params.update({'inputs': input_ids})
            yield formatted_outputs(reply, shared.model_name)
    except Exception:
        traceback.print_exc()
    finally:
        t1 = time.time()
        try:
            shared.model.stop()
        except:
            pass
        original_tokens = len(original_input_ids[0])
        new_tokens = token_count
        print(f'Output generated in {(t1-t0):.2f} seconds ({new_tokens/(t1-t0):.2f} tokens/s, {new_tokens} tokens, context {original_tokens}, seed {seed})')
        return
 modules.text_generation.generate_reply_old = modules.text_generation.generate_reply
 modules.text_generation.generate_reply = generate_reply_patched
 print('Generate Patch Applied')
		`@ -0,0 +1 @@`
							`from .server import ModelClient, ModelServer`