Spaces:

aka7774
/

localgemma2

Runtime error

App Files Files Community

aka7774 commited on Jul 3

Commit

d347e01

•

1 Parent(s): 3350c44

Upload 2 files

Browse files

Files changed (2) hide show

app.py +13 -12
fn.py +87 -69

app.py CHANGED Viewed

@@ -16,31 +16,31 @@ with gr.Blocks() as demo:
             with gr.Column(scale=1):
                 max_new_tokens = gr.Textbox(
-                    value=fn.cfg['max_new_tokens'],
                     label='max_new_tokens',
                     interactive=True,
                     show_copy_button=True,
                     )
                 temperature = gr.Textbox(
-                    value=fn.cfg['temperature'],
                     label='temperature',
                     interactive=True,
                     show_copy_button=True,
                     )
                 top_p = gr.Textbox(
-                    value=fn.cfg['top_p'],
                     label='top_p',
                     interactive=True,
                     show_copy_button=True,
                     )
                 top_k = gr.Textbox(
-                    value=fn.cfg['top_k'],
                     label='top_k',
                     interactive=True,
                     show_copy_button=True,
                     )
                 repetition_penalty = gr.Textbox(
-                    value=fn.cfg['repetition_penalty'],
                     label='repetition_penalty',
                     interactive=True,
                     show_copy_button=True,
@@ -48,17 +48,18 @@ with gr.Blocks() as demo:
         with gr.Row():
             with gr.Column(scale=1):
-                inst_template = gr.Textbox(
                     value='',
-                    lines=10,
-                    label='inst_template',
                     interactive=True,
                     show_copy_button=True,
                     )
-                is_use_cache = gr.Checkbox(
-                    value=False,
-                    label='is_use_cache',
                     interactive=True,
                     )
         set_button = gr.Button(value='Save')
@@ -97,7 +98,7 @@ with gr.Blocks() as demo:
     set_button.click(
         fn=fn.set_config,
-        inputs=[size, instruction, inst_template, is_use_cache, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[info],
         )

             with gr.Column(scale=1):
                 max_new_tokens = gr.Textbox(
+                    value=fn.default_args['max_new_tokens'],
                     label='max_new_tokens',
                     interactive=True,
                     show_copy_button=True,
                     )
                 temperature = gr.Textbox(
+                    value=fn.default_args['temperature'],
                     label='temperature',
                     interactive=True,
                     show_copy_button=True,
                     )
                 top_p = gr.Textbox(
+                    value=fn.default_args['top_p'],
                     label='top_p',
                     interactive=True,
                     show_copy_button=True,
                     )
                 top_k = gr.Textbox(
+                    value=fn.default_args['top_k'],
                     label='top_k',
                     interactive=True,
                     show_copy_button=True,
                     )
                 repetition_penalty = gr.Textbox(
+                    value=fn.default_args['repetition_penalty'],
                     label='repetition_penalty',
                     interactive=True,
                     show_copy_button=True,
         with gr.Row():
             with gr.Column(scale=1):
+                first_assistant = gr.Textbox(
                     value='',
+                    label='first_assistant',
                     interactive=True,
                     show_copy_button=True,
                     )
+                chat_template = gr.Textbox(
+                    value='',
+                    lines=10,
+                    label='chat_template',
                     interactive=True,
+                    show_copy_button=True,
                     )
         set_button = gr.Button(value='Save')
     set_button.click(
         fn=fn.set_config,
+        inputs=[size, instruction, first_assistant, chat_template, max_new_tokens, temperature, top_p, top_k, repetition_penalty],
         outputs=[info],
         )

fn.py CHANGED Viewed

@@ -14,20 +14,22 @@ from threading import Thread
 tokenizer = None
 model = None
-default_cfg = {
     'size': None,
     'instruction': None,
-    'inst_template': None,
-    'is_use_cache': False,
     'max_new_tokens': 1024,
     'temperature': 0.9,
     'top_p': 0.95,
     'top_k': 40,
     'repetition_penalty': 1.2,
 }
-cfg = default_cfg.copy()
-cache = None
-chat_history = []
 def load_model(size = '9b'):
     global tokenizer, model, cfg
@@ -50,17 +52,13 @@ def load_model(size = '9b'):
     cfg['size'] = size
-def clear_config():
-    global cfg
-    cfg = default_cfg.copy()
-def set_config(size, instruction, inst_template, is_use_cache, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
-    global cfg
     load_model(size)
-    cfg.update({
         'instruction': instruction,
-        'inst_template': inst_template,
-        'is_use_cache': bool(is_use_cache),
         'max_new_tokens': int(max_new_tokens),
         'temperature': float(temperature),
         'top_p': float(top_p),
@@ -70,24 +68,16 @@ def set_config(size, instruction, inst_template, is_use_cache, max_new_tokens, t
     return 'done.'
 def set_config_args(args):
-    global cfg
     load_model(args['size'])
-    cfg.update(args)
     return 'done.'
-def chatinterface_to_messages(message, history):
-    global cfg
     messages = []
-    if cfg['instruction']:
-        messages.append({'role': 'user', 'content': cfg['instruction']})
-        # userとassistantは交互に存在しないといけない
-        if message:
-            messages.append({'role': 'assistant', 'content': '了解しました。'})
     for pair in history:
         [user, assistant] = pair
         if user:
@@ -95,41 +85,76 @@ def chatinterface_to_messages(message, history):
         if assistant:
             messages.append({'role': 'assistant', 'content': assistant})
-    if message:
-        messages.append({'role': 'user', 'content': message})
     return messages
-def apply_template(messages):
-    global tokenizer, cfg, cache, chat_history
-    if type(messages) is str:
-        if cfg['inst_template']:
-            user_input = cfg['inst_template'].format(instruction=cfg['instruction'], input=messages)
-        user_input = cfg['instruction'].format(input=messages)
-        tokenized_chat = tokenizer(user_input, return_tensors="pt").input_ids
-    if type(messages) is list:
         tokenized_chat = tokenizer.apply_chat_template(
-            messages + chat_history, tokenize=True, add_generation_prompt=True, return_tensors="pt"
         )
     return tokenized_chat
-def chat(message, history = [], instruction = None, args = {}):
-    global tokenizer, model, cfg, cache, chat_history
-    if instruction:
-        cfg['instruction'] = instruction
-        tokenized_chat = apply_template(message)
-    else:
-        messages = chatinterface_to_messages(message, history)
-        tokenized_chat = apply_template(messages)
     device = local_gemma.utils.config.infer_device(None)
-    is_use_cache = cfg['is_use_cache']
     generation_kwargs = local_gemma.utils.config.get_generation_kwargs('chat')
     streamer = TextStreamer(tokenizer, skip_prompt=True, **{"skip_special_tokens": True})
-    tokenized_chat = tokenized_chat.to(device)
     generation_kwargs.update(
         {
             "streamer": streamer,
@@ -146,16 +171,16 @@ def chat(message, history = [], instruction = None, args = {}):
         'top_k',
         'repetition_penalty'
         ]:
-        if cfg[k]:
-            generation_kwargs[k] = cfg[k]
     # TODO(joao): this if shouldn't be needed, fix in transformers
     if cache is not None:
         generation_kwargs["cache_implementation"] = None
-    if cfg['max_new_tokens'] is not None:
         input_ids_len = tokenized_chat.shape[-1]
-        max_cache_len = cfg['max_new_tokens'] + input_ids_len
         if cache is not None and cache.max_cache_len < max_cache_len:
             # reset the cache
             generation_kwargs.pop("past_key_values")
@@ -169,34 +194,27 @@ def chat(message, history = [], instruction = None, args = {}):
     cache = gen_out.past_key_values
     model_tokens = gen_out.sequences[0, tokenized_chat.shape[1]:]
     model_output_text = tokenizer.decode(model_tokens, skip_special_tokens=True)
-    chat_history += [{"role": "user", "content": message},]
-    chat_history += [{"role": "assistant", "content": model_output_text},]
     # Sanity check: EOS was removed, ends in "<end_of_turn>\n"
     tokenized_chat = tokenizer.apply_chat_template(
-        chat_history, tokenize=True, add_generation_prompt=False, return_tensors="pt"
     ).tolist()[0]
     assert tokenized_chat[0] == 2
     assert tokenized_chat[-1] == 108
     assert tokenized_chat[-2] == 107
-    if not is_use_cache:
-        cache = None
-        chat_history = []
     return model_output_text
-def infer(message, history = [], instruction = None, args = {}):
-    return chat(message, history, instruction, args)
-def numel(message, history = [], instruction = None, args = {}):
-    global tokenizer, model, cfg, cache, chat_history
-    if instruction:
-        cfg['instruction'] = instruction
-        tokenized_chat = apply_template(message)
-    else:
-        messages = chatinterface_to_messages(message, history)
-        tokenized_chat = apply_template(messages)
     return torch.numel(tokenized_chat)

 tokenizer = None
 model = None
+cfg = {
     'size': None,
+}
+default_args = {
     'instruction': None,
+    'first_assistant': None,
+    'chat_template': None,
     'max_new_tokens': 1024,
     'temperature': 0.9,
     'top_p': 0.95,
     'top_k': 40,
     'repetition_penalty': 1.2,
 }
+chat_past_key_values = {}
+chat_messages = {}
 def load_model(size = '9b'):
     global tokenizer, model, cfg
     cfg['size'] = size
+def set_config(size, instruction, first_assistant, chat_template, max_new_tokens, temperature, top_p, top_k, repetition_penalty):
+    global default_args
     load_model(size)
+    default_args.update({
         'instruction': instruction,
+        'first_assistant': first_assistant,
+        'chat_template': chat_template,
         'max_new_tokens': int(max_new_tokens),
         'temperature': float(temperature),
         'top_p': float(top_p),
     return 'done.'
 def set_config_args(args):
+    global default_args
     load_model(args['size'])
+    default_args.update(args)
     return 'done.'
+def chatinterface_to_messages(history):
     messages = []
     for pair in history:
         [user, assistant] = pair
         if user:
         if assistant:
             messages.append({'role': 'assistant', 'content': assistant})
     return messages
+# わりとややこしい
+def tokenize(user_input, history = [], instruction = None, conversation_id = 'gradio', args = {}):
+    global tokenizer, chat_messages
+    # 先頭挿入用の形式づくり
+    inst_messages = []
+    if instruction:
+        if 'first_assistant' in args and args['first_assistant']:
+            # Claude互換形式
+            # userとassistantは交互に存在しないといけない
+            inst_messages = [
+                {'role': 'user', 'content': instruction},
+                {'role': 'assistant', 'content': args['first_assistant']},
+                ]
+        else:
+            # OpenAI互換形式
+            inst_messages = [{'role': 'system', 'content': instruction}]
+    # messagesがあるときは全部上書きする
+    if conversation_id and 'messages' in args:
+        chat_messages[conversation_id] = inst_messages + args['messages']
+    # cacheがあるならmessages形式で送る
+    # instructionは既にcacheされているので不要(途中変更不可)
+    if conversation_id and conversation_id in chat_messages and chat_messages[conversation_id]:
+        # user_inputを追加する
+        chat_messages[conversation_id] += [{'role': 'user', 'content': user_input}]
         tokenized_chat = tokenizer.apply_chat_template(
+            chat_messages[conversation_id], tokenize=True, add_generation_prompt=True, return_tensors="pt"
         )
+    else:
+        # instructionがあれば適用する(inputは任意)
+        if instruction:
+            user_input = instruction.format(input=user_input)
+        # どっちも無いとさすがにエラー
+        if not user_input:
+            raise ValueError('require input or instruction.')
+        tokenized_chat = tokenizer(user_input, return_tensors="pt").input_ids
     return tokenized_chat
+def chat(message, history = [], instruction = None, conversation_id = 'gradio', args = {}):
+    global tokenizer, model, chat_past_key_values, chat_messages
+    for k, v in default_args.items():
+        args.setdefault(k, v)
+    cache = None
+    # conversation_idがあるときはcacheを読む
+    if conversation_id and conversation_id in chat_messages and chat_messages[conversation_id]:
+        # clearが指定されてるなら最初に消す
+        if 'clear' in args and args['clear']:
+            chat_past_key_values[conversation_id] = None
+            chat_messages[conversation_id] = None
+        else:
+            cache = chat_past_key_values[conversation_id]
+    # chat_templateがあれば適用する
+    if args['chat_template']:
+        tokenizer.chat_template = args['chat_template']
+    # tokenizeする
+    tokenized_chat = tokenize(message, history, instruction, conversation_id, args).to(device)
     device = local_gemma.utils.config.infer_device(None)
     generation_kwargs = local_gemma.utils.config.get_generation_kwargs('chat')
     streamer = TextStreamer(tokenizer, skip_prompt=True, **{"skip_special_tokens": True})
     generation_kwargs.update(
         {
             "streamer": streamer,
         'top_k',
         'repetition_penalty'
         ]:
+        if args[k]:
+            generation_kwargs[k] = args[k]
     # TODO(joao): this if shouldn't be needed, fix in transformers
     if cache is not None:
         generation_kwargs["cache_implementation"] = None
+    if args['max_new_tokens'] is not None:
         input_ids_len = tokenized_chat.shape[-1]
+        max_cache_len = args['max_new_tokens'] + input_ids_len
         if cache is not None and cache.max_cache_len < max_cache_len:
             # reset the cache
             generation_kwargs.pop("past_key_values")
     cache = gen_out.past_key_values
     model_tokens = gen_out.sequences[0, tokenized_chat.shape[1]:]
     model_output_text = tokenizer.decode(model_tokens, skip_special_tokens=True)
+    chat_messages += [{"role": "user", "content": message},]
+    chat_messages += [{"role": "assistant", "content": model_output_text},]
     # Sanity check: EOS was removed, ends in "<end_of_turn>\n"
     tokenized_chat = tokenizer.apply_chat_template(
+        chat_messages, tokenize=True, add_generation_prompt=False, return_tensors="pt"
     ).tolist()[0]
     assert tokenized_chat[0] == 2
     assert tokenized_chat[-1] == 108
     assert tokenized_chat[-2] == 107
+    # TODO: stream対応
     return model_output_text
+# 非streamで返す
+def infer(message, history = [], instruction = None, conversation_id = 'gradio', args = {}):
+    return chat(message, history, instruction, conversation_id, args)
+def numel(message, history = [], instruction = None, conversation_id = 'gradio', args = {}):
+    global tokenizer, chat_messages
+    tokenized_chat = tokenize(message, history, instruction, conversation_id, args).to(device)
     return torch.numel(tokenized_chat)