minyichen commited on
Commit
263d7ce
1 Parent(s): 609a0eb

Upload tokenizer_config.json

Browse files

The current chat_template adds an extra `chatML` EOS token when `add_generation_prompt=False`.
Please replace it with the correct chat_template to fix this behavior.
```
from transformers import AutoTokenizer
message = [{"role": "user" , "content": 'How are you?'}]
tame_tokenizer = AutoTokenizer.from_pretrained("yentinglin/Llama-3-Taiwan-8B-Instruct")
tame_tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False)
```
You can see an extra `<|im_end|>` token in the output :
```
<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow are you?<|eot_id|><|im_end|>
```

Files changed (1) hide show
  1. tokenizer_config.json +2 -3
tokenizer_config.json CHANGED
@@ -2066,14 +2066,13 @@
2066
  }
2067
  },
2068
  "bos_token": "<|begin_of_text|>",
2069
- "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% else %}{{ eos_token }}{% endif %}",
2070
  "clean_up_tokenization_spaces": true,
2071
- "eos_token": "<|im_end|>",
2072
  "model_input_names": [
2073
  "input_ids",
2074
  "attention_mask"
2075
  ],
2076
  "model_max_length": 1000000000000000019884624838656,
2077
- "pad_token": "<|end_of_text|>",
2078
  "tokenizer_class": "PreTrainedTokenizerFast"
2079
  }
 
2066
  }
2067
  },
2068
  "bos_token": "<|begin_of_text|>",
2069
+ "chat_template": "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}",
2070
  "clean_up_tokenization_spaces": true,
2071
+ "eos_token": "<|eot_id|>",
2072
  "model_input_names": [
2073
  "input_ids",
2074
  "attention_mask"
2075
  ],
2076
  "model_max_length": 1000000000000000019884624838656,
 
2077
  "tokenizer_class": "PreTrainedTokenizerFast"
2078
  }