Add source files directly to repo

#1
config.json CHANGED
@@ -1,7 +1,13 @@
1
  {
 
2
  "architectures": [
3
  "CharacterBertForPreTraining"
4
  ],
 
 
 
 
 
5
  "attention_probs_dropout_prob": 0.1,
6
  "character_embeddings_dim": 16,
7
  "cnn_activation": "relu",
@@ -52,4 +58,4 @@
52
  "transformers_version": "4.7.0.dev0",
53
  "type_vocab_size": 2,
54
  "use_cache": true
55
- }
 
1
  {
2
+ "_name_or_path": "helboukkouri/character-bert-medical",
3
  "architectures": [
4
  "CharacterBertForPreTraining"
5
  ],
6
+ "auto_map": {
7
+ "AutoConfig": "configuration_character_bert.CharacterBertConfig",
8
+ "AutoModel": "modeling_character_bert.CharacterBertForPreTraining",
9
+ "AutoModelForMaskedLM": "modeling_character_bert.CharacterBertForMaskedLM"
10
+ },
11
  "attention_probs_dropout_prob": 0.1,
12
  "character_embeddings_dim": 16,
13
  "cnn_activation": "relu",
 
58
  "transformers_version": "4.7.0.dev0",
59
  "type_vocab_size": 2,
60
  "use_cache": true
61
+ }
configuration_character_bert.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright Hicham EL BOUKKOURI, Olivier FERRET, Thomas LAVERGNE, Hiroshi NOJI,
3
+ # Pierre ZWEIGENBAUM, Junichi TSUJII and The HuggingFace Inc. team.
4
+ # All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ """ CharacterBERT model configuration"""
18
+
19
+ from transformers.configuration_utils import PretrainedConfig
20
+ from transformers.utils import logging
21
+
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ CHARACTER_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
26
+ "helboukkouri/character-bert": "https://huggingface.co/helboukkouri/character-bert/resolve/main/config.json",
27
+ "helboukkouri/character-bert-medical": "https://huggingface.co/helboukkouri/character-bert-medical/resolve/main/config.json",
28
+ # See all CharacterBERT models at https://huggingface.co/models?filter=character_bert
29
+ }
30
+
31
+
32
+ class CharacterBertConfig(PretrainedConfig):
33
+ r"""
34
+ This is the configuration class to store the configuration of a [`CharacterBertModel`]. It is
35
+ used to instantiate an CharacterBERT model according to the specified arguments, defining the model architecture.
36
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the CharacterBERT
37
+ [helboukkouri/character-bert](https://huggingface.co/helboukkouri/character-bert) architecture.
38
+
39
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model
40
+ outputs. Read the documentation from [`PretrainedConfig`] for more information.
41
+
42
+
43
+ Args:
44
+ character_embeddings_dim (`int`, *optional*, defaults to `16`):
45
+ The size of the character embeddings.
46
+ cnn_activation (`str`, *optional*, defaults to `"relu"`):
47
+ The activation function to apply to the cnn representations.
48
+ cnn_filters (:
49
+ obj:*list(list(int))*, *optional*, defaults to `[[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]]`): The list of CNN filters to use in the CharacterCNN module.
50
+ num_highway_layers (`int`, *optional*, defaults to `2`):
51
+ The number of Highway layers to apply to the CNNs output.
52
+ max_word_length (`int`, *optional*, defaults to `50`):
53
+ The maximum token length in characters (actually, in bytes as any non-ascii characters will be converted to
54
+ a sequence of utf-8 bytes).
55
+ hidden_size (`int`, *optional*, defaults to 768):
56
+ Dimensionality of the encoder layers and the pooler layer.
57
+ num_hidden_layers (`int`, *optional*, defaults to 12):
58
+ Number of hidden layers in the Transformer encoder.
59
+ num_attention_heads (`int`, *optional*, defaults to 12):
60
+ Number of attention heads for each attention layer in the Transformer encoder.
61
+ intermediate_size (`int`, *optional*, defaults to 3072):
62
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
63
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
64
+ The non-linear activation function (function or string) in the encoder and pooler. If string,
65
+ `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
66
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
67
+ The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
68
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
69
+ The dropout ratio for the attention probabilities.
70
+ max_position_embeddings (`int`, *optional*, defaults to 512):
71
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
72
+ just in case (e.g., 512 or 1024 or 2048).
73
+ type_vocab_size (`int`, *optional*, defaults to 2):
74
+ The vocabulary size of the `token_type_ids` passed when calling
75
+ [`CharacterBertModel`] or [`TFCharacterBertModel`].
76
+ mlm_vocab_size (`int`, *optional*, defaults to 100000):
77
+ Size of the output vocabulary for MLM.
78
+ initializer_range (`float`, *optional*, defaults to 0.02):
79
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
80
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
81
+ The epsilon used by the layer normalization layers.
82
+ use_cache (`bool`, *optional*, defaults to `True`):
83
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
84
+ relevant if `config.is_decoder=True`.
85
+
86
+ Example:
87
+
88
+ ```python
89
+
90
+ ```
91
+
92
+ >>> from transformers import CharacterBertModel, CharacterBertConfig
93
+
94
+ >>> # Initializing a CharacterBERT helboukkouri/character-bert style configuration
95
+ >>> configuration = CharacterBertConfig()
96
+
97
+ >>> # Initializing a model from the helboukkouri/character-bert style configuration
98
+ >>> model = CharacterBertModel(configuration)
99
+
100
+ >>> # Accessing the model configuration
101
+ >>> configuration = model.config
102
+ """
103
+ model_type = "character_bert"
104
+
105
+ def __init__(
106
+ self,
107
+ character_embeddings_dim=16,
108
+ cnn_activation="relu",
109
+ cnn_filters=None,
110
+ num_highway_layers=2,
111
+ max_word_length=50,
112
+ hidden_size=768,
113
+ num_hidden_layers=12,
114
+ num_attention_heads=12,
115
+ intermediate_size=3072,
116
+ hidden_act="gelu",
117
+ hidden_dropout_prob=0.1,
118
+ attention_probs_dropout_prob=0.1,
119
+ max_position_embeddings=512,
120
+ type_vocab_size=2,
121
+ mlm_vocab_size=100000,
122
+ initializer_range=0.02,
123
+ layer_norm_eps=1e-12,
124
+ is_encoder_decoder=False,
125
+ use_cache=True,
126
+ **kwargs
127
+ ):
128
+ tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
129
+ if tie_word_embeddings:
130
+ raise ValueError(
131
+ "Cannot tie word embeddings in CharacterBERT. Please set " "`config.tie_word_embeddings=False`."
132
+ )
133
+ super().__init__(
134
+ type_vocab_size=type_vocab_size,
135
+ layer_norm_eps=layer_norm_eps,
136
+ use_cache=use_cache,
137
+ tie_word_embeddings=tie_word_embeddings,
138
+ **kwargs,
139
+ )
140
+ if cnn_filters is None:
141
+ cnn_filters = [[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]]
142
+ self.character_embeddings_dim = character_embeddings_dim
143
+ self.cnn_activation = cnn_activation
144
+ self.cnn_filters = cnn_filters
145
+ self.num_highway_layers = num_highway_layers
146
+ self.max_word_length = max_word_length
147
+ self.hidden_size = hidden_size
148
+ self.num_hidden_layers = num_hidden_layers
149
+ self.num_attention_heads = num_attention_heads
150
+ self.intermediate_size = intermediate_size
151
+ self.mlm_vocab_size = mlm_vocab_size
152
+ self.hidden_act = hidden_act
153
+ self.hidden_dropout_prob = hidden_dropout_prob
154
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
155
+ self.max_position_embeddings = max_position_embeddings
156
+ self.initializer_range = initializer_range
modeling_character_bert.py ADDED
@@ -0,0 +1,1954 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright Hicham EL BOUKKOURI, Olivier FERRET, Thomas LAVERGNE, Hiroshi NOJI,
3
+ # Pierre ZWEIGENBAUM, Junichi TSUJII, The HuggingFace Inc. and AllenNLP teams.
4
+ # All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ """
18
+ PyTorch CharacterBERT model: this is a variant of BERT that uses the CharacterCNN module from ELMo instead of a
19
+ WordPiece embedding matrix. See: “CharacterBERT: Reconciling ELMo and BERT for Word-Level Open-Vocabulary
20
+ Representations From Characters“ https://www.aclweb.org/anthology/2020.coling-main.609/
21
+ """
22
+
23
+ import math
24
+ import warnings
25
+ from dataclasses import dataclass
26
+ from typing import Callable, Optional, Tuple
27
+
28
+ import torch
29
+ import torch.utils.checkpoint
30
+ from torch import nn
31
+ from torch.nn import CrossEntropyLoss, MSELoss
32
+
33
+ from transformers.activations import ACT2FN
34
+ from transformers.file_utils import (
35
+ ModelOutput,
36
+ add_code_sample_docstrings,
37
+ add_start_docstrings,
38
+ add_start_docstrings_to_model_forward,
39
+ replace_return_docstrings,
40
+ )
41
+ from transformers.modeling_outputs import (
42
+ BaseModelOutputWithPastAndCrossAttentions,
43
+ BaseModelOutputWithPoolingAndCrossAttentions,
44
+ CausalLMOutputWithCrossAttentions,
45
+ MaskedLMOutput,
46
+ MultipleChoiceModelOutput,
47
+ NextSentencePredictorOutput,
48
+ QuestionAnsweringModelOutput,
49
+ SequenceClassifierOutput,
50
+ TokenClassifierOutput,
51
+ )
52
+ from transformers.modeling_utils import (
53
+ PreTrainedModel,
54
+ apply_chunking_to_forward,
55
+ find_pruneable_heads_and_indices,
56
+ prune_linear_layer,
57
+ )
58
+ from transformers.utils import logging
59
+ from .configuration_character_bert import CharacterBertConfig
60
+ from .tokenization_character_bert import CharacterMapper
61
+
62
+
63
+ logger = logging.get_logger(__name__)
64
+
65
+ _CHECKPOINT_FOR_DOC = "helboukkouri/character-bert"
66
+ _CONFIG_FOR_DOC = "CharacterBertConfig"
67
+ _TOKENIZER_FOR_DOC = "CharacterBertTokenizer"
68
+
69
+ CHARACTER_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
70
+ "helboukkouri/character-bert",
71
+ "helboukkouri/character-bert-medical",
72
+ # See all CharacterBERT models at https://huggingface.co/models?filter=character_bert
73
+ ]
74
+
75
+
76
+ # NOTE: the following class is taken from:
77
+ # https://github.com/allenai/allennlp/blob/main/allennlp/modules/highway.py
78
+ class Highway(torch.nn.Module):
79
+ """
80
+ A `Highway layer <https://arxiv.org/abs/1505.00387)>`__ does a gated combination of a linear transformation and a
81
+ non-linear transformation of its input. :math:`y = g * x + (1 - g) * f(A(x))`, where :math:`A` is a linear
82
+ transformation, :math:`f` is an element-wise non-linearity, and :math:`g` is an element-wise gate, computed as
83
+ :math:`sigmoid(B(x))`.
84
+
85
+ This module will apply a fixed number of highway layers to its input, returning the final result.
86
+
87
+ # Parameters
88
+
89
+ input_dim : `int`, required The dimensionality of :math:`x`. We assume the input has shape `(batch_size, ...,
90
+ input_dim)`. num_layers : `int`, optional (default=`1`) The number of highway layers to apply to the input.
91
+ activation : `Callable[[torch.Tensor], torch.Tensor]`, optional (default=`torch.nn.functional.relu`) The
92
+ non-linearity to use in the highway layers.
93
+ """
94
+
95
+ def __init__(
96
+ self,
97
+ input_dim: int,
98
+ num_layers: int = 1,
99
+ activation: Callable[[torch.Tensor], torch.Tensor] = torch.nn.functional.relu,
100
+ ) -> None:
101
+ super().__init__()
102
+ self._input_dim = input_dim
103
+ self._layers = torch.nn.ModuleList([torch.nn.Linear(input_dim, input_dim * 2) for _ in range(num_layers)])
104
+ self._activation = activation
105
+ for layer in self._layers:
106
+ # We should bias the highway layer to just carry its input forward. We do that by
107
+ # setting the bias on `B(x)` to be positive, because that means `g` will be biased to
108
+ # be high, so we will carry the input forward. The bias on `B(x)` is the second half
109
+ # of the bias vector in each Linear layer.
110
+ layer.bias[input_dim:].data.fill_(1)
111
+
112
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
113
+ current_input = inputs
114
+ for layer in self._layers:
115
+ projected_input = layer(current_input)
116
+ linear_part = current_input
117
+ # NOTE: if you modify this, think about whether you should modify the initialization
118
+ # above, too.
119
+ nonlinear_part, gate = projected_input.chunk(2, dim=-1)
120
+ nonlinear_part = self._activation(nonlinear_part)
121
+ gate = torch.sigmoid(gate)
122
+ current_input = gate * linear_part + (1 - gate) * nonlinear_part
123
+ return current_input
124
+
125
+
126
+ # NOTE: The CharacterCnn was adapted from `_ElmoCharacterEncoder`:
127
+ # https://github.com/allenai/allennlp/blob/main/allennlp/modules/elmo.py#L254
128
+ class CharacterCnn(torch.nn.Module):
129
+ """
130
+ Computes context insensitive token representation using multiple CNNs. This embedder has input character ids of
131
+ size (batch_size, sequence_length, 50) and returns (batch_size, sequence_length, hidden_size), where hidden_size is
132
+ typically 768.
133
+ """
134
+
135
+ def __init__(self, config):
136
+ super().__init__()
137
+ self.character_embeddings_dim = config.character_embeddings_dim
138
+ self.cnn_activation = config.cnn_activation
139
+ self.cnn_filters = config.cnn_filters
140
+ self.num_highway_layers = config.num_highway_layers
141
+ self.max_word_length = config.max_word_length
142
+ self.hidden_size = config.hidden_size
143
+ # NOTE: this is the 256 possible utf-8 bytes + special slots for the
144
+ # [CLS]/[SEP]/[PAD]/[MASK] characters as well as beginning/end of
145
+ # word symbols and character padding for short words -> total of 263
146
+ self.character_vocab_size = 263
147
+ self._init_weights()
148
+
149
+ def get_output_dim(self):
150
+ return self.hidden_size
151
+
152
+ def _init_weights(self):
153
+ self._init_char_embedding()
154
+ self._init_cnn_weights()
155
+ self._init_highway()
156
+ self._init_projection()
157
+
158
+ def _init_char_embedding(self):
159
+ weights = torch.empty((self.character_vocab_size, self.character_embeddings_dim))
160
+ nn.init.normal_(weights)
161
+ weights[0].fill_(0.0) # token padding
162
+ weights[CharacterMapper.padding_character + 1].fill_(0.0) # character padding
163
+ self._char_embedding_weights = torch.nn.Parameter(torch.FloatTensor(weights), requires_grad=True)
164
+
165
+ def _init_cnn_weights(self):
166
+ convolutions = []
167
+ for i, (width, num) in enumerate(self.cnn_filters):
168
+ conv = torch.nn.Conv1d(
169
+ in_channels=self.character_embeddings_dim, out_channels=num, kernel_size=width, bias=True
170
+ )
171
+ conv.weight.requires_grad = True
172
+ conv.bias.requires_grad = True
173
+ convolutions.append(conv)
174
+ self.add_module(f"char_conv_{i}", conv)
175
+ self._convolutions = convolutions
176
+
177
+ def _init_highway(self):
178
+ # the highway layers have same dimensionality as the number of cnn filters
179
+ n_filters = sum(f[1] for f in self.cnn_filters)
180
+ self._highways = Highway(n_filters, self.num_highway_layers, activation=nn.functional.relu)
181
+ for k in range(self.num_highway_layers):
182
+ # The AllenNLP highway is one matrix multplication with concatenation of
183
+ # transform and carry weights.
184
+ self._highways._layers[k].weight.requires_grad = True
185
+ self._highways._layers[k].bias.requires_grad = True
186
+
187
+ def _init_projection(self):
188
+ n_filters = sum(f[1] for f in self.cnn_filters)
189
+ self._projection = torch.nn.Linear(n_filters, self.hidden_size, bias=True)
190
+ self._projection.weight.requires_grad = True
191
+ self._projection.bias.requires_grad = True
192
+
193
+ def forward(self, inputs: torch.Tensor) -> torch.Tensor:
194
+ """
195
+ Compute context insensitive token embeddings from characters. # Parameters inputs : `torch.Tensor` Shape
196
+ `(batch_size, sequence_length, 50)` of character ids representing the current batch. # Returns output:
197
+ `torch.Tensor` Shape `(batch_size, sequence_length, embedding_dim)` tensor with context insensitive token
198
+ representations.
199
+ """
200
+
201
+ # character embeddings
202
+ # (batch_size * sequence_length, max_word_length, embed_dim)
203
+ character_embedding = torch.nn.functional.embedding(
204
+ inputs.view(-1, self.max_word_length), self._char_embedding_weights
205
+ )
206
+
207
+ # CNN representations
208
+ if self.cnn_activation == "tanh":
209
+ activation = torch.tanh
210
+ elif self.cnn_activation == "relu":
211
+ activation = torch.nn.functional.relu
212
+ else:
213
+ raise Exception("ConfigurationError: Unknown activation")
214
+
215
+ # (batch_size * sequence_length, embed_dim, max_word_length)
216
+ character_embedding = torch.transpose(character_embedding, 1, 2)
217
+ convs = []
218
+ for i in range(len(self._convolutions)):
219
+ conv = getattr(self, "char_conv_{}".format(i))
220
+ convolved = conv(character_embedding)
221
+ # (batch_size * sequence_length, n_filters for this width)
222
+ convolved, _ = torch.max(convolved, dim=-1)
223
+ convolved = activation(convolved)
224
+ convs.append(convolved)
225
+
226
+ # (batch_size * sequence_length, n_filters)
227
+ token_embedding = torch.cat(convs, dim=-1)
228
+
229
+ # apply the highway layers (batch_size * sequence_length, n_filters)
230
+ token_embedding = self._highways(token_embedding)
231
+
232
+ # final projection (batch_size * sequence_length, embedding_dim)
233
+ token_embedding = self._projection(token_embedding)
234
+
235
+ # reshape to (batch_size, sequence_length, embedding_dim)
236
+ batch_size, sequence_length, _ = inputs.size()
237
+ output = token_embedding.view(batch_size, sequence_length, -1)
238
+
239
+ return output
240
+
241
+
242
+ class CharacterBertEmbeddings(nn.Module):
243
+ """Construct the embeddings from word, position and token_type embeddings."""
244
+
245
+ def __init__(self, config):
246
+ super().__init__()
247
+ self.word_embeddings = CharacterCnn(config)
248
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
249
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
250
+
251
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
252
+ # any TensorFlow checkpoint file
253
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
254
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
255
+
256
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
257
+ self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
258
+ self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
259
+
260
+ def forward(
261
+ self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
262
+ ):
263
+ if input_ids is not None:
264
+ input_shape = input_ids[:, :, 0].size()
265
+ else:
266
+ input_shape = inputs_embeds.size()[:-1]
267
+
268
+ seq_length = input_shape[1]
269
+
270
+ if position_ids is None:
271
+ position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
272
+
273
+ if token_type_ids is None:
274
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
275
+
276
+ if inputs_embeds is None:
277
+ inputs_embeds = self.word_embeddings(input_ids)
278
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
279
+
280
+ embeddings = inputs_embeds + token_type_embeddings
281
+ if self.position_embedding_type == "absolute":
282
+ position_embeddings = self.position_embeddings(position_ids)
283
+ embeddings += position_embeddings
284
+ embeddings = self.LayerNorm(embeddings)
285
+ embeddings = self.dropout(embeddings)
286
+ return embeddings
287
+
288
+
289
+ # Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->CharacterBert
290
+ class CharacterBertSelfAttention(nn.Module):
291
+ def __init__(self, config, position_embedding_type=None):
292
+ super().__init__()
293
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
294
+ raise ValueError(
295
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
296
+ f"heads ({config.num_attention_heads})"
297
+ )
298
+
299
+ self.num_attention_heads = config.num_attention_heads
300
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
301
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
302
+
303
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
304
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
305
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
306
+
307
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
308
+ self.position_embedding_type = position_embedding_type or getattr(
309
+ config, "position_embedding_type", "absolute"
310
+ )
311
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
312
+ self.max_position_embeddings = config.max_position_embeddings
313
+ self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
314
+
315
+ self.is_decoder = config.is_decoder
316
+
317
+ def transpose_for_scores(self, x):
318
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
319
+ x = x.view(*new_x_shape)
320
+ return x.permute(0, 2, 1, 3)
321
+
322
+ def forward(
323
+ self,
324
+ hidden_states,
325
+ attention_mask=None,
326
+ head_mask=None,
327
+ encoder_hidden_states=None,
328
+ encoder_attention_mask=None,
329
+ past_key_value=None,
330
+ output_attentions=False,
331
+ ):
332
+ mixed_query_layer = self.query(hidden_states)
333
+
334
+ # If this is instantiated as a cross-attention module, the keys
335
+ # and values come from an encoder; the attention mask needs to be
336
+ # such that the encoder's padding tokens are not attended to.
337
+ is_cross_attention = encoder_hidden_states is not None
338
+
339
+ if is_cross_attention and past_key_value is not None:
340
+ # reuse k,v, cross_attentions
341
+ key_layer = past_key_value[0]
342
+ value_layer = past_key_value[1]
343
+ attention_mask = encoder_attention_mask
344
+ elif is_cross_attention:
345
+ key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
346
+ value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
347
+ attention_mask = encoder_attention_mask
348
+ elif past_key_value is not None:
349
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
350
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
351
+ key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
352
+ value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
353
+ else:
354
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
355
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
356
+
357
+ query_layer = self.transpose_for_scores(mixed_query_layer)
358
+
359
+ if self.is_decoder:
360
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
361
+ # Further calls to cross_attention layer can then reuse all cross-attention
362
+ # key/value_states (first "if" case)
363
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
364
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
365
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
366
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
367
+ past_key_value = (key_layer, value_layer)
368
+
369
+ # Take the dot product between "query" and "key" to get the raw attention scores.
370
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
371
+
372
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
373
+ seq_length = hidden_states.size()[1]
374
+ position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
375
+ position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
376
+ distance = position_ids_l - position_ids_r
377
+ positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
378
+ positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
379
+
380
+ if self.position_embedding_type == "relative_key":
381
+ relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
382
+ attention_scores = attention_scores + relative_position_scores
383
+ elif self.position_embedding_type == "relative_key_query":
384
+ relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
385
+ relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
386
+ attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
387
+
388
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
389
+ if attention_mask is not None:
390
+ # Apply the attention mask is (precomputed for all layers in CharacterBertModel forward() function)
391
+ attention_scores = attention_scores + attention_mask
392
+
393
+ # Normalize the attention scores to probabilities.
394
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
395
+
396
+ # This is actually dropping out entire tokens to attend to, which might
397
+ # seem a bit unusual, but is taken from the original Transformer paper.
398
+ attention_probs = self.dropout(attention_probs)
399
+
400
+ # Mask heads if we want to
401
+ if head_mask is not None:
402
+ attention_probs = attention_probs * head_mask
403
+
404
+ context_layer = torch.matmul(attention_probs, value_layer)
405
+
406
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
407
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
408
+ context_layer = context_layer.view(*new_context_layer_shape)
409
+
410
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
411
+
412
+ if self.is_decoder:
413
+ outputs = outputs + (past_key_value,)
414
+ return outputs
415
+
416
+
417
+ # Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->CharacterBert
418
+ class CharacterBertSelfOutput(nn.Module):
419
+ def __init__(self, config):
420
+ super().__init__()
421
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
422
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
423
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
424
+
425
+ def forward(self, hidden_states, input_tensor):
426
+ hidden_states = self.dense(hidden_states)
427
+ hidden_states = self.dropout(hidden_states)
428
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
429
+ return hidden_states
430
+
431
+
432
+ # Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->CharacterBert
433
+ class CharacterBertAttention(nn.Module):
434
+ def __init__(self, config, position_embedding_type=None):
435
+ super().__init__()
436
+ self.self = CharacterBertSelfAttention(config, position_embedding_type=position_embedding_type)
437
+ self.output = CharacterBertSelfOutput(config)
438
+ self.pruned_heads = set()
439
+
440
+ def prune_heads(self, heads):
441
+ if len(heads) == 0:
442
+ return
443
+ heads, index = find_pruneable_heads_and_indices(
444
+ heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
445
+ )
446
+
447
+ # Prune linear layers
448
+ self.self.query = prune_linear_layer(self.self.query, index)
449
+ self.self.key = prune_linear_layer(self.self.key, index)
450
+ self.self.value = prune_linear_layer(self.self.value, index)
451
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
452
+
453
+ # Update hyper params and store pruned heads
454
+ self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
455
+ self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
456
+ self.pruned_heads = self.pruned_heads.union(heads)
457
+
458
+ def forward(
459
+ self,
460
+ hidden_states,
461
+ attention_mask=None,
462
+ head_mask=None,
463
+ encoder_hidden_states=None,
464
+ encoder_attention_mask=None,
465
+ past_key_value=None,
466
+ output_attentions=False,
467
+ ):
468
+ self_outputs = self.self(
469
+ hidden_states,
470
+ attention_mask,
471
+ head_mask,
472
+ encoder_hidden_states,
473
+ encoder_attention_mask,
474
+ past_key_value,
475
+ output_attentions,
476
+ )
477
+ attention_output = self.output(self_outputs[0], hidden_states)
478
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
479
+ return outputs
480
+
481
+
482
+ # Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->CharacterBert
483
+ class CharacterBertIntermediate(nn.Module):
484
+ def __init__(self, config):
485
+ super().__init__()
486
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
487
+ if isinstance(config.hidden_act, str):
488
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
489
+ else:
490
+ self.intermediate_act_fn = config.hidden_act
491
+
492
+ def forward(self, hidden_states):
493
+ hidden_states = self.dense(hidden_states)
494
+ hidden_states = self.intermediate_act_fn(hidden_states)
495
+ return hidden_states
496
+
497
+
498
+ # Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->CharacterBert
499
+ class CharacterBertOutput(nn.Module):
500
+ def __init__(self, config):
501
+ super().__init__()
502
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
503
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
504
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
505
+
506
+ def forward(self, hidden_states, input_tensor):
507
+ hidden_states = self.dense(hidden_states)
508
+ hidden_states = self.dropout(hidden_states)
509
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
510
+ return hidden_states
511
+
512
+
513
+ # Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->CharacterBert
514
+ class CharacterBertLayer(nn.Module):
515
+ def __init__(self, config):
516
+ super().__init__()
517
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
518
+ self.seq_len_dim = 1
519
+ self.attention = CharacterBertAttention(config)
520
+ self.is_decoder = config.is_decoder
521
+ self.add_cross_attention = config.add_cross_attention
522
+ if self.add_cross_attention:
523
+ if not self.is_decoder:
524
+ raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
525
+ self.crossattention = CharacterBertAttention(config, position_embedding_type="absolute")
526
+ self.intermediate = CharacterBertIntermediate(config)
527
+ self.output = CharacterBertOutput(config)
528
+
529
+ def forward(
530
+ self,
531
+ hidden_states,
532
+ attention_mask=None,
533
+ head_mask=None,
534
+ encoder_hidden_states=None,
535
+ encoder_attention_mask=None,
536
+ past_key_value=None,
537
+ output_attentions=False,
538
+ ):
539
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
540
+ self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
541
+ self_attention_outputs = self.attention(
542
+ hidden_states,
543
+ attention_mask,
544
+ head_mask,
545
+ output_attentions=output_attentions,
546
+ past_key_value=self_attn_past_key_value,
547
+ )
548
+ attention_output = self_attention_outputs[0]
549
+
550
+ # if decoder, the last output is tuple of self-attn cache
551
+ if self.is_decoder:
552
+ outputs = self_attention_outputs[1:-1]
553
+ present_key_value = self_attention_outputs[-1]
554
+ else:
555
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
556
+
557
+ cross_attn_present_key_value = None
558
+ if self.is_decoder and encoder_hidden_states is not None:
559
+ if not hasattr(self, "crossattention"):
560
+ raise ValueError(
561
+ f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`"
562
+ )
563
+
564
+ # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
565
+ cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
566
+ cross_attention_outputs = self.crossattention(
567
+ attention_output,
568
+ attention_mask,
569
+ head_mask,
570
+ encoder_hidden_states,
571
+ encoder_attention_mask,
572
+ cross_attn_past_key_value,
573
+ output_attentions,
574
+ )
575
+ attention_output = cross_attention_outputs[0]
576
+ outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
577
+
578
+ # add cross-attn cache to positions 3,4 of present_key_value tuple
579
+ cross_attn_present_key_value = cross_attention_outputs[-1]
580
+ present_key_value = present_key_value + cross_attn_present_key_value
581
+
582
+ layer_output = apply_chunking_to_forward(
583
+ self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
584
+ )
585
+ outputs = (layer_output,) + outputs
586
+
587
+ # if decoder, return the attn key/values as the last output
588
+ if self.is_decoder:
589
+ outputs = outputs + (present_key_value,)
590
+
591
+ return outputs
592
+
593
+ def feed_forward_chunk(self, attention_output):
594
+ intermediate_output = self.intermediate(attention_output)
595
+ layer_output = self.output(intermediate_output, attention_output)
596
+ return layer_output
597
+
598
+
599
+ # Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->CharacterBert
600
+ class CharacterBertEncoder(nn.Module):
601
+ def __init__(self, config):
602
+ super().__init__()
603
+ self.config = config
604
+ self.layer = nn.ModuleList([CharacterBertLayer(config) for _ in range(config.num_hidden_layers)])
605
+ self.gradient_checkpointing = False
606
+
607
+ def forward(
608
+ self,
609
+ hidden_states,
610
+ attention_mask=None,
611
+ head_mask=None,
612
+ encoder_hidden_states=None,
613
+ encoder_attention_mask=None,
614
+ past_key_values=None,
615
+ use_cache=None,
616
+ output_attentions=False,
617
+ output_hidden_states=False,
618
+ return_dict=True,
619
+ ):
620
+ all_hidden_states = () if output_hidden_states else None
621
+ all_self_attentions = () if output_attentions else None
622
+ all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
623
+
624
+ next_decoder_cache = () if use_cache else None
625
+ for i, layer_module in enumerate(self.layer):
626
+ if output_hidden_states:
627
+ all_hidden_states = all_hidden_states + (hidden_states,)
628
+
629
+ layer_head_mask = head_mask[i] if head_mask is not None else None
630
+ past_key_value = past_key_values[i] if past_key_values is not None else None
631
+
632
+ if self.gradient_checkpointing and self.training:
633
+
634
+ if use_cache:
635
+ logger.warning(
636
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
637
+ )
638
+ use_cache = False
639
+
640
+ def create_custom_forward(module):
641
+ def custom_forward(*inputs):
642
+ return module(*inputs, past_key_value, output_attentions)
643
+
644
+ return custom_forward
645
+
646
+ layer_outputs = torch.utils.checkpoint.checkpoint(
647
+ create_custom_forward(layer_module),
648
+ hidden_states,
649
+ attention_mask,
650
+ layer_head_mask,
651
+ encoder_hidden_states,
652
+ encoder_attention_mask,
653
+ )
654
+ else:
655
+ layer_outputs = layer_module(
656
+ hidden_states,
657
+ attention_mask,
658
+ layer_head_mask,
659
+ encoder_hidden_states,
660
+ encoder_attention_mask,
661
+ past_key_value,
662
+ output_attentions,
663
+ )
664
+
665
+ hidden_states = layer_outputs[0]
666
+ if use_cache:
667
+ next_decoder_cache += (layer_outputs[-1],)
668
+ if output_attentions:
669
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
670
+ if self.config.add_cross_attention:
671
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
672
+
673
+ if output_hidden_states:
674
+ all_hidden_states = all_hidden_states + (hidden_states,)
675
+
676
+ if not return_dict:
677
+ return tuple(
678
+ v
679
+ for v in [
680
+ hidden_states,
681
+ next_decoder_cache,
682
+ all_hidden_states,
683
+ all_self_attentions,
684
+ all_cross_attentions,
685
+ ]
686
+ if v is not None
687
+ )
688
+ return BaseModelOutputWithPastAndCrossAttentions(
689
+ last_hidden_state=hidden_states,
690
+ past_key_values=next_decoder_cache,
691
+ hidden_states=all_hidden_states,
692
+ attentions=all_self_attentions,
693
+ cross_attentions=all_cross_attentions,
694
+ )
695
+
696
+
697
+ # Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->CharacterBert
698
+ class CharacterBertPooler(nn.Module):
699
+ def __init__(self, config):
700
+ super().__init__()
701
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
702
+ self.activation = nn.Tanh()
703
+
704
+ def forward(self, hidden_states):
705
+ # We "pool" the model by simply taking the hidden state corresponding
706
+ # to the first token.
707
+ first_token_tensor = hidden_states[:, 0]
708
+ pooled_output = self.dense(first_token_tensor)
709
+ pooled_output = self.activation(pooled_output)
710
+ return pooled_output
711
+
712
+
713
+ # Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->CharacterBert
714
+ class CharacterBertPredictionHeadTransform(nn.Module):
715
+ def __init__(self, config):
716
+ super().__init__()
717
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
718
+ if isinstance(config.hidden_act, str):
719
+ self.transform_act_fn = ACT2FN[config.hidden_act]
720
+ else:
721
+ self.transform_act_fn = config.hidden_act
722
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
723
+
724
+ def forward(self, hidden_states):
725
+ hidden_states = self.dense(hidden_states)
726
+ hidden_states = self.transform_act_fn(hidden_states)
727
+ hidden_states = self.LayerNorm(hidden_states)
728
+ return hidden_states
729
+
730
+
731
+ class CharacterBertLMPredictionHead(nn.Module):
732
+ def __init__(self, config):
733
+ super().__init__()
734
+ self.transform = CharacterBertPredictionHeadTransform(config)
735
+
736
+ # The output weights are the same as the input embeddings, but there is
737
+ # an output-only bias for each token.
738
+ self.decoder = nn.Linear(config.hidden_size, config.mlm_vocab_size, bias=False)
739
+
740
+ self.bias = nn.Parameter(torch.zeros(config.mlm_vocab_size))
741
+
742
+ # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
743
+ self.decoder.bias = self.bias
744
+
745
+ def forward(self, hidden_states):
746
+ hidden_states = self.transform(hidden_states)
747
+ hidden_states = self.decoder(hidden_states)
748
+ return hidden_states
749
+
750
+
751
+ # Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->CharacterBert
752
+ class CharacterBertOnlyMLMHead(nn.Module):
753
+ def __init__(self, config):
754
+ super().__init__()
755
+ self.predictions = CharacterBertLMPredictionHead(config)
756
+
757
+ def forward(self, sequence_output):
758
+ prediction_scores = self.predictions(sequence_output)
759
+ return prediction_scores
760
+
761
+
762
+ # Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->CharacterBert
763
+ class CharacterBertOnlyNSPHead(nn.Module):
764
+ def __init__(self, config):
765
+ super().__init__()
766
+ self.seq_relationship = nn.Linear(config.hidden_size, 2)
767
+
768
+ def forward(self, pooled_output):
769
+ seq_relationship_score = self.seq_relationship(pooled_output)
770
+ return seq_relationship_score
771
+
772
+
773
+ # Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->CharacterBert
774
+ class CharacterBertPreTrainingHeads(nn.Module):
775
+ def __init__(self, config):
776
+ super().__init__()
777
+ self.predictions = CharacterBertLMPredictionHead(config)
778
+ self.seq_relationship = nn.Linear(config.hidden_size, 2)
779
+
780
+ def forward(self, sequence_output, pooled_output):
781
+ prediction_scores = self.predictions(sequence_output)
782
+ seq_relationship_score = self.seq_relationship(pooled_output)
783
+ return prediction_scores, seq_relationship_score
784
+
785
+
786
+ class CharacterBertPreTrainedModel(PreTrainedModel):
787
+ """
788
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
789
+ models.
790
+ """
791
+
792
+ config_class = CharacterBertConfig
793
+ load_tf_weights = None
794
+ base_model_prefix = "character_bert"
795
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
796
+
797
+ def _init_weights(self, module):
798
+ """Initialize the weights"""
799
+ if isinstance(module, CharacterCnn):
800
+ # We need to handle the case of these parameters since it is not an actual module
801
+ module._char_embedding_weights.data.normal_()
802
+ # token padding
803
+ module._char_embedding_weights.data[0].fill_(0.0)
804
+ # character padding
805
+ module._char_embedding_weights.data[CharacterMapper.padding_character + 1].fill_(0.0)
806
+ if isinstance(module, nn.Linear):
807
+ # Slightly different from the TF version which uses truncated_normal for initialization
808
+ # cf https://github.com/pytorch/pytorch/pull/5617
809
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
810
+ if module.bias is not None:
811
+ module.bias.data.zero_()
812
+ elif isinstance(module, nn.Embedding):
813
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
814
+ if module.padding_idx is not None:
815
+ module.weight.data[module.padding_idx].zero_()
816
+ elif isinstance(module, nn.LayerNorm):
817
+ module.bias.data.zero_()
818
+ module.weight.data.fill_(1.0)
819
+
820
+
821
+ @dataclass
822
+ # Copied from transformers.models.bert.modeling_bert.BertForPreTrainingOutput with Bert->CharacterBert
823
+ class CharacterBertForPreTrainingOutput(ModelOutput):
824
+ """
825
+ Output type of [`CharacterBertForPreTraining`].
826
+
827
+ Args:
828
+ loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
829
+ Total loss as the sum of the masked language modeling loss and the next sequence prediction
830
+ (classification) loss.
831
+ prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
832
+ Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
833
+ seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
834
+ Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
835
+ before SoftMax).
836
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
837
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
838
+ shape `(batch_size, sequence_length, hidden_size)`.
839
+
840
+ Hidden-states of the model at the output of each layer plus the initial embedding outputs.
841
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
842
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
843
+ sequence_length)`.
844
+
845
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
846
+ heads.
847
+ """
848
+
849
+ loss: Optional[torch.FloatTensor] = None
850
+ prediction_logits: torch.FloatTensor = None
851
+ seq_relationship_logits: torch.FloatTensor = None
852
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
853
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
854
+
855
+
856
+ CHARACTER_BERT_START_DOCSTRING = r"""
857
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
858
+ it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
859
+ behavior.
860
+
861
+ Parameters:
862
+ config (:
863
+ class:*~transformers.CharacterBertConfig*): Model configuration class with all the parameters of the model.
864
+ Initializing with a config file does not load the weights associated with the model, only the
865
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model
866
+ weights.
867
+ """
868
+
869
+ CHARACTER_BERT_INPUTS_DOCSTRING = r"""
870
+ Args:
871
+ input_ids (`torch.LongTensor` of shape `{0}`):
872
+ Indices of input sequence tokens.
873
+
874
+ Indices can be obtained using [`CharacterBertTokenizer`]. See
875
+ [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for
876
+ details.
877
+
878
+ [What are input IDs?](../glossary#input-ids)
879
+ attention_mask (`torch.FloatTensor` of shape `{1}`, *optional*):
880
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
881
+
882
+ - 1 for tokens that are **not masked**,
883
+ - 0 for tokens that are **masked**.
884
+
885
+ [What are attention masks?](../glossary#attention-mask)
886
+ token_type_ids (`torch.LongTensor` of shape `{1}`, *optional*):
887
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
888
+
889
+ - 0 corresponds to a *sentence A* token,
890
+ - 1 corresponds to a *sentence B* token.
891
+
892
+ [What are token type IDs?](../glossary#token-type-ids)
893
+ position_ids (`torch.LongTensor` of shape `{1}`, *optional*):
894
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.max_position_embeddings - 1]`.
895
+
896
+ [What are position IDs?](../glossary#position-ids)
897
+ head_mask (:
898
+ obj:*torch.FloatTensor* of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): Mask
899
+ to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
900
+
901
+ - 1 indicates the head is **not masked**,
902
+ - 0 indicates the head is **masked**.
903
+
904
+ inputs_embeds (:
905
+ obj:*torch.FloatTensor* of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
906
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
907
+ This is useful if you want more control over how to convert *input_ids* indices into associated vectors
908
+ than the model's internal embedding lookup matrix.
909
+ output_attentions (`bool`, *optional*):
910
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
911
+ tensors for more detail.
912
+ output_hidden_states (`bool`, *optional*):
913
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
914
+ more detail.
915
+ return_dict (`bool`, *optional*):
916
+ Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
917
+ """
918
+
919
+
920
+ @add_start_docstrings(
921
+ "The bare CharacterBERT Model transformer outputting raw hidden-states without any specific head on top.",
922
+ CHARACTER_BERT_START_DOCSTRING,
923
+ )
924
+ class CharacterBertModel(CharacterBertPreTrainedModel):
925
+ """
926
+
927
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
928
+ cross-attention is added between the self-attention layers, following the architecture described in [Attention is
929
+ all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
930
+ Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
931
+
932
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration
933
+ set to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder`
934
+ argument and `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an
935
+ input to the forward pass.
936
+ """
937
+
938
+ def __init__(self, config, add_pooling_layer=True):
939
+ super().__init__(config)
940
+ self.config = config
941
+
942
+ self.embeddings = CharacterBertEmbeddings(config)
943
+ self.encoder = CharacterBertEncoder(config)
944
+
945
+ self.pooler = CharacterBertPooler(config) if add_pooling_layer else None
946
+
947
+ self.init_weights()
948
+
949
+ def get_input_embeddings(self):
950
+ return self.embeddings.word_embeddings
951
+
952
+ def set_input_embeddings(self, value):
953
+ self.embeddings.word_embeddings = value
954
+
955
+ def resize_token_embeddings(self, *args, **kwargs):
956
+ raise NotImplementedError("Cannot resize CharacterBERT's token embeddings.")
957
+
958
+ def _prune_heads(self, heads_to_prune):
959
+ """
960
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
961
+ class PreTrainedModel
962
+ """
963
+ for layer, heads in heads_to_prune.items():
964
+ self.encoder.layer[layer].attention.prune_heads(heads)
965
+
966
+ @add_start_docstrings_to_model_forward(
967
+ CHARACTER_BERT_INPUTS_DOCSTRING.format(
968
+ "(batch_size, sequence_length, maximum_token_length)", "(batch_size, sequence_length)"
969
+ )
970
+ )
971
+ @add_code_sample_docstrings(
972
+ processor_class=_TOKENIZER_FOR_DOC,
973
+ checkpoint=_CHECKPOINT_FOR_DOC,
974
+ output_type=BaseModelOutputWithPoolingAndCrossAttentions,
975
+ config_class=_CONFIG_FOR_DOC,
976
+ )
977
+ def forward(
978
+ self,
979
+ input_ids=None,
980
+ attention_mask=None,
981
+ token_type_ids=None,
982
+ position_ids=None,
983
+ head_mask=None,
984
+ inputs_embeds=None,
985
+ encoder_hidden_states=None,
986
+ encoder_attention_mask=None,
987
+ past_key_values=None,
988
+ use_cache=None,
989
+ output_attentions=None,
990
+ output_hidden_states=None,
991
+ return_dict=None,
992
+ ):
993
+ r"""
994
+ encoder_hidden_states (:
995
+ obj:*torch.FloatTensor* of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence
996
+ of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model
997
+ is configured as a decoder.
998
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
999
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
1000
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
1001
+
1002
+ - 1 for tokens that are **not masked**,
1003
+ - 0 for tokens that are **masked**.
1004
+ past_key_values (:
1005
+ obj:*tuple(tuple(torch.FloatTensor))* of length `config.n_layers` with each tuple having 4 tensors of
1006
+ shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key
1007
+ and value hidden states of the attention blocks. Can be used to speed up decoding. If
1008
+ `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
1009
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
1010
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
1011
+ use_cache (`bool`, *optional*):
1012
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
1013
+ decoding (see `past_key_values`).
1014
+ """
1015
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1016
+ output_hidden_states = (
1017
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1018
+ )
1019
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1020
+
1021
+ if self.config.is_decoder:
1022
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
1023
+ else:
1024
+ use_cache = False
1025
+
1026
+ if input_ids is not None and inputs_embeds is not None:
1027
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
1028
+ elif input_ids is not None:
1029
+ input_shape = input_ids.size()[:-1]
1030
+ batch_size, seq_length = input_shape
1031
+ elif inputs_embeds is not None:
1032
+ input_shape = inputs_embeds.size()[:-1]
1033
+ batch_size, seq_length = input_shape
1034
+ else:
1035
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
1036
+
1037
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
1038
+
1039
+ # past_key_values_length
1040
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
1041
+
1042
+ if attention_mask is None:
1043
+ attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
1044
+ if token_type_ids is None:
1045
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
1046
+
1047
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
1048
+ # ourselves in which case we just need to make it broadcastable to all heads.
1049
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
1050
+
1051
+ # If a 2D or 3D attention mask is provided for the cross-attention
1052
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
1053
+ if self.config.is_decoder and encoder_hidden_states is not None:
1054
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
1055
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
1056
+ if encoder_attention_mask is None:
1057
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
1058
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
1059
+ else:
1060
+ encoder_extended_attention_mask = None
1061
+
1062
+ # Prepare head mask if needed
1063
+ # 1.0 in head_mask indicate we keep the head
1064
+ # attention_probs has shape bsz x n_heads x N x N
1065
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
1066
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
1067
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
1068
+
1069
+ embedding_output = self.embeddings(
1070
+ input_ids=input_ids,
1071
+ position_ids=position_ids,
1072
+ token_type_ids=token_type_ids,
1073
+ inputs_embeds=inputs_embeds,
1074
+ past_key_values_length=past_key_values_length,
1075
+ )
1076
+ encoder_outputs = self.encoder(
1077
+ embedding_output,
1078
+ attention_mask=extended_attention_mask,
1079
+ head_mask=head_mask,
1080
+ encoder_hidden_states=encoder_hidden_states,
1081
+ encoder_attention_mask=encoder_extended_attention_mask,
1082
+ past_key_values=past_key_values,
1083
+ use_cache=use_cache,
1084
+ output_attentions=output_attentions,
1085
+ output_hidden_states=output_hidden_states,
1086
+ return_dict=return_dict,
1087
+ )
1088
+ sequence_output = encoder_outputs[0]
1089
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
1090
+
1091
+ if not return_dict:
1092
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
1093
+
1094
+ return BaseModelOutputWithPoolingAndCrossAttentions(
1095
+ last_hidden_state=sequence_output,
1096
+ pooler_output=pooled_output,
1097
+ past_key_values=encoder_outputs.past_key_values,
1098
+ hidden_states=encoder_outputs.hidden_states,
1099
+ attentions=encoder_outputs.attentions,
1100
+ cross_attentions=encoder_outputs.cross_attentions,
1101
+ )
1102
+
1103
+
1104
+ @add_start_docstrings(
1105
+ """
1106
+ CharacterBert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a
1107
+ `next sentence prediction (classification)` head.
1108
+ """,
1109
+ CHARACTER_BERT_START_DOCSTRING,
1110
+ )
1111
+ class CharacterBertForPreTraining(CharacterBertPreTrainedModel):
1112
+ def __init__(self, config):
1113
+ super().__init__(config)
1114
+
1115
+ self.character_bert = CharacterBertModel(config)
1116
+ self.cls = CharacterBertPreTrainingHeads(config)
1117
+
1118
+ self.init_weights()
1119
+
1120
+ def get_output_embeddings(self):
1121
+ return self.cls.predictions.decoder
1122
+
1123
+ def set_output_embeddings(self, new_embeddings):
1124
+ self.cls.predictions.decoder = new_embeddings
1125
+
1126
+ @add_start_docstrings_to_model_forward(
1127
+ CHARACTER_BERT_INPUTS_DOCSTRING.format(
1128
+ "(batch_size, sequence_length, maximum_token_length)", "(batch_size, sequence_length)"
1129
+ )
1130
+ )
1131
+ @replace_return_docstrings(output_type=CharacterBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
1132
+ def forward(
1133
+ self,
1134
+ input_ids=None,
1135
+ attention_mask=None,
1136
+ token_type_ids=None,
1137
+ position_ids=None,
1138
+ head_mask=None,
1139
+ inputs_embeds=None,
1140
+ labels=None,
1141
+ next_sentence_label=None,
1142
+ output_attentions=None,
1143
+ output_hidden_states=None,
1144
+ return_dict=None,
1145
+ ):
1146
+ r"""
1147
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1148
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.mlm_vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
1149
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.mlm_vocab_size]`
1150
+ next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1151
+ Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
1152
+ (see `input_ids` docstring) Indices should be in `[0, 1]`:
1153
+
1154
+ - 0 indicates sequence B is a continuation of sequence A,
1155
+ - 1 indicates sequence B is a random sequence.
1156
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
1157
+ Used to hide legacy arguments that have been deprecated.
1158
+
1159
+ Returns:
1160
+
1161
+ Example:
1162
+
1163
+ ```python
1164
+ >>> from transformers import CharacterBertTokenizer, CharacterBertForPreTraining >>> import torch
1165
+
1166
+ >>> tokenizer = CharacterBertTokenizer.from_pretrained('helboukkouri/character-bert') >>> model =
1167
+ CharacterBertForPreTraining.from_pretrained('helboukkouri/character-bert')
1168
+
1169
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs)
1170
+
1171
+ >>> prediction_logits = outputs.prediction_logits >>> seq_relationship_logits =
1172
+ outputs.seq_relationship_logits
1173
+ ```
1174
+ """
1175
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1176
+
1177
+ outputs = self.character_bert(
1178
+ input_ids,
1179
+ attention_mask=attention_mask,
1180
+ token_type_ids=token_type_ids,
1181
+ position_ids=position_ids,
1182
+ head_mask=head_mask,
1183
+ inputs_embeds=inputs_embeds,
1184
+ output_attentions=output_attentions,
1185
+ output_hidden_states=output_hidden_states,
1186
+ return_dict=return_dict,
1187
+ )
1188
+
1189
+ sequence_output, pooled_output = outputs[:2]
1190
+ prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
1191
+
1192
+ total_loss = None
1193
+ if labels is not None and next_sentence_label is not None:
1194
+ loss_fct = CrossEntropyLoss()
1195
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.mlm_vocab_size), labels.view(-1))
1196
+ next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
1197
+ total_loss = masked_lm_loss + next_sentence_loss
1198
+
1199
+ if not return_dict:
1200
+ output = (prediction_scores, seq_relationship_score) + outputs[2:]
1201
+ return ((total_loss,) + output) if total_loss is not None else output
1202
+
1203
+ return CharacterBertForPreTrainingOutput(
1204
+ loss=total_loss,
1205
+ prediction_logits=prediction_scores,
1206
+ seq_relationship_logits=seq_relationship_score,
1207
+ hidden_states=outputs.hidden_states,
1208
+ attentions=outputs.attentions,
1209
+ )
1210
+
1211
+
1212
+ @add_start_docstrings(
1213
+ """CharacterBert Model with a `language modeling` head on top for CLM fine-tuning.""",
1214
+ CHARACTER_BERT_START_DOCSTRING,
1215
+ )
1216
+ class CharacterBertLMHeadModel(CharacterBertPreTrainedModel):
1217
+
1218
+ _keys_to_ignore_on_load_unexpected = [r"pooler"]
1219
+ _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
1220
+
1221
+ def __init__(self, config):
1222
+ super().__init__(config)
1223
+
1224
+ if not config.is_decoder:
1225
+ logger.warning("If you want to use `CharacterBertLMHeadModel` as a standalone, add `is_decoder=True.`")
1226
+
1227
+ self.character_bert = CharacterBertModel(config, add_pooling_layer=False)
1228
+ self.cls = CharacterBertOnlyMLMHead(config)
1229
+
1230
+ self.init_weights()
1231
+
1232
+ def get_output_embeddings(self):
1233
+ return self.cls.predictions.decoder
1234
+
1235
+ def set_output_embeddings(self, new_embeddings):
1236
+ self.cls.predictions.decoder = new_embeddings
1237
+
1238
+ @add_start_docstrings_to_model_forward(
1239
+ CHARACTER_BERT_INPUTS_DOCSTRING.format(
1240
+ "(batch_size, sequence_length, maximum_token_length)", "(batch_size, sequence_length)"
1241
+ )
1242
+ )
1243
+ @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
1244
+ def forward(
1245
+ self,
1246
+ input_ids=None,
1247
+ attention_mask=None,
1248
+ token_type_ids=None,
1249
+ position_ids=None,
1250
+ head_mask=None,
1251
+ inputs_embeds=None,
1252
+ encoder_hidden_states=None,
1253
+ encoder_attention_mask=None,
1254
+ labels=None,
1255
+ past_key_values=None,
1256
+ use_cache=None,
1257
+ output_attentions=None,
1258
+ output_hidden_states=None,
1259
+ return_dict=None,
1260
+ ):
1261
+ r"""
1262
+ encoder_hidden_states (:
1263
+ obj:*torch.FloatTensor* of shape `(batch_size, sequence_length, hidden_size)`, *optional*): Sequence
1264
+ of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model
1265
+ is configured as a decoder.
1266
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
1267
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
1268
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
1269
+
1270
+ - 1 for tokens that are **not masked**,
1271
+ - 0 for tokens that are **masked**.
1272
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1273
+ Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
1274
+ `[-100, 0, ..., config.mlm_vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100`
1275
+ are ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.mlm_vocab_size]`
1276
+ past_key_values (:
1277
+ obj:*tuple(tuple(torch.FloatTensor))* of length `config.n_layers` with each tuple having 4 tensors of
1278
+ shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key
1279
+ and value hidden states of the attention blocks. Can be used to speed up decoding.
1280
+
1281
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids`
1282
+ (those that don't have their past key value states given to this model) of shape `(batch_size, 1)`
1283
+ instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
1284
+ use_cache (`bool`, *optional*):
1285
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up
1286
+ decoding (see `past_key_values`).
1287
+
1288
+ Returns:
1289
+
1290
+ Example:
1291
+
1292
+ ```python
1293
+ >>> from transformers import CharacterBertTokenizer, CharacterBertLMHeadModel, CharacterBertConfig >>>
1294
+ import torch
1295
+
1296
+ >>> tokenizer = CharacterBertTokenizer.from_pretrained('helboukkouri/character-bert') >>> config =
1297
+ CharacterBertConfig.from_pretrained("helboukkouri/character-bert") >>> config.is_decoder = True >>> model =
1298
+ CharacterBertLMHeadModel.from_pretrained('helboukkouri/character-bert', config=config)
1299
+
1300
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs)
1301
+
1302
+ >>> prediction_logits = outputs.logits
1303
+ ```
1304
+ """
1305
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1306
+ if labels is not None:
1307
+ use_cache = False
1308
+
1309
+ outputs = self.character_bert(
1310
+ input_ids,
1311
+ attention_mask=attention_mask,
1312
+ token_type_ids=token_type_ids,
1313
+ position_ids=position_ids,
1314
+ head_mask=head_mask,
1315
+ inputs_embeds=inputs_embeds,
1316
+ encoder_hidden_states=encoder_hidden_states,
1317
+ encoder_attention_mask=encoder_attention_mask,
1318
+ past_key_values=past_key_values,
1319
+ use_cache=use_cache,
1320
+ output_attentions=output_attentions,
1321
+ output_hidden_states=output_hidden_states,
1322
+ return_dict=return_dict,
1323
+ )
1324
+
1325
+ sequence_output = outputs[0]
1326
+ prediction_scores = self.cls(sequence_output)
1327
+
1328
+ lm_loss = None
1329
+ if labels is not None:
1330
+ # we are doing next-token prediction; shift prediction scores and input ids by one
1331
+ shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
1332
+ labels = labels[:, 1:].contiguous()
1333
+ loss_fct = CrossEntropyLoss()
1334
+ lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.mlm_vocab_size), labels.view(-1))
1335
+
1336
+ if not return_dict:
1337
+ output = (prediction_scores,) + outputs[2:]
1338
+ return ((lm_loss,) + output) if lm_loss is not None else output
1339
+
1340
+ return CausalLMOutputWithCrossAttentions(
1341
+ loss=lm_loss,
1342
+ logits=prediction_scores,
1343
+ past_key_values=outputs.past_key_values,
1344
+ hidden_states=outputs.hidden_states,
1345
+ attentions=outputs.attentions,
1346
+ cross_attentions=outputs.cross_attentions,
1347
+ )
1348
+
1349
+ def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, **model_kwargs):
1350
+ input_shape = input_ids.shape
1351
+ # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
1352
+ if attention_mask is None:
1353
+ attention_mask = input_ids.new_ones(input_shape)
1354
+
1355
+ # cut decoder_input_ids if past is used
1356
+ if past is not None:
1357
+ input_ids = input_ids[:, -1:]
1358
+
1359
+ return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past}
1360
+
1361
+ def _reorder_cache(self, past, beam_idx):
1362
+ reordered_past = ()
1363
+ for layer_past in past:
1364
+ reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
1365
+ return reordered_past
1366
+
1367
+
1368
+ @add_start_docstrings(
1369
+ """CharacterBert Model with a `language modeling` head on top.""", CHARACTER_BERT_START_DOCSTRING
1370
+ )
1371
+ class CharacterBertForMaskedLM(CharacterBertPreTrainedModel):
1372
+
1373
+ _keys_to_ignore_on_load_unexpected = [r"pooler"]
1374
+ _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
1375
+
1376
+ def __init__(self, config):
1377
+ super().__init__(config)
1378
+
1379
+ if config.is_decoder:
1380
+ logger.warning(
1381
+ "If you want to use `CharacterBertForMaskedLM` make sure `config.is_decoder=False` for "
1382
+ "bi-directional self-attention."
1383
+ )
1384
+ self.character_bert = CharacterBertModel(config, add_pooling_layer=False)
1385
+ self.cls = CharacterBertOnlyMLMHead(config)
1386
+
1387
+ self.init_weights()
1388
+
1389
+ def get_output_embeddings(self):
1390
+ return self.cls.predictions.decoder
1391
+
1392
+ def set_output_embeddings(self, new_embeddings):
1393
+ self.cls.predictions.decoder = new_embeddings
1394
+
1395
+ @add_start_docstrings_to_model_forward(
1396
+ CHARACTER_BERT_INPUTS_DOCSTRING.format(
1397
+ "(batch_size, sequence_length, maximum_token_length)", "(batch_size, sequence_length)"
1398
+ )
1399
+ )
1400
+ @add_code_sample_docstrings(
1401
+ processor_class=_TOKENIZER_FOR_DOC,
1402
+ checkpoint=_CHECKPOINT_FOR_DOC,
1403
+ output_type=MaskedLMOutput,
1404
+ config_class=_CONFIG_FOR_DOC,
1405
+ )
1406
+ def forward(
1407
+ self,
1408
+ input_ids=None,
1409
+ attention_mask=None,
1410
+ token_type_ids=None,
1411
+ position_ids=None,
1412
+ head_mask=None,
1413
+ inputs_embeds=None,
1414
+ encoder_hidden_states=None,
1415
+ encoder_attention_mask=None,
1416
+ labels=None,
1417
+ output_attentions=None,
1418
+ output_hidden_states=None,
1419
+ return_dict=None,
1420
+ ):
1421
+ r"""
1422
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1423
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., config.mlm_vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored
1424
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.mlm_vocab_size]`
1425
+ """
1426
+
1427
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1428
+
1429
+ outputs = self.character_bert(
1430
+ input_ids,
1431
+ attention_mask=attention_mask,
1432
+ token_type_ids=token_type_ids,
1433
+ position_ids=position_ids,
1434
+ head_mask=head_mask,
1435
+ inputs_embeds=inputs_embeds,
1436
+ encoder_hidden_states=encoder_hidden_states,
1437
+ encoder_attention_mask=encoder_attention_mask,
1438
+ output_attentions=output_attentions,
1439
+ output_hidden_states=output_hidden_states,
1440
+ return_dict=return_dict,
1441
+ )
1442
+
1443
+ sequence_output = outputs[0]
1444
+ prediction_scores = self.cls(sequence_output)
1445
+
1446
+ masked_lm_loss = None
1447
+ if labels is not None:
1448
+ loss_fct = CrossEntropyLoss() # -100 index = padding token
1449
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.mlm_vocab_size), labels.view(-1))
1450
+
1451
+ if not return_dict:
1452
+ output = (prediction_scores,) + outputs[2:]
1453
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
1454
+
1455
+ return MaskedLMOutput(
1456
+ loss=masked_lm_loss,
1457
+ logits=prediction_scores,
1458
+ hidden_states=outputs.hidden_states,
1459
+ attentions=outputs.attentions,
1460
+ )
1461
+
1462
+ def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
1463
+ input_shape = input_ids.shape
1464
+ effective_batch_size = input_shape[0]
1465
+
1466
+ # add a dummy token
1467
+ assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
1468
+ attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
1469
+ dummy_token = torch.full(
1470
+ (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
1471
+ )
1472
+ input_ids = torch.cat([input_ids, dummy_token], dim=1)
1473
+
1474
+ return {"input_ids": input_ids, "attention_mask": attention_mask}
1475
+
1476
+
1477
+ @add_start_docstrings(
1478
+ """CharacterBert Model with a `next sentence prediction (classification)` head on top.""",
1479
+ CHARACTER_BERT_START_DOCSTRING,
1480
+ )
1481
+ class CharacterBertForNextSentencePrediction(CharacterBertPreTrainedModel):
1482
+ def __init__(self, config):
1483
+ super().__init__(config)
1484
+
1485
+ self.character_bert = CharacterBertModel(config)
1486
+ self.cls = CharacterBertOnlyNSPHead(config)
1487
+
1488
+ self.init_weights()
1489
+
1490
+ @add_start_docstrings_to_model_forward(
1491
+ CHARACTER_BERT_INPUTS_DOCSTRING.format(
1492
+ "(batch_size, sequence_length, maximum_token_length)", "(batch_size, sequence_length)"
1493
+ )
1494
+ )
1495
+ @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
1496
+ def forward(
1497
+ self,
1498
+ input_ids=None,
1499
+ attention_mask=None,
1500
+ token_type_ids=None,
1501
+ position_ids=None,
1502
+ head_mask=None,
1503
+ inputs_embeds=None,
1504
+ labels=None,
1505
+ output_attentions=None,
1506
+ output_hidden_states=None,
1507
+ return_dict=None,
1508
+ **kwargs
1509
+ ):
1510
+ r"""
1511
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1512
+ Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
1513
+ (see `input_ids` docstring). Indices should be in `[0, 1]`:
1514
+
1515
+ - 0 indicates sequence B is a continuation of sequence A,
1516
+ - 1 indicates sequence B is a random sequence.
1517
+
1518
+ Returns:
1519
+
1520
+ Example:
1521
+
1522
+ ```python
1523
+ >>> from transformers import CharacterBertTokenizer, CharacterBertForNextSentencePrediction >>> import
1524
+ torch
1525
+
1526
+ >>> tokenizer = CharacterBertTokenizer.from_pretrained('helboukkouri/character-bert') >>> model =
1527
+ CharacterBertForNextSentencePrediction.from_pretrained('helboukkouri/character-bert')
1528
+
1529
+ >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
1530
+ >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." >>> encoding =
1531
+ tokenizer(prompt, next_sentence, return_tensors='pt')
1532
+
1533
+ >>> outputs = model(**encoding, labels=torch.LongTensor([1])) >>> logits = outputs.logits >>> assert
1534
+ logits[0, 0] < logits[0, 1] # next sentence was random
1535
+ ```
1536
+ """
1537
+
1538
+ if "next_sentence_label" in kwargs:
1539
+ warnings.warn(
1540
+ "The `next_sentence_label` argument is deprecated and will be removed in a future version, use `labels` instead.",
1541
+ FutureWarning,
1542
+ )
1543
+ labels = kwargs.pop("next_sentence_label")
1544
+
1545
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1546
+
1547
+ outputs = self.character_bert(
1548
+ input_ids,
1549
+ attention_mask=attention_mask,
1550
+ token_type_ids=token_type_ids,
1551
+ position_ids=position_ids,
1552
+ head_mask=head_mask,
1553
+ inputs_embeds=inputs_embeds,
1554
+ output_attentions=output_attentions,
1555
+ output_hidden_states=output_hidden_states,
1556
+ return_dict=return_dict,
1557
+ )
1558
+
1559
+ pooled_output = outputs[1]
1560
+
1561
+ seq_relationship_scores = self.cls(pooled_output)
1562
+
1563
+ next_sentence_loss = None
1564
+ if labels is not None:
1565
+ loss_fct = CrossEntropyLoss()
1566
+ next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
1567
+
1568
+ if not return_dict:
1569
+ output = (seq_relationship_scores,) + outputs[2:]
1570
+ return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
1571
+
1572
+ return NextSentencePredictorOutput(
1573
+ loss=next_sentence_loss,
1574
+ logits=seq_relationship_scores,
1575
+ hidden_states=outputs.hidden_states,
1576
+ attentions=outputs.attentions,
1577
+ )
1578
+
1579
+
1580
+ @add_start_docstrings(
1581
+ """
1582
+ CharacterBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
1583
+ pooled output) e.g. for GLUE tasks.
1584
+ """,
1585
+ CHARACTER_BERT_START_DOCSTRING,
1586
+ )
1587
+ class CharacterBertForSequenceClassification(CharacterBertPreTrainedModel):
1588
+ def __init__(self, config):
1589
+ super().__init__(config)
1590
+ self.num_labels = config.num_labels
1591
+ self.character_bert = CharacterBertModel(config)
1592
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
1593
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1594
+
1595
+ self.init_weights()
1596
+
1597
+ @add_start_docstrings_to_model_forward(
1598
+ CHARACTER_BERT_INPUTS_DOCSTRING.format(
1599
+ "(batch_size, sequence_length, maximum_token_length)", "(batch_size, sequence_length)"
1600
+ )
1601
+ )
1602
+ @add_code_sample_docstrings(
1603
+ processor_class=_TOKENIZER_FOR_DOC,
1604
+ checkpoint=_CHECKPOINT_FOR_DOC,
1605
+ output_type=SequenceClassifierOutput,
1606
+ config_class=_CONFIG_FOR_DOC,
1607
+ )
1608
+ def forward(
1609
+ self,
1610
+ input_ids=None,
1611
+ attention_mask=None,
1612
+ token_type_ids=None,
1613
+ position_ids=None,
1614
+ head_mask=None,
1615
+ inputs_embeds=None,
1616
+ labels=None,
1617
+ output_attentions=None,
1618
+ output_hidden_states=None,
1619
+ return_dict=None,
1620
+ ):
1621
+ r"""
1622
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1623
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss),
1624
+ If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1625
+ """
1626
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1627
+
1628
+ outputs = self.character_bert(
1629
+ input_ids,
1630
+ attention_mask=attention_mask,
1631
+ token_type_ids=token_type_ids,
1632
+ position_ids=position_ids,
1633
+ head_mask=head_mask,
1634
+ inputs_embeds=inputs_embeds,
1635
+ output_attentions=output_attentions,
1636
+ output_hidden_states=output_hidden_states,
1637
+ return_dict=return_dict,
1638
+ )
1639
+
1640
+ pooled_output = outputs[1]
1641
+
1642
+ pooled_output = self.dropout(pooled_output)
1643
+ logits = self.classifier(pooled_output)
1644
+
1645
+ loss = None
1646
+ if labels is not None:
1647
+ if self.num_labels == 1:
1648
+ # We are doing regression
1649
+ loss_fct = MSELoss()
1650
+ loss = loss_fct(logits.view(-1), labels.view(-1))
1651
+ else:
1652
+ loss_fct = CrossEntropyLoss()
1653
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1654
+
1655
+ if not return_dict:
1656
+ output = (logits,) + outputs[2:]
1657
+ return ((loss,) + output) if loss is not None else output
1658
+
1659
+ return SequenceClassifierOutput(
1660
+ loss=loss,
1661
+ logits=logits,
1662
+ hidden_states=outputs.hidden_states,
1663
+ attentions=outputs.attentions,
1664
+ )
1665
+
1666
+
1667
+ @add_start_docstrings(
1668
+ """
1669
+ CharacterBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output
1670
+ and a softmax) e.g. for RocStories/SWAG tasks.
1671
+ """,
1672
+ CHARACTER_BERT_START_DOCSTRING,
1673
+ )
1674
+ class CharacterBertForMultipleChoice(CharacterBertPreTrainedModel):
1675
+ def __init__(self, config):
1676
+ super().__init__(config)
1677
+
1678
+ self.character_bert = CharacterBertModel(config)
1679
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
1680
+ self.classifier = nn.Linear(config.hidden_size, 1)
1681
+
1682
+ self.init_weights()
1683
+
1684
+ @add_start_docstrings_to_model_forward(
1685
+ CHARACTER_BERT_INPUTS_DOCSTRING.format(
1686
+ "(batch_size, sequence_length, maximum_token_length)", "(batch_size, sequence_length)"
1687
+ )
1688
+ )
1689
+ @add_code_sample_docstrings(
1690
+ processor_class=_TOKENIZER_FOR_DOC,
1691
+ checkpoint=_CHECKPOINT_FOR_DOC,
1692
+ output_type=MultipleChoiceModelOutput,
1693
+ config_class=_CONFIG_FOR_DOC,
1694
+ )
1695
+ def forward(
1696
+ self,
1697
+ input_ids=None,
1698
+ attention_mask=None,
1699
+ token_type_ids=None,
1700
+ position_ids=None,
1701
+ head_mask=None,
1702
+ inputs_embeds=None,
1703
+ labels=None,
1704
+ output_attentions=None,
1705
+ output_hidden_states=None,
1706
+ return_dict=None,
1707
+ ):
1708
+ r"""
1709
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1710
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
1711
+ `input_ids` above)
1712
+ """
1713
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1714
+ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
1715
+
1716
+ input_ids = input_ids.view(-1, input_ids.size(-2), input_ids.size(-1)) if input_ids is not None else None
1717
+ attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
1718
+ token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
1719
+ position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
1720
+ inputs_embeds = (
1721
+ inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
1722
+ if inputs_embeds is not None
1723
+ else None
1724
+ )
1725
+
1726
+ outputs = self.character_bert(
1727
+ input_ids,
1728
+ attention_mask=attention_mask,
1729
+ token_type_ids=token_type_ids,
1730
+ position_ids=position_ids,
1731
+ head_mask=head_mask,
1732
+ inputs_embeds=inputs_embeds,
1733
+ output_attentions=output_attentions,
1734
+ output_hidden_states=output_hidden_states,
1735
+ return_dict=return_dict,
1736
+ )
1737
+
1738
+ pooled_output = outputs[1]
1739
+
1740
+ pooled_output = self.dropout(pooled_output)
1741
+ logits = self.classifier(pooled_output)
1742
+ reshaped_logits = logits.view(-1, num_choices)
1743
+
1744
+ loss = None
1745
+ if labels is not None:
1746
+ loss_fct = CrossEntropyLoss()
1747
+ loss = loss_fct(reshaped_logits, labels)
1748
+
1749
+ if not return_dict:
1750
+ output = (reshaped_logits,) + outputs[2:]
1751
+ return ((loss,) + output) if loss is not None else output
1752
+
1753
+ return MultipleChoiceModelOutput(
1754
+ loss=loss,
1755
+ logits=reshaped_logits,
1756
+ hidden_states=outputs.hidden_states,
1757
+ attentions=outputs.attentions,
1758
+ )
1759
+
1760
+
1761
+ @add_start_docstrings(
1762
+ """
1763
+ CharacterBERT Model with a token classification head on top (a linear layer on top of the hidden-states output)
1764
+ e.g. for Named-Entity-Recognition (NER) tasks.
1765
+ """,
1766
+ CHARACTER_BERT_START_DOCSTRING,
1767
+ )
1768
+ class CharacterBertForTokenClassification(CharacterBertPreTrainedModel):
1769
+ def __init__(self, config):
1770
+ super().__init__(config)
1771
+ self.num_labels = config.num_labels
1772
+
1773
+ self.character_bert = CharacterBertModel(config)
1774
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
1775
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1776
+
1777
+ self.init_weights()
1778
+
1779
+ @add_start_docstrings_to_model_forward(
1780
+ CHARACTER_BERT_INPUTS_DOCSTRING.format(
1781
+ "(batch_size, sequence_length, maximum_token_length)", "(batch_size, sequence_length)"
1782
+ )
1783
+ )
1784
+ @add_code_sample_docstrings(
1785
+ processor_class=_TOKENIZER_FOR_DOC,
1786
+ checkpoint=_CHECKPOINT_FOR_DOC,
1787
+ output_type=TokenClassifierOutput,
1788
+ config_class=_CONFIG_FOR_DOC,
1789
+ )
1790
+ def forward(
1791
+ self,
1792
+ input_ids=None,
1793
+ attention_mask=None,
1794
+ token_type_ids=None,
1795
+ position_ids=None,
1796
+ head_mask=None,
1797
+ inputs_embeds=None,
1798
+ labels=None,
1799
+ output_attentions=None,
1800
+ output_hidden_states=None,
1801
+ return_dict=None,
1802
+ ):
1803
+ r"""
1804
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1805
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
1806
+ """
1807
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1808
+
1809
+ outputs = self.character_bert(
1810
+ input_ids,
1811
+ attention_mask=attention_mask,
1812
+ token_type_ids=token_type_ids,
1813
+ position_ids=position_ids,
1814
+ head_mask=head_mask,
1815
+ inputs_embeds=inputs_embeds,
1816
+ output_attentions=output_attentions,
1817
+ output_hidden_states=output_hidden_states,
1818
+ return_dict=return_dict,
1819
+ )
1820
+
1821
+ sequence_output = outputs[0]
1822
+
1823
+ sequence_output = self.dropout(sequence_output)
1824
+ logits = self.classifier(sequence_output)
1825
+
1826
+ loss = None
1827
+ if labels is not None:
1828
+ loss_fct = CrossEntropyLoss()
1829
+ # Only keep active parts of the loss
1830
+ if attention_mask is not None:
1831
+ active_loss = attention_mask.view(-1) == 1
1832
+ active_logits = logits.view(-1, self.num_labels)
1833
+ active_labels = torch.where(
1834
+ active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
1835
+ )
1836
+ loss = loss_fct(active_logits, active_labels)
1837
+ else:
1838
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1839
+
1840
+ if not return_dict:
1841
+ output = (logits,) + outputs[2:]
1842
+ return ((loss,) + output) if loss is not None else output
1843
+
1844
+ return TokenClassifierOutput(
1845
+ loss=loss,
1846
+ logits=logits,
1847
+ hidden_states=outputs.hidden_states,
1848
+ attentions=outputs.attentions,
1849
+ )
1850
+
1851
+
1852
+ @add_start_docstrings(
1853
+ """
1854
+ CharacterBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
1855
+ linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
1856
+ """,
1857
+ CHARACTER_BERT_START_DOCSTRING,
1858
+ )
1859
+ class CharacterBertForQuestionAnswering(CharacterBertPreTrainedModel):
1860
+ def __init__(self, config):
1861
+ super().__init__(config)
1862
+
1863
+ config.num_labels = 2
1864
+ self.num_labels = config.num_labels
1865
+
1866
+ self.character_bert = CharacterBertModel(config)
1867
+ self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
1868
+
1869
+ self.init_weights()
1870
+
1871
+ @add_start_docstrings_to_model_forward(
1872
+ CHARACTER_BERT_INPUTS_DOCSTRING.format(
1873
+ "(batch_size, sequence_length, maximum_token_length)", "(batch_size, sequence_length)"
1874
+ )
1875
+ )
1876
+ @add_code_sample_docstrings(
1877
+ processor_class=_TOKENIZER_FOR_DOC,
1878
+ checkpoint=_CHECKPOINT_FOR_DOC,
1879
+ output_type=QuestionAnsweringModelOutput,
1880
+ config_class=_CONFIG_FOR_DOC,
1881
+ )
1882
+ def forward(
1883
+ self,
1884
+ input_ids=None,
1885
+ attention_mask=None,
1886
+ token_type_ids=None,
1887
+ position_ids=None,
1888
+ head_mask=None,
1889
+ inputs_embeds=None,
1890
+ start_positions=None,
1891
+ end_positions=None,
1892
+ output_attentions=None,
1893
+ output_hidden_states=None,
1894
+ return_dict=None,
1895
+ ):
1896
+ r"""
1897
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1898
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
1899
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
1900
+ sequence are not taken into account for computing the loss.
1901
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1902
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
1903
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the
1904
+ sequence are not taken into account for computing the loss.
1905
+ """
1906
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1907
+
1908
+ outputs = self.character_bert(
1909
+ input_ids,
1910
+ attention_mask=attention_mask,
1911
+ token_type_ids=token_type_ids,
1912
+ position_ids=position_ids,
1913
+ head_mask=head_mask,
1914
+ inputs_embeds=inputs_embeds,
1915
+ output_attentions=output_attentions,
1916
+ output_hidden_states=output_hidden_states,
1917
+ return_dict=return_dict,
1918
+ )
1919
+
1920
+ sequence_output = outputs[0]
1921
+
1922
+ logits = self.qa_outputs(sequence_output)
1923
+ start_logits, end_logits = logits.split(1, dim=-1)
1924
+ start_logits = start_logits.squeeze(-1)
1925
+ end_logits = end_logits.squeeze(-1)
1926
+
1927
+ total_loss = None
1928
+ if start_positions is not None and end_positions is not None:
1929
+ # If we are on multi-GPU, split add a dimension
1930
+ if len(start_positions.size()) > 1:
1931
+ start_positions = start_positions.squeeze(-1)
1932
+ if len(end_positions.size()) > 1:
1933
+ end_positions = end_positions.squeeze(-1)
1934
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
1935
+ ignored_index = start_logits.size(1)
1936
+ start_positions.clamp_(0, ignored_index)
1937
+ end_positions.clamp_(0, ignored_index)
1938
+
1939
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1940
+ start_loss = loss_fct(start_logits, start_positions)
1941
+ end_loss = loss_fct(end_logits, end_positions)
1942
+ total_loss = (start_loss + end_loss) / 2
1943
+
1944
+ if not return_dict:
1945
+ output = (start_logits, end_logits) + outputs[2:]
1946
+ return ((total_loss,) + output) if total_loss is not None else output
1947
+
1948
+ return QuestionAnsweringModelOutput(
1949
+ loss=total_loss,
1950
+ start_logits=start_logits,
1951
+ end_logits=end_logits,
1952
+ hidden_states=outputs.hidden_states,
1953
+ attentions=outputs.attentions,
1954
+ )
tokenization_character_bert.py ADDED
@@ -0,0 +1,930 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright Hicham EL BOUKKOURI, Olivier FERRET, Thomas LAVERGNE, Hiroshi NOJI,
3
+ # Pierre ZWEIGENBAUM, Junichi TSUJII and The HuggingFace Inc. team.
4
+ # All rights reserved.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ """Tokenization classes for CharacterBERT."""
18
+ import json
19
+ import os
20
+ import unicodedata
21
+ from collections import OrderedDict
22
+ from typing import Dict, List, Optional, Tuple, Union
23
+
24
+ import numpy as np
25
+
26
+ from transformers.file_utils import is_tf_available, is_torch_available, to_py_obj
27
+ from transformers.tokenization_utils import (
28
+ BatchEncoding,
29
+ EncodedInput,
30
+ PaddingStrategy,
31
+ PreTrainedTokenizer,
32
+ TensorType,
33
+ _is_control,
34
+ _is_punctuation,
35
+ _is_whitespace,
36
+ )
37
+ from transformers.tokenization_utils_base import ADDED_TOKENS_FILE
38
+ from transformers.utils import logging
39
+
40
+
41
+ logger = logging.get_logger(__name__)
42
+
43
+ VOCAB_FILES_NAMES = {
44
+ "mlm_vocab_file": "mlm_vocab.txt",
45
+ }
46
+
47
+ PRETRAINED_VOCAB_FILES_MAP = {
48
+ "mlm_vocab_file": {
49
+ "helboukkouri/character-bert": "https://huggingface.co/helboukkouri/character-bert/resolve/main/mlm_vocab.txt",
50
+ "helboukkouri/character-bert-medical": "https://huggingface.co/helboukkouri/character-bert-medical/resolve/main/mlm_vocab.txt",
51
+ }
52
+ }
53
+
54
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
55
+ "helboukkouri/character-bert": 512,
56
+ "helboukkouri/character-bert-medical": 512,
57
+ }
58
+
59
+ PRETRAINED_INIT_CONFIGURATION = {
60
+ "helboukkouri/character-bert": {"max_word_length": 50, "do_lower_case": True},
61
+ "helboukkouri/character-bert-medical": {"max_word_length": 50, "do_lower_case": True},
62
+ }
63
+
64
+ PAD_TOKEN_CHAR_ID = 0
65
+
66
+
67
+ def whitespace_tokenize(text):
68
+ """Runs basic whitespace cleaning and splitting on a piece of text."""
69
+ text = text.strip()
70
+ if not text:
71
+ return []
72
+ tokens = text.split()
73
+ return tokens
74
+
75
+
76
+ def build_mlm_ids_to_tokens_mapping(mlm_vocab_file):
77
+ """Builds a Masked Language Modeling ids to masked tokens mapping."""
78
+ vocabulary = []
79
+ with open(mlm_vocab_file, "r", encoding="utf-8") as reader:
80
+ for line in reader:
81
+ line = line.strip()
82
+ if line:
83
+ vocabulary.append(line)
84
+ return OrderedDict(list(enumerate(vocabulary)))
85
+
86
+
87
+ class CharacterBertTokenizer(PreTrainedTokenizer):
88
+ """
89
+ Construct a CharacterBERT tokenizer. Based on characters.
90
+
91
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
92
+ Users should refer to this superclass for more information regarding those methods.
93
+
94
+ Args:
95
+ mlm_vocab_file (`str`, *optional*, defaults to `None`):
96
+ Path to the Masked Language Modeling vocabulary. This is used for converting the output (token ids) of the
97
+ MLM model into tokens.
98
+ max_word_length (`int`, *optional*, defaults to `50`):
99
+ The maximum token length in characters (actually, in bytes as any non-ascii characters will be converted to
100
+ a sequence of utf-8 bytes).
101
+ do_lower_case (`bool`, *optional*, defaults to `True`):
102
+ Whether or not to lowercase the input when tokenizing.
103
+ do_basic_tokenize (`bool`, *optional*, defaults to `True`):
104
+ Whether or not to do basic tokenization before WordPiece.
105
+ never_split (`Iterable`, *optional*):
106
+ Collection of tokens which will never be split during tokenization. Only has an effect when
107
+ `do_basic_tokenize=True`
108
+ unk_token (`str`, *optional*, defaults to `"[UNK]"`):
109
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
110
+ token instead.
111
+ sep_token (`str`, *optional*, defaults to `"[SEP]"`):
112
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
113
+ sequence classification or for a text and a question for question answering. It is also used as the last
114
+ token of a sequence built with special tokens.
115
+ pad_token (`str`, *optional*, defaults to `"[PAD]"`):
116
+ The token used for padding, for example when batching sequences of different lengths.
117
+ cls_token (`str`, *optional*, defaults to `"[CLS]"`):
118
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
119
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
120
+ mask_token (`str`, *optional*, defaults to `"[MASK]"`):
121
+ The token used for masking values. This is the token used when training this model with masked language
122
+ modeling. This is the token which the model will try to predict.
123
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
124
+ Whether or not to tokenize Chinese characters.
125
+ strip_accents: (`bool`, *optional*):
126
+ Whether or not to strip all accents. If this option is not specified, then it will be determined by the
127
+ value for `lowercase` (as in the original BERT).
128
+ """
129
+
130
+ vocab_files_names = VOCAB_FILES_NAMES
131
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
132
+ pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
133
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
134
+
135
+ def __init__(
136
+ self,
137
+ mlm_vocab_file=None,
138
+ max_word_length=50,
139
+ do_lower_case=True,
140
+ do_basic_tokenize=True,
141
+ never_split=None,
142
+ unk_token="[UNK]",
143
+ sep_token="[SEP]",
144
+ pad_token="[PAD]",
145
+ cls_token="[CLS]",
146
+ mask_token="[MASK]",
147
+ tokenize_chinese_chars=True,
148
+ strip_accents=None,
149
+ **kwargs
150
+ ):
151
+ super().__init__(
152
+ max_word_length=max_word_length,
153
+ do_lower_case=do_lower_case,
154
+ do_basic_tokenize=do_basic_tokenize,
155
+ never_split=never_split,
156
+ unk_token=unk_token,
157
+ sep_token=sep_token,
158
+ pad_token=pad_token,
159
+ cls_token=cls_token,
160
+ mask_token=mask_token,
161
+ tokenize_chinese_chars=tokenize_chinese_chars,
162
+ strip_accents=strip_accents,
163
+ **kwargs,
164
+ )
165
+ # This prevents splitting special tokens during tokenization
166
+ self.unique_no_split_tokens = [self.cls_token, self.mask_token, self.pad_token, self.sep_token, self.unk_token]
167
+ # This is used for converting MLM ids into tokens
168
+ if mlm_vocab_file is None:
169
+ self.ids_to_tokens = None
170
+ else:
171
+ if not os.path.isfile(mlm_vocab_file):
172
+ raise ValueError(
173
+ f"Can't find a vocabulary file at path '{mlm_vocab_file}'. "
174
+ "To load the vocabulary from a pretrained model use "
175
+ "`tokenizer = CharacterBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
176
+ )
177
+ self.ids_to_tokens = build_mlm_ids_to_tokens_mapping(mlm_vocab_file)
178
+ # Tokenization is handled by BasicTokenizer
179
+ self.do_basic_tokenize = do_basic_tokenize
180
+ if do_basic_tokenize:
181
+ self.basic_tokenizer = BasicTokenizer(
182
+ do_lower_case=do_lower_case,
183
+ never_split=never_split,
184
+ tokenize_chinese_chars=tokenize_chinese_chars,
185
+ strip_accents=strip_accents,
186
+ )
187
+ # Then, a CharacterMapper is responsible for converting tokens into character ids
188
+ self.max_word_length = max_word_length
189
+ self._mapper = CharacterMapper(max_word_length=max_word_length)
190
+
191
+ def __repr__(self) -> str:
192
+ # NOTE: we overwrite this because CharacterBERT does not have self.vocab_size
193
+ return (
194
+ f"CharacterBertTokenizer(name_or_path='{self.name_or_path}', "
195
+ + (f"mlm_vocab_size={self.mlm_vocab_size}, " if self.ids_to_tokens else "")
196
+ + f"model_max_len={self.model_max_length}, is_fast={self.is_fast}, "
197
+ + f"padding_side='{self.padding_side}', special_tokens={self.special_tokens_map_extended})"
198
+ )
199
+
200
+ def __len__(self):
201
+ """
202
+ Size of the full vocabulary with the added tokens.
203
+ """
204
+ # return self.vocab_size + len(self.added_tokens_encoder)
205
+ return 0 + len(self.added_tokens_encoder)
206
+
207
+ @property
208
+ def do_lower_case(self):
209
+ return self.basic_tokenizer.do_lower_case
210
+
211
+ @property
212
+ def vocab_size(self):
213
+ raise NotImplementedError("CharacterBERT does not use a token vocabulary.")
214
+
215
+ @property
216
+ def mlm_vocab_size(self):
217
+ if self.ids_to_tokens is None:
218
+ raise ValueError(
219
+ "CharacterBertTokenizer was initialized without a MLM "
220
+ "vocabulary. You can either pass one manually or load a "
221
+ "pre-trained tokenizer using: "
222
+ "`tokenizer = CharacterBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
223
+ )
224
+ return len(self.ids_to_tokens)
225
+
226
+ def add_special_tokens(self, *args, **kwargs):
227
+ raise NotImplementedError("Adding special tokens is not supported for now.")
228
+
229
+ def add_tokens(self, *args, **kwargs):
230
+ # We don't raise an Exception here to allow for ignoring this step.
231
+ # Otherwise, many inherited methods would need to be re-implemented...
232
+ pass
233
+
234
+ def get_vocab(self):
235
+ raise NotImplementedError("CharacterBERT does not have a token vocabulary.")
236
+
237
+ def get_mlm_vocab(self):
238
+ return {token: i for i, token in self.ids_to_tokens.items()}
239
+
240
+ def _tokenize(self, text):
241
+ split_tokens = []
242
+ if self.do_basic_tokenize:
243
+ split_tokens = self.basic_tokenizer.tokenize(text=text, never_split=self.all_special_tokens)
244
+ else:
245
+ split_tokens = whitespace_tokenize(text) # Default to whitespace tokenization
246
+ return split_tokens
247
+
248
+ def convert_tokens_to_string(self, tokens):
249
+ """Converts a sequence of tokens (string) in a single string."""
250
+ out_string = " ".join(tokens).strip()
251
+ return out_string
252
+
253
+ def _convert_token_to_id(self, token):
254
+ """Converts a token (str) into a sequence of character ids."""
255
+ return self._mapper.convert_word_to_char_ids(token)
256
+
257
+ def _convert_id_to_token(self, index: List[int]):
258
+ # NOTE: keeping the same variable name `ìndex` although this will
259
+ # always be a sequence of indices.
260
+ """Converts an index (actually, a list of indices) in a token (str)."""
261
+ return self._mapper.convert_char_ids_to_word(index)
262
+
263
+ def convert_ids_to_tokens(
264
+ self, ids: Union[List[int], List[List[int]]], skip_special_tokens: bool = False
265
+ ) -> Union[str, List[str]]:
266
+ """
267
+ Converts a single sequence of character indices or a sequence of character id sequences in a token or a
268
+ sequence of tokens.
269
+
270
+ Args:
271
+ ids (`int` or `List[int]`):
272
+ The token id (or token ids) to convert to tokens.
273
+ skip_special_tokens (`bool`, *optional*, defaults to `False`):
274
+ Whether or not to remove special tokens in the decoding.
275
+
276
+ Returns:
277
+ `str` or `List[str]`: The decoded token(s).
278
+ """
279
+ if isinstance(ids, list) and isinstance(ids[0], int):
280
+ if tuple(ids) in self.added_tokens_decoder:
281
+ return self.added_tokens_decoder[tuple(ids)]
282
+ else:
283
+ return self._convert_id_to_token(ids)
284
+ tokens = []
285
+ for indices in ids:
286
+ indices = list(map(int, indices))
287
+ if skip_special_tokens and tuple(indices) in self.all_special_ids:
288
+ continue
289
+ if tuple(indices) in self.added_tokens_decoder:
290
+ tokens.append(self.added_tokens_decoder[tuple(indices)])
291
+ else:
292
+ tokens.append(self._convert_id_to_token(indices))
293
+ return tokens
294
+
295
+ def convert_mlm_id_to_token(self, mlm_id):
296
+ """Converts an index (integer) in a token (str) using the vocab."""
297
+ if self.ids_to_tokens is None:
298
+ raise ValueError(
299
+ "CharacterBertTokenizer was initialized without a MLM "
300
+ "vocabulary. You can either pass one manually or load a "
301
+ "pre-trained tokenizer using: "
302
+ "`tokenizer = CharacterBertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
303
+ )
304
+ assert (
305
+ mlm_id < self.mlm_vocab_size
306
+ ), "Attempting to convert a MLM id that is greater than the MLM vocabulary size."
307
+ return self.ids_to_tokens[mlm_id]
308
+
309
+ def build_inputs_with_special_tokens(
310
+ self, token_ids_0: List[List[int]], token_ids_1: Optional[List[List[int]]] = None
311
+ ) -> List[List[int]]:
312
+ """
313
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
314
+ adding special tokens. A CharacterBERT sequence has the following format:
315
+
316
+ - single sequence: `[CLS] X [SEP]`
317
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
318
+
319
+ Args:
320
+ token_ids_0 (`List[int]`):
321
+ List of IDs to which the special tokens will be added.
322
+ token_ids_1 (`List[int]`, *optional*):
323
+ Optional second list of IDs for sequence pairs.
324
+
325
+ Returns:
326
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
327
+ """
328
+ if token_ids_1 is None:
329
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
330
+ cls = [self.cls_token_id]
331
+ sep = [self.sep_token_id]
332
+ return cls + token_ids_0 + sep + token_ids_1 + sep
333
+
334
+ def get_special_tokens_mask(
335
+ self,
336
+ token_ids_0: List[List[int]],
337
+ token_ids_1: Optional[List[List[int]]] = None,
338
+ already_has_special_tokens: bool = False,
339
+ ) -> List[int]:
340
+ """
341
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
342
+ special tokens using the tokenizer `prepare_for_model` method.
343
+
344
+ Args:
345
+ token_ids_0 (`List[int]`):
346
+ List of IDs.
347
+ token_ids_1 (`List[int]`, *optional*):
348
+ Optional second list of IDs for sequence pairs.
349
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
350
+ Whether or not the token list is already formatted with special tokens for the model.
351
+
352
+ Returns:
353
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
354
+ """
355
+ if already_has_special_tokens:
356
+ if token_ids_1 is not None:
357
+ raise ValueError(
358
+ "You should not supply a second sequence if the provided sequence of "
359
+ "ids is already formatted with special tokens for the model."
360
+ )
361
+ return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
362
+
363
+ if token_ids_1 is not None:
364
+ return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
365
+ return [1] + ([0] * len(token_ids_0)) + [1]
366
+
367
+ def create_token_type_ids_from_sequences(
368
+ self, token_ids_0: List[List[int]], token_ids_1: Optional[List[List[int]]] = None
369
+ ) -> List[int]:
370
+ """
371
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. A CharacterBERT
372
+ sequence pair mask has the following format:
373
+
374
+ ```
375
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second sequence |
376
+ ```
377
+
378
+ If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
379
+
380
+ Args:
381
+ token_ids_0 (`List[int]`):
382
+ List of IDs.
383
+ token_ids_1 (`List[int]`, *optional*):
384
+ Optional second list of IDs for sequence pairs.
385
+
386
+ Returns:
387
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given
388
+ sequence(s).
389
+ """
390
+ sep = [self.sep_token_id]
391
+ cls = [self.cls_token_id]
392
+ if token_ids_1 is None:
393
+ return len(cls + token_ids_0 + sep) * [0]
394
+ return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
395
+
396
+ # def pad(
397
+ # self,
398
+ # encoded_inputs: Union[
399
+ # BatchEncoding,
400
+ # List[BatchEncoding],
401
+ # Dict[str, EncodedInput],
402
+ # Dict[str, List[EncodedInput]],
403
+ # List[Dict[str, EncodedInput]],
404
+ # ],
405
+ # padding: Union[bool, str, PaddingStrategy] = True,
406
+ # max_length: Optional[int] = None,
407
+ # pad_to_multiple_of: Optional[int] = None,
408
+ # return_attention_mask: Optional[bool] = None,
409
+ # return_tensors: Optional[Union[str, TensorType]] = None,
410
+ # verbose: bool = True,
411
+ # ) -> BatchEncoding:
412
+ # """
413
+ # Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length
414
+ # in the batch.
415
+
416
+ # Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`,
417
+ # `self.pad_token_id` and `self.pad_token_type_id`)
418
+
419
+ # <Tip>
420
+
421
+ # If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the
422
+ # result will use the same type unless you provide a different tensor type with `return_tensors`. In the
423
+ # case of PyTorch tensors, you will lose the specific device of your tensors however.
424
+
425
+ # </Tip>
426
+
427
+ # Args:
428
+ # encoded_inputs (:
429
+ # class:*~transformers.BatchEncoding*, list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`): Tokenized inputs.
430
+ # Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a
431
+ # batch of tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]*
432
+ # or *List[Dict[str, List[int]]]*) so you can use this method during preprocessing as well as in a
433
+ # PyTorch Dataloader collate function.
434
+
435
+ # Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors),
436
+ # see the note above for the return type.
437
+ # padding (:
438
+ # obj:*bool*, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to
439
+ # `True`): Select a strategy to pad the returned sequences (according to the model's padding side
440
+ # and padding index) among:
441
+
442
+ # - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a
443
+ # single sequence if provided).
444
+ # - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the
445
+ # maximum acceptable input length for the model if that argument is not provided.
446
+ # - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
447
+ # different lengths).
448
+ # max_length (`int`, *optional*):
449
+ # Maximum length of the returned list and optionally padding length (see above).
450
+ # pad_to_multiple_of (`int`, *optional*):
451
+ # If set will pad the sequence to a multiple of the provided value.
452
+
453
+ # This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
454
+ # >= 7.5 (Volta).
455
+ # return_attention_mask (`bool`, *optional*):
456
+ # Whether to return the attention mask. If left to the default, will return the attention mask according
457
+ # to the specific tokenizer's default, defined by the `return_outputs` attribute.
458
+
459
+ # [What are attention masks?](../glossary#attention-mask)
460
+ # return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
461
+ # If set, will return tensors instead of list of python integers. Acceptable values are:
462
+
463
+ # - `'tf'`: Return TensorFlow `tf.constant` objects.
464
+ # - `'pt'`: Return PyTorch `torch.Tensor` objects.
465
+ # - `'np'`: Return Numpy `np.ndarray` objects.
466
+ # verbose (`bool`, *optional*, defaults to `True`):
467
+ # Whether or not to print more information and warnings.
468
+ # """
469
+ # # If we have a list of dicts, let's convert it in a dict of lists
470
+ # # We do this to allow using this method as a collate_fn function in PyTorch Dataloader
471
+ # if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], (dict, BatchEncoding)):
472
+ # encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()}
473
+
474
+ # # The model's main input name, usually `input_ids`, has be passed for padding
475
+ # if self.model_input_names[0] not in encoded_inputs:
476
+ # raise ValueError(
477
+ # "You should supply an encoding or a list of encodings to this method "
478
+ # f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
479
+ # )
480
+
481
+ # required_input = encoded_inputs[self.model_input_names[0]]
482
+
483
+ # if not required_input:
484
+ # if return_attention_mask:
485
+ # encoded_inputs["attention_mask"] = []
486
+ # return encoded_inputs
487
+
488
+ # # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects
489
+ # # and rebuild them afterwards if no return_tensors is specified
490
+ # # Note that we lose the specific device the tensor may be on for PyTorch
491
+
492
+ # first_element = required_input[0]
493
+ # if isinstance(first_element, (list, tuple)):
494
+ # # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element.
495
+ # index = 0
496
+ # while len(required_input[index]) == 0:
497
+ # index += 1
498
+ # if index < len(required_input):
499
+ # first_element = required_input[index][0]
500
+ # # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do.
501
+ # if not isinstance(first_element, (int, list, tuple)):
502
+ # if is_tf_available() and _is_tensorflow(first_element):
503
+ # return_tensors = "tf" if return_tensors is None else return_tensors
504
+ # elif is_torch_available() and _is_torch(first_element):
505
+ # return_tensors = "pt" if return_tensors is None else return_tensors
506
+ # elif isinstance(first_element, np.ndarray):
507
+ # return_tensors = "np" if return_tensors is None else return_tensors
508
+ # else:
509
+ # raise ValueError(
510
+ # f"type of {first_element} unknown: {type(first_element)}. "
511
+ # f"Should be one of a python, numpy, pytorch or tensorflow object."
512
+ # )
513
+
514
+ # for key, value in encoded_inputs.items():
515
+ # encoded_inputs[key] = to_py_obj(value)
516
+
517
+ # # Convert padding_strategy in PaddingStrategy
518
+ # padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies(
519
+ # padding=padding, max_length=max_length, verbose=verbose
520
+ # )
521
+
522
+ # required_input = encoded_inputs[self.model_input_names[0]]
523
+ # if required_input and not isinstance(required_input[0][0], (list, tuple)):
524
+ # encoded_inputs = self._pad(
525
+ # encoded_inputs,
526
+ # max_length=max_length,
527
+ # padding_strategy=padding_strategy,
528
+ # pad_to_multiple_of=pad_to_multiple_of,
529
+ # return_attention_mask=return_attention_mask,
530
+ # )
531
+ # return BatchEncoding(encoded_inputs, tensor_type=return_tensors)
532
+
533
+ # batch_size = len(required_input)
534
+ # assert all(
535
+ # len(v) == batch_size for v in encoded_inputs.values()
536
+ # ), "Some items in the output dictionary have a different batch size than others."
537
+
538
+ # if padding_strategy == PaddingStrategy.LONGEST:
539
+ # max_length = max(len(inputs) for inputs in required_input)
540
+ # padding_strategy = PaddingStrategy.MAX_LENGTH
541
+
542
+ # batch_outputs = {}
543
+ # for i in range(batch_size):
544
+ # inputs = dict((k, v[i]) for k, v in encoded_inputs.items())
545
+ # outputs = self._pad(
546
+ # inputs,
547
+ # max_length=max_length,
548
+ # padding_strategy=padding_strategy,
549
+ # pad_to_multiple_of=pad_to_multiple_of,
550
+ # return_attention_mask=return_attention_mask,
551
+ # )
552
+
553
+ # for key, value in outputs.items():
554
+ # if key not in batch_outputs:
555
+ # batch_outputs[key] = []
556
+ # batch_outputs[key].append(value)
557
+
558
+ # return BatchEncoding(batch_outputs, tensor_type=return_tensors)
559
+
560
+ # def _pad(
561
+ # self,
562
+ # encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
563
+ # max_length: Optional[int] = None,
564
+ # padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
565
+ # pad_to_multiple_of: Optional[int] = None,
566
+ # return_attention_mask: Optional[bool] = None,
567
+ # ) -> dict:
568
+ # """
569
+ # Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
570
+
571
+ # Args:
572
+ # encoded_inputs:
573
+ # Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
574
+ # max_length: maximum length of the returned list and optionally padding length (see below).
575
+ # Will truncate by taking into account the special tokens.
576
+ # padding_strategy: PaddingStrategy to use for padding.
577
+
578
+ # - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
579
+ # - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
580
+ # - PaddingStrategy.DO_NOT_PAD: Do not pad
581
+ # The tokenizer padding sides are defined in self.padding_side:
582
+
583
+ # - 'left': pads on the left of the sequences
584
+ # - 'right': pads on the right of the sequences
585
+ # pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
586
+ # This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
587
+ # >= 7.5 (Volta).
588
+ # return_attention_mask:
589
+ # (optional) Set to False to avoid returning attention mask (default: set to model specifics)
590
+ # """
591
+ # # Load from model defaults
592
+ # if return_attention_mask is None:
593
+ # return_attention_mask = "attention_mask" in self.model_input_names
594
+
595
+ # required_input = encoded_inputs[self.model_input_names[0]]
596
+
597
+ # if padding_strategy == PaddingStrategy.LONGEST:
598
+ # max_length = len(required_input)
599
+
600
+ # if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
601
+ # max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
602
+
603
+ # needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
604
+
605
+ # if needs_to_be_padded:
606
+ # difference = max_length - len(required_input)
607
+ # if self.padding_side == "right":
608
+ # if return_attention_mask:
609
+ # encoded_inputs["attention_mask"] = [1] * len(required_input) + [0] * difference
610
+ # if "token_type_ids" in encoded_inputs:
611
+ # encoded_inputs["token_type_ids"] = (
612
+ # encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
613
+ # )
614
+ # if "special_tokens_mask" in encoded_inputs:
615
+ # encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
616
+ # encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
617
+ # elif self.padding_side == "left":
618
+ # if return_attention_mask:
619
+ # encoded_inputs["attention_mask"] = [0] * difference + [1] * len(required_input)
620
+ # if "token_type_ids" in encoded_inputs:
621
+ # encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
622
+ # "token_type_ids"
623
+ # ]
624
+ # if "special_tokens_mask" in encoded_inputs:
625
+ # encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
626
+ # encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
627
+ # else:
628
+ # raise ValueError("Invalid padding strategy:" + str(self.padding_side))
629
+ # elif return_attention_mask and "attention_mask" not in encoded_inputs:
630
+ # if isinstance(encoded_inputs["token_type_ids"], list):
631
+ # encoded_inputs["attention_mask"] = [1] * len(required_input)
632
+ # else:
633
+ # encoded_inputs["attention_mask"] = 1
634
+
635
+ # return encoded_inputs
636
+
637
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
638
+ logger.warning("CharacterBERT does not have a token vocabulary. " "Skipping saving `vocab.txt`.")
639
+ return ()
640
+
641
+ def save_mlm_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
642
+ # NOTE: CharacterBERT has no token vocabulary, this is just to allow
643
+ # saving tokenizer configuration via CharacterBertTokenizer.save_pretrained
644
+ if os.path.isdir(save_directory):
645
+ vocab_file = os.path.join(
646
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + "mlm_vocab.txt"
647
+ )
648
+ else:
649
+ vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
650
+ with open(vocab_file, "w", encoding="utf-8") as f:
651
+ for _, token in self.ids_to_tokens.items():
652
+ f.write(token + "\n")
653
+ return (vocab_file,)
654
+
655
+ def _save_pretrained(
656
+ self,
657
+ save_directory: Union[str, os.PathLike],
658
+ file_names: Tuple[str],
659
+ legacy_format: Optional[bool] = None,
660
+ filename_prefix: Optional[str] = None,
661
+ ) -> Tuple[str]:
662
+ """
663
+ Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
664
+
665
+ Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
666
+ specific [`~tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`]
667
+ """
668
+ if legacy_format is False:
669
+ raise ValueError(
670
+ "Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
671
+ )
672
+
673
+ save_directory = str(save_directory)
674
+
675
+ added_tokens_file = os.path.join(
676
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
677
+ )
678
+ added_vocab = self.get_added_vocab()
679
+ if added_vocab:
680
+ with open(added_tokens_file, "w", encoding="utf-8") as f:
681
+ out_str = json.dumps(added_vocab, ensure_ascii=False)
682
+ f.write(out_str)
683
+ logger.info(f"added tokens file saved in {added_tokens_file}")
684
+
685
+ vocab_files = self.save_mlm_vocabulary(save_directory, filename_prefix=filename_prefix)
686
+
687
+ return file_names + vocab_files + (added_tokens_file,)
688
+
689
+
690
+ class BasicTokenizer(object):
691
+ """
692
+ Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
693
+
694
+ Args:
695
+ do_lower_case (`bool`, *optional*, defaults to `True`):
696
+ Whether or not to lowercase the input when tokenizing.
697
+ never_split (`Iterable`, *optional*):
698
+ Collection of tokens which will never be split during tokenization. Only has an effect when
699
+ `do_basic_tokenize=True`
700
+ tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
701
+ Whether or not to tokenize Chinese characters.
702
+
703
+ This should likely be deactivated for Japanese (see this [issue](https://github.com/huggingface/transformers/issues/328)).
704
+ strip_accents: (`bool`, *optional*):
705
+ Whether or not to strip all accents. If this option is not specified, then it will be determined by the
706
+ value for `lowercase` (as in the original BERT).
707
+ """
708
+
709
+ def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
710
+ if never_split is None:
711
+ never_split = []
712
+ self.do_lower_case = do_lower_case
713
+ self.never_split = set(never_split)
714
+ self.tokenize_chinese_chars = tokenize_chinese_chars
715
+ self.strip_accents = strip_accents
716
+
717
+ def tokenize(self, text, never_split=None):
718
+ """
719
+ Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
720
+ WordPieceTokenizer.
721
+
722
+ Args:
723
+ **never_split**: (*optional*) list of str
724
+ Kept for backward compatibility purposes. Now implemented directly at the base class level (see
725
+ [`PreTrainedTokenizer.tokenize`]) List of token not to split.
726
+ """
727
+ # union() returns a new set by concatenating the two sets.
728
+ never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
729
+ text = self._clean_text(text)
730
+
731
+ # This was added on November 1st, 2018 for the multilingual and Chinese
732
+ # models. This is also applied to the English models now, but it doesn't
733
+ # matter since the English models were not trained on any Chinese data
734
+ # and generally don't have any Chinese data in them (there are Chinese
735
+ # characters in the vocabulary because Wikipedia does have some Chinese
736
+ # words in the English Wikipedia.).
737
+ if self.tokenize_chinese_chars:
738
+ text = self._tokenize_chinese_chars(text)
739
+ orig_tokens = whitespace_tokenize(text)
740
+ split_tokens = []
741
+ for token in orig_tokens:
742
+ if token not in never_split:
743
+ if self.do_lower_case:
744
+ token = token.lower()
745
+ if self.strip_accents is not False:
746
+ token = self._run_strip_accents(token)
747
+ elif self.strip_accents:
748
+ token = self._run_strip_accents(token)
749
+ split_tokens.extend(self._run_split_on_punc(token, never_split))
750
+
751
+ output_tokens = whitespace_tokenize(" ".join(split_tokens))
752
+ return output_tokens
753
+
754
+ def _run_strip_accents(self, text):
755
+ """Strips accents from a piece of text."""
756
+ text = unicodedata.normalize("NFD", text)
757
+ output = []
758
+ for char in text:
759
+ cat = unicodedata.category(char)
760
+ if cat == "Mn":
761
+ continue
762
+ output.append(char)
763
+ return "".join(output)
764
+
765
+ def _run_split_on_punc(self, text, never_split=None):
766
+ """Splits punctuation on a piece of text."""
767
+ if never_split is not None and text in never_split:
768
+ return [text]
769
+ chars = list(text)
770
+ i = 0
771
+ start_new_word = True
772
+ output = []
773
+ while i < len(chars):
774
+ char = chars[i]
775
+ if _is_punctuation(char):
776
+ output.append([char])
777
+ start_new_word = True
778
+ else:
779
+ if start_new_word:
780
+ output.append([])
781
+ start_new_word = False
782
+ output[-1].append(char)
783
+ i += 1
784
+
785
+ return ["".join(x) for x in output]
786
+
787
+ def _tokenize_chinese_chars(self, text):
788
+ """Adds whitespace around any CJK character."""
789
+ output = []
790
+ for char in text:
791
+ cp = ord(char)
792
+ if self._is_chinese_char(cp):
793
+ output.append(" ")
794
+ output.append(char)
795
+ output.append(" ")
796
+ else:
797
+ output.append(char)
798
+ return "".join(output)
799
+
800
+ def _is_chinese_char(self, cp):
801
+ """Checks whether CP is the codepoint of a CJK character."""
802
+ # This defines a "chinese character" as anything in the CJK Unicode block:
803
+ # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
804
+ #
805
+ # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
806
+ # despite its name. The modern Korean Hangul alphabet is a different block,
807
+ # as is Japanese Hiragana and Katakana. Those alphabets are used to write
808
+ # space-separated words, so they are not treated specially and handled
809
+ # like the all of the other languages.
810
+ if (
811
+ (cp >= 0x4E00 and cp <= 0x9FFF)
812
+ or (cp >= 0x3400 and cp <= 0x4DBF) #
813
+ or (cp >= 0x20000 and cp <= 0x2A6DF) #
814
+ or (cp >= 0x2A700 and cp <= 0x2B73F) #
815
+ or (cp >= 0x2B740 and cp <= 0x2B81F) #
816
+ or (cp >= 0x2B820 and cp <= 0x2CEAF) #
817
+ or (cp >= 0xF900 and cp <= 0xFAFF)
818
+ or (cp >= 0x2F800 and cp <= 0x2FA1F) #
819
+ ): #
820
+ return True
821
+
822
+ return False
823
+
824
+ def _clean_text(self, text):
825
+ """Performs invalid character removal and whitespace cleanup on text."""
826
+ output = []
827
+ for char in text:
828
+ cp = ord(char)
829
+ if cp == 0 or cp == 0xFFFD or _is_control(char):
830
+ continue
831
+ if _is_whitespace(char):
832
+ output.append(" ")
833
+ else:
834
+ output.append(char)
835
+ return "".join(output)
836
+
837
+
838
+ class CharacterMapper:
839
+ """
840
+ NOTE: Adapted from ElmoCharacterMapper:
841
+ https://github.com/allenai/allennlp/blob/main/allennlp/data/token_indexers/elmo_indexer.py Maps individual tokens
842
+ to sequences of character ids, compatible with CharacterBERT.
843
+ """
844
+
845
+ # char ids 0-255 come from utf-8 encoding bytes
846
+ # assign 256-300 to special chars
847
+ beginning_of_sentence_character = 256 # <begin sentence>
848
+ end_of_sentence_character = 257 # <end sentence>
849
+ beginning_of_word_character = 258 # <begin word>
850
+ end_of_word_character = 259 # <end word>
851
+ padding_character = 260 # <padding> | short tokens are padded using this + 1
852
+ mask_character = 261 # <mask>
853
+
854
+ bos_token = "[CLS]" # previously: bos_token = "<S>"
855
+ eos_token = "[SEP]" # previously: eos_token = "</S>"
856
+ pad_token = "[PAD]"
857
+ mask_token = "[MASK]"
858
+
859
+ def __init__(
860
+ self,
861
+ max_word_length: int = 50,
862
+ ):
863
+ self.max_word_length = max_word_length
864
+ self.beginning_of_sentence_characters = self._make_char_id_sequence(self.beginning_of_sentence_character)
865
+ self.end_of_sentence_characters = self._make_char_id_sequence(self.end_of_sentence_character)
866
+ self.mask_characters = self._make_char_id_sequence(self.mask_character)
867
+ # This is the character id sequence for the pad token (i.e. [PAD]).
868
+ # We remove 1 because we will add 1 later on and it will be equal to 0.
869
+ self.pad_characters = [PAD_TOKEN_CHAR_ID - 1] * self.max_word_length
870
+
871
+ def _make_char_id_sequence(self, character: int):
872
+ char_ids = [self.padding_character] * self.max_word_length
873
+ char_ids[0] = self.beginning_of_word_character
874
+ char_ids[1] = character
875
+ char_ids[2] = self.end_of_word_character
876
+ return char_ids
877
+
878
+ def convert_word_to_char_ids(self, word: str) -> List[int]:
879
+ if word == self.bos_token:
880
+ char_ids = self.beginning_of_sentence_characters
881
+ elif word == self.eos_token:
882
+ char_ids = self.end_of_sentence_characters
883
+ elif word == self.mask_token:
884
+ char_ids = self.mask_characters
885
+ elif word == self.pad_token:
886
+ char_ids = self.pad_characters
887
+ else:
888
+ # Convert characters to indices
889
+ word_encoded = word.encode("utf-8", "ignore")[: (self.max_word_length - 2)]
890
+ # Initialize character_ids with padding
891
+ char_ids = [self.padding_character] * self.max_word_length
892
+ # First character is BeginningOfWord
893
+ char_ids[0] = self.beginning_of_word_character
894
+ # Populate character_ids with computed indices
895
+ for k, chr_id in enumerate(word_encoded, start=1):
896
+ char_ids[k] = chr_id
897
+ # Last character is EndOfWord
898
+ char_ids[len(word_encoded) + 1] = self.end_of_word_character
899
+
900
+ # +1 one for masking so that character padding == 0
901
+ # char_ids domain is therefore: (1, 256) for actual characters
902
+ # and (257-262) for special symbols (BOS/EOS/BOW/EOW/padding/MLM Mask)
903
+ return [c + 1 for c in char_ids]
904
+
905
+ def convert_char_ids_to_word(self, char_ids: List[int]) -> str:
906
+ "Converts a sequence of character ids into its corresponding word."
907
+
908
+ assert len(char_ids) <= self.max_word_length, (
909
+ f"Got character sequence of length {len(char_ids)} while `max_word_length={self.max_word_length}`"
910
+ )
911
+
912
+ char_ids_ = [(i - 1) for i in char_ids]
913
+ if char_ids_ == self.beginning_of_sentence_characters:
914
+ return self.bos_token
915
+ elif char_ids_ == self.end_of_sentence_characters:
916
+ return self.eos_token
917
+ elif char_ids_ == self.mask_characters:
918
+ return self.mask_token
919
+ elif char_ids_ == self.pad_characters: # token padding
920
+ return self.pad_token
921
+ else:
922
+ utf8_codes = list(
923
+ filter(
924
+ lambda x: (x != self.padding_character)
925
+ and (x != self.beginning_of_word_character)
926
+ and (x != self.end_of_word_character),
927
+ char_ids_,
928
+ )
929
+ )
930
+ return bytes(utf8_codes).decode("utf-8")
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"max_word_length": 50, "do_lower_case": true, "do_basic_tokenize": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
 
1
+ {"name_or_path": "helboukkouri/character-bert-medical", "tokenizer_class": "CharacterBertTokenizer", "max_word_length": 50, "do_lower_case": true, "do_basic_tokenize": true, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "auto_map": {"AutoTokenizer": ["tokenization_character_bert.CharacterBertTokenizer", null]}}