|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Tokenization classes for Italian AlBERTo models.""" |
|
import collections |
|
import logging |
|
import os |
|
import re |
|
import logger |
|
|
|
try: |
|
from ekphrasis.classes.preprocessor import TextPreProcessor |
|
from ekphrasis.classes.tokenizer import SocialTokenizer |
|
from ekphrasis.dicts.emoticons import emoticons |
|
except ImportError: |
|
|
|
|
|
|
|
|
|
from pip._internal import main as pip |
|
pip(['install', '--user', 'ekphrasis']) |
|
from ekphrasis.classes.preprocessor import TextPreProcessor |
|
from ekphrasis.classes.tokenizer import SocialTokenizer |
|
from ekphrasis.dicts.emoticons import emoticons |
|
|
|
try: |
|
import numpy as np |
|
except ImportError: |
|
logger.warning( |
|
"You need to install numpy to use AlBERToTokenizer" |
|
"pip install numpy" |
|
) |
|
from pip._internal import main as pip |
|
pip(['install', '--user', 'pandas']) |
|
import pandas as pd |
|
|
|
try: |
|
from transformers import BertTokenizer, WordpieceTokenizer |
|
from transformers.tokenization_bert import load_vocab |
|
except ImportError: |
|
logger.warning( |
|
"You need to install pytorch-transformers to use AlBERToTokenizer" |
|
"pip install pytorch-transformers" |
|
) |
|
from pip._internal import main as pip |
|
pip(['install', '--user', 'pytorch-transformers']) |
|
from transformers import BertTokenizer, WordpieceTokenizer |
|
from transformers.tokenization_bert import load_vocab |
|
|
|
text_processor = TextPreProcessor( |
|
|
|
normalize=['url', 'email', 'user', 'percent', 'money', 'phone', 'time', 'date', 'number'], |
|
|
|
annotate={"hashtag"}, |
|
fix_html=True, |
|
|
|
unpack_hashtags=True, |
|
|
|
|
|
|
|
tokenizer=SocialTokenizer(lowercase=True).tokenize, |
|
dicts=[emoticons] |
|
) |
|
|
|
class AlBERTo_Preprocessing(object): |
|
def __init__(self, do_lower_case=True, **kwargs): |
|
self.do_lower_case = do_lower_case |
|
|
|
def preprocess(self, text): |
|
if self.do_lower_case: |
|
text = text.lower() |
|
text = str(" ".join(text_processor.pre_process_doc(text))) |
|
text = re.sub(r'[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]', ' ', text) |
|
text = re.sub(r'\s+', ' ', text) |
|
text = re.sub(r'(\w)\1{2,}', r'\1\1', text) |
|
text = re.sub(r'^\s', '', text) |
|
text = re.sub(r'\s$', '', text) |
|
return text |
|
|
|
class AlBERToTokenizer(BertTokenizer): |
|
|
|
def __init__(self, vocab_file, do_lower_case=True, |
|
do_basic_tokenize=True, do_char_tokenize=False, do_wordpiece_tokenize=False, do_preprocessing = True, unk_token='[UNK]', |
|
sep_token='[SEP]', |
|
pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): |
|
super(BertTokenizer, self).__init__( |
|
unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, |
|
cls_token=cls_token, mask_token=mask_token, **kwargs) |
|
|
|
self.do_wordpiece_tokenize = do_wordpiece_tokenize |
|
self.do_lower_case = do_lower_case |
|
self.vocab_file = vocab_file |
|
self.do_basic_tokenize = do_basic_tokenize |
|
self.do_char_tokenize = do_char_tokenize |
|
self.unk_token = unk_token |
|
self.do_preprocessing = do_preprocessing |
|
|
|
if not os.path.isfile(vocab_file): |
|
raise ValueError( |
|
"Can't find a vocabulary file at path '{}'.".format(vocab_file)) |
|
|
|
self.vocab = load_vocab(vocab_file) |
|
self.ids_to_tokens = collections.OrderedDict( |
|
[(ids, tok) for tok, ids in self.vocab.items()]) |
|
|
|
if do_wordpiece_tokenize: |
|
self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, |
|
unk_token=self.unk_token) |
|
|
|
self.base_bert_tok = BertTokenizer(vocab_file=self.vocab_file, do_lower_case=do_lower_case, |
|
unk_token=unk_token, sep_token=sep_token, pad_token=pad_token, |
|
cls_token=cls_token, mask_token=mask_token, **kwargs) |
|
|
|
def _convert_token_to_id(self, token): |
|
"""Converts a token (str/unicode) to an id using the vocab.""" |
|
|
|
|
|
|
|
return self.vocab.get(token, self.vocab.get(self.unk_token)) |
|
|
|
def convert_token_to_id(self, token): |
|
return self._convert_token_to_id(token) |
|
|
|
return self.vocab.get(token, self.vocab.get(self.unk_token)) |
|
|
|
def _convert_id_to_token(self, id): |
|
|
|
|
|
|
|
return list(self.vocab.keys())[int(id)] |
|
def convert_id_to_token(self, id): |
|
return self._convert_id_to_token(id) |
|
|
|
def _convert_tokens_to_string(self,tokens): |
|
"""Converts a sequence of tokens (string) to a single string.""" |
|
out_string = ' '.join(tokens).replace('##', '').strip() |
|
return out_string |
|
|
|
def convert_tokens_to_string(self,tokens): |
|
return self._convert_tokens_to_string(tokens) |
|
|
|
def _tokenize(self, text, never_split=None, **kwargs): |
|
if self.do_preprocessing: |
|
if self.do_lower_case: |
|
text = text.lower() |
|
text = str(" ".join(text_processor.pre_process_doc(text))) |
|
text = re.sub(r'[^a-zA-ZÀ-ú</>!?♥♡\s\U00010000-\U0010ffff]', ' ', text) |
|
text = re.sub(r'\s+', ' ', text) |
|
text = re.sub(r'(\w)\1{2,}', r'\1\1', text) |
|
text = re.sub(r'^\s', '', text) |
|
text = re.sub(r'\s$', '', text) |
|
|
|
|
|
split_tokens = [text] |
|
if self.do_wordpiece_tokenize: |
|
wordpiece_tokenizer = WordpieceTokenizer(self.vocab,self.unk_token) |
|
split_tokens = wordpiece_tokenizer.tokenize(text) |
|
|
|
elif self.do_char_tokenize: |
|
tokenizer = CharacterTokenizer(self.vocab, self.unk_token) |
|
split_tokens = tokenizer.tokenize(text) |
|
|
|
elif self.do_basic_tokenize: |
|
"""Tokenizes a piece of text.""" |
|
split_tokens = self.base_bert_tok.tokenize(text) |
|
|
|
return split_tokens |
|
|
|
def tokenize(self, text, never_split=None, **kwargs): |
|
return self._tokenize(text, never_split) |
|
|
|
|
|
class CharacterTokenizer(object): |
|
"""Runs Character tokenziation.""" |
|
|
|
def __init__(self, vocab, unk_token, |
|
max_input_chars_per_word=100, with_markers=True): |
|
"""Constructs a CharacterTokenizer. |
|
Args: |
|
vocab: Vocabulary object. |
|
unk_token: A special symbol for out-of-vocabulary token. |
|
with_markers: If True, "#" is appended to each output character except the |
|
first one. |
|
""" |
|
self.vocab = vocab |
|
self.unk_token = unk_token |
|
self.max_input_chars_per_word = max_input_chars_per_word |
|
self.with_markers = with_markers |
|
|
|
def tokenize(self, text): |
|
"""Tokenizes a piece of text into characters. |
|
|
|
For example: |
|
input = "apple" |
|
output = ["a", "##p", "##p", "##l", "##e"] (if self.with_markers is True) |
|
output = ["a", "p", "p", "l", "e"] (if self.with_markers is False) |
|
Args: |
|
text: A single token or whitespace separated tokens. |
|
This should have already been passed through `BasicTokenizer`. |
|
Returns: |
|
A list of characters. |
|
""" |
|
|
|
output_tokens = [] |
|
for i, char in enumerate(text): |
|
if char not in self.vocab: |
|
output_tokens.append(self.unk_token) |
|
continue |
|
|
|
if self.with_markers and i != 0: |
|
output_tokens.append('##' + char) |
|
else: |
|
output_tokens.append(char) |
|
|
|
return output_tokens |
|
|
|
if __name__== "__main__": |
|
a = AlBERTo_Preprocessing(do_lower_case=True) |
|
s = "#IlGOverno presenta le linee guida sulla scuola #labuonascuola - http://t.co/SYS1T9QmQN" |
|
b = a.preprocess(s) |
|
print(b) |
|
|
|
c =AlBERToTokenizer(do_lower_case=True,vocab_file="vocab.txt", do_preprocessing=True) |
|
d = c.tokenize(s) |
|
print(d) |