Spaces:

sam749
/

indictrans2-conversation

Runtime error

App Files Files Community

sam749 commited on Mar 11

Commit

3a89850

•

1 Parent(s): cd2a958

Upload folder using huggingface_hub

Browse files

Files changed (29) hide show

.gitattributes +4 -0
IndicTransTokenizer/.gitignore +3 -0
IndicTransTokenizer/IndicTransTokenizer/__init__.py +2 -0
IndicTransTokenizer/IndicTransTokenizer/en-indic/dict.SRC.json +0 -0
IndicTransTokenizer/IndicTransTokenizer/en-indic/dict.TGT.json +0 -0
IndicTransTokenizer/IndicTransTokenizer/en-indic/model.SRC +0 -0
IndicTransTokenizer/IndicTransTokenizer/en-indic/model.TGT +3 -0
IndicTransTokenizer/IndicTransTokenizer/indic-en/dict.SRC.json +0 -0
IndicTransTokenizer/IndicTransTokenizer/indic-en/dict.TGT.json +0 -0
IndicTransTokenizer/IndicTransTokenizer/indic-en/model.SRC +3 -0
IndicTransTokenizer/IndicTransTokenizer/indic-en/model.TGT +0 -0
IndicTransTokenizer/IndicTransTokenizer/indic-indic/dict.SRC.json +0 -0
IndicTransTokenizer/IndicTransTokenizer/indic-indic/dict.TGT.json +0 -0
IndicTransTokenizer/IndicTransTokenizer/indic-indic/model.SRC +3 -0
IndicTransTokenizer/IndicTransTokenizer/indic-indic/model.TGT +3 -0
IndicTransTokenizer/IndicTransTokenizer/tokenizer.py +262 -0
IndicTransTokenizer/IndicTransTokenizer/utils.py +530 -0
IndicTransTokenizer/IndicTransTokenizer/version.py +1 -0
IndicTransTokenizer/IndicTransTokenizer/version.txt +1 -0
IndicTransTokenizer/LICENSE +21 -0
IndicTransTokenizer/README.md +77 -0
IndicTransTokenizer/requirements.txt +6 -0
IndicTransTokenizer/setup.py +47 -0
README.md +5 -4
app.py +87 -0
config.py +5 -0
examples.py +11 -0
indictrans2.py +98 -0
requirements.txt +8 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+IndicTransTokenizer/IndicTransTokenizer/en-indic/model.TGT filter=lfs diff=lfs merge=lfs -text
+IndicTransTokenizer/IndicTransTokenizer/indic-en/model.SRC filter=lfs diff=lfs merge=lfs -text
+IndicTransTokenizer/IndicTransTokenizer/indic-indic/model.SRC filter=lfs diff=lfs merge=lfs -text
+IndicTransTokenizer/IndicTransTokenizer/indic-indic/model.TGT filter=lfs diff=lfs merge=lfs -text

IndicTransTokenizer/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+dist/
+IndicTransTokenizer.egg-info
+IndicTransTokenizer/__pycache__/

IndicTransTokenizer/IndicTransTokenizer/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .tokenizer import IndicTransTokenizer
2	+ from .utils import IndicProcessor

IndicTransTokenizer/IndicTransTokenizer/en-indic/dict.SRC.json ADDED Viewed

The diff for this file is too large to render. See raw diff

IndicTransTokenizer/IndicTransTokenizer/en-indic/dict.TGT.json ADDED Viewed

The diff for this file is too large to render. See raw diff

IndicTransTokenizer/IndicTransTokenizer/en-indic/model.SRC ADDED Viewed

Binary file (759 kB). View file

IndicTransTokenizer/IndicTransTokenizer/en-indic/model.TGT ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac9257c8e76b8b607705b959cc3d075656ea33032f7a974e467b8941df6e98d4
+size 3256903

IndicTransTokenizer/IndicTransTokenizer/indic-en/dict.SRC.json ADDED Viewed

The diff for this file is too large to render. See raw diff

IndicTransTokenizer/IndicTransTokenizer/indic-en/dict.TGT.json ADDED Viewed

The diff for this file is too large to render. See raw diff

IndicTransTokenizer/IndicTransTokenizer/indic-en/model.SRC ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac9257c8e76b8b607705b959cc3d075656ea33032f7a974e467b8941df6e98d4
+size 3256903

IndicTransTokenizer/IndicTransTokenizer/indic-en/model.TGT ADDED Viewed

Binary file (759 kB). View file

IndicTransTokenizer/IndicTransTokenizer/indic-indic/dict.SRC.json ADDED Viewed

The diff for this file is too large to render. See raw diff

IndicTransTokenizer/IndicTransTokenizer/indic-indic/dict.TGT.json ADDED Viewed

The diff for this file is too large to render. See raw diff

IndicTransTokenizer/IndicTransTokenizer/indic-indic/model.SRC ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac9257c8e76b8b607705b959cc3d075656ea33032f7a974e467b8941df6e98d4
+size 3256903

IndicTransTokenizer/IndicTransTokenizer/indic-indic/model.TGT ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac9257c8e76b8b607705b959cc3d075656ea33032f7a974e467b8941df6e98d4
+size 3256903

IndicTransTokenizer/IndicTransTokenizer/tokenizer.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import os
+import json
+import torch
+from transformers import BatchEncoding
+from typing import Dict, List, Tuple, Union
+from sentencepiece import SentencePieceProcessor
+_PATH = os.path.dirname(os.path.realpath(__file__))
+class IndicTransTokenizer:
+    def __init__(
+        self,
+        direction=None,
+        model_name=None,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="<pad>",
+        model_max_length=256,
+    ):
+        self.model_max_length = model_max_length
+        self.supported_langs = [
+            "asm_Beng",
+            "awa_Deva",
+            "ben_Beng",
+            "bho_Deva",
+            "brx_Deva",
+            "doi_Deva",
+            "eng_Latn",
+            "gom_Deva",
+            "gon_Deva",
+            "guj_Gujr",
+            "hin_Deva",
+            "hne_Deva",
+            "kan_Knda",
+            "kas_Arab",
+            "kas_Deva",
+            "kha_Latn",
+            "lus_Latn",
+            "mag_Deva",
+            "mai_Deva",
+            "mal_Mlym",
+            "mar_Deva",
+            "mni_Beng",
+            "mni_Mtei",
+            "npi_Deva",
+            "ory_Orya",
+            "pan_Guru",
+            "san_Deva",
+            "sat_Olck",
+            "snd_Arab",
+            "snd_Deva",
+            "tam_Taml",
+            "tel_Telu",
+            "urd_Arab",
+            "unr_Deva",
+        ]
+        if model_name is None and direction is None:
+            raise ValueError("Either model_name or direction must be provided!")
+        if model_name is not None:
+            direction = self.get_direction(model_name)  # model_name overrides direction
+        self.src_vocab_fp = os.path.join(_PATH, direction, "dict.SRC.json")
+        self.tgt_vocab_fp = os.path.join(_PATH, direction, "dict.TGT.json")
+        self.src_spm_fp = os.path.join(_PATH, direction, "model.SRC")
+        self.tgt_spm_fp = os.path.join(_PATH, direction, "model.TGT")
+        self.unk_token = unk_token
+        self.pad_token = pad_token
+        self.eos_token = eos_token
+        self.bos_token = bos_token
+        self.encoder = self._load_json(self.src_vocab_fp)
+        if self.unk_token not in self.encoder:
+            raise KeyError("<unk> token must be in vocab")
+        assert self.pad_token in self.encoder
+        self.encoder_rev = {v: k for k, v in self.encoder.items()}
+        self.decoder = self._load_json(self.tgt_vocab_fp)
+        if self.unk_token not in self.encoder:
+            raise KeyError("<unk> token must be in vocab")
+        assert self.pad_token in self.encoder
+        self.decoder_rev = {v: k for k, v in self.decoder.items()}
+        # load SentencePiece model for pre-processing
+        self.src_spm = self._load_spm(self.src_spm_fp)
+        self.tgt_spm = self._load_spm(self.tgt_spm_fp)
+        self.unk_token_id = self.encoder[self.unk_token]
+        self.pad_token_id = self.encoder[self.pad_token]
+        self.eos_token_id = self.encoder[self.eos_token]
+        self.bos_token_id = self.encoder[self.bos_token]
+    def get_direction(self, model_name: str) -> str:
+        pieces = model_name.split("/")[-1].split("-")
+        return f"{pieces[1]}-{pieces[2]}"
+    def is_special_token(self, x: str):
+        return (x == self.pad_token) or (x == self.bos_token) or (x == self.eos_token)
+    def get_vocab_size(self, src: bool) -> int:
+        """Returns the size of the vocabulary"""
+        return len(self.encoder) if src else len(self.decoder)
+    def _load_spm(self, path: str) -> SentencePieceProcessor:
+        return SentencePieceProcessor(model_file=path)
+    def _save_json(self, data, path: str) -> None:
+        with open(path, "w", encoding="utf-8") as f:
+            json.dump(data, f, indent=2)
+    def _load_json(self, path: str) -> Union[Dict, List]:
+        with open(path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    def _convert_token_to_id(self, token: str, src: bool) -> int:
+        """Converts an token (str) into an index (integer) using the source/target vocabulary map."""
+        return (
+            self.encoder.get(token, self.encoder[self.unk_token])
+            if src
+            else self.decoder.get(token, self.encoder[self.unk_token])
+        )
+    def _convert_id_to_token(self, index: int, src: bool) -> str:
+        """Converts an index (integer) into a token (str) using the source/target vocabulary map."""
+        return (
+            self.encoder_rev.get(index, self.unk_token)
+            if src
+            else self.decoder_rev.get(index, self.unk_token)
+        )
+    def _convert_tokens_to_string(self, tokens: List[str], src: bool) -> str:
+        """Uses sentencepiece model for detokenization"""
+        if src:
+            if tokens[0] in self.supported_langs and tokens[1] in self.supported_langs:
+                tokens = tokens[2:]
+            return " ".join(tokens)
+        else:
+            return " ".join(tokens)
+    def _remove_translation_tags(self, text: str) -> Tuple[List, str]:
+        """Removes the translation tags before text normalization and tokenization."""
+        tokens = text.split(" ")
+        return tokens[:2], " ".join(tokens[2:])
+    def _tokenize_src_line(self, line: str) -> List[str]:
+        """Tokenizes a source line."""
+        tags, text = self._remove_translation_tags(line)
+        tokens = self.src_spm.encode(text, out_type=str)
+        return tags + tokens
+    def _tokenize_tgt_line(self, line: str) -> List[str]:
+        """Tokenizes a target line."""
+        return self.tgt_spm.encode(line, out_type=str)
+    def tokenize(self, text: str, src: bool) -> List[str]:
+        """Tokenizes a string into tokens using the source/target vocabulary."""
+        return self._tokenize_src_line(text) if src else self._tokenize_tgt_line(text)
+    def batch_tokenize(self, batch: List[str], src: bool) -> List[List[str]]:
+        """Tokenizes a list of strings into tokens using the source/target vocabulary."""
+        return [self.tokenize(line, src) for line in batch]
+    def _create_attention_mask(self, ids: List[int], max_seq_len: int, src: bool) -> List[int]:
+        """Creates a attention mask for the input sequence."""
+        if src:
+            return [0] * (max_seq_len - len(ids)) + [1] * (len(ids) + 1)
+        else:
+            return [1] * (len(ids) + 1) + [0] * (max_seq_len - len(ids))
+    def _pad_batch(self, tokens: List[str], max_seq_len: int, src: bool) -> List[str]:
+        """Pads a batch of tokens and adds BOS/EOS tokens."""
+        if src:
+            return [self.pad_token] * (max_seq_len - len(tokens)) + tokens + [self.eos_token]
+        else:
+            return tokens + [self.eos_token] + [self.pad_token] * (max_seq_len - len(tokens))
+    def _decode_line(self, ids: List[int], src: bool) -> List[str]:
+        return [self._convert_id_to_token(_id, src) for _id in ids]
+    def _encode_line(self, tokens: List[str], src: bool) -> List[int]:
+        return [self._convert_token_to_id(token, src) for token in tokens]
+    def _strip_special_tokens(self, tokens: List[str]) -> List[str]:
+        return [token for token in tokens if not self.is_special_token(token)]
+    def _single_input_preprocessing(
+        self, tokens: List[str], src: bool, max_seq_len: int
+    ) -> Tuple[List[int], List[int], int]:
+        """Tokenizes a string into tokens and also converts them into integers using source/target vocabulary map."""
+        attention_mask = self._create_attention_mask(tokens, max_seq_len, src)
+        padded_tokens = self._pad_batch(tokens, max_seq_len, src)
+        input_ids = self._encode_line(padded_tokens, src)
+        return input_ids, attention_mask
+    def _single_output_postprocessing(self, ids: List[int], src: bool) -> str:
+        """Detokenizes a list of integer ids into a string using the source/target vocabulary."""
+        tokens = self._decode_line(ids, src)
+        tokens = self._strip_special_tokens(tokens)
+        return (
+            self._convert_tokens_to_string(tokens, src).replace(" ", "").replace("▁", " ").strip()
+        )
+    def __call__(
+        self,
+        batch: Union[list, str],
+        src: bool,
+        truncation: bool = False,
+        padding: str = "longest",
+        max_length: int = None,
+        return_tensors: str = "pt",
+        return_attention_mask: bool = True,
+        return_length: bool = False,
+    ) -> BatchEncoding:
+        """Tokenizes a string into tokens and also converts them into integers using source/target vocabulary map."""
+        assert padding in [
+            "longest",
+            "max_length",
+        ], "Padding should be either 'longest' or 'max_length'"
+        if not isinstance(batch, list):
+            raise TypeError(f"Batch must be a list, but current batch is of type {type(batch)}")
+        # tokenize the source sentences
+        batch = self.batch_tokenize(batch, src)
+        # truncate the sentences if needed
+        if truncation and max_length is not None:
+            batch = [ids[:max_length] for ids in batch]
+        lengths = [len(ids) for ids in batch]
+        max_seq_len = max(lengths) if padding == "longest" else max_length
+        input_ids, attention_mask = zip(
+            *[
+                self._single_input_preprocessing(tokens=tokens, src=src, max_seq_len=max_seq_len)
+                for tokens in batch
+            ]
+        )
+        _data = {"input_ids": input_ids}
+        if return_attention_mask:
+            _data["attention_mask"] = attention_mask
+        if return_length:
+            _data["lengths"] = lengths
+        return BatchEncoding(_data, tensor_type=return_tensors)
+    def batch_decode(self, batch: Union[list, torch.Tensor], src: bool) -> List[List[str]]:
+        """Detokenizes a list of integer ids or a tensor into a list of strings using the source/target vocabulary."""
+        if isinstance(batch, torch.Tensor):
+            batch = batch.detach().cpu().tolist()
+        return [self._single_output_postprocessing(ids=ids, src=src) for ids in batch]

IndicTransTokenizer/IndicTransTokenizer/utils.py ADDED Viewed

	@@ -0,0 +1,530 @@

+import re
+from typing import List, Tuple, Union
+from indicnlp.tokenize import indic_tokenize, indic_detokenize
+from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
+from sacremoses import MosesPunctNormalizer, MosesTokenizer, MosesDetokenizer
+from indicnlp.transliterate.unicode_transliterate import UnicodeIndicTransliterator
+class IndicProcessor:
+    def __init__(self, inference=True):
+        self.inference = inference
+        self._flores_codes = {
+            "asm_Beng": "as",
+            "awa_Deva": "hi",
+            "ben_Beng": "bn",
+            "bho_Deva": "hi",
+            "brx_Deva": "hi",
+            "doi_Deva": "hi",
+            "eng_Latn": "en",
+            "gom_Deva": "kK",
+            "gon_Deva": "hi",
+            "guj_Gujr": "gu",
+            "hin_Deva": "hi",
+            "hne_Deva": "hi",
+            "kan_Knda": "kn",
+            "kas_Arab": "ur",
+            "kas_Deva": "hi",
+            "kha_Latn": "en",
+            "lus_Latn": "en",
+            "mag_Deva": "hi",
+            "mai_Deva": "hi",
+            "mal_Mlym": "ml",
+            "mar_Deva": "mr",
+            "mni_Beng": "bn",
+            "mni_Mtei": "hi",
+            "npi_Deva": "ne",
+            "ory_Orya": "or",
+            "pan_Guru": "pa",
+            "san_Deva": "hi",
+            "sat_Olck": "or",
+            "snd_Arab": "ur",
+            "snd_Deva": "hi",
+            "tam_Taml": "ta",
+            "tel_Telu": "te",
+            "urd_Arab": "ur",
+            "unr_Deva": "hi",
+        }
+        self._indic_num_map = {
+            "\u09e6": "0",
+            "0": "0",
+            "\u0ae6": "0",
+            "\u0ce6": "0",
+            "\u0966": "0",
+            "\u0660": "0",
+            "\uabf0": "0",
+            "\u0b66": "0",
+            "\u0a66": "0",
+            "\u1c50": "0",
+            "\u06f0": "0",
+            "\u09e7": "1",
+            "1": "1",
+            "\u0ae7": "1",
+            "\u0967": "1",
+            "\u0ce7": "1",
+            "\u06f1": "1",
+            "\uabf1": "1",
+            "\u0b67": "1",
+            "\u0a67": "1",
+            "\u1c51": "1",
+            "\u0c67": "1",
+            "\u09e8": "2",
+            "2": "2",
+            "\u0ae8": "2",
+            "\u0968": "2",
+            "\u0ce8": "2",
+            "\u06f2": "2",
+            "\uabf2": "2",
+            "\u0b68": "2",
+            "\u0a68": "2",
+            "\u1c52": "2",
+            "\u0c68": "2",
+            "\u09e9": "3",
+            "3": "3",
+            "\u0ae9": "3",
+            "\u0969": "3",
+            "\u0ce9": "3",
+            "\u06f3": "3",
+            "\uabf3": "3",
+            "\u0b69": "3",
+            "\u0a69": "3",
+            "\u1c53": "3",
+            "\u0c69": "3",
+            "\u09ea": "4",
+            "4": "4",
+            "\u0aea": "4",
+            "\u096a": "4",
+            "\u0cea": "4",
+            "\u06f4": "4",
+            "\uabf4": "4",
+            "\u0b6a": "4",
+            "\u0a6a": "4",
+            "\u1c54": "4",
+            "\u0c6a": "4",
+            "\u09eb": "5",
+            "5": "5",
+            "\u0aeb": "5",
+            "\u096b": "5",
+            "\u0ceb": "5",
+            "\u06f5": "5",
+            "\uabf5": "5",
+            "\u0b6b": "5",
+            "\u0a6b": "5",
+            "\u1c55": "5",
+            "\u0c6b": "5",
+            "\u09ec": "6",
+            "6": "6",
+            "\u0aec": "6",
+            "\u096c": "6",
+            "\u0cec": "6",
+            "\u06f6": "6",
+            "\uabf6": "6",
+            "\u0b6c": "6",
+            "\u0a6c": "6",
+            "\u1c56": "6",
+            "\u0c6c": "6",
+            "\u09ed": "7",
+            "7": "7",
+            "\u0aed": "7",
+            "\u096d": "7",
+            "\u0ced": "7",
+            "\u06f7": "7",
+            "\uabf7": "7",
+            "\u0b6d": "7",
+            "\u0a6d": "7",
+            "\u1c57": "7",
+            "\u0c6d": "7",
+            "\u09ee": "8",
+            "8": "8",
+            "\u0aee": "8",
+            "\u096e": "8",
+            "\u0cee": "8",
+            "\u06f8": "8",
+            "\uabf8": "8",
+            "\u0b6e": "8",
+            "\u0a6e": "8",
+            "\u1c58": "8",
+            "\u0c6e": "8",
+            "\u09ef": "9",
+            "9": "9",
+            "\u0aef": "9",
+            "\u096f": "9",
+            "\u0cef": "9",
+            "\u06f9": "9",
+            "\uabf9": "9",
+            "\u0b6f": "9",
+            "\u0a6f": "9",
+            "\u1c59": "9",
+            "\u0c6f": "9",
+        }
+        self._placeholder_entity_maps = []
+        self._en_tok = MosesTokenizer(lang="en")
+        self._en_normalizer = MosesPunctNormalizer()
+        self._en_detok = MosesDetokenizer(lang="en")
+        self._xliterator = UnicodeIndicTransliterator()
+        self._multispace_regex = re.compile("[ ]{2,}")
+        self._digit_space_percent = re.compile(r"(\d) %")
+        self._double_quot_punc = re.compile(r"\"([,\.]+)")
+        self._digit_nbsp_digit = re.compile(r"(\d) (\d)")
+        self._end_bracket_space_punc_regex = re.compile(r"\) ([\.!:?;,])")
+        self._URL_PATTERN = r"\b(?<![\w/.])(?:(?:https?|ftp)://)?(?:(?:[\w-]+\.)+(?!\.))(?:[\w/\-?#&=%.]+)+(?!\.\w+)\b"
+        self._NUMERAL_PATTERN = r"(~?\d+\.?\d*\s?%?\s?-?\s?~?\d+\.?\d*\s?%|~?\d+%|\d+[-\/.,:']\d+[-\/.,:'+]\d+(?:\.\d+)?|\d+[-\/.:'+]\d+(?:\.\d+)?)"
+        self._EMAIL_PATTERN = r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}"
+        self._OTHER_PATTERN = r"[A-Za-z0-9]*[#|@]\w+"
+    def _add_placeholder_entity_map(self, placeholder_entity_map):
+        self._placeholder_entity_maps.append(placeholder_entity_map)
+    def get_placeholder_entity_maps(self):
+        return self._placeholder_entity_maps
+    def _punc_norm(self, text) -> str:
+        text = (
+            text.replace("\r", "")
+            .replace("(", " (")
+            .replace(")", ") ")
+            .replace("( ", "(")
+            .replace(" )", ")")
+            .replace(" :", ":")
+            .replace(" ;", ";")
+            .replace("`", "'")
+            .replace("„", '"')
+            .replace("“", '"')
+            .replace("”", '"')
+            .replace("–", "-")
+            .replace("—", " - ")
+            .replace("´", "'")
+            .replace("‘", "'")
+            .replace("‚", "'")
+            .replace("’", "'")
+            .replace("''", '"')
+            .replace("´´", '"')
+            .replace("…", "...")
+            .replace(" « ", ' "')
+            .replace("« ", '"')
+            .replace("«", '"')
+            .replace(" » ", '" ')
+            .replace(" »", '"')
+            .replace("»", '"')
+            .replace(" %", "%")
+            .replace("nº ", "nº ")
+            .replace(" :", ":")
+            .replace(" ºC", " ºC")
+            .replace(" cm", " cm")
+            .replace(" ?", "?")
+            .replace(" !", "!")
+            .replace(" ;", ";")
+            .replace(", ", ", ")
+        )
+        text = self._multispace_regex.sub(" ", text)
+        text = self._end_bracket_space_punc_regex.sub(r")\1", text)
+        text = self._digit_space_percent.sub(r"\1%", text)
+        text = self._double_quot_punc.sub(r'\1"', text)
+        text = self._digit_nbsp_digit.sub(r"\1.\2", text)
+        return text.strip()
+    def _normalize_indic_numerals(self, line: str) -> str:
+        """
+        Normalize the numerals in Indic languages from native script to Roman script (if present).
+        Args:
+            line (str): an input string with Indic numerals to be normalized.
+        Returns:
+            str: an input string with the all Indic numerals normalized to Roman script.
+        """
+        return "".join([self._indic_num_map.get(c, c) for c in line])
+    def _wrap_with_placeholders(self, text: str, patterns: list) -> str:
+        """
+        Wraps substrings with matched patterns in the given text with placeholders and returns
+        the modified text along with a mapping of the placeholders to their original value.
+        Args:
+            text (str): an input string which needs to be wrapped with the placeholders.
+            pattern (list): list of patterns to search for in the input string.
+        Returns:
+            text (str): a modified text.
+        """
+        serial_no = 1
+        placeholder_entity_map = dict()
+        indic_failure_cases = [
+            "آی ڈی ",
+            "ꯑꯥꯏꯗꯤ",
+            "आईडी",
+            "आई . डी . ",
+            "आई . डी .",
+            "आई. डी. ",
+            "आई. डी.",
+            "ऐटि",
+            "آئی ڈی ",
+            "ᱟᱭᱰᱤ ᱾",
+            "आयडी",
+            "ऐडि",
+            "आइडि",
+            "ᱟᱭᱰᱤ",
+        ]
+        for pattern in patterns:
+            matches = set(re.findall(pattern, text))
+            # wrap common match with placeholder tags
+            for match in matches:
+                if pattern == self._URL_PATTERN:
+                    # Avoids false positive URL matches for names with initials.
+                    if len(match.replace(".", "")) < 4:
+                        continue
+                if pattern == self._NUMERAL_PATTERN:
+                    # Short numeral patterns do not need placeholder based handling.
+                    if (
+                        len(match.replace(" ", "").replace(".", "").replace(":", ""))
+                        < 4
+                    ):
+                        continue
+                # Set of Translations of "ID" in all the suppported languages have been collated.
+                # This has been added to deal with edge cases where placeholders might get translated.
+                base_placeholder = f"<ID{serial_no}>"
+                placeholder_entity_map[f"<ID{serial_no}]"] = match
+                placeholder_entity_map[f"< ID{serial_no} ]"] = match
+                placeholder_entity_map[f"<ID{serial_no}>"] = match
+                placeholder_entity_map[f"< ID{serial_no} >"] = match
+                for i in indic_failure_cases:
+                    placeholder_entity_map[f"<{i}{serial_no}>"] = match
+                    placeholder_entity_map[f"< {i}{serial_no} >"] = match
+                    placeholder_entity_map[f"< {i} {serial_no} >"] = match
+                    placeholder_entity_map[f"<{i} {serial_no}]"] = match
+                    placeholder_entity_map[f"< {i} {serial_no} ]"] = match
+                    placeholder_entity_map[f"[{i} {serial_no}]"] = match
+                    placeholder_entity_map[f"[ {i} {serial_no} ]"] = match
+                text = text.replace(match, base_placeholder)
+                serial_no += 1
+        text = re.sub("\s+", " ", text).replace(">/", ">").replace("]/", "]")
+        self._add_placeholder_entity_map(placeholder_entity_map)
+        return text
+    def _normalize(
+        self,
+        text: str,
+    ) -> Tuple[str, dict]:
+        """
+        Normalizes and wraps the spans of input string with placeholder tags. It first normalizes
+        the Indic numerals in the input string to Roman script. Later, it uses the input string with normalized
+        Indic numerals to wrap the spans of text matching the pattern with placeholder tags.
+        Args:
+            text (str): input string.
+            pattern (list): list of patterns to search for in the input string.
+        Returns:
+            text (str): the modified text
+        """
+        patterns = [
+            self._EMAIL_PATTERN,
+            self._URL_PATTERN,
+            self._NUMERAL_PATTERN,
+            self._OTHER_PATTERN,
+        ]
+        text = self._normalize_indic_numerals(text.strip())
+        if self.inference:
+            text = self._wrap_with_placeholders(text, patterns)
+        return text
+    def _apply_lang_tags(
+        self, sents: List[str], src_lang: str, tgt_lang: str, delimiter=" "
+    ) -> List[str]:
+        """
+        Add special tokens indicating source and target language to the start of the each input sentence.
+        Each resulting input sentence will have the format: "`{src_lang} {tgt_lang} {input_sentence}`".
+        Args:
+            sent (str): input sentence to be translated.
+            src_lang (str): flores lang code of the input sentence.
+            tgt_lang (str): flores lang code in which the input sentence will be translated.
+        Returns:
+            List[str]: list of input sentences with the special tokens added to the start.
+        """
+        return [f"{src_lang}{delimiter}{tgt_lang}{delimiter}{x.strip()}" for x in sents]
+    def _preprocess(
+        self,
+        sent: str,
+        lang: str,
+        normalizer: Union[MosesPunctNormalizer, IndicNormalizerFactory],
+    ) -> str:
+        """
+        Preprocess an input text sentence by normalizing, tokenization, and possibly transliterating it.
+        Args:
+            sent (str): input text sentence to preprocess.
+            normalizer (Union[MosesPunctNormalizer, IndicNormalizerFactory]): an object that performs normalization on the text.
+            lang (str): flores language code of the input text sentence.
+        Returns:
+            sent (str): a preprocessed input text sentence
+        """
+        iso_lang = self._flores_codes[lang]
+        sent = self._punc_norm(sent)
+        sent = self._normalize(sent)
+        transliterate = True
+        if lang.split("_")[1] in ["Arab", "Aran", "Olck", "Mtei", "Latn"]:
+            transliterate = False
+        if iso_lang == "en":
+            processed_sent = " ".join(
+                self._en_tok.tokenize(
+                    self._en_normalizer.normalize(sent.strip()), escape=False
+                )
+            )
+        elif transliterate:
+            # transliterates from the any specific language to devanagari
+            # which is why we specify lang2_code as "hi".
+            processed_sent = self._xliterator.transliterate(
+                " ".join(
+                    indic_tokenize.trivial_tokenize(
+                        normalizer.normalize(sent.strip()), iso_lang
+                    )
+                ),
+                iso_lang,
+                "hi",
+            ).replace(" ् ", "्")
+        else:
+            # we only need to transliterate for joint training
+            processed_sent = " ".join(
+                indic_tokenize.trivial_tokenize(
+                    normalizer.normalize(sent.strip()), iso_lang
+                )
+            )
+        return processed_sent
+    def preprocess_batch(
+        self, batch: List[str], src_lang: str, tgt_lang: str, is_target: bool = False
+    ) -> List[str]:
+        """
+        Preprocess an array of sentences by normalizing, tokenization, and possibly transliterating it. It also tokenizes the
+        normalized text sequences using sentence piece tokenizer and also adds language tags.
+        Args:
+            batch (List[str]): input list of sentences to preprocess.
+            src_lang (str): flores language code of the input text sentences.
+            tgt_lang (str): flores language code of the output text sentences.
+            is_target (bool): add language tags if false otherwise skip it.
+        Returns:
+            List[str]: a list of preprocessed input text sentences.
+        """
+        # reset the placeholder entity map for each batch
+        normalizer = (
+            IndicNormalizerFactory().get_normalizer(self._flores_codes[src_lang])
+            if src_lang != "eng_Latn"
+            else None
+        )
+        preprocessed_sents = [
+            self._preprocess(sent, src_lang, normalizer) for sent in batch
+        ]
+        tagged_sents = (
+            self._apply_lang_tags(preprocessed_sents, src_lang, tgt_lang)
+            if not is_target
+            else preprocessed_sents
+        )
+        return tagged_sents
+    def _postprocess(
+        self,
+        sent: str,
+        placeholder_entity_map: dict,
+        lang: str = "hin_Deva",
+    ):
+        """
+        Postprocesses a single input sentence after the translation generation.
+        Args:
+            sent (str): input sentence to postprocess.
+            placeholder_entity_map (dict): dictionary mapping placeholders to the original entity values.
+            lang (str): flores language code of the input sentence.
+        Returns:
+            text (str): postprocessed input sentence.
+        """
+        lang_code, script_code = lang.split("_")
+        iso_lang = self._flores_codes[lang]
+        # Fixes for Perso-Arabic scripts
+        if script_code in ["Arab", "Aran"]:
+            sent = (
+                sent.replace(" ؟", "؟")
+                .replace(" ۔", "۔")
+                .replace(" ،", "،")
+                .replace("ٮ۪", "ؠ")
+            )
+        if lang_code == "ory":
+            sent = sent.replace("ଯ଼", "ୟ")
+        for k, v in placeholder_entity_map.items():
+            sent = sent.replace(k, v)
+        return (
+            self._en_detok.detokenize(sent.split(" "))
+            if lang == "eng_Latn"
+            else indic_detokenize.trivial_detokenize(
+                self._xliterator.transliterate(sent, "hi", iso_lang),
+                iso_lang,
+            )
+        )
+    def postprocess_batch(
+        self,
+        sents: List[str],
+        lang: str = "hin_Deva",
+    ) -> List[str]:
+        """
+        Postprocesses a batch of input sentences after the translation generations.
+        Args:
+            sents (List[str]): batch of translated sentences to postprocess.
+            placeholder_entity_map (List[dict]): dictionary mapping placeholders to the original entity values.
+            lang (str): flores language code of the input sentences.
+        Returns:
+            List[str]: postprocessed batch of input sentences.
+        """
+        placeholder_entity_maps = self.get_placeholder_entity_maps()
+        postprocessed_sents = [
+            self._postprocess(sent, placeholder_entity_map, lang)
+            for sent, placeholder_entity_map in zip(sents, placeholder_entity_maps)
+        ]
+        # reset the placeholder entity map after each batch
+        self._placeholder_entity_maps.clear()
+        return postprocessed_sents

IndicTransTokenizer/IndicTransTokenizer/version.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.1.1"

IndicTransTokenizer/IndicTransTokenizer/version.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ 0.1.1

IndicTransTokenizer/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) Varun Gumma.
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE

IndicTransTokenizer/README.md ADDED Viewed

	@@ -0,0 +1,77 @@

+# IndicTransTokenizer
+The goal of this repository is to provide a simple, modular, and extendable tokenizer for [IndicTrans2](https://github.com/AI4Bharat/IndicTrans2) and be compatible with the HuggingFace models released.
+## Pre-requisites
+ - `Python 3.8+`
+ - [Indic NLP Library](https://github.com/VarunGumma/indic_nlp_library)
+ - Other requirements as listed in `requirements.txt`
+## Configuration
+ - Editable installation (Note, this may take a while):
+```bash
+git clone https://github.com/VarunGumma/IndicTransTokenizer
+cd IndicTransTokenizer
+pip install --editable ./
+```
+## Usage
+```python
+import torch
+from transformers import AutoModelForSeq2SeqLM
+from IndicTransTokenizer import IndicProcessor, IndicTransTokenizer
+tokenizer = IndicTransTokenizer(direction="en-indic")
+ip = IndicProcessor(inference=True)
+model = AutoModelForSeq2SeqLM.from_pretrained("ai4bharat/indictrans2-en-indic-dist-200M", trust_remote_code=True)
+sentences = [
+    "This is a test sentence.",
+    "This is another longer different test sentence.",
+    "Please send an SMS to 9876543210 and an email on [email protected] by 15th October, 2023.",
+]
+batch = ip.preprocess_batch(sentences, src_lang="eng_Latn", tgt_lang="hin_Deva")
+batch = tokenizer(batch, src=True, return_tensors="pt")
+with torch.inference_mode():
+    outputs = model.generate(**batch, num_beams=5, num_return_sequences=1, max_length=256)
+outputs = tokenizer.batch_decode(outputs, src=False)
+outputs = ip.postprocess_batch(outputs, lang="hin_Deva")
+print(outputs)
+>>> ['यह एक परीक्षण वाक्य है।', 'यह एक और लंबा अलग परीक्षण वाक्य है।', 'कृपया 9876543210 पर एक एस. एम. एस. भेजें और 15 अक्टूबर, 2023 तक [email protected] पर एक ईमेल भेजें।']
+```
+For using the tokenizer to train/fine-tune the model, just set the `inference` argument of IndicProcessor to `False`.
+## Authors
+ - Varun Gumma ([email protected])
+ - Jay Gala ([email protected])
+ - Pranjal Agadh Chitale ([email protected])
+ - Raj Dabre ([email protected])
+## Bugs and Contribution
+Since this a bleeding-edge module, you may encounter broken stuff and import issues once in a while. In case you encounter any bugs or want additional functionalities, please feel free to raise `Issues`/`Pull Requests` or contact the authors.
+## Citation
+If you use our codebase, models or tokenizer, please do cite the following paper:
+```bibtex
+@article{
+    gala2023indictrans,
+    title={IndicTrans2: Towards High-Quality and Accessible Machine Translation Models for all 22 Scheduled Indian Languages},
+    author={Jay Gala and Pranjal A Chitale and A K Raghavan and Varun Gumma and Sumanth Doddapaneni and Aswanth Kumar M and Janki Atul Nawale and Anupama Sujatha and Ratish Puduppully and Vivek Raghavan and Pratyush Kumar and Mitesh M Khapra and Raj Dabre and Anoop Kunchukuttan},
+    journal={Transactions on Machine Learning Research},
+    issn={2835-8856},
+    year={2023},
+    url={https://openreview.net/forum?id=vfT4YuzAYA},
+    note={}
+}
+```
+## Note
+This tokenizer module is currently **not** compatible with the [PreTrainedTokenizer](https://huggingface.co/docs/transformers/v4.36.1/en/main_classes/tokenizer#transformers.PreTrainedTokenizer) module from HuggingFace. Hence, we are actively looking for `Pull Requests` to port this tokenizer to HF. Any leads on that front are welcome!

IndicTransTokenizer/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+setuptools==68.2.2
+torch
+sacremoses
+sentencepiece
+transformers
+indic-nlp-library-IT2 @ git+https://github.com/VarunGumma/indic_nlp_library

IndicTransTokenizer/setup.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import os
+import pathlib
+from sys import version_info, exit
+from setuptools import setup, find_packages
+from pkg_resources import parse_requirements
+def write_version_py():
+    with open(os.path.join("IndicTransTokenizer", "version.txt"), "r") as f:
+        version = f.read().strip()
+    with open(os.path.join("IndicTransTokenizer", "version.py"), "w") as f:
+        f.write(f'__version__ = "{version}"\n')
+    return version
+if version_info < (3, 8):
+    exit("Sorry, Python >= 3.8 is required for IndicTransTokenizer.")
+with open("README.md", "r", errors="ignore", encoding="utf-8") as fh:
+    long_description = fh.read().strip()
+version = write_version_py()
+setup(
+    name="IndicTransTokenizer",
+    version=version,
+    author="Varun Gumma",
+    author_email="[email protected]",
+    description="A simple, consistent, and extendable module for IndicTrans2 tokenizer compatible with the HuggingFace models",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/VarunGumma/IndicTransTokenizer",
+    packages=find_packages(),
+    license="MIT",
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires=">=3.8",
+    install_requires=[
+        str(requirement)
+        for requirement in parse_requirements(pathlib.Path(f"requirements.txt").open())
+    ],
+)

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
-title: Indictrans2 Conversation
-emoji: 🚀
-colorFrom: pink
-colorTo: gray
 sdk: gradio
 sdk_version: 4.21.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: IndicTrans2 for Conversation
+emoji: 📚
+colorFrom: purple
+colorTo: red
 sdk: gradio
 sdk_version: 4.21.0
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import gradio as gr
+import time
+from config import model_repo_id, src_lang, tgt_lang
+from indictrans2 import initialize_model_and_tokenizer, batch_translate
+from examples import example_sentences
+def load_models():
+    model_dict = {}
+    print("\tLoading model: %s" % model_repo_id)
+    # build model and tokenizer
+    en_indic_tokenizer, en_indic_model, en_indic_lora_model = (
+        initialize_model_and_tokenizer()
+    )
+    model_dict["_tokenizer"] = en_indic_tokenizer
+    model_dict["_model"] = en_indic_model
+    model_dict["_lora_model"] = en_indic_lora_model
+    return model_dict
+def translation(text):
+    start_time = time.time()
+    tokenizer = model_dict["_tokenizer"]
+    model = model_dict["_model"]
+    lora_model = model_dict["_lora_model"]
+    # org translation
+    org_translation = batch_translate(
+        [text],
+        model=model,
+        tokenizer=tokenizer,
+    )
+    org_output = org_translation[0]
+    end_time = time.time()
+    # lora translation
+    lora_translation = batch_translate(
+        [text],
+        model=lora_model,
+        tokenizer=tokenizer,
+    )
+    lora_output = lora_translation[0]
+    end_time2 = time.time()
+    result = {
+        "source": src_lang,
+        "target": tgt_lang,
+        "input": text,
+        "it2_result": org_output,
+        "it2_conv_result": lora_output,
+        "it2_inference_time": end_time - start_time,
+        "it2_conv_inference_time": end_time2 - end_time,
+    }
+    return result
+print("\tinit models")
+global model_dict
+model_dict = load_models()
+inputs = gr.Textbox(lines=5, label="Input text")
+outputs = gr.JSON(container=True)
+submit_btn = gr.Button("Translate", variant="primary")
+title = "IndicTrans2 fine-tuned on conversation"
+description = f"Note: LoRA is trained only on En-Hi pair.\nDetails: https://github.com/AI4Bharat/IndicTrans2.\nLoRA Model: https://huggingface.co/sam749/IndicTrans2-Conv"
+gr.Interface(
+    fn=translation,
+    inputs=inputs,
+    outputs=outputs,
+    title=title,
+    description=description,
+    submit_btn=submit_btn,
+    examples=example_sentences,
+    examples_per_page=10,
+    cache_examples=False,
+).launch(share=True)

config.py ADDED Viewed

	@@ -0,0 +1,5 @@

+model_repo_id = "ai4bharat/indictrans2-en-indic-dist-200M"
+lora_repo_id = "sam749/IndicTrans2-Conv"
+src_lang = "eng_Latn"
+tgt_lang = "hin_Deva"
+batch_size = 8

examples.py ADDED Viewed

	@@ -0,0 +1,11 @@

+example_sentences = [
+    ['Avantika to Prakash: Did you mean "I play cricket"? What position do you play?'],
+    ["'do you eat pizza?', Manoj said to Jaya"],
+    ["Ankita to Avantika: can you come with me to tour?"],
+    [
+        'Sudha to Sakshi: Did you mean "I\'ll grab some coffee before the meeting starts."? Can I join you too?'
+    ],
+    [
+        'Anil to Sakshi: Did you mean "I\'ll grab some coffee before the meeting starts."? Can I join you too?'
+    ],
+]

indictrans2.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import torch
+from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig
+from IndicTransTokenizer.IndicTransTokenizer.utils import IndicProcessor
+from IndicTransTokenizer.IndicTransTokenizer.tokenizer import IndicTransTokenizer
+from peft import PeftModel
+from config import lora_repo_id, model_repo_id, batch_size, src_lang, tgt_lang
+DIRECTION = "en-indic"
+QUANTIZATION = None
+IP = IndicProcessor(inference=True)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+HALF = True if torch.cuda.is_available() else False
+def initialize_model_and_tokenizer():
+    if QUANTIZATION == "4-bit":
+        qconfig = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
+    elif QUANTIZATION == "8-bit":
+        qconfig = BitsAndBytesConfig(
+            load_in_8bit=True,
+            bnb_8bit_use_double_quant=True,
+            bnb_8bit_compute_dtype=torch.bfloat16,
+        )
+    else:
+        qconfig = None
+    tokenizer = IndicTransTokenizer(direction=DIRECTION)
+    model = AutoModelForSeq2SeqLM.from_pretrained(
+        model_repo_id,
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+        quantization_config=qconfig,
+    )
+    model2 = AutoModelForSeq2SeqLM.from_pretrained(
+        model_repo_id,
+        trust_remote_code=True,
+        low_cpu_mem_usage=True,
+        quantization_config=qconfig,
+    )
+    if qconfig == None:
+        model = model.to(DEVICE)
+        model2 = model2.to(DEVICE)
+    model.eval()
+    model2.eval()
+    lora_model = PeftModel.from_pretrained(model2, lora_repo_id)
+    return tokenizer, model, lora_model
+def batch_translate(input_sentences, model, tokenizer):
+    translations = []
+    for i in range(0, len(input_sentences), batch_size):
+        batch = input_sentences[i : i + batch_size]
+        # Preprocess the batch and extract entity mappings
+        batch = IP.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)
+        # Tokenize the batch and generate input encodings
+        inputs = tokenizer(
+            batch,
+            src=True,
+            truncation=True,
+            padding="longest",
+            return_tensors="pt",
+            return_attention_mask=True,
+        ).to(DEVICE)
+        # Generate translations using the model
+        with torch.inference_mode():
+            generated_tokens = model.generate(
+                **inputs,
+                use_cache=True,
+                min_length=0,
+                max_length=256,
+                num_beams=5,
+                num_return_sequences=1,
+            )
+        # Decode the generated tokens into text
+        generated_tokens = tokenizer.batch_decode(
+            generated_tokens.detach().cpu().tolist(), src=False
+        )
+        # Postprocess the translations, including entity replacement
+        translations += IP.postprocess_batch(generated_tokens, lang=tgt_lang)
+        del inputs
+    return translations

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+indic-nlp-library-IT2 @ git+https://github.com/VarunGumma/indic_nlp_library
+setuptools==68.2.2
+transformers
+gradio
+torch
+peft
+sacremoses
+sentencepiece