Spaces:
Running
Running
launch with queue
Browse files- app.py +1 -1
- compression_util.py +3 -5
- vocab.py +6 -4
app.py
CHANGED
@@ -21,4 +21,4 @@ demo = TabbedInterface(
|
|
21 |
demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
|
22 |
|
23 |
if __name__ == "__main__":
|
24 |
-
demo.launch()
|
|
|
21 |
demo.load(js=open("js/onload.js", "r", encoding="utf-8").read())
|
22 |
|
23 |
if __name__ == "__main__":
|
24 |
+
demo.queue(max_size=1024, default_concurrency_limit=80).launch()
|
compression_util.py
CHANGED
@@ -133,15 +133,14 @@ cache = {}
|
|
133 |
|
134 |
|
135 |
def tokenize_corpus(
|
136 |
-
tokenizer_name: str,
|
137 |
corpuses: List[str],
|
138 |
cache_dir: str = "stats"
|
139 |
) -> dict:
|
140 |
"""
|
141 |
-
|
142 |
-
:param tokenizer_config: 可以不加载就
|
143 |
:param corpuses:
|
144 |
-
:param
|
145 |
:return:
|
146 |
"""
|
147 |
|
@@ -157,7 +156,6 @@ def tokenize_corpus(
|
|
157 |
|
158 |
def _tokenize(tokenizer, datasets, detail_path=None):
|
159 |
"""
|
160 |
-
export_diff: true | false
|
161 |
:param tokenizer:
|
162 |
:param datasets:
|
163 |
:param detail_path:
|
|
|
133 |
|
134 |
|
135 |
def tokenize_corpus(
|
136 |
+
tokenizer_name: str,
|
137 |
corpuses: List[str],
|
138 |
cache_dir: str = "stats"
|
139 |
) -> dict:
|
140 |
"""
|
141 |
+
:param tokenizer_name:
|
|
|
142 |
:param corpuses:
|
143 |
+
:param cache_dir:
|
144 |
:return:
|
145 |
"""
|
146 |
|
|
|
156 |
|
157 |
def _tokenize(tokenizer, datasets, detail_path=None):
|
158 |
"""
|
|
|
159 |
:param tokenizer:
|
160 |
:param datasets:
|
161 |
:param detail_path:
|
vocab.py
CHANGED
@@ -182,8 +182,8 @@ class TokenizerConfig:
|
|
182 |
return hash(self.name_or_path)
|
183 |
|
184 |
|
185 |
-
# format: , description, hf_path, tokenizer_class/type, comments, Organization
|
186 |
# TODO: append link and description to the end of dropdown button.
|
|
|
187 |
_all_tokenizer_config = [
|
188 |
##### bert 系列
|
189 |
TokenizerConfig("google-bert/bert-base-cased", impl=TokenizerImpl.BertTokenizer, org="Google",
|
@@ -229,7 +229,9 @@ _all_tokenizer_config = [
|
|
229 |
TokenizerConfig("EleutherAI/gpt-neox-20b", impl=TokenizerImpl.SentencePiece, org="EleutherAI"), # 5万
|
230 |
TokenizerConfig("cyberagent/open-calm-7b", impl=TokenizerImpl.SentencePiece, org="CyberAgent"), # GPTNeoXTokenizer
|
231 |
TokenizerConfig("abeja/gpt-neox-japanese-2.7b", impl=TokenizerImpl.SentencePiece, org="ABEJA"),
|
232 |
-
TokenizerConfig("Qwen/Qwen1.5-14B
|
|
|
|
|
233 |
TokenizerConfig("HuggingFaceH4/starchat-alpha", impl=TokenizerImpl.SentencePiece, org="-"),
|
234 |
|
235 |
####### google/sentencepiece tokenizer:
|
@@ -385,7 +387,7 @@ class TokenizerFactory:
|
|
385 |
|
386 |
def get_tokenizer(self, tokenizer_name: str):
|
387 |
"""
|
388 |
-
:param
|
389 |
:return:
|
390 |
"""
|
391 |
tokenizer_config = self.get_tokenizer_config(tokenizer_name)
|
@@ -407,7 +409,7 @@ class TokenizerFactory:
|
|
407 |
self.tokenizer_cache[tokenizer_config] = tokenizer
|
408 |
return tokenizer
|
409 |
|
410 |
-
def get_name_with_hyperlink(self, tokenizer_name):
|
411 |
def model_hyperlink(link, model_name):
|
412 |
model_name = model_name
|
413 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
|
|
182 |
return hash(self.name_or_path)
|
183 |
|
184 |
|
|
|
185 |
# TODO: append link and description to the end of dropdown button.
|
186 |
+
# Add tokenizer_class/type, comments
|
187 |
_all_tokenizer_config = [
|
188 |
##### bert 系列
|
189 |
TokenizerConfig("google-bert/bert-base-cased", impl=TokenizerImpl.BertTokenizer, org="Google",
|
|
|
229 |
TokenizerConfig("EleutherAI/gpt-neox-20b", impl=TokenizerImpl.SentencePiece, org="EleutherAI"), # 5万
|
230 |
TokenizerConfig("cyberagent/open-calm-7b", impl=TokenizerImpl.SentencePiece, org="CyberAgent"), # GPTNeoXTokenizer
|
231 |
TokenizerConfig("abeja/gpt-neox-japanese-2.7b", impl=TokenizerImpl.SentencePiece, org="ABEJA"),
|
232 |
+
TokenizerConfig("Qwen/Qwen1.5-14B", impl=TokenizerImpl.SentencePiece, org="Alibaba"), # 15万,速度有点慢
|
233 |
+
TokenizerConfig("Qwen/Qwen1.5-110B ", impl=TokenizerImpl.SentencePiece, org="Alibaba"),
|
234 |
+
TokenizerConfig("Qwen/Qwen1.5-1.8B ", impl=TokenizerImpl.SentencePiece, org="Alibaba"),
|
235 |
TokenizerConfig("HuggingFaceH4/starchat-alpha", impl=TokenizerImpl.SentencePiece, org="-"),
|
236 |
|
237 |
####### google/sentencepiece tokenizer:
|
|
|
387 |
|
388 |
def get_tokenizer(self, tokenizer_name: str):
|
389 |
"""
|
390 |
+
:param tokenizer_name:
|
391 |
:return:
|
392 |
"""
|
393 |
tokenizer_config = self.get_tokenizer_config(tokenizer_name)
|
|
|
409 |
self.tokenizer_cache[tokenizer_config] = tokenizer
|
410 |
return tokenizer
|
411 |
|
412 |
+
def get_name_with_hyperlink(self, tokenizer_name: str):
|
413 |
def model_hyperlink(link, model_name):
|
414 |
model_name = model_name
|
415 |
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|