KoichiYasuoka
commited on
Commit
•
7504d8b
1
Parent(s):
a3a1ef7
model improved
Browse files- config.json +2 -2
- maker.sh → maker.py +28 -48
- oldtokenizer.json +0 -0
- pytorch_model-00001-of-00002.bin +1 -1
- pytorch_model-00002-of-00002.bin +1 -1
- tokenizer_config.json +1 -1
- ud.py +8 -2
config.json
CHANGED
@@ -365,9 +365,9 @@
|
|
365 |
"rotary_emb_base": 10000,
|
366 |
"rotary_pct": 1.0,
|
367 |
"tie_word_embeddings": false,
|
368 |
-
"tokenizer_class": "
|
369 |
"torch_dtype": "float32",
|
370 |
-
"transformers_version": "4.
|
371 |
"use_cache": true,
|
372 |
"use_parallel_residual": false,
|
373 |
"vocab_size": 52096
|
|
|
365 |
"rotary_emb_base": 10000,
|
366 |
"rotary_pct": 1.0,
|
367 |
"tie_word_embeddings": false,
|
368 |
+
"tokenizer_class": "GPTNeoXTokenizerFast",
|
369 |
"torch_dtype": "float32",
|
370 |
+
"transformers_version": "4.44.2",
|
371 |
"use_cache": true,
|
372 |
"use_parallel_residual": false,
|
373 |
"vocab_size": 52096
|
maker.sh → maker.py
RENAMED
@@ -1,22 +1,17 @@
|
|
1 |
-
#! /bin/
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
D=`basename $U`
|
6 |
-
test -d $D || git clone --depth=1 $U
|
7 |
-
for F in train dev test
|
8 |
-
do cp $D/*-$F.conllu $F.conllu
|
9 |
-
done
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
d=json.loads(
|
20 |
form=set()
|
21 |
with open("train.conllu","r",encoding="utf-8") as r:
|
22 |
for s in r:
|
@@ -24,30 +19,21 @@ with open("train.conllu","r",encoding="utf-8") as r:
|
|
24 |
if len(w)==10 and w[0].isdecimal():
|
25 |
form.add(w[1])
|
26 |
m=[t for t in d["model"]["merges"] if len(t)<5]
|
27 |
-
for i in range(len(
|
28 |
-
w=
|
29 |
if len(w)==2 and w in form and not unicodedata.name(w[0]).startswith("HIRAGANA"):
|
30 |
-
k=
|
31 |
if len(k[0])==1 and len(k[1])==1:
|
32 |
-
m.append(" ".join(
|
33 |
d["model"]["merges"]=m
|
34 |
-
|
35 |
-
|
36 |
-
) > $TMPA
|
37 |
-
chmod 755 $TMPA
|
38 |
-
$TMPA
|
39 |
-
|
40 |
-
TMPB=./maker$$b.py
|
41 |
-
( echo '#! /usr/bin/env deepspeed'
|
42 |
-
echo 'src="'$S'"'
|
43 |
-
echo 'tgt="'$T'"'
|
44 |
-
cat << 'EOF'
|
45 |
-
from transformers import PreTrainedTokenizerFast,AutoConfig,GPTNeoXForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
|
46 |
|
47 |
class UDCausalDataset(object):
|
48 |
-
def __init__(self,conllu,tokenizer,embeddings=None):
|
49 |
self.conllu=open(conllu,"r",encoding="utf-8")
|
50 |
self.tokenizer=tokenizer
|
|
|
51 |
self.embeddings=embeddings
|
52 |
self.max_tokens=3
|
53 |
self.seeks=[(0,0)]
|
@@ -92,8 +78,8 @@ class UDCausalDataset(object):
|
|
92 |
if w[0].isdecimal():
|
93 |
upos.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
|
94 |
deps.append((int(w[6]),w[7]))
|
95 |
-
v=self.tokenizer(form,add_special_tokens=False)
|
96 |
if t==0:
|
|
|
97 |
i,u=[],[]
|
98 |
for j,(x,y) in enumerate(zip(v["input_ids"],upos)):
|
99 |
if x!=[]:
|
@@ -103,6 +89,7 @@ class UDCausalDataset(object):
|
|
103 |
pad=self.tokenizer.pad_token_id
|
104 |
else:
|
105 |
import torch
|
|
|
106 |
m=[]
|
107 |
for x in v["input_ids"]:
|
108 |
if x==[]:
|
@@ -130,23 +117,16 @@ class UDCausalDataset(object):
|
|
130 |
upos=u[0:self.max_tokens]
|
131 |
return {"inputs_embeds":emb[ids,:],"labels":[self.label2id[p] for p in upos]}
|
132 |
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
testDS=UDCausalDataset("test.conllu",tkz)
|
137 |
lid=trainDS(devDS,testDS)
|
138 |
cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
|
139 |
mdl=GPTNeoXForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True)
|
140 |
trainDS.embeddings=mdl.get_input_embeddings().weight
|
141 |
trainDS.max_tokens=min(trainDS.max_tokens,cfg.max_position_embeddings)
|
142 |
-
|
143 |
-
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=16,deepspeed=dsp,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
|
144 |
trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
|
145 |
trn.train()
|
146 |
trn.save_model(tgt)
|
147 |
-
|
148 |
-
EOF
|
149 |
-
) > $TMPB
|
150 |
-
chmod 755 $TMPB
|
151 |
-
$TMPB
|
152 |
-
exit
|
|
|
1 |
+
#! /usr/bin/python3
|
2 |
+
src="cyberagent/open-calm-1b"
|
3 |
+
tgt="KoichiYasuoka/open-calm-1b-ud-causal"
|
4 |
+
url="https://github.com/UniversalDependencies/UD_Japanese-GSDLUW"
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
+
import os,json,unicodedata
|
7 |
+
from transformers import AutoTokenizer,AutoConfig,GPTNeoXForTokenClassification,DefaultDataCollator,TrainingArguments,Trainer
|
8 |
+
d=os.path.basename(url)
|
9 |
+
os.system("test -d "+d+" || git clone --depth=1 "+url)
|
10 |
+
os.system("for F in train dev test ; do cp "+d+"/*-$F.conllu $F.conllu ; done")
|
11 |
+
otk=AutoTokenizer.from_pretrained(src,cls_token="<|endoftext|>",sep_token="<|endoftext|>",mask_token="<|endoftext|>",model_max_length=2048)
|
12 |
+
otk.save_pretrained("tmpdir")
|
13 |
+
os.rename("tmpdir/tokenizer.json","tmpdir/oldtokenizer.json")
|
14 |
+
d=json.loads(otk.backend_tokenizer.to_str())
|
15 |
form=set()
|
16 |
with open("train.conllu","r",encoding="utf-8") as r:
|
17 |
for s in r:
|
|
|
19 |
if len(w)==10 and w[0].isdecimal():
|
20 |
form.add(w[1])
|
21 |
m=[t for t in d["model"]["merges"] if len(t)<5]
|
22 |
+
for i in range(len(otk)):
|
23 |
+
w=otk.decode(i)
|
24 |
if len(w)==2 and w in form and not unicodedata.name(w[0]).startswith("HIRAGANA"):
|
25 |
+
k=otk([w[0],w[1]],add_special_tokens=False)["input_ids"]
|
26 |
if len(k[0])==1 and len(k[1])==1:
|
27 |
+
m.append(" ".join(otk.convert_ids_to_tokens([k[0][0],k[1][0]])))
|
28 |
d["model"]["merges"]=m
|
29 |
+
otk.backend_tokenizer.from_str(json.dumps(d)).save("tmpdir/tokenizer.json")
|
30 |
+
ntk=AutoTokenizer.from_pretrained("tmpdir")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
class UDCausalDataset(object):
|
33 |
+
def __init__(self,conllu,tokenizer,oldtokenizer=None,embeddings=None):
|
34 |
self.conllu=open(conllu,"r",encoding="utf-8")
|
35 |
self.tokenizer=tokenizer
|
36 |
+
self.oldtokenizer=oldtokenizer if oldtokenizer else tokenizer
|
37 |
self.embeddings=embeddings
|
38 |
self.max_tokens=3
|
39 |
self.seeks=[(0,0)]
|
|
|
78 |
if w[0].isdecimal():
|
79 |
upos.append(w[3] if w[5]=="_" else w[3]+"|"+w[5])
|
80 |
deps.append((int(w[6]),w[7]))
|
|
|
81 |
if t==0:
|
82 |
+
v=self.tokenizer(form,add_special_tokens=False)
|
83 |
i,u=[],[]
|
84 |
for j,(x,y) in enumerate(zip(v["input_ids"],upos)):
|
85 |
if x!=[]:
|
|
|
89 |
pad=self.tokenizer.pad_token_id
|
90 |
else:
|
91 |
import torch
|
92 |
+
v=self.oldtokenizer(form,add_special_tokens=False)
|
93 |
m=[]
|
94 |
for x in v["input_ids"]:
|
95 |
if x==[]:
|
|
|
117 |
upos=u[0:self.max_tokens]
|
118 |
return {"inputs_embeds":emb[ids,:],"labels":[self.label2id[p] for p in upos]}
|
119 |
|
120 |
+
trainDS=UDCausalDataset("train.conllu",ntk,otk)
|
121 |
+
devDS=UDCausalDataset("dev.conllu",ntk,otk)
|
122 |
+
testDS=UDCausalDataset("test.conllu",ntk,otk)
|
|
|
123 |
lid=trainDS(devDS,testDS)
|
124 |
cfg=AutoConfig.from_pretrained(src,num_labels=len(lid),label2id=lid,id2label={i:l for l,i in lid.items()},ignore_mismatched_sizes=True)
|
125 |
mdl=GPTNeoXForTokenClassification.from_pretrained(src,config=cfg,ignore_mismatched_sizes=True)
|
126 |
trainDS.embeddings=mdl.get_input_embeddings().weight
|
127 |
trainDS.max_tokens=min(trainDS.max_tokens,cfg.max_position_embeddings)
|
128 |
+
arg=TrainingArguments(num_train_epochs=3,per_device_train_batch_size=24,dataloader_pin_memory=False,output_dir=tgt,overwrite_output_dir=True,save_total_limit=2,learning_rate=5e-05,warmup_ratio=0.1,save_safetensors=False)
|
|
|
129 |
trn=Trainer(args=arg,data_collator=DefaultDataCollator(),model=mdl,train_dataset=trainDS)
|
130 |
trn.train()
|
131 |
trn.save_model(tgt)
|
132 |
+
ntk.save_pretrained(tgt)
|
|
|
|
|
|
|
|
|
|
oldtokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pytorch_model-00001-of-00002.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 4992712735
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bbcbc6d719e352d5f58342e7bbb5fa7fec821081125009b3a55bdc51af03e031
|
3 |
size 4992712735
|
pytorch_model-00002-of-00002.bin
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 269925742
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3051174186ace7c287ba585ed20f81b1178c2e71e42803be81c823f30e2b94ea
|
3 |
size 269925742
|
tokenizer_config.json
CHANGED
@@ -28,6 +28,6 @@
|
|
28 |
"model_max_length": 2048,
|
29 |
"pad_token": "<|padding|>",
|
30 |
"sep_token": "<|endoftext|>",
|
31 |
-
"tokenizer_class": "
|
32 |
"unk_token": "<|endoftext|>"
|
33 |
}
|
|
|
28 |
"model_max_length": 2048,
|
29 |
"pad_token": "<|padding|>",
|
30 |
"sep_token": "<|endoftext|>",
|
31 |
+
"tokenizer_class": "GPTNeoXTokenizerFast",
|
32 |
"unk_token": "<|endoftext|>"
|
33 |
}
|
ud.py
CHANGED
@@ -1,5 +1,10 @@
|
|
1 |
import numpy
|
2 |
-
from transformers import TokenClassificationPipeline
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
5 |
def __init__(self,**kwargs):
|
@@ -42,6 +47,7 @@ class UniversalDependenciesCausalPipeline(BellmanFordTokenClassificationPipeline
|
|
42 |
def __init__(self,**kwargs):
|
43 |
kwargs["aggregation_strategy"]="simple"
|
44 |
super().__init__(**kwargs)
|
|
|
45 |
x=self.model.config.label2id
|
46 |
self.root=numpy.full((len(x)),numpy.nan)
|
47 |
self.left_arc=numpy.full((len(x)),numpy.nan)
|
@@ -87,7 +93,7 @@ class UniversalDependenciesCausalPipeline(BellmanFordTokenClassificationPipeline
|
|
87 |
if d[i].strip()=="":
|
88 |
d.pop(i)
|
89 |
w.pop(i)
|
90 |
-
v=self.
|
91 |
e=self.model.get_input_embeddings().weight
|
92 |
m=[]
|
93 |
for x in v["input_ids"]:
|
|
|
1 |
import numpy
|
2 |
+
from transformers import TokenClassificationPipeline,AutoTokenizer
|
3 |
+
try:
|
4 |
+
from transformers.utils import cached_file
|
5 |
+
except:
|
6 |
+
from transformers.file_utils import cached_path,hf_bucket_url
|
7 |
+
cached_file=lambda x,y:os.path.join(x,y) if os.path.isdir(x) else cached_path(hf_bucket_url(x,y))
|
8 |
|
9 |
class BellmanFordTokenClassificationPipeline(TokenClassificationPipeline):
|
10 |
def __init__(self,**kwargs):
|
|
|
47 |
def __init__(self,**kwargs):
|
48 |
kwargs["aggregation_strategy"]="simple"
|
49 |
super().__init__(**kwargs)
|
50 |
+
self.oldtokenizer=AutoTokenizer.from_pretrained(self.tokenizer.name_or_path,tokenizer_file=cached_file(self.tokenizer.name_or_path,"oldtokenizer.json"))
|
51 |
x=self.model.config.label2id
|
52 |
self.root=numpy.full((len(x)),numpy.nan)
|
53 |
self.left_arc=numpy.full((len(x)),numpy.nan)
|
|
|
93 |
if d[i].strip()=="":
|
94 |
d.pop(i)
|
95 |
w.pop(i)
|
96 |
+
v=self.oldtokenizer(d,add_special_tokens=False)
|
97 |
e=self.model.get_input_embeddings().weight
|
98 |
m=[]
|
99 |
for x in v["input_ids"]:
|