Adding correct models 10k steps
Browse files- flax_model.msgpack +2 -2
- pytorch_model.bin +3 -0
- tokens.py +2 -2
flax_model.msgpack
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5ff31ebb2460dbc41a160cc755d0555bb8c84672563808b968a2a121c1b2414a
|
3 |
+
size 711587941
|
pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b4265b625a915f8a622926c9be27d6b1f3f2bc44481f81ab5d53eace54a0bc06
|
3 |
+
size 1421780139
|
tokens.py
CHANGED
@@ -3,11 +3,11 @@ from datasets import load_dataset
|
|
3 |
from tokenizers import ByteLevelBPETokenizer
|
4 |
|
5 |
# Load dataset
|
6 |
-
dataset = load_dataset("oscar", "unshuffled_deduplicated_es")
|
7 |
|
8 |
# Instantiate tokenizer
|
9 |
tokenizer = ByteLevelBPETokenizer()
|
10 |
-
def batch_iterator(batch_size=
|
11 |
for i in range(0, len(dataset), batch_size):
|
12 |
yield dataset["text"][i: i + batch_size]
|
13 |
|
|
|
3 |
from tokenizers import ByteLevelBPETokenizer
|
4 |
|
5 |
# Load dataset
|
6 |
+
dataset = load_dataset("oscar", "unshuffled_deduplicated_es", split="train")
|
7 |
|
8 |
# Instantiate tokenizer
|
9 |
tokenizer = ByteLevelBPETokenizer()
|
10 |
+
def batch_iterator(batch_size=1_000_000):
|
11 |
for i in range(0, len(dataset), batch_size):
|
12 |
yield dataset["text"][i: i + batch_size]
|
13 |
|