File size: 2,406 Bytes
f650f50
 
5ac0c16
8f8e283
f650f50
 
 
 
 
 
 
 
 
 
 
 
 
 
27af78d
f650f50
 
 
c4ba27c
f650f50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import numpy as np, os
from pathlib import Path

SEP_ID      = 4  # <sep> token id — confirm this
DOCS_PER_SOURCE = 100  # 100 docs per source = ~2000 total
OUT_PATH    = "/data/val.bin"

# All source files except textbook
SOURCES = [
    # fineweb
    "tokenized/fineweb__000_00007.bin",
    # wikipedia
    "tokenized/wikipedia__train-00005-of-00041.bin",
    # openwebmath
    "tokenized/openwebmath__train-00000-of-00114.bin",
    # phi
    "tokenized/phi__programming_books.bin",
    # code — all 16 languages
    "tokenized/code__shard_000000_CSS.bin",
    "tokenized/code__shard_000000_C.bin",
    "tokenized/code__shard_000000_C++.bin",
    "tokenized/code__shard_000000_Java.bin",
    "tokenized/code__shard_000000_GO.bin",
    "tokenized/code__shard_000000_Rust.bin",
    "tokenized/code__shard_000000_Ruby.bin",
    "tokenized/code__shard_000000_PHP.bin",
    "tokenized/code__shard_000000_SQL.bin",
    "tokenized/code__shard_000000_C%23.bin",
    "tokenized/code__shard_000000_Scala.bin",
    "tokenized/code__shard_000000_Lua.bin",
    "tokenized/code__shard_000000_Perl.bin",
]

def extract_docs(bin_path, sep_id, n_docs):
    """Stream file, split on sep, return first n_docs."""
    docs    = []
    current = []
    CHUNK   = 1_000_000
    with open(bin_path, "rb") as f:
        while len(docs) < n_docs:
            raw = f.read(CHUNK * 2)
            if not raw:
                break
            tokens = np.frombuffer(raw, dtype=np.uint16)
            for tok in tokens:
                if tok == sep_id:
                    if current:
                        docs.append(np.array(current, dtype=np.uint16))
                        current = []
                        if len(docs) >= n_docs:
                            break
                else:
                    current.append(int(tok))
    return docs

all_docs = []
for src in SOURCES:
    path = f"/data/{src}"
    if not os.path.exists(path):
        print(f"  Missing: {src}")
        continue
    docs = extract_docs(path, SEP_ID, DOCS_PER_SOURCE)
    all_docs.extend(docs)
    print(f"  {src.split('/')[-1]}: {len(docs)} docs")

print(f"\nTotal val docs: {len(all_docs):,}")

# Write to val.bin
with open(OUT_PATH, "wb") as f:
    for doc in all_docs:
        doc_with_sep = np.append(doc, SEP_ID).astype(np.uint16)
        doc_with_sep.tofile(f)

print(f"val.bin written: {os.path.getsize(OUT_PATH)/1e6:.1f} MB")