import numpy as np, os from pathlib import Path SEP_ID = 4 # token id — confirm this DOCS_PER_SOURCE = 100 # 100 docs per source = ~2000 total OUT_PATH = "/data/val.bin" # All source files except textbook SOURCES = [ # fineweb "tokenized/fineweb__000_00007.bin", # wikipedia "tokenized/wikipedia__train-00005-of-00041.bin", # openwebmath "tokenized/openwebmath__train-00000-of-00114.bin", # phi "tokenized/phi__programming_books.bin", # code — all 16 languages "tokenized/code__shard_000000_CSS.bin", "tokenized/code__shard_000000_C.bin", "tokenized/code__shard_000000_C++.bin", "tokenized/code__shard_000000_Java.bin", "tokenized/code__shard_000000_GO.bin", "tokenized/code__shard_000000_Rust.bin", "tokenized/code__shard_000000_Ruby.bin", "tokenized/code__shard_000000_PHP.bin", "tokenized/code__shard_000000_SQL.bin", "tokenized/code__shard_000000_C%23.bin", "tokenized/code__shard_000000_Scala.bin", "tokenized/code__shard_000000_Lua.bin", "tokenized/code__shard_000000_Perl.bin", ] def extract_docs(bin_path, sep_id, n_docs): """Stream file, split on sep, return first n_docs.""" docs = [] current = [] CHUNK = 1_000_000 with open(bin_path, "rb") as f: while len(docs) < n_docs: raw = f.read(CHUNK * 2) if not raw: break tokens = np.frombuffer(raw, dtype=np.uint16) for tok in tokens: if tok == sep_id: if current: docs.append(np.array(current, dtype=np.uint16)) current = [] if len(docs) >= n_docs: break else: current.append(int(tok)) return docs all_docs = [] for src in SOURCES: path = f"/data/{src}" if not os.path.exists(path): print(f" Missing: {src}") continue docs = extract_docs(path, SEP_ID, DOCS_PER_SOURCE) all_docs.extend(docs) print(f" {src.split('/')[-1]}: {len(docs)} docs") print(f"\nTotal val docs: {len(all_docs):,}") # Write to val.bin with open(OUT_PATH, "wb") as f: for doc in all_docs: doc_with_sep = np.append(doc, SEP_ID).astype(np.uint16) doc_with_sep.tofile(f) print(f"val.bin written: {os.path.getsize(OUT_PATH)/1e6:.1f} MB")