Spaces:
Runtime error
Runtime error
| import numpy as np, os | |
| from pathlib import Path | |
| SEP_ID = 4 # <sep> token id — confirm this | |
| DOCS_PER_SOURCE = 100 # 100 docs per source = ~2000 total | |
| OUT_PATH = "/data/val.bin" | |
| # All source files except textbook | |
| SOURCES = [ | |
| # fineweb | |
| "tokenized/fineweb__000_00007.bin", | |
| # wikipedia | |
| "tokenized/wikipedia__train-00005-of-00041.bin", | |
| # openwebmath | |
| "tokenized/openwebmath__train-00000-of-00114.bin", | |
| # phi | |
| "tokenized/phi__programming_books.bin", | |
| # code — all 16 languages | |
| "tokenized/code__shard_000000_CSS.bin", | |
| "tokenized/code__shard_000000_C.bin", | |
| "tokenized/code__shard_000000_C++.bin", | |
| "tokenized/code__shard_000000_Java.bin", | |
| "tokenized/code__shard_000000_GO.bin", | |
| "tokenized/code__shard_000000_Rust.bin", | |
| "tokenized/code__shard_000000_Ruby.bin", | |
| "tokenized/code__shard_000000_PHP.bin", | |
| "tokenized/code__shard_000000_SQL.bin", | |
| "tokenized/code__shard_000000_C%23.bin", | |
| "tokenized/code__shard_000000_Scala.bin", | |
| "tokenized/code__shard_000000_Lua.bin", | |
| "tokenized/code__shard_000000_Perl.bin", | |
| ] | |
| def extract_docs(bin_path, sep_id, n_docs): | |
| """Stream file, split on sep, return first n_docs.""" | |
| docs = [] | |
| current = [] | |
| CHUNK = 1_000_000 | |
| with open(bin_path, "rb") as f: | |
| while len(docs) < n_docs: | |
| raw = f.read(CHUNK * 2) | |
| if not raw: | |
| break | |
| tokens = np.frombuffer(raw, dtype=np.uint16) | |
| for tok in tokens: | |
| if tok == sep_id: | |
| if current: | |
| docs.append(np.array(current, dtype=np.uint16)) | |
| current = [] | |
| if len(docs) >= n_docs: | |
| break | |
| else: | |
| current.append(int(tok)) | |
| return docs | |
| all_docs = [] | |
| for src in SOURCES: | |
| path = f"/data/{src}" | |
| if not os.path.exists(path): | |
| print(f" Missing: {src}") | |
| continue | |
| docs = extract_docs(path, SEP_ID, DOCS_PER_SOURCE) | |
| all_docs.extend(docs) | |
| print(f" {src.split('/')[-1]}: {len(docs)} docs") | |
| print(f"\nTotal val docs: {len(all_docs):,}") | |
| # Write to val.bin | |
| with open(OUT_PATH, "wb") as f: | |
| for doc in all_docs: | |
| doc_with_sep = np.append(doc, SEP_ID).astype(np.uint16) | |
| doc_with_sep.tofile(f) | |
| print(f"val.bin written: {os.path.getsize(OUT_PATH)/1e6:.1f} MB") | |