Atrain / app.py
Neon-tech's picture
Update app.py
8f8e283 verified
import numpy as np, os
from pathlib import Path
SEP_ID = 4 # <sep> token id — confirm this
DOCS_PER_SOURCE = 100 # 100 docs per source = ~2000 total
OUT_PATH = "/data/val.bin"
# All source files except textbook
SOURCES = [
# fineweb
"tokenized/fineweb__000_00007.bin",
# wikipedia
"tokenized/wikipedia__train-00005-of-00041.bin",
# openwebmath
"tokenized/openwebmath__train-00000-of-00114.bin",
# phi
"tokenized/phi__programming_books.bin",
# code — all 16 languages
"tokenized/code__shard_000000_CSS.bin",
"tokenized/code__shard_000000_C.bin",
"tokenized/code__shard_000000_C++.bin",
"tokenized/code__shard_000000_Java.bin",
"tokenized/code__shard_000000_GO.bin",
"tokenized/code__shard_000000_Rust.bin",
"tokenized/code__shard_000000_Ruby.bin",
"tokenized/code__shard_000000_PHP.bin",
"tokenized/code__shard_000000_SQL.bin",
"tokenized/code__shard_000000_C%23.bin",
"tokenized/code__shard_000000_Scala.bin",
"tokenized/code__shard_000000_Lua.bin",
"tokenized/code__shard_000000_Perl.bin",
]
def extract_docs(bin_path, sep_id, n_docs):
"""Stream file, split on sep, return first n_docs."""
docs = []
current = []
CHUNK = 1_000_000
with open(bin_path, "rb") as f:
while len(docs) < n_docs:
raw = f.read(CHUNK * 2)
if not raw:
break
tokens = np.frombuffer(raw, dtype=np.uint16)
for tok in tokens:
if tok == sep_id:
if current:
docs.append(np.array(current, dtype=np.uint16))
current = []
if len(docs) >= n_docs:
break
else:
current.append(int(tok))
return docs
all_docs = []
for src in SOURCES:
path = f"/data/{src}"
if not os.path.exists(path):
print(f" Missing: {src}")
continue
docs = extract_docs(path, SEP_ID, DOCS_PER_SOURCE)
all_docs.extend(docs)
print(f" {src.split('/')[-1]}: {len(docs)} docs")
print(f"\nTotal val docs: {len(all_docs):,}")
# Write to val.bin
with open(OUT_PATH, "wb") as f:
for doc in all_docs:
doc_with_sep = np.append(doc, SEP_ID).astype(np.uint16)
doc_with_sep.tofile(f)
print(f"val.bin written: {os.path.getsize(OUT_PATH)/1e6:.1f} MB")