Spaces:

Neon-coding
/

Atrain

Runtime error

App Files Files Community

Atrain / app.py

Neon-tech

Update app.py

8f8e283 verified 5 days ago

raw

history blame contribute delete

2.41 kB

	import numpy as np, os
	from pathlib import Path

	SEP_ID = 4 # <sep> token id — confirm this
	DOCS_PER_SOURCE = 100 # 100 docs per source = ~2000 total
	OUT_PATH = "/data/val.bin"

	# All source files except textbook
	SOURCES = [
	# fineweb
	"tokenized/fineweb__000_00007.bin",
	# wikipedia
	"tokenized/wikipedia__train-00005-of-00041.bin",
	# openwebmath
	"tokenized/openwebmath__train-00000-of-00114.bin",
	# phi
	"tokenized/phi__programming_books.bin",
	# code — all 16 languages
	"tokenized/code__shard_000000_CSS.bin",
	"tokenized/code__shard_000000_C.bin",
	"tokenized/code__shard_000000_C++.bin",
	"tokenized/code__shard_000000_Java.bin",
	"tokenized/code__shard_000000_GO.bin",
	"tokenized/code__shard_000000_Rust.bin",
	"tokenized/code__shard_000000_Ruby.bin",
	"tokenized/code__shard_000000_PHP.bin",
	"tokenized/code__shard_000000_SQL.bin",
	"tokenized/code__shard_000000_C%23.bin",
	"tokenized/code__shard_000000_Scala.bin",
	"tokenized/code__shard_000000_Lua.bin",
	"tokenized/code__shard_000000_Perl.bin",
	]

	def extract_docs(bin_path, sep_id, n_docs):
	"""Stream file, split on sep, return first n_docs."""
	docs = []
	current = []
	CHUNK = 1_000_000
	with open(bin_path, "rb") as f:
	while len(docs) < n_docs:
	raw = f.read(CHUNK * 2)
	if not raw:
	break
	tokens = np.frombuffer(raw, dtype=np.uint16)
	for tok in tokens:
	if tok == sep_id:
	if current:
	docs.append(np.array(current, dtype=np.uint16))
	current = []
	if len(docs) >= n_docs:
	break
	else:
	current.append(int(tok))
	return docs

	all_docs = []
	for src in SOURCES:
	path = f"/data/{src}"
	if not os.path.exists(path):
	print(f" Missing: {src}")
	continue
	docs = extract_docs(path, SEP_ID, DOCS_PER_SOURCE)
	all_docs.extend(docs)
	print(f" {src.split('/')[-1]}: {len(docs)} docs")

	print(f"\nTotal val docs: {len(all_docs):,}")

	# Write to val.bin
	with open(OUT_PATH, "wb") as f:
	for doc in all_docs:
	doc_with_sep = np.append(doc, SEP_ID).astype(np.uint16)
	doc_with_sep.tofile(f)

	print(f"val.bin written: {os.path.getsize(OUT_PATH)/1e6:.1f} MB")