Tok6

Paused

Tok6 / app.py

Update app.py

6045381 verified about 1 month ago

1.35 kB

	import os
	import gc
	from pathlib import Path

	TOK_DIR = "/data/tokenized"
	OUT_DIR = "/data/ready"

	CRAWL_MAP = {
	"2025-05": "fw-00001-of-00006.bin",
	"2025-08": "fw-00002-of-00006.bin",
	"2025-13": "fw-00003-of-00006.bin",
	"2025-18": "fw-00004-of-00006.bin",
	"2025-21": "fw-00005-of-00006.bin",
	"2025-28": "fw-00006-of-00006.bin",
	}

	os.makedirs(OUT_DIR, exist_ok=True)
	print("Starting merge...")
	print(f"TOK_DIR sample: {list(Path(TOK_DIR).iterdir())[:5]}")

	index = {}

	for crawl_id, out_name in CRAWL_MAP.items():
	out_path = Path(OUT_DIR) / out_name
	if out_path.exists():
	print(f" SKIP {out_name}")
	continue

	shards = sorted(Path(TOK_DIR).glob(f"cc{crawl_id}_*.bin"))
	print(f" {crawl_id}: {len(shards)} shards")
	if not shards:
	continue

	total_tokens = 0
	with open(out_path, "wb") as out_f:
	for shard in shards:
	data = shard.read_bytes()
	out_f.write(data)
	total_tokens += len(data) // 2
	shard.unlink()
	del data
	gc.collect()
	print(f" ✓ {shard.name}")

	index[out_name] = {"crawl": crawl_id, "total_tokens": total_tokens}
	print(f" ✓ {out_name} \| {total_tokens:,} tokens")

	Path("/data/index.json").write_text(__import__("json").dumps(index, indent=2))
	print("Done")