import os import gc from pathlib import Path TOK_DIR = "/data/tokenized" OUT_DIR = "/data/ready" CRAWL_MAP = { "2025-05": "fw-00001-of-00006.bin", "2025-08": "fw-00002-of-00006.bin", "2025-13": "fw-00003-of-00006.bin", "2025-18": "fw-00004-of-00006.bin", "2025-21": "fw-00005-of-00006.bin", "2025-28": "fw-00006-of-00006.bin", } os.makedirs(OUT_DIR, exist_ok=True) print("Starting merge...") print(f"TOK_DIR sample: {list(Path(TOK_DIR).iterdir())[:5]}") index = {} for crawl_id, out_name in CRAWL_MAP.items(): out_path = Path(OUT_DIR) / out_name if out_path.exists(): print(f" SKIP {out_name}") continue shards = sorted(Path(TOK_DIR).glob(f"cc{crawl_id}_*.bin")) print(f" {crawl_id}: {len(shards)} shards") if not shards: continue total_tokens = 0 with open(out_path, "wb") as out_f: for shard in shards: data = shard.read_bytes() out_f.write(data) total_tokens += len(data) // 2 shard.unlink() del data gc.collect() print(f" ✓ {shard.name}") index[out_name] = {"crawl": crawl_id, "total_tokens": total_tokens} print(f" ✓ {out_name} | {total_tokens:,} tokens") Path("/data/index.json").write_text(__import__("json").dumps(index, indent=2)) print("Done")