| import os |
| import gc |
| from pathlib import Path |
|
|
| TOK_DIR = "/data/tokenized" |
| OUT_DIR = "/data/ready" |
|
|
| CRAWL_MAP = { |
| "2025-05": "fw-00001-of-00006.bin", |
| "2025-08": "fw-00002-of-00006.bin", |
| "2025-13": "fw-00003-of-00006.bin", |
| "2025-18": "fw-00004-of-00006.bin", |
| "2025-21": "fw-00005-of-00006.bin", |
| "2025-28": "fw-00006-of-00006.bin", |
| } |
|
|
| os.makedirs(OUT_DIR, exist_ok=True) |
| print("Starting merge...") |
| print(f"TOK_DIR sample: {list(Path(TOK_DIR).iterdir())[:5]}") |
|
|
| index = {} |
|
|
| for crawl_id, out_name in CRAWL_MAP.items(): |
| out_path = Path(OUT_DIR) / out_name |
| if out_path.exists(): |
| print(f" SKIP {out_name}") |
| continue |
|
|
| shards = sorted(Path(TOK_DIR).glob(f"cc{crawl_id}_*.bin")) |
| print(f" {crawl_id}: {len(shards)} shards") |
| if not shards: |
| continue |
|
|
| total_tokens = 0 |
| with open(out_path, "wb") as out_f: |
| for shard in shards: |
| data = shard.read_bytes() |
| out_f.write(data) |
| total_tokens += len(data) // 2 |
| shard.unlink() |
| del data |
| gc.collect() |
| print(f" ✓ {shard.name}") |
|
|
| index[out_name] = {"crawl": crawl_id, "total_tokens": total_tokens} |
| print(f" ✓ {out_name} | {total_tokens:,} tokens") |
|
|
| Path("/data/index.json").write_text(__import__("json").dumps(index, indent=2)) |
| print("Done") |