Tok6 / app.py
Neon-tech's picture
Update app.py
6045381 verified
import os
import gc
from pathlib import Path
TOK_DIR = "/data/tokenized"
OUT_DIR = "/data/ready"
CRAWL_MAP = {
"2025-05": "fw-00001-of-00006.bin",
"2025-08": "fw-00002-of-00006.bin",
"2025-13": "fw-00003-of-00006.bin",
"2025-18": "fw-00004-of-00006.bin",
"2025-21": "fw-00005-of-00006.bin",
"2025-28": "fw-00006-of-00006.bin",
}
os.makedirs(OUT_DIR, exist_ok=True)
print("Starting merge...")
print(f"TOK_DIR sample: {list(Path(TOK_DIR).iterdir())[:5]}")
index = {}
for crawl_id, out_name in CRAWL_MAP.items():
out_path = Path(OUT_DIR) / out_name
if out_path.exists():
print(f" SKIP {out_name}")
continue
shards = sorted(Path(TOK_DIR).glob(f"cc{crawl_id}_*.bin"))
print(f" {crawl_id}: {len(shards)} shards")
if not shards:
continue
total_tokens = 0
with open(out_path, "wb") as out_f:
for shard in shards:
data = shard.read_bytes()
out_f.write(data)
total_tokens += len(data) // 2
shard.unlink()
del data
gc.collect()
print(f" ✓ {shard.name}")
index[out_name] = {"crawl": crawl_id, "total_tokens": total_tokens}
print(f" ✓ {out_name} | {total_tokens:,} tokens")
Path("/data/index.json").write_text(__import__("json").dumps(index, indent=2))
print("Done")