File size: 1,673 Bytes
511a4f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""
Run the chunking pipeline on all processed documents.

OPTIMIZATION: Checks existing chunks before loading model,
so if everything is already chunked, we exit immediately
without loading 110MB embedding model.
"""


import json
from pathlib import Path

from src.utils.logger import setup_logger, get_logger
from src.processing.chunker import ChunkingPipeline
from config.settings import PROCESSED_DIR, CHUNKS_DIR



setup_logger()
logger = get_logger(__name__)



def count_remaining(strategy: str) -> int:
    """Count how many papers still need chunking."""

    processed = list(PROCESSED_DIR.glob("*.json"))
    remaining = 0

    for f in processed:
        paper_id    = f.stem
        output_path = CHUNKS_DIR / f"{paper_id}_{strategy}.json"

        if not output_path.exists():
            remaining += 1

        return remaining



def main():
    strategy    = 'semantic'
    remaining   = count_remaining(strategy)


    logger.info(f"Papers remaining to chunk: {remaining}")


    if remaining == 0:
        logger.info("All papers already chunked. Nothing to do.")

        # Print summary of existing chunks
        chunk_files = list(CHUNKS_DIR.glob(f"*_{strategy}.json"))
        total = 0
        for cf in chunk_files:
            with open(cf) as f:
                chunks = json.load()

            total += len(chunks)

        logger.info(f"Existing chunks: {total} across {len(chunk_files)} papers")

    logger.info(f"Starting chunking pipeline for {remaining} papers...")
    pipeline = ChunkingPipeline(strategy = strategy)
    stats    = pipeline.run(PROCESSED_DIR)
    logger.info(f"Done: {stats}")


if __name__ == "__main__":
    main()