File size: 5,112 Bytes
0efdc2f
 
 
 
cfb473d
8625ded
cfb473d
 
0efdc2f
 
 
cfb473d
 
0efdc2f
 
 
 
 
 
 
 
 
cfb473d
8625ded
 
cfb473d
 
0efdc2f
 
 
cfb473d
cd11dad
 
 
 
 
 
 
 
8625ded
 
 
 
cd11dad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0efdc2f
 
 
cfb473d
 
 
 
 
 
 
 
 
 
 
0efdc2f
 
 
 
cfb473d
0efdc2f
 
 
cfb473d
0efdc2f
cfb473d
 
 
 
 
 
 
 
 
0efdc2f
 
 
 
 
cfb473d
8625ded
cfb473d
 
0efdc2f
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python3
"""
Demo: Hypothesis Generation (Phase 7).

This script demonstrates the REAL hypothesis generation pipeline:
1. REAL search: PubMed + ClinicalTrials + bioRxiv (actual API calls)
2. REAL embeddings: Semantic deduplication
3. REAL LLM: Mechanistic hypothesis generation

Usage:
    # Requires OPENAI_API_KEY or ANTHROPIC_API_KEY
    uv run python examples/hypothesis_demo/run_hypothesis.py "metformin Alzheimer's"
    uv run python examples/hypothesis_demo/run_hypothesis.py "sildenafil heart failure"
"""

import argparse
import asyncio
import os
import sys
from typing import Any

from src.agents.hypothesis_agent import HypothesisAgent
from src.services.embeddings import EmbeddingService
from src.tools.biorxiv import BioRxivTool
from src.tools.clinicaltrials import ClinicalTrialsTool
from src.tools.pubmed import PubMedTool
from src.tools.search_handler import SearchHandler


async def run_hypothesis_demo(query: str) -> None:
    """Run the REAL hypothesis generation pipeline."""
    try:
        print(f"\n{'='*60}")
        print("DeepCritical Hypothesis Agent Demo (Phase 7)")
        print(f"Query: {query}")
        print("Mode: REAL (Live API calls)")
        print(f"{'='*60}\n")

        # Step 1: REAL Search
        print("[Step 1] Searching PubMed + ClinicalTrials + bioRxiv...")
        search_handler = SearchHandler(
            tools=[PubMedTool(), ClinicalTrialsTool(), BioRxivTool()], timeout=30.0
        )
        result = await search_handler.execute(query, max_results_per_tool=5)

        print(f"  Found {result.total_found} results from {result.sources_searched}")
        if result.errors:
            print(f"  Warnings: {result.errors}")

        if not result.evidence:
            print("\nNo evidence found. Try a different query.")
            return

        # Step 2: REAL Embeddings - Deduplicate
        print("\n[Step 2] Semantic deduplication...")
        embedding_service = EmbeddingService()
        unique_evidence = await embedding_service.deduplicate(result.evidence, threshold=0.85)
        print(f"  {len(result.evidence)} -> {len(unique_evidence)} unique papers")

        # Show what we found
        print("\n[Evidence collected]")
        max_title_len = 50
        for i, e in enumerate(unique_evidence[:5], 1):
            raw_title = e.citation.title
            if len(raw_title) > max_title_len:
                title = raw_title[:max_title_len] + "..."
            else:
                title = raw_title
            print(f"  {i}. [{e.citation.source.upper()}] {title}")

        # Step 3: REAL LLM - Generate hypotheses
        print("\n[Step 3] Generating mechanistic hypotheses (LLM)...")
        evidence_store: dict[str, Any] = {"current": unique_evidence, "hypotheses": []}
        agent = HypothesisAgent(evidence_store, embedding_service)

        print("-" * 60)
        response = await agent.run(query)
        print(response.messages[0].text)
        print("-" * 60)

        # Show stored hypotheses
        hypotheses = evidence_store.get("hypotheses", [])
        print(f"\n{len(hypotheses)} hypotheses stored")

        if hypotheses:
            print("\nGenerated search queries for further investigation:")
            for h in hypotheses:
                queries = h.to_search_queries()
                print(f"  {h.drug} -> {h.target}:")
                for q in queries[:3]:
                    print(f"    - {q}")

    except Exception as e:
        print(f"\n❌ Error during hypothesis generation: {e}")
        raise


async def main() -> None:
    """Entry point."""
    parser = argparse.ArgumentParser(
        description="Hypothesis Generation Demo (REAL - No Mocks)",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
    uv run python examples/hypothesis_demo/run_hypothesis.py "metformin Alzheimer's"
    uv run python examples/hypothesis_demo/run_hypothesis.py "sildenafil heart failure"
    uv run python examples/hypothesis_demo/run_hypothesis.py "aspirin cancer prevention"
        """,
    )
    parser.add_argument(
        "query",
        nargs="?",
        default="metformin Alzheimer's disease",
        help="Research query",
    )
    args = parser.parse_args()

    # Fail fast: require API key
    if not (os.getenv("OPENAI_API_KEY") or os.getenv("ANTHROPIC_API_KEY")):
        print("=" * 60)
        print("ERROR: This demo requires a real LLM.")
        print()
        print("Set one of the following in your .env file:")
        print("  OPENAI_API_KEY=sk-...")
        print("  ANTHROPIC_API_KEY=sk-ant-...")
        print()
        print("This is a REAL demo, not a mock. No fake data.")
        print("=" * 60)
        sys.exit(1)

    await run_hypothesis_demo(args.query)

    print("\n" + "=" * 60)
    print("Demo complete! This was a REAL pipeline:")
    print("  1. REAL search: PubMed + ClinicalTrials + bioRxiv APIs")
    print("  2. REAL embeddings: Actual sentence-transformers")
    print("  3. REAL LLM: Actual hypothesis generation")
    print("=" * 60 + "\n")


if __name__ == "__main__":
    asyncio.run(main())