Spaces:
Runtime error
Runtime error
| # Complete Medical Literature Health Dataset Generator with Gradio Interface | |
| # | |
| # This creates a web-based interface for generating synthetic health optimization datasets | |
| # ===================================================================== | |
| # STEP 1: INSTALLATIONS AND IMPORTS | |
| # ===================================================================== | |
| # Install required packages | |
| import subprocess | |
| import sys | |
| def install_packages(): | |
| """Install required packages""" | |
| packages = ['openai', 'gradio', 'python-dotenv', 'requests', 'pandas'] | |
| for package in packages: | |
| try: | |
| __import__(package) | |
| except ImportError: | |
| print(f"Installing {package}...") | |
| subprocess.check_call([sys.executable, "-m", "pip", "install", package]) | |
| # Run installation | |
| install_packages() | |
| # Import libraries | |
| import gradio as gr | |
| import json | |
| import random | |
| import re | |
| import time | |
| import os | |
| import io | |
| import zipfile | |
| from datetime import datetime | |
| from typing import Dict, List, Any, Optional, Tuple | |
| from openai import OpenAI | |
| import pandas as pd | |
| # ===================================================================== | |
| # STEP 2: CORE CLASSES (Same as before but with progress callbacks) | |
| # ===================================================================== | |
| class MedicalLiteratureSimulator: | |
| """Simulates medical literature research for health dataset generation""" | |
| def __init__(self): | |
| self.research_domains = { | |
| "longevity": { | |
| "interventions": ["NAD+ supplementation", "resveratrol", "metformin", "caloric restriction"], | |
| "biomarkers": ["telomere length", "cellular senescence", "inflammatory markers", "mitochondrial function"], | |
| "outcomes": ["biological age reduction", "improved healthspan", "enhanced cellular repair"] | |
| }, | |
| "metabolic_health": { | |
| "interventions": ["berberine", "intermittent fasting", "alpha-lipoic acid", "chromium"], | |
| "biomarkers": ["glucose levels", "insulin sensitivity", "HbA1c", "HOMA-IR"], | |
| "outcomes": ["improved glucose control", "enhanced insulin sensitivity", "reduced inflammation"] | |
| }, | |
| "cardiovascular": { | |
| "interventions": ["omega-3 fatty acids", "coenzyme Q10", "magnesium", "nattokinase"], | |
| "biomarkers": ["blood pressure", "cholesterol levels", "CRP", "endothelial function"], | |
| "outcomes": ["reduced blood pressure", "improved lipid profile", "decreased inflammation"] | |
| }, | |
| "cognitive": { | |
| "interventions": ["lion's mane mushroom", "phosphatidylserine", "bacopa monnieri", "acetyl-L-carnitine"], | |
| "biomarkers": ["cognitive performance", "BDNF levels", "neuroinflammation", "memory function"], | |
| "outcomes": ["enhanced memory", "improved cognitive function", "neuroprotection"] | |
| }, | |
| "hormonal": { | |
| "interventions": ["ashwagandha", "vitamin D", "DHEA", "maca root"], | |
| "biomarkers": ["cortisol levels", "thyroid hormones", "sex hormones", "stress markers"], | |
| "outcomes": ["hormone balance", "improved energy", "better sleep quality"] | |
| }, | |
| "inflammation": { | |
| "interventions": ["curcumin", "omega-3", "quercetin", "boswellia"], | |
| "biomarkers": ["CRP", "IL-6", "TNF-alpha", "oxidative stress"], | |
| "outcomes": ["reduced inflammation", "improved immune function", "enhanced recovery"] | |
| } | |
| } | |
| def generate_study_data(self, domain: str) -> Dict[str, Any]: | |
| """Generate realistic medical study data""" | |
| if domain not in self.research_domains: | |
| domain = "longevity" | |
| domain_data = self.research_domains[domain] | |
| study = { | |
| "pmid": f"PMID{random.randint(35000000, 40000000)}", | |
| "title": self._generate_study_title(domain, domain_data), | |
| "abstract": self._generate_study_abstract(domain, domain_data), | |
| "journal": random.choice([ | |
| "Nature Medicine", "Cell Metabolism", "Journal of Clinical Medicine", | |
| "Circulation", "Aging Cell", "Nutrients", "Clinical Nutrition" | |
| ]), | |
| "year": random.choice([2023, 2024]), | |
| "domain": domain, | |
| "interventions": random.sample(domain_data["interventions"], min(2, len(domain_data["interventions"]))), | |
| "biomarkers": random.sample(domain_data["biomarkers"], min(3, len(domain_data["biomarkers"]))), | |
| "outcomes": random.sample(domain_data["outcomes"], min(2, len(domain_data["outcomes"]))), | |
| "participant_count": random.randint(50, 300), | |
| "duration_weeks": random.choice([8, 12, 16, 24]), | |
| "dosages": self._generate_dosages(domain_data["interventions"][0]) | |
| } | |
| return study | |
| def _generate_study_title(self, domain: str, domain_data: Dict) -> str: | |
| intervention = random.choice(domain_data["interventions"]) | |
| outcome = random.choice(domain_data["outcomes"]) | |
| titles = [ | |
| f"Effects of {intervention} on {outcome}: A randomized controlled trial", | |
| f"{intervention} supplementation improves {outcome} in healthy adults", | |
| f"Clinical evaluation of {intervention} for {outcome} optimization", | |
| f"Randomized trial of {intervention} in {outcome} enhancement" | |
| ] | |
| return random.choice(titles) | |
| def _generate_study_abstract(self, domain: str, domain_data: Dict) -> str: | |
| intervention = domain_data["interventions"][0] | |
| biomarker = random.choice(domain_data["biomarkers"]) | |
| outcome = random.choice(domain_data["outcomes"]) | |
| abstract = f""" | |
| Background: {intervention} has shown promise in preliminary studies for health optimization. | |
| Objective: To evaluate the effects of {intervention} supplementation on {biomarker} and related health outcomes. | |
| Methods: Randomized, double-blind, placebo-controlled trial with {random.randint(120, 250)} participants aged 40-65 years. | |
| Subjects received {intervention} or placebo for {random.randint(12, 24)} weeks. | |
| Results: {intervention} supplementation significantly improved {outcome} compared to placebo (p<0.05). | |
| {biomarker.capitalize()} showed {random.randint(15, 35)}% improvement from baseline. | |
| Secondary outcomes included improved quality of life and no serious adverse events. | |
| Conclusions: {intervention} supplementation provides significant benefits for {outcome} with excellent safety profile. | |
| """.strip() | |
| return abstract | |
| def _generate_dosages(self, intervention: str) -> List[str]: | |
| dosage_ranges = { | |
| "NAD+": ["250mg", "500mg", "1000mg"], | |
| "resveratrol": ["100mg", "250mg", "500mg"], | |
| "berberine": ["500mg", "1000mg", "1500mg"], | |
| "omega-3": ["1000mg", "2000mg", "3000mg"], | |
| "magnesium": ["200mg", "400mg", "600mg"], | |
| "curcumin": ["500mg", "1000mg", "1500mg"] | |
| } | |
| for key in dosage_ranges: | |
| if key.lower() in intervention.lower(): | |
| return random.sample(dosage_ranges[key], min(2, len(dosage_ranges[key]))) | |
| return ["500mg", "1000mg"] | |
| class HealthProfileGenerator: | |
| """Generates realistic health profiles based on medical studies""" | |
| def __init__(self): | |
| self.severity_levels = { | |
| "optimal": {"multiplier": 1.0, "description": "excellent baseline health with optimization focus"}, | |
| "mild": {"multiplier": 1.2, "description": "minor health concerns with good overall function"}, | |
| "moderate": {"multiplier": 1.5, "description": "noticeable health issues requiring intervention"}, | |
| "severe": {"multiplier": 2.0, "description": "significant health challenges needing intensive protocols"} | |
| } | |
| def generate_profile_from_study(self, study: Dict[str, Any], severity: str = "moderate") -> Dict[str, Any]: | |
| """Generate complete health profile based on study data and severity level""" | |
| domain = study.get("domain", "longevity") | |
| severity_data = self.severity_levels.get(severity, self.severity_levels["moderate"]) | |
| multiplier = severity_data["multiplier"] | |
| age = random.randint(35, 65) | |
| gender = random.choice(["male", "female"]) | |
| labs = self._generate_lab_values(domain, multiplier) | |
| health_profile = { | |
| "user_tests_result_data": { | |
| "Labs": labs, | |
| "gut_microbiome": self._generate_gut_microbiome(severity), | |
| "epigenetics": self._generate_epigenetics(severity), | |
| "wearables": self._generate_wearables(severity), | |
| "cgm": self._generate_cgm(severity) | |
| }, | |
| "user_query": self._generate_user_query(study, age, gender, severity), | |
| "source_study": { | |
| "pmid": study.get("pmid"), | |
| "domain": domain, | |
| "severity": severity, | |
| "title": study.get("title") | |
| } | |
| } | |
| return health_profile | |
| def _generate_lab_values(self, domain: str, multiplier: float) -> Dict[str, Any]: | |
| """Generate realistic lab values based on domain and severity""" | |
| base_labs = { | |
| "blood_tests": { | |
| "systolic_bp": int(random.randint(120, 140) * multiplier), | |
| "diastolic_bp": int(random.randint(70, 90) * multiplier), | |
| "total_cholesterol": int(random.randint(180, 220) * multiplier), | |
| "ldl": int(random.randint(100, 140) * multiplier), | |
| "hdl": int(random.randint(40, 60) / multiplier), | |
| "triglycerides": int(random.randint(80, 150) * multiplier), | |
| "apoB": int(random.randint(70, 110) * multiplier), | |
| "lp_a": random.randint(10, 50) | |
| }, | |
| "inflammatory": { | |
| "hscrp": round(random.uniform(1.0, 4.0) * multiplier, 1), | |
| "esr": int(random.randint(5, 25) * multiplier), | |
| "il6": round(random.uniform(1.0, 5.0) * multiplier, 1), | |
| "tnf_alpha": round(random.uniform(1.0, 3.0) * multiplier, 1), | |
| "oxidative_stress_markers": "elevated" if multiplier > 1.3 else "normal", | |
| "homocysteine": round(random.uniform(8, 15) * multiplier, 1) | |
| }, | |
| "nutritional": { | |
| "vitamin_d": int(random.randint(25, 50) / multiplier), | |
| "b12": random.randint(250, 400), | |
| "folate": round(random.uniform(6, 14), 1), | |
| "iron": random.randint(60, 120), | |
| "ferritin": random.randint(30, 100), | |
| "selenium": random.randint(80, 120), | |
| "zinc": random.randint(70, 110), | |
| "magnesium": round(random.uniform(1.5, 2.2), 1), | |
| "omega3_index": round(random.uniform(4, 8) / multiplier, 1) | |
| } | |
| } | |
| if domain == "metabolic_health": | |
| base_labs["metabolic"] = { | |
| "fasting_glucose": int(random.randint(85, 110) * multiplier), | |
| "hba1c": round(random.uniform(5.2, 6.0) * min(multiplier, 1.4), 1), | |
| "insulin_fasting": round(random.uniform(5, 15) * multiplier, 1), | |
| "homa_ir": round(random.uniform(1.5, 4.0) * multiplier, 1) | |
| } | |
| return base_labs | |
| def _generate_gut_microbiome(self, severity: str) -> str: | |
| scores = { | |
| "optimal": random.uniform(8.5, 9.5), | |
| "mild": random.uniform(7.0, 8.5), | |
| "moderate": random.uniform(5.5, 7.0), | |
| "severe": random.uniform(3.5, 5.5) | |
| } | |
| score = scores.get(severity, 6.5) | |
| descriptions = { | |
| "optimal": "excellent diversity with optimal bacterial balance", | |
| "mild": "good diversity with minor imbalances", | |
| "moderate": "moderate dysbiosis with reduced beneficial bacteria", | |
| "severe": "significant dysbiosis with pathogenic overgrowth" | |
| } | |
| desc = descriptions.get(severity, "moderate dysbiosis") | |
| return f"Diversity score {score:.1f}/10, {desc}, beneficial bacteria {random.randint(60, 90)}%" | |
| def _generate_epigenetics(self, severity: str) -> str: | |
| age_acceleration = { | |
| "optimal": random.randint(-2, 1), | |
| "mild": random.randint(1, 3), | |
| "moderate": random.randint(3, 6), | |
| "severe": random.randint(6, 12) | |
| } | |
| acceleration = age_acceleration.get(severity, 4) | |
| telomere_percentile = max(10, random.randint(30, 80) - acceleration * 5) | |
| return f"Biological age acceleration: {acceleration} years, telomere length: {telomere_percentile}th percentile, DunedinPACE: {round(random.uniform(0.9, 1.4), 2)}" | |
| def _generate_wearables(self, severity: str) -> Dict[str, int]: | |
| base_ranges = { | |
| "optimal": {"hrv": (55, 75), "rhr": (45, 60), "sleep": (85, 95)}, | |
| "mild": {"hrv": (45, 65), "rhr": (55, 70), "sleep": (75, 85)}, | |
| "moderate": {"hrv": (30, 50), "rhr": (65, 80), "sleep": (60, 75)}, | |
| "severe": {"hrv": (20, 35), "rhr": (75, 95), "sleep": (45, 65)} | |
| } | |
| ranges = base_ranges.get(severity, base_ranges["moderate"]) | |
| return { | |
| "hrv_avg": random.randint(*ranges["hrv"]), | |
| "rhr": random.randint(*ranges["rhr"]), | |
| "sleep_score": random.randint(*ranges["sleep"]), | |
| "recovery_score": random.randint(ranges["sleep"][0]-10, ranges["sleep"][1]-5), | |
| "stress_score": random.randint(100-ranges["sleep"][1], 100-ranges["sleep"][0]+20), | |
| "vo2_max": random.randint(25, 50), | |
| "fitness_age": random.randint(30, 65) | |
| } | |
| def _generate_cgm(self, severity: str) -> str: | |
| glucose_ranges = { | |
| "optimal": (80, 95, 92, 98), | |
| "mild": (85, 105, 85, 95), | |
| "moderate": (95, 120, 70, 85), | |
| "severe": (110, 140, 55, 75) | |
| } | |
| avg_min, avg_max, tir_min, tir_max = glucose_ranges.get(severity, glucose_ranges["moderate"]) | |
| return f"Average glucose {random.randint(avg_min, avg_max)} mg/dL, time in range {random.randint(tir_min, tir_max)}%" | |
| def _generate_user_query(self, study: Dict[str, Any], age: int, gender: str, severity: str) -> str: | |
| domain = study.get("domain", "longevity") | |
| base_queries = { | |
| "longevity": f"I'm a {age}-year-old {gender} interested in longevity optimization and anti-aging protocols", | |
| "metabolic_health": f"I'm a {age}-year-old {gender} with metabolic dysfunction seeking evidence-based glucose control", | |
| "cardiovascular": f"I'm a {age}-year-old {gender} with cardiovascular risk factors wanting heart health optimization", | |
| "cognitive": f"I'm a {age}-year-old {gender} seeking cognitive enhancement and brain health optimization", | |
| "hormonal": f"I'm a {age}-year-old {gender} with hormonal imbalances needing optimization protocols", | |
| "inflammation": f"I'm a {age}-year-old {gender} with chronic inflammation seeking anti-inflammatory interventions" | |
| } | |
| base_query = base_queries.get(domain, base_queries["longevity"]) | |
| severity_context = { | |
| "optimal": "I have excellent baseline health but want to push the boundaries of optimization", | |
| "mild": "I have minor health concerns and want targeted interventions", | |
| "moderate": "I have noticeable health issues and need comprehensive protocols", | |
| "severe": "I have significant health challenges and require intensive interventions" | |
| } | |
| context = severity_context.get(severity, "") | |
| return f"{base_query}. {context}." | |
| class AIProtocolGenerator: | |
| """Uses OpenAI to generate health optimization protocols""" | |
| def __init__(self, api_key: str, model: str = "gpt-4"): | |
| self.client = OpenAI(api_key=api_key) | |
| self.model = model | |
| self.total_cost = 0.0 | |
| def generate_protocol(self, health_profile: Dict[str, Any], study_context: Dict[str, Any], progress_callback=None) -> Optional[str]: | |
| """Generate comprehensive health optimization protocol""" | |
| system_prompt = self._create_system_prompt(study_context) | |
| user_prompt = self._create_user_prompt(health_profile, study_context) | |
| try: | |
| if progress_callback: | |
| progress_callback(f"π Generating protocol using {self.model}...") | |
| response = self.client.chat.completions.create( | |
| model=self.model, | |
| messages=[ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": user_prompt} | |
| ], | |
| max_tokens=4000, | |
| temperature=0.7, | |
| top_p=0.9 | |
| ) | |
| self._update_cost(response.usage) | |
| if progress_callback: | |
| progress_callback(f"β Protocol generated ({response.usage.total_tokens} tokens)") | |
| return response.choices[0].message.content | |
| except Exception as e: | |
| if progress_callback: | |
| progress_callback(f"β Error generating protocol: {e}") | |
| return None | |
| def _create_system_prompt(self, study_context: Dict[str, Any]) -> str: | |
| domain = study_context.get("domain", "health") | |
| interventions = ", ".join(study_context.get("interventions", [])) | |
| return f"""You are an advanced AI health optimization system specializing in evidence-based medicine and personalized protocols. | |
| RESEARCH CONTEXT: | |
| - Domain: {domain} optimization | |
| - Key Interventions: {interventions} | |
| - Evidence Level: Peer-reviewed clinical research | |
| PROTOCOL REQUIREMENTS: | |
| 1. Executive Summary with current health assessment | |
| 2. Multi-Phase Protocol: | |
| - Phase 1: Foundation (0-3 months) | |
| - Phase 2: Optimization (3-6 months) | |
| - Phase 3: Advanced Enhancement (6-12 months) | |
| 3. Specific supplement protocols with dosages and timing | |
| 4. Lifestyle interventions (exercise, nutrition, sleep) | |
| 5. Monitoring and assessment plans | |
| 6. Expected outcomes with realistic timelines | |
| STYLE: Professional, authoritative, using Medicine 3.0 terminology. Reference biological age, biomarkers, and cellular health. | |
| SAFETY: Keep dosages within evidence-based safe ranges. Include monitoring recommendations. | |
| Generate comprehensive protocols (3000+ words) with actionable precision medicine recommendations.""" | |
| def _create_user_prompt(self, health_profile: Dict[str, Any], study_context: Dict[str, Any]) -> str: | |
| return f""" | |
| COMPREHENSIVE HEALTH OPTIMIZATION REQUEST: | |
| Health Profile Analysis: | |
| {json.dumps(health_profile, indent=2)} | |
| Research Context: | |
| - Study: {study_context.get('title', 'Health Optimization Study')} | |
| - Domain: {study_context.get('domain', 'general health')} | |
| - Key Findings: Based on clinical research showing significant improvements in health biomarkers | |
| Please analyze this health profile and generate a detailed, personalized optimization protocol. Address the specific biomarker patterns, deficiencies, and health challenges identified in the data. Provide targeted interventions with precise dosing, timing, and monitoring protocols. | |
| """ | |
| def _update_cost(self, usage): | |
| pricing = { | |
| "gpt-3.5-turbo": {"input": 0.0015, "output": 0.002}, | |
| "gpt-4": {"input": 0.03, "output": 0.06}, | |
| "gpt-4-turbo": {"input": 0.01, "output": 0.03} | |
| } | |
| model_pricing = pricing.get(self.model, pricing["gpt-4"]) | |
| input_cost = usage.prompt_tokens * model_pricing["input"] / 1000 | |
| output_cost = usage.completion_tokens * model_pricing["output"] / 1000 | |
| self.total_cost += input_cost + output_cost | |
| class HealthDatasetGenerator: | |
| """Complete system that orchestrates the entire dataset generation process""" | |
| def __init__(self, api_key: str, model: str = "gpt-4"): | |
| self.literature_sim = MedicalLiteratureSimulator() | |
| self.profile_gen = HealthProfileGenerator() | |
| self.protocol_gen = AIProtocolGenerator(api_key, model) | |
| self.generated_examples = [] | |
| def generate_dataset(self, | |
| domains: List[str] = None, | |
| examples_per_domain: int = 2, | |
| rate_limit_delay: float = 2.0, | |
| progress_callback=None) -> Tuple[List[Dict[str, Any]], str]: | |
| """Generate complete health optimization dataset with progress updates""" | |
| if domains is None: | |
| domains = ["longevity", "metabolic_health", "cardiovascular", "cognitive"] | |
| if progress_callback: | |
| progress_callback(f"π Starting Health Dataset Generation") | |
| progress_callback(f"Domains: {domains}") | |
| progress_callback(f"Examples per domain: {examples_per_domain}") | |
| progress_callback(f"Total examples to generate: {len(domains) * examples_per_domain}") | |
| examples = [] | |
| total_examples = len(domains) * examples_per_domain | |
| current_example = 0 | |
| for domain in domains: | |
| if progress_callback: | |
| progress_callback(f"\nπ Processing domain: {domain}") | |
| for i in range(examples_per_domain): | |
| current_example += 1 | |
| try: | |
| if progress_callback: | |
| progress_callback(f" Creating example {i+1}/{examples_per_domain} (Overall: {current_example}/{total_examples})") | |
| # Generate study data | |
| study = self.literature_sim.generate_study_data(domain) | |
| if progress_callback: | |
| progress_callback(f" π Generated study: {study['title'][:50]}...") | |
| # Create health profile | |
| severity = random.choice(["mild", "moderate", "severe"]) | |
| health_profile = self.profile_gen.generate_profile_from_study(study, severity) | |
| if progress_callback: | |
| progress_callback(f" π€ Created {severity} health profile") | |
| # Generate protocol | |
| protocol = self.protocol_gen.generate_protocol(health_profile, study, progress_callback) | |
| if protocol: | |
| training_example = { | |
| "user_context": health_profile, | |
| "response": protocol, | |
| "citations": self._generate_citations(study), | |
| "metadata": { | |
| "domain": domain, | |
| "severity": severity, | |
| "study_pmid": study["pmid"], | |
| "generated_at": datetime.now().isoformat() | |
| } | |
| } | |
| examples.append(training_example) | |
| if progress_callback: | |
| progress_callback(f" β Complete example generated") | |
| # Rate limiting | |
| if i < examples_per_domain - 1: | |
| if progress_callback: | |
| progress_callback(f" β³ Rate limit delay: {rate_limit_delay}s") | |
| time.sleep(rate_limit_delay) | |
| except Exception as e: | |
| if progress_callback: | |
| progress_callback(f" β Error generating example: {e}") | |
| continue | |
| if progress_callback: | |
| progress_callback(f"\nπ Dataset generation complete!") | |
| progress_callback(f"Generated: {len(examples)} examples") | |
| progress_callback(f"Total cost: ${self.protocol_gen.total_cost:.4f}") | |
| self.generated_examples = examples | |
| return examples, f"Generated {len(examples)} examples. Total cost: ${self.protocol_gen.total_cost:.4f}" | |
| def _generate_citations(self, study: Dict[str, Any]) -> Dict[str, List[str]]: | |
| return { | |
| "tier_1_peer_reviewed": [study["pmid"], f"PMC{random.randint(1000000, 9999999)}"], | |
| "tier_2_rct": [f"{study['domain'].upper()}.2024.{random.randint(100000, 999999)}"], | |
| "tier_3_cohort": [f"HEALTH.2023.{random.randint(100000, 999999)}"], | |
| "real_world_cases": ["Evidence-based health optimization protocols"] | |
| } | |
| def export_dataset(self, filename: str = None) -> Tuple[str, List[str]]: | |
| """Export dataset and return zip file path and file list""" | |
| if not filename: | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| filename = f"health_dataset_{timestamp}" | |
| # Create all files in memory | |
| files_created = [] | |
| # Raw dataset | |
| raw_data = json.dumps(self.generated_examples, indent=2, ensure_ascii=False) | |
| files_created.append((f"{filename}.json", raw_data)) | |
| # Fine-tuning format | |
| fine_tune_lines = [] | |
| for example in self.generated_examples: | |
| fine_tune_example = { | |
| "messages": [ | |
| { | |
| "role": "system", | |
| "content": "You are an advanced AI health optimization system that creates evidence-based protocols." | |
| }, | |
| { | |
| "role": "user", | |
| "content": f"Create a health optimization protocol for this profile:\n\n{json.dumps(example['user_context'], indent=2)}" | |
| }, | |
| { | |
| "role": "assistant", | |
| "content": example["response"] | |
| } | |
| ] | |
| } | |
| fine_tune_lines.append(json.dumps(fine_tune_example, ensure_ascii=False)) | |
| fine_tune_data = '\n'.join(fine_tune_lines) | |
| files_created.append((f"{filename}_fine_tuning.jsonl", fine_tune_data)) | |
| # Sample examples | |
| sample_size = min(3, len(self.generated_examples)) | |
| sample_data = json.dumps(self.generated_examples[:sample_size], indent=2, ensure_ascii=False) | |
| files_created.append((f"{filename}_samples.json", sample_data)) | |
| # Metadata | |
| metadata = { | |
| "generation_info": { | |
| "generated_at": datetime.now().isoformat(), | |
| "total_examples": len(self.generated_examples), | |
| "total_cost": self.protocol_gen.total_cost, | |
| "model_used": self.protocol_gen.model | |
| }, | |
| "domains_covered": list(set(ex["metadata"]["domain"] for ex in self.generated_examples)), | |
| "severity_distribution": { | |
| severity: sum(1 for ex in self.generated_examples if ex["metadata"]["severity"] == severity) | |
| for severity in ["mild", "moderate", "severe"] | |
| } | |
| } | |
| metadata_data = json.dumps(metadata, indent=2, ensure_ascii=False) | |
| files_created.append((f"{filename}_metadata.json", metadata_data)) | |
| # Create zip file | |
| zip_buffer = io.BytesIO() | |
| with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: | |
| for file_name, file_content in files_created: | |
| zip_file.writestr(file_name, file_content) | |
| # Save zip file | |
| zip_filename = f"{filename}.zip" | |
| with open(zip_filename, 'wb') as f: | |
| f.write(zip_buffer.getvalue()) | |
| file_list = [f[0] for f in files_created] | |
| return zip_filename, file_list | |
| # ===================================================================== | |
| # STEP 3: GRADIO INTERFACE | |
| # ===================================================================== | |
| class HealthDatasetGradioInterface: | |
| """Gradio web interface for the health dataset generator""" | |
| def __init__(self): | |
| self.generator = None | |
| self.available_domains = list(MedicalLiteratureSimulator().research_domains.keys()) | |
| def estimate_cost(self, domains, examples_per_domain, model): | |
| """Estimate generation cost""" | |
| if not domains: | |
| return "Please select at least one domain" | |
| total_examples = len(domains) * examples_per_domain | |
| cost_per_example = { | |
| "gpt-3.5-turbo": 0.05, | |
| "gpt-4": 0.25, | |
| "gpt-4-turbo": 0.15 | |
| } | |
| estimated_cost = total_examples * cost_per_example.get(model, 0.25) | |
| return f"π° Estimated cost: ${estimated_cost:.2f} for {total_examples} examples" | |
| def validate_inputs(self, api_key, domains, examples_per_domain): | |
| """Validate user inputs""" | |
| if not api_key or not api_key.strip(): | |
| return False, "β Please provide your OpenAI API key" | |
| if not domains: | |
| return False, "β Please select at least one domain" | |
| if examples_per_domain < 1 or examples_per_domain > 10: | |
| return False, "β Examples per domain must be between 1 and 10" | |
| return True, "β Inputs are valid" | |
| def generate_dataset_interface(self, api_key, domains, examples_per_domain, model, rate_limit): | |
| """Main dataset generation function for Gradio interface""" | |
| # Validate inputs | |
| is_valid, message = self.validate_inputs(api_key, domains, examples_per_domain) | |
| if not is_valid: | |
| yield message, "", "", None, None | |
| return | |
| # Initialize generator | |
| try: | |
| self.generator = HealthDatasetGenerator(api_key.strip(), model) | |
| except Exception as e: | |
| yield f"β Error initializing generator: {e}", "", "", None, None | |
| return | |
| # Progress tracking | |
| progress_messages = [] | |
| def progress_callback(message): | |
| progress_messages.append(message) | |
| progress_text = "\n".join(progress_messages[-20:]) # Keep last 20 messages | |
| return progress_text | |
| try: | |
| # Generate dataset | |
| yield "π Starting dataset generation...", "", "", None, None | |
| dataset, summary = self.generator.generate_dataset( | |
| domains=domains, | |
| examples_per_domain=examples_per_domain, | |
| rate_limit_delay=rate_limit, | |
| progress_callback=progress_callback | |
| ) | |
| if not dataset: | |
| yield "β No examples generated", "", "", None, None | |
| return | |
| # Export dataset | |
| progress_callback("πΎ Exporting dataset...") | |
| zip_filename, file_list = self.generator.export_dataset() | |
| # Create preview | |
| preview = self.create_dataset_preview(dataset) | |
| # Final progress | |
| final_progress = progress_callback(f"π Generation complete! Files: {', '.join(file_list)}") | |
| yield final_progress, summary, preview, zip_filename, file_list | |
| except Exception as e: | |
| yield f"β Error during generation: {e}", "", "", None, None | |
| def create_dataset_preview(self, dataset): | |
| """Create a preview of the generated dataset""" | |
| if not dataset: | |
| return "No data to preview" | |
| preview = "π **Dataset Preview**\n\n" | |
| # Summary statistics | |
| preview += f"**Total Examples:** {len(dataset)}\n" | |
| # Domain distribution | |
| domains = [ex['metadata']['domain'] for ex in dataset] | |
| domain_counts = {d: domains.count(d) for d in set(domains)} | |
| preview += f"**Domain Distribution:** {domain_counts}\n" | |
| # Severity distribution | |
| severities = [ex['metadata']['severity'] for ex in dataset] | |
| severity_counts = {s: severities.count(s) for s in set(severities)} | |
| preview += f"**Severity Distribution:** {severity_counts}\n\n" | |
| # Sample example | |
| if dataset: | |
| example = dataset[0] | |
| preview += "**Sample Example:**\n" | |
| preview += f"- **Domain:** {example['metadata']['domain']}\n" | |
| preview += f"- **Severity:** {example['metadata']['severity']}\n" | |
| preview += f"- **User Query:** {example['user_context']['user_query'][:150]}...\n" | |
| preview += f"- **Response Length:** {len(example['response'])} characters\n" | |
| preview += f"- **PMID:** {example['metadata']['study_pmid']}\n" | |
| return preview | |
| def analyze_dataset_file(self, zip_file): | |
| """Analyze uploaded dataset file""" | |
| if zip_file is None: | |
| return "No file uploaded" | |
| try: | |
| # Read the zip file | |
| with zipfile.ZipFile(zip_file.name, 'r') as zip_ref: | |
| # Look for the main dataset file | |
| json_files = [f for f in zip_ref.namelist() if f.endswith('.json') and not f.endswith('_samples.json') and not f.endswith('_metadata.json')] | |
| if json_files: | |
| dataset_file = json_files[0] | |
| with zip_ref.open(dataset_file) as f: | |
| dataset = json.load(f) | |
| analysis = "π **Dataset Analysis**\n\n" | |
| analysis += f"**Total Examples:** {len(dataset)}\n" | |
| analysis += f"**Average Response Length:** {sum(len(ex['response']) for ex in dataset) / len(dataset):.0f} characters\n" | |
| # Quality checks | |
| long_responses = sum(1 for ex in dataset if len(ex['response']) > 2000) | |
| has_phases = sum(1 for ex in dataset if "Phase" in ex['response']) | |
| has_dosages = sum(1 for ex in dataset if re.search(r'\d+\s*mg', ex['response'])) | |
| analysis += f"**Quality Metrics:**\n" | |
| analysis += f"- Responses >2000 chars: {long_responses}/{len(dataset)} ({long_responses/len(dataset)*100:.1f}%)\n" | |
| analysis += f"- Responses with phases: {has_phases}/{len(dataset)} ({has_phases/len(dataset)*100:.1f}%)\n" | |
| analysis += f"- Responses with dosages: {has_dosages}/{len(dataset)} ({has_dosages/len(dataset)*100:.1f}%)\n" | |
| return analysis | |
| else: | |
| return "No dataset JSON file found in zip" | |
| except Exception as e: | |
| return f"Error analyzing file: {e}" | |
| def create_interface(self): | |
| """Create the Gradio interface""" | |
| with gr.Blocks(title="Medical Literature Health Dataset Generator", theme=gr.themes.Soft()) as interface: | |
| gr.Markdown(""" | |
| # π₯ Medical Literature Health Dataset Generator | |
| This tool generates synthetic health optimization datasets based on medical literature patterns. | |
| Perfect for training AI models on evidence-based health protocols. | |
| β οΈ **Important:** Generated content is for research/educational purposes only. Not medical advice. | |
| """) | |
| with gr.Tab("π Generate Dataset"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### βοΈ Configuration") | |
| api_key = gr.Textbox( | |
| label="OpenAI API Key", | |
| placeholder="sk-...", | |
| type="password", | |
| info="Your OpenAI API key for generating protocols" | |
| ) | |
| domains = gr.CheckboxGroup( | |
| label="Research Domains", | |
| choices=self.available_domains, | |
| value=["longevity", "metabolic_health"], | |
| info="Select medical research domains to include" | |
| ) | |
| examples_per_domain = gr.Slider( | |
| label="Examples per Domain", | |
| minimum=1, | |
| maximum=10, | |
| value=2, | |
| step=1, | |
| info="Number of examples to generate for each domain" | |
| ) | |
| model = gr.Dropdown( | |
| label="OpenAI Model", | |
| choices=["gpt-3.5-turbo", "gpt-4", "gpt-4-turbo"], | |
| value="gpt-4", | |
| info="Model for generating protocols (GPT-4 recommended for quality)" | |
| ) | |
| rate_limit = gr.Slider( | |
| label="Rate Limit Delay (seconds)", | |
| minimum=0.5, | |
| maximum=5.0, | |
| value=2.0, | |
| step=0.5, | |
| info="Delay between API calls to avoid rate limits" | |
| ) | |
| cost_estimate = gr.Textbox( | |
| label="Cost Estimate", | |
| value="Select domains and examples to see estimate", | |
| interactive=False | |
| ) | |
| generate_btn = gr.Button( | |
| "π Generate Dataset", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| with gr.Column(scale=2): | |
| gr.Markdown("### π Progress & Results") | |
| progress_output = gr.Textbox( | |
| label="Generation Progress", | |
| lines=15, | |
| max_lines=20, | |
| value="Ready to generate dataset...", | |
| interactive=False | |
| ) | |
| summary_output = gr.Textbox( | |
| label="Generation Summary", | |
| lines=3, | |
| interactive=False | |
| ) | |
| preview_output = gr.Markdown( | |
| label="Dataset Preview", | |
| value="Dataset preview will appear here..." | |
| ) | |
| with gr.Row(): | |
| download_file = gr.File( | |
| label="π₯ Download Generated Dataset", | |
| interactive=False | |
| ) | |
| file_list = gr.Textbox( | |
| label="Generated Files", | |
| placeholder="Files included in download will be listed here", | |
| interactive=False | |
| ) | |
| with gr.Tab("π Analyze Dataset"): | |
| gr.Markdown("### π Dataset Analysis") | |
| gr.Markdown("Upload a generated dataset zip file to analyze its quality and structure.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| upload_file = gr.File( | |
| label="Upload Dataset Zip File", | |
| file_types=[".zip"] | |
| ) | |
| analyze_btn = gr.Button( | |
| "π Analyze Dataset", | |
| variant="secondary" | |
| ) | |
| with gr.Column(): | |
| analysis_output = gr.Markdown( | |
| label="Analysis Results", | |
| value="Upload a dataset file to see analysis..." | |
| ) | |
| with gr.Tab("βΉοΈ Information"): | |
| gr.Markdown(""" | |
| ### π How It Works | |
| 1. **Literature Simulation**: Creates realistic medical studies with proper abstracts, interventions, and outcomes | |
| 2. **Health Profile Generation**: Generates comprehensive health profiles based on study domains and severity levels | |
| 3. **AI Protocol Generation**: Uses OpenAI to create detailed health optimization protocols | |
| 4. **Dataset Export**: Outputs data in multiple formats including OpenAI fine-tuning format | |
| ### π― Output Files | |
| - **`dataset.json`**: Complete raw dataset | |
| - **`dataset_fine_tuning.jsonl`**: OpenAI fine-tuning format | |
| - **`dataset_samples.json`**: Sample examples for review | |
| - **`dataset_metadata.json`**: Generation statistics and info | |
| ### π° Cost Information | |
| - **GPT-3.5-turbo**: ~$0.05 per example | |
| - **GPT-4**: ~$0.25 per example | |
| - **GPT-4-turbo**: ~$0.15 per example | |
| ### β οΈ Important Notes | |
| - Generated content is for **research/educational purposes only** | |
| - **Not medical advice** - always consult healthcare professionals | |
| - Include appropriate medical disclaimers when using generated content | |
| - Review sample outputs before using in production | |
| ### π§ Recommended Settings | |
| - **Start small**: Generate 2-4 examples first to test quality | |
| - **Use GPT-4**: Better quality than GPT-3.5-turbo | |
| - **Rate limiting**: Use 2+ second delays to avoid API limits | |
| - **Multiple domains**: Include diverse domains for comprehensive dataset | |
| """) | |
| # Event handlers | |
| # Update cost estimate when inputs change | |
| def update_cost_estimate(domains, examples_per_domain, model): | |
| return self.estimate_cost(domains, examples_per_domain, model) | |
| for input_component in [domains, examples_per_domain, model]: | |
| input_component.change( | |
| fn=update_cost_estimate, | |
| inputs=[domains, examples_per_domain, model], | |
| outputs=[cost_estimate] | |
| ) | |
| # Generate dataset | |
| generate_btn.click( | |
| fn=self.generate_dataset_interface, | |
| inputs=[api_key, domains, examples_per_domain, model, rate_limit], | |
| outputs=[progress_output, summary_output, preview_output, download_file, file_list] | |
| ) | |
| # Analyze dataset | |
| analyze_btn.click( | |
| fn=self.analyze_dataset_file, | |
| inputs=[upload_file], | |
| outputs=[analysis_output] | |
| ) | |
| return interface | |
| # ===================================================================== | |
| # STEP 4: LAUNCH THE INTERFACE | |
| # ===================================================================== | |
| def main(): | |
| """Launch the Gradio interface""" | |
| print("π Launching Medical Literature Health Dataset Generator") | |
| print("This will start a web interface accessible through your browser") | |
| # Create interface | |
| interface_creator = HealthDatasetGradioInterface() | |
| interface = interface_creator.create_interface() | |
| # Launch with configuration | |
| interface.launch( | |
| share=True, # Creates public link for sharing | |
| server_name="0.0.0.0", # Makes it accessible from other devices | |
| server_port=7860, # Default Gradio port | |
| show_error=True, # Show detailed errors | |
| quiet=False # Show startup info | |
| ) | |
| if __name__ == "__main__": | |
| main() | |
| # For Google Colab, uncomment the following: | |
| # main() |