Spaces:

MCP-1st-Birthday
/

legacy_code_modernizer

Running

App Files Files Community

naazimsnh02 commited on 13 days ago

Commit

ec4aa90

1 Parent(s): 110a838

Initial deployment: Autonomous AI agent for code modernization

Browse files

Files changed (43) hide show

.env.example +29 -0
.gitignore +69 -0
README.md +286 -5
app.py +21 -0
modal/api_test.py +59 -0
pytest.ini +31 -0
requirements.txt +39 -0
src/__init__.py +3 -0
src/agents/__init__.py +11 -0
src/agents/analyzer.py +322 -0
src/agents/classifier.py +119 -0
src/agents/code_validator.py +346 -0
src/agents/pattern_integration.py +296 -0
src/agents/pattern_matcher.py +838 -0
src/agents/test_generator.py +706 -0
src/agents/transformer.py +358 -0
src/config/__init__.py +10 -0
src/config/ai_manager.py +323 -0
src/config/gemini_config.py +99 -0
src/config/gemini_schemas.py +261 -0
src/mcp/__init__.py +9 -0
src/mcp/github_client.py +407 -0
src/mcp/manager.py +169 -0
src/mcp/memory_client.py +202 -0
src/mcp/search_client.py +247 -0
src/sandbox/__init__.py +5 -0
src/sandbox/config.py +124 -0
src/sandbox/images.py +122 -0
src/sandbox/modal_executor.py +423 -0
src/sandbox/runners/__init__.py +32 -0
src/sandbox/runners/java_runner.py +350 -0
src/sandbox/runners/javascript_runner.py +318 -0
src/sandbox/runners/python_runner.py +219 -0
src/sandbox/validator.py +718 -0
src/search/__init__.py +8 -0
src/search/embeddings.py +350 -0
src/search/vector_store.py +350 -0
src/ui/__init__.py +1 -0
src/ui/app.py +1045 -0
src/utils/__init__.py +1 -0
src/utils/file_handler.py +166 -0
src/workflow/__init__.py +5 -0
src/workflow/orchestrator.py +732 -0

.env.example ADDED Viewed

	@@ -0,0 +1,29 @@

+# AI Provider Configuration
+# ============================================
+# Choose your AI provider: gemini or nebius
+AI_PROVIDER=gemini
+# Gemini API Configuration
+GEMINI_API_KEY=your_gemini_api_key
+# Optional: Change the Gemini model (default: gemini-2.5-flash)
+# Other options: gemini-3-pro, gemini-2.5-pro, etc.
+GEMINI_MODEL=gemini-2.5-flash
+# Nebius Token Factory Configuration
+NEBIUS_API_KEY=your_nebius_api_key
+# Optional: Change the Nebius model (default: zai-org/GLM-4.5)
+NEBIUS_MODEL=zai-org/GLM-4.5
+# Modal Configuration
+MODAL_TOKEN_ID=your_modal_token_id
+MODAL_TOKEN_SECRET=your_modal_token_secret
+MODAL_API_URL=your_modal_api_url
+# GitHub Configuration for code fetching
+GITHUB_TOKEN=your_github_personal_access_token
+# Optional: Database Configuration
+DATABASE_URL=sqlite:///./modernizer.db
+# Tavily API Configuration (Optional)
+TAVILY_API_KEY=your_tavily_api_key

.gitignore ADDED Viewed

	@@ -0,0 +1,69 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+env/
+ENV/
+.venv
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# Testing
+.pytest_cache/
+.pattern_cache/
+.coverage
+htmlcov/
+.tox/
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Environment variables
+.env
+.env.local
+# Uploads and temporary files
+uploads/
+*.zip
+*.tar.gz
+# Output directories
+modernized_output/
+output/
+temp/
+tmp/
+# OS
+.DS_Store
+Thumbs.db
+# Logs
+*.log
+# Modal
+.modal/

README.md CHANGED Viewed

@@ -1,14 +1,295 @@
 ---
-title: Legacy Code Modernizer
-emoji: 🌍
 colorFrom: purple
-colorTo: red
 sdk: gradio
 sdk_version: 6.0.1
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: AI-powered code modernization agent
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Legacy Code Modernizer - Autonomous AI Agent
+emoji: 🤖
 colorFrom: purple
+colorTo: blue
 sdk: gradio
 sdk_version: 6.0.1
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: Autonomous AI agent for code modernization with MCP tools
+tags:
+  - mcp-in-action-track-enterprise
+  - code-modernization
+  - autonomous-agent
+  - mcp
+  - gradio
 ---
+# 🤖 Legacy Code Modernizer - Autonomous AI Agent
+**Track 2: MCP in Action - Enterprise Applications**
+An autonomous AI agent that modernizes legacy codebases through intelligent planning, reasoning, and execution using Model Context Protocol (MCP) tools.
+## 🎯 Project Overview
+Legacy Code Modernizer is a complete autonomous agent system that transforms outdated code into modern, secure, and maintainable software. The agent autonomously:
+1. **Plans** - Analyzes codebases and creates modernization strategies
+2. **Reasons** - Makes intelligent decisions about transformation priorities
+3. **Executes** - Applies transformations, generates tests, and validates changes
+4. **Integrates** - Creates GitHub PRs with comprehensive documentation
+## 🏆 Why This Project Stands Out
+### Autonomous Agent Capabilities
+**Multi-Phase Planning & Reasoning:**
+- **Phase 1**: Intelligent file discovery and classification using AI pattern detection
+- **Phase 2**: Semantic code analysis with vector-based similarity search (LlamaIndex + Chroma)
+- **Phase 3**: Deep pattern analysis using multiple AI models (Gemini, Nebius AI)
+- **Phase 4**: Autonomous code transformation with context-aware reasoning
+- **Phase 5**: Automated testing in isolated sandbox + GitHub PR creation
+**Context Engineering & RAG:**
+- Vector embeddings for semantic code search
+- Pattern grouping across similar files
+- Historical transformation caching via MCP Memory
+- Real-time migration guide retrieval via MCP Search
+### MCP Tools Integration
+The agent uses **4 MCP servers** as autonomous tools:
+1. **GitHub MCP** - Autonomous PR creation with comprehensive documentation
+2. **Tavily Search MCP** - Real-time migration guide discovery
+3. **Memory MCP** - Pattern analysis caching and learning
+4. **Filesystem MCP** - Safe file operations (planned)
+### Real-World Enterprise Value
+- **Multi-language support**: Python, Java, JavaScript, TypeScript
+- **Secure execution**: Modal sandbox with isolated test environments
+- **Production-ready**: Comprehensive test generation with coverage reporting
+## 🚀 Demo
+### Video Demo
+**[Demo video](https://drive.google.com/file/d/1ph0NK8QKXRStjydqBV9w6HJaViirswE2/view?usp=sharing)**
+### Social Media Post
+**Xpost link will be added here**
+## 🎬 Quick Start
+### Try It Live on Hugging Face Spaces
+1. **Upload a code file** (Python, Java, JavaScript, TypeScript)
+2. **Select target version** (auto-detected from your code)
+3. **Click "Start Modernization"**
+4. **Watch the autonomous agent work** through all 5 phases
+5. **Download modernized code, tests, and reports**
+### Local Installation
+```bash
+# Clone repository
+git clone https://huggingface.co/spaces/MCP-1st-Birthday/legacy_code_modernizer
+cd legacy_code_modernizer
+# Set up environment variables
+cp .env.example .env
+# Edit .env with your API keys:
+# - GEMINI_API_KEY (required)
+# - GITHUB_TOKEN (for PR creation)
+# - TAVILY_API_KEY (for search)
+# - MODAL_TOKEN_ID & MODAL_TOKEN_SECRET (for sandbox)
+# Set up Python virtual environment
+#   On macOS / Linux:
+source venv/bin/activate
+#   On Windows PowerShell:
+.\venv\Scripts\Activate.ps1
+#   On Windows CMD:
+venv\Scripts\activate.bat
+# Install dependencies
+pip install -r requirements.txt
+# Run the Gradio app
+python app.py
+```
+## 🧠 Autonomous Agent Architecture
+### Planning Phase
+```
+Input: Legacy codebase
+↓
+Agent analyzes file structure and content
+↓
+Classifies files by modernization priority
+↓
+Creates transformation roadmap
+```
+### Reasoning Phase
+```
+Agent groups similar patterns using vector search
+↓
+Retrieves migration guides via Tavily MCP
+↓
+Checks cached analyses via Memory MCP
+↓
+Prioritizes transformations by risk/impact
+```
+### Execution Phase
+```
+Agent transforms code with AI models
+↓
+Generates comprehensive test suites
+↓
+Validates in isolated Modal sandbox
+↓
+Auto-fixes export/import issues
+```
+### Integration Phase
+```
+Agent creates GitHub branch via GitHub MCP
+↓
+Commits transformed files
+↓
+Generates PR with deployment checklist
+↓
+Adds rollback plan and test results
+```
+## 🛠️ Technical Stack
+### AI & LLM
+- **Google Gemini** - Primary reasoning engine with large context window
+- **Nebius AI** - Alternative model for diverse perspectives
+- **LlamaIndex** - RAG framework for semantic code search
+- **Chroma** - Vector database for embeddings
+### MCP Integration
+- **mcp** (v1.22.0) - Model Context Protocol SDK
+- **@modelcontextprotocol/server-github** - GitHub operations
+- **@modelcontextprotocol/server-tavily** - Web search
+- **@modelcontextprotocol/server-memory** - Persistent storage
+### Execution & Testing
+- **Modal** - Serverless sandbox for secure test execution
+- **pytest/Jest/JUnit** - Language-specific test frameworks
+- **Coverage.py/JaCoCo** - Code coverage analysis
+### UI & Orchestration
+- **Gradio 6.0** - Interactive web interface
+- **LangGraph** - Agent workflow orchestration
+- **asyncio** - Asynchronous execution
+## 📊 Features Showcase
+### 1. Intelligent Pattern Detection
+```python
+# Agent automatically detects legacy patterns:
+- Deprecated libraries (MySQLdb → PyMySQL)
+- Security vulnerabilities (SQL injection)
+- Python 2 syntax → Python 3
+- Missing type hints
+- Old-style string formatting
+```
+### 2. Semantic Code Search
+```python
+# Vector-based similarity search finds:
+- Files with similar legacy patterns
+- Related security vulnerabilities
+- Common refactoring opportunities
+```
+### 3. Autonomous Test Generation
+```python
+# Agent generates:
+- Unit tests with pytest/Jest/JUnit
+- Integration tests
+- Edge case coverage
+- Performance benchmarks
+```
+### 4. GitHub Integration via MCP
+```python
+# Automated PR includes:
+- Comprehensive change summary
+- Test results with coverage
+- Risk assessment
+- Deployment checklist
+- Rollback plan
+```
+## 🎯 Supported Languages & Versions
+### Python
+- **Versions**: 3.10, 3.11, 3.12, 3.13, 3.14
+- **Frameworks**: Django 5.2 LTS, Flask 3.1, FastAPI 0.122
+- **Testing**: pytest with coverage
+### Java
+- **Versions**: Java 17 LTS, 21 LTS, 23, 25 LTS
+- **Frameworks**: Spring Boot 3.4, 4.0
+- **Testing**: Maven + JUnit 5 + JaCoCo
+### JavaScript
+- **Standards**: ES2024, ES2025
+- **Runtimes**: Node.js 22 LTS, 24 LTS, 25
+- **Frameworks**: React 19, Angular 21, Vue 3.5, Express 5.1, Next.js 16
+- **Testing**: Jest with coverage
+### TypeScript
+- **Versions**: 5.6, 5.7, 5.8, 5.9
+- **Frameworks**: React 19, Angular 21, Next.js 16
+- **Testing**: Jest with ts-jest
+## 🔒 Security & Isolation
+### Modal Sandbox Execution
+- **Network isolation**: No external network access during tests
+- **Filesystem isolation**: Temporary containers per execution
+- **Resource limits**: CPU and memory constraints
+- **Automatic cleanup**: Containers destroyed after execution
+### Code Validation
+- **Syntax checking**: Pre-execution validation
+- **Import/export fixing**: Automatic resolution of module issues
+- **Security scanning**: Detection of vulnerabilities
+- **Type checking**: Language-specific validation
+## 🎓 Advanced Features
+### Context Engineering
+- **Sliding window context**: Manages large files efficiently
+- **Cross-file analysis**: Understands dependencies
+- **Pattern learning**: Improves with usage via Memory MCP
+### RAG Implementation
+- **Semantic chunking**: Intelligent code splitting
+- **Vector similarity**: Finds related patterns
+- **Hybrid search**: Combines keyword + semantic search
+### Agent Reasoning
+- **Priority scoring**: Risk vs. impact analysis
+- **Dependency tracking**: Understands file relationships
+## 📝 License
+Apache 2.0 - See LICENSE file for details
+## 🙏 Acknowledgments
+Built for **MCP's 1st Birthday Hackathon** hosted by Anthropic and Gradio.
+**Powered by:**
+- Google Gemini & Nebius AI
+- Model Context Protocol (MCP)
+- LlamaIndex & Chroma
+- Modal
+- Gradio
+---
+*Autonomous agents + MCP tools = The future of software development*

app.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+Entry point for HuggingFace Spaces
+Redirects to the actual app in src/ui/app.py
+"""
+import sys
+import os
+# Add src directory to Python path
+sys.path.insert(0, os.path.dirname(__file__))
+# Import and run the actual app
+from src.ui.app import app
+if __name__ == "__main__":
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )

modal/api_test.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import requests
+import json
+import os
+from dotenv import load_dotenv
+load_dotenv()
+# ---------------------------------------------------------
+# Modal API URL is loaded from .env file
+# Set MODAL_API_URL in your .env file
+# It usually looks like: https://your-username--text-embeddings-inference-api-text-embed-7389a1.modal.run
+# ---------------------------------------------------------
+API_URL = os.getenv("MODAL_API_URL", "").strip()
+if not API_URL:
+    raise ValueError("MODAL_API_URL not found in .env file. Please set it to your Modal endpoint URL.")
+def test_embeddings():
+    print(f"Testing API at: {API_URL}")
+    # 1. Define the input text
+    payload = {
+        "inputs": [
+            "Hello, this is a test sentence.",
+            "Running text embeddings on Modal is fast."
+        ]
+    }
+    try:
+        # 2. Send POST request
+        response = requests.post(API_URL, json=payload)
+        # 3. Check for errors
+        response.raise_for_status()
+        # 4. Parse the result
+        data = response.json()
+        # 5. Display results
+        model_name = data.get("model", "Unknown")
+        embeddings = data.get("embeddings", [])
+        dims = data.get("dimensions", 0)
+        print("\n--- Success! ---")
+        print(f"Model used: {model_name}")
+        print(f"Vector dimensions: {dims}")
+        print(f"Number of texts embedded: {len(embeddings)}")
+        # Print the first few numbers of the first embedding to verify
+        if embeddings:
+            print(f"\nFirst 5 values of first embedding:\n{embeddings[0][:5]}...")
+    except requests.exceptions.RequestException as e:
+        print(f"\nError calling API: {e}")
+        if response is not None:
+             print(f"Server response: {response.text}")
+if __name__ == "__main__":
+    test_embeddings()

pytest.ini ADDED Viewed

	@@ -0,0 +1,31 @@

+[tool:pytest]
+testpaths = tests
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+addopts =
+    -v
+    --tb=short
+    --strict-markers
+    --cov=src
+    --cov-report=html
+    --cov-report=term-missing
+markers =
+    integration: Integration tests (deselect with '-m "not integration"')
+    slow: Slow tests (deselect with '-m "not slow"')
+[coverage:run]
+source = src
+omit =
+    */tests/*
+    */__pycache__/*
+    */venv/*
+    */env/*
+[coverage:report]
+precision = 2
+show_missing = True
+skip_covered = False
+[coverage:html]
+directory = htmlcov

requirements.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+# AI & LLM
+google-genai>=1.0.0
+openai>=1.0.0
+llama-index>=0.14.0
+llama-index-llms-google-genai>=0.4.0
+llama-index-llms-openai>=0.4.0
+llama-index-embeddings-huggingface>=0.5.0
+# Vector Store & Embeddings
+chromadb>=1.3.0
+llama-index-vector-stores-chroma>=0.4.0
+# Agent Orchestration
+langgraph>=1.0.0
+langchain-core>=0.3.0
+# Compute & Sandbox
+modal>=1.2.0
+# MCP Protocol
+mcp>=1.22.0
+# UI Framework
+gradio>=6.0.0
+# Database & ORM
+sqlalchemy>=2.0.0
+pymysql>=1.1.0
+# Testing
+pytest>=9.0.0
+pytest-cov>=6.0.0
+pytest-timeout>=2.3.0
+pytest-asyncio>=0.24.0
+# Utilities
+python-dotenv>=1.0.0
+pydantic>=2.10.0
+transformers>=4.30.0  # For proper tokenization

src/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """Legacy Code Modernizer Agent - AI-powered code modernization system."""
2	+
3	+ __version__ = "0.1.0"

src/agents/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+"""Agent components for code analysis and transformation."""
+from .classifier import CodeClassifier
+from .analyzer import CodeAnalyzer
+from .transformer import CodeTransformer
+from .test_generator import CodeTestGenerator
+# Keep backward compatibility
+TestGenerator = CodeTestGenerator
+__all__ = ['CodeClassifier', 'CodeAnalyzer', 'CodeTransformer', 'CodeTestGenerator', 'TestGenerator']

src/agents/analyzer.py ADDED Viewed

	@@ -0,0 +1,322 @@

+"""
+Deep code analyzer using AI with RAG and MCP integration.
+Supports multiple AI providers (Gemini, Nebius, OpenAI).
+"""
+import os
+import json
+import logging
+from typing import Dict, List, Optional
+from src.config import AIManager, GeminiSchemas
+logger = logging.getLogger(__name__)
+class CodeAnalyzer:
+    """
+    Deep analyzer for legacy code patterns using AI + RAG.
+    Integrates with MCP servers for enhanced analysis.
+    """
+    def __init__(self, mcp_manager=None, search_engine=None):
+        """
+        Initialize Code Analyzer.
+        Args:
+            mcp_manager: Optional MCPManager instance
+            search_engine: Optional CodeSearchEngine instance
+        """
+        self.mcp_manager = mcp_manager
+        self.search_engine = search_engine
+        # Use centralized AI manager
+        self.ai_manager = AIManager()
+        logger.info(
+            f"CodeAnalyzer initialized with provider: {self.ai_manager.provider_name}, "
+            f"model: {self.ai_manager.model_name}"
+        )
+    async def analyze_pattern(self, files: List[str], pattern_name: str,
+                             file_contents: Dict[str, str]) -> Dict:
+        """
+        Deep analysis of legacy pattern with full context.
+        Args:
+            files: List of file paths to analyze
+            pattern_name: Name of the pattern (e.g., "MySQLdb usage")
+            file_contents: Dictionary mapping file paths to their contents
+        Returns:
+            Analysis result dictionary
+        """
+        logger.info(f"Analyzing pattern: {pattern_name} in {len(files)} files")
+        # Check cache first (if MCP manager available)
+        if self.mcp_manager:
+            try:
+                from src.mcp.memory_client import MemoryMCPClient
+                memory_client = MemoryMCPClient(self.mcp_manager)
+                pattern_id = self._generate_pattern_id(pattern_name, files)
+                cached_analysis = await memory_client.retrieve_pattern_analysis(pattern_id)
+                if cached_analysis:
+                    logger.info(f"Using cached analysis for {pattern_name}")
+                    return cached_analysis
+            except Exception as e:
+                logger.warning(f"Could not retrieve cached analysis: {e}")
+        # Get context from search engine if available
+        context = ""
+        if self.search_engine:
+            try:
+                similar_files = self.search_engine.find_similar_patterns(
+                    f"Files with {pattern_name}",
+                    top_k=10
+                )
+                context = f"\n\nSimilar patterns found in: {', '.join([f['file_path'] for f in similar_files[:5]])}"
+            except Exception as e:
+                logger.warning(f"Could not get search context: {e}")
+        # Get migration guides from Tavily if available
+        migration_guides = ""
+        if self.mcp_manager:
+            try:
+                from src.mcp.search_client import SearchMCPClient
+                search_client = SearchMCPClient(self.mcp_manager)
+                # Extract technologies from pattern name
+                guides = await search_client.find_migration_guide(
+                    from_tech=pattern_name.split()[0],
+                    to_tech="modern alternative",
+                    max_results=3
+                )
+                if guides:
+                    migration_guides = "\n\nRelevant migration guides:\n"
+                    for guide in guides:
+                        migration_guides += f"- {guide['title']}: {guide['url']}\n"
+            except Exception as e:
+                logger.warning(f"Could not fetch migration guides: {e}")
+        # Combine file contents
+        code_samples = "\n\n".join([
+            f"=== {file_path} ===\n{content[:1000]}..."  # Limit to first 1000 chars per file
+            for file_path, content in list(file_contents.items())[:5]  # Limit to 5 files
+        ])
+        # Build analysis prompt
+        prompt = f"""You are a senior software architect analyzing legacy code for modernization.
+PATTERN TO ANALYZE: {pattern_name}
+FILES AFFECTED: {', '.join(files)}
+CODE SAMPLES:
+{code_samples}
+{context}
+{migration_guides}
+TASK: Provide a comprehensive analysis with:
+1. **Current Implementation**: What the code currently does
+2. **Issues**: Specific problems (security, performance, maintainability)
+3. **Modern Recommendation**: Recommended library/pattern with version
+4. **Migration Steps**: Detailed step-by-step migration plan
+5. **Risk Assessment**: Potential risks and mitigation strategies
+6. **Estimated Effort**: Time estimate for migration
+Respond in JSON format with these exact keys:
+{{
+  "pattern": "{pattern_name}",
+  "files": {json.dumps(files)},
+  "analysis": "detailed analysis",
+  "issues": ["issue1", "issue2", ...],
+  "recommendation": "recommended approach",
+  "steps": ["step1", "step2", ...],
+  "risks": "risk assessment",
+  "effort_hours": estimated_hours
+}}
+"""
+        try:
+            # Use JSON schema for guaranteed structure
+            schema = GeminiSchemas.code_analysis()
+            # Call AI with configured model
+            response_text = self.ai_manager.generate_content(
+                prompt=prompt,
+                temperature=AIManager.TEMPERATURE_PRECISE,
+                max_tokens=AIManager.MAX_OUTPUT_TOKENS_MEDIUM,
+                response_format="json",
+                response_schema=schema
+            )
+            # Parse JSON response
+            analysis = json.loads(response_text)
+            # Cache the analysis
+            if self.mcp_manager:
+                try:
+                    from src.mcp.memory_client import MemoryMCPClient
+                    memory_client = MemoryMCPClient(self.mcp_manager)
+                    pattern_id = self._generate_pattern_id(pattern_name, files)
+                    await memory_client.store_pattern_analysis(pattern_id, analysis)
+                except Exception as e:
+                    logger.warning(f"Could not cache analysis: {e}")
+            logger.info(f"Analysis complete for {pattern_name}")
+            return analysis
+        except Exception as e:
+            logger.error(f"Error during analysis: {e}")
+            # Return fallback analysis
+            return {
+                "pattern": pattern_name,
+                "files": files,
+                "analysis": f"Error during analysis: {str(e)}",
+                "issues": ["Analysis failed"],
+                "recommendation": "Manual review required",
+                "steps": ["Review error logs", "Retry analysis"],
+                "risks": "High - analysis incomplete",
+                "effort_hours": 0
+            }
+    def _generate_pattern_id(self, pattern_name: str, files: List[str]) -> str:
+        """
+        Generate unique ID for a pattern.
+        Args:
+            pattern_name: Name of the pattern
+            files: List of files
+        Returns:
+            Unique pattern ID
+        """
+        import hashlib
+        # Create hash from pattern name and sorted file list
+        content = f"{pattern_name}:{'|'.join(sorted(files))}"
+        return hashlib.md5(content.encode()).hexdigest()
+    async def analyze_security_issues(self, file_path: str, code: str) -> Dict:
+        """
+        Analyze code for security vulnerabilities.
+        Args:
+            file_path: Path to the file
+            code: Code content
+        Returns:
+            Security analysis result
+        """
+        logger.info(f"Analyzing security issues in {file_path}")
+        prompt = f"""Analyze this code for security vulnerabilities:
+FILE: {file_path}
+CODE:
+{code[:2000]}
+Identify:
+1. SQL injection risks
+2. Hardcoded credentials
+3. Insecure cryptography
+4. Path traversal vulnerabilities
+5. Command injection risks
+6. Other security issues
+Respond in JSON format:
+{{
+  "vulnerabilities": [
+    {{
+      "type": "vulnerability type",
+      "severity": "critical|high|medium|low",
+      "line_number": estimated_line,
+      "description": "description",
+      "recommendation": "how to fix"
+    }}
+  ],
+  "security_score": 0-100
+}}
+"""
+        try:
+            response_text = self.ai_manager.generate_content(
+                prompt=prompt,
+                temperature=AIManager.TEMPERATURE_PRECISE,
+                max_tokens=AIManager.MAX_OUTPUT_TOKENS_SMALL,
+                response_format="json"
+            )
+            return json.loads(response_text)
+        except Exception as e:
+            logger.error(f"Error during security analysis: {e}")
+            return {
+                "vulnerabilities": [],
+                "security_score": 0
+            }
+    async def suggest_refactoring(self, file_path: str, code: str) -> Dict:
+        """
+        Suggest code refactoring improvements.
+        Args:
+            file_path: Path to the file
+            code: Code content
+        Returns:
+            Refactoring suggestions
+        """
+        logger.info(f"Suggesting refactoring for {file_path}")
+        prompt = f"""Suggest refactoring improvements for this code:
+FILE: {file_path}
+CODE:
+{code[:2000]}
+Focus on:
+1. Code duplication
+2. Complex functions (high cyclomatic complexity)
+3. Poor naming conventions
+4. Missing error handling
+5. Performance optimizations
+6. Type hints and documentation
+Respond in JSON format:
+{{
+  "suggestions": [
+    {{
+      "category": "category",
+      "priority": "high|medium|low",
+      "description": "what to improve",
+      "benefit": "why improve it"
+    }}
+  ],
+  "code_quality_score": 0-100
+}}
+"""
+        try:
+            response_text = self.ai_manager.generate_content(
+                prompt=prompt,
+                temperature=AIManager.TEMPERATURE_PRECISE,
+                max_tokens=AIManager.MAX_OUTPUT_TOKENS_SMALL,
+                response_format="json"
+            )
+            return json.loads(response_text)
+        except Exception as e:
+            logger.error(f"Error during refactoring analysis: {e}")
+            return {
+                "suggestions": [],
+                "code_quality_score": 0
+            }

src/agents/classifier.py ADDED Viewed

	@@ -0,0 +1,119 @@

+"""Code classification using AI."""
+import json
+from typing import Dict, List
+import os
+from dotenv import load_dotenv
+from src.config import AIManager, GeminiSchemas
+load_dotenv()
+class CodeClassifier:
+    """Classifies code files into modernization categories using Gemini."""
+    def __init__(self):
+        """Initialize the classifier with AI client."""
+        # Use centralized AI manager
+        self.ai_manager = AIManager()
+    def classify_files(self, file_list: List[str], batch_size: int = 25) -> Dict[str, str]:
+        """
+        Classify files using Gemini with few-shot prompting.
+        Args:
+            file_list: List of file paths to classify
+            batch_size: Number of files to process per API call
+        Returns:
+            Dictionary mapping filenames to categories
+        """
+        all_results = {}
+        # Process in batches to avoid token limits
+        for i in range(0, len(file_list), batch_size):
+            batch = file_list[i:i + batch_size]
+            batch_results = self._classify_batch(batch)
+            all_results.update(batch_results)
+        return all_results
+    def _classify_batch(self, file_list: List[str]) -> Dict[str, str]:
+        """Classify a batch of files."""
+        prompt = f"""You are a code modernization expert. Classify these files into categories.
+CATEGORIES:
+- modernize_high: Legacy patterns that need immediate update (Python 2, deprecated libs, security issues)
+- modernize_low: Minor improvements needed (add type hints, optimize imports)
+- skip: Already modern or non-code files
+FEW-SHOT EXAMPLES:
+1. utils/db.py (uses MySQLdb, string interpolation) → modernize_high
+2. config.py (hardcoded credentials) → modernize_high
+3. models/user.py (missing type hints) → modernize_low
+4. src/api/UserController.java (uses deprecated Vector, no generics) → modernize_high
+5. frontend/app.js (uses jQuery 1.x, inline event handlers) → modernize_high
+6. legacy_php/login.php (mysql_connect, no prepared statements) → modernize_high
+7. README.md → skip
+8. tests/test_api.py (uses unittest, modern Python 3) → skip
+9. package.json → skip
+10. .gitignore → skip
+FILES TO CLASSIFY:
+{json.dumps(file_list, indent=2)}
+Return JSON object with filename as key and category as value.
+Example: {{"file1.py": "modernize_high", "file2.js": "skip"}}
+"""
+        try:
+            # Use JSON schema for guaranteed structure
+            schema = GeminiSchemas.file_classification()
+            response_text = self.ai_manager.generate_content(
+                prompt=prompt,
+                temperature=AIManager.TEMPERATURE_PRECISE,
+                max_tokens=AIManager.MAX_OUTPUT_TOKENS_MEDIUM,
+                response_format="json",
+                response_schema=schema
+            )
+            result = json.loads(response_text)
+            # Validate results
+            valid_categories = {"modernize_high", "modernize_low", "skip"}
+            for filename, category in result.items():
+                if category not in valid_categories:
+                    result[filename] = "skip"  # Default to skip if invalid
+            return result
+        except Exception as e:
+            print(f"Error classifying batch: {e}")
+            # Return default classifications on error
+            return {f: "skip" for f in file_list}
+    def get_statistics(self, classifications: Dict[str, str]) -> Dict[str, int]:
+        """
+        Get statistics about classifications.
+        Args:
+            classifications: Dictionary of file classifications
+        Returns:
+            Dictionary with counts per category
+        """
+        stats = {
+            "modernize_high": 0,
+            "modernize_low": 0,
+            "skip": 0,
+            "total": len(classifications)
+        }
+        for category in classifications.values():
+            if category in stats:
+                stats[category] += 1
+        return stats

src/agents/code_validator.py ADDED Viewed

	@@ -0,0 +1,346 @@

+"""
+Code Validator - Validates generated code for common issues.
+Catches problems before they reach the sandbox execution phase.
+"""
+import re
+import logging
+from typing import Dict, List, Tuple
+logger = logging.getLogger(__name__)
+class CodeValidator:
+    """Validates generated code for common issues and inconsistencies."""
+    @staticmethod
+    def validate_typescript_module_system(source_code: str) -> Tuple[bool, List[str]]:
+        """
+        Validate that TypeScript code is compatible with Jest/ts-jest (CommonJS).
+        Args:
+            source_code: TypeScript source code
+        Returns:
+            (is_valid, list_of_issues)
+        """
+        issues = []
+        # Check for ES module-only features that break Jest/ts-jest
+        if 'import.meta' in source_code:
+            issues.append(
+                "Code uses 'import.meta' which requires ES modules. "
+                "Jest/ts-jest uses CommonJS. Remove import.meta usage."
+            )
+        if re.search(r'\btop-level\s+await\b', source_code) or re.search(r'^await\s+', source_code, re.MULTILINE):
+            issues.append(
+                "Code uses top-level await which requires ES modules. "
+                "Jest/ts-jest uses CommonJS. Wrap in async function."
+            )
+        # Check for CLI execution patterns that shouldn't be in library code
+        if 'process.argv[1]' in source_code or 'if (require.main === module)' in source_code:
+            issues.append(
+                "Code includes CLI execution logic. "
+                "Library code should not include main execution blocks."
+            )
+        return len(issues) == 0, issues
+    @staticmethod
+    def validate_typescript_exports(source_code: str, test_code: str) -> Tuple[bool, List[str]]:
+        """
+        Validate that all TypeScript types/enums/interfaces imported in tests are exported in source.
+        Args:
+            source_code: TypeScript source code
+            test_code: TypeScript test code
+        Returns:
+            (is_valid, list_of_issues)
+        """
+        issues = []
+        # Extract imports from test code
+        import_pattern = r'import\s+\{([^}]+)\}\s+from\s+["\']\./'
+        test_imports = re.findall(import_pattern, test_code)
+        if not test_imports:
+            return True, []
+        # Get all imported names
+        imported_names = set()
+        for import_group in test_imports:
+            names = [name.strip() for name in import_group.split(',')]
+            imported_names.update(names)
+        # Check if each imported name is exported in source
+        for name in imported_names:
+            # Check for export function/class/enum/interface/type
+            export_patterns = [
+                rf'export\s+(function|class|enum|interface|type)\s+{name}\b',
+                rf'export\s+\{{\s*[^}}]*\b{name}\b[^}}]*\}}',
+                rf'export\s+const\s+{name}\s*=',
+            ]
+            is_exported = any(re.search(pattern, source_code) for pattern in export_patterns)
+            if not is_exported:
+                # Check if it's declared but not exported
+                declaration_patterns = [
+                    rf'\b(function|class|enum|interface|type)\s+{name}\b',
+                    rf'\bconst\s+{name}\s*=',
+                ]
+                is_declared = any(re.search(pattern, source_code) for pattern in declaration_patterns)
+                if is_declared:
+                    issues.append(
+                        f"'{name}' is declared in source but not exported. "
+                        f"Add 'export' keyword before the declaration."
+                    )
+                else:
+                    issues.append(
+                        f"'{name}' is imported in tests but not found in source code."
+                    )
+        return len(issues) == 0, issues
+    @staticmethod
+    def validate_javascript_exports(source_code: str, test_code: str) -> Tuple[bool, List[str]]:
+        """
+        Validate that all JavaScript functions/classes imported in tests are exported in source.
+        Args:
+            source_code: JavaScript source code
+            test_code: JavaScript test code
+        Returns:
+            (is_valid, list_of_issues)
+        """
+        issues = []
+        # Extract imports from test code (ES6 imports)
+        import_pattern = r'import\s+\{([^}]+)\}\s+from\s+["\']\./'
+        test_imports = re.findall(import_pattern, test_code)
+        if not test_imports:
+            return True, []
+        # Get all imported names
+        imported_names = set()
+        for import_group in test_imports:
+            names = [name.strip() for name in import_group.split(',')]
+            imported_names.update(names)
+        # Check if each imported name is exported in source
+        for name in imported_names:
+            # Check for various export patterns
+            export_patterns = [
+                rf'export\s+(function|class|const|let|var)\s+{name}\b',
+                rf'export\s+\{{\s*[^}}]*\b{name}\b[^}}]*\}}',
+                rf'module\.exports\s*=\s*\{{[^}}]*\b{name}\b[^}}]*\}}',
+                rf'exports\.{name}\s*=',
+            ]
+            is_exported = any(re.search(pattern, source_code) for pattern in export_patterns)
+            if not is_exported:
+                issues.append(
+                    f"'{name}' is imported in tests but not exported in source. "
+                    f"Add it to the export statement."
+                )
+        return len(issues) == 0, issues
+    @staticmethod
+    def validate_python_imports(source_code: str, test_code: str) -> Tuple[bool, List[str]]:
+        """
+        Validate that all Python functions/classes imported in tests exist in source.
+        Args:
+            source_code: Python source code
+            test_code: Python test code
+        Returns:
+            (is_valid, list_of_issues)
+        """
+        issues = []
+        # Extract imports from test code
+        import_patterns = [
+            r'from\s+\w+\s+import\s+([^#\n]+)',
+            r'import\s+(\w+)',
+        ]
+        imported_names = set()
+        for pattern in import_patterns:
+            matches = re.findall(pattern, test_code)
+            for match in matches:
+                names = [name.strip() for name in match.split(',')]
+                imported_names.update(names)
+        # Check if each imported name is defined in source
+        for name in imported_names:
+            # Check for function/class definitions
+            definition_patterns = [
+                rf'^def\s+{name}\s*\(',
+                rf'^class\s+{name}\b',
+                rf'^{name}\s*=',
+            ]
+            is_defined = any(re.search(pattern, source_code, re.MULTILINE) for pattern in definition_patterns)
+            if not is_defined:
+                issues.append(
+                    f"'{name}' is imported in tests but not defined in source code."
+                )
+        return len(issues) == 0, issues
+    @staticmethod
+    def validate_code(source_code: str, test_code: str, language: str) -> Tuple[bool, List[str]]:
+        """
+        Validate code based on language.
+        Args:
+            source_code: Source code
+            test_code: Test code
+            language: Programming language
+        Returns:
+            (is_valid, list_of_issues)
+        """
+        language = language.lower()
+        all_issues = []
+        if language == 'typescript':
+            # Check module system compatibility
+            is_valid_module, module_issues = CodeValidator.validate_typescript_module_system(source_code)
+            all_issues.extend(module_issues)
+            # Check exports
+            is_valid_exports, export_issues = CodeValidator.validate_typescript_exports(source_code, test_code)
+            all_issues.extend(export_issues)
+            return len(all_issues) == 0, all_issues
+        elif language == 'javascript':
+            return CodeValidator.validate_javascript_exports(source_code, test_code)
+        elif language == 'python':
+            return CodeValidator.validate_python_imports(source_code, test_code)
+        else:
+            # No validation for other languages yet
+            return True, []
+    @staticmethod
+    def auto_fix_typescript_module_system(source_code: str) -> str:
+        """
+        Remove ES module-only features that break Jest/ts-jest.
+        Args:
+            source_code: TypeScript source code
+        Returns:
+            Fixed source code
+        """
+        fixed_code = source_code
+        # Remove import.meta usage and related code
+        if 'import.meta' in fixed_code:
+            # Remove the entire CLI execution block that uses import.meta
+            # Pattern: from import statement to the end of the if block
+            pattern = r'\n// Modern ES module.*?\n.*?import.*?from [\'"]url[\'"];.*?\n.*?import.*?from [\'"]path[\'"];.*?\n\nconst __filename.*?import\.meta\.url\);.*?\n.*?if \(process\.argv\[1\].*?\{.*?\n.*?\n.*?\n\}'
+            fixed_code = re.sub(pattern, '', fixed_code, flags=re.DOTALL)
+            # Fallback: remove just the import.meta line
+            if 'import.meta' in fixed_code:
+                fixed_code = re.sub(r'.*import\.meta.*\n', '', fixed_code)
+            logger.info("Auto-fixed: Removed import.meta usage")
+        # Remove CLI execution patterns
+        if 'process.argv[1]' in fixed_code:
+            # Remove if (process.argv[1] === __filename) blocks
+            pattern = r'\nif \(process\.argv\[1\].*?\{[^}]*\}'
+            fixed_code = re.sub(pattern, '', fixed_code, flags=re.DOTALL)
+            logger.info("Auto-fixed: Removed CLI execution block")
+        return fixed_code
+    @staticmethod
+    def auto_fix_typescript_exports(source_code: str, missing_exports: List[str]) -> str:
+        """
+        Automatically add export keywords to TypeScript declarations.
+        Args:
+            source_code: TypeScript source code
+            missing_exports: List of names that need to be exported
+        Returns:
+            Fixed source code
+        """
+        fixed_code = source_code
+        for name in missing_exports:
+            # Try to add export keyword before declaration
+            patterns = [
+                (rf'(\n)(enum\s+{name}\b)', r'\1export \2'),
+                (rf'(\n)(interface\s+{name}\b)', r'\1export \2'),
+                (rf'(\n)(type\s+{name}\b)', r'\1export \2'),
+                (rf'(\n)(class\s+{name}\b)', r'\1export \2'),
+                (rf'(\n)(function\s+{name}\b)', r'\1export \2'),
+                (rf'(\n)(const\s+{name}\s*=)', r'\1export \2'),
+            ]
+            for pattern, replacement in patterns:
+                new_code = re.sub(pattern, replacement, fixed_code)
+                if new_code != fixed_code:
+                    logger.info(f"Auto-fixed: Added 'export' to '{name}'")
+                    fixed_code = new_code
+                    break
+        return fixed_code
+def validate_and_fix_code(source_code: str, test_code: str, language: str) -> Tuple[str, bool, List[str]]:
+    """
+    Validate code and attempt to auto-fix common issues.
+    Args:
+        source_code: Source code
+        test_code: Test code
+        language: Programming language
+    Returns:
+        (fixed_source_code, is_valid, list_of_remaining_issues)
+    """
+    validator = CodeValidator()
+    is_valid, issues = validator.validate_code(source_code, test_code, language)
+    if not is_valid and language.lower() == 'typescript':
+        fixed_code = source_code
+        # Auto-fix module system issues (import.meta, etc.)
+        module_issues = [issue for issue in issues if 'import.meta' in issue or 'top-level await' in issue or 'CLI execution' in issue]
+        if module_issues:
+            logger.info(f"Attempting to auto-fix {len(module_issues)} module system issues")
+            fixed_code = validator.auto_fix_typescript_module_system(fixed_code)
+        # Auto-fix export issues
+        missing_names = []
+        for issue in issues:
+            # Extract name from issue message
+            match = re.search(r"'(\w+)'", issue)
+            if match and "not exported" in issue:
+                missing_names.append(match.group(1))
+        if missing_names:
+            logger.info(f"Attempting to auto-fix {len(missing_names)} export issues")
+            fixed_code = validator.auto_fix_typescript_exports(fixed_code, missing_names)
+        # Re-validate if we made any fixes
+        if fixed_code != source_code:
+            is_valid, issues = validator.validate_code(fixed_code, test_code, language)
+            return fixed_code, is_valid, issues
+    return source_code, is_valid, issues

src/agents/pattern_integration.py ADDED Viewed

	@@ -0,0 +1,296 @@

+"""
+Integration layer for the new IntelligentPatternMatcher with existing workflow.
+Provides backward compatibility while enabling advanced pattern detection.
+"""
+import logging
+from typing import Dict, List, Optional
+from pathlib import Path
+from .pattern_matcher import (
+    IntelligentPatternMatcher,
+    FileAnalysis,
+    PatternSeverity
+)
+from .classifier import CodeClassifier
+logger = logging.getLogger(__name__)
+class PatternMatcherIntegration:
+    """
+    Integrates IntelligentPatternMatcher with existing workflow.
+    Provides compatibility layer for gradual migration.
+    """
+    def __init__(self, use_intelligent_matcher: bool = True, cache_dir: Optional[str] = None):
+        """
+        Initialize integration layer.
+        Args:
+            use_intelligent_matcher: If True, use new AI-powered matcher
+            cache_dir: Optional cache directory for pattern analysis
+        """
+        self.use_intelligent_matcher = use_intelligent_matcher
+        if use_intelligent_matcher:
+            self.pattern_matcher = IntelligentPatternMatcher(cache_dir=cache_dir)
+            logger.info("Using IntelligentPatternMatcher")
+        else:
+            self.classifier = CodeClassifier()
+            logger.info("Using legacy CodeClassifier")
+    def classify_files(self, files: List[str], file_contents: Optional[Dict[str, str]] = None) -> Dict[str, str]:
+        """
+        Classify files using either intelligent matcher or legacy classifier.
+        Args:
+            files: List of file paths
+            file_contents: Optional dict of file contents (required for intelligent matcher)
+        Returns:
+            Dictionary mapping filenames to categories
+            Categories: 'modernize_high', 'modernize_low', 'skip'
+        """
+        if self.use_intelligent_matcher:
+            return self._classify_with_intelligent_matcher(files, file_contents)
+        else:
+            return self.classifier.classify_files(files)
+    def _classify_with_intelligent_matcher(
+        self,
+        files: List[str],
+        file_contents: Optional[Dict[str, str]]
+    ) -> Dict[str, str]:
+        """
+        Classify files using intelligent pattern matcher.
+        Args:
+            files: List of file paths
+            file_contents: Dictionary of file contents
+        Returns:
+            Dictionary mapping filenames to categories
+        """
+        if not file_contents:
+            logger.warning("No file contents provided, falling back to legacy classifier")
+            return self.classifier.classify_files(files)
+        classifications = {}
+        # Analyze files
+        analyses = self.pattern_matcher.analyze_batch(file_contents)
+        # Convert analyses to legacy classification format
+        for file_path, analysis in analyses.items():
+            category = self._analysis_to_category(analysis)
+            classifications[file_path] = category
+        return classifications
+    def _analysis_to_category(self, analysis: FileAnalysis) -> str:
+        """
+        Convert FileAnalysis to legacy category format.
+        Args:
+            analysis: FileAnalysis object
+        Returns:
+            Category string: 'modernize_high', 'modernize_low', or 'skip'
+        """
+        if not analysis.requires_modernization:
+            return 'skip'
+        # Check for critical or high severity patterns
+        has_critical = any(
+            p.severity == PatternSeverity.CRITICAL
+            for p in analysis.patterns
+        )
+        has_high = any(
+            p.severity == PatternSeverity.HIGH
+            for p in analysis.patterns
+        )
+        # Check modernization score
+        if has_critical or analysis.modernization_score < 50:
+            return 'modernize_high'
+        elif has_high or analysis.modernization_score < 75:
+            return 'modernize_high'
+        elif analysis.requires_modernization:
+            return 'modernize_low'
+        else:
+            return 'skip'
+    def get_detailed_analysis(self, file_path: str, code: str) -> FileAnalysis:
+        """
+        Get detailed pattern analysis for a single file.
+        Args:
+            file_path: Path to the file
+            code: File contents
+        Returns:
+            FileAnalysis object with detailed pattern information
+        """
+        if not self.use_intelligent_matcher:
+            raise ValueError("Detailed analysis requires intelligent matcher")
+        return self.pattern_matcher.analyze_file(file_path, code)
+    def get_transformation_plan(self, analysis: FileAnalysis) -> Dict:
+        """
+        Convert FileAnalysis to transformation plan format.
+        Args:
+            analysis: FileAnalysis object
+        Returns:
+            Transformation plan dictionary compatible with CodeTransformer
+        """
+        # Group patterns by type
+        pattern_groups = {}
+        for pattern in analysis.patterns:
+            if pattern.pattern_type not in pattern_groups:
+                pattern_groups[pattern.pattern_type] = []
+            pattern_groups[pattern.pattern_type].append(pattern)
+        # Build transformation steps
+        steps = []
+        total_effort = 0
+        for pattern_type, patterns in pattern_groups.items():
+            # Get highest severity pattern for this type
+            highest_severity = max(patterns, key=lambda p: self._severity_to_int(p.severity))
+            steps.append({
+                'pattern': pattern_type,
+                'severity': highest_severity.severity.value,
+                'description': highest_severity.description,
+                'recommendation': highest_severity.recommendation,
+                'line_numbers': highest_severity.line_numbers,
+                'confidence': highest_severity.confidence
+            })
+            total_effort += highest_severity.estimated_effort_hours
+        return {
+            'file_path': analysis.file_path,
+            'language': analysis.language,
+            'framework': analysis.framework,
+            'pattern': f"{analysis.language} modernization",
+            'steps': steps,
+            'estimated_effort_hours': total_effort,
+            'priority': analysis.overall_priority.value,
+            'modernization_score': analysis.modernization_score
+        }
+    def _severity_to_int(self, severity: PatternSeverity) -> int:
+        """Convert severity to integer for comparison."""
+        severity_map = {
+            PatternSeverity.CRITICAL: 5,
+            PatternSeverity.HIGH: 4,
+            PatternSeverity.MEDIUM: 3,
+            PatternSeverity.LOW: 2,
+            PatternSeverity.INFO: 1
+        }
+        return severity_map.get(severity, 0)
+    def generate_statistics(self, analyses: Dict[str, FileAnalysis]) -> Dict:
+        """
+        Generate statistics from pattern analyses.
+        Args:
+            analyses: Dictionary of file analyses
+        Returns:
+            Statistics dictionary
+        """
+        total_files = len(analyses)
+        # Count by category
+        modernize_high = sum(
+            1 for a in analyses.values()
+            if self._analysis_to_category(a) == 'modernize_high'
+        )
+        modernize_low = sum(
+            1 for a in analyses.values()
+            if self._analysis_to_category(a) == 'modernize_low'
+        )
+        skip = total_files - modernize_high - modernize_low
+        # Count patterns by severity
+        severity_counts = {s.value: 0 for s in PatternSeverity}
+        for analysis in analyses.values():
+            for pattern in analysis.patterns:
+                severity_counts[pattern.severity.value] += 1
+        # Calculate average scores
+        avg_modernization_score = (
+            sum(a.modernization_score for a in analyses.values()) / max(total_files, 1)
+        )
+        # Estimate total effort
+        total_effort = sum(
+            sum(p.estimated_effort_hours for p in a.patterns)
+            for a in analyses.values()
+        )
+        return {
+            'total_files': total_files,
+            'modernize_high': modernize_high,
+            'modernize_low': modernize_low,
+            'skip': skip,
+            'severity_counts': severity_counts,
+            'average_modernization_score': round(avg_modernization_score, 2),
+            'total_estimated_effort_hours': round(total_effort, 2),
+            'patterns_detected': sum(len(a.patterns) for a in analyses.values())
+        }
+def migrate_to_intelligent_matcher(
+    orchestrator,
+    repo_path: str,
+    file_contents: Dict[str, str]
+) -> Dict:
+    """
+    Helper function to migrate existing orchestrator to use intelligent matcher.
+    Args:
+        orchestrator: ModernizationOrchestrator instance
+        repo_path: Path to repository
+        file_contents: Dictionary of file contents
+    Returns:
+        Enhanced results with detailed pattern analysis
+    """
+    logger.info("Migrating to IntelligentPatternMatcher")
+    # Create integration layer
+    integration = PatternMatcherIntegration(
+        use_intelligent_matcher=True,
+        cache_dir=Path(repo_path) / ".pattern_cache"
+    )
+    # Analyze all files
+    analyses = integration.pattern_matcher.analyze_batch(file_contents)
+    # Generate prioritized list
+    prioritized = integration.pattern_matcher.prioritize_files(analyses)
+    # Convert to transformation plans
+    transformation_plans = {}
+    for file_path, analysis in prioritized:
+        if analysis.requires_modernization:
+            plan = integration.get_transformation_plan(analysis)
+            transformation_plans[file_path] = plan
+    # Generate report
+    report = integration.pattern_matcher.generate_report(analyses)
+    return {
+        'analyses': analyses,
+        'prioritized_files': prioritized,
+        'transformation_plans': transformation_plans,
+        'statistics': integration.generate_statistics(analyses),
+        'report': report
+    }

src/agents/pattern_matcher.py ADDED Viewed

	@@ -0,0 +1,838 @@

+"""
+Production-grade pattern matching system with AI-powered file type detection.
+Replaces the simple primary/secondary classification with intelligent pattern detection.
+"""
+import os
+import logging
+from typing import Dict, List, Optional, Tuple
+from pathlib import Path
+import json
+from dataclasses import dataclass
+from enum import Enum
+from src.config import AIManager, GeminiSchemas
+logger = logging.getLogger(__name__)
+class PatternSeverity(Enum):
+    """Severity levels for detected patterns."""
+    CRITICAL = "critical"  # Security issues, breaking changes
+    HIGH = "high"  # Deprecated APIs, performance issues
+    MEDIUM = "medium"  # Code quality, maintainability
+    LOW = "low"  # Style, minor improvements
+    INFO = "info"  # Informational only
+@dataclass
+class DetectedPattern:
+    """Represents a detected legacy pattern."""
+    pattern_type: str
+    severity: PatternSeverity
+    file_path: str
+    language: str
+    description: str
+    line_numbers: List[int]
+    confidence: float  # 0.0 to 1.0
+    recommendation: str
+    estimated_effort_hours: float
+@dataclass
+class FileAnalysis:
+    """Complete analysis of a single file."""
+    file_path: str
+    language: str
+    framework: Optional[str]
+    patterns: List[DetectedPattern]
+    overall_priority: PatternSeverity
+    modernization_score: float  # 0-100, higher = more modern
+    requires_modernization: bool
+class IntelligentPatternMatcher:
+    """
+    Production-grade pattern matcher using AI for intelligent detection.
+    Features:
+    - Language-agnostic pattern detection
+    - Context-aware analysis
+    - Confidence scoring
+    - Batch processing optimization
+    - Caching for performance
+    """
+    # Language detection patterns
+    LANGUAGE_PATTERNS = {
+        # Python
+        '.py': 'Python',
+        '.pyw': 'Python',
+        '.pyx': 'Python (Cython)',
+        # Java
+        '.java': 'Java',
+        # JavaScript/TypeScript
+        '.js': 'JavaScript',
+        '.jsx': 'JavaScript (React)',
+        '.mjs': 'JavaScript (ES Module)',
+        '.cjs': 'JavaScript (CommonJS)',
+        '.ts': 'TypeScript',
+        '.tsx': 'TypeScript (React)',
+        # PHP
+        '.php': 'PHP',
+        '.php3': 'PHP',
+        '.php4': 'PHP',
+        '.php5': 'PHP',
+        '.phtml': 'PHP',
+        # Ruby
+        '.rb': 'Ruby',
+        '.rbw': 'Ruby',
+        # Go
+        '.go': 'Go',
+        # C/C++
+        '.c': 'C',
+        '.h': 'C/C++ Header',
+        '.cpp': 'C++',
+        '.cc': 'C++',
+        '.cxx': 'C++',
+        '.c++': 'C++',
+        '.hpp': 'C++ Header',
+        '.hh': 'C++ Header',
+        '.hxx': 'C++ Header',
+        '.h++': 'C++ Header',
+        # C#
+        '.cs': 'C#',
+        # Rust
+        '.rs': 'Rust',
+        # Kotlin
+        '.kt': 'Kotlin',
+        '.kts': 'Kotlin Script',
+        # Swift
+        '.swift': 'Swift',
+        # Scala
+        '.scala': 'Scala',
+        '.sc': 'Scala Script',
+        # R
+        '.r': 'R',
+        '.R': 'R',
+        # Perl
+        '.pl': 'Perl',
+        '.pm': 'Perl Module',
+        '.t': 'Perl Test',
+        '.pod': 'Perl Documentation',
+        # Shell
+        '.sh': 'Shell',
+        '.bash': 'Bash',
+        '.zsh': 'Zsh',
+        '.fish': 'Fish Shell'
+    }
+    # Common legacy patterns by language
+    LEGACY_PATTERNS = {
+        'Python': [
+            'Python 2 syntax (print statements, old-style classes)',
+            'Deprecated libraries (MySQLdb, urllib2, optparse)',
+            'Missing type hints',
+            'Hardcoded credentials',
+            'SQL injection vulnerabilities',
+            'Insecure cryptography (MD5, SHA1 for passwords)',
+            'Global variables and mutable defaults',
+            'Missing error handling',
+            'Synchronous I/O in async contexts'
+        ],
+        'Java': [
+            'Pre-Java 8 code (no lambdas, streams)',
+            'Deprecated APIs (Vector, Hashtable, Date)',
+            'Missing generics',
+            'Raw JDBC without ORM',
+            'Synchronization issues',
+            'Resource leaks (missing try-with-resources)',
+            'Hardcoded configuration',
+            'Missing null checks'
+        ],
+        'JavaScript': [
+            'var instead of let/const',
+            'Callback hell (no Promises/async-await)',
+            'jQuery for DOM manipulation',
+            'eval() usage',
+            'Missing strict mode',
+            'Prototype-based inheritance',
+            'Global namespace pollution',
+            'XSS vulnerabilities'
+        ],
+        'TypeScript': [
+            'any type overuse',
+            'Missing strict mode',
+            'Old module syntax',
+            'Missing null checks',
+            'Implicit any',
+            'Type assertions instead of guards'
+        ],
+        'PHP': [
+            'mysql_* functions (deprecated)',
+            'No prepared statements',
+            'register_globals usage',
+            'eval() and create_function()',
+            'Missing input validation',
+            'Outdated PHP version syntax',
+            'No namespace usage',
+            'Missing error handling'
+        ],
+        'Ruby': [
+            'Ruby 1.8/1.9 syntax',
+            'Missing bundler',
+            'Deprecated gem versions',
+            'Missing RSpec/Minitest',
+            'Global variables',
+            'Missing error handling',
+            'Synchronous I/O'
+        ],
+        'Go': [
+            'Missing error handling',
+            'Deprecated packages',
+            'No context usage',
+            'Missing defer for cleanup',
+            'Goroutine leaks',
+            'Race conditions'
+        ],
+        'C++': [
+            'Raw pointers instead of smart pointers',
+            'Manual memory management',
+            'Missing RAII',
+            'C-style casts',
+            'Missing const correctness',
+            'No move semantics',
+            'Deprecated C++98/03 features'
+        ],
+        'C#': [
+            'Missing async/await patterns',
+            'Old collection types',
+            'Missing LINQ usage',
+            'Deprecated .NET Framework APIs',
+            'Missing nullable reference types',
+            'Old string concatenation',
+            'Missing using statements'
+        ],
+        'Rust': [
+            'Deprecated Rust 2015/2018 syntax',
+            'Missing error handling with Result',
+            'Unsafe code blocks',
+            'Missing lifetime annotations',
+            'Deprecated crate versions',
+            'Missing async/await'
+        ],
+        'Kotlin': [
+            'Java-style code in Kotlin',
+            'Missing null safety',
+            'Not using coroutines',
+            'Missing data classes',
+            'Old collection APIs',
+            'Missing extension functions'
+        ],
+        'Swift': [
+            'Objective-C style code',
+            'Missing optionals',
+            'Old closure syntax',
+            'Missing guard statements',
+            'Deprecated Swift 4 features',
+            'Missing Codable protocol'
+        ],
+        'Scala': [
+            'Scala 2.x syntax',
+            'Missing for-comprehensions',
+            'Old collection APIs',
+            'Missing implicit conversions',
+            'Deprecated Future usage',
+            'Missing case classes'
+        ],
+        'R': [
+            'Old R syntax',
+            'Missing tidyverse usage',
+            'Deprecated package versions',
+            'Missing pipe operators',
+            'Old data.frame usage',
+            'Missing ggplot2'
+        ],
+        'Perl': [
+            'Perl 4 syntax',
+            'Missing strict and warnings',
+            'Old module system',
+            'Deprecated CPAN modules',
+            'Missing Moose/Moo',
+            'Old regex syntax'
+        ],
+        'Shell': [
+            'Missing error handling (set -e)',
+            'Unquoted variables',
+            'Missing shellcheck compliance',
+            'Deprecated commands',
+            'Missing function usage',
+            'Security vulnerabilities'
+        ]
+    }
+    def __init__(self, cache_dir: Optional[str] = None):
+        """
+        Initialize pattern matcher.
+        Args:
+            cache_dir: Optional directory for caching analysis results
+        """
+        # Use centralized AI manager
+        self.ai_manager = AIManager()
+        self.cache_dir = Path(cache_dir) if cache_dir else None
+        if self.cache_dir:
+            self.cache_dir.mkdir(exist_ok=True, parents=True)
+        logger.info(
+            f"IntelligentPatternMatcher initialized with provider: {self.ai_manager.provider_name}, "
+            f"model: {self.ai_manager.model_name}"
+        )
+    def detect_language(self, file_path: str, code_sample: str) -> Tuple[str, Optional[str]]:
+        """
+        Detect programming language and framework using AI.
+        Args:
+            file_path: Path to the file
+            code_sample: Sample of code (first 500 chars)
+        Returns:
+            Tuple of (language, framework)
+        """
+        # First try extension-based detection
+        ext = Path(file_path).suffix.lower()
+        base_language = self.LANGUAGE_PATTERNS.get(ext, 'Unknown')
+        # Use AI for framework detection
+        prompt = f"""Analyze this code and identify:
+1. Programming language (confirm or correct: {base_language})
+2. Framework/library being used (if any)
+FILE: {file_path}
+CODE SAMPLE:
+```
+{code_sample[:500]}
+```
+Respond in JSON format:
+{{
+  "language": "detected language",
+  "framework": "framework name or null",
+  "confidence": 0.0-1.0
+}}
+"""
+        try:
+            # Use JSON schema for guaranteed structure
+            schema = GeminiSchemas.language_detection()
+            response_text = self.ai_manager.generate_content(
+                prompt=prompt,
+                temperature=AIManager.TEMPERATURE_PRECISE,
+                max_tokens=AIManager.MAX_OUTPUT_TOKENS_SMALL,
+                response_format="json",
+                response_schema=schema if self.ai_manager.provider_type == "gemini" else None
+            )
+            result = json.loads(response_text)
+            language = result.get('language', base_language)
+            framework = result.get('framework')
+            logger.info(f"Language detection: {language}, Framework: {framework}, Confidence: {result.get('confidence', 0)}")
+            return language, framework
+        except Exception as e:
+            logger.warning(f"AI language detection failed: {e}, using extension-based")
+            return base_language, None
+    def analyze_file(self, file_path: str, code: str) -> FileAnalysis:
+        """
+        Perform comprehensive pattern analysis on a single file.
+        Args:
+            file_path: Path to the file
+            code: File contents
+        Returns:
+            FileAnalysis object with detected patterns
+        """
+        logger.info(f"Analyzing patterns in {file_path}")
+        # Check cache
+        if self.cache_dir:
+            cache_file = self.cache_dir / f"{hash(file_path + code)}.json"
+            if cache_file.exists():
+                try:
+                    cached = json.loads(cache_file.read_text())
+                    return self._deserialize_analysis(cached)
+                except Exception as e:
+                    logger.warning(f"Cache read failed: {e}")
+        # Detect language and framework
+        language, framework = self.detect_language(file_path, code[:500])
+        # Get relevant patterns for this language
+        relevant_patterns = self.LEGACY_PATTERNS.get(language, [])
+        # Build analysis prompt - limit code size to prevent output token overflow
+        # For large files, we need to be more conservative to leave room for detailed analysis
+        code_limit = 4000 if len(code) > 6000 else 6000
+        prompt = f"""You are a senior code auditor. Analyze this code for legacy patterns and modernization opportunities.
+FILE: {file_path}
+LANGUAGE: {language}
+FRAMEWORK: {framework or 'None detected'}
+PATTERNS TO CHECK:
+{json.dumps(relevant_patterns, indent=2)}
+CODE:
+```{language.lower()}
+{code[:code_limit]}
+```
+IMPORTANT: Focus on the MOST CRITICAL patterns. Limit your response to the top 10 most important issues.
+For each detected pattern, provide:
+1. Pattern type (from the list above or new if discovered)
+2. Severity (critical/high/medium/low/info)
+3. Line numbers where pattern appears (first occurrence only)
+4. Confidence score (0.0-1.0)
+5. Brief description (max 100 chars)
+6. Concise recommendation (max 100 chars)
+7. Estimated effort in hours
+Also provide:
+- Overall modernization score (0-100, where 100 is fully modern)
+- Whether modernization is required (true/false)
+- Overall priority (critical/high/medium/low/info)
+Respond in JSON format:
+{{
+  "patterns": [
+    {{
+      "pattern_type": "string",
+      "severity": "critical|high|medium|low|info",
+      "line_numbers": [1],
+      "confidence": 0.95,
+      "description": "brief description",
+      "recommendation": "concise fix",
+      "estimated_effort_hours": 2.5
+    }}
+  ],
+  "modernization_score": 65,
+  "requires_modernization": true,
+  "overall_priority": "high"
+}}
+"""
+        try:
+            # Use JSON schema for guaranteed structure - no more parsing failures!
+            # Use LARGE token limit for detailed pattern analysis
+            schema = GeminiSchemas.pattern_analysis()
+            response_text = self.ai_manager.generate_content(
+                prompt=prompt,
+                temperature=AIManager.TEMPERATURE_PRECISE,
+                max_tokens=AIManager.MAX_OUTPUT_TOKENS_LARGE,
+                response_format="json",
+                response_schema=schema if self.ai_manager.provider_type == "gemini" else None
+            )
+            if not response_text:
+                logger.error(f"Empty response from AI for {file_path}")
+                raise ValueError(f"Empty response from AI API for {file_path}")
+            # With JSON schema, response is guaranteed to be valid JSON
+            result = json.loads(response_text)
+            logger.info(f"Pattern analysis successful for {file_path}: {len(result.get('patterns', []))} patterns found")
+            # Convert to DetectedPattern objects
+            patterns = []
+            for p in result.get('patterns', []):
+                patterns.append(DetectedPattern(
+                    pattern_type=p['pattern_type'],
+                    severity=PatternSeverity(p['severity']),
+                    file_path=file_path,
+                    language=language,
+                    description=p['description'],
+                    line_numbers=p.get('line_numbers', []),
+                    confidence=p.get('confidence', 0.8),
+                    recommendation=p['recommendation'],
+                    estimated_effort_hours=p.get('estimated_effort_hours', 1.0)
+                ))
+            analysis = FileAnalysis(
+                file_path=file_path,
+                language=language,
+                framework=framework,
+                patterns=patterns,
+                overall_priority=PatternSeverity(result.get('overall_priority', 'medium')),
+                modernization_score=result.get('modernization_score', 50),
+                requires_modernization=result.get('requires_modernization', True)
+            )
+            # Cache the result
+            if self.cache_dir:
+                try:
+                    cache_file = self.cache_dir / f"{hash(file_path + code)}.json"
+                    cache_file.write_text(json.dumps(self._serialize_analysis(analysis), indent=2))
+                except Exception as e:
+                    logger.warning(f"Cache write failed: {e}")
+            logger.info(f"Found {len(patterns)} patterns in {file_path}")
+            return analysis
+        except Exception as e:
+            logger.error(f"Pattern analysis failed for {file_path}: {e}")
+            # Return minimal analysis on error
+            return FileAnalysis(
+                file_path=file_path,
+                language=language,
+                framework=framework,
+                patterns=[],
+                overall_priority=PatternSeverity.INFO,
+                modernization_score=100,
+                requires_modernization=False
+            )
+    def analyze_batch(self, files: Dict[str, str], batch_size: int = 3) -> Dict[str, FileAnalysis]:
+        """
+        Analyze multiple files efficiently by batching API calls.
+        Args:
+            files: Dictionary mapping file paths to contents
+            batch_size: Number of files to analyze per API call (default: 3)
+        Returns:
+            Dictionary mapping file paths to FileAnalysis objects
+        """
+        logger.info(f"Batch analyzing {len(files)} files with batch_size={batch_size}")
+        results = {}
+        file_items = list(files.items())
+        # Process in batches to reduce API calls
+        for i in range(0, len(file_items), batch_size):
+            batch = file_items[i:i + batch_size]
+            if len(batch) == 1:
+                # Single file - use individual analysis
+                file_path, code = batch[0]
+                try:
+                    analysis = self.analyze_file(file_path, code)
+                    results[file_path] = analysis
+                except Exception as e:
+                    logger.error(f"Failed to analyze {file_path}: {e}")
+            else:
+                # Multiple files - use batch analysis
+                try:
+                    batch_results = self._analyze_batch_api(batch)
+                    results.update(batch_results)
+                except Exception as e:
+                    logger.error(f"Batch analysis failed: {e}, falling back to individual")
+                    # Fallback to individual analysis
+                    for file_path, code in batch:
+                        try:
+                            analysis = self.analyze_file(file_path, code)
+                            results[file_path] = analysis
+                        except Exception as e2:
+                            logger.error(f"Failed to analyze {file_path}: {e2}")
+        logger.info(f"Batch analysis complete: {len(results)} files analyzed")
+        return results
+    def _analyze_batch_api(self, batch: List[Tuple[str, str]]) -> Dict[str, FileAnalysis]:
+        """
+        Analyze multiple files in a single API call.
+        Args:
+            batch: List of (file_path, code) tuples
+        Returns:
+            Dictionary mapping file paths to FileAnalysis objects
+        """
+        logger.info(f"Analyzing {len(batch)} files in single API call")
+        # Build combined prompt for all files
+        # Reduce code sample size for batch processing to prevent token overflow
+        files_info = []
+        for file_path, code in batch:
+            ext = Path(file_path).suffix.lower()
+            language = self.LANGUAGE_PATTERNS.get(ext, 'Unknown')
+            # Use smaller samples for batch to leave room for multiple file analyses
+            code_sample_size = 2000 if len(batch) > 2 else 3000
+            files_info.append({
+                'file_path': file_path,
+                'language': language,
+                'code_sample': code[:code_sample_size]
+            })
+        prompt = f"""Analyze these {len(batch)} code files for legacy patterns and modernization opportunities.
+For EACH file, provide a complete analysis with patterns, scores, and priorities.
+IMPORTANT: Limit to top 5 most critical patterns per file to keep response concise.
+FILES TO ANALYZE:
+{json.dumps(files_info, indent=2)}
+For each file, detect:
+- Deprecated libraries and APIs
+- Security vulnerabilities (SQL injection, XSS, hardcoded credentials)
+- Code quality issues (missing type hints, error handling)
+- Performance problems
+Keep descriptions and recommendations brief (max 80 chars each).
+Respond in JSON format with this structure:
+{{
+  "files": [
+    {{
+      "file_path": "file1.py",
+      "language": "Python",
+      "framework": "Flask or null",
+      "patterns": [
+        {{
+          "pattern_type": "SQL injection vulnerability",
+          "severity": "critical",
+          "line_numbers": [10, 11],
+          "confidence": 0.95,
+          "description": "Direct string concatenation in SQL query",
+          "recommendation": "Use parameterized queries",
+          "estimated_effort_hours": 2.0
+        }}
+      ],
+      "modernization_score": 35,
+      "requires_modernization": true,
+      "overall_priority": "critical"
+    }}
+  ]
+}}
+"""
+        try:
+            # Use JSON schema for guaranteed structure
+            schema = GeminiSchemas.batch_pattern_analysis()
+            response_text = self.ai_manager.generate_content(
+                prompt=prompt,
+                temperature=AIManager.TEMPERATURE_PRECISE,
+                max_tokens=AIManager.MAX_OUTPUT_TOKENS_LARGE,
+                response_format="json",
+                response_schema=schema if self.ai_manager.provider_type == "gemini" else None
+            )
+            # With JSON schema, response is guaranteed to be valid JSON
+            result = json.loads(response_text)
+            logger.info(f"Batch analysis successful: received data for {len(result.get('files', []))} files")
+            # Schema guarantees 'files' key exists
+            files_data = result.get('files', [])
+            # Convert to FileAnalysis objects
+            analyses = {}
+            for file_data in files_data:
+                file_path = file_data['file_path']
+                language = file_data.get('language', 'Unknown')
+                framework = file_data.get('framework')
+                patterns = []
+                for p in file_data.get('patterns', []):
+                    patterns.append(DetectedPattern(
+                        pattern_type=p['pattern_type'],
+                        severity=PatternSeverity(p['severity']),
+                        file_path=file_path,
+                        language=language,
+                        description=p['description'],
+                        line_numbers=p.get('line_numbers', []),
+                        confidence=p.get('confidence', 0.8),
+                        recommendation=p['recommendation'],
+                        estimated_effort_hours=p.get('estimated_effort_hours', 1.0)
+                    ))
+                analysis = FileAnalysis(
+                    file_path=file_path,
+                    language=language,
+                    framework=framework,
+                    patterns=patterns,
+                    overall_priority=PatternSeverity(file_data.get('overall_priority', 'medium')),
+                    modernization_score=file_data.get('modernization_score', 50),
+                    requires_modernization=file_data.get('requires_modernization', True)
+                )
+                analyses[file_path] = analysis
+            logger.info(f"Batch API call successful: analyzed {len(analyses)} files")
+            return analyses
+        except Exception as e:
+            logger.error(f"Batch API call failed: {e}")
+            raise
+    def prioritize_files(self, analyses: Dict[str, FileAnalysis]) -> List[Tuple[str, FileAnalysis]]:
+        """
+        Prioritize files for modernization based on analysis.
+        Args:
+            analyses: Dictionary of file analyses
+        Returns:
+            Sorted list of (file_path, analysis) tuples, highest priority first
+        """
+        # Define priority weights
+        severity_weights = {
+            PatternSeverity.CRITICAL: 100,
+            PatternSeverity.HIGH: 75,
+            PatternSeverity.MEDIUM: 50,
+            PatternSeverity.LOW: 25,
+            PatternSeverity.INFO: 10
+        }
+        def calculate_priority_score(analysis: FileAnalysis) -> float:
+            """Calculate priority score for an analysis."""
+            # Base score from overall priority
+            base_score = severity_weights.get(analysis.overall_priority, 50)
+            # Add points for each pattern weighted by severity and confidence
+            pattern_score = sum(
+                severity_weights.get(p.severity, 25) * p.confidence
+                for p in analysis.patterns
+            )
+            # Factor in modernization score (lower = higher priority)
+            modernization_penalty = (100 - analysis.modernization_score) / 10
+            return base_score + pattern_score + modernization_penalty
+        # Sort by priority score
+        prioritized = sorted(
+            analyses.items(),
+            key=lambda x: calculate_priority_score(x[1]),
+            reverse=True
+        )
+        return prioritized
+    def generate_report(self, analyses: Dict[str, FileAnalysis]) -> str:
+        """
+        Generate human-readable report from analyses.
+        Args:
+            analyses: Dictionary of file analyses
+        Returns:
+            Formatted report string
+        """
+        report = []
+        report.append("=" * 80)
+        report.append("INTELLIGENT PATTERN MATCHING REPORT")
+        report.append("=" * 80)
+        report.append("")
+        # Summary statistics
+        total_files = len(analyses)
+        files_needing_modernization = sum(1 for a in analyses.values() if a.requires_modernization)
+        total_patterns = sum(len(a.patterns) for a in analyses.values())
+        avg_modernization_score = sum(a.modernization_score for a in analyses.values()) / max(total_files, 1)
+        report.append("SUMMARY:")
+        report.append(f"  Total Files Analyzed: {total_files}")
+        report.append(f"  Files Requiring Modernization: {files_needing_modernization}")
+        report.append(f"  Total Patterns Detected: {total_patterns}")
+        report.append(f"  Average Modernization Score: {avg_modernization_score:.1f}/100")
+        report.append("")
+        # Language breakdown
+        language_counts = {}
+        for analysis in analyses.values():
+            language_counts[analysis.language] = language_counts.get(analysis.language, 0) + 1
+        report.append("LANGUAGES:")
+        for lang, count in sorted(language_counts.items(), key=lambda x: x[1], reverse=True):
+            report.append(f"  {lang}: {count} files")
+        report.append("")
+        # Severity breakdown
+        severity_counts = {s: 0 for s in PatternSeverity}
+        for analysis in analyses.values():
+            for pattern in analysis.patterns:
+                severity_counts[pattern.severity] += 1
+        report.append("PATTERNS BY SEVERITY:")
+        for severity in [PatternSeverity.CRITICAL, PatternSeverity.HIGH,
+                        PatternSeverity.MEDIUM, PatternSeverity.LOW, PatternSeverity.INFO]:
+            count = severity_counts[severity]
+            if count > 0:
+                report.append(f"  {severity.value.upper()}: {count}")
+        report.append("")
+        # Top priority files
+        prioritized = self.prioritize_files(analyses)[:10]
+        report.append("TOP 10 PRIORITY FILES:")
+        for i, (file_path, analysis) in enumerate(prioritized, 1):
+            report.append(f"  {i}. {file_path}")
+            report.append(f"     Priority: {analysis.overall_priority.value}")
+            report.append(f"     Modernization Score: {analysis.modernization_score}/100")
+            report.append(f"     Patterns: {len(analysis.patterns)}")
+        report.append("")
+        report.append("=" * 80)
+        return "\n".join(report)
+    def _serialize_analysis(self, analysis: FileAnalysis) -> dict:
+        """Serialize FileAnalysis to dict for caching."""
+        return {
+            'file_path': analysis.file_path,
+            'language': analysis.language,
+            'framework': analysis.framework,
+            'patterns': [
+                {
+                    'pattern_type': p.pattern_type,
+                    'severity': p.severity.value,
+                    'file_path': p.file_path,
+                    'language': p.language,
+                    'description': p.description,
+                    'line_numbers': p.line_numbers,
+                    'confidence': p.confidence,
+                    'recommendation': p.recommendation,
+                    'estimated_effort_hours': p.estimated_effort_hours
+                }
+                for p in analysis.patterns
+            ],
+            'overall_priority': analysis.overall_priority.value,
+            'modernization_score': analysis.modernization_score,
+            'requires_modernization': analysis.requires_modernization
+        }
+    def _deserialize_analysis(self, data: dict) -> FileAnalysis:
+        """Deserialize dict to FileAnalysis."""
+        patterns = [
+            DetectedPattern(
+                pattern_type=p['pattern_type'],
+                severity=PatternSeverity(p['severity']),
+                file_path=p['file_path'],
+                language=p['language'],
+                description=p['description'],
+                line_numbers=p['line_numbers'],
+                confidence=p['confidence'],
+                recommendation=p['recommendation'],
+                estimated_effort_hours=p['estimated_effort_hours']
+            )
+            for p in data['patterns']
+        ]
+        return FileAnalysis(
+            file_path=data['file_path'],
+            language=data['language'],
+            framework=data['framework'],
+            patterns=patterns,
+            overall_priority=PatternSeverity(data['overall_priority']),
+            modernization_score=data['modernization_score'],
+            requires_modernization=data['requires_modernization']
+        )

src/agents/test_generator.py ADDED Viewed

	@@ -0,0 +1,706 @@

+"""
+Test Generator - Generates unit tests for code transformations using AI.
+Supports multiple AI providers (Gemini, Nebius, OpenAI).
+"""
+import os
+import logging
+from typing import Dict, Optional
+from pathlib import Path
+from src.config import AIManager
+logger = logging.getLogger(__name__)
+class CodeTestGenerator:
+    """
+    Generates comprehensive unit tests for code transformations.
+    Uses Gemini 2.5 Flash to create behavioral equivalence tests.
+    Note: Renamed from TestGenerator to avoid pytest collection conflicts.
+    """
+    def __init__(self):
+        """Initialize Code Test Generator."""
+        # Use centralized AI manager
+        self.ai_manager = AIManager()
+        logger.info(
+            f"CodeTestGenerator initialized with provider: {self.ai_manager.provider_name}, "
+            f"model: {self.ai_manager.model_name}"
+        )
+    def generate_tests(self, original_code: str, modernized_code: str,
+                      file_path: str, language: str = None) -> str:
+        """
+        Generate comprehensive unit tests for code transformation.
+        Args:
+            original_code: Original legacy code
+            modernized_code: Modernized code
+            file_path: Path to the file
+            language: Programming language (auto-detected if not provided)
+        Returns:
+            Generated test code as string
+        """
+        logger.info(f"Generating tests for {file_path}")
+        # Auto-detect language from file extension if not provided
+        if language is None:
+            language = self._detect_language(file_path, modernized_code)
+        logger.info(f"Detected language: {language}")
+        # Language-specific test framework
+        framework_map = {
+            "python": "pytest",
+            "java": "JUnit 5",
+            "javascript": "Jest",
+            "typescript": "Jest",
+            "go": "testing package",
+            "ruby": "RSpec",
+            "csharp": "xUnit",
+            "cpp": "Google Test",
+            "kotlin": "JUnit 5",
+            "scala": "ScalaTest"
+        }
+        framework = framework_map.get(language.lower(), "pytest")
+        # Truncate code if too long to avoid token limits
+        # Increased from 3000 to 8000 to give AI more context
+        max_code_length = 8000  # chars per code block
+        original_truncated = original_code[:max_code_length] + ("\n\n# ... (truncated)" if len(original_code) > max_code_length else "")
+        modernized_truncated = modernized_code[:max_code_length] + ("\n\n# ... (truncated)" if len(modernized_code) > max_code_length else "")
+        # Extract module name for proper imports
+        module_name = Path(file_path).stem
+        # Language-specific setup instructions
+        setup_instructions = ""
+        import_instructions = ""
+        if language == "python":
+            setup_instructions = """1. **CRITICAL SANDBOX ENVIRONMENT**: Modal Sandbox Execution:
+   - Test file location: `/workspace/test_{module_name}.py`
+   - IMPORTANT: The test file contains BOTH source code AND tests combined in one file
+   - Implementation code is defined first, then test functions use it
+   - Start the test file with:
+   ```python
+   import sys
+   import os
+   sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+   ```"""
+            import_instructions = f'2. Import/Usage: Either "from {module_name} import ..." OR call functions directly (same file)'
+        elif language == "java":
+            setup_instructions = """1. **CRITICAL SANDBOX ENVIRONMENT**: Modal Sandbox Maven Execution:
+   - Source file: `/workspace/{module_name}.java` with package `com.modernizer`
+   - Test file: `/workspace/{module_name}Test.java` with package `com.modernizer`
+   - Both files are compiled together by Maven in the `/workspace/` directory
+   - Use proper JUnit 5 annotations:
+   ```java
+   package com.modernizer;
+   import org.junit.jupiter.api.Test;
+   import org.junit.jupiter.api.BeforeEach;
+   import static org.junit.jupiter.api.Assertions.*;
+   public class {module_name}Test {{
+       @BeforeEach
+       void setUp() {{
+           // Setup code
+       }}
+       @Test
+       void testMethodName() {{
+           // Test code with assertions
+       }}
+   }}
+   ```"""
+            import_instructions = f'2. Package: Use "package com.modernizer;" in both files - no imports needed (same package)'
+        elif language in ["javascript", "typescript"]:
+            ext = '.ts' if language == 'typescript' else '.js'
+            if language == 'typescript':
+                import_example = f'import {{ ... }} from "./{module_name}";'
+                import_note = "WITHOUT .ts extension (TypeScript resolves automatically)"
+            else:
+                import_example = f'import {{ ... }} from "./{module_name}.js";'
+                import_note = "WITH .js extension (ES modules require explicit extensions)"
+            setup_instructions = f"""1. **CRITICAL SANDBOX ENVIRONMENT**: Modal Sandbox Jest Execution:
+   - Source file: `/workspace/{module_name}{ext}`
+   - Test file: `/workspace/{module_name}.test{ext}`
+   - Framework: Jest configured for {'TypeScript (ts-jest preset)' if language == 'typescript' else 'JavaScript (ES modules)'}
+   - Use proper module import statements"""
+            import_instructions = f'2. Import: Use relative path {import_note}: `{import_example}`'
+        else:
+            setup_instructions = "1. Ensure proper imports/includes for the sandbox environment."
+            import_instructions = "2. Import the module/class to be tested from the same /workspace/ directory."
+        prompt = f"""Generate comprehensive unit tests for this code transformation.
+FILE: {file_path}
+MODULE NAME: {module_name}
+LANGUAGE: {language}
+TEST FRAMEWORK: {framework}
+ORIGINAL CODE (truncated for context):
+```{language}
+{original_truncated}
+```
+MODERNIZED CODE (truncated for context):
+```{language}
+{modernized_truncated}
+```
+REQUIREMENTS:
+{setup_instructions}
+{import_instructions}
+3. Test behavioral equivalence (same inputs → same outputs)
+4. Test edge cases (empty inputs, None/null, invalid types, boundary values)
+5. Test error handling and exceptions
+6. Use {framework} framework
+7. Mock external dependencies (databases, APIs, file system)
+8. Include fixtures for common test data
+9. Test both success and failure scenarios
+10. Add descriptive test names and docstrings
+11. Ensure tests are independent and can run in any order
+12. Include setup and teardown if needed
+SANDBOX FILE STRUCTURE:
+- Python: test_{module_name}.py contains BOTH source code and tests combined
+- Java: {module_name}.java and {module_name}Test.java in package com.modernizer, compiled together by Maven
+- JavaScript: {module_name}.js and {module_name}.test.js (ES modules with "type": "module" in package.json)
+- TypeScript: {module_name}.ts and {module_name}.test.ts (ts-jest preset handles compilation)
+- All files are in /workspace/ directory in the Modal Sandbox
+CRITICAL IMPORT INSTRUCTIONS:
+- JavaScript: MUST use .js extension in imports: `import {{ ... }} from "./{module_name}.js";`
+- TypeScript: MUST NOT use .ts extension in imports: `import {{ ... }} from "./{module_name}";`
+- This is critical - wrong extensions will cause compilation/runtime errors!
+CRITICAL OUTPUT INSTRUCTIONS:
+- Return ONLY the complete test code in a single code block
+- For Python: Source and tests are in SAME file, define functions first then tests
+- For Java: Source and tests are SEPARATE files, same package, no imports needed
+- For JS/TS: Tests are SEPARATE files, use relative imports with correct extensions (see above)
+- DO NOT include any explanatory text, descriptions, or commentary before or after the code
+- The response must be executable code that can run directly in a sandbox environment
+- Start your response with the code block marker (```{language}) and end with the closing marker (```)
+"""
+        try:
+            response_text = self.ai_manager.generate_content(
+                prompt=prompt,
+                temperature=AIManager.TEMPERATURE_MEDIUM,
+                max_tokens=AIManager.MAX_OUTPUT_TOKENS_LARGE
+            )
+            # Check if response has text
+            if not response_text:
+                logger.warning(f"Empty response from AI for {file_path}")
+                return self._generate_fallback_test(file_path, language, framework)
+            test_code = self._extract_code(response_text)
+            # Validate that we got actual test code, not just fallback
+            if not test_code or len(test_code.strip()) < 100:
+                logger.warning(f"Generated test code too short for {file_path}, using fallback")
+                return self._generate_fallback_test(file_path, language, framework)
+            # Check if it contains actual test functions
+            if language == "python" and "def test_" not in test_code:
+                logger.warning(f"No test functions found in generated code for {file_path}")
+                return self._generate_fallback_test(file_path, language, framework)
+            logger.info(f"Test generation complete for {file_path} ({len(test_code)} chars)")
+            return test_code
+        except Exception as e:
+            logger.error(f"Error generating tests for {file_path}: {e}")
+            return self._generate_fallback_test(file_path, language, framework)
+    def generate_integration_tests(self, files: Dict[str, str],
+                                   language: str = "python") -> str:
+        """
+        Generate integration tests for multiple related files.
+        Args:
+            files: Dictionary mapping file paths to their contents
+            language: Programming language
+        Returns:
+            Generated integration test code
+        """
+        logger.info(f"Generating integration tests for {len(files)} files")
+        files_summary = "\n\n".join([
+            f"FILE: {path}\n```{language}\n{content[:500]}...\n```"
+            for path, content in list(files.items())[:5]
+        ])
+        prompt = f"""Generate integration tests for these related files.
+{files_summary}
+REQUIREMENTS:
+1. Test interactions between modules
+2. Test data flow across components
+3. Test end-to-end scenarios
+4. Mock external dependencies
+5. Include setup and teardown for test environment
+6. Test error propagation across modules
+7. Ensure tests are comprehensive but maintainable
+CRITICAL: Return ONLY the complete test code in a single code block.
+DO NOT include any explanatory text, descriptions, or commentary.
+The response must be executable code that can run directly in a sandbox.
+"""
+        try:
+            response_text = self.ai_manager.generate_content(
+                prompt=prompt,
+                temperature=AIManager.TEMPERATURE_MEDIUM,
+                max_tokens=AIManager.MAX_OUTPUT_TOKENS_LARGE
+            )
+            if not response_text:
+                logger.warning("Empty response for integration tests")
+                return ""
+            test_code = self._extract_code(response_text)
+            logger.info(f"Integration test generation complete ({len(test_code)} chars)")
+            return test_code
+        except Exception as e:
+            logger.error(f"Error generating integration tests: {e}")
+            return ""
+    def generate_security_tests(self, file_path: str, code: str,
+                               vulnerabilities: list) -> str:
+        """
+        Generate security-focused tests.
+        Args:
+            file_path: Path to the file
+            code: Code content
+            vulnerabilities: List of identified vulnerabilities
+        Returns:
+            Generated security test code
+        """
+        logger.info(f"Generating security tests for {file_path}")
+        vulns_text = "\n".join([
+            f"- {v.get('type', 'Unknown')}: {v.get('description', '')}"
+            for v in vulnerabilities
+        ])
+        # Detect language
+        language = self._detect_language(file_path, code)
+        framework_map = {
+            "python": "pytest",
+            "java": "JUnit 5",
+            "javascript": "Jest",
+            "typescript": "Jest",
+            "go": "testing package",
+            "ruby": "RSpec",
+            "csharp": "xUnit",
+            "cpp": "Google Test",
+            "kotlin": "JUnit 5",
+            "scala": "ScalaTest"
+        }
+        framework = framework_map.get(language.lower(), "pytest")
+        prompt = f"""Generate security-focused tests for this code.
+FILE: {file_path}
+LANGUAGE: {language}
+TEST FRAMEWORK: {framework}
+CODE:
+```{language}
+{code}
+```
+IDENTIFIED VULNERABILITIES:
+{vulns_text}
+REQUIREMENTS:
+1. Test for SQL injection prevention
+2. Test for XSS prevention
+3. Test for authentication/authorization
+4. Test for input validation
+5. Test for secure credential handling
+6. Test for proper error handling (no info leakage)
+7. Use {framework} framework
+8. Include security-specific assertions
+CRITICAL: Return ONLY the complete test code in a single code block.
+DO NOT include any explanatory text, descriptions, or commentary.
+The response must be executable code that can run directly in a sandbox.
+"""
+        try:
+            response_text = self.ai_manager.generate_content(
+                prompt=prompt,
+                temperature=AIManager.TEMPERATURE_PRECISE,
+                max_tokens=AIManager.MAX_OUTPUT_TOKENS_LARGE
+            )
+            if not response_text:
+                logger.warning(f"Empty response for security tests: {file_path}")
+                return ""
+            test_code = self._extract_code(response_text)
+            logger.info(f"Security test generation complete for {file_path} ({len(test_code)} chars)")
+            return test_code
+        except Exception as e:
+            logger.error(f"Error generating security tests: {e}")
+            return ""
+    def generate_performance_tests(self, file_path: str, code: str) -> str:
+        """
+        Generate performance/benchmark tests.
+        Args:
+            file_path: Path to the file
+            code: Code content
+        Returns:
+            Generated performance test code
+        """
+        logger.info(f"Generating performance tests for {file_path}")
+        # Detect language
+        language = self._detect_language(file_path, code)
+        framework_map = {
+            "python": "pytest-benchmark",
+            "java": "JMH (Java Microbenchmark Harness)",
+            "javascript": "Jest (with performance hooks)",
+            "typescript": "Jest (with performance hooks)",
+            "go": "testing package benchmarks",
+            "ruby": "Benchmark module",
+            "csharp": "BenchmarkDotNet",
+            "cpp": "Google Benchmark",
+        }
+        framework = framework_map.get(language.lower(), "pytest-benchmark")
+        prompt = f"""Generate performance tests for this code.
+FILE: {file_path}
+LANGUAGE: {language}
+TEST FRAMEWORK: {framework}
+CODE:
+```{language}
+{code}
+```
+REQUIREMENTS:
+1. Use {framework} for performance testing
+2. Test execution time for critical functions
+3. Test memory usage
+4. Test scalability with different input sizes
+5. Include baseline performance metrics
+6. Test for performance regressions
+7. Add timeout tests for long-running operations
+CRITICAL: Return ONLY the complete test code in a single code block.
+DO NOT include any explanatory text, descriptions, or commentary.
+The response must be executable code that can run directly in a sandbox.
+"""
+        try:
+            response_text = self.ai_manager.generate_content(
+                prompt=prompt,
+                temperature=AIManager.TEMPERATURE_PRECISE,
+                max_tokens=AIManager.MAX_OUTPUT_TOKENS_LARGE
+            )
+            if not response_text:
+                logger.warning(f"Empty response for performance tests: {file_path}")
+                return ""
+            test_code = self._extract_code(response_text)
+            logger.info(f"Performance test generation complete for {file_path} ({len(test_code)} chars)")
+            return test_code
+        except Exception as e:
+            logger.error(f"Error generating performance tests: {e}")
+            return ""
+    def _extract_code(self, text: str) -> str:
+        """
+        Extract code from markdown code blocks, removing any explanatory text.
+        Args:
+            text: Text that may contain markdown code blocks
+        Returns:
+            Extracted code only, without explanatory text
+        """
+        # Handle None or empty text
+        if not text:
+            return ""
+        # Try to extract from markdown code blocks
+        if "```" in text:
+            parts = text.split("```")
+            # Find all code blocks
+            code_blocks = []
+            for i in range(1, len(parts), 2):  # Code blocks are at odd indices
+                if i < len(parts):
+                    code_block = parts[i]
+                    lines = code_block.split('\n')
+                    # Remove language identifier if present
+                    first_line = lines[0].strip().lower()
+                    if first_line in ['python', 'java', 'javascript', 'typescript', 'pytest', 'py', 'js', 'ts', 'go', 'ruby', 'rb']:
+                        code_block = '\n'.join(lines[1:])
+                    extracted = code_block.strip()
+                    # Only add substantial code blocks
+                    if len(extracted) > 50:
+                        code_blocks.append(extracted)
+            # Return the largest code block (usually the main test file)
+            if code_blocks:
+                return max(code_blocks, key=len)
+        # If no code blocks found, check if the text itself looks like code
+        # (starts with import, def, class, etc.)
+        text_stripped = text.strip()
+        code_indicators = ['import ', 'from ', 'def ', 'class ', 'async def ', '@pytest', '@test']
+        # If text starts with code indicators, it might be plain code without markdown
+        if any(text_stripped.startswith(indicator) for indicator in code_indicators):
+            return text_stripped
+        # Otherwise, return empty string to trigger fallback
+        return ""
+    def _detect_language(self, file_path: str, code: str) -> str:
+        """
+        Detect programming language from file extension or code content.
+        Args:
+            file_path: Path to the file
+            code: Source code content
+        Returns:
+            Detected language name
+        """
+        if file_path:
+            ext = Path(file_path).suffix.lower()
+            extension_map = {
+                # Python
+                '.py': 'python', '.pyw': 'python', '.pyx': 'python',
+                # Java
+                '.java': 'java',
+                # JavaScript/TypeScript
+                '.js': 'javascript', '.jsx': 'javascript', '.mjs': 'javascript', '.cjs': 'javascript',
+                '.ts': 'typescript', '.tsx': 'typescript',
+                # PHP
+                '.php': 'php', '.php3': 'php', '.php4': 'php', '.php5': 'php', '.phtml': 'php',
+                # Ruby
+                '.rb': 'ruby', '.rbw': 'ruby',
+                # Go
+                '.go': 'go',
+                # C/C++
+                '.c': 'c', '.h': 'c',
+                '.cpp': 'cpp', '.cc': 'cpp', '.cxx': 'cpp', '.c++': 'cpp',
+                '.hpp': 'cpp', '.hh': 'cpp', '.hxx': 'cpp', '.h++': 'cpp',
+                # C#
+                '.cs': 'csharp',
+                # Rust
+                '.rs': 'rust',
+                # Kotlin
+                '.kt': 'kotlin', '.kts': 'kotlin',
+                # Swift
+                '.swift': 'swift',
+                # Scala
+                '.scala': 'scala', '.sc': 'scala',
+                # R
+                '.r': 'r', '.R': 'r',
+                # Perl
+                '.pl': 'perl', '.pm': 'perl', '.t': 'perl', '.pod': 'perl',
+                # Shell
+                '.sh': 'shell', '.bash': 'shell', '.zsh': 'shell', '.fish': 'shell'
+            }
+            if ext in extension_map:
+                return extension_map[ext]
+        # Fallback: detect from code content
+        if code:
+            if 'public class' in code or 'import java.' in code:
+                return 'java'
+            elif 'package main' in code or 'func main()' in code:
+                return 'go'
+            elif 'def ' in code and ('import ' in code or 'from ' in code):
+                return 'python'
+            elif 'function ' in code or 'const ' in code or 'let ' in code:
+                return 'javascript'
+            elif 'namespace ' in code and 'using ' in code:
+                return 'csharp'
+            elif 'fn main()' in code or 'use std::' in code:
+                return 'rust'
+            elif '<?php' in code:
+                return 'php'
+            elif 'class ' in code and 'def ' in code and 'end' in code:
+                return 'ruby'
+        return 'python'  # Default
+    def _generate_fallback_test(self, file_path: str, language: str,
+                               framework: str) -> str:
+        """
+        Generate a basic fallback test when generation fails.
+        Args:
+            file_path: Path to the file
+            language: Programming language
+            framework: Test framework
+        Returns:
+            Basic test template
+        """
+        if language == "python":
+            module_name = Path(file_path).stem
+            return f"""import sys
+import os
+# Ensure module can be imported from any directory structure
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import pytest
+from unittest.mock import Mock, patch
+# Tests for {file_path}
+# Note: These are placeholder tests. AI generation failed.
+# Please add comprehensive tests based on your code's functionality.
+class Test{module_name.title().replace('_', '')}:
+    \"\"\"Test suite for {module_name}\"\"\"
+    def test_module_imports(self):
+        \"\"\"Test that the module can be imported without errors\"\"\"
+        try:
+            import {module_name}
+            assert True
+        except ImportError:
+            pytest.skip("Module not in path")
+    def test_placeholder_basic(self):
+        \"\"\"Placeholder test - replace with actual tests\"\"\"
+        assert True
+    def test_placeholder_edge_cases(self):
+        \"\"\"Test edge cases - implement based on your code\"\"\"
+        # TODO: Add edge case tests
+        assert True
+    def test_placeholder_error_handling(self):
+        \"\"\"Test error handling - implement based on your code\"\"\"
+        # TODO: Add error handling tests
+        assert True
+# TODO: Add comprehensive tests for {file_path}
+# Consider testing:
+# - Normal operation with valid inputs
+# - Edge cases (empty, None, boundary values)
+# - Error conditions and exceptions
+# - Integration with other modules
+"""
+        elif language == "java":
+            class_name = Path(file_path).stem.replace('_', '').title()
+            return f"""import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.DisplayName;
+import static org.junit.jupiter.api.Assertions.*;
+/**
+ * Tests for {file_path}
+ * Note: These are placeholder tests. AI generation failed.
+ * Please add comprehensive tests based on your code's functionality.
+ */
+class {class_name}Test {{
+    @BeforeEach
+    void setUp() {{
+        // Initialize test fixtures
+    }}
+    @Test
+    @DisplayName("Placeholder test - replace with actual tests")
+    void testPlaceholderBasic() {{
+        assertTrue(true);
+    }}
+    @Test
+    @DisplayName("Test edge cases - implement based on your code")
+    void testEdgeCases() {{
+        // TODO: Add edge case tests
+        assertTrue(true);
+    }}
+    @Test
+    @DisplayName("Test error handling - implement based on your code")
+    void testErrorHandling() {{
+        // TODO: Add error handling tests
+        assertTrue(true);
+    }}
+}}
+// TODO: Add comprehensive tests for {file_path}
+// Consider testing:
+// - Normal operation with valid inputs
+// - Edge cases (null, empty, boundary values)
+// - Exception handling
+// - Integration with other classes
+"""
+        elif language in ("javascript", "typescript"):
+            module_name = Path(file_path).stem
+            return f"""// Tests for {file_path}
+// Note: These are placeholder tests. AI generation failed.
+// Please add comprehensive tests based on your code's functionality.
+describe('{module_name}', () => {{
+    beforeEach(() => {{
+        // Initialize test fixtures
+    }});
+    test('placeholder test - replace with actual tests', () => {{
+        expect(true).toBe(true);
+    }});
+    test('edge cases - implement based on your code', () => {{
+        // TODO: Add edge case tests
+        expect(true).toBe(true);
+    }});
+    test('error handling - implement based on your code', () => {{
+        // TODO: Add error handling tests
+        expect(true).toBe(true);
+    }});
+}});
+// TODO: Add comprehensive tests for {file_path}
+// Consider testing:
+// - Normal operation with valid inputs
+// - Edge cases (null, undefined, empty, boundary values)
+// - Error conditions and exceptions
+// - Async operations (if applicable)
+"""
+        else:
+            return f"""// Tests for {file_path}
+// Language: {language}
+// Note: AI test generation failed. Please add tests manually.
+// TODO: Add comprehensive tests for {file_path}
+"""

src/agents/transformer.py ADDED Viewed

	@@ -0,0 +1,358 @@

+"""
+Code Transformer - Generates modernized code using AI with RAG.
+Supports multiple AI providers (Gemini, Nebius, OpenAI).
+"""
+import os
+import json
+import logging
+from typing import Dict, List, Optional
+from src.config import AIManager
+logger = logging.getLogger(__name__)
+class CodeTransformer:
+    """
+    Transforms legacy code to modern equivalents using Gemini 2.5 Flash.
+    Integrates with MCP servers for examples and context.
+    """
+    def __init__(self, mcp_manager=None, search_engine=None):
+        """
+        Initialize Code Transformer.
+        Args:
+            mcp_manager: Optional MCPManager instance
+            search_engine: Optional CodeSearchEngine instance
+        """
+        self.mcp_manager = mcp_manager
+        self.search_engine = search_engine
+        # Use centralized AI manager
+        self.ai_manager = AIManager()
+        logger.info(
+            f"CodeTransformer initialized with provider: {self.ai_manager.provider_name}, "
+            f"model: {self.ai_manager.model_name}"
+        )
+    async def transform_code(self, file_path: str, original_code: str,
+                            transformation_plan: Dict) -> str:
+        """
+        Transform legacy code using Gemini 2.5 Flash.
+        Args:
+            file_path: Path to the file being transformed
+            original_code: Original code content
+            transformation_plan: Plan from analyzer with steps and recommendations
+        Returns:
+            Modernized code as string
+        """
+        logger.info(f"Transforming code: {file_path}")
+        # Get transformation examples from Memory MCP if available
+        examples_text = ""
+        if self.mcp_manager:
+            try:
+                from src.mcp.memory_client import MemoryMCPClient
+                memory_client = MemoryMCPClient(self.mcp_manager)
+                pattern_type = transformation_plan.get('pattern', '')
+                examples = await memory_client.get_transformation_examples(
+                    pattern_type,
+                    limit=3
+                )
+                if examples:
+                    examples_text = "\n\nSUCCESSFUL TRANSFORMATION EXAMPLES:\n"
+                    for i, ex in enumerate(examples, 1):
+                        examples_text += f"\nExample {i}:\n"
+                        examples_text += f"Before: {ex.get('before', '')[:200]}...\n"
+                        examples_text += f"After: {ex.get('after', '')[:200]}...\n"
+            except Exception as e:
+                logger.warning(f"Could not retrieve transformation examples: {e}")
+        # Get similar code from search engine if available
+        context_text = ""
+        if self.search_engine:
+            try:
+                similar_files = self.search_engine.find_similar_patterns(
+                    f"Modern code similar to {file_path}",
+                    top_k=3
+                )
+                if similar_files:
+                    context_text = "\n\nSIMILAR MODERN CODE EXAMPLES:\n"
+                    for f in similar_files[:2]:
+                        context_text += f"- {f['file_path']}: {f['text_snippet']}\n"
+            except Exception as e:
+                logger.warning(f"Could not get similar code context: {e}")
+        # Build transformation prompt
+        prompt = f"""You are an expert code modernization assistant. Transform this legacy code to modern best practices.
+FILE: {file_path}
+TRANSFORMATION PLAN:
+{json.dumps(transformation_plan, indent=2)}
+{examples_text}
+{context_text}
+ORIGINAL CODE:
+```
+{original_code}
+```
+SANDBOX EXECUTION CONTEXT (for reference when generating imports):
+- This code will be tested in Modal Sandbox at /workspace/
+- Python: Tests will be combined with source in test_<module>.py
+- Java: Source in <Module>.java (package: com.modernizer), tests in <Module>Test.java
+- JavaScript: Source in <module>.js (ES modules with Jest), tests in <module>.test.js
+- TypeScript: Source in <module>.ts (CommonJS for Jest/ts-jest), tests in <module>.test.ts
+- All files in same /workspace/ directory
+- Use relative imports and ensure all external dependencies are available
+CRITICAL MODULE SYSTEM RULES:
+- TypeScript: Use CommonJS-compatible code (NO import.meta, NO top-level await)
+- TypeScript: Jest uses ts-jest with module: "commonjs" - avoid ES module-only features
+- JavaScript: Can use ES modules but avoid Node.js-specific ES module features
+- Do NOT add CLI execution code (if __name__ == "__main__", import.meta.url checks, etc.)
+- Focus on library/module code that can be imported and tested
+REQUIREMENTS:
+1. Apply the transformation plan exactly
+2. Maintain behavioral equivalence (same inputs → same outputs)
+3. Add type hints for all functions (Python) or appropriate types
+4. Include docstrings for public functions
+5. Follow language-specific style guides (PEP 8 for Python, Java conventions, etc.)
+6. Add error handling where missing
+7. Use environment variables for secrets/credentials
+8. Add comments explaining complex logic
+9. Ensure all imports are at the top
+10. Remove unused imports and variables
+11. Use correct relative paths for local imports (same directory imports)
+12. Include necessary package declarations (Java) or module exports
+13. CRITICAL: Export ALL types, interfaces, enums, and classes that might be used in tests
+    - TypeScript: Use 'export' keyword for all public types, interfaces, enums, classes
+    - JavaScript: Include all functions/classes in module.exports or export statements
+    - Python: All public functions/classes should be importable
+    - Java: Use public access modifiers for classes/methods that will be tested
+IMPORTANT:
+- Return ONLY the transformed code, no explanations or markdown formatting
+- Do NOT include markdown code fences in the response
+- Ensure imports work in sandbox environment where all files are in /workspace/
+"""
+        try:
+            # Call AI with configured model
+            modernized_code = self.ai_manager.generate_content(
+                prompt=prompt,
+                temperature=AIManager.TEMPERATURE_MEDIUM,
+                max_tokens=AIManager.MAX_OUTPUT_TOKENS_LARGE
+            ).strip()
+            # Extract code from markdown if present
+            modernized_code = self._extract_code(modernized_code)
+            # Validate that code is complete (not truncated)
+            if modernized_code:
+                # Check for common truncation indicators
+                last_lines = modernized_code.split('\n')[-5:]
+                last_text = '\n'.join(last_lines)
+                # Warn if code appears truncated
+                if (not modernized_code.rstrip().endswith((')', '}', ']', '"', "'")) and
+                    len(modernized_code) > 1000 and
+                    not any(keyword in last_text for keyword in ['if __name__', 'main()', 'return'])):
+                    logger.warning(f"Code for {file_path} may be truncated (length: {len(modernized_code)} chars)")
+                    logger.warning(f"Last few lines: {last_text[:200]}")
+            # Store successful transformation as example
+            if self.mcp_manager:
+                try:
+                    from src.mcp.memory_client import MemoryMCPClient
+                    memory_client = MemoryMCPClient(self.mcp_manager)
+                    example = {
+                        "pattern": transformation_plan.get('pattern', ''),
+                        "before": original_code[:500],
+                        "after": modernized_code[:500],
+                        "file_path": file_path
+                    }
+                    example_id = f"{transformation_plan.get('pattern', 'unknown')}_{hash(file_path)}"
+                    await memory_client.store_transformation_example(example_id, example)
+                except Exception as e:
+                    logger.warning(f"Could not store transformation example: {e}")
+            logger.info(f"Transformation complete for {file_path}")
+            return modernized_code
+        except Exception as e:
+            logger.error(f"Error during transformation: {e}")
+            return original_code  # Return original on error
+    def _extract_code(self, text: str) -> str:
+        """
+        Extract code from markdown code blocks if present.
+        Handles both complete blocks and trailing markdown fences.
+        Args:
+            text: Text that may contain markdown code blocks
+        Returns:
+            Extracted code
+        """
+        if not text:
+            return ""
+        # Check for markdown code blocks
+        if "```" in text:
+            # Try to extract code between ``` markers
+            parts = text.split("```")
+            if len(parts) >= 3:
+                # Get the code block (skip language identifier)
+                code_block = parts[1]
+                # Remove language identifier if present
+                lines = code_block.split('\n')
+                if lines[0].strip() in ['python', 'java', 'javascript', 'typescript', 'cpp', 'c', 'go', 'js', 'ts', 'py']:
+                    code_block = '\n'.join(lines[1:])
+                return code_block.strip()
+            elif len(parts) == 2:
+                # Only one ``` found - might be trailing fence
+                # Take everything before the fence
+                return parts[0].strip()
+        # Remove any trailing markdown fences
+        text = text.strip()
+        if text.endswith('```'):
+            text = text[:-3].strip()
+        return text
+    async def bulk_transform(self, files: Dict[str, str],
+                            transformation_plan: Dict) -> Dict[str, str]:
+        """
+        Transform multiple files with the same pattern.
+        Args:
+            files: Dictionary mapping file paths to their contents
+            transformation_plan: Transformation plan to apply
+        Returns:
+            Dictionary mapping file paths to transformed code
+        """
+        logger.info(f"Bulk transforming {len(files)} files")
+        results = {}
+        for file_path, original_code in files.items():
+            try:
+                transformed = await self.transform_code(
+                    file_path,
+                    original_code,
+                    transformation_plan
+                )
+                results[file_path] = transformed
+                logger.info(f"✓ Transformed {file_path}")
+            except Exception as e:
+                logger.error(f"✗ Failed to transform {file_path}: {e}")
+                results[file_path] = original_code
+        logger.info(f"Bulk transformation complete: {len(results)}/{len(files)} successful")
+        return results
+    async def add_type_hints(self, file_path: str, code: str) -> str:
+        """
+        Add type hints to Python code.
+        Args:
+            file_path: Path to the file
+            code: Code content
+        Returns:
+            Code with type hints added
+        """
+        logger.info(f"Adding type hints to {file_path}")
+        prompt = f"""Add comprehensive type hints to this Python code.
+FILE: {file_path}
+CODE:
+```python
+{code}
+```
+REQUIREMENTS:
+1. Add type hints to all function parameters and return types
+2. Use typing module for complex types (List, Dict, Optional, etc.)
+3. Add type hints to class attributes
+4. Maintain all existing functionality
+5. Follow PEP 484 type hinting standards
+Return ONLY the code with type hints added, no explanations.
+"""
+        try:
+            typed_code = self.ai_manager.generate_content(
+                prompt=prompt,
+                temperature=AIManager.TEMPERATURE_PRECISE,
+                max_tokens=AIManager.MAX_OUTPUT_TOKENS_MEDIUM
+            )
+            return self._extract_code(typed_code)
+        except Exception as e:
+            logger.error(f"Error adding type hints: {e}")
+            return code
+    async def add_docstrings(self, file_path: str, code: str) -> str:
+        """
+        Add docstrings to code.
+        Args:
+            file_path: Path to the file
+            code: Code content
+        Returns:
+            Code with docstrings added
+        """
+        logger.info(f"Adding docstrings to {file_path}")
+        prompt = f"""Add comprehensive docstrings to this code.
+FILE: {file_path}
+CODE:
+```
+{code}
+```
+REQUIREMENTS:
+1. Add docstrings to all functions and classes
+2. Use Google-style or NumPy-style docstrings
+3. Include parameter descriptions, return values, and exceptions
+4. Add module-level docstring if missing
+5. Maintain all existing functionality
+Return ONLY the code with docstrings added, no explanations.
+"""
+        try:
+            documented_code = self.ai_manager.generate_content(
+                prompt=prompt,
+                temperature=AIManager.TEMPERATURE_PRECISE,
+                max_tokens=AIManager.MAX_OUTPUT_TOKENS_MEDIUM
+            )
+            return self._extract_code(documented_code)
+        except Exception as e:
+            logger.error(f"Error adding docstrings: {e}")
+            return code

src/config/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+Configuration module for AI providers (Gemini, Nebius, OpenAI).
+"""
+from .gemini_config import GeminiConfig
+from .gemini_schemas import GeminiSchemas
+from .ai_manager import AIManager, AIProvider
+__all__ = ['GeminiConfig', 'GeminiSchemas', 'AIManager', 'AIProvider']

src/config/ai_manager.py ADDED Viewed

	@@ -0,0 +1,323 @@

+"""
+Centralized AI Manager for multiple providers.
+Supports Gemini, Nebius Token Factory, and other OpenAI-compatible providers.
+"""
+import os
+import json
+import logging
+from typing import Dict, Any, Optional, List
+from enum import Enum
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+logger = logging.getLogger(__name__)
+class AIProvider(Enum):
+    """Supported AI providers."""
+    GEMINI = "gemini"
+    NEBIUS = "nebius"
+    OPENAI = "openai"
+class AIManager:
+    """
+    Centralized manager for AI API calls across different providers.
+    Provides a unified interface regardless of the underlying provider.
+    """
+    # Default configurations
+    DEFAULT_PROVIDER = "gemini"
+    DEFAULT_GEMINI_MODEL = "gemini-2.5-flash"
+    DEFAULT_NEBIUS_MODEL = "zai-org/GLM-4.5"
+    DEFAULT_OPENAI_MODEL = "gpt-4"
+    # Temperature settings for different use cases
+    TEMPERATURE_PRECISE = 0.0  # For JSON schema responses
+    TEMPERATURE_LOW = 0.1      # For code generation
+    TEMPERATURE_MEDIUM = 0.2   # For transformations
+    TEMPERATURE_HIGH = 0.7     # For creative tasks
+    # Token limits
+    MAX_OUTPUT_TOKENS_SMALL = 8192
+    MAX_OUTPUT_TOKENS_MEDIUM = 16384
+    MAX_OUTPUT_TOKENS_LARGE = 32768
+    # Retry settings
+    MAX_RETRIES = 3
+    RETRY_DELAY = 1.0  # seconds
+    def __init__(self, provider: Optional[str] = None, model: Optional[str] = None):
+        """
+        Initialize AI Manager.
+        Args:
+            provider: AI provider to use (gemini, nebius, openai).
+                     If None, reads from AI_PROVIDER env var or uses default.
+            model: Model name to use. If None, reads from provider-specific env var.
+        """
+        # Determine provider
+        self.provider_name = (
+            provider or
+            os.getenv("AI_PROVIDER", self.DEFAULT_PROVIDER)
+        ).lower()
+        try:
+            self.provider = AIProvider(self.provider_name)
+        except ValueError:
+            logger.warning(
+                f"Unknown provider '{self.provider_name}', falling back to Gemini"
+            )
+            self.provider = AIProvider.GEMINI
+            self.provider_name = "gemini"
+        # Initialize provider-specific client
+        if self.provider == AIProvider.GEMINI:
+            self._init_gemini(model)
+        elif self.provider == AIProvider.NEBIUS:
+            self._init_nebius(model)
+        elif self.provider == AIProvider.OPENAI:
+            self._init_openai(model)
+        logger.info(
+            f"AIManager initialized with provider: {self.provider_name}, "
+            f"model: {self.model_name}"
+        )
+    def _init_gemini(self, model: Optional[str] = None):
+        """Initialize Gemini provider."""
+        from google import genai
+        api_key = os.getenv("GEMINI_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "GEMINI_API_KEY not found in environment variables. "
+                "Please set it in your .env file."
+            )
+        self.model_name = (
+            model or
+            os.getenv("GEMINI_MODEL", self.DEFAULT_GEMINI_MODEL)
+        )
+        self.client = genai.Client(api_key=api_key)
+        self.provider_type = "gemini"
+    def _init_nebius(self, model: Optional[str] = None):
+        """Initialize Nebius Token Factory provider (OpenAI-compatible)."""
+        from openai import OpenAI
+        api_key = os.getenv("NEBIUS_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "NEBIUS_API_KEY not found in environment variables. "
+                "Please set it in your .env file."
+            )
+        self.model_name = (
+            model or
+            os.getenv("NEBIUS_MODEL", self.DEFAULT_NEBIUS_MODEL)
+        )
+        self.client = OpenAI(
+            base_url="https://api.tokenfactory.nebius.com/v1/",
+            api_key=api_key
+        )
+        self.provider_type = "openai_compatible"
+    def _init_openai(self, model: Optional[str] = None):
+        """Initialize OpenAI provider."""
+        from openai import OpenAI
+        api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            raise ValueError(
+                "OPENAI_API_KEY not found in environment variables. "
+                "Please set it in your .env file."
+            )
+        self.model_name = (
+            model or
+            os.getenv("OPENAI_MODEL", self.DEFAULT_OPENAI_MODEL)
+        )
+        self.client = OpenAI(api_key=api_key)
+        self.provider_type = "openai_compatible"
+    def generate_content(
+        self,
+        prompt: str,
+        temperature: float = TEMPERATURE_LOW,
+        max_tokens: int = MAX_OUTPUT_TOKENS_MEDIUM,
+        response_format: Optional[str] = None,
+        response_schema: Optional[Dict[str, Any]] = None,
+        system_prompt: Optional[str] = None
+    ) -> str:
+        """
+        Generate content using the configured AI provider.
+        Args:
+            prompt: The prompt to send to the AI
+            temperature: Temperature setting (0.0-1.0)
+            max_tokens: Maximum output tokens
+            response_format: Response format ("json" or None)
+            response_schema: JSON schema for structured responses (Gemini format)
+            system_prompt: Optional system prompt (for OpenAI-compatible providers)
+        Returns:
+            Generated text content
+        """
+        if self.provider_type == "gemini":
+            return self._generate_gemini(
+                prompt, temperature, max_tokens,
+                response_format, response_schema
+            )
+        else:  # openai_compatible
+            return self._generate_openai_compatible(
+                prompt, temperature, max_tokens,
+                response_format, system_prompt
+            )
+    def _generate_gemini(
+        self,
+        prompt: str,
+        temperature: float,
+        max_tokens: int,
+        response_format: Optional[str],
+        response_schema: Optional[Dict[str, Any]]
+    ) -> str:
+        """Generate content using Gemini API."""
+        config = {
+            "temperature": temperature,
+            "max_output_tokens": max_tokens,
+            "top_p": 0.95,
+        }
+        # Add JSON schema if provided
+        if response_schema:
+            config["response_mime_type"] = "application/json"
+            config["response_schema"] = response_schema
+        elif response_format == "json":
+            config["response_mime_type"] = "application/json"
+        response = self.client.models.generate_content(
+            model=self.model_name,
+            contents=prompt,
+            config=config
+        )
+        return response.text
+    def _generate_openai_compatible(
+        self,
+        prompt: str,
+        temperature: float,
+        max_tokens: int,
+        response_format: Optional[str],
+        system_prompt: Optional[str]
+    ) -> str:
+        """Generate content using OpenAI-compatible API."""
+        messages = []
+        # Add system prompt if provided
+        if system_prompt:
+            messages.append({"role": "system", "content": system_prompt})
+        messages.append({"role": "user", "content": prompt})
+        kwargs = {
+            "model": self.model_name,
+            "messages": messages,
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+        # Add JSON mode if requested
+        if response_format == "json":
+            kwargs["response_format"] = {"type": "json_object"}
+        response = self.client.chat.completions.create(**kwargs)
+        return response.choices[0].message.content
+    def get_base_config(
+        self,
+        temperature: float = TEMPERATURE_LOW,
+        max_tokens: int = MAX_OUTPUT_TOKENS_MEDIUM
+    ) -> Dict[str, Any]:
+        """
+        Get base configuration for AI calls.
+        Args:
+            temperature: Temperature setting (0.0-1.0)
+            max_tokens: Maximum output tokens
+        Returns:
+            Configuration dictionary
+        """
+        return {
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+    def get_json_config(
+        self,
+        schema: Optional[Dict[str, Any]] = None,
+        temperature: float = TEMPERATURE_PRECISE,
+        max_tokens: int = MAX_OUTPUT_TOKENS_MEDIUM
+    ) -> Dict[str, Any]:
+        """
+        Get configuration for JSON schema-enforced responses.
+        Args:
+            schema: JSON schema dictionary (Gemini format)
+            temperature: Temperature setting (default: 0.0 for precision)
+            max_tokens: Maximum output tokens
+        Returns:
+            Configuration dictionary
+        """
+        config = self.get_base_config(temperature, max_tokens)
+        config["response_format"] = "json"
+        if schema and self.provider_type == "gemini":
+            config["response_schema"] = schema
+        return config
+    @classmethod
+    def validate_config(cls) -> bool:
+        """
+        Validate that required configuration is present.
+        Returns:
+            True if configuration is valid
+        Raises:
+            ValueError: If required configuration is missing
+        """
+        provider = os.getenv("AI_PROVIDER", cls.DEFAULT_PROVIDER).lower()
+        if provider == "gemini":
+            if not os.getenv("GEMINI_API_KEY"):
+                raise ValueError(
+                    "GEMINI_API_KEY not found in environment variables. "
+                    "Please set it in your .env file."
+                )
+        elif provider == "nebius":
+            if not os.getenv("NEBIUS_API_KEY"):
+                raise ValueError(
+                    "NEBIUS_API_KEY not found in environment variables. "
+                    "Please set it in your .env file."
+                )
+        elif provider == "openai":
+            if not os.getenv("OPENAI_API_KEY"):
+                raise ValueError(
+                    "OPENAI_API_KEY not found in environment variables. "
+                    "Please set it in your .env file."
+                )
+        return True

src/config/gemini_config.py ADDED Viewed

	@@ -0,0 +1,99 @@

+"""
+Centralized Gemini API configuration.
+Allows users to configure model settings from .env file.
+"""
+import os
+from typing import Optional
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+class GeminiConfig:
+    """Centralized configuration for Gemini API."""
+    # Default model - can be overridden in .env
+    DEFAULT_MODEL = "gemini-2.5-flash"
+    # Model configuration from environment
+    MODEL_NAME: str = os.getenv("GEMINI_MODEL", DEFAULT_MODEL)
+    API_KEY: str = os.getenv("GEMINI_API_KEY", "")
+    # Temperature settings for different use cases
+    TEMPERATURE_PRECISE = 0.0  # For JSON schema responses
+    TEMPERATURE_LOW = 0.1      # For code generation
+    TEMPERATURE_MEDIUM = 0.2   # For transformations
+    TEMPERATURE_HIGH = 0.7     # For creative tasks
+    # Token limits
+    MAX_OUTPUT_TOKENS_SMALL = 8192
+    MAX_OUTPUT_TOKENS_MEDIUM = 16384
+    MAX_OUTPUT_TOKENS_LARGE = 32768
+    # Retry settings
+    MAX_RETRIES = 3
+    RETRY_DELAY = 1.0  # seconds
+    @classmethod
+    def validate(cls) -> bool:
+        """Validate that required configuration is present."""
+        if not cls.API_KEY:
+            raise ValueError(
+                "GEMINI_API_KEY not found in environment variables. "
+                "Please set it in your .env file."
+            )
+        return True
+    @classmethod
+    def get_model_name(cls) -> str:
+        """Get the configured model name."""
+        return cls.MODEL_NAME
+    @classmethod
+    def get_api_key(cls) -> str:
+        """Get the API key."""
+        cls.validate()
+        return cls.API_KEY
+    @classmethod
+    def get_base_config(cls, temperature: float = TEMPERATURE_LOW,
+                       max_tokens: int = MAX_OUTPUT_TOKENS_MEDIUM) -> dict:
+        """
+        Get base configuration for Gemini API calls.
+        Args:
+            temperature: Temperature setting (0.0-1.0)
+            max_tokens: Maximum output tokens
+        Returns:
+            Configuration dictionary
+        """
+        return {
+            "temperature": temperature,
+            "max_output_tokens": max_tokens,
+            "top_p": 0.95,
+        }
+    @classmethod
+    def get_json_config(cls, schema: dict,
+                       temperature: float = TEMPERATURE_PRECISE,
+                       max_tokens: int = MAX_OUTPUT_TOKENS_MEDIUM) -> dict:
+        """
+        Get configuration for JSON schema-enforced responses.
+        Args:
+            schema: JSON schema dictionary
+            temperature: Temperature setting (default: 0.0 for precision)
+            max_tokens: Maximum output tokens
+        Returns:
+            Configuration dictionary with schema enforcement
+        """
+        config = cls.get_base_config(temperature, max_tokens)
+        config.update({
+            "response_mime_type": "application/json",
+            "response_schema": schema
+        })
+        return config

src/config/gemini_schemas.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+JSON schemas for Gemini API responses.
+Ensures structured, predictable outputs from the AI model.
+Note: Uses Google GenAI SDK schema format (uppercase types: STRING, NUMBER, etc.)
+"""
+from typing import Dict, Any
+class GeminiSchemas:
+    """Collection of JSON schemas for different response types."""
+    @staticmethod
+    def language_detection() -> Dict[str, Any]:
+        """Schema for language and framework detection."""
+        return {
+            "type": "OBJECT",
+            "properties": {
+                "language": {
+                    "type": "STRING",
+                    "description": "Detected programming language"
+                },
+                "framework": {
+                    "type": "STRING",
+                    "description": "Detected framework or empty string if none",
+                    "nullable": True
+                },
+                "confidence": {
+                    "type": "NUMBER",
+                    "description": "Confidence score between 0.0 and 1.0"
+                }
+            },
+            "required": ["language", "framework", "confidence"]
+        }
+    @staticmethod
+    def pattern_analysis() -> Dict[str, Any]:
+        """Schema for pattern analysis results."""
+        return {
+            "type": "OBJECT",
+            "properties": {
+                "patterns": {
+                    "type": "ARRAY",
+                    "items": {
+                        "type": "OBJECT",
+                        "properties": {
+                            "pattern_type": {"type": "STRING"},
+                            "severity": {
+                                "type": "STRING",
+                                "enum": ["critical", "high", "medium", "low", "info"]
+                            },
+                            "line_numbers": {
+                                "type": "ARRAY",
+                                "items": {"type": "INTEGER"}
+                            },
+                            "confidence": {
+                                "type": "NUMBER"
+                            },
+                            "description": {"type": "STRING"},
+                            "recommendation": {"type": "STRING"},
+                            "estimated_effort_hours": {
+                                "type": "NUMBER"
+                            }
+                        },
+                        "required": [
+                            "pattern_type", "severity", "line_numbers",
+                            "confidence", "description", "recommendation",
+                            "estimated_effort_hours"
+                        ]
+                    }
+                },
+                "modernization_score": {
+                    "type": "INTEGER"
+                },
+                "requires_modernization": {"type": "BOOLEAN"},
+                "overall_priority": {
+                    "type": "STRING",
+                    "enum": ["critical", "high", "medium", "low", "info"]
+                }
+            },
+            "required": [
+                "patterns", "modernization_score",
+                "requires_modernization", "overall_priority"
+            ]
+        }
+    @staticmethod
+    def batch_pattern_analysis() -> Dict[str, Any]:
+        """Schema for batch pattern analysis results."""
+        return {
+            "type": "OBJECT",
+            "properties": {
+                "files": {
+                    "type": "ARRAY",
+                    "items": {
+                        "type": "OBJECT",
+                        "properties": {
+                            "file_path": {"type": "STRING"},
+                            "language": {"type": "STRING"},
+                            "framework": {
+                                "type": "STRING",
+                                "nullable": True
+                            },
+                            "patterns": {
+                                "type": "ARRAY",
+                                "items": {
+                                    "type": "OBJECT",
+                                    "properties": {
+                                        "pattern_type": {"type": "STRING"},
+                                        "severity": {
+                                            "type": "STRING",
+                                            "enum": ["critical", "high", "medium", "low", "info"]
+                                        },
+                                        "line_numbers": {
+                                            "type": "ARRAY",
+                                            "items": {"type": "INTEGER"}
+                                        },
+                                        "confidence": {
+                                            "type": "NUMBER"
+                                        },
+                                        "description": {"type": "STRING"},
+                                        "recommendation": {"type": "STRING"},
+                                        "estimated_effort_hours": {
+                                            "type": "NUMBER"
+                                        }
+                                    },
+                                    "required": [
+                                        "pattern_type", "severity", "line_numbers",
+                                        "confidence", "description", "recommendation",
+                                        "estimated_effort_hours"
+                                    ]
+                                }
+                            },
+                            "modernization_score": {
+                                "type": "INTEGER"
+                            },
+                            "requires_modernization": {"type": "BOOLEAN"},
+                            "overall_priority": {
+                                "type": "STRING",
+                                "enum": ["critical", "high", "medium", "low", "info"]
+                            }
+                        },
+                        "required": [
+                            "file_path", "language", "framework", "patterns",
+                            "modernization_score", "requires_modernization",
+                            "overall_priority"
+                        ]
+                    }
+                }
+            },
+            "required": ["files"]
+        }
+    @staticmethod
+    def file_classification() -> Dict[str, Any]:
+        """Schema for file classification results."""
+        return {
+            "type": "OBJECT",
+            "properties": {
+                "classification": {
+                    "type": "STRING",
+                    "enum": ["primary", "secondary", "test", "config", "documentation"]
+                },
+                "confidence": {
+                    "type": "NUMBER"
+                },
+                "reasoning": {"type": "STRING"},
+                "language": {"type": "STRING"},
+                "framework": {
+                    "type": "STRING",
+                    "nullable": True
+                }
+            },
+            "required": ["classification", "confidence", "reasoning", "language", "framework"]
+        }
+    @staticmethod
+    def code_analysis() -> Dict[str, Any]:
+        """Schema for detailed code analysis."""
+        return {
+            "type": "OBJECT",
+            "properties": {
+                "summary": {"type": "STRING"},
+                "issues": {
+                    "type": "ARRAY",
+                    "items": {
+                        "type": "OBJECT",
+                        "properties": {
+                            "type": {"type": "STRING"},
+                            "severity": {
+                                "type": "STRING",
+                                "enum": ["critical", "high", "medium", "low", "info"]
+                            },
+                            "description": {"type": "STRING"},
+                            "line_numbers": {
+                                "type": "ARRAY",
+                                "items": {"type": "INTEGER"}
+                            },
+                            "recommendation": {"type": "STRING"}
+                        },
+                        "required": ["type", "severity", "description", "line_numbers", "recommendation"]
+                    }
+                },
+                "transformation_steps": {
+                    "type": "ARRAY",
+                    "items": {
+                        "type": "OBJECT",
+                        "properties": {
+                            "step": {"type": "STRING"},
+                            "description": {"type": "STRING"},
+                            "priority": {
+                                "type": "STRING",
+                                "enum": ["critical", "high", "medium", "low"]
+                            },
+                            "estimated_hours": {
+                                "type": "NUMBER"
+                            }
+                        },
+                        "required": ["step", "description", "priority", "estimated_hours"]
+                    }
+                },
+                "dependencies": {
+                    "type": "ARRAY",
+                    "items": {"type": "STRING"}
+                },
+                "estimated_total_hours": {
+                    "type": "NUMBER"
+                }
+            },
+            "required": [
+                "summary", "issues", "transformation_steps",
+                "dependencies", "estimated_total_hours"
+            ]
+        }
+    @staticmethod
+    def test_generation() -> Dict[str, Any]:
+        """Schema for test generation metadata."""
+        return {
+            "type": "OBJECT",
+            "properties": {
+                "test_framework": {"type": "STRING"},
+                "test_count": {
+                    "type": "INTEGER"
+                },
+                "coverage_areas": {
+                    "type": "ARRAY",
+                    "items": {"type": "STRING"}
+                },
+                "test_types": {
+                    "type": "ARRAY",
+                    "items": {
+                        "type": "STRING",
+                        "enum": ["unit", "integration", "edge_case", "error_handling"]
+                    }
+                },
+                "notes": {"type": "STRING"}
+            },
+            "required": ["test_framework", "test_count", "coverage_areas", "test_types", "notes"]
+        }

src/mcp/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+"""
+MCP (Model Context Protocol) integration module.
+Manages connections to multiple MCP servers.
+"""
+# Avoid circular import by not importing at module level
+# Import these when needed in your code instead
+__all__ = ['MCPManager', 'MemoryMCPClient', 'SearchMCPClient', 'GitHubMCPClient']

src/mcp/github_client.py ADDED Viewed

	@@ -0,0 +1,407 @@

+"""
+GitHub MCP Client - Creates PRs using GitHub MCP server.
+Phase 5: Automated PR creation with comprehensive documentation.
+"""
+import os
+import logging
+import time
+import json
+from typing import Dict, List, Optional
+# Lazy imports to avoid circular dependency issues
+ClientSession = None
+StdioServerParameters = None
+stdio_client = None
+def _ensure_mcp_imports():
+    """Lazy load MCP imports to avoid circular dependency."""
+    global ClientSession, StdioServerParameters, stdio_client
+    if ClientSession is None:
+        from mcp import ClientSession as CS, StdioServerParameters as SSP
+        from mcp.client.stdio import stdio_client as sc
+        ClientSession = CS
+        StdioServerParameters = SSP
+        stdio_client = sc
+logger = logging.getLogger(__name__)
+class GitHubMCPClient:
+    """
+    GitHub MCP client for automated PR creation.
+    Uses Model Context Protocol to interact with GitHub.
+    """
+    def __init__(self, github_token: Optional[str] = None):
+        """
+        Initialize GitHub MCP Client.
+        Args:
+            github_token: Optional GitHub token. If not provided, uses GITHUB_TOKEN from environment.
+        """
+        self.github_token = github_token or os.getenv("GITHUB_TOKEN")
+        if not self.github_token:
+            logger.warning("GITHUB_TOKEN not set - PR creation will be disabled")
+        logger.info("GitHubMCPClient initialized")
+    async def create_pr(
+        self,
+        repo_url: str,
+        changed_files: Dict[str, str],
+        pr_summary: str,
+        test_results: Dict,
+        base_branch: str = "main"
+    ) -> Dict:
+        """
+        Create GitHub PR using MCP.
+        Args:
+            repo_url: GitHub repository URL (e.g., "owner/repo")
+            changed_files: Dictionary mapping file paths to new content
+            pr_summary: PR description summary
+            test_results: Test execution results
+            base_branch: Base branch to merge into
+        Returns:
+            Dictionary with PR URL and details
+        """
+        _ensure_mcp_imports()  # Lazy load MCP
+        if not self.github_token:
+            return {
+                "success": False,
+                "error": "GITHUB_TOKEN not configured"
+            }
+        logger.info(f"Creating PR for {repo_url}")
+        try:
+            # Configure GitHub MCP server
+            server_params = StdioServerParameters(
+                command="npx",
+                args=["-y", "@modelcontextprotocol/server-github"],
+                env={"GITHUB_PERSONAL_ACCESS_TOKEN": self.github_token}
+            )
+            async with stdio_client(server_params) as (read, write):
+                async with ClientSession(read, write) as session:
+                    await session.initialize()
+                    # Create branch
+                    branch_name = f"modernize/auto-{int(time.time())}"
+                    logger.info(f"Creating branch: {branch_name}")
+                    try:
+                        await session.call_tool(
+                            "create_branch",
+                            arguments={
+                                "repo": repo_url,
+                                "branch": branch_name,
+                                "from_branch": base_branch
+                            }
+                        )
+                    except Exception as e:
+                        logger.error(f"Error creating branch: {e}")
+                        return {"success": False, "error": f"Branch creation failed: {e}"}
+                    # Commit files (batch by 10 files)
+                    file_items = list(changed_files.items())
+                    for i in range(0, len(file_items), 10):
+                        batch = file_items[i:i+10]
+                        files_payload = [
+                            {"path": path, "content": content}
+                            for path, content in batch
+                        ]
+                        try:
+                            await session.call_tool(
+                                "push_files",
+                                arguments={
+                                    "repo": repo_url,
+                                    "branch": branch_name,
+                                    "files": files_payload,
+                                    "message": f"Modernize batch {i//10 + 1}"
+                                }
+                            )
+                        except Exception as e:
+                            logger.error(f"Error pushing files: {e}")
+                    # Generate comprehensive PR description
+                    pr_description = self._generate_pr_description(
+                        pr_summary,
+                        test_results,
+                        changed_files
+                    )
+                    # Create pull request
+                    logger.info("Creating pull request")
+                    pr_result = await session.call_tool(
+                        "create_pull_request",
+                        arguments={
+                            "repo": repo_url,
+                            "title": "[Automated] Modernize codebase",
+                            "body": pr_description,
+                            "head": branch_name,
+                            "base": base_branch,
+                            "draft": False
+                        }
+                    )
+                    logger.info(f"PR created successfully: {pr_result}")
+                    return {
+                        "success": True,
+                        "pr_url": pr_result.get("url", ""),
+                        "pr_number": pr_result.get("number", 0),
+                        "branch": branch_name
+                    }
+        except Exception as e:
+            logger.error(f"Error creating PR: {e}")
+            return {
+                "success": False,
+                "error": str(e)
+            }
+    def _generate_pr_description(
+        self,
+        summary: str,
+        test_results: Dict,
+        changed_files: Dict[str, str]
+    ) -> str:
+        """
+        Generate comprehensive PR description.
+        Args:
+            summary: High-level summary
+            test_results: Test execution results
+            changed_files: Changed files dictionary
+        Returns:
+            Formatted PR description in Markdown
+        """
+        # Calculate statistics
+        total_files = len(changed_files)
+        total_lines_added = sum(content.count('\n') for content in changed_files.values())
+        tests_passed = test_results.get('tests_passed', 0)
+        tests_run = test_results.get('tests_run', 0)
+        pass_rate = (tests_passed / tests_run * 100) if tests_run > 0 else 0
+        coverage = test_results.get('coverage_percent', 0)
+        description = f"""## 🤖 Auto-generated by Legacy Code Modernizer Agent
+## Summary
+{summary}
+## Key Changes
+### Files Modified
+- **Total files changed**: {total_files}
+- **Lines added**: +{total_lines_added}
+- **Modernization patterns applied**: Multiple (see details below)
+### Testing Results
+✅ **{tests_passed}/{tests_run} tests passed** ({pass_rate:.1f}% pass rate)
+- Test coverage: {coverage:.1f}%
+- Execution time: {test_results.get('execution_time', 0):.2f}s
+- All tests run in isolated Modal sandbox
+## Risk Assessment: **MEDIUM** ⚠️
+### Why Medium Risk:
+- Automated code transformation requires thorough review
+- Database and API changes need integration testing
+- Environment variables may need configuration
+### Mitigation Steps:
+1. ✅ All changes validated in sandbox environment
+2. ✅ Comprehensive test suite generated and passing
+3. ✅ Rollback plan included below
+4. ⚠️ Manual review recommended before merging
+## Deployment Checklist
+**Before merging:**
+- [ ] Review all file changes
+- [ ] Verify environment variables are configured
+- [ ] Run integration tests against staging
+- [ ] Check for breaking changes in dependencies
+- [ ] Update documentation if needed
+**After merging:**
+- [ ] Monitor application logs for errors
+- [ ] Check performance metrics
+- [ ] Verify all features working as expected
+## Rollback Plan
+If issues arise after deployment:
+### Immediate Rollback (< 5 minutes)
+```bash
+# Revert to previous commit
+git revert HEAD
+git push origin main
+```
+### Alternative: Redeploy Previous Version
+```bash
+# Checkout previous commit
+git checkout HEAD~1
+# Deploy previous version
+./deploy.sh
+```
+## Test Details
+<details>
+<summary>Click to expand test execution logs</summary>
+```
+{test_results.get('stdout', 'No test output available')[:2000]}
+```
+</details>
+## Changed Files
+<details>
+<summary>Click to expand file list ({total_files} files)</summary>
+{self._format_file_list(changed_files)}
+</details>
+---
+**🙏 Generated with ❤️ by Legacy Code Modernizer**
+**Pipeline Time**: {test_results.get('execution_time', 0):.1f}s
+**Powered by**: Google Gemini, Nebius AI, LlamaIndex, Modal, MCP
+**👥 Reviewers**: Please focus on:
+1. Code quality and maintainability
+2. Test coverage and edge cases
+3. Environment configuration requirements
+"""
+        return description
+    def _format_file_list(self, changed_files: Dict[str, str]) -> str:
+        """Format changed files list for PR description."""
+        file_list = []
+        for i, file_path in enumerate(sorted(changed_files.keys())[:50], 1):
+            file_list.append(f"{i}. `{file_path}`")
+        if len(changed_files) > 50:
+            file_list.append(f"\n... and {len(changed_files) - 50} more files")
+        return "\n".join(file_list)
+    async def create_issue(
+        self,
+        repo_url: str,
+        title: str,
+        body: str,
+        labels: Optional[List[str]] = None
+    ) -> Dict:
+        """
+        Create GitHub issue using MCP.
+        Args:
+            repo_url: GitHub repository URL
+            title: Issue title
+            body: Issue description
+            labels: Optional list of labels
+        Returns:
+            Dictionary with issue details
+        """
+        _ensure_mcp_imports()  # Lazy load MCP
+        if not self.github_token:
+            return {"success": False, "error": "GITHUB_TOKEN not configured"}
+        logger.info(f"Creating issue in {repo_url}")
+        try:
+            server_params = StdioServerParameters(
+                command="npx",
+                args=["-y", "@modelcontextprotocol/server-github"],
+                env={"GITHUB_PERSONAL_ACCESS_TOKEN": self.github_token}
+            )
+            async with stdio_client(server_params) as (read, write):
+                async with ClientSession(read, write) as session:
+                    await session.initialize()
+                    result = await session.call_tool(
+                        "create_issue",
+                        arguments={
+                            "repo": repo_url,
+                            "title": title,
+                            "body": body,
+                            "labels": labels or []
+                        }
+                    )
+                    return {
+                        "success": True,
+                        "issue_url": result.get("url", ""),
+                        "issue_number": result.get("number", 0)
+                    }
+        except Exception as e:
+            logger.error(f"Error creating issue: {e}")
+            return {"success": False, "error": str(e)}
+    async def add_pr_comment(
+        self,
+        repo_url: str,
+        pr_number: int,
+        comment: str
+    ) -> Dict:
+        """
+        Add comment to PR.
+        Args:
+            repo_url: GitHub repository URL
+            pr_number: PR number
+            comment: Comment text
+        Returns:
+            Success status
+        """
+        _ensure_mcp_imports()  # Lazy load MCP
+        if not self.github_token:
+            return {"success": False, "error": "GITHUB_TOKEN not configured"}
+        try:
+            server_params = StdioServerParameters(
+                command="npx",
+                args=["-y", "@modelcontextprotocol/server-github"],
+                env={"GITHUB_PERSONAL_ACCESS_TOKEN": self.github_token}
+            )
+            async with stdio_client(server_params) as (read, write):
+                async with ClientSession(read, write) as session:
+                    await session.initialize()
+                    await session.call_tool(
+                        "add_comment",
+                        arguments={
+                            "repo": repo_url,
+                            "issue_number": pr_number,
+                            "body": comment
+                        }
+                    )
+                    return {"success": True}
+        except Exception as e:
+            logger.error(f"Error adding comment: {e}")
+            return {"success": False, "error": str(e)}

src/mcp/manager.py ADDED Viewed

	@@ -0,0 +1,169 @@

+"""
+MCP Manager - Central orchestrator for multiple MCP server connections.
+"""
+import os
+import logging
+from typing import Dict, Optional
+from mcp import ClientSession, StdioServerParameters
+from mcp.client.stdio import stdio_client
+logger = logging.getLogger(__name__)
+class MCPManager:
+    """
+    Manages multiple MCP server connections and sessions.
+    Provides centralized connection pooling and session management.
+    """
+    def __init__(self):
+        """Initialize MCP Manager."""
+        self.servers: Dict[str, StdioServerParameters] = {}
+        self.sessions: Dict[str, ClientSession] = {}
+        self.active_connections: Dict[str, bool] = {}
+        logger.info("MCPManager initialized")
+    def register_server(self, name: str, command: str, args: list, env: Optional[Dict] = None):
+        """
+        Register an MCP server configuration.
+        Args:
+            name: Unique name for the server
+            command: Command to start the server
+            args: Arguments for the command
+            env: Optional environment variables
+        """
+        server_params = StdioServerParameters(
+            command=command,
+            args=args,
+            env=env or {}
+        )
+        self.servers[name] = server_params
+        self.active_connections[name] = False
+        logger.info(f"Registered MCP server: {name}")
+    def register_github_server(self):
+        """Register GitHub MCP server."""
+        github_token = os.getenv("GITHUB_TOKEN")
+        if not github_token:
+            logger.warning("GITHUB_TOKEN not set, GitHub MCP will not be available")
+            return
+        self.register_server(
+            name="github",
+            command="npx",
+            args=["-y", "@modelcontextprotocol/server-github"],
+            env={"GITHUB_PERSONAL_ACCESS_TOKEN": github_token}
+        )
+    def register_tavily_server(self):
+        """Register Tavily Search MCP server."""
+        tavily_key = os.getenv("TAVILY_API_KEY")
+        if not tavily_key:
+            logger.warning("TAVILY_API_KEY not set, Tavily MCP will not be available")
+            return
+        self.register_server(
+            name="tavily",
+            command="npx",
+            args=["-y", "@modelcontextprotocol/server-tavily"],
+            env={"TAVILY_API_KEY": tavily_key}
+        )
+    def register_memory_server(self):
+        """Register Memory MCP server."""
+        self.register_server(
+            name="memory",
+            command="npx",
+            args=["-y", "@modelcontextprotocol/server-memory"]
+        )
+    def register_filesystem_server(self, allowed_directories: Optional[list] = None):
+        """
+        Register Filesystem MCP server.
+        Args:
+            allowed_directories: List of allowed directories for file access
+        """
+        args = ["-y", "@modelcontextprotocol/server-filesystem"]
+        if allowed_directories:
+            args.extend(allowed_directories)
+        self.register_server(
+            name="filesystem",
+            command="npx",
+            args=args
+        )
+    def get_server_params(self, name: str) -> Optional[StdioServerParameters]:
+        """
+        Get server parameters by name.
+        Args:
+            name: Server name
+        Returns:
+            Server parameters or None if not found
+        """
+        return self.servers.get(name)
+    def is_server_registered(self, name: str) -> bool:
+        """
+        Check if a server is registered.
+        Args:
+            name: Server name
+        Returns:
+            True if registered, False otherwise
+        """
+        return name in self.servers
+    def list_servers(self) -> list:
+        """
+        List all registered servers.
+        Returns:
+            List of server names
+        """
+        return list(self.servers.keys())
+    async def initialize_all_servers(self):
+        """Initialize all registered MCP servers."""
+        logger.info("Initializing all MCP servers...")
+        for name in self.servers:
+            try:
+                logger.info(f"Initializing {name} MCP server...")
+                # Note: Actual initialization happens when clients connect
+                self.active_connections[name] = True
+            except Exception as e:
+                logger.error(f"Failed to initialize {name}: {e}")
+                self.active_connections[name] = False
+        logger.info("MCP server initialization complete")
+    def get_active_servers(self) -> list:
+        """
+        Get list of active server connections.
+        Returns:
+            List of active server names
+        """
+        return [name for name, active in self.active_connections.items() if active]
+    def register_all_standard_servers(self):
+        """Register all standard MCP servers."""
+        logger.info("Registering all standard MCP servers...")
+        self.register_github_server()
+        self.register_tavily_server()
+        self.register_memory_server()
+        self.register_filesystem_server()
+        logger.info(f"Registered {len(self.servers)} MCP servers")

src/mcp/memory_client.py ADDED Viewed

	@@ -0,0 +1,202 @@

+"""
+Memory MCP Client - Store and retrieve analysis results using Memory MCP server.
+"""
+import json
+import logging
+from typing import Dict, Optional, Any
+from mcp import ClientSession
+from mcp.client.stdio import stdio_client
+logger = logging.getLogger(__name__)
+class MemoryMCPClient:
+    """
+    Client for Memory MCP server to cache analysis results and transformation examples.
+    """
+    def __init__(self, mcp_manager):
+        """
+        Initialize Memory MCP client.
+        Args:
+            mcp_manager: MCPManager instance
+        """
+        self.mcp_manager = mcp_manager
+        self.server_name = "memory"
+        logger.info("MemoryMCPClient initialized")
+    async def store_pattern_analysis(self, pattern_id: str, analysis: Dict) -> bool:
+        """
+        Store pattern analysis in MCP memory.
+        Args:
+            pattern_id: Unique identifier for the pattern
+            analysis: Analysis data to store
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            server_params = self.mcp_manager.get_server_params(self.server_name)
+            if not server_params:
+                logger.error(f"{self.server_name} MCP server not registered")
+                return False
+            async with stdio_client(server_params) as (read, write):
+                async with ClientSession(read, write) as session:
+                    await session.initialize()
+                    # Store entity in memory
+                    result = await session.call_tool(
+                        "store_entity",
+                        arguments={
+                            "name": f"pattern_{pattern_id}",
+                            "content": json.dumps(analysis)
+                        }
+                    )
+                    logger.info(f"Stored pattern analysis: {pattern_id}")
+                    return True
+        except Exception as e:
+            logger.error(f"Error storing pattern analysis: {e}")
+            return False
+    async def retrieve_pattern_analysis(self, pattern_id: str) -> Optional[Dict]:
+        """
+        Retrieve cached pattern analysis.
+        Args:
+            pattern_id: Unique identifier for the pattern
+        Returns:
+            Analysis data or None if not found
+        """
+        try:
+            server_params = self.mcp_manager.get_server_params(self.server_name)
+            if not server_params:
+                logger.error(f"{self.server_name} MCP server not registered")
+                return None
+            async with stdio_client(server_params) as (read, write):
+                async with ClientSession(read, write) as session:
+                    await session.initialize()
+                    # Retrieve entity from memory
+                    result = await session.call_tool(
+                        "retrieve_entity",
+                        arguments={"name": f"pattern_{pattern_id}"}
+                    )
+                    if result and hasattr(result, 'content'):
+                        data = json.loads(result.content[0].text)
+                        logger.info(f"Retrieved pattern analysis: {pattern_id}")
+                        return data
+                    return None
+        except Exception as e:
+            logger.error(f"Error retrieving pattern analysis: {e}")
+            return None
+    async def store_transformation_example(self, example_id: str, example: Dict) -> bool:
+        """
+        Store a successful transformation example.
+        Args:
+            example_id: Unique identifier for the example
+            example: Example data containing before/after code
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            server_params = self.mcp_manager.get_server_params(self.server_name)
+            if not server_params:
+                logger.error(f"{self.server_name} MCP server not registered")
+                return False
+            async with stdio_client(server_params) as (read, write):
+                async with ClientSession(read, write) as session:
+                    await session.initialize()
+                    result = await session.call_tool(
+                        "store_entity",
+                        arguments={
+                            "name": f"example_{example_id}",
+                            "content": json.dumps(example)
+                        }
+                    )
+                    logger.info(f"Stored transformation example: {example_id}")
+                    return True
+        except Exception as e:
+            logger.error(f"Error storing transformation example: {e}")
+            return False
+    async def get_transformation_examples(self, pattern_type: str, limit: int = 5) -> list:
+        """
+        Retrieve transformation examples for a pattern type.
+        Args:
+            pattern_type: Type of pattern to get examples for
+            limit: Maximum number of examples to return
+        Returns:
+            List of transformation examples
+        """
+        try:
+            server_params = self.mcp_manager.get_server_params(self.server_name)
+            if not server_params:
+                logger.error(f"{self.server_name} MCP server not registered")
+                return []
+            async with stdio_client(server_params) as (read, write):
+                async with ClientSession(read, write) as session:
+                    await session.initialize()
+                    # Search for examples matching pattern type
+                    # Note: This is a simplified implementation
+                    # In production, you'd want more sophisticated querying
+                    examples = []
+                    for i in range(limit):
+                        try:
+                            result = await session.call_tool(
+                                "retrieve_entity",
+                                arguments={"name": f"example_{pattern_type}_{i}"}
+                            )
+                            if result and hasattr(result, 'content'):
+                                example = json.loads(result.content[0].text)
+                                examples.append(example)
+                        except:
+                            break
+                    logger.info(f"Retrieved {len(examples)} transformation examples")
+                    return examples
+        except Exception as e:
+            logger.error(f"Error retrieving transformation examples: {e}")
+            return []
+    async def clear_cache(self) -> bool:
+        """
+        Clear all cached data.
+        Returns:
+            True if successful, False otherwise
+        """
+        try:
+            # Note: Memory MCP may not have a clear_all method
+            # This is a placeholder for future implementation
+            logger.info("Cache cleared (placeholder)")
+            return True
+        except Exception as e:
+            logger.error(f"Error clearing cache: {e}")
+            return False

src/mcp/search_client.py ADDED Viewed

	@@ -0,0 +1,247 @@

+"""
+Search MCP Client - Find migration guides and documentation using Tavily MCP server.
+"""
+import logging
+from typing import List, Dict, Optional
+from mcp import ClientSession
+from mcp.client.stdio import stdio_client
+logger = logging.getLogger(__name__)
+class SearchMCPClient:
+    """
+    Client for Tavily Search MCP server to find migration guides and best practices.
+    """
+    def __init__(self, mcp_manager):
+        """
+        Initialize Search MCP client.
+        Args:
+            mcp_manager: MCPManager instance
+        """
+        self.mcp_manager = mcp_manager
+        self.server_name = "tavily"
+        logger.info("SearchMCPClient initialized")
+    async def find_migration_guide(self, from_tech: str, to_tech: str, max_results: int = 5) -> List[Dict]:
+        """
+        Find migration documentation for technology upgrade.
+        Args:
+            from_tech: Source technology (e.g., "Python 2.7")
+            to_tech: Target technology (e.g., "Python 3.12")
+            max_results: Maximum number of results to return
+        Returns:
+            List of search results with URLs and snippets
+        """
+        try:
+            server_params = self.mcp_manager.get_server_params(self.server_name)
+            if not server_params:
+                logger.warning(f"{self.server_name} MCP server not registered, returning empty results")
+                return []
+            query = f"{from_tech} to {to_tech} migration guide best practices"
+            async with stdio_client(server_params) as (read, write):
+                async with ClientSession(read, write) as session:
+                    await session.initialize()
+                    result = await session.call_tool(
+                        "search",
+                        arguments={
+                            "query": query,
+                            "max_results": max_results
+                        }
+                    )
+                    # Parse results
+                    results = []
+                    if result and hasattr(result, 'content'):
+                        for item in result.content:
+                            if hasattr(item, 'text'):
+                                results.append({
+                                    'title': item.text.get('title', ''),
+                                    'url': item.text.get('url', ''),
+                                    'snippet': item.text.get('snippet', ''),
+                                    'score': item.text.get('score', 0)
+                                })
+                    logger.info(f"Found {len(results)} migration guides for {from_tech} to {to_tech}")
+                    return results
+        except Exception as e:
+            logger.error(f"Error finding migration guide: {e}")
+            return []
+    async def find_library_documentation(self, library_name: str, version: Optional[str] = None) -> List[Dict]:
+        """
+        Find official documentation for a library.
+        Args:
+            library_name: Name of the library
+            version: Optional specific version
+        Returns:
+            List of documentation links
+        """
+        try:
+            server_params = self.mcp_manager.get_server_params(self.server_name)
+            if not server_params:
+                logger.warning(f"{self.server_name} MCP server not registered, returning empty results")
+                return []
+            query = f"{library_name} official documentation"
+            if version:
+                query += f" version {version}"
+            async with stdio_client(server_params) as (read, write):
+                async with ClientSession(read, write) as session:
+                    await session.initialize()
+                    result = await session.call_tool(
+                        "search",
+                        arguments={
+                            "query": query,
+                            "max_results": 3
+                        }
+                    )
+                    results = []
+                    if result and hasattr(result, 'content'):
+                        for item in result.content:
+                            if hasattr(item, 'text'):
+                                results.append({
+                                    'title': item.text.get('title', ''),
+                                    'url': item.text.get('url', ''),
+                                    'snippet': item.text.get('snippet', '')
+                                })
+                    logger.info(f"Found {len(results)} documentation links for {library_name}")
+                    return results
+        except Exception as e:
+            logger.error(f"Error finding library documentation: {e}")
+            return []
+    async def find_best_practices(self, topic: str, language: str = "python") -> List[Dict]:
+        """
+        Find best practices for a specific topic.
+        Args:
+            topic: Topic to search for (e.g., "database connection pooling")
+            language: Programming language
+        Returns:
+            List of best practice resources
+        """
+        try:
+            server_params = self.mcp_manager.get_server_params(self.server_name)
+            if not server_params:
+                logger.warning(f"{self.server_name} MCP server not registered, returning empty results")
+                return []
+            query = f"{language} {topic} best practices 2024"
+            async with stdio_client(server_params) as (read, write):
+                async with ClientSession(read, write) as session:
+                    await session.initialize()
+                    result = await session.call_tool(
+                        "search",
+                        arguments={
+                            "query": query,
+                            "max_results": 5
+                        }
+                    )
+                    results = []
+                    if result and hasattr(result, 'content'):
+                        for item in result.content:
+                            if hasattr(item, 'text'):
+                                results.append({
+                                    'title': item.text.get('title', ''),
+                                    'url': item.text.get('url', ''),
+                                    'snippet': item.text.get('snippet', '')
+                                })
+                    logger.info(f"Found {len(results)} best practice resources for {topic}")
+                    return results
+        except Exception as e:
+            logger.error(f"Error finding best practices: {e}")
+            return []
+    async def find_security_vulnerabilities(self, pattern: str, language: str = "python") -> List[Dict]:
+        """
+        Find information about security vulnerabilities in a code pattern.
+        Args:
+            pattern: Code pattern to check (e.g., "SQL string interpolation")
+            language: Programming language
+        Returns:
+            List of security resources
+        """
+        try:
+            server_params = self.mcp_manager.get_server_params(self.server_name)
+            if not server_params:
+                logger.warning(f"{self.server_name} MCP server not registered, returning empty results")
+                return []
+            query = f"{language} {pattern} security vulnerability CVE"
+            async with stdio_client(server_params) as (read, write):
+                async with ClientSession(read, write) as session:
+                    await session.initialize()
+                    result = await session.call_tool(
+                        "search",
+                        arguments={
+                            "query": query,
+                            "max_results": 5
+                        }
+                    )
+                    results = []
+                    if result and hasattr(result, 'content'):
+                        for item in result.content:
+                            if hasattr(item, 'text'):
+                                results.append({
+                                    'title': item.text.get('title', ''),
+                                    'url': item.text.get('url', ''),
+                                    'snippet': item.text.get('snippet', ''),
+                                    'severity': self._extract_severity(item.text.get('snippet', ''))
+                                })
+                    logger.info(f"Found {len(results)} security resources for {pattern}")
+                    return results
+        except Exception as e:
+            logger.error(f"Error finding security vulnerabilities: {e}")
+            return []
+    def _extract_severity(self, text: str) -> str:
+        """
+        Extract severity level from text.
+        Args:
+            text: Text to analyze
+        Returns:
+            Severity level (critical, high, medium, low, unknown)
+        """
+        text_lower = text.lower()
+        if 'critical' in text_lower:
+            return 'critical'
+        elif 'high' in text_lower:
+            return 'high'
+        elif 'medium' in text_lower or 'moderate' in text_lower:
+            return 'medium'
+        elif 'low' in text_lower:
+            return 'low'
+        return 'unknown'

src/sandbox/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Sandbox module for secure test execution."""
+from .validator import ModalSandboxValidator, app
+__all__ = ['ModalSandboxValidator', 'app']

src/sandbox/config.py ADDED Viewed

	@@ -0,0 +1,124 @@

+"""
+Sandbox execution configuration.
+Handles environment-specific settings for local vs Hugging Face deployment.
+"""
+import os
+import logging
+logger = logging.getLogger(__name__)
+def is_huggingface_space() -> bool:
+    """Detect if running in Hugging Face Spaces environment."""
+    return os.getenv("SPACE_ID") is not None or os.getenv("SYSTEM") == "spaces"
+def is_modal_configured() -> bool:
+    """Check if Modal is properly configured with credentials."""
+    # Check for Modal token in environment
+    token_id = os.getenv("MODAL_TOKEN_ID")
+    token_secret = os.getenv("MODAL_TOKEN_SECRET")
+    # Check if modal config exists
+    modal_config_exists = os.path.exists(os.path.expanduser("~/.modal.toml"))
+    return bool((token_id and token_secret) or modal_config_exists)
+def get_execution_mode() -> str:
+    """
+    Determine the execution mode based on environment.
+    Returns:
+        "modal" - Use Modal for execution (required for Hugging Face)
+        "local" - Use local subprocess execution
+        "auto" - Try Modal first, fallback to local
+    """
+    # Explicit mode from environment
+    mode = os.getenv("EXECUTION_MODE", "").lower()
+    if mode in ("modal", "local", "auto"):
+        return mode
+    # Auto-detect based on environment
+    if is_huggingface_space():
+        # Hugging Face Spaces MUST use Modal
+        if is_modal_configured():
+            logger.info("Hugging Face Spaces detected - using Modal execution")
+            return "modal"
+        else:
+            logger.error("Hugging Face Spaces detected but Modal not configured!")
+            logger.error("Set MODAL_TOKEN_ID and MODAL_TOKEN_SECRET environment variables")
+            return "modal"  # Still return modal, will fail with clear error
+    # Local development - try Modal first, fallback to local
+    if is_modal_configured():
+        return "auto"
+    else:
+        logger.info("Modal not configured - using local execution")
+        return "local"
+def should_prefer_modal() -> bool:
+    """Determine if Modal should be preferred over local execution."""
+    mode = get_execution_mode()
+    if mode == "modal":
+        return True
+    elif mode == "local":
+        return False
+    else:  # auto
+        return is_modal_configured()
+def validate_environment():
+    """
+    Validate that the environment is properly configured.
+    Raises warnings or errors for configuration issues.
+    """
+    mode = get_execution_mode()
+    is_hf = is_huggingface_space()
+    modal_ok = is_modal_configured()
+    if is_hf and not modal_ok:
+        logger.error("=" * 60)
+        logger.error("CONFIGURATION ERROR: Hugging Face Spaces Deployment")
+        logger.error("=" * 60)
+        logger.error("Modal is REQUIRED for Hugging Face Spaces but not configured.")
+        logger.error("")
+        logger.error("To fix this:")
+        logger.error("1. Get Modal token from: https://modal.com/settings")
+        logger.error("2. Set Hugging Face Secrets:")
+        logger.error("   - MODAL_TOKEN_ID")
+        logger.error("   - MODAL_TOKEN_SECRET")
+        logger.error("3. Restart the Space")
+        logger.error("=" * 60)
+        return False
+    if mode == "modal" and not modal_ok:
+        logger.warning("Execution mode set to 'modal' but Modal not configured")
+        logger.warning("Tests will fail until Modal is configured")
+        return False
+    if mode == "local" and is_hf:
+        logger.warning("Local execution mode on Hugging Face Spaces will not work")
+        logger.warning("Change EXECUTION_MODE to 'modal'")
+        return False
+    # All good
+    logger.info(f"Environment validated: mode={mode}, huggingface={is_hf}, modal_configured={modal_ok}")
+    return True
+# Configuration values
+EXECUTION_MODE = get_execution_mode()
+PREFER_MODAL = should_prefer_modal()
+IS_HUGGINGFACE = is_huggingface_space()
+MODAL_CONFIGURED = is_modal_configured()
+# Log configuration on import
+logger.info(f"Sandbox Configuration:")
+logger.info(f"  Execution Mode: {EXECUTION_MODE}")
+logger.info(f"  Prefer Modal: {PREFER_MODAL}")
+logger.info(f"  Hugging Face: {IS_HUGGINGFACE}")
+logger.info(f"  Modal Configured: {MODAL_CONFIGURED}")

src/sandbox/images.py ADDED Viewed

	@@ -0,0 +1,122 @@

+"""
+Modal Container Images for Multi-Language Test Execution.
+Defines secure, isolated container images for each supported language.
+"""
+import logging
+logger = logging.getLogger(__name__)
+# Try to import Modal
+try:
+    import modal
+    MODAL_AVAILABLE = True
+except ImportError:
+    MODAL_AVAILABLE = False
+    modal = None
+    logger.warning("Modal not available - will use local execution only")
+# Create Modal app only if available
+if MODAL_AVAILABLE:
+    app = modal.App("legacy-code-validator")
+    # ============================================================================
+    # SUPPORTED LANGUAGES (Production Ready)
+    # ============================================================================
+    python_image = (
+        modal.Image.debian_slim()
+        .pip_install(
+            "pytest>=9.0.0",
+            "pytest-cov>=6.0.0",
+            "pytest-timeout>=2.3.0",
+            "pytest-benchmark>=4.0.0",
+            "pytest-mock>=3.12.0"
+        )
+    )
+    java_image = (
+        modal.Image.debian_slim()
+        .apt_install("openjdk-17-jdk", "maven", "wget")
+        .run_commands(
+            "mvn --version"
+        )
+    )
+    javascript_image = (
+        modal.Image.debian_slim()
+        .apt_install(
+            "curl", "ca-certificates", "gnupg", "libxt6", "libxmu6", "libxaw7",
+            "build-essential", "python3", "git"
+        )
+        .run_commands(
+            # Install Node.js 20.x
+            "mkdir -p /etc/apt/keyrings",
+            "curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg",
+            "echo 'deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main' | tee /etc/apt/sources.list.d/nodesource.list",
+            "apt-get update",
+            "apt-get install -y nodejs",
+            # Pre-install Jest globally for faster test execution
+            "npm install -g jest@latest ts-jest@latest typescript@latest @types/jest@latest",
+            # Create a working directory and set permissions
+            "mkdir -p /workspace",
+            "chmod 777 /workspace",
+            "node --version",
+            "npm --version",
+            "jest --version"
+        )
+    )
+    # TypeScript uses same image as JavaScript
+    typescript_image = javascript_image
+    # ============================================================================
+    # IMAGE REGISTRY
+    # ============================================================================
+    LANGUAGE_IMAGES = {
+        # Supported Languages
+        'python': python_image,
+        'java': java_image,
+        'javascript': javascript_image,
+        'typescript': typescript_image
+    }
+    # Support status for UI display
+    LANGUAGE_SUPPORT_STATUS = {
+        'python': 'production',
+        'java': 'production',
+        'javascript': 'production',
+        'typescript': 'production'
+    }
+else:
+    # Fallback when Modal not available
+    app = None
+    LANGUAGE_IMAGES = {}
+    LANGUAGE_SUPPORT_STATUS = {}
+    python_image = None
+    java_image = None
+    javascript_image = None
+    typescript_image = None
+def get_image_for_language(language: str):
+    """Get the appropriate Modal image for a language."""
+    if not MODAL_AVAILABLE:
+        return None
+    return LANGUAGE_IMAGES.get(language.lower())
+def get_support_status(language: str) -> str:
+    """Get support status for a language: production, experimental, planned, or unsupported."""
+    if not MODAL_AVAILABLE:
+        return 'local_only'
+    return LANGUAGE_SUPPORT_STATUS.get(language.lower(), 'unsupported')
+def is_language_supported(language: str) -> bool:
+    """Check if a language is supported in Modal."""
+    return language.lower() in LANGUAGE_IMAGES

src/sandbox/modal_executor.py ADDED Viewed

	@@ -0,0 +1,423 @@

+"""
+Modal-based test executor using Modal Sandboxes for multi-language support.
+Uses Sandbox.exec() API for more flexible and reliable language execution.
+Supports: Python, Java, JavaScript, TypeScript, and more.
+"""
+import logging
+import tempfile
+import json
+from typing import Dict, List
+from pathlib import Path
+logger = logging.getLogger(__name__)
+# Try to import Modal
+try:
+    import modal
+    import os
+    # Configure Modal authentication from environment if available
+    token_id = os.getenv("MODAL_TOKEN_ID")
+    token_secret = os.getenv("MODAL_TOKEN_SECRET")
+    if token_id and token_secret:
+        # Set Modal credentials from environment variables
+        # This is needed for Hugging Face Spaces deployment
+        os.environ["MODAL_TOKEN_ID"] = token_id
+        os.environ["MODAL_TOKEN_SECRET"] = token_secret
+        logger.info("Modal credentials loaded from environment")
+    MODAL_AVAILABLE = True
+except ImportError:
+    MODAL_AVAILABLE = False
+    modal = None
+    logger.warning("Modal not available - install with: pip install modal")
+if MODAL_AVAILABLE:
+    from .images import LANGUAGE_IMAGES
+    def _execute_python_in_sandbox(sb: modal.Sandbox, code: str, tests: str,
+                                    module_name: str) -> Dict:
+        """Execute Python tests in Modal Sandbox using pytest."""
+        try:
+            # Ensure workspace directory exists
+            p = sb.exec("mkdir", "-p", "/workspace", timeout=30)
+            p.wait()
+            # Create a combined test file
+            test_content = f"""# Test module
+{code}
+# Tests
+{tests}
+"""
+            # Upload files to sandbox
+            with sb.open(f"/workspace/test_{module_name}.py", "w") as f:
+                f.write(test_content)
+            # Run pytest
+            p = sb.exec("python", "-m", "pytest", f"/workspace/test_{module_name}.py",
+                       "-v", "--tb=short", timeout=120)
+            p.wait()
+            stdout = p.stdout.read()
+            stderr = p.stderr.read()
+            logger.info(f"Python test output: {stdout}")
+            # Parse results
+            success = p.returncode == 0
+            return {
+                "success": success,
+                "tests_run": 1,
+                "tests_passed": 1 if success else 0,
+                "tests_failed": 0 if success else 1,
+                "stdout": stdout,
+                "stderr": stderr,
+                "execution_mode": "modal",
+                "language": "python"
+            }
+        except Exception as e:
+            logger.error(f"Python sandbox execution failed: {e}", exc_info=True)
+            return {
+                "success": False,
+                "error": f"Python execution error: {str(e)}",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "modal",
+                "language": "python"
+            }
+    def _execute_java_in_sandbox(sb: modal.Sandbox, code: str, tests: str,
+                                  module_name: str) -> Dict:
+        """Execute Java tests in Modal Sandbox using Maven."""
+        try:
+            # Ensure workspace directory exists
+            p = sb.exec("mkdir", "-p", "/workspace", timeout=30)
+            p.wait()
+            # Create Maven project structure
+            # Create pom.xml
+            pom_xml = f"""<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>com.example</groupId>
+    <artifactId>{module_name}</artifactId>
+    <version>1.0.0</version>
+    <properties>
+        <maven.compiler.source>17</maven.compiler.source>
+        <maven.compiler.target>17</maven.compiler.target>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter</artifactId>
+            <version>5.9.0</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>2.22.2</version>
+            </plugin>
+        </plugins>
+    </build>
+</project>"""
+            # Upload files to sandbox
+            with sb.open(f"/workspace/{module_name}.java", "w") as f:
+                f.write(code)
+            with sb.open(f"/workspace/{module_name}Test.java", "w") as f:
+                f.write(tests)
+            with sb.open(f"/workspace/pom.xml", "w") as f:
+                f.write(pom_xml)
+            # Run Maven tests
+            p = sb.exec("bash", "-c", "cd /workspace && mvn test -q 2>&1", timeout=120)
+            p.wait()
+            stdout = p.stdout.read()
+            stderr = p.stderr.read()
+            logger.info(f"Maven test output: {stdout}")
+            if p.returncode == 0:
+                return {
+                    "success": True,
+                    "tests_run": 1,
+                    "tests_passed": 1,
+                    "tests_failed": 0,
+                    "stdout": stdout,
+                    "stderr": stderr,
+                    "execution_mode": "modal",
+                    "language": "java"
+                }
+            else:
+                return {
+                    "success": False,
+                    "error": f"Tests failed: {stderr}",
+                    "tests_run": 1,
+                    "tests_passed": 0,
+                    "tests_failed": 1,
+                    "stdout": stdout,
+                    "stderr": stderr,
+                    "execution_mode": "modal",
+                    "language": "java"
+                }
+        except Exception as e:
+            logger.error(f"Java sandbox execution failed: {e}")
+            return {
+                "success": False,
+                "error": f"Java execution error: {str(e)}",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "modal",
+                "language": "java"
+            }
+    def _execute_javascript_in_sandbox(sb: modal.Sandbox, code: str, tests: str,
+                                       module_name: str, language: str = 'javascript') -> Dict:
+        """Execute JavaScript/TypeScript tests in Modal Sandbox using Jest."""
+        try:
+            # Ensure workspace directory exists
+            p = sb.exec("mkdir", "-p", "/workspace", timeout=30)
+            p.wait()
+            ext = '.ts' if language == 'typescript' else '.js'
+            # Create package.json
+            package_json = {
+                "name": module_name.replace('_', '-'),
+                "version": "1.0.0",
+                "description": "Test suite",
+                "scripts": {
+                    "test": "jest --json"
+                },
+                "devDependencies": {
+                    "jest": "^29.0.0"
+                }
+            }
+            # For JavaScript, use ES modules with proper Jest config
+            # For TypeScript, use ts-jest preset
+            if language == 'javascript':
+                package_json["type"] = "module"
+            elif language == 'typescript':
+                package_json["devDependencies"]["ts-jest"] = "^29.0.0"
+                package_json["devDependencies"]["typescript"] = "^5.0.0"
+                package_json["devDependencies"]["@types/jest"] = "^29.0.0"
+            # Create Jest config
+            jest_config = {
+                "testEnvironment": "node",
+                "testMatch": ["**/*.test.js", "**/*.test.ts"]
+            }
+            if language == 'javascript':
+                # Configure Jest for ES modules
+                jest_config["transform"] = {}
+                jest_config["extensionsToTreatAsEsm"] = [".js"]
+            elif language == 'typescript':
+                jest_config["preset"] = "ts-jest"
+                jest_config["moduleNameMapper"] = {
+                    "^(\\.{1,2}/.*)\\.ts$": "$1"
+                }
+            # Upload files to sandbox
+            with sb.open(f"/workspace/{module_name}{ext}", "w") as f:
+                f.write(code)
+            with sb.open(f"/workspace/{module_name}.test{ext}", "w") as f:
+                f.write(tests)
+            with sb.open(f"/workspace/package.json", "w") as f:
+                f.write(json.dumps(package_json, indent=2))
+            with sb.open(f"/workspace/jest.config.json", "w") as f:
+                f.write(json.dumps(jest_config, indent=2))
+            # For TypeScript, create tsconfig.json
+            if language == 'typescript':
+                tsconfig = {
+                    "compilerOptions": {
+                        "target": "ES2020",
+                        "module": "commonjs",
+                        "lib": ["ES2020"],
+                        "strict": True,
+                        "esModuleInterop": True,
+                        "skipLibCheck": True,
+                        "forceConsistentCasingInFileNames": True,
+                        "resolveJsonModule": True,
+                        "moduleResolution": "node",
+                        "types": ["jest", "node"]
+                    },
+                    "include": ["*.ts"],
+                    "exclude": ["node_modules"]
+                }
+                with sb.open(f"/workspace/tsconfig.json", "w") as f:
+                    f.write(json.dumps(tsconfig, indent=2))
+            # Install dependencies and run tests
+            p = sb.exec("bash", "-c",
+                       "cd /workspace && npm install --legacy-peer-deps && npm test 2>&1",
+                       timeout=180)
+            p.wait()
+            stdout = p.stdout.read()
+            stderr = p.stderr.read()
+            logger.info(f"Jest test output: {stdout}")
+            # Parse Jest JSON output if available
+            try:
+                # Extract JSON from output (Jest outputs to stdout)
+                lines = stdout.split('\n')
+                json_str = None
+                for line in lines:
+                    if line.strip().startswith('{') and 'numTotalTests' in line:
+                        json_str = line
+                        break
+                if json_str:
+                    result = json.loads(json_str)
+                    tests_run = result.get('numTotalTests', 0)
+                    tests_passed = result.get('numPassedTests', 0)
+                    tests_failed = result.get('numFailedTests', 0)
+                    success = result.get('success', False)
+                else:
+                    tests_run = 1 if p.returncode == 0 else 1
+                    tests_passed = 1 if p.returncode == 0 else 0
+                    tests_failed = 0 if p.returncode == 0 else 1
+                    success = p.returncode == 0
+            except Exception as parse_error:
+                logger.warning(f"Could not parse Jest JSON output: {parse_error}")
+                tests_run = 1
+                tests_passed = 1 if p.returncode == 0 else 0
+                tests_failed = 0 if p.returncode == 0 else 1
+                success = p.returncode == 0
+            return {
+                "success": success,
+                "tests_run": tests_run,
+                "tests_passed": tests_passed,
+                "tests_failed": tests_failed,
+                "stdout": stdout,
+                "stderr": stderr,
+                "execution_mode": "modal",
+                "language": language
+            }
+        except Exception as e:
+            logger.error(f"JavaScript sandbox execution failed: {e}")
+            return {
+                "success": False,
+                "error": f"{language} execution error: {str(e)}",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "modal",
+                "language": language
+            }
+    def execute_in_modal(code: str, tests: str, requirements: List[str],
+                        module_name: str, language: str) -> Dict:
+        """
+        Execute tests in Modal Sandbox with proper image configuration.
+        Uses Sandbox.exec() for better multi-language support.
+        Args:
+            code: Source code
+            tests: Test code
+            requirements: Package requirements
+            module_name: Module name
+            language: Programming language
+        Returns:
+            Test execution results
+        """
+        lang_lower = language.lower()
+        if lang_lower not in LANGUAGE_IMAGES:
+            return {
+                "success": False,
+                "error": f"Unsupported language: {language}",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "unsupported",
+                "language": language
+            }
+        try:
+            logger.info(f"Executing {language} tests in Modal Sandbox...")
+            # Get the appropriate image for this language
+            image = LANGUAGE_IMAGES[lang_lower]
+            # Create app for this execution
+            app = modal.App.lookup("legacy-code-validator", create_if_missing=True)
+            # Create sandbox with appropriate image
+            with modal.enable_output():
+                sb = modal.Sandbox.create(
+                    image=image,
+                    app=app,
+                    timeout=300,
+                    cpu=2.0,
+                    memory=4096
+                )
+                try:
+                    # Dispatch to language-specific executor
+                    if lang_lower == 'python':
+                        result = _execute_python_in_sandbox(sb, code, tests, module_name)
+                    elif lang_lower == 'java':
+                        result = _execute_java_in_sandbox(sb, code, tests, module_name)
+                    elif lang_lower in ('javascript', 'typescript'):
+                        result = _execute_javascript_in_sandbox(sb, code, tests, module_name, lang_lower)
+                    else:
+                        result = {
+                            "success": False,
+                            "error": f"No executor for language: {language}",
+                            "tests_run": 0,
+                            "tests_passed": 0,
+                            "tests_failed": 0,
+                            "execution_mode": "modal",
+                            "language": language
+                        }
+                    result['execution_mode'] = 'modal'
+                    return result
+                finally:
+                    sb.terminate()
+        except Exception as e:
+            logger.error(f"Modal sandbox execution failed: {e}", exc_info=True)
+            return {
+                "success": False,
+                "error": f"Modal sandbox error: {str(e)}",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "modal_error",
+                "language": language
+            }
+else:
+    # Stub when Modal not available
+    def execute_in_modal(code: str, tests: str, requirements: List[str],
+                        module_name: str, language: str) -> Dict:
+        """Stub function when Modal is not available."""
+        return {
+            "success": False,
+            "error": "Modal not available",
+            "tests_run": 0,
+            "tests_passed": 0,
+            "tests_failed": 0,
+            "execution_mode": "modal_unavailable",
+            "language": language
+        }

src/sandbox/runners/__init__.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+Language-specific test runners for Modal sandbox execution.
+Each runner handles project structure, build files, and test execution for its language.
+"""
+from .python_runner import run_python_tests
+from .java_runner import run_java_tests
+from .javascript_runner import run_javascript_tests
+__all__ = [
+    'run_python_tests',
+    'run_java_tests',
+    'run_javascript_tests',
+]
+# Registry of all available runners
+LANGUAGE_RUNNERS = {
+    'python': run_python_tests,
+    'java': run_java_tests,
+    'javascript': run_javascript_tests,
+    'typescript': run_javascript_tests,  # TypeScript uses JS runner
+}
+def get_runner_for_language(language: str):
+    """Get the appropriate test runner function for a language."""
+    return LANGUAGE_RUNNERS.get(language.lower())
+def is_runner_available(language: str) -> bool:
+    """Check if a test runner is available for a language."""
+    return language.lower() in LANGUAGE_RUNNERS

src/sandbox/runners/java_runner.py ADDED Viewed

	@@ -0,0 +1,350 @@

+"""
+Java test runner for Modal sandbox execution.
+Handles Maven project structure, pom.xml generation, and JUnit 5 execution.
+"""
+import subprocess
+import tempfile
+import time
+import logging
+import re
+from pathlib import Path
+from typing import Dict, List
+logger = logging.getLogger(__name__)
+def _extract_class_name(code: str, module_name: str) -> str:
+    """Extract Java class name from code."""
+    match = re.search(r'public\s+class\s+(\w+)', code)
+    if match:
+        return match.group(1)
+    # Fallback: convert module_name to PascalCase
+    return ''.join(word.capitalize() for word in module_name.split('_'))
+def _create_maven_project(tmpdir: Path, module_name: str, code: str, tests: str) -> str:
+    """
+    Create Maven project structure with proper directory layout.
+    Returns:
+        Class name extracted from code
+    """
+    # Extract class names
+    main_class = _extract_class_name(code, module_name)
+    test_class = _extract_class_name(tests, f"{module_name}Test")
+    # Create Maven directory structure
+    src_main = tmpdir / "src" / "main" / "java" / "com" / "modernizer"
+    src_test = tmpdir / "src" / "test" / "java" / "com" / "modernizer"
+    src_main.mkdir(parents=True)
+    src_test.mkdir(parents=True)
+    # Add package declaration if not present
+    if "package " not in code:
+        code = "package com.modernizer;\n\n" + code
+    if "package " not in tests:
+        tests = "package com.modernizer;\n\n" + tests
+    # Write source files
+    (src_main / f"{main_class}.java").write_text(code, encoding='utf-8')
+    (src_test / f"{test_class}.java").write_text(tests, encoding='utf-8')
+    # Generate pom.xml
+    pom_xml = f"""<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
+         http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <groupId>com.modernizer</groupId>
+    <artifactId>{module_name}</artifactId>
+    <version>1.0-SNAPSHOT</version>
+    <packaging>jar</packaging>
+    <properties>
+        <maven.compiler.source>17</maven.compiler.source>
+        <maven.compiler.target>17</maven.compiler.target>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <junit.version>5.10.1</junit.version>
+    </properties>
+    <dependencies>
+        <!-- JUnit 5 -->
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter</artifactId>
+            <version>${{junit.version}}</version>
+            <scope>test</scope>
+        </dependency>
+        <!-- Mockito for mocking -->
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-core</artifactId>
+            <version>5.7.0</version>
+            <scope>test</scope>
+        </dependency>
+        <dependency>
+            <groupId>org.assertj</groupId>
+            <artifactId>assertj-core</artifactId>
+            <version>3.24.2</version>
+            <scope>test</scope>
+        </dependency>
+        <!-- Servlet API -->
+        <dependency>
+            <groupId>javax.servlet</groupId>
+            <artifactId>javax.servlet-api</artifactId>
+            <version>4.0.1</version>
+            <scope>provided</scope>
+        </dependency>
+    </dependencies>
+    <build>
+        <plugins>
+            <!-- Maven Compiler Plugin -->
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>3.11.0</version>
+                <configuration>
+                    <source>17</source>
+                    <target>17</target>
+                </configuration>
+            </plugin>
+            <!-- Maven Surefire Plugin for running tests -->
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>3.2.2</version>
+                <configuration>
+                    <includes>
+                        <include>**/*Test.java</include>
+                    </includes>
+                </configuration>
+            </plugin>
+            <!-- JaCoCo for code coverage -->
+            <plugin>
+                <groupId>org.jacoco</groupId>
+                <artifactId>jacoco-maven-plugin</artifactId>
+                <version>0.8.11</version>
+                <executions>
+                    <execution>
+                        <goals>
+                            <goal>prepare-agent</goal>
+                        </goals>
+                    </execution>
+                    <execution>
+                        <id>report</id>
+                        <phase>test</phase>
+                        <goals>
+                            <goal>report</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+        </plugins>
+    </build>
+</project>
+"""
+    (tmpdir / "pom.xml").write_text(pom_xml, encoding='utf-8')
+    return main_class
+def _validate_java_tests(tests: str) -> tuple:
+    """
+    Validate Java test code before execution.
+    Returns:
+        (is_valid, error_message)
+    """
+    # Check for JUnit 5 annotations
+    if "@Test" not in tests:
+        return False, "No @Test annotations found (required for JUnit 5)"
+    # Check for JUnit imports
+    if "org.junit" not in tests:
+        return False, "Missing JUnit imports (import org.junit.jupiter.api.Test)"
+    # Check for test class
+    if "class" not in tests:
+        return False, "No test class found"
+    return True, ""
+def run_java_tests(code: str, tests: str, requirements: List[str], module_name: str) -> Dict:
+    """
+    Run Java tests using Maven and JUnit 5 in Modal container.
+    Args:
+        code: Java source code
+        tests: JUnit test code
+        requirements: List of Maven dependencies (not used currently)
+        module_name: Name of the module
+    Returns:
+        Dictionary with test results
+    """
+    # Validate tests before execution
+    is_valid, error_msg = _validate_java_tests(tests)
+    if not is_valid:
+        logger.error(f"Test validation failed: {error_msg}")
+        return {
+            "success": False,
+            "error": f"Test validation failed: {error_msg}",
+            "tests_run": 0,
+            "tests_passed": 0,
+            "tests_failed": 0,
+            "execution_mode": "modal",
+            "language": "java"
+        }
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir_path = Path(tmpdir)
+        try:
+            # Create Maven project structure
+            class_name = _create_maven_project(tmpdir_path, module_name, code, tests)
+            logger.info(f"Created Maven project for class: {class_name}")
+        except Exception as e:
+            logger.error(f"Failed to create Maven project: {e}")
+            return {
+                "success": False,
+                "error": f"Project setup failed: {str(e)}",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "modal",
+                "language": "java"
+            }
+        start_time = time.time()
+        try:
+            # Run Maven clean test
+            logger.info("Running Maven tests...")
+            result = subprocess.run(
+                ["mvn", "clean", "test", "-B", "-q"],
+                cwd=tmpdir,
+                capture_output=True,
+                text=True,
+                timeout=300  # 5 minutes for Maven
+            )
+        except subprocess.TimeoutExpired:
+            return {
+                "success": False,
+                "error": "Maven test execution timeout (>5 minutes)",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_time": 300.0,
+                "execution_mode": "modal",
+                "language": "java"
+            }
+        except FileNotFoundError:
+            return {
+                "success": False,
+                "error": "Maven (mvn) not found in container",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "modal",
+                "language": "java"
+            }
+        execution_time = time.time() - start_time
+        stdout = result.stdout[:10000]  # Truncate to prevent memory issues
+        stderr = result.stderr[:10000]
+        # Check for compilation/build failures first
+        if "BUILD FAILURE" in stdout or "COMPILATION ERROR" in stdout or "BUILD FAILURE" in stderr:
+            error_msg = "Maven build failed"
+            # Try to extract specific error
+            if "COMPILATION ERROR" in stdout:
+                error_msg = "Java compilation error"
+            elif "[ERROR]" in stdout:
+                # Extract first error line
+                for line in stdout.split('\n'):
+                    if '[ERROR]' in line and 'Failed to execute goal' not in line:
+                        error_msg = line.strip()
+                        break
+            return {
+                "success": False,
+                "error": error_msg,
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "modal",
+                "language": "java",
+                "stdout": stdout,
+                "stderr": stderr
+            }
+        # Parse Maven Surefire output
+        # Format: "Tests run: X, Failures: Y, Errors: Z, Skipped: W"
+        tests_run = 0
+        tests_passed = 0
+        tests_failed = 0
+        tests_errors = 0
+        tests_skipped = 0
+        match = re.search(r'Tests run: (\d+),\s*Failures: (\d+),\s*Errors: (\d+),\s*Skipped: (\d+)', stdout)
+        if match:
+            tests_run = int(match.group(1))
+            failures = int(match.group(2))
+            tests_errors = int(match.group(3))
+            tests_skipped = int(match.group(4))
+            tests_failed = failures + tests_errors
+            tests_passed = tests_run - tests_failed - tests_skipped
+        elif tests_run == 0 and result.returncode == 0:
+            # Maven succeeded but no tests found - this is suspicious
+            logger.warning("Maven succeeded but no tests were detected")
+            return {
+                "success": False,
+                "error": "No tests detected by Maven Surefire (missing @Test annotations?)",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "modal",
+                "language": "java",
+                "stdout": stdout,
+                "stderr": stderr
+            }
+        # Try to extract coverage from JaCoCo report
+        coverage_percent = 0.0
+        jacoco_report = tmpdir_path / "target" / "site" / "jacoco" / "index.html"
+        if jacoco_report.exists():
+            try:
+                report_content = jacoco_report.read_text()
+                # Extract coverage percentage from JaCoCo HTML report
+                cov_match = re.search(r'Total.*?(\d+)%', report_content)
+                if cov_match:
+                    coverage_percent = float(cov_match.group(1))
+            except Exception as e:
+                logger.warning(f"Failed to parse JaCoCo coverage: {e}")
+        return {
+            "success": result.returncode == 0,
+            "tests_run": tests_run,
+            "tests_passed": tests_passed,
+            "tests_failed": tests_failed,
+            "tests_errors": tests_errors,
+            "tests_skipped": tests_skipped,
+            "execution_time": round(execution_time, 2),
+            "coverage_percent": coverage_percent,
+            "stdout": stdout,
+            "stderr": stderr,
+            "exit_code": result.returncode,
+            "execution_mode": "modal",
+            "language": "java"
+        }

src/sandbox/runners/javascript_runner.py ADDED Viewed

	@@ -0,0 +1,318 @@

+"""
+JavaScript/TypeScript test runner for Modal sandbox execution.
+Handles Node.js project structure, package.json generation, and Jest execution.
+"""
+import subprocess
+import tempfile
+import time
+import logging
+import json
+import re
+from pathlib import Path
+from typing import Dict, List
+logger = logging.getLogger(__name__)
+def _create_nodejs_project(tmpdir: Path, module_name: str, code: str, tests: str, language: str):
+    """
+    Create Node.js project structure with package.json and config files.
+    Args:
+        tmpdir: Temporary directory path
+        module_name: Name of the module
+        code: Source code
+        tests: Test code
+        language: 'javascript' or 'typescript'
+    """
+    ext = '.ts' if language == 'typescript' else '.js'
+    # Write source files
+    (tmpdir / f"{module_name}{ext}").write_text(code, encoding='utf-8')
+    (tmpdir / f"{module_name}.test{ext}").write_text(tests, encoding='utf-8')
+    # Generate package.json
+    package_json = {
+        "name": module_name.replace('_', '-'),
+        "version": "1.0.0",
+        "type": "module" if language == 'javascript' else None,
+        "description": "Modernized code test suite",
+        "main": f"{module_name}{ext}",
+        "scripts": {
+            "test": "NODE_OPTIONS=--experimental-vm-modules jest --coverage --verbose --no-cache" if language == 'javascript' else "jest --coverage --verbose --no-cache"
+        },
+        "devDependencies": {
+            "jest": "^29.7.0"
+        }
+    }
+    # Remove None values
+    package_json = {k: v for k, v in package_json.items() if v is not None}
+    if language == 'typescript':
+        package_json["devDependencies"].update({
+            "typescript": "^5.3.0",
+            "ts-jest": "^29.1.0",
+            "@types/jest": "^29.5.0",
+            "ts-node": "^10.9.0"
+        })
+        # Generate jest.config.js for TypeScript
+        jest_config = """module.exports = {
+  preset: 'ts-jest',
+  testEnvironment: 'node',
+  testMatch: ['**/*.test.ts'],
+  collectCoverageFrom: ['*.ts', '!*.test.ts', '!jest.config.js'],
+  coverageReporters: ['text', 'text-summary'],
+  verbose: true
+};
+"""
+        (tmpdir / "jest.config.js").write_text(jest_config, encoding='utf-8')
+        # Generate tsconfig.json
+        tsconfig = {
+            "compilerOptions": {
+                "target": "ES2020",
+                "module": "commonjs",
+                "lib": ["ES2020"],
+                "strict": True,
+                "esModuleInterop": True,
+                "skipLibCheck": True,
+                "forceConsistentCasingInFileNames": True,
+                "resolveJsonModule": True,
+                "moduleResolution": "node",
+                "types": ["jest", "node"]
+            },
+            "include": ["*.ts"],
+            "exclude": ["node_modules"]
+        }
+        (tmpdir / "tsconfig.json").write_text(json.dumps(tsconfig, indent=2), encoding='utf-8')
+    else:
+        # Generate jest.config.js for JavaScript with ES module support
+        jest_config = """module.exports = {
+  testEnvironment: 'node',
+  testMatch: ['**/*.test.js'],
+  collectCoverageFrom: ['*.js', '!*.test.js', '!jest.config.js'],
+  coverageReporters: ['text', 'text-summary'],
+  verbose: true,
+  transform: {},
+  extensionsToTreatAsEsm: ['.js'],
+  moduleNameMapper: {
+    '^(\\\\.{1,2}/.*)\\\\.js$': '$1',
+  },
+};
+"""
+        (tmpdir / "jest.config.js").write_text(jest_config, encoding='utf-8')
+    (tmpdir / "package.json").write_text(json.dumps(package_json, indent=2), encoding='utf-8')
+def _validate_javascript_tests(tests: str, language: str) -> tuple:
+    """
+    Validate JavaScript/TypeScript test code before execution.
+    Returns:
+        (is_valid, error_message)
+    """
+    # Check for Jest test structure
+    if "describe(" not in tests and "test(" not in tests and "it(" not in tests:
+        return False, "No Jest test functions found (describe/test/it)"
+    # Check for imports
+    if "import" not in tests and "require" not in tests:
+        return False, "No import/require statements found"
+    # Check for expect assertions
+    if "expect(" not in tests:
+        return False, "No expect() assertions found"
+    return True, ""
+def run_javascript_tests(code: str, tests: str, requirements: List[str], module_name: str, language: str = 'javascript') -> Dict:
+    """
+    Run JavaScript/TypeScript tests using Jest in Modal container.
+    Args:
+        code: JavaScript/TypeScript source code
+        tests: Jest test code
+        requirements: List of npm packages to install (not used currently)
+        module_name: Name of the module
+        language: 'javascript' or 'typescript'
+    Returns:
+        Dictionary with test results
+    """
+    # Validate tests before execution
+    is_valid, error_msg = _validate_javascript_tests(tests, language)
+    if not is_valid:
+        logger.error(f"Test validation failed: {error_msg}")
+        return {
+            "success": False,
+            "error": f"Test validation failed: {error_msg}",
+            "tests_run": 0,
+            "tests_passed": 0,
+            "tests_failed": 0,
+            "execution_mode": "modal",
+            "language": language
+        }
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir_path = Path(tmpdir)
+        try:
+            # Create Node.js project structure
+            _create_nodejs_project(tmpdir_path, module_name, code, tests, language)
+            logger.info(f"Created Node.js project for {module_name} ({language})")
+        except Exception as e:
+            logger.error(f"Failed to create Node.js project: {e}")
+            return {
+                "success": False,
+                "error": f"Project setup failed: {str(e)}",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "modal",
+                "language": language
+            }
+        start_time = time.time()
+        try:
+            # Install dependencies
+            logger.info("Installing npm dependencies...")
+            install_result = subprocess.run(
+                ["npm", "install", "--silent", "--no-fund", "--no-audit"],
+                cwd=tmpdir,
+                capture_output=True,
+                text=True,
+                timeout=180  # 3 minutes for npm install
+            )
+            if install_result.returncode != 0:
+                logger.error(f"npm install failed with return code: {install_result.returncode}")
+                logger.error(f"npm install stderr: {install_result.stderr}")
+                logger.error(f"npm install stdout: {install_result.stdout}")
+                return {
+                    "success": False,
+                    "error": f"npm install failed: {install_result.stderr}",
+                    "tests_run": 0,
+                    "tests_passed": 0,
+                    "tests_failed": 0,
+                    "execution_mode": "modal",
+                    "language": language
+                }
+            # Run tests
+            logger.info("Running Jest tests...")
+            result = subprocess.run(
+                ["npm", "test", "--", "--ci"],
+                cwd=tmpdir,
+                capture_output=True,
+                text=True,
+                timeout=120  # 2 minutes for tests
+            )
+        except subprocess.TimeoutExpired as e:
+            return {
+                "success": False,
+                "error": f"Test execution timeout: {str(e)}",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_time": 300.0,
+                "execution_mode": "modal",
+                "language": language
+            }
+        except FileNotFoundError:
+            return {
+                "success": False,
+                "error": "Node.js/npm not found in container",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "modal",
+                "language": language
+            }
+        execution_time = time.time() - start_time
+        stdout = result.stdout[:10000]  # Truncate to prevent memory issues
+        stderr = result.stderr[:10000]
+        # Parse Jest output - handle all possible formats
+        # Jest format examples:
+        # - "Tests: 5 passed, 5 total"
+        # - "Tests: 1 failed, 4 passed, 5 total"
+        # - "Tests: 2 skipped, 3 passed, 5 total"
+        # - "Tests: 1 todo, 4 passed, 5 total"
+        # - "Tests: 0 total"
+        tests_run = 0
+        tests_passed = 0
+        tests_failed = 0
+        tests_skipped = 0
+        # Look for "Tests:" line
+        tests_line_match = re.search(r'Tests:\s+(.+)', stdout)
+        if tests_line_match:
+            tests_line = tests_line_match.group(1)
+            # Extract total
+            total_match = re.search(r'(\d+)\s+total', tests_line)
+            if total_match:
+                tests_run = int(total_match.group(1))
+            # Extract passed
+            passed_match = re.search(r'(\d+)\s+passed', tests_line)
+            if passed_match:
+                tests_passed = int(passed_match.group(1))
+            # Extract failed
+            failed_match = re.search(r'(\d+)\s+failed', tests_line)
+            if failed_match:
+                tests_failed = int(failed_match.group(1))
+            # Extract skipped
+            skipped_match = re.search(r'(\d+)\s+skipped', tests_line)
+            if skipped_match:
+                tests_skipped = int(skipped_match.group(1))
+            # If we have total but not passed, calculate it
+            if tests_run > 0 and tests_passed == 0 and tests_failed == 0:
+                tests_passed = tests_run - tests_failed - tests_skipped
+        # Check for test suite failures (compilation errors, etc.)
+        if "Test Suites: " in stdout and " failed" in stdout:
+            suite_match = re.search(r'Test Suites:\s+(\d+)\s+failed', stdout)
+            if suite_match and tests_run == 0:
+                return {
+                    "success": False,
+                    "error": "Test suite failed to run (compilation/syntax error)",
+                    "tests_run": 0,
+                    "tests_passed": 0,
+                    "tests_failed": 0,
+                    "execution_mode": "modal",
+                    "language": language,
+                    "stdout": stdout,
+                    "stderr": stderr
+                }
+        # Extract coverage percentage
+        coverage_percent = 0.0
+        # Jest coverage format: "All files | 85.71 | 75 | 100 | 85.71 |"
+        cov_match = re.search(r'All files\s*\|\s*([\d.]+)', stdout)
+        if cov_match:
+            coverage_percent = float(cov_match.group(1))
+        return {
+            "success": result.returncode == 0,
+            "tests_run": tests_run,
+            "tests_passed": tests_passed,
+            "tests_failed": tests_failed,
+            "execution_time": round(execution_time, 2),
+            "coverage_percent": coverage_percent,
+            "stdout": stdout,
+            "stderr": stderr,
+            "exit_code": result.returncode,
+            "execution_mode": "modal",
+            "language": language
+        }

src/sandbox/runners/python_runner.py ADDED Viewed

	@@ -0,0 +1,219 @@

+"""
+Python test runner for Modal sandbox execution.
+Handles pytest execution with proper path setup and result parsing.
+"""
+import subprocess
+import tempfile
+import time
+import logging
+import re
+from pathlib import Path
+from typing import Dict, List
+logger = logging.getLogger(__name__)
+def _validate_python_tests(tests: str) -> tuple:
+    """
+    Validate Python test code before execution.
+    Returns:
+        (is_valid, error_message)
+    """
+    # Check for basic pytest structure
+    if "def test_" not in tests and "class Test" not in tests:
+        return False, "No test functions found (must start with 'test_' or be in 'Test' class)"
+    # Check for imports
+    if "import" not in tests:
+        return False, "No import statements found"
+    # Check for basic syntax issues
+    try:
+        compile(tests, '<string>', 'exec')
+    except SyntaxError as e:
+        return False, f"Syntax error in test code: {str(e)}"
+    return True, ""
+def run_python_tests(code: str, tests: str, requirements: List[str], module_name: str) -> Dict:
+    """
+    Run Python tests using pytest in Modal container.
+    Args:
+        code: Python source code
+        tests: Pytest test code
+        requirements: List of pip packages to install
+        module_name: Name of the module
+    Returns:
+        Dictionary with test results
+    """
+    # Validate tests before execution
+    is_valid, error_msg = _validate_python_tests(tests)
+    if not is_valid:
+        logger.error(f"Test validation failed: {error_msg}")
+        return {
+            "success": False,
+            "error": f"Test validation failed: {error_msg}",
+            "tests_run": 0,
+            "tests_passed": 0,
+            "tests_failed": 0,
+            "execution_mode": "modal",
+            "language": "python"
+        }
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir_path = Path(tmpdir)
+        # Write code and tests in same directory for proper imports
+        code_file = tmpdir_path / f"{module_name}.py"
+        test_file = tmpdir_path / f"test_{module_name}.py"
+        # Ensure tests have proper path setup
+        if "sys.path" not in tests and "import sys" not in tests:
+            path_setup = """import sys
+import os
+# Ensure module can be imported
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+"""
+            tests = path_setup + tests
+        code_file.write_text(code, encoding='utf-8')
+        test_file.write_text(tests, encoding='utf-8')
+        # Install additional requirements
+        if requirements:
+            try:
+                logger.info(f"Installing requirements: {requirements}")
+                install_result = subprocess.run(
+                    ["pip", "install", "-q", "--no-cache-dir"] + requirements,
+                    capture_output=True,
+                    text=True,
+                    timeout=120
+                )
+                if install_result.returncode != 0:
+                    logger.warning(f"Some requirements failed to install: {install_result.stderr}")
+            except Exception as e:
+                logger.warning(f"Failed to install requirements: {e}")
+        start_time = time.time()
+        try:
+            # Run pytest with coverage and verbose output
+            result = subprocess.run(
+                [
+                    "pytest",
+                    str(test_file),
+                    "-v",
+                    "--tb=short",
+                    "--timeout=30",
+                    "-p", "no:warnings",
+                    "--cov=" + module_name,
+                    "--cov-report=term-missing"
+                ],
+                cwd=tmpdir,
+                capture_output=True,
+                text=True,
+                timeout=120
+            )
+        except subprocess.TimeoutExpired:
+            return {
+                "success": False,
+                "error": "Test execution timeout (>2 minutes)",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_time": 120.0,
+                "execution_mode": "modal",
+                "language": "python"
+            }
+        except FileNotFoundError:
+            return {
+                "success": False,
+                "error": "pytest not found in container",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "modal",
+                "language": "python"
+            }
+        execution_time = time.time() - start_time
+        stdout = result.stdout[:10000]  # Truncate to prevent memory issues
+        stderr = result.stderr[:10000]
+        # Parse pytest output from summary line (more reliable than counting)
+        # Format: "3 passed, 1 failed, 1 skipped in 0.5s" or "3 passed in 0.5s"
+        tests_run = 0
+        tests_passed = 0
+        tests_failed = 0
+        tests_errors = 0
+        tests_skipped = 0
+        # Look for summary line
+        summary_match = re.search(r'=+\s*(.*?)\s+in\s+[\d.]+s\s*=+', stdout)
+        if summary_match:
+            summary = summary_match.group(1)
+            # Parse each component
+            passed_match = re.search(r'(\d+)\s+passed', summary)
+            if passed_match:
+                tests_passed = int(passed_match.group(1))
+            failed_match = re.search(r'(\d+)\s+failed', summary)
+            if failed_match:
+                tests_failed = int(failed_match.group(1))
+            error_match = re.search(r'(\d+)\s+error', summary)
+            if error_match:
+                tests_errors = int(error_match.group(1))
+            skipped_match = re.search(r'(\d+)\s+skipped', summary)
+            if skipped_match:
+                tests_skipped = int(skipped_match.group(1))
+            tests_run = tests_passed + tests_failed + tests_errors + tests_skipped
+        # Fallback: count individual test results if summary not found
+        if tests_run == 0:
+            passed = stdout.count(" PASSED\n")
+            failed = stdout.count(" FAILED\n")
+            errors = stdout.count(" ERROR\n")
+            skipped = stdout.count(" SKIPPED\n")
+            tests_run = passed + failed + errors
+            tests_passed = passed
+            tests_failed = failed
+            tests_errors = errors
+            tests_skipped = skipped
+        # Extract coverage percentage from summary
+        coverage_percent = 0.0
+        # Look for coverage summary: "TOTAL    100   20    80%"
+        cov_match = re.search(r'TOTAL\s+\d+\s+\d+\s+\d+\s+\d+\s+(\d+)%', stdout)
+        if cov_match:
+            coverage_percent = float(cov_match.group(1))
+        else:
+            # Alternative format: "TOTAL    80%"
+            cov_match = re.search(r'TOTAL.*?(\d+)%', stdout)
+            if cov_match:
+                coverage_percent = float(cov_match.group(1))
+        return {
+            "success": result.returncode == 0,
+            "tests_run": tests_run,
+            "tests_passed": tests_passed,
+            "tests_failed": tests_failed,
+            "tests_errors": tests_errors,
+            "tests_skipped": tests_skipped,
+            "execution_time": round(execution_time, 2),
+            "coverage_percent": coverage_percent,
+            "stdout": stdout,
+            "stderr": stderr,
+            "exit_code": result.returncode,
+            "execution_mode": "modal",
+            "language": "python"
+        }

src/sandbox/validator.py ADDED Viewed

	@@ -0,0 +1,718 @@

+"""
+Modal Sandbox Validator - Executes tests in isolated Modal containers.
+Phase 5: Test execution in secure sandbox environment.
+Supports multiple languages with dedicated Modal container images.
+Falls back to local execution when Modal is not available.
+"""
+import os
+import logging
+import subprocess
+import tempfile
+import json
+import time
+from typing import Dict, List, Optional
+from pathlib import Path
+# Import Modal images and runners
+from .images import (
+    MODAL_AVAILABLE, app, LANGUAGE_IMAGES, LANGUAGE_SUPPORT_STATUS,
+    get_image_for_language, get_support_status, is_language_supported
+)
+from .runners import LANGUAGE_RUNNERS, get_runner_for_language, is_runner_available
+logger = logging.getLogger(__name__)
+def _detect_language(file_path: str, code: str) -> str:
+    """Detect programming language from file extension or code content."""
+    if file_path:
+        ext = Path(file_path).suffix.lower()
+        extension_map = {
+            # Python
+            '.py': 'python', '.pyw': 'python', '.pyx': 'python',
+            # Java
+            '.java': 'java',
+            # JavaScript/TypeScript
+            '.js': 'javascript', '.jsx': 'javascript', '.mjs': 'javascript', '.cjs': 'javascript',
+            '.ts': 'typescript', '.tsx': 'typescript'
+        }
+        if ext in extension_map:
+            return extension_map[ext]
+    # Fallback: detect from code content
+    if code:
+        if 'public class' in code or 'import java.' in code:
+            return 'java'
+        elif 'def ' in code and ('import ' in code or 'from ' in code):
+            return 'python'
+        elif 'function ' in code or 'const ' in code or 'let ' in code:
+            return 'javascript'
+        elif 'interface ' in code or 'type ' in code:
+            return 'typescript'
+    return 'python'  # Default
+def run_tests_locally(code: str, tests: str, requirements: List[str],
+                      module_name: str = "module", language: str = "python") -> Dict:
+    """
+    Execute tests locally (fallback when Modal is not available).
+    Args:
+        code: Modernized code to test
+        tests: Generated test code
+        requirements: Additional packages needed
+        module_name: Name of the module
+        language: Programming language
+    Returns:
+        Dictionary with test results
+    """
+    # Only support Python, Java, JavaScript, and TypeScript
+    supported_languages = ['python', 'java', 'javascript', 'typescript']
+    if language not in supported_languages:
+        return {
+            "success": False,
+            "error": f"Unsupported language: {language}. Supported languages: {', '.join(supported_languages)}",
+            "tests_run": 0,
+            "tests_passed": 0,
+            "tests_failed": 0,
+            "execution_mode": "unsupported"
+        }
+    if language == 'python':
+        return _run_python_tests_locally(code, tests, requirements, module_name)
+    elif language == 'java':
+        return _run_java_tests_locally(code, tests, module_name)
+    elif language in ('javascript', 'typescript'):
+        return _run_js_tests_locally(code, tests, module_name, language)
+def _run_python_tests_locally(code: str, tests: str, requirements: List[str],
+                               module_name: str) -> Dict:
+    """Run Python tests locally using pytest."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir_path = Path(tmpdir)
+        # Write code and tests in same directory for proper imports
+        code_file = tmpdir_path / f"{module_name}.py"
+        test_file = tmpdir_path / f"test_{module_name}.py"
+        # Add sys.path manipulation to tests if not already present
+        # This ensures tests can import the module even from subdirectories
+        if "sys.path" not in tests and "import sys" not in tests:
+            path_setup = """import sys
+import os
+# Ensure module can be imported
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+"""
+            tests = path_setup + tests
+        code_file.write_text(code, encoding='utf-8')
+        test_file.write_text(tests, encoding='utf-8')
+        # Install additional requirements
+        if requirements:
+            try:
+                subprocess.run(
+                    ["pip", "install", "-q", "--no-cache-dir"] + requirements,
+                    capture_output=True,
+                    timeout=60,
+                    check=False  # Don't fail on install errors
+                )
+            except Exception as e:
+                logger.warning(f"Failed to install requirements: {e}")
+        start_time = time.time()
+        try:
+            result = subprocess.run(
+                [
+                    "pytest",
+                    str(test_file),
+                    "-v",
+                    "--tb=short",
+                    "--timeout=30",
+                    "-p", "no:warnings"
+                ],
+                cwd=tmpdir,
+                capture_output=True,
+                text=True,
+                timeout=120
+            )
+        except subprocess.TimeoutExpired:
+            return {
+                "success": False,
+                "error": "Test execution timeout (>2 minutes)",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_time": 120.0,
+                "execution_mode": "local"
+            }
+        except FileNotFoundError:
+            return {
+                "success": False,
+                "error": "pytest not found. Install with: pip install pytest",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "local"
+            }
+        execution_time = time.time() - start_time
+        stdout = result.stdout
+        # Count tests
+        passed = stdout.count(" PASSED")
+        failed = stdout.count(" FAILED")
+        errors = stdout.count(" ERROR")
+        test_count = passed + failed + errors
+        return {
+            "success": result.returncode == 0,
+            "tests_run": test_count,
+            "tests_passed": passed,
+            "tests_failed": failed,
+            "tests_errors": errors,
+            "execution_time": round(execution_time, 2),
+            "coverage_percent": 0.0,  # Coverage not measured in local mode
+            "stdout": stdout,
+            "stderr": result.stderr,
+            "exit_code": result.returncode,
+            "execution_mode": "local"
+        }
+def _run_java_tests_locally(code: str, tests: str, module_name: str) -> Dict:
+    """Run Java tests locally using JUnit."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir_path = Path(tmpdir)
+        # Extract class name from code
+        class_name = module_name.replace('_', '').title()
+        if 'public class ' in code:
+            import re
+            match = re.search(r'public class (\w+)', code)
+            if match:
+                class_name = match.group(1)
+        # Write Java files
+        code_file = tmpdir_path / f"{class_name}.java"
+        test_file = tmpdir_path / f"{class_name}Test.java"
+        code_file.write_text(code, encoding='utf-8')
+        test_file.write_text(tests, encoding='utf-8')
+        start_time = time.time()
+        try:
+            # Compile
+            compile_result = subprocess.run(
+                ["javac", str(code_file), str(test_file)],
+                cwd=tmpdir,
+                capture_output=True,
+                text=True,
+                timeout=60
+            )
+            if compile_result.returncode != 0:
+                return {
+                    "success": False,
+                    "error": f"Compilation failed: {compile_result.stderr}",
+                    "tests_run": 0,
+                    "tests_passed": 0,
+                    "tests_failed": 0,
+                    "execution_mode": "local"
+                }
+            # Run tests (simplified - would need JUnit runner in real scenario)
+            run_result = subprocess.run(
+                ["java", f"{class_name}Test"],
+                cwd=tmpdir,
+                capture_output=True,
+                text=True,
+                timeout=120
+            )
+            execution_time = time.time() - start_time
+            return {
+                "success": run_result.returncode == 0,
+                "tests_run": 1,  # Simplified
+                "tests_passed": 1 if run_result.returncode == 0 else 0,
+                "tests_failed": 0 if run_result.returncode == 0 else 1,
+                "execution_time": round(execution_time, 2),
+                "stdout": run_result.stdout,
+                "stderr": run_result.stderr,
+                "exit_code": run_result.returncode,
+                "execution_mode": "local"
+            }
+        except FileNotFoundError:
+            return {
+                "success": False,
+                "error": "Java compiler (javac) not found. Install JDK.",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "local"
+            }
+        except subprocess.TimeoutExpired:
+            return {
+                "success": False,
+                "error": "Java test execution timeout",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "local"
+            }
+def _run_js_tests_locally(code: str, tests: str, module_name: str,
+                          language: str) -> Dict:
+    """Run JavaScript/TypeScript tests locally using Jest or Node."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        tmpdir_path = Path(tmpdir)
+        ext = '.ts' if language == 'typescript' else '.js'
+        # Write files
+        code_file = tmpdir_path / f"{module_name}{ext}"
+        test_file = tmpdir_path / f"{module_name}.test{ext}"
+        code_file.write_text(code, encoding='utf-8')
+        test_file.write_text(tests, encoding='utf-8')
+        # Create minimal package.json
+        package_json = {
+            "name": "test-sandbox",
+            "scripts": {"test": "jest"},
+            "devDependencies": {"jest": "^29.0.0"}
+        }
+        if language == 'typescript':
+            package_json["devDependencies"]["ts-jest"] = "^29.0.0"
+            package_json["devDependencies"]["typescript"] = "^5.0.0"
+        (tmpdir_path / "package.json").write_text(json.dumps(package_json))
+        start_time = time.time()
+        try:
+            # Try running with node directly for simple tests
+            run_result = subprocess.run(
+                ["node", str(test_file)],
+                cwd=tmpdir,
+                capture_output=True,
+                text=True,
+                timeout=60
+            )
+            execution_time = time.time() - start_time
+            return {
+                "success": run_result.returncode == 0,
+                "tests_run": 1,
+                "tests_passed": 1 if run_result.returncode == 0 else 0,
+                "tests_failed": 0 if run_result.returncode == 0 else 1,
+                "execution_time": round(execution_time, 2),
+                "stdout": run_result.stdout,
+                "stderr": run_result.stderr,
+                "exit_code": run_result.returncode,
+                "execution_mode": "local"
+            }
+        except FileNotFoundError:
+            return {
+                "success": False,
+                "error": "Node.js not found. Install Node.js.",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "local"
+            }
+        except subprocess.TimeoutExpired:
+            return {
+                "success": False,
+                "error": "JavaScript test execution timeout",
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "local"
+            }
+# Import Modal executor if available
+if MODAL_AVAILABLE:
+    try:
+        from .modal_executor import execute_in_modal
+        MODAL_EXECUTOR_AVAILABLE = True
+    except Exception as e:
+        logger.warning(f"Failed to import Modal executor: {e}")
+        MODAL_EXECUTOR_AVAILABLE = False
+else:
+    MODAL_EXECUTOR_AVAILABLE = False
+def run_tests_in_sandbox(code: str, tests: str, requirements: List[str],
+                         module_name: str = "module", language: str = "python") -> Dict:
+    """
+    Execute tests in sandbox (Modal if available, otherwise local).
+    Args:
+        code: Source code
+        tests: Test code
+        requirements: Package requirements
+        module_name: Module name
+        language: Programming language
+    Returns:
+        Test execution results
+    """
+    if MODAL_EXECUTOR_AVAILABLE:
+        try:
+            return execute_in_modal(code, tests, requirements, module_name, language)
+        except Exception as e:
+            logger.warning(f"Modal execution failed: {e}, falling back to local")
+            return run_tests_locally(code, tests, requirements, module_name, language)
+    else:
+        logger.info("Modal not available, running tests locally")
+        return run_tests_locally(code, tests, requirements, module_name, language)
+class ModalSandboxValidator:
+    """
+    Validates code transformations using Modal sandbox.
+    Provides safe, isolated test execution environment.
+    Falls back to local execution when Modal is not available.
+    Supports multiple languages: Python, Java, JavaScript, TypeScript, etc.
+    """
+    def __init__(self, prefer_modal: bool = None):
+        """
+        Initialize Modal Sandbox Validator.
+        Args:
+            prefer_modal: If True, try Modal first, fallback to local.
+                         If False, always use local execution.
+                         If None (default), auto-detect based on environment.
+        """
+        # Import config to get environment-aware settings
+        from .config import should_prefer_modal, validate_environment, IS_HUGGINGFACE
+        # Auto-detect if not specified
+        if prefer_modal is None:
+            prefer_modal = should_prefer_modal()
+        self.prefer_modal = prefer_modal and MODAL_AVAILABLE
+        self.is_huggingface = IS_HUGGINGFACE
+        self.app = app
+        # Validate environment configuration
+        validate_environment()
+        if self.is_huggingface and not self.prefer_modal:
+            logger.error("Running on Hugging Face but Modal is not available!")
+            logger.error("Test execution will fail. Please configure Modal.")
+        if self.prefer_modal:
+            logger.info("ModalSandboxValidator initialized with Modal support")
+        else:
+            logger.info("ModalSandboxValidator initialized (local execution mode)")
+    def validate_transformation(
+        self,
+        original_code: str,
+        modernized_code: str,
+        tests: str,
+        requirements: Optional[List[str]] = None,
+        file_path: Optional[str] = None
+    ) -> Dict:
+        """
+        Validate code transformation by running tests in sandbox.
+        Args:
+            original_code: Original legacy code
+            modernized_code: Modernized code
+            tests: Generated test code
+            requirements: Additional packages needed
+            file_path: Path to the file (used to extract module name and language)
+        Returns:
+            Validation results with test metrics
+        """
+        logger.info("Starting sandbox validation")
+        # Detect language from file path or code
+        language = _detect_language(file_path, modernized_code)
+        logger.info(f"Detected language: {language}")
+        # Extract requirements based on language
+        if requirements is None:
+            requirements = self._extract_requirements(modernized_code, language)
+        # Extract module name from file path
+        if file_path:
+            module_name = Path(file_path).stem
+        else:
+            module_name = "module"
+        logger.info(f"Validating module: {module_name} (language: {language})")
+        # Try Modal first if available and preferred
+        if self.prefer_modal and MODAL_AVAILABLE:
+            try:
+                logger.info("Attempting Modal sandbox execution...")
+                results = run_tests_in_sandbox(
+                    code=modernized_code,
+                    tests=tests,
+                    requirements=requirements,
+                    module_name=module_name,
+                    language=language
+                )
+                results['execution_mode'] = 'modal'
+                logger.info(f"Modal validation complete: {results['tests_passed']}/{results['tests_run']} passed")
+                return results
+            except Exception as e:
+                logger.warning(f"Modal execution failed: {e}, falling back to local")
+        # Fallback to local execution
+        logger.info("Running tests locally...")
+        try:
+            results = run_tests_locally(
+                code=modernized_code,
+                tests=tests,
+                requirements=requirements,
+                module_name=module_name,
+                language=language
+            )
+            logger.info(f"Local validation complete: {results['tests_passed']}/{results['tests_run']} passed")
+            return results
+        except Exception as e:
+            logger.error(f"Local validation error: {e}")
+            return {
+                "success": False,
+                "error": str(e),
+                "tests_run": 0,
+                "tests_passed": 0,
+                "tests_failed": 0,
+                "execution_mode": "failed"
+            }
+    def validate_batch(
+        self,
+        transformations: List[Dict]
+    ) -> List[Dict]:
+        """
+        Validate multiple transformations in parallel.
+        Args:
+            transformations: List of transformation dicts with code and tests
+        Returns:
+            List of validation results
+        """
+        logger.info(f"Starting batch validation for {len(transformations)} files")
+        results = []
+        # Try Modal batch execution if available
+        if self.prefer_modal and MODAL_AVAILABLE:
+            try:
+                # For batch operations, we can call functions directly
+                # Modal handles the parallelization internally
+                for t in transformations:
+                    file_path = t.get('file_path', '')
+                    language = _detect_language(file_path, t['modernized_code'])
+                    try:
+                        result = run_tests_in_sandbox(
+                            code=t['modernized_code'],
+                            tests=t['tests'],
+                            requirements=t.get('requirements', []),
+                            module_name=Path(file_path).stem if file_path else 'module',
+                            language=language
+                        )
+                        result['file_path'] = file_path
+                        result['execution_mode'] = 'modal'
+                        results.append(result)
+                    except Exception as e:
+                        logger.error(f"Error validating {file_path}: {e}")
+                        results.append({
+                            "file_path": file_path,
+                            "success": False,
+                            "error": str(e),
+                            "execution_mode": "modal_failed"
+                        })
+                logger.info(f"Modal batch validation complete: {len(results)} results")
+                return results
+            except Exception as e:
+                logger.warning(f"Modal batch execution failed: {e}, falling back to local")
+                results = []  # Reset for local execution
+        # Fallback to local sequential execution
+        for t in transformations:
+            file_path = t.get('file_path', '')
+            language = _detect_language(file_path, t['modernized_code'])
+            try:
+                result = run_tests_locally(
+                    code=t['modernized_code'],
+                    tests=t['tests'],
+                    requirements=t.get('requirements', []),
+                    module_name=Path(file_path).stem if file_path else 'module',
+                    language=language
+                )
+                result['file_path'] = file_path
+                results.append(result)
+            except Exception as e:
+                logger.error(f"Error validating {file_path}: {e}")
+                results.append({
+                    "file_path": file_path,
+                    "success": False,
+                    "error": str(e),
+                    "execution_mode": "local_failed"
+                })
+        logger.info(f"Local batch validation complete: {len(results)} results")
+        return results
+    def _extract_requirements(self, code: str, language: str = "python") -> List[str]:
+        """
+        Extract required packages from import statements.
+        Args:
+            code: Source code
+            language: Programming language
+        Returns:
+            List of package names
+        """
+        requirements = []
+        if language == 'python':
+            # Python import to package mappings
+            import_map = {
+                'sqlalchemy': 'sqlalchemy',
+                'pymysql': 'pymysql',
+                'requests': 'requests',
+                'flask': 'flask',
+                'django': 'django',
+                'numpy': 'numpy',
+                'pandas': 'pandas',
+                'fastapi': 'fastapi',
+                'pydantic': 'pydantic',
+                'aiohttp': 'aiohttp',
+                'httpx': 'httpx',
+                'pytest': 'pytest'
+            }
+            for line in code.split('\n'):
+                line = line.strip()
+                if line.startswith('import ') or line.startswith('from '):
+                    parts = line.split()
+                    if len(parts) >= 2:
+                        module = parts[1].split('.')[0]
+                        if module in import_map:
+                            pkg = import_map[module]
+                            if pkg not in requirements:
+                                requirements.append(pkg)
+        elif language == 'java':
+            # Java dependencies would be handled via Maven/Gradle
+            # Return empty list - dependencies managed differently
+            pass
+        elif language in ('javascript', 'typescript'):
+            # JavaScript/TypeScript - look for require/import statements
+            import_map = {
+                'express': 'express',
+                'axios': 'axios',
+                'lodash': 'lodash',
+                'moment': 'moment',
+                'react': 'react',
+                'jest': 'jest'
+            }
+            for line in code.split('\n'):
+                line = line.strip()
+                for pkg in import_map:
+                    if f"'{pkg}'" in line or f'"{pkg}"' in line:
+                        if pkg not in requirements:
+                            requirements.append(pkg)
+        return requirements
+    def test_behavioral_equivalence(
+        self,
+        original_code: str,
+        modernized_code: str,
+        test_cases: List[Dict]
+    ) -> Dict:
+        """
+        Test that modernized code produces same outputs as original.
+        Args:
+            original_code: Original code
+            modernized_code: Modernized code
+            test_cases: List of test case dicts with inputs and expected outputs
+        Returns:
+            Equivalence test results
+        """
+        logger.info("Testing behavioral equivalence")
+        # Generate equivalence test
+        equivalence_test = self._generate_equivalence_test(test_cases)
+        # Test both versions
+        original_results = self.validate_transformation(
+            original_code, original_code, equivalence_test
+        )
+        modernized_results = self.validate_transformation(
+            original_code, modernized_code, equivalence_test
+        )
+        # Compare results
+        equivalence_score = 0.0
+        if original_results['success'] and modernized_results['success']:
+            if original_results['tests_passed'] == modernized_results['tests_passed']:
+                equivalence_score = 1.0
+            else:
+                equivalence_score = (
+                    modernized_results['tests_passed'] /
+                    max(original_results['tests_passed'], 1)
+                )
+        return {
+            "behavioral_equivalence": equivalence_score >= 0.95,
+            "equivalence_score": round(equivalence_score, 3),
+            "original_results": original_results,
+            "modernized_results": modernized_results
+        }
+    def _generate_equivalence_test(self, test_cases: List[Dict]) -> str:
+        """Generate pytest code for equivalence testing."""
+        test_code = "import pytest\n\n"
+        for i, case in enumerate(test_cases):
+            test_code += f"""
+def test_equivalence_{i}():
+    \"\"\"Test case {i}: {case.get('description', 'equivalence test')}\"\"\"
+    # Test implementation would go here
+    assert True
+"""
+        return test_code

src/search/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+Search module for semantic code search using LlamaIndex and Chroma.
+"""
+from .vector_store import CodeSearchEngine
+from .embeddings import ModalEmbedding, GeminiEmbeddingWrapper, get_embedding_model
+__all__ = ['CodeSearchEngine', 'ModalEmbedding', 'GeminiEmbeddingWrapper', 'get_embedding_model']

src/search/embeddings.py ADDED Viewed

	@@ -0,0 +1,350 @@

+"""
+Custom embedding implementations for Modal and Gemini.
+"""
+import os
+import logging
+from typing import List, Optional
+from llama_index.core.embeddings import BaseEmbedding
+from llama_index.core.bridge.pydantic import PrivateAttr
+logger = logging.getLogger(__name__)
+# Global tokenizer instance (lazy loaded)
+_tokenizer = None
+def get_tokenizer():
+    """Get or create the tokenizer for BAAI/bge-base-en-v1.5."""
+    global _tokenizer
+    if _tokenizer is None:
+        try:
+            from transformers import AutoTokenizer
+            _tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-base-en-v1.5")
+            logger.info("Tokenizer loaded successfully")
+        except Exception as e:
+            logger.warning(f"Failed to load tokenizer: {e}. Falling back to word-based truncation.")
+            _tokenizer = False  # Mark as failed
+    return _tokenizer if _tokenizer else None
+class ModalEmbedding(BaseEmbedding):
+    """
+    Custom embedding class that uses Modal's deployed TEI service.
+    Primary embedding model for the application.
+    """
+    _modal_instance: Optional[object] = PrivateAttr(default=None)
+    _model_name: str = PrivateAttr(default="BAAI/bge-base-en-v1.5")
+    _max_text_length: int = PrivateAttr(default=4000)  # Reduced max chars per text
+    _batch_size: int = PrivateAttr(default=2)  # Very small batches to avoid 413
+    def __init__(self, **kwargs):
+        """Initialize Modal embedding client."""
+        super().__init__(**kwargs)
+        try:
+            import modal
+            # Use modal.Cls.from_name and get an instance
+            TextEmbeddingsInference = modal.Cls.from_name(
+                "text-embeddings-inference-api",
+                "TextEmbeddingsInference"
+            )
+            # Create an instance and store it
+            self._modal_instance = TextEmbeddingsInference()
+            logger.info("ModalEmbedding initialized successfully")
+        except Exception as e:
+            logger.error(f"Failed to initialize Modal embedding: {e}")
+            raise
+    def _truncate_text(self, text: str) -> str:
+        """Truncate text to max token limit using proper tokenization."""
+        # Modal TEI has a hard limit of 512 tokens
+        # Use 500 tokens to be safe (leave some buffer)
+        max_tokens = 500
+        tokenizer = get_tokenizer()
+        if tokenizer:
+            # Use proper tokenization
+            try:
+                tokens = tokenizer.encode(text, add_special_tokens=False)
+                if len(tokens) > max_tokens:
+                    # Truncate to max_tokens
+                    truncated_tokens = tokens[:max_tokens]
+                    # Decode back to text
+                    return tokenizer.decode(truncated_tokens, skip_special_tokens=True)
+                return text
+            except Exception as e:
+                logger.warning(f"Tokenization failed: {e}. Using word-based fallback.")
+        # Fallback: word-based truncation (conservative estimate)
+        # Assume 1.3 tokens per word: 500 tokens ≈ 385 words
+        # Use 250 words to be very conservative
+        words = text.split()
+        if len(words) > 250:
+            truncated_words = words[:250]
+            return ' '.join(truncated_words)
+        return text
+    @classmethod
+    def class_name(cls) -> str:
+        return "ModalEmbedding"
+    async def _aget_query_embedding(self, query: str) -> List[float]:
+        """Get query embedding asynchronously."""
+        return await self._aget_text_embedding(query)
+    async def _aget_text_embedding(self, text: str) -> List[float]:
+        """Get text embedding asynchronously."""
+        try:
+            text = self._truncate_text(text)
+            embeddings = await self._modal_instance.embed.remote.aio([text])
+            return embeddings[0]
+        except Exception as e:
+            logger.error(f"Error getting embedding from Modal: {e}")
+            raise
+    def _get_query_embedding(self, query: str) -> List[float]:
+        """Get query embedding synchronously."""
+        return self._get_text_embedding(query)
+    def _get_text_embedding(self, text: str) -> List[float]:
+        """Get text embedding synchronously."""
+        try:
+            text = self._truncate_text(text)
+            embeddings = self._modal_instance.embed.remote([text])
+            return embeddings[0]
+        except Exception as e:
+            logger.error(f"Error getting embedding from Modal: {e}")
+            # If Modal fails due to size limits, try to fall back to Gemini for this request
+            if "413" in str(e) or "Payload Too Large" in str(e) or "Input validation error" in str(e):
+                logger.warning("Modal embedding failed due to size limits, attempting Gemini fallback for this request")
+                try:
+                    gemini_wrapper = GeminiEmbeddingWrapper()
+                    return gemini_wrapper._get_text_embedding(text)
+                except Exception as gemini_e:
+                    logger.error(f"Gemini fallback also failed: {gemini_e}")
+                    raise e
+            raise
+    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
+        """Get embeddings for multiple texts with batching."""
+        # Truncate all texts
+        texts = [self._truncate_text(t) for t in texts]
+        # Process in smaller batches to avoid payload size issues
+        all_embeddings = []
+        for i in range(0, len(texts), self._batch_size):
+            batch = texts[i:i + self._batch_size]
+            try:
+                batch_embeddings = self._modal_instance.embed.remote(batch)
+                all_embeddings.extend(batch_embeddings)
+            except Exception as e:
+                logger.error(f"Error getting embeddings from Modal for batch {i//self._batch_size + 1}: {e}")
+                raise
+        return all_embeddings
+    async def _aget_text_embeddings(self, texts: List[str]) -> List[List[float]]:
+        """Get embeddings for multiple texts asynchronously with batching."""
+        # Truncate all texts
+        texts = [self._truncate_text(t) for t in texts]
+        # Process in smaller batches to avoid payload size issues
+        all_embeddings = []
+        for i in range(0, len(texts), self._batch_size):
+            batch = texts[i:i + self._batch_size]
+            try:
+                batch_embeddings = await self._modal_instance.embed.remote.aio(batch)
+                all_embeddings.extend(batch_embeddings)
+            except Exception as e:
+                logger.error(f"Error getting embeddings from Modal for batch {i//self._batch_size + 1}: {e}")
+                raise
+class NebiusEmbeddingWrapper(BaseEmbedding):
+    """
+    Wrapper for Nebius embeddings using OpenAI-compatible API.
+    Uses Qwen/Qwen3-Embedding-8B model (4096 dimensions).
+    """
+    _client: Optional[object] = PrivateAttr(default=None)
+    _model_name: str = PrivateAttr(default="Qwen/Qwen3-Embedding-8B")
+    def __init__(self, api_key: Optional[str] = None, model_name: str = "Qwen/Qwen3-Embedding-8B", **kwargs):
+        """Initialize Nebius embedding client."""
+        super().__init__(**kwargs)
+        # Get API key from environment if not provided
+        if not api_key:
+            api_key = os.getenv("NEBIUS_API_KEY")
+        if not api_key:
+            raise ValueError("NEBIUS_API_KEY not found")
+        try:
+            from openai import OpenAI
+            self._client = OpenAI(
+                base_url="https://api.tokenfactory.nebius.com/v1/",
+                api_key=api_key
+            )
+            self._model_name = model_name
+            logger.info(f"NebiusEmbeddingWrapper initialized with model: {model_name}")
+        except Exception as e:
+            logger.error(f"Failed to initialize Nebius embedding: {e}")
+            raise
+    @classmethod
+    def class_name(cls) -> str:
+        return "NebiusEmbeddingWrapper"
+    def _get_query_embedding(self, query: str) -> List[float]:
+        """Get query embedding."""
+        return self._get_text_embedding(query)
+    def _get_text_embedding(self, text: str) -> List[float]:
+        """Get text embedding."""
+        try:
+            response = self._client.embeddings.create(
+                model=self._model_name,
+                input=text
+            )
+            return response.data[0].embedding
+        except Exception as e:
+            logger.error(f"Error getting embedding from Nebius: {e}")
+            raise
+    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
+        """Get embeddings for multiple texts."""
+        try:
+            response = self._client.embeddings.create(
+                model=self._model_name,
+                input=texts
+            )
+            # Sort by index to ensure correct order
+            sorted_data = sorted(response.data, key=lambda x: x.index)
+            return [item.embedding for item in sorted_data]
+        except Exception as e:
+            logger.error(f"Error getting batch embeddings from Nebius: {e}")
+            raise
+    async def _aget_query_embedding(self, query: str) -> List[float]:
+        """Get query embedding asynchronously."""
+        return self._get_query_embedding(query)
+    async def _aget_text_embedding(self, text: str) -> List[float]:
+        """Get text embedding asynchronously."""
+        return self._get_text_embedding(text)
+class GeminiEmbeddingWrapper(BaseEmbedding):
+    """
+    Wrapper for Gemini embeddings using the new google-genai SDK.
+    Fallback embedding model.
+    """
+    _client: Optional[object] = PrivateAttr(default=None)
+    _model_name: str = PrivateAttr(default="models/gemini-embedding-001")
+    def __init__(self, api_key: Optional[str] = None, model_name: str = "models/gemini-embedding-001", **kwargs):
+        """Initialize Gemini embedding client."""
+        super().__init__(**kwargs)
+        # Use centralized config if no API key provided
+        if not api_key:
+            try:
+                from src.config import GeminiConfig
+                api_key = GeminiConfig.get_api_key()
+            except Exception:
+                # Fallback to environment variable
+                api_key = os.getenv("GEMINI_API_KEY")
+        if not api_key:
+            raise ValueError("GEMINI_API_KEY not found")
+        try:
+            from google import genai
+            self._client = genai.Client(api_key=api_key)
+            self._model_name = model_name
+            logger.info(f"GeminiEmbeddingWrapper initialized with model: {model_name}")
+        except Exception as e:
+            logger.error(f"Failed to initialize Gemini embedding: {e}")
+            raise
+    @classmethod
+    def class_name(cls) -> str:
+        return "GeminiEmbeddingWrapper"
+    def _get_query_embedding(self, query: str) -> List[float]:
+        """Get query embedding."""
+        return self._get_text_embedding(query)
+    def _get_text_embedding(self, text: str) -> List[float]:
+        """Get text embedding."""
+        try:
+            result = self._client.models.embed_content(
+                model=self._model_name,
+                contents=text
+            )
+            return result.embeddings[0].values
+        except Exception as e:
+            logger.error(f"Error getting embedding from Gemini: {e}")
+            raise
+    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:
+        """Get embeddings for multiple texts."""
+        embeddings = []
+        for text in texts:
+            embeddings.append(self._get_text_embedding(text))
+        return embeddings
+    async def _aget_query_embedding(self, query: str) -> List[float]:
+        """Get query embedding asynchronously."""
+        return self._get_query_embedding(query)
+    async def _aget_text_embedding(self, text: str) -> List[float]:
+        """Get text embedding asynchronously."""
+        return self._get_text_embedding(text)
+def get_embedding_model(prefer_modal: bool = True, force_gemini: bool = False) -> BaseEmbedding:
+    """
+    Get the best available embedding model.
+    Priority order:
+    1. Modal (if prefer_modal=True and available)
+    2. Provider-specific embedding (Nebius if AI_PROVIDER=nebius, Gemini otherwise)
+    Args:
+        prefer_modal: If True, try Modal first, then fallback to provider-specific
+        force_gemini: If True, skip Modal and use Gemini directly
+    Returns:
+        BaseEmbedding instance
+    """
+    if force_gemini:
+        logger.info("Using Gemini embedding (forced)")
+        return GeminiEmbeddingWrapper()
+    if prefer_modal:
+        try:
+            logger.info("Attempting to use Modal embedding (primary)")
+            return ModalEmbedding()
+        except Exception as e:
+            logger.warning(f"Modal embedding unavailable, falling back to provider-specific: {e}")
+    # Determine which provider-specific embedding to use
+    ai_provider = os.getenv("AI_PROVIDER", "gemini").lower()
+    if ai_provider == "nebius":
+        try:
+            logger.info("Using Nebius embedding (Qwen/Qwen3-Embedding-8B)")
+            return NebiusEmbeddingWrapper()
+        except Exception as e:
+            logger.warning(f"Nebius embedding unavailable, falling back to Gemini: {e}")
+    try:
+        logger.info("Using Gemini embedding (fallback)")
+        return GeminiEmbeddingWrapper()
+    except Exception as e:
+        logger.error(f"Failed to initialize any embedding model: {e}")
+        raise

src/search/vector_store.py ADDED Viewed

	@@ -0,0 +1,350 @@

+"""
+Vector Store implementation using LlamaIndex and Chroma for semantic code search.
+"""
+import os
+import logging
+from typing import List, Dict, Optional
+from pathlib import Path
+from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings, Document
+from llama_index.vector_stores.chroma import ChromaVectorStore
+import chromadb
+import warnings
+from .embeddings import get_embedding_model
+from src.config import AIManager
+# Suppress deprecation warnings
+warnings.filterwarnings('ignore', category=DeprecationWarning, module='llama_index.llms.gemini')
+warnings.filterwarnings('ignore', category=DeprecationWarning, module='llama_index.embeddings.gemini')
+logger = logging.getLogger(__name__)
+class CodeSearchEngine:
+    """
+    Semantic code search engine using LlamaIndex + Chroma vector store.
+    Enables finding similar legacy patterns across large codebases.
+    """
+    def __init__(self, persist_dir: Optional[str] = None, use_modal: bool = True):
+        """
+        Initialize the code search engine.
+        Args:
+            persist_dir: Optional directory to persist Chroma database
+            use_modal: If True, use Modal embedding as primary (default: True)
+        """
+        self.persist_dir = persist_dir
+        self.index: Optional[VectorStoreIndex] = None
+        self.chroma_client = None
+        self.chroma_collection = None
+        self.use_modal = use_modal
+        # Configure embeddings (Modal primary, Gemini fallback)
+        try:
+            Settings.embed_model = get_embedding_model(prefer_modal=use_modal)
+        except Exception as e:
+            logger.warning(f"Failed to initialize preferred embedding, using Gemini: {e}")
+            Settings.embed_model = get_embedding_model(force_gemini=True)
+            self.use_modal = False
+        # Configure LLM using centralized AIManager
+        self.ai_manager = AIManager()
+        # Set up LlamaIndex LLM based on provider
+        if self.ai_manager.provider_name == "gemini":
+            from llama_index.llms.gemini import Gemini
+            Settings.llm = Gemini(
+                model=self.ai_manager.model_name,
+                api_key=os.getenv("GEMINI_API_KEY"),
+                temperature=0.1
+            )
+        elif self.ai_manager.provider_name in ["nebius", "openai"]:
+            from llama_index.llms.openai import OpenAI
+            if self.ai_manager.provider_name == "nebius":
+                # Use gpt-3.5-turbo as placeholder to pass LlamaIndex validation
+                # The actual model is passed via additional_kwargs
+                Settings.llm = OpenAI(
+                    model="gpt-3.5-turbo",
+                    api_key=os.getenv("NEBIUS_API_KEY"),
+                    api_base="https://api.tokenfactory.nebius.com/v1/",
+                    temperature=0.1,
+                    additional_kwargs={"model": self.ai_manager.model_name}
+                )
+            else:
+                Settings.llm = OpenAI(
+                    model=self.ai_manager.model_name,
+                    api_key=os.getenv("OPENAI_API_KEY"),
+                    temperature=0.1
+                )
+        embedding_type = "Modal (primary)" if self.use_modal else "Gemini (fallback)"
+        logger.info(f"CodeSearchEngine initialized with {embedding_type} embeddings and {self.ai_manager.provider_name} LLM")
+    def build_index(self, repo_path: str, file_extensions: Optional[List[str]] = None) -> VectorStoreIndex:
+        """
+        Build searchable index of codebase.
+        Args:
+            repo_path: Path to repository to index
+            file_extensions: Optional list of file extensions to include (e.g., ['.py', '.java'])
+        Returns:
+            VectorStoreIndex for querying
+        """
+        logger.info(f"Building code index for: {repo_path}")
+        # Initialize Chroma client
+        if self.persist_dir:
+            self.chroma_client = chromadb.PersistentClient(path=self.persist_dir)
+        else:
+            self.chroma_client = chromadb.EphemeralClient()
+        # Create or get collection
+        collection_name = "code_embeddings"
+        try:
+            self.chroma_collection = self.chroma_client.get_or_create_collection(collection_name)
+        except Exception as e:
+            logger.warning(f"Error with collection, creating new: {e}")
+            self.chroma_collection = self.chroma_client.create_collection(collection_name)
+        vector_store = ChromaVectorStore(chroma_collection=self.chroma_collection)
+        # Load documents from repository
+        documents = self._load_code_files(repo_path, file_extensions)
+        if not documents:
+            logger.warning(f"No code files found in {repo_path}")
+            return None
+        logger.info(f"Loaded {len(documents)} code files")
+        # Build index (using default text splitter instead of CodeSplitter to avoid tree-sitter dependency)
+        try:
+            self.index = VectorStoreIndex.from_documents(
+                documents,
+                vector_store=vector_store,
+                show_progress=True
+            )
+            logger.info("Code index built successfully")
+        except Exception as e:
+            if self.use_modal:
+                logger.warning(f"Modal embedding failed during indexing: {e}")
+                logger.info("Retrying with Gemini embeddings...")
+                # Switch to Gemini
+                Settings.embed_model = get_embedding_model(force_gemini=True)
+                self.use_modal = False
+                # Retry building index
+                self.index = VectorStoreIndex.from_documents(
+                    documents,
+                    vector_store=vector_store,
+                    show_progress=True
+                )
+                logger.info("Code index built successfully with Gemini embeddings")
+            else:
+                raise
+        return self.index
+    def _load_code_files(self, repo_path: str, file_extensions: Optional[List[str]] = None) -> List[Document]:
+        """
+        Load code files from repository.
+        Args:
+            repo_path: Path to repository
+            file_extensions: Optional list of extensions to include
+        Returns:
+            List of Document objects
+        """
+        documents = []
+        repo_path = Path(repo_path)
+        # Default extensions if not specified
+        if file_extensions is None:
+            file_extensions = [
+                # Python
+                '.py', '.pyw', '.pyx',
+                # Java
+                '.java',
+                # JavaScript/TypeScript
+                '.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs',
+                # PHP
+                '.php', '.php3', '.php4', '.php5', '.phtml',
+                # Ruby
+                '.rb', '.rbw',
+                # Go
+                '.go',
+                # C/C++
+                '.c', '.cpp', '.cc', '.cxx', '.c++', '.h', '.hpp', '.hh', '.hxx', '.h++',
+                # C#
+                '.cs',
+                # Rust
+                '.rs',
+                # Kotlin
+                '.kt', '.kts',
+                # Swift
+                '.swift',
+                # Scala
+                '.scala', '.sc',
+                # R
+                '.r', '.R',
+                # Perl
+                '.pl', '.pm', '.t', '.pod',
+                # Shell
+                '.sh', '.bash', '.zsh', '.fish'
+            ]
+        # Walk through directory
+        for file_path in repo_path.rglob('*'):
+            if file_path.is_file() and file_path.suffix in file_extensions:
+                try:
+                    # Skip hidden files and common non-code directories
+                    if any(part.startswith('.') for part in file_path.parts):
+                        continue
+                    if any(part in ['node_modules', 'venv', '__pycache__', 'build', 'dist']
+                           for part in file_path.parts):
+                        continue
+                    # Read file content
+                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
+                        content = f.read()
+                    # Create document with metadata
+                    doc = Document(
+                        text=content,
+                        metadata={
+                            'file_path': str(file_path.relative_to(repo_path)),
+                            'file_name': file_path.name,
+                            'extension': file_path.suffix,
+                            'size': len(content)
+                        }
+                    )
+                    documents.append(doc)
+                except Exception as e:
+                    logger.warning(f"Error reading {file_path}: {e}")
+        return documents
+    def find_similar_patterns(self, pattern_query: str, top_k: int = 20) -> List[Dict]:
+        """
+        Find files with similar legacy patterns.
+        Args:
+            pattern_query: Natural language query describing the pattern
+            top_k: Number of results to return
+        Returns:
+            List of dictionaries with file paths and relevance scores
+        """
+        if not self.index:
+            raise ValueError("Index not built. Call build_index() first.")
+        logger.info(f"Searching for pattern: {pattern_query}")
+        # Create query engine
+        query_engine = self.index.as_query_engine(
+            similarity_top_k=top_k,
+            response_mode="tree_summarize"
+        )
+        # Execute query
+        response = query_engine.query(pattern_query)
+        # Extract source files and scores
+        results = []
+        for node in response.source_nodes:
+            results.append({
+                'file_path': node.metadata.get('file_path', 'unknown'),
+                'file_name': node.metadata.get('file_name', 'unknown'),
+                'score': node.score,
+                'text_snippet': node.text[:200] + '...' if len(node.text) > 200 else node.text
+            })
+        logger.info(f"Found {len(results)} matching files")
+        return results
+    def analyze_pattern_with_context(self, pattern_query: str, files: List[str]) -> str:
+        """
+        Deep analysis of legacy pattern with full context retrieval.
+        Args:
+            pattern_query: Description of the pattern to analyze
+            files: List of file paths to analyze
+        Returns:
+            Analysis result from Gemini
+        """
+        if not self.index:
+            raise ValueError("Index not built. Call build_index() first.")
+        logger.info(f"Analyzing pattern with context: {pattern_query}")
+        # Build enhanced query with file context
+        enhanced_query = f"""
+        Analyze the following legacy code pattern and provide:
+        1. What the code currently does
+        2. Why it's problematic (security, performance, maintainability)
+        3. Modern equivalent (recommended library/pattern)
+        4. Migration steps with risk assessment
+        Pattern to analyze: {pattern_query}
+        Files to focus on: {', '.join(files)}
+        Provide detailed analysis in JSON format with keys:
+        - analysis: Overall analysis
+        - issues: List of specific issues
+        - recommendation: Recommended modern approach
+        - steps: Migration steps
+        - risks: Risk assessment
+        """
+        # Create query engine with custom prompt
+        query_engine = self.index.as_query_engine(
+            similarity_top_k=10,
+            response_mode="compact"
+        )
+        # Execute analysis
+        response = query_engine.query(enhanced_query)
+        return response.response
+    def get_transformation_examples(self, pattern_type: str, top_k: int = 5) -> List[Dict]:
+        """
+        Retrieve examples of successful transformations for a pattern type.
+        Args:
+            pattern_type: Type of pattern (e.g., "MySQLdb to SQLAlchemy")
+            top_k: Number of examples to retrieve
+        Returns:
+            List of example transformations
+        """
+        if not self.index:
+            raise ValueError("Index not built. Call build_index() first.")
+        query = f"Find examples of code that was successfully transformed from {pattern_type}"
+        query_engine = self.index.as_query_engine(
+            similarity_top_k=top_k,
+            response_mode="compact"
+        )
+        response = query_engine.query(query)
+        # Extract examples from source nodes
+        examples = []
+        for node in response.source_nodes:
+            examples.append({
+                'file_path': node.metadata.get('file_path', 'unknown'),
+                'code_snippet': node.text,
+                'score': node.score
+            })
+        return examples

src/ui/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """UI components for the Legacy Code Modernizer Agent."""

src/ui/app.py ADDED Viewed

	@@ -0,0 +1,1045 @@

+"""Gradio UI for Legacy Code Modernizer Agent - Phase 5 Complete."""
+import gradio as gr
+import os
+import asyncio
+import logging
+import zipfile
+import tempfile
+from dotenv import load_dotenv
+from pathlib import Path
+# Import orchestrator
+import sys
+import os
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(__file__))))
+from src.workflow.orchestrator import ModernizationOrchestrator
+# Load environment variables
+load_dotenv()
+# Configure logging with sensitive data redaction
+class SensitiveDataFilter(logging.Filter):
+    """Filter to redact sensitive information from logs."""
+    def __init__(self):
+        super().__init__()
+        self.sensitive_patterns = []
+        # Collect sensitive values from environment
+        sensitive_keys = [
+            "GEMINI_API_KEY",
+            "NEBIUS_API_KEY",
+            "OPENAI_API_KEY",
+            "MODAL_TOKEN_ID",
+            "MODAL_TOKEN_SECRET",
+            "GITHUB_TOKEN"
+        ]
+        for key in sensitive_keys:
+            value = os.getenv(key)
+            if value and len(value) > 5:  # Only redact if value is substantial
+                self.sensitive_patterns.append(value)
+    def filter(self, record):
+        msg = str(record.msg)
+        for sensitive_value in self.sensitive_patterns:
+            if sensitive_value in msg:
+                msg = msg.replace(sensitive_value, "[REDACTED]")
+        record.msg = msg
+        return True
+# Initialize logging with redaction
+logging.basicConfig(level=logging.INFO)
+root_logger = logging.getLogger()
+root_logger.addFilter(SensitiveDataFilter())
+logger = logging.getLogger(__name__)
+# Initialize orchestrator with intelligent pattern matching
+orchestrator = ModernizationOrchestrator(use_intelligent_matcher=True)
+# Supported file extensions for single file upload
+SUPPORTED_EXTENSIONS = {
+    # Python
+    '.py', '.pyw', '.pyx',
+    # Java
+    '.java',
+    # JavaScript/TypeScript
+    '.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs'
+}
+# Language to file extension mapping
+LANGUAGE_EXTENSIONS = {
+    'python': ['.py', '.pyw', '.pyx'],
+    'java': ['.java'],
+    'javascript': ['.js', '.jsx', '.mjs', '.cjs'],
+    'typescript': ['.ts', '.tsx']
+}
+# Target version options for each language (Updated November 2025)
+TARGET_VERSIONS = {
+    'python': ['Python 3.14', 'Python 3.13', 'Python 3.12', 'Python 3.11', 'Python 3.10'],
+    'java': ['Java 25 LTS', 'Java 23', 'Java 21 LTS', 'Java 17 LTS'],
+    'javascript': ['ES2025', 'ES2024', 'Node.js 25', 'Node.js 24 LTS', 'Node.js 22 LTS'],
+    'typescript': ['TypeScript 5.9', 'TypeScript 5.8', 'TypeScript 5.7', 'TypeScript 5.6']
+}
+# Framework-specific versions (Updated November 2025)
+FRAMEWORK_VERSIONS = [
+    'React 19', 'React 18', 'React 18 (Hooks)', 'React 17',
+    'Angular 21', 'Angular 20', 'Angular 19',
+    'Vue 3.5', 'Vue 3.4', 'Vue 2.7',
+    'Django 5.2 LTS', 'Django 5.1', 'Django 5.0',
+    'Flask 3.1', 'Flask 3.0', 'Flask 2.3',
+    'Spring Boot 4.0', 'Spring Boot 3.4', 'Spring Boot 3.3',
+    'Laravel 12', 'Laravel 11',
+    'Rails 8.1', 'Rails 8.0', 'Rails 7.2',
+    'Express 5.1', 'Express 5.0', 'Express 4.21',
+    'FastAPI 0.122', 'FastAPI 0.115',
+    'Next.js 16', 'Next.js 15', 'Next.js 14'
+]
+def detect_language_from_extension(file_ext):
+    """Detect language from file extension."""
+    for lang, exts in LANGUAGE_EXTENSIONS.items():
+        if file_ext in exts:
+            return lang
+    return None
+def get_target_versions_for_language(language):
+    """Get appropriate target versions for a detected language."""
+    if not language:
+        # Return all options if language unknown
+        all_versions = []
+        for versions in TARGET_VERSIONS.values():
+            all_versions.extend(versions)
+        all_versions.extend(FRAMEWORK_VERSIONS)
+        return sorted(set(all_versions))
+    # Get language-specific versions
+    versions = TARGET_VERSIONS.get(language, [])
+    # Add framework versions for web languages
+    if language in ['javascript', 'typescript']:
+        versions.extend([v for v in FRAMEWORK_VERSIONS if 'React' in v or 'Angular' in v or 'Vue' in v or 'Express' in v])
+    elif language == 'python':
+        versions.extend([v for v in FRAMEWORK_VERSIONS if 'Django' in v or 'Flask' in v or 'FastAPI' in v])
+    elif language == 'java':
+        versions.extend([v for v in FRAMEWORK_VERSIONS if 'Spring' in v])
+    elif language == 'php':
+        versions.extend([v for v in FRAMEWORK_VERSIONS if 'Laravel' in v])
+    elif language == 'ruby':
+        versions.extend([v for v in FRAMEWORK_VERSIONS if 'Rails' in v])
+    return versions if versions else get_target_versions_for_language(None)
+def detect_languages_from_files(file_paths):
+    """
+    Detect languages from multiple files.
+    Args:
+        file_paths: List of file paths
+    Returns:
+        Dictionary with language counts and suggested target versions
+    """
+    language_counts = {}
+    for file_path in file_paths:
+        ext = Path(file_path).suffix.lower()
+        lang = detect_language_from_extension(ext)
+        if lang:
+            language_counts[lang] = language_counts.get(lang, 0) + 1
+    if not language_counts:
+        return None, []
+    # Get primary language (most files)
+    primary_language = max(language_counts.items(), key=lambda x: x[1])[0]
+    # Get suggested versions
+    suggested_versions = get_target_versions_for_language(primary_language)
+    return primary_language, suggested_versions
+def validate_single_file(file_path):
+    """
+    Validate if a single file is supported for modernization.
+    Args:
+        file_path: Path to the uploaded file
+    Returns:
+        Tuple of (is_valid, message, file_info, suggested_versions)
+    """
+    if not file_path:
+        return False, "❌ No file uploaded", None, []
+    try:
+        file_name = Path(file_path).name
+        file_ext = Path(file_path).suffix.lower()
+        file_size = os.path.getsize(file_path)
+        # Check file extension
+        if file_ext not in SUPPORTED_EXTENSIONS:
+            supported_list = ', '.join(sorted(SUPPORTED_EXTENSIONS))
+            return False, f"❌ Unsupported file type: {file_ext}\n\n✅ Supported types:\n{supported_list}", None, []
+        # Check file size (max 10MB for single file)
+        max_size = 10 * 1024 * 1024  # 10MB
+        if file_size > max_size:
+            return False, f"❌ File too large: {file_size / 1024 / 1024:.2f} MB (max 10 MB)", None, []
+        # Read file to check if it's valid text
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                content = f.read(1000)  # Read first 1000 chars
+                line_count = len(content.split('\n'))
+        except UnicodeDecodeError:
+            return False, f"❌ File is not a valid text file (encoding error)", None, []
+        # Detect language and get suggested versions
+        language = detect_language_from_extension(file_ext)
+        suggested_versions = get_target_versions_for_language(language)
+        # Language name mapping
+        language_names = {
+            'python': 'Python',
+            'java': 'Java',
+            'javascript': 'JavaScript',
+            'typescript': 'TypeScript'
+        }
+        file_info = {
+            'name': file_name,
+            'extension': file_ext,
+            'size': file_size,
+            'path': file_path,
+            'language': language
+        }
+        lang_display = language_names.get(language, 'Unknown')
+        message = f"""✅ File validated successfully!
+📄 File: {file_name}
+📊 Type: {file_ext} ({lang_display})
+💾 Size: {file_size / 1024:.2f} KB
+🎯 Suggested target versions updated in dropdown
+✨ Ready to modernize! Click 'Start Modernization' button."""
+        return True, message, file_info, suggested_versions
+    except Exception as e:
+        return False, f"❌ Error validating file: {str(e)}", None, []
+def process_single_file(file_path):
+    """
+    Process single file upload by creating a temporary ZIP.
+    Args:
+        file_path: Path to the uploaded file
+    Returns:
+        Tuple of (status message, zip path, suggested_versions)
+    """
+    is_valid, message, file_info, suggested_versions = validate_single_file(file_path)
+    if not is_valid:
+        return message, None, []
+    try:
+        # Create a temporary ZIP containing the single file
+        import tempfile
+        import zipfile
+        zip_path = tempfile.NamedTemporaryFile(delete=False, suffix='.zip')
+        with zipfile.ZipFile(zip_path.name, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            zipf.write(file_path, file_info['name'])
+        return message, zip_path.name, suggested_versions
+    except Exception as e:
+        return f"❌ Error processing file: {str(e)}", None, []
+def detect_languages_from_zip(zip_path):
+    """
+    Detect languages from files in a ZIP archive.
+    Args:
+        zip_path: Path to ZIP file
+    Returns:
+        Tuple of (language_summary, suggested_versions)
+    """
+    try:
+        import zipfile
+        file_paths = []
+        with zipfile.ZipFile(zip_path, 'r') as zipf:
+            file_paths = [name for name in zipf.namelist() if not name.endswith('/')]
+        primary_language, suggested_versions = detect_languages_from_files(file_paths)
+        if not primary_language:
+            return "Multiple file types detected", []
+        language_names = {
+            'python': 'Python',
+            'java': 'Java',
+            'javascript': 'JavaScript',
+            'typescript': 'TypeScript'
+        }
+        return f"Primary language: {language_names.get(primary_language, 'Unknown')}", suggested_versions
+    except Exception as e:
+        logger.error(f"Error detecting languages from ZIP: {e}")
+        return "Could not detect language", []
+def clone_github_repo(github_url):
+    """
+    Clone GitHub repository and show preview.
+    Args:
+        github_url: GitHub repository URL
+    Returns:
+        Tuple of (status message, cloned repo path)
+    """
+    if not github_url or not github_url.strip():
+        return "❌ Please enter a GitHub repository URL", None, gr.update(visible=True)
+    try:
+        import tempfile
+        import subprocess
+        # Clean URL (remove .git if present)
+        github_url = github_url.strip().rstrip('/')
+        if github_url.endswith('.git'):
+            github_url = github_url[:-4]
+        # Create temp directory for clone
+        temp_dir = tempfile.mkdtemp(prefix="github_clone_")
+        # Clone repository
+        result = subprocess.run(
+            ["git", "clone", "--depth", "1", github_url, temp_dir],
+            capture_output=True,
+            text=True,
+            timeout=300
+        )
+        if result.returncode != 0:
+            error_msg = result.stderr if result.stderr else "Unknown error"
+            return f"❌ Failed to clone repository:\n{error_msg}", None, gr.update(visible=True)
+        # Count files (only supported extensions)
+        code_extensions = {'.py', '.pyw', '.pyx', '.java', '.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs'}
+        file_count = 0
+        code_files = []
+        for root, dirs, files in os.walk(temp_dir):
+            # Skip .git directory
+            if '.git' in root:
+                continue
+            for file in files:
+                file_path = os.path.join(root, file)
+                rel_path = os.path.relpath(file_path, temp_dir)
+                ext = os.path.splitext(file)[1].lower()
+                if ext in code_extensions:
+                    file_count += 1
+                    code_files.append(rel_path)
+        # Create ZIP from cloned repo
+        import zipfile
+        zip_path = tempfile.NamedTemporaryFile(delete=False, suffix='.zip')
+        with zipfile.ZipFile(zip_path.name, 'w', zipfile.ZIP_DEFLATED) as zipf:
+            for root, dirs, files in os.walk(temp_dir):
+                # Skip .git directory
+                if '.git' in root:
+                    continue
+                for file in files:
+                    file_path = os.path.join(root, file)
+                    arcname = os.path.relpath(file_path, temp_dir)
+                    zipf.write(file_path, arcname)
+        # Detect languages
+        all_code_files = []
+        for root, dirs, files in os.walk(temp_dir):
+            if '.git' in root:
+                continue
+            for file in files:
+                ext = os.path.splitext(file)[1].lower()
+                if ext in SUPPORTED_EXTENSIONS:
+                    all_code_files.append(os.path.join(root, file))
+        primary_language, suggested_versions = detect_languages_from_files(all_code_files)
+        language_names = {
+            'python': 'Python',
+            'java': 'Java',
+            'javascript': 'JavaScript',
+            'typescript': 'TypeScript'
+        }
+        # Generate preview message with all files
+        status = f"""✅ Repository cloned successfully!
+📁 Repository: {github_url.split('/')[-1]}
+📊 Code files found: {file_count}
+🔤 Primary language: {language_names.get(primary_language, 'Mixed')}
+📝 Loaded files ({file_count} total):
+"""
+        # Show all files, not just first 15
+        for f in code_files:
+            status += f"   • {f}\n"
+        status += "\n🎯 Suggested target versions updated in dropdown"
+        status += "\n✨ Ready to modernize! Click 'Start Modernization' button above."
+        return status, zip_path.name, gr.update(visible=True), suggested_versions
+    except subprocess.TimeoutExpired:
+        return "❌ Clone timeout (>5 minutes). Repository might be too large.", None, gr.update(visible=True)
+    except Exception as e:
+        return f"❌ Error cloning from GitHub: {str(e)}", None, gr.update(visible=True)
+def modernize_code(repo_file, target_version, create_pr, repo_url, github_token, cloned_repo_path, single_file_path, progress=gr.Progress()):
+    """
+    Main function to process uploaded repository.
+    Args:
+        repo_file: Uploaded ZIP file containing the repository
+        target_version: Target language/framework version
+        create_pr: Whether to create GitHub PR
+        repo_url: GitHub repository URL for PR
+        github_token: GitHub personal access token for PR creation
+        cloned_repo_path: Path to cloned repo ZIP (if using GitHub clone)
+        single_file_path: Path to single file ZIP (if using single file upload)
+        progress: Gradio progress tracker
+    Returns:
+        Tuple of (status message, download files)
+    """
+    logger.info(f"modernize_code called with: repo_file={repo_file}, single_file_path={single_file_path}, cloned_repo_path={cloned_repo_path}")
+    # Priority: single file > cloned repo > uploaded file
+    if single_file_path:
+        logger.info(f"Single file path detected: {single_file_path}")
+        repo_file = type('obj', (object,), {'name': single_file_path})()
+        logger.info(f"Using single file path: {single_file_path}")
+    elif cloned_repo_path:
+        logger.info(f"Cloned repo path detected: {cloned_repo_path}")
+        repo_file = type('obj', (object,), {'name': cloned_repo_path})()
+        logger.info(f"Using cloned repo path: {cloned_repo_path}")
+    else:
+        logger.info("Using uploaded ZIP file")
+    # Check if any valid input source is provided
+    if repo_file is None:
+        logger.error("No input source provided")
+        return "❌ Please upload a repository ZIP file, single file, or clone from GitHub.", gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+    logger.info(f"Processing with file: {repo_file.name}")
+    try:
+        file_path = repo_file.name
+        file_size = os.path.getsize(file_path)
+        # Initial status
+        status = f"""✅ Processing started!
+📁 File: {Path(file_path).name}
+📊 Size: {file_size / 1024:.2f} KB
+🎯 Target: {target_version}
+"""
+        progress(0.05, desc="Starting...")
+        yield status, gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)  # Hide download buttons initially
+        # Create a callback to update progress from orchestrator
+        current_status = [status]  # Use list to allow modification in nested function
+        def progress_callback(phase, message):
+            """Callback to update progress from orchestrator."""
+            phase_progress = {
+                "Phase 1": 0.15,
+                "Phase 2": 0.30,
+                "Phase 3": 0.45,
+                "Phase 4": 0.65,
+                "Phase 5": 0.85
+            }
+            prog_value = phase_progress.get(phase, 0.5)
+            progress(prog_value, desc=f"{phase}: {message}")
+            current_status[0] += f"⏳ {phase}: {message}\n"
+        # Run orchestrator with callback
+        progress(0.1, desc="Initializing workflow...")
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        results = loop.run_until_complete(
+            orchestrator.modernize_repository(
+                repo_path=file_path,
+                target_version=target_version,
+                create_pr=create_pr,
+                repo_url=repo_url if create_pr else None,
+                github_token=github_token if github_token and github_token.strip() else None,
+                progress_callback=progress_callback
+            )
+        )
+        loop.close()
+        progress(0.95, desc="Preparing downloads...")
+        status = current_status[0]
+        # Prepare download files
+        modernized_zip = None
+        tests_zip = None
+        report_file = None
+        if results.get('output'):
+            import zipfile
+            import tempfile
+            import time
+            # Create timestamp for file naming
+            timestamp = time.strftime("%Y%m%d_%H%M%S")
+            output_dir = Path(results['output']['modernized_files'])
+            # Get list of files that were actually transformed in this run
+            transformed_files = []
+            if results.get('phases', {}).get('transformation'):
+                # Extract file paths from transformation results
+                for t in results.get('transformations', []):
+                    if 'file_path' in t:
+                        transformed_files.append(Path(t['file_path']).name)
+            # Create ZIP of modernized files with better naming - ONLY current run files
+            if output_dir.exists() and transformed_files:
+                modernized_zip = tempfile.NamedTemporaryFile(
+                    delete=False,
+                    suffix='.zip',
+                    prefix=f'modernized_code_{timestamp}_'
+                )
+                with zipfile.ZipFile(modernized_zip.name, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                    # Only include files from current transformation
+                    for file in output_dir.iterdir():
+                        if file.is_file() and file.name in transformed_files:
+                            zipf.write(file, file.name)
+                modernized_zip.close()
+            else:
+                modernized_zip = None
+            # Create ZIP of test files with better naming - ONLY current run files
+            tests_dir = Path(results['output']['test_files'])
+            if tests_dir.exists() and transformed_files:
+                tests_zip = tempfile.NamedTemporaryFile(
+                    delete=False,
+                    suffix='.zip',
+                    prefix=f'test_files_{timestamp}_'
+                )
+                with zipfile.ZipFile(tests_zip.name, 'w', zipfile.ZIP_DEFLATED) as zipf:
+                    # Only include test files from current transformation
+                    for file in tests_dir.iterdir():
+                        if file.is_file():
+                            # Check if this test file corresponds to a transformed file
+                            test_base = file.name.replace('test_', '')
+                            if test_base in transformed_files:
+                                zipf.write(file, file.name)
+                tests_zip.close()
+            else:
+                tests_zip = None
+            # Create report file with UTF-8 encoding and better naming
+            report_file = tempfile.NamedTemporaryFile(
+                delete=False,
+                suffix='.txt',
+                prefix=f'modernization_report_{timestamp}_',
+                mode='w',
+                encoding='utf-8'
+            )
+            report_content = orchestrator.generate_report(results)
+            report_file.write(report_content)
+            report_file.close()
+        # Generate final report
+        if results['success']:
+            status += "\n" + "=" * 60 + "\n"
+            status += "✅ MODERNIZATION COMPLETE!\n"
+            status += "=" * 60 + "\n\n"
+            stats = results.get('statistics', {})
+            status += f"📊 **Statistics:**\n"
+            status += f"   • Total files: {stats.get('total_files', 0)}\n"
+            status += f"   • Files modernized: {stats.get('files_modernized', 0)}\n"
+            status += f"   • Tests generated: {stats.get('tests_generated', 0)}\n"
+            status += f"   • Test pass rate: {stats.get('test_pass_rate', 0):.1f}%\n"
+            # Only show coverage if it's greater than 0
+            if stats.get('average_coverage', 0) > 0:
+                status += f"   • Code coverage: {stats.get('average_coverage', 0):.1f}%\n"
+            status += "\n"
+            # Phase details
+            phases = results.get('phases', {})
+            if 'classification' in phases:
+                c = phases['classification']
+                status += f"📋 **Classification:**\n"
+                status += f"   • High priority: {c.get('modernize_high', 0)} files\n"
+                status += f"   • Low priority: {c.get('modernize_low', 0)} files\n"
+                status += f"   • Skip: {c.get('skip', 0)} files\n\n"
+            if 'search' in phases:
+                s = phases['search']
+                status += f"🔍 **Semantic Search:**\n"
+                status += f"   • Indexed files: {s.get('indexed_files', 0)}\n"
+                status += f"   • Pattern groups: {s.get('pattern_groups', 0)}\n\n"
+            if 'validation' in phases:
+                v = phases['validation']
+                status += f"✅ **Validation:**\n"
+                status += f"   • Tests run: {v.get('total_tests', 0)}\n"
+                status += f"   • Tests passed: {v.get('tests_passed', 0)}\n"
+                status += f"   • Tests failed: {v.get('tests_failed', 0)}\n"
+                status += f"   • Pass rate: {v.get('pass_rate', 0):.1f}%\n"
+                # Show execution mode
+                exec_mode = v.get('execution_mode', 'unknown')
+                if exec_mode == 'modal':
+                    status += f"   • Execution: 🚀 Modal (cloud)\n\n"
+                elif exec_mode == 'local':
+                    status += f"   • Execution: 💻 Local\n\n"
+                else:
+                    status += f"\n"
+            if 'github_pr' in phases:
+                pr = phases['github_pr']
+                if pr.get('success'):
+                    status += f"🔗 **GitHub PR:**\n"
+                    status += f"   • PR URL: {pr.get('pr_url', 'N/A')}\n"
+                    status += f"   • PR Number: #{pr.get('pr_number', 0)}\n"
+                    status += f"   • Branch: {pr.get('branch', 'N/A')}\n\n"
+                else:
+                    status += f"⚠️ **GitHub PR:** {pr.get('error', 'Failed')}\n\n"
+            if results.get('errors'):
+                status += f"⚠️ **Warnings:**\n"
+                for error in results['errors'][:5]:
+                    status += f"   • {error}\n"
+            # Add output locations
+            if results.get('output'):
+                status += f"\n📁 **Output Locations:**\n"
+                status += f"   • Modernized files: {results['output']['modernized_files']}\n"
+                status += f"   • Test files: {results['output']['test_files']}\n"
+                status += f"   • Original files: {results['output']['original_files']}\n"
+            status += "\n" + "=" * 60 + "\n"
+            status += "🎉 Ready for review and deployment!\n"
+            status += "📥 Download files using the buttons below.\n"
+        else:
+            status += "\n❌ MODERNIZATION FAILED\n\n"
+            status += "Errors:\n"
+            for error in results.get('errors', []):
+                status += f"  • {error}\n"
+        progress(1.0, desc="Complete!")
+        # Final yield with status and download files (make visible)
+        yield (
+            status,
+            gr.update(value=modernized_zip.name, visible=True) if modernized_zip else gr.update(visible=False),
+            gr.update(value=tests_zip.name, visible=True) if tests_zip else gr.update(visible=False),
+            gr.update(value=report_file.name, visible=True) if report_file else gr.update(visible=False)
+        )
+    except Exception as e:
+        logger.error(f"Error in modernize_code: {e}", exc_info=True)
+        progress(1.0, desc="Error occurred")
+        yield f"❌ Error: {str(e)}\n\nPlease check logs for details.", gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
+# Custom CSS for better styling
+custom_css = """
+.gradio-container {
+    font-family: 'Inter', sans-serif;
+}
+.header {
+    text-align: center;
+    padding: 20px;
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    border-radius: 10px;
+    margin-bottom: 20px;
+}
+/* Style token input to match other inputs */
+.token-input input {
+    background-color: var(--input-background-fill) !important;
+    border: 1px solid var(--input-border-color) !important;
+}
+"""
+# Get execution mode info for display
+from src.sandbox.config import EXECUTION_MODE, IS_HUGGINGFACE, MODAL_CONFIGURED
+exec_mode_display = ""
+if IS_HUGGINGFACE:
+    if MODAL_CONFIGURED:
+        exec_mode_display = "🚀 Running on Hugging Face Spaces with Modal (cloud execution)"
+    else:
+        exec_mode_display = "⚠️ Running on Hugging Face but Modal not configured - tests will fail!"
+elif EXECUTION_MODE == "modal":
+    exec_mode_display = "🚀 Modal execution enabled (cloud)"
+elif EXECUTION_MODE == "local":
+    exec_mode_display = "💻 Local execution mode"
+else:
+    exec_mode_display = ""  # Don't show anything for auto mode
+# Build Gradio interface
+with gr.Blocks(title="Legacy Code Modernizer") as app:
+    # Add custom CSS via HTML
+    gr.HTML(f"""
+        <style>
+        {custom_css}
+        </style>
+        <div class="header">
+            <h1>🤖 Legacy Code Modernizer</h1>
+            <p>AI-powered code modernization for Python, Java, and JavaScript/TypeScript</p>
+            <p style="font-size: 12px; opacity: 0.8; margin-top: 8px;">{exec_mode_display}</p>
+        </div>
+    """)
+    gr.Markdown("""
+    ### Modernization Workflow:
+    1. **Discovery & Classification**: Analyze codebase structure and prioritize files
+    2. **Semantic Search**: Group similar patterns using vector-based search
+    3. **Code Transformation**: Apply AI-powered modernization patterns
+    4. **Testing & Validation**: Generate tests and validate in secure sandbox
+    5. **GitHub Integration**: Create pull requests with comprehensive documentation
+    **Powered by**: Google Gemini, Nebius AI, LlamaIndex, Chroma, Modal, MCP Protocol
+    """)
+    with gr.Row():
+        with gr.Column(scale=2):
+            # Input method selection
+            with gr.Tabs() as input_tabs:
+                with gr.Tab("📄 Single File"):
+                    single_file_input = gr.File(
+                        label="Upload Single Code File",
+                        file_types=[
+                            ".py", ".pyw", ".pyx",
+                            ".java",
+                            ".js", ".jsx", ".ts", ".tsx", ".mjs", ".cjs"
+                        ],
+                        type="filepath"
+                    )
+                    file_status = gr.Textbox(
+                        label="File Status",
+                        lines=8,
+                        interactive=False,
+                        visible=True
+                    )
+                    single_file_path = gr.State(value=None)
+                    gr.Markdown("""
+                    **Supported Languages**:
+                    - Python (.py, .pyw, .pyx) - pytest with coverage
+                    - Java (.java) - Maven + JUnit 5 + JaCoCo
+                    - JavaScript (.js, .jsx, .mjs, .cjs) - Jest with coverage
+                    - TypeScript (.ts, .tsx) - Jest with coverage
+                    **Max file size**: 10 MB per file
+                    **Note**: All supported languages include code transformation, test generation, and secure Modal sandbox execution with automatic dependency management.
+                    """)
+                with gr.Tab("📁 Upload ZIP"):
+                    file_input = gr.File(
+                        label="Upload Repository (.zip)",
+                        file_types=[".zip"],
+                        type="filepath"
+                    )
+                with gr.Tab("🔗 Clone from GitHub"):
+                    github_repo_url = gr.Textbox(
+                        label="GitHub Repository URL",
+                        placeholder="https://github.com/owner/repo",
+                        info="Enter full GitHub URL to clone (without .git extension)"
+                    )
+                    clone_btn = gr.Button(
+                        "📥 Load Repository",
+                        variant="secondary",
+                        size="sm"
+                    )
+                    clone_status = gr.Textbox(
+                        label="Repository Files",
+                        lines=15,
+                        interactive=False,
+                        visible=False
+                    )
+                    cloned_repo_path = gr.State(value=None)
+                    gr.Markdown("**Note**: Requires git to be installed on your system")
+            # Build comprehensive target version list
+            all_target_versions = []
+            for versions in TARGET_VERSIONS.values():
+                all_target_versions.extend(versions)
+            all_target_versions.extend(FRAMEWORK_VERSIONS)
+            all_target_versions = sorted(set(all_target_versions))
+            target_version = gr.Dropdown(
+                choices=all_target_versions,
+                label="🎯 Target Version (auto-detected from files)",
+                value="Python 3.14",
+                info="Automatically updated based on uploaded files",
+                allow_custom_value=False
+            )
+            # Add option to select from full list
+            with gr.Accordion("📋 Browse All Versions", open=False):
+                gr.Markdown("""
+                **Auto-detection incorrect?** Select from the full list below:
+                **Python**: 3.14, 3.13, 3.12, 3.11, 3.10
+                **Java**: 25 LTS, 23, 21 LTS, 17 LTS
+                **JavaScript**: ES2025, ES2024, Node.js 25, 24 LTS, 22 LTS
+                **TypeScript**: 5.9, 5.8, 5.7, 5.6
+                **Frameworks**: React 19, Angular 21, Vue 3.5, Django 5.2 LTS, Spring Boot 4.0, Laravel 12, Rails 8.1, Next.js 16, FastAPI 0.122, and more
+                Simply select your desired version from the dropdown above.
+                """)
+            with gr.Accordion("⚙️ Advanced Options", open=False):
+                create_pr = gr.Checkbox(
+                    label="Create GitHub PR",
+                    value=False,
+                    info="Automatically create pull request with modernized code"
+                )
+                repo_url = gr.Textbox(
+                    label="GitHub Repository URL for PR",
+                    placeholder="owner/repo (e.g., myorg/myproject)",
+                    info="Required if creating PR"
+                )
+                github_token_input = gr.Textbox(
+                    label="GitHub Personal Access Token",
+                    placeholder="ghp_xxxxxxxxxxxxxxxxxxxx",
+                    type="password",
+                    info="Required for PR creation. Leave empty to use token from .env file",
+                    container=True,
+                    elem_classes=["token-input"]
+                )
+            process_btn = gr.Button(
+                "🚀 Start Modernization",
+                variant="primary",
+                size="lg"
+            )
+        with gr.Column(scale=3):
+            output = gr.Textbox(
+                label="📊 Status & Progress",
+                lines=25,
+                max_lines=35
+            )
+    # Download section (separate row, below main interface)
+    with gr.Row():
+        download_modernized = gr.File(
+            label="📦 Download Modernized Code",
+            visible=False
+        )
+        download_tests = gr.File(
+            label="🧪 Download Test Files",
+            visible=False
+        )
+        download_report = gr.File(
+            label="📄 Download Report",
+            visible=False
+        )
+    with gr.Accordion("📖 Features & Capabilities", open=False):
+        gr.Markdown("""
+        ### Core Features:
+        **🔍 Semantic Code Search**
+        - Vector-based similarity search using LlamaIndex and Chroma
+        - Automatic pattern grouping for efficient refactoring
+        - Bulk code transformation capabilities
+        **🤖 AI-Powered Analysis**
+        - Powered by Google Gemini and Nebius AI models
+        - Large context window for comprehensive code understanding
+        - Multi-language support (Python, Java, JavaScript, TypeScript)
+        **🧪 Automated Testing**
+        - Isolated test execution in Modal sandbox
+        - Secure environment with no network access
+        - Performance benchmarking and coverage reporting
+        **🔗 GitHub Integration**
+        - Automated pull request creation via MCP Protocol
+        - Comprehensive documentation generation
+        - Deployment checklists and rollback plans
+        **📊 Quality Assurance**
+        - High test pass rates with comprehensive coverage
+        - Behavioral equivalence testing
+        - Automated validation before deployment
+        """)
+    with gr.Accordion("🎯 Supported Languages & Versions", open=False):
+        gr.Markdown("""
+        ### Supported Languages (Updated November 2025):
+        **Python**
+        - Versions: 3.9, 3.10, 3.11, 3.12, 3.13
+        - Frameworks: Django 5.1, Flask 3.1, FastAPI 0.115
+        - Testing: pytest with coverage
+        **Java**
+        - Versions: Java 11 LTS, 17 LTS, 21 LTS, 23
+        - Frameworks: Spring Boot 3.4
+        - Testing: Maven + JUnit 5 + JaCoCo
+        **JavaScript**
+        - Standards: ES2023, ES2024, ES2025
+        - Runtimes: Node.js 20 LTS, 22 LTS, 23
+        - Frameworks: React 19, Angular 19, Vue 3.5, Express 5.0, Next.js 15
+        - Testing: Jest with coverage
+        **TypeScript**
+        - Versions: 5.4, 5.5, 5.6, 5.7
+        - Frameworks: React 19, Angular 19, Vue 3.5, Next.js 15
+        - Testing: Jest with ts-jest
+        """)
+    # State for suggested versions
+    suggested_versions_state = gr.State(value=[])
+    # Event handlers
+    # Handle single file validation (automatic on upload)
+    def validate_and_show(file_path):
+        """Wrapper to validate file and show status."""
+        logger.info(f"validate_and_show called with file_path: {file_path}")
+        if not file_path:
+            logger.warning("No file path provided to validate_and_show")
+            return "📄 Upload a code file to get started", None, gr.update(), []
+        try:
+            message, zip_path, suggested_versions = process_single_file(file_path)
+            logger.info(f"Validation result: message='{message}', zip_path='{zip_path}', versions={len(suggested_versions)}")
+            # Update dropdown with suggested versions
+            if suggested_versions:
+                return message, zip_path, gr.update(choices=suggested_versions, value=suggested_versions[0]), suggested_versions
+            else:
+                return message, zip_path, gr.update(), []
+        except Exception as e:
+            logger.error(f"Error in validate_and_show: {e}", exc_info=True)
+            return f"❌ Error: {str(e)}", None, gr.update(), []
+    # Handle ZIP file upload
+    def handle_zip_upload(file_path):
+        """Handle ZIP file upload and detect languages."""
+        if not file_path:
+            return gr.update(), []
+        try:
+            lang_summary, suggested_versions = detect_languages_from_zip(file_path)
+            logger.info(f"ZIP upload: {lang_summary}, {len(suggested_versions)} versions")
+            if suggested_versions:
+                return gr.update(choices=suggested_versions, value=suggested_versions[0]), suggested_versions
+            else:
+                return gr.update(), []
+        except Exception as e:
+            logger.error(f"Error handling ZIP upload: {e}")
+            return gr.update(), []
+    # Auto-validate on file upload
+    single_file_input.change(
+        fn=validate_and_show,
+        inputs=[single_file_input],
+        outputs=[file_status, single_file_path, target_version, suggested_versions_state],
+        show_progress=True
+    )
+    # Auto-detect on ZIP upload
+    file_input.change(
+        fn=handle_zip_upload,
+        inputs=[file_input],
+        outputs=[target_version, suggested_versions_state],
+        show_progress=False
+    )
+    # Handle GitHub clone button
+    def handle_github_clone(github_url):
+        """Wrapper for GitHub clone with version detection."""
+        status, zip_path, visibility, suggested_versions = clone_github_repo(github_url)
+        if suggested_versions:
+            return status, zip_path, visibility, gr.update(choices=suggested_versions, value=suggested_versions[0]), suggested_versions
+        else:
+            return status, zip_path, visibility, gr.update(), []
+    clone_btn.click(
+        fn=handle_github_clone,
+        inputs=[github_repo_url],
+        outputs=[clone_status, cloned_repo_path, clone_status, target_version, suggested_versions_state],
+        show_progress=True
+    )
+    # Handle modernization
+    process_btn.click(
+        fn=modernize_code,
+        inputs=[file_input, target_version, create_pr, repo_url, github_token_input, cloned_repo_path, single_file_path],
+        outputs=[output, download_modernized, download_tests, download_report],
+        show_progress="full"
+    )
+    # Examples
+    gr.Examples(
+        examples=[
+            [None, "Python 3.12", False, "", "", None, None],
+            [None, "Java 21 LTS", False, "", "", None, None],
+            [None, "React 18 (Hooks)", True, "myorg/myproject", "", None, None]
+        ],
+        inputs=[file_input, target_version, create_pr, repo_url, github_token_input, cloned_repo_path, single_file_path],
+        label="📝 Example Configurations"
+    )
+if __name__ == "__main__":
+    app.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True,
+        css=custom_css
+    )

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Utility functions for file handling and processing."""

src/utils/file_handler.py ADDED Viewed

	@@ -0,0 +1,166 @@

+"""File handling utilities for repository processing."""
+import zipfile
+import os
+from pathlib import Path
+from typing import List, Set
+import shutil
+class FileHandler:
+    """Handles file extraction and code file discovery."""
+    # Supported code file extensions
+    CODE_EXTENSIONS: Set[str] = {
+        '.py', '.java', '.js', '.ts', '.jsx', '.tsx',
+        '.php', '.rb', '.go', '.rs', '.cpp', '.c', '.h',
+        '.cs', '.swift', '.kt', '.scala', '.pl', '.r'
+    }
+    # Files/directories to exclude
+    EXCLUDE_PATTERNS: Set[str] = {
+        '__pycache__', '.git', '.svn', 'node_modules',
+        'venv', 'env', '.venv', 'dist', 'build',
+        '.idea', '.vscode', '.pytest_cache', '.mypy_cache'
+    }
+    def __init__(self, upload_dir: str = "./uploads"):
+        """
+        Initialize file handler.
+        Args:
+            upload_dir: Directory to store uploaded and extracted files
+        """
+        self.upload_dir = Path(upload_dir)
+        self.upload_dir.mkdir(exist_ok=True, parents=True)
+    def extract_repo(self, zip_path: str) -> str:
+        """
+        Extract uploaded repository ZIP file.
+        Args:
+            zip_path: Path to the ZIP file
+        Returns:
+            Path to extracted directory
+        Raises:
+            ValueError: If file is not a valid ZIP
+        """
+        if not zipfile.is_zipfile(zip_path):
+            raise ValueError(f"File {zip_path} is not a valid ZIP file")
+        # Create unique extraction directory
+        extract_path = self.upload_dir / "extracted"
+        # Clean up previous extraction
+        if extract_path.exists():
+            shutil.rmtree(extract_path)
+        extract_path.mkdir(exist_ok=True, parents=True)
+        try:
+            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+                zip_ref.extractall(extract_path)
+            return str(extract_path)
+        except Exception as e:
+            raise ValueError(f"Error extracting ZIP file: {e}")
+    def list_code_files(self, repo_path: str) -> List[str]:
+        """
+        List all code files in repository.
+        Args:
+            repo_path: Path to repository directory
+        Returns:
+            List of relative file paths
+        """
+        code_files = []
+        repo_path = Path(repo_path)
+        for root, dirs, files in os.walk(repo_path):
+            # Filter out excluded directories
+            dirs[:] = [d for d in dirs if d not in self.EXCLUDE_PATTERNS]
+            for filename in files:
+                file_path = Path(root) / filename
+                # Check if it's a code file
+                if file_path.suffix in self.CODE_EXTENSIONS:
+                    # Get relative path
+                    rel_path = file_path.relative_to(repo_path)
+                    code_files.append(str(rel_path))
+        return sorted(code_files)
+    def read_file(self, file_path: str, max_size: int = 1024 * 1024) -> str:
+        """
+        Read file contents safely.
+        Args:
+            file_path: Path to file
+            max_size: Maximum file size in bytes (default 1MB)
+        Returns:
+            File contents as string
+        Raises:
+            ValueError: If file is too large or cannot be read
+        """
+        file_path = Path(file_path)
+        if not file_path.exists():
+            raise ValueError(f"File {file_path} does not exist")
+        file_size = file_path.stat().st_size
+        if file_size > max_size:
+            raise ValueError(
+                f"File {file_path} is too large ({file_size} bytes). "
+                f"Maximum size is {max_size} bytes."
+            )
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return f.read()
+        except UnicodeDecodeError:
+            # Try with different encoding
+            try:
+                with open(file_path, 'r', encoding='latin-1') as f:
+                    return f.read()
+            except Exception as e:
+                raise ValueError(f"Cannot read file {file_path}: {e}")
+    def get_file_info(self, file_path: str) -> dict:
+        """
+        Get information about a file.
+        Args:
+            file_path: Path to file
+        Returns:
+            Dictionary with file information
+        """
+        file_path = Path(file_path)
+        if not file_path.exists():
+            return {"exists": False}
+        stat = file_path.stat()
+        return {
+            "exists": True,
+            "name": file_path.name,
+            "extension": file_path.suffix,
+            "size_bytes": stat.st_size,
+            "size_kb": round(stat.st_size / 1024, 2),
+            "is_code": file_path.suffix in self.CODE_EXTENSIONS
+        }
+    def cleanup(self):
+        """Clean up temporary files and directories."""
+        if self.upload_dir.exists():
+            shutil.rmtree(self.upload_dir)
+            self.upload_dir.mkdir(exist_ok=True, parents=True)

src/workflow/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Workflow orchestration module."""
+from src.workflow.orchestrator import ModernizationOrchestrator
+__all__ = ['ModernizationOrchestrator']

src/workflow/orchestrator.py ADDED Viewed

	@@ -0,0 +1,732 @@

+"""
+Workflow Orchestrator - Integrates all phases into complete pipeline.
+Phase 5: Complete end-to-end workflow with all MCP integrations.
+"""
+import os
+import logging
+import asyncio
+from typing import Dict, List, Optional
+from pathlib import Path
+# Phase 1-2: Classification
+from src.agents.classifier import CodeClassifier
+from src.agents.pattern_integration import PatternMatcherIntegration
+from src.utils.file_handler import FileHandler
+# Phase 3: Search
+from src.search.vector_store import CodeSearchEngine
+# Phase 4: Analysis & Transformation
+from src.agents.analyzer import CodeAnalyzer
+from src.agents.transformer import CodeTransformer
+# Phase 5: Testing & GitHub
+from src.agents.test_generator import CodeTestGenerator
+from src.sandbox.validator import ModalSandboxValidator
+# Lazy import to avoid circular dependency
+GitHubMCPClient = None
+logger = logging.getLogger(__name__)
+class ModernizationOrchestrator:
+    """
+    Orchestrates the complete code modernization workflow.
+    Integrates all 5 phases into a seamless pipeline.
+    """
+    def __init__(self, use_intelligent_matcher: bool = True):
+        """Initialize orchestrator with all components."""
+        logger.info("Initializing ModernizationOrchestrator")
+        # Phase 1-2 components
+        self.use_intelligent_matcher = use_intelligent_matcher
+        if use_intelligent_matcher:
+            self.pattern_integration = PatternMatcherIntegration(
+                use_intelligent_matcher=True,
+                cache_dir=".pattern_cache"
+            )
+            logger.info("Using IntelligentPatternMatcher")
+        else:
+            self.classifier = CodeClassifier()
+            logger.info("Using legacy CodeClassifier")
+        self.file_handler = FileHandler()
+        # Phase 3 components
+        self.search_engine = None  # Initialized per repo
+        # Phase 4 components
+        self.analyzer = CodeAnalyzer()
+        self.transformer = CodeTransformer()
+        # Phase 5 components
+        self.test_generator = CodeTestGenerator()
+        self.validator = ModalSandboxValidator()
+        # Lazy load GitHub client to avoid circular import
+        self.github_client = None
+        logger.info("ModernizationOrchestrator initialized successfully")
+    async def modernize_repository(
+        self,
+        repo_path: str,
+        target_version: str = "Python 3.14",
+        create_pr: bool = False,
+        repo_url: Optional[str] = None,
+        github_token: Optional[str] = None,
+        progress_callback: Optional[callable] = None
+    ) -> Dict:
+        """
+        Complete modernization workflow for a repository.
+        Args:
+            repo_path: Path to repository (ZIP or directory)
+            target_version: Target language/framework version
+            create_pr: Whether to create GitHub PR
+            repo_url: GitHub repository URL (required if create_pr=True)
+            github_token: GitHub personal access token (optional, uses .env if not provided)
+            progress_callback: Optional callback function for progress updates
+        Returns:
+            Dictionary with complete modernization results
+        """
+        logger.info(f"Starting modernization for {repo_path}")
+        def update_progress(phase: str, message: str):
+            """Helper to call progress callback if provided."""
+            if progress_callback:
+                progress_callback(phase, message)
+        results = {
+            "success": False,
+            "phases": {},
+            "statistics": {},
+            "errors": []
+        }
+        try:
+            # Phase 1: Extract and discover files
+            logger.info("Phase 1: File discovery")
+            update_progress("Phase 1", "Extracting and discovering files...")
+            if repo_path.endswith('.zip'):
+                extract_path = self.file_handler.extract_repo(repo_path)
+            else:
+                extract_path = repo_path
+            files = self.file_handler.list_code_files(extract_path)
+            logger.info(f"Discovered {len(files)} code files")
+            update_progress("Phase 1", f"Discovered {len(files)} code files")
+            results['phases']['discovery'] = {
+                "files_found": len(files),
+                "repo_path": extract_path
+            }
+            # Phase 2: Classify files
+            logger.info("Phase 2: File classification")
+            update_progress("Phase 2", "Classifying files with AI pattern detection...")
+            # Read file contents for intelligent matching
+            file_contents = {}
+            if self.use_intelligent_matcher:
+                logger.info("Reading file contents for intelligent pattern matching...")
+                for file_path in files[:50]:  # Limit to 50 files for demo
+                    try:
+                        full_path = os.path.join(extract_path, file_path)
+                        content = self.file_handler.read_file(full_path)
+                        if content:
+                            file_contents[file_path] = content
+                    except Exception as e:
+                        logger.warning(f"Could not read {file_path}: {e}")
+                classifications = self.pattern_integration.classify_files(
+                    list(file_contents.keys()),
+                    file_contents
+                )
+                # Get detailed statistics
+                analyses = self.pattern_integration.pattern_matcher.analyze_batch(file_contents)
+                stats = self.pattern_integration.generate_statistics(analyses)
+                logger.info(f"Intelligent classification: {stats['modernize_high']} high, "
+                          f"{stats['modernize_low']} low, {stats['skip']} skip")
+                logger.info(f"Detected {stats['patterns_detected']} patterns across {stats['total_files']} files")
+            else:
+                classifications = self.classifier.classify_files(files)
+                stats = None
+            modernize_high = [f for f, c in classifications.items() if c == 'modernize_high']
+            modernize_low = [f for f, c in classifications.items() if c == 'modernize_low']
+            skip_files = [f for f, c in classifications.items() if c == 'skip']
+            logger.info(f"Classification: {len(modernize_high)} high, {len(modernize_low)} low, {len(skip_files)} skip")
+            results['phases']['classification'] = {
+                "modernize_high": len(modernize_high),
+                "modernize_low": len(modernize_low),
+                "skip": len(skip_files),
+                "classifications": classifications,
+                "intelligent_stats": stats if self.use_intelligent_matcher else None
+            }
+            # Phase 3: Semantic search and pattern grouping
+            logger.info("Phase 3: Semantic search")
+            update_progress("Phase 3", "Building semantic index with LlamaIndex...")
+            self.search_engine = CodeSearchEngine(persist_dir=None)
+            # Build index for high-priority files
+            files_to_modernize = modernize_high + modernize_low
+            if files_to_modernize:
+                self.search_engine.build_index(extract_path)  # Build index from repo
+                # Find pattern groups
+                pattern_groups = self._find_pattern_groups(files_to_modernize[:20])
+                logger.info(f"Found {len(pattern_groups)} pattern groups")
+                results['phases']['search'] = {
+                    "indexed_files": min(len(files_to_modernize), 100),
+                    "pattern_groups": len(pattern_groups)
+                }
+            else:
+                pattern_groups = []
+                results['phases']['search'] = {"message": "No files to modernize"}
+            # Phase 4: Analysis and transformation
+            logger.info("Phase 4: Code transformation")
+            update_progress("Phase 4", "Analyzing and transforming code...")
+            transformations = []
+            # Use intelligent pattern data if available
+            if self.use_intelligent_matcher and file_contents:
+                logger.info("Using intelligent pattern analysis for transformation")
+                # Get prioritized files from intelligent matcher
+                prioritized = self.pattern_integration.pattern_matcher.prioritize_files(analyses)
+                # Process top priority files
+                files_to_transform = [
+                    (fp, analysis) for fp, analysis in prioritized
+                    if analysis.requires_modernization
+                ][:10]  # Limit to 10 files for demo
+                logger.info(f"Processing {len(files_to_transform)} high-priority files with detailed pattern data")
+                total_files = len(files_to_transform)
+                for idx, (file_path, file_analysis) in enumerate(files_to_transform, 1):
+                    try:
+                        update_progress("Phase 4", f"Transforming file {idx}/{total_files}: {Path(file_path).name}")
+                        original_code = file_contents.get(file_path, "")
+                        if not original_code:
+                            continue
+                        # Convert intelligent pattern analysis to transformation plan
+                        transformation_plan = self.pattern_integration.get_transformation_plan(file_analysis)
+                        # Transform using detailed pattern information
+                        modernized_code = await self.transformer.transform_code(
+                            file_path,
+                            original_code,
+                            transformation_plan
+                        )
+                        transformations.append({
+                            "file_path": file_path,
+                            "original_code": original_code,
+                            "modernized_code": modernized_code,
+                            "analysis": transformation_plan,
+                            "patterns_addressed": [p['pattern'] for p in transformation_plan['steps']],
+                            "pattern_details": file_analysis.patterns  # Include detailed pattern info
+                        })
+                    except Exception as e:
+                        logger.error(f"Error transforming {file_path}: {e}")
+                        results['errors'].append(f"Transformation error for {file_path}: {e}")
+            else:
+                # Fallback to legacy pattern grouping
+                logger.info("Using legacy pattern grouping for transformation")
+                file_to_patterns = {}
+                for group in pattern_groups[:5]:  # Limit to 5 groups for demo
+                    for file_path in group['files'][:3]:
+                        if file_path not in file_to_patterns:
+                            file_to_patterns[file_path] = []
+                        file_to_patterns[file_path].append(group['pattern_name'])
+                logger.info(f"Processing {len(file_to_patterns)} unique files")
+                total_files = len(file_to_patterns)
+                for idx, (file_path, patterns) in enumerate(file_to_patterns.items(), 1):
+                    try:
+                        update_progress("Phase 4", f"Transforming file {idx}/{total_files}: {Path(file_path).name}")
+                        full_path = os.path.join(extract_path, file_path)
+                        original_code = self.file_handler.read_file(full_path)
+                        if not original_code:
+                            continue
+                        # Analyze patterns
+                        combined_pattern = " AND ".join(patterns)
+                        analysis = await self.analyzer.analyze_pattern(
+                            [file_path],
+                            combined_pattern,
+                            {file_path: original_code}
+                        )
+                        # Transform file
+                        modernized_code = await self.transformer.transform_code(
+                            file_path,
+                            original_code,
+                            analysis
+                        )
+                        transformations.append({
+                            "file_path": file_path,
+                            "original_code": original_code,
+                            "modernized_code": modernized_code,
+                            "analysis": analysis,
+                            "patterns_addressed": patterns
+                        })
+                    except Exception as e:
+                        logger.error(f"Error transforming {file_path}: {e}")
+                        results['errors'].append(f"Transformation error for {file_path}: {e}")
+            logger.info(f"Transformed {len(transformations)} files")
+            # Save transformed files to output directory
+            output_dir = Path("modernized_output")
+            output_dir.mkdir(exist_ok=True)
+            for t in transformations:
+                try:
+                    # Create subdirectories if needed
+                    output_file = output_dir / t['file_path']
+                    output_file.parent.mkdir(parents=True, exist_ok=True)
+                    # Save modernized code
+                    output_file.write_text(t['modernized_code'])
+                    logger.info(f"Saved: {output_file}")
+                    # Also save original for comparison
+                    original_file = output_dir / "original" / t['file_path']
+                    original_file.parent.mkdir(parents=True, exist_ok=True)
+                    original_file.write_text(t['original_code'])
+                except Exception as e:
+                    logger.error(f"Error saving {t['file_path']}: {e}")
+            logger.info(f"Output saved to: {output_dir.absolute()}")
+            results['phases']['transformation'] = {
+                "files_transformed": len(transformations),
+                "output_directory": str(output_dir.absolute())
+            }
+            # Store transformations for zip file creation
+            results['transformations'] = transformations
+            # Phase 5: Test generation and validation
+            logger.info("Phase 5: Test generation and validation")
+            update_progress("Phase 5", "Generating tests and validating in Modal sandbox...")
+            validation_results = []
+            # Create tests directory
+            tests_dir = output_dir / "tests"
+            tests_dir.mkdir(exist_ok=True)
+            total_tests = min(len(transformations), 10)
+            for idx, t in enumerate(transformations[:10], 1):  # Limit to 10 for demo
+                try:
+                    # Update progress
+                    update_progress("Phase 5", f"Testing file {idx}/{total_tests}: {Path(t['file_path']).name}")
+                    # Generate tests
+                    tests = self.test_generator.generate_tests(
+                        t['original_code'],
+                        t['modernized_code'],
+                        t['file_path']
+                    )
+                    # Validate and auto-fix export issues
+                    if tests:
+                        from src.agents.code_validator import validate_and_fix_code
+                        # Detect language from file extension
+                        file_ext = Path(t['file_path']).suffix.lower()
+                        language_map = {
+                            '.ts': 'typescript',
+                            '.js': 'javascript',
+                            '.py': 'python',
+                            '.java': 'java'
+                        }
+                        language = language_map.get(file_ext, 'unknown')
+                        # Validate and fix
+                        fixed_code, is_valid, issues = validate_and_fix_code(
+                            t['modernized_code'],
+                            tests,
+                            language
+                        )
+                        if not is_valid:
+                            logger.warning(f"Code validation issues for {t['file_path']}: {issues}")
+                        if fixed_code != t['modernized_code']:
+                            logger.info(f"Auto-fixed export issues in {t['file_path']}")
+                            t['modernized_code'] = fixed_code
+                            # Re-save the fixed source file
+                            output_file = output_dir / Path(t['file_path']).name
+                            output_file.write_text(fixed_code)
+                    # Save test file
+                    if tests:
+                        test_file = tests_dir / f"test_{Path(t['file_path']).name}"
+                        test_file.write_text(tests)
+                        logger.info(f"Saved test: {test_file}")
+                    # Validate in sandbox
+                    validation = self.validator.validate_transformation(
+                        t['original_code'],
+                        t['modernized_code'],
+                        tests,
+                        file_path=t['file_path']
+                    )
+                    validation['file_path'] = t['file_path']
+                    validation_results.append(validation)
+                except Exception as e:
+                    logger.error(f"Error validating {t['file_path']}: {e}")
+                    results['errors'].append(f"Validation error: {e}")
+            # Calculate aggregate test results
+            total_tests = sum(v.get('tests_run', 0) for v in validation_results)
+            total_passed = sum(v.get('tests_passed', 0) for v in validation_results)
+            # Fix: Only average coverage for files that have coverage data
+            coverage_values = [v.get('coverage_percent', 0) for v in validation_results if v.get('coverage_percent', 0) > 0]
+            avg_coverage = sum(coverage_values) / len(coverage_values) if coverage_values else 0.0
+            logger.info(f"Validation: {total_passed}/{total_tests} tests passed, {avg_coverage:.1f}% coverage")
+            results['phases']['validation'] = {
+                "files_validated": len(validation_results),
+                "total_tests": total_tests,
+                "tests_passed": total_passed,
+                "tests_failed": total_tests - total_passed,
+                "average_coverage": round(avg_coverage, 2),
+                "pass_rate": round(total_passed / max(total_tests, 1) * 100, 2)
+            }
+            # Phase 5b: GitHub PR creation (optional)
+            if create_pr and repo_url:
+                logger.info("Phase 5b: Creating GitHub PR")
+                # Lazy load GitHub client
+                if self.github_client is None:
+                    from src.mcp.github_client import GitHubMCPClient
+                    self.github_client = GitHubMCPClient(github_token=github_token)
+                # Prepare changed files
+                changed_files = {
+                    t['file_path']: t['modernized_code']
+                    for t in transformations
+                }
+                # Generate PR summary
+                pr_summary = self._generate_pr_summary(results, target_version)
+                # Create PR
+                pr_result = await self.github_client.create_pr(
+                    repo_url=repo_url,
+                    changed_files=changed_files,
+                    pr_summary=pr_summary,
+                    test_results=results['phases']['validation']
+                )
+                results['phases']['github_pr'] = pr_result
+                logger.info(f"PR creation: {pr_result.get('success', False)}")
+            # Calculate final statistics
+            results['statistics'] = {
+                "total_files": len(files),
+                "files_modernized": len(transformations),
+                "tests_generated": total_tests,
+                "test_pass_rate": round(total_passed / max(total_tests, 1) * 100, 2),
+                "average_coverage": round(avg_coverage, 2)
+            }
+            # Add output locations
+            results['output'] = {
+                "modernized_files": str(output_dir.absolute()),
+                "original_files": str((output_dir / "original").absolute()),
+                "test_files": str((output_dir / "tests").absolute())
+            }
+            results['success'] = True
+            logger.info("Modernization workflow completed successfully")
+            logger.info(f"📁 Modernized files: {output_dir.absolute()}")
+            logger.info(f"📁 Test files: {output_dir / 'tests'}")
+        except Exception as e:
+            logger.error(f"Workflow error: {e}")
+            results['errors'].append(f"Workflow error: {e}")
+            results['success'] = False
+        return results
+    def _find_pattern_groups(self, files: List[str]) -> List[Dict]:
+        """
+        Find groups of files with similar legacy patterns.
+        Detects file languages and uses appropriate pattern queries.
+        Args:
+            files: List of file paths
+        Returns:
+            List of pattern group dictionaries
+        """
+        # Detect languages present in the files
+        languages = self._detect_languages_in_files(files)
+        # Build language-specific pattern queries
+        pattern_queries = self._get_pattern_queries_for_languages(languages)
+        groups = []
+        for query in pattern_queries:
+            try:
+                similar_files = self.search_engine.find_similar_patterns(query, top_k=10)
+                if similar_files:
+                    groups.append({
+                        "pattern_name": query,
+                        "files": [f['file_path'] for f in similar_files],
+                        "similarity_scores": [f['score'] for f in similar_files]
+                    })
+            except Exception as e:
+                logger.error(f"Error searching for pattern '{query}': {e}")
+        return groups
+    def _detect_languages_in_files(self, files: List[str]) -> set:
+        """Detect programming languages from file extensions."""
+        extension_to_language = {
+            '.py': 'python',
+            '.java': 'java',
+            '.js': 'javascript',
+            '.ts': 'typescript',
+            '.jsx': 'javascript',
+            '.tsx': 'typescript',
+            '.cpp': 'cpp',
+            '.c': 'c',
+            '.h': 'c',
+            '.cs': 'csharp',
+            '.go': 'go',
+            '.rb': 'ruby',
+            '.php': 'php',
+            '.kt': 'kotlin',
+            '.scala': 'scala',
+            '.rs': 'rust',
+            '.swift': 'swift'
+        }
+        languages = set()
+        for file_path in files:
+            ext = Path(file_path).suffix.lower()
+            if ext in extension_to_language:
+                languages.add(extension_to_language[ext])
+        return languages if languages else {'python'}  # Default to Python if no recognized extensions
+    def _get_pattern_queries_for_languages(self, languages: set) -> List[str]:
+        """Get pattern queries appropriate for the detected languages."""
+        # Common patterns for all languages
+        common_patterns = [
+            "Files with SQL injection vulnerabilities",
+            "Files with hardcoded credentials or secrets",
+            "Files with security vulnerabilities",
+            "Files with deprecated API usage"
+        ]
+        # Language-specific patterns
+        language_patterns = {
+            'python': [
+                "Files using deprecated database libraries like MySQLdb",
+                "Files using Python 2 print statements",
+                "Files using deprecated urllib2 library",
+                "Files missing type hints",
+                "Files using old-style string formatting"
+            ],
+            'java': [
+                "Files using deprecated Java APIs like Vector or Hashtable",
+                "Files using raw JDBC without prepared statements",
+                "Files missing try-with-resources for AutoCloseable",
+                "Files using pre-Java 8 patterns without lambdas or streams",
+                "Files using deprecated Date and Calendar APIs",
+                "Files with missing null checks or Optional usage"
+            ],
+            'javascript': [
+                "Files using var instead of let or const",
+                "Files using callback patterns instead of Promises or async/await",
+                "Files using jQuery for DOM manipulation",
+                "Files with eval() usage",
+                "Files using prototype-based inheritance"
+            ],
+            'typescript': [
+                "Files with excessive any type usage",
+                "Files missing strict null checks",
+                "Files using old module syntax"
+            ],
+            'cpp': [
+                "Files using raw pointers instead of smart pointers",
+                "Files with manual memory management",
+                "Files using C-style casts",
+                "Files missing RAII patterns"
+            ],
+            'csharp': [
+                "Files using deprecated .NET APIs",
+                "Files missing async/await patterns",
+                "Files using old collection types"
+            ],
+            'go': [
+                "Files missing error handling",
+                "Files with goroutine leaks",
+                "Files missing context usage"
+            ],
+            'ruby': [
+                "Files using deprecated Ruby syntax",
+                "Files missing proper error handling"
+            ],
+            'php': [
+                "Files using deprecated mysql_* functions",
+                "Files missing prepared statements",
+                "Files with register_globals usage"
+            ]
+        }
+        queries = common_patterns.copy()
+        for lang in languages:
+            if lang in language_patterns:
+                queries.extend(language_patterns[lang])
+        return queries
+    def _generate_pr_summary(self, results: Dict, target_version: str) -> str:
+        """Generate PR summary from results."""
+        stats = results['statistics']
+        # Build coverage line only if coverage > 0
+        coverage_line = ""
+        if stats.get('average_coverage', 0) > 0:
+            coverage_line = f"**Code Coverage**: {stats['average_coverage']:.1f}%\n"
+        summary = f"""Automated migration to {target_version} with security fixes and performance improvements.
+**Files Modernized**: {stats['files_modernized']} / {stats['total_files']}
+**Tests Generated**: {stats['tests_generated']}
+**Test Pass Rate**: {stats['test_pass_rate']:.1f}%
+{coverage_line}
+This PR includes:
+- Syntax modernization to {target_version}
+- Security vulnerability fixes
+- Deprecated library replacements
+- Comprehensive test suite
+- Performance optimizations
+All changes have been validated in an isolated sandbox environment.
+"""
+        return summary
+    def generate_report(self, results: Dict) -> str:
+        """
+        Generate human-readable report from results.
+        Args:
+            results: Workflow results dictionary
+        Returns:
+            Formatted report string
+        """
+        report = []
+        report.append("=" * 60)
+        report.append("LEGACY CODE MODERNIZATION REPORT")
+        report.append("=" * 60)
+        report.append("")
+        if results['success']:
+            report.append("✅ Status: SUCCESS")
+        else:
+            report.append("❌ Status: FAILED")
+        report.append("")
+        report.append("STATISTICS:")
+        report.append("-" * 60)
+        stats = results.get('statistics', {})
+        for key, value in stats.items():
+            # Skip average_coverage if it's 0
+            if key == 'average_coverage' and value == 0:
+                continue
+            report.append(f"  {key.replace('_', ' ').title()}: {value}")
+        # Add intelligent pattern statistics if available
+        classification_data = results.get('phases', {}).get('classification', {})
+        intelligent_stats = classification_data.get('intelligent_stats')
+        if intelligent_stats:
+            report.append("")
+            report.append("INTELLIGENT PATTERN ANALYSIS:")
+            report.append("-" * 60)
+            report.append(f"  Patterns Detected: {intelligent_stats.get('patterns_detected', 0)}")
+            report.append(f"  Average Modernization Score: {intelligent_stats.get('average_modernization_score', 0)}/100")
+            report.append(f"  Total Estimated Effort: {intelligent_stats.get('total_estimated_effort_hours', 0)}h")
+            severity_counts = intelligent_stats.get('severity_counts', {})
+            if severity_counts:
+                report.append("  Severity Breakdown:")
+                for severity, count in severity_counts.items():
+                    if count > 0:
+                        report.append(f"    {severity.upper()}: {count}")
+        report.append("")
+        report.append("PHASE RESULTS:")
+        report.append("-" * 60)
+        for phase, data in results.get('phases', {}).items():
+            report.append(f"\n  {phase.upper()}:")
+            if isinstance(data, dict):
+                for k, v in data.items():
+                    if k not in ['classifications', 'intelligent_stats']:  # Skip large data
+                        report.append(f"    {k}: {v}")
+        # Add output locations
+        if results.get('output'):
+            report.append("")
+            report.append("OUTPUT LOCATIONS:")
+            report.append("-" * 60)
+            for key, path in results['output'].items():
+                report.append(f"  📁 {key.replace('_', ' ').title()}: {path}")
+        if results.get('errors'):
+            report.append("")
+            report.append("ERRORS:")
+            report.append("-" * 60)
+            for error in results['errors']:
+                report.append(f"  ⚠️ {error}")
+        report.append("")
+        report.append("=" * 60)
+        return "\n".join(report)