Spaces:

Prithvigg
/

queryforge

Sleeping

File size: 4,645 Bytes

"""
Data models for the QueryForge SQL environment.

SQLAction    — the agent's submitted SQL query.
SQLObservation — task description + grading feedback returned after each step.
TaskSpec     — payload for registering a custom task via POST /tasks.
"""

from typing import Any, Dict, List, Optional

from openenv.core.env_server.types import Action, Observation
from pydantic import BaseModel, Field


class SQLAction(Action):
    """Action: submit a SQL query for evaluation."""

    sql: str = Field(..., description="The SQL query to submit for grading")


class SQLObservation(Observation):
    """Observation returned after reset() or step()."""

    # ── Task context ─────────────────────────────────────────────────────────
    task_id: str = Field(default="", description="Active task identifier")
    task_level: str = Field(
        default="", description="Difficulty: easy | medium | hard | expert"
    )
    task_title: str = Field(default="", description="Human-readable task title")
    task_description: str = Field(
        default="",
        description=(
            "Full task description: schema, broken query, error message, and goal"
        ),
    )

    # ── Per-step grading signals ──────────────────────────────────────────────
    syntax_valid: bool = Field(
        default=False, description="True if the submitted query parsed without error"
    )
    execution_success: bool = Field(
        default=False, description="True if the query ran to completion in DuckDB"
    )
    execution_error: Optional[str] = Field(
        default=None, description="Runtime error message, if any"
    )
    rows_returned: int = Field(
        default=0, description="Number of rows the query returned"
    )
    feedback: str = Field(
        default="",
        description="Detailed grading feedback from DuckDB + AI judge",
    )
    hint: str = Field(
        default="", description="Actionable hint for the next attempt"
    )

    # ── Episode progress ──────────────────────────────────────────────────────
    attempt: int = Field(
        default=0, description="Number of queries submitted this episode"
    )
    best_score: float = Field(
        default=0.0, description="Highest score achieved so far this episode"
    )


class TaskSpec(BaseModel):
    """
    Payload for registering a custom SQL task via POST /tasks
    or directly via REGISTRY.register(task_from_dict(spec.model_dump())).

    Required: id, schema_ddl, expected_rows
    Everything else has sensible defaults.
    """

    id: str = Field(
        ..., description="Unique task identifier, e.g. 'null_handling_task'"
    )
    level: str = Field(
        default="custom",
        description="Difficulty label: easy | medium | hard | custom",
    )
    title: str = Field(..., description="Human-readable task title")
    description: str = Field(
        default="",
        description="Full task description shown to the agent (schema, goal, etc.)",
    )
    schema_ddl: str = Field(
        ...,
        description="CREATE TABLE + INSERT statements to seed the DuckDB test DB",
    )
    broken_query: str = Field(
        default="",
        description="The broken or slow query the agent must fix",
    )
    error_message: str = Field(
        default="",
        description="Error or performance warning shown to the agent alongside the task",
    )
    hint: str = Field(
        default="",
        description="Actionable hint surfaced in the observation after each wrong attempt",
    )
    expected_rows: List[Dict[str, Any]] = Field(
        ...,
        description=(
            "Exact rows the correct query must return. "
            "Used for deterministic row-match scoring."
        ),
    )
    order_by: Optional[str] = Field(
        default=None,
        description="Comma-separated column names used to sort rows before comparison",
    )
    solution_query: str = Field(
        default="",
        description="Reference solution shown to the AI judge for quality scoring",
    )
    test_description: str = Field(
        default="Custom test case",
        description="One-line description of what the test case checks",
    )
    max_steps: int = Field(
        default=5, ge=1, le=20,
        description="Maximum number of step() calls allowed per episode",
    )