Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

__pycache__/complex_json_output.cpython-312.pyc +0 -0
complex_json_output.py +337 -0
pyproject.toml +14 -0
train_complex_json_output.py +101 -0

__pycache__/complex_json_output.cpython-312.pyc ADDED Viewed

Binary file (11.7 kB). View file

complex_json_output.py ADDED Viewed

	@@ -0,0 +1,337 @@

+import json
+from datasets import load_dataset
+import verifiers as vf
+def load_environment(
+    num_train_examples=7000,
+    num_eval_examples=1000,
+    **kwargs
+):
+    """
+    Environment for verifying complex JSON output from models.
+    The task requires models to:
+    1. Parse multi-question prompts
+    2. Generate valid JSON responses
+    3. Match the expected structure with correct keys and values
+    Reward structure (multiplicative to prevent local minima):
+    - If JSON fails to parse: reward = 0
+    - Otherwise:
+      * key_accuracy = (correct_keys) / (total_keys_in_response)
+      * value_accuracy = (correct_values) / (total_values_in_response)
+      * final_reward = key_accuracy * value_accuracy
+    This penalizes both missing keys/values AND adding extra incorrect ones.
+    """
+    # Load dataset from HuggingFace
+    dataset = load_dataset("Delta-Vector/Tauri-Complex-JSON-Formatting", split="train")
+    # Map to expected format - keep verification_info as string to avoid schema issues
+    def format_example(example):
+        return {
+            "question": example["prompt"],
+            "info": {"verification_info": example["verification_info"]},  # Keep as dict with string
+        }
+    dataset = dataset.map(format_example, remove_columns=dataset.column_names)
+    # Split into train and eval
+    train_dataset = dataset.select(range(num_train_examples))
+    eval_dataset = dataset.select(range(num_train_examples, num_train_examples + num_eval_examples))
+    # Custom extract function to parse JSON from code blocks or raw text
+    def extract_json_from_completion(completion):
+        """Extract JSON from completion, handling code blocks."""
+        if not completion:
+            return ""
+        # Get the last message content
+        if isinstance(completion, list) and len(completion) > 0:
+            content = completion[-1].get("content", "")
+        else:
+            content = str(completion)
+        # Try to extract from code blocks first (```json ... ``` or ``` ... ```)
+        import re
+        code_block_pattern = r"```(?:json)?\s*\n(.*?)\n```"
+        matches = re.findall(code_block_pattern, content, re.DOTALL)
+        if matches:
+            return matches[-1].strip()  # Return last code block
+        # Otherwise return the content as-is
+        return content.strip()
+    # Use simple Parser with custom extract function
+    parser = vf.Parser(extract_fn=extract_json_from_completion)
+    def multiplicative_reward(completion, info, **kwargs) -> float:
+        """
+        Multiplicative reward: key_accuracy * value_accuracy.
+        Returns 0 if JSON fails to parse.
+        Otherwise:
+        - key_accuracy = (correct_keys) / (total_keys_in_response)
+        - value_accuracy = (correct_values) / (total_values_in_response)
+        - final_reward = key_accuracy * value_accuracy
+        This penalizes both missing correct items AND adding extra incorrect ones.
+        """
+        try:
+            response = parser.parse_answer(completion) or ""
+            response = response.strip()
+            # Check: Valid JSON format
+            if not response:
+                return 0.0
+            try:
+                parsed_response = json.loads(response)
+            except (json.JSONDecodeError, ValueError):
+                return 0.0
+            # Must be a dict
+            if not isinstance(parsed_response, dict):
+                return 0.0
+            # Parse ground truth from info
+            verification_info = json.loads(info["verification_info"])
+            ground_truth = verification_info["ground_truth"]
+            # Get all keys recursively with their full paths
+            def get_all_keys(d, prefix=""):
+                keys = set()
+                if isinstance(d, dict):
+                    for k, v in d.items():
+                        full_key = f"{prefix}.{k}" if prefix else k
+                        keys.add(full_key)
+                        keys.update(get_all_keys(v, full_key))
+                return keys
+            # Get all values recursively
+            def get_all_values(d):
+                values = []
+                if isinstance(d, dict):
+                    for v in d.values():
+                        if isinstance(v, dict):
+                            values.extend(get_all_values(v))
+                        elif isinstance(v, list):
+                            values.extend(get_all_values({"_": item} for item in v))
+                        else:
+                            values.append(v)
+                return values
+            ground_truth_keys = get_all_keys(ground_truth)
+            response_keys = get_all_keys(parsed_response)
+            # Calculate key accuracy
+            if len(response_keys) == 0:
+                key_accuracy = 0.0
+            else:
+                correct_keys = len(ground_truth_keys & response_keys)  # Intersection
+                key_accuracy = correct_keys / len(response_keys)
+            # Calculate value accuracy by checking each value at correct key paths
+            def get_value_at_path(d, path):
+                """Get value at a specific key path like 'a.b.c'"""
+                keys = path.split('.')
+                current = d
+                try:
+                    for key in keys:
+                        current = current[key]
+                    return current
+                except (KeyError, TypeError):
+                    return None
+            # Helper function to compare values with numeric type tolerance
+            def values_equal(a, b):
+                """Compare values with numeric type tolerance (25 == 25.0)"""
+                # Handle numeric comparison (int vs float)
+                if isinstance(a, (int, float)) and isinstance(b, (int, float)):
+                    return a == b  # Python handles int/float equality correctly
+                # For everything else, use strict equality
+                return a == b
+            # Only check values for keys that exist in both
+            common_keys = ground_truth_keys & response_keys
+            total_values_checked = len(response_keys)  # We check all response keys
+            if total_values_checked == 0:
+                value_accuracy = 0.0
+            else:
+                correct_values = 0
+                for key_path in response_keys:
+                    response_val = get_value_at_path(parsed_response, key_path)
+                    ground_truth_val = get_value_at_path(ground_truth, key_path)
+                    # If key exists in ground truth and values match
+                    if ground_truth_val is not None and values_equal(response_val, ground_truth_val):
+                        correct_values += 1
+                value_accuracy = correct_values / total_values_checked
+            # Multiply together
+            final_reward = key_accuracy * value_accuracy
+            return final_reward
+        except (AttributeError, TypeError, KeyError) as e:
+            return 0.0
+    def format_reward(completion, **kwargs) -> float:
+        """
+        Reward for valid JSON formatting.
+        Returns 0.33 for valid JSON dict, 0 for invalid.
+        """
+        try:
+            response = parser.parse_answer(completion) or ""
+            response = response.strip()
+            # Check if response is not empty
+            if not response:
+                return 0.0
+            # Try to parse as JSON
+            parsed = json.loads(response)
+            # Must be a dict (since ground truth is always a dict)
+            if not isinstance(parsed, dict):
+                return 0.0
+            return 0.33
+        except (json.JSONDecodeError, ValueError, TypeError):
+            return 0.0
+    def keys_match_reward(completion, info, **kwargs) -> float:
+        """
+        Metric: key accuracy (correct_keys / total_keys_in_response).
+        Returns the same key_accuracy used in multiplicative_reward.
+        """
+        try:
+            response = parser.parse_answer(completion) or ""
+            response = response.strip()
+            if not response:
+                return 0.0
+            parsed_response = json.loads(response)
+            if not isinstance(parsed_response, dict):
+                return 0.0
+            # Parse ground truth from info
+            verification_info = json.loads(info["verification_info"])
+            ground_truth = verification_info["ground_truth"]
+            # Get all keys from ground truth (recursively)
+            def get_all_keys(d, prefix=""):
+                keys = set()
+                if isinstance(d, dict):
+                    for k, v in d.items():
+                        full_key = f"{prefix}.{k}" if prefix else k
+                        keys.add(full_key)
+                        keys.update(get_all_keys(v, full_key))
+                return keys
+            ground_truth_keys = get_all_keys(ground_truth)
+            response_keys = get_all_keys(parsed_response)
+            if len(response_keys) == 0:
+                return 0.0
+            correct_keys = len(ground_truth_keys & response_keys)
+            return correct_keys / len(response_keys)
+        except (json.JSONDecodeError, ValueError, AttributeError, TypeError):
+            return 0.0
+    def values_match_reward(completion, info, **kwargs) -> float:
+        """
+        Metric: value accuracy (correct_values / total_values_in_response).
+        Returns the same value_accuracy used in multiplicative_reward.
+        """
+        try:
+            response = parser.parse_answer(completion) or ""
+            response = response.strip()
+            if not response:
+                return 0.0
+            parsed_response = json.loads(response)
+            if not isinstance(parsed_response, dict):
+                return 0.0
+            # Parse ground truth from info
+            verification_info = json.loads(info["verification_info"])
+            ground_truth = verification_info["ground_truth"]
+            # Helper function to compare values with numeric type tolerance
+            def values_equal(a, b):
+                if isinstance(a, (int, float)) and isinstance(b, (int, float)):
+                    return a == b
+                return a == b
+            # Get all keys recursively
+            def get_all_keys(d, prefix=""):
+                keys = set()
+                if isinstance(d, dict):
+                    for k, v in d.items():
+                        full_key = f"{prefix}.{k}" if prefix else k
+                        keys.add(full_key)
+                        keys.update(get_all_keys(v, full_key))
+                return keys
+            def get_value_at_path(d, path):
+                keys = path.split('.')
+                current = d
+                try:
+                    for key in keys:
+                        current = current[key]
+                    return current
+                except (KeyError, TypeError):
+                    return None
+            response_keys = get_all_keys(parsed_response)
+            if len(response_keys) == 0:
+                return 0.0
+            correct_values = 0
+            for key_path in response_keys:
+                response_val = get_value_at_path(parsed_response, key_path)
+                ground_truth_val = get_value_at_path(ground_truth, key_path)
+                if ground_truth_val is not None and values_equal(response_val, ground_truth_val):
+                    correct_values += 1
+            return correct_values / len(response_keys)
+        except (json.JSONDecodeError, ValueError, AttributeError, TypeError):
+            return 0.0
+    # Create rubric with multiplicative reward
+    # Keep individual functions for debugging/metrics but use multiplicative for training
+    rubric = vf.Rubric(
+        parser=parser,
+        funcs=[
+            multiplicative_reward,   # Main reward - key_acc * value_acc
+            format_reward,           # Metric only (weight 0)
+            keys_match_reward,       # Metric only (weight 0)
+            values_match_reward,     # Metric only (weight 0)
+        ],
+        weights=[1.0, 0.0, 0.0, 0.0]  # Only multiplicative_reward counts
+    )
+    # Return SingleTurnEnv since this is a one-shot task
+    # No system prompt - let the dataset prompt speak for itself
+    vf_env = vf.SingleTurnEnv(
+        dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        parser=parser,
+        rubric=rubric,
+    )
+    return vf_env

pyproject.toml ADDED Viewed

	@@ -0,0 +1,14 @@

+[project]
+name = "complex-json-output"
+description = "Environment for verifying complex JSON output formatting and correctness"
+tags = ["json", "instruction-following", "verifiable-reward", "train", "eval"]
+version = "0.1.0"
+requires-python = ">=3.10"
+dependencies = [
+    "verifiers>=0.1.5.post0",
+    "datasets",
+]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"

train_complex_json_output.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import verifiers as vf
+"""
+# install
+vf-install complex-json-output (-p /path/to/environments)
+# quick eval
+vf-eval complex-json-output (-m model_name in endpoints.py)
+inference:
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5 vf-vllm --model Qwen/Qwen2.5-1.5B-Instruct \
+    --data-parallel-size 6 --enforce-eager --disable-log-requests
+training:
+CUDA_VISIBLE_DEVICES=6,7 accelerate launch --num-processes 2 \
+    --config-file configs/zero3.yaml examples/grpo/train_complex_json_output.py
+"""
+# Hyperparameters
+HPARAMS = [
+    "per_device_train_batch_size",
+    "num_generations",
+    "gradient_accumulation_steps",
+    "max_tokens",
+    "max_seq_len",
+    "max_prompt_length",
+    "max_completion_length",
+    "temperature",
+    "learning_rate",
+    "max_steps",
+    "warmup_steps",
+    "eval_steps",
+    "save_steps",
+    "beta",
+    "loss_type",
+]
+# Load environment
+vf_env = vf.load_environment(
+    env_id="complex-json-output",
+    num_train_examples=8000,  # Use subset for faster training
+    num_eval_examples=50
+)
+# Model configuration
+model_name = "/raid/workspace/Mango/verifiers/MS3.2-0.35-Beta"
+run_name = "complex-json-grpo_" + model_name.split("/")[-1].lower()
+# Load model and tokenizer
+model, tokenizer = vf.get_model_and_tokenizer(model_name)
+# Training arguments
+training_args = vf.grpo_defaults(run_name=run_name)
+# Batch configuration
+training_args.per_device_train_batch_size = 2
+training_args.num_generations = 16
+training_args.gradient_accumulation_steps = 2
+# Generation parameters
+training_args.max_tokens = 2048  # JSON can be long
+training_args.max_seq_len = 16000
+training_args.max_prompt_length = 8192  # Allow long prompts (questions can be lengthy)
+training_args.max_completion_length = 4096  # Allow long completions
+training_args.temperature = 1.0  # Full diversity for exploration
+# Training schedule
+training_args.learning_rate = 5e-6
+training_args.max_steps = 1000
+training_args.warmup_steps = 15
+# Evaluation
+training_args.eval_strategy = "none"
+training_args.eval_steps = 50
+training_args.per_device_eval_batch_size = 8
+# Checkpointing
+training_args.save_strategy = "steps"
+training_args.save_steps = 100
+# GRPO parameters
+training_args.beta = 0.001  # Conservative KL penalty
+training_args.loss_type = "dr_grpo"  # Recommended: no length bias
+# Logging
+training_args.logging_steps = 1
+training_args.log_completions = True
+training_args.num_completions_to_print = 3
+training_args.report_to = "wandb"  # Disable wandb
+# Create trainer
+trainer = vf.GRPOTrainer(
+    model=model,
+    processing_class=tokenizer,
+    env=vf_env,
+    args=training_args,
+    peft_config=vf.lora_defaults(r=8, alpha=16),  # Use LoRA for efficiency
+)
+# Train
+trainer.train()