import json
import random
import os

# config
OUTPUT_FILE = "ecommerce_test_holdout.json"
NOUNS_FILE = "gen_nouns.json"
TEST_SIZE = 1000

CATEGORIES = [
    "Toys", "Stationery", "Sports", "Pet Supplies", "Home & Kitchen",
    "Groceries", "Electronics", "Clothing", "Books", "Beauty",
    "Automotive", "Accessories"
]

BASE_NOUNS = {
    "Toys": ["Doll", "Car", "Ball", "Puzzle", "Block", "Robot", "Teddy Bear", "Kite", "Game"],
    "Stationery": ["Pen", "Pencil", "Notebook", "Paper", "Glue", "Eraser", "Ruler", "Marker"],
    "Sports": ["Ball", "Racket", "Bat", "Helmet", "Jersey", "Net", "Gloves", "Mat"],
    "Pet Supplies": ["Leash", "Collar", "Food", "Bowl", "Bed", "Toy", "Cage", "Tank"],
    "Home & Kitchen": ["Pan", "Pot", "Knife", "Spoon", "Fork", "Plate", "Cup", "Towel"],
    "Groceries": ["Bread", "Milk", "Rice", "Pasta", "Apple", "Banana", "Water", "Snack"],
    "Electronics": ["Phone", "Laptop", "Mouse", "Keyboard", "Screen", "Cable", "Battery"],
    "Clothing": ["Shirt", "Pants", "Shoe", "Hat", "Sock", "Jacket", "Coat", "Dress"],
    "Books": ["Novel", "Textbook", "Guide", "Comic", "Journal", "Atlas", "Biography"],
    "Beauty": ["Soap", "Cream", "Lotion", "Perfume", "Makeup", "Shampoo", "Brush"],
    "Automotive": ["Tire", "Oil", "Wiper", "Mat", "Wax", "Cleaner", "Light", "Filter"],
    "Accessories": ["Bag", "Wallet", "Watch", "Ring", "Glasses", "Belt", "Scarf"]
}

ADJECTIVES = [
    "Premium", "Deluxe", "Basic", "Vintage", "Modern", "Eco-friendly", "Handmade", 
    "Digital", "Analog", "Portable", "Heavy-Duty", "Lightweight", "Compact", "Professional",
    "Essential", "Luxury", "Smart", "Wireless", "Ergonomic", "Durable", "Waterproof",
    "Adjustable", "Foldable", "Rechargeable", "Electric", "Solar-Powered", "Magnetic"
]

COLORS = [
    "Red", "Blue", "Green", "Black", "White", "Silver", "Gold", "Rose Gold", "Matte Black",
    "Navy", "Teal", "Pink", "Purple", "Yellow", "Orange", "Grey", "Transparent", "Multicolor"
]

MATERIALS = [
    "Plastic", "Metal", "Wooden", "Leather", "Cotton", "Silk", "Ceramic", "Glass", 
    "Stainless Steel", "Aluminum", "Carbon Fiber", "Silicon", "Bamboo", "Nylon", "Velvet"
]

def load_nouns():
    # try to load the expanded list, fallback to basics if the file is missing.
    if os.path.exists(NOUNS_FILE):
        print(f"Loading expanded vocabulary from {NOUNS_FILE}...")
        with open(NOUNS_FILE, 'r', encoding='utf-8') as f:
            return json.load(f)
    else:
        print("Expanded nouns file not found. Using basic hardcoded nouns.")
        return BASE_NOUNS

def generate_test_data():
    vocab = load_nouns()
    test_items = []
    
    print(f"Generating {TEST_SIZE} unseen test items...")
    
    for _ in range(TEST_SIZE):
        cat = random.choice(CATEGORIES)
        
        # if the category is missing, just use "Item" to avoid crashing.
        noun_list = vocab.get(cat, BASE_NOUNS.get(cat, ["Item"]))
        noun = random.choice(noun_list)
        
        # we are intentionally creating messy, chaotic combinations here.
        # if the model memorized phrases, this will break it.
        structure = random.randint(1, 4)
        
        if structure == 1: 
            # maximalist: color + adj + mat + noun.
            name = f"{random.choice(COLORS)} {random.choice(ADJECTIVES)} {random.choice(MATERIALS)} {noun}"
            
        elif structure == 2: 
            # material first. weird english, but valid.
            name = f"{random.choice(MATERIALS)} {random.choice(ADJECTIVES)} {noun}"
            
        elif structure == 3: 
            # double color with conjunction. confusing for naive models.
            name = f"{random.choice(COLORS)} and {random.choice(COLORS)} {noun}"
            
        else: 
            # standard adj + noun.
            name = f"{random.choice(ADJECTIVES)} {noun}"
            
        test_items.append({"product": name, "category": cat})

    print(f"Saving to {OUTPUT_FILE}...")
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(test_items, f, indent=4)
    print("Done.")

if __name__ == "__main__":
    generate_test_data()