import json import random import os # config OUTPUT_FILE = "ecommerce_test_holdout.json" NOUNS_FILE = "gen_nouns.json" TEST_SIZE = 1000 CATEGORIES = [ "Toys", "Stationery", "Sports", "Pet Supplies", "Home & Kitchen", "Groceries", "Electronics", "Clothing", "Books", "Beauty", "Automotive", "Accessories" ] BASE_NOUNS = { "Toys": ["Doll", "Car", "Ball", "Puzzle", "Block", "Robot", "Teddy Bear", "Kite", "Game"], "Stationery": ["Pen", "Pencil", "Notebook", "Paper", "Glue", "Eraser", "Ruler", "Marker"], "Sports": ["Ball", "Racket", "Bat", "Helmet", "Jersey", "Net", "Gloves", "Mat"], "Pet Supplies": ["Leash", "Collar", "Food", "Bowl", "Bed", "Toy", "Cage", "Tank"], "Home & Kitchen": ["Pan", "Pot", "Knife", "Spoon", "Fork", "Plate", "Cup", "Towel"], "Groceries": ["Bread", "Milk", "Rice", "Pasta", "Apple", "Banana", "Water", "Snack"], "Electronics": ["Phone", "Laptop", "Mouse", "Keyboard", "Screen", "Cable", "Battery"], "Clothing": ["Shirt", "Pants", "Shoe", "Hat", "Sock", "Jacket", "Coat", "Dress"], "Books": ["Novel", "Textbook", "Guide", "Comic", "Journal", "Atlas", "Biography"], "Beauty": ["Soap", "Cream", "Lotion", "Perfume", "Makeup", "Shampoo", "Brush"], "Automotive": ["Tire", "Oil", "Wiper", "Mat", "Wax", "Cleaner", "Light", "Filter"], "Accessories": ["Bag", "Wallet", "Watch", "Ring", "Glasses", "Belt", "Scarf"] } ADJECTIVES = [ "Premium", "Deluxe", "Basic", "Vintage", "Modern", "Eco-friendly", "Handmade", "Digital", "Analog", "Portable", "Heavy-Duty", "Lightweight", "Compact", "Professional", "Essential", "Luxury", "Smart", "Wireless", "Ergonomic", "Durable", "Waterproof", "Adjustable", "Foldable", "Rechargeable", "Electric", "Solar-Powered", "Magnetic" ] COLORS = [ "Red", "Blue", "Green", "Black", "White", "Silver", "Gold", "Rose Gold", "Matte Black", "Navy", "Teal", "Pink", "Purple", "Yellow", "Orange", "Grey", "Transparent", "Multicolor" ] MATERIALS = [ "Plastic", "Metal", "Wooden", "Leather", "Cotton", "Silk", "Ceramic", "Glass", "Stainless Steel", "Aluminum", "Carbon Fiber", "Silicon", "Bamboo", "Nylon", "Velvet" ] def load_nouns(): # try to load the expanded list, fallback to basics if the file is missing. if os.path.exists(NOUNS_FILE): print(f"Loading expanded vocabulary from {NOUNS_FILE}...") with open(NOUNS_FILE, 'r', encoding='utf-8') as f: return json.load(f) else: print("Expanded nouns file not found. Using basic hardcoded nouns.") return BASE_NOUNS def generate_test_data(): vocab = load_nouns() test_items = [] print(f"Generating {TEST_SIZE} unseen test items...") for _ in range(TEST_SIZE): cat = random.choice(CATEGORIES) # if the category is missing, just use "Item" to avoid crashing. noun_list = vocab.get(cat, BASE_NOUNS.get(cat, ["Item"])) noun = random.choice(noun_list) # we are intentionally creating messy, chaotic combinations here. # if the model memorized phrases, this will break it. structure = random.randint(1, 4) if structure == 1: # maximalist: color + adj + mat + noun. name = f"{random.choice(COLORS)} {random.choice(ADJECTIVES)} {random.choice(MATERIALS)} {noun}" elif structure == 2: # material first. weird english, but valid. name = f"{random.choice(MATERIALS)} {random.choice(ADJECTIVES)} {noun}" elif structure == 3: # double color with conjunction. confusing for naive models. name = f"{random.choice(COLORS)} and {random.choice(COLORS)} {noun}" else: # standard adj + noun. name = f"{random.choice(ADJECTIVES)} {noun}" test_items.append({"product": name, "category": cat}) print(f"Saving to {OUTPUT_FILE}...") with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(test_items, f, indent=4) print("Done.") if __name__ == "__main__": generate_test_data()