jamesaasher
/

quickdraw-text-diffusion

+# Text-Conditional Diffusion Configuration
+IMAGE_SIZE = 64
+BATCH_SIZE = 128  # 32 per GPU across 4 GPUs
+NUM_EPOCHS = 100  # Longer training for better text conditioning
+LEARNING_RATE = 1e-4
+TIMESTEPS = 1000
+# Model architecture
+CHANNELS = 256  # Maximum capacity for best feature learning
+TIME_DIM = 128
+TEXT_DIM = 512  # CLIP embedding dimension
+# Text encoder
+CLIP_MODEL = "openai/clip-vit-base-patch32"
+FREEZE_CLIP = True  # Freeze CLIP weights during training
+# Classifier-free guidance
+CFG_DROP_PROB = 0.15  # Increased to 15% for stronger conditioning contrast
+CFG_GUIDANCE_SCALE = 5.0  # Higher default guidance scale
+# Data
+DATASET_NAME = "Xenova/quickdraw-small"
+MAX_SAMPLES = None  # Set to None to use all samples, or specify a number
+NUM_CLASSES_FILTER = 5  # Small set for clear proof-of-concept
+NUM_CLASSES = None  # Will be set after loading dataset