Spaces:

Uday
/

ctm-energy-based-halting

Paused

App Files Files Community

Uday commited on 16 days ago

Commit

00d1de8

1 Parent(s): 32e3089

Added model training artifact dashboard and saving artifacts

Browse files

Files changed (3) hide show

index.html +281 -12
tasks/image_classification/train_energy.py +31 -2
verify_dashboard.py +77 -0

index.html CHANGED Viewed

@@ -1,27 +1,296 @@
 <!DOCTYPE html>
-<html>
   <head>
-    <title>CTM Training Status</title>
     <style>
       body {
-        font-family: sans-serif;
         text-align: center;
-        padding: 50px;
       }
       h1 {
-        color: #333;
       }
-      p {
         color: #666;
       }
     </style>
   </head>
   <body>
-    <h1>Training in Progress</h1>
-    <p>
-      The Continuous Thought Machine energy-based halting experiment is
-      currently training.
-    </p>
-    <p>Please check the <strong>Logs</strong> tab for real-time updates.</p>
   </body>
 </html>

 <!DOCTYPE html>
+<html lang="en">
   <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <title>CTM Training Dashboard</title>
     <style>
+      :root {
+        --bg-color: #f4f4f9;
+        --card-bg: #ffffff;
+        --text-color: #333;
+        --accent-color: #4a90e2;
+        --success-color: #2ecc71;
+        --font-family: "Segoe UI", Tahoma, Geneva, Verdana, sans-serif;
+      }
       body {
+        font-family: var(--font-family);
+        background-color: var(--bg-color);
+        color: var(--text-color);
+        margin: 0;
+        padding: 20px;
+        line-height: 1.6;
+      }
+      .container {
+        max-width: 1200px;
+        margin: 0 auto;
+      }
+      header {
         text-align: center;
+        margin-bottom: 40px;
       }
       h1 {
+        color: var(--accent-color);
+        margin-bottom: 10px;
+      }
+      .status-card {
+        background: var(--card-bg);
+        padding: 20px;
+        border-radius: 8px;
+        box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+        margin-bottom: 30px;
+        display: grid;
+        grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+        gap: 20px;
+        text-align: center;
       }
+      .metric {
+        display: flex;
+        flex-direction: column;
+      }
+      .metric-label {
+        font-size: 0.9em;
         color: #666;
       }
+      .metric-value {
+        font-size: 1.5em;
+        font-weight: bold;
+        color: var(--text-color);
+      }
+      .plots-grid {
+        display: grid;
+        grid-template-columns: repeat(auto-fit, minmax(500px, 1fr));
+        gap: 20px;
+        margin-bottom: 30px;
+      }
+      .plot-card {
+        background: var(--card-bg);
+        padding: 15px;
+        border-radius: 8px;
+        box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+      }
+      .plot-card img {
+        width: 100%;
+        height: auto;
+        border-radius: 4px;
+      }
+      .artifacts-section {
+        background: var(--card-bg);
+        padding: 20px;
+        border-radius: 8px;
+        box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+        text-align: center;
+        margin-bottom: 30px;
+      }
+      .btn {
+        display: inline-block;
+        padding: 10px 20px;
+        background-color: var(--accent-color);
+        color: white;
+        text-decoration: none;
+        border-radius: 5px;
+        margin: 0 10px;
+        transition: background-color 0.3s;
+      }
+      .btn:hover {
+        background-color: #357abd;
+      }
+      .gallery {
+        display: grid;
+        grid-template-columns: repeat(auto-fill, minmax(200px, 1fr));
+        gap: 15px;
+        margin-top: 20px;
+      }
+      .gallery img {
+        width: 100%;
+        border-radius: 4px;
+        box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
+      }
+      footer {
+        text-align: center;
+        margin-top: 50px;
+        color: #888;
+        font-size: 0.9em;
+      }
+      #last-updated {
+        font-size: 0.8em;
+        color: #999;
+        margin-top: 5px;
+      }
     </style>
   </head>
   <body>
+    <div class="container">
+      <header>
+        <h1>CTM Training Dashboard</h1>
+        <p>Real-time monitoring of Energy-Based Halting Experiment</p>
+        <div id="last-updated">Waiting for data...</div>
+      </header>
+      <div class="status-card" id="metrics-container">
+        <div class="metric">
+          <span class="metric-label">Iteration</span>
+          <span class="metric-value" id="iter">--</span>
+        </div>
+        <div class="metric">
+          <span class="metric-label">Epoch</span>
+          <span class="metric-value" id="epoch">--</span>
+        </div>
+        <div class="metric">
+          <span class="metric-label">Train Loss</span>
+          <span class="metric-value" id="train-loss">--</span>
+        </div>
+        <div class="metric">
+          <span class="metric-label">Test Loss</span>
+          <span class="metric-value" id="test-loss">--</span>
+        </div>
+        <div class="metric">
+          <span class="metric-label">Train Acc</span>
+          <span class="metric-value" id="train-acc">--</span>
+        </div>
+        <div class="metric">
+          <span class="metric-label">Test Acc</span>
+          <span class="metric-value" id="test-acc">--</span>
+        </div>
+      </div>
+      <div class="plots-grid">
+        <div class="plot-card">
+          <h3>Loss History</h3>
+          <img
+            id="loss-plot"
+            src="logs/scratch/losses.png"
+            alt="Loss Plot"
+            onerror="this.src='https://via.placeholder.com/600x400?text=Waiting+for+Plots'"
+          />
+        </div>
+        <div class="plot-card">
+          <h3>Accuracy History</h3>
+          <img
+            id="acc-plot"
+            src="logs/scratch/accuracies.png"
+            alt="Accuracy Plot"
+            onerror="this.src='https://via.placeholder.com/600x400?text=Waiting+for+Plots'"
+          />
+        </div>
+      </div>
+      <div class="artifacts-section">
+        <h2>Artifacts & Downloads</h2>
+        <p>Download the latest model checkpoints and full logs.</p>
+        <a href="logs/scratch/artifacts.zip" class="btn"
+          >Download All Artifacts (.zip)</a
+        >
+        <a href="logs/scratch/checkpoint.pt" class="btn"
+          >Download Checkpoint (.pt)</a
+        >
+      </div>
+      <div class="artifacts-section">
+        <h2>Attention Visualization</h2>
+        <p>Latest generated attention maps from the model.</p>
+        <div class="gallery" id="gif-gallery">
+          <!-- GIFs will be injected here -->
+          <img
+            src="logs/scratch/0_attention.gif"
+            onerror="this.style.display='none'"
+            alt="Attention Map"
+          />
+        </div>
+      </div>
+    </div>
+    <footer>
+      <p>Continuous Thought Machine Experiment</p>
+    </footer>
+    <script>
+      const LOG_DIR = "logs/scratch";
+      async function updateDashboard() {
+        try {
+          // Fetch status.json
+          const response = await fetch(
+            `${LOG_DIR}/status.json?t=${new Date().getTime()}`
+          );
+          if (!response.ok) throw new Error("Status file not found");
+          const data = await response.json();
+          // Update Metrics
+          document.getElementById(
+            "iter"
+          ).textContent = `${data.iteration} / ${data.total_iterations}`;
+          document.getElementById("epoch").textContent = data.epoch;
+          document.getElementById("train-loss").textContent = parseFloat(
+            data.train_loss
+          ).toFixed(4);
+          document.getElementById("test-loss").textContent = parseFloat(
+            data.test_loss
+          ).toFixed(4);
+          // Handle Accuracy (could be array or float)
+          const formatAcc = (acc) => {
+            if (Array.isArray(acc)) {
+              return (acc[acc.length - 1] * 100).toFixed(2) + "%";
+            }
+            return (acc * 100).toFixed(2) + "%";
+          };
+          document.getElementById("train-acc").textContent = formatAcc(
+            data.train_accuracy
+          );
+          document.getElementById("test-acc").textContent = formatAcc(
+            data.test_accuracy
+          );
+          // Update Timestamp
+          document.getElementById(
+            "last-updated"
+          ).textContent = `Last updated: ${new Date().toLocaleTimeString()}`;
+          // Refresh Images
+          const timestamp = new Date().getTime();
+          document.getElementById(
+            "loss-plot"
+          ).src = `${LOG_DIR}/losses.png?t=${timestamp}`;
+          document.getElementById(
+            "acc-plot"
+          ).src = `${LOG_DIR}/accuracies.png?t=${timestamp}`;
+          // Refresh Gallery (simple approach: try to reload the known gif)
+          const gallery = document.getElementById("gif-gallery");
+          gallery.innerHTML = `<img src="${LOG_DIR}/0_attention.gif?t=${timestamp}" onerror="this.style.display='none'" alt="Attention Map">`;
+        } catch (error) {
+          console.log("Waiting for training to start...", error);
+          document.getElementById("last-updated").textContent =
+            "Waiting for training to start...";
+        }
+      }
+      // Update every 30 seconds
+      setInterval(updateDashboard, 30000);
+      // Initial call
+      updateDashboard();
+    </script>
   </body>
 </html>

tasks/image_classification/train_energy.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import argparse
 import os
 import random
 import matplotlib.pyplot as plt
 import numpy as np
@@ -292,7 +294,7 @@ if __name__=='__main__':
     elif args.model == 'ff':
         model = FFBaseline(
             d_model=args.d_model,
-            d_input=args.d_input,
             out_dims=args.out_dims,
             dropout=args.dropout,
         )
@@ -718,6 +720,27 @@ if __name__=='__main__':
             # Save model checkpoint (conditional metrics)
             # Save model checkpoint (conditional metrics)
             if (bi % args.save_every == 0 or bi == args.training_iterations - 1) and bi != start_iter:
                 if accelerator.is_main_process:
@@ -744,6 +767,12 @@ if __name__=='__main__':
                     accelerator.save(checkpoint_data, f'{args.log_dir}/checkpoint.pt')
                     # Push to Hub
                     if args.push_to_hub and args.hub_model_id:
                         if bi % (args.save_every * 5) == 0: # Upload less frequently
@@ -753,7 +782,7 @@ if __name__=='__main__':
                                     repo_id=args.hub_model_id,
                                     token=args.hub_token,
                                     commit_message=f"Training checkpoint {bi}",
-                                    ignore_patterns=["*.pt"],
                                 )
                             except Exception as e:
                                 print(f"Failed to upload to hub: {e}")

 import argparse
 import os
 import random
+import json
+import shutil
 import matplotlib.pyplot as plt
 import numpy as np
     elif args.model == 'ff':
         model = FFBaseline(
             d_model=args.d_model,
+            backbone_type=args.backbone_type,
             out_dims=args.out_dims,
             dropout=args.dropout,
         )
             # Save model checkpoint (conditional metrics)
+            # Save status.json for the dashboard
+            if (bi % args.track_every == 0 or bi == args.training_iterations - 1) and bi != start_iter:
+                status_data = {
+                    'iteration': bi,
+                    'total_iterations': args.training_iterations,
+                    'epoch': bi // len(trainloader),
+                    'train_loss': train_losses[-1] if train_losses else 0.0,
+                    'test_loss': test_losses[-1] if test_losses else 0.0,
+                    'train_accuracy': train_accuracies[-1] if train_accuracies else 0.0, # Might be array for CTM
+                    'test_accuracy': test_accuracies[-1] if test_accuracies else 0.0, # Might be array for CTM
+                    'learning_rate': current_lr,
+                }
+                # Handle numpy arrays for JSON serialization
+                def convert_to_serializable(obj):
+                    if isinstance(obj, np.ndarray):
+                        return obj.tolist()
+                    return obj
+                with open(f'{args.log_dir}/status.json', 'w') as f:
+                    json.dump(status_data, f, default=convert_to_serializable)
             # Save model checkpoint (conditional metrics)
             if (bi % args.save_every == 0 or bi == args.training_iterations - 1) and bi != start_iter:
                 if accelerator.is_main_process:
                     accelerator.save(checkpoint_data, f'{args.log_dir}/checkpoint.pt')
+                    # Zip artifacts
+                    try:
+                        shutil.make_archive(f'{args.log_dir}/artifacts', 'zip', args.log_dir)
+                    except Exception as e:
+                        print(f"Failed to zip artifacts: {e}")
                     # Push to Hub
                     if args.push_to_hub and args.hub_model_id:
                         if bi % (args.save_every * 5) == 0: # Upload less frequently
                                     repo_id=args.hub_model_id,
                                     token=args.hub_token,
                                     commit_message=f"Training checkpoint {bi}",
+                                    ignore_patterns=[], # Upload everything including .pt and .zip
                                 )
                             except Exception as e:
                                 print(f"Failed to upload to hub: {e}")

verify_dashboard.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import os
+import json
+import subprocess
+import time
+import shutil
+def verify():
+    print("Starting verification...")
+    # Clean up previous logs
+    if os.path.exists('logs/scratch'):
+        shutil.rmtree('logs/scratch')
+    # Run training script for a few iterations
+    # We use a small model (ff) and cifar10 for speed, with minimal iterations
+    cmd = [
+        "pixi", "run", "accelerate", "launch", "--cpu", "tasks/image_classification/train_energy.py",
+        "--model", "ff",
+        "--dataset", "cifar10",
+        "--batch_size", "4",
+        "--training_iterations", "5", # Run for 5 iterations
+        "--track_every", "2", # Track every 2 iterations to ensure we get logs
+        "--save_every", "2", # Save every 2 iterations
+        "--log_dir", "logs/scratch",
+        "--device", "-1" # Use CPU for verification to avoid GPU issues if any
+    ]
+    print(f"Running command: {' '.join(cmd)}")
+    try:
+        subprocess.run(cmd, check=True, capture_output=True)
+    except subprocess.CalledProcessError as e:
+        print("Training failed!")
+        print(e.stderr.decode())
+        return
+    print("Training finished. Checking files...")
+    # Check status.json
+    if os.path.exists('logs/scratch/status.json'):
+        print("[PASS] status.json exists")
+        with open('logs/scratch/status.json', 'r') as f:
+            data = json.load(f)
+            print(f"  - Iteration: {data.get('iteration')}")
+            print(f"  - Train Loss: {data.get('train_loss')}")
+    else:
+        print("[FAIL] status.json missing")
+    # Check artifacts.zip
+    if os.path.exists('logs/scratch/artifacts.zip'):
+        print("[PASS] artifacts.zip exists")
+    else:
+        print("[FAIL] artifacts.zip missing")
+    # Check plots
+    if os.path.exists('logs/scratch/losses.png'):
+        print("[PASS] losses.png exists")
+    else:
+        print("[FAIL] losses.png missing")
+    if os.path.exists('logs/scratch/accuracies.png'):
+        print("[PASS] accuracies.png exists")
+    else:
+        print("[FAIL] accuracies.png missing")
+    # Check index.html content (simple check)
+    if os.path.exists('index.html'):
+        with open('index.html', 'r') as f:
+            content = f.read()
+            if 'CTM Training Dashboard' in content and 'status.json' in content:
+                print("[PASS] index.html looks correct")
+            else:
+                print("[FAIL] index.html content incorrect")
+    else:
+        print("[FAIL] index.html missing")
+if __name__ == "__main__":
+    verify()