Fix: runtime error, workload was not healthy after 30 min
Browse files- Dockerfile +1 -3
- README.md +14 -1
- entrypoint.sh +4 -0
Dockerfile
CHANGED
|
@@ -48,9 +48,7 @@ RUN mkdir -p /tmp/matplotlib /tmp/numba_cache && \
|
|
| 48 |
# Switch to the non-root user
|
| 49 |
USER user
|
| 50 |
|
| 51 |
-
#
|
| 52 |
-
# This writes to ~/.cache/huggingface/accelerate/default_config.yaml
|
| 53 |
-
RUN python -c "from accelerate.utils import write_basic_config; write_basic_config(mixed_precision='fp16')"
|
| 54 |
|
| 55 |
ENTRYPOINT ["/app/entrypoint.sh"]
|
| 56 |
CMD ["--energy_head_enabled", "--loss_type", "energy_contrastive", "--push_to_hub", "--hub_model_id", "Uday/ctm-energy-based-halting"]
|
|
|
|
| 48 |
# Switch to the non-root user
|
| 49 |
USER user
|
| 50 |
|
| 51 |
+
# Accelerate configuration is now handled in entrypoint.sh at runtime
|
|
|
|
|
|
|
| 52 |
|
| 53 |
ENTRYPOINT ["/app/entrypoint.sh"]
|
| 54 |
CMD ["--energy_head_enabled", "--loss_type", "energy_contrastive", "--push_to_hub", "--hub_model_id", "Uday/ctm-energy-based-halting"]
|
README.md
CHANGED
|
@@ -224,12 +224,25 @@ If you use this code or build upon CTM in your work, please cite:
|
|
| 224 |
```bibtex
|
| 225 |
@article{ctm2025,
|
| 226 |
title={The Continuous Thought Machine},
|
| 227 |
-
author={
|
| 228 |
journal={arXiv preprint arXiv:2505.05522},
|
| 229 |
year={2025}
|
| 230 |
}
|
| 231 |
```
|
| 232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
---
|
| 234 |
|
| 235 |
## 📝 License
|
|
|
|
| 224 |
```bibtex
|
| 225 |
@article{ctm2025,
|
| 226 |
title={The Continuous Thought Machine},
|
| 227 |
+
author={Darlow, Luke and Regan, Ciaran and Risi, Sebastian and Seely, Jeffrey and Jones, Llion},
|
| 228 |
journal={arXiv preprint arXiv:2505.05522},
|
| 229 |
year={2025}
|
| 230 |
}
|
| 231 |
```
|
| 232 |
|
| 233 |
+
### Energy-Based Halting Extension
|
| 234 |
+
|
| 235 |
+
This repository contains experimental extensions for Energy-Based Halting developed by **Uday Phalak**.
|
| 236 |
+
|
| 237 |
+
```bibtex
|
| 238 |
+
@misc{ctmenergy2025,
|
| 239 |
+
title={Energy-Based Halting for Continuous Thought Machines},
|
| 240 |
+
author={Phalak, Uday},
|
| 241 |
+
year={2025},
|
| 242 |
+
note={Experimental Extension}
|
| 243 |
+
}
|
| 244 |
+
```
|
| 245 |
+
|
| 246 |
---
|
| 247 |
|
| 248 |
## 📝 License
|
entrypoint.sh
CHANGED
|
@@ -15,5 +15,9 @@ if [ -n "$HF_TOKEN" ]; then
|
|
| 15 |
args+=("--hub_token" "$HF_TOKEN")
|
| 16 |
fi
|
| 17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
# Run accelerate launch with the training script and arguments
|
| 19 |
exec accelerate launch tasks/image_classification/train_energy.py "${args[@]}"
|
|
|
|
| 15 |
args+=("--hub_token" "$HF_TOKEN")
|
| 16 |
fi
|
| 17 |
|
| 18 |
+
# Generate Accelerate config at runtime to detect GPUs correctly
|
| 19 |
+
# This writes to ~/.cache/huggingface/accelerate/default_config.yaml
|
| 20 |
+
python -c "from accelerate.utils import write_basic_config; write_basic_config(mixed_precision='fp16')"
|
| 21 |
+
|
| 22 |
# Run accelerate launch with the training script and arguments
|
| 23 |
exec accelerate launch tasks/image_classification/train_energy.py "${args[@]}"
|