ctm-energy-based-halting / entrypoint.sh
Uday's picture
Fix: runtime error, workload was not healthy after 30 min
6914bc9
raw
history blame
844 Bytes
#!/bin/bash
set -e
# Collect arguments passed to the container (CMD)
args=("$@")
# Sanitize OMP_NUM_THREADS if it's not an integer (e.g. "3500m" from HF Spaces)
if ! [[ "$OMP_NUM_THREADS" =~ ^[0-9]+$ ]]; then
echo "WARNING: OMP_NUM_THREADS is '$OMP_NUM_THREADS', which is not an integer. Resetting to 1."
export OMP_NUM_THREADS=1
fi
# If HF_TOKEN is set, append it to the arguments
if [ -n "$HF_TOKEN" ]; then
args+=("--hub_token" "$HF_TOKEN")
fi
# Generate Accelerate config at runtime to detect GPUs correctly
# This writes to ~/.cache/huggingface/accelerate/default_config.yaml
python -c "from accelerate.utils import write_basic_config; write_basic_config(mixed_precision='fp16')"
# Run accelerate launch with the training script and arguments
exec accelerate launch tasks/image_classification/train_energy.py "${args[@]}"