Model parallel (#538)
Browse files* model-parallel for single process
* fix device/device_map
* fix handling for device
src/axolotl/utils/bench.py
CHANGED
|
@@ -28,7 +28,7 @@ def gpu_memory_usage_smi(device=0):
|
|
| 28 |
|
| 29 |
|
| 30 |
def log_gpu_memory_usage(log, msg, device):
|
| 31 |
-
if not torch.cuda.is_available():
|
| 32 |
return (0, 0, 0)
|
| 33 |
|
| 34 |
usage, cache, misc = gpu_memory_usage_all(device)
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
def log_gpu_memory_usage(log, msg, device):
|
| 31 |
+
if not torch.cuda.is_available() or device == "auto":
|
| 32 |
return (0, 0, 0)
|
| 33 |
|
| 34 |
usage, cache, misc = gpu_memory_usage_all(device)
|
src/axolotl/utils/config.py
CHANGED
|
@@ -25,7 +25,9 @@ def choose_device(cfg):
|
|
| 25 |
return "cpu"
|
| 26 |
|
| 27 |
cfg.device = get_device()
|
| 28 |
-
if cfg.
|
|
|
|
|
|
|
| 29 |
if cfg.device.startswith("cuda"):
|
| 30 |
cfg.device_map = {"": cfg.local_rank}
|
| 31 |
else:
|
|
|
|
| 25 |
return "cpu"
|
| 26 |
|
| 27 |
cfg.device = get_device()
|
| 28 |
+
if cfg.world_size == 1:
|
| 29 |
+
cfg.device_map = "auto"
|
| 30 |
+
else:
|
| 31 |
if cfg.device.startswith("cuda"):
|
| 32 |
cfg.device_map = {"": cfg.local_rank}
|
| 33 |
else:
|