Training in progress - step 500
Browse files- asr_config.py +2 -0
- asr_modeling.py +1 -7
- config.json +1 -0
- model.safetensors +1 -1
asr_config.py
CHANGED
|
@@ -26,6 +26,7 @@ class ASRConfig(transformers.PretrainedConfig):
|
|
| 26 |
projector_num_layers: int = 2, # Number of layers (for residual projector)
|
| 27 |
projector_dropout: float = 0.05, # Dropout rate for projector layers
|
| 28 |
projector_input_noise: float = 0.02, # Input noise for projector
|
|
|
|
| 29 |
inference_diversity_penalty: float = 0.0,
|
| 30 |
inference_warmup_tokens: int = 10,
|
| 31 |
max_new_tokens: Optional[int] = None,
|
|
@@ -72,6 +73,7 @@ class ASRConfig(transformers.PretrainedConfig):
|
|
| 72 |
self.projector_num_layers = projector_num_layers
|
| 73 |
self.projector_dropout = projector_dropout
|
| 74 |
self.projector_input_noise = projector_input_noise
|
|
|
|
| 75 |
self.inference_diversity_penalty = inference_diversity_penalty
|
| 76 |
self.inference_warmup_tokens = inference_warmup_tokens
|
| 77 |
if "audio_config" not in kwargs:
|
|
|
|
| 26 |
projector_num_layers: int = 2, # Number of layers (for residual projector)
|
| 27 |
projector_dropout: float = 0.05, # Dropout rate for projector layers
|
| 28 |
projector_input_noise: float = 0.02, # Input noise for projector
|
| 29 |
+
label_smoothing: float = 0.0, # Label smoothing for cross-entropy loss
|
| 30 |
inference_diversity_penalty: float = 0.0,
|
| 31 |
inference_warmup_tokens: int = 10,
|
| 32 |
max_new_tokens: Optional[int] = None,
|
|
|
|
| 73 |
self.projector_num_layers = projector_num_layers
|
| 74 |
self.projector_dropout = projector_dropout
|
| 75 |
self.projector_input_noise = projector_input_noise
|
| 76 |
+
self.label_smoothing = label_smoothing
|
| 77 |
self.inference_diversity_penalty = inference_diversity_penalty
|
| 78 |
self.inference_warmup_tokens = inference_warmup_tokens
|
| 79 |
if "audio_config" not in kwargs:
|
asr_modeling.py
CHANGED
|
@@ -118,10 +118,6 @@ class ASRModel(PreTrainedModel):
|
|
| 118 |
"low_cpu_mem_usage": True,
|
| 119 |
"dtype": dtype,
|
| 120 |
}
|
| 121 |
-
# Only use device_map="auto" when NOT loading from pretrained
|
| 122 |
-
# (avoids meta tensor conflicts during from_pretrained)
|
| 123 |
-
if not cls._is_loading_from_pretrained:
|
| 124 |
-
encoder_kwargs["device_map"] = "auto"
|
| 125 |
|
| 126 |
if "whisper" in config.audio_model_id.lower():
|
| 127 |
from transformers import WhisperModel
|
|
@@ -146,9 +142,6 @@ class ASRModel(PreTrainedModel):
|
|
| 146 |
"low_cpu_mem_usage": True,
|
| 147 |
"dtype": dtype,
|
| 148 |
}
|
| 149 |
-
# Only use device_map="auto" when NOT loading from pretrained
|
| 150 |
-
if not cls._is_loading_from_pretrained:
|
| 151 |
-
decoder_kwargs["device_map"] = "auto"
|
| 152 |
|
| 153 |
decoder = AutoModelForCausalLM.from_pretrained(config.text_model_id, **decoder_kwargs)
|
| 154 |
decoder.config.use_cache = getattr(config, "use_cache", True)
|
|
@@ -393,6 +386,7 @@ class ASRModel(PreTrainedModel):
|
|
| 393 |
shift_logits.view(-1, shift_logits.size(-1)),
|
| 394 |
shift_labels.view(-1),
|
| 395 |
ignore_index=-100,
|
|
|
|
| 396 |
)
|
| 397 |
|
| 398 |
return CausalLMOutputWithPast(
|
|
|
|
| 118 |
"low_cpu_mem_usage": True,
|
| 119 |
"dtype": dtype,
|
| 120 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
if "whisper" in config.audio_model_id.lower():
|
| 123 |
from transformers import WhisperModel
|
|
|
|
| 142 |
"low_cpu_mem_usage": True,
|
| 143 |
"dtype": dtype,
|
| 144 |
}
|
|
|
|
|
|
|
|
|
|
| 145 |
|
| 146 |
decoder = AutoModelForCausalLM.from_pretrained(config.text_model_id, **decoder_kwargs)
|
| 147 |
decoder.config.use_cache = getattr(config, "use_cache", True)
|
|
|
|
| 386 |
shift_logits.view(-1, shift_logits.size(-1)),
|
| 387 |
shift_labels.view(-1),
|
| 388 |
ignore_index=-100,
|
| 389 |
+
label_smoothing=getattr(self.config, "label_smoothing", 0.0),
|
| 390 |
)
|
| 391 |
|
| 392 |
return CausalLMOutputWithPast(
|
config.json
CHANGED
|
@@ -68,6 +68,7 @@
|
|
| 68 |
"encoder_dim": 1280,
|
| 69 |
"inference_diversity_penalty": 0.0,
|
| 70 |
"inference_warmup_tokens": 10,
|
|
|
|
| 71 |
"llm_dim": 2048,
|
| 72 |
"max_new_tokens": 128,
|
| 73 |
"min_new_tokens": 1,
|
|
|
|
| 68 |
"encoder_dim": 1280,
|
| 69 |
"inference_diversity_penalty": 0.0,
|
| 70 |
"inference_warmup_tokens": 10,
|
| 71 |
+
"label_smoothing": 0.1,
|
| 72 |
"llm_dim": 2048,
|
| 73 |
"max_new_tokens": 128,
|
| 74 |
"min_new_tokens": 1,
|
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 144762160
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3a25deecc1f7a0eef0322e88451ff27f6eb9ade7e853e21df8eb8afee152b736
|
| 3 |
size 144762160
|