File size: 2,123 Bytes
82e506e
989d8ec
82e506e
989d8ec
 
8953d03
989d8ec
 
 
 
 
82e506e
 
989d8ec
 
8953d03
 
 
 
 
 
 
 
 
 
 
989d8ec
 
82e506e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
989d8ec
82e506e
989d8ec
8953d03
82e506e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
{
  "add_prefix_space": false,
  "backend": "tokenizers",
  "bos_token": "<s>",
  "clean_up_tokenization_spaces": false,
  "effective_vocab_size": 114822,
  "eos_token": "</s>",
  "errors": "replace",
  "fix_mistral_regex": true,
  "is_local": false,
  "local_files_only": false,
  "model_max_length": 131072,
  "model_type": "byte_level_bpe",
  "no_audio_codec_tokens": true,
  "no_dense_timestamp_tokens": true,
  "open_formosa": {
    "required_special_token_count": 157,
    "required_special_tokens_present": true,
    "required_special_tokens_single_id": true,
    "standard_special_tokens": {
      "bos_token": "<s>",
      "eos_token": "</s>",
      "pad_token": "<pad>",
      "unk_token": "<unk>"
    }
  },
  "pad_token": "<pad>",
  "padding_side": "right",
  "rich_transcription": {
    "allow_non_speech_events": true,
    "compact_json": true,
    "default_format": "json_segments",
    "enabled": true,
    "include_content": true,
    "include_speaker": true,
    "include_start_end": true,
    "no_dense_timestamp_tokens": true,
    "timestamp_precision_digits": 2,
    "timestamp_unit": "seconds"
  },
  "special_tokens": [
    "<|pad|>",
    "<|bos|>",
    "<|eos|>",
    "<|unk|>",
    "<|system|>",
    "<|user_channel|>",
    "<|assistant_channel|>",
    "<|task:speech_to_text|>",
    "<|task:text_to_speech|>",
    "<|input_audio_start|>",
    "<|input_audio_end|>",
    "<|audio_ref_start|>",
    "<|audio_ref_end|>",
    "<|audio_start|>",
    "<|audio_end|>",
    "<|speech_start|>",
    "<|speech_end|>",
    "<|transcript_start|>",
    "<|transcript_end|>",
    "<|segment_start|>",
    "<|segment_end|>",
    "<|speaker|>",
    "<|start_time|>",
    "<|end_time|>",
    "<|duration|>",
    "<|content|>",
    "<|non_speech_event|>",
    "<|retrieval_result_start|>",
    "<|retrieval_result_end|>",
    "<|ocr_start|>",
    "<|ocr_end|>",
    "<|image_start|>",
    "<|image_end|>",
    "<|video_start|>",
    "<|video_end|>"
  ],
  "strict_no_dense_timestamp_tokens": true,
  "tokenizer_class": "GPT2Tokenizer",
  "truncation_side": "right",
  "unk_token": "<unk>",
  "vocab_size": 114688
}