File size: 2,250 Bytes
d21d362
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#include <iostream>
#include <vector>
#include <cmath>
#include <iomanip>      // std::fixed, std::setprecision

// 自定义头文件
#include "wav.h"         // 包含 wav::WavReader 定义
#include "time_stamp.h"          // 包含 timestamp_t 定义
#include "vad_iterator.h"       // 包含 VadIterator 类声明


int main(int argc, char* argv[]) {
    if (argc < 3) {
        std::cerr << "Usage: " << argv[0] << " <model_absolute_path>" << " <audio_file_absolute_path>" << std::endl;
        return 1;
    }

    // 获取命令行传入的音频文件路径
    std::string model_path = argv[1];
    std::string wav_path = argv[2];

    // std::string model_path = "/Users/chenxiang/translator/Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx";
    // std::string wav_path = "/Users/chenxiang/translator/core/whisper_wrapper/bin/zh.wav";
    
    // Read the WAV file (expects 16000 Hz, mono, PCM).
    wav::WavReader wav_reader(wav_path); // File located in the "audio" folder.
    int numSamples = wav_reader.num_samples();
    std::vector<float> input_wav(static_cast<size_t>(numSamples));
    for (size_t i = 0; i < static_cast<size_t>(numSamples); i++) {
        input_wav[i] = static_cast<float>(*(wav_reader.data() + i));
    }
    // Initialize the VadIterator.
    VadIterator vad(model_path);

    // Process the audio.
    vad.process(input_wav);

    // Retrieve the speech timestamps (in samples).
    std::vector<timestamp_t> stamps = vad.get_speech_timestamps();

    // Convert timestamps to seconds and round to one decimal place (for 16000 Hz).
    const float sample_rate_float = 16000.0f;
    for (size_t i = 0; i < stamps.size(); i++) {
        float start_sec = std::rint((stamps[i].start / sample_rate_float) * 10.0f) / 10.0f;
        float end_sec = std::rint((stamps[i].end / sample_rate_float) * 10.0f) / 10.0f;
        std::cout << "Speech detected from "
            << std::fixed << std::setprecision(1) << start_sec
            << " s to "
            << std::fixed << std::setprecision(1) << end_sec
            << " s" 
            << " [ " << stamps[i].start << " " << stamps[i].end <<" ]"
            << std::endl;
    }

    // Optionally, reset the internal state.
    vad.reset();

    return 0;
}