#include #include #include #include // std::fixed, std::setprecision // 自定义头文件 #include "wav.h" // 包含 wav::WavReader 定义 #include "time_stamp.h" // 包含 timestamp_t 定义 #include "vad_iterator.h" // 包含 VadIterator 类声明 int main(int argc, char* argv[]) { if (argc < 3) { std::cerr << "Usage: " << argv[0] << " " << " " << std::endl; return 1; } // 获取命令行传入的音频文件路径 std::string model_path = argv[1]; std::string wav_path = argv[2]; // std::string model_path = "/Users/chenxiang/translator/Translator/moyoyo_asr_models/silero-vad/silero_vad.onnx"; // std::string wav_path = "/Users/chenxiang/translator/core/whisper_wrapper/bin/zh.wav"; // Read the WAV file (expects 16000 Hz, mono, PCM). wav::WavReader wav_reader(wav_path); // File located in the "audio" folder. int numSamples = wav_reader.num_samples(); std::vector input_wav(static_cast(numSamples)); for (size_t i = 0; i < static_cast(numSamples); i++) { input_wav[i] = static_cast(*(wav_reader.data() + i)); } // Initialize the VadIterator. VadIterator vad(model_path); // Process the audio. vad.process(input_wav); // Retrieve the speech timestamps (in samples). std::vector stamps = vad.get_speech_timestamps(); // Convert timestamps to seconds and round to one decimal place (for 16000 Hz). const float sample_rate_float = 16000.0f; for (size_t i = 0; i < stamps.size(); i++) { float start_sec = std::rint((stamps[i].start / sample_rate_float) * 10.0f) / 10.0f; float end_sec = std::rint((stamps[i].end / sample_rate_float) * 10.0f) / 10.0f; std::cout << "Speech detected from " << std::fixed << std::setprecision(1) << start_sec << " s to " << std::fixed << std::setprecision(1) << end_sec << " s" << " [ " << stamps[i].start << " " << stamps[i].end <<" ]" << std::endl; } // Optionally, reset the internal state. vad.reset(); return 0; }