摘要:
在上一篇的文档中,分析unimrcp中vad算法的诸多弊端,但是有没有一种更好的算法来取代呢。目前有两种方式 1. GMM 2. DNN。
其中鼎鼎大名的WebRTC VAD就是采用了GMM 算法来完成voice active dector。今天笔者重点介绍WebRTC VAD算法。在后面的文章中,
我们在刨析DNN在VAD的中应用。下面的章节中,将介绍WebRTC的检测原理。
原理:
首先呢,我们要了解一下人声和乐器的频谱范围,下图是音频的频谱。
本图来源于网络
根据音频的频谱划分了6个子带,80Hz~250Hz,250Hz~500Hz,500Hz~1K,1K~2K,2K~3K,3K~4K,分别计算出每个子带的特征。
步骤:
1. 准备工作
1.1 WebRTC的检测模式分为了4种:
0: Normal, 1. low Bitrate 2.Aggressive 3. Very Aggressive ,其激进程序与数值大小相关,可以根据实际的使用在初始化的时候可以配置。
// Set aggressiveness mode int WebRtcVad_set_mode_core(VadInstT *self, int mode) { int return_value = 0; switch (mode) { case 0: // Quality mode. memcpy(self->over_hang_max_1, kOverHangMax1Q, sizeof(self->over_hang_max_1)); memcpy(self->over_hang_max_2, kOverHangMax2Q, sizeof(self->over_hang_max_2)); memcpy(self->individual, kLocalThresholdQ, sizeof(self->individual)); memcpy(self->total, kGlobalThresholdQ, sizeof(self->total)); break; case 1: // Low bitrate mode. memcpy(self->over_hang_max_1, kOverHangMax1LBR, sizeof(self->over_hang_max_1)); memcpy(self->over_hang_max_2, kOverHangMax2LBR, sizeof(self->over_hang_max_2)); memcpy(self->individual, kLocalThresholdLBR, sizeof(self->individual)); memcpy(self->total, kGlobalThresholdLBR, sizeof(self->total)); break; case 2: // Aggressive mode. memcpy(self->over_hang_max_1, kOverHangMax1AGG, sizeof(self->over_hang_max_1)); memcpy(self->over_hang_max_2, kOverHangMax2AGG, sizeof(self->over_hang_max_2)); memcpy(self->individual, kLocalThresholdAGG, sizeof(self->individual)); memcpy(self->total, kGlobalThresholdAGG, sizeof(self->total)); break; case 3: // Very aggressive mode. memcpy(self->over_hang_max_1, kOverHangMax1VAG, sizeof(self->over_hang_max_1)); memcpy(self->over_hang_max_2, kOverHangMax2VAG, sizeof(self->over_hang_max_2)); memcpy(self->individual, kLocalThresholdVAG, sizeof(self->individual)); memcpy(self->total, kGlobalThresholdVAG, sizeof(self->total)); break; default: return_value = -1; break; } return return_value; }