Jack O'Quin wrote:
Susan Cragin <susancragin(a)earthlink.net> writes:
Hi. I'm a lurker from the Open-Source Speech
Recognition Initiative.
Welcome. Interesting topic.
We are starting to collect voice samples and need
an audio program
that can segment speech by word, or attempt to, by recognizinig all
the silences and placing markers at each location.
Actually, I happen to have done some research on the topic. I would have
been really happy to give you some matlab and C code, but unfortunately,
I am not allowed to publish the code under any open source license.
What I can do is to give you some advice :) First, it is indeed a
difficult problem. The first aspect is: do you need real time ? Should
it be fast ? The second one are you allowed to have some training data
(which allow a wide range of pattern recognition schemes: it is
supervised learning) ?
A threshold on the energy works as long as your signal is clear, and the
speaker speaks with a more or less constant volume. One thing which is
possible is to adapt the threshold dynamically, by using more or less
clever schemes, such as minimum statistics.
One thing which becomes obvious when you are working on this problem is
that detecting voice is really difficult, but detecting voiced (ie
vowels and some consonants) parts is easier. Many technics are based on
the pitch (using features such as autocorrelation, cepstrum, etc...).
Here is a bibliography which may help:
Supervised technics (ie need a training set) :
@INPROCEEDINGS{Basu2003,
author = {Sumit Basu},
title = {A linked-hmm model for robust voicing and speech detection},
booktitle = {IEEE International Conference on Acoustics, Speech, and
Signal Processing (ICASSP '03)},
year = {2003},
abstract = {We present a novel method for simultaneous voicing and
speech detection based on a linked-HMM architecture, with robust
features that are independent of the signal energy. Because this
approach models the change in dynamics be- tween speech and non-speech
regions, it is robust to low sampling rates, significant levels of
additive noise, and large distances from the microphone. We demonstrate
the perfor- mance of our method in a variety of testing conditions
and also compare it to other methods reported in the literature.},
owner = {david},
keywords = {VAD},
}
@INPROCEEDINGS{Enqing2002,
author = {Dong Enqing and Liu Guizhong and Zhou Yatong and Zhang Xiaodi},
title = {Applying Support Vector Machine to Voice Activity Detection},
booktitle = {6th International Conference on Signal Processing
Procedings (ICSP'02)},
year = {2002},
abstract = {A new voice activity detector (VAD) algorithm using
support vector machines (SVM) is proposed in the paper, and the new VAD
effectiveness is validated. The sequential minimal optimization (SMO)
algorithm for fast training support vector machines is adopted. The
proposed VAD algorithm via SVM (SVM-VAD) also uses the characteristic
parameters set used by G.729 Annex B (G.729B) VAD. Comparing SVM-VAD with
G729B VAD shows that it is effective for applying SVM to VAD. The new
proposed VAD algorithm is integrated with G.729B instead of G.729B VAD,
informal listening tests show that the integrated speech coding system
has a little better efficiency over the G.729B VAD in perceptivity.},
owner = {david},
keywords = {VAD},
}
Real time technics :
@INPROCEEDINGS{Basu2001,
author = {Sumit Basu and Brian Clarkson and Alex Pentland},
title = {Smart headphones: enhancing auditory awareness through robust
speech detection and source localization},
booktitle = {Proceedings of the Int'l Conf. on Acoustics, Speech, and
Signal Processing (ICASSP '01)},
year = {2001},
month = {May},
abstract = {We describe a method for enhancing auditory awareness
by selec- tively passing speech sounds in the environment to the
user. We develop a robust far-eld speech detection algorithm for
noisy en- vironments and a source localization algorithm for exible
arrays. We then combine these methods to give a user control over the
spatial regions from which speech will be passed through. Using this
technique, we have implemented a ¡Ãsmart headphones¡à system in which a
user can be listening to music over headphones and hear speech from
specied directions mixed in. We show our prelim- inary results on
the algorithms and describe initial user feedback about the system.},
owner = {david},
keywords = {VAD Ubiquitous},
}
@INPROCEEDINGS{Zhang2002,
author = {Jianping Zhang, Wayne Ward and Bryan Pellom},
title = {Phone Based Voice Activity Detection Using Online Bayesian
Adapation with Conjugate Normal Distributions},
booktitle = {ICASSP02},
year = {2002},
organization = {IEEE},
abstract = {In this paper, we developed a highly efcient frame-level on-
line adaptive
voice activity detection (VAD) algorithm for the telephone-based
CU Communicator spoken dialog system. The adaptive algorithm uses
prior speaker and channel statistics as well as acoustic features of
current sample frames to update model pa- rameters. The algorithm
achieved .05xRT in contrast to .7xRT of a compared VAD algorithm
using 5-state HMMs. We detail the adaptive algorithm and address some
real-time implementation is- sues. Experiments on live collected data
show that there is a 23% error reduction compared with G.729B VAD.},
owner = {david},
keywords = {VAD},
}
@INPROCEEDINGS{Li2001,
author = {Qi Li and Jinsong Zheng and Qiru Zhou and Chin-Hui Lee},
title = {A Robust, Real-Time Endpoint Detector with Energy Normalization
for ASR in Adverse Environments},
booktitle = {ICASSP01},
year = {2001},
organization = {IEEE},
abstract = {When automatic speech recognition (ASR)
is applied to hands-free or other adverse acoustic environments,
endpoint detection and energy normalization can be crucial to the en-
tire system. In low signal-to-noise (SNR) situations, conven- tional
approaches of endpointing and energy normalization often fail and
ASR performances usually degrade dramati- cally. The goal of this
paper is to nd a fast, accurate, and robust endpointing algorithm
for real-time ASR. We pro- pose a novel approach of using a special
lter plus a 3-state decision logic for endpoint detection. The lter
has been designed under several criteria to ensure the accuracy and
robustness of detection. The detected endpoints are then applied to energy
normalization simultaneously. Evaluation results show that the proposed
algorithm signicantly reduce the string error rates on 7 out of 12 tested
databases. The reduction rates even exceeded 50% on two of them. The
algorithm only uses one-dimensional energy with 24-frame lookahead;
therefore, it has a low complexity and is suitable for real-time ASR.},
owner = {david},
keywords = {VAD},
}
Non real time algorithms (ie need several passes):
@ARTICLE{Spanias1999,
author = {Sassan Ahmadi, Andreas S. Spanias},
title = {Cepstrum-Based Pitch Detection Using a New Statistical V/UV
classification algorithm},
journal = {IEEE Transactions on Speech and Audio Processing},
year = {1999},
volume = {7},
number = {3},
pages = {333-338},
month = {May},
owner = {david},
keywords = {none VAD },
abstract = {An improved cepstrum-based voicing detection and pitch
determination algorithm is
presented. Voicing decisions are made using a multifeature voiced/unvoiced
classication algorithm based on statistical analysis of cepstral peak,
zero-crossing rate, and energy of short-time segments of the speech signal.
Pitch frequency information is extracted by a modied cepstrum-based
method and then carefully rened using pitch tracking, correction, and
smoothing algorithms. Performance analysis on a large database indicates
considerable improvement relative to the conventional cepstrum method. The
proposed algorithm is also shown to be robust to additive noise.},
}
Cheers,
David