From 5904ae746f226af77fabb8651d54e8d0f458923a Mon Sep 17 00:00:00 2001 From: drowe67 Date: Mon, 15 Oct 2012 20:44:16 +0000 Subject: [PATCH] optional use of different post proc for NLP pitch est git-svn-id: https://svn.code.sf.net/p/freetel/code@755 01035d8c-6547-0410-b346-abe4f91aad63 --- codec2-dev/src/c2sim.c | 52 ++++++++---- codec2-dev/src/codec2.c | 2 +- codec2-dev/src/nlp.c | 173 ++++++++++++++++++++++++++++++++++++++-- codec2-dev/src/nlp.h | 3 +- 4 files changed, 207 insertions(+), 23 deletions(-) diff --git a/codec2-dev/src/c2sim.c b/codec2-dev/src/c2sim.c index 0d9b41c4..2893b015 100644 --- a/codec2-dev/src/c2sim.c +++ b/codec2-dev/src/c2sim.c @@ -49,7 +49,7 @@ #include "ampexp.h" #include "phaseexp.h" -void synth_one_frame(kiss_fft_cfg fft_inv_cfg, short buf[], MODEL *model, float Sn_[], float Pn[]); +void synth_one_frame(kiss_fft_cfg fft_inv_cfg, short buf[], MODEL *model, float Sn_[], float Pn[], int prede, float *de_mem); void print_help(const struct option *long_options, int num_opts, char* argv[]); @@ -65,6 +65,7 @@ int main(int argc, char *argv[]) FILE *fin; /* input speech file */ short buf[N]; /* input/output buffer */ float Sn[M]; /* float input speech samples */ + float Sn_pre[M]; /* pre-emphasised input speech samples */ COMP Sw[FFT_ENC]; /* DFT of Sn[] */ kiss_fft_cfg fft_fwd_cfg; kiss_fft_cfg fft_inv_cfg; @@ -89,6 +90,8 @@ int main(int argc, char *argv[]) int lspres = 0; int lspdt = 0, lspdt_mode = LSPDT_ALL; int dt = 0, lspjvm = 0, lspanssi = 0, lspjnd = 0, lspmel = 0; + int prede = 0; + float pre_mem = 0.0, de_mem = 0.0; float ak[LPC_MAX]; COMP Sw_[FFT_ENC]; COMP Ew[FFT_ENC]; @@ -150,6 +153,7 @@ int main(int argc, char *argv[]) { "hi", no_argument, &hi, 1 }, { "simlpcpf", no_argument, &simlpcpf, 1 }, { "lpcpf", no_argument, &lpcpf, 1 }, + { "prede", no_argument, &prede, 1 }, { "dump_pitch_e", required_argument, &dump_pitch_e, 1 }, { "sq_pitch_e", no_argument, &scalar_quant_Wo_e, 1 }, { "vq_pitch_e", no_argument, &vector_quant_Wo_e, 1 }, @@ -162,8 +166,10 @@ int main(int argc, char *argv[]) }; int num_opts=sizeof(long_options)/sizeof(struct option); - for(i=0; i 32767.0) diff --git a/codec2-dev/src/codec2.c b/codec2-dev/src/codec2.c index 5fc2abd4..7130d67b 100644 --- a/codec2-dev/src/codec2.c +++ b/codec2-dev/src/codec2.c @@ -944,7 +944,7 @@ void analyse_one_frame(struct CODEC2 *c2, MODEL *model, short speech[]) /* Estimate pitch */ - nlp(c2->nlp,c2->Sn,N,M,P_MIN,P_MAX,&pitch,Sw, &c2->prev_Wo_enc); + nlp(c2->nlp,c2->Sn,N,M,P_MIN,P_MAX,&pitch,Sw, c2->W, &c2->prev_Wo_enc); model->Wo = TWO_PI/pitch; model->L = PI/model->Wo; diff --git a/codec2-dev/src/nlp.c b/codec2-dev/src/nlp.c index ef58c57e..4b089610 100644 --- a/codec2-dev/src/nlp.c +++ b/codec2-dev/src/nlp.c @@ -4,8 +4,8 @@ AUTHOR......: David Rowe DATE CREATED: 23/3/93 - Non Linear Pitch (NLP) estimation functions. - + Non Linear Pitch (NLP) estimation functions. + \*---------------------------------------------------------------------------*/ /* @@ -117,7 +117,8 @@ typedef struct { kiss_fft_cfg fft_cfg; /* kiss FFT config */ } NLP; -float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax); +float test_candidate_mbe(COMP Sw[], COMP W[], float f0); +float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax, COMP Sw[], COMP W[], float *prev_Wo); float post_process_sub_multiples(COMP Fw[], int pmin, int pmax, float gmax, int gmax_bin, float *prev_Wo); @@ -209,6 +210,7 @@ float nlp( int pmax, /* maximum pitch value */ float *pitch, /* estimated pitch period in samples */ COMP Sw[], /* Freq domain version of Sn[] */ + COMP W[], /* Freq domain window */ float *prev_Wo ) { @@ -281,8 +283,12 @@ float nlp( } } - best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin, - prev_Wo); + //#define POST_PROCESS_MBE + #ifdef POST_PROCESS_MBE + best_f0 = post_process_mbe(Fw, pmin, pmax, gmax, Sw, W, prev_Wo); + #else + best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin, prev_Wo); + #endif /* Shift samples in buffer to make room for new samples */ @@ -299,7 +305,7 @@ float nlp( post_process_sub_multiples() - Given the global maximma of Fw[] we search interger submultiples for + Given the global maximma of Fw[] we search integer submultiples for local maxima. If local maxima exist and they are above an experimentally derived threshold (OK a magic number I pulled out of the air) we choose the submultiple as the F0 estimate. @@ -372,3 +378,158 @@ float post_process_sub_multiples(COMP Fw[], return best_f0; } +/*---------------------------------------------------------------------------*\ + + post_process_mbe() + + Use the MBE pitch estimation algorithm to evaluate pitch candidates. This + works OK but the accuracy at low F0 is affected by NW, the analysis window + size used for the DFT of the input speech Sw[]. Also favours high F0 in + the presence of background noise which causes periodic artifacts in the + synthesised speech. + +\*---------------------------------------------------------------------------*/ + +float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax, COMP Sw[], COMP W[], float *prev_Wo) +{ + float candidate_f0; + float f0,best_f0; /* fundamental frequency */ + float e,e_min; /* MBE cost function */ + int i; + float e_hz[F0_MAX]; + int bin; + float f0_min, f0_max; + float f0_start, f0_end; + + f0_min = (float)SAMPLE_RATE/pmax; + f0_max = (float)SAMPLE_RATE/pmin; + + /* Now look for local maxima. Each local maxima is a candidate + that we test using the MBE pitch estimation algotithm */ + + for(i=0; i Fw[i-1].real) && (Fw[i].real > Fw[i+1].real)) { + + /* local maxima found, lets test if it's big enough */ + + if (Fw[i].real > T*gmax) { + + /* OK, sample MBE cost function over +/- 10Hz range in 2.5Hz steps */ + + candidate_f0 = (float)i*SAMPLE_RATE/(PE_FFT_SIZE*DEC); + f0_start = candidate_f0-20; + f0_end = candidate_f0+20; + if (f0_start < f0_min) f0_start = f0_min; + if (f0_end > f0_max) f0_end = f0_max; + + for(f0=f0_start; f0<=f0_end; f0+= 2.5) { + e = test_candidate_mbe(Sw, W, f0); + bin = floor(f0); assert((bin > 0) && (bin < F0_MAX)); + e_hz[bin] = e; + if (e < e_min) { + e_min = e; + best_f0 = f0; + } + } + + } + } + } + + /* finally sample MBE cost function around previous pitch estimate + (form of pitch tracking) */ + + candidate_f0 = *prev_Wo * SAMPLE_RATE/TWO_PI; + f0_start = candidate_f0-20; + f0_end = candidate_f0+20; + if (f0_start < f0_min) f0_start = f0_min; + if (f0_end > f0_max) f0_end = f0_max; + + for(f0=f0_start; f0<=f0_end; f0+= 2.5) { + e = test_candidate_mbe(Sw, W, f0); + bin = floor(f0); assert((bin > 0) && (bin < F0_MAX)); + e_hz[bin] = e; + if (e < e_min) { + e_min = e; + best_f0 = f0; + } + } + + #ifdef DUMP + dump_e(e_hz); + #endif + + return best_f0; +} + +/*---------------------------------------------------------------------------*\ + + test_candidate_mbe() + + Returns the error of the MBE cost function for the input f0. + + Note: I think a lot of the operations below can be simplified as + W[].imag = 0 and has been normalised such that den always equals 1. + +\*---------------------------------------------------------------------------*/ + +float test_candidate_mbe( + COMP Sw[], + COMP W[], + float f0 +) +{ + COMP Sw_[FFT_ENC]; /* DFT of all voiced synthesised signal */ + int l,al,bl,m; /* loop variables */ + COMP Am; /* amplitude sample for this band */ + int offset; /* centers Hw[] about current harmonic */ + float den; /* denominator of Am expression */ + float error; /* accumulated error between originl and synthesised */ + float Wo; /* current "test" fundamental freq. */ + int L; + + L = floor((SAMPLE_RATE/2.0)/f0); + Wo = f0*(2*PI/SAMPLE_RATE); + + error = 0.0; + + /* Just test across the harmonics in the first 1000 Hz (L/4) */ + + for(l=1; l