From: drowe67 Date: Tue, 8 Sep 2009 10:17:22 +0000 (+0000) Subject: added two forms of post processor for nlp X-Git-Url: http://git.whiteaudio.com/gitweb/?a=commitdiff_plain;h=c1dd6ed91f261316001a01022eed3508bd80fe76;p=freetel-svn-tracking.git added two forms of post processor for nlp git-svn-id: https://svn.code.sf.net/p/freetel/code@46 01035d8c-6547-0410-b346-abe4f91aad63 --- diff --git a/codec2/src/defines.h b/codec2/src/defines.h index c07e7a8f..0c367947 100644 --- a/codec2/src/defines.h +++ b/codec2/src/defines.h @@ -51,7 +51,7 @@ /* Encoder defines */ -#define NW 220 /* analysis window size */ +#define NW 280 /* analysis window size */ #define AW_ENC 512 /* maximum encoder analysis window size */ #define FFT_ENC 512 /* size of FFT used for encoder analysis */ diff --git a/codec2/src/nlp.c b/codec2/src/nlp.c index c63f9d4f..47f41f3a 100644 --- a/codec2/src/nlp.c +++ b/codec2/src/nlp.c @@ -45,6 +45,7 @@ #define PI 3.141592654 /* mathematical constant */ #define T 0.1 /* threshold for local minima candidate */ #define F0_MAX 500 +#define CNLP 0.5 /* post processor constant */ /*---------------------------------------------------------------------------*\ @@ -106,19 +107,39 @@ float nlp_fir[] = { }; float test_candidate_mbe(COMP Sw[], float f0); +float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax); +float post_process_sub_multiples(COMP Fw[], + int pmin, int pmax, float gmax, int gmax_bin); extern int frames; /*---------------------------------------------------------------------------*\ - void nlp() - - Determines the pitch in samples using the NLP algorithm. Returns the - fundamental in Hz. Note that the actual pitch estimate is for the - centre of the M sample Sn[] vector, not the current N sample input - vector. This is (I think) a delay of 2.5 frames with N=80 samples. - You should align further analysis using this pitch estimate to be - centred on the middle of Sn[]. + nlp() + Determines the pitch in samples using the Non Linear Pitch (NLP) + algorithm [1]. Returns the fundamental in Hz. Note that the actual + pitch estimate is for the centre of the M sample Sn[] vector, not + the current N sample input vector. This is (I think) a delay of 2.5 + frames with N=80 samples. You should align further analysis using + this pitch estimate to be centred on the middle of Sn[]. + + Two post processors have been tried, the MBE version (as discussed + in [1]), and a post processor that checks sub-multiples. Both + suffer occasional gross pitch errors (i.e. neither are perfect). In + the presence of background noise the sub-multiple algorithm tends + towards low F0 which leads to better sounding background noise than + the MBE post processor. + + A good way to test and develop the NLP pitch estimator is using the + tnlp (codec2/unittest) and the codec2/octave/plnlp.m Octave script. + + A pitch tracker searching a few frames forward and backward in time + would be a useful addition. + + References: + + [1] http://www.itr.unisa.edu.au/~steven/thesis/dgr.pdf Chapter 4 + \*---------------------------------------------------------------------------*/ float nlp( @@ -138,13 +159,9 @@ float nlp( static float mem_fir[NLP_NTAP];/* decimation FIR filter memory */ COMP Fw[PE_FFT_SIZE]; /* DFT of squared signal */ float gmax; - - float candidate_f0; - float f0,best_f0; /* fundamental frequency */ - float e,e_min; /* MBE cost function */ - int i,j; - float e_hz[F0_MAX]; - int bin; + int gmax_bin; + int i,j; + float best_f0; /* Square, notch filter at DC, and LP filter vector */ @@ -190,9 +207,119 @@ float nlp( for(i=PE_FFT_SIZE*DEC/pmax; i<=PE_FFT_SIZE*DEC/pmin; i++) { if (Fw[i].real > gmax) { gmax = Fw[i].real; + gmax_bin = i; } } + #ifdef POST_PROCESS_MBE + best_f0 = post_process_mbe(Fw, pmin, pmax, gmax); + #else + best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin); + #endif + + /* Shift samples in buffer to make room for new samples */ + + for(i=0; i= min_bin) { + + b = gmax_bin/mult; /* determine search interval */ + bmin = 0.8*b; + bmax = 1.2*b; + if (bmin < min_bin) + bmin = min_bin; + + lmax = 0; + for (b=bmin; b<=bmax; b++) /* look for maximum in interval */ + if (Fw[b].real > lmax) { + lmax = Fw[b].real; + lmax_bin = b; + } + + if (lmax > thresh) + if (lmax > Fw[lmax_bin-1].real && lmax > Fw[lmax_bin+1].real) { + cmax = lmax; + cmax_bin = lmax_bin; + } + + mult++; + } + + best_f0 = (float)cmax_bin*SAMPLE_RATE/(PE_FFT_SIZE*DEC); + + return best_f0; +} + +/*---------------------------------------------------------------------------*\ + + post_process_mbe() + + Use the MBE pitch estimation algorithm to evaluate pitch candidates. This + works OK but the accuracy at low F0 is affected by NW, the analysis window + size used for the DFT of the input speech Sw[]. Also favours high F0 in + the presence of background noise which causes periodic artifacts in the + synthesised speech. + +\*---------------------------------------------------------------------------*/ + +float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax) +{ + float candidate_f0; + float f0,best_f0; /* fundamental frequency */ + float e,e_min; /* MBE cost function */ + int i; + float e_hz[F0_MAX]; + int bin; + float f0_min, f0_max; + float f0_start, f0_end; + + f0_min = (float)SAMPLE_RATE/pmax; + f0_max = (float)SAMPLE_RATE/pmin; + /* Now look for local maxima. Each local maxima is a candidate that we test using the MBE pitch estimation algotithm */ @@ -210,15 +337,15 @@ float nlp( /* OK, sample MBE cost function over +/- 10Hz range in 2.5Hz steps */ candidate_f0 = (float)i*SAMPLE_RATE/(PE_FFT_SIZE*DEC); - if (frames == 29) - printf("candidate F0: %f\n", candidate_f0); - for(f0=candidate_f0-20; f0<=candidate_f0+20; f0+= 2.5) { + f0_start = candidate_f0-20; + f0_end = candidate_f0+20; + if (f0_start < f0_min) f0_start = f0_min; + if (f0_end > f0_max) f0_end = f0_max; + + for(f0=f0_start; f0<=f0_end; f0+= 2.5) { e = test_candidate_mbe(Sw, f0); bin = floor(f0); assert((bin > 0) && (bin < F0_MAX)); e_hz[bin] = e; - if (frames == 29) - printf("f0: %f e: %f e_min: %f best_f0: %f\n", - f0, e, e_min, best_f0); if (e < e_min) { e_min = e; best_f0 = f0; @@ -230,17 +357,9 @@ float nlp( } dump_e(e_hz); - /* Shift samples in buffer to make room for new samples */ - - for(i=0; i