From: drowe67 Date: Sun, 13 Sep 2009 06:27:53 +0000 (+0000) Subject: added some basic pitch tracking to nlp to help with erros in mmt1. Seems to work... X-Git-Url: http://git.whiteaudio.com/gitweb/?a=commitdiff_plain;h=28006535b7d264691f4e781d61449c3608fc9a39;p=freetel-svn-tracking.git added some basic pitch tracking to nlp to help with erros in mmt1. Seems to work OK and not upset other samples git-svn-id: https://svn.code.sf.net/p/freetel/code@61 01035d8c-6547-0410-b346-abe4f91aad63 --- diff --git a/codec2/README.txt b/codec2/README.txt index 2c75b568..a756cf51 100644 --- a/codec2/README.txt +++ b/codec2/README.txt @@ -214,10 +214,56 @@ Nope - I don't think it's possible to build a compatible codec without infringing on patents or access to commercial in confidence information. +[[hacking]] +Hacking +------- + +If you would like to work on the Codec2 code base here are some +notes: + +* src/code.sh will perform the several processing steps + required to output speech files at various processing steps, for + example: + + $ ./code.sh hts1a ++ +will produce hts1a_uq (unquantised, i.e. baseline sinusoidal model), +hts1a_phase0 (zero phase model), hts1a_lpc10 (10th order LPC model). + +* You can then listen to all of these samples (and the original) + using: + + $ ./listen.sh hts1a + +* Specific notes about LPC and Phase modelling are below. + +* There are some useful scripts in the scripts directory, for example + wav2raw.sh, raw2wav.sh, playraw.sh, menu.sh. Note that code.sh and + listen.sh are in the src directory as thats where they get used most + of the time. + [[lpc]] LPC Modelling Notes ------------------- +Linear Prediction Coefficient (LPC) modelling is used to model the +sine wave amplitudes { A }. The use of LPC in speech coding is +common, although the application of LPC modelling to frequency domain +coding is fairly novel. They are mainly used for time domain codecs +like LPC-10 and CELP. + +LPC modelling has a couple of advantages: + +* From time domain coding we know a lot about LPC, for example how to + quantise them efficiently using Line Spectrum Pairs (LSPs). + +* The number of amplitudes varies each frame as Wo and hence L vary. + This makes the { A } tricky to quantise and transmit. However it is + possible to convey the same information using a fixed number of + LPCs which makes the quantisation problem easier. + +To test LPC modelling: + $ ./sinedec ../raw/hts1a.raw hts1a.mdl --lpc 10 - hts1a_lpc10.raw The blog post [4] discusses why LPC modelling works so well when Am diff --git a/codec2/raw/forig_speex_8k.raw b/codec2/raw/forig_speex_8k.raw new file mode 100644 index 00000000..e95302ef Binary files /dev/null and b/codec2/raw/forig_speex_8k.raw differ diff --git a/codec2/src/code.sh b/codec2/src/code.sh index 7be303c5..9ab6b0a6 100644 --- a/codec2/src/code.sh +++ b/codec2/src/code.sh @@ -7,6 +7,6 @@ ../unittest/tnlp ../raw/$1.raw ../unittest/$1_nlp.p ../src/sinenc ../raw/$1.raw %1.mdl 300 ../unittest/$1_nlp.p ../src/sinedec ../raw/$1.raw %1.mdl -o $1_uq.raw -../src/sinedec ../raw/$1.raw %1.mdl --phase 0 -o $1_phase0.raw +../src/sinedec ../raw/$1.raw %1.mdl --phase 0 -o $1_phase0.raw --postfilter ../src/sinedec ../raw/$1.raw %1.mdl --lpc 10 -o $1_lpc10.raw diff --git a/codec2/src/defines.h b/codec2/src/defines.h index 255f0f0f..e9702f05 100644 --- a/codec2/src/defines.h +++ b/codec2/src/defines.h @@ -58,7 +58,7 @@ #define AW_DEC 160 /* number of samples in synthesis window */ #define FFT_DEC 512 /* number of points in DFT */ -#define TW 40 /* Trapezoidal UV synthesis window overlap */ +#define TW 40 /* Trapezoidal synthesis window overlap */ #define MAX_STR 256 /*---------------------------------------------------------------------------*\ diff --git a/codec2/src/listen1.sh b/codec2/src/listen1.sh index a9b156ce..c609b189 100755 --- a/codec2/src/listen1.sh +++ b/codec2/src/listen1.sh @@ -4,6 +4,6 @@ # # Run menu with common sample file options, headphone version -../script/menu.sh ../raw/$1.raw $1_uq.raw $1_phase0.raw $1_lpc10.raw -d /dev/dsp1 +../script/menu.sh ../raw/$1.raw $1_uq.raw $1_phase0.raw $1_lpc10.raw ../raw/$1_speex_8k.raw -d /dev/dsp1 diff --git a/codec2/src/nlp.c b/codec2/src/nlp.c index 8b81c479..d1077894 100644 --- a/codec2/src/nlp.c +++ b/codec2/src/nlp.c @@ -109,7 +109,8 @@ float nlp_fir[] = { float test_candidate_mbe(COMP Sw[], float f0); float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax); float post_process_sub_multiples(COMP Fw[], - int pmin, int pmax, float gmax, int gmax_bin); + int pmin, int pmax, float gmax, int gmax_bin, + float *prev_Wo); extern int frames; /*---------------------------------------------------------------------------*\ @@ -149,7 +150,8 @@ float nlp( int pmin, /* minimum pitch value */ int pmax, /* maximum pitch value */ float *pitch, /* estimated pitch period in samples */ - COMP Sw[] /* Freq domain version of Sn[] */ + COMP Sw[], /* Freq domain version of Sn[] */ + float *prev_Wo ) { static float sq[PMAX_M]; /* squared speech samples */ @@ -192,8 +194,9 @@ float nlp( Fw[i].real = 0.0; Fw[i].imag = 0.0; } - for(i=0; i= min_bin) { @@ -273,7 +278,15 @@ float post_process_sub_multiples(COMP Fw[], bmax = 1.2*b; if (bmin < min_bin) bmin = min_bin; - + + /* lower threshold to favour previous frames pitch estimate, + this is a form of pitch tracking */ + + if ((prev_f0_bin > bmin) && (prev_f0_bin < bmax)) + thresh = CNLP*0.5*gmax; + else + thresh = CNLP*gmax; + lmax = 0; for (b=bmin; b<=bmax; b++) /* look for maximum in interval */ if (Fw[b].real > lmax) { @@ -282,7 +295,7 @@ float post_process_sub_multiples(COMP Fw[], } if (lmax > thresh) - if (lmax > Fw[lmax_bin-1].real && lmax > Fw[lmax_bin+1].real) { + if ((lmax > Fw[lmax_bin-1].real) && (lmax > Fw[lmax_bin+1].real)) { cmax = lmax; cmax_bin = lmax_bin; } diff --git a/codec2/src/nlp.h b/codec2/src/nlp.h index b39c02e7..05000617 100644 --- a/codec2/src/nlp.h +++ b/codec2/src/nlp.h @@ -33,6 +33,6 @@ #define NLP_NTAP 48 /* Decimation LPF order */ -float nlp(float Sn[], int n, int m, int pmin, int pmax, float *pitch, COMP Sw[]); +float nlp(float Sn[], int n, int m, int pmin, int pmax, float *pitch, COMP Sw[], float *prev_Wo); #endif diff --git a/codec2/src/phase.c b/codec2/src/phase.c index 25dcf946..e13e99c6 100644 --- a/codec2/src/phase.c +++ b/codec2/src/phase.c @@ -342,7 +342,6 @@ void phase_synth_zero_order( /* generate excitation */ if (m <= Lrand) { - b = floor(m*model.Wo*FFT_DEC/TWO_PI + 0.5); Ex[m].real = cos(ex_phase[0]*m); Ex[m].imag = sin(ex_phase[0]*m); diff --git a/codec2/src/quantise.c b/codec2/src/quantise.c index d00036b5..d855412f 100644 --- a/codec2/src/quantise.c +++ b/codec2/src/quantise.c @@ -321,6 +321,9 @@ void aks_to_M2( float Em; /* energy in band */ float Am; /* spectral amplitude sample */ float signal, noise; + float E1,Am1; + + Am1 = model->A[1]; r = TWO_PI/(FFT_DEC); @@ -359,17 +362,25 @@ void aks_to_M2( } *snr = 10.0*log10(signal/noise); - /* attenuate fundamental by 30dB if F0 < 150 Hz. LPC modelling often makes - big errors on 1st harmonic, which is usually at very low level due to - analog HPF. + /* + Attenuate fundamental by 30dB if F0 < 150 Hz and LPC modelling + error for A[1] is larger than 6dB. - Another option is to use a single bit to swith thos attenuation - in and out based on measured error an encoder. That way - non-HPF speech won't be impaired. - */ + LPC modelling often makes big errors on 1st harmonic, for example + when the fundamental has been removed by analog high pass + filtering before sampling. However on unfiltered speech from + high quality sources we would like to keep the fundamental to + maintain the speech quality. So we check the error in A[1] and + attenuate it if the error is large to avoid annoying low + frequency energy after LPC modelling. - if (model->Wo < PI*150.0/4000) { - model->A[1] *= 0.032; - } + This will require a single bit to quantise, on top of the other + spectral magnitude bits (i.e. LSP bits + 1 total). + */ + E1 = fabs(20.0*log10(Am1) - 20.0*log10(model->A[1])); + if (E1 > 6.0) + if (model->Wo < (PI*150.0/4000)) { + model->A[1] *= 0.032; + } } diff --git a/codec2/unittest/tnlp.c b/codec2/unittest/tnlp.c index 0a9f51ab..dec36bb6 100644 --- a/codec2/unittest/tnlp.c +++ b/codec2/unittest/tnlp.c @@ -80,6 +80,7 @@ char *argv[]; float pitch; int i; int dump; + float prev_Wo; if (argc < 3) { printf("\nusage: tnlp InputRawSpeechFile OutputPitchTextFile " @@ -109,6 +110,7 @@ char *argv[]; make_window(NW); frames = 0; + prev_Wo = 0; while(fread(buf,sizeof(short),N,fin)) { frames++; @@ -121,7 +123,8 @@ char *argv[]; dft_speech(); dump_Sn(Sn); dump_Sw(Sw); - nlp(Sn,N,M,PITCH_MIN,PITCH_MAX,&pitch,Sw); + nlp(Sn,N,M,PITCH_MIN,PITCH_MAX,&pitch,Sw,&prev_Wo); + prev_Wo = TWO_PI/pitch; fprintf(fout,"%f\n",pitch); }