From: drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
Date: Sun, 13 Sep 2009 06:27:53 +0000 (+0000)
Subject: added some basic pitch tracking to nlp to help with erros in mmt1.  Seems to work... 
X-Git-Url: http://git.whiteaudio.com/gitweb/?a=commitdiff_plain;h=28006535b7d264691f4e781d61449c3608fc9a39;p=freetel-svn-tracking.git

added some basic pitch tracking to nlp to help with erros in mmt1.  Seems to work OK and not upset other samples

git-svn-id: https://svn.code.sf.net/p/freetel/code@61 01035d8c-6547-0410-b346-abe4f91aad63
---

diff --git a/codec2/README.txt b/codec2/README.txt
index 2c75b568..a756cf51 100644
--- a/codec2/README.txt
+++ b/codec2/README.txt
@@ -214,10 +214,56 @@ Nope - I don't think it's possible to build a compatible codec without
 infringing on patents or access to commercial in confidence
 information.
 
+[[hacking]]
+Hacking
+-------
+
+If you would like to work on the Codec2 code base here are some
+notes:
+
+* src/code.sh will perform the several processing steps
+  required to output speech files at various processing steps, for
+  example:
+
+  $ ./code.sh hts1a
++
+will produce hts1a_uq (unquantised, i.e. baseline sinusoidal model),
+hts1a_phase0 (zero phase model), hts1a_lpc10 (10th order LPC model).
+
+* You can then listen to all of these samples (and the original)
+  using:
+
+  $ ./listen.sh hts1a
+
+* Specific notes about LPC and Phase modelling are below.
+
+* There are some useful scripts in the scripts directory, for example
+  wav2raw.sh, raw2wav.sh, playraw.sh, menu.sh.  Note that code.sh and
+  listen.sh are in the src directory as thats where they get used most
+  of the time.
+
 [[lpc]]
 LPC Modelling Notes
 -------------------
 
+Linear Prediction Coefficient (LPC) modelling is used to model the
+sine wave amplitudes { A }.  The use of LPC in speech coding is
+common, although the application of LPC modelling to frequency domain
+coding is fairly novel.  They are mainly used for time domain codecs
+like LPC-10 and CELP.
+
+LPC modelling has a couple of advantages:
+
+* From time domain coding we know a lot about LPC, for example how to
+  quantise them efficiently using Line Spectrum Pairs (LSPs).
+
+* The number of amplitudes varies each frame as Wo and hence L vary.
+  This makes the { A } tricky to quantise and transmit.  However it is
+  possible to convey the same information using a fixed number of
+  LPCs which makes the quantisation problem easier.
+
+To test LPC modelling:
+
   $ ./sinedec ../raw/hts1a.raw hts1a.mdl --lpc 10 - hts1a_lpc10.raw
 
 The blog post [4] discusses why LPC modelling works so well when Am
diff --git a/codec2/raw/forig_speex_8k.raw b/codec2/raw/forig_speex_8k.raw
new file mode 100644
index 00000000..e95302ef
Binary files /dev/null and b/codec2/raw/forig_speex_8k.raw differ
diff --git a/codec2/src/code.sh b/codec2/src/code.sh
index 7be303c5..9ab6b0a6 100644
--- a/codec2/src/code.sh
+++ b/codec2/src/code.sh
@@ -7,6 +7,6 @@
 ../unittest/tnlp ../raw/$1.raw ../unittest/$1_nlp.p
 ../src/sinenc ../raw/$1.raw %1.mdl 300 ../unittest/$1_nlp.p
 ../src/sinedec ../raw/$1.raw %1.mdl -o $1_uq.raw
-../src/sinedec ../raw/$1.raw %1.mdl --phase 0 -o $1_phase0.raw
+../src/sinedec ../raw/$1.raw %1.mdl --phase 0 -o $1_phase0.raw --postfilter
 ../src/sinedec ../raw/$1.raw %1.mdl --lpc 10 -o $1_lpc10.raw
 
diff --git a/codec2/src/defines.h b/codec2/src/defines.h
index 255f0f0f..e9702f05 100644
--- a/codec2/src/defines.h
+++ b/codec2/src/defines.h
@@ -58,7 +58,7 @@
 
 #define AW_DEC 160		/* number of samples in synthesis window */
 #define FFT_DEC 512	    	/* number of points in DFT */
-#define TW 40			/* Trapezoidal UV synthesis window overlap */
+#define TW 40			/* Trapezoidal synthesis window overlap */
 #define MAX_STR 256
 
 /*---------------------------------------------------------------------------*\
diff --git a/codec2/src/listen1.sh b/codec2/src/listen1.sh
index a9b156ce..c609b189 100755
--- a/codec2/src/listen1.sh
+++ b/codec2/src/listen1.sh
@@ -4,6 +4,6 @@
 #
 # Run menu with common sample file options, headphone version
 
-../script/menu.sh ../raw/$1.raw $1_uq.raw $1_phase0.raw $1_lpc10.raw -d /dev/dsp1
+../script/menu.sh ../raw/$1.raw $1_uq.raw $1_phase0.raw $1_lpc10.raw ../raw/$1_speex_8k.raw -d /dev/dsp1
 
 
diff --git a/codec2/src/nlp.c b/codec2/src/nlp.c
index 8b81c479..d1077894 100644
--- a/codec2/src/nlp.c
+++ b/codec2/src/nlp.c
@@ -109,7 +109,8 @@ float nlp_fir[] = {
 float test_candidate_mbe(COMP Sw[], float f0);
 float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax);
 float post_process_sub_multiples(COMP Fw[], 
-				 int pmin, int pmax, float gmax, int gmax_bin);
+				 int pmin, int pmax, float gmax, int gmax_bin,
+				 float *prev_Wo);
 extern int frames;
 
 /*---------------------------------------------------------------------------*\
@@ -149,7 +150,8 @@ float nlp(
   int    pmin,			/* minimum pitch value */
   int    pmax,			/* maximum pitch value */
   float *pitch,			/* estimated pitch period in samples */
-  COMP   Sw[]                   /* Freq domain version of Sn[] */
+  COMP   Sw[],                  /* Freq domain version of Sn[] */
+  float *prev_Wo
 )
 {
   static float sq[PMAX_M];	/* squared speech samples */
@@ -192,8 +194,9 @@ float nlp(
     Fw[i].real = 0.0;
     Fw[i].imag = 0.0;
   }
-  for(i=0; i<m/DEC; i++)
+  for(i=0; i<m/DEC; i++) {
     Fw[i].real = sq[i*DEC]*(0.5 - 0.5*cos(2*PI*i/(m/DEC-1)));
+  }
   dump_dec(Fw);
   four1(&Fw[-1].imag,PE_FFT_SIZE,1);
   for(i=0; i<PE_FFT_SIZE; i++)
@@ -215,7 +218,7 @@ float nlp(
   #ifdef POST_PROCESS_MBE
   best_f0 = post_process_mbe(Fw, pmin, pmax, gmax);
   #else
-  best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin);
+  best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin, prev_Wo);
   #endif
 
   /* Shift samples in buffer to make room for new samples */
@@ -251,20 +254,22 @@ float nlp(
 \*---------------------------------------------------------------------------*/
 
 float post_process_sub_multiples(COMP Fw[], 
-				 int pmin, int pmax, float gmax, int gmax_bin)
+				 int pmin, int pmax, float gmax, int gmax_bin,
+				 float *prev_Wo)
 {
     int   min_bin, cmax_bin;
     int   mult;
     float thresh, best_f0;
     int   b, bmin, bmax, lmax_bin;
     float lmax, cmax;
+    int   prev_f0_bin;
 
     /* post process estimate by searching submultiples */
 
     mult = 2;
     min_bin = PE_FFT_SIZE*DEC/pmax;
-    thresh = CNLP*gmax;
     cmax_bin = gmax_bin;
+    prev_f0_bin = *prev_Wo*(4000.0/PI)*(PE_FFT_SIZE*DEC)/SAMPLE_RATE;
 
     while(gmax_bin/mult >= min_bin) {
 
@@ -273,7 +278,15 @@ float post_process_sub_multiples(COMP Fw[],
 	bmax = 1.2*b;
 	if (bmin < min_bin)
 	    bmin = min_bin;
-      
+
+	/* lower threshold to favour previous frames pitch estimate,
+	    this is a form of pitch tracking */
+
+	if ((prev_f0_bin > bmin) && (prev_f0_bin < bmax))
+	    thresh = CNLP*0.5*gmax;
+	else
+	    thresh = CNLP*gmax;
+
 	lmax = 0;
 	for (b=bmin; b<=bmax; b++) 		/* look for maximum in interval */
 	    if (Fw[b].real > lmax) {
@@ -282,7 +295,7 @@ float post_process_sub_multiples(COMP Fw[],
 	    }
 
 	if (lmax > thresh)
-	    if (lmax > Fw[lmax_bin-1].real && lmax > Fw[lmax_bin+1].real) {
+	    if ((lmax > Fw[lmax_bin-1].real) && (lmax > Fw[lmax_bin+1].real)) {
 		cmax = lmax;
 		cmax_bin = lmax_bin;
 	    }
diff --git a/codec2/src/nlp.h b/codec2/src/nlp.h
index b39c02e7..05000617 100644
--- a/codec2/src/nlp.h
+++ b/codec2/src/nlp.h
@@ -33,6 +33,6 @@
 
 #define NLP_NTAP 48	 /* Decimation LPF order */
 
-float nlp(float Sn[], int n, int m, int pmin, int pmax, float *pitch, COMP  Sw[]);
+float nlp(float Sn[], int n, int m, int pmin, int pmax, float *pitch, COMP Sw[], float *prev_Wo);
 
 #endif
diff --git a/codec2/src/phase.c b/codec2/src/phase.c
index 25dcf946..e13e99c6 100644
--- a/codec2/src/phase.c
+++ b/codec2/src/phase.c
@@ -342,7 +342,6 @@ void phase_synth_zero_order(
     /* generate excitation */
 
     if (m <= Lrand) {
-	b = floor(m*model.Wo*FFT_DEC/TWO_PI + 0.5);
         Ex[m].real = cos(ex_phase[0]*m);
 	Ex[m].imag = sin(ex_phase[0]*m);
 
diff --git a/codec2/src/quantise.c b/codec2/src/quantise.c
index d00036b5..d855412f 100644
--- a/codec2/src/quantise.c
+++ b/codec2/src/quantise.c
@@ -321,6 +321,9 @@ void aks_to_M2(
   float Em;		/* energy in band */
   float Am;		/* spectral amplitude sample */
   float signal, noise;
+  float E1,Am1;
+
+  Am1 = model->A[1];
 
   r = TWO_PI/(FFT_DEC);
 
@@ -359,17 +362,25 @@ void aks_to_M2(
   }
   *snr = 10.0*log10(signal/noise);
 
-  /* attenuate fundamental by 30dB if F0 < 150 Hz.  LPC modelling often makes
-     big errors on 1st harmonic, which is usually at very low level due to
-     analog HPF.
+  /* 
+     Attenuate fundamental by 30dB if F0 < 150 Hz and LPC modelling
+     error for A[1] is larger than 6dB.
 
-     Another option is to use a single bit to swith thos attenuation
-     in and out based on measured error an encoder.  That way
-     non-HPF speech won't be impaired.
-   */
+     LPC modelling often makes big errors on 1st harmonic, for example
+     when the fundamental has been removed by analog high pass
+     filtering before sampling.  However on unfiltered speech from
+     high quality sources we would like to keep the fundamental to
+     maintain the speech quality.  So we check the error in A[1] and
+     attenuate it if the error is large to avoid annoying low
+     frequency energy after LPC modelling.
 
-  if (model->Wo < PI*150.0/4000) {
-      model->A[1] *= 0.032;
-  }
+     This will require a single bit to quantise, on top of the other
+     spectral magnitude bits (i.e. LSP bits + 1 total).
+   */
 
+  E1 = fabs(20.0*log10(Am1) - 20.0*log10(model->A[1]));
+  if (E1 > 6.0)
+      if (model->Wo < (PI*150.0/4000)) {
+	  model->A[1] *= 0.032;
+      }
 }
diff --git a/codec2/unittest/tnlp.c b/codec2/unittest/tnlp.c
index 0a9f51ab..dec36bb6 100644
--- a/codec2/unittest/tnlp.c
+++ b/codec2/unittest/tnlp.c
@@ -80,6 +80,7 @@ char *argv[];
     float pitch;
     int   i; 
     int   dump;
+    float prev_Wo;
     
     if (argc < 3) {
 	printf("\nusage: tnlp InputRawSpeechFile OutputPitchTextFile "
@@ -109,6 +110,7 @@ char *argv[];
     make_window(NW);
 
     frames = 0;
+    prev_Wo = 0;
     while(fread(buf,sizeof(short),N,fin)) {
       frames++;
 
@@ -121,7 +123,8 @@ char *argv[];
       dft_speech();
       dump_Sn(Sn); dump_Sw(Sw); 
 
-      nlp(Sn,N,M,PITCH_MIN,PITCH_MAX,&pitch,Sw);
+      nlp(Sn,N,M,PITCH_MIN,PITCH_MAX,&pitch,Sw,&prev_Wo);
+      prev_Wo = TWO_PI/pitch;
 
       fprintf(fout,"%f\n",pitch);
     }