added some basic pitch tracking to nlp to help with erros in mmt1. Seems to work...

author drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>

Sun, 13 Sep 2009 06:27:53 +0000 (06:27 +0000)

committer drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>

Sun, 13 Sep 2009 06:27:53 +0000 (06:27 +0000)
author drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
Sun, 13 Sep 2009 06:27:53 +0000 (06:27 +0000)
committer drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
Sun, 13 Sep 2009 06:27:53 +0000 (06:27 +0000)
diff --git a/codec2/README.txt b/codec2/README.txt

index 2c75b5681e4608a0788e5ed0c989bd5112c204ee..a756cf51983955012d06944465600141492454a9 100644 (file)
--- a/codec2/README.txt
+++ b/codec2/README.txt
@@ -214,10 +214,56 @@ Nope - I don't think it's possible to build a compatible codec without
  infringing on patents or access to commercial in confidence
  information.
  
+[[hacking]]
+Hacking
+-------
+
+If you would like to work on the Codec2 code base here are some
+notes:
+
+* src/code.sh will perform the several processing steps
+  required to output speech files at various processing steps, for
+  example:
+
+  $ ./code.sh hts1a
++
+will produce hts1a_uq (unquantised, i.e. baseline sinusoidal model),
+hts1a_phase0 (zero phase model), hts1a_lpc10 (10th order LPC model).
+
+* You can then listen to all of these samples (and the original)
+  using:
+
+  $ ./listen.sh hts1a
+
+* Specific notes about LPC and Phase modelling are below.
+
+* There are some useful scripts in the scripts directory, for example
+  wav2raw.sh, raw2wav.sh, playraw.sh, menu.sh.  Note that code.sh and
+  listen.sh are in the src directory as thats where they get used most
+  of the time.
+
  [[lpc]]
  LPC Modelling Notes
  -------------------
  
+Linear Prediction Coefficient (LPC) modelling is used to model the
+sine wave amplitudes { A }.  The use of LPC in speech coding is
+common, although the application of LPC modelling to frequency domain
+coding is fairly novel.  They are mainly used for time domain codecs
+like LPC-10 and CELP.
+
+LPC modelling has a couple of advantages:
+
+* From time domain coding we know a lot about LPC, for example how to
+  quantise them efficiently using Line Spectrum Pairs (LSPs).
+
+* The number of amplitudes varies each frame as Wo and hence L vary.
+  This makes the { A } tricky to quantise and transmit.  However it is
+  possible to convey the same information using a fixed number of
+  LPCs which makes the quantisation problem easier.
+
+To test LPC modelling:
+
    $ ./sinedec ../raw/hts1a.raw hts1a.mdl --lpc 10 - hts1a_lpc10.raw
  
  The blog post [4] discusses why LPC modelling works so well when Am
diff --git a/codec2/raw/forig_speex_8k.raw b/codec2/raw/forig_speex_8k.raw

new file mode 100644 (file)

index 0000000..e95302e

Binary files /dev/null and b/codec2/raw/forig_speex_8k.raw differ
diff --git a/codec2/src/code.sh b/codec2/src/code.sh

index 7be303c5e1e038d5411945d1007643652feb886b..9ab6b0a650d2a313fd81a2f2309705a7ed6d2fd0 100644 (file)
--- a/codec2/src/code.sh
+++ b/codec2/src/code.sh
@@ -7,6 +7,6 @@
  ../unittest/tnlp ../raw/$1.raw ../unittest/$1_nlp.p
  ../src/sinenc ../raw/$1.raw %1.mdl 300 ../unittest/$1_nlp.p
  ../src/sinedec ../raw/$1.raw %1.mdl -o $1_uq.raw
-../src/sinedec ../raw/$1.raw %1.mdl --phase 0 -o $1_phase0.raw
+../src/sinedec ../raw/$1.raw %1.mdl --phase 0 -o $1_phase0.raw --postfilter
  ../src/sinedec ../raw/$1.raw %1.mdl --lpc 10 -o $1_lpc10.raw
  
diff --git a/codec2/src/defines.h b/codec2/src/defines.h

index 255f0f0f0593fac8323dea3f5f42e7522ef90083..e9702f0595dae486fae3060be11581208aa02a28 100644 (file)
--- a/codec2/src/defines.h
+++ b/codec2/src/defines.h
@@ -58,7 +58,7 @@
  
  #define AW_DEC 160             /* number of samples in synthesis window */
  #define FFT_DEC 512            /* number of points in DFT */
-#define TW 40                  /* Trapezoidal UV synthesis window overlap */
+#define TW 40                  /* Trapezoidal synthesis window overlap */
  #define MAX_STR 256
  
  /*---------------------------------------------------------------------------*\
diff --git a/codec2/src/listen1.sh b/codec2/src/listen1.sh

index a9b156ce8ecd77b453f26af6137404e95b8d1c1f..c609b18900a0aee01d6b5bff127a40418af35131 100755 (executable)
--- a/codec2/src/listen1.sh
+++ b/codec2/src/listen1.sh
@@ -4,6 +4,6 @@
  #
  # Run menu with common sample file options, headphone version
  
-../script/menu.sh ../raw/$1.raw $1_uq.raw $1_phase0.raw $1_lpc10.raw -d /dev/dsp1
+../script/menu.sh ../raw/$1.raw $1_uq.raw $1_phase0.raw $1_lpc10.raw ../raw/$1_speex_8k.raw -d /dev/dsp1
  
  
diff --git a/codec2/src/nlp.c b/codec2/src/nlp.c

index 8b81c479f26be0cfb7cee8c89a46deaad55ecc5d..d1077894eef5f7f85a8303dfa8c054ffacb2879b 100644 (file)
--- a/codec2/src/nlp.c
+++ b/codec2/src/nlp.c
@@ -109,7 +109,8 @@ float nlp_fir[] = {
  float test_candidate_mbe(COMP Sw[], float f0);
  float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax);
  float post_process_sub_multiples(COMP Fw[], 
-                                int pmin, int pmax, float gmax, int gmax_bin);
+                                int pmin, int pmax, float gmax, int gmax_bin,
+                                float *prev_Wo);
  extern int frames;
  
  /*---------------------------------------------------------------------------*\
@@ -149,7 +150,8 @@ float nlp(
    int    pmin,                 /* minimum pitch value */
    int    pmax,                 /* maximum pitch value */
    float *pitch,                        /* estimated pitch period in samples */
-  COMP   Sw[]                   /* Freq domain version of Sn[] */
+  COMP   Sw[],                  /* Freq domain version of Sn[] */
+  float *prev_Wo
  )
  {
    static float sq[PMAX_M];     /* squared speech samples */
@@ -192,8 +194,9 @@ float nlp(
      Fw[i].real = 0.0;
      Fw[i].imag = 0.0;
    }
-  for(i=0; i<m/DEC; i++)
+  for(i=0; i<m/DEC; i++) {
      Fw[i].real = sq[i*DEC]*(0.5 - 0.5*cos(2*PI*i/(m/DEC-1)));
+  }
    dump_dec(Fw);
    four1(&Fw[-1].imag,PE_FFT_SIZE,1);
    for(i=0; i<PE_FFT_SIZE; i++)
@@ -215,7 +218,7 @@ float nlp(
    #ifdef POST_PROCESS_MBE
    best_f0 = post_process_mbe(Fw, pmin, pmax, gmax);
    #else
-  best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin);
+  best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin, prev_Wo);
    #endif
  
    /* Shift samples in buffer to make room for new samples */
@@ -251,20 +254,22 @@ float nlp(
  \*---------------------------------------------------------------------------*/
  
  float post_process_sub_multiples(COMP Fw[], 
-                                int pmin, int pmax, float gmax, int gmax_bin)
+                                int pmin, int pmax, float gmax, int gmax_bin,
+                                float *prev_Wo)
  {
      int   min_bin, cmax_bin;
      int   mult;
      float thresh, best_f0;
      int   b, bmin, bmax, lmax_bin;
      float lmax, cmax;
+    int   prev_f0_bin;
  
      /* post process estimate by searching submultiples */
  
      mult = 2;
      min_bin = PE_FFT_SIZE*DEC/pmax;
-    thresh = CNLP*gmax;
      cmax_bin = gmax_bin;
+    prev_f0_bin = *prev_Wo*(4000.0/PI)*(PE_FFT_SIZE*DEC)/SAMPLE_RATE;
  
      while(gmax_bin/mult >= min_bin) {
  
@@ -273,7 +278,15 @@ float post_process_sub_multiples(COMP Fw[],
         bmax = 1.2*b;
         if (bmin < min_bin)
             bmin = min_bin;
-      
+
+       /* lower threshold to favour previous frames pitch estimate,
+           this is a form of pitch tracking */
+
+       if ((prev_f0_bin > bmin) && (prev_f0_bin < bmax))
+           thresh = CNLP*0.5*gmax;
+       else
+           thresh = CNLP*gmax;
+
         lmax = 0;
         for (b=bmin; b<=bmax; b++)              /* look for maximum in interval */
             if (Fw[b].real > lmax) {
@@ -282,7 +295,7 @@ float post_process_sub_multiples(COMP Fw[],
             }
  
         if (lmax > thresh)
-           if (lmax > Fw[lmax_bin-1].real && lmax > Fw[lmax_bin+1].real) {
+           if ((lmax > Fw[lmax_bin-1].real) && (lmax > Fw[lmax_bin+1].real)) {
                 cmax = lmax;
                 cmax_bin = lmax_bin;
             }
diff --git a/codec2/src/nlp.h b/codec2/src/nlp.h

index b39c02e7373e35ff71142414bfb6920e0bb4cae8..05000617e284bcca10a581fde0c511a9902bae72 100644 (file)
--- a/codec2/src/nlp.h
+++ b/codec2/src/nlp.h
@@ -33,6 +33,6 @@
  
  #define NLP_NTAP 48     /* Decimation LPF order */
  
-float nlp(float Sn[], int n, int m, int pmin, int pmax, float *pitch, COMP  Sw[]);
+float nlp(float Sn[], int n, int m, int pmin, int pmax, float *pitch, COMP Sw[], float *prev_Wo);
  
  #endif
diff --git a/codec2/src/phase.c b/codec2/src/phase.c

index 25dcf946554b26d9147869d9fec5565ad55ea181..e13e99c613b4a0a2ee02152e67250e2032fe990c 100644 (file)
--- a/codec2/src/phase.c
+++ b/codec2/src/phase.c
@@ -342,7 +342,6 @@ void phase_synth_zero_order(
      /* generate excitation */
  
      if (m <= Lrand) {
-       b = floor(m*model.Wo*FFT_DEC/TWO_PI + 0.5);
          Ex[m].real = cos(ex_phase[0]*m);
         Ex[m].imag = sin(ex_phase[0]*m);
  
diff --git a/codec2/src/quantise.c b/codec2/src/quantise.c

index d00036b5a26206a7666156d918feae194a81ff2b..d855412fab118def1dd66fe8aae40f985b824e33 100644 (file)
--- a/codec2/src/quantise.c
+++ b/codec2/src/quantise.c
@@ -321,6 +321,9 @@ void aks_to_M2(
    float Em;            /* energy in band */
    float Am;            /* spectral amplitude sample */
    float signal, noise;
+  float E1,Am1;
+
+  Am1 = model->A[1];
  
    r = TWO_PI/(FFT_DEC);
  
@@ -359,17 +362,25 @@ void aks_to_M2(
    }
    *snr = 10.0*log10(signal/noise);
  
-  /* attenuate fundamental by 30dB if F0 < 150 Hz.  LPC modelling often makes
-     big errors on 1st harmonic, which is usually at very low level due to
-     analog HPF.
+  /* 
+     Attenuate fundamental by 30dB if F0 < 150 Hz and LPC modelling
+     error for A[1] is larger than 6dB.
  
-     Another option is to use a single bit to swith thos attenuation
-     in and out based on measured error an encoder.  That way
-     non-HPF speech won't be impaired.
-   */
+     LPC modelling often makes big errors on 1st harmonic, for example
+     when the fundamental has been removed by analog high pass
+     filtering before sampling.  However on unfiltered speech from
+     high quality sources we would like to keep the fundamental to
+     maintain the speech quality.  So we check the error in A[1] and
+     attenuate it if the error is large to avoid annoying low
+     frequency energy after LPC modelling.
  
-  if (model->Wo < PI*150.0/4000) {
-      model->A[1] *= 0.032;
-  }
+     This will require a single bit to quantise, on top of the other
+     spectral magnitude bits (i.e. LSP bits + 1 total).
+   */
  
+  E1 = fabs(20.0*log10(Am1) - 20.0*log10(model->A[1]));
+  if (E1 > 6.0)
+      if (model->Wo < (PI*150.0/4000)) {
+         model->A[1] *= 0.032;
+      }
  }
diff --git a/codec2/unittest/tnlp.c b/codec2/unittest/tnlp.c

index 0a9f51ab04f9dbad1fd1e2d2495ede207ef3d6fb..dec36bb6a3fe8d2e3d71b67d79c07d999e6fd54c 100644 (file)
--- a/codec2/unittest/tnlp.c
+++ b/codec2/unittest/tnlp.c
@@ -80,6 +80,7 @@ char *argv[];
      float pitch;
      int   i; 
      int   dump;
+    float prev_Wo;
      
      if (argc < 3) {
         printf("\nusage: tnlp InputRawSpeechFile OutputPitchTextFile "
@@ -109,6 +110,7 @@ char *argv[];
      make_window(NW);
  
      frames = 0;
+    prev_Wo = 0;
      while(fread(buf,sizeof(short),N,fin)) {
        frames++;
  
@@ -121,7 +123,8 @@ char *argv[];
        dft_speech();
        dump_Sn(Sn); dump_Sw(Sw); 
  
-      nlp(Sn,N,M,PITCH_MIN,PITCH_MAX,&pitch,Sw);
+      nlp(Sn,N,M,PITCH_MIN,PITCH_MAX,&pitch,Sw,&prev_Wo);
+      prev_Wo = TWO_PI/pitch;
  
        fprintf(fout,"%f\n",pitch);
      }
author	drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
	Sun, 13 Sep 2009 06:27:53 +0000 (06:27 +0000)
committer	drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
	Sun, 13 Sep 2009 06:27:53 +0000 (06:27 +0000)
codec2/README.txt		patch \| blob \| history
codec2/raw/forig_speex_8k.raw	[new file with mode: 0644]	patch \| blob
codec2/src/code.sh		patch \| blob \| history
codec2/src/defines.h		patch \| blob \| history
codec2/src/listen1.sh		patch \| blob \| history
codec2/src/nlp.c		patch \| blob \| history
codec2/src/nlp.h		patch \| blob \| history
codec2/src/phase.c		patch \| blob \| history
codec2/src/quantise.c		patch \| blob \| history
codec2/unittest/tnlp.c		patch \| blob \| history