optional use of different post proc for NLP pitch est

author drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>

Mon, 15 Oct 2012 20:44:16 +0000 (20:44 +0000)

committer drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>

Mon, 15 Oct 2012 20:44:16 +0000 (20:44 +0000)
author drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
Mon, 15 Oct 2012 20:44:16 +0000 (20:44 +0000)
committer drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
Mon, 15 Oct 2012 20:44:16 +0000 (20:44 +0000)
diff --git a/codec2-dev/src/c2sim.c b/codec2-dev/src/c2sim.c

index 0d9b41c46d00c69926c8f391e3340bcef72bf9ef..2893b015bcbeea95b9e36c2265a52c7215d37087 100644 (file)
--- a/codec2-dev/src/c2sim.c
+++ b/codec2-dev/src/c2sim.c
@@ -49,7 +49,7 @@
  #include "ampexp.h"
  #include "phaseexp.h"
  
-void synth_one_frame(kiss_fft_cfg fft_inv_cfg, short buf[], MODEL *model, float Sn_[], float Pn[]);
+void synth_one_frame(kiss_fft_cfg fft_inv_cfg, short buf[], MODEL *model, float Sn_[], float Pn[], int prede, float *de_mem);
  void print_help(const struct option *long_options, int num_opts, char* argv[]);
  
  
@@ -65,6 +65,7 @@ int main(int argc, char *argv[])
      FILE *fin;         /* input speech file                     */
      short buf[N];      /* input/output buffer                   */
      float Sn[M];       /* float input speech samples            */
+    float Sn_pre[M];   /* pre-emphasised input speech samples   */
      COMP  Sw[FFT_ENC]; /* DFT of Sn[]                           */
      kiss_fft_cfg  fft_fwd_cfg;
      kiss_fft_cfg  fft_inv_cfg;
@@ -89,6 +90,8 @@ int main(int argc, char *argv[])
      int lspres = 0;
      int lspdt = 0, lspdt_mode = LSPDT_ALL;
      int dt = 0, lspjvm = 0, lspanssi = 0, lspjnd = 0, lspmel = 0;
+    int prede = 0;
+    float pre_mem = 0.0, de_mem = 0.0;
      float ak[LPC_MAX];
      COMP  Sw_[FFT_ENC];
      COMP  Ew[FFT_ENC]; 
@@ -150,6 +153,7 @@ int main(int argc, char *argv[])
          { "hi", no_argument, &hi, 1 },
          { "simlpcpf", no_argument, &simlpcpf, 1 },
          { "lpcpf", no_argument, &lpcpf, 1 },
+        { "prede", no_argument, &prede, 1 },
          { "dump_pitch_e", required_argument, &dump_pitch_e, 1 },
          { "sq_pitch_e", no_argument, &scalar_quant_Wo_e, 1 },
          { "vq_pitch_e", no_argument, &vector_quant_Wo_e, 1 },
@@ -162,8 +166,10 @@ int main(int argc, char *argv[])
      };
      int num_opts=sizeof(long_options)/sizeof(struct option);
      
-    for(i=0; i<M; i++)
+    for(i=0; i<M; i++) {
         Sn[i] = 1.0;
+       Sn_pre[i] = 1.0;
+    }
      for(i=0; i<2*N; i++)
         Sn_[i] = 0;
  
@@ -354,21 +360,26 @@ int main(int argc, char *argv[])
  
         /* Read input speech */
  
-       for(i=0; i<M-N; i++)
+       for(i=0; i<M-N; i++) {
             Sn[i] = Sn[i+N];
+           Sn_pre[i] = Sn_pre[i+N];
+       }
         for(i=0; i<N; i++)
             Sn[i+M-N] = buf[i];
- 
+
+       pre_emp(&Sn_pre[M-N], &Sn[M-N], &pre_mem, N);
+       
+
         /*------------------------------------------------------------*\
  
                        Estimate Sinusoidal Model Parameters
  
         \*------------------------------------------------------------*/
  
-       nlp(nlp_states,Sn,N,M,P_MIN,P_MAX,&pitch,Sw,&prev_uq_Wo);
+       nlp(nlp_states,Sn,N,M,P_MIN,P_MAX,&pitch,Sw,W,&prev_uq_Wo);
         model.Wo = TWO_PI/pitch;
  
-       dft_speech(fft_fwd_cfg, Sw, Sn, w); 
+       dft_speech(fft_fwd_cfg, Sw, Sn, w);
         two_stage_pitch_refinement(&model, Sw);
         estimate_amplitudes(&model, Sw, W);
         uq_Wo = model.Wo;
@@ -415,8 +426,15 @@ int main(int argc, char *argv[])
  
             /* find aks here, these are overwritten if LPC modelling is enabled */
  
-           for(i=0; i<M; i++)
-               Wn[i] = Sn[i]*w[i];
+           if (prede) {
+               for(i=0; i<M; i++)
+                   Wn[i] = Sn_pre[i]*w[i];
+           }
+           else {
+           
+               for(i=0; i<M; i++)
+                   Wn[i] = Sn[i]*w[i];
+           }
             autocorrelate(Wn,Rk,M,order);
             levinson_durbin(Rk,ak,order);
  
@@ -452,8 +470,11 @@ int main(int argc, char *argv[])
         \*------------------------------------------------------------*/
  
         if (lpc_model) {
-
-           e = speech_to_uq_lsps(lsps, ak, Sn, w, order);
+           
+           if (prede)
+               e = speech_to_uq_lsps(lsps, ak, Sn_pre, w, order);
+           else
+               e = speech_to_uq_lsps(lsps, ak, Sn, w, order);
  
              #ifdef DUMP
             dump_ak(ak, LPC_ORD);
@@ -521,6 +542,7 @@ int main(int argc, char *argv[])
                 /*  multi-stage VQ from Anssi Ramo OH3GDD */
  
                 lspanssi_quantise(lsps, lsps_, LPC_ORD, 5);
+               bw_expand_lsps(lsps_, LPC_ORD);                     
                 lsp_to_lpc(lsps_, ak, LPC_ORD);
             }
  
@@ -761,7 +783,7 @@ int main(int argc, char *argv[])
                                            order);      
                 if (postfilt)
                     postfilter(&interp_model, &bg_est);
-               synth_one_frame(fft_inv_cfg, buf, &interp_model, Sn_, Pn);
+               synth_one_frame(fft_inv_cfg, buf, &interp_model, Sn_, Pn, prede, &de_mem);
                 //printf("  buf[0] %d\n", buf[0]);
                 if (fout != NULL) 
                     fwrite(buf,sizeof(short),N,fout);
@@ -772,7 +794,7 @@ int main(int argc, char *argv[])
                     phase_synth_zero_order(fft_fwd_cfg, &model, ak, ex_phase, order);   
                 if (postfilt)
                     postfilter(&model, &bg_est);
-               synth_one_frame(fft_inv_cfg, buf, &model, Sn_, Pn);
+               synth_one_frame(fft_inv_cfg, buf, &model, Sn_, Pn, prede, &de_mem);
                 //printf("  buf[0] %d\n", buf[0]);
                 if (fout != NULL) 
                     fwrite(buf,sizeof(short),N,fout);
@@ -796,7 +818,7 @@ int main(int argc, char *argv[])
  
             if (postfilt)
                 postfilter(&model, &bg_est);
-           synth_one_frame(fft_inv_cfg, buf, &model, Sn_, Pn);
+           synth_one_frame(fft_inv_cfg, buf, &model, Sn_, Pn, prede, &de_mem);
             if (fout != NULL) fwrite(buf,sizeof(short),N,fout);
         }
  
@@ -839,11 +861,13 @@ int main(int argc, char *argv[])
      return 0;
  }
  
-void synth_one_frame(kiss_fft_cfg fft_inv_cfg, short buf[], MODEL *model, float Sn_[], float Pn[])
+void synth_one_frame(kiss_fft_cfg fft_inv_cfg, short buf[], MODEL *model, float Sn_[], float Pn[], int prede, float *de_mem)
  {
      int     i;
  
      synthesise(fft_inv_cfg, Sn_, model, Pn, 1);
+    if (prede)
+        de_emp(Sn_, Sn_, de_mem, N);   
  
      for(i=0; i<N; i++) {
         if (Sn_[i] > 32767.0)
diff --git a/codec2-dev/src/codec2.c b/codec2-dev/src/codec2.c

index 5fc2abd4fdbacad003461c145d1586063838ee81..7130d67b9a0de7e00705ce4a95aa92fc94ba4584 100644 (file)
--- a/codec2-dev/src/codec2.c
+++ b/codec2-dev/src/codec2.c
@@ -944,7 +944,7 @@ void analyse_one_frame(struct CODEC2 *c2, MODEL *model, short speech[])
  
      /* Estimate pitch */
  
-    nlp(c2->nlp,c2->Sn,N,M,P_MIN,P_MAX,&pitch,Sw, &c2->prev_Wo_enc);
+    nlp(c2->nlp,c2->Sn,N,M,P_MIN,P_MAX,&pitch,Sw, c2->W, &c2->prev_Wo_enc);
      model->Wo = TWO_PI/pitch;
      model->L = PI/model->Wo;
  
diff --git a/codec2-dev/src/nlp.c b/codec2-dev/src/nlp.c

index ef58c57e8dc7227f06d9bde5d0ade3aedc34d781..4b089610fdf4ef8c5a38d6a0d3aab7a53ecd7ed9 100644 (file)
--- a/codec2-dev/src/nlp.c
+++ b/codec2-dev/src/nlp.c
@@ -4,8 +4,8 @@
    AUTHOR......: David Rowe                                      
    DATE CREATED: 23/3/93                                    
                                                           
-  Non Linear Pitch (NLP) estimation functions.                   
-                                                               
+  Non Linear Pitch (NLP) estimation functions.
+
  \*---------------------------------------------------------------------------*/
  
  /*
@@ -117,7 +117,8 @@ typedef struct {
      kiss_fft_cfg  fft_cfg;           /* kiss FFT config              */
  } NLP;
  
-float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax);
+float test_candidate_mbe(COMP Sw[], COMP W[], float f0);
+float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax, COMP Sw[], COMP W[], float *prev_Wo);
  float post_process_sub_multiples(COMP Fw[], 
                                  int pmin, int pmax, float gmax, int gmax_bin,
                                  float *prev_Wo);
@@ -209,6 +210,7 @@ float nlp(
    int    pmax,                 /* maximum pitch value */
    float *pitch,                        /* estimated pitch period in samples */
    COMP   Sw[],                  /* Freq domain version of Sn[] */
+  COMP   W[],                   /* Freq domain window */
    float *prev_Wo
  )
  {
@@ -281,8 +283,12 @@ float nlp(
         }
      }
  
-    best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin, 
-                                        prev_Wo);
+    //#define POST_PROCESS_MBE
+    #ifdef POST_PROCESS_MBE
+    best_f0 = post_process_mbe(Fw, pmin, pmax, gmax, Sw, W, prev_Wo);
+    #else
+    best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin, prev_Wo);
+    #endif
  
      /* Shift samples in buffer to make room for new samples */
  
@@ -299,7 +305,7 @@ float nlp(
                                                                               
    post_process_sub_multiples() 
                                                                             
-  Given the global maximma of Fw[] we search interger submultiples for
+  Given the global maximma of Fw[] we search integer submultiples for
    local maxima.  If local maxima exist and they are above an
    experimentally derived threshold (OK a magic number I pulled out of
    the air) we choose the submultiple as the F0 estimate.
@@ -372,3 +378,158 @@ float post_process_sub_multiples(COMP Fw[],
      return best_f0;
  }
    
+/*---------------------------------------------------------------------------*\
+                                                                             
+  post_process_mbe() 
+                                                                           
+  Use the MBE pitch estimation algorithm to evaluate pitch candidates.  This
+  works OK but the accuracy at low F0 is affected by NW, the analysis window
+  size used for the DFT of the input speech Sw[].  Also favours high F0 in
+  the presence of background noise which causes periodic artifacts in the
+  synthesised speech.
+
+\*---------------------------------------------------------------------------*/
+
+float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax, COMP Sw[], COMP W[], float *prev_Wo)
+{
+  float candidate_f0;
+  float f0,best_f0;            /* fundamental frequency */
+  float e,e_min;                /* MBE cost function */
+  int   i;
+  float e_hz[F0_MAX];
+  int   bin;
+  float f0_min, f0_max;
+  float f0_start, f0_end;
+
+  f0_min = (float)SAMPLE_RATE/pmax;
+  f0_max = (float)SAMPLE_RATE/pmin;
+
+  /* Now look for local maxima.  Each local maxima is a candidate
+     that we test using the MBE pitch estimation algotithm */
+
+  for(i=0; i<F0_MAX; i++)
+      e_hz[i] = -1;
+  e_min = 1E32;
+  best_f0 = 50;
+  for(i=PE_FFT_SIZE*DEC/pmax; i<=PE_FFT_SIZE*DEC/pmin; i++) {
+    if ((Fw[i].real > Fw[i-1].real) && (Fw[i].real > Fw[i+1].real)) {
+
+       /* local maxima found, lets test if it's big enough */
+
+       if (Fw[i].real > T*gmax) {
+
+           /* OK, sample MBE cost function over +/- 10Hz range in 2.5Hz steps */
+
+           candidate_f0 = (float)i*SAMPLE_RATE/(PE_FFT_SIZE*DEC);
+           f0_start = candidate_f0-20;
+           f0_end = candidate_f0+20;
+           if (f0_start < f0_min) f0_start = f0_min;
+           if (f0_end > f0_max) f0_end = f0_max;
+
+           for(f0=f0_start; f0<=f0_end; f0+= 2.5) {
+               e = test_candidate_mbe(Sw, W, f0);
+               bin = floor(f0); assert((bin > 0) && (bin < F0_MAX));
+               e_hz[bin] = e;
+               if (e < e_min) {
+                   e_min = e;
+                   best_f0 = f0;
+               }
+           }
+
+       }
+    }
+  }
+
+  /* finally sample MBE cost function around previous pitch estimate
+     (form of pitch tracking) */
+
+  candidate_f0 = *prev_Wo * SAMPLE_RATE/TWO_PI;
+  f0_start = candidate_f0-20;
+  f0_end = candidate_f0+20;
+  if (f0_start < f0_min) f0_start = f0_min;
+  if (f0_end > f0_max) f0_end = f0_max;
+
+  for(f0=f0_start; f0<=f0_end; f0+= 2.5) {
+      e = test_candidate_mbe(Sw, W, f0);
+      bin = floor(f0); assert((bin > 0) && (bin < F0_MAX));
+      e_hz[bin] = e;
+      if (e < e_min) {
+         e_min = e;
+         best_f0 = f0;
+      }
+  }
+
+  #ifdef DUMP
+  dump_e(e_hz);
+  #endif
+
+  return best_f0;
+}
+
+/*---------------------------------------------------------------------------*\
+                                                                             
+  test_candidate_mbe()          
+                                                                             
+  Returns the error of the MBE cost function for the input f0.  
+
+  Note: I think a lot of the operations below can be simplified as
+  W[].imag = 0 and has been normalised such that den always equals 1.
+                                                                             
+\*---------------------------------------------------------------------------*/
+
+float test_candidate_mbe(
+    COMP  Sw[],
+    COMP  W[],
+    float f0
+)
+{
+    COMP  Sw_[FFT_ENC];   /* DFT of all voiced synthesised signal */
+    int   l,al,bl,m;      /* loop variables */
+    COMP  Am;             /* amplitude sample for this band */
+    int   offset;         /* centers Hw[] about current harmonic */
+    float den;            /* denominator of Am expression */
+    float error;          /* accumulated error between originl and synthesised */
+    float Wo;             /* current "test" fundamental freq. */
+    int   L;
+    
+    L = floor((SAMPLE_RATE/2.0)/f0);
+    Wo = f0*(2*PI/SAMPLE_RATE);
+
+    error = 0.0;
+
+    /* Just test across the harmonics in the first 1000 Hz (L/4) */
+
+    for(l=1; l<L/4; l++) {
+       Am.real = 0.0;
+       Am.imag = 0.0;
+       den = 0.0;
+       al = ceil((l - 0.5)*Wo*FFT_ENC/TWO_PI);
+       bl = ceil((l + 0.5)*Wo*FFT_ENC/TWO_PI);
+
+       /* Estimate amplitude of harmonic assuming harmonic is totally voiced */
+
+       for(m=al; m<bl; m++) {
+           offset = FFT_ENC/2 + m - l*Wo*FFT_ENC/TWO_PI + 0.5;
+           Am.real += Sw[m].real*W[offset].real + Sw[m].imag*W[offset].imag;
+           Am.imag += Sw[m].imag*W[offset].real - Sw[m].real*W[offset].imag;
+           den += W[offset].real*W[offset].real + W[offset].imag*W[offset].imag;
+        }
+
+        Am.real = Am.real/den;
+        Am.imag = Am.imag/den;
+
+        /* Determine error between estimated harmonic and original */
+
+        for(m=al; m<bl; m++) {
+           offset = FFT_ENC/2 + m - l*Wo*FFT_ENC/TWO_PI + 0.5;
+           Sw_[m].real = Am.real*W[offset].real - Am.imag*W[offset].imag;
+           Sw_[m].imag = Am.real*W[offset].imag + Am.imag*W[offset].real;
+           error += (Sw[m].real - Sw_[m].real)*(Sw[m].real - Sw_[m].real);
+           error += (Sw[m].imag - Sw_[m].imag)*(Sw[m].imag - Sw_[m].imag);
+       }
+    }
+
+    return error;
+}
+
+
diff --git a/codec2-dev/src/nlp.h b/codec2-dev/src/nlp.h

index 88a3733dc249e86b228ae66b425916cb08204e1a..38265959024210e38e332afc315a20ae7cf8873c 100644 (file)
--- a/codec2-dev/src/nlp.h
+++ b/codec2-dev/src/nlp.h
@@ -33,7 +33,6 @@
  void *nlp_create();
  void nlp_destroy(void *nlp_state);
  float nlp(void *nlp_state, float Sn[], int n, int m, int pmin, int pmax, 
-         float *pitch, COMP Sw[], float *prev_Wo);
-float test_candidate_mbe(COMP Sw[], float f0, COMP Sw_[]);
+         float *pitch, COMP Sw[], COMP W[], float *prev_Wo);
  
  #endif
author	drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
	Mon, 15 Oct 2012 20:44:16 +0000 (20:44 +0000)
committer	drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
	Mon, 15 Oct 2012 20:44:16 +0000 (20:44 +0000)
codec2-dev/src/c2sim.c		patch \| blob \| history
codec2-dev/src/codec2.c		patch \| blob \| history
codec2-dev/src/nlp.c		patch \| blob \| history
codec2-dev/src/nlp.h		patch \| blob \| history