added some post processing to voicing estimator, which has reduced some of the voicin...

author drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>

Fri, 12 Nov 2010 07:49:46 +0000 (07:49 +0000)

committer drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>

Fri, 12 Nov 2010 07:49:46 +0000 (07:49 +0000)
author drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
Fri, 12 Nov 2010 07:49:46 +0000 (07:49 +0000)
committer drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
Fri, 12 Nov 2010 07:49:46 +0000 (07:49 +0000)
diff --git a/codec2/src/c2sim.c b/codec2/src/c2sim.c

index 950d1061bc67add20e054ddfe0e96a536d3b827d..4e54aaff4b24137a6951e093295fc4e5a8ceb542 100644 (file)
--- a/codec2/src/c2sim.c
+++ b/codec2/src/c2sim.c
@@ -145,7 +145,7 @@ int main(int argc, char *argv[])
    for(i=0; i<LPC_ORD; i++) {
        prev_lsps[i] = i*PI/(LPC_ORD+1);
    }
-  prev_e = 1;
+  e = prev_e = 1;
  
    nlp_states = nlp_create();
  
@@ -233,7 +233,8 @@ int main(int argc, char *argv[])
    sum_snr = 0;
    while(fread(buf,sizeof(short),N,fin)) {
      frames++;
-    
+    printf("frame: %d", frames);
+
      /* Read input speech */
  
      for(i=0; i<M-N; i++)
@@ -244,7 +245,6 @@ int main(int argc, char *argv[])
      /* Estimate pitch */
  
      nlp(nlp_states,Sn,N,M,P_MIN,P_MAX,&pitch,Sw,&prev_Wo);
-    prev_Wo = TWO_PI/pitch;
      model.Wo = TWO_PI/pitch;
  
      /* estimate model parameters */
@@ -282,7 +282,7 @@ int main(int argc, char *argv[])
         
         /* determine voicing */
  
-       snr = est_voicing_mbe(&model, Sw, W, Sw_, Ew);
+       snr = est_voicing_mbe(&model, Sw, W, Sw_, Ew, prev_Wo);
  #ifdef DUMP
         dump_Sw_(Sw_);
         dump_Ew(Ew);
@@ -383,6 +383,7 @@ int main(int argc, char *argv[])
         synth_one_frame(buf, &model, Sn_, Pn);
         if (fout != NULL) fwrite(buf,sizeof(short),N,fout);
      }
+    prev_Wo = TWO_PI/pitch;
    }
    fclose(fin);
  
diff --git a/codec2/src/codec2.c b/codec2/src/codec2.c

index e23da2cba75697d844ab1434ec899aedfeea9b5d..03a28ed0606a3c25c41a4d8eec20cd4328fb8e27 100644 (file)
--- a/codec2/src/codec2.c
+++ b/codec2/src/codec2.c
@@ -329,7 +329,6 @@ void analyse_one_frame(CODEC2 *c2, MODEL *model, short speech[])
      /* Estimate pitch */
  
      nlp(c2->nlp,c2->Sn,N,M,P_MIN,P_MAX,&pitch,Sw,&c2->prev_Wo);
-    c2->prev_Wo = TWO_PI/pitch;
      model->Wo = TWO_PI/pitch;
      model->L = PI/model->Wo;
  
@@ -338,5 +337,7 @@ void analyse_one_frame(CODEC2 *c2, MODEL *model, short speech[])
      dft_speech(Sw, c2->Sn, c2->w); 
      two_stage_pitch_refinement(model, Sw);
      estimate_amplitudes(model, Sw, c2->W);
-    est_voicing_mbe(model, Sw, c2->W, Sw_, Ew);
+    est_voicing_mbe(model, Sw, c2->W, Sw_, Ew, c2->prev_Wo);
+
+    c2->prev_Wo = model->Wo;
  }
diff --git a/codec2/src/sine.c b/codec2/src/sine.c

index 9a07a4c4e4e74033f52f09c953b6f57e360915d0..f58882eebcf77328b5bb3706c109d72d4a6d7072 100644 (file)
--- a/codec2/src/sine.c
+++ b/codec2/src/sine.c
@@ -363,8 +363,8 @@ float est_voicing_mbe(
      COMP   W[],
      COMP   Sw_[],         /* DFT of all voiced synthesised signal  */
                            /* useful for debugging/dump file        */
-    COMP   Ew[]           /* DFT of error                        */
-)
+    COMP   Ew[],          /* DFT of error                        */
+    float prev_Wo)
  {
      int   i,l,al,bl,m;    /* loop variables */
      COMP  Am;             /* amplitude sample for this band */
@@ -373,6 +373,8 @@ float est_voicing_mbe(
      float error;          /* accumulated error between originl and synthesised */
      float Wo;            
      float sig, snr;
+    float elow, ehigh, eratio;
+    float dF0, sixty;
  
      sig = 0.0;
      for(l=1; l<=model->L/4; l++) {
@@ -427,7 +429,58 @@ float est_voicing_mbe(
         model->voiced = 1;
      else
         model->voiced = 0;
-   
+ 
+    /* post processing, helps clean up some voicing errors ---------------------*/
+
+    /* 
+       Determine the ratio of low freancy to high frequency energy,
+       voiced speech tends to be dominated by low frequency energy,
+       unvoiced by high frequency. This measure can be used to
+       determine if we have made any gross errors.
+    */
+
+    elow = ehigh = 0.0;
+    for(l=1; l<=model->L/2; l++) {
+       elow += model->A[l]*model->A[l];
+    }
+    for(l=model->L/2; l<=model->L; l++) {
+       ehigh += model->A[l]*model->A[l];
+    }
+    eratio = 10.0*log10(elow/ehigh);
+    dF0 = 0.0;
+
+    /* Look for Type 1 errors, strongly V speech that has been
+       accidentally declared UV */
+
+    if (model->voiced == 0)
+       if (eratio > 10.0)
+           model->voiced = 1;
+
+    /* Look for Type 2 errors, strongly UV speech that has been
+       accidentally declared V */
+
+    if (model->voiced == 1) {
+       if (eratio < -10.0)
+           model->voiced = 0;
+
+       /* If pitch is jumping about it's likely this is UV */
+
+       dF0 = (model->Wo - prev_Wo)*FS/TWO_PI;
+       if (fabs(dF0) > 15.0) 
+           model->voiced = 0;
+
+       /* A common source of Type 2 errors is the pitch estimator
+          gives a low (50Hz) estimate for UV speech, which gives a
+          good match with noise due to the close harmoonic spacing.
+          These errors are much more common than people with 50Hz
+          pitch, so we have just a small eratio threshold. */
+
+       sixty = 60.0*TWO_PI/FS;
+       if ((eratio < -4.0) && (model->Wo <= sixty))
+           model->voiced = 0;
+    }
+    printf(" v: %d snr: %f eratio: %3.2f %f\n", model->voiced, snr, eratio, dF0);
+
      return snr;
  }
  
diff --git a/codec2/src/sine.h b/codec2/src/sine.h

index 73d928fd0498ef484e3fa389f18d21f2c2d4ac0e..88eee37f7fa21efa59016d4a0532b1434dca630b 100644 (file)
--- a/codec2/src/sine.h
+++ b/codec2/src/sine.h
@@ -35,7 +35,7 @@ void make_analysis_window(float w[], COMP W[]);
  void dft_speech(COMP Sw[], float Sn[], float w[]);
  void two_stage_pitch_refinement(MODEL *model, COMP Sw[]);
  void estimate_amplitudes(MODEL *model, COMP Sw[], COMP W[]);
-float est_voicing_mbe(MODEL *model, COMP Sw[], COMP W[], COMP Sw_[],COMP Ew[]);
+float est_voicing_mbe(MODEL *model, COMP Sw[], COMP W[], COMP Sw_[],COMP Ew[], float prev_Wo);
  void make_synthesis_window(float Pn[]);
  void synthesise(float Sn_[], MODEL *model, float Pn[], int shift);
author	drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
	Fri, 12 Nov 2010 07:49:46 +0000 (07:49 +0000)
committer	drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
	Fri, 12 Nov 2010 07:49:46 +0000 (07:49 +0000)
codec2/src/c2sim.c		patch \| blob \| history
codec2/src/codec2.c		patch \| blob \| history
codec2/src/sine.c		patch \| blob \| history
codec2/src/sine.h		patch \| blob \| history