From ebd97e824de24a1f7b25f386a7a98f73a927697c Mon Sep 17 00:00:00 2001
From: drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
Date: Fri, 12 Nov 2010 07:49:46 +0000
Subject: [PATCH] added some post processing to voicing estimator, which has
 reduced some of the voicing errors and clicky artefacts

git-svn-id: https://svn.code.sf.net/p/freetel/code@220 01035d8c-6547-0410-b346-abe4f91aad63
---
 codec2/src/c2sim.c  |  9 ++++---
 codec2/src/codec2.c |  5 ++--
 codec2/src/sine.c   | 59 ++++++++++++++++++++++++++++++++++++++++++---
 codec2/src/sine.h   |  2 +-
 4 files changed, 65 insertions(+), 10 deletions(-)

diff --git a/codec2/src/c2sim.c b/codec2/src/c2sim.c
index 950d1061..4e54aaff 100644
--- a/codec2/src/c2sim.c
+++ b/codec2/src/c2sim.c
@@ -145,7 +145,7 @@ int main(int argc, char *argv[])
   for(i=0; i<LPC_ORD; i++) {
       prev_lsps[i] = i*PI/(LPC_ORD+1);
   }
-  prev_e = 1;
+  e = prev_e = 1;
 
   nlp_states = nlp_create();
 
@@ -233,7 +233,8 @@ int main(int argc, char *argv[])
   sum_snr = 0;
   while(fread(buf,sizeof(short),N,fin)) {
     frames++;
-    
+    printf("frame: %d", frames);
+
     /* Read input speech */
 
     for(i=0; i<M-N; i++)
@@ -244,7 +245,6 @@ int main(int argc, char *argv[])
     /* Estimate pitch */
 
     nlp(nlp_states,Sn,N,M,P_MIN,P_MAX,&pitch,Sw,&prev_Wo);
-    prev_Wo = TWO_PI/pitch;
     model.Wo = TWO_PI/pitch;
 
     /* estimate model parameters */
@@ -282,7 +282,7 @@ int main(int argc, char *argv[])
 	
 	/* determine voicing */
 
-	snr = est_voicing_mbe(&model, Sw, W, Sw_, Ew);
+	snr = est_voicing_mbe(&model, Sw, W, Sw_, Ew, prev_Wo);
 #ifdef DUMP
 	dump_Sw_(Sw_);
 	dump_Ew(Ew);
@@ -383,6 +383,7 @@ int main(int argc, char *argv[])
 	synth_one_frame(buf, &model, Sn_, Pn);
 	if (fout != NULL) fwrite(buf,sizeof(short),N,fout);
     }
+    prev_Wo = TWO_PI/pitch;
   }
   fclose(fin);
 
diff --git a/codec2/src/codec2.c b/codec2/src/codec2.c
index e23da2cb..03a28ed0 100644
--- a/codec2/src/codec2.c
+++ b/codec2/src/codec2.c
@@ -329,7 +329,6 @@ void analyse_one_frame(CODEC2 *c2, MODEL *model, short speech[])
     /* Estimate pitch */
 
     nlp(c2->nlp,c2->Sn,N,M,P_MIN,P_MAX,&pitch,Sw,&c2->prev_Wo);
-    c2->prev_Wo = TWO_PI/pitch;
     model->Wo = TWO_PI/pitch;
     model->L = PI/model->Wo;
 
@@ -338,5 +337,7 @@ void analyse_one_frame(CODEC2 *c2, MODEL *model, short speech[])
     dft_speech(Sw, c2->Sn, c2->w); 
     two_stage_pitch_refinement(model, Sw);
     estimate_amplitudes(model, Sw, c2->W);
-    est_voicing_mbe(model, Sw, c2->W, Sw_, Ew);
+    est_voicing_mbe(model, Sw, c2->W, Sw_, Ew, c2->prev_Wo);
+
+    c2->prev_Wo = model->Wo;
 }
diff --git a/codec2/src/sine.c b/codec2/src/sine.c
index 9a07a4c4..f58882ee 100644
--- a/codec2/src/sine.c
+++ b/codec2/src/sine.c
@@ -363,8 +363,8 @@ float est_voicing_mbe(
     COMP   W[],
     COMP   Sw_[],         /* DFT of all voiced synthesised signal  */
                           /* useful for debugging/dump file        */
-    COMP   Ew[]           /* DFT of error                        */
-)
+    COMP   Ew[],          /* DFT of error                        */
+    float prev_Wo)
 {
     int   i,l,al,bl,m;    /* loop variables */
     COMP  Am;             /* amplitude sample for this band */
@@ -373,6 +373,8 @@ float est_voicing_mbe(
     float error;          /* accumulated error between originl and synthesised */
     float Wo;            
     float sig, snr;
+    float elow, ehigh, eratio;
+    float dF0, sixty;
 
     sig = 0.0;
     for(l=1; l<=model->L/4; l++) {
@@ -427,7 +429,58 @@ float est_voicing_mbe(
 	model->voiced = 1;
     else
 	model->voiced = 0;
-   
+ 
+    /* post processing, helps clean up some voicing errors ---------------------*/
+
+    /* 
+       Determine the ratio of low freancy to high frequency energy,
+       voiced speech tends to be dominated by low frequency energy,
+       unvoiced by high frequency. This measure can be used to
+       determine if we have made any gross errors.
+    */
+
+    elow = ehigh = 0.0;
+    for(l=1; l<=model->L/2; l++) {
+	elow += model->A[l]*model->A[l];
+    }
+    for(l=model->L/2; l<=model->L; l++) {
+	ehigh += model->A[l]*model->A[l];
+    }
+    eratio = 10.0*log10(elow/ehigh);
+    dF0 = 0.0;
+
+    /* Look for Type 1 errors, strongly V speech that has been
+       accidentally declared UV */
+
+    if (model->voiced == 0)
+	if (eratio > 10.0)
+	    model->voiced = 1;
+
+    /* Look for Type 2 errors, strongly UV speech that has been
+       accidentally declared V */
+
+    if (model->voiced == 1) {
+	if (eratio < -10.0)
+	    model->voiced = 0;
+
+	/* If pitch is jumping about it's likely this is UV */
+
+	dF0 = (model->Wo - prev_Wo)*FS/TWO_PI;
+	if (fabs(dF0) > 15.0) 
+	    model->voiced = 0;
+
+	/* A common source of Type 2 errors is the pitch estimator
+	   gives a low (50Hz) estimate for UV speech, which gives a
+	   good match with noise due to the close harmoonic spacing.
+	   These errors are much more common than people with 50Hz
+	   pitch, so we have just a small eratio threshold. */
+
+	sixty = 60.0*TWO_PI/FS;
+	if ((eratio < -4.0) && (model->Wo <= sixty))
+	    model->voiced = 0;
+    }
+    printf(" v: %d snr: %f eratio: %3.2f %f\n", model->voiced, snr, eratio, dF0);
+
     return snr;
 }
 
diff --git a/codec2/src/sine.h b/codec2/src/sine.h
index 73d928fd..88eee37f 100644
--- a/codec2/src/sine.h
+++ b/codec2/src/sine.h
@@ -35,7 +35,7 @@ void make_analysis_window(float w[], COMP W[]);
 void dft_speech(COMP Sw[], float Sn[], float w[]);
 void two_stage_pitch_refinement(MODEL *model, COMP Sw[]);
 void estimate_amplitudes(MODEL *model, COMP Sw[], COMP W[]);
-float est_voicing_mbe(MODEL *model, COMP Sw[], COMP W[], COMP Sw_[],COMP Ew[]);
+float est_voicing_mbe(MODEL *model, COMP Sw[], COMP W[], COMP Sw_[],COMP Ew[], float prev_Wo);
 void make_synthesis_window(float Pn[]);
 void synthesise(float Sn_[], MODEL *model, float Pn[], int shift);
 
-- 
2.25.1