From: drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
Date: Fri, 9 Oct 2009 01:48:48 +0000 (+0000)
Subject: first working 10 to 20ms decimation algorithm, sounds quite close on _uq speech sampl... 
X-Git-Url: http://git.whiteaudio.com/gitweb/?a=commitdiff_plain;h=8cc4b12f7148026478b3d96c1c42906cf51ed064;p=freetel-svn-tracking.git

first working 10 to 20ms decimation algorithm, sounds quite close on _uq speech samples.  About to clean up

git-svn-id: https://svn.code.sf.net/p/freetel/code@74 01035d8c-6547-0410-b346-abe4f91aad63
---

diff --git a/codec2/src/defines.h b/codec2/src/defines.h
index e9702f05..eb8b8287 100644
--- a/codec2/src/defines.h
+++ b/codec2/src/defines.h
@@ -56,7 +56,7 @@
 
 /* Decoder defines */
 
-#define AW_DEC 160		/* number of samples in synthesis window */
+#define AW_DEC  160		/* number of samples in synthesis window */
 #define FFT_DEC 512	    	/* number of points in DFT */
 #define TW 40			/* Trapezoidal synthesis window overlap */
 #define MAX_STR 256
diff --git a/codec2/src/sinedec.c b/codec2/src/sinedec.c
index 0b7d83ba..815f6984 100644
--- a/codec2/src/sinedec.c
+++ b/codec2/src/sinedec.c
@@ -88,13 +88,28 @@ int main(int argc, char *argv[])
   
   int phase, phase_model;
   float ex_phase[1];
+  int voiced, voiced_1, voiced_2, voiced_synth;
 
   int   postfilt;
   float bg_est;
 
   int   hand_snr;
   FILE *fsnr;
-  
+
+  MODEL model_1, model_2, model_3, model_synth, model_a, model_b;
+  int transition;
+
+  int vf=0;
+
+  voiced_1 = voiced_2 = 0;
+  model_1.Wo = TWO_PI/P_MIN;
+  model_1.L = floor(PI/model_1.Wo);
+  for(i=1; i<=model_1.L; i++) {
+      model_1.A[i] = 0.0;
+      model_1.phi[i] = 0.0;
+  }
+  model_synth = model_3 = model_2 = model_1;
+
   if (argc < 3) {
     printf("usage: sinedec InputFile ModelFile [-o OutputFile] [-o lpc Order]\n");
     printf("       [--dump DumpFilePrefix]\n");
@@ -176,6 +191,8 @@ int main(int argc, char *argv[])
   bg_est = 0.0;
   postfilt = switch_present("--postfilter",argc,argv);
 
+  transition = 0;
+
   /* Initialise ------------------------------------------------------------*/
 
   init_decoder();
@@ -201,7 +218,7 @@ int main(int argc, char *argv[])
     dump_Sn(Sn);
     dft_speech(); dump_Sw(Sw);   
 
-    dump_model(&model);
+    //dump_model(&model);
 
     /* optional phase modelling - make sure this happens before LPC
        modelling of {Am} as first order model fit doesn't work well
@@ -234,7 +251,7 @@ int main(int argc, char *argv[])
 	    assert(order == PHASE_LPC_ORD);
 
 	dump_ak(ak_phase, PHASE_LPC_ORD);
-	snr = phase_model_first_order(ak_phase, H, &n_min, &min_Am);
+	snr = phase_model_first_order(ak_phase, H, &n_min, &min_Am, &voiced);
 
 	dump_snr(snr);
 	if (phase_model == 0) {
@@ -243,15 +260,15 @@ int main(int argc, char *argv[])
 	    	model.phi[i] = 0;
 	    if (hand_snr)
 		fscanf(fsnr,"%f\n",&snr);
-	    phase_synth_zero_order(snr, H, ex_phase);
+	    phase_synth_zero_order(voiced, H, ex_phase, voiced);
 	}
 
 	if (phase_model == 1) {
-	    phase_synth_first_order(snr, H, n_min, min_Am);
+	    phase_synth_first_order(voiced, H, n_min, min_Am, voiced);
         }
 
         if (postfilt)
-	    postfilter(&model, snr>2.0, &bg_est);
+	    postfilter(&model, voiced, &bg_est);
 
         dump_phase_(&model.phi[0]);
     }
@@ -264,11 +281,130 @@ int main(int argc, char *argv[])
         dump_quantised_model(&model);
     }
 
+ #define DEC
+ #ifdef DEC
+   /* Decimate to 20ms frame rate.  In the code we only send
+      off frames to the receiver.  To simulate this on odd
+      frames the model parameters pass straight thru.  On even
+      frames we interpolate from adjacent odd frames.  A one
+      frame delay is required for the odd frames.
+   */
+
+    /* 
+       frames  Transmitted to Rx  Decimator output
+ 
+         0     n                  0.5*model(-3) + 0.5*model(-1)
+         1     y                  model(-1)
+         2     n                  0.5*model(-1) + 0.5*model(1)
+         3     y                  model(1)
+ 	 4     n                  0.5*model(1) + 0.5*model(3)
+	 5     y                  model(3)
+    */
+
+    /* 
+       TODO: 
+       [ ] Voicing decision
+       [ ] unvoiced
+           [ ] amplitudes
+           [ ] phases
+           [ ] Wo
+       [ ] unvoiced
+           [ ] amplitudes
+           [ ] phases
+               + OK to run zero phase model on 10ms rate, using info
+                 from adjacent 20 ms frames
+           [ ] Wo
+    */
+
+    dump_model(&model_2);
+
+    if (frames%2) {
+
+	/* odd frames use the original model parameters */
+
+	model_synth = model_2;
+	transition = 0;
+
+    }
+    else {
+	/* even frame so we need to synthesise the model parameters by
+	   interpolating between adjacent frames */
+
+	model_synth = model_2;
+        voiced_synth = voiced && voiced_2;
+	if (fabs(model_1.Wo - model_3.Wo) < 0.1*model_1.Wo) {
+	    /* If the Wo of adjacent frames is within 10% we synthesise a 
+	       continuous track through this frame by linear interpolation
+	       of the amplitudes and Wo.  This is typical of a strongly 
+	       voiced frame.
+	    */
+	    transition = 0;
+
+	    /* continuous track through this frame */
+	    #define T
+	    #ifdef T
+		model_synth.Wo = (model_1.Wo + model_3.Wo)/2.0;
+		if (model_1.L > model_3.L)
+		    model_synth.L = model_3.L;
+		else
+		    model_synth.L = model_1.L;
+	    #endif
+		for(i=1; i<=model_synth.L; i++) {
+		    model_synth.A[i] = (model_3.A[i] + model_1.A[i])/2.0;
+		    /* cheat on phases for now, these were constructed using
+		       LPC model from actual speech for this frame - fix later */
+		    model_synth.phi[i] = model_2.phi[i];
+		}
+		vf++;
+	}
+	else {
+	    /* 
+	       transition frame, adjacent frames have different Wo and
+	       L so set up two sets of model parameters based on
+	       previous and next frame.  We then synthesise both of
+	       them and add them together in the time domain.  Note
+	       the adjustments to phase to shift the timing of the
+	       model parameters forward or back N samples.  
+
+	       This case is typical of unvoiced speech or background noise
+	       of a voiced to unvoiced transition.
+	    */
+
+	    transition = 1;
+
+	    memcpy(&model_a, &model_3, sizeof(model));
+	    memcpy(&model_b, &model_1, sizeof(model));
+	    for(i=1; i<=model_a.L; i++) {
+		model_a.A[i] /= 2.0;
+		model_a.phi[i] += model_a.Wo*i*N;
+	    }
+	    for(i=1; i<=model_b.L; i++) {
+		model_b.A[i] /= 2.0;
+		model_b.phi[i] -= model_b.Wo*i*N;
+	    }
+	}
+    }
+
+    voiced_2 = voiced_1;
+    voiced_1 = voiced;
+
+    model_3 = model_2;
+    model_2 = model_1;
+    model_1 = model;
+    model = model_synth;
+#endif
+
     /* Synthesise speech */
 
     if (fout != NULL) {
 
-	synthesise_mixed(Pn,&model,Sn_);
+	if (transition) {
+	    synthesise_mixed(Pn,&model_a,Sn_,1);
+	    synthesise_mixed(Pn,&model_b,Sn_,0);
+	}
+	else {
+	    synthesise_mixed(Pn,&model,Sn_,1);
+	}
 
 	/* Save output speech to disk */
 
@@ -285,6 +421,7 @@ int main(int argc, char *argv[])
   }
 
   //printf("gmin = %f\n", get_gmin());
+  printf("vf = %d\n", vf);
   if (fout != NULL)
     fclose(fout);
 
diff --git a/codec2/src/synth.c b/codec2/src/synth.c
index 9f2d8c66..105acbf1 100644
--- a/codec2/src/synth.c
+++ b/codec2/src/synth.c
@@ -30,20 +30,23 @@
 #include "sine.h"
 
 void synthesise_mixed(
-  float Pn[],		/* time domain Parzen window */
+  float   Pn[],		/* time domain Parzen window */
   MODEL *model,		/* ptr to model parameters for this frame */
-  float Sn_[]		/* time domain synthesised signal */
+  float  Sn_[],		/* time domain synthesised signal */
+  int    shift          /* if non-zero update memories */
 )
 {
   int i,l,j,b;	        /* loop variables */
   COMP Sw_[FFT_DEC];	/* DFT of synthesised signal */
 
-  /* Update memories */
+  if (shift) {
+      /* Update memories */
 
-  for(i=0; i<N-1; i++) {
-    Sn_[i] = Sn_[i+N];
+      for(i=0; i<N-1; i++) {
+	  Sn_[i] = Sn_[i+N];
+      }
+      Sn_[N-1] = 0.0;
   }
-  Sn_[N-1] = 0.0;
 
   for(i=0; i<FFT_DEC; i++) {
     Sw_[i].real = 0.0;
@@ -67,10 +70,16 @@ void synthesise_mixed(
   /* Overlap add to previous samples */
 
   for(i=0; i<N-1; i++) {
-    Sn_[i] += Sw_[FFT_DEC-N+1+i].real*Pn[i];
+      Sn_[i] += Sw_[FFT_DEC-N+1+i].real*Pn[i];
   }
-  for(i=N-1,j=0; i<2*N; i++,j++)
-    Sn_[i] = Sw_[j].real*Pn[i];
+
+  if (shift)
+      for(i=N-1,j=0; i<2*N; i++,j++)
+	  Sn_[i] = Sw_[j].real*Pn[i];
+  else
+      for(i=N-1,j=0; i<2*N; i++,j++)
+	  Sn_[i] += Sw_[j].real*Pn[i];
+
 }
 
 /*---------------------------------------------------------------------------*\
@@ -98,7 +107,7 @@ void synthesise_mixed(
 
   Result: when tested was no difference in output speech quality.  The
   partial unvoiced sound when using zero phase model was found to be
-  due mis-laignment of teh LPC analysis window and accidental addition
+  due mis-alignment of the LPC analysis window and accidental addition
   of a random phase component.  So we are sticking with synthesise_mixed()
   above for now.
 
diff --git a/codec2/src/synth.h b/codec2/src/synth.h
index 7651d603..0ab8fb28 100644
--- a/codec2/src/synth.h
+++ b/codec2/src/synth.h
@@ -32,7 +32,7 @@
 
 #include "sine.h"
 
-void synthesise_mixed(float Pn[], MODEL *model, float Sn_[]);
+void synthesise_mixed(float Pn[], MODEL *model, float Sn_[], int shift);
 void synthesise_continuous_phase(float Pn[], MODEL *model, float Sn_[], 
 				 int voiced, float *Wo_prev, float phi_prev[]);
 #endif