From c1043232799b658e8d8411beadd3db3053f87042 Mon Sep 17 00:00:00 2001
From: drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
Date: Fri, 11 Sep 2009 00:49:35 +0000
Subject: [PATCH] fixed window alignment issues which improved  LPC modelling
 for Ams and phase models

git-svn-id: https://svn.code.sf.net/p/freetel/code@55 01035d8c-6547-0410-b346-abe4f91aad63
---
 codec2/README.txt     | 125 ++++++++++++++++++++++++++++++++----------
 codec2/TODO.txt       |  64 +++++++++++----------
 codec2/src/defines.h  |   3 +-
 codec2/src/globals.c  |   2 +-
 codec2/src/initenc.c  |  86 +++++++++++++++++++++++------
 codec2/src/listen.sh  |   4 +-
 codec2/src/nlp.c      |   2 +-
 codec2/src/quantise.c |   6 +-
 codec2/src/refine.c   |   4 +-
 codec2/src/sinedec.c  |  10 ++--
 10 files changed, 215 insertions(+), 91 deletions(-)

diff --git a/codec2/README.txt b/codec2/README.txt
index 7a86c5df..3dbdb084 100644
--- a/codec2/README.txt
+++ b/codec2/README.txt
@@ -10,9 +10,9 @@ Introduction
 Codec2 is a open source low bit rate speech codec designed for
 communications quality speech at around 2400 kbit/s.  Applications
 include low bandwidth HF/VHF digital radio.  It fills a gap in open
-source, free as in speech voice codecs beneath 5000 bit/s.
+source, free-as-in-speech voice codecs beneath 5000 bit/s.
 
-The motivations behind the project are summarised on this
+The motivations behind the project are summarised in this
 link:/blog/?p=128[blog post].
 
 [[status]]
@@ -77,6 +77,8 @@ Development Roadmap
       [ ] Frame rate/quantisation schemes for 2400 bit/s developed
       [ ] Refactor to develop a encoder/decoder functions
       [ ] Test phone call over LAN
+      [ ] Fixed point port
+      [ ] codec2-on-a-chip embedded DSP/CPU port
 
 [[howitworks]]
 How it Works
@@ -113,7 +115,7 @@ update rate).
 
 The speech quality of the basic harmonic sinusoidal model is pretty
 good, close to transparent.  It is also relatively robust to Wo
-estimation errors.  Unvoiced Speech (e.g. consonants) are well
+estimation errors.  Unvoiced speech (e.g. consonants) are well
 modelled by a bunch of harmonics with random phases.  Speech corrupted
 with background noise also sounds OK, the background noise doesn't
 introduce any grossly unpleasant artifacts.
@@ -143,9 +145,10 @@ The tough bits of this project are:
    unvoiced (constants).  The voicing model needs to be accurate (not
    introduce distortion), and relatively low bit rate.
 
-4. Quantisation of the amplitudes { A } to a small number of bits while
-   maintaining speech quality.  For example 30 bits/frame is 1500
-   bits/s, a large part of our 2400 bit/s "budget".
+4. Quantisation of the amplitudes { A } to a small number of bits
+   while maintaining speech quality.  For example 30 bits/frame at a
+   20ms frame rate is 30/0.02 = 1500 bits/s, a large part of our 2400
+   bit/s "budget".
 
 5. Performance with different speakers and background noise
    conditions.  This is where you come in - as codec2 develops please
@@ -154,6 +157,20 @@ The tough bits of this project are:
    algorithm.  This approach proved very powerful when developing
    link:oslec.html[Oslec].  One of the cool things about open source!
 
+[[help]]
+Can I help?
+-----------
+
+Maybe, check out the latest version of the
+http://freetel.svn.sourceforge.net/viewvc/freetel/codec2/TODO.txt?view=log[TODO]
+list and the development roadmap above and see if there is anything
+that interests you.
+
+I will happily accept sponsorship for this project.  For example
+research grants, or development contracts for companies interested in
+seeing an open source low bit rate speech codec.  One interesting
+project would be funding a real time port to a single DSP/CPU chip.
+
 [[patents]]
 Is it Patent Free?
 ------------------
@@ -167,11 +184,11 @@ compare.
 Proprietary codecs typically have small, novel parts of the algorithm
 protected by patents.  However the designers of these codecs rely
 heavily on large bodies of existing, public domain work.  The patents
-cover perhaps 5% of the codec algorithms.  The designers of
-proprietary codecs did not invent most of the algorithms they use in
-their codec. Typically, the patents just cover enough to make
-designing an interoperable codec very difficult.  These also tend to
-be the parts that make their codecs sound good.
+cover perhaps 5% of the codec algorithms.  Proprietary codec designers
+did not invent most of the algorithms they use in their
+codec. Typically, the patents just cover enough to make designing an
+interoperable codec very difficult.  These also tend to be the parts
+that make their codecs sound good.
 
 However there are many ways to make a codec sound good, so we simply
 need to choose and develop other methods.
@@ -195,21 +212,36 @@ recovered via RMS method (Section 5.1 of thesis).  Equal area model of
 LPC spectra versus harmonic seems to work remarkably well, especially
 compared to sampling.  SNRs up to 30dB on female frames.
 
-m=1 harmonic problem for males when LPC modelled. The amplitude of
-this harmonic is raised by as much as 30dB after LPC modelling as (I
-think) LPC spectra must have zero derivative at DC.  This means it's
-poor at modelling very low freq harmonics which unfortunately the ear
-is very sensitive to.  Consider automatic lowering for 20dB of this
-harmonic or maybe a few extra bits to quantise error.
+There is a problem with modelling the low order (e.g. m=1,
+i.e. fundamental) harmonics for males. The amplitude of the m=1
+harmonic is raised by as much as 30dB after LPC modelling as (I think)
+LPC spectra must have zero derivative at DC.  This means it's poor at
+modelling very low freq harmonics which unfortunately the ear is very
+sensitive to.  Consider automatic lowering for 20dB of this harmonic
+or maybe a few extra bits to quantise error.  Or maybe just don't
+synthesise anything beneath 200Hz.
 
 [[phase]]
 Phase Modelling Notes
 ---------------------
 
-Phase modelling makes no attempt to match harmonic phases at frame
-edges.  This area would be worth experimenting with, as it could cause
-roughness.  Then again it might be responsible for effective mixed
-voicing modelling.
+"Zero order" and "First order" phase models have been developed to
+synthesise phases of each harmonic at the decoder side.  These are
+described in source code of
+http://freetel.svn.sourceforge.net/viewvc/freetel/codec2/src/phase.c?view=log[phase.c].
+
+The zero phase model required just one voicing bit to be transmitted
+to the decoder, all other phase information is synthesised use a rule
+based model.  It seems to work OK for most speech samples.
+
+To determine voicing we attempt to fit a first order phase model, then
+measure SNR of the fit.  The frame is declared unvoiced if the SNR is
+beneath a threshold.
+
+Current phase modelling makes no attempt to match harmonic phases at
+frame edges.  This area would be worth experimenting with, as it could
+cause roughness.  Then again it might be responsible for effective
+mixed voicing modelling.
 
 Unvoiced speech can be represented well by random phases and a Wo
 estimate that jumps around randomly.  If Wo is small the number of
@@ -229,28 +261,38 @@ Zero Phase model (just one voicing bit/frame):
 
   $ ./sinedec ../raw/hts1a.raw hts1a.mdl --phase 0 - hts1a_phase0.raw
 
-First order Phase model (around 12 bits/frame bits for pulse position
-and phase):
+First order Phase model (when quantised around 13 bits/frame bits for
+pulse position, phase, and a voicing bit):
 
   $ ./sinedec ../raw/hts1a.raw hts1a.mdl --phase 1 - hts1a_phase1.raw
 
-
 [[octave]]
 Octave Scripts 
 --------------
 
-pl.m    - plot a segment from a raw file
+* pl.m    - plot a segment from a raw file
 
-pl2.m   - plot the same segments from two different files to compare
+* pl2.m   - plot the same segments from two different files to compare
 
-plamp.m - menu based GUI interface to "dump" files, move back and forward
+* plamp.m - menu based GUI interface to "dump" files, move back and forward
           through file examining time and frequency domain parameters, lpc 
           model etc
   
           $ ./sinedec ../raw/hts1a.raw hts1a.mdl --lpc 10 --dump hts1a
           $ cd ../octave
           $ octave
-          octave:1> plamp("../src/hts1",25)
+          octave:1> plamp("../src/hts1a",25)
+
+* plphase.m - similar to plamp.m but for analysing phase models
+
+          $ ./sinedec ../raw/hts1a.raw hts1a.mdl --phase [0|1] --dump hts1a_phase
+          $ cd ../octave
+          $ octave
+          octave:1> plphase("../src/hts1a_phase",25)
+
+* plpitch.m - plot two pitch contours (.p files) and compare
+
+* plnlp.m   - plots a bunch of NLP pitch estimator states.  link:/images/codec2/tnlp_screenshot.png[Screenshot]
 
 [[directories]]
 Directories
@@ -265,8 +307,31 @@ Directories
   unittest - Unit test source code
   wav      - speech files in wave file format
 
-[[refs]]
-References
+[[other]]
+Other Uses
+----------
+
+The DSP algorithms contained in codec2 may be useful for other DSP
+applications, for example:
+
+* The
+  http://freetel.svn.sourceforge.net/viewvc/freetel/codec2/src/nlp.c?view=log[nlp.c]
+  pitch estimator is a working, tested, pitch estimator for human
+  speech.  NLP is an open source pitch estimator presented in C code
+  complete with a GUI debugging tool (plnlp.m
+  link:/images/codec2/tnlp_screenshot.png[screenshot]).  It can be run
+  stand-alone using the
+  http://freetel.svn.sourceforge.net/viewvc/freetel/codec2/unittest/tnlp.c?view=log[tnlp.c]
+  unit test program. It could be applied to other speech coding
+  research.  Pitch estimation is a popular subject in academia,
+  however most pitch estimators are described in papers, with the fine
+  implementation details left out.
+
+* The basic analysis/synthesis framework could be used for high
+  quality speech synthesis.
+
+[[refs]] 
+References 
 ----------
 
 [1] http://perens.com/[Bruce Perens] introducing the
diff --git a/codec2/TODO.txt b/codec2/TODO.txt
index 098a9278..99058ee9 100644
--- a/codec2/TODO.txt
+++ b/codec2/TODO.txt
@@ -1,38 +1,46 @@
 TODO for codec2
 ---------------
 
-Planned tasks and half formed ideas for codec2.....
+[ ] Important Open Issues
+    [ ] Why zero phase model doesn'y work for mmt1
+    [ ] Pitch errors on mmt1
+        + we may need a tracker
+    [ ] removal of LPC modelling errors for males
+        + first few haromic energies get raised
+    [ ] conversion to 20ms frames
+        + without significnat distortion
 
-[X] Convert files from DOS to Unix
-[X] Get sinenc and sinedec to build and run under gcc
-[ ] refactor
-    [ ] each source file has it's own header
-    [ ] no globals
-    [ ] Consistent file headers
-    [X] GPL2 notice in each file
-[ ] Replace Numerical Recipes in C (NRC) four1.c and four1.h with Gnu
-    Science Lib (GSL) SL FFT as NRC code has restrictive licencing
-[ ] A way to handle m=1 harmonic for males when LPC modelling
-[ ] Is BW expansion and Rk noise floor required before LSP quant
-[ ] test split VQ to make sure no silly errors
-    + for example test MSE or index historgram for training data
+[ ] Planned tasks and half formed ideas for codec2.....
+    [X] Convert files from DOS to Unix
+    [X] Get sinenc and sinedec to build and run under gcc
+    [ ] refactor
+        [ ] each source file has it's own header
+        [ ] no globals
+        [ ] Consistent file headers
+        [X] GPL2 notice in each file
+    [ ] Replace Numerical Recipes in C (NRC) four1.c and four1.h with Gnu
+        Science Lib (GSL) SL FFT as NRC code has restrictive licencing
+    [ ] A way to handle m=1 harmonic for males when LPC modelling
+    [ ] Is BW expansion and Rk noise floor required before LSP quant
+    [ ] test split VQ to make sure no silly errors
+        + for example test MSE or index historgram for training data
 
-[ ] Go through papers referenced in thesis and credit various
-    techniques to papers.
+    [ ] Go through papers referenced in thesis and credit various
+        techniques to papers.
       + sure there was somthing about zero phase synthesis is those papers
 
-[ ] voicing errors can be clearly seen in synthesised speech using pl2.m
+    [ ] voicing errors can be clearly seen in synthesised speech using pl2.m
 
-[ ] Voicing improvement
-    + voicing tracker, if enery about the same in frame n-1,n,n+1, and n-1, n+1
-      are voiced then make frame n voiced.
+    [ ] Voicing improvement
+        + voicing tracker, if enery about the same in frame n-1,n,n+1, and n-1, n+1
+          are voiced then make frame n voiced.
 
-[ ] mmt1 with zero phase model
-    + male with lots of background noise
-    + zero order synth model sounds more "clicky" on mmt1, this suggests less
-      dispersion
-    + could be due to IRS filtering or background noise
-    + exploring this could lead to better understanding on phase, e.g. add
-      noise to clean samples and explore effect on zero phase model
-    + wrote plphase.m to start analysing this
+    [ ] mmt1 with zero phase model
+        + male with lots of background noise
+        + zero order synth model sounds more "clicky" on mmt1, this suggests less
+          dispersion
+        + could be due to IRS filtering or background noise
+        + exploring this could lead to better understanding on phase, e.g. add
+          noise to clean samples and explore effect on zero phase model
+        + wrote plphase.m to start analysing this
 
diff --git a/codec2/src/defines.h b/codec2/src/defines.h
index 0c367947..255f0f0f 100644
--- a/codec2/src/defines.h
+++ b/codec2/src/defines.h
@@ -51,8 +51,7 @@
 
 /* Encoder defines */
 
-#define NW      280             /* analysis window size */
-#define AW_ENC  512		/* maximum encoder analysis window size */
+#define NW      279             /* analysis window size */
 #define FFT_ENC 512		/* size of FFT used for encoder analysis */
 
 /* Decoder defines */
diff --git a/codec2/src/globals.c b/codec2/src/globals.c
index ad92851f..ce5ce8a9 100644
--- a/codec2/src/globals.c
+++ b/codec2/src/globals.c
@@ -38,7 +38,7 @@ float sig;		/* energy of current frame */
 
 /* Globals used in encoder */
 
-float w[AW_ENC];	/* time domain hamming window */
+float w[M];	        /* time domain hamming window */
 COMP W[FFT_ENC];	/* DFT of w[] */
 COMP Sw[FFT_ENC];	/* DFT of current frame */
 
diff --git a/codec2/src/initenc.c b/codec2/src/initenc.c
index 333c6a15..393d9d40 100644
--- a/codec2/src/initenc.c
+++ b/codec2/src/initenc.c
@@ -34,53 +34,105 @@ void init_encoder()
 
   frames = 0;
 
-  /* Initialise sample buffer memories */
+  /* Initialise sample buffer memories, 1 1 stop divide by zero errors
+     and zero energy frames at the start */
 
-  for(i=0; i<M+AW_ENC/2; i++)
+  for(i=0; i<M; i++)
     Sn[i] = 1.0;
 
 }
 
-float make_window(int Nw)
+float make_window()
 {
   float m;
   COMP  temp;
   int   i,j;
 
-  /* Generate Hamming window centered on analysis window */
+  /* 
+     Generate Hamming window centered on M-sample pitch analysis window
+  
+  0            M/2           M-1
+  |-------------|-------------|
+        |-------|-------|
+            NW samples
+
+     All our analysis/synthsis is centred on the M/2 sample.               
+  */
 
   m = 0.0;
-  for(i=0; i<AW_ENC/2-Nw/2; i++)
+  for(i=0; i<M/2-NW/2; i++)
     w[i] = 0.0;
-  for(i=AW_ENC/2-Nw/2,j=0; i<AW_ENC/2+Nw/2; i++,j++) {
-    w[i] = 0.5 - 0.5*cos(TWO_PI*j/(Nw-1));
+  for(i=M/2-NW/2,j=0; i<M/2+NW/2; i++,j++) {
+    w[i] = 0.5 - 0.5*cos(TWO_PI*j/(NW-1));
     m += w[i]*w[i];
   }
-  for(i=AW_ENC/2+Nw/2; i<AW_ENC; i++)
+  for(i=M/2+NW/2; i<M; i++)
     w[i] = 0.0;
-
-  /* Normalise - make amplitude estimation straight forward */
+ 
+  /* Normalise - makes freq domain amplitude estimation straight
+     forward */
 
   m = 1.0/sqrt(m*FFT_ENC);
-  for(i=0; i<AW_ENC; i++) {
+  for(i=0; i<M; i++) {
     w[i] *= m;
   }
 
-  /* Generate DFT of analysis window, used for voicing estimation */
+  /* 
+     Generate DFT of analysis window, used for later processing.  Note
+     we modulo FFT_ENC shift the time domain window w[], this makes the
+     imaginary part of the DFT W[] equal to zero as the shifted w[] is
+     even about the n=0 time axis if NW is odd.  Having the imag part
+     of the DFT W[] makes computation easier.
+
+     0                      FFT_ENC-1
+     |-------------------------|
+
+      ----\               /----
+           \             / 
+            \           /          <- shifted version of window w[n]
+             \         /
+              \       /
+               -------
+
+     |---------|     |---------|      
+       NW/2              NW/2
+  */
 
   for(i=0; i<FFT_ENC; i++) {
     W[i].real = 0.0;
     W[i].imag = 0.0;
   }
-  for(i=0; i<AW_ENC/2; i++)
-    W[i].real = w[i+AW_ENC/2];
-  for(i=FFT_ENC-AW_ENC/2; i<FFT_ENC; i++)
-    W[i].real = w[i-FFT_ENC+AW_ENC/2];
+  for(i=0; i<NW/2; i++)
+    W[i].real = w[i+M/2];
+  for(i=FFT_ENC-NW/2,j=M/2-NW/2; i<FFT_ENC; i++,j++)
+    W[i].real = w[j];
 
   four1(&W[-1].imag,FFT_ENC,-1);         /* "Numerical Recipes in C" FFT */
 
-  /* re-arrange so that W is symmetrical about FFT_ENC/2 */
+  /* 
+      Re-arrange W[] to be symmetrical about FFT_ENC/2.  Makes later 
+      analysis convenient.
+
+   Before:
+
+
+     0                 FFT_ENC-1
+     |----------|---------|
+     __                   _       
+       \                 /          
+        \_______________/      
+
+   After:
+
+     0                 FFT_ENC-1
+     |----------|---------|
+               ___                        
+              /   \                
+     ________/     \_______     
 
+  */
+       
+      
   for(i=0; i<FFT_ENC/2; i++) {
     temp.real = W[i].real;
     temp.imag = W[i].imag;
diff --git a/codec2/src/listen.sh b/codec2/src/listen.sh
index 281e68ea..54c2db1d 100755
--- a/codec2/src/listen.sh
+++ b/codec2/src/listen.sh
@@ -1,9 +1,9 @@
 #!/bin/sh
-# listen1.sh
+# listen.sh
 # David Rowe 10 Sep 2009
 #
 # Run menu with common sample file options, headphone version
 
-../script/menu.sh ../raw/$1.raw $1_uq.raw $1_phase0.raw -d /dev/dsp1
+../script/menu.sh ../raw/$1.raw $1_uq.raw $1_phase0.raw
 
 
diff --git a/codec2/src/nlp.c b/codec2/src/nlp.c
index ef555095..8b81c479 100644
--- a/codec2/src/nlp.c
+++ b/codec2/src/nlp.c
@@ -45,7 +45,7 @@
 #define PI          3.141592654	/* mathematical constant                */
 #define T           0.1         /* threshold for local minima candidate */
 #define F0_MAX      500
-#define CNLP        0.3		/* post processor constant              */
+#define CNLP        0.3	        /* post processor constant              */
 
 /*---------------------------------------------------------------------------*\
                                                                             
diff --git a/codec2/src/quantise.c b/codec2/src/quantise.c
index 633812a9..e24fe087 100644
--- a/codec2/src/quantise.c
+++ b/codec2/src/quantise.c
@@ -247,7 +247,7 @@ float lpc_model_amplitudes(
   float  ak[]                   /* output aks */
 )
 {
-  float Wn[AW_ENC];
+  float Wn[M];
   float R[MAX_ORDER+1];
   float E;
   int   i;
@@ -257,9 +257,9 @@ float lpc_model_amplitudes(
   int   index;
   float se;
 
-  for(i=0; i<AW_ENC; i++)
+  for(i=0; i<M; i++)
       Wn[i] = Sn[i]*w[i];
-  autocorrelate(Wn,R,AW_ENC,order);
+  autocorrelate(Wn,R,M,order);
 
   levinson_durbin(R,ak,order);
   E = 0.0;
diff --git a/codec2/src/refine.c b/codec2/src/refine.c
index 3eb62984..c627bb6a 100644
--- a/codec2/src/refine.c
+++ b/codec2/src/refine.c
@@ -56,12 +56,12 @@ void dft_speech()
   /* move 2nd half to start of FFT input vector */
 
   for(i=0; i<NW/2; i++)
-    Sw[i].real = Sn[i+M/2]*w[i+AW_ENC/2];
+    Sw[i].real = Sn[i+M/2]*w[i+M/2];
 
   /* move 1st half to end of FFT input vector */
 
   for(i=0; i<NW/2; i++)
-    Sw[FFT_ENC-NW/2+i].real = Sn[i+M/2-NW/2]*w[i+AW_ENC/2-NW/2];
+    Sw[FFT_ENC-NW/2+i].real = Sn[i+M/2-NW/2]*w[i+M/2-NW/2];
 
   four1(&Sw[-1].imag,FFT_ENC,-1);
 }
diff --git a/codec2/src/sinedec.c b/codec2/src/sinedec.c
index d2dee22f..2a3310cc 100644
--- a/codec2/src/sinedec.c
+++ b/codec2/src/sinedec.c
@@ -177,10 +177,10 @@ int main(int argc, char *argv[])
     /* Read input speech */
 
     fread(buf,sizeof(short),N,fin);
-    for(i=0; i<N+AW_ENC/2; i++)
+    for(i=0; i<M-N; i++)
       Sn[i] = Sn[i+N];
     for(i=0; i<N; i++)
-      Sn[i+N+AW_ENC/2] = buf[i];
+      Sn[i+M-N] = buf[i];
     dump_Sn(Sn);
     dft_speech(); dump_Sw(Sw);   
 
@@ -197,7 +197,7 @@ int main(int argc, char *argv[])
     /* optional phase modelling */
 
     if (phase) {
-	float Wn[AW_ENC];		/* windowed speech samples */
+	float Wn[M];		        /* windowed speech samples */
 	float Rk[PHASE_LPC_ORD+1];	/* autocorrelation coeffs  */
         COMP  H[MAX_AMP];               /* LPC freq domain samples */
 	float n_min;
@@ -209,9 +209,9 @@ int main(int argc, char *argv[])
 	    /* Determine LPC model using time domain LPC if we don't have
 	       any LPCs yet */
 
-	    for(i=0; i<AW_ENC; i++)
+	    for(i=0; i<M; i++)
 		Wn[i] = Sn[i]*w[i];
-	    autocorrelate(Wn,Rk,AW_ENC,PHASE_LPC_ORD);
+	    autocorrelate(Wn,Rk,M,PHASE_LPC_ORD);
 	    levinson_durbin(Rk,ak,PHASE_LPC_ORD);
 	}
 	else
-- 
2.25.1