SINEDEC_OBJ = sinedec.o globals.o initenc.o initdec.o four1.o synth.o \
quantise.o lpc.o dump.o refine.o ../speex/lsp.o \
../speex/quant_lsp.o ../speex/bits.o ../speex/lsp_tables_nb.o \
- ../speex/high_lsp_tables.o phase.o
+ ../speex/high_lsp_tables.o phase.o postfilter.o
all: sinenc sinedec
../src/sinenc ../raw/$1.raw %1.mdl 300 ../unittest/$1_nlp.p
../src/sinedec ../raw/$1.raw %1.mdl -o $1_uq.raw
../src/sinedec ../raw/$1.raw %1.mdl --phase 0 -o $1_phase0.raw
+../src/sinedec ../raw/$1.raw %1.mdl --lpc 10 -o $1_lpc10.raw
static FILE *fdec = NULL;
static FILE *fsnr = NULL;
static FILE *fak = NULL;
+static FILE *fbg = NULL;
static char prefix[MAX_STR];
fclose(fsnr);
if (fak != NULL)
fclose(fak);
+ if (fbg != NULL)
+ fclose(fbg);
}
void dump_Sn(float Sn[]) {
fprintf(fdec,"\n");
}
+void dump_bg(float e, float bg_est, float percent_uv) {
+ char s[MAX_STR];
+
+ if (!dumpon) return;
+
+ if (fbg == NULL) {
+ sprintf(s,"%s_bg.txt", prefix);
+ fbg = fopen(s, "wt");
+ assert(fbg != NULL);
+ }
+
+ fprintf(fbg,"%f\t%f\t%f\n", e, bg_est, percent_uv);
+}
+
+
+
void dump_on(char filename_prefix[]);
void dump_off();
+
void dump_Sn(float Sn[]);
void dump_Sw(COMP Sw[]);
void dump_Sw_(COMP Sw_[]);
+
+/* amplitude modelling */
+
void dump_model(MODEL *m);
void dump_quantised_model(MODEL *m);
void dump_Pw(COMP Pw[]);
void dump_Fw(COMP Fw[]);
void dump_e(float e_hz[]);
+/* post filter */
+
+void dump_bg(float e, float bg_est, float percent_uv);
+
#endif
#
# Run menu with common sample file options, headphone version
-../script/menu.sh ../raw/$1.raw $1_uq.raw $1_phase0.raw
+../script/menu.sh ../raw/$1.raw $1_uq.raw $1_phase0.raw $1_lpc10.raw
#
# Run menu with common sample file options, headphone version
-../script/menu.sh ../raw/$1.raw $1_uq.raw $1_phase0.raw -d /dev/dsp1
+../script/menu.sh ../raw/$1.raw $1_uq.raw $1_phase0.raw $1_lpc10.raw -d /dev/dsp1
if (Lrand < 1) Lrand = 1;
if (Lrand > model.L) Lrand = model.L;
}
-
+
/* update excitation fundamental phase track */
ex_phase[0] += (*prev_Wo+model.Wo)*N/2.0;
/* generate excitation */
if (m <= Lrand) {
+ b = floor(m*model.Wo*FFT_DEC/TWO_PI + 0.5);
Ex[m].real = cos(ex_phase[0]*m);
Ex[m].imag = sin(ex_phase[0]*m);
"clicky"*/
//Ex[m].real = cos(ex_phase[0]*m + model.Wo*m*m*0.3);
//Ex[m].imag = sin(ex_phase[0]*m + model.Wo*m*m*0.3);
+
+ /* following is an experiment to use the phase of a glottal pulse
+ (see octave/glottal.m) in an attempt io make mmt1 and hts1 a little
+ less "clicky", i.e. disperse the pusle energy away from the point
+ of onset. Result was no difference in speech quality, in fact
+ no difference at all. Could be an implementation error I guess. */
+ //b = floor(m*model->Wo*FFT_DEC/TWO_PI + 0.5);
+ //Ex[m].real = cos(ex_phase[0]*m + glottal[b]);
+ //Ex[m].imag = sin(ex_phase[0]*m + glottal[b]);
+
}
else {
/* we probably don't need to LPC filter phase in unvoiced case,
--- /dev/null
+/*---------------------------------------------------------------------------*\
+
+ FILE........: postfilter.c
+ AUTHOR......: David Rowe
+ DATE CREATED: 13/09/09
+
+ Postfilter to improve sound quality for speech with high levels of
+ background noise. Unlike mixed-excitation models requires no bits
+ to be transmitted to handle background noise.
+
+\*---------------------------------------------------------------------------*/
+
+/*
+ Copyright (C) 2009 David Rowe
+
+ All rights reserved.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2, as
+ published by the Free Software Foundation. This program is
+ distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+ License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include "postfilter.h"
+#include "dump.h"
+
+/*---------------------------------------------------------------------------*\
+
+ DEFINES
+
+\*---------------------------------------------------------------------------*/
+
+#define BG_THRESH 40.0 /* only consider low levels signals for bg_est */
+#define BG_BETA 0.1 /* averaging filter constant */
+
+/*---------------------------------------------------------------------------*\
+
+ postfilter()
+
+ The post filter is designed to help with speech corrupted by
+ background noise. The zero phase model tends to make speech with
+ background noise sound "clicky". With high levels of background
+ noise the low level inter-formant parts of the spectrum will contain
+ noise rather than speech harmonics, so modelling them as voiced
+ (i.e. a continuous, non-random phase track) is inaccurate.
+
+ Some codecs (like MBE) have a mixed voicing model that breaks the
+ spectrum into voiced and unvoiced regions. Several bits/frame
+ (5-12) are required to transmit the frequency selective voicing
+ information. Mixed excitation also requires accurate voicing
+ estimation (parameter estimators always break occasionally under
+ exceptional condition).
+
+ In our case we use a post filter approach which requires no
+ additional bits to be transmitted. The decoder measures the average
+ level of the background noise during unvoiced frames. If a harmonic
+ is less than this level it is made unvoiced by randomising it's
+ phases.
+
+ This idea is rather experimental. Some potential problems that may
+ happen:
+
+ 1/ If someone says "aaaaaaaahhhhhhhhh" willl background estimator track
+ up to speech level? This would be a bad thing.
+
+ 2/ If background noise suddenly dissapears from the source speech does
+ estimate drop quickly? What is noise suddenly re-appears?
+
+ 3/ Background noise with a non-flat sepctrum. Current algorithm just
+ comsiders scpetrum as a whole, but this could be broken up into
+ bands, each with their own estimator.
+
+ 4/ Males and females with the same level of background noise. Check
+ performance the same. Changing Wo affects width of each band, may
+ affect bg energy estimates.
+
+ 5/ Not sure what happens during long periods of voiced speech
+ e.g. "sshhhhhhh"
+
+\*---------------------------------------------------------------------------*/
+
+void postfilter(
+ MODEL *model,
+ int voiced,
+ float *bg_est
+)
+{
+ int m, uv;
+ float e;
+
+ /* determine average energy across spectrum */
+
+ e = 0.0;
+ for(m=1; m<=model->L; m++)
+ e += model->A[m]*model->A[m];
+
+ e = 10.0*log10(e/model->L);
+
+ /* If beneath threhold, update bg estimate. The idea
+ of the threshold is to prevent updating during high level
+ speech. */
+
+ if ((e < BG_THRESH) && !voiced)
+ *bg_est = *bg_est*(1.0 - BG_BETA) + e*BG_BETA;
+
+ /* now mess with phases during voiced frames to make any harmonics
+ less then our background estimate unvoiced.
+ */
+
+ uv = 0;
+ if (voiced)
+ for(m=1; m<=model->L; m++)
+ if (20.0*log10(model->A[m]) < *bg_est) {
+ model->phi[m] = TWO_PI*(float)rand()/RAND_MAX;
+ uv++;
+ }
+
+ dump_bg(e, *bg_est, 100.0*uv/model->L);
+
+}
--- /dev/null
+/*---------------------------------------------------------------------------*\
+
+ FILE........: postfilter.h
+ AUTHOR......: David Rowe
+ DATE CREATED: 13/09/09
+
+ Postfilter header file.
+
+\*---------------------------------------------------------------------------*/
+
+/*
+ Copyright (C) 2009 David Rowe
+
+ All rights reserved.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License version 2, as
+ published by the Free Software Foundation. This program is
+ distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+ License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#ifndef __POSTFILTER__
+#define __POSTFILTER__
+
+#include "sine.h"
+
+void postfilter(MODEL *model, int voiced, float *bg_est);
+
+#endif
#include "phase.h"
#include "lpc.h"
#include "synth.h"
+#include "postfilter.h"
/*---------------------------------------------------------------------------*\
int phase, phase_model;
float prev_Wo, ex_phase;
+ int postfilt;
+ float bg_est;
+
if (argc < 3) {
printf("usage: sinedec InputFile ModelFile [-o OutputFile] [-o lpc Order]\n");
printf(" [--dump DumpFilePrefix]\n");
assert((phase_model == 0) || (phase_model == 1));
}
+ bg_est = 0.0;
+ postfilt = switch_present("--postfilter",argc,argv);
+
/* Initialise ------------------------------------------------------------*/
init_decoder();
dump_snr(snr);
if (phase_model == 0) {
/* just to make sure we are not cheating - kill all phases */
- for(i=0; i<MAX_AMP; i++)
- model.phi[i] = 0;
+ //for(i=0; i<MAX_AMP; i++)
+ // model.phi[i] = 0;
phase_synth_zero_order(snr, H, &prev_Wo, &ex_phase);
}
}
}
+ if (postfilt)
+ postfilter(&model, snr>2.0, &bg_est);
+
+
/* Synthesise speech */
if (fout != NULL) {
model.phi[m] = atan2(Sw[b].imag,Sw[b].real);
- /* construct all voiced model spectrum and estimate voicing */
+ #ifdef MBE_VOICING_NEEDED
+ /* construct all voiced model spectrum and estimate voicing using MBE model */
E = 0.0;
for(i=am; i<bm; i++) {
E = pow(Sw[i].real - Sw_[i].real, 2.0) + pow(Sw[i].imag - Sw_[i].imag, 2.0);
}
model.v[m] = E/den;
+ #endif
}
}
track. So in unvoiced frames or in cases where the fundamental
frequency varies by more that 20%, we don't add the small frequency
offset.
-
+
+ Result: when tested was no difference in output speech quality. The
+ partial unvoiced sound when using zero phase model was found to be
+ due mis-laignment of teh LPC analysis window and accidental addition
+ of a random phase component. So we are sticking with synthesise_mixed()
+ above for now.
+
\*---------------------------------------------------------------------------*/
void synthesise_continuous_phase(