infringing on patents or access to commercial in confidence
information.
+[[hacking]]
+Hacking
+-------
+
+If you would like to work on the Codec2 code base here are some
+notes:
+
+* src/code.sh will perform the several processing steps
+ required to output speech files at various processing steps, for
+ example:
+
+ $ ./code.sh hts1a
++
+will produce hts1a_uq (unquantised, i.e. baseline sinusoidal model),
+hts1a_phase0 (zero phase model), hts1a_lpc10 (10th order LPC model).
+
+* You can then listen to all of these samples (and the original)
+ using:
+
+ $ ./listen.sh hts1a
+
+* Specific notes about LPC and Phase modelling are below.
+
+* There are some useful scripts in the scripts directory, for example
+ wav2raw.sh, raw2wav.sh, playraw.sh, menu.sh. Note that code.sh and
+ listen.sh are in the src directory as thats where they get used most
+ of the time.
+
[[lpc]]
LPC Modelling Notes
-------------------
+Linear Prediction Coefficient (LPC) modelling is used to model the
+sine wave amplitudes { A }. The use of LPC in speech coding is
+common, although the application of LPC modelling to frequency domain
+coding is fairly novel. They are mainly used for time domain codecs
+like LPC-10 and CELP.
+
+LPC modelling has a couple of advantages:
+
+* From time domain coding we know a lot about LPC, for example how to
+ quantise them efficiently using Line Spectrum Pairs (LSPs).
+
+* The number of amplitudes varies each frame as Wo and hence L vary.
+ This makes the { A } tricky to quantise and transmit. However it is
+ possible to convey the same information using a fixed number of
+ LPCs which makes the quantisation problem easier.
+
+To test LPC modelling:
+
$ ./sinedec ../raw/hts1a.raw hts1a.mdl --lpc 10 - hts1a_lpc10.raw
The blog post [4] discusses why LPC modelling works so well when Am
../unittest/tnlp ../raw/$1.raw ../unittest/$1_nlp.p
../src/sinenc ../raw/$1.raw %1.mdl 300 ../unittest/$1_nlp.p
../src/sinedec ../raw/$1.raw %1.mdl -o $1_uq.raw
-../src/sinedec ../raw/$1.raw %1.mdl --phase 0 -o $1_phase0.raw
+../src/sinedec ../raw/$1.raw %1.mdl --phase 0 -o $1_phase0.raw --postfilter
../src/sinedec ../raw/$1.raw %1.mdl --lpc 10 -o $1_lpc10.raw
#define AW_DEC 160 /* number of samples in synthesis window */
#define FFT_DEC 512 /* number of points in DFT */
-#define TW 40 /* Trapezoidal UV synthesis window overlap */
+#define TW 40 /* Trapezoidal synthesis window overlap */
#define MAX_STR 256
/*---------------------------------------------------------------------------*\
#
# Run menu with common sample file options, headphone version
-../script/menu.sh ../raw/$1.raw $1_uq.raw $1_phase0.raw $1_lpc10.raw -d /dev/dsp1
+../script/menu.sh ../raw/$1.raw $1_uq.raw $1_phase0.raw $1_lpc10.raw ../raw/$1_speex_8k.raw -d /dev/dsp1
float test_candidate_mbe(COMP Sw[], float f0);
float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax);
float post_process_sub_multiples(COMP Fw[],
- int pmin, int pmax, float gmax, int gmax_bin);
+ int pmin, int pmax, float gmax, int gmax_bin,
+ float *prev_Wo);
extern int frames;
/*---------------------------------------------------------------------------*\
int pmin, /* minimum pitch value */
int pmax, /* maximum pitch value */
float *pitch, /* estimated pitch period in samples */
- COMP Sw[] /* Freq domain version of Sn[] */
+ COMP Sw[], /* Freq domain version of Sn[] */
+ float *prev_Wo
)
{
static float sq[PMAX_M]; /* squared speech samples */
Fw[i].real = 0.0;
Fw[i].imag = 0.0;
}
- for(i=0; i<m/DEC; i++)
+ for(i=0; i<m/DEC; i++) {
Fw[i].real = sq[i*DEC]*(0.5 - 0.5*cos(2*PI*i/(m/DEC-1)));
+ }
dump_dec(Fw);
four1(&Fw[-1].imag,PE_FFT_SIZE,1);
for(i=0; i<PE_FFT_SIZE; i++)
#ifdef POST_PROCESS_MBE
best_f0 = post_process_mbe(Fw, pmin, pmax, gmax);
#else
- best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin);
+ best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin, prev_Wo);
#endif
/* Shift samples in buffer to make room for new samples */
\*---------------------------------------------------------------------------*/
float post_process_sub_multiples(COMP Fw[],
- int pmin, int pmax, float gmax, int gmax_bin)
+ int pmin, int pmax, float gmax, int gmax_bin,
+ float *prev_Wo)
{
int min_bin, cmax_bin;
int mult;
float thresh, best_f0;
int b, bmin, bmax, lmax_bin;
float lmax, cmax;
+ int prev_f0_bin;
/* post process estimate by searching submultiples */
mult = 2;
min_bin = PE_FFT_SIZE*DEC/pmax;
- thresh = CNLP*gmax;
cmax_bin = gmax_bin;
+ prev_f0_bin = *prev_Wo*(4000.0/PI)*(PE_FFT_SIZE*DEC)/SAMPLE_RATE;
while(gmax_bin/mult >= min_bin) {
bmax = 1.2*b;
if (bmin < min_bin)
bmin = min_bin;
-
+
+ /* lower threshold to favour previous frames pitch estimate,
+ this is a form of pitch tracking */
+
+ if ((prev_f0_bin > bmin) && (prev_f0_bin < bmax))
+ thresh = CNLP*0.5*gmax;
+ else
+ thresh = CNLP*gmax;
+
lmax = 0;
for (b=bmin; b<=bmax; b++) /* look for maximum in interval */
if (Fw[b].real > lmax) {
}
if (lmax > thresh)
- if (lmax > Fw[lmax_bin-1].real && lmax > Fw[lmax_bin+1].real) {
+ if ((lmax > Fw[lmax_bin-1].real) && (lmax > Fw[lmax_bin+1].real)) {
cmax = lmax;
cmax_bin = lmax_bin;
}
#define NLP_NTAP 48 /* Decimation LPF order */
-float nlp(float Sn[], int n, int m, int pmin, int pmax, float *pitch, COMP Sw[]);
+float nlp(float Sn[], int n, int m, int pmin, int pmax, float *pitch, COMP Sw[], float *prev_Wo);
#endif
/* generate excitation */
if (m <= Lrand) {
- b = floor(m*model.Wo*FFT_DEC/TWO_PI + 0.5);
Ex[m].real = cos(ex_phase[0]*m);
Ex[m].imag = sin(ex_phase[0]*m);
float Em; /* energy in band */
float Am; /* spectral amplitude sample */
float signal, noise;
+ float E1,Am1;
+
+ Am1 = model->A[1];
r = TWO_PI/(FFT_DEC);
}
*snr = 10.0*log10(signal/noise);
- /* attenuate fundamental by 30dB if F0 < 150 Hz. LPC modelling often makes
- big errors on 1st harmonic, which is usually at very low level due to
- analog HPF.
+ /*
+ Attenuate fundamental by 30dB if F0 < 150 Hz and LPC modelling
+ error for A[1] is larger than 6dB.
- Another option is to use a single bit to swith thos attenuation
- in and out based on measured error an encoder. That way
- non-HPF speech won't be impaired.
- */
+ LPC modelling often makes big errors on 1st harmonic, for example
+ when the fundamental has been removed by analog high pass
+ filtering before sampling. However on unfiltered speech from
+ high quality sources we would like to keep the fundamental to
+ maintain the speech quality. So we check the error in A[1] and
+ attenuate it if the error is large to avoid annoying low
+ frequency energy after LPC modelling.
- if (model->Wo < PI*150.0/4000) {
- model->A[1] *= 0.032;
- }
+ This will require a single bit to quantise, on top of the other
+ spectral magnitude bits (i.e. LSP bits + 1 total).
+ */
+ E1 = fabs(20.0*log10(Am1) - 20.0*log10(model->A[1]));
+ if (E1 > 6.0)
+ if (model->Wo < (PI*150.0/4000)) {
+ model->A[1] *= 0.032;
+ }
}
float pitch;
int i;
int dump;
+ float prev_Wo;
if (argc < 3) {
printf("\nusage: tnlp InputRawSpeechFile OutputPitchTextFile "
make_window(NW);
frames = 0;
+ prev_Wo = 0;
while(fread(buf,sizeof(short),N,fin)) {
frames++;
dft_speech();
dump_Sn(Sn); dump_Sw(Sw);
- nlp(Sn,N,M,PITCH_MIN,PITCH_MAX,&pitch,Sw);
+ nlp(Sn,N,M,PITCH_MIN,PITCH_MAX,&pitch,Sw,&prev_Wo);
+ prev_Wo = TWO_PI/pitch;
fprintf(fout,"%f\n",pitch);
}