From 322497ab9fa767bb9c7299e17b0d1108abf0cdce Mon Sep 17 00:00:00 2001 From: drowe67 Date: Sun, 4 Jun 2017 07:18:43 +0000 Subject: [PATCH] original phases working with 8000 Hz and 16000 Hz, sounds OK. Can see better ways to handle this in nlp.c, plenty of work for a future clean up git-svn-id: https://svn.code.sf.net/p/freetel/code@3153 01035d8c-6547-0410-b346-abe4f91aad63 --- codec2-dev/src/c2sim.c | 208 +++++++++++++++++-------------- codec2-dev/src/codec2.c | 7 +- codec2-dev/src/codec2_internal.h | 2 +- codec2-dev/src/defines.h | 2 +- codec2-dev/src/nlp.c | 167 ++++++++++++++++++++----- codec2-dev/src/nlp.h | 6 +- codec2-dev/src/os.h | 2 +- codec2-dev/src/sine.c | 1 + codec2-dev/unittest/tnlp.c | 101 +++++++-------- 9 files changed, 311 insertions(+), 185 deletions(-) diff --git a/codec2-dev/src/c2sim.c b/codec2-dev/src/c2sim.c index 35f8f609..af8eaecf 100644 --- a/codec2-dev/src/c2sim.c +++ b/codec2-dev/src/c2sim.c @@ -64,34 +64,10 @@ void print_help(const struct option *long_options, int num_opts, char* argv[]); int main(int argc, char *argv[]) { - C2CONST c2const = c2const_create(8000); - int n_samp = c2const.n_samp; - int m_pitch = c2const.m_pitch; - FILE *fout = NULL; /* output speech file */ - FILE *fin; /* input speech file */ - short buf[N_SAMP]; /* input/output buffer */ - float buf_float[N_SAMP]; - float Sn[m_pitch]; /* float input speech samples */ - float Sn_pre[m_pitch]; /* pre-emphasised input speech samples */ - COMP Sw[FFT_ENC]; /* DFT of Sn[] */ - codec2_fft_cfg fft_fwd_cfg; - codec2_fftr_cfg fftr_fwd_cfg; - codec2_fftr_cfg fftr_inv_cfg; - float w[m_pitch]; /* time domain hamming window */ - COMP W[FFT_ENC]; /* DFT of w[] */ - MODEL model; - float Pn[2*N_SAMP]; /* trapezoidal synthesis window */ - float Sn_[2*N_SAMP]; /* synthesised speech */ - int i,m; /* loop variable */ - int frames; - float prev_Wo, prev__Wo, prev_uq_Wo; - float pitch; - char out_file[MAX_STR]; - char ampexp_arg[MAX_STR]; - char phaseexp_arg[MAX_STR]; - float snr; - float sum_snr; + int Fs = 8000; + int set_fs; + int orderi; int lpc_model = 0, order = LPC_ORD; int lsp = 0, lspd = 0, lspvq = 0; @@ -101,63 +77,39 @@ int main(int argc, char *argv[]) int lspanssi = 0, #endif int prede = 0; - float pre_mem = 0.0, de_mem = 0.0; - float ak[order]; - // COMP Sw_[FFT_ENC]; - // COMP Ew[FFT_ENC]; - - int phase0 = 0; - float ex_phase[MAX_AMP+1]; - int postfilt; - float bg_est = 0.0; - int hand_voicing = 0, phaseexp = 0, ampexp = 0, hi = 0, simlpcpf = 0, lspmelread = 0; int lpcpf = 0; FILE *fvoicing = 0; FILE *flspmel = 0; - - MODEL prev_model; int dec; int decimate = 1; - float lsps[order]; - float e, prev_e; - int lsp_indexes[order]; - float lsps_[order]; - float Woe_[2]; - - float lsps_dec[4][LPC_ORD], e_dec[4], weight, weight_inc, ak_dec[4][LPC_ORD]; - MODEL model_dec[4], prev_model_dec; - float prev_lsps_dec[order], prev_e_dec; - - void *nlp_states; - float hpf_states[2]; + int amread, Woread; + int awread; + int hmread; + int phase0 = 0; int scalar_quant_Wo_e = 0; int scalar_quant_Wo_e_low = 0; int vector_quant_Wo_e = 0; int dump_pitch_e = 0; - FILE *fjvm = NULL; - #ifdef DUMP - int dump; - #endif - #if 0 - struct PEXP *pexp = NULL; - struct AEXP *aexp = NULL; - #endif float gain = 1.0; int bpf_en = 0; int bpfb_en = 0; - float bpf_buf[BPF_N+N_SAMP]; - float lspmelvq_mse = 0.0; - int amread, Woread; FILE *fam = NULL, *fWo = NULL; - int awread; FILE *faw = NULL; - int hmread; FILE *fhm = NULL; + FILE *fjvm = NULL; + #ifdef DUMP + int dump; + #endif + char ampexp_arg[MAX_STR]; + char phaseexp_arg[MAX_STR]; + char out_file[MAX_STR]; + FILE *fout = NULL; /* output speech file */ char* opt_string = "ho:"; struct option long_options[] = { + { "Fs", required_argument, &set_fs, 1 }, { "lpc", required_argument, &lpc_model, 1 }, { "lspjnd", no_argument, &lspjnd, 1 }, { "lspmel", no_argument, &lspmel, 1 }, @@ -200,36 +152,6 @@ int main(int argc, char *argv[]) { NULL, no_argument, NULL, 0 } }; int num_opts=sizeof(long_options)/sizeof(struct option); - COMP Aw[FFT_ENC]; - COMP H[MAX_AMP]; - - for(i=0; i order)) { fprintf(stderr, "Error in LPC order (4 to %d): %s\n", order, optarg); @@ -396,6 +324,7 @@ int main(int argc, char *argv[]) /* Input file */ + FILE *fin; /* input speech file */ if (strcmp(argv[optind], "-") == 0) fin = stdin; else if ((fin = fopen(argv[optind],"rb")) == NULL) { fprintf(stderr, "Error opening input speech file: %s: %s.\n", @@ -403,6 +332,93 @@ int main(int argc, char *argv[]) exit(1); } + C2CONST c2const = c2const_create(Fs); + int n_samp = c2const.n_samp; + int m_pitch = c2const.m_pitch; + + short buf[N_SAMP]; /* input/output buffer */ + float buf_float[N_SAMP]; + float Sn[m_pitch]; /* float input speech samples */ + float Sn_pre[m_pitch]; /* pre-emphasised input speech samples */ + COMP Sw[FFT_ENC]; /* DFT of Sn[] */ + codec2_fft_cfg fft_fwd_cfg; + codec2_fftr_cfg fftr_fwd_cfg; + codec2_fftr_cfg fftr_inv_cfg; + float w[m_pitch]; /* time domain hamming window */ + COMP W[FFT_ENC]; /* DFT of w[] */ + MODEL model; + float Pn[2*N_SAMP]; /* trapezoidal synthesis window */ + float Sn_[2*N_SAMP]; /* synthesised speech */ + int i,m; /* loop variable */ + int frames; + float prev_f0; + float pitch; + float snr; + float sum_snr; + + float pre_mem = 0.0, de_mem = 0.0; + float ak[order]; + // COMP Sw_[FFT_ENC]; + // COMP Ew[FFT_ENC]; + + float ex_phase[MAX_AMP+1]; + + float bg_est = 0.0; + + + MODEL prev_model; + float lsps[order]; + float e, prev_e; + int lsp_indexes[order]; + float lsps_[order]; + float Woe_[2]; + + float lsps_dec[4][LPC_ORD], e_dec[4], weight, weight_inc, ak_dec[4][LPC_ORD]; + MODEL model_dec[4], prev_model_dec; + float prev_lsps_dec[order], prev_e_dec; + + void *nlp_states; + float hpf_states[2]; + #if 0 + struct PEXP *pexp = NULL; + struct AEXP *aexp = NULL; + #endif + float bpf_buf[BPF_N+N_SAMP]; + float lspmelvq_mse = 0.0; + + COMP Aw[FFT_ENC]; + COMP H[MAX_AMP]; + + + for(i=0; ic2const, c2->Pn); c2->fftr_inv_cfg = codec2_fftr_alloc(FFT_DEC, 1, NULL, NULL); quantise_init(); - c2->prev_Wo_enc = 0.0; + c2->prev_f0_enc = 1/P_MAX_S; c2->bg_est = 0.0; c2->ex_phase = 0.0; @@ -163,7 +163,7 @@ struct CODEC2 * codec2_create(int mode) } c2->prev_e_dec = 1; - c2->nlp = nlp_create(m_pitch); + c2->nlp = nlp_create(&c2->c2const); if (c2->nlp == NULL) { return NULL; } @@ -2103,7 +2103,7 @@ void analyse_one_frame(struct CODEC2 *c2, MODEL *model, short speech[]) /* Estimate pitch */ - nlp(c2->nlp,c2->Sn,n_samp,c2->c2const.p_min,c2->c2const.p_max,&pitch,Sw, c2->W, &c2->prev_Wo_enc); + nlp(c2->nlp, c2->Sn, n_samp, &pitch, Sw, c2->W, &c2->prev_f0_enc); PROFILE_SAMPLE_AND_LOG(model_start, nlp_start, " nlp"); model->Wo = TWO_PI/pitch; @@ -2116,7 +2116,6 @@ void analyse_one_frame(struct CODEC2 *c2, MODEL *model, short speech[]) estimate_amplitudes(model, Sw, c2->W, 0); PROFILE_SAMPLE_AND_LOG(estamps, two_stage, " est_amps"); est_voicing_mbe(&c2->c2const, model, Sw, c2->W); - c2->prev_Wo_enc = model->Wo; PROFILE_SAMPLE_AND_LOG2(estamps, " est_voicing"); #ifdef DUMP dump_model(model); diff --git a/codec2-dev/src/codec2_internal.h b/codec2-dev/src/codec2_internal.h index 15115873..48fcb994 100644 --- a/codec2-dev/src/codec2_internal.h +++ b/codec2-dev/src/codec2_internal.h @@ -53,7 +53,7 @@ struct CODEC2 { float *Sn_; /* [2*n_samp] synthesised output speech */ float ex_phase; /* excitation model phase track */ float bg_est; /* background noise estimate for post filter */ - float prev_Wo_enc; /* previous frame's pitch estimate */ + float prev_f0_enc; /* previous frame's f0 estimate */ MODEL prev_model_dec; /* previous frame's model parameters */ float prev_lsps_dec[LPC_ORD]; /* previous frame's LSPs */ float prev_e_dec; /* previous frame's LPC energy */ diff --git a/codec2-dev/src/defines.h b/codec2-dev/src/defines.h index 968c0bbe..ec94ed8e 100644 --- a/codec2-dev/src/defines.h +++ b/codec2-dev/src/defines.h @@ -43,7 +43,7 @@ #define PI 3.141592654 /* mathematical constant */ #endif #define TWO_PI 6.283185307 /* mathematical constant */ -#define MAX_STR 256 /* maximum string size */ +#define MAX_STR 2048 /* maximum string size */ #define FFT_ENC 512 /* size of FFT used for encoder */ #define FFT_DEC 512 /* size of FFT used in decoder */ diff --git a/codec2-dev/src/nlp.c b/codec2-dev/src/nlp.c index 36037b43..8c8d5f1c 100644 --- a/codec2-dev/src/nlp.c +++ b/codec2-dev/src/nlp.c @@ -31,6 +31,7 @@ #include "codec2_fft.h" #undef PROFILE #include "machdep.h" +#include "os.h" #include #include @@ -42,7 +43,7 @@ \*---------------------------------------------------------------------------*/ -#define PMAX_M 600 /* maximum NLP analysis window size */ +#define PMAX_M 320 /* maximum NLP analysis window size */ #define COEFF 0.95 /* notch filter parameter */ #define PE_FFT_SIZE 512 /* DFT size for pitch estimation */ #define DEC 5 /* decimation factor */ @@ -54,7 +55,11 @@ #define NLP_NTAP 48 /* Decimation LPF order */ #undef POST_PROCESS_MBE /* choose post processor */ -//#undef DUMP +/* 8 to 16 kHz sample rate conversion */ + +#define FDMDV_OS 2 /* oversampling rate */ +#define FDMDV_OS_TAPS_16K 48 /* number of OS filter taps at 16kHz */ +#define FDMDV_OS_TAPS_8K (FDMDV_OS_TAPS_16K/FDMDV_OS) /* number of OS filter taps at 8kHz */ /*---------------------------------------------------------------------------*\ @@ -116,12 +121,15 @@ const float nlp_fir[] = { }; typedef struct { + int Fs; /* sample rate in Hz */ int m; float w[PMAX_M/DEC]; /* DFT window */ float sq[PMAX_M]; /* squared speech samples */ float mem_x,mem_y; /* memory for notch filter */ float mem_fir[NLP_NTAP]; /* decimation FIR filter memory */ - codec2_fft_cfg fft_cfg; /* kiss FFT config */ + codec2_fft_cfg fft_cfg; /* kiss FFT config */ + float *Sn16k; /* Fs=16kHz input speech vector */ + FILE *f; } NLP; #ifdef POST_PROCESS_MBE @@ -130,7 +138,8 @@ float post_process_mbe(COMP Fw[], int pmin, int pmax, float gmax, COMP Sw[], COM #endif float post_process_sub_multiples(COMP Fw[], int pmin, int pmax, float gmax, int gmax_bin, - float *prev_Wo); + float *prev_f0); +static void fdmdv_16_to_8(float out8k[], float in16k[], int n); /*---------------------------------------------------------------------------*\ @@ -140,20 +149,41 @@ float post_process_sub_multiples(COMP Fw[], \*---------------------------------------------------------------------------*/ -void *nlp_create( -int m /* analysis window size */ -) +void *nlp_create(C2CONST *c2const) { NLP *nlp; int i; - - assert(m <= PMAX_M); + int m = c2const->m_pitch; + int Fs = c2const->Fs; nlp = (NLP*)malloc(sizeof(NLP)); if (nlp == NULL) return NULL; + assert((Fs == 8000) || (Fs == 16000)); + nlp->Fs = Fs; + nlp->m = m; + + /* if running at 16kHz allocate storage for decimating filter memory */ + + if (Fs == 16000) { + nlp->Sn16k = (float*)malloc(sizeof(float)*(FDMDV_OS_TAPS_16K + c2const->n_samp)); + for(i=0; iSn16k[i] = 0.0; + } + if (nlp->Sn16k == NULL) { + free(nlp); + return NULL; + } + + /* most processing occurs at 8 kHz sample rate so halve m */ + + m /= 2; + } + + assert(m <= PMAX_M); + for(i=0; iw[i] = 0.5 - 0.5*cosf(2*PI*i/(m/DEC-1)); } @@ -186,6 +216,9 @@ void nlp_destroy(void *nlp_state) nlp = (NLP*)nlp_state; codec2_fft_free(nlp->fft_cfg); + if (nlp->Fs == 16000) { + free(nlp->Sn16k); + } free(nlp_state); } @@ -215,28 +248,26 @@ void nlp_destroy(void *nlp_state) References: - [1] http://www.itr.unisa.edu.au/~steven/thesis/dgr.pdf Chapter 4 + [1] http://rowetel.com/downloads/1997_rowe_phd_thesis.pdf Chapter 4 \*---------------------------------------------------------------------------*/ float nlp( void *nlp_state, - float Sn[], /* input speech vector */ - int n, /* frames shift (no. new samples in Sn[]) */ - int pmin, /* minimum pitch value */ - int pmax, /* maximum pitch value */ - float *pitch, /* estimated pitch period in samples */ - COMP Sw[], /* Freq domain version of Sn[] */ - COMP W[], /* Freq domain window */ - float *prev_Wo + float Sn[], /* input speech vector */ + int n, /* frames shift (no. new samples in Sn[]) */ + float *pitch, /* estimated pitch period in samples at current Fs */ + COMP Sw[], /* Freq domain version of Sn[] */ + COMP W[], /* Freq domain window */ + float *prev_f0 /* previous pitch f0 in Hz, memory for pitch tracking */ ) { NLP *nlp; - float notch; /* current notch filter output */ + float notch; /* current notch filter output */ COMP Fw[PE_FFT_SIZE]; /* DFT of squared signal (input/output) */ float gmax; int gmax_bin; - int m, i,j; + int m, i, j; float best_f0; PROFILE_VAR(start, tnotch, filter, peakpick, window, fft, magsq, shiftmem); @@ -244,12 +275,43 @@ float nlp( nlp = (NLP*)nlp_state; m = nlp->m; - PROFILE_SAMPLE(start); - /* Square, notch filter at DC, and LP filter vector */ - for(i=m-n; isq[i] = Sn[i]*Sn[i]; + /* If running at 16 kHz decimate to 8 kHz, as NLP ws designed for + Fs = 8kHz. The decimating filter introduces about 3ms of delay, + that shouldn't be a problem as pitch changes slowly. */ + + if (nlp->Fs == 8000) { + /* Square latest input samples */ + + for(i=m-n; isq[i] = Sn[i]*Sn[i]; + } + } + else { + assert(nlp->Fs == 16000); + + /* re-sample at 8 KHz */ + + for(i=0; iSn16k[FDMDV_OS_TAPS_16K+i] = Sn[m-n+i]; + } + + m /= 2; n /= 2; + + float Sn8k[n]; + fdmdv_16_to_8(Sn8k, &nlp->Sn16k[FDMDV_OS_TAPS_16K], n); + + /* Square latest input samples */ + + for(i=m-n, j=0; isq[i] = Sn8k[j]*Sn8k[j]; + } + assert(j <= n); + } + //fprintf(stderr, "n: %d m: %d\n", n, m); + + PROFILE_SAMPLE(start); for(i=m-n; isq[i] - nlp->mem_x; @@ -309,6 +371,11 @@ float nlp( dump_Fw(Fw); #endif + /* todo: express everything in f0, as pitch in samples is dep on Fs */ + + int pmin = floor(SAMPLE_RATE*P_MIN_S); + int pmax = floor(SAMPLE_RATE*P_MAX_S); + /* find global peak */ gmax = 0.0; @@ -323,9 +390,9 @@ float nlp( PROFILE_SAMPLE_AND_LOG(peakpick, magsq, " peak pick"); #ifdef POST_PROCESS_MBE - best_f0 = post_process_mbe(Fw, pmin, pmax, gmax, Sw, W, prev_Wo); + best_f0 = post_process_mbe(Fw, pmin, pmax, gmax, Sw, W, prev_f0); #else - best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin, prev_Wo); + best_f0 = post_process_sub_multiples(Fw, pmin, pmax, gmax, gmax_bin, prev_f0); #endif PROFILE_SAMPLE_AND_LOG(shiftmem, peakpick, " post process"); @@ -335,14 +402,16 @@ float nlp( for(i=0; isq[i] = nlp->sq[i+n]; - /* return pitch and F0 estimate */ + /* return pitch period in samples and F0 estimate */ - *pitch = (float)SAMPLE_RATE/best_f0; + *pitch = (float)nlp->Fs/best_f0; PROFILE_SAMPLE_AND_LOG2(shiftmem, " shift mem"); PROFILE_SAMPLE_AND_LOG2(start, " nlp int"); + *prev_f0 = best_f0; + return(best_f0); } @@ -369,7 +438,7 @@ float nlp( float post_process_sub_multiples(COMP Fw[], int pmin, int pmax, float gmax, int gmax_bin, - float *prev_Wo) + float *prev_f0) { int min_bin, cmax_bin; int mult; @@ -383,7 +452,7 @@ float post_process_sub_multiples(COMP Fw[], mult = 2; min_bin = PE_FFT_SIZE*DEC/pmax; cmax_bin = gmax_bin; - prev_f0_bin = *prev_Wo*(4000.0/PI)*(PE_FFT_SIZE*DEC)/SAMPLE_RATE; + prev_f0_bin = *prev_f0*(PE_FFT_SIZE*DEC)/SAMPLE_RATE; while(gmax_bin/mult >= min_bin) { @@ -593,3 +662,41 @@ float test_candidate_mbe( } #endif + +/*---------------------------------------------------------------------------*\ + + FUNCTION....: fdmdv_16_to_8() + AUTHOR......: David Rowe + DATE CREATED: 9 May 2012 + + Changes the sample rate of a signal from 16 to 8 kHz. + + n is the number of samples at the 8 kHz rate, there are FDMDV_OS*n + samples at the 48 kHz rate. As above however a memory of + FDMDV_OS_TAPS samples is reqd for in16k[] (see t16_8.c unit test as example). + + Low pass filter the 16 kHz signal at 4 kHz using the same filter as + the upsampler, then just output every FDMDV_OS-th filtered sample. + + Note: this function copied from fdmdv.c, included in nlp.c as a convenience + to avoid linking with another source file. + +\*---------------------------------------------------------------------------*/ + +static void fdmdv_16_to_8(float out8k[], float in16k[], int n) +{ + float acc; + int i,j,k; + + for(i=0, k=0; k. */ -#define N 80 /* frame size */ -#define M 320 /* pitch analysis window size */ -#define PITCH_MIN 20 -#define PITCH_MAX 160 -#define TNLP #include #include @@ -46,11 +41,11 @@ int frames; /*---------------------------------------------------------------------------*\ - switch_present() + switch_present() - Searches the command line arguments for a "switch". If the switch is - found, returns the command line argument where it ws found, else returns - NULL. + Searches the command line arguments for a "switch". If the switch is + found, returns the command line argument where it ws found, else returns + NULL. \*---------------------------------------------------------------------------*/ @@ -74,30 +69,36 @@ int switch_present(sw,argc,argv) \*---------------------------------------------------------------------------*/ -int main(argc,argv) -int argc; -char *argv[]; +int main(int argc, char *argv[]) { + if (argc < 3) { + printf("\nusage: tnlp InputRawSpeechFile Outputf0PitchTextFile " + "[--dump DumpFile] [--Fs SampleRateHz]\n"); + exit(1); + } + + int Fs = 8000; + if (switch_present("--Fs",argc,argv)) { + Fs = atoi(argv[argc+1]); + } + + C2CONST c2const = c2const_create(Fs); + int n = c2const.n_samp; + int m = c2const.m_pitch; FILE *fin,*fout; - short buf[N]; - float Sn[M]; /* float input speech samples */ + short buf[n]; + float Sn[m]; /* float input speech samples */ kiss_fft_cfg fft_fwd_cfg; COMP Sw[FFT_ENC]; /* DFT of Sn[] */ - float w[M]; /* time domain hamming window */ + float w[m]; /* time domain hamming window */ COMP W[FFT_ENC]; /* DFT of w[] */ - float pitch; + float pitch_samples; int i; - float prev_Wo; + float f0, prev_f0; void *nlp_states; -#ifdef DUMP + #ifdef DUMP int dump; -#endif - - if (argc < 3) { - printf("\nusage: tnlp InputRawSpeechFile OutputPitchTextFile " - "[--dump DumpFile]\n"); - exit(1); - } + #endif /* Input file */ @@ -113,46 +114,48 @@ char *argv[]; exit(1); } -#ifdef DUMP + #ifdef DUMP dump = switch_present("--dump",argc,argv); if (dump) dump_on(argv[dump+1]); -#else -/// TODO -/// #warning "Compile with -DDUMP if you expect to dump anything." -#endif + #else + /// TODO + /// #warning "Compile with -DDUMP if you expect to dump anything." + #endif - nlp_states = nlp_create(M); + for(i=0; i