for(i=0; i<LPC_ORD; i++) {
prev_lsps[i] = i*PI/(LPC_ORD+1);
}
- prev_e = 1;
+ e = prev_e = 1;
nlp_states = nlp_create();
sum_snr = 0;
while(fread(buf,sizeof(short),N,fin)) {
frames++;
-
+ printf("frame: %d", frames);
+
/* Read input speech */
for(i=0; i<M-N; i++)
/* Estimate pitch */
nlp(nlp_states,Sn,N,M,P_MIN,P_MAX,&pitch,Sw,&prev_Wo);
- prev_Wo = TWO_PI/pitch;
model.Wo = TWO_PI/pitch;
/* estimate model parameters */
/* determine voicing */
- snr = est_voicing_mbe(&model, Sw, W, Sw_, Ew);
+ snr = est_voicing_mbe(&model, Sw, W, Sw_, Ew, prev_Wo);
#ifdef DUMP
dump_Sw_(Sw_);
dump_Ew(Ew);
synth_one_frame(buf, &model, Sn_, Pn);
if (fout != NULL) fwrite(buf,sizeof(short),N,fout);
}
+ prev_Wo = TWO_PI/pitch;
}
fclose(fin);
/* Estimate pitch */
nlp(c2->nlp,c2->Sn,N,M,P_MIN,P_MAX,&pitch,Sw,&c2->prev_Wo);
- c2->prev_Wo = TWO_PI/pitch;
model->Wo = TWO_PI/pitch;
model->L = PI/model->Wo;
dft_speech(Sw, c2->Sn, c2->w);
two_stage_pitch_refinement(model, Sw);
estimate_amplitudes(model, Sw, c2->W);
- est_voicing_mbe(model, Sw, c2->W, Sw_, Ew);
+ est_voicing_mbe(model, Sw, c2->W, Sw_, Ew, c2->prev_Wo);
+
+ c2->prev_Wo = model->Wo;
}
COMP W[],
COMP Sw_[], /* DFT of all voiced synthesised signal */
/* useful for debugging/dump file */
- COMP Ew[] /* DFT of error */
-)
+ COMP Ew[], /* DFT of error */
+ float prev_Wo)
{
int i,l,al,bl,m; /* loop variables */
COMP Am; /* amplitude sample for this band */
float error; /* accumulated error between originl and synthesised */
float Wo;
float sig, snr;
+ float elow, ehigh, eratio;
+ float dF0, sixty;
sig = 0.0;
for(l=1; l<=model->L/4; l++) {
model->voiced = 1;
else
model->voiced = 0;
-
+
+ /* post processing, helps clean up some voicing errors ---------------------*/
+
+ /*
+ Determine the ratio of low freancy to high frequency energy,
+ voiced speech tends to be dominated by low frequency energy,
+ unvoiced by high frequency. This measure can be used to
+ determine if we have made any gross errors.
+ */
+
+ elow = ehigh = 0.0;
+ for(l=1; l<=model->L/2; l++) {
+ elow += model->A[l]*model->A[l];
+ }
+ for(l=model->L/2; l<=model->L; l++) {
+ ehigh += model->A[l]*model->A[l];
+ }
+ eratio = 10.0*log10(elow/ehigh);
+ dF0 = 0.0;
+
+ /* Look for Type 1 errors, strongly V speech that has been
+ accidentally declared UV */
+
+ if (model->voiced == 0)
+ if (eratio > 10.0)
+ model->voiced = 1;
+
+ /* Look for Type 2 errors, strongly UV speech that has been
+ accidentally declared V */
+
+ if (model->voiced == 1) {
+ if (eratio < -10.0)
+ model->voiced = 0;
+
+ /* If pitch is jumping about it's likely this is UV */
+
+ dF0 = (model->Wo - prev_Wo)*FS/TWO_PI;
+ if (fabs(dF0) > 15.0)
+ model->voiced = 0;
+
+ /* A common source of Type 2 errors is the pitch estimator
+ gives a low (50Hz) estimate for UV speech, which gives a
+ good match with noise due to the close harmoonic spacing.
+ These errors are much more common than people with 50Hz
+ pitch, so we have just a small eratio threshold. */
+
+ sixty = 60.0*TWO_PI/FS;
+ if ((eratio < -4.0) && (model->Wo <= sixty))
+ model->voiced = 0;
+ }
+ printf(" v: %d snr: %f eratio: %3.2f %f\n", model->voiced, snr, eratio, dF0);
+
return snr;
}
void dft_speech(COMP Sw[], float Sn[], float w[]);
void two_stage_pitch_refinement(MODEL *model, COMP Sw[]);
void estimate_amplitudes(MODEL *model, COMP Sw[], COMP W[]);
-float est_voicing_mbe(MODEL *model, COMP Sw[], COMP W[], COMP Sw_[],COMP Ew[]);
+float est_voicing_mbe(MODEL *model, COMP Sw[], COMP W[], COMP Sw_[],COMP Ew[], float prev_Wo);
void make_synthesis_window(float Pn[]);
void synthesise(float Sn_[], MODEL *model, float Pn[], int shift);