/* LPF cutoff 200Hz, so we can handle max +/- 200 Hz freq offset */
for(i=0; i<NPILOTLPF-nin; i++)
- pilot_lpf[i] = pilot_lpf[nin+i];
+ pilot_lpf[i] = pilot_lpf[nin+i];
for(i=NPILOTLPF-nin, j=NPILOTBASEBAND-nin; i<NPILOTLPF; i++,j++) {
- pilot_lpf[i].real = 0.0; pilot_lpf[i].imag = 0.0;
- for(k=0; k<NPILOTCOEFF; k++)
- pilot_lpf[i] = cadd(pilot_lpf[i], fcmult(pilot_coeff[k], pilot_baseband[j-NPILOTCOEFF+1+k]));
+ pilot_lpf[i].real = 0.0; pilot_lpf[i].imag = 0.0;
+
+ // STM32F4 hand optimized, this alone makes it go done from 1.6 to 1.17ms
+ // switching pilot_coeff to RAM (by removing const in pilot_coeff.h) would save
+ // another 0.11 ms at the expense of NPILOTCOEFF * 4 bytes == 120 bytes RAM
+
+ if (NPILOTCOEFF%5 == 0)
+ {
+ for(k=0; k<NPILOTCOEFF; k+=5)
+ {
+ COMP i0 = fcmult(pilot_coeff[k], pilot_baseband[j-NPILOTCOEFF+1+k]);
+ COMP i1 = fcmult(pilot_coeff[k+1], pilot_baseband[j-NPILOTCOEFF+1+k+1]);
+ COMP i2 = fcmult(pilot_coeff[k+2], pilot_baseband[j-NPILOTCOEFF+1+k+2]);
+ COMP i3 = fcmult(pilot_coeff[k+3], pilot_baseband[j-NPILOTCOEFF+1+k+3]);
+ COMP i4 = fcmult(pilot_coeff[k+4], pilot_baseband[j-NPILOTCOEFF+1+k+4]);
+
+ pilot_lpf[i] = cadd(cadd(cadd(cadd(cadd(pilot_lpf[i], i0),i1),i2),i3),i4);
+ }
+ }
+ else
+ {
+ for(k=0; k<NPILOTCOEFF; k++)
+ {
+ pilot_lpf[i] = cadd(pilot_lpf[i], fcmult(pilot_coeff[k], pilot_baseband[j-NPILOTCOEFF+1+k]));
+ }
+
+ }
}
/* We only need to do FFTs if we are out of sync. Making them optional saves CPU in sync, which is when
FUNCTION....: fir_filter2()
AUTHOR......: Danilo Beuche
- DATE CREATED: Auhust 2016
+ DATE CREATED: August 2016
Ths version submitted by Danilo for the STM32F4 platform. The idea
is to avoid reading the same value from the STM32F4 "slow" flash
\*---------------------------------------------------------------------------*/
-static void fir_filter2(float acc[2], float mem[], float coeff[], const unsigned int dec_rate) {
+static void fir_filter2(float acc[2], float mem[], const float coeff[], const unsigned int dec_rate) {
acc[0] = 0.0;
acc[1] = 0.0;
- float c,m1,m2,a1,a2;
+ float c1,c2,c3,c4,c5,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,a1,a2;
float* inpCmplx = &mem[0];
- float* coeffPtr = &coeff[0];
+ const float* coeffPtr = &coeff[0];
int m;
- for(m=0; m<NFILTER; m+=dec_rate) {
- c = *coeffPtr;
- m1 = inpCmplx[0];
- m2 = inpCmplx[1];
- a1 = c * m1;
- a2 = c * m2;
- acc[0] += a1;
- inpCmplx+= dec_rate*2;
- acc[1] += a2;
- coeffPtr+= dec_rate;
+ // this manual loop unrolling gives significant boost on STM32 machines
+ // reduction from avg 3.2ms to 2.4ms in tfdmv.c test
+ // 5 was the sweet spot, with 6 it took longer again
+ // and should not harm other, more powerful machines
+ // no significant difference in output, only rounding (which was to be expected)
+ // TODO: try to move coeffs to RAM and check if it makes a significant difference
+ if (NFILTER%(dec_rate*5) == 0) {
+ for(m=0; m<NFILTER; m+=dec_rate*5) {
+ c1 = *coeffPtr;
+
+ m1 = inpCmplx[0];
+ m2 = inpCmplx[1];
+
+ inpCmplx+= dec_rate*2;
+ coeffPtr+= dec_rate;
+
+ c2 = *coeffPtr;
+ m3 = inpCmplx[0];
+ m4 = inpCmplx[1];
+
+ inpCmplx+= dec_rate*2;
+ coeffPtr+= dec_rate;
+
+ c3 = *coeffPtr;
+ m5 = inpCmplx[0];
+ m6 = inpCmplx[1];
+
+ inpCmplx+= dec_rate*2;
+ coeffPtr+= dec_rate;
+
+ c4 = *coeffPtr;
+ m7 = inpCmplx[0];
+ m8 = inpCmplx[1];
+
+ inpCmplx+= dec_rate*2;
+ coeffPtr+= dec_rate;
+
+ c5 = *coeffPtr;
+ m9 = inpCmplx[0];
+ m10 = inpCmplx[1];
+
+ inpCmplx+= dec_rate*2;
+ coeffPtr+= dec_rate;
+
+ a1 = c1 * m1 + c2 * m3 + c3 * m5 + c4 * m7 + c5 * m9;
+ a2 = c1 * m2 + c2 * m4 + c3 * m6 + c4 * m8 + c5 * m10;
+ acc[0] += a1;
+ acc[1] += a2;
+ }
}
+ else
+ {
+ for(m=0; m<NFILTER; m+=dec_rate) {
+ c1 = *coeffPtr;
+
+ m1 = inpCmplx[0];
+ m2 = inpCmplx[1];
+
+ inpCmplx+= dec_rate*2;
+ coeffPtr+= dec_rate;
+ a1 = c1 * m1;
+ a2 = c1 * m2;
+ acc[0] += a1;
+ acc[1] += a2;
+ }
+ }
acc[0] *= dec_rate;
acc[1] *= dec_rate;
}
/* update memory of rx_fdm */
+#if 0
for(i=0; i<NFILTER+M-nin; i++)
rx_fdm_mem[i] = rx_fdm_mem[i+nin];
for(i=NFILTER+M-nin, k=0; i<NFILTER+M; i++, k++)
rx_fdm_mem[i] = rx_fdm[k];
-
+#else
+ // this gives only 40uS gain on STM32 but now that we have, we keep it
+ memmove(&rx_fdm_mem[0],&rx_fdm_mem[nin],(NFILTER+M-nin)*sizeof(COMP));
+ memcpy(&rx_fdm_mem[NFILTER+M-nin],&rx_fdm[0],nin*sizeof(COMP));
+#endif
for(c=0; c<Nc+1; c++) {
/*
This means winding phase(c) back from this point to ensure
phase continuity.
- */
+ */
//PROFILE_SAMPLE(windback_start);
windback_phase = -freq_pol[c]*NFILTER;
N=M/P;
for(i=0, k=0; i<nin; i+=N, k++) {
- #ifdef ORIG
- rx_filt[c][k].real = 0.0; rx_filt[c][k].imag = 0.0;
+#ifdef ORIG
+ rx_filt[c][k].real = 0.0; rx_filt[c][k].imag = 0.0;
- for(m=0; m<NFILTER; m++)
- rx_filt[c][k] = cadd(rx_filt[c][k], fcmult(gt_alpha5_root[m], rx_baseband[st+i+m]));
- #else
+ for(m=0; m<NFILTER; m++)
+ rx_filt[c][k] = cadd(rx_filt[c][k], fcmult(gt_alpha5_root[m], rx_baseband[st+i+m]));
+#else
// rx_filt[c][k].real = fir_filter(&rx_baseband[st+i].real, (float*)gt_alpha5_root, dec_rate);
// rx_filt[c][k].imag = fir_filter(&rx_baseband[st+i].imag, (float*)gt_alpha5_root, dec_rate);
- fir_filter2(&rx_filt[c][k].real,&rx_baseband[st+i].real, (float*)gt_alpha5_root, dec_rate);
- #endif
+ fir_filter2(&rx_filt[c][k].real,&rx_baseband[st+i].real, gt_alpha5_root, dec_rate);
+#endif
}
//PROFILE_SAMPLE_AND_LOG2(filter_start, " filter");
\*---------------------------------------------------------------------------*/
-void lpc_post_filter(kiss_fft_cfg fft_fwd_cfg, COMP Pw[], float ak[],
+void lpc_post_filter(kiss_fft_cfg fft_fwd_cfg, float Pw[], float ak[],
int order, int dump, float beta, float gamma, int bass_boost, float E)
{
int i;
max_Rw = 0.0; min_Rw = 1E32;
for(i=0; i<FFT_ENC/2; i++) {
- Rw[i] = sqrtf(Ww[i].real * Pw[i].real);
+ Rw[i] = sqrtf(Ww[i].real * Pw[i]);
if (Rw[i] > max_Rw)
max_Rw = Rw[i];
if (Rw[i] < min_Rw)
e_before = 1E-4;
for(i=0; i<FFT_ENC/2; i++)
- e_before += Pw[i].real;
+ e_before += Pw[i];
/* apply post filter and measure energy */
e_after = 1E-4;
for(i=0; i<FFT_ENC/2; i++) {
Pfw = powf(Rw[i], beta);
- Pw[i].real *= Pfw * Pfw;
- e_after += Pw[i].real;
+ Pw[i] *= Pfw * Pfw;
+ e_after += Pw[i];
}
gain = e_before/e_after;
gain *= E;
for(i=0; i<FFT_ENC/2; i++) {
- Pw[i].real *= gain;
+ Pw[i] *= gain;
}
if (bass_boost) {
/* add 3dB to first 1 kHz to account for LP effect of PF */
for(i=0; i<FFT_ENC/8; i++) {
- Pw[i].real *= 1.4*1.4;
+ Pw[i] *= 1.4*1.4;
}
}
COMP Aw[] /* output power spectrum */
)
{
- COMP a[FFT_ENC]; /* input to FFT for power spectrum */
- COMP Pw[FFT_ENC]; /* output power spectrum */
int i,m; /* loop variables */
int am,bm; /* limits of current band */
float r; /* no. rads/bin */
r = TWO_PI/(FFT_ENC);
/* Determine DFT of A(exp(jw)) --------------------------------------------*/
+ {
+ COMP a[FFT_ENC]; /* input to FFT for power spectrum */
- for(i=0; i<FFT_ENC; i++) {
- a[i].real = 0.0;
- a[i].imag = 0.0;
- Pw[i].real = 0.0;
- Pw[i].imag = 0.0;
- }
-
- for(i=0; i<=order; i++)
- a[i].real = ak[i];
- kiss_fft(fft_fwd_cfg, (kiss_fft_cpx *)a, (kiss_fft_cpx *)Aw);
+ for(i=0; i<FFT_ENC; i++) {
+ a[i].real = 0.0;
+ a[i].imag = 0.0;
+ }
+ for(i=0; i<=order; i++)
+ a[i].real = ak[i];
+ kiss_fft(fft_fwd_cfg, (kiss_fft_cpx *)a, (kiss_fft_cpx *)Aw);
+ }
PROFILE_SAMPLE_AND_LOG(tfft, tstart, " fft");
/* Determine power spectrum P(w) = E/(A(exp(jw))^2 ------------------------*/
+#ifndef ARM_MATH_CM4
+ float Pw[FFT_ENC];
for(i=0; i<FFT_ENC/2; i++) {
- Pw[i].real = 1.0/(Aw[i].real*Aw[i].real + Aw[i].imag*Aw[i].imag + 1E-6);
+ Pw[i] = 1.0/(Aw[i].real*Aw[i].real + Aw[i].imag*Aw[i].imag + 1E-6);
+ }
+#else
+ // this difference may seem strange, but the gcc for STM32F4 generates almost 5 times
+ // faster code with the two loops: 1120 ms -> 242 ms
+ // so please leave it as is or improve further
+ // since this code is called 4 times it results in almost 4ms gain (21ms -> 17ms per audio frame decode @ 1300 )
+
+ float Pw[FFT_ENC];
+ for(i=FFT_ENC/2; i<FFT_ENC; i++) {
+ Pw[i] = 0.0;
}
+ for(i=0; i<FFT_ENC/2; i++)
+ {
+ Pw[i] = Aw[i].real * Aw[i].real + Aw[i].imag * Aw[i].imag + 1E-6;
+ }
+ for(i=0; i<FFT_ENC/2; i++) {
+ Pw[i] = 1.0/(Pw[i]);
+ }
+#endif
+
PROFILE_SAMPLE_AND_LOG(tpw, tfft, " Pw");
if (pf)
lpc_post_filter(fft_fwd_cfg, Pw, ak, order, dump, beta, gamma, bass_boost, E);
else {
for(i=0; i<FFT_ENC; i++) {
- Pw[i].real *= E;
+ Pw[i] *= E;
}
}
Em = 0.0;
for(i=am; i<bm; i++)
- Em += Pw[i].real;
+ Em += Pw[i];
Am = sqrtf(Em);
signal += model->A[m]*model->A[m];