some more fine patches from Danilo and the mcHF team

author drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>

Thu, 15 Sep 2016 03:34:16 +0000 (03:34 +0000)

committer drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>

Thu, 15 Sep 2016 03:34:16 +0000 (03:34 +0000)
author drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
Thu, 15 Sep 2016 03:34:16 +0000 (03:34 +0000)
committer drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
Thu, 15 Sep 2016 03:34:16 +0000 (03:34 +0000)
diff --git a/codec2-dev/src/fdmdv.c b/codec2-dev/src/fdmdv.c

index 3b3d9fad3ee483ca0c8eec560c5cfeb72248afaf..64c615ddb6965b1d37e7f77918d342c7225165ca 100644 (file)
--- a/codec2-dev/src/fdmdv.c
+++ b/codec2-dev/src/fdmdv.c
@@ -726,11 +726,35 @@ void lpf_peak_pick(float *foff, float *max, COMP pilot_baseband[],
      /* LPF cutoff 200Hz, so we can handle max +/- 200 Hz freq offset */
  
      for(i=0; i<NPILOTLPF-nin; i++)
-       pilot_lpf[i] = pilot_lpf[nin+i];
+        pilot_lpf[i] = pilot_lpf[nin+i];
      for(i=NPILOTLPF-nin, j=NPILOTBASEBAND-nin; i<NPILOTLPF; i++,j++) {
-       pilot_lpf[i].real = 0.0; pilot_lpf[i].imag = 0.0;
-       for(k=0; k<NPILOTCOEFF; k++)
-           pilot_lpf[i] = cadd(pilot_lpf[i], fcmult(pilot_coeff[k], pilot_baseband[j-NPILOTCOEFF+1+k]));
+        pilot_lpf[i].real = 0.0; pilot_lpf[i].imag = 0.0;
+
+        // STM32F4 hand optimized, this alone makes it go done from 1.6 to 1.17ms
+        // switching pilot_coeff to RAM (by removing const in pilot_coeff.h) would save
+        // another 0.11 ms at the expense of NPILOTCOEFF * 4 bytes == 120 bytes RAM
+
+        if (NPILOTCOEFF%5 == 0)
+        {
+            for(k=0; k<NPILOTCOEFF; k+=5)
+            {
+                COMP i0 = fcmult(pilot_coeff[k], pilot_baseband[j-NPILOTCOEFF+1+k]);
+                COMP i1 = fcmult(pilot_coeff[k+1], pilot_baseband[j-NPILOTCOEFF+1+k+1]);
+                COMP i2 = fcmult(pilot_coeff[k+2], pilot_baseband[j-NPILOTCOEFF+1+k+2]);
+                COMP i3 = fcmult(pilot_coeff[k+3], pilot_baseband[j-NPILOTCOEFF+1+k+3]);
+                COMP i4 = fcmult(pilot_coeff[k+4], pilot_baseband[j-NPILOTCOEFF+1+k+4]);
+
+                pilot_lpf[i] = cadd(cadd(cadd(cadd(cadd(pilot_lpf[i], i0),i1),i2),i3),i4);
+            }
+        }
+        else
+        {
+            for(k=0; k<NPILOTCOEFF; k++)
+            {
+                pilot_lpf[i] = cadd(pilot_lpf[i], fcmult(pilot_coeff[k], pilot_baseband[j-NPILOTCOEFF+1+k]));
+            }
+
+        }
      }
  
      /* We only need to do FFTs if we are out of sync.  Making them optional saves CPU in sync, which is when
@@ -1008,7 +1032,7 @@ void rxdec_filter(COMP rx_fdm_filter[], COMP rx_fdm[], COMP rxdec_lpf_mem[], int
  
    FUNCTION....: fir_filter2()
    AUTHOR......: Danilo Beuche
-  DATE CREATED: Auhust 2016
+  DATE CREATED: August 2016
  
    Ths version submitted by Danilo for the STM32F4 platform.  The idea
    is to avoid reading the same value from the STM32F4 "slow" flash
@@ -1017,28 +1041,83 @@ void rxdec_filter(COMP rx_fdm_filter[], COMP rx_fdm[], COMP rxdec_lpf_mem[], int
  
  \*---------------------------------------------------------------------------*/
  
-static void fir_filter2(float acc[2], float mem[], float coeff[], const unsigned int dec_rate) {
+static void fir_filter2(float acc[2], float mem[], const float coeff[], const unsigned int dec_rate) {
      acc[0] = 0.0;
      acc[1] = 0.0;
  
-    float c,m1,m2,a1,a2;
+    float c1,c2,c3,c4,c5,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,a1,a2;
      float* inpCmplx = &mem[0];
-    float* coeffPtr = &coeff[0];
+    const float* coeffPtr = &coeff[0];
  
      int m;
  
-    for(m=0; m<NFILTER; m+=dec_rate) {
-        c = *coeffPtr;
-        m1 = inpCmplx[0];
-        m2 = inpCmplx[1];
-        a1 = c * m1;
-        a2 = c * m2;
-        acc[0] += a1;
-        inpCmplx+= dec_rate*2;
-        acc[1] += a2;
-        coeffPtr+= dec_rate;
+    // this manual loop unrolling gives significant boost on STM32 machines
+    // reduction from avg 3.2ms to 2.4ms in tfdmv.c test
+    // 5 was the sweet spot, with 6 it took longer again
+    // and should not harm other, more powerful machines
+    // no significant difference in output, only rounding (which was to be expected)
+    // TODO: try to move coeffs to RAM and check if it makes a significant difference
+    if (NFILTER%(dec_rate*5) == 0) {
+        for(m=0; m<NFILTER; m+=dec_rate*5) {
+            c1 = *coeffPtr;
+
+            m1 = inpCmplx[0];
+            m2 = inpCmplx[1];
+
+            inpCmplx+= dec_rate*2;
+            coeffPtr+= dec_rate;
+
+            c2 = *coeffPtr;
+            m3 = inpCmplx[0];
+            m4 = inpCmplx[1];
+
+            inpCmplx+= dec_rate*2;
+            coeffPtr+= dec_rate;
+
+            c3 = *coeffPtr;
+            m5 = inpCmplx[0];
+            m6 = inpCmplx[1];
+
+            inpCmplx+= dec_rate*2;
+            coeffPtr+= dec_rate;
+
+            c4 = *coeffPtr;
+            m7 = inpCmplx[0];
+            m8 = inpCmplx[1];
+
+            inpCmplx+= dec_rate*2;
+            coeffPtr+= dec_rate;
+
+            c5 = *coeffPtr;
+            m9 = inpCmplx[0];
+            m10 = inpCmplx[1];
+
+            inpCmplx+= dec_rate*2;
+            coeffPtr+= dec_rate;
+
+            a1 = c1 * m1 + c2 * m3 + c3 * m5 + c4 * m7 + c5 * m9;
+            a2 = c1 * m2 + c2 * m4 + c3 * m6 + c4 * m8 + c5 * m10;
+            acc[0] += a1;
+            acc[1] += a2;
+        }
      }
+    else
+    {
+        for(m=0; m<NFILTER; m+=dec_rate) {
+            c1 = *coeffPtr;
+
+            m1 = inpCmplx[0];
+            m2 = inpCmplx[1];
+
+            inpCmplx+= dec_rate*2;
+            coeffPtr+= dec_rate;
  
+            a1 = c1 * m1;
+            a2 = c1 * m2;
+            acc[0] += a1;
+            acc[1] += a2;
+        }
+    }
      acc[0] *= dec_rate;
      acc[1] *= dec_rate;
  }
@@ -1077,11 +1156,16 @@ void down_convert_and_rx_filter(COMP rx_filt[NC+1][P+1], int Nc, COMP rx_fdm[],
  
      /* update memory of rx_fdm */
  
+#if 0
      for(i=0; i<NFILTER+M-nin; i++)
          rx_fdm_mem[i] = rx_fdm_mem[i+nin];
      for(i=NFILTER+M-nin, k=0; i<NFILTER+M; i++, k++)
          rx_fdm_mem[i] = rx_fdm[k];
-
+#else
+    // this gives only 40uS gain on STM32 but now that we have, we keep it
+    memmove(&rx_fdm_mem[0],&rx_fdm_mem[nin],(NFILTER+M-nin)*sizeof(COMP));
+    memcpy(&rx_fdm_mem[NFILTER+M-nin],&rx_fdm[0],nin*sizeof(COMP));
+#endif
      for(c=0; c<Nc+1; c++) {
  
          /*
@@ -1095,7 +1179,7 @@ void down_convert_and_rx_filter(COMP rx_filt[NC+1][P+1], int Nc, COMP rx_fdm[],
  
            This means winding phase(c) back from this point to ensure
            phase continuity.
-        */
+         */
  
          //PROFILE_SAMPLE(windback_start);
          windback_phase           = -freq_pol[c]*NFILTER;
@@ -1126,16 +1210,16 @@ void down_convert_and_rx_filter(COMP rx_filt[NC+1][P+1], int Nc, COMP rx_fdm[],
  
          N=M/P;
          for(i=0, k=0; i<nin; i+=N, k++) {
-           #ifdef ORIG
-           rx_filt[c][k].real = 0.0; rx_filt[c][k].imag = 0.0;
+#ifdef ORIG
+            rx_filt[c][k].real = 0.0; rx_filt[c][k].imag = 0.0;
  
-           for(m=0; m<NFILTER; m++)
-               rx_filt[c][k] = cadd(rx_filt[c][k], fcmult(gt_alpha5_root[m], rx_baseband[st+i+m]));
-           #else
+            for(m=0; m<NFILTER; m++)
+                rx_filt[c][k] = cadd(rx_filt[c][k], fcmult(gt_alpha5_root[m], rx_baseband[st+i+m]));
+#else
              // rx_filt[c][k].real = fir_filter(&rx_baseband[st+i].real, (float*)gt_alpha5_root, dec_rate);
              // rx_filt[c][k].imag = fir_filter(&rx_baseband[st+i].imag, (float*)gt_alpha5_root, dec_rate);
-           fir_filter2(&rx_filt[c][k].real,&rx_baseband[st+i].real, (float*)gt_alpha5_root, dec_rate);
-           #endif
+            fir_filter2(&rx_filt[c][k].real,&rx_baseband[st+i].real, gt_alpha5_root, dec_rate);
+#endif
          }
          //PROFILE_SAMPLE_AND_LOG2(filter_start, "        filter");
  
diff --git a/codec2-dev/src/pilot_coeff.h b/codec2-dev/src/pilot_coeff.h

index b284af93bde128ba1073100096e703f2835eab64..85af69695e87476699d592f496094d2e77466d95 100644 (file)
--- a/codec2-dev/src/pilot_coeff.h
+++ b/codec2-dev/src/pilot_coeff.h
@@ -1,6 +1,13 @@
  /* Generated by pilot_coeff_file() Octave function */
  
+// const removed since this provides gain
+// on the STM32F4 platform
+
+#ifdef CORTEX_M4
+/* const */ float pilot_coeff[]={
+#else
  const float pilot_coeff[]={
+#endif
    0.00223001,
    0.00301037,
    0.00471258,
diff --git a/codec2-dev/src/quantise.c b/codec2-dev/src/quantise.c

index 9c08ef5e3ee68710b232c7ec675e576aefc799df..ea0d5603d5686272e3d2e8e2a5d57cf769d8c215 100644 (file)
--- a/codec2-dev/src/quantise.c
+++ b/codec2-dev/src/quantise.c
@@ -846,7 +846,7 @@ void force_min_lsp_dist(float lsp[], int order)
  
  \*---------------------------------------------------------------------------*/
  
-void lpc_post_filter(kiss_fft_cfg fft_fwd_cfg, COMP Pw[], float ak[],
+void lpc_post_filter(kiss_fft_cfg fft_fwd_cfg, float Pw[], float ak[],
                       int order, int dump, float beta, float gamma, int bass_boost, float E)
  {
      int   i;
@@ -888,7 +888,7 @@ void lpc_post_filter(kiss_fft_cfg fft_fwd_cfg, COMP Pw[], float ak[],
  
      max_Rw = 0.0; min_Rw = 1E32;
      for(i=0; i<FFT_ENC/2; i++) {
-       Rw[i] = sqrtf(Ww[i].real * Pw[i].real);
+       Rw[i] = sqrtf(Ww[i].real * Pw[i]);
         if (Rw[i] > max_Rw)
             max_Rw = Rw[i];
         if (Rw[i] < min_Rw)
@@ -909,7 +909,7 @@ void lpc_post_filter(kiss_fft_cfg fft_fwd_cfg, COMP Pw[], float ak[],
  
      e_before = 1E-4;
      for(i=0; i<FFT_ENC/2; i++)
-       e_before += Pw[i].real;
+       e_before += Pw[i];
  
      /* apply post filter and measure energy  */
  
@@ -922,8 +922,8 @@ void lpc_post_filter(kiss_fft_cfg fft_fwd_cfg, COMP Pw[], float ak[],
      e_after = 1E-4;
      for(i=0; i<FFT_ENC/2; i++) {
          Pfw = powf(Rw[i], beta);
-        Pw[i].real *= Pfw * Pfw;
-        e_after += Pw[i].real;
+        Pw[i] *= Pfw * Pfw;
+        e_after += Pw[i];
      }
      gain = e_before/e_after;
  
@@ -931,14 +931,14 @@ void lpc_post_filter(kiss_fft_cfg fft_fwd_cfg, COMP Pw[], float ak[],
  
      gain *= E;
      for(i=0; i<FFT_ENC/2; i++) {
-       Pw[i].real *= gain;
+       Pw[i] *= gain;
      }
  
      if (bass_boost) {
          /* add 3dB to first 1 kHz to account for LP effect of PF */
  
          for(i=0; i<FFT_ENC/8; i++) {
-            Pw[i].real *= 1.4*1.4;
+            Pw[i] *= 1.4*1.4;
          }
      }
  
@@ -972,8 +972,6 @@ void aks_to_M2(
    COMP          Aw[]         /* output power spectrum */
  )
  {
-  COMP a[FFT_ENC];     /* input to FFT for power spectrum */
-  COMP Pw[FFT_ENC];    /* output power spectrum */
    int i,m;             /* loop variables */
    int am,bm;           /* limits of current band */
    float r;             /* no. rads/bin */
@@ -987,33 +985,54 @@ void aks_to_M2(
    r = TWO_PI/(FFT_ENC);
  
    /* Determine DFT of A(exp(jw)) --------------------------------------------*/
+  {
+      COMP a[FFT_ENC];  /* input to FFT for power spectrum */
  
-  for(i=0; i<FFT_ENC; i++) {
-    a[i].real = 0.0;
-    a[i].imag = 0.0;
-    Pw[i].real = 0.0;
-    Pw[i].imag = 0.0;
-  }
-
-  for(i=0; i<=order; i++)
-    a[i].real = ak[i];
-  kiss_fft(fft_fwd_cfg, (kiss_fft_cpx *)a, (kiss_fft_cpx *)Aw);
+      for(i=0; i<FFT_ENC; i++) {
+          a[i].real = 0.0;
+          a[i].imag = 0.0;
+      }
  
+      for(i=0; i<=order; i++)
+          a[i].real = ak[i];
+      kiss_fft(fft_fwd_cfg, (kiss_fft_cpx *)a, (kiss_fft_cpx *)Aw);
+  }
    PROFILE_SAMPLE_AND_LOG(tfft, tstart, "      fft");
  
    /* Determine power spectrum P(w) = E/(A(exp(jw))^2 ------------------------*/
  
+#ifndef ARM_MATH_CM4
+  float Pw[FFT_ENC];
    for(i=0; i<FFT_ENC/2; i++) {
-    Pw[i].real = 1.0/(Aw[i].real*Aw[i].real + Aw[i].imag*Aw[i].imag + 1E-6);
+    Pw[i] = 1.0/(Aw[i].real*Aw[i].real + Aw[i].imag*Aw[i].imag + 1E-6);
+  }
+#else
+  // this difference may seem strange, but the gcc for STM32F4 generates almost 5 times
+  // faster code with the two loops: 1120 ms -> 242 ms
+  // so please leave it as is or improve further
+  // since this code is called 4 times it results in almost 4ms gain (21ms -> 17ms per audio frame decode @ 1300 )
+
+  float Pw[FFT_ENC];
+  for(i=FFT_ENC/2; i<FFT_ENC; i++) {
+    Pw[i] = 0.0;
    }
  
+  for(i=0; i<FFT_ENC/2; i++)
+  {
+      Pw[i] = Aw[i].real * Aw[i].real + Aw[i].imag * Aw[i].imag  + 1E-6;
+  }
+  for(i=0; i<FFT_ENC/2; i++) {
+      Pw[i] = 1.0/(Pw[i]);
+  }
+#endif
+
    PROFILE_SAMPLE_AND_LOG(tpw, tfft, "      Pw");
  
    if (pf)
        lpc_post_filter(fft_fwd_cfg, Pw, ak, order, dump, beta, gamma, bass_boost, E);
    else {
        for(i=0; i<FFT_ENC; i++) {
-          Pw[i].real *= E;
+          Pw[i] *= E;
        }
    }
  
@@ -1037,7 +1056,7 @@ void aks_to_M2(
        Em = 0.0;
  
        for(i=am; i<bm; i++)
-          Em += Pw[i].real;
+          Em += Pw[i];
        Am = sqrtf(Em);
  
        signal += model->A[m]*model->A[m];
author	drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
	Thu, 15 Sep 2016 03:34:16 +0000 (03:34 +0000)
committer	drowe67 <drowe67@01035d8c-6547-0410-b346-abe4f91aad63>
	Thu, 15 Sep 2016 03:34:16 +0000 (03:34 +0000)
codec2-dev/src/fdmdv.c		patch \| blob \| history
codec2-dev/src/pilot_coeff.h		patch \| blob \| history
codec2-dev/src/quantise.c		patch \| blob \| history