An AltiVec-enhanced IMDCT for liba52 (liba52/imdct.c)

It's nearly bit-perfect, I have a couple of lsb changed in a 128 frames sample. I can't hear the differences :-) patch by Romain Dolbeau <dolbeau@irisa.fr> git-svn-id: svn://svn.mplayerhq.hu/mplayer/trunk@9002 b3059339-0415-0410-9bf9-f77b7e298cf2
2025-02-19 12:23:17 +01:00 · 2003-01-18 19:28:29 +00:00 · 2003-01-18 19:28:29 +00:00 · 418e699d7e
commit 418e699d7e
parent 5ef9bfab29
3 changed files with 365 additions and 13 deletions
--- a/liba52/Makefile
+++ b/liba52/Makefile
@ -7,6 +7,9 @@ SRCS    = crc.c resample.c bit_allocate.c bitstream.c downmix.c imdct.c imdct_ml
 OBJS	= $(SRCS:.c=.o)

 CFLAGS  = $(MLIB_INC) $(OPTFLAGS) 
+ifeq ($(TARGET_ALTIVEC),yes)
+    CFLAGS+= -faltivec
+endif

 .SUFFIXES: .c .o

--- a/liba52/imdct.c
+++ b/liba52/imdct.c
@ -23,6 +23,7 @@
 * SSE optimizations from Michael Niedermayer (michaelni@gmx.at)
 * 3DNOW optimizations from Nick Kurshev <nickols_k@mail.ru>
 *   michael did port them from libac3 (untested, perhaps totally broken)
+ * AltiVec optimizations from Romain Dolbeau (romain@dolbeau.org)
 */

 #include "config.h"
@ -114,24 +115,24 @@ static float __attribute__((aligned(16))) *sseW[7]=
 	{NULL /*sseW0*/,sseW1,sseW2,sseW3,sseW4,sseW5,sseW6};
 static float __attribute__((aligned(16))) sseWindow[512];
 #else
-static complex_t buf[128];
+static complex_t  __attribute__((aligned(16))) buf[128];
 #endif

 /* Twiddle factor LUT */
-static complex_t w_1[1];
-static complex_t w_2[2];
-static complex_t w_4[4];
-static complex_t w_8[8];
-static complex_t w_16[16];
-static complex_t w_32[32];
-static complex_t w_64[64];
-static complex_t * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};
+static complex_t __attribute__((aligned(16))) w_1[1];
+static complex_t __attribute__((aligned(16))) w_2[2];
+static complex_t __attribute__((aligned(16))) w_4[4];
+static complex_t __attribute__((aligned(16))) w_8[8];
+static complex_t __attribute__((aligned(16))) w_16[16];
+static complex_t __attribute__((aligned(16))) w_32[32];
+static complex_t __attribute__((aligned(16))) w_64[64];
+static complex_t __attribute__((aligned(16))) * w[7] = {w_1, w_2, w_4, w_8, w_16, w_32, w_64};

 /* Twiddle factors for IMDCT */
-static sample_t xcos1[128];
-static sample_t xsin1[128];
-static sample_t xcos2[64];
-static sample_t xsin2[64];
+static sample_t __attribute__((aligned(16))) xcos1[128];
+static sample_t __attribute__((aligned(16))) xsin1[128];
+static sample_t __attribute__((aligned(16))) xcos2[64];
+static sample_t __attribute__((aligned(16))) xsin2[64];

 /* Windowing function for Modified DCT - Thank you acroread */
 sample_t imdct_window[] = {
@ -384,6 +385,343 @@ imdct_do_512(sample_t data[],sample_t delay[], sample_t bias)
    }
 }

+#ifdef HAVE_ALTIVEC
+
+// used to build registers permutation vectors (vcprm)
+// the 's' are for words in the _s_econd vector
+#define WORD_0 0x00,0x01,0x02,0x03
+#define WORD_1 0x04,0x05,0x06,0x07
+#define WORD_2 0x08,0x09,0x0a,0x0b
+#define WORD_3 0x0c,0x0d,0x0e,0x0f
+#define WORD_s0 0x10,0x11,0x12,0x13
+#define WORD_s1 0x14,0x15,0x16,0x17
+#define WORD_s2 0x18,0x19,0x1a,0x1b
+#define WORD_s3 0x1c,0x1d,0x1e,0x1f
+
+#define vcprm(a,b,c,d) (const vector unsigned char)(WORD_ ## a, WORD_ ## b, WORD_ ## c, WORD_ ## d)
+
+// vcprmle is used to keep the same index as in the SSE version.
+// it's the same as vcprm, with the index inversed
+// ('le' is Little Endian)
+#define vcprmle(a,b,c,d) vcprm(d,c,b,a)
+
+// used to build inverse/identity vectors (vcii)
+// n is _n_egative, p is _p_ositive
+#define FLOAT_n -1.
+#define FLOAT_p 1.
+
+#define vcii(a,b,c,d) (const vector float)(FLOAT_ ## a, FLOAT_ ## b, FLOAT_ ## c, FLOAT_ ## d)
+
+void
+imdct_do_512_altivec(sample_t data[],sample_t delay[], sample_t bias)
+{
+  int i;
+  int k;
+  int p,q;
+  int m;
+  int two_m;
+  int two_m_plus_one;
+
+  sample_t tmp_b_i;
+  sample_t tmp_b_r;
+  sample_t tmp_a_i;
+  sample_t tmp_a_r;
+
+  sample_t *data_ptr;
+  sample_t *delay_ptr;
+  sample_t *window_ptr;
+	
+  /* 512 IMDCT with source and dest data in 'data' */
+	
+  /* Pre IFFT complex multiply plus IFFT cmplx conjugate & reordering*/
+  for( i=0; i < 128; i++) {
+    /* z[i] = (X[256-2*i-1] + j * X[2*i]) * (xcos1[i] + j * xsin1[i]) ; */ 
+    int j= bit_reverse_512[i];
+    buf[i].real =         (data[256-2*j-1] * xcos1[j])  -  (data[2*j]       * xsin1[j]);
+    buf[i].imag = -1.0 * ((data[2*j]       * xcos1[j])  +  (data[256-2*j-1] * xsin1[j]));
+  }
+  
+  /* 1. iteration */
+  for(i = 0; i < 128; i += 2) {
+#if 0
+    tmp_a_r = buf[i].real;
+    tmp_a_i = buf[i].imag;
+    tmp_b_r = buf[i+1].real;
+    tmp_b_i = buf[i+1].imag;
+    buf[i].real = tmp_a_r + tmp_b_r;
+    buf[i].imag =  tmp_a_i + tmp_b_i;
+    buf[i+1].real = tmp_a_r - tmp_b_r;
+    buf[i+1].imag =  tmp_a_i - tmp_b_i;
+#else
+    vector float temp, bufv; 
+
+    bufv = vec_ld(i << 3, (float*)buf);
+    temp = vec_perm(bufv, bufv, vcprm(2,3,0,1));
+    bufv = vec_madd(bufv, vcii(p,p,n,n), temp);
+    vec_st(bufv, i << 3, (float*)buf);
+#endif
+  }
+        
+  /* 2. iteration */
+  // Note w[1]={{1,0}, {0,-1}}
+  for(i = 0; i < 128; i += 4) {
+#if 0
+    tmp_a_r = buf[i].real;
+    tmp_a_i = buf[i].imag;
+    tmp_b_r = buf[i+2].real;
+    tmp_b_i = buf[i+2].imag;
+    buf[i].real = tmp_a_r + tmp_b_r;
+    buf[i].imag =  tmp_a_i + tmp_b_i;
+    buf[i+2].real = tmp_a_r - tmp_b_r;
+    buf[i+2].imag =  tmp_a_i - tmp_b_i;
+    tmp_a_r = buf[i+1].real;
+    tmp_a_i = buf[i+1].imag;
+    /* WARNING: im <-> re here ! */
+    tmp_b_r = buf[i+3].imag;
+    tmp_b_i = buf[i+3].real;
+    buf[i+1].real = tmp_a_r + tmp_b_r;
+    buf[i+1].imag =  tmp_a_i - tmp_b_i;
+    buf[i+3].real = tmp_a_r - tmp_b_r;
+    buf[i+3].imag =  tmp_a_i + tmp_b_i;
+#else
+    vector float buf01, buf23, temp1, temp2;
+	
+    buf01 = vec_ld((i + 0) << 3, (float*)buf);
+    buf23 = vec_ld((i + 2) << 3, (float*)buf);
+    buf23 = vec_perm(buf23,buf23,vcprm(0,1,3,2));
+
+    temp1 = vec_madd(buf23, vcii(p,p,p,n), buf01);
+    temp2 = vec_madd(buf23, vcii(n,n,n,p), buf01);
+
+    vec_st(temp1, (i + 0) << 3, (float*)buf);
+    vec_st(temp2, (i + 2) << 3, (float*)buf);
+#endif
+  }
+
+  /* 3. iteration */
+  for(i = 0; i < 128; i += 8) {
+#if 0
+    tmp_a_r = buf[i].real;
+    tmp_a_i = buf[i].imag;
+    tmp_b_r = buf[i+4].real;
+    tmp_b_i = buf[i+4].imag;
+    buf[i].real = tmp_a_r + tmp_b_r;
+    buf[i].imag =  tmp_a_i + tmp_b_i;
+    buf[i+4].real = tmp_a_r - tmp_b_r;
+    buf[i+4].imag =  tmp_a_i - tmp_b_i;
+    tmp_a_r = buf[1+i].real;
+    tmp_a_i = buf[1+i].imag;
+    tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
+    tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
+    buf[1+i].real = tmp_a_r + tmp_b_r;
+    buf[1+i].imag =  tmp_a_i + tmp_b_i;
+    buf[i+5].real = tmp_a_r - tmp_b_r;
+    buf[i+5].imag =  tmp_a_i - tmp_b_i;
+    tmp_a_r = buf[i+2].real;
+    tmp_a_i = buf[i+2].imag;
+    /* WARNING re <-> im & sign */
+    tmp_b_r = buf[i+6].imag;
+    tmp_b_i = - buf[i+6].real;
+    buf[i+2].real = tmp_a_r + tmp_b_r;
+    buf[i+2].imag =  tmp_a_i + tmp_b_i;
+    buf[i+6].real = tmp_a_r - tmp_b_r;
+    buf[i+6].imag =  tmp_a_i - tmp_b_i;
+    tmp_a_r = buf[i+3].real;
+    tmp_a_i = buf[i+3].imag;
+    tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
+    tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
+    buf[i+3].real = tmp_a_r + tmp_b_r;
+    buf[i+3].imag =  tmp_a_i + tmp_b_i;
+    buf[i+7].real = tmp_a_r - tmp_b_r;
+    buf[i+7].imag =  tmp_a_i - tmp_b_i;
+#else
+    vector float buf01, buf23, buf45, buf67;
+
+    buf01 = vec_ld((i + 0) << 3, (float*)buf);
+    buf23 = vec_ld((i + 2) << 3, (float*)buf);
+
+    tmp_b_r = (buf[i+5].real + buf[i+5].imag) * w[2][1].real;
+    tmp_b_i = (buf[i+5].imag - buf[i+5].real) * w[2][1].real;
+    buf[i+5].real = tmp_b_r;
+    buf[i+5].imag = tmp_b_i;
+    tmp_b_r = (buf[i+7].real - buf[i+7].imag) * w[2][3].imag;
+    tmp_b_i = (buf[i+7].imag + buf[i+7].real) * w[2][3].imag;
+    buf[i+7].real = tmp_b_r;
+    buf[i+7].imag = tmp_b_i;
+
+    buf23 = vec_ld((i + 2) << 3, (float*)buf);
+    buf45 = vec_ld((i + 4) << 3, (float*)buf);
+    buf67 = vec_ld((i + 6) << 3, (float*)buf);
+    buf67 = vec_perm(buf67, buf67, vcprm(1,0,2,3));
+	
+    vec_st(vec_add(buf01, buf45), (i + 0) << 3, (float*)buf);
+    vec_st(vec_madd(buf67, vcii(p,n,p,p), buf23), (i + 2) << 3, (float*)buf);
+    vec_st(vec_sub(buf01, buf45), (i + 4) << 3, (float*)buf);
+    vec_st(vec_nmsub(buf67, vcii(p,n,p,p), buf23), (i + 6) << 3, (float*)buf);
+#endif
+  }
+    
+  /* 4-7. iterations */
+  for (m=3; m < 7; m++) {
+    two_m = (1 << m);
+
+    two_m_plus_one = two_m<<1;
+
+    for(i = 0; i < 128; i += two_m_plus_one) {
+      for(k = 0; k < two_m; k+=2) {
+#if 0
+        int p = k + i;
+        int q = p + two_m;
+        tmp_a_r = buf[p].real;
+        tmp_a_i = buf[p].imag;
+        tmp_b_r =
+          buf[q].real * w[m][k].real -
+          buf[q].imag * w[m][k].imag;
+        tmp_b_i =
+          buf[q].imag * w[m][k].real +
+          buf[q].real * w[m][k].imag;
+        buf[p].real = tmp_a_r + tmp_b_r;
+        buf[p].imag =  tmp_a_i + tmp_b_i;
+        buf[q].real = tmp_a_r - tmp_b_r;
+        buf[q].imag =  tmp_a_i - tmp_b_i;
+
+        tmp_a_r = buf[(p + 1)].real;
+        tmp_a_i = buf[(p + 1)].imag;
+        tmp_b_r =
+          buf[(q + 1)].real * w[m][(k + 1)].real -
+          buf[(q + 1)].imag * w[m][(k + 1)].imag;
+        tmp_b_i =
+          buf[(q + 1)].imag * w[m][(k + 1)].real +
+          buf[(q + 1)].real * w[m][(k + 1)].imag;
+        buf[(p + 1)].real = tmp_a_r + tmp_b_r;
+        buf[(p + 1)].imag =  tmp_a_i + tmp_b_i;
+        buf[(q + 1)].real = tmp_a_r - tmp_b_r;
+        buf[(q + 1)].imag =  tmp_a_i - tmp_b_i;
+#else
+        int p = k + i;
+        int q = p + two_m;
+        vector float vecp, vecq, vecw, temp1, temp2, temp3, temp4;
+        const vector float vczero = (const vector float)(0);
+        // first compute buf[q] and buf[q+1]
+        vecq = vec_ld(q << 3, (float*)buf);
+        vecw = vec_ld(0, (float*)&(w[m][k]));
+        temp1 = vec_madd(vecq, vecw, vczero);
+        temp2 = vec_perm(vecq, vecq, vcprm(1,0,3,2));
+        temp2 = vec_madd(temp2, vecw, vczero);
+        temp3 = vec_perm(temp1, temp2, vcprm(0,s0,2,s2));
+        temp4 = vec_perm(temp1, temp2, vcprm(1,s1,3,s3));
+        vecq = vec_madd(temp4, vcii(n,p,n,p), temp3);
+        // then butterfly with buf[p] and buf[p+1]
+        vecp = vec_ld(p << 3, (float*)buf);
+        
+        temp1 = vec_add(vecp, vecq);
+        temp2 = vec_sub(vecp, vecq);
+                
+        vec_st(temp1, p << 3, (float*)buf);
+        vec_st(temp2, q << 3, (float*)buf);
+#endif
+      }
+    }
+  }
+
+  /* Post IFFT complex multiply  plus IFFT complex conjugate*/
+  for( i=0; i < 128; i+=4) {
+    /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
+#if 0
+    tmp_a_r =        buf[(i + 0)].real;
+    tmp_a_i = -1.0 * buf[(i + 0)].imag;
+    buf[(i + 0)].real =
+      (tmp_a_r * xcos1[(i + 0)])  -  (tmp_a_i  * xsin1[(i + 0)]);
+    buf[(i + 0)].imag =
+      (tmp_a_r * xsin1[(i + 0)])  +  (tmp_a_i  * xcos1[(i + 0)]);
+
+    tmp_a_r =        buf[(i + 1)].real;
+    tmp_a_i = -1.0 * buf[(i + 1)].imag;
+    buf[(i + 1)].real =
+      (tmp_a_r * xcos1[(i + 1)])  -  (tmp_a_i  * xsin1[(i + 1)]);
+    buf[(i + 1)].imag =
+      (tmp_a_r * xsin1[(i + 1)])  +  (tmp_a_i  * xcos1[(i + 1)]);
+
+    tmp_a_r =        buf[(i + 2)].real;
+    tmp_a_i = -1.0 * buf[(i + 2)].imag;
+    buf[(i + 2)].real =
+      (tmp_a_r * xcos1[(i + 2)])  -  (tmp_a_i  * xsin1[(i + 2)]);
+    buf[(i + 2)].imag =
+      (tmp_a_r * xsin1[(i + 2)])  +  (tmp_a_i  * xcos1[(i + 2)]);
+
+    tmp_a_r =        buf[(i + 3)].real;
+    tmp_a_i = -1.0 * buf[(i + 3)].imag;
+    buf[(i + 3)].real =
+      (tmp_a_r * xcos1[(i + 3)])  -  (tmp_a_i  * xsin1[(i + 3)]);
+    buf[(i + 3)].imag =
+      (tmp_a_r * xsin1[(i + 3)])  +  (tmp_a_i  * xcos1[(i + 3)]);
+#else
+    vector float bufv_0, bufv_2, cosv, sinv, temp1, temp2;
+    vector float temp0022, temp1133, tempCS01;
+    const vector float vczero = (const vector float)(0);
+
+    bufv_0 = vec_ld((i + 0) << 3, (float*)buf);
+    bufv_2 = vec_ld((i + 2) << 3, (float*)buf);
+
+    cosv = vec_ld(i << 2, xcos1);
+    sinv = vec_ld(i << 2, xsin1);
+
+    temp0022 = vec_perm(bufv_0, bufv_0, vcprm(0,0,2,2));
+    temp1133 = vec_perm(bufv_0, bufv_0, vcprm(1,1,3,3));
+    tempCS01 = vec_perm(cosv, sinv, vcprm(0,s0,1,s1));
+    temp1 = vec_madd(temp0022, tempCS01, vczero);
+    tempCS01 = vec_perm(cosv, sinv, vcprm(s0,0,s1,1));
+    temp2 = vec_madd(temp1133, tempCS01, vczero);
+    bufv_0 = vec_madd(temp2, vcii(p,n,p,n), temp1);
+    
+    vec_st(bufv_0, (i + 0) << 3, (float*)buf);
+
+    /* idem with bufv_2 and high-order cosv/sinv */
+
+    temp0022 = vec_perm(bufv_2, bufv_2, vcprm(0,0,2,2));
+    temp1133 = vec_perm(bufv_2, bufv_2, vcprm(1,1,3,3));
+    tempCS01 = vec_perm(cosv, sinv, vcprm(2,s2,3,s3));
+    temp1 = vec_madd(temp0022, tempCS01, vczero);
+    tempCS01 = vec_perm(cosv, sinv, vcprm(s2,2,s3,3));
+    temp2 = vec_madd(temp1133, tempCS01, vczero);
+    bufv_2 = vec_madd(temp2, vcii(p,n,p,n), temp1);
+
+    vec_st(bufv_2, (i + 2) << 3, (float*)buf);
+    
+#endif
+  }
+  
+  data_ptr = data;
+  delay_ptr = delay;
+  window_ptr = imdct_window;
+
+  /* Window and convert to real valued signal */
+  for(i=0; i< 64; i++) { 
+    *data_ptr++   = -buf[64+i].imag   * *window_ptr++ + *delay_ptr++ + bias; 
+    *data_ptr++   =  buf[64-i-1].real * *window_ptr++ + *delay_ptr++ + bias; 
+  }
+    
+  for(i=0; i< 64; i++) { 
+    *data_ptr++  = -buf[i].real       * *window_ptr++ + *delay_ptr++ + bias; 
+    *data_ptr++  =  buf[128-i-1].imag * *window_ptr++ + *delay_ptr++ + bias; 
+  }
+    
+  /* The trailing edge of the window goes into the delay line */
+  delay_ptr = delay;
+
+  for(i=0; i< 64; i++) { 
+    *delay_ptr++  = -buf[64+i].real   * *--window_ptr; 
+    *delay_ptr++  =  buf[64-i-1].imag * *--window_ptr; 
+  }
+    
+  for(i=0; i<64; i++) {
+    *delay_ptr++  =  buf[i].imag       * *--window_ptr; 
+    *delay_ptr++  = -buf[128-i-1].real * *--window_ptr; 
+  }
+}
+#endif
+
+
 // Stuff below this line is borrowed from libac3
 #include "srfftp.h"
 #ifdef ARCH_X86
@ -965,6 +1303,14 @@ void imdct_init (uint32_t mm_accel)
 	}
 	else
 #endif // arch_x86
+#ifdef HAVE_ALTIVEC
+        if (mm_accel & MM_ACCEL_PPC_ALTIVEC)
+	{
+	  fprintf(stderr, "Using AltiVec optimized IMDCT transform\n");
+          imdct_512 = imdct_do_512_altivec;
+	}
+        else
+#endif
 	fprintf (stderr, "No accelerated IMDCT transform found\n");
 	imdct_256 = imdct_do_256;
    }
--- a/liba52/mm_accel.h
+++ b/liba52/mm_accel.h
@ -34,6 +34,9 @@
 #define MM_ACCEL_X86_MMXEXT	0x20000000
 #define MM_ACCEL_X86_SSE	0x10000000

+/* PPC accelerations */
+#define MM_ACCEL_PPC_ALTIVEC	0x00010000
+
 uint32_t mm_accel (void);

 #endif /* MM_ACCEL_H */