[x264-devel] Re: [PATCH] Altivec optimizations for quant4x4, quant4x4dc, quant8x8, sub8x8_dct8, sub16x16_dct8, pixel_sa8d_8x8

Guillaume POIRIER poirierg at gmail.com
Wed Sep 6 09:56:38 CEST 2006


Hello,

(quoting re-ordered for more coherency)

On 9/5/06, Firas Al-Tahan <firearse at gmail.com> wrote:
> On 9/4/06, Guillaume POIRIER <gpoirier at mplayerhq.hu> wrote:

[..]

> > Please find in attachment the n+1 version of my Altivec patchset.
> > In today's menu:
> > I've benchmarked the different implementations of hadamard8x8 and
> > quant4x4 that I had on hand (as featured in rev.7 of my patchset, which
> > does not seem to have reached the ML...) to pick the fastest of each of
> > them.
> > Nothing too exciting as it's just a matter of squeezing 1-3 or 4 cyles
> > out of 200 or so... but while I was at it, I figured it wouldn't hurt to
> > measure the different implementation before arbitrary discarding the
> > other implementations.
> > It's interesting to note that on my G5, in the case of quant8x8, which
> > uses the macro defined for quant4x4, the implementation which uses
> > shifts (158 cycles), or unrolls the outer loop (160 cycles) is slower
> > that the implementation which uses the plain a simple mults (157
> > cycles)... Well...
> >
> > As a free bonus, the attached patchset adds PMC support (i.e. hardware
> > performance counters) for G5 and G3/G4, taken from FFmpeg's code but
> > it's a bit ugly as there's no START/STOP macro to ease benchmarking yet
> > more.
> > This probably deserves to be put in a different patch if ever it gets
> > merged.
> >
> > Last but not least, I've cleaned-up my patchset so that it works better
> > with GCC3.3 (not yet complete though, quant.c needs some rework).

> makes me wonder if it were a good idea to sell the Dual G5 and get the
> mac pro! Keep up the great PPC/Altivec work!

Thanks. A G5 probably won't beat a Woodcrest clock-for clock, so if
you really were after every clock cycle, MacPro is the way to go, all
the more since it benefits from the exhaustive optimizations of x86
targets.

If you still have a G4 or G5 machine lying around, please test my
patchset and report. I know that they say "no news is good news", but
I wouldn't mind knowing if there are some areas that would need some
work and I didn't know about them.


Another day, another revision of my patchset. In today's menu:
improved all quant routines to yet again shave off a couple of
percents of CPU cycles, and some more GCC3.3 fixes.

Get it while it's hot, and please test && review,

Guillaume
-- 
With DADVSI (http://en.wikipedia.org/wiki/DADVSI), France finally has
a lead on USA on selling out individuals right to corporations!
Vive la France!
-------------- next part --------------
Index: encoder/macroblock.c
===================================================================
--- encoder/macroblock.c	(revision 558)
+++ encoder/macroblock.c	(working copy)
@@ -31,7 +31,7 @@
 /* def_quant4_mf only for probe_skip; actual encoding uses matrices from set.c */
 /* FIXME this seems to make better decisions with cqm=jvt, but could screw up
  * with general custom matrices. */
-static const int def_quant4_mf[6][4][4] =
+static const int def_quant4_mf[6][4][4] __attribute__((__aligned__(16))) =
 {
     { { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 },
       { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 } },
@@ -195,7 +195,7 @@
     int y = 4 * block_idx_y[idx];
     uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
     uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
-    int16_t dct4x4[4][4];
+    DECLARE_ALIGNED( int16_t, dct4x4[4][4], 16 );
 
     if( h->mb.b_lossless )
     {
@@ -223,7 +223,7 @@
     int y = 8 * (idx>>1);
     uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
     uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
-    int16_t dct8x8[8][8];
+    DECLARE_ALIGNED( int16_t, dct8x8[8][8], 16 );
 
     h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
 
@@ -242,7 +242,7 @@
     uint8_t  *p_src = h->mb.pic.p_fenc[0];
     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 
-    int16_t dct4x4[16+1][4][4];
+    DECLARE_ALIGNED( int16_t, dct4x4[16+1][4][4], 16 );
 
     int i;
 
@@ -305,8 +305,8 @@
         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
         int i_decimate_score = 0;
 
-        int16_t dct2x2[2][2];
-        int16_t dct4x4[4][4][4];
+        DECLARE_ALIGNED( int16_t, dct2x2[2][2] , 16 );
+        DECLARE_ALIGNED( int16_t, dct4x4[4][4][4], 16 );
 
         if( h->mb.b_lossless )
         {
@@ -494,7 +494,7 @@
         }
         else if( h->mb.b_transform_8x8 )
         {
-            int16_t dct8x8[4][8][8];
+            DECLARE_ALIGNED( int16_t, dct8x8[4][8][8], 16 );
             int nnz8x8[4] = {1,1,1,1};
             b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
             h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
@@ -539,7 +539,7 @@
         }
         else
         {
-            int16_t dct4x4[16][4][4];
+            DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
             int nnz8x8[4] = {1,1,1,1};
             h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
 
@@ -869,7 +869,7 @@
 
     if( h->mb.b_transform_8x8 )
     {
-        int16_t dct8x8[8][8];
+        DECLARE_ALIGNED( int16_t, dct8x8[8][8], 16 );
         h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
         quant_8x8( h, dct8x8, h->quant8_mf[CQM_8PY], i_qp, 0 );
         scan_zigzag_8x8full( h->dct.luma8x8[i8], dct8x8 );
@@ -888,7 +888,7 @@
     else
     {
         int i4;
-        int16_t dct4x4[4][4][4];
+        DECLARE_ALIGNED( int16_t, dct4x4[4][4][4], 16 );
         h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
         quant_4x4( h, dct4x4[0], h->quant4_mf[CQM_4PY], i_qp, 0 );
         quant_4x4( h, dct4x4[1], h->quant4_mf[CQM_4PY], i_qp, 0 );
@@ -919,7 +919,7 @@
 
     for( ch = 0; ch < 2; ch++ )
     {
-        int16_t dct4x4[4][4];
+        DECLARE_ALIGNED( int16_t, dct4x4[4][4], 16 );
         p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
         p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
 
Index: common/ppc/dct.h
===================================================================
--- common/ppc/dct.h	(revision 558)
+++ common/ppc/dct.h	(working copy)
@@ -5,6 +5,7 @@
  * $Id$
  *
  * Authors: Eric Petit <titer at m0k.org>
+ *          Guillaume Poirier <gpoirier at mplayerhq.hu>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -31,4 +32,9 @@
 void x264_sub16x16_dct_altivec( int16_t dct[16][4][4],
         uint8_t *pix1, uint8_t *pix2 );
 
+void x264_sub8x8_dct8_altivec( int16_t dct[8][8],
+        uint8_t *pix1, uint8_t *pix2 );
+void x264_sub16x16_dct8_altivec( int16_t dct[4][8][8],
+        uint8_t *pix1, uint8_t *pix2 );
+
 #endif
Index: common/ppc/ppccommon.h
===================================================================
--- common/ppc/ppccommon.h	(revision 558)
+++ common/ppc/ppccommon.h	(working copy)
@@ -252,3 +252,54 @@
     dl     = vec_sub( temp1v, temp3v );   \
     p1    += i1;                          \
     p2    += i2
+
+/***********************************************************************
+ Accessors to read Performance Monitoring Counters (PMC)
+ **********************************************************************/
+#define POWERPC_NUM_PMC_ENABLED 8
+#define POWERPC_MODE_64BITS
+
+#ifndef POWERPC_MODE_64BITS
+#define POWERP_PMC_DATATYPE unsigned long
+#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 937" : "=r" (a))
+#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 938" : "=r" (a))
+#if (POWERPC_NUM_PMC_ENABLED > 2)
+#define POWERPC_GET_PMC3(a) asm volatile("mfspr %0, 941" : "=r" (a))
+#define POWERPC_GET_PMC4(a) asm volatile("mfspr %0, 942" : "=r" (a))
+#else
+#define POWERPC_GET_PMC3(a) do {} while (0)
+#define POWERPC_GET_PMC4(a) do {} while (0)
+#endif
+#if (POWERPC_NUM_PMC_ENABLED > 4)
+#define POWERPC_GET_PMC5(a) asm volatile("mfspr %0, 929" : "=r" (a))
+#define POWERPC_GET_PMC6(a) asm volatile("mfspr %0, 930" : "=r" (a))
+#else
+#define POWERPC_GET_PMC5(a) do {} while (0)
+#define POWERPC_GET_PMC6(a) do {} while (0)
+#endif
+#else /* POWERPC_MODE_64BITS */
+#define POWERP_PMC_DATATYPE unsigned long long
+#define POWERPC_GET_PMC1(a) asm volatile("mfspr %0, 771" : "=r" (a))
+#define POWERPC_GET_PMC2(a) asm volatile("mfspr %0, 772" : "=r" (a))
+#if (POWERPC_NUM_PMC_ENABLED > 2)
+#define POWERPC_GET_PMC3(a) asm volatile("mfspr %0, 773" : "=r" (a))
+#define POWERPC_GET_PMC4(a) asm volatile("mfspr %0, 774" : "=r" (a))
+#else
+#define POWERPC_GET_PMC3(a) do {} while (0)
+#define POWERPC_GET_PMC4(a) do {} while (0)
+#endif
+#if (POWERPC_NUM_PMC_ENABLED > 4)
+#define POWERPC_GET_PMC5(a) asm volatile("mfspr %0, 775" : "=r" (a))
+#define POWERPC_GET_PMC6(a) asm volatile("mfspr %0, 776" : "=r" (a))
+#else
+#define POWERPC_GET_PMC5(a) do {} while (0)
+#define POWERPC_GET_PMC6(a) do {} while (0)
+#endif
+#if (POWERPC_NUM_PMC_ENABLED > 6)
+#define POWERPC_GET_PMC7(a) asm volatile("mfspr %0, 777" : "=r" (a))
+#define POWERPC_GET_PMC8(a) asm volatile("mfspr %0, 778" : "=r" (a))
+#else
+#define POWERPC_GET_PMC7(a) do {} while (0)
+#define POWERPC_GET_PMC8(a) do {} while (0)
+#endif
+#endif /* POWERPC_MODE_64BITS */
Index: common/ppc/pixel.c
===================================================================
--- common/ppc/pixel.c	(revision 558)
+++ common/ppc/pixel.c	(working copy)
@@ -5,6 +5,7 @@
  * $Id: pixel.c,v 1.1 2004/06/03 19:27:07 fenrir Exp $
  *
  * Authors: Eric Petit <titer at m0k.org>
+ *          Guillaume Poirier <gpoirier at mplayerhq.hu>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -28,6 +29,8 @@
 #include "common/common.h"
 #include "ppccommon.h"
 
+#include <stdio.h>
+
 /***********************************************************************
  * SAD routines
  **********************************************************************/
@@ -100,8 +103,7 @@
  * actually also calls vec_splat(0), but we already have a null vector.
  **********************************************************************/
 #define VEC_ABS(a) \
-    pix1v = vec_sub( zero_s16v, a ); \
-    a     = vec_max( a, pix1v ); \
+    a     = vec_max( a, vec_sub( zero_s16v, a ) );
 
 /***********************************************************************
  * VEC_ADD_ABS
@@ -1604,7 +1606,133 @@
     return sum;
 } 
 
+/**********************************************************************
+ * SA8D routines: sum of 8x8 Hadamard transformed differences
+ **********************************************************************/
+/* SA8D_1D unrolled by 8 in Altivec */
+#define SA8D_1D_ALTIVEC( sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v ) \
+{ \
+    /* int    a0  =        SRC(0) + SRC(4) */\
+    vec_s16_t a0v = vec_add(sa8d0v, sa8d4v); \
+    /* int    a4  =        SRC(0) - SRC(4) */\
+    vec_s16_t a4v = vec_sub(sa8d0v, sa8d4v); \
+    /* int    a1  =        SRC(1) + SRC(5) */\
+    vec_s16_t a1v = vec_add(sa8d1v, sa8d5v); \
+    /* int    a5  =        SRC(1) - SRC(5) */\
+    vec_s16_t a5v = vec_sub(sa8d1v, sa8d5v); \
+    /* int    a2  =        SRC(2) + SRC(6) */\
+    vec_s16_t a2v = vec_add(sa8d2v, sa8d6v); \
+    /* int    a6  =        SRC(2) - SRC(6) */\
+    vec_s16_t a6v = vec_sub(sa8d2v, sa8d6v); \
+    /* int    a3  =        SRC(3) + SRC(7) */\
+    vec_s16_t a3v = vec_add(sa8d3v, sa8d7v); \
+    /* int    a7  =        SRC(3) - SRC(7) */\
+    vec_s16_t a7v = vec_sub(sa8d3v, sa8d7v); \
+\
+    /* int    b0  =         a0 + a2  */\
+    vec_s16_t b0v = vec_add(a0v, a2v); \
+    /* int    b2  =         a0 - a2; */\
+   vec_s16_t  b2v = vec_sub(a0v, a2v); \
+    /* int    b1  =         a1 + a3; */\
+    vec_s16_t b1v = vec_add(a1v, a3v); \
+    /* int    b3  =         a1 - a3; */\
+    vec_s16_t b3v = vec_sub(a1v, a3v); \
+    /* int    b4  =         a4 + a6; */\
+    vec_s16_t b4v = vec_add(a4v, a6v); \
+    /* int    b6  =         a4 - a6; */\
+    vec_s16_t b6v = vec_sub(a4v, a6v); \
+    /* int    b5  =         a5 + a7; */\
+    vec_s16_t b5v = vec_add(a5v, a7v); \
+    /* int    b7  =         a5 - a7; */\
+    vec_s16_t b7v = vec_sub(a5v, a7v); \
+\
+    /* DST(0,        b0 + b1) */\
+    sa8d0v = vec_add(b0v, b1v); \
+    /* DST(1,        b0 - b1) */\
+    sa8d1v = vec_sub(b0v, b1v); \
+    /* DST(2,        b2 + b3) */\
+    sa8d2v = vec_add(b2v, b3v); \
+    /* DST(3,        b2 - b3) */\
+    sa8d3v = vec_sub(b2v, b3v); \
+    /* DST(4,        b4 + b5) */\
+    sa8d4v = vec_add(b4v, b5v); \
+    /* DST(5,        b4 - b5) */\
+    sa8d5v = vec_sub(b4v, b5v); \
+    /* DST(6,        b6 + b7) */\
+    sa8d6v = vec_add(b6v, b7v); \
+    /* DST(7,        b6 - b7) */\
+    sa8d7v = vec_sub(b6v, b7v); \
+}
 
+static int pixel_sa8d_8x8_altivec( uint8_t *pix1, int i_pix1, uint8_t *pix2, int i_pix2,
+                                  int i_width, int i_height )
+{
+    int32_t i_satd=0;
+    DECLARE_ALIGNED(int16_t, tab[8][8], 16);
+    PREP_DIFF;
+    
+    vec_s16_t diff0v, diff1v, diff2v, diff3v,
+        diff4v, diff5v, diff6v, diff7v;
+
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v );
+    
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v );
+    VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v );
+
+    vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v,
+        sa8d4v, sa8d5v, sa8d6v, sa8d7v;
+
+    SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v,
+                    diff4v, diff5v, diff6v, diff7v);
+
+    VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
+                    diff4v, diff5v, diff6v, diff7v,
+                    sa8d0v, sa8d1v, sa8d2v, sa8d3v,
+                    sa8d4v, sa8d5v, sa8d6v, sa8d7v );
+
+    SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
+                    sa8d4v, sa8d5v, sa8d6v, sa8d7v );
+
+    /* accumulation of the absolute value of all elements of the resulting bloc */
+    vec_s16_t abs0v = VEC_ABS(sa8d0v);
+    vec_s16_t abs1v = VEC_ABS(sa8d1v);
+    vec_s16_t sum01v = vec_add(abs0v, abs1v);
+
+    vec_s16_t abs2v = VEC_ABS(sa8d2v);
+    vec_s16_t abs3v = VEC_ABS(sa8d3v);
+    vec_s16_t sum23v = vec_add(abs2v, abs3v);
+
+    vec_s16_t abs4v = VEC_ABS(sa8d4v);
+    vec_s16_t abs5v = VEC_ABS(sa8d5v);
+    vec_s16_t sum45v = vec_add(abs4v, abs5v);
+
+    vec_s16_t abs6v = VEC_ABS(sa8d6v);
+    vec_s16_t abs7v = VEC_ABS(sa8d7v);
+    vec_s16_t sum67v = vec_add(abs6v, abs7v);
+
+    vec_s16_t sum0123v = vec_add(sum01v, sum23v);
+    vec_s16_t sum4567v = vec_add(sum45v, sum67v);
+
+    vec_s32_t sumblocv;
+
+    sumblocv = vec_sum4s(sum0123v, (vec_s32_t)zerov );
+    sumblocv = vec_sum4s(sum4567v, sumblocv );
+
+    sumblocv = vec_sums(sumblocv, (vec_s32_t)zerov );
+
+    sumblocv = vec_splat(sumblocv, 3);
+
+    vec_ste(sumblocv, 0, &i_satd);
+
+    return (i_satd+2)>>2;
+}
+
+
 /****************************************************************************
  * x264_pixel_init:
  ****************************************************************************/
@@ -1634,4 +1762,6 @@
     pixf->satd[PIXEL_4x4]   = pixel_satd_4x4_altivec;
     
     pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16_altivec;
+
+    pixf->sa8d[PIXEL_8x8]   = pixel_sa8d_8x8_altivec;
 }
Index: common/ppc/quant.c
===================================================================
--- common/ppc/quant.c	(revision 0)
+++ common/ppc/quant.c	(revision 0)
@@ -0,0 +1,187 @@
+/*****************************************************************************
+* quant.c: h264 encoder
+*****************************************************************************
+* Authors: Guillaume Poirier <poirierg at gmail.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+*****************************************************************************/
+
+#ifdef HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
+#include "common/common.h"
+#include "ppccommon.h"
+#include "quant.h"            
+
+// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
+#define QUANT_16_U( dct0, dct1, quant_mf0, quant_mf1, quant_mf2, quant_mf3 )                              \
+temp1v = vec_ld((dct0), *dct);                                      \
+temp2v = vec_ld((dct1), *dct);                                      \
+mfvA = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((quant_mf0), *quant_mf), (vec_u32_t)vec_ld((quant_mf1), *quant_mf));    \
+mfvB = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((quant_mf2), *quant_mf), (vec_u32_t)vec_ld((quant_mf3), *quant_mf));    \
+mskA = vec_cmplt(temp1v, zerov);                                    \
+mskB = vec_cmplt(temp2v, zerov);                                    \
+coefvA = (vec_u16_t)vec_max(vec_sub(zerov, temp1v), temp1v);       \
+coefvB = (vec_u16_t)vec_max(vec_sub(zerov, temp2v), temp2v);       \
+multEvenvA = vec_mule(coefvA, mfvA);                                \
+multOddvA = vec_mulo(coefvA, mfvA);                                 \
+multEvenvB = vec_mule(coefvB, mfvB);                                \
+multOddvB = vec_mulo(coefvB, mfvB);                                 \
+multEvenvA = vec_add(multEvenvA, fV);                               \
+multOddvA = vec_add(multOddvA, fV);                                 \
+multEvenvB = vec_add(multEvenvB, fV);                               \
+multOddvB = vec_add(multOddvB, fV);                                 \
+multEvenvA = vec_sr(multEvenvA, i_qbitsv);                          \
+multOddvA = vec_sr(multOddvA, i_qbitsv);                            \
+multEvenvB = vec_sr(multEvenvB, i_qbitsv);                          \
+multOddvB = vec_sr(multOddvB, i_qbitsv);                            \
+temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
+temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
+temp1v = vec_xor(temp1v, mskA);                                     \
+temp2v = vec_xor(temp2v, mskB);                                     \
+temp1v = vec_add(temp1v, vec_and(mskA, one));                       \
+vec_st(temp1v, (dct0), dct);                                        \
+temp2v = vec_add(temp2v, vec_and(mskB, one));                       \
+vec_st(temp2v, (dct1), dct);
+                
+void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f ) {
+    vector bool short mskA;
+    vec_s32_t i_qbitsv;
+    vec_u16_t coefvA;
+    vec_u32_t multEvenvA, multOddvA;
+    vec_u32_t mfvA;
+    vec_s16_t zerov, one;
+    vec_s32_t fV;
+
+    vector bool short mskB;
+    vec_u16_t coefvB;
+    vec_u32_t multEvenvB, multOddvB;
+    vec_u32_t mfvB;
+
+    vec_s16_t temp1v, temp2v;
+
+    vect_sint_u qbits_u;
+    qbits_u.s[0]=i_qbits;
+    i_qbitsv = vec_splat(qbits_u.v, 0);
+
+    vect_sint_u f_u;
+    f_u.s[0]=f;
+
+    fV = vec_splat(f_u.v, 0);
+
+    zerov = vec_splat_s16(0);
+    one = vec_splat_s16(1);
+    
+    QUANT_16_U( 0, 16, 0, 16, 32, 48 );
+}
+
+// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
+#define QUANT_16_U_DC( dct0, dct1 )                                       \
+temp1v = vec_ld((dct0), *dct);                                         \
+temp2v = vec_ld((dct1), *dct);                                         \
+mskA = vec_cmplt(temp1v, zerov);                                    \
+mskB = vec_cmplt(temp2v, zerov);                                    \
+coefvA = (vec_u16_t) vec_max(vec_sub(zerov, temp1v), temp1v);       \
+coefvB = (vec_u16_t) vec_max(vec_sub(zerov, temp2v), temp2v);       \
+multEvenvA = vec_mule(coefvA, mfv);                                 \
+multOddvA = vec_mulo(coefvA, mfv);                                  \
+multEvenvB = vec_mule(coefvB, mfv);                                 \
+multOddvB = vec_mulo(coefvB, mfv);                                  \
+multEvenvA = vec_add(multEvenvA, fV);                               \
+multOddvA = vec_add(multOddvA, fV);                                 \
+multEvenvB = vec_add(multEvenvB, fV);                               \
+multOddvB = vec_add(multOddvB, fV);                                 \
+multEvenvA = vec_sr(multEvenvA, i_qbitsv);                          \
+multOddvA = vec_sr(multOddvA, i_qbitsv);                            \
+multEvenvB = vec_sr(multEvenvB, i_qbitsv);                          \
+multOddvB = vec_sr(multOddvB, i_qbitsv);                            \
+temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
+temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
+temp1v = vec_xor(temp1v, mskA);                                     \
+temp2v = vec_xor(temp2v, mskB);                                     \
+temp1v = vec_add(temp1v, vec_and(mskA, one));                       \
+vec_st(temp1v, (dct0), dct);                                           \
+temp2v = vec_add(temp2v, vec_and(mskB, one));                       \
+vec_st(temp2v, (dct1), dct);
+
+
+void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f ) {
+    vector bool short mskA;
+    vec_s32_t i_qbitsv;
+    vec_u16_t coefvA;
+    vec_u32_t multEvenvA, multOddvA;
+    vec_s16_t zerov, one;
+    vec_s32_t fV;
+
+    vector bool short mskB;
+    vec_u16_t coefvB;
+    vec_u32_t multEvenvB, multOddvB;
+
+    vec_s16_t temp1v, temp2v;
+
+    vec_u32_t mfv;
+    vect_int_u mf_u;
+    mf_u.s[0]=i_quant_mf;
+    mfv = vec_splat( mf_u.v, 0 );
+    mfv = vec_packs( mfv, mfv);
+
+    vect_sint_u qbits_u;
+    qbits_u.s[0]=i_qbits;
+    i_qbitsv = vec_splat(qbits_u.v, 0);
+
+    vect_sint_u f_u;
+    f_u.s[0]=f;
+    fV = vec_splat(f_u.v, 0);
+
+    zerov = vec_splat_s16(0);
+    one = vec_splat_s16(1);
+
+    QUANT_16_U_DC( 0, 16 );
+}
+
+
+void x264_quant_8x8_altivec( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f ) {
+    vector bool short mskA;
+    vec_s32_t i_qbitsv;
+    vec_u16_t coefvA;
+    vec_s32_t multEvenvA, multOddvA, mfvA;
+    vec_s16_t zerov, one;
+    vec_s32_t fV;
+    
+    vector bool short mskB;
+    vec_u16_t coefvB;
+    vec_u32_t multEvenvB, multOddvB, mfvB;
+    
+    vec_s16_t temp1v, temp2v;
+    
+    vect_int_u qbits_u;
+    qbits_u.s[0]=i_qbits;
+    i_qbitsv = vec_splat(qbits_u.v, 0);
+
+    vect_sint_u f_u;
+    f_u.s[0]=f;
+    fV = vec_splat(f_u.v, 0);
+
+    zerov = vec_splat_s16(0);
+    one = vec_splat_s16(1);
+    
+    int i;
+
+    for ( i=0; i<4; i++ ) {
+      QUANT_16_U( i*2*16, i*2*16+16, i*4*16, i*4*16+16, i*4*16+32, i*4*16+48 );
+    }
+}
+
Index: common/ppc/dct.c
===================================================================
--- common/ppc/dct.c	(revision 558)
+++ common/ppc/dct.c	(working copy)
@@ -5,6 +5,7 @@
  * $Id$
  *
  * Authors: Eric Petit <titer at m0k.org>
+ *          Guillaume Poirier <gpoirier at mplayerhq.hu>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -250,3 +251,118 @@
     VEC_STORE8_L( temp6v, dct[15][2] );
     VEC_STORE8_L( temp7v, dct[15][3] );
 }
+
+/****************************************************************************
+* 8x8 transform:
+****************************************************************************/
+
+/* DCT8_1D unrolled by 8 in Altivec */
+#define DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v, dct4v, dct5v, dct6v, dct7v ) \
+{ \
+    /* int s07 = SRC(0) + SRC(7);         */ \
+    vec_s16_t s07v = vec_add( dct0v, dct7v); \
+    /* int s16 = SRC(1) + SRC(6);         */ \
+    vec_s16_t s16v = vec_add( dct1v, dct6v); \
+    /* int s25 = SRC(2) + SRC(5);         */ \
+    vec_s16_t s25v = vec_add( dct2v, dct5v); \
+    /* int s34 = SRC(3) + SRC(4);         */ \
+    vec_s16_t s34v = vec_add( dct3v, dct4v); \
+\
+    /* int a0 = s07 + s34;                */ \
+    vec_s16_t a0v = vec_add(s07v, s34v);     \
+    /* int a1 = s16 + s25;                */ \
+    vec_s16_t a1v = vec_add(s16v, s25v);     \
+    /* int a2 = s07 - s34;                */ \
+    vec_s16_t a2v = vec_sub(s07v, s34v);     \
+    /* int a3 = s16 - s25;                */ \
+    vec_s16_t a3v = vec_sub(s16v, s25v);     \
+\
+    /* int d07 = SRC(0) - SRC(7);         */ \
+    vec_s16_t d07v = vec_sub( dct0v, dct7v); \
+    /* int d16 = SRC(1) - SRC(6);         */ \
+    vec_s16_t d16v = vec_sub( dct1v, dct6v); \
+    /* int d25 = SRC(2) - SRC(5);         */ \
+    vec_s16_t d25v = vec_sub( dct2v, dct5v); \
+    /* int d34 = SRC(3) - SRC(4);         */ \
+    vec_s16_t d34v = vec_sub( dct3v, dct4v); \
+\
+    /* int a4 = d16 + d25 + (d07 + (d07>>1)); */ \
+    vec_s16_t a4v = vec_add( vec_add(d16v, d25v), vec_add(d07v, vec_sra(d07v, onev)) );\
+    /* int a5 = d07 - d34 - (d25 + (d25>>1)); */ \
+    vec_s16_t a5v = vec_sub( vec_sub(d07v, d34v), vec_add(d25v, vec_sra(d25v, onev)) );\
+    /* int a6 = d07 + d34 - (d16 + (d16>>1)); */ \
+    vec_s16_t a6v = vec_sub( vec_add(d07v, d34v), vec_add(d16v, vec_sra(d16v, onev)) );\
+    /* int a7 = d16 - d25 + (d34 + (d34>>1)); */ \
+    vec_s16_t a7v = vec_add( vec_sub(d16v, d25v), vec_add(d34v, vec_sra(d34v, onev)) );\
+\
+    /* DST(0) =  a0 + a1;                    */ \
+    dct0v = vec_add( a0v, a1v );                \
+    /* DST(1) =  a4 + (a7>>2);               */ \
+    dct1v = vec_add( a4v, vec_sra(a7v, twov) ); \
+    /* DST(2) =  a2 + (a3>>1);               */ \
+    dct2v = vec_add( a2v, vec_sra(a3v, onev) ); \
+    /* DST(3) =  a5 + (a6>>2);               */ \
+    dct3v = vec_add( a5v, vec_sra(a6v, twov) ); \
+    /* DST(4) =  a0 - a1;                    */ \
+    dct4v = vec_sub( a0v, a1v );                \
+    /* DST(5) =  a6 - (a5>>2);               */ \
+    dct5v = vec_sub( a6v, vec_sra(a5v, twov) ); \
+    /* DST(6) = (a2>>1) - a3 ;               */ \
+    dct6v = vec_sub( vec_sra(a2v, onev), a3v ); \
+    /* DST(7) = (a4>>2) - a7 ;               */ \
+    dct7v = vec_sub( vec_sra(a4v, twov), a7v ); \
+}
+
+
+void x264_sub8x8_dct8_altivec( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+{
+    vec_u16_t onev = vec_splat_u16(1);
+    vec_u16_t twov = vec_add( onev, onev );
+
+    PREP_DIFF;
+
+    vec_s16_t dct0v, dct1v, dct2v, dct3v,
+              dct4v, dct5v, dct6v, dct7v;
+
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v );
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v );
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v );
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v );
+
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v );
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v );
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v );
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v );
+
+    DCT8_1D_ALTIVEC( dct0v, dct1v, dct2v, dct3v,
+                     dct4v, dct5v, dct6v, dct7v );
+
+    vec_s16_t dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
+        dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v;
+
+    VEC_TRANSPOSE_8(dct0v, dct1v, dct2v, dct3v,
+                    dct4v, dct5v, dct6v, dct7v,
+                    dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
+                    dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v );
+
+    DCT8_1D_ALTIVEC( dct_tr0v, dct_tr1v, dct_tr2v, dct_tr3v,
+                     dct_tr4v, dct_tr5v, dct_tr6v, dct_tr7v );
+
+    vec_st( dct_tr0v,  0,  (signed short *)dct );
+    vec_st( dct_tr1v, 16,  (signed short *)dct );
+    vec_st( dct_tr2v, 32,  (signed short *)dct );
+    vec_st( dct_tr3v, 48,  (signed short *)dct );
+    
+    vec_st( dct_tr4v, 64,  (signed short *)dct );
+    vec_st( dct_tr5v, 80,  (signed short *)dct );
+    vec_st( dct_tr6v, 96,  (signed short *)dct );
+    vec_st( dct_tr7v, 112, (signed short *)dct );
+}
+
+void x264_sub16x16_dct8_altivec( int16_t dct[4][8][8], uint8_t *pix1, uint8_t *pix2 )
+{
+    x264_sub8x8_dct8_altivec( dct[0], &pix1[0],               &pix2[0] );
+    x264_sub8x8_dct8_altivec( dct[1], &pix1[8],               &pix2[8] );
+    x264_sub8x8_dct8_altivec( dct[2], &pix1[8*FENC_STRIDE+0], &pix2[8*FDEC_STRIDE+0] );
+    x264_sub8x8_dct8_altivec( dct[3], &pix1[8*FENC_STRIDE+8], &pix2[8*FDEC_STRIDE+8] );
+}
Index: common/ppc/quant.h
===================================================================
--- common/ppc/quant.h	(revision 0)
+++ common/ppc/quant.h	(revision 0)
@@ -0,0 +1,39 @@
+/*****************************************************************************
+* quant.h: h264 encoder library
+*****************************************************************************
+* Authors: Guillaume Poirier <poirierg at gmail.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+*****************************************************************************/
+
+#ifndef _PPC_QUANT_H
+#define _PPC_QUANT_H 1
+
+typedef union {
+  int s[4];
+  vector unsigned int v;
+} vect_int_u;
+
+typedef union {
+  signed int s[4];
+  vector signed int v;
+} vect_sint_u;
+
+
+void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f );
+void x264_quant_8x8_altivec( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f );
+
+void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f );
+#endif
Index: common/quant.c
===================================================================
--- common/quant.c	(revision 558)
+++ common/quant.c	(working copy)
@@ -25,6 +25,9 @@
 #ifdef HAVE_MMXEXT
 #include "i386/quant.h"
 #endif
+#ifdef ARCH_PPC
+#   include "ppc/quant.h"
+#endif
 
 #define QUANT_ONE( coef, mf ) \
 { \
@@ -271,4 +274,39 @@
         pf->dequant_8x8 = x264_dequant_8x8_mmx;
     }
 #endif  /* HAVE_MMXEXT */
+    
+#ifdef ARCH_PPC
+    if( cpu&X264_CPU_ALTIVEC )
+    {
+        /* determine the biggest coeffient in all quant8_mf tables */
+        for( i = 0; i < 2*6*8*8; i++ )
+        {
+            int q = h->quant8_mf[0][0][0][i];
+            if( maxQ8 < q )
+                maxQ8 = q;
+        }
+
+        for( i = 0; i < 4*6*4*4; i++ )
+        {
+            int q = h->quant4_mf[0][0][0][i];
+            if( maxQ4 < q )
+                maxQ4 = q;
+            if( maxQdc < q && i%16 == 0 )
+                maxQdc = q;
+        }
+
+        if( maxQ8 < (1<<16) )
+        {
+            pf->quant_8x8_core = x264_quant_8x8_altivec;
+        }
+        if( maxQ4 < (1<<16) )
+        {
+            pf->quant_4x4_core = x264_quant_4x4_altivec;
+        }
+        if( maxQdc < (1<<16) )
+        {
+           pf->quant_4x4_dc_core = x264_quant_4x4_dc_altivec;
+        }
+    }
+#endif
 }
Index: common/dct.c
===================================================================
--- common/dct.c	(revision 558)
+++ common/dct.c	(working copy)
@@ -437,6 +437,9 @@
         dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
         dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
         dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
+
+        dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
+        dctf->sub16x16_dct8 = x264_sub16x16_dct8_altivec;
     }
 #endif
 }
Index: tools/checkasm.c
===================================================================
--- tools/checkasm.c	(revision 558)
+++ tools/checkasm.c	(working copy)
@@ -453,8 +453,9 @@
     x264_quant_function_t qf_c;
     x264_quant_function_t qf_ref;
     x264_quant_function_t qf_a;
-    int16_t dct1[64], dct2[64];
-    uint8_t cqm_buf[64];
+    int16_t dct1[64]    __attribute__((__aligned__(16)));
+    int16_t dct2[64]    __attribute__((__aligned__(16)));
+    uint8_t cqm_buf[64] __attribute__((__aligned__(16)));
     int ret = 0, ok, used_asm;
     int oks[2] = {1,1}, used_asms[2] = {0,0};
     int i, i_cqm;
Index: Makefile
===================================================================
--- Makefile	(revision 558)
+++ Makefile	(working copy)
@@ -43,7 +43,8 @@
 
 # AltiVec optims
 ifeq ($(ARCH),PPC)
-SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c
+SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \
+	common/ppc/quant.c
 endif
 
 # VIS optims


More information about the x264-devel mailing list