[x264-devel] [PATCH] Altivec optimizations for quant4x4 and quant4x4dc

Guillaume POIRIER poirierg at gmail.com
Tue Aug 8 09:52:07 CEST 2006


Hello folks,
Please find in attachment a work-in-progress version of $subj.
Both work and pass regression tests. \o/
However, I think they can be slightly improved as quant8x8, which
share the same Altivec code as 4x4, doesn't seem to work will all
custom quant matrices.
Having a quick look  at the different ASM routines for x86 target, it
looks like I may need to write a version of the Altivec routines with
a higher level of precision...
I imagine that if some of the tests passes but not the whole lot, it's
likely to be an overflow problem, right?
Anyway, the Altivec quant8x8 routine is also present in the patch, but
it's not used.

To put it in a nutshell, the attached patch do work, but will probably
need a bit of work for a inclusion, any comment welcome!!

Guillaume
-- 
Conditional branch execution in SIMD vector processors are patented! :
http://www.freepatentsonline.com/4435758.html
All ASM SIMD code writers are outlaws!
-------------- next part --------------
Index: encoder/macroblock.c
===================================================================
--- encoder/macroblock.c	(revision 540)
+++ encoder/macroblock.c	(working copy)
@@ -31,7 +31,7 @@
 /* def_quant4_mf only for probe_skip; actual encoding uses matrices from set.c */
 /* FIXME this seems to make better decisions with cqm=jvt, but could screw up
  * with general custom matrices. */
-static const int def_quant4_mf[6][4][4] =
+static const int def_quant4_mf[6][4][4] __attribute__((__aligned__(16))) =
 {
     { { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 },
       { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 } },
@@ -195,7 +195,7 @@
     int y = 4 * block_idx_y[idx];
     uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
     uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
-    int16_t dct4x4[4][4];
+    DECLARE_ALIGNED( int16_t, dct4x4[4][4], 16 );
 
     if( h->mb.b_lossless )
     {
@@ -223,7 +223,7 @@
     int y = 8 * (idx>>1);
     uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
     uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
-    int16_t dct8x8[8][8];
+    DECLARE_ALIGNED( int16_t, dct8x8[8][8], 16 );
 
     h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
 
@@ -242,7 +242,7 @@
     uint8_t  *p_src = h->mb.pic.p_fenc[0];
     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 
-    int16_t dct4x4[16+1][4][4];
+    DECLARE_ALIGNED( int16_t, dct4x4[16+1][4][4], 16 );
 
     int i;
 
@@ -305,8 +305,8 @@
         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
         int i_decimate_score = 0;
 
-        int16_t dct2x2[2][2];
-        int16_t dct4x4[4][4][4];
+        DECLARE_ALIGNED( int16_t, dct2x2[2][2] , 16 );
+        DECLARE_ALIGNED( int16_t, dct4x4[4][4][4], 16 );
 
         if( h->mb.b_lossless )
         {
@@ -494,7 +494,7 @@
         }
         else if( h->mb.b_transform_8x8 )
         {
-            int16_t dct8x8[4][8][8];
+            DECLARE_ALIGNED( int16_t, dct8x8[4][8][8], 16 );
             int nnz8x8[4] = {1,1,1,1};
             b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
             h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
@@ -539,7 +539,7 @@
         }
         else
         {
-            int16_t dct4x4[16][4][4];
+            DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
             int nnz8x8[4] = {1,1,1,1};
             h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
 
@@ -869,7 +869,7 @@
 
     if( h->mb.b_transform_8x8 )
     {
-        int16_t dct8x8[8][8];
+        DECLARE_ALIGNED( int16_t, dct8x8[8][8], 16 );
         h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
         quant_8x8( h, dct8x8, h->quant8_mf[CQM_8PY], i_qp, 0 );
         scan_zigzag_8x8full( h->dct.luma8x8[i8], dct8x8 );
@@ -888,7 +888,7 @@
     else
     {
         int i4;
-        int16_t dct4x4[4][4][4];
+        DECLARE_ALIGNED( int16_t, dct4x4[4][4][4], 16 );
         h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
         quant_4x4( h, dct4x4[0], h->quant4_mf[CQM_4PY], i_qp, 0 );
         quant_4x4( h, dct4x4[1], h->quant4_mf[CQM_4PY], i_qp, 0 );
@@ -919,7 +919,7 @@
 
     for( ch = 0; ch < 2; ch++ )
     {
-        int16_t dct4x4[4][4];
+        DECLARE_ALIGNED( int16_t, dct4x4[4][4], 16 );
         p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
         p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
 
Index: common/ppc/quant.c
===================================================================
--- common/ppc/quant.c	(revision 0)
+++ common/ppc/quant.c	(revision 0)
@@ -0,0 +1,204 @@
+/*****************************************************************************
+* quant.c: h264 encoder
+*****************************************************************************
+* Authors: Guillaume Poirier <poirierg at gmail.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+*****************************************************************************/
+
+#ifdef HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
+#include "common/common.h"
+#include "ppccommon.h"
+#include "quant.h"            
+
+// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
+#define QUANT_16_U( a, b, c, d, e, f )                              \
+temp1v = vec_ld((a), *dct);                                         \
+temp2v = vec_ld((d), *dct);                                         \
+mfvA = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((b), *quant_mf), (vec_u32_t)vec_ld((c), *quant_mf));    \
+mfvB = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((e), *quant_mf), (vec_u32_t)vec_ld((f), *quant_mf));    \
+mskA = vec_cmplt(temp1v, zerov);                                    \
+mskB = vec_cmplt(temp2v, zerov);                                    \
+coefvA = (vec_u16_t) vec_abs(vec_sub(zerov, temp1v), temp1v);       \
+coefvB = (vec_u16_t) vec_abs(vec_sub(zerov, temp2v), temp2v);       \
+multEvenvA = vec_mule(coefvA, mfvA);                                \
+multOddvA = vec_mulo(coefvA, mfvA);                                 \
+multEvenvB = vec_mule(coefvB, mfvB);                                \
+multOddvB = vec_mulo(coefvB, mfvB);                                 \
+multEvenvA = vec_add(multEvenvA, fV);                               \
+multOddvA = vec_add(multOddvA, fV);                                 \
+multEvenvB = vec_add(multEvenvB, fV);                               \
+multOddvB = vec_add(multOddvB, fV);                                 \
+multEvenvA = vec_sr(multEvenvA, i_qbitsv);                          \
+multOddvA = vec_sr(multOddvA, i_qbitsv);                            \
+multEvenvB = vec_sr(multEvenvB, i_qbitsv);                          \
+multOddvB = vec_sr(multOddvB, i_qbitsv);                            \
+temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
+temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
+temp1v = vec_xor(temp1v, mskA);                                     \
+temp2v = vec_xor(temp2v, mskB);                                     \
+temp1v = vec_add(temp1v, vec_and(mskA, one));                       \
+vec_st(temp1v, (a), dct);                                           \
+temp2v = vec_add(temp2v, vec_and(mskB, one));                       \
+vec_st(temp2v, (d), dct);
+                
+void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f ) {
+    vector bool short mskA;
+    vec_s32_t i_qbitsv;
+    vec_u16_t coefvA;
+    vec_s32_t multEvenvA, multOddvA, mfvA;
+    vec_s16_t zerov, one;
+    vec_s32_t fV;
+
+    vector bool short mskB;
+    vec_u16_t coefvB;
+    vec_u32_t multEvenvB, multOddvB, mfvB;
+
+    vec_s16_t temp1v, temp2v;
+
+    union {
+        int s[4];
+        vector int v;
+    } qbits_u;
+    qbits_u.s[0]=i_qbits;
+    i_qbitsv = vec_splat(qbits_u.v, 0);
+
+    union {
+        signed int s[4];
+        signed vector int v;
+    } f_u;
+    f_u.s[0]=f;
+    fV = vec_splat(f_u.v, 0);
+
+    zerov = vec_splat_s16(0);
+    one = vec_splat_s16(1);
+    
+    QUANT_16_U( 0, 0, 16, 16, 32, 48 );
+}
+
+// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
+#define QUANT_16_U_DC( a, d )                                       \
+temp1v = vec_ld((a), *dct);                                         \
+temp2v = vec_ld((d), *dct);                                         \
+mskA = vec_cmplt(temp1v, zerov);                                    \
+mskB = vec_cmplt(temp2v, zerov);                                    \
+coefvA = (vec_u16_t) vec_abs(vec_sub(zerov, temp1v), temp1v);       \
+coefvB = (vec_u16_t) vec_abs(vec_sub(zerov, temp2v), temp2v);       \
+multEvenvA = vec_mule(coefvA, mfv);                                 \
+multOddvA = vec_mulo(coefvA, mfv);                                  \
+multEvenvB = vec_mule(coefvB, mfv);                                 \
+multOddvB = vec_mulo(coefvB, mfv);                                  \
+multEvenvA = vec_add(multEvenvA, fV);                               \
+multOddvA = vec_add(multOddvA, fV);                                 \
+multEvenvB = vec_add(multEvenvB, fV);                               \
+multOddvB = vec_add(multOddvB, fV);                                 \
+multEvenvA = vec_sr(multEvenvA, i_qbitsv);                          \
+multOddvA = vec_sr(multOddvA, i_qbitsv);                            \
+multEvenvB = vec_sr(multEvenvB, i_qbitsv);                          \
+multOddvB = vec_sr(multOddvB, i_qbitsv);                            \
+temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
+temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
+temp1v = vec_xor(temp1v, mskA);                                     \
+temp2v = vec_xor(temp2v, mskB);                                     \
+temp1v = vec_add(temp1v, vec_and(mskA, one));                       \
+vec_st(temp1v, (a), dct);                                           \
+temp2v = vec_add(temp2v, vec_and(mskB, one));                       \
+vec_st(temp2v, (d), dct);
+
+
+void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f ) {
+    vector bool short mskA;
+    vec_s32_t i_qbitsv;
+    vec_u16_t coefvA;
+    vec_s32_t multEvenvA, multOddvA;
+    vec_s16_t zerov, one;
+    vec_s32_t fV;
+
+    vector bool short mskB;
+    vec_u16_t coefvB;
+    vec_u32_t multEvenvB, multOddvB;
+
+    vec_s16_t temp1v, temp2v;
+
+    vec_s32_t mfv;
+	union {
+        int s[4];
+        vector int v;
+    } mf_u;
+    mf_u.s[0]=i_quant_mf;
+    mfv = vec_splat( mf_u.v, 0 );
+    mfv = vec_packs( mfv, mfv);
+
+    union {
+        int s[4];
+        vector int v;
+    } qbits_u;
+    qbits_u.s[0]=i_qbits;
+    i_qbitsv = vec_splat(qbits_u.v, 0);
+
+    union {
+        signed int s[4];
+        signed vector int v;
+    } f_u;
+    f_u.s[0]=f;
+    fV = vec_splat(f_u.v, 0);
+
+    zerov = vec_splat_s16(0);
+    one = vec_splat_s16(1);
+
+    QUANT_16_U_DC( 0, 16 );
+}
+
+
+void x264_quant_8x8_altivec( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f )
+{
+    vector bool short mskA;
+    vec_s32_t i_qbitsv;
+    vec_u16_t coefvA;
+    vec_s32_t multEvenvA, multOddvA, mfvA;
+    vec_s16_t zerov, one;
+    vec_s32_t fV;
+    
+    vector bool short mskB;
+    vec_u16_t coefvB;
+    vec_u32_t multEvenvB, multOddvB, mfvB;
+    
+    vec_s16_t temp1v, temp2v;
+    
+    union {
+        int s[4];
+        vector int v;
+    } qbits_u;
+    qbits_u.s[0]=i_qbits;
+    i_qbitsv = vec_splat(qbits_u.v, 0);
+
+    union {
+        signed int s[4];
+        signed vector int v;
+    } f_u;
+    f_u.s[0]=f;
+    fV = vec_splat(f_u.v, 0);
+
+    zerov = vec_splat_s16(0);
+    one = vec_splat_s16(1);
+    
+    int i;
+    for ( i=0; i<4; i++ ) {
+        QUANT_16_U( i*2*16, i*4*16, i*4*16+16, i*2*16+16, i*4*16+32, i*4*16+48 );
+    }
+}
Index: common/ppc/quant.h
===================================================================
--- common/ppc/quant.h	(revision 0)
+++ common/ppc/quant.h	(revision 0)
@@ -0,0 +1,28 @@
+/*****************************************************************************
+* quant.h: h264 encoder library
+*****************************************************************************
+* Authors: Guillaume Poirier <poirierg at gmail.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+*****************************************************************************/
+
+#ifndef _PPC_QUANT_H
+#define _PPC_QUANT_H 1
+
+void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f );
+void x264_quant_8x8_altivec( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f );
+
+void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f );
+#endif
Index: common/quant.c
===================================================================
--- common/quant.c	(revision 540)
+++ common/quant.c	(working copy)
@@ -25,6 +25,9 @@
 #ifdef HAVE_MMXEXT
 #include "i386/quant.h"
 #endif
+#ifdef ARCH_PPC
+#   include "ppc/quant.h"
+#endif
 
 #define QUANT_ONE( coef, mf ) \
 { \
@@ -271,4 +274,31 @@
         pf->dequant_8x8 = x264_dequant_8x8_mmx;
     }
 #endif  /* HAVE_MMXEXT */
+    
+#ifdef ARCH_PPC
+    if( cpu&X264_CPU_ALTIVEC )
+    {
+        for( i = 0; i < 4*6*4*4; i++ )
+        {
+            int q = h->quant4_mf[0][0][0][i];
+            if( maxQ4 < q )
+                maxQ4 = q;
+            if( maxQdc < q && i%16 == 0 )
+                maxQdc = q;
+        }
+
+        if( maxQ8 < (1<<16) )
+        {
+            //pf->quant_8x8_core = x264_quant_8x8_altivec;
+        }
+        if( maxQ4 < (1<<16) )
+        {
+            pf->quant_4x4_core = x264_quant_4x4_altivec;
+        }
+        if( maxQdc < (1<<15) )
+        {
+           pf->quant_4x4_dc_core = x264_quant_4x4_dc_altivec;
+        }
+    }
+#endif
 }
Index: common/macroblock.c
===================================================================
--- common/macroblock.c	(revision 540)
+++ common/macroblock.c	(working copy)
@@ -26,7 +26,7 @@
 
 #include "common.h"
 
-static const int dequant_mf[6][4][4] =
+static const int dequant_mf[6][4][4] __attribute__((__aligned__(16))) =
 {
     { {10, 13, 10, 13}, {13, 16, 13, 16}, {10, 13, 10, 13}, {13, 16, 13, 16} },
     { {11, 14, 11, 14}, {14, 18, 14, 18}, {11, 14, 11, 14}, {14, 18, 14, 18} },
Index: tools/checkasm.c
===================================================================
--- tools/checkasm.c	(revision 540)
+++ tools/checkasm.c	(working copy)
@@ -437,8 +437,9 @@
     x264_quant_function_t qf_c;
     x264_quant_function_t qf_ref;
     x264_quant_function_t qf_a;
-    int16_t dct1[64], dct2[64];
-    uint8_t cqm_buf[64];
+    int16_t dct1[64]    __attribute__((__aligned__(16)));
+    int16_t dct2[64]    __attribute__((__aligned__(16)));
+    uint8_t cqm_buf[64] __attribute__((__aligned__(16)));
     int ret = 0, ok, used_asm;
     int oks[2] = {1,1}, used_asms[2] = {0,0};
     int i, i_cqm;
Index: Makefile
===================================================================
--- Makefile	(revision 540)
+++ Makefile	(working copy)
@@ -43,7 +43,8 @@
 
 # AltiVec optims
 ifeq ($(ARCH),PPC)
-SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c
+SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \
+	common/ppc/quant.c
 endif
 
 # VIS optims


More information about the x264-devel mailing list