[x264-devel] x86: AVX2 high bit-depth quant

Mon May 20 23:06:49 CEST 2013

x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sat May  4 18:48:58 2013 +0200| [db95d6af63bec7839b3d3e1f2eb67b8689dc8170] | committer: Jason Garrett-Glaser

x86: AVX2 high bit-depth quant

quant_4x4: 13->6 cycles
quant_4x4_dc: 14->8 cycles
quant_8x8: 47->24 cycles
quant_4x4x4: 48->25 cycles

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=db95d6af63bec7839b3d3e1f2eb67b8689dc8170
---

 common/quant.c         |    7 +++++++
 common/x86/quant-a.asm |   34 +++++++++++++++++++++++++++++-----
 2 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/common/quant.c b/common/quant.c
index cffd8e8..d4fd405 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -537,6 +537,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
             pf->dequant_8x8 = x264_dequant_8x8_xop;
         }
     }
+    if( cpu&X264_CPU_AVX2 )
+    {
+        pf->quant_4x4 = x264_quant_4x4_avx2;
+        pf->quant_4x4_dc = x264_quant_4x4_dc_avx2;
+        pf->quant_8x8 = x264_quant_8x8_avx2;
+        pf->quant_4x4x4 = x264_quant_4x4x4_avx2;
+    }
 #endif // HAVE_MMX
 #else // !HIGH_BIT_DEPTH
 #if HAVE_MMX
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index fbe2d79..3f7e9b3 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -7,7 +7,7 @@
 ;*          Jason Garrett-Glaser <darkshikari at gmail.com>
 ;*          Christian Heine <sennindemokrit at gmx.net>
 ;*          Oskar Arvidsson <oskar at irock.se>
-;*          Henrik Gramner <hengar-6 at student.ltu.se>
+;*          Henrik Gramner <henrik at gramner.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -238,10 +238,10 @@ cextern popcnt_table
     mova [%1       ], m2
     mova [%1+mmsize], m3
     ACCUM      por, %5, 2, %4
-    ACCUM      por, %5, 3, %4+mmsize
+    por        m%5, m3
 %else ; !sse4
     QUANT_ONE_AC_MMX %1, %2, %3, %4, %5
-    QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, %4+mmsize, %5
+    QUANT_ONE_AC_MMX %1+mmsize, %2+mmsize, %3+mmsize, 1, %5
 %endif ; cpuflag
 %endmacro
 
@@ -279,8 +279,8 @@ cglobal quant_%1x%2, 3,3,8
 %endmacro
 
 %macro QUANT_4x4 2
-    QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, mmsize*0, %2
-    QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, mmsize*2, %2
+    QUANT_TWO_AC r0+%1+mmsize*0, r1+mmsize*0, r2+mmsize*0, 0, %2
+    QUANT_TWO_AC r0+%1+mmsize*2, r1+mmsize*2, r2+mmsize*2, 1, %2
 %endmacro
 
 %macro QUANT_4x4x4 0
@@ -324,6 +324,30 @@ QUANT_AC 4, 4
 QUANT_AC 8, 8
 QUANT_4x4x4
 
+INIT_YMM avx2
+QUANT_DC 4, 4
+QUANT_AC 4, 4
+QUANT_AC 8, 8
+
+INIT_YMM avx2
+cglobal quant_4x4x4, 3,3,6
+    QUANT_TWO_AC r0,    r1, r2, 0, 4
+    QUANT_TWO_AC r0+64, r1, r2, 0, 5
+    add       r0, 128
+    packssdw  m4, m5
+    QUANT_TWO_AC r0,    r1, r2, 0, 5
+    QUANT_TWO_AC r0+64, r1, r2, 0, 1
+    packssdw  m5, m1
+    packssdw  m4, m5
+    pxor      m3, m3
+    pcmpeqd   m4, m3
+    movmskps eax, m4
+    mov      edx, eax
+    shr      eax, 4
+    and      eax, edx
+    xor      eax, 0xf
+    RET
+
 %endif ; HIGH_BIT_DEPTH
 
 %if HIGH_BIT_DEPTH == 0