[x265] [PATCH] asm: avx2 10bit code for scale2D_64to32

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Wed Apr 22 16:23:12 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1429712526 -19800
#      Wed Apr 22 19:52:06 2015 +0530
# Node ID eb7ba0bea5b6bac28ea39030062c31b5ed504487
# Parent  86268e498680951069c48b681eef830b0aa37873
asm: avx2 10bit code for scale2D_64to32

AVX2:
scale2D_64to32  17.07x   3873.16   44301.99
SSSE3:
scale2D_64to32  2.75x    14407.30  39553.04

diff -r 86268e498680 -r eb7ba0bea5b6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Wed Apr 22 19:52:06 2015 +0530
@@ -1196,6 +1196,7 @@
         p.dequant_normal  = x265_dequant_normal_avx2;
 
         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
+        p.scale2D_64to32 = x265_scale2D_64to32_avx2;
         // p.weight_pp = x265_weight_pp_avx2; fails tests
 
         p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2;
diff -r 86268e498680 -r eb7ba0bea5b6 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/pixel-util8.asm	Wed Apr 22 19:52:06 2015 +0530
@@ -4022,6 +4022,50 @@
     RET
 %endif
 
+;-----------------------------------------------------------------
+; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
+;-----------------------------------------------------------------
+%if HIGH_BIT_DEPTH
+INIT_YMM avx2
+cglobal scale2D_64to32, 3, 4, 5, dest, src, stride
+    mov         r3d,     32
+    add         r2d,     r2d
+    mova        m4,      [pw_2000]
+
+.loop:
+    movu        m0,      [r1]
+    movu        m1,      [r1 + 1 * mmsize]
+    movu        m2,      [r1 + r2]
+    movu        m3,      [r1 + r2 + 1 * mmsize]
+
+    paddw       m0,      m2
+    paddw       m1,      m3
+    phaddw      m0,      m1
+
+    pmulhrsw    m0,      m4
+    vpermq      m0,      m0, q3120
+    movu        [r0],    m0
+
+    movu        m0,      [r1 + 2 * mmsize]
+    movu        m1,      [r1 + 3 * mmsize]
+    movu        m2,      [r1 + r2 + 2 * mmsize]
+    movu        m3,      [r1 + r2 + 3 * mmsize]
+
+    paddw       m0,      m2
+    paddw       m1,      m3
+    phaddw      m0,      m1
+
+    pmulhrsw    m0,      m4
+    vpermq      m0,      m0, q3120
+    movu        [r0 + mmsize], m0
+
+    add         r0,      64
+    lea         r1,      [r1 + 2 * r2]
+    dec         r3d
+    jnz         .loop
+    RET
+%else
+
 INIT_YMM avx2
 cglobal scale2D_64to32, 3, 5, 8, dest, src, stride
     mov         r3d,     16
@@ -4121,6 +4165,7 @@
     dec         r3d
     jnz         .loop
     RET
+%endif
 
 ;-----------------------------------------------------------------------------
 ; void pixel_sub_ps_4x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);


More information about the x265-devel mailing list