[x265] [PATCH] asm: scale2D_64to32 avx2 code

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Mar 27 06:57:59 CET 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1427435844 -19800
#      Fri Mar 27 11:27:24 2015 +0530
# Branch stable
# Node ID 694ce8986d7e83c8fca7e7a421c29b21b423ffbf
# Parent  3d0f23cb0e58585e490362587022e67cfded143a
asm: scale2D_64to32 avx2 code

AVX2:
scale2D_64to32      10.41x   3861.30         40192.99
scale2D_64to32      10.35x   3880.97         40175.66

SSSE3:
scale2D_64to32      5.44x    7454.44         40576.51
scale2D_64to32      5.45x    7445.73         40613.14

diff -r 3d0f23cb0e58 -r 694ce8986d7e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Mar 26 15:09:51 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Fri Mar 27 11:27:24 2015 +0530
@@ -1447,6 +1447,8 @@
 #if X86_64
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.scale2D_64to32 = x265_scale2D_64to32_avx2;
+
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
         p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
         p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_avx2;
diff -r 3d0f23cb0e58 -r 694ce8986d7e source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Thu Mar 26 15:09:51 2015 -0500
+++ b/source/common/x86/pixel-util.h	Fri Mar 27 11:27:24 2015 +0530
@@ -76,6 +76,7 @@
 void x265_scale1D_128to64_ssse3(pixel*, const pixel*, intptr_t);
 void x265_scale1D_128to64_avx2(pixel*, const pixel*, intptr_t);
 void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t);
+void x265_scale2D_64to32_avx2(pixel*, const pixel*, intptr_t);
 
 int x265_findPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
 
diff -r 3d0f23cb0e58 -r 694ce8986d7e source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Thu Mar 26 15:09:51 2015 -0500
+++ b/source/common/x86/pixel-util8.asm	Fri Mar 27 11:27:24 2015 +0530
@@ -40,16 +40,17 @@
 ssim_c1:   times 4 dd 416          ; .01*.01*255*255*64
 ssim_c2:   times 4 dd 235963       ; .03*.03*255*255*64*63
 %endif
-mask_ff:   times 16 db 0xff
-           times 16 db 0
-deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
-deinterleave_word_shuf: db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
-hmul_16p:  times 16 db 1
-           times 8 db 1, -1
-hmulw_16p:  times 8 dw 1
-            times 4 dw 1, -1
-
-trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
+
+mask_ff:                times 16 db 0xff
+                        times 16 db 0
+deinterleave_shuf:      times  2 db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+deinterleave_word_shuf: times  2 db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
+hmul_16p:               times 16 db 1
+                        times  8 db 1, -1
+hmulw_16p:              times  8 dw 1
+                        times  4 dw 1, -1
+
+trans8_shuf:            dd 0, 4, 1, 5, 2, 6, 3, 7
 
 SECTION .text
 
@@ -3944,6 +3945,105 @@
     RET
 %endif
 
+INIT_YMM avx2
+cglobal scale2D_64to32, 3, 5, 8, dest, src, stride
+    mov         r3d,     16
+    mova        m7,      [deinterleave_shuf]
+.loop:
+    movu        m0,      [r1]                  ; i
+    lea         r4,      [r1 + r2 * 2]
+    psrlw       m1,      m0, 8                 ; j
+    movu        m2,      [r1 + r2]             ; k
+    psrlw       m3,      m2, 8                 ; l
+
+    pxor        m4,      m0, m1                ; i^j
+    pxor        m5,      m2, m3                ; k^l
+    por         m4,      m5                    ; ij|kl
+
+    pavgb       m0,      m1                    ; s
+    pavgb       m2,      m3                    ; t
+    mova        m5,      m0
+    pavgb       m0,      m2                    ; (s+t+1)/2
+    pxor        m5,      m2                    ; s^t
+    pand        m4,      m5                    ; (ij|kl)&st
+    pand        m4,      [pb_1]
+    psubb       m0,      m4                    ; Result
+
+    movu        m1,      [r1 + 32]             ; i
+    psrlw       m2,      m1, 8                 ; j
+    movu        m3,      [r1 + r2 + 32]        ; k
+    psrlw       m4,      m3, 8                 ; l
+
+    pxor        m5,      m1, m2                ; i^j
+    pxor        m6,      m3, m4                ; k^l
+    por         m5,      m6                    ; ij|kl
+
+    pavgb       m1,      m2                    ; s
+    pavgb       m3,      m4                    ; t
+    mova        m6,      m1
+    pavgb       m1,      m3                    ; (s+t+1)/2
+    pxor        m6,      m3                    ; s^t
+    pand        m5,      m6                    ; (ij|kl)&st
+    pand        m5,      [pb_1]
+    psubb       m1,      m5                    ; Result
+
+    pshufb      m0,      m0, m7
+    pshufb      m1,      m1, m7
+
+    punpcklqdq  m0,      m1
+    vpermq      m0,      m0, 11011000b
+    movu        [r0],    m0
+
+    add         r0,      32
+
+    movu        m0,      [r4]                  ; i
+    psrlw       m1,      m0, 8                 ; j
+    movu        m2,      [r4 + r2]             ; k
+    psrlw       m3,      m2, 8                 ; l
+
+    pxor        m4,      m0, m1                ; i^j
+    pxor        m5,      m2, m3                ; k^l
+    por         m4,      m5                    ; ij|kl
+
+    pavgb       m0,      m1                    ; s
+    pavgb       m2,      m3                    ; t
+    mova        m5,      m0
+    pavgb       m0,      m2                    ; (s+t+1)/2
+    pxor        m5,      m2                    ; s^t
+    pand        m4,      m5                    ; (ij|kl)&st
+    pand        m4,      [pb_1]
+    psubb       m0,      m4                    ; Result
+
+    movu        m1,      [r4 + 32]             ; i
+    psrlw       m2,      m1, 8                 ; j
+    movu        m3,      [r4 + r2 + 32]        ; k
+    psrlw       m4,      m3, 8                 ; l
+
+    pxor        m5,      m1, m2                ; i^j
+    pxor        m6,      m3, m4                ; k^l
+    por         m5,      m6                    ; ij|kl
+
+    pavgb       m1,      m2                    ; s
+    pavgb       m3,      m4                    ; t
+    mova        m6,      m1
+    pavgb       m1,      m3                    ; (s+t+1)/2
+    pxor        m6,      m3                    ; s^t
+    pand        m5,      m6                    ; (ij|kl)&st
+    pand        m5,      [pb_1]
+    psubb       m1,      m5                    ; Result
+
+    pshufb      m0,      m0, m7
+    pshufb      m1,      m1, m7
+
+    punpcklqdq  m0,      m1
+    vpermq      m0,      m0, 11011000b
+    movu        [r0],    m0
+
+    lea         r1,      [r1 + 4 * r2]
+    add         r0,      32
+    dec         r3d
+    jnz         .loop
+    RET
 
 ;-----------------------------------------------------------------------------
 ; void pixel_sub_ps_4x4(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);


More information about the x265-devel mailing list