[x265] [PATCH Review Only] weight_sp: avx2 asm code, improved from 7849.40 cycles to 4922.20 cycles over sse version

praveen at multicorewareinc.com praveen at multicorewareinc.com
Thu Oct 23 14:19:10 CEST 2014


# HG changeset patch
# User Praveen Tiwari
# Date 1414061274 -19800
# Node ID a19f9a6e25da7f4b8b8d2f5105eaaa19df4a6529
# Parent  ce304756a6e469b94cceef930e62972bd2168e4f
weight_sp: avx2 asm code, improved from 7849.40 cycles to 4922.20 cycles over sse version

diff -r ce304756a6e4 -r a19f9a6e25da source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Oct 22 23:16:13 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp	Thu Oct 23 16:17:54 2014 +0530
@@ -1792,6 +1792,7 @@
         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
 
         p.weight_pp = x265_weight_pp_avx2;
+        p.weight_sp = x265_weight_sp_avx2;
 
 #if X86_64
         p.dct[DCT_8x8] = x265_dct8_avx2;
diff -r ce304756a6e4 -r a19f9a6e25da source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Wed Oct 22 23:16:13 2014 -0500
+++ b/source/common/x86/pixel-util.h	Thu Oct 23 16:17:54 2014 +0530
@@ -60,6 +60,7 @@
 void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
 void x265_weight_pp_avx2(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
 void x265_weight_sp_sse4(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
+void x265_weight_sp_avx2(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
 
 void x265_pixel_ssim_4x4x2_core_mmx2(const uint8_t * pix1, intptr_t stride1,
                                      const uint8_t * pix2, intptr_t stride2, int sums[2][4]);
diff -r ce304756a6e4 -r a19f9a6e25da source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Wed Oct 22 23:16:13 2014 -0500
+++ b/source/common/x86/pixel-util8.asm	Thu Oct 23 16:17:54 2014 +0530
@@ -1498,6 +1498,90 @@
 
     RET
 
+INIT_YMM avx2
+%if ARCH_X86_64
+cglobal weight_sp, 6, 7+2, 7
+    %define tmp_r0      r7
+    %define tmp_r1      r8
+%else ; ARCH_X86_64 = 0
+cglobal weight_sp, 6, 7, 7, 0-(2*4)
+    %define tmp_r0      [(rsp + 0 * 4)]
+    %define tmp_r1      [(rsp + 1 * 4)]
+%endif ; ARCH_X86_64
+
+    movd         xm0, r6m         ; m0 = [w0]
+    movd         xm1, r7m         ; m1 = [round]
+    punpcklwd    xm0, xm1
+    pshufd       xm0, xm0, 0      ; m0 = [w0 round]
+    vinserti128  m0,  m0, xm0, 1  ; document says (pshufd + vinserti128) can be replaced with vpbroadcastd m0, xm0, but having build problem, need to investigate
+    movd         xm1, r8m         ; m1 = [shift]
+    vpbroadcastd m2, r9m          ; m2 = [offset]
+    vpbroadcastw m3, [pw_1]
+    vpbroadcastw m4, [pw_2000]
+
+    add         r2d, r2d
+
+.loopH:
+    mov         r6d, r4d
+
+    ; save old src and dst
+    mov         tmp_r0, r0
+    mov         tmp_r1, r1
+.loopW:
+    movu        m5, [r0]
+    paddw       m5, m4
+
+    punpcklwd   m6, m5, m3
+    pmaddwd     m6, m0
+    psrad       m6, xm1
+    paddd       m6, m2
+
+    punpckhwd   m5, m3
+    pmaddwd     m5, m0
+    psrad       m5, xm1
+    paddd       m5, m2
+
+    packssdw     m6, m5
+    vextracti128 xm5, m6, 1
+    packuswb     xm6, xm5
+
+    sub         r6d, 16
+    jl          .width8
+    movu        [r1], xm6
+    je          .nextH
+    add         r0, 32
+    add         r1, 16
+
+    jmp         .loopW
+
+.width8:
+    cmp         r6d, -8
+    jl          .width4
+    movq        [r1], xm6
+    je          .nextH
+    add         r1, 8
+
+.width4:
+    cmp         r6d, -4
+    jl          .width2
+    movd        [r1], xm6
+    je          .nextH
+    add         r1, 4
+    pshufd      m6, m6, 1
+
+.width2:
+    pextrw      [r1], xm6, 0
+
+.nextH:
+    mov         r0, tmp_r0
+    mov         r1, tmp_r1
+    lea         r0, [r0 + r2]
+    lea         r1, [r1 + r3]
+
+    dec         r5d
+    jnz         .loopH
+    RET
+
 ;-----------------------------------------------------------------
 ; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
 ;-----------------------------------------------------------------


More information about the x265-devel mailing list