[x265] [PATCH] asm: avx2 code for weight_sp() for 8bpp

sumalatha at multicorewareinc.com sumalatha at multicorewareinc.com
Thu Apr 2 11:51:08 CEST 2015


# HG changeset patch
# User Sumalatha Polureddy
# Date 1427968258 -19800
#      Thu Apr 02 15:20:58 2015 +0530
# Node ID 7f976e1e89c5940a8bb2f5b965ebd9ed6e6948a6
# Parent  ac85c775620f1dcb0df056874633cbf916098bd2
asm: avx2 code for weight_sp() for 8bpp

sse4
weight_sp  16.40x   7768.71         127369.20

avx2
weight_sp  25.83x   4918.74         127040.17

diff -r ac85c775620f -r 7f976e1e89c5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Thu Apr 02 15:20:58 2015 +0530
@@ -1604,6 +1604,7 @@
 
         p.scale1D_128to64 = x265_scale1D_128to64_avx2;
         p.weight_pp = x265_weight_pp_avx2;
+        p.weight_sp = x265_weight_sp_avx2;
 
         // intra_pred functions
         p.cu[BLOCK_8x8].intra_pred[3] = x265_intra_pred_ang8_3_avx2;
diff -r ac85c775620f -r 7f976e1e89c5 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/pixel-util8.asm	Thu Apr 02 15:20:58 2015 +0530
@@ -1492,6 +1492,84 @@
     dec         r5d
     jnz         .loopH
     RET
+
+%if ARCH_X86_64
+INIT_YMM avx2
+cglobal weight_sp, 6, 9, 7
+    mov             r7d, r7m
+    shl             r7d, 16
+    or              r7d, r6m
+    vpbroadcastd    m0, r7d            ; m0 = times 8 dw w0, round
+    movd            xm1, r8m            ; m1 = [shift]
+    vpbroadcastd    m2, r9m            ; m2 = times 16 dw offset
+    vpbroadcastw    m3, [pw_1]
+    vpbroadcastw    m4, [pw_2000]
+
+    add             r2d, r2d            ; 2 * srcstride
+
+    mov             r7, r0
+    mov             r8, r1
+.loopH:
+    mov             r6d, r4d            ; width
+
+    ; save old src and dst
+    mov             r0, r7              ; src
+    mov             r1, r8              ; dst
+.loopW:
+    movu            m5, [r0]
+    paddw           m5, m4
+
+    punpcklwd       m6,m5, m3
+    pmaddwd         m6, m0
+    psrad           m6, xm1
+    paddd           m6, m2
+
+    punpckhwd       m5, m3
+    pmaddwd         m5, m0
+    psrad           m5, xm1
+    paddd           m5, m2
+
+    packssdw        m6, m5
+    packuswb        m6, m6
+    vpermq          m6, m6, 10001000b
+
+    sub             r6d, 16
+    jl              .width8
+    movu            [r1], xm6
+    je              .nextH
+    add             r0, 32
+    add             r1, 16
+    jmp             .loopW
+
+.width8:
+    add             r6d, 16
+    cmp             r6d, 8
+    jl              .width4
+    movq            [r1], xm6
+    je              .nextH
+    psrldq          m6, 8
+    sub             r6d, 8
+    add             r1, 8
+
+.width4:
+    cmp             r6d, 4
+    jl              .width2
+    movd            [r1], xm6
+    je              .nextH
+    add             r1, 4
+    pshufd          m6, m6, 1
+
+.width2:
+    pextrw          [r1], xm6, 0
+
+.nextH:
+    lea             r7, [r7 + r2]
+    lea             r8, [r8 + r3]
+
+    dec             r5d
+    jnz             .loopH
+    RET
+%endif
 %endif  ; end of (HIGH_BIT_DEPTH == 0)
     
 
diff -r ac85c775620f -r 7f976e1e89c5 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/pixel.h	Thu Apr 02 15:20:58 2015 +0530
@@ -272,6 +272,7 @@
 int x265_psyCost_ss_16x16_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
 int x265_psyCost_ss_32x32_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
 int x265_psyCost_ss_64x64_avx2(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
+void x265_weight_sp_avx2(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
 
 #undef DECL_PIXELS
 #undef DECL_HEVC_SSD


More information about the x265-devel mailing list