[x265] [PATCH] asm: avx2 code for weight_pp() for 10 bpp

sumalatha at multicorewareinc.com sumalatha at multicorewareinc.com
Wed Jun 17 11:28:06 CEST 2015


# HG changeset patch
# User Sumalatha Polureddy<sumalatha at multicorewareinc.com>
# Date 1434533281 -19800
#      Wed Jun 17 14:58:01 2015 +0530
# Node ID 855feadb8cb588ec8333c2984cb6e0dbb4d09260
# Parent  be0ed447922cc81e809d296e75424bb71822aea7
asm: avx2 code for weight_pp() for 10 bpp

sse4
weight_pp  9.37x    6768.87         63435.43

avx2
weight_pp  16.45x   4187.86         68871.50

diff -r be0ed447922c -r 855feadb8cb5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jun 16 11:15:03 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Jun 17 14:58:01 2015 +0530
@@ -1467,7 +1467,7 @@
 
         p.scale1D_128to64 = PFX(scale1D_128to64_avx2);
         p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
-        // p.weight_pp = PFX(weight_pp_avx2); fails tests
+        p.weight_pp = PFX(weight_pp_avx2);
 
         p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
         p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
diff -r be0ed447922c -r 855feadb8cb5 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Jun 16 11:15:03 2015 +0530
+++ b/source/common/x86/pixel-util8.asm	Wed Jun 17 14:58:01 2015 +0530
@@ -1280,7 +1280,57 @@
 %endif  ; end of (HIGH_BIT_DEPTH == 0)
 
 
-
+%if HIGH_BIT_DEPTH
+INIT_YMM avx2
+cglobal weight_pp, 6, 7, 7
+    shl          r5d, 4            ; m0 = [w0<<4]
+    mov          r6d, r6m
+    shl          r6d, 16
+    or           r6d, r5d          ; assuming both (w0<<4) and round are using maximum of 16 bits each.
+
+    vpbroadcastd m0, r6d
+
+    movd         xm1, r7m
+    vpbroadcastd m2, r8m
+    mova         m5, [pw_1]
+    mova         m6, [pw_1023]
+    add         r2d, r2d
+    add         r3d, r3d
+    sub          r2d, r3d
+    shr          r3d, 5
+
+.loopH:
+    mov          r5d, r3d
+
+.loopW:
+    movu        m4, [r0]
+    punpcklwd   m3, m4, m5
+    pmaddwd     m3, m0
+    psrad       m3, xm1
+    paddd       m3, m2
+
+    punpckhwd   m4, m5
+    pmaddwd     m4, m0
+    psrad       m4, xm1
+    paddd       m4, m2
+
+    packusdw    m3, m4
+    pminuw      m3, m6
+    movu        [r1], m3
+
+    add         r0, 32
+    add         r1, 32
+
+    dec         r5d
+    jnz         .loopW
+
+    lea         r0, [r0 + r2]
+    lea         r1, [r1 + r2]
+
+    dec         r4d
+    jnz         .loopH
+    RET
+%else
 INIT_YMM avx2
 cglobal weight_pp, 6, 7, 6
 
@@ -1329,7 +1379,7 @@
     dec         r4d
     jnz         .loopH
     RET
-
+%endif
 ;-------------------------------------------------------------------------------------------------------------------------------------------------
 ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
 ;-------------------------------------------------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list