[x265] [PATCH] asm: avx2 code for sign primitive: improve 204c->114c

Divya Manivannan divya at multicorewareinc.com
Fri Apr 24 10:27:44 CEST 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1429863478 -19800
#      Fri Apr 24 13:47:58 2015 +0530
# Node ID deea3a0293187e142884b9aa2a719468f1ce5be6
# Parent  a35fafa25df2c82fec9e44d95f0a29ba835b48ea
asm: avx2 code for sign primitive: improve 204c->114c

diff -r a35fafa25df2 -r deea3a029318 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Apr 23 12:32:49 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Fri Apr 24 13:47:58 2015 +0530
@@ -1720,6 +1720,7 @@
         p.saoCuOrgE3[0] = x265_saoCuOrgE3_avx2;
         p.saoCuOrgE3[1] = x265_saoCuOrgE3_32_avx2;
         p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
+        p.sign = x265_calSign_avx2;
 
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
         p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
diff -r a35fafa25df2 -r deea3a029318 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Thu Apr 23 12:32:49 2015 -0500
+++ b/source/common/x86/loopfilter.asm	Fri Apr 24 13:47:58 2015 +0530
@@ -30,6 +30,8 @@
 SECTION_RODATA 32
 pb_31:      times 32 db 31
 pb_15:      times 32 db 15
+pb_movemask_32:  times 32 db 0x00
+                 times 32 db 0xFF
 
 SECTION .text
 cextern pb_1
@@ -911,3 +913,52 @@
 .end:
     RET
 
+INIT_YMM avx2
+cglobal calSign, 4, 5, 6
+    vbroadcasti128  m0,     [pb_128]
+    mova            m1,     [pb_1]
+
+    sub             r1,     r0
+    sub             r2,     r0
+
+    mov             r4d,    r3d
+    shr             r3d,    5
+    jz              .next
+.loop:
+    movu            m2,     [r0 + r1]            ; m2 = pRec[x]
+    movu            m3,     [r0 + r2]            ; m3 = pTmpU[x]
+    pxor            m4,     m2,     m0
+    pxor            m3,     m0
+    pcmpgtb         m5,     m4,     m3
+    pcmpgtb         m3,     m4
+    pand            m5,     m1
+    por             m5,     m3
+    movu            [r0],   m5
+
+    add             r0,     mmsize
+    dec             r3d
+    jnz             .loop
+
+    ; process partial
+.next:
+    and             r4d,    31
+    jz              .end
+
+    movu            m2,     [r0 + r1]            ; m2 = pRec[x]
+    movu            m3,     [r0 + r2]            ; m3 = pTmpU[x]
+    pxor            m4,     m2,     m0
+    pxor            m3,     m0
+    pcmpgtb         m5,     m4,     m3
+    pcmpgtb         m3,     m4
+    pand            m5,     m1
+    por             m5,     m3
+
+    lea             r3,     [pb_movemask_32 + 32]
+    sub             r3,     r4
+    movu            m0,     [r3]
+    movu            m3,     [r0]
+    pblendvb        m5,     m5,     m3,     m0
+    movu            [r0],   m5
+
+.end:
+    RET
diff -r a35fafa25df2 -r deea3a029318 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Thu Apr 23 12:32:49 2015 -0500
+++ b/source/common/x86/loopfilter.h	Fri Apr 24 13:47:58 2015 +0530
@@ -40,5 +40,6 @@
 void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
+void x265_calSign_avx2(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 
 #endif // ifndef X265_LOOPFILTER_H


More information about the x265-devel mailing list