[x265] [PATCH] asm: avx2 code for sign primitive: improve 204c->114c
Divya Manivannan
divya at multicorewareinc.com
Fri Apr 24 10:27:44 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1429863478 -19800
# Fri Apr 24 13:47:58 2015 +0530
# Node ID deea3a0293187e142884b9aa2a719468f1ce5be6
# Parent a35fafa25df2c82fec9e44d95f0a29ba835b48ea
asm: avx2 code for sign primitive: improve 204c->114c
diff -r a35fafa25df2 -r deea3a029318 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Apr 23 12:32:49 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Fri Apr 24 13:47:58 2015 +0530
@@ -1720,6 +1720,7 @@
p.saoCuOrgE3[0] = x265_saoCuOrgE3_avx2;
p.saoCuOrgE3[1] = x265_saoCuOrgE3_32_avx2;
p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
+ p.sign = x265_calSign_avx2;
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
diff -r a35fafa25df2 -r deea3a029318 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Thu Apr 23 12:32:49 2015 -0500
+++ b/source/common/x86/loopfilter.asm Fri Apr 24 13:47:58 2015 +0530
@@ -30,6 +30,8 @@
SECTION_RODATA 32
pb_31: times 32 db 31
pb_15: times 32 db 15
+pb_movemask_32: times 32 db 0x00
+ times 32 db 0xFF
SECTION .text
cextern pb_1
@@ -911,3 +913,52 @@
.end:
RET
+INIT_YMM avx2
+cglobal calSign, 4, 5, 6
+ vbroadcasti128 m0, [pb_128]
+ mova m1, [pb_1]
+
+ sub r1, r0
+ sub r2, r0
+
+ mov r4d, r3d
+ shr r3d, 5
+ jz .next
+.loop:
+ movu m2, [r0 + r1] ; m2 = pRec[x]
+ movu m3, [r0 + r2] ; m3 = pTmpU[x]
+ pxor m4, m2, m0
+ pxor m3, m0
+ pcmpgtb m5, m4, m3
+ pcmpgtb m3, m4
+ pand m5, m1
+ por m5, m3
+ movu [r0], m5
+
+ add r0, mmsize
+ dec r3d
+ jnz .loop
+
+ ; process partial
+.next:
+ and r4d, 31
+ jz .end
+
+ movu m2, [r0 + r1] ; m2 = pRec[x]
+ movu m3, [r0 + r2] ; m3 = pTmpU[x]
+ pxor m4, m2, m0
+ pxor m3, m0
+ pcmpgtb m5, m4, m3
+ pcmpgtb m3, m4
+ pand m5, m1
+ por m5, m3
+
+ lea r3, [pb_movemask_32 + 32]
+ sub r3, r4
+ movu m0, [r3]
+ movu m3, [r0]
+ pblendvb m5, m5, m3, m0
+ movu [r0], m5
+
+.end:
+ RET
diff -r a35fafa25df2 -r deea3a029318 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Thu Apr 23 12:32:49 2015 -0500
+++ b/source/common/x86/loopfilter.h Fri Apr 24 13:47:58 2015 +0530
@@ -40,5 +40,6 @@
void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
+void x265_calSign_avx2(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
#endif // ifndef X265_LOOPFILTER_H
More information about the x265-devel
mailing list