[x265] [PATCH] asm: saoCuOrgE2[1] avx2 code: improve 449c->292c
Divya Manivannan
divya at multicorewareinc.com
Thu Apr 23 15:24:25 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1429795103 -19800
# Thu Apr 23 18:48:23 2015 +0530
# Node ID d94c86306da98592882cf266535aa31bf82d5643
# Parent 8d9267b2128bb647557def1b893d1c1bbada4e37
asm: saoCuOrgE2[1] avx2 code: improve 449c->292c
diff -r 8d9267b2128b -r d94c86306da9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Apr 23 18:40:34 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Apr 23 18:48:23 2015 +0530
@@ -1708,6 +1708,7 @@
p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_avx2;
p.saoCuOrgE2[0] = x265_saoCuOrgE2_avx2;
+ p.saoCuOrgE2[1] = x265_saoCuOrgE2_32_avx2;
p.saoCuOrgE3[0] = x265_saoCuOrgE3_avx2;
p.saoCuOrgE3[1] = x265_saoCuOrgE3_32_avx2;
p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
diff -r 8d9267b2128b -r d94c86306da9 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Thu Apr 23 18:40:34 2015 +0530
+++ b/source/common/x86/loopfilter.asm Thu Apr 23 18:48:23 2015 +0530
@@ -487,6 +487,55 @@
movu [r0], xm2
RET
+INIT_YMM avx2
+cglobal saoCuOrgE2_32, 5, 6, 8, rec, bufft, buff1, offsetEo, lcuWidth
+ mov r5d, r5m
+ pxor m0, m0 ; m0 = 0
+ mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+ vbroadcasti128 m7, [pb_128]
+ vbroadcasti128 m5, [r3] ; m5 = offsetEo
+ shr r4d, 5
+ inc r1
+
+.loop:
+ movu m1, [r0] ; m1 = rec[x]
+ movu m2, [r0 + r5 + 1] ; m2 = rec[x + stride + 1]
+ pxor m3, m1, m7
+ pxor m4, m2, m7
+ pcmpgtb m2, m3, m4
+ pcmpgtb m4, m3
+ pand m2, [pb_1]
+ por m2, m4
+ movu m3, [r2] ; m3 = buff1
+
+ paddb m3, m2
+ paddb m3, m6 ; m3 = edgeType
+
+ pshufb m4, m5, m3
+
+ psubb m3, m0, m2
+ movu [r1], m3
+
+ pmovzxbw m2, xm1
+ vextracti128 xm1, m1, 1
+ pmovzxbw m1, xm1
+ pmovsxbw m3, xm4
+ vextracti128 xm4, m4, 1
+ pmovsxbw m4, xm4
+
+ paddw m2, m3
+ paddw m1, m4
+ packuswb m2, m1
+ vpermq m2, m2, 11011000b
+ movu [r0], m2
+
+ add r0, 32
+ add r1, 32
+ add r2, 32
+ dec r4d
+ jnz .loop
+ RET
+
;=======================================================================================================
;void saoCuOrgE3(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX)
;=======================================================================================================
diff -r 8d9267b2128b -r d94c86306da9 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Thu Apr 23 18:40:34 2015 +0530
+++ b/source/common/x86/loopfilter.h Thu Apr 23 18:48:23 2015 +0530
@@ -33,6 +33,7 @@
void x265_saoCuOrgE1_2Rows_avx2(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
void x265_saoCuOrgE2_avx2(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
+void x265_saoCuOrgE2_32_avx2(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
void x265_saoCuOrgE3_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
void x265_saoCuOrgE3_32_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
More information about the x265-devel
mailing list