[x265] [PATCH] asm: saoCuOrgB0 avx2 code: 23780c->18441c
Divya Manivannan
divya at multicorewareinc.com
Fri Apr 10 15:19:34 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1428671123 -19800
# Fri Apr 10 18:35:23 2015 +0530
# Node ID ea96d405de730e2dd0661c869c4c557ca877a9ec
# Parent 984e254f93f7cedc5a9b00851d2e14b49dc94e91
asm: saoCuOrgB0 avx2 code: 23780c->18441c
diff -r 984e254f93f7 -r ea96d405de73 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Apr 09 11:48:08 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Fri Apr 10 18:35:23 2015 +0530
@@ -1563,6 +1563,7 @@
p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_avx2;
+ p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
diff -r 984e254f93f7 -r ea96d405de73 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Thu Apr 09 11:48:08 2015 -0500
+++ b/source/common/x86/loopfilter.asm Fri Apr 10 18:35:23 2015 +0530
@@ -28,8 +28,8 @@
%include "x86inc.asm"
SECTION_RODATA 32
-pb_31: times 16 db 31
-pb_15: times 16 db 15
+pb_31: times 32 db 31
+pb_15: times 32 db 15
SECTION .text
cextern pb_1
@@ -584,6 +584,82 @@
jnz .loopH
RET
+INIT_YMM avx2
+cglobal saoCuOrgB0, 4, 7, 8
+
+ mov r3d, r3m
+ mov r4d, r4m
+ mova m7, [pb_31]
+ vbroadcasti128 m3, [r1 + 0] ; offset[0-15]
+ vbroadcasti128 m4, [r1 + 16] ; offset[16-31]
+ lea r6, [r4 * 2]
+ sub r6d, r2d
+ shr r2d, 4
+ mov r1d, r3d
+ shr r3d, 1
+.loopH
+ mov r5d, r2d
+.loopW
+ movu xm2, [r0] ; m2 = [rec]
+ vinserti128 m2, m2, [r0 + r4], 1
+ psrlw m1, m2, 3
+ pand m1, m7 ; m1 = [index]
+ pcmpgtb m0, m1, [pb_15] ; m0 = [mask]
+
+ pshufb m6, m3, m1
+ pshufb m5, m4, m1
+
+ pblendvb m6, m6, m5, m0 ; NOTE: don't use 3 parameters style, x264 macro have some bug!
+
+ pmovzxbw m1, xm2 ; rec
+ vextracti128 xm2, m2, 1
+ pmovzxbw m2, xm2
+ pmovsxbw m0, xm6 ; offset
+ vextracti128 xm6, m6, 1
+ pmovsxbw m6, xm6
+
+ paddw m1, m0
+ paddw m2, m6
+ packuswb m1, m2
+ vpermq m1, m1, 11011000b
+
+ movu [r0], xm1
+ vextracti128 [r0 + r4], m1, 1
+ add r0, 16
+ dec r5d
+ jnz .loopW
+
+ add r0, r6
+ dec r3d
+ jnz .loopH
+ test r1b, 1
+ jz .end
+ mov r5d, r2d
+.loopW1
+ movu xm2, [r0] ; m2 = [rec]
+ psrlw xm1, xm2, 3
+ pand xm1, xm7 ; m1 = [index]
+ pcmpgtb xm0, xm1, [pb_15] ; m0 = [mask]
+
+ pshufb xm6, xm3, xm1
+ pshufb xm5, xm4, xm1
+
+ pblendvb xm6, xm6, xm5, xm0 ; NOTE: don't use 3 parameters style, x264 macro have some bug!
+
+ pmovzxbw m1, xm2 ; rec
+ pmovsxbw m0, xm6 ; offset
+
+ paddw m1, m0
+ vextracti128 xm0, m1, 1
+ packuswb xm1, xm0
+
+ movu [r0], xm1
+ add r0, 16
+ dec r5d
+ jnz .loopW1
+.end
+ RET
+
;============================================================================================================
; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int width)
;============================================================================================================
diff -r 984e254f93f7 -r ea96d405de73 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Thu Apr 09 11:48:08 2015 -0500
+++ b/source/common/x86/loopfilter.h Fri Apr 10 18:35:23 2015 +0530
@@ -34,6 +34,7 @@
void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
+void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
#endif // ifndef X265_LOOPFILTER_H
diff -r 984e254f93f7 -r ea96d405de73 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Thu Apr 09 11:48:08 2015 -0500
+++ b/source/test/pixelharness.cpp Fri Apr 10 18:35:23 2015 +0530
@@ -1156,8 +1156,8 @@
for (int i = 0; i < ITERS; i++)
{
int width = 16 * (rand() % 4 + 1);
- int height = rand() % 64 +1;
- int stride = rand() % 65;
+ int height = rand() % 63 + 2;
+ int stride = width;
ref(ref_dest, psbuf1 + j, width, height, stride);
checked(opt, opt_dest, psbuf1 + j, width, height, stride);
More information about the x265-devel
mailing list