[x265] [PATCH 2 of 2] asm: asm code for deblocking filter horizontal and vertical
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Fri Oct 9 14:47:08 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1444286180 -19800
# Thu Oct 08 12:06:20 2015 +0530
# Node ID 86627e458e6e2e357fe1746067392c6984b8915f
# Parent 38e4b94377fa6ffe57472c49ecff6c909ed4f6dc
asm: asm code for deblocking filter horizontal and vertical
diff -r 38e4b94377fa -r 86627e458e6e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Oct 06 14:19:56 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Oct 08 12:06:20 2015 +0530
@@ -2541,6 +2541,9 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
#if X86_64
+ p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
+ p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
+
p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
diff -r 38e4b94377fa -r 86627e458e6e source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Tue Oct 06 14:19:56 2015 +0530
+++ b/source/common/x86/const-a.asm Thu Oct 08 12:06:20 2015 +0530
@@ -67,6 +67,7 @@
;; 16-bit constants
+const pw_n1, times 16 dw -1
const pw_1, times 16 dw 1
const pw_2, times 16 dw 2
const pw_3, times 16 dw 3
diff -r 38e4b94377fa -r 86627e458e6e source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Tue Oct 06 14:19:56 2015 +0530
+++ b/source/common/x86/loopfilter.asm Thu Oct 08 12:06:20 2015 +0530
@@ -37,6 +37,7 @@
SECTION .text
cextern pb_1
+cextern pb_01
cextern pb_128
cextern pb_2
cextern pw_2
@@ -45,6 +46,8 @@
cextern pw_1
cextern hmul_16p
cextern pb_4
+cextern pw_4
+cextern pw_n1
;============================================================================================================
@@ -2231,6 +2234,248 @@
RET
%endif ; ARCH_X86_64
+%if ARCH_X86_64
+;; argument registers used -
+; r0 - src
+; r1 - srcStep
+; r2 - offset
+; r3 - tcP
+; r4 - tcQ
+
+INIT_XMM sse4
+cglobal pelFilterLumaStrong_H, 5,7,10
+ mov r1, r2
+ neg r3d
+ neg r4d
+ neg r1
+
+ lea r5, [r2 * 3]
+ lea r6, [r1 * 3]
+
+ pmovzxbw m4, [r0] ; src[0]
+ pmovzxbw m3, [r0 + r1] ; src[-offset]
+ pmovzxbw m2, [r0 + r1 * 2] ; src[-offset * 2]
+ pmovzxbw m1, [r0 + r6] ; src[-offset * 3]
+ pmovzxbw m0, [r0 + r1 * 4] ; src[-offset * 4]
+ pmovzxbw m5, [r0 + r2] ; src[offset]
+ pmovzxbw m6, [r0 + r2 * 2] ; src[offset * 2]
+ pmovzxbw m7, [r0 + r5] ; src[offset * 3]
+
+ paddw m0, m0 ; m0*2
+ mova m8, m2
+ paddw m8, m3 ; m2 + m3
+ paddw m8, m4 ; m2 + m3 + m4
+ mova m9, m8
+ paddw m9, m9 ; 2*m2 + 2*m3 + 2*m4
+ paddw m8, m1 ; m2 + m3 + m4 + m1
+ paddw m0, m8 ; 2*m0 + m2+ m3 + m4 + m1
+ paddw m9, m1
+ paddw m0, m1
+ paddw m9, m5 ; m1 + 2*m2 + 2*m3 + 2*m4 + m5
+ paddw m0, m1 ; 2*m0 + 3*m1 + m2 + m3 + m4
+
+ punpcklqdq m0, m9
+ punpcklqdq m1, m3
+
+ paddw m3, m4
+ mova m9, m5
+ paddw m9, m6
+ paddw m7, m7 ; 2*m7
+ paddw m9, m3 ; m3 + m4 + m5 + m6
+ mova m3, m9
+ paddw m3, m3 ; 2*m3 + 2*m4 + 2*m5 + 2*m6
+ paddw m7, m9 ; 2*m7 + m3 + m4 + m5 + m6
+ paddw m7, m6
+ psubw m3, m6 ; 2*m3 + 2*m4 + 2*m5 + m6
+ paddw m7, m6 ; m3 + m4 + m5 + 3*m6 + 2*m7
+ paddw m3, m2 ; m2 + 2*m3 + 2*m4 + 2*m5 + m6
+
+ punpcklqdq m9, m8
+ punpcklqdq m3, m7
+ punpcklqdq m5, m2
+ punpcklqdq m4, m6
+
+ movd m7, r3d ; -tcP
+ movd m2, r4d ; -tcQ
+ pshufb m7, [pb_01]
+ pshufb m2, [pb_01]
+ mova m6, m2
+ punpcklqdq m6, m7
+
+ paddw m0, [pw_4]
+ paddw m3, [pw_4]
+ paddw m9, [pw_2]
+
+ psraw m0, 3
+ psraw m3, 3
+ psraw m9, 2
+
+ psubw m0, m1
+ psubw m3, m4
+ psubw m9, m5
+
+ pmaxsw m0, m7
+ pmaxsw m3, m2
+ pmaxsw m9, m6
+ psignw m7, [pw_n1]
+ psignw m2, [pw_n1]
+ psignw m6, [pw_n1]
+ pminsw m0, m7
+ pminsw m3, m2
+ pminsw m9, m6
+
+ paddw m0, m1
+ paddw m3, m4
+ paddw m9, m5
+ packuswb m0, m0
+ packuswb m3, m9
+
+ movd [r0 + r6], m0
+ pextrd [r0 + r1], m0, 1
+ movd [r0], m3
+ pextrd [r0 + r2 * 2], m3, 1
+ pextrd [r0 + r2 * 1], m3, 2
+ pextrd [r0 + r1 * 2], m3, 3
+ RET
+
+INIT_XMM sse4
+cglobal pelFilterLumaStrong_V, 5,5,10
+ neg r3d
+ neg r4d
+ lea r2, [r1 * 3]
+
+ movh m0, [r0 - 4] ; src[-offset * 4] row 0
+ movh m1, [r0 + r1 * 1 - 4] ; src[-offset * 4] row 1
+ movh m2, [r0 + r1 * 2 - 4] ; src[-offset * 4] row 2
+ movh m3, [r0 + r2 * 1 - 4] ; src[-offset * 4] row 3
+
+ punpcklbw m0, m1
+ punpcklbw m2, m3
+ mova m4, m0
+ punpcklwd m0, m2
+ punpckhwd m4, m2
+ mova m1, m0
+ mova m2, m0
+ mova m3, m0
+ pshufd m0, m0, 0
+ pshufd m1, m1, 1
+ pshufd m2, m2, 2
+ pshufd m3, m3, 3
+ mova m5, m4
+ mova m6, m4
+ mova m7, m4
+ pshufd m4, m4, 0
+ pshufd m5, m5, 1
+ pshufd m6, m6, 2
+ pshufd m7, m7, 3
+ pmovzxbw m0, m0
+ pmovzxbw m1, m1
+ pmovzxbw m2, m2
+ pmovzxbw m3, m3
+ pmovzxbw m4, m4
+ pmovzxbw m5, m5
+ pmovzxbw m6, m6
+ pmovzxbw m7, m7
+
+ paddw m0, m0 ; m0*2
+ mova m8, m2
+ paddw m8, m3 ; m2 + m3
+ paddw m8, m4 ; m2 + m3 + m4
+ mova m9, m8
+ paddw m9, m9 ; 2*m2 + 2*m3 + 2*m4
+ paddw m8, m1 ; m2 + m3 + m4 + m1
+ paddw m0, m8 ; 2*m0 + m2+ m3 + m4 + m1
+ paddw m9, m1
+ paddw m0, m1
+ paddw m9, m5 ; m1 + 2*m2 + 2*m3 + 2*m4 + m5
+ paddw m0, m1 ; 2*m0 + 3*m1 + m2 + m3 + m4
+
+ punpcklqdq m0, m9
+ punpcklqdq m1, m3
+
+ paddw m3, m4
+ mova m9, m5
+ paddw m9, m6
+ paddw m7, m7 ; 2*m7
+ paddw m9, m3 ; m3 + m4 + m5 + m6
+ mova m3, m9
+ paddw m3, m3 ; 2*m3 + 2*m4 + 2*m5 + 2*m6
+ paddw m7, m9 ; 2*m7 + m3 + m4 + m5 + m6
+ paddw m7, m6
+ psubw m3, m6 ; 2*m3 + 2*m4 + 2*m5 + m6
+ paddw m7, m6 ; m3 + m4 + m5 + 3*m6 + 2*m7
+ paddw m3, m2 ; m2 + 2*m3 + 2*m4 + 2*m5 + m6
+
+ punpcklqdq m9, m8
+ punpcklqdq m3, m7
+ punpcklqdq m5, m2
+ punpcklqdq m4, m6
+
+ movd m7, r3d ; -tcP
+ movd m2, r4d ; -tcQ
+ pshufb m7, [pb_01]
+ pshufb m2, [pb_01]
+ mova m6, m2
+ punpcklqdq m6, m7
+
+ paddw m0, [pw_4]
+ paddw m3, [pw_4]
+ paddw m9, [pw_2]
+
+ psraw m0, 3
+ psraw m3, 3
+ psraw m9, 2
+
+ psubw m0, m1
+ psubw m3, m4
+ psubw m9, m5
+
+ pmaxsw m0, m7
+ pmaxsw m3, m2
+ pmaxsw m9, m6
+ psignw m7, [pw_n1]
+ psignw m2, [pw_n1]
+ psignw m6, [pw_n1]
+ pminsw m0, m7
+ pminsw m3, m2
+ pminsw m9, m6
+
+ paddw m0, m1
+ paddw m3, m4
+ paddw m9, m5
+ packuswb m0, m0
+ packuswb m3, m9
+
+ ; 4x6 output rows -
+ ; m0 - col 0
+ ; m3 - col 3
+ mova m1, m0
+ mova m2, m3
+ mova m4, m3
+ mova m5, m3
+ pshufd m1, m1, 1 ; col 2
+ pshufd m2, m2, 1 ; col 5
+ pshufd m4, m4, 2 ; col 4
+ pshufd m5, m5, 3 ; col 1
+
+ ; transpose 4x6 to 6x4
+ punpcklbw m0, m5
+ punpcklbw m1, m3
+ punpcklbw m4, m2
+ punpcklwd m0, m1
+
+ movd [r0 + r1 * 0 - 3], m0
+ pextrd [r0 + r1 * 1 - 3], m0, 1
+ pextrd [r0 + r1 * 2 - 3], m0, 2
+ pextrd [r0 + r2 * 1 - 3], m0, 3
+ pextrw [r0 + r1 * 0 + 1], m4, 0
+ pextrw [r0 + r1 * 1 + 1], m4, 1
+ pextrw [r0 + r1 * 2 + 1], m4, 2
+ pextrw [r0 + r2 * 1 + 1], m4, 3
+ RET
+%endif ; ARCH_X86_64
+
+
;void saoCuStatsE2_c(const int16_t *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
;{
diff -r 38e4b94377fa -r 86627e458e6e source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Tue Oct 06 14:19:56 2015 +0530
+++ b/source/common/x86/loopfilter.h Thu Oct 08 12:06:20 2015 +0530
@@ -46,4 +46,7 @@
DECL_SAO(sse4);
DECL_SAO(avx2);
+void PFX(pelFilterLumaStrong_V_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+void PFX(pelFilterLumaStrong_H_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+
#endif // ifndef X265_LOOPFILTER_H
diff -r 38e4b94377fa -r 86627e458e6e source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Oct 06 14:19:56 2015 +0530
+++ b/source/test/pixelharness.cpp Thu Oct 08 12:06:20 2015 +0530
@@ -1835,6 +1835,72 @@
return true;
}
+bool PixelHarness::check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt)
+{
+ intptr_t srcStep = 1, offset = 64;
+ int32_t tcP, tcQ, maskP, maskQ, tc;
+ int j = 0;
+
+ pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
+ for (int i = 0; i < TEST_CASES; i++)
+ memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel) * BUFFSIZE);
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ tc = rand() % PIXEL_MAX;
+ maskP = (rand() % PIXEL_MAX) - 1;
+ maskQ = (rand() % PIXEL_MAX) - 1;
+ tcP = (tc & maskP);
+ tcQ = (tc & maskQ);
+
+ int index = rand() % 3;
+
+ ref(pixel_test_buff[index] + 4 * offset + j, srcStep, offset, tcP, tcQ);
+ checked(opt, pixel_test_buff1[index] + 4 * offset + j, srcStep, offset, tcP, tcQ);
+
+ if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel) * BUFFSIZE))
+ return false;
+
+ reportfail()
+ j += INCR;
+ }
+
+ return true;
+}
+
+bool PixelHarness::check_pelFilterLumaStrong_V(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt)
+{
+ intptr_t srcStep = 64, offset = 1;
+ int32_t tcP, tcQ, maskP, maskQ, tc;
+ int j = 0;
+
+ pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
+ for (int i = 0; i < TEST_CASES; i++)
+ memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel) * BUFFSIZE);
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ tc = rand() % PIXEL_MAX;
+ maskP = (rand() % PIXEL_MAX) - 1;
+ maskQ = (rand() % PIXEL_MAX) - 1;
+ tcP = (tc & maskP);
+ tcQ = (tc & maskQ);
+
+ int index = rand() % 3;
+
+ ref(pixel_test_buff[index] + 4 + j, srcStep, offset, tcP, tcQ);
+ checked(opt, pixel_test_buff1[index] + 4 + j, srcStep, offset, tcP, tcQ);
+
+ if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel) * BUFFSIZE))
+ return false;
+
+ reportfail()
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
if (opt.pu[part].satd)
@@ -2484,6 +2550,24 @@
}
}
+ if (opt.pelFilterLumaStrong[0])
+ {
+ if (!check_pelFilterLumaStrong_V(ref.pelFilterLumaStrong[0], opt.pelFilterLumaStrong[0]))
+ {
+ printf("pelFilterLumaStrong Vertical failed!\n");
+ return false;
+ }
+ }
+
+ if (opt.pelFilterLumaStrong[1])
+ {
+ if (!check_pelFilterLumaStrong_H(ref.pelFilterLumaStrong[1], opt.pelFilterLumaStrong[1]))
+ {
+ printf("pelFilterLumaStrong Horizontal failed!\n");
+ return false;
+ }
+ }
+
return true;
}
@@ -2973,4 +3057,20 @@
uint64_t dummy;
REPORT_SPEEDUP(opt.planeClipAndMax, ref.planeClipAndMax, pbuf1, 128, 63, 62, &dummy, 1, PIXEL_MAX - 1);
}
+
+ if (opt.pelFilterLumaStrong[0])
+ {
+ int32_t tcP = (rand() % PIXEL_MAX) - 1;
+ int32_t tcQ = (rand() % PIXEL_MAX) - 1;
+ HEADER0("pelFilterLumaStrong_Vertical");
+ REPORT_SPEEDUP(opt.pelFilterLumaStrong[0], ref.pelFilterLumaStrong[0], pbuf1, STRIDE, 1, tcP, tcQ);
+ }
+
+ if (opt.pelFilterLumaStrong[1])
+ {
+ int32_t tcP = (rand() % PIXEL_MAX) - 1;
+ int32_t tcQ = (rand() % PIXEL_MAX) - 1;
+ HEADER0("pelFilterLumaStrong_Horizontal");
+ REPORT_SPEEDUP(opt.pelFilterLumaStrong[1], ref.pelFilterLumaStrong[1], pbuf1, 1, STRIDE, tcP, tcQ);
+ }
}
diff -r 38e4b94377fa -r 86627e458e6e source/test/pixelharness.h
--- a/source/test/pixelharness.h Tue Oct 06 14:19:56 2015 +0530
+++ b/source/test/pixelharness.h Thu Oct 08 12:06:20 2015 +0530
@@ -119,6 +119,8 @@
bool check_costCoeffRemain(costCoeffRemain_t ref, costCoeffRemain_t opt);
bool check_costC1C2Flag(costC1C2Flag_t ref, costC1C2Flag_t opt);
bool check_planeClipAndMax(planeClipAndMax_t ref, planeClipAndMax_t opt);
+ bool check_pelFilterLumaStrong_V(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
+ bool check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
public:
More information about the x265-devel
mailing list