[x265] [PATCH 2 of 3] asm: asm code for pelFilterChroma_V/H
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Fri Feb 26 10:11:30 CET 2016
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1456466679 -19800
# Fri Feb 26 11:34:39 2016 +0530
# Node ID 59d9eca3d144e71f11d509a5dd40b634bb9ab500
# Parent 5ff8ee940ad7f4d34b106ae4999b996245c87919
asm: asm code for pelFilterChroma_V/H
diff -r 5ff8ee940ad7 -r 59d9eca3d144 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Feb 26 11:33:33 2016 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Feb 26 11:34:39 2016 +0530
@@ -2535,6 +2535,8 @@
#if X86_64
p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
+ p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4);
+ p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4);
// p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
diff -r 5ff8ee940ad7 -r 59d9eca3d144 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Fri Feb 26 11:33:33 2016 +0530
+++ b/source/common/x86/const-a.asm Fri Feb 26 11:34:39 2016 +0530
@@ -48,6 +48,8 @@
const pb_a1, times 16 db 0xa1
const pb_01, times 8 db 0, 1
+const pb_0123, times 4 db 0, 1
+ times 4 db 2, 3
const hsub_mul, times 16 db 1, -1
const pw_swap, times 2 db 6, 7, 4, 5, 2, 3, 0, 1
const pb_unpackbd1, times 2 db 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
@@ -66,6 +68,7 @@
times 12 db 0x00
const pb_000000000000000F, db 0xff
times 15 db 0x00
+const pb_shuf_off4, times 2 db 0, 4, 1, 5, 2, 6, 3, 7
;; 16-bit constants
@@ -117,6 +120,8 @@
const hmul_16p, times 16 db 1
times 8 db 1, -1
const pw_exp2_0_15, dw 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768
+const pw_1_ffff, times 4 dw 1
+ times 4 dw 0xFFFF
;; 32-bit constants
diff -r 5ff8ee940ad7 -r 59d9eca3d144 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Fri Feb 26 11:33:33 2016 +0530
+++ b/source/common/x86/loopfilter.asm Fri Feb 26 11:34:39 2016 +0530
@@ -36,6 +36,7 @@
cextern pb_3
cextern pb_4
cextern pb_01
+cextern pb_0123
cextern pb_15
cextern pb_31
cextern pb_124
@@ -48,8 +49,8 @@
cextern pb_movemask
cextern pb_movemask_32
cextern hmul_16p
-
-
+cextern pw_1_ffff
+cextern pb_shuf_off4
;============================================================================================================
; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride)
;============================================================================================================
@@ -3987,3 +3988,108 @@
pextrw [r0 + r2 * 1 + 1], m4, 3
RET
%endif ; ARCH_X86_64
+
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal pelFilterChroma_H, 6,6,5
+ mov r1, r2
+ neg r3d
+ neg r1
+
+ pmovzxbw m4, [r0] ; src[0]
+ pmovzxbw m3, [r0 + r1] ; src[-offset]
+ pmovzxbw m0, [r0 + r2] ; src[offset]
+ pmovzxbw m2, [r0 + r1 * 2] ; src[-offset * 2]
+
+ psubw m1, m4, m3 ; m4 - m3
+ psubw m2, m0 ; m2 - m5
+ paddw m2, [pw_4]
+ psllw m1, 2 ; (m4 - m3) * 4
+ paddw m1, m2
+ psraw m1, 3
+
+ movd m0, r3d
+ pshufb m0, [pb_01] ; -tc
+
+ pmaxsw m1, m0
+ psignw m0, [pw_n1]
+ pminsw m1, m0 ; delta
+ punpcklqdq m1, m1
+
+ shl r5d, 16
+ or r5w, r4w
+ punpcklqdq m3, m4
+ mova m2, [pw_1_ffff]
+
+ movd m0, r5d
+ pshufb m0, [pb_0123]
+
+ pand m0, m1 ; (delta & maskP) (delta & maskQ)
+ psignw m0, m2
+ paddw m3, m0
+
+ pxor m0, m0
+ pmaxsw m3, m0
+ pminsw m3, [pw_pixel_max]
+
+ packuswb m3, m3
+ movd [r0 + r1], m3
+ pextrd [r0], m3, 1
+ RET
+
+INIT_XMM sse4
+cglobal pelFilterChroma_V, 6,6,5
+ neg r3d
+ lea r2, [r1 * 3]
+
+ pmovzxbw m4, [r0 + r1 * 0 - 2] ; src[-offset*2, -offset, 0, offset] [m2 m3 m4 m5]
+ pmovzxbw m3, [r0 + r1 * 1 - 2]
+ pmovzxbw m0, [r0 + r1 * 2 - 2]
+ pmovzxbw m2, [r0 + r2 * 1 - 2]
+
+ punpcklwd m4, m3
+ punpcklwd m0, m2
+ punpckldq m2, m4, m0 ; [m2 m2 m2 m2 m3 m3 m3 m3]
+ punpckhdq m4, m0 ; [m4 m4 m4 m4 m5 m5 m5 m5]
+ psrldq m3, m2, 8
+ psrldq m0, m4, 8
+
+ psubw m1, m4, m3 ; m4 - m3
+ psubw m2, m0 ; m2 - m5
+ paddw m2, [pw_4]
+ psllw m1, 2 ; (m4 - m3) * 4
+ paddw m1, m2
+ psraw m1, 3
+
+ movd m0, r3d
+ pshufb m0, [pb_01] ; -tc
+
+ pmaxsw m1, m0
+ psignw m0, [pw_n1]
+ pminsw m1, m0 ; delta
+ punpcklqdq m1, m1
+
+ shl r5d, 16
+ or r5w, r4w
+ punpcklqdq m3, m4
+ mova m2, [pw_1_ffff]
+
+ movd m0, r5d
+ pshufb m0, [pb_0123]
+
+ pand m0, m1 ; (delta & maskP) (delta & maskQ)
+ psignw m0, m2
+ paddw m3, m0
+
+ pxor m0, m0
+ pmaxsw m3, m0
+ pminsw m3, [pw_pixel_max]
+
+ packuswb m3, m3
+ pshufb m3, [pb_shuf_off4]
+ pextrw [r0 + r1 * 0 - 1], m3, 0
+ pextrw [r0 + r1 * 1 - 1], m3, 1
+ pextrw [r0 + r1 * 2 - 1], m3, 2
+ pextrw [r0 + r2 * 1 - 1], m3, 3
+ RET
+%endif ; ARCH_X86_64
diff -r 5ff8ee940ad7 -r 59d9eca3d144 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Fri Feb 26 11:33:33 2016 +0530
+++ b/source/common/x86/loopfilter.h Fri Feb 26 11:34:39 2016 +0530
@@ -48,5 +48,7 @@
void PFX(pelFilterLumaStrong_V_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
void PFX(pelFilterLumaStrong_H_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+void PFX(pelFilterChroma_V_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ);
+void PFX(pelFilterChroma_H_sse4)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ);
#endif // ifndef X265_LOOPFILTER_H
diff -r 5ff8ee940ad7 -r 59d9eca3d144 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Fri Feb 26 11:33:33 2016 +0530
+++ b/source/test/pixelharness.cpp Fri Feb 26 11:34:39 2016 +0530
@@ -1912,6 +1912,68 @@
return true;
}
+bool PixelHarness::check_pelFilterChroma_H(pelFilterChroma_t ref, pelFilterChroma_t opt)
+{
+ intptr_t srcStep = 1, offset = 64;
+ int32_t maskP, maskQ, tc;
+ int j = 0;
+
+ pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
+ for (int i = 0; i < TEST_CASES; i++)
+ memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel)* BUFFSIZE);
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ tc = rand() % PIXEL_MAX;
+ maskP = (rand() % PIXEL_MAX) - 1;
+ maskQ = (rand() % PIXEL_MAX) - 1;
+
+ int index = rand() % 3;
+
+ ref(pixel_test_buff[index] + 4 * offset + j, srcStep, offset, tc, maskP, maskQ);
+ checked(opt, pixel_test_buff1[index] + 4 * offset + j, srcStep, offset, tc, maskP, maskQ);
+
+ if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel)* BUFFSIZE))
+ return false;
+
+ reportfail()
+ j += INCR;
+ }
+
+ return true;
+}
+
+bool PixelHarness::check_pelFilterChroma_V(pelFilterChroma_t ref, pelFilterChroma_t opt)
+{
+ intptr_t srcStep = 64, offset = 1;
+ int32_t maskP, maskQ, tc;
+ int j = 0;
+
+ pixel pixel_test_buff1[TEST_CASES][BUFFSIZE];
+ for (int i = 0; i < TEST_CASES; i++)
+ memcpy(pixel_test_buff1[i], pixel_test_buff[i], sizeof(pixel)* BUFFSIZE);
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ tc = rand() % PIXEL_MAX;
+ maskP = (rand() % PIXEL_MAX) - 1;
+ maskQ = (rand() % PIXEL_MAX) - 1;
+
+ int index = rand() % 3;
+
+ ref(pixel_test_buff[index] + 4 + j, srcStep, offset, tc, maskP, maskQ);
+ checked(opt, pixel_test_buff1[index] + 4 + j, srcStep, offset, tc, maskP, maskQ);
+
+ if (memcmp(pixel_test_buff[index], pixel_test_buff1[index], sizeof(pixel)* BUFFSIZE))
+ return false;
+
+ reportfail()
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
if (opt.pu[part].satd)
@@ -2570,6 +2632,24 @@
}
}
+ if (opt.pelFilterChroma[0])
+ {
+ if (!check_pelFilterChroma_V(ref.pelFilterChroma[0], opt.pelFilterChroma[0]))
+ {
+ printf("pelFilterChroma Vertical failed!\n");
+ return false;
+ }
+ }
+
+ if (opt.pelFilterChroma[1])
+ {
+ if (!check_pelFilterChroma_H(ref.pelFilterChroma[1], opt.pelFilterChroma[1]))
+ {
+ printf("pelFilterChroma Horizontal failed!\n");
+ return false;
+ }
+ }
+
return true;
}
@@ -3069,4 +3149,22 @@
HEADER0("pelFilterLumaStrong_Horizontal");
REPORT_SPEEDUP(opt.pelFilterLumaStrong[1], ref.pelFilterLumaStrong[1], pbuf1, 1, STRIDE, tcP, tcQ);
}
+
+ if (opt.pelFilterChroma[0])
+ {
+ int32_t tc = (rand() % PIXEL_MAX);
+ int32_t maskP = (rand() % PIXEL_MAX) - 1;
+ int32_t maskQ = (rand() % PIXEL_MAX) - 1;
+ HEADER0("pelFilterChroma_Vertical");
+ REPORT_SPEEDUP(opt.pelFilterChroma[0], ref.pelFilterChroma[0], pbuf1, STRIDE, 1, tc, maskP, maskQ);
+ }
+
+ if (opt.pelFilterChroma[1])
+ {
+ int32_t tc = (rand() % PIXEL_MAX);
+ int32_t maskP = (rand() % PIXEL_MAX) - 1;
+ int32_t maskQ = (rand() % PIXEL_MAX) - 1;
+ HEADER0("pelFilterChroma_Horizontal");
+ REPORT_SPEEDUP(opt.pelFilterChroma[1], ref.pelFilterChroma[1], pbuf1, 1, STRIDE, tc, maskP, maskQ);
+ }
}
diff -r 5ff8ee940ad7 -r 59d9eca3d144 source/test/pixelharness.h
--- a/source/test/pixelharness.h Fri Feb 26 11:33:33 2016 +0530
+++ b/source/test/pixelharness.h Fri Feb 26 11:34:39 2016 +0530
@@ -123,6 +123,8 @@
bool check_planeClipAndMax(planeClipAndMax_t ref, planeClipAndMax_t opt);
bool check_pelFilterLumaStrong_V(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
bool check_pelFilterLumaStrong_H(pelFilterLumaStrong_t ref, pelFilterLumaStrong_t opt);
+ bool check_pelFilterChroma_V(pelFilterChroma_t ref, pelFilterChroma_t opt);
+ bool check_pelFilterChroma_H(pelFilterChroma_t ref, pelFilterChroma_t opt);
public:
More information about the x265-devel
mailing list