[x265] [PATCH] asm-ssse3: filter_p2s[12x16](10.27x), filter_p2s[24x32](10.30x),
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Thu Mar 5 13:07:14 CET 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1425556197 -19800
# Thu Mar 05 17:19:57 2015 +0530
# Node ID 38cb572c2927eee8039464ec462b874c0da20871
# Parent ea9bdb10353fcb06cea1045ba0186c22c448df63
asm-ssse3: filter_p2s[12x16](10.27x), filter_p2s[24x32](10.30x),
filter_p2s[48x64](9.60x)
diff -r ea9bdb10353f -r 38cb572c2927 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Mar 04 13:20:55 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Mar 05 17:19:57 2015 +0530
@@ -1278,7 +1278,9 @@
p.pu[LUMA_64x32].filter_p2s = x265_pixelToShort_64x32_ssse3;
p.pu[LUMA_64x48].filter_p2s = x265_pixelToShort_64x48_ssse3;
p.pu[LUMA_64x64].filter_p2s = x265_pixelToShort_64x64_ssse3;
-
+ p.pu[LUMA_12x16].filter_p2s = x265_pixelToShort_12x16_ssse3;
+ p.pu[LUMA_24x32].filter_p2s = x265_pixelToShort_24x32_ssse3;
+ p.pu[LUMA_48x64].filter_p2s = x265_pixelToShort_48x64_ssse3;
p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3;
p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3;
diff -r ea9bdb10353f -r 38cb572c2927 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Wed Mar 04 13:20:55 2015 +0530
+++ b/source/common/x86/ipfilter8.asm Thu Mar 05 17:19:57 2015 +0530
@@ -5579,8 +5579,9 @@
FILTER_V4_W16n_H2 64, 48
FILTER_V4_W16n_H2 48, 64
FILTER_V4_W16n_H2 64, 16
-;-----------------------------------------------------------------------------
-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
;-----------------------------------------------------------------------------
%macro PIXEL_WH_4xN 2
INIT_XMM ssse3
@@ -5641,9 +5642,8 @@
PIXEL_WH_4xN 4, 4
PIXEL_WH_4xN 4, 8
PIXEL_WH_4xN 4, 16
-
-;-----------------------------------------------------------------------------
-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
;-----------------------------------------------------------------------------
%macro PIXEL_WH_8xN 2
INIT_XMM ssse3
@@ -5703,10 +5703,8 @@
PIXEL_WH_8xN 8, 4
PIXEL_WH_8xN 8, 16
PIXEL_WH_8xN 8, 32
-
-
-;-----------------------------------------------------------------------------
-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
;-----------------------------------------------------------------------------
%macro PIXEL_WH_16xN 2
INIT_XMM ssse3
@@ -5768,9 +5766,8 @@
PIXEL_WH_16xN 16, 12
PIXEL_WH_16xN 16, 32
PIXEL_WH_16xN 16, 64
-
-;-----------------------------------------------------------------------------
-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
;-----------------------------------------------------------------------------
%macro PIXEL_WH_32xN 2
INIT_XMM ssse3
@@ -5831,9 +5828,8 @@
PIXEL_WH_32xN 32, 16
PIXEL_WH_32xN 32, 24
PIXEL_WH_32xN 32, 64
-
-;-----------------------------------------------------------------------------
-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
;-----------------------------------------------------------------------------
%macro PIXEL_WH_64xN 2
INIT_XMM ssse3
@@ -5893,6 +5889,165 @@
PIXEL_WH_64xN 64, 16
PIXEL_WH_64xN 64, 32
PIXEL_WH_64xN 64, 48
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal pixelToShort_12x16, 3, 4, 6
+
+ ; load constant
+ mova m4, [pb_128]
+ mova m5, [tab_c_64_n64]
+
+%rep 4
+
+ movu m0, [r0]
+ punpcklbw m1, m0, m4
+ punpckhbw m0, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+
+ movu m2, [r0 + r1]
+ punpcklbw m3, m2, m4
+ punpckhbw m2, m4
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+
+ movu [r2 + FENC_STRIDE * 0], m1
+ movu [r2 + FENC_STRIDE * 2], m3
+
+ movh [r2 + FENC_STRIDE * 0 + 16], m0
+ movh [r2 + FENC_STRIDE * 2 + 16], m2
+
+ movu m0, [r0 + r1 * 2]
+ punpcklbw m1, m0, m4
+ punpckhbw m0, m4
+ pmaddubsw m0, m5
+ pmaddubsw m1, m5
+
+ lea r3, [r0 + r1 * 2]
+ movu m2, [r3 + r1]
+ punpcklbw m3, m2, m4
+ punpckhbw m2, m4
+ pmaddubsw m2, m5
+ pmaddubsw m3, m5
+
+ movu [r2 + FENC_STRIDE * 4], m1
+ movu [r2 + FENC_STRIDE * 6], m3
+
+ movh [r2 + FENC_STRIDE * 4 + 16], m0
+ movh [r2 + FENC_STRIDE * 6 + 16], m2
+
+ lea r0, [r0 + r1 * 4]
+ add r2, FENC_STRIDE * 8
+%endrep
+ RET
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal pixelToShort_24x32, 3, 6, 6
+
+ ; load height
+ mov r3d, 32
+
+ ; load constant
+ mova m4, [pb_128]
+ mova m5, [tab_c_64_n64]
+
+.loopH:
+
+ xor r4d, r4d
+.loopW:
+ lea r5, [r0 + r4]
+
+ movh m0, [r5]
+ punpcklbw m0, m4
+ pmaddubsw m0, m5
+
+ movh m1, [r5 + r1]
+ punpcklbw m1, m4
+ pmaddubsw m1, m5
+
+ movh m2, [r5 + r1 * 2]
+ punpcklbw m2, m4
+ pmaddubsw m2, m5
+
+ lea r5, [r5 + r1 * 2]
+ movh m3, [r5 + r1]
+ punpcklbw m3, m4
+ pmaddubsw m3, m5
+
+ add r4, 8
+ cmp r4, 24
+ movu [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0
+ movu [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1
+ movu [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2
+ movu [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3
+ je .nextH
+ jnz .loopW
+
+.nextH:
+ lea r0, [r0 + r1 * 4]
+ add r2, FENC_STRIDE * 8
+
+ sub r3d, 4
+ jnz .loopH
+ RET
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal pixelToShort_48x64, 3, 6, 6
+
+ ; load height
+ mov r3d, 64
+
+ ; load constant
+ mova m4, [pb_128]
+ mova m5, [tab_c_64_n64]
+
+.loopH:
+
+ xor r4d, r4d
+.loopW:
+ lea r5, [r0 + r4]
+
+ movh m0, [r5]
+ punpcklbw m0, m4
+ pmaddubsw m0, m5
+
+ movh m1, [r5 + r1]
+ punpcklbw m1, m4
+ pmaddubsw m1, m5
+
+ movh m2, [r5 + r1 * 2]
+ punpcklbw m2, m4
+ pmaddubsw m2, m5
+
+ lea r5, [r5 + r1 * 2]
+ movh m3, [r5 + r1]
+ punpcklbw m3, m4
+ pmaddubsw m3, m5
+
+ add r4, 8
+ cmp r4, 48
+ movu [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0
+ movu [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1
+ movu [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2
+ movu [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3
+ je .nextH
+ jnz .loopW
+
+.nextH:
+ lea r0, [r0 + r1 * 4]
+ add r2, FENC_STRIDE * 8
+
+ sub r3d, 4
+ jnz .loopH
+ RET
%macro PROCESS_LUMA_W4_4R 0
movd m0, [r0]
diff -r ea9bdb10353f -r 38cb572c2927 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Wed Mar 04 13:20:55 2015 +0530
+++ b/source/common/x86/ipfilter8.h Thu Mar 05 17:19:57 2015 +0530
@@ -642,6 +642,10 @@
void x265_pixelToShort_64x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
void x265_pixelToShort_64x48_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
void x265_pixelToShort_64x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_12x16_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_24x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_48x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+
#undef LUMA_FILTERS
#undef LUMA_SP_FILTERS
#undef LUMA_SS_FILTERS
diff -r ea9bdb10353f -r 38cb572c2927 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp Wed Mar 04 13:20:55 2015 +0530
+++ b/source/test/ipfilterharness.cpp Thu Mar 05 17:19:57 2015 +0530
@@ -523,7 +523,7 @@
checked(opt, pixel_test_buff[index] + i, rand_srcStride, IPF_vec_output_s);
- if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(pixel)))
+ if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * sizeof(int16_t)))
return false;
reportfail();
More information about the x265-devel
mailing list