[x265] [PATCH 11 of 12] asm: interp_4tap_vert_ps_64xN and interp_4tap_vert_ps_48x64 sse2
dtyx265 at gmail.com
dtyx265 at gmail.com
Tue May 19 02:24:38 CEST 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1431994278 25200
# Node ID a1ae3a91f5e011753017db8579296b5702439579
# Parent 91010ea886c50f9802c1ab872bd2648041137e19
asm: interp_4tap_vert_ps_64xN and interp_4tap_vert_ps_48x64 sse2
Converted vert_pp_64xN macro to also create ps primitives. This replaces c code for ps with minimal impact on pp.
64-bit
./test/TestBench --testbench interp | grep vp | grep "\[48x"
chroma_vpp[48x64] 8.08x 100330.05 810881.69
chroma_vps[48x64] 7.20x 94220.81 678188.62
./test/TestBench --testbench interp | grep vp | grep "\[64x"
chroma_vpp[64x64] 8.11x 133558.36 1083062.38
chroma_vps[64x64] 7.33x 125722.91 922104.19
chroma_vpp[64x32] 8.07x 66936.26 540263.38
chroma_vps[64x32] 7.29x 62902.86 458592.06
chroma_vpp[64x48] 8.11x 100331.80 813477.25
chroma_vps[64x48] 7.32x 94151.30 689587.25
chroma_vpp[64x16] 8.17x 33602.11 274452.09
chroma_vps[64x16] 7.28x 31558.53 229885.62
diff -r 91010ea886c5 -r a1ae3a91f5e0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon May 18 17:07:23 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp Mon May 18 17:11:18 2015 -0700
@@ -1572,6 +1572,11 @@
p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = x265_interp_4tap_vert_ps_32x24_sse2;
p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = x265_interp_4tap_vert_ps_32x32_sse2;
p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = x265_interp_4tap_vert_ps_32x64_sse2;
+ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = x265_interp_4tap_vert_ps_48x64_sse2;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = x265_interp_4tap_vert_ps_64x16_sse2;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = x265_interp_4tap_vert_ps_64x32_sse2;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = x265_interp_4tap_vert_ps_64x48_sse2;
+ p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = x265_interp_4tap_vert_ps_64x64_sse2;
#endif
ALL_LUMA_PU(luma_hpp, interp_8tap_horiz_pp, sse2);
diff -r 91010ea886c5 -r a1ae3a91f5e0 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Mon May 18 17:07:23 2015 -0700
+++ b/source/common/x86/ipfilter8.asm Mon May 18 17:11:18 2015 -0700
@@ -2638,17 +2638,21 @@
%endif
;-----------------------------------------------------------------------------
-; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro FILTER_V4_W16n_H2_sse2 2
+; void interp_4tap_vert_%1_%2x%3(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V4_W16n_H2_sse2 3
INIT_XMM sse2
-cglobal interp_4tap_vert_pp_%1x%2, 4, 7, 11
-
+cglobal interp_4tap_vert_%1_%2x%3, 4, 7, 11
mov r4d, r4m
sub r0, r1
shl r4d, 5
pxor m9, m9
+
+%ifidn %1,pp
mova m7, [pw_32]
+%elifidn %1,ps
+ mova m7, [pw_2000]
+%endif
%ifdef PIC
lea r5, [tab_ChromaCoeffV]
@@ -2659,11 +2663,12 @@
mova m0, [tab_ChromaCoeffV + r4 + 16]
%endif
- mov r4d, %2/2
-
-.loop:
-
- mov r6d, %1/16
+ mov r4d, %3/2
+ lea r5, [3 * r1]
+
+.loop:
+
+ mov r6d, %2/16
.loopW:
@@ -2687,11 +2692,10 @@
pmaddwd m8, m1
packssdw m2, m8
- lea r5, [r0 + 2 * r1]
- movu m5, [r5]
- movu m6, [r5 + r1]
-
- punpckhbw m10, m5, m6
+ movu m5, [r0 + 2 * r1]
+ movu m6, [r0 + r5]
+
+ punpckhbw m10, m5, m6
movhlps m8, m10
punpcklbw m10, m9
punpcklbw m8, m9
@@ -2700,7 +2704,7 @@
packssdw m10, m8
paddw m2, m10
- punpcklbw m10, m5, m6
+ punpcklbw m10, m5, m6
movhlps m8, m10
punpcklbw m10, m9
punpcklbw m8, m9
@@ -2709,6 +2713,7 @@
packssdw m10, m8
paddw m4, m10
+%ifidn %1,pp
paddw m4, m7
psraw m4, 6
paddw m2, m7
@@ -2716,6 +2721,12 @@
packuswb m4, m2
movu [r2], m4
+%elifidn %1,ps
+ psubw m4, m7
+ psubw m2, m7
+ movu [r2], m4
+ movu [r2 + 16], m2
+%endif
punpcklbw m4, m3, m5
punpckhbw m3, m5
@@ -2734,7 +2745,7 @@
pmaddwd m8, m1
packssdw m3, m8
- movu m5, [r5 + 2 * r1]
+ movu m5, [r0 + 4 * r1]
punpcklbw m2, m6, m5
punpckhbw m6, m5
@@ -2756,33 +2767,53 @@
paddw m4, m2
paddw m3, m6
+%ifidn %1,pp
paddw m4, m7
psraw m4, 6
paddw m3, m7
psraw m3, 6
packuswb m4, m3
- movu [r2 + r3], m4
+ movu [r2 + r3], m4
+ add r2, 16
+%elifidn %1,ps
+ psubw m4, m7
+ psubw m3, m7
+
+ movu [r2 + 2 * r3], m4
+ movu [r2 + 2 * r3 + 16], m3
+ add r2, 32
+%endif
add r0, 16
- add r2, 16
dec r6d
jnz .loopW
- lea r0, [r0 + r1 * 2 - %1]
- lea r2, [r2 + r3 * 2 - %1]
+ lea r0, [r0 + r1 * 2 - %2]
+
+%ifidn %1,pp
+ lea r2, [r2 + r3 * 2 - %2]
+%elifidn %1,ps
+ lea r2, [r2 + r3 * 4 - (%2 * 2)]
+%endif
dec r4d
jnz .loop
RET
+
%endmacro
%if ARCH_X86_64
- FILTER_V4_W16n_H2_sse2 64, 64
- FILTER_V4_W16n_H2_sse2 64, 32
- FILTER_V4_W16n_H2_sse2 64, 48
- FILTER_V4_W16n_H2_sse2 48, 64
- FILTER_V4_W16n_H2_sse2 64, 16
+ FILTER_V4_W16n_H2_sse2 pp, 64, 64
+ FILTER_V4_W16n_H2_sse2 pp, 64, 32
+ FILTER_V4_W16n_H2_sse2 pp, 64, 48
+ FILTER_V4_W16n_H2_sse2 pp, 48, 64
+ FILTER_V4_W16n_H2_sse2 pp, 64, 16
+ FILTER_V4_W16n_H2_sse2 ps, 64, 64
+ FILTER_V4_W16n_H2_sse2 ps, 64, 32
+ FILTER_V4_W16n_H2_sse2 ps, 64, 48
+ FILTER_V4_W16n_H2_sse2 ps, 48, 64
+ FILTER_V4_W16n_H2_sse2 ps, 64, 16
%endif
%macro FILTER_H4_w2_2 3
diff -r 91010ea886c5 -r a1ae3a91f5e0 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Mon May 18 17:07:23 2015 -0700
+++ b/source/common/x86/ipfilter8.h Mon May 18 17:11:18 2015 -0700
@@ -981,6 +981,11 @@
void x265_interp_4tap_vert_ps_32x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
void x265_interp_4tap_vert_ps_32x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
void x265_interp_4tap_vert_ps_32x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_48x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_64x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_64x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_64x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
+void x265_interp_4tap_vert_ps_64x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx);
#endif
#undef LUMA_FILTERS
#undef LUMA_SP_FILTERS
More information about the x265-devel
mailing list