[x265] [PATCH] asm: interp_8tap_hv_pp_8x8 sse3
dtyx265 at gmail.com
dtyx265 at gmail.com
Thu Apr 30 04:41:10 CEST 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1430361608 25200
# Node ID f95cc094467c844c6607c67d330748d171d26483
# Parent 9a1b8b71bc997547044f42992e1eb7f3572f03f1
asm: interp_8tap_hv_pp_8x8 sse3
This replaces c code
64-bit
./test/TestBench --testbench interp | grep hv
luma_hv [ 8x8] 2.53x 14225.03 35970.65
32-bit
./test/TestBench --testbench interp | grep hv
luma_hv [ 8x8] 2.50x 14367.40 35917.48
diff -r 9a1b8b71bc99 -r f95cc094467c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Apr 29 08:23:45 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp Wed Apr 29 19:40:08 2015 -0700
@@ -1347,6 +1347,7 @@
p.pu[LUMA_4x4].luma_hpp = x265_interp_8tap_horiz_pp_4x4_sse2;
ALL_LUMA_PU(luma_hps, interp_8tap_horiz_ps, sse2);
p.pu[LUMA_4x4].luma_hps = x265_interp_8tap_horiz_ps_4x4_sse2;
+ p.pu[LUMA_8x8].luma_hvpp = x265_interp_8tap_hv_pp_8x8_sse3;
//p.frameInitLowres = x265_frame_init_lowres_core_mmx2;
p.frameInitLowres = x265_frame_init_lowres_core_sse2;
diff -r 9a1b8b71bc99 -r f95cc094467c source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Wed Apr 29 08:23:45 2015 -0700
+++ b/source/common/x86/ipfilter8.asm Wed Apr 29 19:40:08 2015 -0700
@@ -3464,6 +3464,78 @@
RET
;-----------------------------------------------------------------------------
+; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+;-----------------------------------------------------------------------------
+INIT_XMM sse3
+cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
+ mov r4d, r4m
+ mov r5d, r5m
+ add r4d, r4d
+ pxor m6, m6
+
+%ifdef PIC
+ lea r6, [tabw_LumaCoeff]
+ mova m3, [r6 + r4 * 8]
+%else
+ mova m3, [tabw_LumaCoeff + r4 * 8]
+%endif
+
+ ; move to row -3
+ lea r6, [r1 + r1 * 2]
+ sub r0, r6
+
+ mov r4, rsp
+
+%assign x 0 ;needed for FILTER_H8_W8_sse2 macro
+%assign y 1
+%rep 15
+ FILTER_H8_W8_sse2
+ psubw m1, [pw_2000]
+ mova [r4], m1
+
+%if y < 15
+ add r0, r1
+ add r4, 16
+%endif
+%assign y y+1
+%endrep
+
+ ; ready to phase V
+ ; Here all of mN is free
+
+ ; load coeff table
+ shl r5, 6
+ lea r6, [tab_LumaCoeffV]
+ lea r5, [r5 + r6]
+
+ ; load intermedia buffer
+ mov r0, rsp
+
+ ; register mapping
+ ; r0 - src
+ ; r5 - coeff
+
+ ; let's go
+%assign y 1
+%rep 4
+ FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0
+ FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
+ FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
+ FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
+ FILTER_HV8_END m3, m0, m4, m1
+
+ movh [r2], m3
+ movhps [r2 + r3], m3
+
+%if y < 4
+ lea r0, [r0 + 16 * 2]
+ lea r2, [r2 + r3 * 2]
+%endif
+%assign y y+1
+%endrep
+ RET
+
+;-----------------------------------------------------------------------------
;void interp_4tap_vert_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
;-----------------------------------------------------------------------------
INIT_XMM sse4
diff -r 9a1b8b71bc99 -r f95cc094467c source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Wed Apr 29 08:23:45 2015 -0700
+++ b/source/common/x86/ipfilter8.h Wed Apr 29 19:40:08 2015 -0700
@@ -900,6 +900,7 @@
void x265_interp_8tap_horiz_ps_64x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
void x265_interp_8tap_horiz_ps_64x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
void x265_interp_8tap_horiz_ps_64x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+void x265_interp_8tap_hv_pp_8x8_sse3(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int idxX, int idxY);
#undef LUMA_FILTERS
#undef LUMA_SP_FILTERS
#undef LUMA_SS_FILTERS
More information about the x265-devel
mailing list