[x265] [PATCH 1 of 4] asm: interp_8tap_hv_pp_8x8() for Interpolate_HV_8x8
Min Chen
chenm003 at 163.com
Mon Oct 28 15:24:33 CET 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1382970131 -28800
# Node ID 4a14bd24b6572ed1ffcf9dcfa9f4c841adc62211
# Parent ef2428fd32feddd60168f3430c50f4d7e6f02741
asm: interp_8tap_hv_pp_8x8() for Interpolate_HV_8x8
diff -r ef2428fd32fe -r 4a14bd24b657 source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp Mon Oct 28 00:08:06 2013 -0500
+++ b/source/common/ipfilter.cpp Mon Oct 28 22:22:11 2013 +0800
@@ -401,6 +401,17 @@
dst += dstStride;
}
}
+typedef void (*ipfilter_ps_t)(pixel *src, intptr_t srcStride, short *dst, intptr_t dstStride, int width, int height, const short *coeff);
+typedef void (*ipfilter_sp_t)(short *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, const short *coeff);
+
+template<int N, int width, int height>
+void interp_hv_pp_c(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+{
+ short m_immedVals[(64 + 8) * (64 + 8)];
+ filterHorizontal_ps_c<N>(src - 3 * srcStride, srcStride, m_immedVals, width, width, height + 7, g_lumaFilter[idxX]);
+ filterVertical_sp_c<N>(m_immedVals + 3 * width, width, dst, dstStride, width, height, g_lumaFilter[idxY]);
+}
+
}
namespace x265 {
@@ -412,7 +423,8 @@
#define LUMA(W, H) \
p.luma_hpp[LUMA_ ## W ## x ## H] = interp_horiz_pp_c<8, W, H>;\
- p.luma_vpp[LUMA_ ## W ## x ## H] = interp_vert_pp_c<8, W, H>
+ p.luma_vpp[LUMA_ ## W ## x ## H] = interp_vert_pp_c<8, W, H>; \
+ p.luma_hvpp[LUMA_ ## W ## x ## H] = interp_hv_pp_c<8, W, H>;
void Setup_C_IPFilterPrimitives(EncoderPrimitives& p)
{
diff -r ef2428fd32fe -r 4a14bd24b657 source/common/primitives.h
--- a/source/common/primitives.h Mon Oct 28 00:08:06 2013 -0500
+++ b/source/common/primitives.h Mon Oct 28 22:22:11 2013 +0800
@@ -209,6 +209,7 @@
typedef void (*plane_copy_deinterleave_t)(pixel *dstu, intptr_t dstuStride, pixel *dstv, intptr_t dstvStride, pixel *src, intptr_t srcStride, int w, int h);
typedef void (*filter_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx);
+typedef void (*filter_hv_pp_t) (pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY);
/* Define a structure containing function pointers to optimized encoder
* primitives. Each pointer can reference either an assembly routine,
@@ -245,6 +246,7 @@
filter_pp_t luma_hpp[NUM_LUMA_PARTITIONS];
filter_pp_t chroma_vpp[NUM_CHROMA_PARTITIONS];
filter_pp_t luma_vpp[NUM_LUMA_PARTITIONS];
+ filter_hv_pp_t luma_hvpp[NUM_LUMA_PARTITIONS];
intra_dc_t intra_pred_dc;
intra_planar_t intra_pred_planar;
diff -r ef2428fd32fe -r 4a14bd24b657 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Oct 28 00:08:06 2013 -0500
+++ b/source/common/x86/asm-primitives.cpp Mon Oct 28 22:22:11 2013 +0800
@@ -278,6 +278,8 @@
p.sad_x4[LUMA_16x12] = x265_pixel_sad_x4_16x12_ssse3;
p.sad_x3[LUMA_16x32] = x265_pixel_sad_x3_16x32_ssse3;
p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_ssse3;
+
+ p.luma_hvpp[LUMA_8x8] = x265_interp_8tap_hv_pp_8x8_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
diff -r ef2428fd32fe -r 4a14bd24b657 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Mon Oct 28 00:08:06 2013 -0500
+++ b/source/common/x86/ipfilter8.asm Mon Oct 28 22:22:11 2013 +0800
@@ -35,7 +35,9 @@
db 4, 5, 6, 7, 8, 9, 10, 11, 5, 6, 7, 8, 9, 10, 11, 12
db 6, 7, 8, 9, 10, 11, 12, 13, 7, 8, 9, 10, 11, 12, 13, 14
-tab_c_512: times 8 dw 512
+tab_c_512: times 8 dw 512
+tab_c_8192: times 8 dw 8192
+tab_c_526336: times 4 dd 8192*64+2048
tab_ChromaCoeff: db 0, 64, 0, 0
db -2, 58, 10, -2
@@ -51,6 +53,25 @@
db -1, 4, -11, 40, 40, -11, 4, -1
db 0, 1, -5, 17, 58, -10, 4, -1
+tab_LumaCoeffV: times 4 dw 0, 0
+ times 4 dw 0, 64
+ times 4 dw 0, 0
+ times 4 dw 0, 0
+
+ times 4 dw -1, 4
+ times 4 dw -10, 58
+ times 4 dw 17, -5
+ times 4 dw 1, 0
+
+ times 4 dw -1, 4
+ times 4 dw -11, 40
+ times 4 dw 40, -11
+ times 4 dw 4, -1
+
+ times 4 dw 0, 1
+ times 4 dw -5, 17
+ times 4 dw 58, -10
+ times 4 dw 4, -1
SECTION .text
@@ -523,8 +544,8 @@
pmaddubsw %1, %5
phaddw %4, %1
phaddw %2, %4
+ %if %0 == 8
pmulhrsw %2, %6
- %if %0 == 8
packuswb %2, %2
movh %8, %2
%endif
@@ -623,3 +644,133 @@
IPFILTER_LUMA 48, 64
IPFILTER_LUMA 64, 16
IPFILTER_LUMA 16, 64
+
+
+;-----------------------------------------------------------------------------
+; Interpolate HV
+;-----------------------------------------------------------------------------
+%macro FILTER_HV8_START 7 ; (t0, t1, t2, t3, t4, off_src, off_coeff) -> (t3, t5), (t4, t1), [2]
+ mova %5, [r0 + (%6 + 0) * 16]
+ mova %1, [r0 + (%6 + 1) * 16]
+ mova %2, [r0 + (%6 + 2) * 16]
+ punpcklwd %3, %5, %1
+ punpckhwd %5, %1
+ pmaddwd %3, [r5 + (%7) * 16] ; R3 = L[0+1] -- Row 0
+ pmaddwd %5, [r5 + (%7) * 16] ; R0 = H[0+1]
+ punpcklwd %4, %1, %2
+ punpckhwd %1, %2
+ pmaddwd %4, [r5 + (%7) * 16] ; R4 = L[1+2] -- Row 1
+ pmaddwd %1, [r5 + (%7) * 16] ; R1 = H[1+2]
+%endmacro ; FILTER_HV8_START
+
+%macro FILTER_HV8_MID 10 ; (Row3, prevRow, sum0L, sum1L, sum0H, sum1H, t6, t7, off_src, off_coeff) -> [6]
+ mova %8, [r0 + (%9 + 0) * 16]
+ mova %1, [r0 + (%9 + 1) * 16]
+ punpcklwd %7, %2, %8
+ punpckhwd %2, %8
+ pmaddwd %7, [r5 + %10 * 16]
+ pmaddwd %2, [r5 + %10 * 16]
+ paddd %3, %7 ; R3 = L[0+1+2+3] -- Row 0
+ paddd %5, %2 ; R0 = H[0+1+2+3]
+ punpcklwd %7, %8, %1
+ punpckhwd %8, %1
+ pmaddwd %7, [r5 + %10 * 16]
+ pmaddwd %8, [r5 + %10 * 16]
+ paddd %4, %7 ; R4 = L[1+2+3+4] -- Row 1
+ paddd %6, %8 ; R1 = H[1+2+3+4]
+%endmacro ; FILTER_HV8_START
+
+; Round and Saturate
+%macro FILTER_HV8_END 4 ; output in [1, 3]
+ paddd %1, [tab_c_526336]
+ paddd %2, [tab_c_526336]
+ paddd %3, [tab_c_526336]
+ paddd %4, [tab_c_526336]
+ psrad %1, 12
+ psrad %2, 12
+ psrad %3, 12
+ psrad %4, 12
+ packssdw %1, %2
+ packssdw %3, %4
+
+ ; TODO: is merge better? I think this way is short dependency link
+ packuswb %1, %1
+ packuswb %3, %3
+%endmacro ; FILTER_HV8_END
+
+;-----------------------------------------------------------------------------
+; void interp_8tap_hv_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int idxX, int idxY)
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal interp_8tap_hv_pp_8x8, 4, 7, 8, 0-15*16
+%define coef m7
+%define stk_buf rsp
+
+ mov r4d, r4m
+ mov r5d, r5m
+
+%ifdef PIC
+ lea r6, [tab_LumaCoeff]
+ movh coef, [r6 + r4 * 8]
+%else
+ movh coef, [tab_LumaCoeff + r4 * 8]
+%endif
+ punpcklqdq coef, coef
+
+ ; move to row -3
+ lea r6, [r1 + r1 * 2]
+ sub r0, r6
+
+ xor r6, r6
+ mov r4, rsp
+
+.loopH:
+ FILTER_H8_W8 m0, m1, m2, m3, coef, [tab_c_512], [r0 - 3]
+ psubw m1, [tab_c_8192]
+ mova [r4], m1
+
+ add r0, r1
+ add r4, 16
+ inc r6
+ cmp r6, 8+7
+ jnz .loopH
+
+ ; ready to phase V
+ ; Here all of mN is free
+
+ ; load coeff table
+ shl r5, 6
+ lea r6, [tab_LumaCoeffV]
+ lea r5, [r5 + r6]
+
+ ; load intermedia buffer
+ mov r0, stk_buf
+
+ ; register mapping
+ ; r0 - src
+ ; r5 - coeff
+ ; r6 - loop_i
+
+ ; let's go
+ xor r6, r6
+
+ ; TODO: this loop have more than 70 instructions, I think it is more than Intel loop decode cache
+.loopV:
+
+ FILTER_HV8_START m1, m2, m3, m4, m0, 0, 0
+ FILTER_HV8_MID m6, m2, m3, m4, m0, m1, m7, m5, 3, 1
+ FILTER_HV8_MID m5, m6, m3, m4, m0, m1, m7, m2, 5, 2
+ FILTER_HV8_MID m6, m5, m3, m4, m0, m1, m7, m2, 7, 3
+ FILTER_HV8_END m3, m0, m4, m1
+
+ movq [r2], m3
+ movq [r2 + r3], m4
+
+ lea r0, [r0 + 16 * 2]
+ lea r2, [r2 + r3 * 2]
+
+ inc r6
+ cmp r6, 8/2
+ jnz .loopV
+
+ RET
diff -r ef2428fd32fe -r 4a14bd24b657 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Mon Oct 28 00:08:06 2013 -0500
+++ b/source/common/x86/ipfilter8.h Mon Oct 28 22:22:11 2013 +0800
@@ -88,6 +88,8 @@
CHROMA_FILTERS(_sse4);
LUMA_FILTERS(_sse4);
+void x265_interp_8tap_hv_pp_8x8_ssse3(pixel * src, intptr_t srcStride, pixel * dst, intptr_t dstStride, int idxX, int idxY);
+
#undef SETUP_CHROMA_FUNC_DEF
#undef SETUP_LUMA_FUNC_DEF
#undef CHROMA_FILTERS
diff -r ef2428fd32fe -r 4a14bd24b657 source/test/ipfilterharness.cpp
--- a/source/test/ipfilterharness.cpp Mon Oct 28 00:08:06 2013 -0500
+++ b/source/test/ipfilterharness.cpp Mon Oct 28 22:22:11 2013 +0800
@@ -325,6 +325,40 @@
return true;
}
+bool IPFilterHarness::check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt)
+{
+ int rand_srcStride, rand_dstStride, rand_coeffIdxX, rand_coeffIdxY;
+
+ for (int i = 0; i <= 1000; i++)
+ {
+ rand_coeffIdxX = rand() % 3; // Random coeffIdex in the filter
+ rand_coeffIdxY = rand() % 3; // Random coeffIdex in the filter
+
+ rand_srcStride = rand() % 100; // Randomly generated srcStride
+ rand_dstStride = rand() % 100; // Randomly generated dstStride
+
+ ref(pixel_buff + 3 * rand_srcStride,
+ rand_srcStride,
+ IPF_C_output_p,
+ rand_dstStride,
+ rand_coeffIdxX,
+ rand_coeffIdxY
+ );
+ opt(pixel_buff + 3 * rand_srcStride,
+ rand_srcStride,
+ IPF_vec_output_p,
+ rand_dstStride,
+ rand_coeffIdxX,
+ rand_coeffIdxY
+ );
+
+ if (memcmp(IPF_vec_output_p, IPF_C_output_p, ipf_t_size))
+ return false;
+ }
+
+ return true;
+}
+
bool IPFilterHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
for (int value = 0; value < NUM_IPFILTER_P_P; value++)
@@ -421,6 +455,18 @@
}
}
+ for (int value = 0; value < NUM_LUMA_PARTITIONS; value++)
+ {
+ if (opt.luma_hvpp[value])
+ {
+ if (!check_IPFilterLumaHV_primitive(ref.luma_hvpp[value], opt.luma_hvpp[value]))
+ {
+ printf("luma_hvpp[%s]", lumaPartStr[value]);
+ return false;
+ }
+ }
+ }
+
return true;
}
@@ -486,6 +532,7 @@
REPORT_SPEEDUP(opt.luma_hpp[value], ref.luma_hpp[value],
pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1);
}
+
if (opt.luma_vpp[value])
{
printf("luma_vpp[%s]\t", lumaPartStr[value]);
@@ -493,6 +540,13 @@
pixel_buff + maxVerticalfilterHalfDistance * srcStride, srcStride,
IPF_vec_output_p, dstStride, 1);
}
+
+ if (opt.luma_hvpp[value])
+ {
+ printf("luma_hv [%s]\t", lumaPartStr[value]);
+ REPORT_SPEEDUP(opt.luma_hvpp[value], ref.luma_hvpp[value],
+ pixel_buff + srcStride, srcStride, IPF_vec_output_p, dstStride, 1, 3);
+ }
}
for (int value = 0; value < NUM_CHROMA_PARTITIONS; value++)
diff -r ef2428fd32fe -r 4a14bd24b657 source/test/ipfilterharness.h
--- a/source/test/ipfilterharness.h Mon Oct 28 00:08:06 2013 -0500
+++ b/source/test/ipfilterharness.h Mon Oct 28 22:22:11 2013 +0800
@@ -48,6 +48,7 @@
bool check_IPFilter_primitive(ipfilter_s2p_t ref, ipfilter_s2p_t opt);
bool check_IPFilterChroma_primitive(filter_pp_t ref, filter_pp_t opt);
bool check_IPFilterLuma_primitive(filter_pp_t ref, filter_pp_t opt);
+ bool check_IPFilterLumaHV_primitive(filter_hv_pp_t ref, filter_hv_pp_t opt);
public:
More information about the x265-devel
mailing list