[x265] [PATCH] asm: 10bpp code of pixel_sub for 16xN, 24x32, 32xN, 48x64 and 64xN
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Fri Dec 6 13:53:54 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386334392 -19800
# Fri Dec 06 18:23:12 2013 +0530
# Node ID 9a9eaf4dadd7e191038e56341fac23aad60a10db
# Parent 6ed44381cb6c2fab0bc550170e5bfacc68aaaf4e
asm: 10bpp code of pixel_sub for 16xN, 24x32, 32xN,48x64 and 64xN
diff -r 6ed44381cb6c -r 9a9eaf4dadd7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Dec 05 16:05:53 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp Fri Dec 06 18:23:12 2013 +0530
@@ -300,9 +300,11 @@
p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
p.luma_copy_ps[LUMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu; \
- p.luma_sub_ps[LUMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu; \
p.luma_add_ps[LUMA_ ## W ## x ## H] = x265_pixel_add_ps_ ## W ## x ## H ## cpu;
+#define SETUP_LUMA_SUB_FUNC_DEF(W, H, cpu) \
+ p.luma_sub_ps[LUMA_ ## W ## x ## H] = x265_pixel_sub_ps_ ## W ## x ## H ## cpu;
+
#define SETUP_LUMA_SP_FUNC_DEF(W, H, cpu) \
p.luma_vsp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_sp_ ## W ## x ## H ## cpu;
@@ -398,6 +400,33 @@
SETUP_LUMA_FUNC_DEF(64, 16, cpu); \
SETUP_LUMA_FUNC_DEF(16, 64, cpu);
+#define LUMA_PIXELSUB(cpu) \
+ SETUP_LUMA_SUB_FUNC_DEF(4, 4, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(8, 8, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(8, 4, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(4, 8, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(16, 16, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(16, 8, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(8, 16, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(16, 12, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(12, 16, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(16, 4, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(4, 16, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(32, 32, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(32, 16, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(16, 32, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(32, 24, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(24, 32, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(32, 8, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(8, 32, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(64, 64, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(64, 32, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(32, 64, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(64, 48, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(48, 64, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(64, 16, cpu); \
+ SETUP_LUMA_SUB_FUNC_DEF(16, 64, cpu);
+
#define LUMA_SP_FILTERS(cpu) \
SETUP_LUMA_SP_FUNC_DEF(4, 4, cpu); \
SETUP_LUMA_SP_FUNC_DEF(8, 8, cpu); \
@@ -632,20 +661,8 @@
p.cvt32to16_shr = x265_cvt32to16_shr_sse2;
p.cvt16to32_shl = x265_cvt16to32_shl_sse2;
- p.chroma[X265_CSP_I420].sub_ps[LUMA_4x8] = x265_pixel_sub_ps_2x4_sse2;
- p.chroma[X265_CSP_I420].sub_ps[LUMA_4x16] = x265_pixel_sub_ps_2x8_sse2;
- p.chroma[X265_CSP_I420].sub_ps[LUMA_8x4] = x265_pixel_sub_ps_4x2_sse2;
- p.chroma[X265_CSP_I420].sub_ps[LUMA_8x8] = x265_pixel_sub_ps_4x4_sse2;
- p.chroma[X265_CSP_I420].sub_ps[LUMA_8x16] = x265_pixel_sub_ps_4x8_sse2;
- p.chroma[X265_CSP_I420].sub_ps[LUMA_8x32] = x265_pixel_sub_ps_4x16_sse2;
- p.chroma[X265_CSP_I420].sub_ps[LUMA_12x16] = x265_pixel_sub_ps_6x8_sse2;
- p.chroma[X265_CSP_I420].sub_ps[LUMA_16x4] = x265_pixel_sub_ps_8x2_sse2;
- p.chroma[X265_CSP_I420].sub_ps[LUMA_16x8] = x265_pixel_sub_ps_8x4_sse2;
- p.chroma[X265_CSP_I420].sub_ps[LUMA_16x12] = x265_pixel_sub_ps_8x6_sse2;
- p.chroma[X265_CSP_I420].sub_ps[LUMA_16x16] = x265_pixel_sub_ps_8x8_sse2;
- p.chroma[X265_CSP_I420].sub_ps[LUMA_16x32] = x265_pixel_sub_ps_8x16_sse2;
- p.chroma[X265_CSP_I420].sub_ps[LUMA_16x64] = x265_pixel_sub_ps_8x32_sse2;
- p.chroma[X265_CSP_I420].sub_ps[LUMA_24x32] = x265_pixel_sub_ps_12x16_sse2;
+ CHROMA_PIXELSUB_PS(_sse2);
+ LUMA_PIXELSUB(_sse2);
}
if (cpuMask & X265_CPU_SSSE3)
{
@@ -843,6 +860,7 @@
LUMA_SSE_SP(_sse4);
CHROMA_PIXELSUB_PS(_sse4);
+ LUMA_PIXELSUB(_sse4);
CHROMA_FILTERS(_sse4);
LUMA_FILTERS(_sse4);
diff -r 6ed44381cb6c -r 9a9eaf4dadd7 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Thu Dec 05 16:05:53 2013 -0600
+++ b/source/common/x86/pixel-util.h Fri Dec 06 18:23:12 2013 +0530
@@ -122,6 +122,7 @@
CHROMA_PIXELSUB_DEF(_sse4);
LUMA_PIXELSUB_DEF(_sse4);
CHROMA_PIXELSUB_DEF(_sse2);
+LUMA_PIXELSUB_DEF(_sse2);
#define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \
uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(pixel * pix, intptr_t pixstride);
diff -r 6ed44381cb6c -r 9a9eaf4dadd7 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Thu Dec 05 16:05:53 2013 -0600
+++ b/source/common/x86/pixel-util8.asm Fri Dec 06 18:23:12 2013 +0530
@@ -2937,22 +2937,63 @@
; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W16_H4 2
-INIT_XMM sse4
+%if HIGH_BIT_DEPTH
+cglobal pixel_sub_ps_%1x%2, 6, 7, 6, dest, deststride, src0, src1, srcstride0, srcstride1
+ add r1, r1
+ mov r6d, %2/4
+ add r4, r4
+ add r5, r5
+.loop
+ movu m0, [r2]
+ movu m1, [r3]
+ movu m2, [r2 + 16]
+ movu m3, [r3 + 16]
+ psubw m0, m1
+ psubw m2, m3
+ movu m4, [r2 + r4]
+ movu m5, [r3 + r5]
+ movu m1, [r2 + r4 + 16]
+ movu m3, [r3 + r5 + 16]
+ psubw m4, m5
+ psubw m1, m3
+ lea r2, [r2 + 2 * r4]
+ lea r3, [r3 + 2 * r5]
+
+ movu [r0], m0
+ movu [r0 + 16], m2
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 16], m1
+
+ movu m0, [r2]
+ movu m1, [r3]
+ movu m2, [r2 + 16]
+ movu m3, [r3 + 16]
+ psubw m0, m1
+ psubw m2, m3
+ movu m4, [r2 + r4]
+ movu m5, [r3 + r5]
+ movu m1, [r2 + r4 + 16]
+ movu m3, [r3 + r5 + 16]
+ psubw m4, m5
+ psubw m1, m3
+ lea r0, [r0 + 2 * r1]
+
+ movu [r0], m0
+ movu [r0 + 16], m2
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 16], m1
+%else
cglobal pixel_sub_ps_%1x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
-
-add r1, r1
-mov r6d, %2/4
-pxor m6, m6
-
+ add r1, r1
+ mov r6d, %2/4
+ pxor m6, m6
.loop
-
movu m1, [r2]
pmovzxbw m0, m1
punpckhbw m1, m6
movu m3, [r3]
pmovzxbw m2, m3
punpckhbw m3, m6
-
psubw m0, m2
psubw m1, m3
@@ -2962,7 +3003,6 @@
movu m3, [r3 + r5]
pmovzxbw m2, m3
punpckhbw m3, m6
-
psubw m4, m2
psubw m5, m3
@@ -2977,62 +3017,96 @@
movu m3, [r3 + 2 * r5]
pmovzxbw m2, m3
punpckhbw m3, m6
-
lea r2, [r2 + 2 * r4]
lea r3, [r3 + 2 * r5]
-
psubw m0, m2
psubw m1, m3
-
movu m5, [r2 + r4]
pmovzxbw m4, m5
punpckhbw m5, m6
movu m3, [r3 + r5]
pmovzxbw m2, m3
punpckhbw m3, m6
-
psubw m4, m2
psubw m5, m3
movu [r0 + 2 * r1], m0
movu [r0 + 2 * r1 + 16], m1
-
lea r0, [r0 + 2 * r1]
-
movu [r0 + r1], m4
movu [r0 + r1 + 16], m5
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
-
+%endif
dec r6d
-
-jnz .loop
+ lea r2, [r2 + 2 * r4]
+ lea r3, [r3 + 2 * r5]
+ lea r0, [r0 + 2 * r1]
+ jnz .loop
RET
%endmacro
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
PIXELSUB_PS_W16_H4 16, 4
PIXELSUB_PS_W16_H4 16, 8
PIXELSUB_PS_W16_H4 16, 12
PIXELSUB_PS_W16_H4 16, 16
PIXELSUB_PS_W16_H4 16, 32
PIXELSUB_PS_W16_H4 16, 64
+%else
+INIT_XMM sse4
+PIXELSUB_PS_W16_H4 16, 4
+PIXELSUB_PS_W16_H4 16, 8
+PIXELSUB_PS_W16_H4 16, 12
+PIXELSUB_PS_W16_H4 16, 16
+PIXELSUB_PS_W16_H4 16, 32
+PIXELSUB_PS_W16_H4 16, 64
+%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W24_H2 2
-INIT_XMM sse4
+%if HIGH_BIT_DEPTH
+cglobal pixel_sub_ps_%1x%2, 6, 7, 6, dest, deststride, src0, src1, srcstride0, srcstride1
+ add r1, r1
+ mov r6d, %2/2
+ add r4, r4
+ add r5, r5
+.loop
+ movu m0, [r2]
+ movu m1, [r3]
+ movu m2, [r2 + 16]
+ movu m3, [r3 + 16]
+ movu m4, [r2 + 32]
+ movu m5, [r3 + 32]
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+
+ movu [r0], m0
+ movu [r0 + 16], m2
+ movu [r0 + 32], m4
+
+ movu m0, [r2 + r4]
+ movu m1, [r3 + r5]
+ movu m2, [r2 + r4 + 16]
+ movu m3, [r3 + r5 + 16]
+ movu m4, [r2 + r4 + 32]
+ movu m5, [r3 + r5 + 32]
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m2
+ movu [r0 + r1 + 32], m4
+%else
cglobal pixel_sub_ps_%1x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
-
-add r1, r1
-mov r6d, %2/2
-pxor m6, m6
-
+ add r1, r1
+ mov r6d, %2/2
+ pxor m6, m6
.loop
-
movu m1, [r2]
pmovzxbw m0, m1
punpckhbw m1, m6
@@ -3043,7 +3117,6 @@
punpckhbw m4, m6
movh m5, [r3 + 16]
pmovzxbw m5, m5
-
psubw m0, m3
psubw m1, m4
psubw m2, m5
@@ -3062,7 +3135,6 @@
punpckhbw m4, m6
movh m5, [r3 + r5 + 16]
pmovzxbw m5, m5
-
psubw m0, m3
psubw m1, m4
psubw m2, m5
@@ -3070,54 +3142,93 @@
movu [r0 + r1], m0
movu [r0 + r1 + 16], m1
movu [r0 + r1 + 32], m2
-
+%endif
+
+ dec r6d
lea r2, [r2 + 2 * r4]
lea r3, [r3 + 2 * r5]
lea r0, [r0 + 2 * r1]
-
- dec r6d
-
-jnz .loop
+ jnz .loop
RET
%endmacro
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
PIXELSUB_PS_W24_H2 24, 32
+%else
+INIT_XMM sse4
+PIXELSUB_PS_W24_H2 24, 32
+%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W32_H2 2
-INIT_XMM sse4
-cglobal pixel_sub_ps_%1x%2, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
-
-add r1, r1
-mov r6d, %2/2
-
+cglobal pixel_sub_ps_%1x%2, 6, 7, 6, dest, deststride, src0, src1, srcstride0, srcstride1
+ add r1, r1
+ mov r6d, %2/2
+%if HIGH_BIT_DEPTH
+ add r4, r4
+ add r5, r5
.loop
-
+ movu m0, [r2]
+ movu m1, [r3]
+ movu m2, [r2 + 16]
+ movu m3, [r3 + 16]
+ movu m4, [r2 + 32]
+ movu m5, [r3 + 32]
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ movu m3, [r2 + 48]
+ movu m5, [r3 + 48]
+ psubw m3, m5
+
+ movu [r0], m0
+ movu [r0 + 16], m2
+ movu [r0 + 32], m4
+ movu [r0 + 48], m3
+
+ movu m0, [r2 + r4]
+ movu m1, [r3 + r5]
+ movu m2, [r2 + r4 + 16]
+ movu m3, [r3 + r5 + 16]
+ movu m4, [r2 + r4 + 32]
+ movu m5, [r3 + r5 + 32]
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ movu m3, [r2 + r4 + 48]
+ movu m5, [r3 + r5 + 48]
+ psubw m3, m5
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m2
+ movu [r0 + r1 + 32], m4
+ movu [r0 + r1 + 48], m3
+%else
+.loop
movh m0, [r2]
movh m1, [r2 + 8]
movh m2, [r2 + 16]
- movh m3, [r2 + 24]
- movh m4, [r3]
- movh m5, [r3 + 8]
- movh m6, [r3 + 16]
- movh m7, [r3 + 24]
-
+ movh m3, [r3]
+ movh m4, [r3 + 8]
+ movh m5, [r3 + 16]
pmovzxbw m0, m0
pmovzxbw m1, m1
pmovzxbw m2, m2
pmovzxbw m3, m3
pmovzxbw m4, m4
pmovzxbw m5, m5
- pmovzxbw m6, m6
- pmovzxbw m7, m7
-
- psubw m0, m4
- psubw m1, m5
- psubw m2, m6
- psubw m3, m7
+ psubw m0, m3
+ psubw m1, m4
+ psubw m2, m5
+ movh m3, [r2 + 24]
+ movh m4, [r3 + 24]
+ pmovzxbw m4, m4
+ pmovzxbw m3, m3
+ psubw m3, m4
movu [r0], m0
movu [r0 + 16], m1
@@ -3127,61 +3238,126 @@
movh m0, [r2 + r4]
movh m1, [r2 + r4 + 8]
movh m2, [r2 + r4 + 16]
- movh m3, [r2 + r4 + 24]
- movh m4, [r3 + r5]
- movh m5, [r3 + r5 + 8]
- movh m6, [r3 + r5 + 16]
- movh m7, [r3 + r5 + 24]
-
+ movh m3, [r3 + r5]
+ movh m4, [r3 + r5 + 8]
+ movh m5, [r3 + r5 + 16]
pmovzxbw m0, m0
pmovzxbw m1, m1
pmovzxbw m2, m2
pmovzxbw m3, m3
pmovzxbw m4, m4
pmovzxbw m5, m5
- pmovzxbw m6, m6
- pmovzxbw m7, m7
-
- psubw m0, m4
- psubw m1, m5
- psubw m2, m6
- psubw m3, m7
+ psubw m0, m3
+ psubw m1, m4
+ psubw m2, m5
+ movh m3, [r2 + r4 + 24]
+ movh m4, [r3 + r5 + 24]
+ pmovzxbw m3, m3
+ pmovzxbw m4, m4
+ psubw m3, m4
movu [r0 + r1], m0
movu [r0 + r1 + 16], m1
movu [r0 + r1 + 32], m2
movu [r0 + r1 + 48], m3
-
+%endif
+ dec r6d
lea r2, [r2 + 2 * r4]
lea r3, [r3 + 2 * r5]
lea r0, [r0 + 2 * r1]
-
- dec r6d
-
-jnz .loop
-
-RET
+ jnz .loop
+ RET
%endmacro
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
PIXELSUB_PS_W32_H2 32, 8
PIXELSUB_PS_W32_H2 32, 16
PIXELSUB_PS_W32_H2 32, 24
PIXELSUB_PS_W32_H2 32, 32
PIXELSUB_PS_W32_H2 32, 64
+%else
+INIT_XMM sse4
+PIXELSUB_PS_W32_H2 32, 8
+PIXELSUB_PS_W32_H2 32, 16
+PIXELSUB_PS_W32_H2 32, 24
+PIXELSUB_PS_W32_H2 32, 32
+PIXELSUB_PS_W32_H2 32, 64
+%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W48_H2 2
-INIT_XMM sse4
+%if HIGH_BIT_DEPTH
+cglobal pixel_sub_ps_%1x%2, 6, 7, 6, dest, deststride, src0, src1, srcstride0, srcstride1
+ add r1, r1
+ mov r6d, %2/2
+ add r4, r4
+ add r5, r5
+.loop
+ movu m0, [r2]
+ movu m1, [r3]
+ movu m2, [r2 + 16]
+ movu m3, [r3 + 16]
+ movu m4, [r2 + 32]
+ movu m5, [r3 + 32]
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+
+ movu [r0], m0
+ movu [r0 + 16], m2
+ movu [r0 + 32], m4
+
+ movu m0, [r2 + 48]
+ movu m1, [r3 + 48]
+ movu m2, [r2 + 64]
+ movu m3, [r3 + 64]
+ movu m4, [r2 + 80]
+ movu m5, [r3 + 80]
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+
+ movu [r0 + 48], m0
+ movu [r0 + 64], m2
+ movu [r0 + 80], m4
+
+ movu m0, [r2 + r4]
+ movu m1, [r3 + r5]
+ movu m2, [r2 + r4 + 16]
+ movu m3, [r3 + r5 + 16]
+ movu m4, [r2 + r4 + 32]
+ movu m5, [r3 + r5 + 32]
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m2
+ movu [r0 + r1 + 32], m4
+
+ movu m0, [r2 + r4 + 48]
+ movu m1, [r3 + r5 + 48]
+ movu m2, [r2 + r4 + 64]
+ movu m3, [r3 + r5 + 64]
+ movu m4, [r2 + r4 + 80]
+ movu m5, [r3 + r5 + 80]
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+
+ movu [r0 + r1 + 48], m0
+ movu [r0 + r1 + 64], m2
+ movu [r0 + r1 + 80], m4
+%else
+
cglobal pixel_sub_ps_%1x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
-
-add r1, r1
-mov r6d, %2/2
-pxor m6, m6
-
+ add r1, r1
+ mov r6d, %2/2
+ pxor m6, m6
.loop
-
movu m1, [r2]
pmovzxbw m0, m1
punpckhbw m1, m6
@@ -3191,7 +3367,6 @@
movu m5, [r2 + 16]
pmovzxbw m4, m5
punpckhbw m5, m6
-
psubw m0, m2
psubw m1, m3
@@ -3201,7 +3376,6 @@
movu m3, [r3 + 16]
pmovzxbw m2, m3
punpckhbw m3, m6
-
psubw m4, m2
psubw m5, m3
@@ -3214,7 +3388,6 @@
movu m3, [r3 + 32]
pmovzxbw m2, m3
punpckhbw m3, m6
-
psubw m0, m2
psubw m1, m3
@@ -3230,7 +3403,6 @@
movu m5, [r2 + r5 + 16]
pmovzxbw m4, m5
punpckhbw m5, m6
-
psubw m0, m2
psubw m1, m3
@@ -3240,7 +3412,6 @@
movu m3, [r3 + r4 + 16]
pmovzxbw m2, m3
punpckhbw m3, m6
-
psubw m4, m2
psubw m5, m3
@@ -3253,39 +3424,119 @@
movu m3, [r3 + r5 + 32]
pmovzxbw m2, m3
punpckhbw m3, m6
-
psubw m0, m2
psubw m1, m3
movu [r0 + r1 + 64], m0
movu [r0 + r1 + 80], m1
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
-
+%endif
dec r6d
-
-jnz .loop
+ lea r2, [r2 + 2 * r4]
+ lea r3, [r3 + 2 * r5]
+ lea r0, [r0 + 2 * r1]
+ jnz .loop
RET
%endmacro
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
PIXELSUB_PS_W48_H2 48, 64
+%else
+INIT_XMM sse4
+PIXELSUB_PS_W48_H2 48, 64
+%endif
;-----------------------------------------------------------------------------
; void pixel_sub_ps_c_%1x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
%macro PIXELSUB_PS_W64_H2 2
-INIT_XMM sse4
+%if HIGH_BIT_DEPTH
+cglobal pixel_sub_ps_%1x%2, 6, 7, 6, dest, deststride, src0, src1, srcstride0, srcstride1
+ add r1, r1
+ mov r6d, %2/2
+ add r4, r4
+ add r5, r5
+.loop
+ movu m0, [r2]
+ movu m1, [r3]
+ movu m2, [r2 + 16]
+ movu m3, [r3 + 16]
+ movu m4, [r2 + 32]
+ movu m5, [r3 + 32]
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ movu m3, [r2 + 48]
+ movu m5, [r3 + 48]
+ psubw m3, m5
+
+ movu [r0], m0
+ movu [r0 + 16], m2
+ movu [r0 + 32], m4
+ movu [r0 + 48], m3
+
+ movu m0, [r2 + 64]
+ movu m1, [r3 + 64]
+ movu m2, [r2 + 80]
+ movu m3, [r3 + 80]
+ movu m4, [r2 + 96]
+ movu m5, [r3 + 96]
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ movu m3, [r2 + 112]
+ movu m5, [r3 + 112]
+ psubw m3, m5
+
+ movu [r0 + 64], m0
+ movu [r0 + 80], m2
+ movu [r0 + 96], m4
+ movu [r0 + 112], m3
+
+ movu m0, [r2 + r4]
+ movu m1, [r3 + r5]
+ movu m2, [r2 + r4 + 16]
+ movu m3, [r3 + r5 + 16]
+ movu m4, [r2 + r4 + 32]
+ movu m5, [r3 + r5 + 32]
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ movu m3, [r2 + r4 + 48]
+ movu m5, [r3 + r5 + 48]
+ psubw m3, m5
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m2
+ movu [r0 + r1 + 32], m4
+ movu [r0 + r1 + 48], m3
+
+ movu m0, [r2 + r4 + 64]
+ movu m1, [r3 + r5 + 64]
+ movu m2, [r2 + r4 + 80]
+ movu m3, [r3 + r5 + 80]
+ movu m4, [r2 + r4 + 96]
+ movu m5, [r3 + r5 + 96]
+ psubw m0, m1
+ psubw m2, m3
+ psubw m4, m5
+ movu m3, [r2 + r4 + 112]
+ movu m5, [r3 + r5 + 112]
+ psubw m3, m5
+
+ movu [r0 + r1 + 64], m0
+ movu [r0 + r1 + 80], m2
+ movu [r0 + r1 + 96], m4
+ movu [r0 + r1 + 112], m3
+
+%else
+
cglobal pixel_sub_ps_%1x%2, 6, 7, 7, dest, deststride, src0, src1, srcstride0, srcstride1
-
-add r1, r1
-mov r6d, %2/2
-pxor m6, m6
-
+ add r1, r1
+ mov r6d, %2/2
+ pxor m6, m6
.loop
-
movu m1, [r2]
pmovzxbw m0, m1
punpckhbw m1, m6
@@ -3295,7 +3546,6 @@
movu m5, [r2 + 16]
pmovzxbw m4, m5
punpckhbw m5, m6
-
psubw m0, m2
psubw m1, m3
@@ -3308,7 +3558,6 @@
movu m3, [r2 + 32]
pmovzxbw m2, m3
punpckhbw m3, m6
-
psubw m4, m0
psubw m5, m1
@@ -3321,7 +3570,6 @@
movu m1, [r2 + 48]
pmovzxbw m0, m1
punpckhbw m1, m6
-
psubw m2, m4
psubw m3, m5
@@ -3334,7 +3582,6 @@
movu m5, [r2 + r4]
pmovzxbw m4, m5
punpckhbw m5, m6
-
psubw m0, m2
psubw m1, m3
@@ -3347,7 +3594,6 @@
movu m3, [r2 + r4 + 16]
pmovzxbw m2, m3
punpckhbw m3, m6
-
psubw m4, m0
psubw m5, m1
@@ -3360,7 +3606,6 @@
movu m1, [r2 + r4 + 32]
pmovzxbw m0, m1
punpckhbw m1, m6
-
psubw m2, m4
psubw m3, m5
@@ -3373,7 +3618,6 @@
movu m5, [r2 + r4 + 48]
pmovzxbw m4, m5
punpckhbw m5, m6
-
psubw m0, m2
psubw m1, m3
@@ -3383,28 +3627,33 @@
movu m1, [r3 + r5 + 48]
pmovzxbw m0, m1
punpckhbw m1, m6
-
psubw m4, m0
psubw m5, m1
movu [r0 + r1 + 96], m4
movu [r0 + r1 + 112], m5
-
- lea r2, [r2 + 2 * r4]
- lea r3, [r3 + 2 * r5]
- lea r0, [r0 + 2 * r1]
-
+%endif
dec r6d
-
-jnz .loop
-
-RET
+ lea r2, [r2 + 2 * r4]
+ lea r3, [r3 + 2 * r5]
+ lea r0, [r0 + 2 * r1]
+ jnz .loop
+ RET
%endmacro
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
PIXELSUB_PS_W64_H2 64, 16
PIXELSUB_PS_W64_H2 64, 32
PIXELSUB_PS_W64_H2 64, 48
PIXELSUB_PS_W64_H2 64, 64
+%else
+INIT_XMM sse4
+PIXELSUB_PS_W64_H2 64, 16
+PIXELSUB_PS_W64_H2 64, 32
+PIXELSUB_PS_W64_H2 64, 48
+PIXELSUB_PS_W64_H2 64, 64
+%endif
;=============================================================================
; variance
More information about the x265-devel
mailing list