[x265] [PATCH] asm: added code for blockcopy_ss and cleaned up asm primitives of blockcopy
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Fri Mar 7 11:06:37 CET 2014
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1394185273 -19800
# Fri Mar 07 15:11:13 2014 +0530
# Node ID 2bf727dca27d6f69e96d4412850661cbe036cbef
# Parent 33b67a53b6deb19bd5b5142398f7c8c47ba3d2fa
asm: added code for blockcopy_ss and cleaned up asm primitives of blockcopy
diff -r 33b67a53b6de -r 2bf727dca27d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Mar 06 21:27:55 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp Fri Mar 07 15:11:13 2014 +0530
@@ -400,8 +400,7 @@
p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu; \
p.luma_hps[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_ps_ ## W ## x ## H ## cpu; \
p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu; \
- p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu; \
- p.luma_copy_ps[LUMA_ ## W ## x ## H] = x265_blockcopy_ps_ ## W ## x ## H ## cpu;
+ p.luma_vps[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ps_ ## W ## x ## H ## cpu;
#endif
#define SETUP_LUMA_SUB_FUNC_DEF(W, H, cpu) \
@@ -414,38 +413,91 @@
#define SETUP_LUMA_SS_FUNC_DEF(W, H, cpu) \
p.luma_vss[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_ss_ ## W ## x ## H ## cpu;
-#define SETUP_LUMA_BLOCKCOPY_FUNC_DEF(W, H, cpu) \
- p.luma_copy_pp[LUMA_ ## W ## x ## H] = x265_blockcopy_pp_ ## W ## x ## H ## cpu;
+#define SETUP_LUMA_BLOCKCOPY(type, W, H, cpu) \
+ p.luma_copy_ ## type[LUMA_ ## W ## x ## H] = x265_blockcopy_ ## type ## _ ## W ## x ## H ## cpu;
-#define SETUP_CHROMA_FROM_LUMA(W1, H1, W2, H2, cpu) \
- p.chroma[X265_CSP_I420].copy_pp[LUMA_ ## W1 ## x ## H1] = x265_blockcopy_pp_ ## W2 ## x ## H2 ## cpu;
+#define SETUP_CHROMA_BLOCKCOPY(type, W, H, cpu) \
+ p.chroma[X265_CSP_I420].copy_ ## type[CHROMA_ ## W ## x ## H] = x265_blockcopy_ ## type ##_ ## W ## x ## H ## cpu;
-// For X265_CSP_I420 chroma width and height will be half of luma width and height
-#define CHROMA_BLOCKCOPY(cpu) \
- SETUP_CHROMA_FROM_LUMA(8, 8, 4, 4, cpu); \
- SETUP_CHROMA_FROM_LUMA(8, 4, 4, 2, cpu); \
- SETUP_CHROMA_FROM_LUMA(4, 8, 2, 4, cpu); \
- SETUP_CHROMA_FROM_LUMA(16, 16, 8, 8, cpu); \
- SETUP_CHROMA_FROM_LUMA(16, 8, 8, 4, cpu); \
- SETUP_CHROMA_FROM_LUMA(8, 16, 4, 8, cpu); \
- SETUP_CHROMA_FROM_LUMA(16, 12, 8, 6, cpu); \
- SETUP_CHROMA_FROM_LUMA(12, 16, 6, 8, cpu); \
- SETUP_CHROMA_FROM_LUMA(16, 4, 8, 2, cpu); \
- SETUP_CHROMA_FROM_LUMA(4, 16, 2, 8, cpu); \
- SETUP_CHROMA_FROM_LUMA(32, 32, 16, 16, cpu); \
- SETUP_CHROMA_FROM_LUMA(32, 16, 16, 8, cpu); \
- SETUP_CHROMA_FROM_LUMA(16, 32, 8, 16, cpu); \
- SETUP_CHROMA_FROM_LUMA(32, 24, 16, 12, cpu); \
- SETUP_CHROMA_FROM_LUMA(24, 32, 12, 16, cpu); \
- SETUP_CHROMA_FROM_LUMA(32, 8, 16, 4, cpu); \
- SETUP_CHROMA_FROM_LUMA(8, 32, 4, 16, cpu); \
- SETUP_CHROMA_FROM_LUMA(64, 64, 32, 32, cpu); \
- SETUP_CHROMA_FROM_LUMA(64, 32, 32, 16, cpu); \
- SETUP_CHROMA_FROM_LUMA(32, 64, 16, 32, cpu); \
- SETUP_CHROMA_FROM_LUMA(64, 48, 32, 24, cpu); \
- SETUP_CHROMA_FROM_LUMA(48, 64, 24, 32, cpu); \
- SETUP_CHROMA_FROM_LUMA(64, 16, 32, 8, cpu); \
- SETUP_CHROMA_FROM_LUMA(16, 64, 8, 32, cpu);
+#define CHROMA_BLOCKCOPY(type ,cpu) \
+ SETUP_CHROMA_BLOCKCOPY(type, 2, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 2, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 4, 2, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 4, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 4, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 4, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 6, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 8, 2, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 8, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 8, 6, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 8, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 8, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 8, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 12, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 16, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 16, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 16, 12, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 16, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 16, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 24, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 32, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 32, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 32, 24, cpu); \
+ SETUP_CHROMA_BLOCKCOPY(type, 32, 32, cpu);
+
+#define LUMA_BLOCKCOPY(type, cpu) \
+ SETUP_LUMA_BLOCKCOPY(type, 4, 4, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 8, 8, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 8, 4, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 4, 8, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 16, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 16, 8, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 8, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 16, 12, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 12, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 16, 4, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 4, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 32, 32, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 32, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 16, 32, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 32, 24, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 24, 32, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 32, 8, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 8, 32, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 64, 64, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 64, 32, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 32, 64, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 64, 48, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 48, 64, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 64, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY(type, 16, 64, cpu);
+
+#define SETUP_CHROMA_BLOCKCOPY_SP(W, H, cpu) \
+ p.chroma[X265_CSP_I420].copy_sp[CHROMA_ ## W ## x ## H] = x265_blockcopy_sp_ ## W ## x ## H ## cpu;
+
+#define CHROMA_BLOCKCOPY_SP(cpu) \
+ SETUP_CHROMA_BLOCKCOPY_SP(4, 2, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(4, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(4, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(4, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(8, 2, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(8, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(8, 6, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(8, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(8, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(8, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(12, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(16, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(16, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(16, 12, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(16, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(16, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(24, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(32, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(32, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(32, 24, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_SP(32, 32, cpu);
+
#define SETUP_CHROMA_LUMA(W1, H1, W2, H2, cpu) \
p.chroma[X265_CSP_I420].sub_ps[LUMA_ ## W1 ## x ## H1] = x265_pixel_sub_ps_ ## W2 ## x ## H2 ## cpu; \
@@ -585,33 +637,6 @@
SETUP_LUMA_SS_FUNC_DEF(64, 16, cpu); \
SETUP_LUMA_SS_FUNC_DEF(16, 64, cpu);
-#define LUMA_BLOCKCOPY(cpu) \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(4, 4, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8, 8, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8, 4, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(4, 8, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 8, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 12, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(12, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 4, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(4, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 24, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(24, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 8, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 64, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 64, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 48, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(48, 64, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 64, cpu);
-
#define SETUP_PIXEL_VAR_DEF(W, H, cpu) \
p.var[BLOCK_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu;
@@ -963,8 +988,8 @@
CHROMA_PIXELSUB_PS(_sse2);
LUMA_PIXELSUB(_sse2);
- CHROMA_BLOCKCOPY(_sse2);
- LUMA_BLOCKCOPY(_sse2);
+ CHROMA_BLOCKCOPY(ss, _sse2);
+ LUMA_BLOCKCOPY(ss, _sse2);
CHROMA_VERT_FILTERS(_sse2);
p.chroma_p2s[X265_CSP_I420] = x265_chroma_p2s_sse2;
@@ -1043,14 +1068,16 @@
for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
{
- p.luma_copy_ps[i] = (copy_ps_t)p.luma_copy_pp[i];
- p.luma_copy_sp[i] = (copy_sp_t)p.luma_copy_pp[i];
+ p.luma_copy_ps[i] = (copy_ps_t)p.luma_copy_ss[i];
+ p.luma_copy_sp[i] = (copy_sp_t)p.luma_copy_ss[i];
+ p.luma_copy_pp[i] = (copy_pp_t)p.luma_copy_ss[i];
}
for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
{
- p.chroma[X265_CSP_I420].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I420].copy_pp[i];
- p.chroma[X265_CSP_I420].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I420].copy_pp[i];
+ p.chroma[X265_CSP_I420].copy_ps[i] = (copy_ps_t)p.chroma[X265_CSP_I420].copy_ss[i];
+ p.chroma[X265_CSP_I420].copy_sp[i] = (copy_sp_t)p.chroma[X265_CSP_I420].copy_ss[i];
+ p.chroma[X265_CSP_I420].copy_pp[i] = (copy_pp_t)p.chroma[X265_CSP_I420].copy_ss[i];
}
#else // if HIGH_BIT_DEPTH
@@ -1096,8 +1123,12 @@
INIT2(sad_x4, _sse2);
HEVC_SATD(sse2);
- CHROMA_BLOCKCOPY(_sse2);
- LUMA_BLOCKCOPY(_sse2);
+ CHROMA_BLOCKCOPY(ss, _sse2);
+ CHROMA_BLOCKCOPY(pp, _sse2);
+ LUMA_BLOCKCOPY(ss, _sse2);
+ LUMA_BLOCKCOPY(pp, _sse2);
+ LUMA_BLOCKCOPY(sp, _sse2);
+ CHROMA_BLOCKCOPY_SP(_sse2);
CHROMA_SS_FILTERS_420(_sse2);
CHROMA_SS_FILTERS_444(_sse2);
@@ -1110,34 +1141,6 @@
// until all partitions are coded and commit smaller patches, easier to
// review.
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_4x2] = x265_blockcopy_sp_4x2_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_4x4] = x265_blockcopy_sp_4x4_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_4x8] = x265_blockcopy_sp_4x8_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_4x16] = x265_blockcopy_sp_4x16_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_8x2] = x265_blockcopy_sp_8x2_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_8x4] = x265_blockcopy_sp_8x4_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_8x6] = x265_blockcopy_sp_8x6_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_8x8] = x265_blockcopy_sp_8x8_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_8x16] = x265_blockcopy_sp_8x16_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_12x16] = x265_blockcopy_sp_12x16_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_16x4] = x265_blockcopy_sp_16x4_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_16x8] = x265_blockcopy_sp_16x8_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_16x12] = x265_blockcopy_sp_16x12_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_16x16] = x265_blockcopy_sp_16x16_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_16x32] = x265_blockcopy_sp_16x32_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_24x32] = x265_blockcopy_sp_24x32_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_32x8] = x265_blockcopy_sp_32x8_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_32x16] = x265_blockcopy_sp_32x16_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_32x24] = x265_blockcopy_sp_32x24_sse2;
- p.chroma[X265_CSP_I420].copy_sp[CHROMA_32x32] = x265_blockcopy_sp_32x32_sse2;
-
- p.luma_copy_sp[LUMA_32x64] = x265_blockcopy_sp_32x64_sse2;
- p.luma_copy_sp[LUMA_16x64] = x265_blockcopy_sp_16x64_sse2;
- p.luma_copy_sp[LUMA_48x64] = x265_blockcopy_sp_48x64_sse2;
- p.luma_copy_sp[LUMA_64x16] = x265_blockcopy_sp_64x16_sse2;
- p.luma_copy_sp[LUMA_64x32] = x265_blockcopy_sp_64x32_sse2;
- p.luma_copy_sp[LUMA_64x48] = x265_blockcopy_sp_64x48_sse2;
- p.luma_copy_sp[LUMA_64x64] = x265_blockcopy_sp_64x64_sse2;
p.blockfill_s[BLOCK_4x4] = x265_blockfill_s_4x4_sse2;
p.blockfill_s[BLOCK_8x8] = x265_blockfill_s_8x8_sse2;
p.blockfill_s[BLOCK_16x16] = x265_blockfill_s_16x16_sse2;
@@ -1227,9 +1230,12 @@
LUMA_SP_FILTERS(_sse4);
LUMA_FILTERS(_sse4);
ASSGN_SSE_SS(sse4);
+
p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x4] = x265_blockcopy_sp_2x4_sse4;
p.chroma[X265_CSP_I420].copy_sp[CHROMA_2x8] = x265_blockcopy_sp_2x8_sse4;
p.chroma[X265_CSP_I420].copy_sp[CHROMA_6x8] = x265_blockcopy_sp_6x8_sse4;
+ CHROMA_BLOCKCOPY(ps, _sse4);
+ LUMA_BLOCKCOPY(ps, _sse4);
p.chroma[X265_CSP_I420].filter_vsp[CHROMA_2x4] = x265_interp_4tap_vert_sp_2x4_sse4;
p.chroma[X265_CSP_I420].filter_vsp[CHROMA_2x8] = x265_interp_4tap_vert_sp_2x8_sse4;
@@ -1305,6 +1311,7 @@
p.chroma[X265_CSP_I444].copy_pp[i] = p.luma_copy_pp[i];
p.chroma[X265_CSP_I444].copy_ps[i] = p.luma_copy_ps[i];
p.chroma[X265_CSP_I444].copy_sp[i] = p.luma_copy_sp[i];
+ p.chroma[X265_CSP_I444].copy_ss[i] = p.luma_copy_ss[i];
p.chroma[X265_CSP_I444].add_ps[i] = p.luma_add_ps[i];
p.chroma[X265_CSP_I444].sub_ps[i] = p.luma_sub_ps[i];
p.chroma[X265_CSP_I444].addAvg[i] = p.luma_addAvg[i];
diff -r 33b67a53b6de -r 2bf727dca27d source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Thu Mar 06 21:27:55 2014 -0600
+++ b/source/common/x86/blockcopy8.asm Fri Mar 07 15:11:13 2014 +0530
@@ -35,22 +35,7 @@
; void blockcopy_pp_2x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_pp_2x4, 4, 7, 0, dest, deststride, src, srcstride
-%if HIGH_BIT_DEPTH
- add r1, r1
- add r3, r3
- mov r4d, [r2]
- mov r5d, [r2 + r3]
- lea r2, [r2 + r3 * 2]
- mov r6d, [r2]
- mov r3d, [r2 + r3]
-
- mov [r0], r4d
- mov [r0 + r1], r5d
- lea r0, [r0 + 2 * r1]
- mov [r0], r6d
- mov [r0 + r1], r3d
-%else
+cglobal blockcopy_pp_2x4, 4, 7, 0
mov r4w, [r2]
mov r5w, [r2 + r3]
lea r2, [r2 + r3 * 2]
@@ -62,43 +47,13 @@
lea r0, [r0 + 2 * r1]
mov [r0], r6w
mov [r0 + r1], r3w
-%endif
RET
;-----------------------------------------------------------------------------
; void blockcopy_pp_2x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_pp_2x8, 4, 7, 0, dest, deststride, src, srcstride
-%if HIGH_BIT_DEPTH
- add r1, r1
- add r3, r3
- mov r4d, [r2]
- mov r5d, [r2 + r3]
- lea r2, [r2 + r3 * 2]
- mov r6d, [r2]
-
- mov [r0], r4d
- mov [r0 + r1], r5d
- lea r0, [r0 + 2 * r1]
- mov [r0], r6d
- mov r4d, [r2 + r3]
- mov [r0 + r1], r4d
-
- lea r2, [r2 + r3 * 2]
- lea r0, [r0 + 2 * r1]
- mov r4d, [r2]
- mov r5d, [r2 + r3]
- lea r2, [r2 + r3 * 2]
- mov r6d, [r2]
- mov r3d, [r2 + r3]
-
- mov [r0], r4d
- mov [r0 + r1], r5d
- lea r0, [r0 + 2 * r1]
- mov [r0], r6d
- mov [r0 + r1], r3d
-%else
+cglobal blockcopy_pp_2x8, 4, 7, 0
mov r4w, [r2]
mov r5w, [r2 + r3]
mov r6w, [r2 + 2 * r3]
@@ -130,51 +85,25 @@
mov r4w, [r2 + r3]
mov [r0 + r1], r4w
-%endif
RET
;-----------------------------------------------------------------------------
; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-%if HIGH_BIT_DEPTH
-cglobal blockcopy_pp_4x2, 4, 4, 2, dest, deststride, src, srcstride
- add r1, r1
- add r3, r3
- movh m0, [r2]
- movh m1, [r2 + r3]
- movh [r0], m0
- movh [r0 + r1], m1
-%else
-cglobal blockcopy_pp_4x2, 4, 6, 0, dest, deststride, src, srcstride
+cglobal blockcopy_pp_4x2, 4, 6, 0
mov r4d, [r2]
mov r5d, [r2 + r3]
mov [r0], r4d
mov [r0 + r1], r5d
-%endif
RET
;-----------------------------------------------------------------------------
; void blockcopy_pp_4x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_pp_4x4, 4, 4, 4, dest, deststride, src, srcstride
-%if HIGH_BIT_DEPTH
- add r1, r1
- add r3, r3
- movh m0, [r2]
- movh m1, [r2 + r3]
- lea r2, [r2 + r3 * 2]
- movh m2, [r2]
- movh m3, [r2 + r3]
-
- movh [r0], m0
- movh [r0 + r1], m1
- lea r0, [r0 + 2 * r1]
- movh [r0], m2
- movh [r0 + r1], m3
-%else
+cglobal blockcopy_pp_4x4, 4, 4, 4
movd m0, [r2]
movd m1, [r2 + r3]
movd m2, [r2 + 2 * r3]
@@ -186,7 +115,6 @@
movd [r0 + 2 * r1], m2
lea r1, [r1 + 2 * r1]
movd [r0 + r1], m3
-%endif
RET
;-----------------------------------------------------------------------------
@@ -194,45 +122,9 @@
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W4_H8 2
INIT_XMM sse2
-cglobal blockcopy_pp_%1x%2, 4, 5, 4, dest, deststride, src, srcstride
+cglobal blockcopy_pp_%1x%2, 4, 5, 4
mov r4d, %2/8
-
-%if HIGH_BIT_DEPTH
- add r1, r1
- add r3, r3
-.loop
- movh m0, [r2]
- movh m1, [r2 + r3]
- lea r2, [r2 + r3 * 2]
- movh m2, [r2]
- movh m3, [r2 + r3]
-
- movh [r0], m0
- movh [r0 + r1], m1
- lea r0, [r0 + 2 * r1]
- movh [r0], m2
- movh [r0 + r1], m3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- movh m0, [r2]
- movh m1, [r2 + r3]
- lea r2, [r2 + r3 * 2]
- movh m2, [r2]
- movh m3, [r2 + r3]
-
- movh [r0], m0
- movh [r0 + r1], m1
- lea r0, [r0 + 2 * r1]
- movh [r0], m2
- movh [r0 + r1], m3
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
-
- dec r4d
- jnz .loop
-%else
-.loop
+.loop:
movd m0, [r2]
movd m1, [r2 + r3]
lea r2, [r2 + 2 * r3]
@@ -264,7 +156,6 @@
dec r4d
jnz .loop
-%endif
RET
%endmacro
@@ -275,58 +166,7 @@
; void blockcopy_pp_6x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-%if HIGH_BIT_DEPTH
-cglobal blockcopy_pp_6x8, 4, 4, 8, dest, deststride, src, srcstride
- add r1, r1
- add r3, r3
- movu m0, [r2]
- movu m1, [r2 + r3]
-
- pshufd m2, m0, 2
- pshufd m3, m1, 2
- movh [r0], m0
- movd [r0 + 8], m2
- movh [r0 + r1], m1
- movd [r0 + r1 + 8], m3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- movu m0, [r2]
- movu m1, [r2 + r3]
-
- pshufd m2, m0, 2
- pshufd m3, m1, 2
- movh [r0], m0
- movd [r0 + 8], m2
- movh [r0 + r1], m1
- movd [r0 + r1 + 8], m3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- movu m0, [r2]
- movu m1, [r2 + r3]
-
- pshufd m2, m0, 2
- pshufd m3, m1, 2
- movh [r0], m0
- movd [r0 + 8], m2
- movh [r0 + r1], m1
- movd [r0 + r1 + 8], m3
-
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- movu m0, [r2]
- movu m1, [r2 + r3]
-
- pshufd m2, m0, 2
- pshufd m3, m1, 2
- movh [r0], m0
- movd [r0 + 8], m2
- movh [r0 + r1], m1
- movd [r0 + r1 + 8], m3
- RET
-%else
-cglobal blockcopy_pp_6x8, 4, 7, 8, dest, deststride, src, srcstride
+cglobal blockcopy_pp_6x8, 4, 7, 8
movd m0, [r2]
movd m1, [r2 + r3]
@@ -386,50 +226,24 @@
mov r4w, [r2 + r3 + 4]
mov [r0 + r1 + 4], r4w
RET
-%endif
;-----------------------------------------------------------------------------
; void blockcopy_pp_8x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_pp_8x2, 4, 4, 2, dest, deststride, src, srcstride
-%if HIGH_BIT_DEPTH
- add r1, r1
- add r3, r3
- movu m0, [r2]
- movu m1, [r2 + r3]
-
- movu [r0], m0
- movu [r0 + r1], m1
-%else
+cglobal blockcopy_pp_8x2, 4, 4, 2
movh m0, [r2]
movh m1, [r2 + r3]
movh [r0], m0
movh [r0 + r1], m1
-%endif
RET
;-----------------------------------------------------------------------------
; void blockcopy_pp_8x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_pp_8x4, 4, 4, 4, dest, deststride, src, srcstride
-%if HIGH_BIT_DEPTH
- add r1, r1
- add r3, r3
- movu m0, [r2]
- movu m1, [r2 + r3]
- lea r2, [r2 + r3 * 2]
- movu m2, [r2]
- movu m3, [r2 + r3]
-
- movu [r0], m0
- movu [r0 + r1], m1
- lea r0, [r0 + 2 * r1]
- movu [r0], m2
- movu [r0 + r1], m3
-%else
+cglobal blockcopy_pp_8x4, 4, 4, 4
movh m0, [r2]
movh m1, [r2 + r3]
movh m2, [r2 + 2 * r3]
@@ -441,35 +255,13 @@
movh [r0 + 2 * r1], m2
lea r1, [r1 + 2 * r1]
movh [r0 + r1], m3
-%endif
RET
;-----------------------------------------------------------------------------
; void blockcopy_pp_8x6(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_pp_8x6, 4, 7, 6, dest, deststride, src, srcstride
-%if HIGH_BIT_DEPTH
- add r1, r1
- add r3, r3
- movu m0, [r2]
- movu m1, [r2 + r3]
- lea r2, [r2 + r3 * 2]
- movu m2, [r2]
- movu m3, [r2 + r3]
- lea r2, [r2 + r3 * 2]
- movu m4, [r2]
- movu m5, [r2 + r3]
-
- movu [r0], m0
- movu [r0 + r1], m1
- lea r0, [r0 + 2 * r1]
- movu [r0], m2
- movu [r0 + r1], m3
- lea r0, [r0 + 2 * r1]
- movu [r0], m4
- movu [r0 + r1], m5
-%else
+cglobal blockcopy_pp_8x6, 4, 7, 6
movh m0, [r2]
movh m1, [r2 + r3]
movh m2, [r2 + 2 * r3]
@@ -487,7 +279,6 @@
movh [r6 + 2 * r1], m4
lea r6, [r6 + 2 * r1]
movh [r6 + r1], m5
-%endif
RET
;-----------------------------------------------------------------------------
@@ -495,47 +286,10 @@
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W8_H8 2
INIT_XMM sse2
-cglobal blockcopy_pp_%1x%2, 4, 5, 6, dest, deststride, src, srcstride
+cglobal blockcopy_pp_%1x%2, 4, 5, 6
mov r4d, %2/8
-%if HIGH_BIT_DEPTH
- add r1, r1
- add r3, r3
-.loop
- movu m0, [r2]
- movu m1, [r2 + r3]
- lea r2, [r2 + 2 * r3]
-
- movu m2, [r2]
- movu m3, [r2 + r3]
- lea r2, [r2 + 2 * r3]
-
- movu m4, [r2]
- movu m5, [r2 + r3]
-
- movu [r0], m0
- movu [r0 + r1], m1
- lea r0, [r0 + 2 * r1]
- movu [r0], m2
- movu [r0 + r1], m3
- lea r0, [r0 + 2 * r1]
-
- movu [r0], m4
- movu [r0 + r1], m5
-
- lea r2, [r2 + 2 * r3]
- movu m4, [r2]
- movu m5, [r2 + r3]
-
- lea r0, [r0 + 2 * r1]
- movu [r0], m4
- movu [r0 + r1], m5
-
- dec r4d
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- jnz .loop
-%else
-.loop
+
+.loop:
movh m0, [r2]
movh m1, [r2 + r3]
lea r2, [r2 + 2 * r3]
@@ -565,7 +319,6 @@
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
jnz .loop
-%endif
RET
%endmacro
@@ -578,41 +331,10 @@
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W12_H4 2
INIT_XMM sse2
-cglobal blockcopy_pp_%1x%2, 4, 5, 4, dest, deststride, src, srcstride
-
+cglobal blockcopy_pp_%1x%2, 4, 5, 4
mov r4d, %2/4
-%if HIGH_BIT_DEPTH
- add r1, r1
- add r3, r3
-.loop
- movu m0, [r2]
- movh m1, [r2 + 16]
- movu m2, [r2 + r3]
- movh m3, [r2 + r3 + 16]
- lea r2, [r2 + 2 * r3]
-
- movu [r0], m0
- movh [r0 + 16], m1
- movu [r0 + r1], m2
- movh [r0 + r1 + 16], m3
-
- lea r0, [r0 + 2 * r1]
- movu m0, [r2]
- movh m1, [r2 + 16]
- movu m2, [r2 + r3]
- movh m3, [r2 + r3 + 16]
-
- movu [r0], m0
- movh [r0 + 16], m1
- movu [r0 + r1], m2
- movh [r0 + r1 + 16], m3
-
- dec r4d
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- jnz .loop
-%else
-.loop
+
+.loop:
movh m0, [r2]
movd m1, [r2 + 8]
movh m2, [r2 + r3]
@@ -639,7 +361,6 @@
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
jnz .loop
-%endif
RET
%endmacro
@@ -650,40 +371,10 @@
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W16_H4 2
INIT_XMM sse2
-cglobal blockcopy_pp_%1x%2, 4, 5, 4, dest, deststride, src, srcstride
+cglobal blockcopy_pp_%1x%2, 4, 5, 4
mov r4d, %2/4
-%if HIGH_BIT_DEPTH
- add r1, r1
- add r3, r3
-.loop
- movu m0, [r2]
- movu m1, [r2 + 16]
- movu m2, [r2 + r3]
- movu m3, [r2 + r3 + 16]
- lea r2, [r2 + 2 * r3]
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + r1], m2
- movu [r0 + r1 + 16], m3
-
- lea r0, [r0 + 2 * r1]
- movu m0, [r2]
- movu m1, [r2 + 16]
- movu m2, [r2 + r3]
- movu m3, [r2 + r3 + 16]
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + r1], m2
- movu [r0 + r1 + 16], m3
-
- dec r4d
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- jnz .loop
-%else
-.loop
+
+.loop:
movu m0, [r2]
movu m1, [r2 + r3]
lea r2, [r2 + 2 * r3]
@@ -700,7 +391,7 @@
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
jnz .loop
-%endif
+
RET
%endmacro
@@ -712,62 +403,10 @@
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W16_H8 2
INIT_XMM sse2
-cglobal blockcopy_pp_%1x%2, 4, 5, 6, dest, deststride, src, srcstride
+cglobal blockcopy_pp_%1x%2, 4, 5, 6
mov r4d, %2/8
-%if HIGH_BIT_DEPTH
- add r1, r1
- add r3, r3
-.loop
- movu m0, [r2]
- movu m1, [r2 + 16]
- movu m2, [r2 + r3]
- movu m3, [r2 + r3 + 16]
- lea r2, [r2 + 2 * r3]
- movu m4, [r2]
- movu m5, [r2 + 16]
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + r1], m2
- movu [r0 + r1 + 16], m3
- lea r0, [r0 + 2 * r1]
- movu [r0], m4
- movu [r0 + 16], m5
-
- movu m0, [r2 + r3]
- movu m1, [r2 + r3 + 16]
- lea r2, [r2 + 2 * r3]
- movu m2, [r2]
- movu m3, [r2 + 16]
- movu m4, [r2 + r3]
- movu m5, [r2 + r3 + 16]
- lea r2, [r2 + 2 * r3]
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m1
- lea r0, [r0 + 2 * r1]
- movu [r0], m2
- movu [r0 + 16], m3
- movu [r0 + r1], m4
- movu [r0 + r1 + 16], m5
- lea r0, [r0 + 2 * r1]
-
- movu m0, [r2]
- movu m1, [r2 + 16]
- movu m2, [r2 + r3]
- movu m3, [r2 + r3 + 16]
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + r1], m2
- movu [r0 + r1 + 16], m3
-
- dec r4d
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- jnz .loop
-%else
-.loop
+
+.loop:
movu m0, [r2]
movu m1, [r2 + r3]
lea r2, [r2 + 2 * r3]
@@ -797,7 +436,6 @@
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
jnz .loop
-%endif
RET
%endmacro
@@ -811,12 +449,93 @@
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W24_H4 2
INIT_XMM sse2
-cglobal blockcopy_pp_%1x%2, 4, 5, 6, dest, deststride, src, srcstride
+cglobal blockcopy_pp_%1x%2, 4, 5, 6
mov r4d, %2/4
-%if HIGH_BIT_DEPTH
- add r1, r1
- add r3, r3
-.loop
+
+.loop:
+ movu m0, [r2]
+ movh m1, [r2 + 16]
+ movu m2, [r2 + r3]
+ movh m3, [r2 + r3 + 16]
+ lea r2, [r2 + 2 * r3]
+ movu m4, [r2]
+ movh m5, [r2 + 16]
+
+ movu [r0], m0
+ movh [r0 + 16], m1
+ movu [r0 + r1], m2
+ movh [r0 + r1 + 16], m3
+ lea r0, [r0 + 2 * r1]
+ movu [r0], m4
+ movh [r0 + 16], m5
+
+ movu m0, [r2 + r3]
+ movh m1, [r2 + r3 + 16]
+ movu [r0 + r1], m0
+ movh [r0 + r1 + 16], m1
+
+ dec r4d
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ jnz .loop
+ RET
+%endmacro
+
+BLOCKCOPY_PP_W24_H4 24, 32
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W32_H4 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 5, 4
+ mov r4d, %2/4
+
+.loop:
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + r3 + 16]
+ lea r2, [r2 + 2 * r3]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 16], m3
+ lea r0, [r0 + 2 * r1]
+
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + r3 + 16]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 16], m3
+
+ dec r4d
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ jnz .loop
+ RET
+%endmacro
+
+BLOCKCOPY_PP_W32_H4 32, 8
+BLOCKCOPY_PP_W32_H4 32, 16
+BLOCKCOPY_PP_W32_H4 32, 24
+BLOCKCOPY_PP_W32_H4 32, 32
+BLOCKCOPY_PP_W32_H4 32, 64
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W48_H2 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 5, 6
+ mov r4d, %2/4
+
+.loop:
movu m0, [r2]
movu m1, [r2 + 16]
movu m2, [r2 + 32]
@@ -831,8 +550,8 @@
movu [r0 + r1], m3
movu [r0 + r1 + 16], m4
movu [r0 + r1 + 32], m5
-
lea r0, [r0 + 2 * r1]
+
movu m0, [r2]
movu m1, [r2 + 16]
movu m2, [r2 + 32]
@@ -847,251 +566,13 @@
movu [r0 + r1 + 16], m4
movu [r0 + r1 + 32], m5
- dec r4d
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- jnz .loop
-%else
-.loop
- movu m0, [r2]
- movh m1, [r2 + 16]
- movu m2, [r2 + r3]
- movh m3, [r2 + r3 + 16]
- lea r2, [r2 + 2 * r3]
- movu m4, [r2]
- movh m5, [r2 + 16]
-
- movu [r0], m0
- movh [r0 + 16], m1
- movu [r0 + r1], m2
- movh [r0 + r1 + 16], m3
- lea r0, [r0 + 2 * r1]
- movu [r0], m4
- movh [r0 + 16], m5
-
- movu m0, [r2 + r3]
- movh m1, [r2 + r3 + 16]
- movu [r0 + r1], m0
- movh [r0 + r1 + 16], m1
-
dec r4d
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
jnz .loop
-%endif
RET
%endmacro
-BLOCKCOPY_PP_W24_H4 24, 32
-
-;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
-;-----------------------------------------------------------------------------
-%macro BLOCKCOPY_PP_W32_H4 2
-INIT_XMM sse2
-cglobal blockcopy_pp_%1x%2, 4, 5, 4, dest, deststride, src, srcstride
- mov r4d, %2/4
-%if HIGH_BIT_DEPTH
- add r1, r1
- add r3, r3
-.loop
- movu m0, [r2]
- movu m1, [r2 + 16]
- movu m2, [r2 + 32]
- movu m3, [r2 + 48]
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + 32], m2
- movu [r0 + 48], m3
-
- movu m0, [r2 + r3]
- movu m1, [r2 + r3 + 16]
- movu m2, [r2 + r3 + 32]
- movu m3, [r2 + r3 + 48]
- lea r2, [r2 + 2 * r3]
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m1
- movu [r0 + r1 + 32], m2
- movu [r0 + r1 + 48], m3
-
- lea r0, [r0 + 2 * r1]
- movu m0, [r2]
- movu m1, [r2 + 16]
- movu m2, [r2 + 32]
- movu m3, [r2 + 48]
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + 32], m2
- movu [r0 + 48], m3
-
- movu m0, [r2 + r3]
- movu m1, [r2 + r3 + 16]
- movu m2, [r2 + r3 + 32]
- movu m3, [r2 + r3 + 48]
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m1
- movu [r0 + r1 + 32], m2
- movu [r0 + r1 + 48], m3
-
- dec r4d
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- jnz .loop
-%else
-.loop
- movu m0, [r2]
- movu m1, [r2 + 16]
- movu m2, [r2 + r3]
- movu m3, [r2 + r3 + 16]
- lea r2, [r2 + 2 * r3]
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + r1], m2
- movu [r0 + r1 + 16], m3
- lea r0, [r0 + 2 * r1]
-
- movu m0, [r2]
- movu m1, [r2 + 16]
- movu m2, [r2 + r3]
- movu m3, [r2 + r3 + 16]
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + r1], m2
- movu [r0 + r1 + 16], m3
-
- dec r4d
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- jnz .loop
-%endif
- RET
-%endmacro
-
-BLOCKCOPY_PP_W32_H4 32, 8
-BLOCKCOPY_PP_W32_H4 32, 16
-BLOCKCOPY_PP_W32_H4 32, 24
-BLOCKCOPY_PP_W32_H4 32, 32
-BLOCKCOPY_PP_W32_H4 32, 64
-
-;-----------------------------------------------------------------------------
-; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
-;-----------------------------------------------------------------------------
-%macro BLOCKCOPY_PP_W48_H2 2
-INIT_XMM sse2
-cglobal blockcopy_pp_%1x%2, 4, 5, 6, dest, deststride, src, srcstride
- mov r4d, %2/4
-%if HIGH_BIT_DEPTH
- add r1, r1
- add r3, r3
-.loop
- movu m0, [r2]
- movu m1, [r2 + 16]
- movu m2, [r2 + 32]
- movu m3, [r2 + 48]
- movu m4, [r2 + 64]
- movu m5, [r2 + 80]
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + 32], m2
- movu [r0 + 48], m3
- movu [r0 + 64], m4
- movu [r0 + 80], m5
-
- movu m0, [r2 + r3]
- movu m1, [r2 + r3 + 16]
- movu m2, [r2 + r3 + 32]
- movu m3, [r2 + r3 + 48]
- movu m4, [r2 + r3 + 64]
- movu m5, [r2 + r3 + 80]
- lea r2, [r2 + 2 * r3]
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m1
- movu [r0 + r1 + 32], m2
- movu [r0 + r1 + 48], m3
- movu [r0 + r1 + 64], m4
- movu [r0 + r1 + 80], m5
- lea r0, [r0 + 2 * r1]
-
- movu m0, [r2]
- movu m1, [r2 + 16]
- movu m2, [r2 + 32]
- movu m3, [r2 + 48]
- movu m4, [r2 + 64]
- movu m5, [r2 + 80]
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + 32], m2
- movu [r0 + 48], m3
- movu [r0 + 64], m4
- movu [r0 + 80], m5
-
- movu m0, [r2 + r3]
- movu m1, [r2 + r3 + 16]
- movu m2, [r2 + r3 + 32]
- movu m3, [r2 + r3 + 48]
- movu m4, [r2 + r3 + 64]
- movu m5, [r2 + r3 + 80]
-
- movu [r0 + r1], m0
- movu [r0 + r1 + 16], m1
- movu [r0 + r1 + 32], m2
- movu [r0 + r1 + 48], m3
- movu [r0 + r1 + 64], m4
- movu [r0 + r1 + 80], m5
-
- dec r4d
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- jnz .loop
-%else
-.loop
- movu m0, [r2]
- movu m1, [r2 + 16]
- movu m2, [r2 + 32]
- movu m3, [r2 + r3]
- movu m4, [r2 + r3 + 16]
- movu m5, [r2 + r3 + 32]
- lea r2, [r2 + 2 * r3]
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + 32], m2
- movu [r0 + r1], m3
- movu [r0 + r1 + 16], m4
- movu [r0 + r1 + 32], m5
- lea r0, [r0 + 2 * r1]
-
- movu m0, [r2]
- movu m1, [r2 + 16]
- movu m2, [r2 + 32]
- movu m3, [r2 + r3]
- movu m4, [r2 + r3 + 16]
- movu m5, [r2 + r3 + 32]
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + 32], m2
- movu [r0 + r1], m3
- movu [r0 + r1 + 16], m4
- movu [r0 + r1 + 32], m5
-
- dec r4d
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- jnz .loop
-%endif
-RET
-%endmacro
-
BLOCKCOPY_PP_W48_H2 48, 64
;-----------------------------------------------------------------------------
@@ -1099,96 +580,10 @@
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W64_H4 2
INIT_XMM sse2
-cglobal blockcopy_pp_%1x%2, 4, 5, 6, dest, deststride, src, srcstride
+cglobal blockcopy_pp_%1x%2, 4, 5, 6
mov r4d, %2/4
-%if HIGH_BIT_DEPTH
- add r1, r1
- add r3, r3
-.loop
- movu m0, [r2]
- movu m1, [r2 + 16]
- movu m2, [r2 + 32]
- movu m3, [r2 + 48]
- movu m4, [r2 + 64]
- movu m5, [r2 + 80]
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + 32], m2
- movu [r0 + 48], m3
- movu [r0 + 64], m4
- movu [r0 + 80], m5
-
- movu m0, [r2 + 96]
- movu m1, [r2 + 112]
- movu m2, [r2 + r3]
- movu m3, [r2 + r3 + 16]
- movu m4, [r2 + r3 + 32]
- movu m5, [r2 + r3 + 48]
-
- movu [r0 + 96], m0
- movu [r0 + 112], m1
- movu [r0 + r1], m2
- movu [r0 + r1 + 16], m3
- movu [r0 + r1 + 32], m4
- movu [r0 + r1 + 48], m5
-
- movu m0, [r2 + r3 + 64]
- movu m1, [r2 + r3 + 80]
- movu m2, [r2 + r3 + 96]
- movu m3, [r2 + r3 + 112]
- lea r2, [r2 + 2 * r3]
-
- movu [r0 + r1 + 64], m0
- movu [r0 + r1 + 80], m1
- movu [r0 + r1 + 96], m2
- movu [r0 + r1 + 112], m3
-
- lea r0, [r0 + 2 * r1]
- movu m0, [r2]
- movu m1, [r2 + 16]
- movu m2, [r2 + 32]
- movu m3, [r2 + 48]
- movu m4, [r2 + 64]
- movu m5, [r2 + 80]
-
- movu [r0], m0
- movu [r0 + 16], m1
- movu [r0 + 32], m2
- movu [r0 + 48], m3
- movu [r0 + 64], m4
- movu [r0 + 80], m5
-
- movu m0, [r2 + 96]
- movu m1, [r2 + 112]
- movu m2, [r2 + r3]
- movu m3, [r2 + r3 + 16]
- movu m4, [r2 + r3 + 32]
- movu m5, [r2 + r3 + 48]
-
- movu [r0 + 96], m0
- movu [r0 + 112], m1
- movu [r0 + r1], m2
- movu [r0 + r1 + 16], m3
- movu [r0 + r1 + 32], m4
- movu [r0 + r1 + 48], m5
-
- movu m0, [r2 + r3 + 64]
- movu m1, [r2 + r3 + 80]
- movu m2, [r2 + r3 + 96]
- movu m3, [r2 + r3 + 112]
-
- movu [r0 + r1 + 64], m0
- movu [r0 + r1 + 80], m1
- movu [r0 + r1 + 96], m2
- movu [r0 + r1 + 112], m3
-
- dec r4d
- lea r0, [r0 + 2 * r1]
- lea r2, [r2 + 2 * r3]
- jnz .loop
-%else
-.loop
+
+.loop:
movu m0, [r2]
movu m1, [r2 + 16]
movu m2, [r2 + 32]
@@ -1233,7 +628,6 @@
lea r0, [r0 + 2 * r1]
lea r2, [r2 + 2 * r3]
jnz .loop
-%endif
RET
%endmacro
@@ -1419,7 +813,7 @@
add r3, r3
-.loop
+.loop:
movh m0, [r2]
movh m1, [r2 + r3]
movh m2, [r2 + 2 * r3]
@@ -1652,7 +1046,7 @@
add r3, r3
-.loop
+.loop:
movu m0, [r2]
movu m1, [r2 + r3]
movu m2, [r2 + 2 * r3]
@@ -1692,7 +1086,7 @@
%endmacro
BLOCKCOPY_SP_W8_H8 8, 16
-
+BLOCKCOPY_SP_W8_H8 8, 32
;-----------------------------------------------------------------------------
; void blockcopy_sp_%1x%2(pixel *dest, intptr_t destStride, int16_t *src, intptr_t srcStride)
@@ -1705,7 +1099,7 @@
add r3, r3
-.loop
+.loop:
movu m0, [r2]
movu m1, [r2 + 16]
movu m2, [r2 + r3]
@@ -1760,7 +1154,7 @@
add r3, r3
-.loop
+.loop:
movu m0, [r2]
movu m1, [r2 + 16]
movu m2, [r2 + r3]
@@ -1809,7 +1203,7 @@
add r3, r3
-.loop
+.loop:
movu m0, [r2]
movu m1, [r2 + 16]
movu m2, [r2 + 32]
@@ -1848,7 +1242,7 @@
add r3, r3
-.loop
+.loop:
movu m0, [r2]
movu m1, [r2 + 16]
movu m2, [r2 + 32]
@@ -1894,7 +1288,7 @@
add r3, r3
-.loop
+.loop:
movu m0, [r2]
movu m1, [r2 + 16]
movu m2, [r2 + 32]
@@ -1932,7 +1326,7 @@
add r3, r3
-.loop
+.loop:
movu m0, [r2]
movu m1, [r2 + 16]
movu m2, [r2 + 32]
@@ -2029,7 +1423,7 @@
pshuflw m0, m0, 0
pshufd m0, m0, 0
-.loop
+.loop:
movu [r0], m0
movu [r0 + 16], m0
@@ -2082,7 +1476,7 @@
pshuflw m0, m0, 0
pshufd m0, m0, 0
-.loop
+.loop:
movu [r0], m0
movu [r0 + 16], m0
movu [r0 + 32], m0
@@ -2257,7 +1651,7 @@
add r1, r1
mov r4d, %2/4
-.loop
+.loop:
movd m0, [r2]
pmovzxbw m0, m0
movh [r0], m0
@@ -2299,7 +1693,7 @@
add r1, r1
mov r4d, %2/4
-.loop
+.loop:
movh m0, [r2]
pmovzxbw m0, m0
movh [r0], m0
@@ -2431,7 +1825,7 @@
add r1, r1
mov r4d, %2/4
-.loop
+.loop:
movh m0, [r2]
pmovzxbw m0, m0
movu [r0], m0
@@ -2476,7 +1870,7 @@
mov r4d, %2/2
pxor m0, m0
-.loop
+.loop:
movu m1, [r2]
pmovzxbw m2, m1
movu [r0], m2
@@ -2549,7 +1943,7 @@
mov r4d, %2/4
pxor m0, m0
-.loop
+.loop:
movu m1, [r2]
pmovzxbw m2, m1
movu [r0], m2
@@ -2603,7 +1997,7 @@
mov r4d, %2/2
pxor m0, m0
-.loop
+.loop:
movu m1, [r2]
pmovzxbw m2, m1
movu [r0], m2
@@ -2646,7 +2040,7 @@
mov r4d, %2/2
pxor m0, m0
-.loop
+.loop:
movu m1, [r2]
pmovzxbw m2, m1
movu [r0], m2
@@ -2697,7 +2091,7 @@
mov r4d, %2/2
pxor m0, m0
-.loop
+.loop:
movu m1, [r2]
pmovzxbw m2, m1
movu [r0], m2
@@ -2756,7 +2150,7 @@
mov r4d, %2/2
pxor m0, m0
-.loop
+.loop:
movu m1, [r2]
pmovzxbw m2, m1
movu [r0], m2
@@ -2820,6 +2214,777 @@
BLOCKCOPY_PS_W64_H2 64, 64
;-----------------------------------------------------------------------------
+; void blockcopy_ss_2x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_ss_2x4, 4, 6, 0
+ add r1, r1
+ add r3, r3
+
+ mov r4d, [r2]
+ mov r5d, [r2 + r3]
+ mov [r0], r4d
+ mov [r0 + r1], r5d
+
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + 2 * r1]
+
+ mov r4d, [r2]
+ mov r5d, [r2 + r3]
+ mov [r0], r4d
+ mov [r0 + r1], r5d
+
+ RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_2x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_ss_2x8, 4, 6, 0
+ add r1, r1
+ add r3, r3
+
+ mov r4d, [r2]
+ mov r5d, [r2 + r3]
+ mov [r0], r4d
+ mov [r0 + r1], r5d
+
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + 2 * r1]
+
+ mov r4d, [r2]
+ mov r5d, [r2 + r3]
+ mov [r0], r4d
+ mov [r0 + r1], r5d
+
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + 2 * r1]
+
+ mov r4d, [r2]
+ mov r5d, [r2 + r3]
+ mov [r0], r4d
+ mov [r0 + r1], r5d
+
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + 2 * r1]
+
+ mov r4d, [r2]
+ mov r5d, [r2 + r3]
+ mov [r0], r4d
+ mov [r0 + r1], r5d
+
+ RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_4x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_ss_4x2, 4, 4, 2
+ add r1, r1
+ add r3, r3
+
+ movh m0, [r2]
+ movh m1, [r2 + r3]
+
+ movh [r0], m0
+ movh [r0 + r1], m1
+
+ RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_4x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_ss_4x4, 4, 4, 4
+ add r1, r1
+ add r3, r3
+ movh m0, [r2]
+ movh m1, [r2 + r3]
+ lea r2, [r2 + r3 * 2]
+ movh m2, [r2]
+ movh m3, [r2 + r3]
+
+ movh [r0], m0
+ movh [r0 + r1], m1
+ lea r0, [r0 + 2 * r1]
+ movh [r0], m2
+ movh [r0 + r1], m3
+ RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SS_W4_H8 2
+INIT_XMM sse2
+cglobal blockcopy_ss_%1x%2, 4, 5, 4
+ mov r4d, %2/8
+ add r1, r1
+ add r3, r3
+.loop:
+ movh m0, [r2]
+ movh m1, [r2 + r3]
+ lea r2, [r2 + r3 * 2]
+ movh m2, [r2]
+ movh m3, [r2 + r3]
+
+ movh [r0], m0
+ movh [r0 + r1], m1
+ lea r0, [r0 + 2 * r1]
+ movh [r0], m2
+ movh [r0 + r1], m3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ movh m0, [r2]
+ movh m1, [r2 + r3]
+ lea r2, [r2 + r3 * 2]
+ movh m2, [r2]
+ movh m3, [r2 + r3]
+
+ movh [r0], m0
+ movh [r0 + r1], m1
+ lea r0, [r0 + 2 * r1]
+ movh [r0], m2
+ movh [r0 + r1], m3
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loop
+ RET
+%endmacro
+
+BLOCKCOPY_SS_W4_H8 4, 8
+BLOCKCOPY_SS_W4_H8 4, 16
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_6x8(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_ss_6x8, 4, 4, 4
+ add r1, r1
+ add r3, r3
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ pshufd m2, m0, 2
+ pshufd m3, m1, 2
+ movh [r0], m0
+ movd [r0 + 8], m2
+ movh [r0 + r1], m1
+ movd [r0 + r1 + 8], m3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ pshufd m2, m0, 2
+ pshufd m3, m1, 2
+ movh [r0], m0
+ movd [r0 + 8], m2
+ movh [r0 + r1], m1
+ movd [r0 + r1 + 8], m3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ pshufd m2, m0, 2
+ pshufd m3, m1, 2
+ movh [r0], m0
+ movd [r0 + 8], m2
+ movh [r0 + r1], m1
+ movd [r0 + r1 + 8], m3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ pshufd m2, m0, 2
+ pshufd m3, m1, 2
+ movh [r0], m0
+ movd [r0 + 8], m2
+ movh [r0 + r1], m1
+ movd [r0 + r1 + 8], m3
+
+ RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_8x2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_ss_8x2, 4, 4, 2
+ add r1, r1
+ add r3, r3
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+
+ movu [r0], m0
+ movu [r0 + r1], m1
+
+ RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_8x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_ss_8x4, 4, 4, 4
+ add r1, r1
+ add r3, r3
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ lea r2, [r2 + r3 * 2]
+ movu m2, [r2]
+ movu m3, [r2 + r3]
+
+ movu [r0], m0
+ movu [r0 + r1], m1
+ lea r0, [r0 + 2 * r1]
+ movu [r0], m2
+ movu [r0 + r1], m3
+ RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_8x6(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_ss_8x6, 4, 4, 4
+
+ add r1, r1
+ add r3, r3
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ lea r2, [r2 + r3 * 2]
+ movu m2, [r2]
+ movu m3, [r2 + r3]
+
+ movu [r0], m0
+ movu [r0 + r1], m1
+ lea r0, [r0 + 2 * r1]
+ movu [r0], m2
+ movu [r0 + r1], m3
+
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + 2 * r1]
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ movu [r0], m0
+ movu [r0 + r1], m1
+ RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SS_W8_H8 2
+INIT_XMM sse2
+cglobal blockcopy_ss_%1x%2, 4, 5, 4
+ mov r4d, %2/8
+ add r1, r1
+ add r3, r3
+.loop:
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ lea r2, [r2 + r3 * 2]
+ movu m2, [r2]
+ movu m3, [r2 + r3]
+
+ movu [r0], m0
+ movu [r0 + r1], m1
+ lea r0, [r0 + 2 * r1]
+ movu [r0], m2
+ movu [r0 + r1], m3
+
+
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ lea r2, [r2 + r3 * 2]
+ movu m2, [r2]
+ movu m3, [r2 + r3]
+
+ movu [r0], m0
+ movu [r0 + r1], m1
+ lea r0, [r0 + 2 * r1]
+ movu [r0], m2
+ movu [r0 + r1], m3
+
+ dec r4d
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ jnz .loop
+RET
+%endmacro
+
+BLOCKCOPY_SS_W8_H8 8, 8
+BLOCKCOPY_SS_W8_H8 8, 16
+BLOCKCOPY_SS_W8_H8 8, 32
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SS_W12_H4 2
+INIT_XMM sse2
+cglobal blockcopy_ss_%1x%2, 4, 5, 4
+
+ mov r4d, %2/4
+ add r1, r1
+ add r3, r3
+.loop:
+ movu m0, [r2]
+ movh m1, [r2 + 16]
+ movu m2, [r2 + r3]
+ movh m3, [r2 + r3 + 16]
+ lea r2, [r2 + 2 * r3]
+
+ movu [r0], m0
+ movh [r0 + 16], m1
+ movu [r0 + r1], m2
+ movh [r0 + r1 + 16], m3
+
+ lea r0, [r0 + 2 * r1]
+ movu m0, [r2]
+ movh m1, [r2 + 16]
+ movu m2, [r2 + r3]
+ movh m3, [r2 + r3 + 16]
+
+ movu [r0], m0
+ movh [r0 + 16], m1
+ movu [r0 + r1], m2
+ movh [r0 + r1 + 16], m3
+
+ dec r4d
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ jnz .loop
+ RET
+%endmacro
+
+BLOCKCOPY_SS_W12_H4 12, 16
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_16x4(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SS_W16_H4 2
+INIT_XMM sse2
+cglobal blockcopy_ss_%1x%2, 4, 5, 4
+ mov r4d, %2/4
+ add r1, r1
+ add r3, r3
+.loop:
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + r3 + 16]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 16], m3
+
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + r3 + 16]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 16], m3
+
+ dec r4d
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ jnz .loop
+ RET
+%endmacro
+
+BLOCKCOPY_SS_W16_H4 16, 4
+BLOCKCOPY_SS_W16_H4 16, 12
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SS_W16_H8 2
+INIT_XMM sse2
+cglobal blockcopy_ss_%1x%2, 4, 5, 4
+ mov r4d, %2/8
+ add r1, r1
+ add r3, r3
+.loop:
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + r3 + 16]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 16], m3
+
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + r3 + 16]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 16], m3
+
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + r3 + 16]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 16], m3
+
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + r3]
+ movu m3, [r2 + r3 + 16]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 16], m3
+
+ dec r4d
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+ jnz .loop
+ RET
+%endmacro
+
+BLOCKCOPY_SS_W16_H8 16, 8
+BLOCKCOPY_SS_W16_H8 16, 16
+BLOCKCOPY_SS_W16_H8 16, 32
+BLOCKCOPY_SS_W16_H8 16, 64
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SS_W24_H4 2
+INIT_XMM sse2
+cglobal blockcopy_ss_%1x%2, 4, 5, 6
+ mov r4d, %2/4
+ add r1, r1
+ add r3, r3
+.loop
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + 32]
+ movu m3, [r2 + r3]
+ movu m4, [r2 + r3 + 16]
+ movu m5, [r2 + r3 + 32]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + 32], m2
+ movu [r0 + r1], m3
+ movu [r0 + r1 + 16], m4
+ movu [r0 + r1 + 32], m5
+
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + 32]
+ movu m3, [r2 + r3]
+ movu m4, [r2 + r3 + 16]
+ movu m5, [r2 + r3 + 32]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + 32], m2
+ movu [r0 + r1], m3
+ movu [r0 + r1 + 16], m4
+ movu [r0 + r1 + 32], m5
+
+ dec r4d
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+ jnz .loop
+ RET
+%endmacro
+
+BLOCKCOPY_SS_W24_H4 24, 32
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SS_W32_H4 2
+INIT_XMM sse2
+cglobal blockcopy_ss_%1x%2, 4, 5, 4
+ mov r4d, %2/4
+ add r1, r1
+ add r3, r3
+.loop:
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + 32]
+ movu m3, [r2 + 48]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + 32], m2
+ movu [r0 + 48], m3
+
+ movu m0, [r2 + r3]
+ movu m1, [r2 + r3 + 16]
+ movu m2, [r2 + r3 + 32]
+ movu m3, [r2 + r3 + 48]
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m1
+ movu [r0 + r1 + 32], m2
+ movu [r0 + r1 + 48], m3
+
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + 32]
+ movu m3, [r2 + 48]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + 32], m2
+ movu [r0 + 48], m3
+
+ movu m0, [r2 + r3]
+ movu m1, [r2 + r3 + 16]
+ movu m2, [r2 + r3 + 32]
+ movu m3, [r2 + r3 + 48]
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m1
+ movu [r0 + r1 + 32], m2
+ movu [r0 + r1 + 48], m3
+
+ dec r4d
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+ jnz .loop
+ RET
+%endmacro
+
+BLOCKCOPY_SS_W32_H4 32, 8
+BLOCKCOPY_SS_W32_H4 32, 16
+BLOCKCOPY_SS_W32_H4 32, 24
+BLOCKCOPY_SS_W32_H4 32, 32
+BLOCKCOPY_SS_W32_H4 32, 64
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SS_W48_H2 2
+INIT_XMM sse2
+cglobal blockcopy_ss_%1x%2, 4, 5, 6
+ mov r4d, %2/4
+ add r1, r1
+ add r3, r3
+.loop:
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + 32]
+ movu m3, [r2 + 48]
+ movu m4, [r2 + 64]
+ movu m5, [r2 + 80]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + 32], m2
+ movu [r0 + 48], m3
+ movu [r0 + 64], m4
+ movu [r0 + 80], m5
+
+ movu m0, [r2 + r3]
+ movu m1, [r2 + r3 + 16]
+ movu m2, [r2 + r3 + 32]
+ movu m3, [r2 + r3 + 48]
+ movu m4, [r2 + r3 + 64]
+ movu m5, [r2 + r3 + 80]
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m1
+ movu [r0 + r1 + 32], m2
+ movu [r0 + r1 + 48], m3
+ movu [r0 + r1 + 64], m4
+ movu [r0 + r1 + 80], m5
+
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + 32]
+ movu m3, [r2 + 48]
+ movu m4, [r2 + 64]
+ movu m5, [r2 + 80]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + 32], m2
+ movu [r0 + 48], m3
+ movu [r0 + 64], m4
+ movu [r0 + 80], m5
+
+ movu m0, [r2 + r3]
+ movu m1, [r2 + r3 + 16]
+ movu m2, [r2 + r3 + 32]
+ movu m3, [r2 + r3 + 48]
+ movu m4, [r2 + r3 + 64]
+ movu m5, [r2 + r3 + 80]
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m1
+ movu [r0 + r1 + 32], m2
+ movu [r0 + r1 + 48], m3
+ movu [r0 + r1 + 64], m4
+ movu [r0 + r1 + 80], m5
+
+ dec r4d
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+ jnz .loop
+RET
+%endmacro
+
+BLOCKCOPY_SS_W48_H2 48, 64
+
+;-----------------------------------------------------------------------------
+; void blockcopy_ss_%1x%2(int16_t *dest, intptr_t deststride, int16_t *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_SS_W64_H4 2
+INIT_XMM sse2
+cglobal blockcopy_ss_%1x%2, 4, 5, 6, dest, deststride, src, srcstride
+ mov r4d, %2/4
+ add r1, r1
+ add r3, r3
+.loop:
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + 32]
+ movu m3, [r2 + 48]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + 32], m2
+ movu [r0 + 48], m3
+
+ movu m0, [r2 + 64]
+ movu m1, [r2 + 80]
+ movu m2, [r2 + 96]
+ movu m3, [r2 + 112]
+
+ movu [r0 + 64], m0
+ movu [r0 + 80], m1
+ movu [r0 + 96], m2
+ movu [r0 + 112], m3
+
+ movu m0, [r2 + r3]
+ movu m1, [r2 + r3 + 16]
+ movu m2, [r2 + r3 + 32]
+ movu m3, [r2 + r3 + 48]
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m1
+ movu [r0 + r1 + 32], m2
+ movu [r0 + r1 + 48], m3
+
+ movu m0, [r2 + r3 + 64]
+ movu m1, [r2 + r3 + 80]
+ movu m2, [r2 + r3 + 96]
+ movu m3, [r2 + r3 + 112]
+
+ movu [r0 + r1 + 64], m0
+ movu [r0 + r1 + 80], m1
+ movu [r0 + r1 + 96], m2
+ movu [r0 + r1 + 112], m3
+
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + 32]
+ movu m3, [r2 + 48]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + 32], m2
+ movu [r0 + 48], m3
+
+ movu m0, [r2 + 64]
+ movu m1, [r2 + 80]
+ movu m2, [r2 + 96]
+ movu m3, [r2 + 112]
+
+ movu [r0 + 64], m0
+ movu [r0 + 80], m1
+ movu [r0 + 96], m2
+ movu [r0 + 112], m3
+
+ movu m0, [r2 + r3]
+ movu m1, [r2 + r3 + 16]
+ movu m2, [r2 + r3 + 32]
+ movu m3, [r2 + r3 + 48]
+
+ movu [r0 + r1], m0
+ movu [r0 + r1 + 16], m1
+ movu [r0 + r1 + 32], m2
+ movu [r0 + r1 + 48], m3
+
+ movu m0, [r2 + r3 + 64]
+ movu m1, [r2 + r3 + 80]
+ movu m2, [r2 + r3 + 96]
+ movu m3, [r2 + r3 + 112]
+
+ movu [r0 + r1 + 64], m0
+ movu [r0 + r1 + 80], m1
+ movu [r0 + r1 + 96], m2
+ movu [r0 + r1 + 112], m3
+
+ dec r4d
+ lea r2, [r2 + 2 * r3]
+ lea r0, [r0 + 2 * r1]
+ jnz .loop
+
+ RET
+%endmacro
+
+BLOCKCOPY_SS_W64_H4 64, 16
+BLOCKCOPY_SS_W64_H4 64, 32
+BLOCKCOPY_SS_W64_H4 64, 48
+BLOCKCOPY_SS_W64_H4 64, 64
+
+
+;-----------------------------------------------------------------------------
; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)
;-----------------------------------------------------------------------------
INIT_XMM sse2
@@ -2915,10 +3080,10 @@
add r2d, r2d
mov r5d, r4d
shr r4d, 2
-.loop_row
+.loop_row:
mov r6d, r4d
-.loop_col
+.loop_col:
pmovsxwd m0, [r1]
pslld m0, shift
movu [r0], m0
diff -r 33b67a53b6de -r 2bf727dca27d source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Thu Mar 06 21:27:55 2014 -0600
+++ b/source/common/x86/blockcopy8.h Fri Mar 07 15:11:13 2014 +0530
@@ -27,131 +27,112 @@
void x265_cvt32to16_shr_sse2(int16_t * dst, int *src, intptr_t, int, int);
void x265_cvt16to32_shl_sse4(int32_t * dst, int16_t * src, intptr_t, int32_t, int32_t);
-#define SETUP_CHROMA_BLOCKCOPY_FUNC(W, H, cpu) \
+#define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \
void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \
+ void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, int16_t * b, intptr_t strideb);\
+ void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t * a, intptr_t stridea, int16_t * b, intptr_t strideb);
+
+#define SETUP_BLOCKCOPY_PS(W, H, cpu) \
+ void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t * dst, intptr_t dstStride, pixel * src, intptr_t srcStride);
+
+#define SETUP_BLOCKCOPY_SP(W, H, cpu) \
void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, int16_t * b, intptr_t strideb);
-#define CHROMA_BLOCKCOPY_DEF(cpu) \
- SETUP_CHROMA_BLOCKCOPY_FUNC(4, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(4, 2, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(2, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(8, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(8, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(4, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(8, 6, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(6, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(8, 2, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(2, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(16, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(16, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(8, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(16, 12, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(12, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(16, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(4, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(32, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(32, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(16, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(32, 24, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(24, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(32, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC(8, 32, cpu);
+#define SETUP_BLOCKCOPY_SS_PP(W, H, cpu) \
+ void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \
+ void x265_blockcopy_ss_ ## W ## x ## H ## cpu(int16_t * a, intptr_t stridea, int16_t * b, intptr_t strideb);
-#define SETUP_LUMA_BLOCKCOPY_FUNC(W, H, cpu) \
- void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \
- void x265_blockcopy_sp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, int16_t * b, intptr_t strideb);
+#define BLOCKCOPY_COMMON(cpu) \
+ SETUP_BLOCKCOPY_FUNC(4, 4, cpu); \
+ SETUP_BLOCKCOPY_FUNC(4, 2, cpu); \
+ SETUP_BLOCKCOPY_FUNC(8, 8, cpu); \
+ SETUP_BLOCKCOPY_FUNC(8, 4, cpu); \
+ SETUP_BLOCKCOPY_FUNC(4, 8, cpu); \
+ SETUP_BLOCKCOPY_FUNC(8, 6, cpu); \
+ SETUP_BLOCKCOPY_FUNC(8, 2, cpu); \
+ SETUP_BLOCKCOPY_FUNC(16, 16, cpu); \
+ SETUP_BLOCKCOPY_FUNC(16, 8, cpu); \
+ SETUP_BLOCKCOPY_FUNC(8, 16, cpu); \
+ SETUP_BLOCKCOPY_FUNC(16, 12, cpu); \
+ SETUP_BLOCKCOPY_FUNC(12, 16, cpu); \
+ SETUP_BLOCKCOPY_FUNC(16, 4, cpu); \
+ SETUP_BLOCKCOPY_FUNC(4, 16, cpu); \
+ SETUP_BLOCKCOPY_FUNC(32, 32, cpu); \
+ SETUP_BLOCKCOPY_FUNC(32, 16, cpu); \
+ SETUP_BLOCKCOPY_FUNC(16, 32, cpu); \
+ SETUP_BLOCKCOPY_FUNC(32, 24, cpu); \
+ SETUP_BLOCKCOPY_FUNC(24, 32, cpu); \
+ SETUP_BLOCKCOPY_FUNC(32, 8, cpu); \
+ SETUP_BLOCKCOPY_FUNC(8, 32, cpu); \
+ SETUP_BLOCKCOPY_FUNC(64, 64, cpu); \
+ SETUP_BLOCKCOPY_FUNC(64, 32, cpu); \
+ SETUP_BLOCKCOPY_FUNC(32, 64, cpu); \
+ SETUP_BLOCKCOPY_FUNC(64, 48, cpu); \
+ SETUP_BLOCKCOPY_FUNC(48, 64, cpu); \
+ SETUP_BLOCKCOPY_FUNC(64, 16, cpu); \
+ SETUP_BLOCKCOPY_FUNC(16, 64, cpu);
-#define LUMA_BLOCKCOPY_DEF(cpu) \
- SETUP_LUMA_BLOCKCOPY_FUNC(4, 4, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(8, 8, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(8, 4, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(4, 8, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(16, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(16, 8, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(8, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(16, 12, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(12, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(16, 4, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(4, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(32, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(32, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(16, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(32, 24, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(24, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(32, 8, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(8, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(64, 64, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(64, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(32, 64, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(64, 48, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(48, 64, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(64, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC(16, 64, cpu);
+#define BLOCKCOPY_SP(cpu) \
+ SETUP_BLOCKCOPY_SP(2, 4, cpu); \
+ SETUP_BLOCKCOPY_SP(2, 8, cpu); \
+ SETUP_BLOCKCOPY_SP(6, 8, cpu);
-CHROMA_BLOCKCOPY_DEF(_sse2);
-LUMA_BLOCKCOPY_DEF(_sse2);
+#define BLOCKCOPY_SS_PP(cpu) \
+ SETUP_BLOCKCOPY_SS_PP(2, 4, cpu); \
+ SETUP_BLOCKCOPY_SS_PP(2, 8, cpu); \
+ SETUP_BLOCKCOPY_SS_PP(6, 8, cpu);
-#define SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(W, H, cpu) \
- void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t * dst, intptr_t dstStride, pixel * src, intptr_t srcStride);
+#define BLOCKCOPY_PS(cpu) \
+ SETUP_BLOCKCOPY_PS(2, 4, cpu); \
+ SETUP_BLOCKCOPY_PS(2, 8, cpu); \
+ SETUP_BLOCKCOPY_PS(4, 2, cpu); \
+ SETUP_BLOCKCOPY_PS(4, 4, cpu); \
+ SETUP_BLOCKCOPY_PS(4, 8, cpu); \
+ SETUP_BLOCKCOPY_PS(4, 16, cpu); \
+ SETUP_BLOCKCOPY_PS(6, 8, cpu); \
+ SETUP_BLOCKCOPY_PS(8, 2, cpu); \
+ SETUP_BLOCKCOPY_PS(8, 4, cpu); \
+ SETUP_BLOCKCOPY_PS(8, 6, cpu); \
+ SETUP_BLOCKCOPY_PS(8, 8, cpu); \
+ SETUP_BLOCKCOPY_PS(8, 16, cpu); \
+ SETUP_BLOCKCOPY_PS(8, 32, cpu); \
+ SETUP_BLOCKCOPY_PS(12, 16, cpu); \
+ SETUP_BLOCKCOPY_PS(16, 4, cpu); \
+ SETUP_BLOCKCOPY_PS(16, 8, cpu); \
+ SETUP_BLOCKCOPY_PS(16, 12, cpu); \
+ SETUP_BLOCKCOPY_PS(16, 16, cpu); \
+ SETUP_BLOCKCOPY_PS(16, 32, cpu); \
+ SETUP_BLOCKCOPY_PS(24, 32, cpu); \
+ SETUP_BLOCKCOPY_PS(32, 8, cpu); \
+ SETUP_BLOCKCOPY_PS(32, 16, cpu); \
+ SETUP_BLOCKCOPY_PS(32, 24, cpu); \
+ SETUP_BLOCKCOPY_PS(32, 32, cpu); \
+ SETUP_BLOCKCOPY_PS(16, 64, cpu); \
+ SETUP_BLOCKCOPY_PS(32, 64, cpu); \
+ SETUP_BLOCKCOPY_PS(48, 64, cpu); \
+ SETUP_BLOCKCOPY_PS(64, 16, cpu); \
+ SETUP_BLOCKCOPY_PS(64, 32, cpu); \
+ SETUP_BLOCKCOPY_PS(64, 48, cpu); \
+ SETUP_BLOCKCOPY_PS(64, 64, cpu);
-#define CHROMA_BLOCKCOPY_DEF_SSE4(cpu) \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(2, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(2, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(4, 2, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(4, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(4, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(4, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(6, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(8, 2, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(8, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(8, 6, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(8, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(8, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(8, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(12, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(16, 4, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(16, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(16, 12, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(16, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(16, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(24, 32, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(32, 8, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(32, 16, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(32, 24, cpu); \
- SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4(32, 32, cpu);
-#define SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(W, H, cpu) \
- void x265_blockcopy_ps_ ## W ## x ## H ## cpu(int16_t * dst, intptr_t dstStride, pixel * src, intptr_t srcStride);
-
-#define LUMA_BLOCKCOPY_DEF_SSE4(cpu) \
- SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(16, 64, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(32, 64, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(48, 64, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 16, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 32, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 48, cpu); \
- SETUP_LUMA_BLOCKCOPY_FUNC_SSE4(64, 64, cpu);
-
-CHROMA_BLOCKCOPY_DEF_SSE4(_sse4);
-LUMA_BLOCKCOPY_DEF_SSE4(_sse4);
-
-void x265_blockcopy_sp_2x4_sse4(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb);
-void x265_blockcopy_sp_2x8_sse4(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb);
-void x265_blockcopy_sp_6x8_sse4(pixel *a, intptr_t stridea, int16_t *b, intptr_t strideb);
+BLOCKCOPY_COMMON(_sse2);
+BLOCKCOPY_SS_PP(_sse2);
+BLOCKCOPY_SP(_sse4);
+BLOCKCOPY_PS(_sse4);
void x265_blockfill_s_4x4_sse2(int16_t *dst, intptr_t dstride, int16_t val);
void x265_blockfill_s_8x8_sse2(int16_t *dst, intptr_t dstride, int16_t val);
void x265_blockfill_s_16x16_sse2(int16_t *dst, intptr_t dstride, int16_t val);
void x265_blockfill_s_32x32_sse2(int16_t *dst, intptr_t dstride, int16_t val);
-#undef SETUP_CHROMA_BLOCKCOPY_FUNC
-#undef SETUP_LUMA_BLOCK_FUNC
-#undef CHROMA_BLOCKCOPY_DEF
-#undef LUMA_BLOCKCOPY_DEF
-
-#undef SETUP_CHROMA_BLOCKCOPY_FUNC_SSE4
-#undef CHROMA_BLOCKCOPY_DEF_SSE4
-#undef SETUP_LUMA_BLOCKCOPY_FUNC_SSE4
-#undef LUMA_BLOCKCOPY_DEF_SSE4
+#undef BLOCKCOPY_COMMON
+#undef BLOCKCOPY_SS_PP
+#undef BLOCKCOPY_SP
+#undef BLOCKCOPY_PS
+#undef SETUP_BLOCKCOPY_PS
+#undef SETUP_BLOCKCOPY_SP
+#undef SETUP_BLOCKCOPY_SS_PP
+#undef SETUP_BLOCKCOPY_FUNC
#endif // ifndef X265_I386_PIXEL_H
More information about the x265-devel
mailing list