[x265] [PATCH] asm code and test bench integration code for blockcopy_pp_c partitions
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Mon Nov 4 13:00:37 CET 2013
# HG changeset patch
# User Praveen Tiwari
# Date 1383566415 -19800
# Node ID aca0d5c6b9605e3c56c401711daa20a9630c728b
# Parent 37903c6fd1f90ec6bd166a116254d7cf29d4c90c
asm code and test bench integration code for blockcopy_pp_c partitions
diff -r 37903c6fd1f9 -r aca0d5c6b960 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt Mon Nov 04 12:09:06 2013 +0530
+++ b/source/common/CMakeLists.txt Mon Nov 04 17:30:15 2013 +0530
@@ -122,7 +122,7 @@
if(ENABLE_PRIMITIVES_ASM)
set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h)
- set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util.asm)
+ set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util.asm blockcopy8.asm)
if (NOT X64)
set(A_SRCS ${A_SRCS} pixel-32.asm)
endif()
diff -r 37903c6fd1f9 -r aca0d5c6b960 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Nov 04 12:09:06 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Nov 04 17:30:15 2013 +0530
@@ -129,6 +129,9 @@
p.chroma_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu;\
p.chroma_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu;
+#define SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(W, H, cpu) \
+ p.chroma_copy_pp[CHROMA_ ## W ## x ## H] = x265_blockcopy_pp_## W ## x ## H ## cpu;
+
#define CHROMA_FILTERS(cpu) \
SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \
SETUP_CHROMA_FUNC_DEF(4, 2, cpu); \
@@ -155,10 +158,38 @@
SETUP_CHROMA_FUNC_DEF(32, 8, cpu); \
SETUP_CHROMA_FUNC_DEF(8, 32, cpu);
+#define CHROMA_BLOCKCOPY(cpu) \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 2, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(2, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 6, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(6, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 2, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(2, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 12, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(12, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 24, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(24, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 32, cpu);
#define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu;\
- p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu
+ p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu;\
+
+#define SETUP_LUMA_BLOCKCOPY_FUNC_DEF(W, H, cpu) \
+ p.luma_copy_pp[LUMA_ ## W ## x ## H] = x265_blockcopy_pp_## W ## x ## H ## cpu;
#define LUMA_FILTERS(cpu) \
SETUP_LUMA_FUNC_DEF(4, 4, cpu); \
@@ -185,7 +216,34 @@
SETUP_LUMA_FUNC_DEF(64, 48, cpu); \
SETUP_LUMA_FUNC_DEF(48, 64, cpu); \
SETUP_LUMA_FUNC_DEF(64, 16, cpu); \
- SETUP_LUMA_FUNC_DEF(16, 64, cpu)
+ SETUP_LUMA_FUNC_DEF(16, 64, cpu);
+
+#define LUMA_BLOCKCOPY(cpu) \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(4, 4, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8, 8, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8, 4, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(4, 8, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 8, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 12, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(12, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 4, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(4, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 32, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 32, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 24, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(24, 32, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 8, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8, 32, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 64, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 32, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 64, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 48, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(48, 64, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 64, cpu);
using namespace x265;
@@ -265,6 +323,8 @@
INIT6(satd, _sse2);
HEVC_SATD(sse2);
+ CHROMA_BLOCKCOPY(_sse2);
+ LUMA_BLOCKCOPY(_sse2);
#if X86_64
p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2;
p.satd[LUMA_16x4] = x265_pixel_satd_16x4_sse2;
diff -r 37903c6fd1f9 -r aca0d5c6b960 source/common/x86/blockcopy8.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/blockcopy8.asm Mon Nov 04 17:30:15 2013 +0530
@@ -0,0 +1,798 @@
+;*****************************************************************************
+;* Copyright (C) 2013 x265 project
+;*
+;* Authors: Praveen Kumar Tiwari <praveen at multicorewareinc.com>
+;* Murugan Vairavel <murugan at multicorewareinc.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing at multicorewareinc.com.
+;*****************************************************************************/
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA 32
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_2x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_2x4, 4, 7, 0, dest, deststride, src, srcstride
+
+mov r4w, [r2]
+mov r5w, [r2 + r3]
+mov r6w, [r2 + 2 * r3]
+lea r3, [r3 + r3 * 2]
+mov r3w, [r2 + r3]
+
+mov [r0], r4w
+mov [r0 + r1], r5w
+mov [r0 + 2 * r1], r6w
+lea r1, [r1 + 2 * r1]
+mov [r0 + r1], r3w
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_2x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_2x8, 4, 7, 0, dest, deststride, src, srcstride
+
+mov r4w, [r2]
+mov r5w, [r2 + r3]
+mov r6w, [r2 + 2 * r3]
+
+mov [r0], r4w
+mov [r0 + r1], r5w
+mov [r0 + 2 * r1], r6w
+
+lea r0, [r0 + 2 * r1]
+lea r2, [r2 + 2 * r3]
+
+mov r4w, [r2 + r3]
+mov r5w, [r2 + 2 * r3]
+
+mov [r0 + r1], r4w
+mov [r0 + 2 * r1], r5w
+
+lea r0, [r0 + 2 * r1]
+lea r2, [r2 + 2 * r3]
+
+mov r4w, [r2 + r3]
+mov r5w, [r2 + 2 * r3]
+
+mov [r0 + r1], r4w
+mov [r0 + 2 * r1], r5w
+
+lea r0, [r0 + 2 * r1]
+lea r2, [r2 + 2 * r3]
+
+mov r4w, [r2 + r3]
+mov [r0 + r1], r4w
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_4x2, 4, 6, 2, dest, deststride, src, srcstride
+
+mov r4d, [r2]
+mov r5d, [r2 + r3]
+
+mov [r0], r4d
+mov [r0 + r1], r5d
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_4x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_4x4, 4, 4, 4, dest, deststride, src, srcstride
+
+movd m0, [r2]
+movd m1, [r2 + r3]
+movd m2, [r2 + 2 * r3]
+lea r3, [r3 + r3 * 2]
+movd m3, [r2 + r3]
+
+movd [r0], m0
+movd [r0 + r1], m1
+movd [r0 + 2 * r1], m2
+lea r1, [r1 + 2 * r1]
+movd [r0 + r1], m3
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_4x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_4x8, 4, 6, 8, dest, deststride, src, srcstride
+
+movd m0, [r2]
+movd m1, [r2 + r3]
+movd m2, [r2 + 2 * r3]
+lea r4, [r2 + 2 * r3]
+movd m3, [r4 + r3]
+
+movd m4, [r4 + 2 * r3]
+lea r4, [r4 + 2 * r3]
+movd m5, [r4 + r3]
+movd m6, [r4 + 2 * r3]
+lea r4, [r4 + 2 * r3]
+movd m7, [r4 + r3]
+
+movd [r0], m0
+movd [r0 + r1], m1
+movd [r0 + 2 * r1], m2
+lea r5, [r0 + 2 * r1]
+movd [r5 + r1], m3
+
+movd [r5 + 2 * r1], m4
+lea r5, [r5 + 2 * r1]
+movd [r5 + r1], m5
+movd [r5 + 2 * r1], m6
+lea r5, [r5 + 2 * r1]
+movd [r5 + r1], m7
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W4_H8 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 7, 8, dest, deststride, src, srcstride
+
+
+mov r4d, %2
+
+.loop
+ movd m0, [r2]
+ movd m1, [r2 + r3]
+ movd m2, [r2 + 2 * r3]
+ lea r5, [r2 + 2 * r3]
+ movd m3, [r5 + r3]
+
+ movd m4, [r5 + 2 * r3]
+ lea r5, [r5 + 2 * r3]
+ movd m5, [r5 + r3]
+ movd m6, [r5 + 2 * r3]
+ lea r5, [r5 + 2 * r3]
+ movd m7, [r5 + r3]
+
+ movd [r0], m0
+ movd [r0 + r1], m1
+ movd [r0 + 2 * r1], m2
+ lea r6, [r0 + 2 * r1]
+ movd [r6 + r1], m3
+
+ movd [r6 + 2 * r1], m4
+ lea r6, [r6 + 2 * r1]
+ movd [r6 + r1], m5
+ movd [r6 + 2 * r1], m6
+ lea r6, [r6 + 2 * r1]
+ movd [r6 + r1], m7
+
+ lea r0, [r0 + 8 * r1]
+ lea r2, [r2 + 8 * r3]
+
+ sub r4d, 8
+ jnz .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W4_H8 4, 16
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_6x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_6x8, 4, 7, 8, dest, deststride, src, srcstride
+
+movd m0, [r2]
+movd m1, [r2 + r3]
+movd m2, [r2 + 2 * r3]
+lea r5, [r2 + 2 * r3]
+movd m3, [r5 + r3]
+
+movd m4, [r5 + 2 * r3]
+lea r5, [r5 + 2 * r3]
+movd m5, [r5 + r3]
+movd m6, [r5 + 2 * r3]
+lea r5, [r5 + 2 * r3]
+movd m7, [r5 + r3]
+
+movd [r0], m0
+movd [r0 + r1], m1
+movd [r0 + 2 * r1], m2
+lea r6, [r0 + 2 * r1]
+movd [r6 + r1], m3
+
+movd [r6 + 2 * r1], m4
+lea r6, [r6 + 2 * r1]
+movd [r6 + r1], m5
+movd [r6 + 2 * r1], m6
+lea r6, [r6 + 2 * r1]
+movd [r6 + r1], m7
+
+mov r4w, [r2 + 4]
+mov r5w, [r2 + r3 + 4]
+mov r6w, [r2 + 2 * r3 + 4]
+
+mov [r0 + 4], r4w
+mov [r0 + r1 + 4], r5w
+mov [r0 + 2 * r1 + 4], r6w
+
+lea r0, [r0 + 2 * r1]
+lea r2, [r2 + 2 * r3]
+
+mov r4w, [r2 + r3 + 4]
+mov r5w, [r2 + 2 * r3 + 4]
+
+mov [r0 + r1 + 4], r4w
+mov [r0 + 2 * r1 + 4], r5w
+
+lea r0, [r0 + 2 * r1]
+lea r2, [r2 + 2 * r3]
+
+mov r4w, [r2 + r3 + 4]
+mov r5w, [r2 + 2 * r3 + 4]
+
+mov [r0 + r1 + 4], r4w
+mov [r0 + 2 * r1 + 4], r5w
+
+lea r0, [r0 + 2 * r1]
+lea r2, [r2 + 2 * r3]
+
+mov r4w, [r2 + r3 + 4]
+mov [r0 + r1 + 4], r4w
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_8x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_8x2, 4, 4, 2, dest, deststride, src, srcstride
+
+movh m0, [r2]
+movh m1, [r2 + r3]
+
+movh [r0], m0
+movh [r0 + r1], m1
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_8x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_8x4, 4, 4, 4, dest, deststride, src, srcstride
+
+movh m0, [r2]
+movh m1, [r2 + r3]
+movh m2, [r2 + 2 * r3]
+lea r3, [r3 + r3 * 2]
+movh m3, [r2 + r3]
+
+movh [r0], m0
+movh [r0 + r1], m1
+movh [r0 + 2 * r1], m2
+lea r1, [r1 + 2 * r1]
+movh [r0 + r1], m3
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_8x6(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_8x6, 4, 7, 6, dest, deststride, src, srcstride
+
+movh m0, [r2]
+movh m1, [r2 + r3]
+movh m2, [r2 + 2 * r3]
+lea r5, [r2 + 2 * r3]
+movh m3, [r5 + r3]
+movh m4, [r5 + 2 * r3]
+lea r5, [r5 + 2 * r3]
+movh m5, [r5 + r3]
+
+movh [r0], m0
+movh [r0 + r1], m1
+movh [r0 + 2 * r1], m2
+lea r6, [r0 + 2 * r1]
+movh [r6 + r1], m3
+movh [r6 + 2 * r1], m4
+lea r6, [r6 + 2 * r1]
+movh [r6 + r1], m5
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_8x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_8x8, 4, 7, 8, dest, deststride, src, srcstride
+
+movh m0, [r2]
+movh m1, [r2 + r3]
+movh m2, [r2 + 2 * r3]
+lea r5, [r2 + 2 * r3]
+movh m3, [r5 + r3]
+
+movh m4, [r5 + 2 * r3]
+lea r5, [r5 + 2 * r3]
+movh m5, [r5 + r3]
+movh m6, [r5 + 2 * r3]
+lea r5, [r5 + 2 * r3]
+movh m7, [r5 + r3]
+
+movh [r0], m0
+movh [r0 + r1], m1
+movh [r0 + 2 * r1], m2
+lea r6, [r0 + 2 * r1]
+movh [r6 + r1], m3
+
+movh [r6 + 2 * r1], m4
+lea r6, [r6 + 2 * r1]
+movh [r6 + r1], m5
+movh [r6 + 2 * r1], m6
+lea r6, [r6 + 2 * r1]
+movh [r6 + r1], m7
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W8_H8 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 7, 8, dest, deststride, src, srcstride
+
+
+mov r4d, %2
+
+.loop
+ movh m0, [r2]
+ movh m1, [r2 + r3]
+ movh m2, [r2 + 2 * r3]
+ lea r5, [r2 + 2 * r3]
+ movh m3, [r5 + r3]
+
+ movh m4, [r5 + 2 * r3]
+ lea r5, [r5 + 2 * r3]
+ movh m5, [r5 + r3]
+ movh m6, [r5 + 2 * r3]
+ lea r5, [r5 + 2 * r3]
+ movh m7, [r5 + r3]
+
+ movh [r0], m0
+ movh [r0 + r1], m1
+ movh [r0 + 2 * r1], m2
+ lea r6, [r0 + 2 * r1]
+ movh [r6 + r1], m3
+
+ movh [r6 + 2 * r1], m4
+ lea r6, [r6 + 2 * r1]
+ movh [r6 + r1], m5
+ movh [r6 + 2 * r1], m6
+ lea r6, [r6 + 2 * r1]
+ movh [r6 + r1], m7
+
+ lea r0, [r0 + 8 * r1]
+ lea r2, [r2 + 8 * r3]
+
+ sub r4d, 8
+ jnz .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W8_H8 8, 16
+BLOCKCOPY_PP_W8_H8 8, 32
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W12_H4 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 7, 8, dest, deststride, src, srcstride
+
+mov r4d, %2
+
+.loop
+ movh m0, [r2]
+ movd m1, [r2 + 8]
+
+ movh m2, [r2 + r3]
+ movd m3, [r2 + r3 + 8]
+
+ movh m4, [r2 + 2 * r3]
+ movd m5, [r2 + 2 * r3 + 8]
+
+ lea r5, [r2 + 2 * r3]
+
+ movh m6, [r5 + r3]
+ movd m7, [r5 + r3 + 8]
+
+ movh [r0], m0
+ movd [r0 + 8], m1
+
+ movh [r0 + r1], m2
+ movd [r0 + r1 + 8], m3
+
+ movh [r0 + 2 * r1], m4
+ movd [r0 + 2 * r1 + 8], m5
+
+ lea r6, [r0 + 2 * r1]
+
+ movh [r6 + r1], m6
+ movd [r6 + r1 + 8], m7
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+
+ sub r4d, 4
+ jnz .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W12_H4 12, 16
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_16x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_16x4, 4, 4, 4, dest, deststride, src, srcstride
+
+movu m0, [r2]
+movu m1, [r2 + r3]
+movu m2, [r2 + 2 * r3]
+lea r3, [r3 + r3 * 2]
+movu m3, [r2 + r3]
+
+movu [r0], m0
+movu [r0 + r1], m1
+movu [r0 + 2 * r1], m2
+lea r1, [r1 + 2 * r1]
+movu [r0 + r1], m3
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_16x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_16x8, 4, 7, 8, dest, deststride, src, srcstride
+
+movu m0, [r2]
+movu m1, [r2 + r3]
+movu m2, [r2 + 2 * r3]
+lea r5, [r2 + 2 * r3]
+movu m3, [r5 + r3]
+
+movu m4, [r5 + 2 * r3]
+lea r5, [r5 + 2 * r3]
+movu m5, [r5 + r3]
+movu m6, [r5 + 2 * r3]
+lea r5, [r5 + 2 * r3]
+movu m7, [r5 + r3]
+
+movu [r0], m0
+movu [r0 + r1], m1
+movu [r0 + 2 * r1], m2
+lea r6, [r0 + 2 * r1]
+movu [r6 + r1], m3
+
+movu [r6 + 2 * r1], m4
+lea r6, [r6 + 2 * r1]
+movu [r6 + r1], m5
+movu [r6 + 2 * r1], m6
+lea r6, [r6 + 2 * r1]
+movu [r6 + r1], m7
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_16x12(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_16x12, 4, 7, 8, dest, deststride, src, srcstride
+
+movu m0, [r2]
+movu m1, [r2 + r3]
+movu m2, [r2 + 2 * r3]
+lea r5, [r2 + 2 * r3]
+movu m3, [r5 + r3]
+
+movu m4, [r5 + 2 * r3]
+lea r5, [r5 + 2 * r3]
+movu m5, [r5 + r3]
+movu m6, [r5 + 2 * r3]
+lea r5, [r5 + 2 * r3]
+movu m7, [r5 + r3]
+
+movu [r0], m0
+movu [r0 + r1], m1
+movu [r0 + 2 * r1], m2
+lea r6, [r0 + 2 * r1]
+movu [r6 + r1], m3
+
+movu [r6 + 2 * r1], m4
+lea r6, [r6 + 2 * r1]
+movu [r6 + r1], m5
+movu [r6 + 2 * r1], m6
+lea r6, [r6 + 2 * r1]
+movu [r6 + r1], m7
+
+lea r0, [r0 + 8 * r1]
+lea r2, [r2 + 8 * r3]
+
+movu m0, [r2]
+movu m1, [r2 + r3]
+movu m2, [r2 + 2 * r3]
+lea r3, [r3 + r3 * 2]
+movu m3, [r2 + r3]
+
+movu [r0], m0
+movu [r0 + r1], m1
+movu [r0 + 2 * r1], m2
+lea r1, [r1 + 2 * r1]
+movu [r0 + r1], m3
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W16_H8 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 7, 8, dest, deststride, src, srcstride
+
+
+mov r4d, %2
+
+.loop
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ movu m2, [r2 + 2 * r3]
+ lea r5, [r2 + 2 * r3]
+ movu m3, [r5 + r3]
+
+ movu m4, [r5 + 2 * r3]
+ lea r5, [r5 + 2 * r3]
+ movu m5, [r5 + r3]
+ movu m6, [r5 + 2 * r3]
+ lea r5, [r5 + 2 * r3]
+ movu m7, [r5 + r3]
+
+ movu [r0], m0
+ movu [r0 + r1], m1
+ movu [r0 + 2 * r1], m2
+ lea r6, [r0 + 2 * r1]
+ movu [r6 + r1], m3
+
+ movu [r6 + 2 * r1], m4
+ lea r6, [r6 + 2 * r1]
+ movu [r6 + r1], m5
+ movu [r6 + 2 * r1], m6
+ lea r6, [r6 + 2 * r1]
+ movu [r6 + r1], m7
+
+ lea r0, [r0 + 8 * r1]
+ lea r2, [r2 + 8 * r3]
+
+ sub r4d, 8
+ jnz .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W16_H8 16, 16
+BLOCKCOPY_PP_W16_H8 16, 32
+BLOCKCOPY_PP_W16_H8 16, 64
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W24_H32 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 7, 8, dest, deststride, src, srcstride
+
+mov r4d, %2
+
+.loop
+ movu m0, [r2]
+ movh m1, [r2 + 16]
+
+ movu m2, [r2 + r3]
+ movh m3, [r2 + r3 + 16]
+
+ movu m4, [r2 + 2 * r3]
+ movh m5, [r2 + 2 * r3 + 16]
+
+ lea r5, [r2 + 2 * r3]
+
+ movu m6, [r5 + r3]
+ movh m7, [r5 + r3 + 16]
+
+ movu [r0], m0
+ movh [r0 + 16], m1
+
+ movu [r0 + r1], m2
+ movh [r0 + r1 + 16], m3
+
+ movu [r0 + 2 * r1], m4
+ movh [r0 + 2 * r1 + 16], m5
+
+ lea r6, [r0 + 2 * r1]
+
+ movu [r6 + r1], m6
+ movh [r6 + r1 + 16], m7
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+
+ sub r4d, 4
+ jnz .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W24_H32 24, 32
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W32_H4 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 7, 8, dest, deststride, src, srcstride
+
+mov r4d, %2
+
+.loop
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+
+ movu m2, [r2 + r3]
+ movu m3, [r2 + r3 + 16]
+
+ movu m4, [r2 + 2 * r3]
+ movu m5, [r2 + 2 * r3 + 16]
+
+ lea r5, [r2 + 2 * r3]
+
+ movu m6, [r5 + r3]
+ movu m7, [r5 + r3 + 16]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+
+ movu [r0 + r1], m2
+ movu [r0 + r1 + 16], m3
+
+ movu [r0 + 2 * r1], m4
+ movu [r0 + 2 * r1 + 16], m5
+
+ lea r6, [r0 + 2 * r1]
+
+ movu [r6 + r1], m6
+ movu [r6 + r1 + 16], m7
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+
+ sub r4d, 4
+ jnz .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W32_H4 32, 8
+BLOCKCOPY_PP_W32_H4 32, 16
+BLOCKCOPY_PP_W32_H4 32, 24
+BLOCKCOPY_PP_W32_H4 32, 32
+BLOCKCOPY_PP_W32_H4 32, 64
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W48_H2 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 5, 8, dest, deststride, src, srcstride
+
+mov r4d, %2
+
+.loop
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + 32]
+
+ movu m3, [r2 + r3]
+ movu m4, [r2 + r3 + 16]
+ movu m5, [r2 + r3 + 32]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + 32], m2
+
+ movu [r0 + r1], m3
+ movu [r0 + r1 + 16], m4
+ movu [r0 + r1 + 32], m5
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+ sub r4d, 2
+ jnz .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W48_H2 48, 64
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W64_H2 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 5, 8, dest, deststride, src, srcstride
+
+mov r4d, %2
+
+.loop
+ movu m0, [r2]
+ movu m1, [r2 + 16]
+ movu m2, [r2 + 32]
+ movu m3, [r2 + 48]
+
+ movu m4, [r2 + r3]
+ movu m5, [r2 + r3 + 16]
+ movu m6, [r2 + r3 + 32]
+ movu m7, [r2 + r3 + 48]
+
+ movu [r0], m0
+ movu [r0 + 16], m1
+ movu [r0 + 32], m2
+ movu [r0 + 48], m3
+
+ movu [r0 + r1], m4
+ movu [r0 + r1 + 16], m5
+ movu [r0 + r1 + 32], m6
+ movu [r0 + r1 + 48], m7
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+ sub r4d, 2
+ jnz .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W64_H2 64, 16
+BLOCKCOPY_PP_W64_H2 64, 32
+BLOCKCOPY_PP_W64_H2 64, 48
+BLOCKCOPY_PP_W64_H2 64, 64
diff -r 37903c6fd1f9 -r aca0d5c6b960 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Nov 04 12:09:06 2013 +0530
+++ b/source/common/x86/pixel.h Mon Nov 04 17:30:15 2013 +0530
@@ -268,11 +268,79 @@
DECL_ADS(2, avx2)
DECL_ADS(1, avx2)
+
+
+#define SETUP_CHROMA_BLOCKCOPY_FUNC(W, H, cpu) \
+ void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb);\
+
+#define CHROMA_BLOCKCOPY_DEF(cpu) \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(4, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(4, 2, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(2, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(8, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(8, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(4, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(8, 6, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(6, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(8, 2, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(2, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(16, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(16, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(8, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(16, 12, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(12, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(16, 4, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(4, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(32, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(32, 16, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(16, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(32, 24, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(24, 32, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(32, 8, cpu); \
+ SETUP_CHROMA_BLOCKCOPY_FUNC(8, 32, cpu);
+
+#define SETUP_LUMA_BLOCKCOPY_FUNC(W, H, cpu) \
+ void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb);\
+
+#define LUMA_BLOCKCOPY_DEF(cpu) \
+ SETUP_LUMA_BLOCKCOPY_FUNC(4, 4, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(8, 8, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(8, 4, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(4, 8, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(16, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(16, 8, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(8, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(16, 12, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(12, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(16, 4, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(4, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(32, 32, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(32, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(16, 32, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(32, 24, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(24, 32, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(32, 8, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(8, 32, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(64, 64, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(64, 32, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(32, 64, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(64, 48, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(48, 64, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(64, 16, cpu); \
+ SETUP_LUMA_BLOCKCOPY_FUNC(16, 64, cpu);
+
+CHROMA_BLOCKCOPY_DEF(_sse2);
+LUMA_BLOCKCOPY_DEF(_sse2);
+
#undef DECL_PIXELS
#undef DECL_SUF
#undef DECL_HEVC_SSD
#undef DECL_X1
#undef DECL_X4
#undef DECL_ADS
+#undef SETUP_CHROMA_BLOCKCOPY_FUNC
+#undef SETUP_LUMA_BLOCK_FUNC
+#undef CHROMA_BLOCKCOPY_DEF
+#undef LUMA_BLOCKCOPY_DEF
#endif // ifndef X265_I386_PIXEL_H
diff -r 37903c6fd1f9 -r aca0d5c6b960 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Nov 04 12:09:06 2013 +0530
+++ b/source/test/pixelharness.cpp Mon Nov 04 17:30:15 2013 +0530
@@ -645,15 +645,6 @@
}
}
- if (opt.chroma_copy_pp[part])
- {
- if (!check_block_copy_pp(ref.chroma_copy_pp[part], opt.chroma_copy_pp[part]))
- {
- printf("chroma_copy_pp[%s] failed\n", chromaPartStr[part]);
- return false;
- }
- }
-
return true;
}
@@ -813,6 +804,17 @@
}
}
+ for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
+ {
+ if (opt.chroma_copy_pp[i])
+ {
+ if (!check_block_copy_pp(ref.chroma_copy_pp[i], opt.chroma_copy_pp[i]))
+ {
+ printf("chroma_copy_pp[%s] failed\n", chromaPartStr[i]);
+ return false;
+ }
+ }
+ }
return true;
}
More information about the x265-devel
mailing list