[x265] [PATCH] asm code and test bench integration code for blockcopy_pp_c partitions

praveen at multicorewareinc.com praveen at multicorewareinc.com
Mon Nov 4 13:00:37 CET 2013


# HG changeset patch
# User Praveen Tiwari
# Date 1383566415 -19800
# Node ID aca0d5c6b9605e3c56c401711daa20a9630c728b
# Parent  37903c6fd1f90ec6bd166a116254d7cf29d4c90c
asm code and test bench integration code for blockcopy_pp_c partitions

diff -r 37903c6fd1f9 -r aca0d5c6b960 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt	Mon Nov 04 12:09:06 2013 +0530
+++ b/source/common/CMakeLists.txt	Mon Nov 04 17:30:15 2013 +0530
@@ -122,7 +122,7 @@
 
 if(ENABLE_PRIMITIVES_ASM)
     set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h)
-    set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util.asm)
+    set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util.asm blockcopy8.asm)
     if (NOT X64)
         set(A_SRCS ${A_SRCS} pixel-32.asm)
     endif()
diff -r 37903c6fd1f9 -r aca0d5c6b960 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Nov 04 12:09:06 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Nov 04 17:30:15 2013 +0530
@@ -129,6 +129,9 @@
     p.chroma_hpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_horiz_pp_ ## W ## x ## H ## cpu;\
     p.chroma_vpp[CHROMA_ ## W ## x ## H] = x265_interp_4tap_vert_pp_ ## W ## x ## H ## cpu;
 
+#define SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(W, H, cpu) \
+    p.chroma_copy_pp[CHROMA_ ## W ## x ## H] = x265_blockcopy_pp_## W ## x ## H ## cpu;
+
 #define CHROMA_FILTERS(cpu) \
     SETUP_CHROMA_FUNC_DEF(4, 4, cpu); \
     SETUP_CHROMA_FUNC_DEF(4, 2, cpu); \
@@ -155,10 +158,38 @@
     SETUP_CHROMA_FUNC_DEF(32, 8, cpu); \
     SETUP_CHROMA_FUNC_DEF(8, 32, cpu);
 
+#define CHROMA_BLOCKCOPY(cpu) \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 4, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 2, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(2, 4, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 8, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 4, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 8, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 6, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(6, 8, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 2, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(2, 8, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 8, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 12, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(12, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 4, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(4, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(16, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 24, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(24, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(32, 8, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC_DEF(8, 32, cpu);
 
 #define SETUP_LUMA_FUNC_DEF(W, H, cpu) \
     p.luma_hpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_horiz_pp_ ## W ## x ## H ## cpu;\
-    p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu
+    p.luma_vpp[LUMA_ ## W ## x ## H] = x265_interp_8tap_vert_pp_ ## W ## x ## H ## cpu;\
+
+#define SETUP_LUMA_BLOCKCOPY_FUNC_DEF(W, H, cpu) \
+    p.luma_copy_pp[LUMA_ ## W ## x ## H] = x265_blockcopy_pp_## W ## x ## H ## cpu;
 
 #define LUMA_FILTERS(cpu) \
     SETUP_LUMA_FUNC_DEF(4,   4, cpu); \
@@ -185,7 +216,34 @@
     SETUP_LUMA_FUNC_DEF(64, 48, cpu); \
     SETUP_LUMA_FUNC_DEF(48, 64, cpu); \
     SETUP_LUMA_FUNC_DEF(64, 16, cpu); \
-    SETUP_LUMA_FUNC_DEF(16, 64, cpu)
+    SETUP_LUMA_FUNC_DEF(16, 64, cpu);
+
+#define LUMA_BLOCKCOPY(cpu) \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(4,   4, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8,   8, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8,   4, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(4,   8, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 16, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16,  8, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8,  16, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 12, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(12, 16, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16,  4, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(4,  16, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 32, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 16, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 32, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 24, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(24, 32, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32,  8, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(8,  32, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 64, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 32, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(32, 64, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 48, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(48, 64, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 16, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 64, cpu);
 
 using namespace x265;
 
@@ -265,6 +323,8 @@
         INIT6(satd, _sse2);
         HEVC_SATD(sse2);
 
+        CHROMA_BLOCKCOPY(_sse2);
+        LUMA_BLOCKCOPY(_sse2);
 #if X86_64
         p.satd[LUMA_8x32] = x265_pixel_satd_8x32_sse2;
         p.satd[LUMA_16x4] = x265_pixel_satd_16x4_sse2;
diff -r 37903c6fd1f9 -r aca0d5c6b960 source/common/x86/blockcopy8.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/blockcopy8.asm	Mon Nov 04 17:30:15 2013 +0530
@@ -0,0 +1,798 @@
+;*****************************************************************************
+;* Copyright (C) 2013 x265 project
+;*
+;* Authors: Praveen Kumar Tiwari <praveen at multicorewareinc.com>
+;*          Murugan Vairavel <murugan at multicorewareinc.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing at multicorewareinc.com.
+;*****************************************************************************/
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA 32
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_2x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_2x4, 4, 7, 0, dest, deststride, src, srcstride
+
+mov     r4w,     [r2]
+mov     r5w,     [r2 + r3]
+mov     r6w,     [r2 + 2 * r3]
+lea      r3,     [r3 + r3 * 2]
+mov      r3w,    [r2 + r3]
+
+mov     [r0],            r4w
+mov     [r0 + r1],       r5w
+mov     [r0 + 2 * r1],   r6w
+lea      r1,              [r1 + 2 * r1]
+mov     [r0 + r1],       r3w
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_2x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_2x8, 4, 7, 0, dest, deststride, src, srcstride
+
+mov     r4w,     [r2]
+mov     r5w,     [r2 + r3]
+mov     r6w,     [r2 + 2 * r3]
+
+mov     [r0],            r4w
+mov     [r0 + r1],       r5w
+mov     [r0 + 2 * r1],   r6w
+
+lea     r0,             [r0 + 2 * r1]
+lea     r2,             [r2 + 2 * r3]
+
+mov     r4w,             [r2 + r3]
+mov     r5w,             [r2 + 2 * r3]
+
+mov     [r0 + r1],       r4w
+mov     [r0 + 2 * r1],   r5w
+
+lea     r0,              [r0 + 2 * r1]
+lea     r2,              [r2 + 2 * r3]
+
+mov     r4w,             [r2 + r3]
+mov     r5w,             [r2 + 2 * r3]
+
+mov     [r0 + r1],       r4w
+mov     [r0 + 2 * r1],   r5w
+
+lea     r0,              [r0 + 2 * r1]
+lea     r2,              [r2 + 2 * r3]
+
+mov     r4w,             [r2 + r3]
+mov     [r0 + r1],       r4w
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_4x2, 4, 6, 2, dest, deststride, src, srcstride
+
+mov     r4d,     [r2]
+mov     r5d,     [r2 + r3]
+
+mov     [r0],            r4d
+mov     [r0 + r1],       r5d
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_4x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_4x4, 4, 4, 4, dest, deststride, src, srcstride
+
+movd     m0,     [r2]
+movd     m1,     [r2 + r3]
+movd     m2,     [r2 + 2 * r3]
+lea      r3,     [r3 + r3 * 2]
+movd     m3,     [r2 + r3]
+
+movd     [r0],            m0
+movd     [r0 + r1],       m1
+movd     [r0 + 2 * r1],   m2
+lea      r1,              [r1 + 2 * r1]
+movd     [r0 + r1],       m3
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_4x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_4x8, 4, 6, 8, dest, deststride, src, srcstride
+
+movd     m0,     [r2]
+movd     m1,     [r2 + r3]
+movd     m2,     [r2 + 2 * r3]
+lea      r4,     [r2 + 2 * r3]
+movd     m3,     [r4 + r3]
+
+movd     m4,     [r4 + 2 * r3]
+lea      r4,     [r4 + 2 * r3]
+movd     m5,     [r4 + r3]
+movd     m6,     [r4 + 2 * r3]
+lea      r4,     [r4 + 2 * r3]
+movd     m7,     [r4 + r3]
+
+movd     [r0],                m0
+movd     [r0 + r1],           m1
+movd     [r0 + 2 * r1],       m2
+lea      r5,                  [r0 + 2 * r1]
+movd     [r5 + r1],           m3
+
+movd     [r5 + 2 * r1],        m4
+lea      r5,                   [r5 + 2 * r1]
+movd     [r5 + r1],            m5
+movd     [r5 + 2 * r1],        m6
+lea      r5,                   [r5 + 2 * r1]
+movd     [r5 + r1],            m7
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W4_H8 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 7, 8, dest, deststride, src, srcstride
+
+
+mov         r4d,       %2
+
+.loop
+     movd     m0,     [r2]
+     movd     m1,     [r2 + r3]
+     movd     m2,     [r2 + 2 * r3]
+     lea      r5,     [r2 + 2 * r3]
+     movd     m3,     [r5 + r3]
+
+     movd     m4,     [r5 + 2 * r3]
+     lea      r5,     [r5 + 2 * r3]
+     movd     m5,     [r5 + r3]
+     movd     m6,     [r5 + 2 * r3]
+     lea      r5,     [r5 + 2 * r3]
+     movd     m7,     [r5 + r3]
+
+     movd     [r0],                m0
+     movd     [r0 + r1],           m1
+     movd     [r0 + 2 * r1],       m2
+     lea      r6,                  [r0 + 2 * r1]
+     movd     [r6 + r1],           m3
+
+     movd     [r6 + 2 * r1],       m4
+     lea      r6,                  [r6 + 2 * r1]
+     movd     [r6 + r1],           m5
+     movd     [r6 + 2 * r1],       m6
+     lea      r6,                  [r6 + 2 * r1]
+     movd     [r6 + r1],           m7
+
+    lea         r0,           [r0 + 8 * r1]
+    lea         r2,           [r2 + 8 * r3]
+
+    sub         r4d,           8
+    jnz        .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W4_H8 4, 16
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_6x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_6x8, 4, 7, 8, dest, deststride, src, srcstride
+
+movd     m0,     [r2]
+movd     m1,     [r2 + r3]
+movd     m2,     [r2 + 2 * r3]
+lea      r5,     [r2 + 2 * r3]
+movd     m3,     [r5 + r3]
+
+movd     m4,     [r5 + 2 * r3]
+lea      r5,     [r5 + 2 * r3]
+movd     m5,     [r5 + r3]
+movd     m6,     [r5 + 2 * r3]
+lea      r5,     [r5 + 2 * r3]
+movd     m7,     [r5 + r3]
+
+movd     [r0],                m0
+movd     [r0 + r1],           m1
+movd     [r0 + 2 * r1],       m2
+lea      r6,                  [r0 + 2 * r1]
+movd     [r6 + r1],           m3
+
+movd     [r6 + 2 * r1],        m4
+lea      r6,                   [r6 + 2 * r1]
+movd     [r6 + r1],            m5
+movd     [r6 + 2 * r1],        m6
+lea      r6,                   [r6 + 2 * r1]
+movd     [r6 + r1],            m7
+
+mov     r4w,     [r2 + 4]
+mov     r5w,     [r2 + r3 + 4]
+mov     r6w,     [r2 + 2 * r3 + 4]
+
+mov     [r0 + 4],            r4w
+mov     [r0 + r1 + 4],       r5w
+mov     [r0 + 2 * r1 + 4],   r6w
+
+lea     r0,              [r0 + 2 * r1]
+lea     r2,              [r2 + 2 * r3]
+
+mov     r4w,             [r2 + r3 + 4]
+mov     r5w,             [r2 + 2 * r3 + 4]
+
+mov     [r0 + r1 + 4],       r4w
+mov     [r0 + 2 * r1 + 4],   r5w
+
+lea     r0,              [r0 + 2 * r1]
+lea     r2,              [r2 + 2 * r3]
+
+mov     r4w,             [r2 + r3 + 4]
+mov     r5w,             [r2 + 2 * r3 + 4]
+
+mov     [r0 + r1 + 4],       r4w
+mov     [r0 + 2 * r1 + 4],   r5w
+
+lea     r0,              [r0 + 2 * r1]
+lea     r2,              [r2 + 2 * r3]
+
+mov     r4w,             [r2 + r3 + 4]
+mov     [r0 + r1 + 4],       r4w
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_8x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_8x2, 4, 4, 2, dest, deststride, src, srcstride
+
+movh     m0,        [r2]
+movh     m1,        [r2 + r3]
+
+movh     [r0],       m0
+movh     [r0 + r1],  m1
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_8x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_8x4, 4, 4, 4, dest, deststride, src, srcstride
+
+movh     m0,     [r2]
+movh     m1,     [r2 + r3]
+movh     m2,     [r2 + 2 * r3]
+lea      r3,     [r3 + r3 * 2]
+movh     m3,     [r2 + r3]
+
+movh     [r0],            m0
+movh     [r0 + r1],       m1
+movh     [r0 + 2 * r1],   m2
+lea      r1,              [r1 + 2 * r1]
+movh     [r0 + r1],       m3
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_8x6(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_8x6, 4, 7, 6, dest, deststride, src, srcstride
+
+movh     m0,     [r2]
+movh     m1,     [r2 + r3]
+movh     m2,     [r2 + 2 * r3]
+lea      r5,     [r2 + 2 * r3]
+movh     m3,     [r5 + r3]
+movh     m4,     [r5 + 2 * r3]
+lea      r5,     [r5 + 2 * r3]
+movh     m5,     [r5 + r3]
+
+movh     [r0],            m0
+movh     [r0 + r1],       m1
+movh     [r0 + 2 * r1],   m2
+lea      r6,              [r0 + 2 * r1]
+movh     [r6 + r1],       m3
+movh     [r6 + 2 * r1],   m4
+lea      r6,              [r6 + 2 * r1]
+movh     [r6 + r1],       m5
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_8x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_8x8, 4, 7, 8, dest, deststride, src, srcstride
+
+movh     m0,     [r2]
+movh     m1,     [r2 + r3]
+movh     m2,     [r2 + 2 * r3]
+lea      r5,     [r2 + 2 * r3]
+movh     m3,     [r5 + r3]
+
+movh     m4,     [r5 + 2 * r3]
+lea      r5,     [r5 + 2 * r3]
+movh     m5,     [r5 + r3]
+movh     m6,     [r5 + 2 * r3]
+lea      r5,     [r5 + 2 * r3]
+movh     m7,     [r5 + r3]
+
+movh     [r0],                m0
+movh     [r0 + r1],           m1
+movh     [r0 + 2 * r1],       m2
+lea      r6,                  [r0 + 2 * r1]
+movh     [r6 + r1],           m3
+
+movh     [r6 + 2 * r1],        m4
+lea      r6,                   [r6 + 2 * r1]
+movh     [r6 + r1],            m5
+movh     [r6 + 2 * r1],        m6
+lea      r6,                   [r6 + 2 * r1]
+movh     [r6 + r1],            m7
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W8_H8 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 7, 8, dest, deststride, src, srcstride
+
+
+mov         r4d,       %2
+
+.loop
+     movh     m0,     [r2]
+     movh     m1,     [r2 + r3]
+     movh     m2,     [r2 + 2 * r3]
+     lea      r5,     [r2 + 2 * r3]
+     movh     m3,     [r5 + r3]
+
+     movh     m4,     [r5 + 2 * r3]
+     lea      r5,     [r5 + 2 * r3]
+     movh     m5,     [r5 + r3]
+     movh     m6,     [r5 + 2 * r3]
+     lea      r5,     [r5 + 2 * r3]
+     movh     m7,     [r5 + r3]
+
+     movh     [r0],                m0
+     movh     [r0 + r1],           m1
+     movh     [r0 + 2 * r1],       m2
+     lea      r6,                  [r0 + 2 * r1]
+     movh     [r6 + r1],           m3
+
+     movh     [r6 + 2 * r1],        m4
+     lea      r6,                   [r6 + 2 * r1]
+     movh     [r6 + r1],            m5
+     movh     [r6 + 2 * r1],        m6
+     lea      r6,                   [r6 + 2 * r1]
+     movh     [r6 + r1],            m7
+
+     lea         r0,           [r0 + 8 * r1]
+     lea         r2,           [r2 + 8 * r3]
+
+     sub         r4d,           8
+     jnz        .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W8_H8 8, 16
+BLOCKCOPY_PP_W8_H8 8, 32
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W12_H4 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 7, 8, dest, deststride, src, srcstride
+
+mov         r4d,       %2
+
+.loop
+      movh     m0,     [r2]
+      movd     m1,     [r2 + 8]
+
+      movh     m2,     [r2 + r3]
+      movd     m3,     [r2 + r3 + 8]
+
+      movh     m4,     [r2 + 2 * r3]
+      movd     m5,     [r2 + 2 * r3 + 8]
+
+      lea      r5,     [r2 + 2 * r3]
+
+      movh     m6,     [r5 + r3]
+      movd     m7,     [r5 + r3 + 8]
+
+      movh     [r0],                 m0
+      movd     [r0 + 8],             m1
+
+      movh     [r0 + r1],            m2
+      movd     [r0 + r1 + 8],        m3
+
+      movh     [r0 + 2 * r1],        m4
+      movd     [r0 + 2 * r1 + 8],    m5
+
+      lea      r6,                   [r0 + 2 * r1]
+
+      movh     [r6 + r1],            m6
+      movd     [r6 + r1 + 8],        m7
+
+      lea      r0,                   [r0 + 4 * r1]
+      lea      r2,                   [r2 + 4 * r3]
+
+      sub      r4d,                   4
+      jnz      .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W12_H4 12, 16
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_16x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_16x4, 4, 4, 4, dest, deststride, src, srcstride
+
+movu     m0,     [r2]
+movu     m1,     [r2 + r3]
+movu     m2,     [r2 + 2 * r3]
+lea      r3,     [r3 + r3 * 2]
+movu     m3,     [r2 + r3]
+
+movu     [r0],            m0
+movu     [r0 + r1],       m1
+movu     [r0 + 2 * r1],   m2
+lea      r1,              [r1 + 2 * r1]
+movu     [r0 + r1],       m3
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_16x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_16x8, 4, 7, 8, dest, deststride, src, srcstride
+
+movu     m0,     [r2]
+movu     m1,     [r2 + r3]
+movu     m2,     [r2 + 2 * r3]
+lea      r5,     [r2 + 2 * r3]
+movu     m3,     [r5 + r3]
+
+movu     m4,     [r5 + 2 * r3]
+lea      r5,     [r5 + 2 * r3]
+movu     m5,     [r5 + r3]
+movu     m6,     [r5 + 2 * r3]
+lea      r5,     [r5 + 2 * r3]
+movu     m7,     [r5 + r3]
+
+movu     [r0],                m0
+movu     [r0 + r1],           m1
+movu     [r0 + 2 * r1],       m2
+lea      r6,                  [r0 + 2 * r1]
+movu     [r6 + r1],           m3
+
+movu     [r6 + 2 * r1],        m4
+lea      r6,                   [r6 + 2 * r1]
+movu     [r6 + r1],            m5
+movu     [r6 + 2 * r1],        m6
+lea      r6,                   [r6 + 2 * r1]
+movu     [r6 + r1],            m7
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_16x12(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_16x12, 4, 7, 8, dest, deststride, src, srcstride
+
+movu     m0,            [r2]
+movu     m1,            [r2 + r3]
+movu     m2,            [r2 + 2 * r3]
+lea      r5,            [r2 + 2 * r3]
+movu     m3,            [r5 + r3]
+
+movu     m4,            [r5 + 2 * r3]
+lea      r5,            [r5 + 2 * r3]
+movu     m5,            [r5 + r3]
+movu     m6,            [r5 + 2 * r3]
+lea      r5,            [r5 + 2 * r3]
+movu     m7,            [r5 + r3]
+
+movu     [r0],            m0
+movu     [r0 + r1],       m1
+movu     [r0 + 2 * r1],   m2
+lea      r6,              [r0 + 2 * r1]
+movu     [r6 + r1],       m3
+
+movu     [r6 + 2 * r1],   m4
+lea      r6,              [r6 + 2 * r1]
+movu     [r6 + r1],       m5
+movu     [r6 + 2 * r1],   m6
+lea      r6,              [r6 + 2 * r1]
+movu     [r6 + r1],       m7
+
+lea      r0,           [r0 + 8 * r1]
+lea      r2,           [r2 + 8 * r3]
+
+movu     m0,              [r2]
+movu     m1,              [r2 + r3]
+movu     m2,              [r2 + 2 * r3]
+lea      r3,              [r3 + r3 * 2]
+movu     m3,              [r2 + r3]
+
+movu     [r0],            m0
+movu     [r0 + r1],       m1
+movu     [r0 + 2 * r1],   m2
+lea      r1,              [r1 + 2 * r1]
+movu     [r0 + r1],       m3
+
+RET
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W16_H8 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 7, 8, dest, deststride, src, srcstride
+
+
+mov      r4d,       %2
+
+.loop
+      movu     m0,     [r2]
+      movu     m1,     [r2 + r3]
+      movu     m2,     [r2 + 2 * r3]
+      lea      r5,     [r2 + 2 * r3]
+      movu     m3,     [r5 + r3]
+
+      movu     m4,     [r5 + 2 * r3]
+      lea      r5,     [r5 + 2 * r3]
+      movu     m5,     [r5 + r3]
+      movu     m6,     [r5 + 2 * r3]
+      lea      r5,     [r5 + 2 * r3]
+      movu     m7,     [r5 + r3]
+
+      movu     [r0],            m0
+      movu     [r0 + r1],       m1
+      movu     [r0 + 2 * r1],   m2
+      lea      r6,              [r0 + 2 * r1]
+      movu     [r6 + r1],       m3
+
+      movu     [r6 + 2 * r1],   m4
+      lea      r6,              [r6 + 2 * r1]
+      movu     [r6 + r1],       m5
+      movu     [r6 + 2 * r1],   m6
+      lea      r6,              [r6 + 2 * r1]
+      movu     [r6 + r1],       m7
+
+      lea      r0,           [r0 + 8 * r1]
+      lea      r2,           [r2 + 8 * r3]
+
+      sub      r4d,          8
+      jnz      .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W16_H8 16, 16
+BLOCKCOPY_PP_W16_H8 16, 32
+BLOCKCOPY_PP_W16_H8 16, 64
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W24_H32 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 7, 8, dest, deststride, src, srcstride
+
+mov         r4d,       %2
+
+.loop
+      movu     m0,     [r2]
+      movh     m1,     [r2 + 16]
+
+      movu     m2,     [r2 + r3]
+      movh     m3,     [r2 + r3 + 16]
+
+      movu     m4,     [r2 + 2 * r3]
+      movh     m5,     [r2 + 2 * r3 + 16]
+
+      lea      r5,     [r2 + 2 * r3]
+
+      movu     m6,     [r5 + r3]
+      movh     m7,     [r5 + r3 + 16]
+
+      movu     [r0],                 m0
+      movh     [r0 + 16],            m1
+
+      movu     [r0 + r1],            m2
+      movh     [r0 + r1 + 16],       m3
+
+      movu     [r0 + 2 * r1],        m4
+      movh     [r0 + 2 * r1 + 16],   m5
+
+      lea      r6,                   [r0 + 2 * r1]
+
+      movu     [r6 + r1],            m6
+      movh     [r6 + r1 + 16],       m7
+
+      lea      r0,                   [r0 + 4 * r1]
+      lea      r2,                   [r2 + 4 * r3]
+
+      sub      r4d,                  4
+      jnz      .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W24_H32 24, 32
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W32_H4 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 7, 8, dest, deststride, src, srcstride
+
+mov         r4d,       %2
+
+.loop
+      movu     m0,     [r2]
+      movu     m1,     [r2 + 16]
+
+      movu     m2,     [r2 + r3]
+      movu     m3,     [r2 + r3 + 16]
+
+      movu     m4,     [r2 + 2 * r3]
+      movu     m5,     [r2 + 2 * r3 + 16]
+
+      lea      r5,     [r2 + 2 * r3]
+
+      movu     m6,     [r5 + r3]
+      movu     m7,     [r5 + r3 + 16]
+
+      movu     [r0],                 m0
+      movu     [r0 + 16],            m1
+
+      movu     [r0 + r1],            m2
+      movu     [r0 + r1 + 16],       m3
+
+      movu     [r0 + 2 * r1],        m4
+      movu     [r0 + 2 * r1 + 16],   m5
+
+      lea      r6,                   [r0 + 2 * r1]
+
+      movu     [r6 + r1],            m6
+      movu     [r6 + r1 + 16],       m7
+
+      lea      r0,                   [r0 + 4 * r1]
+      lea      r2,                   [r2 + 4 * r3]
+
+      sub      r4d,                  4
+      jnz      .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W32_H4 32, 8
+BLOCKCOPY_PP_W32_H4 32, 16
+BLOCKCOPY_PP_W32_H4 32, 24
+BLOCKCOPY_PP_W32_H4 32, 32
+BLOCKCOPY_PP_W32_H4 32, 64
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W48_H2 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 5, 8, dest, deststride, src, srcstride
+
+mov         r4d,       %2
+
+.loop
+     movu     m0,     [r2]
+     movu     m1,     [r2 + 16]
+     movu     m2,     [r2 + 32]
+
+     movu     m3,     [r2 + r3]
+     movu     m4,     [r2 + r3 + 16]
+     movu     m5,     [r2 + r3 + 32]
+
+     movu     [r0],                 m0
+     movu     [r0 + 16],            m1
+     movu     [r0 + 32],            m2
+
+     movu     [r0 + r1],            m3
+     movu     [r0 + r1 + 16],       m4
+     movu     [r0 + r1 + 32],       m5
+
+     lea      r0,                   [r0 + 2 * r1]
+     lea      r2,                   [r2 + 2 * r3]
+
+     sub      r4d,                  2
+     jnz      .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W48_H2 48, 64
+
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
+;-----------------------------------------------------------------------------
+%macro BLOCKCOPY_PP_W64_H2 2
+INIT_XMM sse2
+cglobal blockcopy_pp_%1x%2, 4, 5, 8, dest, deststride, src, srcstride
+
+mov         r4d,       %2
+
+.loop
+     movu     m0,     [r2]
+     movu     m1,     [r2 + 16]
+     movu     m2,     [r2 + 32]
+     movu     m3,     [r2 + 48]
+
+     movu     m4,     [r2 + r3]
+     movu     m5,     [r2 + r3 + 16]
+     movu     m6,     [r2 + r3 + 32]
+     movu     m7,     [r2 + r3 + 48]
+
+     movu     [r0],                 m0
+     movu     [r0 + 16],            m1
+     movu     [r0 + 32],            m2
+     movu     [r0 + 48],            m3
+
+     movu     [r0 + r1],            m4
+     movu     [r0 + r1 + 16],       m5
+     movu     [r0 + r1 + 32],       m6
+     movu     [r0 + r1 + 48],       m7
+
+     lea      r0,                   [r0 + 2 * r1]
+     lea      r2,                   [r2 + 2 * r3]
+
+     sub      r4d,                  2
+     jnz      .loop
+
+RET
+%endmacro
+
+BLOCKCOPY_PP_W64_H2 64, 16
+BLOCKCOPY_PP_W64_H2 64, 32
+BLOCKCOPY_PP_W64_H2 64, 48
+BLOCKCOPY_PP_W64_H2 64, 64
diff -r 37903c6fd1f9 -r aca0d5c6b960 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Mon Nov 04 12:09:06 2013 +0530
+++ b/source/common/x86/pixel.h	Mon Nov 04 17:30:15 2013 +0530
@@ -268,11 +268,79 @@
 DECL_ADS(2, avx2)
 DECL_ADS(1, avx2)
 
+
+
+#define SETUP_CHROMA_BLOCKCOPY_FUNC(W, H, cpu) \
+    void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb);\
+
+#define CHROMA_BLOCKCOPY_DEF(cpu) \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(4, 4, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(4, 2, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(2, 4, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(8, 8, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(8, 4, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(4, 8, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(8, 6, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(6, 8, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(8, 2, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(2, 8, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(16, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(16, 8, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(8, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(16, 12, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(12, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(16, 4, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(4, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(32, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(32, 16, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(16, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(32, 24, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(24, 32, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(32, 8, cpu); \
+    SETUP_CHROMA_BLOCKCOPY_FUNC(8, 32, cpu);
+
+#define SETUP_LUMA_BLOCKCOPY_FUNC(W, H, cpu) \
+    void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel *a, intptr_t stridea, pixel *b, intptr_t strideb);\
+
+#define LUMA_BLOCKCOPY_DEF(cpu) \
+    SETUP_LUMA_BLOCKCOPY_FUNC(4,   4, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(8,   8, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(8,   4, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(4,   8, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(16, 16, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(16,  8, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(8,  16, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(16, 12, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(12, 16, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(16,  4, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(4,  16, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(32, 32, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(32, 16, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(16, 32, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(32, 24, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(24, 32, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(32,  8, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(8,  32, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(64, 64, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(64, 32, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(32, 64, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(64, 48, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(48, 64, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(64, 16, cpu); \
+    SETUP_LUMA_BLOCKCOPY_FUNC(16, 64, cpu);
+
+CHROMA_BLOCKCOPY_DEF(_sse2);
+LUMA_BLOCKCOPY_DEF(_sse2);
+
 #undef DECL_PIXELS
 #undef DECL_SUF
 #undef DECL_HEVC_SSD
 #undef DECL_X1
 #undef DECL_X4
 #undef DECL_ADS
+#undef SETUP_CHROMA_BLOCKCOPY_FUNC
+#undef SETUP_LUMA_BLOCK_FUNC
+#undef CHROMA_BLOCKCOPY_DEF
+#undef LUMA_BLOCKCOPY_DEF
 
 #endif // ifndef X265_I386_PIXEL_H
diff -r 37903c6fd1f9 -r aca0d5c6b960 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Mon Nov 04 12:09:06 2013 +0530
+++ b/source/test/pixelharness.cpp	Mon Nov 04 17:30:15 2013 +0530
@@ -645,15 +645,6 @@
         }
     }
 
-    if (opt.chroma_copy_pp[part])
-    {
-        if (!check_block_copy_pp(ref.chroma_copy_pp[part], opt.chroma_copy_pp[part]))
-        {
-            printf("chroma_copy_pp[%s] failed\n", chromaPartStr[part]);
-            return false;
-        }
-    }
-
     return true;
 }
 
@@ -813,6 +804,17 @@
         }
     }
 
+    for (int i = 0; i < NUM_CHROMA_PARTITIONS; i++)
+    {
+      if (opt.chroma_copy_pp[i])
+      {
+          if (!check_block_copy_pp(ref.chroma_copy_pp[i], opt.chroma_copy_pp[i]))
+          {
+              printf("chroma_copy_pp[%s] failed\n", chromaPartStr[i]);
+              return false;
+          }
+      }
+    }
     return true;
 }
 


More information about the x265-devel mailing list