[x265] [PATCH] 16bpp: assembly code for sad_NxN functions
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Wed Dec 4 07:23:27 CET 2013
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1386137638 -19800
# Wed Dec 04 11:43:58 2013 +0530
# Node ID 3e6159dfd6a1b59beedf430894b997e848e9e3b1
# Parent 55c0bf9d99661073a7acdb5749e2625379d8393a
16bpp: assembly code for sad_NxN functions
diff -r 55c0bf9d9966 -r 3e6159dfd6a1 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt Tue Dec 03 14:14:44 2013 -0600
+++ b/source/common/CMakeLists.txt Wed Dec 04 11:43:58 2013 +0530
@@ -118,9 +118,15 @@
if(ENABLE_PRIMITIVES_ASM)
set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h)
- set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm ssd-a.asm mc-a.asm
+ set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm
mc-a2.asm ipfilter8.asm pixel-util8.asm blockcopy8.asm intrapred8.asm
pixeladd8.asm dct8.asm)
+ if(HIGH_BIT_DEPTH)
+ set(A_SRCS ${A_SRCS} sad16-a.asm)
+ else()
+ set(A_SRCS ${A_SRCS} sad-a.asm)
+ endif()
+
if (NOT X64)
set(A_SRCS ${A_SRCS} pixel-32.asm)
endif()
diff -r 55c0bf9d9966 -r 3e6159dfd6a1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Dec 03 14:14:44 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp Wed Dec 04 11:43:58 2013 +0530
@@ -533,6 +533,27 @@
PIXEL_AVG(sse2);
PIXEL_AVG_W4(mmx2);
LUMA_VAR(_sse2);
+
+ INIT8(sad, _mmx2);
+ p.sad[LUMA_8x32] = x265_pixel_sad_8x32_sse2;
+ p.sad[LUMA_16x4] = x265_pixel_sad_16x4_sse2;
+ p.sad[LUMA_16x12] = x265_pixel_sad_16x12_sse2;
+ p.sad[LUMA_16x32] = x265_pixel_sad_16x32_sse2;
+
+ p.sad[LUMA_32x8] = x265_pixel_sad_32x8_sse2;
+ p.sad[LUMA_32x16] = x265_pixel_sad_32x16_sse2;
+ p.sad[LUMA_32x24] = x265_pixel_sad_32x24_sse2;
+ p.sad[LUMA_32x32] = x265_pixel_sad_32x32_sse2;
+ p.sad[LUMA_32x64] = x265_pixel_sad_32x64_sse2;
+
+ p.sad[LUMA_64x16] = x265_pixel_sad_64x16_sse2;
+ p.sad[LUMA_64x32] = x265_pixel_sad_64x32_sse2;
+ p.sad[LUMA_64x48] = x265_pixel_sad_64x48_sse2;
+ p.sad[LUMA_64x64] = x265_pixel_sad_64x64_sse2;
+
+ p.sad[LUMA_48x64] = x265_pixel_sad_48x64_sse2;
+ p.sad[LUMA_24x32] = x265_pixel_sad_24x32_sse2;
+ p.sad[LUMA_12x16] = x265_pixel_sad_12x16_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
diff -r 55c0bf9d9966 -r 3e6159dfd6a1 source/common/x86/sad16-a.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/sad16-a.asm Wed Dec 04 11:43:58 2013 +0530
@@ -0,0 +1,772 @@
+;*****************************************************************************
+;* sad16-a.asm: x86 high depth sad functions
+;*****************************************************************************
+;* Copyright (C) 2010-2013 x264 project
+;*
+;* Authors: Oskar Arvidsson <oskar at irock.se>
+;* Henrik Gramner <henrik at gramner.com>
+;* Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing at x264.com.
+;*****************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION .text
+
+cextern pw_1
+
+;=============================================================================
+; SAD MMX
+;=============================================================================
+
+%macro SAD_INC_1x16P_MMX 0
+ movu m1, [r0+ 0]
+ movu m2, [r0+ 8]
+ movu m3, [r0+16]
+ movu m4, [r0+24]
+ psubw m1, [r2+ 0]
+ psubw m2, [r2+ 8]
+ psubw m3, [r2+16]
+ psubw m4, [r2+24]
+ ABSW2 m1, m2, m1, m2, m5, m6
+ ABSW2 m3, m4, m3, m4, m7, m5
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ paddw m1, m2
+ paddw m3, m4
+ paddw m0, m1
+ paddw m0, m3
+%endmacro
+
+%macro SAD_INC_2x8P_MMX 0
+ movu m1, [r0+0]
+ movu m2, [r0+8]
+ movu m3, [r0+2*r1+0]
+ movu m4, [r0+2*r1+8]
+ psubw m1, [r2+0]
+ psubw m2, [r2+8]
+ psubw m3, [r2+2*r3+0]
+ psubw m4, [r2+2*r3+8]
+ ABSW2 m1, m2, m1, m2, m5, m6
+ ABSW2 m3, m4, m3, m4, m7, m5
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ paddw m1, m2
+ paddw m3, m4
+ paddw m0, m1
+ paddw m0, m3
+%endmacro
+
+%macro SAD_INC_2x4P_MMX 0
+ movu m1, [r0]
+ movu m2, [r0+2*r1]
+ psubw m1, [r2]
+ psubw m2, [r2+2*r3]
+ ABSW2 m1, m2, m1, m2, m3, m4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ paddw m0, m1
+ paddw m0, m2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;-----------------------------------------------------------------------------
+%macro SAD_MMX 3
+cglobal pixel_sad_%1x%2, 4,5-(%2&4/4)
+ pxor m0, m0
+%if %2 == 4
+ SAD_INC_%3x%1P_MMX
+ SAD_INC_%3x%1P_MMX
+%else
+ mov r4d, %2/%3
+.loop:
+ SAD_INC_%3x%1P_MMX
+ dec r4d
+ jg .loop
+%endif
+%if %1*%2 == 256
+ HADDUW m0, m1
+%else
+ HADDW m0, m1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_MMX mmx2
+SAD_MMX 16, 16, 1
+SAD_MMX 16, 8, 1
+SAD_MMX 8, 16, 2
+SAD_MMX 8, 8, 2
+SAD_MMX 8, 4, 2
+SAD_MMX 4, 8, 2
+SAD_MMX 4, 4, 2
+SAD_MMX 4, 16, 2
+INIT_MMX ssse3
+SAD_MMX 4, 8, 2
+SAD_MMX 4, 4, 2
+
+;=============================================================================
+; SAD XMM
+;=============================================================================
+
+%macro SAD_1x32 0
+ movu m1, [r2+ 0]
+ movu m2, [r2+16]
+ movu m3, [r2+32]
+ movu m4, [r2+48]
+ psubw m1, [r0+0]
+ psubw m2, [r0+16]
+ psubw m3, [r0+32]
+ psubw m4, [r0+48]
+ ABSW2 m1, m2, m1, m2, m5, m6
+ pmaddwd m1, [pw_1]
+ pmaddwd m2, [pw_1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ ABSW2 m3, m4, m3, m4, m7, m5
+ pmaddwd m3, [pw_1]
+ pmaddwd m4, [pw_1]
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+%endmacro
+
+%macro SAD_1x24 0
+ movu m1, [r2+ 0]
+ movu m2, [r2+16]
+ movu m3, [r2+32]
+ psubw m1, [r0+0]
+ psubw m2, [r0+16]
+ psubw m3, [r0+32]
+ ABSW2 m1, m2, m1, m2, m4, m6
+ pmaddwd m1, [pw_1]
+ pmaddwd m2, [pw_1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ pxor m4, m4
+ psubw m4, m3
+ pmaxsw m3, m4
+ pmaddwd m3, [pw_1]
+ paddd m1, m2
+ paddd m0, m1
+ paddd m0, m3
+%endmacro
+
+%macro SAD_1x48 0
+ movu m1, [r2+ 0]
+ movu m2, [r2+16]
+ movu m3, [r2+32]
+ movu m4, [r2+48]
+ psubw m1, [r0+0]
+ psubw m2, [r0+16]
+ psubw m3, [r0+32]
+ psubw m4, [r0+48]
+ ABSW2 m1, m2, m1, m2, m5, m6
+ pmaddwd m1, [pw_1]
+ pmaddwd m2, [pw_1]
+ ABSW2 m3, m4, m3, m4, m7, m5
+ pmaddwd m3, [pw_1]
+ pmaddwd m4, [pw_1]
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+ movu m1, [r2+64]
+ movu m2, [r2+80]
+ psubw m1, [r0+64]
+ psubw m2, [r0+80]
+ ABSW2 m1, m2, m1, m2, m3, m4
+ pmaddwd m1, [pw_1]
+ pmaddwd m2, [pw_1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ paddd m0, m1
+ paddd m0, m2
+%endmacro
+
+%macro SAD_1x64 0
+ movu m1, [r2+ 0]
+ movu m2, [r2+16]
+ movu m3, [r2+32]
+ movu m4, [r2+48]
+ psubw m1, [r0+0]
+ psubw m2, [r0+16]
+ psubw m3, [r0+32]
+ psubw m4, [r0+48]
+ ABSW2 m1, m2, m1, m2, m5, m6
+ pmaddwd m1, [pw_1]
+ pmaddwd m2, [pw_1]
+ ABSW2 m3, m4, m3, m4, m7, m5
+ pmaddwd m3, [pw_1]
+ pmaddwd m4, [pw_1]
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+ movu m1, [r2+64]
+ movu m2, [r2+80]
+ movu m3, [r2+96]
+ movu m4, [r2+112]
+ psubw m1, [r0+64]
+ psubw m2, [r0+80]
+ psubw m3, [r0+96]
+ psubw m4, [r0+112]
+ ABSW2 m1, m2, m1, m2, m5, m6
+ pmaddwd m1, [pw_1]
+ pmaddwd m2, [pw_1]
+ ABSW2 m3, m4, m3, m4, m7, m5
+ pmaddwd m3, [pw_1]
+ pmaddwd m4, [pw_1]
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+%endmacro
+
+%macro SAD_1x12 0
+ movu m1, [r2+0]
+ movh m2, [r2+16]
+ psubw m1, [r0+0]
+ movh m3, [r0+16]
+ psubw m2, m3
+ ABSW2 m1, m2, m1, m2, m4, m6
+ pmaddwd m1, [pw_1]
+ pmaddwd m2, [pw_1]
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ paddd m1, m2
+ paddd m0, m1
+%endmacro
+
+%macro SAD_INC_2ROW 1
+%if 2*%1 > mmsize
+ movu m1, [r2+ 0]
+ movu m2, [r2+16]
+ movu m3, [r2+2*r3+ 0]
+ movu m4, [r2+2*r3+16]
+ psubw m1, [r0+ 0]
+ psubw m2, [r0+16]
+ psubw m3, [r0+2*r1+ 0]
+ psubw m4, [r0+2*r1+16]
+ ABSW2 m1, m2, m1, m2, m5, m6
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ ABSW2 m3, m4, m3, m4, m7, m5
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+%else
+ movu m1, [r2]
+ movu m2, [r2+2*r3]
+ psubw m1, [r0]
+ psubw m2, [r0+2*r1]
+ ABSW2 m1, m2, m1, m2, m3, m4
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+ paddw m0, m1
+ paddw m0, m2
+%endif
+%endmacro
+
+;-----------------------------------------------------------------------------
+; int pixel_sad_NxM( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;-----------------------------------------------------------------------------
+%macro SAD 2
+cglobal pixel_sad_%1x%2, 4,5-(%2&4/4),8*(%1/mmsize)
+ pxor m0, m0
+%if %2 == 4
+ SAD_INC_2ROW %1
+ SAD_INC_2ROW %1
+%else
+ mov r4d, %2/2
+.loop:
+ SAD_INC_2ROW %1
+ dec r4d
+ jg .loop
+%endif
+
+ HADDW m0, m1
+ movd eax, xm0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD 16, 4
+SAD 16, 8
+SAD 16, 12
+SAD 16, 16
+SAD 16, 32
+
+INIT_XMM sse2
+SAD 8, 4
+SAD 8, 8
+SAD 8, 16
+SAD 8, 32
+
+;------------------------------------------------------------------
+; int pixel_sad_32xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;------------------------------------------------------------------
+%macro SAD_32 2
+cglobal pixel_sad_%1x%2, 4,5,8
+ pxor m0, m0
+ mov r4d, %2/4
+.loop:
+ SAD_1x32
+ SAD_1x32
+ SAD_1x32
+ SAD_1x32
+ dec r4d
+ jnz .loop
+
+ HADDD m0, m1
+ movd eax, xm0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD_32 32, 8
+SAD_32 32, 16
+SAD_32 32, 24
+SAD_32 32, 32
+SAD_32 32, 64
+
+;------------------------------------------------------------------
+; int pixel_sad_64xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;------------------------------------------------------------------
+%macro SAD_64 2
+cglobal pixel_sad_%1x%2, 4,5,8
+ pxor m0, m0
+ mov r4d, %2/4
+.loop:
+ SAD_1x64
+ SAD_1x64
+ SAD_1x64
+ SAD_1x64
+ dec r4d
+ jnz .loop
+
+ HADDD m0, m1
+ movd eax, xmm0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD_64 64, 16
+SAD_64 64, 32
+SAD_64 64, 48
+SAD_64 64, 64
+
+;------------------------------------------------------------------
+; int pixel_sad_48xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;------------------------------------------------------------------
+%macro SAD_48 2
+cglobal pixel_sad_%1x%2, 4,5,8
+ pxor m0, m0
+ mov r4d, %2/4
+.loop:
+ SAD_1x48
+ SAD_1x48
+ SAD_1x48
+ SAD_1x48
+ dec r4d
+ jnz .loop
+
+ HADDD m0, m1
+ movd eax, xmm0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD_48 48, 64
+
+;------------------------------------------------------------------
+; int pixel_sad_24xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;------------------------------------------------------------------
+%macro SAD_24 2
+cglobal pixel_sad_%1x%2, 4,5,8
+ pxor m0, m0
+ mov r4d, %2/4
+.loop:
+ SAD_1x24
+ SAD_1x24
+ SAD_1x24
+ SAD_1x24
+ dec r4d
+ jnz .loop
+
+ HADDD m0, m1
+ movd eax, xmm0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD_24 24, 32
+
+;------------------------------------------------------------------
+; int pixel_sad_12xN( uint16_t *, intptr_t, uint16_t *, intptr_t )
+;------------------------------------------------------------------
+%macro SAD_12 2
+cglobal pixel_sad_%1x%2, 4,5,8
+ pxor m0, m0
+ mov r4d, %2/4
+.loop:
+ SAD_1x12
+ SAD_1x12
+ SAD_1x12
+ SAD_1x12
+ dec r4d
+ jnz .loop
+
+ HADDD m0, m1
+ movd eax, xmm0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD_12 12, 16
+
+
+;=============================================================================
+; SAD x3/x4
+;=============================================================================
+
+%macro SAD_X3_INC_P 0
+ add r0, 4*FENC_STRIDE
+ lea r1, [r1+4*r4]
+ lea r2, [r2+4*r4]
+ lea r3, [r3+4*r4]
+%endmacro
+
+%macro SAD_X3_ONE_START 0
+ mova m3, [r0]
+ movu m0, [r1]
+ movu m1, [r2]
+ movu m2, [r3]
+ psubw m0, m3
+ psubw m1, m3
+ psubw m2, m3
+ ABSW2 m0, m1, m0, m1, m4, m5
+ ABSW m2, m2, m6
+%endmacro
+
+%macro SAD_X3_ONE 2
+ mova m6, [r0+%1]
+ movu m3, [r1+%2]
+ movu m4, [r2+%2]
+ movu m5, [r3+%2]
+ psubw m3, m6
+ psubw m4, m6
+ psubw m5, m6
+ ABSW2 m3, m4, m3, m4, m7, m6
+ ABSW m5, m5, m6
+ paddw m0, m3
+ paddw m1, m4
+ paddw m2, m5
+%endmacro
+
+%macro SAD_X3_END 2
+%if mmsize == 8 && %1*%2 == 256
+ HADDUW m0, m3
+ HADDUW m1, m4
+ HADDUW m2, m5
+%else
+ HADDW m0, m3
+ HADDW m1, m4
+ HADDW m2, m5
+%endif
+%if UNIX64
+ movd [r5+0], xm0
+ movd [r5+4], xm1
+ movd [r5+8], xm2
+%else
+ mov r0, r5mp
+ movd [r0+0], xm0
+ movd [r0+4], xm1
+ movd [r0+8], xm2
+%endif
+ RET
+%endmacro
+
+%macro SAD_X4_INC_P 0
+ add r0, 4*FENC_STRIDE
+ lea r1, [r1+4*r5]
+ lea r2, [r2+4*r5]
+ lea r3, [r3+4*r5]
+ lea r4, [r4+4*r5]
+%endmacro
+
+%macro SAD_X4_ONE_START 0
+ mova m4, [r0]
+ movu m0, [r1]
+ movu m1, [r2]
+ movu m2, [r3]
+ movu m3, [r4]
+ psubw m0, m4
+ psubw m1, m4
+ psubw m2, m4
+ psubw m3, m4
+ ABSW2 m0, m1, m0, m1, m5, m6
+ ABSW2 m2, m3, m2, m3, m4, m7
+%endmacro
+
+%macro SAD_X4_ONE 2
+ mova m4, [r0+%1]
+ movu m5, [r1+%2]
+ movu m6, [r2+%2]
+%if num_mmregs > 8
+ movu m7, [r3+%2]
+ movu m8, [r4+%2]
+ psubw m5, m4
+ psubw m6, m4
+ psubw m7, m4
+ psubw m8, m4
+ ABSW2 m5, m6, m5, m6, m9, m10
+ ABSW2 m7, m8, m7, m8, m9, m10
+ paddw m0, m5
+ paddw m1, m6
+ paddw m2, m7
+ paddw m3, m8
+%elif cpuflag(ssse3)
+ movu m7, [r3+%2]
+ psubw m5, m4
+ psubw m6, m4
+ psubw m7, m4
+ movu m4, [r4+%2]
+ pabsw m5, m5
+ psubw m4, [r0+%1]
+ pabsw m6, m6
+ pabsw m7, m7
+ pabsw m4, m4
+ paddw m0, m5
+ paddw m1, m6
+ paddw m2, m7
+ paddw m3, m4
+%else ; num_mmregs == 8 && !ssse3
+ psubw m5, m4
+ psubw m6, m4
+ ABSW m5, m5, m7
+ ABSW m6, m6, m7
+ paddw m0, m5
+ paddw m1, m6
+ movu m5, [r3+%2]
+ movu m6, [r4+%2]
+ psubw m5, m4
+ psubw m6, m4
+ ABSW2 m5, m6, m5, m6, m7, m4
+ paddw m2, m5
+ paddw m3, m6
+%endif
+%endmacro
+
+%macro SAD_X4_END 2
+%if mmsize == 8 && %1*%2 == 256
+ HADDUW m0, m4
+ HADDUW m1, m5
+ HADDUW m2, m6
+ HADDUW m3, m7
+%else
+ HADDW m0, m4
+ HADDW m1, m5
+ HADDW m2, m6
+ HADDW m3, m7
+%endif
+ mov r0, r6mp
+ movd [r0+ 0], xm0
+ movd [r0+ 4], xm1
+ movd [r0+ 8], xm2
+ movd [r0+12], xm3
+ RET
+%endmacro
+
+%macro SAD_X_2xNP 4
+ %assign x %3
+%rep %4
+ SAD_X%1_ONE x*mmsize, x*mmsize
+ SAD_X%1_ONE 2*FENC_STRIDE+x*mmsize, 2*%2+x*mmsize
+ %assign x x+1
+%endrep
+%endmacro
+
+%macro PIXEL_VSAD 0
+cglobal pixel_vsad, 3,3,8
+ mova m0, [r0]
+ mova m1, [r0+16]
+ mova m2, [r0+2*r1]
+ mova m3, [r0+2*r1+16]
+ lea r0, [r0+4*r1]
+ psubw m0, m2
+ psubw m1, m3
+ ABSW2 m0, m1, m0, m1, m4, m5
+ paddw m0, m1
+ sub r2d, 2
+ je .end
+.loop:
+ mova m4, [r0]
+ mova m5, [r0+16]
+ mova m6, [r0+2*r1]
+ mova m7, [r0+2*r1+16]
+ lea r0, [r0+4*r1]
+ psubw m2, m4
+ psubw m3, m5
+ psubw m4, m6
+ psubw m5, m7
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ ABSW m4, m4, m1
+ ABSW m5, m5, m1
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ paddw m0, m5
+ mova m2, m6
+ mova m3, m7
+ sub r2d, 2
+ jg .loop
+.end:
+%if BIT_DEPTH == 9
+ HADDW m0, m1 ; max sum: 62(pixel diffs)*511(pixel_max)=31682
+%else
+ HADDUW m0, m1 ; max sum: 62(pixel diffs)*1023(pixel_max)=63426
+%endif
+ movd eax, m0
+ RET
+%endmacro
+INIT_XMM sse2
+PIXEL_VSAD
+INIT_XMM ssse3
+PIXEL_VSAD
+INIT_XMM xop
+PIXEL_VSAD
+
+INIT_YMM avx2
+cglobal pixel_vsad, 3,3
+ mova m0, [r0]
+ mova m1, [r0+2*r1]
+ lea r0, [r0+4*r1]
+ psubw m0, m1
+ pabsw m0, m0
+ sub r2d, 2
+ je .end
+.loop:
+ mova m2, [r0]
+ mova m3, [r0+2*r1]
+ lea r0, [r0+4*r1]
+ psubw m1, m2
+ psubw m2, m3
+ pabsw m1, m1
+ pabsw m2, m2
+ paddw m0, m1
+ paddw m0, m2
+ mova m1, m3
+ sub r2d, 2
+ jg .loop
+.end:
+%if BIT_DEPTH == 9
+ HADDW m0, m1
+%else
+ HADDUW m0, m1
+%endif
+ movd eax, xm0
+ RET
+
+;-----------------------------------------------------------------------------
+; void pixel_sad_xN_WxH( uint16_t *fenc, uint16_t *pix0, uint16_t *pix1,
+; uint16_t *pix2, intptr_t i_stride, int scores[3] )
+;-----------------------------------------------------------------------------
+%macro SAD_X 3
+cglobal pixel_sad_x%1_%2x%3, 6,7,XMM_REGS
+ %assign regnum %1+1
+ %xdefine STRIDE r %+ regnum
+ mov r6, %3/2-1
+ SAD_X%1_ONE_START
+ SAD_X%1_ONE 2*FENC_STRIDE, 2*STRIDE
+ SAD_X_2xNP %1, STRIDE, 1, %2/(mmsize/2)-1
+.loop:
+ SAD_X%1_INC_P
+ SAD_X_2xNP %1, STRIDE, 0, %2/(mmsize/2)
+ dec r6
+ jg .loop
+%if %1 == 4
+ mov r6, r6m
+%endif
+ SAD_X%1_END %2, %3
+%endmacro
+
+INIT_MMX mmx2
+%define XMM_REGS 0
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+SAD_X 3, 8, 16
+SAD_X 3, 8, 8
+SAD_X 3, 8, 4
+SAD_X 3, 4, 8
+SAD_X 3, 4, 4
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+SAD_X 4, 8, 16
+SAD_X 4, 8, 8
+SAD_X 4, 8, 4
+SAD_X 4, 4, 8
+SAD_X 4, 4, 4
+INIT_MMX ssse3
+SAD_X 3, 4, 8
+SAD_X 3, 4, 4
+SAD_X 4, 4, 8
+SAD_X 4, 4, 4
+INIT_XMM ssse3
+%define XMM_REGS 7
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+SAD_X 3, 8, 16
+SAD_X 3, 8, 8
+SAD_X 3, 8, 4
+%define XMM_REGS 9
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+SAD_X 4, 8, 16
+SAD_X 4, 8, 8
+SAD_X 4, 8, 4
+INIT_XMM sse2
+%define XMM_REGS 8
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+SAD_X 3, 8, 16
+SAD_X 3, 8, 8
+SAD_X 3, 8, 4
+%define XMM_REGS 11
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+SAD_X 4, 8, 16
+SAD_X 4, 8, 8
+SAD_X 4, 8, 4
+INIT_YMM avx2
+%define XMM_REGS 7
+SAD_X 3, 16, 16
+SAD_X 3, 16, 8
+%define XMM_REGS 9
+SAD_X 4, 16, 16
+SAD_X 4, 16, 8
+
More information about the x265-devel
mailing list