[x265] [PATCH 3 of 3] use x264 pixel average assembly routines for bidir and lowres QPEL
Steve Borho
steve at borho.org
Sat Oct 5 00:02:03 CEST 2013
# HG changeset patch
# User Steve Borho <steve at borho.org>
# Date 1380924067 18000
# Fri Oct 04 17:01:07 2013 -0500
# Node ID 84d0f4f907f75f32de74762eee0c28e76337da6f
# Parent 7976c35f5b769431f99db1a77151329be16934ba
use x264 pixel average assembly routines for bidir and lowres QPEL
This required adding a weight parameter and re-ordering arguments. Bidir might
eventually use the weighting feature so this didn't seem like a bad trade-off.
This commit naively pulls in all of mc-a.asm from x264 for just this one
set of assembly functions.
diff -r 7976c35f5b76 -r 84d0f4f907f7 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Fri Oct 04 16:02:01 2013 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Fri Oct 04 17:01:07 2013 -0500
@@ -2428,7 +2428,7 @@
pixel avg[MAX_CU_SIZE * MAX_CU_SIZE];
int partEnum = PartitionFromSizes(roiWidth, roiHeight);
- primitives.pixelavg_pp[partEnum](avg, roiWidth, ref0, ref1, m_predYuv[0].getStride(), m_predYuv[1].getStride());
+ primitives.pixelavg_pp[partEnum](avg, roiWidth, ref0, m_predYuv[0].getStride(), ref1, m_predYuv[1].getStride(), 32);
int satdCost = primitives.satd[partEnum](pu, fenc->getStride(), avg, roiWidth);
bits[2] = bits[0] + bits[1] - mbBits[0] - mbBits[1] + mbBits[2];
@@ -2441,7 +2441,7 @@
intptr_t refStride = cu->getSlice()->m_mref[0][refIdx[0]]->lumaStride;
partEnum = PartitionFromSizes(roiWidth, roiHeight);
- primitives.pixelavg_pp[partEnum](avg, roiWidth, ref0, ref1, refStride, refStride);
+ primitives.pixelavg_pp[partEnum](avg, roiWidth, ref0, refStride, ref1, refStride, 32);
satdCost = primitives.satd[partEnum](pu, fenc->getStride(), avg, roiWidth);
diff -r 7976c35f5b76 -r 84d0f4f907f7 source/common/pixel.cpp
--- a/source/common/pixel.cpp Fri Oct 04 16:02:01 2013 -0500
+++ b/source/common/pixel.cpp Fri Oct 04 17:01:07 2013 -0500
@@ -580,7 +580,7 @@
}
template<int lx, int ly>
-void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, pixel* src1, intptr_t sstride0, intptr_t sstride1)
+void pixelavg_pp(pixel* dst, intptr_t dstride, pixel* src0, intptr_t sstride0, pixel* src1, intptr_t sstride1, int)
{
for (int y = 0; y < ly; y++)
{
diff -r 7976c35f5b76 -r 84d0f4f907f7 source/common/primitives.h
--- a/source/common/primitives.h Fri Oct 04 16:02:01 2013 -0500
+++ b/source/common/primitives.h Fri Oct 04 17:01:07 2013 -0500
@@ -196,7 +196,7 @@
typedef void (*pixelsub_sp_t)(int bx, int by, short *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
typedef void (*pixeladd_ss_t)(int bx, int by, short *dst, intptr_t dstride, short *src0, short *src1, intptr_t sstride0, intptr_t sstride1);
typedef void (*pixeladd_pp_t)(int bx, int by, pixel *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
-typedef void (*pixelavg_pp_t)(pixel *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
+typedef void (*pixelavg_pp_t)(pixel *dst, intptr_t dstride, pixel *src0, intptr_t sstride0, pixel *src1, intptr_t sstride1, int weight);
typedef void (*blockfil_s_t)(short *dst, intptr_t dstride, short val);
typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter);
diff -r 7976c35f5b76 -r 84d0f4f907f7 source/common/x86/CMakeLists.txt
--- a/source/common/x86/CMakeLists.txt Fri Oct 04 16:02:01 2013 -0500
+++ b/source/common/x86/CMakeLists.txt Fri Oct 04 17:01:07 2013 -0500
@@ -5,7 +5,7 @@
add_definitions(-DHAVE_ALIGNED_STACK=0)
endif()
-set(ASMS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a2.asm ipfilter8.asm)
+set(ASMS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm)
if (X64)
add_definitions(-DARCH_X86_64=1)
else()
diff -r 7976c35f5b76 -r 84d0f4f907f7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Oct 04 16:02:01 2013 -0500
+++ b/source/common/x86/asm-primitives.cpp Fri Oct 04 17:01:07 2013 -0500
@@ -38,6 +38,19 @@
LOWRES(avx)
LOWRES(xop)
+#define DECL_SUF( func, args )\
+ void func##_mmx2 args;\
+ void func##_sse2 args;\
+ void func##_ssse3 args;
+DECL_SUF( x265_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x265_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x265_pixel_avg_8x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x265_pixel_avg_8x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x265_pixel_avg_8x4, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x265_pixel_avg_4x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x265_pixel_avg_4x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+DECL_SUF( x265_pixel_avg_4x4, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
+
void x265_filterHorizontal_p_p_4_sse4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff);
}
@@ -101,6 +114,13 @@
p.sse_##type[PARTITION_##width##x8] = (pixelcmp_t) x265_pixel_ssd_##width##x8_##suffix; \
p.sse_##type[PARTITION_##width##x4] = (pixelcmp_t) x265_pixel_ssd_##width##x4_##suffix; \
+#define PIXEL_AVE(cpu) \
+ p.pixelavg_pp[PARTITION_16x16] = x265_pixel_avg_16x16_##cpu; \
+ p.pixelavg_pp[PARTITION_16x8] = x265_pixel_avg_16x8_##cpu; \
+ p.pixelavg_pp[PARTITION_8x16] = x265_pixel_avg_8x16_##cpu; \
+ p.pixelavg_pp[PARTITION_8x8] = x265_pixel_avg_8x8_##cpu; \
+ p.pixelavg_pp[PARTITION_8x4] = x265_pixel_avg_8x4_##cpu;
+
void Setup_Assembly_Primitives(EncoderPrimitives &p, int cpuMask)
{
#if !HIGH_BIT_DEPTH
@@ -113,6 +133,9 @@
INIT8( satd, _mmx2 );
p.frame_init_lowres_core = x265_frame_init_lowres_core_mmx2;
+ p.pixelavg_pp[PARTITION_4x16] = x265_pixel_avg_4x16_mmx2;
+ p.pixelavg_pp[PARTITION_4x8] = x265_pixel_avg_4x8_mmx2;
+ p.pixelavg_pp[PARTITION_4x4] = x265_pixel_avg_4x4_mmx2;
p.sa8d[BLOCK_4x4] = x265_pixel_satd_4x4_mmx2;
@@ -184,6 +207,7 @@
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
INIT6( satd, _sse2 );
+ PIXEL_AVE(sse2);
p.frame_init_lowres_core = x265_frame_init_lowres_core_sse2;
@@ -329,6 +353,7 @@
ASSGN_SSE(pp, 8, ssse3)
ASSGN_SSE(pp, 16, ssse3)
ASSGN_SSE(pp, 32, ssse3)
+ PIXEL_AVE(ssse3);
p.sse_pp[PARTITION_24x4] = cmp<24, 4, 8, 4, x265_pixel_ssd_8x4_ssse3>;
p.sse_pp[PARTITION_24x8] = cmp<24, 8, 8, 8, x265_pixel_ssd_8x8_ssse3>;
diff -r 7976c35f5b76 -r 84d0f4f907f7 source/common/x86/mc-a.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/mc-a.asm Fri Oct 04 17:01:07 2013 -0500
@@ -0,0 +1,2132 @@
+;*****************************************************************************
+;* mc-a.asm: x86 motion compensation
+;*****************************************************************************
+;* Copyright (C) 2003-2013 x264 project
+;*
+;* Authors: Loren Merritt <lorenm at u.washington.edu>
+;* Jason Garrett-Glaser <darkshikari at gmail.com>
+;* Laurent Aimar <fenrir at via.ecp.fr>
+;* Dylan Yudaken <dyudaken at gmail.com>
+;* Holger Lubitz <holger at lubitz.org>
+;* Min Chen <chenm001.163.com>
+;* Oskar Arvidsson <oskar at irock.se>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing at x264.com.
+;*****************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA 32
+
+ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
+ch_shuf_adj: times 8 db 0
+ times 8 db 2
+ times 8 db 4
+ times 8 db 6
+sq_1: times 1 dq 1
+
+SECTION .text
+
+cextern pb_0
+cextern pw_1
+cextern pw_4
+cextern pw_8
+cextern pw_32
+cextern pw_64
+cextern pw_512
+cextern pw_00ff
+cextern pw_pixel_max
+cextern sw_64
+cextern pd_32
+cextern deinterleave_shufd
+
+;=============================================================================
+; implicit weighted biprediction
+;=============================================================================
+; assumes log2_denom = 5, offset = 0, weight1 + weight2 = 64
+%if WIN64
+ DECLARE_REG_TMP 0,1,2,3,4,5,4,5
+ %macro AVG_START 0-1 0
+ PROLOGUE 6,7,%1
+ %endmacro
+%elif UNIX64
+ DECLARE_REG_TMP 0,1,2,3,4,5,7,8
+ %macro AVG_START 0-1 0
+ PROLOGUE 6,9,%1
+ %endmacro
+%else
+ DECLARE_REG_TMP 1,2,3,4,5,6,1,2
+ %macro AVG_START 0-1 0
+ PROLOGUE 0,7,%1
+ mov t0, r0m
+ mov t1, r1m
+ mov t2, r2m
+ mov t3, r3m
+ mov t4, r4m
+ mov t5, r5m
+ %endmacro
+%endif
+
+%macro AVG_END 0
+ lea t4, [t4+t5*2*SIZEOF_PIXEL]
+ lea t2, [t2+t3*2*SIZEOF_PIXEL]
+ lea t0, [t0+t1*2*SIZEOF_PIXEL]
+ sub eax, 2
+ jg .height_loop
+ RET
+%endmacro
+
+%if HIGH_BIT_DEPTH
+
+%macro BIWEIGHT_MMX 2
+ movh m0, %1
+ movh m1, %2
+ punpcklwd m0, m1
+ pmaddwd m0, m3
+ paddd m0, m4
+ psrad m0, 6
+%endmacro
+
+%macro BIWEIGHT_START_MMX 0
+ movzx t6d, word r6m
+ mov t7d, 64
+ sub t7d, t6d
+ shl t7d, 16
+ add t6d, t7d
+ movd m3, t6d
+ SPLATD m3, m3
+ mova m4, [pd_32]
+ pxor m5, m5
+%endmacro
+
+%else ;!HIGH_BIT_DEPTH
+%macro BIWEIGHT_MMX 2
+ movh m0, %1
+ movh m1, %2
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, m2
+ pmullw m1, m3
+ paddw m0, m1
+ paddw m0, m4
+ psraw m0, 6
+%endmacro
+
+%macro BIWEIGHT_START_MMX 0
+ movd m2, r6m
+ SPLATW m2, m2 ; weight_dst
+ mova m3, [pw_64]
+ psubw m3, m2 ; weight_src
+ mova m4, [pw_32] ; rounding
+ pxor m5, m5
+%endmacro
+%endif ;HIGH_BIT_DEPTH
+
+%macro BIWEIGHT_SSSE3 2
+ movh m0, %1
+ movh m1, %2
+ punpcklbw m0, m1
+ pmaddubsw m0, m3
+ pmulhrsw m0, m4
+%endmacro
+
+%macro BIWEIGHT_START_SSSE3 0
+ movzx t6d, byte r6m ; FIXME x86_64
+ mov t7d, 64
+ sub t7d, t6d
+ shl t7d, 8
+ add t6d, t7d
+ mova m4, [pw_512]
+ movd xm3, t6d
+%if cpuflag(avx2)
+ vpbroadcastw m3, xm3
+%else
+ SPLATW m3, m3 ; weight_dst,src
+%endif
+%endmacro
+
+%if HIGH_BIT_DEPTH
+%macro BIWEIGHT_ROW 4
+ BIWEIGHT [%2], [%3]
+%if %4==mmsize/4
+ packssdw m0, m0
+ CLIPW m0, m5, m7
+ movh [%1], m0
+%else
+ SWAP 0, 6
+ BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
+ packssdw m6, m0
+ CLIPW m6, m5, m7
+ mova [%1], m6
+%endif
+%endmacro
+
+%else ;!HIGH_BIT_DEPTH
+%macro BIWEIGHT_ROW 4
+ BIWEIGHT [%2], [%3]
+%if %4==mmsize/2
+ packuswb m0, m0
+ movh [%1], m0
+%else
+ SWAP 0, 6
+ BIWEIGHT [%2+mmsize/2], [%3+mmsize/2]
+ packuswb m6, m0
+ mova [%1], m6
+%endif
+%endmacro
+
+%endif ;HIGH_BIT_DEPTH
+
+;-----------------------------------------------------------------------------
+; int pixel_avg_weight_w16( pixel *dst, intptr_t, pixel *src1, intptr_t, pixel *src2, intptr_t, int i_weight )
+;-----------------------------------------------------------------------------
+%macro AVG_WEIGHT 1-2 0
+cglobal pixel_avg_weight_w%1
+ BIWEIGHT_START
+ AVG_START %2
+%if HIGH_BIT_DEPTH
+ mova m7, [pw_pixel_max]
+%endif
+.height_loop:
+%if mmsize==16 && %1==mmsize/(2*SIZEOF_PIXEL)
+ BIWEIGHT [t2], [t4]
+ SWAP 0, 6
+ BIWEIGHT [t2+SIZEOF_PIXEL*t3], [t4+SIZEOF_PIXEL*t5]
+%if HIGH_BIT_DEPTH
+ packssdw m6, m0
+ CLIPW m6, m5, m7
+%else ;!HIGH_BIT_DEPTH
+ packuswb m6, m0
+%endif ;HIGH_BIT_DEPTH
+ movlps [t0], m6
+ movhps [t0+SIZEOF_PIXEL*t1], m6
+%else
+%assign x 0
+%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
+ BIWEIGHT_ROW t0+x, t2+x, t4+x, %1
+ BIWEIGHT_ROW t0+x+SIZEOF_PIXEL*t1, t2+x+SIZEOF_PIXEL*t3, t4+x+SIZEOF_PIXEL*t5, %1
+%assign x x+mmsize
+%endrep
+%endif
+ AVG_END
+%endmacro
+
+%define BIWEIGHT BIWEIGHT_MMX
+%define BIWEIGHT_START BIWEIGHT_START_MMX
+INIT_MMX mmx2
+AVG_WEIGHT 4
+AVG_WEIGHT 8
+AVG_WEIGHT 16
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+AVG_WEIGHT 4, 8
+AVG_WEIGHT 8, 8
+AVG_WEIGHT 16, 8
+%else ;!HIGH_BIT_DEPTH
+INIT_XMM sse2
+AVG_WEIGHT 8, 7
+AVG_WEIGHT 16, 7
+%define BIWEIGHT BIWEIGHT_SSSE3
+%define BIWEIGHT_START BIWEIGHT_START_SSSE3
+INIT_MMX ssse3
+AVG_WEIGHT 4
+INIT_XMM ssse3
+AVG_WEIGHT 8, 7
+AVG_WEIGHT 16, 7
+
+INIT_YMM avx2
+cglobal pixel_avg_weight_w16
+ BIWEIGHT_START
+ AVG_START 5
+.height_loop:
+ movu xm0, [t2]
+ movu xm1, [t4]
+ vinserti128 m0, m0, [t2+t3], 1
+ vinserti128 m1, m1, [t4+t5], 1
+ SBUTTERFLY bw, 0, 1, 2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ packuswb m0, m1
+ mova [t0], xm0
+ vextracti128 [t0+t1], m0, 1
+ AVG_END
+%endif ;HIGH_BIT_DEPTH
+
+;=============================================================================
+; P frame explicit weighted prediction
+;=============================================================================
+
+%if HIGH_BIT_DEPTH
+; width
+%macro WEIGHT_START 1
+ mova m0, [r4+ 0] ; 1<<denom
+ mova m3, [r4+16]
+ movd m2, [r4+32] ; denom
+ mova m4, [pw_pixel_max]
+ paddw m2, [sq_1] ; denom+1
+%endmacro
+
+; src1, src2
+%macro WEIGHT 2
+ movh m5, [%1]
+ movh m6, [%2]
+ punpcklwd m5, m0
+ punpcklwd m6, m0
+ pmaddwd m5, m3
+ pmaddwd m6, m3
+ psrad m5, m2
+ psrad m6, m2
+ packssdw m5, m6
+%endmacro
+
+; src, dst, width
+%macro WEIGHT_TWO_ROW 4
+ %assign x 0
+%rep (%3+mmsize/2-1)/(mmsize/2)
+%if %3-x/2 <= 4 && mmsize == 16
+ WEIGHT %1+x, %1+r3+x
+ CLIPW m5, [pb_0], m4
+ movh [%2+x], m5
+ movhps [%2+r1+x], m5
+%else
+ WEIGHT %1+x, %1+x+mmsize/2
+ SWAP 5, 7
+ WEIGHT %1+r3+x, %1+r3+x+mmsize/2
+ CLIPW m5, [pb_0], m4
+ CLIPW m7, [pb_0], m4
+ mova [%2+x], m7
+ mova [%2+r1+x], m5
+%endif
+ %assign x x+mmsize
+%endrep
+%endmacro
+
+%else ; !HIGH_BIT_DEPTH
+
+%macro WEIGHT_START 1
+%if cpuflag(avx2)
+ vbroadcasti128 m3, [r4]
+ vbroadcasti128 m4, [r4+16]
+%else
+ mova m3, [r4]
+ mova m4, [r4+16]
+%if notcpuflag(ssse3)
+ movd m5, [r4+32]
+%endif
+%endif
+ pxor m2, m2
+%endmacro
+
+; src1, src2, dst1, dst2, fast
+%macro WEIGHT_ROWx2 5
+ movh m0, [%1 ]
+ movh m1, [%1+mmsize/2]
+ movh m6, [%2 ]
+ movh m7, [%2+mmsize/2]
+ punpcklbw m0, m2
+ punpcklbw m1, m2
+ punpcklbw m6, m2
+ punpcklbw m7, m2
+%if cpuflag(ssse3)
+%if %5==0
+ psllw m0, 7
+ psllw m1, 7
+ psllw m6, 7
+ psllw m7, 7
+%endif
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ pmulhrsw m6, m3
+ pmulhrsw m7, m3
+ paddw m0, m4
+ paddw m1, m4
+ paddw m6, m4
+ paddw m7, m4
+%else
+ pmullw m0, m3
+ pmullw m1, m3
+ pmullw m6, m3
+ pmullw m7, m3
+ paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
+ paddsw m1, m4
+ paddsw m6, m4
+ paddsw m7, m4
+ psraw m0, m5
+ psraw m1, m5
+ psraw m6, m5
+ psraw m7, m5
+%endif
+ packuswb m0, m1
+ packuswb m6, m7
+ mova [%3], m0
+ mova [%4], m6
+%endmacro
+
+; src1, src2, dst1, dst2, width, fast
+%macro WEIGHT_COL 6
+%if cpuflag(avx2)
+%if %5==16
+ movu xm0, [%1]
+ vinserti128 m0, m0, [%2], 1
+ punpckhbw m1, m0, m2
+ punpcklbw m0, m0, m2
+%if %6==0
+ psllw m0, 7
+ psllw m1, 7
+%endif
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddw m0, m4
+ paddw m1, m4
+ packuswb m0, m1
+ mova [%3], xm0
+ vextracti128 [%4], m0, 1
+%else
+ movq xm0, [%1]
+ vinserti128 m0, m0, [%2], 1
+ punpcklbw m0, m2
+%if %6==0
+ psllw m0, 7
+%endif
+ pmulhrsw m0, m3
+ paddw m0, m4
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+%if %5 == 8
+ movq [%3], xm0
+ movq [%4], xm1
+%else
+ movd [%3], xm0
+ movd [%4], xm1
+%endif
+%endif
+%else
+ movh m0, [%1]
+ movh m1, [%2]
+ punpcklbw m0, m2
+ punpcklbw m1, m2
+%if cpuflag(ssse3)
+%if %6==0
+ psllw m0, 7
+ psllw m1, 7
+%endif
+ pmulhrsw m0, m3
+ pmulhrsw m1, m3
+ paddw m0, m4
+ paddw m1, m4
+%else
+ pmullw m0, m3
+ pmullw m1, m3
+ paddsw m0, m4 ;1<<(denom-1)+(offset<<denom)
+ paddsw m1, m4
+ psraw m0, m5
+ psraw m1, m5
+%endif
+%if %5 == 8
+ packuswb m0, m1
+ movh [%3], m0
+ movhps [%4], m0
+%else
+ packuswb m0, m0
+ packuswb m1, m1
+ movd [%3], m0 ; width 2 can write garbage for the last 2 bytes
+ movd [%4], m1
+%endif
+%endif
+%endmacro
+; src, dst, width
+%macro WEIGHT_TWO_ROW 4
+%assign x 0
+%rep %3
+%if (%3-x) >= mmsize
+ WEIGHT_ROWx2 %1+x, %1+r3+x, %2+x, %2+r1+x, %4
+ %assign x (x+mmsize)
+%else
+ %assign w %3-x
+%if w == 20
+ %assign w 16
+%endif
+ WEIGHT_COL %1+x, %1+r3+x, %2+x, %2+r1+x, w, %4
+ %assign x (x+w)
+%endif
+%if x >= %3
+ %exitrep
+%endif
+%endrep
+%endmacro
+
+%endif ; HIGH_BIT_DEPTH
+
+;-----------------------------------------------------------------------------
+;void mc_weight_wX( pixel *dst, intptr_t i_dst_stride, pixel *src, intptr_t i_src_stride, weight_t *weight, int h )
+;-----------------------------------------------------------------------------
+
+%macro WEIGHTER 1
+cglobal mc_weight_w%1, 6,6,8
+ FIX_STRIDES r1, r3
+ WEIGHT_START %1
+%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
+ ; we can merge the shift step into the scale factor
+ ; if (m3<<7) doesn't overflow an int16_t
+ cmp byte [r4+1], 0
+ jz .fast
+%endif
+.loop:
+ WEIGHT_TWO_ROW r2, r0, %1, 0
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ sub r5d, 2
+ jg .loop
+ RET
+%if cpuflag(ssse3) && HIGH_BIT_DEPTH == 0
+.fast:
+ psllw m3, 7
+.fastloop:
+ WEIGHT_TWO_ROW r2, r0, %1, 1
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ sub r5d, 2
+ jg .fastloop
+ RET
+%endif
+%endmacro
+
+INIT_MMX mmx2
+WEIGHTER 4
+WEIGHTER 8
+WEIGHTER 12
+WEIGHTER 16
+WEIGHTER 20
+INIT_XMM sse2
+WEIGHTER 8
+WEIGHTER 16
+WEIGHTER 20
+%if HIGH_BIT_DEPTH
+WEIGHTER 12
+%else
+INIT_MMX ssse3
+WEIGHTER 4
+INIT_XMM ssse3
+WEIGHTER 8
+WEIGHTER 16
+WEIGHTER 20
+INIT_YMM avx2
+WEIGHTER 8
+WEIGHTER 16
+WEIGHTER 20
+%endif
+
+%macro OFFSET_OP 7
+ mov%6 m0, [%1]
+ mov%6 m1, [%2]
+%if HIGH_BIT_DEPTH
+ p%5usw m0, m2
+ p%5usw m1, m2
+%ifidn %5,add
+ pminsw m0, m3
+ pminsw m1, m3
+%endif
+%else
+ p%5usb m0, m2
+ p%5usb m1, m2
+%endif
+ mov%7 [%3], m0
+ mov%7 [%4], m1
+%endmacro
+
+%macro OFFSET_TWO_ROW 4
+%assign x 0
+%rep %3
+%if (%3*SIZEOF_PIXEL-x) >= mmsize
+ OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, u, a
+ %assign x (x+mmsize)
+%else
+%if HIGH_BIT_DEPTH
+ OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, h, h
+%else
+ OFFSET_OP (%1+x), (%1+x+r3), (%2+x), (%2+x+r1), %4, d, d
+%endif
+ %exitrep
+%endif
+%if x >= %3*SIZEOF_PIXEL
+ %exitrep
+%endif
+%endrep
+%endmacro
+
+;-----------------------------------------------------------------------------
+;void mc_offset_wX( pixel *src, intptr_t i_src_stride, pixel *dst, intptr_t i_dst_stride, weight_t *w, int h )
+;-----------------------------------------------------------------------------
+%macro OFFSET 2
+cglobal mc_offset%2_w%1, 6,6
+ FIX_STRIDES r1, r3
+ mova m2, [r4]
+%if HIGH_BIT_DEPTH
+%ifidn %2,add
+ mova m3, [pw_pixel_max]
+%endif
+%endif
+.loop:
+ OFFSET_TWO_ROW r2, r0, %1, %2
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ sub r5d, 2
+ jg .loop
+ RET
+%endmacro
+
+%macro OFFSETPN 1
+ OFFSET %1, add
+ OFFSET %1, sub
+%endmacro
+INIT_MMX mmx2
+OFFSETPN 4
+OFFSETPN 8
+OFFSETPN 12
+OFFSETPN 16
+OFFSETPN 20
+INIT_XMM sse2
+OFFSETPN 12
+OFFSETPN 16
+OFFSETPN 20
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+OFFSETPN 8
+%endif
+
+
+;=============================================================================
+; pixel avg
+;=============================================================================
+
+;-----------------------------------------------------------------------------
+; void pixel_avg_4x4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
+; pixel *src2, intptr_t src2_stride, int weight );
+;-----------------------------------------------------------------------------
+%macro AVGH 2
+cglobal pixel_avg_%1x%2
+ mov eax, %2
+ cmp dword r6m, 32
+ jne pixel_avg_weight_w%1 %+ SUFFIX
+%if cpuflag(avx2) && %1 == 16 ; all AVX2 machines can do fast 16-byte unaligned loads
+ jmp pixel_avg_w%1_avx2
+%else
+%if mmsize == 16 && %1 == 16
+ test dword r4m, 15
+ jz pixel_avg_w%1_sse2
+%endif
+ jmp pixel_avg_w%1_mmx2
+%endif
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void pixel_avg_w4( pixel *dst, intptr_t dst_stride, pixel *src1, intptr_t src1_stride,
+; pixel *src2, intptr_t src2_stride, int height, int weight );
+;-----------------------------------------------------------------------------
+
+%macro AVG_FUNC 3
+cglobal pixel_avg_w%1
+ AVG_START
+.height_loop:
+%assign x 0
+%rep (%1*SIZEOF_PIXEL+mmsize-1)/mmsize
+ %2 m0, [t2+x]
+ %2 m1, [t2+x+SIZEOF_PIXEL*t3]
+%if HIGH_BIT_DEPTH
+ pavgw m0, [t4+x]
+ pavgw m1, [t4+x+SIZEOF_PIXEL*t5]
+%else ;!HIGH_BIT_DEPTH
+ pavgb m0, [t4+x]
+ pavgb m1, [t4+x+SIZEOF_PIXEL*t5]
+%endif
+ %3 [t0+x], m0
+ %3 [t0+x+SIZEOF_PIXEL*t1], m1
+%assign x x+mmsize
+%endrep
+ AVG_END
+%endmacro
+
+%if HIGH_BIT_DEPTH
+
+INIT_MMX mmx2
+AVG_FUNC 4, movq, movq
+AVGH 4, 16
+AVGH 4, 8
+AVGH 4, 4
+AVGH 4, 2
+
+AVG_FUNC 8, movq, movq
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
+
+AVG_FUNC 16, movq, movq
+AVGH 16, 16
+AVGH 16, 8
+
+INIT_XMM sse2
+AVG_FUNC 4, movq, movq
+AVGH 4, 16
+AVGH 4, 8
+AVGH 4, 4
+AVGH 4, 2
+
+AVG_FUNC 8, movdqu, movdqa
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
+
+AVG_FUNC 16, movdqu, movdqa
+AVGH 16, 16
+AVGH 16, 8
+
+%else ;!HIGH_BIT_DEPTH
+
+INIT_MMX mmx2
+AVG_FUNC 4, movd, movd
+AVGH 4, 16
+AVGH 4, 8
+AVGH 4, 4
+AVGH 4, 2
+
+AVG_FUNC 8, movq, movq
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
+
+AVG_FUNC 16, movq, movq
+AVGH 16, 16
+AVGH 16, 8
+
+INIT_XMM sse2
+AVG_FUNC 16, movdqu, movdqa
+AVGH 16, 16
+AVGH 16, 8
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
+INIT_XMM ssse3
+AVGH 16, 16
+AVGH 16, 8
+AVGH 8, 16
+AVGH 8, 8
+AVGH 8, 4
+INIT_MMX ssse3
+AVGH 4, 16
+AVGH 4, 8
+AVGH 4, 4
+AVGH 4, 2
+INIT_XMM avx2
+AVG_FUNC 16, movdqu, movdqa
+AVGH 16, 16
+AVGH 16, 8
+
+%endif ;HIGH_BIT_DEPTH
+
+
+
+;=============================================================================
+; pixel avg2
+;=============================================================================
+
+%if HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; void pixel_avg2_wN( uint16_t *dst, intptr_t dst_stride,
+; uint16_t *src1, intptr_t src_stride,
+; uint16_t *src2, int height );
+;-----------------------------------------------------------------------------
+%macro AVG2_W_ONE 1
+cglobal pixel_avg2_w%1, 6,7,4
+ sub r4, r2
+ lea r6, [r4+r3*2]
+.height_loop:
+ movu m0, [r2]
+ movu m1, [r2+r3*2]
+%if cpuflag(avx) || mmsize == 8
+ pavgw m0, [r2+r4]
+ pavgw m1, [r2+r6]
+%else
+ movu m2, [r2+r4]
+ movu m3, [r2+r6]
+ pavgw m0, m2
+ pavgw m1, m3
+%endif
+ mova [r0], m0
+ mova [r0+r1*2], m1
+ lea r2, [r2+r3*4]
+ lea r0, [r0+r1*4]
+ sub r5d, 2
+ jg .height_loop
+ RET
+%endmacro
+
+%macro AVG2_W_TWO 3
+cglobal pixel_avg2_w%1, 6,7,8
+ sub r4, r2
+ lea r6, [r4+r3*2]
+.height_loop:
+ movu m0, [r2]
+ %2 m1, [r2+mmsize]
+ movu m2, [r2+r3*2]
+ %2 m3, [r2+r3*2+mmsize]
+%if mmsize == 8
+ pavgw m0, [r2+r4]
+ pavgw m1, [r2+r4+mmsize]
+ pavgw m2, [r2+r6]
+ pavgw m3, [r2+r6+mmsize]
+%else
+ movu m4, [r2+r4]
+ %2 m5, [r2+r4+mmsize]
+ movu m6, [r2+r6]
+ %2 m7, [r2+r6+mmsize]
+ pavgw m0, m4
+ pavgw m1, m5
+ pavgw m2, m6
+ pavgw m3, m7
+%endif
+ mova [r0], m0
+ %3 [r0+mmsize], m1
+ mova [r0+r1*2], m2
+ %3 [r0+r1*2+mmsize], m3
+ lea r2, [r2+r3*4]
+ lea r0, [r0+r1*4]
+ sub r5d, 2
+ jg .height_loop
+ RET
+%endmacro
+
+INIT_MMX mmx2
+AVG2_W_ONE 4
+AVG2_W_TWO 8, movu, mova
+INIT_XMM sse2
+AVG2_W_ONE 8
+AVG2_W_TWO 10, movd, movd
+AVG2_W_TWO 16, movu, mova
+INIT_YMM avx2
+AVG2_W_ONE 16
+
+INIT_MMX
+cglobal pixel_avg2_w10_mmx2, 6,7
+ sub r4, r2
+ lea r6, [r4+r3*2]
+.height_loop:
+ movu m0, [r2+ 0]
+ movu m1, [r2+ 8]
+ movh m2, [r2+16]
+ movu m3, [r2+r3*2+ 0]
+ movu m4, [r2+r3*2+ 8]
+ movh m5, [r2+r3*2+16]
+ pavgw m0, [r2+r4+ 0]
+ pavgw m1, [r2+r4+ 8]
+ pavgw m2, [r2+r4+16]
+ pavgw m3, [r2+r6+ 0]
+ pavgw m4, [r2+r6+ 8]
+ pavgw m5, [r2+r6+16]
+ mova [r0+ 0], m0
+ mova [r0+ 8], m1
+ movh [r0+16], m2
+ mova [r0+r1*2+ 0], m3
+ mova [r0+r1*2+ 8], m4
+ movh [r0+r1*2+16], m5
+ lea r2, [r2+r3*2*2]
+ lea r0, [r0+r1*2*2]
+ sub r5d, 2
+ jg .height_loop
+ RET
+
+cglobal pixel_avg2_w16_mmx2, 6,7
+ sub r4, r2
+ lea r6, [r4+r3*2]
+.height_loop:
+ movu m0, [r2+ 0]
+ movu m1, [r2+ 8]
+ movu m2, [r2+16]
+ movu m3, [r2+24]
+ movu m4, [r2+r3*2+ 0]
+ movu m5, [r2+r3*2+ 8]
+ movu m6, [r2+r3*2+16]
+ movu m7, [r2+r3*2+24]
+ pavgw m0, [r2+r4+ 0]
+ pavgw m1, [r2+r4+ 8]
+ pavgw m2, [r2+r4+16]
+ pavgw m3, [r2+r4+24]
+ pavgw m4, [r2+r6+ 0]
+ pavgw m5, [r2+r6+ 8]
+ pavgw m6, [r2+r6+16]
+ pavgw m7, [r2+r6+24]
+ mova [r0+ 0], m0
+ mova [r0+ 8], m1
+ mova [r0+16], m2
+ mova [r0+24], m3
+ mova [r0+r1*2+ 0], m4
+ mova [r0+r1*2+ 8], m5
+ mova [r0+r1*2+16], m6
+ mova [r0+r1*2+24], m7
+ lea r2, [r2+r3*2*2]
+ lea r0, [r0+r1*2*2]
+ sub r5d, 2
+ jg .height_loop
+ RET
+
+cglobal pixel_avg2_w18_mmx2, 6,7
+ sub r4, r2
+.height_loop:
+ movu m0, [r2+ 0]
+ movu m1, [r2+ 8]
+ movu m2, [r2+16]
+ movu m3, [r2+24]
+ movh m4, [r2+32]
+ pavgw m0, [r2+r4+ 0]
+ pavgw m1, [r2+r4+ 8]
+ pavgw m2, [r2+r4+16]
+ pavgw m3, [r2+r4+24]
+ pavgw m4, [r2+r4+32]
+ mova [r0+ 0], m0
+ mova [r0+ 8], m1
+ mova [r0+16], m2
+ mova [r0+24], m3
+ movh [r0+32], m4
+ lea r2, [r2+r3*2]
+ lea r0, [r0+r1*2]
+ dec r5d
+ jg .height_loop
+ RET
+
+%macro PIXEL_AVG_W18 0
+cglobal pixel_avg2_w18, 6,7
+ sub r4, r2
+.height_loop:
+ movu m0, [r2+ 0]
+ movd xm2, [r2+32]
+%if mmsize == 32
+ pavgw m0, [r2+r4+ 0]
+ movd xm1, [r2+r4+32]
+ pavgw xm2, xm1
+%else
+ movu m1, [r2+16]
+ movu m3, [r2+r4+ 0]
+ movu m4, [r2+r4+16]
+ movd m5, [r2+r4+32]
+ pavgw m0, m3
+ pavgw m1, m4
+ pavgw m2, m5
+ mova [r0+16], m1
+%endif
+ mova [r0+ 0], m0
+ movd [r0+32], xm2
+ lea r2, [r2+r3*2]
+ lea r0, [r0+r1*2]
+ dec r5d
+ jg .height_loop
+ RET
+%endmacro
+
+INIT_XMM sse2
+PIXEL_AVG_W18
+INIT_YMM avx2
+PIXEL_AVG_W18
+
+%endif ; HIGH_BIT_DEPTH
+
+%if HIGH_BIT_DEPTH == 0
+;-----------------------------------------------------------------------------
+; void pixel_avg2_w4( uint8_t *dst, intptr_t dst_stride,
+; uint8_t *src1, intptr_t src_stride,
+; uint8_t *src2, int height );
+;-----------------------------------------------------------------------------
+%macro AVG2_W8 2
+cglobal pixel_avg2_w%1_mmx2, 6,7
+ sub r4, r2
+ lea r6, [r4+r3]
+.height_loop:
+ %2 mm0, [r2]
+ %2 mm1, [r2+r3]
+ pavgb mm0, [r2+r4]
+ pavgb mm1, [r2+r6]
+ lea r2, [r2+r3*2]
+ %2 [r0], mm0
+ %2 [r0+r1], mm1
+ lea r0, [r0+r1*2]
+ sub r5d, 2
+ jg .height_loop
+ RET
+%endmacro
+
+INIT_MMX
+AVG2_W8 4, movd
+AVG2_W8 8, movq
+
+%macro AVG2_W16 2
+cglobal pixel_avg2_w%1_mmx2, 6,7
+ sub r2, r4
+ lea r6, [r2+r3]
+.height_loop:
+ movq mm0, [r4]
+ %2 mm1, [r4+8]
+ movq mm2, [r4+r3]
+ %2 mm3, [r4+r3+8]
+ pavgb mm0, [r4+r2]
+ pavgb mm1, [r4+r2+8]
+ pavgb mm2, [r4+r6]
+ pavgb mm3, [r4+r6+8]
+ lea r4, [r4+r3*2]
+ movq [r0], mm0
+ %2 [r0+8], mm1
+ movq [r0+r1], mm2
+ %2 [r0+r1+8], mm3
+ lea r0, [r0+r1*2]
+ sub r5d, 2
+ jg .height_loop
+ RET
+%endmacro
+
+AVG2_W16 12, movd
+AVG2_W16 16, movq
+
+cglobal pixel_avg2_w20_mmx2, 6,7
+ sub r2, r4
+ lea r6, [r2+r3]
+.height_loop:
+ movq mm0, [r4]
+ movq mm1, [r4+8]
+ movd mm2, [r4+16]
+ movq mm3, [r4+r3]
+ movq mm4, [r4+r3+8]
+ movd mm5, [r4+r3+16]
+ pavgb mm0, [r4+r2]
+ pavgb mm1, [r4+r2+8]
+ pavgb mm2, [r4+r2+16]
+ pavgb mm3, [r4+r6]
+ pavgb mm4, [r4+r6+8]
+ pavgb mm5, [r4+r6+16]
+ lea r4, [r4+r3*2]
+ movq [r0], mm0
+ movq [r0+8], mm1
+ movd [r0+16], mm2
+ movq [r0+r1], mm3
+ movq [r0+r1+8], mm4
+ movd [r0+r1+16], mm5
+ lea r0, [r0+r1*2]
+ sub r5d, 2
+ jg .height_loop
+ RET
+
+INIT_XMM
+cglobal pixel_avg2_w16_sse2, 6,7
+ sub r4, r2
+ lea r6, [r4+r3]
+.height_loop:
+ movu m0, [r2]
+ movu m2, [r2+r3]
+ movu m1, [r2+r4]
+ movu m3, [r2+r6]
+ lea r2, [r2+r3*2]
+ pavgb m0, m1
+ pavgb m2, m3
+ mova [r0], m0
+ mova [r0+r1], m2
+ lea r0, [r0+r1*2]
+ sub r5d, 2
+ jg .height_loop
+ RET
+
+cglobal pixel_avg2_w20_sse2, 6,7
+ sub r2, r4
+ lea r6, [r2+r3]
+.height_loop:
+ movu m0, [r4]
+ movu m2, [r4+r3]
+ movu m1, [r4+r2]
+ movu m3, [r4+r6]
+ movd mm4, [r4+16]
+ movd mm5, [r4+r3+16]
+ pavgb m0, m1
+ pavgb m2, m3
+ pavgb mm4, [r4+r2+16]
+ pavgb mm5, [r4+r6+16]
+ lea r4, [r4+r3*2]
+ mova [r0], m0
+ mova [r0+r1], m2
+ movd [r0+16], mm4
+ movd [r0+r1+16], mm5
+ lea r0, [r0+r1*2]
+ sub r5d, 2
+ jg .height_loop
+ RET
+
+INIT_YMM avx2
+cglobal pixel_avg2_w20, 6,7
+ sub r2, r4
+ lea r6, [r2+r3]
+.height_loop:
+ movu m0, [r4]
+ movu m1, [r4+r3]
+ pavgb m0, [r4+r2]
+ pavgb m1, [r4+r6]
+ lea r4, [r4+r3*2]
+ mova [r0], m0
+ mova [r0+r1], m1
+ lea r0, [r0+r1*2]
+ sub r5d, 2
+ jg .height_loop
+ RET
+
+; Cacheline split code for processors with high latencies for loads
+; split over cache lines. See sad-a.asm for a more detailed explanation.
+; This particular instance is complicated by the fact that src1 and src2
+; can have different alignments. For simplicity and code size, only the
+; MMX cacheline workaround is used. As a result, in the case of SSE2
+; pixel_avg, the cacheline check functions calls the SSE2 version if there
+; is no cacheline split, and the MMX workaround if there is.
+
+%macro INIT_SHIFT 2
+ and eax, 7
+ shl eax, 3
+ movd %1, [sw_64]
+ movd %2, eax
+ psubw %1, %2
+%endmacro
+
+%macro AVG_CACHELINE_START 0
+ %assign stack_offset 0
+ INIT_SHIFT mm6, mm7
+ mov eax, r4m
+ INIT_SHIFT mm4, mm5
+ PROLOGUE 6,6
+ and r2, ~7
+ and r4, ~7
+ sub r4, r2
+.height_loop:
+%endmacro
+
+%macro AVG_CACHELINE_LOOP 2
+ movq mm1, [r2+%1]
+ movq mm0, [r2+8+%1]
+ movq mm3, [r2+r4+%1]
+ movq mm2, [r2+r4+8+%1]
+ psrlq mm1, mm7
+ psllq mm0, mm6
+ psrlq mm3, mm5
+ psllq mm2, mm4
+ por mm0, mm1
+ por mm2, mm3
+ pavgb mm2, mm0
+ %2 [r0+%1], mm2
+%endmacro
+
+%macro AVG_CACHELINE_FUNC 2
+pixel_avg2_w%1_cache_mmx2:
+ AVG_CACHELINE_START
+ AVG_CACHELINE_LOOP 0, movq
+%if %1>8
+ AVG_CACHELINE_LOOP 8, movq
+%if %1>16
+ AVG_CACHELINE_LOOP 16, movd
+%endif
+%endif
+ add r2, r3
+ add r0, r1
+ dec r5d
+ jg .height_loop
+ RET
+%endmacro
+
+%macro AVG_CACHELINE_CHECK 3 ; width, cacheline, instruction set
+%if %1 == 12
+;w12 isn't needed because w16 is just as fast if there's no cacheline split
+%define cachesplit pixel_avg2_w16_cache_mmx2
+%else
+%define cachesplit pixel_avg2_w%1_cache_mmx2
+%endif
+cglobal pixel_avg2_w%1_cache%2_%3
+ mov eax, r2m
+ and eax, %2-1
+ cmp eax, (%2-%1-(%1 % 8))
+%if %1==12||%1==20
+ jbe pixel_avg2_w%1_%3
+%else
+ jb pixel_avg2_w%1_%3
+%endif
+%if 0 ; or %1==8 - but the extra branch seems too expensive
+ ja cachesplit
+%if ARCH_X86_64
+ test r4b, 1
+%else
+ test byte r4m, 1
+%endif
+ jz pixel_avg2_w%1_%3
+%else
+ or eax, r4m
+ and eax, 7
+ jz pixel_avg2_w%1_%3
+ mov eax, r2m
+%endif
+%if mmsize==16 || (%1==8 && %2==64)
+ AVG_CACHELINE_FUNC %1, %2
+%else
+ jmp cachesplit
+%endif
+%endmacro
+
+INIT_MMX
+AVG_CACHELINE_CHECK 8, 64, mmx2
+AVG_CACHELINE_CHECK 12, 64, mmx2
+%if ARCH_X86_64 == 0
+AVG_CACHELINE_CHECK 16, 64, mmx2
+AVG_CACHELINE_CHECK 20, 64, mmx2
+AVG_CACHELINE_CHECK 8, 32, mmx2
+AVG_CACHELINE_CHECK 12, 32, mmx2
+AVG_CACHELINE_CHECK 16, 32, mmx2
+AVG_CACHELINE_CHECK 20, 32, mmx2
+%endif
+INIT_XMM
+AVG_CACHELINE_CHECK 16, 64, sse2
+AVG_CACHELINE_CHECK 20, 64, sse2
+
+; computed jump assumes this loop is exactly 48 bytes
+%macro AVG16_CACHELINE_LOOP_SSSE3 2 ; alignment
+ALIGN 16
+avg_w16_align%1_%2_ssse3:
+%if %1==0 && %2==0
+ movdqa xmm1, [r2]
+ pavgb xmm1, [r2+r4]
+ add r2, r3
+%elif %1==0
+ movdqa xmm1, [r2+r4+16]
+ palignr xmm1, [r2+r4], %2
+ pavgb xmm1, [r2]
+ add r2, r3
+%elif %2&15==0
+ movdqa xmm1, [r2+16]
+ palignr xmm1, [r2], %1
+ pavgb xmm1, [r2+r4]
+ add r2, r3
+%else
+ movdqa xmm1, [r2+16]
+ movdqa xmm2, [r2+r4+16]
+ palignr xmm1, [r2], %1
+ palignr xmm2, [r2+r4], %2&15
+ add r2, r3
+ pavgb xmm1, xmm2
+%endif
+ movdqa [r0], xmm1
+ add r0, r1
+ dec r5d
+ jg avg_w16_align%1_%2_ssse3
+ ret
+%if %1==0
+ ; make sure the first ones don't end up short
+ ALIGN 16
+ times (48-($-avg_w16_align%1_%2_ssse3))>>4 nop
+%endif
+%endmacro
+
+cglobal pixel_avg2_w16_cache64_ssse3
+%if 0 ; seems both tests aren't worth it if src1%16==0 is optimized
+ mov eax, r2m
+ and eax, 0x3f
+ cmp eax, 0x30
+ jb x264_pixel_avg2_w16_sse2
+ or eax, r4m
+ and eax, 7
+ jz x264_pixel_avg2_w16_sse2
+%endif
+ PROLOGUE 6, 8
+ lea r6, [r4+r2]
+ and r4, ~0xf
+ and r6, 0x1f
+ and r2, ~0xf
+ lea r6, [r6*3] ;(offset + align*2)*3
+ sub r4, r2
+ shl r6, 4 ;jump = (offset + align*2)*48
+%define avg_w16_addr avg_w16_align1_1_ssse3-(avg_w16_align2_2_ssse3-avg_w16_align1_1_ssse3)
+%ifdef PIC
+ lea r7, [avg_w16_addr]
+ add r6, r7
+%else
+ lea r6, [avg_w16_addr + r6]
+%endif
+ TAIL_CALL r6, 1
+
+%assign j 0
+%assign k 1
+%rep 16
+AVG16_CACHELINE_LOOP_SSSE3 j, j
+AVG16_CACHELINE_LOOP_SSSE3 j, k
+%assign j j+1
+%assign k k+1
+%endrep
+%endif ; !HIGH_BIT_DEPTH
+
+;=============================================================================
+; pixel copy
+;=============================================================================
+
+%macro COPY1 2
+ movu m0, [r2]
+ movu m1, [r2+r3]
+ movu m2, [r2+r3*2]
+ movu m3, [r2+%2]
+ mova [r0], m0
+ mova [r0+r1], m1
+ mova [r0+r1*2], m2
+ mova [r0+%1], m3
+%endmacro
+
+%macro COPY2 2-4 0, 1
+ movu m0, [r2+%3*mmsize]
+ movu m1, [r2+%4*mmsize]
+ movu m2, [r2+r3+%3*mmsize]
+ movu m3, [r2+r3+%4*mmsize]
+ mova [r0+%3*mmsize], m0
+ mova [r0+%4*mmsize], m1
+ mova [r0+r1+%3*mmsize], m2
+ mova [r0+r1+%4*mmsize], m3
+ movu m0, [r2+r3*2+%3*mmsize]
+ movu m1, [r2+r3*2+%4*mmsize]
+ movu m2, [r2+%2+%3*mmsize]
+ movu m3, [r2+%2+%4*mmsize]
+ mova [r0+r1*2+%3*mmsize], m0
+ mova [r0+r1*2+%4*mmsize], m1
+ mova [r0+%1+%3*mmsize], m2
+ mova [r0+%1+%4*mmsize], m3
+%endmacro
+
+%macro COPY4 2
+ COPY2 %1, %2, 0, 1
+ COPY2 %1, %2, 2, 3
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void mc_copy_w4( uint8_t *dst, intptr_t i_dst_stride,
+; uint8_t *src, intptr_t i_src_stride, int i_height )
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal mc_copy_w4_mmx, 4,6
+ FIX_STRIDES r1, r3
+ cmp dword r4m, 4
+ lea r5, [r3*3]
+ lea r4, [r1*3]
+ je .end
+%if HIGH_BIT_DEPTH == 0
+ %define mova movd
+ %define movu movd
+%endif
+ COPY1 r4, r5
+ lea r2, [r2+r3*4]
+ lea r0, [r0+r1*4]
+.end:
+ COPY1 r4, r5
+ RET
+
+%macro MC_COPY 1
+%assign %%w %1*SIZEOF_PIXEL/mmsize
+%if %%w > 0
+cglobal mc_copy_w%1, 5,7
+ FIX_STRIDES r1, r3
+ lea r6, [r3*3]
+ lea r5, [r1*3]
+.height_loop:
+ COPY %+ %%w r5, r6
+ lea r2, [r2+r3*4]
+ lea r0, [r0+r1*4]
+ sub r4d, 4
+ jg .height_loop
+ RET
+%endif
+%endmacro
+
+INIT_MMX mmx
+MC_COPY 8
+MC_COPY 16
+INIT_XMM sse
+MC_COPY 8
+MC_COPY 16
+INIT_XMM aligned, sse
+MC_COPY 16
+%if HIGH_BIT_DEPTH
+INIT_YMM avx
+MC_COPY 16
+INIT_YMM aligned, avx
+MC_COPY 16
+%endif
+
+;=============================================================================
+; prefetch
+;=============================================================================
+; assumes 64 byte cachelines
+; FIXME doesn't cover all pixels in high depth and/or 4:4:4
+
+;-----------------------------------------------------------------------------
+; void prefetch_fenc( pixel *pix_y, intptr_t stride_y,
+; pixel *pix_uv, intptr_t stride_uv, int mb_x )
+;-----------------------------------------------------------------------------
+
+%macro PREFETCH_FENC 1
+%if ARCH_X86_64
+cglobal prefetch_fenc_%1, 5,5
+ FIX_STRIDES r1, r3
+ and r4d, 3
+ mov eax, r4d
+ imul r4d, r1d
+ lea r0, [r0+r4*4+64*SIZEOF_PIXEL]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+ lea r0, [r0+r1*2]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+
+ imul eax, r3d
+ lea r2, [r2+rax*2+64*SIZEOF_PIXEL]
+ prefetcht0 [r2]
+ prefetcht0 [r2+r3]
+%ifidn %1, 422
+ lea r2, [r2+r3*2]
+ prefetcht0 [r2]
+ prefetcht0 [r2+r3]
+%endif
+ RET
+
+%else
+cglobal prefetch_fenc_%1, 0,3
+ mov r2, r4m
+ mov r1, r1m
+ mov r0, r0m
+ FIX_STRIDES r1
+ and r2, 3
+ imul r2, r1
+ lea r0, [r0+r2*4+64*SIZEOF_PIXEL]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+ lea r0, [r0+r1*2]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+
+ mov r2, r4m
+ mov r1, r3m
+ mov r0, r2m
+ FIX_STRIDES r1
+ and r2, 3
+ imul r2, r1
+ lea r0, [r0+r2*2+64*SIZEOF_PIXEL]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+%ifidn %1, 422
+ lea r0, [r0+r1*2]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+%endif
+ ret
+%endif ; ARCH_X86_64
+%endmacro
+
+INIT_MMX mmx2
+PREFETCH_FENC 420
+PREFETCH_FENC 422
+
+;-----------------------------------------------------------------------------
+; void prefetch_ref( pixel *pix, intptr_t stride, int parity )
+;-----------------------------------------------------------------------------
+INIT_MMX mmx2
+cglobal prefetch_ref, 3,3
+ FIX_STRIDES r1
+ dec r2d
+ and r2d, r1d
+ lea r0, [r0+r2*8+64*SIZEOF_PIXEL]
+ lea r2, [r1*3]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+ prefetcht0 [r0+r1*2]
+ prefetcht0 [r0+r2]
+ lea r0, [r0+r1*4]
+ prefetcht0 [r0]
+ prefetcht0 [r0+r1]
+ prefetcht0 [r0+r1*2]
+ prefetcht0 [r0+r2]
+ RET
+
+
+
+;=============================================================================
+; chroma MC
+;=============================================================================
+
+%if ARCH_X86_64
+ DECLARE_REG_TMP 6,7,8
+%else
+ DECLARE_REG_TMP 0,1,2
+%endif
+
+%macro MC_CHROMA_START 1
+%if ARCH_X86_64
+ PROLOGUE 0,9,%1
+%else
+ PROLOGUE 0,6,%1
+%endif
+ movifnidn r3, r3mp
+ movifnidn r4d, r4m
+ movifnidn r5d, r5m
+ movifnidn t0d, r6m
+ mov t2d, t0d
+ mov t1d, r5d
+ sar t0d, 3
+ sar t1d, 3
+ imul t0d, r4d
+ lea t0d, [t0+t1*2]
+ FIX_STRIDES t0d
+ movsxdifnidn t0, t0d
+ add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride
+%endmacro
+
+%if HIGH_BIT_DEPTH
+%macro UNPACK_UNALIGNED 4
+ movu %1, [%4+0]
+ movu %2, [%4+4]
+ punpckhwd %3, %1, %2
+ punpcklwd %1, %2
+%if mmsize == 8
+ mova %2, %1
+ punpcklwd %1, %3
+ punpckhwd %2, %3
+%else
+ shufps %2, %1, %3, q3131
+ shufps %1, %3, q2020
+%endif
+%endmacro
+%else ; !HIGH_BIT_DEPTH
+%macro UNPACK_UNALIGNED 3
+%if mmsize == 8
+ punpcklwd %1, %3
+%else
+ movh %2, %3
+ punpcklwd %1, %2
+%endif
+%endmacro
+%endif ; HIGH_BIT_DEPTH
+
+;-----------------------------------------------------------------------------
+; void mc_chroma( uint8_t *dstu, uint8_t *dstv, intptr_t dst_stride,
+; uint8_t *src, intptr_t src_stride,
+; int dx, int dy,
+; int width, int height )
+;-----------------------------------------------------------------------------
+%macro MC_CHROMA 0
+cglobal mc_chroma
+ MC_CHROMA_START 0
+ FIX_STRIDES r4
+ and r5d, 7
+%if ARCH_X86_64
+ jz .mc1dy
+%endif
+ and t2d, 7
+%if ARCH_X86_64
+ jz .mc1dx
+%endif
+ shl r5d, 16
+ add t2d, r5d
+ mov t0d, t2d
+ shl t2d, 8
+ sub t2d, t0d
+ add t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y)
+ cmp dword r7m, 4
+%if mmsize==8
+.skip_prologue:
+%else
+ jl mc_chroma_mmx2 %+ .skip_prologue
+ WIN64_SPILL_XMM 9
+%endif
+ movd m5, t2d
+ movifnidn r0, r0mp
+ movifnidn r1, r1mp
+ movifnidn r2d, r2m
+ movifnidn r5d, r8m
+ pxor m6, m6
+ punpcklbw m5, m6
+%if mmsize==8
+ pshufw m7, m5, q3232
+ pshufw m6, m5, q0000
+ pshufw m5, m5, q1111
+ jge .width4
+%else
+%if WIN64
+ cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
+%endif
+ pshufd m7, m5, q1111
+ punpcklwd m5, m5
+ pshufd m6, m5, q0000
+ pshufd m5, m5, q1111
+ jg .width8
+%endif
+%if HIGH_BIT_DEPTH
+ add r2, r2
+ UNPACK_UNALIGNED m0, m1, m2, r3
+%else
+ movu m0, [r3]
+ UNPACK_UNALIGNED m0, m1, [r3+2]
+ mova m1, m0
+ pand m0, [pw_00ff]
+ psrlw m1, 8
+%endif ; HIGH_BIT_DEPTH
+ pmaddwd m0, m7
+ pmaddwd m1, m7
+ packssdw m0, m1
+ SWAP 3, 0
+ALIGN 4
+.loop2:
+%if HIGH_BIT_DEPTH
+ UNPACK_UNALIGNED m0, m1, m2, r3+r4
+ pmullw m3, m6
+%else ; !HIGH_BIT_DEPTH
+ movu m0, [r3+r4]
+ UNPACK_UNALIGNED m0, m1, [r3+r4+2]
+ pmullw m3, m6
+ mova m1, m0
+ pand m0, [pw_00ff]
+ psrlw m1, 8
+%endif ; HIGH_BIT_DEPTH
+ pmaddwd m0, m7
+ pmaddwd m1, m7
+ mova m2, [pw_32]
+ packssdw m0, m1
+ paddw m2, m3
+ mova m3, m0
+ pmullw m0, m5
+ paddw m0, m2
+ psrlw m0, 6
+%if HIGH_BIT_DEPTH
+ movh [r0], m0
+%if mmsize == 8
+ psrlq m0, 32
+ movh [r1], m0
+%else
+ movhps [r1], m0
+%endif
+%else ; !HIGH_BIT_DEPTH
+ packuswb m0, m0
+ movd [r0], m0
+%if mmsize==8
+ psrlq m0, 16
+%else
+ psrldq m0, 4
+%endif
+ movd [r1], m0
+%endif ; HIGH_BIT_DEPTH
+ add r3, r4
+ add r0, r2
+ add r1, r2
+ dec r5d
+ jg .loop2
+ RET
+
+%if mmsize==8
+.width4:
+%if ARCH_X86_64
+ mov t0, r0
+ mov t1, r1
+ mov t2, r3
+%if WIN64
+ %define multy0 r4m
+%else
+ %define multy0 [rsp-8]
+%endif
+ mova multy0, m5
+%else
+ mov r3m, r3
+ %define multy0 r4m
+ mova multy0, m5
+%endif
+%else
+.width8:
+%if ARCH_X86_64
+ %define multy0 m8
+ SWAP 8, 5
+%else
+ %define multy0 r0m
+ mova multy0, m5
+%endif
+%endif
+ FIX_STRIDES r2
+.loopx:
+%if HIGH_BIT_DEPTH
+ UNPACK_UNALIGNED m0, m2, m4, r3
+ UNPACK_UNALIGNED m1, m3, m5, r3+mmsize
+%else
+ movu m0, [r3]
+ movu m1, [r3+mmsize/2]
+ UNPACK_UNALIGNED m0, m2, [r3+2]
+ UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
+ psrlw m2, m0, 8
+ psrlw m3, m1, 8
+ pand m0, [pw_00ff]
+ pand m1, [pw_00ff]
+%endif
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ pmaddwd m1, m7
+ pmaddwd m3, m7
+ packssdw m0, m2
+ packssdw m1, m3
+ SWAP 4, 0
+ SWAP 5, 1
+ add r3, r4
+ALIGN 4
+.loop4:
+%if HIGH_BIT_DEPTH
+ UNPACK_UNALIGNED m0, m1, m2, r3
+ pmaddwd m0, m7
+ pmaddwd m1, m7
+ packssdw m0, m1
+ UNPACK_UNALIGNED m1, m2, m3, r3+mmsize
+ pmaddwd m1, m7
+ pmaddwd m2, m7
+ packssdw m1, m2
+%else ; !HIGH_BIT_DEPTH
+ movu m0, [r3]
+ movu m1, [r3+mmsize/2]
+ UNPACK_UNALIGNED m0, m2, [r3+2]
+ UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
+ psrlw m2, m0, 8
+ psrlw m3, m1, 8
+ pand m0, [pw_00ff]
+ pand m1, [pw_00ff]
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ pmaddwd m1, m7
+ pmaddwd m3, m7
+ packssdw m0, m2
+ packssdw m1, m3
+%endif ; HIGH_BIT_DEPTH
+ pmullw m4, m6
+ pmullw m5, m6
+ mova m2, [pw_32]
+ paddw m3, m2, m5
+ paddw m2, m4
+ mova m4, m0
+ mova m5, m1
+ pmullw m0, multy0
+ pmullw m1, multy0
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 6
+ psrlw m1, 6
+%if HIGH_BIT_DEPTH
+ movh [r0], m0
+ movh [r0+mmsize/2], m1
+%if mmsize==8
+ psrlq m0, 32
+ psrlq m1, 32
+ movh [r1], m0
+ movh [r1+mmsize/2], m1
+%else
+ movhps [r1], m0
+ movhps [r1+mmsize/2], m1
+%endif
+%else ; !HIGH_BIT_DEPTH
+ packuswb m0, m1
+%if mmsize==8
+ pshufw m1, m0, q0020
+ pshufw m0, m0, q0031
+ movd [r0], m1
+ movd [r1], m0
+%else
+ pshufd m0, m0, q3120
+ movq [r0], m0
+ movhps [r1], m0
+%endif
+%endif ; HIGH_BIT_DEPTH
+ add r3, r4
+ add r0, r2
+ add r1, r2
+ dec r5d
+ jg .loop4
+%if mmsize!=8
+ RET
+%else
+ sub dword r7m, 4
+ jg .width8
+ RET
+.width8:
+%if ARCH_X86_64
+ lea r3, [t2+8*SIZEOF_PIXEL]
+ lea r0, [t0+4*SIZEOF_PIXEL]
+ lea r1, [t1+4*SIZEOF_PIXEL]
+%else
+ mov r3, r3m
+ mov r0, r0m
+ mov r1, r1m
+ add r3, 8*SIZEOF_PIXEL
+ add r0, 4*SIZEOF_PIXEL
+ add r1, 4*SIZEOF_PIXEL
+%endif
+ mov r5d, r8m
+ jmp .loopx
+%endif
+
+%if ARCH_X86_64 ; too many regs for x86_32
+ RESET_MM_PERMUTATION
+%if WIN64
+ %assign stack_offset stack_offset - stack_size_padded
+ %assign stack_size_padded 0
+ %assign xmm_regs_used 0
+%endif
+.mc1dy:
+ and t2d, 7
+ movd m5, t2d
+ mov r6d, r4d ; pel_offset = dx ? 2 : src_stride
+ jmp .mc1d
+.mc1dx:
+ movd m5, r5d
+ mov r6d, 2*SIZEOF_PIXEL
+.mc1d:
+%if HIGH_BIT_DEPTH && mmsize == 16
+ WIN64_SPILL_XMM 8
+%endif
+ mova m4, [pw_8]
+ SPLATW m5, m5
+ psubw m4, m5
+ movifnidn r0, r0mp
+ movifnidn r1, r1mp
+ movifnidn r2d, r2m
+ FIX_STRIDES r2
+ movifnidn r5d, r8m
+ cmp dword r7m, 4
+ jg .mc1d_w8
+ mov r7, r2
+ mov r8, r4
+%if mmsize!=8
+ shr r5d, 1
+%endif
+.loop1d_w4:
+%if HIGH_BIT_DEPTH
+%if mmsize == 8
+ movq m0, [r3+0]
+ movq m2, [r3+8]
+ movq m1, [r3+r6+0]
+ movq m3, [r3+r6+8]
+%else
+ movu m0, [r3]
+ movu m1, [r3+r6]
+ add r3, r8
+ movu m2, [r3]
+ movu m3, [r3+r6]
+%endif
+ SBUTTERFLY wd, 0, 2, 6
+ SBUTTERFLY wd, 1, 3, 7
+ SBUTTERFLY wd, 0, 2, 6
+ SBUTTERFLY wd, 1, 3, 7
+%if mmsize == 16
+ SBUTTERFLY wd, 0, 2, 6
+ SBUTTERFLY wd, 1, 3, 7
+%endif
+%else ; !HIGH_BIT_DEPTH
+ movq m0, [r3]
+ movq m1, [r3+r6]
+%if mmsize!=8
+ add r3, r8
+ movhps m0, [r3]
+ movhps m1, [r3+r6]
+%endif
+ psrlw m2, m0, 8
+ psrlw m3, m1, 8
+ pand m0, [pw_00ff]
+ pand m1, [pw_00ff]
+%endif ; HIGH_BIT_DEPTH
+ pmullw m0, m4
+ pmullw m1, m5
+ pmullw m2, m4
+ pmullw m3, m5
+ paddw m0, [pw_4]
+ paddw m2, [pw_4]
+ paddw m0, m1
+ paddw m2, m3
+ psrlw m0, 3
+ psrlw m2, 3
+%if HIGH_BIT_DEPTH
+%if mmsize == 8
+ xchg r4, r8
+ xchg r2, r7
+%endif
+ movq [r0], m0
+ movq [r1], m2
+%if mmsize == 16
+ add r0, r7
+ add r1, r7
+ movhps [r0], m0
+ movhps [r1], m2
+%endif
+%else ; !HIGH_BIT_DEPTH
+ packuswb m0, m2
+%if mmsize==8
+ xchg r4, r8
+ xchg r2, r7
+ movd [r0], m0
+ psrlq m0, 32
+ movd [r1], m0
+%else
+ movhlps m1, m0
+ movd [r0], m0
+ movd [r1], m1
+ add r0, r7
+ add r1, r7
+ psrldq m0, 4
+ psrldq m1, 4
+ movd [r0], m0
+ movd [r1], m1
+%endif
+%endif ; HIGH_BIT_DEPTH
+ add r3, r4
+ add r0, r2
+ add r1, r2
+ dec r5d
+ jg .loop1d_w4
+ RET
+.mc1d_w8:
+ sub r2, 4*SIZEOF_PIXEL
+ sub r4, 8*SIZEOF_PIXEL
+ mov r7, 4*SIZEOF_PIXEL
+ mov r8, 8*SIZEOF_PIXEL
+%if mmsize==8
+ shl r5d, 1
+%endif
+ jmp .loop1d_w4
+%endif ; ARCH_X86_64
+%endmacro ; MC_CHROMA
+
+%macro MC_CHROMA_SSSE3 0
+cglobal mc_chroma
+%if cpuflag(avx2)
+ MC_CHROMA_START 9
+%else
+ MC_CHROMA_START 10
+%endif
+ and r5d, 7
+ and t2d, 7
+ mov t0d, r5d
+ shl t0d, 8
+ sub t0d, r5d
+ mov r5d, 8
+ add t0d, 8
+ sub r5d, t2d
+ imul t2d, t0d ; (x*255+8)*y
+ imul r5d, t0d ; (x*255+8)*(8-y)
+ movd xm6, t2d
+ movd xm7, r5d
+%if cpuflag(cache64)
+ mov t0d, r3d
+ and t0d, 7
+%ifdef PIC
+ lea t1, [ch_shuf_adj]
+ movddup xm5, [t1 + t0*4]
+%else
+ movddup xm5, [ch_shuf_adj + t0*4]
+%endif
+ paddb xm5, [ch_shuf]
+ and r3, ~7
+%else
+ mova m5, [ch_shuf]
+%endif
+ movifnidn r0, r0mp
+ movifnidn r1, r1mp
+ movifnidn r2d, r2m
+ movifnidn r5d, r8m
+%if cpuflag(avx2)
+ vpbroadcastw m6, xm6
+ vpbroadcastw m7, xm7
+%else
+ SPLATW m6, m6
+ SPLATW m7, m7
+%endif
+%if ARCH_X86_64
+ %define shiftround m8
+ mova m8, [pw_512]
+%else
+ %define shiftround [pw_512]
+%endif
+ cmp dword r7m, 4
+ jg .width8
+
+%if cpuflag(avx2)
+.loop4:
+ movu xm0, [r3]
+ movu xm1, [r3+r4]
+ vinserti128 m0, m0, [r3+r4], 1
+ vinserti128 m1, m1, [r3+r4*2], 1
+ pshufb m0, m5
+ pshufb m1, m5
+ pmaddubsw m0, m7
+ pmaddubsw m1, m6
+ paddw m0, m1
+ pmulhrsw m0, shiftround
+ packuswb m0, m0
+ vextracti128 xm1, m0, 1
+ movd [r0], xm0
+ movd [r0+r2], xm1
+ psrldq xm0, 4
+ psrldq xm1, 4
+ movd [r1], xm0
+ movd [r1+r2], xm1
+ lea r3, [r3+r4*2]
+ lea r0, [r0+r2*2]
+ lea r1, [r1+r2*2]
+ sub r5d, 2
+ jg .loop4
+ RET
+.width8:
+ movu xm0, [r3]
+ vinserti128 m0, m0, [r3+8], 1
+ pshufb m0, m5
+.loop8:
+ movu xm3, [r3+r4]
+ vinserti128 m3, m3, [r3+r4+8], 1
+ pshufb m3, m5
+ pmaddubsw m1, m0, m7
+ pmaddubsw m2, m3, m6
+ pmaddubsw m3, m3, m7
+
+ movu xm0, [r3+r4*2]
+ vinserti128 m0, m0, [r3+r4*2+8], 1
+ pshufb m0, m5
+ pmaddubsw m4, m0, m6
+
+ paddw m1, m2
+ paddw m3, m4
+ pmulhrsw m1, shiftround
+ pmulhrsw m3, shiftround
+ packuswb m1, m3
+ mova m2, [deinterleave_shufd]
+ vpermd m1, m2, m1
+ vextracti128 xm2, m1, 1
+ movq [r0], xm1
+ movhps [r1], xm1
+ movq [r0+r2], xm2
+ movhps [r1+r2], xm2
+%else
+ movu m0, [r3]
+ pshufb m0, xm5
+.loop4:
+ movu m1, [r3+r4]
+ pshufb m1, m5
+ movu m3, [r3+r4*2]
+ pshufb m3, m5
+ mova m4, m3
+ pmaddubsw m0, m7
+ pmaddubsw m2, m1, m7
+ pmaddubsw m1, m6
+ pmaddubsw m3, m6
+ paddw m1, m0
+ paddw m3, m2
+ pmulhrsw m1, shiftround
+ pmulhrsw m3, shiftround
+ mova m0, m4
+ packuswb m1, m3
+ movhlps m3, m1
+ movd [r0], xm1
+ movd [r0+r2], m3
+ psrldq m1, 4
+ psrldq m3, 4
+ movd [r1], m1
+ movd [r1+r2], m3
+ lea r3, [r3+r4*2]
+ lea r0, [r0+r2*2]
+ lea r1, [r1+r2*2]
+ sub r5d, 2
+ jg .loop4
+ RET
+.width8:
+ movu m0, [r3]
+ pshufb m0, m5
+ movu m1, [r3+8]
+ pshufb m1, m5
+%if ARCH_X86_64
+ SWAP 9, 6
+ %define mult1 m9
+%else
+ mova r0m, m6
+ %define mult1 r0m
+%endif
+.loop8:
+ movu m2, [r3+r4]
+ pshufb m2, m5
+ movu m3, [r3+r4+8]
+ pshufb m3, m5
+ mova m4, m2
+ mova m6, m3
+ pmaddubsw m0, m7
+ pmaddubsw m1, m7
+ pmaddubsw m2, mult1
+ pmaddubsw m3, mult1
+ paddw m0, m2
+ paddw m1, m3
+ pmulhrsw m0, shiftround ; x + 32 >> 6
+ pmulhrsw m1, shiftround
+ packuswb m0, m1
+ pshufd m0, m0, q3120
+ movq [r0], m0
+ movhps [r1], m0
+
+ movu m2, [r3+r4*2]
+ pshufb m2, m5
+ movu m3, [r3+r4*2+8]
+ pshufb m3, m5
+ mova m0, m2
+ mova m1, m3
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pmaddubsw m2, mult1
+ pmaddubsw m3, mult1
+ paddw m2, m4
+ paddw m3, m6
+ pmulhrsw m2, shiftround
+ pmulhrsw m3, shiftround
+ packuswb m2, m3
+ pshufd m2, m2, q3120
+ movq [r0+r2], m2
+ movhps [r1+r2], m2
+%endif
+ lea r3, [r3+r4*2]
+ lea r0, [r0+r2*2]
+ lea r1, [r1+r2*2]
+ sub r5d, 2
+ jg .loop8
+ RET
+%endmacro
+
+%if HIGH_BIT_DEPTH
+INIT_MMX mmx2
+MC_CHROMA
+INIT_XMM sse2
+MC_CHROMA
+INIT_XMM avx
+MC_CHROMA
+%else ; !HIGH_BIT_DEPTH
+INIT_MMX mmx2
+MC_CHROMA
+INIT_XMM sse2
+MC_CHROMA
+INIT_XMM ssse3
+MC_CHROMA_SSSE3
+INIT_XMM ssse3, cache64
+MC_CHROMA_SSSE3
+INIT_XMM avx
+MC_CHROMA_SSSE3 ; No known AVX CPU will trigger CPU_CACHELINE_64
+INIT_YMM avx2
+MC_CHROMA_SSSE3
+%endif ; HIGH_BIT_DEPTH
diff -r 7976c35f5b76 -r 84d0f4f907f7 source/encoder/motion.cpp
--- a/source/encoder/motion.cpp Fri Oct 04 16:02:01 2013 -0500
+++ b/source/encoder/motion.cpp Fri Oct 04 17:01:07 2013 -0500
@@ -1119,7 +1119,7 @@
pixel *frefB = ref->lowresPlane[hpelB] + blockOffset + (qmvB.x >> 2) + (qmvB.y >> 2) * ref->lumaStride;
// average nearest two HPEL pixels to generate H.264 style QPEL pixels
- primitives.pixelavg_pp[PARTITION_8x8](subpelbuf, FENC_STRIDE, frefA, frefB, ref->lumaStride, ref->lumaStride);
+ primitives.pixelavg_pp[PARTITION_8x8](subpelbuf, FENC_STRIDE, frefA, ref->lumaStride, frefB, ref->lumaStride, 32);
return cmp(fenc, FENC_STRIDE, subpelbuf, FENC_STRIDE);
}
else
More information about the x265-devel
mailing list