[x265] [PATCH] Lookahead: asm primitve for downscale
Steve Borho
steve at borho.org
Fri Jul 26 21:37:42 CEST 2013
On Fri, Jul 26, 2013 at 4:31 PM, <gopu at multicorewareinc.com> wrote:
> # HG changeset patch
> # User ggopu at bitbucket.org
> # Date 1374873151 25200
> # Node ID 2454a81c67fa50b20a71c81a4a5b870eade71b77
> # Parent f2f70fa9b4f3f075629d02c35684d16bea67fee0
> Lookahead: asm primitve for downscale
>
> diff -r f2f70fa9b4f3 -r 2454a81c67fa source/common/pixel.cpp
> --- a/source/common/pixel.cpp Fri Jul 26 02:19:06 2013 -0500
> +++ b/source/common/pixel.cpp Fri Jul 26 14:12:31 2013 -0700
> @@ -598,6 +598,31 @@
> }
> }
>
> +void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel
> *dstv, pixel *dstc,
> + intptr_t src_stride, intptr_t
> dst_stride, int width, int height )
> +{
> + for( int y = 0; y < height; y++ )
> + {
> + pixel *src1 = src0+src_stride;
> + pixel *src2 = src1+src_stride;
> + for( int x = 0; x<width; x++ )
> + {
> + // slower than naive bilinear, but matches asm
> +#define FILTER(a,b,c,d) ((((a+b+1)>>1)+((c+d+1)>>1)+1)>>1)
> + dst0[x] = FILTER(src0[2*x ], src1[2*x ], src0[2*x+1],
> src1[2*x+1]);
> + dsth[x] = FILTER(src0[2*x+1], src1[2*x+1], src0[2*x+2],
> src1[2*x+2]);
> + dstv[x] = FILTER(src1[2*x ], src2[2*x ], src1[2*x+1],
> src2[2*x+1]);
> + dstc[x] = FILTER(src1[2*x+1], src2[2*x+1], src1[2*x+2],
> src2[2*x+2]);
> +#undef FILTER
> + }
> + src0 += src_stride*2;
> + dst0 += dst_stride;
> + dsth += dst_stride;
> + dstv += dst_stride;
> + dstc += dst_stride;
> + }
> +}
> +
> } // end anonymous namespace
>
> namespace x265 {
> @@ -806,5 +831,6 @@
>
> p.scale1D_128to64 = scale1D_128to64;
> p.scale2D_64to32 = scale2D_64to32;
> + p.frame_init_lowres_core = frame_init_lowres_core;
> }
> }
> diff -r f2f70fa9b4f3 -r 2454a81c67fa source/common/primitives.h
> --- a/source/common/primitives.h Fri Jul 26 02:19:06 2013 -0500
> +++ b/source/common/primitives.h Fri Jul 26 14:12:31 2013 -0700
> @@ -227,6 +227,8 @@
> int marginX, int marginY, int w, int
> roundw, int shiftw, int offsetw);
> typedef void (*weightpUni_t)(short *src, pixel *dst, int srcStride, int
> dstStride, int width, int height, int w0, int round, int shift, int offset);
> typedef void (*scale_t)(pixel *dst, pixel *src, intptr_t stride);
> +typedef void (*downscale_t)( pixel *src0, pixel *dst0, pixel *dsth, pixel
> *dstv, pixel *dstc,
> + intptr_t src_stride, intptr_t
> dst_stride, int width, int height );
>
> /* Define a structure containing function pointers to optimized encoder
> * primitives. Each pointer can reference either an assembly routine,
> @@ -286,6 +288,7 @@
>
> scale_t scale1D_128to64;
> scale_t scale2D_64to32;
> + downscale_t frame_init_lowres_core;
> };
>
> /* This copy of the table is what gets used by the encoder.
> diff -r f2f70fa9b4f3 -r 2454a81c67fa source/common/x86/CMakeLists.txt
> --- a/source/common/x86/CMakeLists.txt Fri Jul 26 02:19:06 2013 -0500
> +++ b/source/common/x86/CMakeLists.txt Fri Jul 26 14:12:31 2013 -0700
> @@ -5,7 +5,7 @@
> add_definitions(-DHAVE_ALIGNED_STACK=0)
> endif()
>
> -set(ASMS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm)
> +set(ASMS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a2.asm)
> if (X64)
> add_definitions(-DARCH_X86_64=1)
> else()
> diff -r f2f70fa9b4f3 -r 2454a81c67fa source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Fri Jul 26 02:19:06 2013
> -0500
> +++ b/source/common/x86/asm-primitives.cpp Fri Jul 26 14:12:31 2013
> -0700
> @@ -165,6 +165,8 @@
> p.satd[PARTITION_64x48] = cmp<64, 48, 16, 16,
> x265_pixel_satd_16x16_mmx2>;
> p.satd[PARTITION_64x64] = cmp<64, 64, 16, 16,
> x265_pixel_satd_16x16_mmx2>;
>
> + p.frame_init_lowres_core = x265_frame_init_lowres_core_mmx2;
> +
> INIT2( sad, _sse2 );
> INIT2( sad_x3, _sse2 );
> INIT2( sad_x4, _sse2 );
> diff -r f2f70fa9b4f3 -r 2454a81c67fa source/common/x86/mc-a2.asm
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/source/common/x86/mc-a2.asm Fri Jul 26 14:12:31 2013 -0700
> @@ -0,0 +1,1790 @@
>
> +;*****************************************************************************
> +;* mc-a2.asm: x86 motion compensation
>
> +;*****************************************************************************
> +;* Copyright (C) 2005-2012 x264 project
>
This is really old, but this is partially my fault as I haven't documented
clearly which revision of x264 I drew the other files from. I'll fix that
in a follow-on commit.
> +;*
> +;* Authors: Loren Merritt <lorenm at u.washington.edu>
> +;* Jason Garrett-Glaser <darkshikari at gmail.com>
> +;* Holger Lubitz <holger at lubitz.org>
> +;* Mathieu Monnier <manao at melix.net>
> +;* Oskar Arvidsson <oskar at irock.se>
> +;*
> +;* This program is free software; you can redistribute it and/or modify
> +;* it under the terms of the GNU General Public License as published by
> +;* the Free Software Foundation; either version 2 of the License, or
> +;* (at your option) any later version.
> +;*
> +;* This program is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> +;* GNU General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU General Public License
> +;* along with this program; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
> USA.
> +;*
> +;* This program is also available under a commercial proprietary license.
> +;* For more information, contact us at licensing at x264.com.
>
> +;*****************************************************************************
> +
> +%include "x86inc.asm"
> +%include "x86util.asm"
> +
> +SECTION_RODATA
> +
> +filt_mul20: times 16 db 20
> +filt_mul15: times 8 db 1, -5
> +filt_mul51: times 8 db -5, 1
> +hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
> +deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
> +%if HIGH_BIT_DEPTH
> +deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
> +deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
> +%else
> +deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
> +deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
> +%endif
> +
> +pd_16: times 4 dd 16
> +pd_0f: times 4 dd 0xffff
> +pf_inv256: times 8 dd 0.00390625
> +
> +pad10: times 8 dw 10*PIXEL_MAX
> +pad20: times 8 dw 20*PIXEL_MAX
> +pad30: times 8 dw 30*PIXEL_MAX
> +depad: times 4 dd 32*20*PIXEL_MAX + 512
> +
> +tap1: times 4 dw 1, -5
> +tap2: times 4 dw 20, 20
> +tap3: times 4 dw -5, 1
> +
> +SECTION .text
> +
> +cextern pb_0
> +cextern pw_1
> +cextern pw_16
> +cextern pw_32
> +cextern pw_00ff
> +cextern pw_3fff
> +cextern pw_pixel_max
> +cextern pd_ffff
> +
> +%macro LOAD_ADD 4
> + movh %4, %3
> + movh %1, %2
> + punpcklbw %4, m0
> + punpcklbw %1, m0
> + paddw %1, %4
> +%endmacro
> +
> +%macro LOAD_ADD_2 6
> + mova %5, %3
> + mova %1, %4
> + punpckhbw %6, %5, m0
> + punpcklbw %5, m0
> + punpckhbw %2, %1, m0
> + punpcklbw %1, m0
> + paddw %1, %5
> + paddw %2, %6
> +%endmacro
> +
> +%macro FILT_V2 6
> + psubw %1, %2 ; a-b
> + psubw %4, %5
> + psubw %2, %3 ; b-c
> + psubw %5, %6
> + psllw %2, 2
> + psllw %5, 2
> + psubw %1, %2 ; a-5*b+4*c
> + psllw %3, 4
> + psubw %4, %5
> + psllw %6, 4
> + paddw %1, %3 ; a-5*b+20*c
> + paddw %4, %6
> +%endmacro
> +
> +%macro FILT_H 3
> + psubw %1, %2 ; a-b
> + psraw %1, 2 ; (a-b)/4
> + psubw %1, %2 ; (a-b)/4-b
> + paddw %1, %3 ; (a-b)/4-b+c
> + psraw %1, 2 ; ((a-b)/4-b+c)/4
> + paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
> +%endmacro
> +
> +%macro FILT_H2 6
> + psubw %1, %2
> + psubw %4, %5
> + psraw %1, 2
> + psraw %4, 2
> + psubw %1, %2
> + psubw %4, %5
> + paddw %1, %3
> + paddw %4, %6
> + psraw %1, 2
> + psraw %4, 2
> + paddw %1, %3
> + paddw %4, %6
> +%endmacro
> +
> +%macro FILT_PACK 4-6 b
> + paddw %1, %4
> + paddw %2, %4
> +%if %0 == 6
> + psubusw %1, %6
> + psubusw %2, %6
> + psrlw %1, %3
> + psrlw %2, %3
> +%else
> + psraw %1, %3
> + psraw %2, %3
> +%endif
> +%ifnidn w, %5
> + packuswb %1, %2
> +%endif
> +%endmacro
> +
> +;The hpel_filter routines use non-temporal writes for output.
> +;The following defines may be uncommented for testing.
> +;Doing the hpel_filter temporal may be a win if the last level cache
> +;is big enough (preliminary benching suggests on the order of 4*
> framesize).
> +
> +;%define movntq movq
> +;%define movntps movaps
> +;%define sfence
> +
> +%if HIGH_BIT_DEPTH
>
> +;-----------------------------------------------------------------------------
> +; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf,
> intptr_t stride, intptr_t width );
>
> +;-----------------------------------------------------------------------------
> +%macro HPEL_FILTER 0
> +cglobal hpel_filter_v, 5,6,11
> + FIX_STRIDES r3, r4
> + lea r5, [r1+r3]
> + sub r1, r3
> + sub r1, r3
> +%if num_mmregs > 8
> + mova m8, [pad10]
> + mova m9, [pad20]
> + mova m10, [pad30]
> + %define s10 m8
> + %define s20 m9
> + %define s30 m10
> +%else
> + %define s10 [pad10]
> + %define s20 [pad20]
> + %define s30 [pad30]
> +%endif
> + add r0, r4
> + add r2, r4
> + neg r4
> + mova m7, [pw_pixel_max]
> + pxor m0, m0
> +.loop:
> + mova m1, [r1]
> + mova m2, [r1+r3]
> + mova m3, [r1+r3*2]
> + mova m4, [r1+mmsize]
> + mova m5, [r1+r3+mmsize]
> + mova m6, [r1+r3*2+mmsize]
> + paddw m1, [r5+r3*2]
> + paddw m2, [r5+r3]
> + paddw m3, [r5]
> + paddw m4, [r5+r3*2+mmsize]
> + paddw m5, [r5+r3+mmsize]
> + paddw m6, [r5+mmsize]
> + add r1, 2*mmsize
> + add r5, 2*mmsize
> + FILT_V2 m1, m2, m3, m4, m5, m6
> + mova m6, [pw_16]
> + psubw m1, s20
> + psubw m4, s20
> + mova [r2+r4], m1
> + mova [r2+r4+mmsize], m4
> + paddw m1, s30
> + paddw m4, s30
> + FILT_PACK m1, m4, 5, m6, w, s10
> + CLIPW m1, m0, m7
> + CLIPW m4, m0, m7
> + mova [r0+r4], m1
> + mova [r0+r4+mmsize], m4
> + add r4, 2*mmsize
> + jl .loop
> + REP_RET
> +
>
> +;-----------------------------------------------------------------------------
> +; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
>
> +;-----------------------------------------------------------------------------
> +cglobal hpel_filter_c, 3,3,10
> + add r2, r2
> + add r0, r2
> + add r1, r2
> + neg r2
> + mova m0, [tap1]
> + mova m7, [tap3]
> +%if num_mmregs > 8
> + mova m8, [tap2]
> + mova m9, [depad]
> + %define s1 m8
> + %define s2 m9
> +%else
> + %define s1 [tap2]
> + %define s2 [depad]
> +%endif
> +.loop:
> + movu m1, [r1+r2-4]
> + movu m2, [r1+r2-2]
> + mova m3, [r1+r2+0]
> + movu m4, [r1+r2+2]
> + movu m5, [r1+r2+4]
> + movu m6, [r1+r2+6]
> + pmaddwd m1, m0
> + pmaddwd m2, m0
> + pmaddwd m3, s1
> + pmaddwd m4, s1
> + pmaddwd m5, m7
> + pmaddwd m6, m7
> + paddd m1, s2
> + paddd m2, s2
> + paddd m3, m5
> + paddd m4, m6
> + paddd m1, m3
> + paddd m2, m4
> + psrad m1, 10
> + psrad m2, 10
> + pslld m2, 16
> + pand m1, [pd_0f]
> + por m1, m2
> + CLIPW m1, [pb_0], [pw_pixel_max]
> + mova [r0+r2], m1
> + add r2, mmsize
> + jl .loop
> + REP_RET
> +
>
> +;-----------------------------------------------------------------------------
> +; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
>
> +;-----------------------------------------------------------------------------
> +cglobal hpel_filter_h, 3,4,8
> + %define src r1+r2
> + add r2, r2
> + add r0, r2
> + add r1, r2
> + neg r2
> + mova m0, [pw_pixel_max]
> +.loop:
> + movu m1, [src-4]
> + movu m2, [src-2]
> + mova m3, [src+0]
> + movu m6, [src+2]
> + movu m4, [src+4]
> + movu m5, [src+6]
> + paddw m3, m6 ; c0
> + paddw m2, m4 ; b0
> + paddw m1, m5 ; a0
> +%if mmsize == 16
> + movu m4, [src-4+mmsize]
> + movu m5, [src-2+mmsize]
> +%endif
> + movu m7, [src+4+mmsize]
> + movu m6, [src+6+mmsize]
> + paddw m5, m7 ; b1
> + paddw m4, m6 ; a1
> + movu m7, [src+2+mmsize]
> + mova m6, [src+0+mmsize]
> + paddw m6, m7 ; c1
> + FILT_H2 m1, m2, m3, m4, m5, m6
> + mova m7, [pw_1]
> + pxor m2, m2
> + FILT_PACK m1, m4, 1, m7, w
> + CLIPW m1, m2, m0
> + CLIPW m4, m2, m0
> + mova [r0+r2], m1
> + mova [r0+r2+mmsize], m4
> + add r2, mmsize*2
> + jl .loop
> + REP_RET
> +%endmacro ; HPEL_FILTER
> +
> +INIT_MMX mmx2
> +HPEL_FILTER
> +INIT_XMM sse2
> +HPEL_FILTER
> +%endif ; HIGH_BIT_DEPTH
> +
> +%if HIGH_BIT_DEPTH == 0
> +%macro HPEL_V 1
>
> +;-----------------------------------------------------------------------------
> +; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t
> stride, intptr_t width );
>
> +;-----------------------------------------------------------------------------
> +cglobal hpel_filter_v, 5,6,%1
> + lea r5, [r1+r3]
> + sub r1, r3
> + sub r1, r3
> + add r0, r4
> + lea r2, [r2+r4*2]
> + neg r4
> +%if cpuflag(ssse3)
> + mova m0, [filt_mul15]
> +%else
> + pxor m0, m0
> +%endif
> +.loop:
> +%if cpuflag(ssse3)
> + mova m1, [r1]
> + mova m4, [r1+r3]
> + mova m2, [r5+r3*2]
> + mova m5, [r5+r3]
> + mova m3, [r1+r3*2]
> + mova m6, [r5]
> + SBUTTERFLY bw, 1, 4, 7
> + SBUTTERFLY bw, 2, 5, 7
> + SBUTTERFLY bw, 3, 6, 7
> + pmaddubsw m1, m0
> + pmaddubsw m4, m0
> + pmaddubsw m2, m0
> + pmaddubsw m5, m0
> + pmaddubsw m3, [filt_mul20]
> + pmaddubsw m6, [filt_mul20]
> + paddw m1, m2
> + paddw m4, m5
> + paddw m1, m3
> + paddw m4, m6
> +%else
> + LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
> + LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
> + LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
> + LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
> + FILT_V2 m1, m2, m3, m4, m5, m6
> +%endif
> + mova m7, [pw_16]
> + mova [r2+r4*2], m1
> + mova [r2+r4*2+mmsize], m4
> + FILT_PACK m1, m4, 5, m7
> + movnta [r0+r4], m1
> + add r1, mmsize
> + add r5, mmsize
> + add r4, mmsize
> + jl .loop
> + REP_RET
> +%endmacro
> +
>
> +;-----------------------------------------------------------------------------
> +; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
>
> +;-----------------------------------------------------------------------------
> +INIT_MMX
> +cglobal hpel_filter_c_mmx2, 3,3
> + add r0, r2
> + lea r1, [r1+r2*2]
> + neg r2
> + %define src r1+r2*2
> + movq m7, [pw_32]
> +.loop:
> + movq m1, [src-4]
> + movq m2, [src-2]
> + movq m3, [src ]
> + movq m4, [src+4]
> + movq m5, [src+6]
> + paddw m3, [src+2] ; c0
> + paddw m2, m4 ; b0
> + paddw m1, m5 ; a0
> + movq m6, [src+8]
> + paddw m4, [src+14] ; a1
> + paddw m5, [src+12] ; b1
> + paddw m6, [src+10] ; c1
> + FILT_H2 m1, m2, m3, m4, m5, m6
> + FILT_PACK m1, m4, 6, m7
> + movntq [r0+r2], m1
> + add r2, 8
> + jl .loop
> + REP_RET
> +
>
> +;-----------------------------------------------------------------------------
> +; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
>
> +;-----------------------------------------------------------------------------
> +cglobal hpel_filter_h_mmx2, 3,3
> + add r0, r2
> + add r1, r2
> + neg r2
> + %define src r1+r2
> + pxor m0, m0
> +.loop:
> + movd m1, [src-2]
> + movd m2, [src-1]
> + movd m3, [src ]
> + movd m6, [src+1]
> + movd m4, [src+2]
> + movd m5, [src+3]
> + punpcklbw m1, m0
> + punpcklbw m2, m0
> + punpcklbw m3, m0
> + punpcklbw m6, m0
> + punpcklbw m4, m0
> + punpcklbw m5, m0
> + paddw m3, m6 ; c0
> + paddw m2, m4 ; b0
> + paddw m1, m5 ; a0
> + movd m7, [src+7]
> + movd m6, [src+6]
> + punpcklbw m7, m0
> + punpcklbw m6, m0
> + paddw m4, m7 ; c1
> + paddw m5, m6 ; b1
> + movd m7, [src+5]
> + movd m6, [src+4]
> + punpcklbw m7, m0
> + punpcklbw m6, m0
> + paddw m6, m7 ; a1
> + movq m7, [pw_1]
> + FILT_H2 m1, m2, m3, m4, m5, m6
> + FILT_PACK m1, m4, 1, m7
> + movntq [r0+r2], m1
> + add r2, 8
> + jl .loop
> + REP_RET
> +
> +INIT_XMM
> +
> +%macro HPEL_C 0
>
> +;-----------------------------------------------------------------------------
> +; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
>
> +;-----------------------------------------------------------------------------
> +cglobal hpel_filter_c, 3,3,9
> + add r0, r2
> + lea r1, [r1+r2*2]
> + neg r2
> + %define src r1+r2*2
> +%ifnidn cpuname, sse2
> + mova m7, [pw_32]
> + %define tpw_32 m7
> +%elif ARCH_X86_64
> + mova m8, [pw_32]
> + %define tpw_32 m8
> +%else
> + %define tpw_32 [pw_32]
> +%endif
> +; This doesn't seem to be faster (with AVX) on Sandy Bridge or
> Bulldozer...
> +%if cpuflag(misalign)
> +.loop:
> + movu m4, [src-4]
> + movu m5, [src-2]
> + mova m6, [src]
> + movu m3, [src+12]
> + movu m2, [src+14]
> + mova m1, [src+16]
> + paddw m4, [src+6]
> + paddw m5, [src+4]
> + paddw m6, [src+2]
> + paddw m3, [src+22]
> + paddw m2, [src+20]
> + paddw m1, [src+18]
> + FILT_H2 m4, m5, m6, m3, m2, m1
> +%else
> + mova m0, [src-16]
> + mova m1, [src]
> +.loop:
> + mova m2, [src+16]
> + PALIGNR m4, m1, m0, 12, m7
> + PALIGNR m5, m1, m0, 14, m0
> + PALIGNR m0, m2, m1, 6, m7
> + paddw m4, m0
> + PALIGNR m0, m2, m1, 4, m7
> + paddw m5, m0
> + PALIGNR m6, m2, m1, 2, m7
> + paddw m6, m1
> + FILT_H m4, m5, m6
> +
> + mova m0, m2
> + mova m5, m2
> + PALIGNR m2, m1, 12, m7
> + PALIGNR m5, m1, 14, m1
> + mova m1, [src+32]
> + PALIGNR m3, m1, m0, 6, m7
> + paddw m3, m2
> + PALIGNR m6, m1, m0, 4, m7
> + paddw m5, m6
> + PALIGNR m6, m1, m0, 2, m7
> + paddw m6, m0
> + FILT_H m3, m5, m6
> +%endif
> + FILT_PACK m4, m3, 6, tpw_32
> + movntps [r0+r2], m4
> + add r2, 16
> + jl .loop
> + REP_RET
> +%endmacro
> +
>
> +;-----------------------------------------------------------------------------
> +; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
>
> +;-----------------------------------------------------------------------------
> +cglobal hpel_filter_h_sse2, 3,3,8
> + add r0, r2
> + add r1, r2
> + neg r2
> + %define src r1+r2
> + pxor m0, m0
> +.loop:
> + movh m1, [src-2]
> + movh m2, [src-1]
> + movh m3, [src ]
> + movh m4, [src+1]
> + movh m5, [src+2]
> + movh m6, [src+3]
> + punpcklbw m1, m0
> + punpcklbw m2, m0
> + punpcklbw m3, m0
> + punpcklbw m4, m0
> + punpcklbw m5, m0
> + punpcklbw m6, m0
> + paddw m3, m4 ; c0
> + paddw m2, m5 ; b0
> + paddw m1, m6 ; a0
> + movh m4, [src+6]
> + movh m5, [src+7]
> + movh m6, [src+10]
> + movh m7, [src+11]
> + punpcklbw m4, m0
> + punpcklbw m5, m0
> + punpcklbw m6, m0
> + punpcklbw m7, m0
> + paddw m5, m6 ; b1
> + paddw m4, m7 ; a1
> + movh m6, [src+8]
> + movh m7, [src+9]
> + punpcklbw m6, m0
> + punpcklbw m7, m0
> + paddw m6, m7 ; c1
> + mova m7, [pw_1] ; FIXME xmm8
> + FILT_H2 m1, m2, m3, m4, m5, m6
> + FILT_PACK m1, m4, 1, m7
> + movntps [r0+r2], m1
> + add r2, 16
> + jl .loop
> + REP_RET
> +
>
> +;-----------------------------------------------------------------------------
> +; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
>
> +;-----------------------------------------------------------------------------
> +%macro HPEL_H 0
> +cglobal hpel_filter_h, 3,3
> + add r0, r2
> + add r1, r2
> + neg r2
> + %define src r1+r2
> + mova m0, [src-16]
> + mova m1, [src]
> + mova m7, [pw_16]
> +.loop:
> + mova m2, [src+16]
> + ; Using unaligned loads instead of palignr is marginally slower on SB
> and significantly
> + ; slower on Bulldozer, despite their fast load units -- even though
> it would let us avoid
> + ; the repeated loads of constants for pmaddubsw.
> + palignr m3, m1, m0, 14
> + palignr m4, m1, m0, 15
> + palignr m0, m2, m1, 2
> + pmaddubsw m3, [filt_mul15]
> + pmaddubsw m4, [filt_mul15]
> + pmaddubsw m0, [filt_mul51]
> + palignr m5, m2, m1, 1
> + palignr m6, m2, m1, 3
> + paddw m3, m0
> + mova m0, m1
> + pmaddubsw m1, [filt_mul20]
> + pmaddubsw m5, [filt_mul20]
> + pmaddubsw m6, [filt_mul51]
> + paddw m3, m1
> + paddw m4, m5
> + paddw m4, m6
> + FILT_PACK m3, m4, 5, m7
> + pshufb m3, [hpel_shuf]
> + mova m1, m2
> + movntps [r0+r2], m3
> + add r2, 16
> + jl .loop
> + REP_RET
> +%endmacro
> +
> +INIT_MMX mmx2
> +HPEL_V 0
> +INIT_XMM sse2
> +HPEL_V 8
> +INIT_XMM sse2, misalign
> +HPEL_C
> +%if ARCH_X86_64 == 0
> +INIT_XMM sse2
> +HPEL_C
> +INIT_XMM ssse3
> +HPEL_C
> +HPEL_V 0
> +HPEL_H
> +INIT_XMM avx
> +HPEL_C
> +HPEL_V 0
> +HPEL_H
> +%endif
> +
> +%if ARCH_X86_64
> +%macro DO_FILT_V 5
> + ;The optimum prefetch distance is difficult to determine in checkasm:
> + ;any prefetch seems slower than not prefetching.
> + ;In real use, the prefetch seems to be a slight win.
> + ;+16 is picked somewhat arbitrarily here based on the fact that even
> one
> + ;loop iteration is going to take longer than the prefetch.
> + prefetcht0 [r1+r2*2+16]
> +%if cpuflag(ssse3)
> + mova m1, [r3]
> + mova m2, [r3+r2]
> + mova %3, [r3+r2*2]
> + mova m3, [r1]
> + mova %1, [r1+r2]
> + mova %2, [r1+r2*2]
> + punpckhbw m4, m1, m2
> + punpcklbw m1, m2
> + punpckhbw m2, %1, %2
> + punpcklbw %1, %2
> + punpckhbw %2, m3, %3
> + punpcklbw m3, %3
> +
> + pmaddubsw m1, m12
> + pmaddubsw m4, m12
> + pmaddubsw %1, m0
> + pmaddubsw m2, m0
> + pmaddubsw m3, m14
> + pmaddubsw %2, m14
> +
> + paddw m1, %1
> + paddw m4, m2
> + paddw m1, m3
> + paddw m4, %2
> +%else
> + LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
> + LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
> + LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
> + packuswb %3, %4
> + FILT_V2 m1, m2, m3, m4, m5, m6
> +%endif
> + add r3, 16
> + add r1, 16
> + mova %1, m1
> + mova %2, m4
> + FILT_PACK m1, m4, 5, m15
> + movntps [r8+r4+%5], m1
> +%endmacro
> +
> +%macro FILT_C 4
> + PALIGNR m1, %2, %1, 12, m2
> + PALIGNR m2, %2, %1, 14, %1
> + PALIGNR m3, %3, %2, 4, %1
> + PALIGNR m4, %3, %2, 2, %1
> + paddw m3, m2
> + mova %1, %3
> + PALIGNR %3, %2, 6, m2
> + paddw m4, %2
> + paddw %3, m1
> + FILT_H %3, m3, m4
> +%endmacro
> +
> +%macro DO_FILT_C 4
> + FILT_C %1, %2, %3, 6
> + FILT_C %2, %1, %4, 6
> + FILT_PACK %3, %4, 6, m15
> + movntps [r5+r4], %3
> +%endmacro
> +
> +%macro ADD8TO16 5
> + punpckhbw %3, %1, %5
> + punpcklbw %1, %5
> + punpcklbw %4, %2, %5
> + punpckhbw %2, %5
> + paddw %2, %3
> + paddw %1, %4
> +%endmacro
> +
> +%macro DO_FILT_H 3
> + PALIGNR m1, %2, %1, 14, m3
> + PALIGNR m2, %2, %1, 15, m3
> + PALIGNR m4, %3, %2, 1 , m3
> + PALIGNR m5, %3, %2, 2 , m3
> + PALIGNR m6, %3, %2, 3 , m3
> + mova %1, %2
> +%if cpuflag(ssse3)
> + pmaddubsw m1, m12
> + pmaddubsw m2, m12
> + pmaddubsw %2, m14
> + pmaddubsw m4, m14
> + pmaddubsw m5, m0
> + pmaddubsw m6, m0
> + paddw m1, %2
> + paddw m2, m4
> + paddw m1, m5
> + paddw m2, m6
> + FILT_PACK m1, m2, 5, m15
> + pshufb m1, [hpel_shuf]
> +%else ; ssse3, avx
> + ADD8TO16 m1, m6, m12, m3, m0 ; a
> + ADD8TO16 m2, m5, m12, m3, m0 ; b
> + ADD8TO16 %2, m4, m12, m3, m0 ; c
> + FILT_V2 m1, m2, %2, m6, m5, m4
> + FILT_PACK m1, m6, 5, m15
> +%endif
> + movntps [r0+r4], m1
> + mova %2, %3
> +%endmacro
> +
> +%macro HPEL 0
>
> +;-----------------------------------------------------------------------------
> +; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
> +; uint8_t *src, intptr_t stride, int width, int height )
>
> +;-----------------------------------------------------------------------------
> +cglobal hpel_filter, 7,9,16
> + mov r7, r3
> + sub r5d, 16
> + mov r8, r1
> + and r7, 15
> + sub r3, r7
> + add r0, r5
> + add r8, r5
> + add r7, r5
> + add r5, r2
> + mov r2, r4
> + neg r7
> + lea r1, [r3+r2]
> + sub r3, r2
> + sub r3, r2
> + mov r4, r7
> + mova m15, [pw_16]
> +%if cpuflag(ssse3)
> + mova m0, [filt_mul51]
> + mova m12, [filt_mul15]
> + mova m14, [filt_mul20]
> +%else
> + pxor m0, m0
> +%endif
> +;ALIGN 16
> +.loopy:
> +; first filter_v
> + DO_FILT_V m8, m7, m13, m12, 0
> +;ALIGN 16
> +.loopx:
> + DO_FILT_V m6, m5, m11, m12, 16
> +.lastx:
> + paddw m15, m15 ; pw_32
> + DO_FILT_C m9, m8, m7, m6
> + psrlw m15, 1 ; pw_16
> + movdqa m7, m5
> + DO_FILT_H m10, m13, m11
> + add r4, 16
> + jl .loopx
> + cmp r4, 16
> + jl .lastx
> +; setup regs for next y
> + sub r4, r7
> + sub r4, r2
> + sub r1, r4
> + sub r3, r4
> + add r0, r2
> + add r8, r2
> + add r5, r2
> + mov r4, r7
> + sub r6d, 1
> + jg .loopy
> + sfence
> + RET
> +%endmacro
> +
> +INIT_XMM sse2
> +HPEL
> +INIT_XMM ssse3
> +HPEL
> +INIT_XMM avx
> +HPEL
> +%endif ; ARCH_X86_64
> +
> +%undef movntq
> +%undef movntps
> +%undef sfence
> +%endif ; !HIGH_BIT_DEPTH
> +
>
> +;-----------------------------------------------------------------------------
> +; void plane_copy_core( pixel *dst, intptr_t i_dst,
> +; pixel *src, intptr_t i_src, int w, int h )
>
> +;-----------------------------------------------------------------------------
> +; assumes i_dst and w are multiples of 16, and i_dst>w
> +INIT_MMX
> +cglobal plane_copy_core_mmx2, 6,7
> + FIX_STRIDES r1, r3, r4d
> +%if HIGH_BIT_DEPTH == 0
> + movsxdifnidn r4, r4d
> +%endif
> + sub r1, r4
> + sub r3, r4
> +.loopy:
> + lea r6d, [r4-63]
> +.loopx:
> + prefetchnta [r2+256]
> + movq m0, [r2 ]
> + movq m1, [r2+ 8]
> + movntq [r0 ], m0
> + movntq [r0+ 8], m1
> + movq m2, [r2+16]
> + movq m3, [r2+24]
> + movntq [r0+16], m2
> + movntq [r0+24], m3
> + movq m4, [r2+32]
> + movq m5, [r2+40]
> + movntq [r0+32], m4
> + movntq [r0+40], m5
> + movq m6, [r2+48]
> + movq m7, [r2+56]
> + movntq [r0+48], m6
> + movntq [r0+56], m7
> + add r2, 64
> + add r0, 64
> + sub r6d, 64
> + jg .loopx
> + prefetchnta [r2+256]
> + add r6d, 63
> + jle .end16
> +.loop16:
> + movq m0, [r2 ]
> + movq m1, [r2+8]
> + movntq [r0 ], m0
> + movntq [r0+8], m1
> + add r2, 16
> + add r0, 16
> + sub r6d, 16
> + jg .loop16
> +.end16:
> + add r0, r1
> + add r2, r3
> + dec r5d
> + jg .loopy
> + sfence
> + emms
> + RET
> +
> +
> +%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
> +%if HIGH_BIT_DEPTH
> +%assign x 0
> +%rep 16/mmsize
> + mov%4 m0, [%2+(x/2)*mmsize]
> + mov%4 m1, [%3+(x/2)*mmsize]
> + punpckhwd m2, m0, m1
> + punpcklwd m0, m1
> + mov%5a [%1+(x+0)*mmsize], m0
> + mov%5a [%1+(x+1)*mmsize], m2
> + %assign x (x+2)
> +%endrep
> +%else
> + movq m0, [%2]
> +%if mmsize==16
> +%ifidn %4, a
> + punpcklbw m0, [%3]
> +%else
> + movq m1, [%3]
> + punpcklbw m0, m1
> +%endif
> + mov%5a [%1], m0
> +%else
> + movq m1, [%3]
> + punpckhbw m2, m0, m1
> + punpcklbw m0, m1
> + mov%5a [%1+0], m0
> + mov%5a [%1+8], m2
> +%endif
> +%endif ; HIGH_BIT_DEPTH
> +%endmacro
> +
> +%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant,
> is aligned
> +%if HIGH_BIT_DEPTH
> +%assign n 0
> +%rep 16/mmsize
> + mova m0, [%3+(n+0)*mmsize]
> + mova m1, [%3+(n+1)*mmsize]
> + psrld m2, m0, 16
> + psrld m3, m1, 16
> + pand m0, %5
> + pand m1, %5
> + packssdw m0, m1
> + packssdw m2, m3
> + mov%6 [%1+(n/2)*mmsize], m0
> + mov%6 [%2+(n/2)*mmsize], m2
> + %assign n (n+2)
> +%endrep
> +%else ; !HIGH_BIT_DEPTH
> +%if mmsize==16
> + mova m0, [%3]
> +%if cpuflag(ssse3)
> + pshufb m0, %5
> +%else
> + mova m1, m0
> + pand m0, %5
> + psrlw m1, 8
> + packuswb m0, m1
> +%endif
> +%if %4
> + mova [%1], m0
> +%else
> + movq [%1], m0
> + movhps [%2], m0
> +%endif
> +%else
> + mova m0, [%3]
> + mova m1, [%3+8]
> + mova m2, m0
> + mova m3, m1
> + pand m0, %5
> + pand m1, %5
> + psrlw m2, 8
> + psrlw m3, 8
> + packuswb m0, m1
> + packuswb m2, m3
> + mova [%1], m0
> + mova [%2], m2
> +%endif ; mmsize == 16
> +%endif ; HIGH_BIT_DEPTH
> +%endmacro
> +
> +%macro PLANE_INTERLEAVE 0
>
> +;-----------------------------------------------------------------------------
> +; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst,
> +; uint8_t *srcu, intptr_t i_srcu,
> +; uint8_t *srcv, intptr_t i_srcv, int w,
> int h )
>
> +;-----------------------------------------------------------------------------
> +; assumes i_dst and w are multiples of 16, and i_dst>2*w
> +cglobal plane_copy_interleave_core, 6,9
> + mov r6d, r6m
> +%if HIGH_BIT_DEPTH
> + FIX_STRIDES r1, r3, r5, r6d
> + movifnidn r1mp, r1
> + movifnidn r3mp, r3
> + mov r6m, r6d
> +%endif
> + lea r0, [r0+r6*2]
> + add r2, r6
> + add r4, r6
> +%if ARCH_X86_64
> + DECLARE_REG_TMP 7,8
> +%else
> + DECLARE_REG_TMP 1,3
> +%endif
> + mov t1, r1
> + shr t1, SIZEOF_PIXEL
> + sub t1, r6
> + mov t0d, r7m
> +.loopy:
> + mov r6d, r6m
> + neg r6
> +.prefetch:
> + prefetchnta [r2+r6]
> + prefetchnta [r4+r6]
> + add r6, 64
> + jl .prefetch
> + mov r6d, r6m
> + neg r6
> +.loopx:
> + INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL,
> r4+r6+0*SIZEOF_PIXEL, u, nt
> + INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL,
> r4+r6+8*SIZEOF_PIXEL, u, nt
> + add r6, 16*SIZEOF_PIXEL
> + jl .loopx
> +.pad:
> +%assign n 0
> +%rep SIZEOF_PIXEL
> +%if mmsize==8
> + movntq [r0+r6*2+(n+ 0)], m0
> + movntq [r0+r6*2+(n+ 8)], m0
> + movntq [r0+r6*2+(n+16)], m0
> + movntq [r0+r6*2+(n+24)], m0
> +%else
> + movntdq [r0+r6*2+(n+ 0)], m0
> + movntdq [r0+r6*2+(n+16)], m0
> +%endif
> + %assign n n+32
> +%endrep
> + add r6, 16*SIZEOF_PIXEL
> + cmp r6, t1
> + jl .pad
> + add r0, r1mp
> + add r2, r3mp
> + add r4, r5
> + dec t0d
> + jg .loopy
> + sfence
> + emms
> + RET
> +
>
> +;-----------------------------------------------------------------------------
> +; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t
> *srcu, uint8_t *srcv, int height )
>
> +;-----------------------------------------------------------------------------
> +cglobal store_interleave_chroma, 5,5
> + FIX_STRIDES r1
> +.loop:
> + INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
> + INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
> + add r2, FDEC_STRIDEB*2
> + add r3, FDEC_STRIDEB*2
> + lea r0, [r0+r1*2]
> + sub r4d, 2
> + jg .loop
> + REP_RET
> +%endmacro ; PLANE_INTERLEAVE
> +
> +%macro DEINTERLEAVE_START 0
> +%if HIGH_BIT_DEPTH
> + mova m4, [pd_ffff]
> +%elif cpuflag(ssse3)
> + mova m4, [deinterleave_shuf]
> +%else
> + mova m4, [pw_00ff]
> +%endif ; HIGH_BIT_DEPTH
> +%endmacro
> +
> +%macro PLANE_DEINTERLEAVE 0
>
> +;-----------------------------------------------------------------------------
> +; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
> +; pixel *dstv, intptr_t i_dstv,
> +; pixel *src, intptr_t i_src, int w, int h
> )
>
> +;-----------------------------------------------------------------------------
> +cglobal plane_copy_deinterleave, 6,7
> + DEINTERLEAVE_START
> + mov r6d, r6m
> + FIX_STRIDES r1, r3, r5, r6d
> +%if HIGH_BIT_DEPTH
> + mov r6m, r6d
> +%endif
> + add r0, r6
> + add r2, r6
> + lea r4, [r4+r6*2]
> +.loopy:
> + mov r6d, r6m
> + neg r6
> +.loopx:
> + DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+
> 0*SIZEOF_PIXEL, 0, m4, u
> + DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL,
> r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
> + add r6, 16*SIZEOF_PIXEL
> + jl .loopx
> + add r0, r1
> + add r2, r3
> + add r4, r5
> + dec dword r7m
> + jg .loopy
> + REP_RET
> +
>
> +;-----------------------------------------------------------------------------
> +; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t
> i_src, int height )
>
> +;-----------------------------------------------------------------------------
> +cglobal load_deinterleave_chroma_fenc, 4,4
> + DEINTERLEAVE_START
> + FIX_STRIDES r2
> +.loop:
> + DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
> + DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
> + add r0, FENC_STRIDEB*2
> + lea r1, [r1+r2*2]
> + sub r3d, 2
> + jg .loop
> + REP_RET
> +
>
> +;-----------------------------------------------------------------------------
> +; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t
> i_src, int height )
>
> +;-----------------------------------------------------------------------------
> +cglobal load_deinterleave_chroma_fdec, 4,4
> + DEINTERLEAVE_START
> + FIX_STRIDES r2
> +.loop:
> + DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
> + DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
> + add r0, FDEC_STRIDEB*2
> + lea r1, [r1+r2*2]
> + sub r3d, 2
> + jg .loop
> + REP_RET
> +%endmacro ; PLANE_DEINTERLEAVE
> +
> +%if HIGH_BIT_DEPTH
> +INIT_MMX mmx2
> +PLANE_INTERLEAVE
> +INIT_MMX mmx
> +PLANE_DEINTERLEAVE
> +INIT_XMM sse2
> +PLANE_INTERLEAVE
> +PLANE_DEINTERLEAVE
> +INIT_XMM avx
> +PLANE_INTERLEAVE
> +PLANE_DEINTERLEAVE
> +%else
> +INIT_MMX mmx2
> +PLANE_INTERLEAVE
> +INIT_MMX mmx
> +PLANE_DEINTERLEAVE
> +INIT_XMM sse2
> +PLANE_INTERLEAVE
> +PLANE_DEINTERLEAVE
> +INIT_XMM ssse3
> +PLANE_DEINTERLEAVE
> +%endif
> +
> +; These functions are not general-use; not only do the SSE ones require
> aligned input,
> +; but they also will fail if given a non-mod16 size.
> +; memzero SSE will fail for non-mod128.
> +
>
> +;-----------------------------------------------------------------------------
> +; void *memcpy_aligned( void *dst, const void *src, size_t n );
>
> +;-----------------------------------------------------------------------------
> +INIT_MMX
> +cglobal memcpy_aligned_mmx, 3,3
> + test r2d, 16
> + jz .copy32start
> + movq mm0, [r1 + r2 - 16]
> + movq mm1, [r1 + r2 - 8]
> + movq [r0 + r2 - 16], mm0
> + movq [r0 + r2 - 8], mm1
> + sub r2d, 16
> +.copy32start
> + test r2d, r2d
> + jz .ret
> +.copy32:
> + movq mm0, [r1 + r2 - 32]
> + movq mm1, [r1 + r2 - 24]
> + movq mm2, [r1 + r2 - 16]
> + movq mm3, [r1 + r2 - 8]
> + movq [r0 + r2 - 32], mm0
> + movq [r0 + r2 - 24], mm1
> + movq [r0 + r2 - 16], mm2
> + movq [r0 + r2 - 8], mm3
> + sub r2d, 32
> + jg .copy32
> +.ret
> + REP_RET
> +
>
> +;-----------------------------------------------------------------------------
> +; void *memcpy_aligned( void *dst, const void *src, size_t n );
>
> +;-----------------------------------------------------------------------------
> +cglobal memcpy_aligned_sse2, 3,3
> + test r2d, 16
> + jz .copy32
> + movdqa xmm0, [r1 + r2 - 16]
> + movdqa [r0 + r2 - 16], xmm0
> + sub r2d, 16
> +.copy32:
> + test r2d, 32
> + jz .copy64start
> + movdqa xmm0, [r1 + r2 - 32]
> + movdqa [r0 + r2 - 32], xmm0
> + movdqa xmm1, [r1 + r2 - 16]
> + movdqa [r0 + r2 - 16], xmm1
> + sub r2d, 32
> +.copy64start
> + test r2d, r2d
> + jz .ret
> +.copy64:
> + movdqa xmm0, [r1 + r2 - 64]
> + movdqa [r0 + r2 - 64], xmm0
> + movdqa xmm1, [r1 + r2 - 48]
> + movdqa [r0 + r2 - 48], xmm1
> + movdqa xmm2, [r1 + r2 - 32]
> + movdqa [r0 + r2 - 32], xmm2
> + movdqa xmm3, [r1 + r2 - 16]
> + movdqa [r0 + r2 - 16], xmm3
> + sub r2d, 64
> + jg .copy64
> +.ret:
> + REP_RET
> +
>
> +;-----------------------------------------------------------------------------
> +; void *memzero_aligned( void *dst, size_t n );
>
> +;-----------------------------------------------------------------------------
> +%macro MEMZERO 0
> +cglobal memzero_aligned, 2,2
> + add r0, r1
> + neg r1
> + pxor m0, m0
> +.loop:
> +%assign i 0
> +%rep 8
> + mova [r0 + r1 + i], m0
> +%assign i i+mmsize
> +%endrep
> + add r1, mmsize*8
> + jl .loop
> + REP_RET
> +%endmacro
> +
> +INIT_MMX mmx
> +MEMZERO
> +INIT_XMM sse2
> +MEMZERO
> +
> +
> +
> +%if HIGH_BIT_DEPTH == 0
>
> +;-----------------------------------------------------------------------------
> +; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
>
> +;-----------------------------------------------------------------------------
> +INIT_XMM
> +cglobal integral_init4h_sse4, 3,4
> + lea r3, [r0+r2*2]
> + add r1, r2
> + neg r2
> + pxor m4, m4
> +.loop:
> + movdqa m0, [r1+r2]
> + movdqa m1, [r1+r2+16]
> + palignr m1, m0, 8
> + mpsadbw m0, m4, 0
> + mpsadbw m1, m4, 0
> + paddw m0, [r0+r2*2]
> + paddw m1, [r0+r2*2+16]
> + movdqa [r3+r2*2 ], m0
> + movdqa [r3+r2*2+16], m1
> + add r2, 16
> + jl .loop
> + REP_RET
> +
> +%macro INTEGRAL_INIT8H 0
> +cglobal integral_init8h, 3,4
> + lea r3, [r0+r2*2]
> + add r1, r2
> + neg r2
> + pxor m4, m4
> +.loop:
> + movdqa m0, [r1+r2]
> + movdqa m1, [r1+r2+16]
> + palignr m1, m0, 8
> + mpsadbw m2, m0, m4, 4
> + mpsadbw m3, m1, m4, 4
> + mpsadbw m0, m4, 0
> + mpsadbw m1, m4, 0
> + paddw m0, [r0+r2*2]
> + paddw m1, [r0+r2*2+16]
> + paddw m0, m2
> + paddw m1, m3
> + movdqa [r3+r2*2 ], m0
> + movdqa [r3+r2*2+16], m1
> + add r2, 16
> + jl .loop
> + REP_RET
> +%endmacro
> +
> +INIT_XMM sse4
> +INTEGRAL_INIT8H
> +INIT_XMM avx
> +INTEGRAL_INIT8H
> +%endif ; !HIGH_BIT_DEPTH
> +
> +%macro INTEGRAL_INIT_8V 0
>
> +;-----------------------------------------------------------------------------
> +; void integral_init8v( uint16_t *sum8, intptr_t stride )
>
> +;-----------------------------------------------------------------------------
> +cglobal integral_init8v, 3,3
> + shl r1, 1
> + add r0, r1
> + lea r2, [r0+r1*8]
> + neg r1
> +.loop:
> + mova m0, [r2+r1]
> + mova m1, [r2+r1+mmsize]
> + psubw m0, [r0+r1]
> + psubw m1, [r0+r1+mmsize]
> + mova [r0+r1], m0
> + mova [r0+r1+mmsize], m1
> + add r1, 2*mmsize
> + jl .loop
> + REP_RET
> +%endmacro
> +
> +INIT_MMX mmx
> +INTEGRAL_INIT_8V
> +INIT_XMM sse2
> +INTEGRAL_INIT_8V
> +
>
> +;-----------------------------------------------------------------------------
> +; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
>
> +;-----------------------------------------------------------------------------
> +INIT_MMX
> +cglobal integral_init4v_mmx, 3,5
> + shl r2, 1
> + lea r3, [r0+r2*4]
> + lea r4, [r0+r2*8]
> + mova m0, [r0+r2]
> + mova m4, [r4+r2]
> +.loop:
> + mova m1, m4
> + psubw m1, m0
> + mova m4, [r4+r2-8]
> + mova m0, [r0+r2-8]
> + paddw m1, m4
> + mova m3, [r3+r2-8]
> + psubw m1, m0
> + psubw m3, m0
> + mova [r0+r2-8], m1
> + mova [r1+r2-8], m3
> + sub r2, 8
> + jge .loop
> + REP_RET
> +
> +INIT_XMM
> +cglobal integral_init4v_sse2, 3,5
> + shl r2, 1
> + add r0, r2
> + add r1, r2
> + lea r3, [r0+r2*4]
> + lea r4, [r0+r2*8]
> + neg r2
> +.loop:
> + mova m0, [r0+r2]
> + mova m1, [r4+r2]
> + mova m2, m0
> + mova m4, m1
> + shufpd m0, [r0+r2+16], 1
> + shufpd m1, [r4+r2+16], 1
> + paddw m0, m2
> + paddw m1, m4
> + mova m3, [r3+r2]
> + psubw m1, m0
> + psubw m3, m2
> + mova [r0+r2], m1
> + mova [r1+r2], m3
> + add r2, 16
> + jl .loop
> + REP_RET
> +
> +cglobal integral_init4v_ssse3, 3,5
> + shl r2, 1
> + add r0, r2
> + add r1, r2
> + lea r3, [r0+r2*4]
> + lea r4, [r0+r2*8]
> + neg r2
> +.loop:
> + mova m2, [r0+r2]
> + mova m0, [r0+r2+16]
> + mova m4, [r4+r2]
> + mova m1, [r4+r2+16]
> + palignr m0, m2, 8
> + palignr m1, m4, 8
> + paddw m0, m2
> + paddw m1, m4
> + mova m3, [r3+r2]
> + psubw m1, m0
> + psubw m3, m2
> + mova [r0+r2], m1
> + mova [r1+r2], m3
> + add r2, 16
> + jl .loop
> + REP_RET
> +
> +%macro FILT8x4 7
> + mova %3, [r0+%7]
> + mova %4, [r0+r5+%7]
> + pavgb %3, %4
> + pavgb %4, [r0+r5*2+%7]
> + PALIGNR %1, %3, 1, m6
> + PALIGNR %2, %4, 1, m6
> +%if cpuflag(xop)
> + pavgb %1, %3
> + pavgb %2, %4
> +%else
> + pavgb %1, %3
> + pavgb %2, %4
> + psrlw %5, %1, 8
> + psrlw %6, %2, 8
> + pand %1, m7
> + pand %2, m7
> +%endif
> +%endmacro
> +
> +%macro FILT16x2 4
> + mova m3, [r0+%4+mmsize]
> + mova m2, [r0+%4]
> + pavgb m3, [r0+%4+r5+mmsize]
> + pavgb m2, [r0+%4+r5]
> + PALIGNR %1, m3, 1, m6
> + pavgb %1, m3
> + PALIGNR m3, m2, 1, m6
> + pavgb m3, m2
> +%if cpuflag(xop)
> + vpperm m5, m3, %1, m7
> + vpperm m3, m3, %1, m6
> +%else
> + psrlw m5, m3, 8
> + psrlw m4, %1, 8
> + pand m3, m7
> + pand %1, m7
> + packuswb m3, %1
> + packuswb m5, m4
> +%endif
> + mova [%2], m3
> + mova [%3], m5
> + mova %1, m2
> +%endmacro
> +
> +%macro FILT8x2U 3
> + mova m3, [r0+%3+8]
> + mova m2, [r0+%3]
> + pavgb m3, [r0+%3+r5+8]
> + pavgb m2, [r0+%3+r5]
> + mova m1, [r0+%3+9]
> + mova m0, [r0+%3+1]
> + pavgb m1, [r0+%3+r5+9]
> + pavgb m0, [r0+%3+r5+1]
> + pavgb m1, m3
> + pavgb m0, m2
> + psrlw m3, m1, 8
> + psrlw m2, m0, 8
> + pand m1, m7
> + pand m0, m7
> + packuswb m0, m1
> + packuswb m2, m3
> + mova [%1], m0
> + mova [%2], m2
> +%endmacro
> +
> +%macro FILT8xU 3
> + mova m3, [r0+%3+8]
> + mova m2, [r0+%3]
> + pavgw m3, [r0+%3+r5+8]
> + pavgw m2, [r0+%3+r5]
> + movu m1, [r0+%3+10]
> + movu m0, [r0+%3+2]
> + pavgw m1, [r0+%3+r5+10]
> + pavgw m0, [r0+%3+r5+2]
> + pavgw m1, m3
> + pavgw m0, m2
> + psrld m3, m1, 16
> + psrld m2, m0, 16
> + pand m1, m7
> + pand m0, m7
> + packssdw m0, m1
> + packssdw m2, m3
> + movu [%1], m0
> + mova [%2], m2
> +%endmacro
> +
> +%macro FILT8xA 4
> + mova m3, [r0+%4+mmsize]
> + mova m2, [r0+%4]
> + pavgw m3, [r0+%4+r5+mmsize]
> + pavgw m2, [r0+%4+r5]
> + PALIGNR %1, m3, 2, m6
> + pavgw %1, m3
> + PALIGNR m3, m2, 2, m6
> + pavgw m3, m2
> +%if cpuflag(xop)
> + vpperm m5, m3, %1, m7
> + vpperm m3, m3, %1, m6
> +%else
> + psrld m5, m3, 16
> + psrld m4, %1, 16
> + pand m3, m7
> + pand %1, m7
> + packssdw m3, %1
> + packssdw m5, m4
> +%endif
> + mova [%2], m3
> + mova [%3], m5
> + mova %1, m2
> +%endmacro
> +
>
> +;-----------------------------------------------------------------------------
> +; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t
> *dsth, uint8_t *dstv, uint8_t *dstc,
> +; intptr_t src_stride, intptr_t dst_stride,
> int width, int height )
>
> +;-----------------------------------------------------------------------------
> +%macro FRAME_INIT_LOWRES 0
> +cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for
> HIGH_BIT_DEPTH, 12 otherwise
> +%if HIGH_BIT_DEPTH
> + shl dword r6m, 1
> + FIX_STRIDES r5
> + shl dword r7m, 1
> +%endif
> + ; src += 2*(height-1)*stride + 2*width
> + mov r6d, r8m
> + dec r6d
> + imul r6d, r5d
> + add r6d, r7m
> + lea r0, [r0+r6*2]
> + ; dst += (height-1)*stride + width
> + mov r6d, r8m
> + dec r6d
> + imul r6d, r6m
> + add r6d, r7m
> + add r1, r6
> + add r2, r6
> + add r3, r6
> + add r4, r6
> + ; gap = stride - width
> + mov r6d, r6m
> + sub r6d, r7m
> + PUSH r6
> + %define dst_gap [rsp+gprsize]
> + mov r6d, r5d
> + sub r6d, r7m
> + shl r6d, 1
> + PUSH r6
> + %define src_gap [rsp]
> +%if HIGH_BIT_DEPTH
> +%if cpuflag(xop)
> + mova m6, [deinterleave_shuf32a]
> + mova m7, [deinterleave_shuf32b]
> +%else
> + pcmpeqw m7, m7
> + psrld m7, 16
> +%endif
> +.vloop:
> + mov r6d, r7m
> +%ifnidn cpuname, mmx2
> + mova m0, [r0]
> + mova m1, [r0+r5]
> + pavgw m0, m1
> + pavgw m1, [r0+r5*2]
> +%endif
> +.hloop:
> + sub r0, mmsize*2
> + sub r1, mmsize
> + sub r2, mmsize
> + sub r3, mmsize
> + sub r4, mmsize
> +%ifidn cpuname, mmx2
> + FILT8xU r1, r2, 0
> + FILT8xU r3, r4, r5
> +%else
> + FILT8xA m0, r1, r2, 0
> + FILT8xA m1, r3, r4, r5
> +%endif
> + sub r6d, mmsize
> + jg .hloop
> +%else ; !HIGH_BIT_DEPTH
> +%if mmsize == 16
> + ; adjust for the odd end case
> + mov r6d, r7m
> + and r6d, 8
> + sub r1, r6
> + sub r2, r6
> + sub r3, r6
> + sub r4, r6
> + add dst_gap, r6d
> +%endif ; mmsize
> +%if cpuflag(xop)
> + mova m6, [deinterleave_shuf32a]
> + mova m7, [deinterleave_shuf32b]
> +%else
> + pcmpeqb m7, m7
> + psrlw m7, 8
> +%endif
> +.vloop:
> + mov r6d, r7m
> +%ifnidn cpuname, mmx2
> + mova m0, [r0]
> + mova m1, [r0+r5]
> + pavgb m0, m1
> + pavgb m1, [r0+r5*2]
> +%endif
> +%if mmsize == 16
> + test r6d, 8
> + jz .hloop
> + sub r0, 16
> + FILT8x4 m0, m1, m2, m3, m4, m5, 0
> +%if cpuflag(xop)
> + mova m4, m0
> + vpperm m0, m4, m1, m6
> + vpperm m1, m4, m1, m7
> + movq [r1], m0
> + movq [r2], m1
> + movhps [r3], m0
> + movhps [r4], m1
> +%else
> + packuswb m0, m4
> + packuswb m1, m5
> + movq [r1], m0
> + movhps [r2], m0
> + movq [r3], m1
> + movhps [r4], m1
> +%endif
> + mova m0, m2
> + mova m1, m3
> + sub r6d, 8
> + jz .skip
> +%endif ; mmsize
> +.hloop:
> + sub r0, mmsize*2
> + sub r1, mmsize
> + sub r2, mmsize
> + sub r3, mmsize
> + sub r4, mmsize
> +%ifdef m8
> + FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
> + mova m8, m0
> + mova m9, m1
> + FILT8x4 m2, m3, m0, m1, m4, m5, 0
> +%if cpuflag(xop)
> + vpperm m4, m2, m8, m7
> + vpperm m2, m2, m8, m6
> + vpperm m5, m3, m9, m7
> + vpperm m3, m3, m9, m6
> +%else
> + packuswb m2, m8
> + packuswb m3, m9
> + packuswb m4, m10
> + packuswb m5, m11
> +%endif
> + mova [r1], m2
> + mova [r2], m4
> + mova [r3], m3
> + mova [r4], m5
> +%elifidn cpuname, mmx2
> + FILT8x2U r1, r2, 0
> + FILT8x2U r3, r4, r5
> +%else
> + FILT16x2 m0, r1, r2, 0
> + FILT16x2 m1, r3, r4, r5
> +%endif
> + sub r6d, mmsize
> + jg .hloop
> +%endif ; HIGH_BIT_DEPTH
> +.skip:
> + mov r6, dst_gap
> + sub r0, src_gap
> + sub r1, r6
> + sub r2, r6
> + sub r3, r6
> + sub r4, r6
> + dec dword r8m
> + jg .vloop
> + ADD rsp, 2*gprsize
> + emms
> + RET
> +%endmacro ; FRAME_INIT_LOWRES
> +
> +INIT_MMX mmx2
> +FRAME_INIT_LOWRES
> +%if ARCH_X86_64 == 0
> +INIT_MMX cache32, mmx2
> +FRAME_INIT_LOWRES
> +%endif
> +INIT_XMM sse2
> +FRAME_INIT_LOWRES
> +INIT_XMM ssse3
> +FRAME_INIT_LOWRES
> +INIT_XMM avx
> +FRAME_INIT_LOWRES
> +INIT_XMM xop
> +FRAME_INIT_LOWRES
> +
>
> +;-----------------------------------------------------------------------------
> +; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t
> *intra_costs,
> +; uint16_t *inter_costs, uint16_t
> *inv_qscales, float *fps_factor, int len )
>
> +;-----------------------------------------------------------------------------
> +%macro MBTREE 0
> +cglobal mbtree_propagate_cost, 7,7,7
> + add r6d, r6d
> + lea r0, [r0+r6*2]
> + add r1, r6
> + add r2, r6
> + add r3, r6
> + add r4, r6
> + neg r6
> + pxor xmm4, xmm4
> + movss xmm6, [r5]
> + shufps xmm6, xmm6, 0
> + mulps xmm6, [pf_inv256]
> + movdqa xmm5, [pw_3fff]
> +.loop:
> + movq xmm2, [r2+r6] ; intra
> + movq xmm0, [r4+r6] ; invq
> + movq xmm3, [r3+r6] ; inter
> + movq xmm1, [r1+r6] ; prop
> + punpcklwd xmm2, xmm4
> + punpcklwd xmm0, xmm4
> + pmaddwd xmm0, xmm2
> + pand xmm3, xmm5
> + punpcklwd xmm1, xmm4
> + punpcklwd xmm3, xmm4
> +%if cpuflag(fma4)
> + cvtdq2ps xmm0, xmm0
> + cvtdq2ps xmm1, xmm1
> + vfmaddps xmm0, xmm0, xmm6, xmm1
> + cvtdq2ps xmm1, xmm2
> + psubd xmm2, xmm3
> + cvtdq2ps xmm2, xmm2
> + rcpps xmm3, xmm1
> + mulps xmm1, xmm3
> + mulps xmm0, xmm2
> + addps xmm2, xmm3, xmm3
> + vfnmaddps xmm3, xmm1, xmm3, xmm2
> + mulps xmm0, xmm3
> +%else
> + cvtdq2ps xmm0, xmm0
> + mulps xmm0, xmm6 ; intra*invq*fps_factor>>8
> + cvtdq2ps xmm1, xmm1 ; prop
> + addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8)
> + cvtdq2ps xmm1, xmm2 ; intra
> + psubd xmm2, xmm3 ; intra - inter
> + cvtdq2ps xmm2, xmm2 ; intra - inter
> + rcpps xmm3, xmm1 ; 1 / intra 1st approximation
> + mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
> + mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
> + mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) *
> (intra - inter)
> + addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
> + subps xmm3, xmm1 ; 2nd approximation for 1/intra
> + mulps xmm0, xmm3 ; / intra
> +%endif
> + cvtps2dq xmm0, xmm0
> + movdqa [r0+r6*2], xmm0
> + add r6, 8
> + jl .loop
> + REP_RET
> +%endmacro
> +
> +INIT_XMM sse2
> +MBTREE
> +; Bulldozer only has a 128-bit float unit, so the AVX version of this
> function is actually slower.
> +INIT_XMM fma4
> +MBTREE
> +
> +%macro INT16_TO_FLOAT 1
> + vpunpckhwd xmm4, xmm%1, xmm7
> + vpunpcklwd xmm%1, xmm7
> + vinsertf128 ymm%1, ymm%1, xmm4, 1
> + vcvtdq2ps ymm%1, ymm%1
> +%endmacro
> +
> +; FIXME: align loads/stores to 16 bytes
> +INIT_YMM avx
> +cglobal mbtree_propagate_cost, 7,7,8
> + add r6d, r6d
> + lea r0, [r0+r6*2]
> + add r1, r6
> + add r2, r6
> + add r3, r6
> + add r4, r6
> + neg r6
> + vmovdqa xmm5, [pw_3fff]
> + vbroadcastss ymm6, [r5]
> + vmulps ymm6, ymm6, [pf_inv256]
> + vpxor xmm7, xmm7
> +.loop:
> + vmovdqu xmm0, [r2+r6] ; intra
> + vmovdqu xmm1, [r4+r6] ; invq
> + vmovdqu xmm2, [r1+r6] ; prop
> + vpand xmm3, xmm5, [r3+r6] ; inter
> + INT16_TO_FLOAT 0
> + INT16_TO_FLOAT 1
> + INT16_TO_FLOAT 2
> + INT16_TO_FLOAT 3
> + vmulps ymm1, ymm1, ymm0
> + vsubps ymm4, ymm0, ymm3
> + vmulps ymm1, ymm1, ymm6 ; intra*invq*fps_factor>>8
> + vaddps ymm1, ymm1, ymm2 ; prop + (intra*invq*fps_factor>>8)
> + vrcpps ymm3, ymm0 ; 1 / intra 1st approximation
> + vmulps ymm2, ymm0, ymm3 ; intra * (1/intra 1st approx)
> + vmulps ymm2, ymm2, ymm3 ; intra * (1/intra 1st approx)^2
> + vmulps ymm1, ymm1, ymm4 ; (prop +
> (intra*invq*fps_factor>>8)) * (intra - inter)
> + vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx)
> + vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra
> + vmulps ymm1, ymm1, ymm3 ; / intra
> + vcvtps2dq ymm1, ymm1
> + vmovdqu [r0+r6*2], ymm1
> + add r6, 16
> + jl .loop
> + vzeroupper
> + RET
> diff -r f2f70fa9b4f3 -r 2454a81c67fa source/common/x86/pixel.h
> --- a/source/common/x86/pixel.h Fri Jul 26 02:19:06 2013 -0500
> +++ b/source/common/x86/pixel.h Fri Jul 26 14:12:31 2013 -0700
> @@ -202,6 +202,9 @@
> uint64_t x265_pixel_sa8d_satd_16x16_xop ( pixel *pix1, intptr_t
> stride1, pixel *pix2, intptr_t stride2 );
> uint64_t x265_pixel_sa8d_satd_16x16_avx2 ( pixel *pix1, intptr_t
> stride1, pixel *pix2, intptr_t stride2 );
>
> +void x265_frame_init_lowres_core_mmx2( pixel *src0, pixel *dst0, pixel
> *dsth, pixel *dstv, pixel *dstc,\
> + intptr_t src_stride, intptr_t
> dst_stride, int width, int height );
> +
> #define DECL_SSD(width,suffix)\
> int x265_pixel_ssd_##width##x64_##suffix( pixel *, intptr_t, pixel *,
> intptr_t ); \
> int x265_pixel_ssd_##width##x48_##suffix( pixel *, intptr_t, pixel *,
> intptr_t ); \
> diff -r f2f70fa9b4f3 -r 2454a81c67fa source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp Fri Jul 26 02:19:06 2013 -0500
> +++ b/source/test/pixelharness.cpp Fri Jul 26 14:12:31 2013 -0700
> @@ -54,11 +54,13 @@
> {
> pbuf1 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 *
> 32, 32);
> pbuf2 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 *
> 32, 32);
> + pbuf3 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 *
> 32, 32);
> + pbuf4 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 *
> 32, 32);
>
> sbuf1 = (short*)TestHarness::alignedMalloc(sizeof(short), 64 * 64 *
> 32, 32);
> sbuf2 = (short*)TestHarness::alignedMalloc(sizeof(short), 64 * 64 *
> 32, 32);
>
> - if (!pbuf1 || !pbuf2)
> + if (!pbuf1 || !pbuf2 || !pbuf3 || !pbuf4 )
> {
> fprintf(stderr, "malloc failed, unable to initiate tests!\n");
> exit(1);
> @@ -69,6 +71,8 @@
> //Generate the Random Buffer for Testing
> pbuf1[i] = rand() & PIXEL_MAX;
> pbuf2[i] = rand() & PIXEL_MAX;
> + pbuf3[i] = rand() & PIXEL_MAX;
> + pbuf4[i] = rand() & PIXEL_MAX;
>
> sbuf1[i] = rand() & PIXEL_MAX;
> sbuf2[i] = rand() & PIXEL_MAX;
> @@ -79,6 +83,8 @@
> {
> TestHarness::alignedFree(pbuf1);
> TestHarness::alignedFree(pbuf2);
> + TestHarness::alignedFree(pbuf3);
> + TestHarness::alignedFree(pbuf4);
> TestHarness::alignedFree(sbuf1);
> TestHarness::alignedFree(sbuf2);
> }
> @@ -423,6 +429,46 @@
> return true;
> }
>
> +bool PixelHarness::check_downscale_t(x265::downscale_t ref,
> x265::downscale_t opt)
> +{
> + ALIGN_VAR_16(pixel, ref_dest0[64 * 64]);
>
You only need to declare the first array as aligned. So long as each array
is a multiple of 16 bytes, the others will automatically be aligned.
> + ALIGN_VAR_16(pixel, opt_dest0[64 * 64]);
> +
> + ALIGN_VAR_16(pixel, ref_desth[64 * 64]);
> + ALIGN_VAR_16(pixel, opt_desth[64 * 64]);
> +
> + ALIGN_VAR_16(pixel, ref_destv[64 * 64]);
> + ALIGN_VAR_16(pixel, opt_destv[64 * 64]);
> +
> + ALIGN_VAR_16(pixel, ref_destc[64 * 64]);
> + ALIGN_VAR_16(pixel, opt_destc[64 * 64]);
> +
> + int bx = 64;
> + int by = 64;
> + int j = 0;
> + for (int i = 0; i <= 100; i++)
> + {
> + ref(pbuf2 + j, ref_dest0, ref_desth, ref_destv, ref_destc, 64,
> 64, bx, by);
> + opt(pbuf2 + j, opt_dest0, opt_desth, opt_destv, opt_destc, 64,
> 64, bx, by);
> +
> +
> + if (memcmp(ref_dest0, opt_dest0, 64 * 64 * sizeof(pixel)))
> + return false;
>
There are tabs here, and extra blank lines. I'll clean these up before
pushing, but you need to fix whatever editor is adding them.
> + if (memcmp(ref_desth, opt_desth, 64 * 64 * sizeof(pixel)))
> + return false;
> + if (memcmp(ref_destv, opt_destv, 64 * 64 * sizeof(pixel)))
> + return false;
> + if (memcmp(ref_destc, opt_destc, 64 * 64 * sizeof(pixel)))
> + return false;
> +
> + j += 4;
> + bx = 8 * ((rand() & 7) + 1);
> + by = 8 * ((rand() & 7) + 1);
>
the ASM functions above MMX cannot handle addresses and sizes with odd
alignments like these. Will fix in a follow up commit.
> + }
> +
> + return true;
> +}
> +
> bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const
> EncoderPrimitives& opt)
> {
> for (uint16_t curpar = 0; curpar < NUM_PARTITIONS; curpar++)
> @@ -600,6 +646,14 @@
> }
> }
>
> + if (opt.frame_init_lowres_core)
> + {
> + if (!check_downscale_t(ref.frame_init_lowres_core,
> opt.frame_init_lowres_core))
> + {
> + printf("downscale failed!\n");
> + return false;
> + }
> + }
> return true;
> }
>
> @@ -726,4 +780,10 @@
> printf("pixel_pp add");
> REPORT_SPEEDUP(opt.pixeladd_pp, ref.pixeladd_pp, 64, 64, pbuf1,
> FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
> }
> +
> + if (opt.frame_init_lowres_core)
> + {
> + printf("downscale");
> + REPORT_SPEEDUP(opt.frame_init_lowres_core,
> ref.frame_init_lowres_core, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64,
> 64);
> + }
> }
> diff -r f2f70fa9b4f3 -r 2454a81c67fa source/test/pixelharness.h
> --- a/source/test/pixelharness.h Fri Jul 26 02:19:06 2013 -0500
> +++ b/source/test/pixelharness.h Fri Jul 26 14:12:31 2013 -0700
> @@ -31,8 +31,8 @@
> {
> protected:
>
> - pixel *pbuf1, *pbuf2;
> -
> + pixel *pbuf1, *pbuf2, *pbuf3, *pbuf4;
> +
> short *sbuf1, *sbuf2;
>
> bool check_pixelcmp(x265::pixelcmp_t ref, x265::pixelcmp_t opt);
> @@ -50,6 +50,7 @@
> bool check_pixelsub_sp(x265::pixelsub_sp_t ref, x265::pixelsub_sp_t
> opt);
> bool check_pixeladd_ss(x265::pixeladd_ss_t ref, x265::pixeladd_ss_t
> opt);
> bool check_pixeladd_pp(x265::pixeladd_pp_t ref, x265::pixeladd_pp_t
> opt);
> + bool check_downscale_t(x265::downscale_t ref, x265::downscale_t opt);
>
> public:
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> http://mailman.videolan.org/listinfo/x265-devel
>
>
--
Steve Borho
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/private/x265-devel/attachments/20130726/4de4c3c9/attachment-0001.html>
More information about the x265-devel
mailing list