[x265] [PATCH] Lookahead: asm primitve for downscale
gopu at multicorewareinc.com
gopu at multicorewareinc.com
Fri Jul 26 23:31:41 CEST 2013
# HG changeset patch
# User ggopu at bitbucket.org
# Date 1374873151 25200
# Node ID 2454a81c67fa50b20a71c81a4a5b870eade71b77
# Parent f2f70fa9b4f3f075629d02c35684d16bea67fee0
Lookahead: asm primitve for downscale
diff -r f2f70fa9b4f3 -r 2454a81c67fa source/common/pixel.cpp
--- a/source/common/pixel.cpp Fri Jul 26 02:19:06 2013 -0500
+++ b/source/common/pixel.cpp Fri Jul 26 14:12:31 2013 -0700
@@ -598,6 +598,31 @@
}
}
+void frame_init_lowres_core( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
+ intptr_t src_stride, intptr_t dst_stride, int width, int height )
+{
+ for( int y = 0; y < height; y++ )
+ {
+ pixel *src1 = src0+src_stride;
+ pixel *src2 = src1+src_stride;
+ for( int x = 0; x<width; x++ )
+ {
+ // slower than naive bilinear, but matches asm
+#define FILTER(a,b,c,d) ((((a+b+1)>>1)+((c+d+1)>>1)+1)>>1)
+ dst0[x] = FILTER(src0[2*x ], src1[2*x ], src0[2*x+1], src1[2*x+1]);
+ dsth[x] = FILTER(src0[2*x+1], src1[2*x+1], src0[2*x+2], src1[2*x+2]);
+ dstv[x] = FILTER(src1[2*x ], src2[2*x ], src1[2*x+1], src2[2*x+1]);
+ dstc[x] = FILTER(src1[2*x+1], src2[2*x+1], src1[2*x+2], src2[2*x+2]);
+#undef FILTER
+ }
+ src0 += src_stride*2;
+ dst0 += dst_stride;
+ dsth += dst_stride;
+ dstv += dst_stride;
+ dstc += dst_stride;
+ }
+}
+
} // end anonymous namespace
namespace x265 {
@@ -806,5 +831,6 @@
p.scale1D_128to64 = scale1D_128to64;
p.scale2D_64to32 = scale2D_64to32;
+ p.frame_init_lowres_core = frame_init_lowres_core;
}
}
diff -r f2f70fa9b4f3 -r 2454a81c67fa source/common/primitives.h
--- a/source/common/primitives.h Fri Jul 26 02:19:06 2013 -0500
+++ b/source/common/primitives.h Fri Jul 26 14:12:31 2013 -0700
@@ -227,6 +227,8 @@
int marginX, int marginY, int w, int roundw, int shiftw, int offsetw);
typedef void (*weightpUni_t)(short *src, pixel *dst, int srcStride, int dstStride, int width, int height, int w0, int round, int shift, int offset);
typedef void (*scale_t)(pixel *dst, pixel *src, intptr_t stride);
+typedef void (*downscale_t)( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,
+ intptr_t src_stride, intptr_t dst_stride, int width, int height );
/* Define a structure containing function pointers to optimized encoder
* primitives. Each pointer can reference either an assembly routine,
@@ -286,6 +288,7 @@
scale_t scale1D_128to64;
scale_t scale2D_64to32;
+ downscale_t frame_init_lowres_core;
};
/* This copy of the table is what gets used by the encoder.
diff -r f2f70fa9b4f3 -r 2454a81c67fa source/common/x86/CMakeLists.txt
--- a/source/common/x86/CMakeLists.txt Fri Jul 26 02:19:06 2013 -0500
+++ b/source/common/x86/CMakeLists.txt Fri Jul 26 14:12:31 2013 -0700
@@ -5,7 +5,7 @@
add_definitions(-DHAVE_ALIGNED_STACK=0)
endif()
-set(ASMS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm)
+set(ASMS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a2.asm)
if (X64)
add_definitions(-DARCH_X86_64=1)
else()
diff -r f2f70fa9b4f3 -r 2454a81c67fa source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Jul 26 02:19:06 2013 -0500
+++ b/source/common/x86/asm-primitives.cpp Fri Jul 26 14:12:31 2013 -0700
@@ -165,6 +165,8 @@
p.satd[PARTITION_64x48] = cmp<64, 48, 16, 16, x265_pixel_satd_16x16_mmx2>;
p.satd[PARTITION_64x64] = cmp<64, 64, 16, 16, x265_pixel_satd_16x16_mmx2>;
+ p.frame_init_lowres_core = x265_frame_init_lowres_core_mmx2;
+
INIT2( sad, _sse2 );
INIT2( sad_x3, _sse2 );
INIT2( sad_x4, _sse2 );
diff -r f2f70fa9b4f3 -r 2454a81c67fa source/common/x86/mc-a2.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/mc-a2.asm Fri Jul 26 14:12:31 2013 -0700
@@ -0,0 +1,1790 @@
+;*****************************************************************************
+;* mc-a2.asm: x86 motion compensation
+;*****************************************************************************
+;* Copyright (C) 2005-2012 x264 project
+;*
+;* Authors: Loren Merritt <lorenm at u.washington.edu>
+;* Jason Garrett-Glaser <darkshikari at gmail.com>
+;* Holger Lubitz <holger at lubitz.org>
+;* Mathieu Monnier <manao at melix.net>
+;* Oskar Arvidsson <oskar at irock.se>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing at x264.com.
+;*****************************************************************************
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA
+
+filt_mul20: times 16 db 20
+filt_mul15: times 8 db 1, -5
+filt_mul51: times 8 db -5, 1
+hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
+deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
+%if HIGH_BIT_DEPTH
+deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
+deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
+%else
+deinterleave_shuf32a: db 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30
+deinterleave_shuf32b: db 1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31
+%endif
+
+pd_16: times 4 dd 16
+pd_0f: times 4 dd 0xffff
+pf_inv256: times 8 dd 0.00390625
+
+pad10: times 8 dw 10*PIXEL_MAX
+pad20: times 8 dw 20*PIXEL_MAX
+pad30: times 8 dw 30*PIXEL_MAX
+depad: times 4 dd 32*20*PIXEL_MAX + 512
+
+tap1: times 4 dw 1, -5
+tap2: times 4 dw 20, 20
+tap3: times 4 dw -5, 1
+
+SECTION .text
+
+cextern pb_0
+cextern pw_1
+cextern pw_16
+cextern pw_32
+cextern pw_00ff
+cextern pw_3fff
+cextern pw_pixel_max
+cextern pd_ffff
+
+%macro LOAD_ADD 4
+ movh %4, %3
+ movh %1, %2
+ punpcklbw %4, m0
+ punpcklbw %1, m0
+ paddw %1, %4
+%endmacro
+
+%macro LOAD_ADD_2 6
+ mova %5, %3
+ mova %1, %4
+ punpckhbw %6, %5, m0
+ punpcklbw %5, m0
+ punpckhbw %2, %1, m0
+ punpcklbw %1, m0
+ paddw %1, %5
+ paddw %2, %6
+%endmacro
+
+%macro FILT_V2 6
+ psubw %1, %2 ; a-b
+ psubw %4, %5
+ psubw %2, %3 ; b-c
+ psubw %5, %6
+ psllw %2, 2
+ psllw %5, 2
+ psubw %1, %2 ; a-5*b+4*c
+ psllw %3, 4
+ psubw %4, %5
+ psllw %6, 4
+ paddw %1, %3 ; a-5*b+20*c
+ paddw %4, %6
+%endmacro
+
+%macro FILT_H 3
+ psubw %1, %2 ; a-b
+ psraw %1, 2 ; (a-b)/4
+ psubw %1, %2 ; (a-b)/4-b
+ paddw %1, %3 ; (a-b)/4-b+c
+ psraw %1, 2 ; ((a-b)/4-b+c)/4
+ paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+%endmacro
+
+%macro FILT_H2 6
+ psubw %1, %2
+ psubw %4, %5
+ psraw %1, 2
+ psraw %4, 2
+ psubw %1, %2
+ psubw %4, %5
+ paddw %1, %3
+ paddw %4, %6
+ psraw %1, 2
+ psraw %4, 2
+ paddw %1, %3
+ paddw %4, %6
+%endmacro
+
+%macro FILT_PACK 4-6 b
+ paddw %1, %4
+ paddw %2, %4
+%if %0 == 6
+ psubusw %1, %6
+ psubusw %2, %6
+ psrlw %1, %3
+ psrlw %2, %3
+%else
+ psraw %1, %3
+ psraw %2, %3
+%endif
+%ifnidn w, %5
+ packuswb %1, %2
+%endif
+%endmacro
+
+;The hpel_filter routines use non-temporal writes for output.
+;The following defines may be uncommented for testing.
+;Doing the hpel_filter temporal may be a win if the last level cache
+;is big enough (preliminary benching suggests on the order of 4* framesize).
+
+;%define movntq movq
+;%define movntps movaps
+;%define sfence
+
+%if HIGH_BIT_DEPTH
+;-----------------------------------------------------------------------------
+; void hpel_filter_v( uint16_t *dst, uint16_t *src, int16_t *buf, intptr_t stride, intptr_t width );
+;-----------------------------------------------------------------------------
+%macro HPEL_FILTER 0
+cglobal hpel_filter_v, 5,6,11
+ FIX_STRIDES r3, r4
+ lea r5, [r1+r3]
+ sub r1, r3
+ sub r1, r3
+%if num_mmregs > 8
+ mova m8, [pad10]
+ mova m9, [pad20]
+ mova m10, [pad30]
+ %define s10 m8
+ %define s20 m9
+ %define s30 m10
+%else
+ %define s10 [pad10]
+ %define s20 [pad20]
+ %define s30 [pad30]
+%endif
+ add r0, r4
+ add r2, r4
+ neg r4
+ mova m7, [pw_pixel_max]
+ pxor m0, m0
+.loop:
+ mova m1, [r1]
+ mova m2, [r1+r3]
+ mova m3, [r1+r3*2]
+ mova m4, [r1+mmsize]
+ mova m5, [r1+r3+mmsize]
+ mova m6, [r1+r3*2+mmsize]
+ paddw m1, [r5+r3*2]
+ paddw m2, [r5+r3]
+ paddw m3, [r5]
+ paddw m4, [r5+r3*2+mmsize]
+ paddw m5, [r5+r3+mmsize]
+ paddw m6, [r5+mmsize]
+ add r1, 2*mmsize
+ add r5, 2*mmsize
+ FILT_V2 m1, m2, m3, m4, m5, m6
+ mova m6, [pw_16]
+ psubw m1, s20
+ psubw m4, s20
+ mova [r2+r4], m1
+ mova [r2+r4+mmsize], m4
+ paddw m1, s30
+ paddw m4, s30
+ FILT_PACK m1, m4, 5, m6, w, s10
+ CLIPW m1, m0, m7
+ CLIPW m4, m0, m7
+ mova [r0+r4], m1
+ mova [r0+r4+mmsize], m4
+ add r4, 2*mmsize
+ jl .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void hpel_filter_c( uint16_t *dst, int16_t *buf, intptr_t width );
+;-----------------------------------------------------------------------------
+cglobal hpel_filter_c, 3,3,10
+ add r2, r2
+ add r0, r2
+ add r1, r2
+ neg r2
+ mova m0, [tap1]
+ mova m7, [tap3]
+%if num_mmregs > 8
+ mova m8, [tap2]
+ mova m9, [depad]
+ %define s1 m8
+ %define s2 m9
+%else
+ %define s1 [tap2]
+ %define s2 [depad]
+%endif
+.loop:
+ movu m1, [r1+r2-4]
+ movu m2, [r1+r2-2]
+ mova m3, [r1+r2+0]
+ movu m4, [r1+r2+2]
+ movu m5, [r1+r2+4]
+ movu m6, [r1+r2+6]
+ pmaddwd m1, m0
+ pmaddwd m2, m0
+ pmaddwd m3, s1
+ pmaddwd m4, s1
+ pmaddwd m5, m7
+ pmaddwd m6, m7
+ paddd m1, s2
+ paddd m2, s2
+ paddd m3, m5
+ paddd m4, m6
+ paddd m1, m3
+ paddd m2, m4
+ psrad m1, 10
+ psrad m2, 10
+ pslld m2, 16
+ pand m1, [pd_0f]
+ por m1, m2
+ CLIPW m1, [pb_0], [pw_pixel_max]
+ mova [r0+r2], m1
+ add r2, mmsize
+ jl .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void hpel_filter_h( uint16_t *dst, uint16_t *src, intptr_t width );
+;-----------------------------------------------------------------------------
+cglobal hpel_filter_h, 3,4,8
+ %define src r1+r2
+ add r2, r2
+ add r0, r2
+ add r1, r2
+ neg r2
+ mova m0, [pw_pixel_max]
+.loop:
+ movu m1, [src-4]
+ movu m2, [src-2]
+ mova m3, [src+0]
+ movu m6, [src+2]
+ movu m4, [src+4]
+ movu m5, [src+6]
+ paddw m3, m6 ; c0
+ paddw m2, m4 ; b0
+ paddw m1, m5 ; a0
+%if mmsize == 16
+ movu m4, [src-4+mmsize]
+ movu m5, [src-2+mmsize]
+%endif
+ movu m7, [src+4+mmsize]
+ movu m6, [src+6+mmsize]
+ paddw m5, m7 ; b1
+ paddw m4, m6 ; a1
+ movu m7, [src+2+mmsize]
+ mova m6, [src+0+mmsize]
+ paddw m6, m7 ; c1
+ FILT_H2 m1, m2, m3, m4, m5, m6
+ mova m7, [pw_1]
+ pxor m2, m2
+ FILT_PACK m1, m4, 1, m7, w
+ CLIPW m1, m2, m0
+ CLIPW m4, m2, m0
+ mova [r0+r2], m1
+ mova [r0+r2+mmsize], m4
+ add r2, mmsize*2
+ jl .loop
+ REP_RET
+%endmacro ; HPEL_FILTER
+
+INIT_MMX mmx2
+HPEL_FILTER
+INIT_XMM sse2
+HPEL_FILTER
+%endif ; HIGH_BIT_DEPTH
+
+%if HIGH_BIT_DEPTH == 0
+%macro HPEL_V 1
+;-----------------------------------------------------------------------------
+; void hpel_filter_v( uint8_t *dst, uint8_t *src, int16_t *buf, intptr_t stride, intptr_t width );
+;-----------------------------------------------------------------------------
+cglobal hpel_filter_v, 5,6,%1
+ lea r5, [r1+r3]
+ sub r1, r3
+ sub r1, r3
+ add r0, r4
+ lea r2, [r2+r4*2]
+ neg r4
+%if cpuflag(ssse3)
+ mova m0, [filt_mul15]
+%else
+ pxor m0, m0
+%endif
+.loop:
+%if cpuflag(ssse3)
+ mova m1, [r1]
+ mova m4, [r1+r3]
+ mova m2, [r5+r3*2]
+ mova m5, [r5+r3]
+ mova m3, [r1+r3*2]
+ mova m6, [r5]
+ SBUTTERFLY bw, 1, 4, 7
+ SBUTTERFLY bw, 2, 5, 7
+ SBUTTERFLY bw, 3, 6, 7
+ pmaddubsw m1, m0
+ pmaddubsw m4, m0
+ pmaddubsw m2, m0
+ pmaddubsw m5, m0
+ pmaddubsw m3, [filt_mul20]
+ pmaddubsw m6, [filt_mul20]
+ paddw m1, m2
+ paddw m4, m5
+ paddw m1, m3
+ paddw m4, m6
+%else
+ LOAD_ADD_2 m1, m4, [r1 ], [r5+r3*2], m6, m7 ; a0 / a1
+ LOAD_ADD_2 m2, m5, [r1+r3 ], [r5+r3 ], m6, m7 ; b0 / b1
+ LOAD_ADD m3, [r1+r3*2], [r5 ], m7 ; c0
+ LOAD_ADD m6, [r1+r3*2+mmsize/2], [r5+mmsize/2], m7 ; c1
+ FILT_V2 m1, m2, m3, m4, m5, m6
+%endif
+ mova m7, [pw_16]
+ mova [r2+r4*2], m1
+ mova [r2+r4*2+mmsize], m4
+ FILT_PACK m1, m4, 5, m7
+ movnta [r0+r4], m1
+ add r1, mmsize
+ add r5, mmsize
+ add r4, mmsize
+ jl .loop
+ REP_RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal hpel_filter_c_mmx2, 3,3
+ add r0, r2
+ lea r1, [r1+r2*2]
+ neg r2
+ %define src r1+r2*2
+ movq m7, [pw_32]
+.loop:
+ movq m1, [src-4]
+ movq m2, [src-2]
+ movq m3, [src ]
+ movq m4, [src+4]
+ movq m5, [src+6]
+ paddw m3, [src+2] ; c0
+ paddw m2, m4 ; b0
+ paddw m1, m5 ; a0
+ movq m6, [src+8]
+ paddw m4, [src+14] ; a1
+ paddw m5, [src+12] ; b1
+ paddw m6, [src+10] ; c1
+ FILT_H2 m1, m2, m3, m4, m5, m6
+ FILT_PACK m1, m4, 6, m7
+ movntq [r0+r2], m1
+ add r2, 8
+ jl .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
+;-----------------------------------------------------------------------------
+cglobal hpel_filter_h_mmx2, 3,3
+ add r0, r2
+ add r1, r2
+ neg r2
+ %define src r1+r2
+ pxor m0, m0
+.loop:
+ movd m1, [src-2]
+ movd m2, [src-1]
+ movd m3, [src ]
+ movd m6, [src+1]
+ movd m4, [src+2]
+ movd m5, [src+3]
+ punpcklbw m1, m0
+ punpcklbw m2, m0
+ punpcklbw m3, m0
+ punpcklbw m6, m0
+ punpcklbw m4, m0
+ punpcklbw m5, m0
+ paddw m3, m6 ; c0
+ paddw m2, m4 ; b0
+ paddw m1, m5 ; a0
+ movd m7, [src+7]
+ movd m6, [src+6]
+ punpcklbw m7, m0
+ punpcklbw m6, m0
+ paddw m4, m7 ; c1
+ paddw m5, m6 ; b1
+ movd m7, [src+5]
+ movd m6, [src+4]
+ punpcklbw m7, m0
+ punpcklbw m6, m0
+ paddw m6, m7 ; a1
+ movq m7, [pw_1]
+ FILT_H2 m1, m2, m3, m4, m5, m6
+ FILT_PACK m1, m4, 1, m7
+ movntq [r0+r2], m1
+ add r2, 8
+ jl .loop
+ REP_RET
+
+INIT_XMM
+
+%macro HPEL_C 0
+;-----------------------------------------------------------------------------
+; void hpel_filter_c( uint8_t *dst, int16_t *buf, intptr_t width );
+;-----------------------------------------------------------------------------
+cglobal hpel_filter_c, 3,3,9
+ add r0, r2
+ lea r1, [r1+r2*2]
+ neg r2
+ %define src r1+r2*2
+%ifnidn cpuname, sse2
+ mova m7, [pw_32]
+ %define tpw_32 m7
+%elif ARCH_X86_64
+ mova m8, [pw_32]
+ %define tpw_32 m8
+%else
+ %define tpw_32 [pw_32]
+%endif
+; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
+%if cpuflag(misalign)
+.loop:
+ movu m4, [src-4]
+ movu m5, [src-2]
+ mova m6, [src]
+ movu m3, [src+12]
+ movu m2, [src+14]
+ mova m1, [src+16]
+ paddw m4, [src+6]
+ paddw m5, [src+4]
+ paddw m6, [src+2]
+ paddw m3, [src+22]
+ paddw m2, [src+20]
+ paddw m1, [src+18]
+ FILT_H2 m4, m5, m6, m3, m2, m1
+%else
+ mova m0, [src-16]
+ mova m1, [src]
+.loop:
+ mova m2, [src+16]
+ PALIGNR m4, m1, m0, 12, m7
+ PALIGNR m5, m1, m0, 14, m0
+ PALIGNR m0, m2, m1, 6, m7
+ paddw m4, m0
+ PALIGNR m0, m2, m1, 4, m7
+ paddw m5, m0
+ PALIGNR m6, m2, m1, 2, m7
+ paddw m6, m1
+ FILT_H m4, m5, m6
+
+ mova m0, m2
+ mova m5, m2
+ PALIGNR m2, m1, 12, m7
+ PALIGNR m5, m1, 14, m1
+ mova m1, [src+32]
+ PALIGNR m3, m1, m0, 6, m7
+ paddw m3, m2
+ PALIGNR m6, m1, m0, 4, m7
+ paddw m5, m6
+ PALIGNR m6, m1, m0, 2, m7
+ paddw m6, m0
+ FILT_H m3, m5, m6
+%endif
+ FILT_PACK m4, m3, 6, tpw_32
+ movntps [r0+r2], m4
+ add r2, 16
+ jl .loop
+ REP_RET
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
+;-----------------------------------------------------------------------------
+cglobal hpel_filter_h_sse2, 3,3,8
+ add r0, r2
+ add r1, r2
+ neg r2
+ %define src r1+r2
+ pxor m0, m0
+.loop:
+ movh m1, [src-2]
+ movh m2, [src-1]
+ movh m3, [src ]
+ movh m4, [src+1]
+ movh m5, [src+2]
+ movh m6, [src+3]
+ punpcklbw m1, m0
+ punpcklbw m2, m0
+ punpcklbw m3, m0
+ punpcklbw m4, m0
+ punpcklbw m5, m0
+ punpcklbw m6, m0
+ paddw m3, m4 ; c0
+ paddw m2, m5 ; b0
+ paddw m1, m6 ; a0
+ movh m4, [src+6]
+ movh m5, [src+7]
+ movh m6, [src+10]
+ movh m7, [src+11]
+ punpcklbw m4, m0
+ punpcklbw m5, m0
+ punpcklbw m6, m0
+ punpcklbw m7, m0
+ paddw m5, m6 ; b1
+ paddw m4, m7 ; a1
+ movh m6, [src+8]
+ movh m7, [src+9]
+ punpcklbw m6, m0
+ punpcklbw m7, m0
+ paddw m6, m7 ; c1
+ mova m7, [pw_1] ; FIXME xmm8
+ FILT_H2 m1, m2, m3, m4, m5, m6
+ FILT_PACK m1, m4, 1, m7
+ movntps [r0+r2], m1
+ add r2, 16
+ jl .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void hpel_filter_h( uint8_t *dst, uint8_t *src, intptr_t width );
+;-----------------------------------------------------------------------------
+%macro HPEL_H 0
+cglobal hpel_filter_h, 3,3
+ add r0, r2
+ add r1, r2
+ neg r2
+ %define src r1+r2
+ mova m0, [src-16]
+ mova m1, [src]
+ mova m7, [pw_16]
+.loop:
+ mova m2, [src+16]
+ ; Using unaligned loads instead of palignr is marginally slower on SB and significantly
+ ; slower on Bulldozer, despite their fast load units -- even though it would let us avoid
+ ; the repeated loads of constants for pmaddubsw.
+ palignr m3, m1, m0, 14
+ palignr m4, m1, m0, 15
+ palignr m0, m2, m1, 2
+ pmaddubsw m3, [filt_mul15]
+ pmaddubsw m4, [filt_mul15]
+ pmaddubsw m0, [filt_mul51]
+ palignr m5, m2, m1, 1
+ palignr m6, m2, m1, 3
+ paddw m3, m0
+ mova m0, m1
+ pmaddubsw m1, [filt_mul20]
+ pmaddubsw m5, [filt_mul20]
+ pmaddubsw m6, [filt_mul51]
+ paddw m3, m1
+ paddw m4, m5
+ paddw m4, m6
+ FILT_PACK m3, m4, 5, m7
+ pshufb m3, [hpel_shuf]
+ mova m1, m2
+ movntps [r0+r2], m3
+ add r2, 16
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx2
+HPEL_V 0
+INIT_XMM sse2
+HPEL_V 8
+INIT_XMM sse2, misalign
+HPEL_C
+%if ARCH_X86_64 == 0
+INIT_XMM sse2
+HPEL_C
+INIT_XMM ssse3
+HPEL_C
+HPEL_V 0
+HPEL_H
+INIT_XMM avx
+HPEL_C
+HPEL_V 0
+HPEL_H
+%endif
+
+%if ARCH_X86_64
+%macro DO_FILT_V 5
+ ;The optimum prefetch distance is difficult to determine in checkasm:
+ ;any prefetch seems slower than not prefetching.
+ ;In real use, the prefetch seems to be a slight win.
+ ;+16 is picked somewhat arbitrarily here based on the fact that even one
+ ;loop iteration is going to take longer than the prefetch.
+ prefetcht0 [r1+r2*2+16]
+%if cpuflag(ssse3)
+ mova m1, [r3]
+ mova m2, [r3+r2]
+ mova %3, [r3+r2*2]
+ mova m3, [r1]
+ mova %1, [r1+r2]
+ mova %2, [r1+r2*2]
+ punpckhbw m4, m1, m2
+ punpcklbw m1, m2
+ punpckhbw m2, %1, %2
+ punpcklbw %1, %2
+ punpckhbw %2, m3, %3
+ punpcklbw m3, %3
+
+ pmaddubsw m1, m12
+ pmaddubsw m4, m12
+ pmaddubsw %1, m0
+ pmaddubsw m2, m0
+ pmaddubsw m3, m14
+ pmaddubsw %2, m14
+
+ paddw m1, %1
+ paddw m4, m2
+ paddw m1, m3
+ paddw m4, %2
+%else
+ LOAD_ADD_2 m1, m4, [r3 ], [r1+r2*2], m2, m5 ; a0 / a1
+ LOAD_ADD_2 m2, m5, [r3+r2 ], [r1+r2 ], m3, m6 ; b0 / b1
+ LOAD_ADD_2 m3, m6, [r3+r2*2], [r1 ], %3, %4 ; c0 / c1
+ packuswb %3, %4
+ FILT_V2 m1, m2, m3, m4, m5, m6
+%endif
+ add r3, 16
+ add r1, 16
+ mova %1, m1
+ mova %2, m4
+ FILT_PACK m1, m4, 5, m15
+ movntps [r8+r4+%5], m1
+%endmacro
+
+%macro FILT_C 4
+ PALIGNR m1, %2, %1, 12, m2
+ PALIGNR m2, %2, %1, 14, %1
+ PALIGNR m3, %3, %2, 4, %1
+ PALIGNR m4, %3, %2, 2, %1
+ paddw m3, m2
+ mova %1, %3
+ PALIGNR %3, %2, 6, m2
+ paddw m4, %2
+ paddw %3, m1
+ FILT_H %3, m3, m4
+%endmacro
+
+%macro DO_FILT_C 4
+ FILT_C %1, %2, %3, 6
+ FILT_C %2, %1, %4, 6
+ FILT_PACK %3, %4, 6, m15
+ movntps [r5+r4], %3
+%endmacro
+
+%macro ADD8TO16 5
+ punpckhbw %3, %1, %5
+ punpcklbw %1, %5
+ punpcklbw %4, %2, %5
+ punpckhbw %2, %5
+ paddw %2, %3
+ paddw %1, %4
+%endmacro
+
+%macro DO_FILT_H 3
+ PALIGNR m1, %2, %1, 14, m3
+ PALIGNR m2, %2, %1, 15, m3
+ PALIGNR m4, %3, %2, 1 , m3
+ PALIGNR m5, %3, %2, 2 , m3
+ PALIGNR m6, %3, %2, 3 , m3
+ mova %1, %2
+%if cpuflag(ssse3)
+ pmaddubsw m1, m12
+ pmaddubsw m2, m12
+ pmaddubsw %2, m14
+ pmaddubsw m4, m14
+ pmaddubsw m5, m0
+ pmaddubsw m6, m0
+ paddw m1, %2
+ paddw m2, m4
+ paddw m1, m5
+ paddw m2, m6
+ FILT_PACK m1, m2, 5, m15
+ pshufb m1, [hpel_shuf]
+%else ; ssse3, avx
+ ADD8TO16 m1, m6, m12, m3, m0 ; a
+ ADD8TO16 m2, m5, m12, m3, m0 ; b
+ ADD8TO16 %2, m4, m12, m3, m0 ; c
+ FILT_V2 m1, m2, %2, m6, m5, m4
+ FILT_PACK m1, m6, 5, m15
+%endif
+ movntps [r0+r4], m1
+ mova %2, %3
+%endmacro
+
+%macro HPEL 0
+;-----------------------------------------------------------------------------
+; void hpel_filter( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
+; uint8_t *src, intptr_t stride, int width, int height )
+;-----------------------------------------------------------------------------
+cglobal hpel_filter, 7,9,16
+ mov r7, r3
+ sub r5d, 16
+ mov r8, r1
+ and r7, 15
+ sub r3, r7
+ add r0, r5
+ add r8, r5
+ add r7, r5
+ add r5, r2
+ mov r2, r4
+ neg r7
+ lea r1, [r3+r2]
+ sub r3, r2
+ sub r3, r2
+ mov r4, r7
+ mova m15, [pw_16]
+%if cpuflag(ssse3)
+ mova m0, [filt_mul51]
+ mova m12, [filt_mul15]
+ mova m14, [filt_mul20]
+%else
+ pxor m0, m0
+%endif
+;ALIGN 16
+.loopy:
+; first filter_v
+ DO_FILT_V m8, m7, m13, m12, 0
+;ALIGN 16
+.loopx:
+ DO_FILT_V m6, m5, m11, m12, 16
+.lastx:
+ paddw m15, m15 ; pw_32
+ DO_FILT_C m9, m8, m7, m6
+ psrlw m15, 1 ; pw_16
+ movdqa m7, m5
+ DO_FILT_H m10, m13, m11
+ add r4, 16
+ jl .loopx
+ cmp r4, 16
+ jl .lastx
+; setup regs for next y
+ sub r4, r7
+ sub r4, r2
+ sub r1, r4
+ sub r3, r4
+ add r0, r2
+ add r8, r2
+ add r5, r2
+ mov r4, r7
+ sub r6d, 1
+ jg .loopy
+ sfence
+ RET
+%endmacro
+
+INIT_XMM sse2
+HPEL
+INIT_XMM ssse3
+HPEL
+INIT_XMM avx
+HPEL
+%endif ; ARCH_X86_64
+
+%undef movntq
+%undef movntps
+%undef sfence
+%endif ; !HIGH_BIT_DEPTH
+
+;-----------------------------------------------------------------------------
+; void plane_copy_core( pixel *dst, intptr_t i_dst,
+; pixel *src, intptr_t i_src, int w, int h )
+;-----------------------------------------------------------------------------
+; assumes i_dst and w are multiples of 16, and i_dst>w
+INIT_MMX
+cglobal plane_copy_core_mmx2, 6,7
+ FIX_STRIDES r1, r3, r4d
+%if HIGH_BIT_DEPTH == 0
+ movsxdifnidn r4, r4d
+%endif
+ sub r1, r4
+ sub r3, r4
+.loopy:
+ lea r6d, [r4-63]
+.loopx:
+ prefetchnta [r2+256]
+ movq m0, [r2 ]
+ movq m1, [r2+ 8]
+ movntq [r0 ], m0
+ movntq [r0+ 8], m1
+ movq m2, [r2+16]
+ movq m3, [r2+24]
+ movntq [r0+16], m2
+ movntq [r0+24], m3
+ movq m4, [r2+32]
+ movq m5, [r2+40]
+ movntq [r0+32], m4
+ movntq [r0+40], m5
+ movq m6, [r2+48]
+ movq m7, [r2+56]
+ movntq [r0+48], m6
+ movntq [r0+56], m7
+ add r2, 64
+ add r0, 64
+ sub r6d, 64
+ jg .loopx
+ prefetchnta [r2+256]
+ add r6d, 63
+ jle .end16
+.loop16:
+ movq m0, [r2 ]
+ movq m1, [r2+8]
+ movntq [r0 ], m0
+ movntq [r0+8], m1
+ add r2, 16
+ add r0, 16
+ sub r6d, 16
+ jg .loop16
+.end16:
+ add r0, r1
+ add r2, r3
+ dec r5d
+ jg .loopy
+ sfence
+ emms
+ RET
+
+
+%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
+%if HIGH_BIT_DEPTH
+%assign x 0
+%rep 16/mmsize
+ mov%4 m0, [%2+(x/2)*mmsize]
+ mov%4 m1, [%3+(x/2)*mmsize]
+ punpckhwd m2, m0, m1
+ punpcklwd m0, m1
+ mov%5a [%1+(x+0)*mmsize], m0
+ mov%5a [%1+(x+1)*mmsize], m2
+ %assign x (x+2)
+%endrep
+%else
+ movq m0, [%2]
+%if mmsize==16
+%ifidn %4, a
+ punpcklbw m0, [%3]
+%else
+ movq m1, [%3]
+ punpcklbw m0, m1
+%endif
+ mov%5a [%1], m0
+%else
+ movq m1, [%3]
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ mov%5a [%1+0], m0
+ mov%5a [%1+8], m2
+%endif
+%endif ; HIGH_BIT_DEPTH
+%endmacro
+
+%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, shuffle constant, is aligned
+%if HIGH_BIT_DEPTH
+%assign n 0
+%rep 16/mmsize
+ mova m0, [%3+(n+0)*mmsize]
+ mova m1, [%3+(n+1)*mmsize]
+ psrld m2, m0, 16
+ psrld m3, m1, 16
+ pand m0, %5
+ pand m1, %5
+ packssdw m0, m1
+ packssdw m2, m3
+ mov%6 [%1+(n/2)*mmsize], m0
+ mov%6 [%2+(n/2)*mmsize], m2
+ %assign n (n+2)
+%endrep
+%else ; !HIGH_BIT_DEPTH
+%if mmsize==16
+ mova m0, [%3]
+%if cpuflag(ssse3)
+ pshufb m0, %5
+%else
+ mova m1, m0
+ pand m0, %5
+ psrlw m1, 8
+ packuswb m0, m1
+%endif
+%if %4
+ mova [%1], m0
+%else
+ movq [%1], m0
+ movhps [%2], m0
+%endif
+%else
+ mova m0, [%3]
+ mova m1, [%3+8]
+ mova m2, m0
+ mova m3, m1
+ pand m0, %5
+ pand m1, %5
+ psrlw m2, 8
+ psrlw m3, 8
+ packuswb m0, m1
+ packuswb m2, m3
+ mova [%1], m0
+ mova [%2], m2
+%endif ; mmsize == 16
+%endif ; HIGH_BIT_DEPTH
+%endmacro
+
+%macro PLANE_INTERLEAVE 0
+;-----------------------------------------------------------------------------
+; void plane_copy_interleave_core( uint8_t *dst, intptr_t i_dst,
+; uint8_t *srcu, intptr_t i_srcu,
+; uint8_t *srcv, intptr_t i_srcv, int w, int h )
+;-----------------------------------------------------------------------------
+; assumes i_dst and w are multiples of 16, and i_dst>2*w
+cglobal plane_copy_interleave_core, 6,9
+ mov r6d, r6m
+%if HIGH_BIT_DEPTH
+ FIX_STRIDES r1, r3, r5, r6d
+ movifnidn r1mp, r1
+ movifnidn r3mp, r3
+ mov r6m, r6d
+%endif
+ lea r0, [r0+r6*2]
+ add r2, r6
+ add r4, r6
+%if ARCH_X86_64
+ DECLARE_REG_TMP 7,8
+%else
+ DECLARE_REG_TMP 1,3
+%endif
+ mov t1, r1
+ shr t1, SIZEOF_PIXEL
+ sub t1, r6
+ mov t0d, r7m
+.loopy:
+ mov r6d, r6m
+ neg r6
+.prefetch:
+ prefetchnta [r2+r6]
+ prefetchnta [r4+r6]
+ add r6, 64
+ jl .prefetch
+ mov r6d, r6m
+ neg r6
+.loopx:
+ INTERLEAVE r0+r6*2+ 0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6+0*SIZEOF_PIXEL, u, nt
+ INTERLEAVE r0+r6*2+16*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6+8*SIZEOF_PIXEL, u, nt
+ add r6, 16*SIZEOF_PIXEL
+ jl .loopx
+.pad:
+%assign n 0
+%rep SIZEOF_PIXEL
+%if mmsize==8
+ movntq [r0+r6*2+(n+ 0)], m0
+ movntq [r0+r6*2+(n+ 8)], m0
+ movntq [r0+r6*2+(n+16)], m0
+ movntq [r0+r6*2+(n+24)], m0
+%else
+ movntdq [r0+r6*2+(n+ 0)], m0
+ movntdq [r0+r6*2+(n+16)], m0
+%endif
+ %assign n n+32
+%endrep
+ add r6, 16*SIZEOF_PIXEL
+ cmp r6, t1
+ jl .pad
+ add r0, r1mp
+ add r2, r3mp
+ add r4, r5
+ dec t0d
+ jg .loopy
+ sfence
+ emms
+ RET
+
+;-----------------------------------------------------------------------------
+; void store_interleave_chroma( uint8_t *dst, intptr_t i_dst, uint8_t *srcu, uint8_t *srcv, int height )
+;-----------------------------------------------------------------------------
+cglobal store_interleave_chroma, 5,5
+ FIX_STRIDES r1
+.loop:
+ INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
+ INTERLEAVE r0+r1, r2+FDEC_STRIDEB, r3+FDEC_STRIDEB, a
+ add r2, FDEC_STRIDEB*2
+ add r3, FDEC_STRIDEB*2
+ lea r0, [r0+r1*2]
+ sub r4d, 2
+ jg .loop
+ REP_RET
+%endmacro ; PLANE_INTERLEAVE
+
+%macro DEINTERLEAVE_START 0
+%if HIGH_BIT_DEPTH
+ mova m4, [pd_ffff]
+%elif cpuflag(ssse3)
+ mova m4, [deinterleave_shuf]
+%else
+ mova m4, [pw_00ff]
+%endif ; HIGH_BIT_DEPTH
+%endmacro
+
+%macro PLANE_DEINTERLEAVE 0
+;-----------------------------------------------------------------------------
+; void plane_copy_deinterleave( pixel *dstu, intptr_t i_dstu,
+; pixel *dstv, intptr_t i_dstv,
+; pixel *src, intptr_t i_src, int w, int h )
+;-----------------------------------------------------------------------------
+cglobal plane_copy_deinterleave, 6,7
+ DEINTERLEAVE_START
+ mov r6d, r6m
+ FIX_STRIDES r1, r3, r5, r6d
+%if HIGH_BIT_DEPTH
+ mov r6m, r6d
+%endif
+ add r0, r6
+ add r2, r6
+ lea r4, [r4+r6*2]
+.loopy:
+ mov r6d, r6m
+ neg r6
+.loopx:
+ DEINTERLEAVE r0+r6+0*SIZEOF_PIXEL, r2+r6+0*SIZEOF_PIXEL, r4+r6*2+ 0*SIZEOF_PIXEL, 0, m4, u
+ DEINTERLEAVE r0+r6+8*SIZEOF_PIXEL, r2+r6+8*SIZEOF_PIXEL, r4+r6*2+16*SIZEOF_PIXEL, 0, m4, u
+ add r6, 16*SIZEOF_PIXEL
+ jl .loopx
+ add r0, r1
+ add r2, r3
+ add r4, r5
+ dec dword r7m
+ jg .loopy
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, intptr_t i_src, int height )
+;-----------------------------------------------------------------------------
+cglobal load_deinterleave_chroma_fenc, 4,4
+ DEINTERLEAVE_START
+ FIX_STRIDES r2
+.loop:
+ DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
+ DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
+ add r0, FENC_STRIDEB*2
+ lea r1, [r1+r2*2]
+ sub r3d, 2
+ jg .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, intptr_t i_src, int height )
+;-----------------------------------------------------------------------------
+cglobal load_deinterleave_chroma_fdec, 4,4
+ DEINTERLEAVE_START
+ FIX_STRIDES r2
+.loop:
+ DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
+ DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
+ add r0, FDEC_STRIDEB*2
+ lea r1, [r1+r2*2]
+ sub r3d, 2
+ jg .loop
+ REP_RET
+%endmacro ; PLANE_DEINTERLEAVE
+
+%if HIGH_BIT_DEPTH
+INIT_MMX mmx2
+PLANE_INTERLEAVE
+INIT_MMX mmx
+PLANE_DEINTERLEAVE
+INIT_XMM sse2
+PLANE_INTERLEAVE
+PLANE_DEINTERLEAVE
+INIT_XMM avx
+PLANE_INTERLEAVE
+PLANE_DEINTERLEAVE
+%else
+INIT_MMX mmx2
+PLANE_INTERLEAVE
+INIT_MMX mmx
+PLANE_DEINTERLEAVE
+INIT_XMM sse2
+PLANE_INTERLEAVE
+PLANE_DEINTERLEAVE
+INIT_XMM ssse3
+PLANE_DEINTERLEAVE
+%endif
+
+; These functions are not general-use; not only do the SSE ones require aligned input,
+; but they also will fail if given a non-mod16 size.
+; memzero SSE will fail for non-mod128.
+
+;-----------------------------------------------------------------------------
+; void *memcpy_aligned( void *dst, const void *src, size_t n );
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal memcpy_aligned_mmx, 3,3
+ test r2d, 16
+ jz .copy32start
+ movq mm0, [r1 + r2 - 16]
+ movq mm1, [r1 + r2 - 8]
+ movq [r0 + r2 - 16], mm0
+ movq [r0 + r2 - 8], mm1
+ sub r2d, 16
+.copy32start
+ test r2d, r2d
+ jz .ret
+.copy32:
+ movq mm0, [r1 + r2 - 32]
+ movq mm1, [r1 + r2 - 24]
+ movq mm2, [r1 + r2 - 16]
+ movq mm3, [r1 + r2 - 8]
+ movq [r0 + r2 - 32], mm0
+ movq [r0 + r2 - 24], mm1
+ movq [r0 + r2 - 16], mm2
+ movq [r0 + r2 - 8], mm3
+ sub r2d, 32
+ jg .copy32
+.ret
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void *memcpy_aligned( void *dst, const void *src, size_t n );
+;-----------------------------------------------------------------------------
+cglobal memcpy_aligned_sse2, 3,3
+ test r2d, 16
+ jz .copy32
+ movdqa xmm0, [r1 + r2 - 16]
+ movdqa [r0 + r2 - 16], xmm0
+ sub r2d, 16
+.copy32:
+ test r2d, 32
+ jz .copy64start
+ movdqa xmm0, [r1 + r2 - 32]
+ movdqa [r0 + r2 - 32], xmm0
+ movdqa xmm1, [r1 + r2 - 16]
+ movdqa [r0 + r2 - 16], xmm1
+ sub r2d, 32
+.copy64start
+ test r2d, r2d
+ jz .ret
+.copy64:
+ movdqa xmm0, [r1 + r2 - 64]
+ movdqa [r0 + r2 - 64], xmm0
+ movdqa xmm1, [r1 + r2 - 48]
+ movdqa [r0 + r2 - 48], xmm1
+ movdqa xmm2, [r1 + r2 - 32]
+ movdqa [r0 + r2 - 32], xmm2
+ movdqa xmm3, [r1 + r2 - 16]
+ movdqa [r0 + r2 - 16], xmm3
+ sub r2d, 64
+ jg .copy64
+.ret:
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void *memzero_aligned( void *dst, size_t n );
+;-----------------------------------------------------------------------------
+%macro MEMZERO 0
+cglobal memzero_aligned, 2,2
+ add r0, r1
+ neg r1
+ pxor m0, m0
+.loop:
+%assign i 0
+%rep 8
+ mova [r0 + r1 + i], m0
+%assign i i+mmsize
+%endrep
+ add r1, mmsize*8
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+MEMZERO
+INIT_XMM sse2
+MEMZERO
+
+
+
+%if HIGH_BIT_DEPTH == 0
+;-----------------------------------------------------------------------------
+; void integral_init4h( uint16_t *sum, uint8_t *pix, intptr_t stride )
+;-----------------------------------------------------------------------------
+INIT_XMM
+cglobal integral_init4h_sse4, 3,4
+ lea r3, [r0+r2*2]
+ add r1, r2
+ neg r2
+ pxor m4, m4
+.loop:
+ movdqa m0, [r1+r2]
+ movdqa m1, [r1+r2+16]
+ palignr m1, m0, 8
+ mpsadbw m0, m4, 0
+ mpsadbw m1, m4, 0
+ paddw m0, [r0+r2*2]
+ paddw m1, [r0+r2*2+16]
+ movdqa [r3+r2*2 ], m0
+ movdqa [r3+r2*2+16], m1
+ add r2, 16
+ jl .loop
+ REP_RET
+
+%macro INTEGRAL_INIT8H 0
+cglobal integral_init8h, 3,4
+ lea r3, [r0+r2*2]
+ add r1, r2
+ neg r2
+ pxor m4, m4
+.loop:
+ movdqa m0, [r1+r2]
+ movdqa m1, [r1+r2+16]
+ palignr m1, m0, 8
+ mpsadbw m2, m0, m4, 4
+ mpsadbw m3, m1, m4, 4
+ mpsadbw m0, m4, 0
+ mpsadbw m1, m4, 0
+ paddw m0, [r0+r2*2]
+ paddw m1, [r0+r2*2+16]
+ paddw m0, m2
+ paddw m1, m3
+ movdqa [r3+r2*2 ], m0
+ movdqa [r3+r2*2+16], m1
+ add r2, 16
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse4
+INTEGRAL_INIT8H
+INIT_XMM avx
+INTEGRAL_INIT8H
+%endif ; !HIGH_BIT_DEPTH
+
+%macro INTEGRAL_INIT_8V 0
+;-----------------------------------------------------------------------------
+; void integral_init8v( uint16_t *sum8, intptr_t stride )
+;-----------------------------------------------------------------------------
+cglobal integral_init8v, 3,3
+ shl r1, 1
+ add r0, r1
+ lea r2, [r0+r1*8]
+ neg r1
+.loop:
+ mova m0, [r2+r1]
+ mova m1, [r2+r1+mmsize]
+ psubw m0, [r0+r1]
+ psubw m1, [r0+r1+mmsize]
+ mova [r0+r1], m0
+ mova [r0+r1+mmsize], m1
+ add r1, 2*mmsize
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_MMX mmx
+INTEGRAL_INIT_8V
+INIT_XMM sse2
+INTEGRAL_INIT_8V
+
+;-----------------------------------------------------------------------------
+; void integral_init4v( uint16_t *sum8, uint16_t *sum4, intptr_t stride )
+;-----------------------------------------------------------------------------
+INIT_MMX
+cglobal integral_init4v_mmx, 3,5
+ shl r2, 1
+ lea r3, [r0+r2*4]
+ lea r4, [r0+r2*8]
+ mova m0, [r0+r2]
+ mova m4, [r4+r2]
+.loop:
+ mova m1, m4
+ psubw m1, m0
+ mova m4, [r4+r2-8]
+ mova m0, [r0+r2-8]
+ paddw m1, m4
+ mova m3, [r3+r2-8]
+ psubw m1, m0
+ psubw m3, m0
+ mova [r0+r2-8], m1
+ mova [r1+r2-8], m3
+ sub r2, 8
+ jge .loop
+ REP_RET
+
+INIT_XMM
+cglobal integral_init4v_sse2, 3,5
+ shl r2, 1
+ add r0, r2
+ add r1, r2
+ lea r3, [r0+r2*4]
+ lea r4, [r0+r2*8]
+ neg r2
+.loop:
+ mova m0, [r0+r2]
+ mova m1, [r4+r2]
+ mova m2, m0
+ mova m4, m1
+ shufpd m0, [r0+r2+16], 1
+ shufpd m1, [r4+r2+16], 1
+ paddw m0, m2
+ paddw m1, m4
+ mova m3, [r3+r2]
+ psubw m1, m0
+ psubw m3, m2
+ mova [r0+r2], m1
+ mova [r1+r2], m3
+ add r2, 16
+ jl .loop
+ REP_RET
+
+cglobal integral_init4v_ssse3, 3,5
+ shl r2, 1
+ add r0, r2
+ add r1, r2
+ lea r3, [r0+r2*4]
+ lea r4, [r0+r2*8]
+ neg r2
+.loop:
+ mova m2, [r0+r2]
+ mova m0, [r0+r2+16]
+ mova m4, [r4+r2]
+ mova m1, [r4+r2+16]
+ palignr m0, m2, 8
+ palignr m1, m4, 8
+ paddw m0, m2
+ paddw m1, m4
+ mova m3, [r3+r2]
+ psubw m1, m0
+ psubw m3, m2
+ mova [r0+r2], m1
+ mova [r1+r2], m3
+ add r2, 16
+ jl .loop
+ REP_RET
+
+%macro FILT8x4 7
+ mova %3, [r0+%7]
+ mova %4, [r0+r5+%7]
+ pavgb %3, %4
+ pavgb %4, [r0+r5*2+%7]
+ PALIGNR %1, %3, 1, m6
+ PALIGNR %2, %4, 1, m6
+%if cpuflag(xop)
+ pavgb %1, %3
+ pavgb %2, %4
+%else
+ pavgb %1, %3
+ pavgb %2, %4
+ psrlw %5, %1, 8
+ psrlw %6, %2, 8
+ pand %1, m7
+ pand %2, m7
+%endif
+%endmacro
+
+%macro FILT16x2 4
+ mova m3, [r0+%4+mmsize]
+ mova m2, [r0+%4]
+ pavgb m3, [r0+%4+r5+mmsize]
+ pavgb m2, [r0+%4+r5]
+ PALIGNR %1, m3, 1, m6
+ pavgb %1, m3
+ PALIGNR m3, m2, 1, m6
+ pavgb m3, m2
+%if cpuflag(xop)
+ vpperm m5, m3, %1, m7
+ vpperm m3, m3, %1, m6
+%else
+ psrlw m5, m3, 8
+ psrlw m4, %1, 8
+ pand m3, m7
+ pand %1, m7
+ packuswb m3, %1
+ packuswb m5, m4
+%endif
+ mova [%2], m3
+ mova [%3], m5
+ mova %1, m2
+%endmacro
+
+%macro FILT8x2U 3
+ mova m3, [r0+%3+8]
+ mova m2, [r0+%3]
+ pavgb m3, [r0+%3+r5+8]
+ pavgb m2, [r0+%3+r5]
+ mova m1, [r0+%3+9]
+ mova m0, [r0+%3+1]
+ pavgb m1, [r0+%3+r5+9]
+ pavgb m0, [r0+%3+r5+1]
+ pavgb m1, m3
+ pavgb m0, m2
+ psrlw m3, m1, 8
+ psrlw m2, m0, 8
+ pand m1, m7
+ pand m0, m7
+ packuswb m0, m1
+ packuswb m2, m3
+ mova [%1], m0
+ mova [%2], m2
+%endmacro
+
+%macro FILT8xU 3
+ mova m3, [r0+%3+8]
+ mova m2, [r0+%3]
+ pavgw m3, [r0+%3+r5+8]
+ pavgw m2, [r0+%3+r5]
+ movu m1, [r0+%3+10]
+ movu m0, [r0+%3+2]
+ pavgw m1, [r0+%3+r5+10]
+ pavgw m0, [r0+%3+r5+2]
+ pavgw m1, m3
+ pavgw m0, m2
+ psrld m3, m1, 16
+ psrld m2, m0, 16
+ pand m1, m7
+ pand m0, m7
+ packssdw m0, m1
+ packssdw m2, m3
+ movu [%1], m0
+ mova [%2], m2
+%endmacro
+
+%macro FILT8xA 4
+ mova m3, [r0+%4+mmsize]
+ mova m2, [r0+%4]
+ pavgw m3, [r0+%4+r5+mmsize]
+ pavgw m2, [r0+%4+r5]
+ PALIGNR %1, m3, 2, m6
+ pavgw %1, m3
+ PALIGNR m3, m2, 2, m6
+ pavgw m3, m2
+%if cpuflag(xop)
+ vpperm m5, m3, %1, m7
+ vpperm m3, m3, %1, m6
+%else
+ psrld m5, m3, 16
+ psrld m4, %1, 16
+ pand m3, m7
+ pand %1, m7
+ packssdw m3, %1
+ packssdw m5, m4
+%endif
+ mova [%2], m3
+ mova [%3], m5
+ mova %1, m2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
+; intptr_t src_stride, intptr_t dst_stride, int width, int height )
+;-----------------------------------------------------------------------------
+%macro FRAME_INIT_LOWRES 0
+cglobal frame_init_lowres_core, 6,7,(12-4*(BIT_DEPTH/9)) ; 8 for HIGH_BIT_DEPTH, 12 otherwise
+%if HIGH_BIT_DEPTH
+ shl dword r6m, 1
+ FIX_STRIDES r5
+ shl dword r7m, 1
+%endif
+ ; src += 2*(height-1)*stride + 2*width
+ mov r6d, r8m
+ dec r6d
+ imul r6d, r5d
+ add r6d, r7m
+ lea r0, [r0+r6*2]
+ ; dst += (height-1)*stride + width
+ mov r6d, r8m
+ dec r6d
+ imul r6d, r6m
+ add r6d, r7m
+ add r1, r6
+ add r2, r6
+ add r3, r6
+ add r4, r6
+ ; gap = stride - width
+ mov r6d, r6m
+ sub r6d, r7m
+ PUSH r6
+ %define dst_gap [rsp+gprsize]
+ mov r6d, r5d
+ sub r6d, r7m
+ shl r6d, 1
+ PUSH r6
+ %define src_gap [rsp]
+%if HIGH_BIT_DEPTH
+%if cpuflag(xop)
+ mova m6, [deinterleave_shuf32a]
+ mova m7, [deinterleave_shuf32b]
+%else
+ pcmpeqw m7, m7
+ psrld m7, 16
+%endif
+.vloop:
+ mov r6d, r7m
+%ifnidn cpuname, mmx2
+ mova m0, [r0]
+ mova m1, [r0+r5]
+ pavgw m0, m1
+ pavgw m1, [r0+r5*2]
+%endif
+.hloop:
+ sub r0, mmsize*2
+ sub r1, mmsize
+ sub r2, mmsize
+ sub r3, mmsize
+ sub r4, mmsize
+%ifidn cpuname, mmx2
+ FILT8xU r1, r2, 0
+ FILT8xU r3, r4, r5
+%else
+ FILT8xA m0, r1, r2, 0
+ FILT8xA m1, r3, r4, r5
+%endif
+ sub r6d, mmsize
+ jg .hloop
+%else ; !HIGH_BIT_DEPTH
+%if mmsize == 16
+ ; adjust for the odd end case
+ mov r6d, r7m
+ and r6d, 8
+ sub r1, r6
+ sub r2, r6
+ sub r3, r6
+ sub r4, r6
+ add dst_gap, r6d
+%endif ; mmsize
+%if cpuflag(xop)
+ mova m6, [deinterleave_shuf32a]
+ mova m7, [deinterleave_shuf32b]
+%else
+ pcmpeqb m7, m7
+ psrlw m7, 8
+%endif
+.vloop:
+ mov r6d, r7m
+%ifnidn cpuname, mmx2
+ mova m0, [r0]
+ mova m1, [r0+r5]
+ pavgb m0, m1
+ pavgb m1, [r0+r5*2]
+%endif
+%if mmsize == 16
+ test r6d, 8
+ jz .hloop
+ sub r0, 16
+ FILT8x4 m0, m1, m2, m3, m4, m5, 0
+%if cpuflag(xop)
+ mova m4, m0
+ vpperm m0, m4, m1, m6
+ vpperm m1, m4, m1, m7
+ movq [r1], m0
+ movq [r2], m1
+ movhps [r3], m0
+ movhps [r4], m1
+%else
+ packuswb m0, m4
+ packuswb m1, m5
+ movq [r1], m0
+ movhps [r2], m0
+ movq [r3], m1
+ movhps [r4], m1
+%endif
+ mova m0, m2
+ mova m1, m3
+ sub r6d, 8
+ jz .skip
+%endif ; mmsize
+.hloop:
+ sub r0, mmsize*2
+ sub r1, mmsize
+ sub r2, mmsize
+ sub r3, mmsize
+ sub r4, mmsize
+%ifdef m8
+ FILT8x4 m0, m1, m2, m3, m10, m11, mmsize
+ mova m8, m0
+ mova m9, m1
+ FILT8x4 m2, m3, m0, m1, m4, m5, 0
+%if cpuflag(xop)
+ vpperm m4, m2, m8, m7
+ vpperm m2, m2, m8, m6
+ vpperm m5, m3, m9, m7
+ vpperm m3, m3, m9, m6
+%else
+ packuswb m2, m8
+ packuswb m3, m9
+ packuswb m4, m10
+ packuswb m5, m11
+%endif
+ mova [r1], m2
+ mova [r2], m4
+ mova [r3], m3
+ mova [r4], m5
+%elifidn cpuname, mmx2
+ FILT8x2U r1, r2, 0
+ FILT8x2U r3, r4, r5
+%else
+ FILT16x2 m0, r1, r2, 0
+ FILT16x2 m1, r3, r4, r5
+%endif
+ sub r6d, mmsize
+ jg .hloop
+%endif ; HIGH_BIT_DEPTH
+.skip:
+ mov r6, dst_gap
+ sub r0, src_gap
+ sub r1, r6
+ sub r2, r6
+ sub r3, r6
+ sub r4, r6
+ dec dword r8m
+ jg .vloop
+ ADD rsp, 2*gprsize
+ emms
+ RET
+%endmacro ; FRAME_INIT_LOWRES
+
+INIT_MMX mmx2
+FRAME_INIT_LOWRES
+%if ARCH_X86_64 == 0
+INIT_MMX cache32, mmx2
+FRAME_INIT_LOWRES
+%endif
+INIT_XMM sse2
+FRAME_INIT_LOWRES
+INIT_XMM ssse3
+FRAME_INIT_LOWRES
+INIT_XMM avx
+FRAME_INIT_LOWRES
+INIT_XMM xop
+FRAME_INIT_LOWRES
+
+;-----------------------------------------------------------------------------
+; void mbtree_propagate_cost( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
+; uint16_t *inter_costs, uint16_t *inv_qscales, float *fps_factor, int len )
+;-----------------------------------------------------------------------------
+%macro MBTREE 0
+cglobal mbtree_propagate_cost, 7,7,7
+ add r6d, r6d
+ lea r0, [r0+r6*2]
+ add r1, r6
+ add r2, r6
+ add r3, r6
+ add r4, r6
+ neg r6
+ pxor xmm4, xmm4
+ movss xmm6, [r5]
+ shufps xmm6, xmm6, 0
+ mulps xmm6, [pf_inv256]
+ movdqa xmm5, [pw_3fff]
+.loop:
+ movq xmm2, [r2+r6] ; intra
+ movq xmm0, [r4+r6] ; invq
+ movq xmm3, [r3+r6] ; inter
+ movq xmm1, [r1+r6] ; prop
+ punpcklwd xmm2, xmm4
+ punpcklwd xmm0, xmm4
+ pmaddwd xmm0, xmm2
+ pand xmm3, xmm5
+ punpcklwd xmm1, xmm4
+ punpcklwd xmm3, xmm4
+%if cpuflag(fma4)
+ cvtdq2ps xmm0, xmm0
+ cvtdq2ps xmm1, xmm1
+ vfmaddps xmm0, xmm0, xmm6, xmm1
+ cvtdq2ps xmm1, xmm2
+ psubd xmm2, xmm3
+ cvtdq2ps xmm2, xmm2
+ rcpps xmm3, xmm1
+ mulps xmm1, xmm3
+ mulps xmm0, xmm2
+ addps xmm2, xmm3, xmm3
+ vfnmaddps xmm3, xmm1, xmm3, xmm2
+ mulps xmm0, xmm3
+%else
+ cvtdq2ps xmm0, xmm0
+ mulps xmm0, xmm6 ; intra*invq*fps_factor>>8
+ cvtdq2ps xmm1, xmm1 ; prop
+ addps xmm0, xmm1 ; prop + (intra*invq*fps_factor>>8)
+ cvtdq2ps xmm1, xmm2 ; intra
+ psubd xmm2, xmm3 ; intra - inter
+ cvtdq2ps xmm2, xmm2 ; intra - inter
+ rcpps xmm3, xmm1 ; 1 / intra 1st approximation
+ mulps xmm1, xmm3 ; intra * (1/intra 1st approx)
+ mulps xmm1, xmm3 ; intra * (1/intra 1st approx)^2
+ mulps xmm0, xmm2 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+ addps xmm3, xmm3 ; 2 * (1/intra 1st approx)
+ subps xmm3, xmm1 ; 2nd approximation for 1/intra
+ mulps xmm0, xmm3 ; / intra
+%endif
+ cvtps2dq xmm0, xmm0
+ movdqa [r0+r6*2], xmm0
+ add r6, 8
+ jl .loop
+ REP_RET
+%endmacro
+
+INIT_XMM sse2
+MBTREE
+; Bulldozer only has a 128-bit float unit, so the AVX version of this function is actually slower.
+INIT_XMM fma4
+MBTREE
+
+%macro INT16_TO_FLOAT 1
+ vpunpckhwd xmm4, xmm%1, xmm7
+ vpunpcklwd xmm%1, xmm7
+ vinsertf128 ymm%1, ymm%1, xmm4, 1
+ vcvtdq2ps ymm%1, ymm%1
+%endmacro
+
+; FIXME: align loads/stores to 16 bytes
+INIT_YMM avx
+cglobal mbtree_propagate_cost, 7,7,8
+ add r6d, r6d
+ lea r0, [r0+r6*2]
+ add r1, r6
+ add r2, r6
+ add r3, r6
+ add r4, r6
+ neg r6
+ vmovdqa xmm5, [pw_3fff]
+ vbroadcastss ymm6, [r5]
+ vmulps ymm6, ymm6, [pf_inv256]
+ vpxor xmm7, xmm7
+.loop:
+ vmovdqu xmm0, [r2+r6] ; intra
+ vmovdqu xmm1, [r4+r6] ; invq
+ vmovdqu xmm2, [r1+r6] ; prop
+ vpand xmm3, xmm5, [r3+r6] ; inter
+ INT16_TO_FLOAT 0
+ INT16_TO_FLOAT 1
+ INT16_TO_FLOAT 2
+ INT16_TO_FLOAT 3
+ vmulps ymm1, ymm1, ymm0
+ vsubps ymm4, ymm0, ymm3
+ vmulps ymm1, ymm1, ymm6 ; intra*invq*fps_factor>>8
+ vaddps ymm1, ymm1, ymm2 ; prop + (intra*invq*fps_factor>>8)
+ vrcpps ymm3, ymm0 ; 1 / intra 1st approximation
+ vmulps ymm2, ymm0, ymm3 ; intra * (1/intra 1st approx)
+ vmulps ymm2, ymm2, ymm3 ; intra * (1/intra 1st approx)^2
+ vmulps ymm1, ymm1, ymm4 ; (prop + (intra*invq*fps_factor>>8)) * (intra - inter)
+ vaddps ymm3, ymm3, ymm3 ; 2 * (1/intra 1st approx)
+ vsubps ymm3, ymm3, ymm2 ; 2nd approximation for 1/intra
+ vmulps ymm1, ymm1, ymm3 ; / intra
+ vcvtps2dq ymm1, ymm1
+ vmovdqu [r0+r6*2], ymm1
+ add r6, 16
+ jl .loop
+ vzeroupper
+ RET
diff -r f2f70fa9b4f3 -r 2454a81c67fa source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Fri Jul 26 02:19:06 2013 -0500
+++ b/source/common/x86/pixel.h Fri Jul 26 14:12:31 2013 -0700
@@ -202,6 +202,9 @@
uint64_t x265_pixel_sa8d_satd_16x16_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
uint64_t x265_pixel_sa8d_satd_16x16_avx2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2 );
+void x265_frame_init_lowres_core_mmx2( pixel *src0, pixel *dst0, pixel *dsth, pixel *dstv, pixel *dstc,\
+ intptr_t src_stride, intptr_t dst_stride, int width, int height );
+
#define DECL_SSD(width,suffix)\
int x265_pixel_ssd_##width##x64_##suffix( pixel *, intptr_t, pixel *, intptr_t ); \
int x265_pixel_ssd_##width##x48_##suffix( pixel *, intptr_t, pixel *, intptr_t ); \
diff -r f2f70fa9b4f3 -r 2454a81c67fa source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Fri Jul 26 02:19:06 2013 -0500
+++ b/source/test/pixelharness.cpp Fri Jul 26 14:12:31 2013 -0700
@@ -54,11 +54,13 @@
{
pbuf1 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);
pbuf2 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);
+ pbuf3 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);
+ pbuf4 = (pixel*)TestHarness::alignedMalloc(sizeof(pixel), 64 * 64 * 32, 32);
sbuf1 = (short*)TestHarness::alignedMalloc(sizeof(short), 64 * 64 * 32, 32);
sbuf2 = (short*)TestHarness::alignedMalloc(sizeof(short), 64 * 64 * 32, 32);
- if (!pbuf1 || !pbuf2)
+ if (!pbuf1 || !pbuf2 || !pbuf3 || !pbuf4 )
{
fprintf(stderr, "malloc failed, unable to initiate tests!\n");
exit(1);
@@ -69,6 +71,8 @@
//Generate the Random Buffer for Testing
pbuf1[i] = rand() & PIXEL_MAX;
pbuf2[i] = rand() & PIXEL_MAX;
+ pbuf3[i] = rand() & PIXEL_MAX;
+ pbuf4[i] = rand() & PIXEL_MAX;
sbuf1[i] = rand() & PIXEL_MAX;
sbuf2[i] = rand() & PIXEL_MAX;
@@ -79,6 +83,8 @@
{
TestHarness::alignedFree(pbuf1);
TestHarness::alignedFree(pbuf2);
+ TestHarness::alignedFree(pbuf3);
+ TestHarness::alignedFree(pbuf4);
TestHarness::alignedFree(sbuf1);
TestHarness::alignedFree(sbuf2);
}
@@ -423,6 +429,46 @@
return true;
}
+bool PixelHarness::check_downscale_t(x265::downscale_t ref, x265::downscale_t opt)
+{
+ ALIGN_VAR_16(pixel, ref_dest0[64 * 64]);
+ ALIGN_VAR_16(pixel, opt_dest0[64 * 64]);
+
+ ALIGN_VAR_16(pixel, ref_desth[64 * 64]);
+ ALIGN_VAR_16(pixel, opt_desth[64 * 64]);
+
+ ALIGN_VAR_16(pixel, ref_destv[64 * 64]);
+ ALIGN_VAR_16(pixel, opt_destv[64 * 64]);
+
+ ALIGN_VAR_16(pixel, ref_destc[64 * 64]);
+ ALIGN_VAR_16(pixel, opt_destc[64 * 64]);
+
+ int bx = 64;
+ int by = 64;
+ int j = 0;
+ for (int i = 0; i <= 100; i++)
+ {
+ ref(pbuf2 + j, ref_dest0, ref_desth, ref_destv, ref_destc, 64, 64, bx, by);
+ opt(pbuf2 + j, opt_dest0, opt_desth, opt_destv, opt_destc, 64, 64, bx, by);
+
+
+ if (memcmp(ref_dest0, opt_dest0, 64 * 64 * sizeof(pixel)))
+ return false;
+ if (memcmp(ref_desth, opt_desth, 64 * 64 * sizeof(pixel)))
+ return false;
+ if (memcmp(ref_destv, opt_destv, 64 * 64 * sizeof(pixel)))
+ return false;
+ if (memcmp(ref_destc, opt_destc, 64 * 64 * sizeof(pixel)))
+ return false;
+
+ j += 4;
+ bx = 8 * ((rand() & 7) + 1);
+ by = 8 * ((rand() & 7) + 1);
+ }
+
+ return true;
+}
+
bool PixelHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
for (uint16_t curpar = 0; curpar < NUM_PARTITIONS; curpar++)
@@ -600,6 +646,14 @@
}
}
+ if (opt.frame_init_lowres_core)
+ {
+ if (!check_downscale_t(ref.frame_init_lowres_core, opt.frame_init_lowres_core))
+ {
+ printf("downscale failed!\n");
+ return false;
+ }
+ }
return true;
}
@@ -726,4 +780,10 @@
printf("pixel_pp add");
REPORT_SPEEDUP(opt.pixeladd_pp, ref.pixeladd_pp, 64, 64, pbuf1, FENC_STRIDE, pbuf2, pbuf1, STRIDE, STRIDE);
}
+
+ if (opt.frame_init_lowres_core)
+ {
+ printf("downscale");
+ REPORT_SPEEDUP(opt.frame_init_lowres_core, ref.frame_init_lowres_core, pbuf2, pbuf1, pbuf2, pbuf3, pbuf4, 64, 64, 64, 64);
+ }
}
diff -r f2f70fa9b4f3 -r 2454a81c67fa source/test/pixelharness.h
--- a/source/test/pixelharness.h Fri Jul 26 02:19:06 2013 -0500
+++ b/source/test/pixelharness.h Fri Jul 26 14:12:31 2013 -0700
@@ -31,8 +31,8 @@
{
protected:
- pixel *pbuf1, *pbuf2;
-
+ pixel *pbuf1, *pbuf2, *pbuf3, *pbuf4;
+
short *sbuf1, *sbuf2;
bool check_pixelcmp(x265::pixelcmp_t ref, x265::pixelcmp_t opt);
@@ -50,6 +50,7 @@
bool check_pixelsub_sp(x265::pixelsub_sp_t ref, x265::pixelsub_sp_t opt);
bool check_pixeladd_ss(x265::pixeladd_ss_t ref, x265::pixeladd_ss_t opt);
bool check_pixeladd_pp(x265::pixeladd_pp_t ref, x265::pixeladd_pp_t opt);
+ bool check_downscale_t(x265::downscale_t ref, x265::downscale_t opt);
public:
-------------- next part --------------
A non-text attachment was scrubbed...
Name: x265_public.patch
Type: text/x-patch
Size: 55815 bytes
Desc: not available
URL: <http://mailman.videolan.org/private/x265-devel/attachments/20130726/76baf7f8/attachment-0001.bin>
More information about the x265-devel
mailing list