[x264-devel] [Git][videolan/x264][master] 2 commits: x86: Fix exhaustive search ME asm
Anton Mitrofanov
gitlab at videolan.org
Sun Oct 25 18:31:38 CET 2020
Anton Mitrofanov pushed to branch master at VideoLAN / x264
Commits:
58dcf16f by Anton Mitrofanov at 2020-10-25T18:22:37+01:00
x86: Fix exhaustive search ME asm
- - - - -
be3c1492 by Anton Mitrofanov at 2020-10-25T18:22:37+01:00
x86: Remove workaround for nasm on macho64
- - - - -
9 changed files:
- common/bitstream.c
- common/pixel.c
- common/quant.c
- common/x86/pixel-a.asm
- common/x86/x86inc.asm
- common/x86/x86util.asm
- encoder/cabac.c
- encoder/rdo.c
- tools/checkasm.c
Changes:
=====================================
common/bitstream.c
=====================================
@@ -109,7 +109,7 @@ void x264_bitstream_init( uint32_t cpu, x264_bitstream_function_t *pf )
pf->nal_escape = nal_escape_c;
#if HAVE_MMX
-#if ARCH_X86_64 && !defined( __MACH__ )
+#if ARCH_X86_64
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_sse2;
pf->cabac_block_residual_rd_internal = x264_cabac_block_residual_rd_internal_sse2;
pf->cabac_block_residual_8x8_rd_internal = x264_cabac_block_residual_8x8_rd_internal_sse2;
@@ -122,7 +122,7 @@ void x264_bitstream_init( uint32_t cpu, x264_bitstream_function_t *pf )
if( cpu&X264_CPU_SSE2_IS_FAST )
pf->nal_escape = x264_nal_escape_sse2;
}
-#if ARCH_X86_64 && !defined( __MACH__ )
+#if ARCH_X86_64
if( cpu&X264_CPU_LZCNT )
{
pf->cabac_block_residual_internal = x264_cabac_block_residual_internal_lzcnt;
=====================================
common/pixel.c
=====================================
@@ -888,7 +888,6 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
INIT7( satd_x4, _mmx2 );
INIT4( hadamard_ac, _mmx2 );
INIT8( ssd, _mmx2 );
- INIT_ADS( _mmx2 );
pixf->intra_sad_x3_4x4 = intra_sad_x3_4x4_mmx2;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
@@ -961,9 +960,7 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
INIT7( sad, _ssse3 );
INIT7( sad_x3, _ssse3 );
INIT7( sad_x4, _ssse3 );
-#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _ssse3 );
-#endif
INIT6( satd, _ssse3 );
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_ssse3;
@@ -1003,9 +1000,7 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_AVX )
{
INIT5_NAME( sad_aligned, sad, _ssse3 ); /* AVX-capable CPUs doesn't benefit from an aligned version */
-#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _avx );
-#endif
INIT6( satd, _avx );
pixf->satd[PIXEL_4x16] = x264_pixel_satd_4x16_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
@@ -1043,6 +1038,7 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
INIT2_NAME( sad_aligned, sad, _avx2 );
INIT2( sad_x3, _avx2 );
INIT2( sad_x4, _avx2 );
+ INIT_ADS( _avx2 );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2;
@@ -1201,9 +1197,7 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x9_8x8 = x264_intra_sa8d_x9_8x8_ssse3;
#endif
}
-#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _ssse3 );
-#endif
if( cpu&X264_CPU_SLOW_ATOM )
{
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3_atom;
@@ -1286,9 +1280,7 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
INIT8( satd, _avx );
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
-#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _avx );
-#endif
INIT4( hadamard_ac, _avx );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
@@ -1341,9 +1333,7 @@ void x264_pixel_init( uint32_t cpu, x264_pixel_function_t *pixf )
INIT2( sad_x4, _avx2 );
INIT4( satd, _avx2 );
INIT2( hadamard_ac, _avx2 );
-#if ARCH_X86 || !defined( __MACH__ )
INIT_ADS( _avx2 );
-#endif
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx2;
=====================================
common/quant.c
=====================================
@@ -648,7 +648,6 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
pf->decimate_score16 = x264_decimate_score16_ssse3;
pf->decimate_score64 = x264_decimate_score64_ssse3;
INIT_TRELLIS( ssse3 );
-#if ARCH_X86 || !defined( __MACH__ )
pf->coeff_level_run4 = x264_coeff_level_run4_ssse3;
pf->coeff_level_run8 = x264_coeff_level_run8_ssse3;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3;
@@ -660,7 +659,6 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_ssse3_lzcnt;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_ssse3_lzcnt;
}
-#endif
}
if( cpu&X264_CPU_SSE4 )
@@ -711,10 +709,8 @@ void x264_quant_init( x264_t *h, uint32_t cpu, x264_quant_function_t *pf )
pf->decimate_score64 = x264_decimate_score64_avx2;
pf->denoise_dct = x264_denoise_dct_avx2;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_avx2;
-#if ARCH_X86 || !defined( __MACH__ )
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_avx2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_avx2;
-#endif
}
if( cpu&X264_CPU_AVX512 )
{
=====================================
common/x86/pixel-a.asm
=====================================
@@ -5220,13 +5220,13 @@ ASD8
shl r2d, 1
%endmacro
-%macro ADS_END 1 ; unroll_size
- add r1, 8*%1
- add r3, 8*%1
- add r6, 4*%1
- sub r0d, 4*%1
- jg .loop
- WIN64_RESTORE_XMM
+%macro ADS_END 1-2 .loop ; unroll_size, loop_label
+ add r1, 2*%1
+ add r3, 2*%1
+ add r6, %1
+ sub r0d, %1
+ jg %2
+ WIN64_RESTORE_XMM_INTERNAL
%if mmsize==32
vzeroupper
%endif
@@ -5243,105 +5243,220 @@ ASD8
; int pixel_ads4( int enc_dc[4], uint16_t *sums, int delta,
; uint16_t *cost_mvx, int16_t *mvs, int width, int thresh )
;-----------------------------------------------------------------------------
-INIT_MMX mmx2
-cglobal pixel_ads4, 5,7
- mova m6, [r0]
- mova m4, [r0+8]
- pshufw m7, m6, 0
- pshufw m6, m6, q2222
- pshufw m5, m4, 0
- pshufw m4, m4, q2222
+%if HIGH_BIT_DEPTH
+
+%macro ADS_XMM 0
+%if ARCH_X86_64
+cglobal pixel_ads4, 5,7,9
+%else
+cglobal pixel_ads4, 5,7,8
+%endif
+%if mmsize >= 32
+ vpbroadcastd m7, [r0+ 0]
+ vpbroadcastd m6, [r0+ 4]
+ vpbroadcastd m5, [r0+ 8]
+ vpbroadcastd m4, [r0+12]
+%else
+ mova m4, [r0]
+ pshufd m7, m4, 0
+ pshufd m6, m4, q1111
+ pshufd m5, m4, q2222
+ pshufd m4, m4, q3333
+%endif
+%if ARCH_X86_64
+ SPLATD m8, r6m
+%endif
ADS_START
.loop:
- movu m0, [r1]
- movu m1, [r1+16]
- psubw m0, m7
- psubw m1, m6
- ABSW m0, m0, m2
- ABSW m1, m1, m3
- movu m2, [r1+r2]
- movu m3, [r1+r2+16]
- psubw m2, m5
- psubw m3, m4
- paddw m0, m1
- ABSW m2, m2, m1
- ABSW m3, m3, m1
- paddw m0, m2
- paddw m0, m3
- pshufw m1, r6m, 0
- paddusw m0, [r3]
- psubusw m1, m0
- packsswb m1, m1
+%if cpuflag(avx)
+ pmovzxwd m0, [r1]
+ pmovzxwd m1, [r1+16]
+%else
+ movh m0, [r1]
+ movh m1, [r1+16]
+ pxor m3, m3
+ punpcklwd m0, m3
+ punpcklwd m1, m3
+%endif
+ psubd m0, m7
+ psubd m1, m6
+ ABSD m0, m0, m2
+ ABSD m1, m1, m3
+%if cpuflag(avx)
+ pmovzxwd m2, [r1+r2]
+ pmovzxwd m3, [r1+r2+16]
+ paddd m0, m1
+%else
+ movh m2, [r1+r2]
+ movh m3, [r1+r2+16]
+ paddd m0, m1
+ pxor m1, m1
+ punpcklwd m2, m1
+ punpcklwd m3, m1
+%endif
+ psubd m2, m5
+ psubd m3, m4
+ ABSD m2, m2, m1
+ ABSD m3, m3, m1
+ paddd m0, m2
+ paddd m0, m3
+%if cpuflag(avx)
+ pmovzxwd m1, [r3]
+%else
+ movh m1, [r3]
+ pxor m3, m3
+ punpcklwd m1, m3
+%endif
+ paddd m0, m1
+%if ARCH_X86_64
+ psubd m1, m8, m0
+%else
+ SPLATD m1, r6m
+ psubd m1, m0
+%endif
+ packssdw m1, m1
+%if mmsize == 32
+ vpermq m1, m1, q3120
+ packuswb m1, m1
+ movq [r6], xm1
+%else
+ packuswb m1, m1
movd [r6], m1
- ADS_END 1
+%endif
+ ADS_END mmsize/4
-cglobal pixel_ads2, 5,7
- mova m6, [r0]
- pshufw m5, r6m, 0
- pshufw m7, m6, 0
- pshufw m6, m6, q2222
+cglobal pixel_ads2, 5,7,8
+%if mmsize >= 32
+ vpbroadcastd m7, [r0+0]
+ vpbroadcastd m6, [r0+4]
+ vpbroadcastd m5, r6m
+%else
+ movq m6, [r0]
+ movd m5, r6m
+ pshufd m7, m6, 0
+ pshufd m6, m6, q1111
+ pshufd m5, m5, 0
+%endif
+ pxor m4, m4
ADS_START
.loop:
- movu m0, [r1]
- movu m1, [r1+r2]
- psubw m0, m7
- psubw m1, m6
- ABSW m0, m0, m2
- ABSW m1, m1, m3
- paddw m0, m1
- paddusw m0, [r3]
- mova m4, m5
- psubusw m4, m0
- packsswb m4, m4
- movd [r6], m4
- ADS_END 1
-
-cglobal pixel_ads1, 5,7
- pshufw m7, [r0], 0
- pshufw m6, r6m, 0
+%if cpuflag(avx)
+ pmovzxwd m0, [r1]
+ pmovzxwd m1, [r1+r2]
+ pmovzxwd m2, [r3]
+%else
+ movh m0, [r1]
+ movh m1, [r1+r2]
+ movh m2, [r3]
+ punpcklwd m0, m4
+ punpcklwd m1, m4
+ punpcklwd m2, m4
+%endif
+ psubd m0, m7
+ psubd m1, m6
+ ABSD m0, m0, m3
+ ABSD m1, m1, m3
+ paddd m0, m1
+ paddd m0, m2
+ psubd m1, m5, m0
+ packssdw m1, m1
+%if mmsize == 32
+ vpermq m1, m1, q3120
+ packuswb m1, m1
+ movq [r6], xm1
+%else
+ packuswb m1, m1
+ movd [r6], m1
+%endif
+ ADS_END mmsize/4
+
+cglobal pixel_ads1, 5,7,8
+%if mmsize >= 32
+ vpbroadcastd m7, [r0]
+ vpbroadcastd m6, r6m
+%else
+ movd m7, [r0]
+ movd m6, r6m
+ pshufd m7, m7, 0
+ pshufd m6, m6, 0
+%endif
+ pxor m5, m5
ADS_START
.loop:
- movu m0, [r1]
- movu m1, [r1+8]
- psubw m0, m7
- psubw m1, m7
- ABSW m0, m0, m2
- ABSW m1, m1, m3
- paddusw m0, [r3]
- paddusw m1, [r3+8]
- mova m4, m6
- mova m5, m6
- psubusw m4, m0
- psubusw m5, m1
- packsswb m4, m5
- mova [r6], m4
- ADS_END 2
+ movu m1, [r1]
+ movu m3, [r3]
+ punpcklwd m0, m1, m5
+ punpckhwd m1, m5
+ punpcklwd m2, m3, m5
+ punpckhwd m3, m5
+ psubd m0, m7
+ psubd m1, m7
+ ABSD m0, m0, m4
+ ABSD m1, m1, m4
+ paddd m0, m2
+ paddd m1, m3
+ psubd m2, m6, m0
+ psubd m3, m6, m1
+ packssdw m2, m3
+ packuswb m2, m2
+%if mmsize == 32
+ vpermq m2, m2, q3120
+ mova [r6], xm2
+%else
+ movq [r6], m2
+%endif
+ ADS_END mmsize/2
+%endmacro
+
+INIT_XMM sse2
+ADS_XMM
+INIT_XMM ssse3
+ADS_XMM
+INIT_XMM avx
+ADS_XMM
+INIT_YMM avx2
+ADS_XMM
+
+%else ; !HIGH_BIT_DEPTH
%macro ADS_XMM 0
-%if mmsize==32
+%if ARCH_X86_64 && mmsize == 16
+cglobal pixel_ads4, 5,7,12
+%elif ARCH_X86_64 && mmsize != 8
+cglobal pixel_ads4, 5,7,9
+%else
cglobal pixel_ads4, 5,7,8
+%endif
+ test dword r6m, 0xffff0000
+%if mmsize >= 32
vpbroadcastw m7, [r0+ 0]
vpbroadcastw m6, [r0+ 4]
vpbroadcastw m5, [r0+ 8]
vpbroadcastw m4, [r0+12]
-%else
-cglobal pixel_ads4, 5,7,12
- mova m4, [r0]
- pshuflw m7, m4, q0000
- pshuflw m6, m4, q2222
- pshufhw m5, m4, q0000
- pshufhw m4, m4, q2222
+%elif mmsize == 16
+ mova m4, [r0]
+ pshuflw m7, m4, 0
+ pshuflw m6, m4, q2222
+ pshufhw m5, m4, 0
+ pshufhw m4, m4, q2222
punpcklqdq m7, m7
punpcklqdq m6, m6
punpckhqdq m5, m5
punpckhqdq m4, m4
+%else
+ mova m6, [r0]
+ mova m4, [r0+8]
+ pshufw m7, m6, 0
+ pshufw m6, m6, q2222
+ pshufw m5, m4, 0
+ pshufw m4, m4, q2222
%endif
-%if ARCH_X86_64 && mmsize == 16
- movd m8, r6m
- SPLATW m8, m8
+ jnz .nz
ADS_START
+%if ARCH_X86_64 && mmsize == 16
movu m10, [r1]
movu m11, [r1+r2]
+ SPLATW m8, r6m
.loop:
psubw m0, m10, m7
movu m10, [r1+16]
@@ -5360,7 +5475,9 @@ cglobal pixel_ads4, 5,7,12
paddusw m0, m9
psubusw m1, m8, m0
%else
- ADS_START
+%if ARCH_X86_64 && mmsize != 8
+ SPLATW m8, r6m
+%endif
.loop:
movu m0, [r1]
movu m1, [r1+16]
@@ -5378,81 +5495,196 @@ cglobal pixel_ads4, 5,7,12
paddw m0, m2
paddw m0, m3
movu m2, [r3]
-%if mmsize==32
- vpbroadcastw m1, r6m
+%if ARCH_X86_64 && mmsize != 8
+ mova m1, m8
%else
- movd m1, r6m
- pshuflw m1, m1, 0
- punpcklqdq m1, m1
+ SPLATW m1, r6m
%endif
paddusw m0, m2
psubusw m1, m0
%endif ; ARCH
packsswb m1, m1
-%if mmsize==32
+%if mmsize == 32
vpermq m1, m1, q3120
mova [r6], xm1
%else
movh [r6], m1
%endif
- ADS_END mmsize/8
+ ADS_END mmsize/2
+.nz:
+ ADS_START
+%if ARCH_X86_64 && mmsize == 16
+ movu m10, [r1]
+ movu m11, [r1+r2]
+ SPLATD m8, r6m
+.loop_nz:
+ psubw m0, m10, m7
+ movu m10, [r1+16]
+ psubw m1, m10, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ psubw m2, m11, m5
+ movu m11, [r1+r2+16]
+ paddw m0, m1
+ psubw m3, m11, m4
+ movu m9, [r3]
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ paddw m0, m2
+ paddw m0, m3
+ pxor m3, m3
+ mova m2, m0
+ mova m1, m9
+ punpcklwd m0, m3
+ punpcklwd m9, m3
+ punpckhwd m2, m3
+ punpckhwd m1, m3
+ paddd m0, m9
+ paddd m2, m1
+ psubd m1, m8, m0
+ psubd m3, m8, m2
+ packssdw m1, m3
+ packuswb m1, m1
+%else
+%if ARCH_X86_64 && mmsize != 8
+ SPLATD m8, r6m
+%endif
+.loop_nz:
+ movu m0, [r1]
+ movu m1, [r1+16]
+ psubw m0, m7
+ psubw m1, m6
+ ABSW m0, m0, m2
+ ABSW m1, m1, m3
+ movu m2, [r1+r2]
+ movu m3, [r1+r2+16]
+ psubw m2, m5
+ psubw m3, m4
+ paddw m0, m1
+ ABSW m2, m2, m1
+ ABSW m3, m3, m1
+ paddw m0, m2
+ paddw m0, m3
+%if mmsize == 32
+ movu m1, [r3]
+%else
+ movh m1, [r3]
+%endif
+ pxor m3, m3
+ mova m2, m0
+ punpcklwd m0, m3
+ punpcklwd m1, m3
+ punpckhwd m2, m3
+ paddd m0, m1
+%if mmsize == 32
+ movu m1, [r3]
+ punpckhwd m1, m3
+%else
+ movh m1, [r3+mmsize/2]
+ punpcklwd m1, m3
+%endif
+ paddd m2, m1
+%if ARCH_X86_64 && mmsize != 8
+ mova m1, m8
+%else
+ SPLATD m1, r6m
+%endif
+ mova m3, m1
+ psubd m1, m0
+ psubd m3, m2
+ packssdw m1, m3
+ packuswb m1, m1
+%endif ; ARCH
+%if mmsize == 32
+ vpermq m1, m1, q3120
+ mova [r6], xm1
+%else
+ movh [r6], m1
+%endif
+ ADS_END mmsize/2, .loop_nz
cglobal pixel_ads2, 5,7,8
-%if mmsize==32
+ test dword r6m, 0xffff0000
+%if mmsize >= 32
vpbroadcastw m7, [r0+0]
vpbroadcastw m6, [r0+4]
- vpbroadcastw m5, r6m
-%else
- movq m6, [r0]
- movd m5, r6m
- pshuflw m7, m6, 0
- pshuflw m6, m6, q2222
- pshuflw m5, m5, 0
+%elif mmsize == 16
+ movq m6, [r0]
+ pshuflw m7, m6, 0
+ pshuflw m6, m6, q2222
punpcklqdq m7, m7
punpcklqdq m6, m6
- punpcklqdq m5, m5
+%else
+ mova m6, [r0]
+ pshufw m7, m6, 0
+ pshufw m6, m6, q2222
%endif
+ jnz .nz
ADS_START
+ SPLATW m5, r6m
.loop:
movu m0, [r1]
movu m1, [r1+r2]
+ movu m2, [r3]
psubw m0, m7
psubw m1, m6
- movu m4, [r3]
- ABSW m0, m0, m2
- ABSW m1, m1, m3
+ ABSW m0, m0, m3
+ ABSW m1, m1, m4
paddw m0, m1
- paddusw m0, m4
+ paddusw m0, m2
psubusw m1, m5, m0
packsswb m1, m1
-%if mmsize==32
+%if mmsize == 32
vpermq m1, m1, q3120
mova [r6], xm1
%else
movh [r6], m1
%endif
- ADS_END mmsize/8
-
-cglobal pixel_ads1, 5,7,8
-%if mmsize==32
- vpbroadcastw m7, [r0]
- vpbroadcastw m6, r6m
+ ADS_END mmsize/2
+.nz:
+ ADS_START
+ SPLATD m5, r6m
+ pxor m4, m4
+.loop_nz:
+ movu m0, [r1]
+ movu m1, [r1+r2]
+ movu m2, [r3]
+ psubw m0, m7
+ psubw m1, m6
+ ABSW m0, m0, m3
+ ABSW m1, m1, m3
+ paddw m0, m1
+ punpckhwd m3, m2, m4
+ punpckhwd m1, m0, m4
+ punpcklwd m2, m4
+ punpcklwd m0, m4
+ paddd m1, m3
+ paddd m0, m2
+ psubd m3, m5, m1
+ psubd m2, m5, m0
+ packssdw m2, m3
+ packuswb m2, m2
+%if mmsize == 32
+ vpermq m2, m2, q3120
+ mova [r6], xm2
%else
- movd m7, [r0]
- movd m6, r6m
- pshuflw m7, m7, 0
- pshuflw m6, m6, 0
- punpcklqdq m7, m7
- punpcklqdq m6, m6
+ movh [r6], m2
%endif
+ ADS_END mmsize/2, .loop_nz
+
+cglobal pixel_ads1, 5,7,8
+ test dword r6m, 0xffff0000
+ SPLATW m7, [r0]
+ jnz .nz
ADS_START
+ SPLATW m6, r6m
.loop:
movu m0, [r1]
movu m1, [r1+mmsize]
- psubw m0, m7
- psubw m1, m7
movu m2, [r3]
movu m3, [r3+mmsize]
+ psubw m0, m7
+ psubw m1, m7
ABSW m0, m0, m4
ABSW m1, m1, m5
paddusw m0, m2
@@ -5460,13 +5692,52 @@ cglobal pixel_ads1, 5,7,8
psubusw m4, m6, m0
psubusw m5, m6, m1
packsswb m4, m5
-%if mmsize==32
+%if mmsize == 32
vpermq m4, m4, q3120
%endif
mova [r6], m4
- ADS_END mmsize/4
+ ADS_END mmsize
+.nz:
+ ADS_START
+ SPLATD m6, r6m
+ pxor m5, m5
+.loop_nz:
+ movu m0, [r1]
+ movu m1, [r1+mmsize]
+ movu m2, [r3]
+ psubw m0, m7
+ psubw m1, m7
+ ABSW m0, m0, m3
+ ABSW m1, m1, m4
+ punpckhwd m3, m2, m5
+ punpckhwd m4, m0, m5
+ punpcklwd m2, m5
+ punpcklwd m0, m5
+ paddd m4, m3
+ paddd m0, m2
+ psubd m3, m6, m4
+ movu m4, [r3+mmsize]
+ psubd m2, m6, m0
+ packssdw m2, m3
+ punpckhwd m0, m1, m5
+ punpckhwd m3, m4, m5
+ punpcklwd m1, m5
+ punpcklwd m4, m5
+ paddd m0, m3
+ paddd m1, m4
+ psubd m3, m6, m0
+ psubd m4, m6, m1
+ packssdw m4, m3
+ packuswb m2, m4
+%if mmsize == 32
+ vpermq m2, m2, q3120
+%endif
+ mova [r6], m2
+ ADS_END mmsize, .loop_nz
%endmacro
+INIT_MMX mmx2
+ADS_XMM
INIT_XMM sse2
ADS_XMM
INIT_XMM ssse3
@@ -5476,6 +5747,8 @@ ADS_XMM
INIT_YMM avx2
ADS_XMM
+%endif ; HIGH_BIT_DEPTH
+
; int pixel_ads_mvs( int16_t *mvs, uint8_t *masks, int width )
; {
; int nmv=0, i, j;
@@ -5521,7 +5794,7 @@ ALIGN 16
test r2, r2
%else
mov r3, r2
- add r3d, [r6+r1+4]
+ or r3d, [r6+r1+4]
%endif
jz .loopi0
xor r3d, r3d
=====================================
common/x86/x86inc.asm
=====================================
@@ -664,6 +664,8 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
%if WIN64 == 0
%macro WIN64_SPILL_XMM 1
%endmacro
+ %macro WIN64_RESTORE_XMM_INTERNAL 0
+ %endmacro
%macro WIN64_RESTORE_XMM 0
%endmacro
%macro WIN64_PUSH_XMM 0
=====================================
common/x86/x86util.asm
=====================================
@@ -286,18 +286,51 @@
%if cpuflag(avx2) && %3 == 0
vpbroadcastw %1, %2
%else
- PSHUFLW %1, %2, (%3)*q1111
-%if mmsize == 16
+ %define %%s %2
+%ifid %2
+ %define %%s xmm%2
+%elif %3 == 0
+ movd xmm%1, %2
+ %define %%s xmm%1
+%endif
+ PSHUFLW xmm%1, %%s, (%3)*q1111
+%if mmsize >= 32
+ vpbroadcastq %1, xmm%1
+%elif mmsize == 16
punpcklqdq %1, %1
%endif
%endif
%endmacro
%imacro SPLATD 2-3 0
-%if mmsize == 16
- pshufd %1, %2, (%3)*q1111
+%if cpuflag(avx2) && %3 == 0
+ vpbroadcastd %1, %2
+%else
+ %define %%s %2
+%ifid %2
+ %define %%s xmm%2
+%elif %3 == 0
+ movd xmm%1, %2
+ %define %%s xmm%1
+%endif
+%if mmsize == 8 && %3 == 0
+%ifidn %1, %%s
+ punpckldq %1, %1
%else
- pshufw %1, %2, (%3)*q0101 + ((%3)+1)*q1010
+ pshufw %1, %%s, q1010
+%endif
+%elif mmsize == 8 && %3 == 1
+%ifidn %1, %%s
+ punpckhdq %1, %1
+%else
+ pshufw %1, %%s, q3232
+%endif
+%else
+ pshufd xmm%1, %%s, (%3)*q1111
+%endif
+%if mmsize >= 32
+ vpbroadcastq %1, xmm%1
+%endif
%endif
%endmacro
=====================================
encoder/cabac.c
=====================================
@@ -754,7 +754,7 @@ void x264_cabac_block_residual_c( x264_t *h, x264_cabac_t *cb, int ctx_block_cat
static ALWAYS_INLINE void cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
-#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ )
+#if ARCH_X86_64 && HAVE_MMX
h->bsf.cabac_block_residual_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
x264_cabac_block_residual_c( h, cb, ctx_block_cat, l );
@@ -868,7 +868,7 @@ void x264_cabac_block_residual_rd_c( x264_t *h, x264_cabac_t *cb, int ctx_block_
static ALWAYS_INLINE void cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
-#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ )
+#if ARCH_X86_64 && HAVE_MMX
h->bsf.cabac_block_residual_8x8_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
x264_cabac_block_residual_8x8_rd_c( h, cb, ctx_block_cat, l );
@@ -876,7 +876,7 @@ static ALWAYS_INLINE void cabac_block_residual_8x8( x264_t *h, x264_cabac_t *cb,
}
static ALWAYS_INLINE void cabac_block_residual( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
-#if ARCH_X86_64 && HAVE_MMX && !defined( __MACH__ )
+#if ARCH_X86_64 && HAVE_MMX
h->bsf.cabac_block_residual_rd_internal( l, MB_INTERLACED, ctx_block_cat, cb );
#else
x264_cabac_block_residual_rd_c( h, cb, ctx_block_cat, l );
=====================================
encoder/rdo.c
=====================================
@@ -704,7 +704,7 @@ int quant_trellis_cabac( x264_t *h, dctcoef *dct,
return !!dct[0];
}
-#if HAVE_MMX && ARCH_X86_64 && !defined( __MACH__ )
+#if HAVE_MMX && ARCH_X86_64
uint64_t level_state0;
memcpy( &level_state0, cabac_state, sizeof(uint64_t) );
uint16_t level_state1;
=====================================
tools/checkasm.c
=====================================
@@ -60,6 +60,9 @@ static int quiet = 0;
#define MAX_FUNCS 1000 // just has to be big enough to hold all the existing functions
#define MAX_CPUS 30 // number of different combinations of cpu flags
+// RAND_MAX is guaranteed to be at least 32767, to get 30 bits of random data, we'll call rand() twice
+#define rand30() (((rand() & 0x7fff) << 15) + (rand() & 0x7fff))
+
typedef struct
{
void *pointer; // just for detecting duplicates
@@ -799,7 +802,7 @@ static int check_pixel( uint32_t cpu_ref, uint32_t cpu_new )
ok = 1; used_asm = 0;
for( int i = 0; i < 32; i++ )
- cost_mv[i] = i*10;
+ cost_mv[i] = rand30() & 0xffff;
for( int i = 0; i < 100 && ok; i++ )
if( pixel_asm.ads[i&3] != pixel_ref.ads[i&3] )
{
@@ -808,18 +811,36 @@ static int check_pixel( uint32_t cpu_ref, uint32_t cpu_new )
ALIGNED_16( int16_t mvs_a[48] );
ALIGNED_16( int16_t mvs_c[48] );
int mvn_a, mvn_c;
- int thresh = rand() & 0x3fff;
- set_func_name( "esa_ads" );
- for( int j = 0; j < 72; j++ )
- sums[j] = rand() & 0x3fff;
- for( int j = 0; j < 4; j++ )
- dc[j] = rand() & 0x3fff;
+ int thresh = (rand() % 257) * PIXEL_MAX + (rand30() & 0xffff);
+ set_func_name( "esa_ads_%s", pixel_names[i&3] );
+ if( i < 40 )
+ {
+ for( int j = 0; j < 72; j++ )
+ sums[j] = (rand() % 9) * 8 * PIXEL_MAX;
+ for( int j = 0; j < 4; j++ )
+ dc[j] = (rand() % 9) * 8 * PIXEL_MAX;
+ }
+ else
+ {
+#if BIT_DEPTH + 6 > 15
+ for( int j = 0; j < 72; j++ )
+ sums[j] = rand30() & ((1 << (BIT_DEPTH + 6))-1);
+ for( int j = 0; j < 4; j++ )
+ dc[j] = rand30() & ((1 << (BIT_DEPTH + 6))-1);
+#else
+ for( int j = 0; j < 72; j++ )
+ sums[j] = rand() & ((1 << (BIT_DEPTH + 6))-1);
+ for( int j = 0; j < 4; j++ )
+ dc[j] = rand() & ((1 << (BIT_DEPTH + 6))-1);
+#endif
+ }
used_asm = 1;
mvn_c = call_c( pixel_c.ads[i&3], dc, sums, 32, cost_mv, mvs_c, 28, thresh );
mvn_a = call_a( pixel_asm.ads[i&3], dc, sums, 32, cost_mv, mvs_a, 28, thresh );
if( mvn_c != mvn_a || memcmp( mvs_c, mvs_a, mvn_c*sizeof(*mvs_c) ) )
{
ok = 0;
+ printf( "thresh: %d\n", thresh );
printf( "c%d: ", i&3 );
for( int j = 0; j < mvn_c; j++ )
printf( "%d ", mvs_c[j] );
@@ -1721,7 +1742,7 @@ static int check_mc( uint32_t cpu_ref, uint32_t cpu_new )
x264_emms();
for( int i = 0; i < 10; i++ )
{
- float fps_factor = (rand()&65535) / 65535.0f;
+ float fps_factor = (rand30()&65535) / 65535.0f;
set_func_name( "mbtree_propagate_cost" );
int16_t *dsta = (int16_t*)buf3;
int16_t *dstc = dsta+400;
View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/37329c4f103327b6d306c8148c79d9658419231b...be3c1492db255a3b3db7196eae0abe804d094985
--
View it on GitLab: https://code.videolan.org/videolan/x264/-/compare/37329c4f103327b6d306c8148c79d9658419231b...be3c1492db255a3b3db7196eae0abe804d094985
You're receiving this email because of your account on code.videolan.org.
More information about the x264-devel
mailing list