[x264-devel] x86: Faster pixel_ssd_nv12
Henrik Gramner
git at videolan.org
Tue Sep 20 20:57:52 CEST 2016
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sat Sep 17 14:45:08 2016 +0200| [9521b278adb92081f052c1b7bfc4b95651d88b07] | committer: Anton Mitrofanov
x86: Faster pixel_ssd_nv12
Also drop the MMX2 version to simplify things.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=9521b278adb92081f052c1b7bfc4b95651d88b07
---
common/pixel.c | 2 -
common/x86/pixel-a.asm | 109 ++++++++++++++++++++-----------------------------
common/x86/pixel.h | 3 --
3 files changed, 45 insertions(+), 69 deletions(-)
diff --git a/common/pixel.c b/common/pixel.c
index 3963af7..4db4b46 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -885,7 +885,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT8( ssd, _mmx2 );
INIT_ADS( _mmx2 );
- pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
#if ARCH_X86
@@ -1071,7 +1070,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
- pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
#if ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmx2;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2;
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index cf60618..cff22ee 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -43,6 +43,9 @@ mask_ff: times 16 db 0xff
mask_ac4: times 2 dw 0, -1, -1, -1, 0, -1, -1, -1
mask_ac4b: times 2 dw 0, -1, 0, -1, -1, -1, -1, -1
mask_ac8: times 2 dw 0, -1, -1, -1, -1, -1, -1, -1
+%if HIGH_BIT_DEPTH
+ssd_nv12_shuf: db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15
+%endif
%if BIT_DEPTH == 10
ssim_c1: times 4 dd 6697.7856 ; .01*.01*1023*1023*64
ssim_c2: times 4 dd 3797644.4352 ; .03*.03*1023*1023*64*63
@@ -531,8 +534,8 @@ SSD 16, 8
;
; 2 * mmsize/32 * (2^32 - 1) / (2^BIT_DEPTH - 1)^2
;
-; For 10-bit MMX this means width >= 16416 and for XMM >= 32832. At sane
-; distortion levels it will take much more than that though.
+; For 10-bit XMM this means width >= 32832. At sane distortion levels
+; it will take much more than that though.
;-----------------------------------------------------------------------------
%if HIGH_BIT_DEPTH
%macro SSD_NV12 0
@@ -541,13 +544,14 @@ cglobal pixel_ssd_nv12_core, 6,7,7
FIX_STRIDES r1, r3
add r0, r4
add r2, r4
- xor r6, r6
+ neg r4
pxor m4, m4
pxor m5, m5
- pxor m6, m6
+%if mmsize == 32
+ vbroadcasti128 m6, [ssd_nv12_shuf]
+%endif
.loopy:
mov r6, r4
- neg r6
pxor m2, m2
pxor m3, m3
.loopx:
@@ -555,11 +559,11 @@ cglobal pixel_ssd_nv12_core, 6,7,7
mova m1, [r0+r6+mmsize]
psubw m0, [r2+r6]
psubw m1, [r2+r6+mmsize]
- PSHUFLW m0, m0, q3120
- PSHUFLW m1, m1, q3120
-%if mmsize >= 16
- pshufhw m0, m0, q3120
- pshufhw m1, m1, q3120
+%if mmsize == 32
+ pshufb m0, m6
+ pshufb m1, m6
+%else
+ SBUTTERFLY wd, 0, 1, 6
%endif
%if cpuflag(xop)
pmadcswd m2, m0, m0, m2
@@ -577,59 +581,30 @@ cglobal pixel_ssd_nv12_core, 6,7,7
psubd m3, m1
.no_overread:
%endif
-%if mmsize >= 16 ; using HADDD would remove the mmsize/32 part from the
- ; equation above, putting the width limit at 8208
- punpckhdq m0, m2, m6
- punpckhdq m1, m3, m6
- punpckldq m2, m6
- punpckldq m3, m6
- paddq m3, m2
- paddq m1, m0
- paddq m4, m3
- paddq m4, m1
-%else ; unfortunately paddq is sse2
- ; emulate 48 bit precision for mmx2 instead
- mova m0, m2
- mova m1, m3
- punpcklwd m2, m6
- punpcklwd m3, m6
- punpckhwd m0, m6
- punpckhwd m1, m6
- paddd m3, m2
- paddd m1, m0
- paddd m4, m3
- paddd m5, m1
-%endif
+ punpckhdq m0, m2, m5 ; using HADDD would remove the mmsize/32 part from the
+ punpckhdq m1, m3, m5 ; equation above, putting the width limit at 8208
+ punpckldq m2, m5
+ punpckldq m3, m5
+ paddq m0, m1
+ paddq m2, m3
+ paddq m4, m0
+ paddq m4, m2
add r0, r1
add r2, r3
dec r5d
jg .loopy
- mov r3, r6m
- mov r4, r7m
+ mov r0, r6m
+ mov r1, r7m
%if mmsize == 32
vextracti128 xm0, m4, 1
paddq xm4, xm0
%endif
-%if mmsize >= 16
- movq [r3], xm4
- movhps [r4], xm4
-%else ; fixup for mmx2
- SBUTTERFLY dq, 4, 5, 0
- mova m0, m4
- psrld m4, 16
- paddd m5, m4
- pslld m0, 16
- SBUTTERFLY dq, 0, 5, 4
- psrlq m0, 16
- psrlq m5, 16
- movq [r3], m0
- movq [r4], m5
-%endif
+ movq [r0], xm4
+ movhps [r1], xm4
RET
%endmacro ; SSD_NV12
-%endif ; HIGH_BIT_DEPTH
-%if HIGH_BIT_DEPTH == 0
+%else ; !HIGH_BIT_DEPTH
;-----------------------------------------------------------------------------
; void pixel_ssd_nv12_core( uint8_t *pixuv1, intptr_t stride1, uint8_t *pixuv2, intptr_t stride2,
; int width, int height, uint64_t *ssd_u, uint64_t *ssd_v )
@@ -643,12 +618,12 @@ cglobal pixel_ssd_nv12_core, 6,7
add r4d, r4d
add r0, r4
add r2, r4
+ neg r4
pxor m3, m3
pxor m4, m4
mova m5, [pw_00ff]
.loopy:
mov r6, r4
- neg r6
.loopx:
%if mmsize == 32 ; only 16-byte alignment is guaranteed
movu m2, [r0+r6]
@@ -686,21 +661,27 @@ cglobal pixel_ssd_nv12_core, 6,7
add r2, r3
dec r5d
jg .loopy
- mov r3, r6m
- mov r4, r7m
- HADDD m3, m0
- HADDD m4, m0
- pxor xm0, xm0
- punpckldq xm3, xm0
- punpckldq xm4, xm0
- movq [r3], xm3
- movq [r4], xm4
+ mov r0, r6m
+ mov r1, r7m
+%if cpuflag(ssse3)
+ phaddd m3, m4
+%else
+ SBUTTERFLY qdq, 3, 4, 0
+ paddd m3, m4
+%endif
+%if mmsize == 32
+ vextracti128 xm4, m3, 1
+ paddd xm3, xm4
+%endif
+ psllq xm4, xm3, 32
+ paddd xm3, xm4
+ psrlq xm3, 32
+ movq [r0], xm3
+ movhps [r1], xm3
RET
%endmacro ; SSD_NV12
%endif ; !HIGH_BIT_DEPTH
-INIT_MMX mmx2
-SSD_NV12
INIT_XMM sse2
SSD_NV12
INIT_XMM avx
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 7f57788..a8ed389 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -145,9 +145,6 @@ int x264_intra_sad_x9_8x8_sse4 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, u
int x264_intra_sad_x9_8x8_avx ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
int x264_intra_sad_x9_8x8_avx2 ( uint8_t *, uint8_t *, uint8_t *, uint16_t *, uint16_t * );
-void x264_pixel_ssd_nv12_core_mmx2( pixel *pixuv1, intptr_t stride1,
- pixel *pixuv2, intptr_t stride2, int width,
- int height, uint64_t *ssd_u, uint64_t *ssd_v );
void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, intptr_t stride1,
pixel *pixuv2, intptr_t stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
More information about the x264-devel
mailing list