[x264-devel] x86: XOP pixel_ssd_nv12_core
James Almer
git at videolan.org
Wed Apr 23 00:41:00 CEST 2014
x264 | branch: master | James Almer <jamrial at gmail.com> | Wed Apr 9 03:33:05 2014 -0300| [be0469715a683c2556d57f3105bd38dd70caa1bf] | committer: Jason Garrett-Glaser
x86: XOP pixel_ssd_nv12_core
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=be0469715a683c2556d57f3105bd38dd70caa1bf
---
common/pixel.c | 2 ++
common/x86/pixel-a.asm | 14 +++++++++++++-
common/x86/pixel.h | 3 +++
3 files changed, 18 insertions(+), 1 deletion(-)
diff --git a/common/pixel.c b/common/pixel.c
index 735c835..63e3f65 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1021,6 +1021,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
if( cpu&X264_CPU_XOP )
{
+ pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
pixf->vsad = x264_pixel_vsad_xop;
@@ -1313,6 +1314,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_xop;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_xop;
pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_xop;
+ pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_xop;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_xop;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_xop;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_xop;
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 6b2ab2e..b5f1ba8 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -561,10 +561,15 @@ cglobal pixel_ssd_nv12_core, 6,7,7
pshufhw m0, m0, q3120
pshufhw m1, m1, q3120
%endif
+%if cpuflag(xop)
+ pmadcswd m2, m0, m0, m2
+ pmadcswd m3, m1, m1, m3
+%else
pmaddwd m0, m0
pmaddwd m1, m1
paddd m2, m0
paddd m3, m1
+%endif
add r6, 2*mmsize
jl .loopx
%if mmsize == 32 ; avx2 may overread by 32 bytes, that has to be handled
@@ -657,10 +662,15 @@ cglobal pixel_ssd_nv12_core, 6,7
por m0, m1
psrlw m2, m0, 8
pand m0, m5
+%if cpuflag(xop)
+ pmadcswd m4, m2, m2, m4
+ pmadcswd m3, m0, m0, m3
+%else
pmaddwd m2, m2
pmaddwd m0, m0
- paddd m3, m0
paddd m4, m2
+ paddd m3, m0
+%endif
add r6, mmsize
jl .loopx
%if mmsize == 32 ; avx2 may overread by 16 bytes, that has to be handled
@@ -695,6 +705,8 @@ INIT_XMM sse2
SSD_NV12
INIT_XMM avx
SSD_NV12
+INIT_XMM xop
+SSD_NV12
INIT_YMM avx2
SSD_NV12
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index aeea3e2..4568da6 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -153,6 +153,9 @@ void x264_pixel_ssd_nv12_core_sse2( pixel *pixuv1, intptr_t stride1,
void x264_pixel_ssd_nv12_core_avx ( pixel *pixuv1, intptr_t stride1,
pixel *pixuv2, intptr_t stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
+void x264_pixel_ssd_nv12_core_xop ( pixel *pixuv1, intptr_t stride1,
+ pixel *pixuv2, intptr_t stride2, int width,
+ int height, uint64_t *ssd_u, uint64_t *ssd_v );
void x264_pixel_ssd_nv12_core_avx2( pixel *pixuv1, intptr_t stride1,
pixel *pixuv2, intptr_t stride2, int width,
int height, uint64_t *ssd_u, uint64_t *ssd_v );
More information about the x264-devel
mailing list