[x264-devel] High bit depth intra_sad_x3_4x4
Cristian Militaru
git at videolan.org
Mon Jan 16 02:11:58 CET 2012
x264 | branch: master | Cristian Militaru <cristipiticul at yahoo.com> | Wed Jan 4 12:38:08 2012 -0800| [c4b54c83629bb92af6c4836a8859e9432dc7333a] | committer: Jason Garrett-Glaser
High bit depth intra_sad_x3_4x4
>From Google Code-In.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=c4b54c83629bb92af6c4836a8859e9432dc7333a
---
common/pixel.c | 4 ++-
common/x86/pixel.h | 3 ++
common/x86/sad16-a.asm | 65 +++++++++++++++++++++++++++++++++++++++++++++++-
3 files changed, 70 insertions(+), 2 deletions(-)
diff --git a/common/pixel.c b/common/pixel.c
index 350057e..ca10c7d 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -552,7 +552,6 @@ INTRA_MBCMP( sad, 8x8, dc, h, v, c, _mmx2, _c )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _mmx2, _mmx2 )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _sse2, _sse2 )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _sse2, _sse2 )
-INTRA_MBCMP( sad, 4x4, v, h, dc, , _ssse3, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c, _ssse3, _sse2 )
INTRA_MBCMP( sad, 16x16, v, h, dc, , _ssse3, _sse2 )
#else
@@ -860,6 +859,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse2;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
@@ -908,6 +908,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT4( hadamard_ac, _ssse3 );
}
pixf->vsad = x264_pixel_vsad_ssse3;
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
@@ -931,6 +932,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
{
INIT4( hadamard_ac, _avx );
}
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_avx;
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 863c312..497901b 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -98,6 +98,9 @@ DECL_PIXELS( uint64_t, hadamard_ac, xop, ( pixel *pix, int i_stride ))
void x264_intra_satd_x3_4x4_mmx2 ( pixel *, pixel *, int * );
void x264_intra_sad_x3_4x4_mmx2 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_4x4_sse2 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_4x4_ssse3 ( pixel *, pixel *, int * );
+void x264_intra_sad_x3_4x4_avx ( pixel *, pixel *, int * );
void x264_intra_satd_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_mmx2 ( pixel *, pixel *, int * );
diff --git a/common/x86/sad16-a.asm b/common/x86/sad16-a.asm
index 606ea6f..ee35bbb 100644
--- a/common/x86/sad16-a.asm
+++ b/common/x86/sad16-a.asm
@@ -29,6 +29,7 @@
SECTION .text
cextern pw_1
+cextern pw_4
cextern pw_8
;=============================================================================
@@ -472,7 +473,69 @@ SAD_X 4, 8, 8
SAD_X 4, 8, 4
;-----------------------------------------------------------------------------
-; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3]);
+; void intra_sad_x3_4x4( uint16_t *fenc, uint16_t *fdec, int res[3] );
+;-----------------------------------------------------------------------------
+
+%macro INTRA_SAD_X3_4x4 0
+cglobal intra_sad_x3_4x4, 3,3,7
+ movq m0, [r1-1*FDEC_STRIDEB]
+ movq m1, [r0+0*FENC_STRIDEB]
+ movq m2, [r0+2*FENC_STRIDEB]
+ pshuflw m6, m0, q1032
+ paddw m6, m0
+ pshuflw m5, m6, q2301
+ paddw m6, m5
+ punpcklqdq m6, m6 ;A+B+C+D 8 times
+ punpcklqdq m0, m0
+ movhps m1, [r0+1*FENC_STRIDEB]
+ movhps m2, [r0+3*FENC_STRIDEB]
+ psubw m3, m1, m0
+ psubw m0, m2
+ ABSW m3, m3, m5
+ ABSW m0, m0, m5
+ paddw m0, m3
+ HADDW m0, m5
+ movd [r2], m0 ;V prediction cost
+ movd m3, [r1+0*FDEC_STRIDEB-4]
+ movhps m3, [r1+1*FDEC_STRIDEB-8]
+ movd m4, [r1+2*FDEC_STRIDEB-4]
+ movhps m4, [r1+3*FDEC_STRIDEB-8]
+ pshufhw m3, m3, q3333
+ pshufhw m4, m4, q3333
+ pshuflw m3, m3, q1111 ; FF FF EE EE
+ pshuflw m4, m4, q1111 ; HH HH GG GG
+ paddw m5, m3, m4
+ pshufd m0, m5, q1032
+ paddw m5, m6
+ paddw m5, m0
+ paddw m5, [pw_4]
+ psrlw m5, 3
+ psubw m6, m5, m2
+ psubw m5, m1
+ psubw m1, m3
+ psubw m2, m4
+ ABSW m5, m5, m0
+ ABSW m6, m6, m0
+ ABSW m1, m1, m0
+ ABSW m2, m2, m0
+ paddw m5, m6
+ paddw m1, m2
+ HADDW m5, m0
+ HADDW m1, m2
+ movd [r2+8], m5 ;DC prediction cost
+ movd [r2+4], m1 ;H prediction cost
+ RET
+%endmacro
+
+INIT_XMM sse2
+INTRA_SAD_X3_4x4
+INIT_XMM ssse3
+INTRA_SAD_X3_4x4
+INIT_XMM avx
+INTRA_SAD_X3_4x4
+
+;-----------------------------------------------------------------------------
+; void intra_sad_x3_8x8( pixel *fenc, pixel edge[36], int res[3] );
;-----------------------------------------------------------------------------
;m0 = DC
More information about the x264-devel
mailing list