[x264-devel] Enable some existing asm functions that were missing function pointers

Wed Aug 10 05:15:14 CEST 2011

x264 | branch: master | Loren Merritt <pengvado at akuvian.org> | Wed Aug  3 14:58:50 2011 +0000| [a718aad0045b2930d871fd7b6bf33fc0192d526f] | committer: Jason Garrett-Glaser

Enable some existing asm functions that were missing function pointers
pixel_ads1_avx, predict_8x8_hd_avxx
High bit depth mc_copy_w8_sse2, denoise_dct_avx, prefetch_fenc/ref, and several pixel*sse4.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=a718aad0045b2930d871fd7b6bf33fc0192d526f
---

 common/pixel.c         |   33 +++++++++++++++++++++++++++++----
 common/quant.c         |    4 ++++
 common/x86/mc-a.asm    |   23 ++++++++++++++---------
 common/x86/mc-c.c      |   13 +++++++++----
 common/x86/predict-c.c |    1 +
 5 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/common/pixel.c b/common/pixel.c
index 40395b2..57b5d58 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -546,6 +546,7 @@ INTRA_MBCMP( sad, 16,  v, h, dc,  , _mmx2 )
 INTRA_MBCMP(satd, 16,  v, h, dc,  , _mmx2 )
 INTRA_MBCMP( sad,  8, dc, h,  v, c, _sse2 )
 INTRA_MBCMP( sad, 16,  v, h, dc,  , _sse2 )
+INTRA_MBCMP( sad,  4,  v, h, dc,  , _ssse3 )
 INTRA_MBCMP( sad,  8, dc, h,  v, c, _ssse3 )
 INTRA_MBCMP( sad, 16,  v, h, dc,  , _ssse3 )
 #endif
@@ -873,10 +874,35 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
 
         pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
         pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_ssse3;
+        pixf->intra_sad_x3_4x4    = x264_intra_sad_x3_4x4_ssse3;
         pixf->intra_sad_x3_8x8    = x264_intra_sad_x3_8x8_ssse3;
         pixf->intra_sad_x3_8x8c   = x264_intra_sad_x3_8x8c_ssse3;
         pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16_ssse3;
     }
+    if( cpu&X264_CPU_SSE4 )
+    {
+        if( !(cpu&X264_CPU_STACK_MOD4) )
+        {
+            INIT4( hadamard_ac, _sse4 );
+        }
+        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
+        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_sse4;
+    }
+    if( cpu&X264_CPU_AVX )
+    {
+        INIT_ADS( _avx );
+        if( !(cpu&X264_CPU_STACK_MOD4) )
+        {
+            INIT4( hadamard_ac, _avx );
+        }
+        pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
+        pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_avx;
+        pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
+        pixf->var[PIXEL_8x8]   = x264_pixel_var_8x8_avx;
+        pixf->ssd_nv12_core    = x264_pixel_ssd_nv12_core_avx;
+        pixf->ssim_4x4x2_core  = x264_pixel_ssim_4x4x2_core_avx;
+        pixf->ssim_end4        = x264_pixel_ssim_end4_avx;
+    }
 #endif // HAVE_MMX
 #else // !HIGH_BIT_DEPTH
 #if HAVE_MMX
@@ -1038,6 +1064,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
 #endif
         pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
+        if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+            pixf->intra_sad_x3_8x8  = x264_intra_sad_x3_8x8_ssse3;
         if( cpu&X264_CPU_CACHELINE_64 )
         {
             INIT2( sad, _cache64_ssse3 );
@@ -1062,8 +1090,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
         pixf->sa8d[PIXEL_8x8]  = x264_pixel_sa8d_8x8_sse4;
         pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse4;
-        /* Slower on Conroe, so only enable under SSE4 */
-        pixf->intra_sad_x3_8x8  = x264_intra_sad_x3_8x8_ssse3;
     }
 
     if( cpu&X264_CPU_AVX )
@@ -1071,8 +1097,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         INIT7( satd, _avx );
         INIT7( satd_x3, _avx );
         INIT7( satd_x4, _avx );
-        pixf->ads[PIXEL_16x16] = x264_pixel_ads4_avx;
-        pixf->ads[PIXEL_16x8]  = x264_pixel_ads2_avx;
+        INIT_ADS( _avx );
         if( !(cpu&X264_CPU_STACK_MOD4) )
         {
             INIT4( hadamard_ac, _avx );
diff --git a/common/quant.c b/common/quant.c
index b0537ce..40ac1f1 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -428,6 +428,10 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->quant_4x4 = x264_quant_4x4_sse4;
         pf->quant_8x8 = x264_quant_8x8_sse4;
     }
+    if( cpu&X264_CPU_AVX )
+    {
+        pf->denoise_dct = x264_denoise_dct_avx;
+    }
 #endif // HAVE_MMX
 #else // !HIGH_BIT_DEPTH
 #if HAVE_MMX
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 403059b..93c5687 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -1291,19 +1291,21 @@ MC_COPY 16
 ;=============================================================================
 ; prefetch
 ;=============================================================================
-; FIXME assumes 64 byte cachelines
+; assumes 64 byte cachelines
+; FIXME doesn't cover all pixels in high depth and/or 4:4:4
 
 ;-----------------------------------------------------------------------------
-; void prefetch_fenc( uint8_t *pix_y, int stride_y,
-;                     uint8_t *pix_uv, int stride_uv, int mb_x )
+; void prefetch_fenc( pixel *pix_y, int stride_y,
+;                     pixel *pix_uv, int stride_uv, int mb_x )
 ;-----------------------------------------------------------------------------
 INIT_MMX
 %ifdef ARCH_X86_64
 cglobal prefetch_fenc_mmx2, 5,5
+    FIX_STRIDES r1d, r3d
     and    r4d, 3
     mov    eax, r4d
     imul   r4d, r1d
-    lea    r0,  [r0+r4*4+64]
+    lea    r0,  [r0+r4*4+64*SIZEOF_PIXEL]
     prefetcht0  [r0]
     prefetcht0  [r0+r1]
     lea    r0,  [r0+r1*2]
@@ -1311,7 +1313,7 @@ cglobal prefetch_fenc_mmx2, 5,5
     prefetcht0  [r0+r1]
 
     imul   eax, r3d
-    lea    r2,  [r2+rax*2+64]
+    lea    r2,  [r2+rax*2+64*SIZEOF_PIXEL]
     prefetcht0  [r2]
     prefetcht0  [r2+r3]
     RET
@@ -1321,9 +1323,10 @@ cglobal prefetch_fenc_mmx2, 0,3
     mov    r2, r4m
     mov    r1, r1m
     mov    r0, r0m
+    FIX_STRIDES r1
     and    r2, 3
     imul   r2, r1
-    lea    r0, [r0+r2*4+64]
+    lea    r0, [r0+r2*4+64*SIZEOF_PIXEL]
     prefetcht0 [r0]
     prefetcht0 [r0+r1]
     lea    r0, [r0+r1*2]
@@ -1333,21 +1336,23 @@ cglobal prefetch_fenc_mmx2, 0,3
     mov    r2, r4m
     mov    r1, r3m
     mov    r0, r2m
+    FIX_STRIDES r1
     and    r2, 3
     imul   r2, r1
-    lea    r0, [r0+r2*2+64]
+    lea    r0, [r0+r2*2+64*SIZEOF_PIXEL]
     prefetcht0 [r0]
     prefetcht0 [r0+r1]
     ret
 %endif ; ARCH_X86_64
 
 ;-----------------------------------------------------------------------------
-; void prefetch_ref( uint8_t *pix, int stride, int parity )
+; void prefetch_ref( pixel *pix, int stride, int parity )
 ;-----------------------------------------------------------------------------
 cglobal prefetch_ref_mmx2, 3,3
+    FIX_STRIDES r1d
     dec    r2d
     and    r2d, r1d
-    lea    r0,  [r0+r2*8+64]
+    lea    r0,  [r0+r2*8+64*SIZEOF_PIXEL]
     lea    r2,  [r1*3]
     prefetcht0  [r0]
     prefetcht0  [r0+r1]
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 81dd1ca..2c028e7 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -85,8 +85,8 @@ void x264_mc_copy_w8_sse2( pixel *, int, pixel *, int, int );
 void x264_mc_copy_w16_mmx( pixel *, int, pixel *, int, int );
 void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int );
 void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int );
-void x264_prefetch_fenc_mmx2( uint8_t *, int, uint8_t *, int, int );
-void x264_prefetch_ref_mmx2( uint8_t *, int, int );
+void x264_prefetch_fenc_mmx2( pixel *, int, pixel *, int, int );
+void x264_prefetch_ref_mmx2( pixel *, int, int );
 void x264_plane_copy_core_mmx2( pixel *, int, pixel *, int, int w, int h);
 void x264_plane_copy_c( pixel *, int, pixel *, int, int w, int h );
 void x264_plane_copy_interleave_core_mmx2( pixel *dst, int i_dst,
@@ -225,7 +225,11 @@ static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, int, pixel *, int,
 };
 
 MC_COPY_WTAB(mmx,mmx,mmx,mmx)
+#if HIGH_BIT_DEPTH
+MC_COPY_WTAB(sse2,mmx,sse2,sse2)
+#else
 MC_COPY_WTAB(sse2,mmx,mmx,sse2)
+#endif
 
 #define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
     static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, int, pixel *, int, const x264_weight_t *, int ) =\
@@ -510,6 +514,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     if( !(cpu&X264_CPU_MMX2) )
         return;
 
+    pf->prefetch_fenc = x264_prefetch_fenc_mmx2;
+    pf->prefetch_ref  = x264_prefetch_ref_mmx2;
+
     pf->plane_copy = x264_plane_copy_mmx2;
     pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
     pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmx2;
@@ -605,8 +612,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     if( !(cpu&X264_CPU_STACK_MOD4) )
         pf->mc_chroma = x264_mc_chroma_avx;
 #else // !HIGH_BIT_DEPTH
-    pf->prefetch_fenc = x264_prefetch_fenc_mmx2;
-    pf->prefetch_ref  = x264_prefetch_ref_mmx2;
 
 #if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
     if( cpu&X264_CPU_CACHELINE_32 )
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 1db0925..96d0e23 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -566,6 +566,7 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
     pf[I_PRED_8x8_DDR]  = x264_predict_8x8_ddr_avx;
     pf[I_PRED_8x8_VL]   = x264_predict_8x8_vl_avx;
     pf[I_PRED_8x8_VR]   = x264_predict_8x8_vr_avx;
+    pf[I_PRED_8x8_HD]   = x264_predict_8x8_hd_avx;
 #endif // HIGH_BIT_DEPTH
 }