[x264-devel] commit: Add support for SSE4a (Phenom) LZCNT instruction ( Jason Garrett-Glaser )
git version control
git at videolan.org
Wed Dec 31 14:28:01 CET 2008
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Tue Dec 30 20:47:45 2008 -0500| [00cef64dd3fff5d4b5b9b0e63314c11bfb7d33e0] | committer: Jason Garrett-Glaser
Add support for SSE4a (Phenom) LZCNT instruction
Significantly speeds up coeff_last and coeff_level_run on Phenom CPUs for faster CAVLC and CABAC.
Also a small tweak to coeff_level_run asm.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=00cef64dd3fff5d4b5b9b0e63314c11bfb7d33e0
---
common/cpu.c | 2 +
common/quant.c | 13 ++++++++
common/x86/quant-a.asm | 74 ++++++++++++++++++++++++++++++++++++-----------
common/x86/quant.h | 7 ++++
tools/checkasm.c | 14 ++++++++-
x264.h | 1 +
6 files changed, 92 insertions(+), 19 deletions(-)
diff --git a/common/cpu.c b/common/cpu.c
index d8ed4d3..56c88cb 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -54,6 +54,7 @@ const x264_cpu_name_t x264_cpu_names[] = {
{"Cache32", X264_CPU_CACHELINE_32},
{"Cache64", X264_CPU_CACHELINE_64},
{"SSEMisalign", X264_CPU_SSE_MISALIGN},
+ {"LZCNT", X264_CPU_LZCNT},
{"Slow_mod4_stack", X264_CPU_STACK_MOD4},
{"", 0},
};
@@ -117,6 +118,7 @@ uint32_t x264_cpu_detect( void )
{
cpu |= X264_CPU_SSE2_IS_FAST;
cpu |= X264_CPU_SSE_MISALIGN;
+ cpu |= X264_CPU_LZCNT;
x264_cpu_mask_misalign_sse();
}
else
diff --git a/common/quant.c b/common/quant.c
index fa38360..ac798a2 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -352,6 +352,11 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
#endif
pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext;
pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext;
+ if( cpu&X264_CPU_LZCNT )
+ {
+ pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmxext_lzcnt;
+ pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmxext_lzcnt;
+ }
}
if( cpu&X264_CPU_SSE2 )
@@ -376,6 +381,14 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2;
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
+ if( cpu&X264_CPU_LZCNT )
+ {
+ pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
+ pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
+ pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_sse2_lzcnt;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2_lzcnt;
+ }
}
if( cpu&X264_CPU_SSSE3 )
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 688f8b4..d2290a7 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -688,37 +688,53 @@ DECIMATE8x8 ssse3
or %1, %3
%endmacro
+%macro LAST_X86 3
+ bsr %1, %2
+%endmacro
+
+%macro LAST_SSE4A 3
+ lzcnt %1, %2
+ xor %1, %3
+%endmacro
+
+%macro COEFF_LAST4 1
%ifdef ARCH_X86_64
-cglobal x264_coeff_last4_mmxext, 1,1
- bsr rax, [r0]
+cglobal x264_coeff_last4_%1, 1,1
+ LAST rax, [r0], 0x3f
shr eax, 4
RET
%else
-cglobal x264_coeff_last4_mmxext, 0,3
+cglobal x264_coeff_last4_%1, 0,3
mov edx, r0m
mov eax, [edx+4]
xor ecx, ecx
test eax, eax
cmovz eax, [edx]
setnz cl
- bsr eax, eax
+ LAST eax, eax, 0x1f
shr eax, 4
lea eax, [eax+ecx*2]
RET
%endif
+%endmacro
+
+%define LAST LAST_X86
+COEFF_LAST4 mmxext
+%define LAST LAST_SSE4A
+COEFF_LAST4 mmxext_lzcnt
%macro COEFF_LAST 1
cglobal x264_coeff_last15_%1, 1,3
LAST_MASK r1d, r0-2, r2d
xor r1d, 0xffff
- bsr eax, r1d
+ LAST eax, r1d, 0x1f
dec eax
RET
cglobal x264_coeff_last16_%1, 1,3
LAST_MASK r1d, r0, r2d
xor r1d, 0xffff
- bsr eax, r1d
+ LAST eax, r1d, 0x1f
RET
%ifndef ARCH_X86_64
@@ -738,17 +754,18 @@ cglobal x264_coeff_last16_%1, 1,3
not r1d
xor r2d, -1
jne .secondhalf
- bsr eax, r1d
+ LAST eax, r1d, 0x1f
RET
.secondhalf:
- bsr eax, r2d
+ LAST eax, r2d, 0x1f
add eax, 32
RET
%endif
%endmacro
%ifdef ARCH_X86_64
- cglobal x264_coeff_last64_sse2, 1,4
+%macro COEFF_LAST64 1
+ cglobal x264_coeff_last64_%1, 1,4
LAST_MASK_SSE2 r1d, r0
LAST_MASK_SSE2 r2d, r0+32
LAST_MASK_SSE2 r3d, r0+64
@@ -760,16 +777,25 @@ cglobal x264_coeff_last16_%1, 1,3
shl r3, 32
or r1, r3
not r1
- bsr rax, r1
+ LAST rax, r1, 0x3f
RET
+%endmacro
+
+%define LAST LAST_X86
+COEFF_LAST64 sse2
+%define LAST LAST_SSE4A
+COEFF_LAST64 sse2_lzcnt
%endif
+%define LAST LAST_X86
%ifndef ARCH_X86_64
%define LAST_MASK LAST_MASK_MMX
COEFF_LAST mmxext
%endif
%define LAST_MASK LAST_MASK_SSE2
COEFF_LAST sse2
+%define LAST LAST_SSE4A
+COEFF_LAST sse2_lzcnt
;-----------------------------------------------------------------------------
; int x264_coeff_level_run( int16_t *dct, x264_run_level_t *runlevel )
@@ -783,6 +809,15 @@ COEFF_LAST sse2
pmovmskb %1, mm0
%endmacro
+%macro LZCOUNT_X86 3
+ bsr %1, %2
+ xor %1, %3
+%endmacro
+
+%macro LZCOUNT_SSE4A 3
+ lzcnt %1, %2
+%endmacro
+
; t6 = eax for return, t3 = ecx for shift, t[01] = r[01] for x86_64 args
%ifdef ARCH_X86_64
DECLARE_REG_TMP 0,1,2,3,4,5,6
@@ -794,21 +829,18 @@ COEFF_LAST sse2
cglobal x264_coeff_level_run%2_%1,0,7
movifnidn t0d, r0m
movifnidn t1d, r1m
- LAST_MASK t2d, t0-(%2&1)*2, t4d
- not t2d
- shl t2d, 32-((%2+1)&~1)
+ LAST_MASK t5d, t0-(%2&1)*2, t4d
+ not t5d
+ shl t5d, 32-((%2+1)&~1)
mov t4d, %2-1
- mov t5d, t2d
- bsr t3d, t2d
+ LZCOUNT t3d, t5d, 0x1f
xor t6d, t6d
shl t5d, 1
- xor t3d, 0x1f
sub t4d, t3d
shl t5d, t3b
mov [t1], t4d
.loop:
- bsr t3d, t5d
- xor t3d, 0x1f
+ LZCOUNT t3d, t5d, 0x1f
mov t2w, [t0+t4*2]
mov [t1+t6 +36], t3b
mov [t1+t6*2+ 4], t2w
@@ -820,6 +852,7 @@ cglobal x264_coeff_level_run%2_%1,0,7
RET
%endmacro
+%define LZCOUNT LZCOUNT_X86
%ifndef ARCH_X86_64
%define LAST_MASK LAST_MASK_MMX
COEFF_LEVELRUN mmxext, 15
@@ -830,3 +863,8 @@ COEFF_LEVELRUN mmxext, 4
%define LAST_MASK LAST_MASK_SSE2
COEFF_LEVELRUN sse2, 15
COEFF_LEVELRUN sse2, 16
+%define LZCOUNT LZCOUNT_SSE4A
+COEFF_LEVELRUN sse2_lzcnt, 15
+COEFF_LEVELRUN sse2_lzcnt, 16
+%define LAST_MASK LAST_MASK4_MMX
+COEFF_LEVELRUN mmxext_lzcnt, 4
diff --git a/common/x86/quant.h b/common/x86/quant.h
index 46186ce..878699f 100644
--- a/common/x86/quant.h
+++ b/common/x86/quant.h
@@ -64,10 +64,17 @@ int x264_coeff_last64_mmxext( int16_t *dct );
int x264_coeff_last15_sse2( int16_t *dct );
int x264_coeff_last16_sse2( int16_t *dct );
int x264_coeff_last64_sse2( int16_t *dct );
+int x264_coeff_last4_mmxext_lzcnt( int16_t *dct );
+int x264_coeff_last15_sse2_lzcnt( int16_t *dct );
+int x264_coeff_last16_sse2_lzcnt( int16_t *dct );
+int x264_coeff_last64_sse2_lzcnt( int16_t *dct );
int x264_coeff_level_run16_mmxext( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run16_sse2( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run16_sse2_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_mmxext( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run15_sse2( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run15_sse2_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
int x264_coeff_level_run4_mmxext( int16_t *dct, x264_run_level_t *runlevel );
+int x264_coeff_level_run4_mmxext_lzcnt( int16_t *dct, x264_run_level_t *runlevel );
#endif
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 8785a7a..aeaf5fb 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -156,7 +156,8 @@ static void print_bench(void)
b->cpu&X264_CPU_MMX ? "mmx" : "c",
b->cpu&X264_CPU_CACHELINE_32 ? "_c32" :
b->cpu&X264_CPU_CACHELINE_64 ? "_c64" :
- b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" : "",
+ b->cpu&X264_CPU_SSE_MISALIGN ? "_misalign" :
+ b->cpu&X264_CPU_LZCNT ? "_lzcnt" : "",
((int64_t)10*b->cycles/b->den - nop_time)/4 );
}
}
@@ -1392,6 +1393,11 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_CACHELINE_32, "MMX Cache32" );
cpu1 &= ~X264_CPU_CACHELINE_32;
#endif
+ if( x264_cpu_detect() & X264_CPU_LZCNT )
+ {
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "MMX_LZCNT" );
+ cpu1 &= ~X264_CPU_LZCNT;
+ }
}
if( x264_cpu_detect() & X264_CPU_SSE2 )
{
@@ -1405,6 +1411,12 @@ static int check_all_flags( void )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE_MISALIGN, "SSE_Misalign" );
cpu1 &= ~X264_CPU_SSE_MISALIGN;
}
+ if( x264_cpu_detect() & X264_CPU_LZCNT )
+ {
+ cpu1 &= ~X264_CPU_CACHELINE_64;
+ ret |= add_flags( &cpu0, &cpu1, X264_CPU_LZCNT, "SSE_LZCNT" );
+ cpu1 &= ~X264_CPU_LZCNT;
+ }
if( x264_cpu_detect() & X264_CPU_SSE3 )
ret |= add_flags( &cpu0, &cpu1, X264_CPU_SSE3 | X264_CPU_CACHELINE_64, "SSE3" );
if( x264_cpu_detect() & X264_CPU_SSSE3 )
diff --git a/x264.h b/x264.h
index 51be79e..8c517b1 100644
--- a/x264.h
+++ b/x264.h
@@ -62,6 +62,7 @@ typedef struct x264_t x264_t;
#define X264_CPU_SSE4 0x002000 /* SSE4.1 */
#define X264_CPU_SSE42 0x004000 /* SSE4.2 */
#define X264_CPU_SSE_MISALIGN 0x008000 /* Phenom support for misaligned SSE instruction arguments */
+#define X264_CPU_LZCNT 0x010000 /* Phenom support for "leading zero count" instruction. */
/* Analyse flags
*/
More information about the x264-devel
mailing list