[x264-devel] [PATCH] x264_coeff_level_run4/8/15/16_neon
George Stephanos
gaf.stephanos at gmail.com
Wed Feb 8 23:45:33 CET 2012
---
common/arm/quant-a.S | 104 ++++++++++++++++++++++++++++++++++++++++++++++++
common/arm/quant.h | 5 ++
common/bitstream.h | 2 +-
common/quant.c | 4 ++
common/x86/quant-a.asm | 8 +--
tools/checkasm.c | 25 ++++++++++-
6 files changed, 139 insertions(+), 9 deletions(-)
diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
index 8e675bd..1c14c86 100644
--- a/common/arm/quant-a.S
+++ b/common/arm/quant-a.S
@@ -353,3 +353,107 @@ function x264_coeff_last64_neon
movlt r0, #0
bx lr
.endfunc
+
+.macro COEFF_LEVEL_RUN size
+function x264_coeff_level_run\size\()_neon
+ push {r4-r7,lr}
+.if \size == 4
+ vld1.16 {d16}, [r0,:64]
+ vqmovn.u16 d18, q8
+.endif
+
+.if \size == 8
+ vld1.64 {d16,d17}, [r0,:128]
+ vqmovn.u16 d18, q8
+.endif
+
+.if \size == 16 || \size == 15
+ vld1.64 {d16-d19}, [r0]
+ vqmovn.u16 d16, q8
+ vqmovn.u16 d17, q9
+.endif
+
+ movrel r2, pmovmskb_byte
+
+.if \size == 16 || \size == 15
+ vld1.64 {d0, d1}, [r2]
+ vtst.8 q8, q8
+ vand q8, q0
+.else
+
+ vld1.64 {d0}, [r2]
+ vtst.8 d18, d18
+ vand d18, d0
+.endif
+
+.if \size == 4
+ vmov.32 r2, d18[0]
+ add r2, r2, r2, ror #16
+ add r2, r2, r2, ror #24
+.endif
+.if \size == 8
+ vmov.i8 d19, #0
+ vpadd.u8 d18, d19
+ vpadd.u8 d18, d19
+ vpadd.u8 d18, d19
+ vmov.32 r2, d18[0]
+.endif
+.if \size == 16 || \size == 15
+ vmov.i8 d19, #0
+ vpadd.u8 d16, d19
+ vpadd.u8 d16, d19
+ vpadd.u8 d16, d19
+
+ vpadd.u8 d17, d19
+ vpadd.u8 d17, d19
+ vpadd.u8 d17, d19
+
+ vmov.32 r2, d16[0]
+ vmov.32 r3, d17[0]
+
+ orr r2, r2, r3, lsl #8
+.endif
+
+.if \size == 4
+ and r2, #0xff
+.endif
+
+ str r2, [r1,#4] // mask
+
+ orr r2, #0xf0000000
+ ror r2, #\size
+ mov r4, #\size-1
+
+ clz r3, r2
+
+ add r2, r2
+ sub r4, r3
+ lsl r2, r3
+
+ str r4, [r1] // last
+ add r1, #8
+ add r6, r1, #64
+ lsl r4, #1
+ mov r7, #0
+1:
+ clz r3, r2
+ ldrsh r5, [r0, r4]
+ strb r3, [r6], #1
+ add r3, #1
+ str r5, [r1], #4
+ lsl r2, r3
+ subs r4, r4, r3, lsl #1
+ add r7, #1
+ bge 1b
+
+ mov r0, r7
+ pop {r4-r7,pc}
+ bx lr
+.endfunc
+.endm
+
+COEFF_LEVEL_RUN 4
+COEFF_LEVEL_RUN 8
+COEFF_LEVEL_RUN 15
+COEFF_LEVEL_RUN 16
+
diff --git a/common/arm/quant.h b/common/arm/quant.h
index e6fc343..a548d15 100644
--- a/common/arm/quant.h
+++ b/common/arm/quant.h
@@ -42,4 +42,9 @@ int x264_coeff_last15_neon( int16_t * );
int x264_coeff_last16_neon( int16_t * );
int x264_coeff_last64_neon( int16_t * );
+int x264_coeff_level_run4_neon( int16_t * );
+int x264_coeff_level_run8_neon( int16_t * );
+int x264_coeff_level_run15_neon( int16_t * );
+int x264_coeff_level_run16_neon( int16_t * );
+
#endif
diff --git a/common/bitstream.h b/common/bitstream.h
index f407e1d..094d5ca 100644
--- a/common/bitstream.h
+++ b/common/bitstream.h
@@ -57,7 +57,7 @@ typedef struct
{
int last;
int mask;
- dctcoef level[16];
+ int32_t level[16];
uint8_t run[16];
} x264_run_level_t;
diff --git a/common/quant.c b/common/quant.c
index bdf3a0f..5a19d73 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -706,6 +706,10 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_neon;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_neon;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
+ pf->coeff_level_run4 = x264_coeff_level_run4_neon;
+ pf->coeff_level_run8 = x264_coeff_level_run8_neon;
+ pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_neon;
+ pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_neon;
}
#endif
#endif // HIGH_BIT_DEPTH
diff --git a/common/x86/quant-a.asm b/common/x86/quant-a.asm
index 24f024f..0425172 100644
--- a/common/x86/quant-a.asm
+++ b/common/x86/quant-a.asm
@@ -1366,13 +1366,11 @@ cglobal coeff_level_run%1,0,7
LZCOUNT t3d, t5d, 0x1f
%if HIGH_BIT_DEPTH
mov t2d, [t0+t4*4]
- mov [t1+t6+8+16*4], t3b
- mov [t1+t6*4+ 8], t2d
%else
- mov t2w, [t0+t4*2]
- mov [t1+t6+8+16*2], t3b
- mov [t1+t6*2+ 8], t2w
+ movsx t2, word [t0+t4*2]
%endif
+ mov [t1+t6+8+16*4], t3b
+ mov [t1+t6*4+ 8], t2d
inc t3d
shl t5d, t3b
inc t6d
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 88c508c..ae7750f 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -2022,22 +2022,41 @@ static int check_quant( int cpu_ref, int cpu_new )
x264_run_level_t runlevel_c, runlevel_a; \
int nnz = 0; \
int max = rand() & (size-1); \
- memset( dct1, 0, size*sizeof(dctcoef) ); \
+ memset( dct1, 0, size*sizeof(int32_t) ); \
memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \
memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \
for( int idx = ac; idx < max; idx++ ) \
- nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
+ nnz |= dct1[idx] = !(rand()&1) + (!(rand()&15))*rand(); \
if( !nnz ) \
dct1[ac] = 1; \
int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
runlevel_c.mask != runlevel_a.mask || \
- memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c) || \
+ memcmp(runlevel_c.level, runlevel_a.level, sizeof(int32_t)*result_c) || \
memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \
{ \
ok = 0; \
fprintf( stderr, #name ": [FAILED]\n" ); \
+ fprintf( stderr, "\ndctcoef: " ); \
+ for (int i = 0; i < size; i++) fprintf( stderr, "%04x ", (dct1+ac)[i] ); \
+ fprintf( stderr, "\n\nresult_a: %d\n", result_a ); \
+ fprintf( stderr, "runlevel_a.last: %d\n", runlevel_a.last ); \
+ fprintf( stderr, "runlevel_a.mask: %d\n", runlevel_a.mask ); \
+ fprintf( stderr, "runlevel_a.level: " ); \
+ for (int i = 0; i < 16; i++) fprintf( stderr, "%08x ", runlevel_a.level[i] ); \
+ fprintf( stderr, "\n" ); \
+ fprintf( stderr, "runlevel_a.run: " ); \
+ for (int i = 0; i < 16; i++) fprintf( stderr, "%02x ", runlevel_a.run[i] ); \
+ fprintf( stderr, "\n\nresult_c: %d\n", result_c ); \
+ fprintf( stderr, "runlevel_c.last: %d\n", runlevel_c.last ); \
+ fprintf( stderr, "runlevel_c.mask: %d\n", runlevel_c.mask ); \
+ fprintf( stderr, "runlevel_c.level: " ); \
+ for (int i = 0; i < 16; i++) fprintf( stderr, "%08x ", runlevel_c.level[i] ); \
+ fprintf( stderr, "\n" ); \
+ fprintf( stderr, "runlevel_c.run: " ); \
+ for (int i = 0; i < 16; i++) fprintf( stderr, "%02x ", runlevel_c.run[i] ); \
+ fprintf( stderr, "\n\n" ); \
break; \
} \
} \
--
1.7.4.1
More information about the x264-devel
mailing list