[x265] [PATCH 4 of 4] asm: assembly code for dequant_normal
Min Chen
chenm003 at 163.com
Mon Nov 25 07:38:02 CET 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1385360399 -28800
# Node ID 67e8ecb2b0e553a5259f0f4967fc9572360ef093
# Parent 9c7142ced7c412bd502b3921c8023da4dbee3f76
asm: assembly code for dequant_normal
diff -r 9c7142ced7c4 -r 67e8ecb2b0e5 source/common/dct.cpp
--- a/source/common/dct.cpp Mon Nov 25 12:03:42 2013 +0800
+++ b/source/common/dct.cpp Mon Nov 25 14:19:59 2013 +0800
@@ -720,8 +720,11 @@
void dequant_normal_c(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
{
- static const int invQuantScales[6] = { 40, 45, 51, 57, 64, 72 };
assert(num <= 32 * 32);
+ // NOTE: maximum of scale is (72 * 256)
+ assert(scale < 32768);
+ assert((num % 8) == 0);
+ assert(shift <= 6);
int add, coeffQ;
diff -r 9c7142ced7c4 -r 67e8ecb2b0e5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Nov 25 12:03:42 2013 +0800
+++ b/source/common/x86/asm-primitives.cpp Mon Nov 25 14:19:59 2013 +0800
@@ -663,6 +663,7 @@
p.intra_pred_dc[BLOCK_8x8] = x265_intra_pred_dc8_sse4;
p.intra_pred_dc[BLOCK_16x16] = x265_intra_pred_dc16_sse4;
p.intra_pred_dc[BLOCK_32x32] = x265_intra_pred_dc32_sse4;
+ p.dequant_normal = x265_dequant_normal_sse4;
}
if (cpuMask & X265_CPU_AVX)
{
diff -r 9c7142ced7c4 -r 67e8ecb2b0e5 source/common/x86/pixel-util.asm
--- a/source/common/x86/pixel-util.asm Mon Nov 25 12:03:42 2013 +0800
+++ b/source/common/x86/pixel-util.asm Mon Nov 25 14:19:59 2013 +0800
@@ -32,6 +32,8 @@
SECTION .text
+cextern pw_1
+
;-----------------------------------------------------------------------------
; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)
;-----------------------------------------------------------------------------
@@ -670,3 +672,49 @@
movd eax, m7
RET
+
+
+;-----------------------------------------------------------------------------
+; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal dequant_normal, 2,5,8
+ movd m1, r3m ; m1 = word [scale]
+ mov r4d, r4m
+ movd m0, r4d ; m0 = shift
+ xor r3d, r3d
+ dec r4d
+ bts r3d, r4d
+ movd m2, r3d
+ punpcklwd m1, m2
+ pshufd m1, m1, 0 ; m1 = dword [add scale]
+ mova m2, [pw_1]
+ mov r1, r1m
+ mov r2d, r2m
+
+ ; m0 = shift
+ ; m1 = scale
+ ; m2 = word [1]
+.loop:
+ movu m3, [r0]
+ movu m4, [r0 + 16]
+ packssdw m3, m4 ; m3 = clipQCoef
+ punpckhwd m4, m3, m2
+ punpcklwd m3, m2
+ pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
+ pmaddwd m4, m1
+ psrad m3, m0
+ psrad m4, m0
+ packssdw m3, m3 ; OPT_ME: store must be 32 bits
+ pmovsxwd m3, m3
+ packssdw m4, m4
+ pmovsxwd m4, m4
+ movu [r1], m3
+ movu [r1 + 16], m4
+
+ add r0, 32
+ add r1, 32
+
+ sub r2d, 8
+ jnz .loop
+ RET
diff -r 9c7142ced7c4 -r 67e8ecb2b0e5 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Nov 25 12:03:42 2013 +0800
+++ b/source/common/x86/pixel.h Mon Nov 25 14:19:59 2013 +0800
@@ -379,5 +379,6 @@
int x265_pixel_ssd_64x32_sse4(pixel *, intptr_t, pixel *, intptr_t);
int x265_pixel_ssd_64x48_sse4(pixel *, intptr_t, pixel *, intptr_t);
int x265_pixel_ssd_64x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
+void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
#endif // ifndef X265_I386_PIXEL_H
More information about the x265-devel
mailing list