[x265] [PATCH 4 of 4] asm: assembly code for dequant_normal

Min Chen chenm003 at 163.com
Mon Nov 25 07:38:02 CET 2013


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1385360399 -28800
# Node ID 67e8ecb2b0e553a5259f0f4967fc9572360ef093
# Parent  9c7142ced7c412bd502b3921c8023da4dbee3f76
asm: assembly code for dequant_normal

diff -r 9c7142ced7c4 -r 67e8ecb2b0e5 source/common/dct.cpp
--- a/source/common/dct.cpp	Mon Nov 25 12:03:42 2013 +0800
+++ b/source/common/dct.cpp	Mon Nov 25 14:19:59 2013 +0800
@@ -720,8 +720,11 @@
 
 void dequant_normal_c(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
 {
-    static const int invQuantScales[6] = { 40, 45, 51, 57, 64, 72 };
     assert(num <= 32 * 32);
+    // NOTE: maximum of scale is (72 * 256)
+    assert(scale < 32768);
+    assert((num % 8) == 0);
+    assert(shift <= 6);
 
     int add, coeffQ;
 
diff -r 9c7142ced7c4 -r 67e8ecb2b0e5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Nov 25 12:03:42 2013 +0800
+++ b/source/common/x86/asm-primitives.cpp	Mon Nov 25 14:19:59 2013 +0800
@@ -663,6 +663,7 @@
         p.intra_pred_dc[BLOCK_8x8] = x265_intra_pred_dc8_sse4;
         p.intra_pred_dc[BLOCK_16x16] = x265_intra_pred_dc16_sse4;
         p.intra_pred_dc[BLOCK_32x32] = x265_intra_pred_dc32_sse4;
+        p.dequant_normal = x265_dequant_normal_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r 9c7142ced7c4 -r 67e8ecb2b0e5 source/common/x86/pixel-util.asm
--- a/source/common/x86/pixel-util.asm	Mon Nov 25 12:03:42 2013 +0800
+++ b/source/common/x86/pixel-util.asm	Mon Nov 25 14:19:59 2013 +0800
@@ -32,6 +32,8 @@
 
 SECTION .text
 
+cextern pw_1
+
 ;-----------------------------------------------------------------------------
 ; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)
 ;-----------------------------------------------------------------------------
@@ -670,3 +672,49 @@
     movd        eax, m7
 
     RET
+
+
+;-----------------------------------------------------------------------------
+; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal dequant_normal, 2,5,8
+    movd        m1, r3m             ; m1 = word [scale]
+    mov         r4d, r4m
+    movd        m0, r4d             ; m0 = shift
+    xor         r3d, r3d
+    dec         r4d
+    bts         r3d, r4d
+    movd        m2, r3d
+    punpcklwd   m1, m2
+    pshufd      m1, m1, 0           ; m1 = dword [add scale]
+    mova        m2, [pw_1]
+    mov         r1, r1m
+    mov         r2d, r2m
+
+    ; m0 = shift
+    ; m1 = scale
+    ; m2 = word [1]
+.loop:
+    movu        m3, [r0]
+    movu        m4, [r0 + 16]
+    packssdw    m3, m4              ; m3 = clipQCoef
+    punpckhwd   m4, m3, m2
+    punpcklwd   m3, m2
+    pmaddwd     m3, m1              ; m3 = dword (clipQCoef * scale + add)
+    pmaddwd     m4, m1
+    psrad       m3, m0
+    psrad       m4, m0
+    packssdw    m3, m3              ; OPT_ME: store must be 32 bits
+    pmovsxwd    m3, m3
+    packssdw    m4, m4
+    pmovsxwd    m4, m4
+    movu        [r1], m3
+    movu        [r1 + 16], m4
+
+    add         r0, 32
+    add         r1, 32
+
+    sub         r2d, 8
+    jnz        .loop
+    RET
diff -r 9c7142ced7c4 -r 67e8ecb2b0e5 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Mon Nov 25 12:03:42 2013 +0800
+++ b/source/common/x86/pixel.h	Mon Nov 25 14:19:59 2013 +0800
@@ -379,5 +379,6 @@
 int x265_pixel_ssd_64x32_sse4(pixel *, intptr_t, pixel *, intptr_t);
 int x265_pixel_ssd_64x48_sse4(pixel *, intptr_t, pixel *, intptr_t);
 int x265_pixel_ssd_64x64_sse4(pixel *, intptr_t, pixel *, intptr_t);
+void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
 
 #endif // ifndef X265_I386_PIXEL_H



More information about the x265-devel mailing list