[x265] [PATCH] asm: assembly code for quant

Min Chen chenm003 at 163.com
Tue Nov 19 10:18:04 CET 2013


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1384852672 -28800
# Node ID e91e72c53c15b9e0c3e78b4268aa7b35149ac86d
# Parent  2f5f538d2cbca3b46e8d27d860e9787cc19f406f
asm: assembly code for quant

diff -r 2f5f538d2cbc -r e91e72c53c15 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Nov 18 16:44:31 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp	Tue Nov 19 17:17:52 2013 +0800
@@ -633,6 +633,7 @@
         p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse4;
         p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
         p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;
+        p.quant = x265_quant_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r 2f5f538d2cbc -r e91e72c53c15 source/common/x86/pixel-util.asm
--- a/source/common/x86/pixel-util.asm	Mon Nov 18 16:44:31 2013 -0600
+++ b/source/common/x86/pixel-util.asm	Tue Nov 19 17:17:52 2013 +0800
@@ -26,9 +26,12 @@
 
 SECTION_RODATA 32
 
+c_d_4:          dd 4, 4, 4, 4
+c_d_1234:       dd 1, 2, 3, 4
+
+
 SECTION .text
 
-
 ;-----------------------------------------------------------------------------
 ; void cvt32to16_shr(short *dst, int *src, intptr_t stride, int shift, int size)
 ;-----------------------------------------------------------------------------
@@ -549,3 +552,121 @@
 
     jnz        .loop
     RET
+
+
+;-----------------------------------------------------------------------------
+; uint32_t quant(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+%if ARCH_X86_64 == 1
+cglobal quant, 5,6,11
+  %define addVec    m8
+  %define qbits     m9
+  %define qbits8    m10
+%else
+cglobal quant, 5,6,8, 0-(3*mmsize)
+  %define addVec    [rsp + 0 * mmsize]
+  %define qbits     [rsp + 1 * mmsize]
+  %define qbits8    [rsp + 2 * mmsize]
+%endif
+
+    ; fill qbits-8
+    movd        m0, r4d
+    mova        qbits, m0
+
+    ; fill qbits-8
+    sub         r4d, 8
+    movd        m0, r4d
+    mova        qbits8, m0
+
+    ; fill offset
+    mov         r4d, r5m
+    movd        m0, r4d
+    pshufd      m0, m0, 0
+    mova        addVec, m0
+
+    mov         r4d, r6m
+    shr         r4d, 3
+    pxor        m7, m7          ; m7 = acSum4
+    mova        m6, [c_d_1234]  ; m6 = last4
+    pxor        m5, m5          ; m5 = count
+    mova        m4, [c_d_4]     ; m4 = [4 4 4 4]
+.loop:
+    ; 4 coeff
+    movu        m0, [r0]        ; m1 = level
+    pxor        m1, m1
+    pcmpgtd     m1, m0          ; m2 = sign
+    movu        m2, [r1]        ; m3 = qcoeff
+    pabsd       m0, m0
+    pmulld      m0, m2          ; m1 = tmpLevel1
+    paddd       m2, m0, addVec
+    psrad       m2, qbits       ; m3 = level1
+    paddd       m7, m2
+    pslld       m3, m2, qbits
+    psubd       m0, m3
+    psrad       m0, qbits8      ; m1 = deltaU1
+    movu        [r2], m0
+
+    pxor        m0, m0
+    pcmpeqd     m0, m2          ; m0 = mask4
+    pand        m5, m0
+    pandn       m0, m6
+    por         m5, m0
+    paddd       m6, m4
+
+    pxor        m2, m1
+    psubd       m2, m1
+    packssdw    m2, m2
+    pmovsxwd    m2, m2
+    movu        [r3], m2
+
+    ; 4 coeff
+    movu        m0, [r0 + 16]   ; m1 = level
+    pxor        m1, m1
+    pcmpgtd     m1, m0          ; m2 = sign
+    movu        m2, [r1 + 16]   ; m3 = qcoeff
+    pabsd       m0, m0
+    pmulld      m0, m2          ; m1 = tmpLevel1
+    paddd       m2, m0, addVec
+    psrad       m2, qbits       ; m3 = level1
+    paddd       m7, m2
+    pslld       m3, m2, qbits
+    psubd       m0, m3
+    psrad       m0, qbits8      ; m1 = deltaU1
+    movu        [r2 + 16], m0
+
+    pxor        m0, m0
+    pcmpeqd     m0, m2          ; m0 = mask4
+    pand        m5, m0
+    pandn       m0, m6
+    por         m5, m0
+    paddd       m6, m4
+
+    pxor        m2, m1
+    psubd       m2, m1
+    packssdw    m2, m2
+    pmovsxwd    m2, m2
+    movu        [r3 + 16], m2
+
+    add         r0, 32
+    add         r1, 32
+    add         r2, 32
+    add         r3, 32
+
+    dec         r4d
+    jnz        .loop
+
+    movhlps     m4, m5
+    pmaxud      m4, m5
+    pshufd      m5, m4, 1
+    pmaxud      m4, m5
+
+    mov         r4, r7m
+    movd        [r4], m4
+    dec         dword [r4]
+
+    phaddd      m7, m7
+    phaddd      m7, m7
+    movd        eax, m7
+
+    RET
diff -r 2f5f538d2cbc -r e91e72c53c15 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Mon Nov 18 16:44:31 2013 -0600
+++ b/source/common/x86/pixel.h	Tue Nov 19 17:17:52 2013 +0800
@@ -364,5 +364,6 @@
 void x265_getResidual8_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
 void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
 void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
+uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
 
 #endif // ifndef X265_I386_PIXEL_H
diff -r 2f5f538d2cbc -r e91e72c53c15 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp	Mon Nov 18 16:44:31 2013 -0600
+++ b/source/test/mbdstharness.cpp	Tue Nov 19 17:17:52 2013 +0800
@@ -30,6 +30,7 @@
 #include <stdio.h>
 
 using namespace x265;
+#define ITERS  100
 
 struct DctConf_t
 {
@@ -245,7 +246,7 @@
         mintbuf2[i] = rand() & PIXEL_MAX;
     }
 
-    for (int i = 0; i <= 5; i++)
+    for (int i = 0; i <= ITERS; i++)
     {
         int width = (rand() % 4 + 1) * 4;
 
@@ -282,10 +283,10 @@
         j += 16;
 
 #if _DEBUG
-        memset(mintbuf3, 0, mem_cmp_size);
-        memset(mintbuf4, 0, mem_cmp_size);
-        memset(mintbuf5, 0, mem_cmp_size);
-        memset(mintbuf6, 0, mem_cmp_size);
+        memset(mintbuf3, 0xCD, mem_cmp_size);
+        memset(mintbuf4, 0xCD, mem_cmp_size);
+        memset(mintbuf5, 0xCD, mem_cmp_size);
+        memset(mintbuf6, 0xCD, mem_cmp_size);
 #endif
     }
 



More information about the x265-devel mailing list