[x265] [PATCH] asm: 10bpp code for calcrecon_4x4 and 8x8

murugan at multicorewareinc.com murugan at multicorewareinc.com
Wed Dec 11 09:03:33 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386748798 -19800
#      Wed Dec 11 13:29:58 2013 +0530
# Node ID 15d12e33cbf8e2766aeb2b79fed578323a66a93f
# Parent  1686a3f8b3d04a33dfff3da8ca55c0acc1684bc1
asm: 10bpp code for calcrecon_4x4 and 8x8

diff -r 1686a3f8b3d0 -r 15d12e33cbf8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Dec 11 12:46:46 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Dec 11 13:29:58 2013 +0530
@@ -676,6 +676,9 @@
         p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
         p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse2;
         p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse2;
+
+        p.calcrecon[BLOCK_4x4] = x265_calcRecons4_sse2;
+        p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2;
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
diff -r 1686a3f8b3d0 -r 15d12e33cbf8 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Wed Dec 11 12:46:46 2013 +0530
+++ b/source/common/x86/pixel-util8.asm	Wed Dec 11 13:29:58 2013 +0530
@@ -55,12 +55,80 @@
 cextern pw_1
 cextern pw_00ff
 cextern pw_2000
+cextern pw_pixel_max
 
 ;-----------------------------------------------------------------------------
 ; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
 cglobal calcRecons4
+%if HIGH_BIT_DEPTH
+%if ARCH_X86_64 == 1
+    DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
+    PROLOGUE 6,9,6
+%else
+    DECLARE_REG_TMP 0,1,2,3,4,5
+    PROLOGUE 6,7,6
+    %define t6      r6m
+    %define t6d     r6d
+    %define t7      r7m
+    %define t8d     r6d
+%endif
+
+    mov         t6d, r6m
+%if ARCH_X86_64 == 0
+    add         t6d, t6d
+    mov         r6m, t6d
+%else
+    mov         r5d, r5m
+    mov         r7d, r7m
+    add         t6d, t6d
+    add         t7, t7
+%endif
+
+    pxor        m4, m4
+    mova        m5, [pw_pixel_max]
+    add         t5, t5
+    mov         t8d, 4/2
+.loop:
+    movh        m0, [t0]
+    movh        m1, [t0 + t5]
+    punpcklqdq  m0, m1
+    movh        m2, [t1]
+    movh        m3, [t1 + t5]
+    punpcklqdq  m2, m3
+    paddw       m0, m2
+    CLIPW       m0, m4, m5
+
+    ; store recon[] and recipred[]
+    movh        [t2], m0
+    movh        [t4], m0
+%if ARCH_X86_64 == 0
+    add         t4, t7
+    add         t4, t7
+    movhps      [t4], m0
+    add         t4, t7
+    add         t4, t7
+%else
+    movhps      [t4 + t7], m0
+    lea         t4, [t4 + t7 * 2]
+%endif
+    movhps      [t2 + t5], m0
+
+    ; store recqt[]
+    movh        [t3], m0
+    add         t3, t6
+    movhps      [t3], m0
+    add         t3, t6
+
+    lea         t0, [t0 + t5 * 2]
+    lea         t1, [t1 + t5 * 2]
+    lea         t2, [t2 + t5 * 2]
+
+    dec         t8d
+    jnz        .loop
+
+%else          ;HIGH_BIT_DEPTH
 %if ARCH_X86_64 == 1
     DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
     PROLOGUE 6,9,4
@@ -118,11 +186,79 @@
 
     dec         t8d
     jnz        .loop
+%endif          ;HIGH_BIT_DEPTH
     RET
 
 
 INIT_XMM sse2
 cglobal calcRecons8
+%if HIGH_BIT_DEPTH
+%if ARCH_X86_64 == 1
+    DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
+    PROLOGUE 6,9,6
+%else
+    DECLARE_REG_TMP 0,1,2,3,4,5
+    PROLOGUE 6,7,6
+    %define t6      r6m
+    %define t6d     r6d
+    %define t7      r7m
+    %define t8d     r6d
+%endif
+
+    mov         t6d, r6m
+%if ARCH_X86_64 == 0
+    add         t6d, t6d
+    mov         r6m, t6d
+%else
+    mov         r5d, r5m
+    mov         r7d, r7m
+    add         t6d, t6d
+    add         t7, t7
+%endif
+
+    pxor        m4, m4
+    mova        m5, [pw_pixel_max]
+    add         t5, t5
+    mov         t8d, 8/2
+.loop:
+    movu        m0, [t0]
+    movu        m1, [t0 + t5]
+    movu        m2, [t1]
+    movu        m3, [t1 + t5]
+    paddw       m0, m2
+    paddw       m1, m3
+    CLIPW       m0, m4, m5
+    CLIPW       m1, m4, m5
+
+    ; store recon[] and recipred[]
+    movu        [t2], m0
+    movu        [t2 + t5], m1
+    movu        [t4], m0
+%if ARCH_X86_64 == 0
+    add         t4, t7
+    add         t4, t7
+    movu        [t4], m1
+    add         t4, t7
+    add         t4, t7
+%else
+    movu        [t4 + t7], m1
+    lea         t4, [t4 + t7 * 2]
+%endif
+
+    ; store recqt[]
+    movu        [t3], m0
+    add         t3, t6
+    movu        [t3], m1
+    add         t3, t6
+
+    lea         t0, [t0 + t5 * 2]
+    lea         t1, [t1 + t5 * 2]
+    lea         t2, [t2 + t5 * 2]
+
+    dec         t8d
+    jnz        .loop
+%else          ;HIGH_BIT_DEPTH
+
 %if ARCH_X86_64 == 1
     DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
     PROLOGUE 6,9,5
@@ -185,6 +321,7 @@
 
     dec         t8d
     jnz        .loop
+%endif          ;HIGH_BIT_DEPTH
     RET
 
 


More information about the x265-devel mailing list