[x265] [PATCH] asm: 10bpp code for calcrecon_16x16 and 32x32

murugan at multicorewareinc.com murugan at multicorewareinc.com
Wed Dec 11 09:52:47 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386751746 -19800
#      Wed Dec 11 14:19:06 2013 +0530
# Node ID 2e141f382fa809330244989c2822412e62b6015d
# Parent  15d12e33cbf8e2766aeb2b79fed578323a66a93f
asm: 10bpp code for calcrecon_16x16 and 32x32

diff -r 15d12e33cbf8 -r 2e141f382fa8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Dec 11 13:29:58 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Dec 11 14:19:06 2013 +0530
@@ -679,6 +679,8 @@
 
         p.calcrecon[BLOCK_4x4] = x265_calcRecons4_sse2;
         p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2;
+        p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse2;
+        p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse2;
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
diff -r 15d12e33cbf8 -r 2e141f382fa8 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Wed Dec 11 13:29:58 2013 +0530
+++ b/source/common/x86/pixel-util.h	Wed Dec 11 14:19:06 2013 +0530
@@ -26,6 +26,8 @@
 
 void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
 void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons16_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons32_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
 void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
 void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
 
diff -r 15d12e33cbf8 -r 2e141f382fa8 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Wed Dec 11 13:29:58 2013 +0530
+++ b/source/common/x86/pixel-util8.asm	Wed Dec 11 14:19:06 2013 +0530
@@ -325,6 +325,97 @@
     RET
 
 
+
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+cglobal calcRecons16
+%if ARCH_X86_64 == 1
+    DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
+    PROLOGUE 6,9,6
+%else
+    DECLARE_REG_TMP 0,1,2,3,4,5
+    PROLOGUE 6,7,6
+    %define t6      r6m
+    %define t6d     r6d
+    %define t7      r7m
+    %define t8d     r6d
+%endif
+
+    mov         t6d, r6m
+%if ARCH_X86_64 == 0
+    add         t6d, t6d
+    mov         r6m, t6d
+%else
+    mov         r5d, r5m
+    mov         r7d, r7m
+    add         t6d, t6d
+    add         t7, t7
+%endif
+
+    pxor        m4, m4
+    mova        m5, [pw_pixel_max]
+    add         t5, t5
+    mov         t8d, 16/2
+.loop:
+    movu        m0, [t0]
+    movu        m1, [t0 + 16]
+    movu        m2, [t1]
+    movu        m3, [t1 + 16]
+    paddw       m0, m2
+    paddw       m1, m3
+    CLIPW       m0, m4, m5
+    CLIPW       m1, m4, m5
+
+    ; store recon[] and recipred[]
+    movu        [t2], m0
+    movu        [t2 + 16], m1
+    movu        [t4], m0
+    movu        [t4 + 16], m1
+%if ARCH_X86_64 == 0
+    add         t4, t7
+    add         t4, t7
+%endif
+
+    ; store recqt[]
+    movu        [t3], m0
+    movu        [t3 + 16], m1
+    add         t3, t6
+
+    movu        m0, [t0 + t5]
+    movu        m1, [t0 + t5 + 16]
+    movu        m2, [t1 + t5]
+    movu        m3, [t1 + t5 + 16]
+    paddw       m0, m2
+    paddw       m1, m3
+    CLIPW       m0, m4, m5
+    CLIPW       m1, m4, m5
+
+    ; store recon[] and recipred[]
+    movu        [t2 + t5], m0
+    movu        [t2 + t5 + 16], m1
+%if ARCH_X86_64 == 0
+    movu        [t4], m0
+    movu        [t4 + 16], m1
+    add         t4, t7
+    add         t4, t7
+%else
+    movu        [t4 + t7], m0
+    movu        [t4 + t7 + 16], m1
+    lea         t4, [t4 + t7 * 2]
+%endif
+
+    ; store recqt[]
+    movu        [t3], m0
+    movu        [t3 + 16], m1
+    add         t3, t6
+
+    lea         t0, [t0 + t5 * 2]
+    lea         t1, [t1 + t5 * 2]
+    lea         t2, [t2 + t5 * 2]
+
+    dec         t8d
+    jnz        .loop
+%else          ;HIGH_BIT_DEPTH
 INIT_XMM sse4
 cglobal calcRecons16
 %if ARCH_X86_64 == 1
@@ -377,9 +468,143 @@
 
     dec         t8d
     jnz        .loop
+%endif          ;HIGH_BIT_DEPTH
     RET
 
-
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+cglobal calcRecons32
+%if ARCH_X86_64 == 1
+    DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
+    PROLOGUE 6,9,6
+%else
+    DECLARE_REG_TMP 0,1,2,3,4,5
+    PROLOGUE 6,7,6
+    %define t6      r6m
+    %define t6d     r6d
+    %define t7      r7m
+    %define t8d     r6d
+%endif
+
+    mov         t6d, r6m
+%if ARCH_X86_64 == 0
+    add         t6d, t6d
+    mov         r6m, t6d
+%else
+    mov         r5d, r5m
+    mov         r7d, r7m
+    add         t6d, t6d
+    add         t7, t7
+%endif
+
+    pxor        m4, m4
+    mova        m5, [pw_pixel_max]
+    add         t5, t5
+    mov         t8d, 32/2
+.loop:
+
+    movu        m0, [t0]
+    movu        m1, [t0 + 16]
+    movu        m2, [t1]
+    movu        m3, [t1 + 16]
+    paddw       m0, m2
+    paddw       m1, m3
+    CLIPW       m0, m4, m5
+    CLIPW       m1, m4, m5
+
+    ; store recon[] and recipred[]
+    movu        [t2], m0
+    movu        [t2 + 16], m1
+    movu        [t4], m0
+    movu        [t4 + 16], m1
+
+    ; store recqt[]
+    movu        [t3], m0
+    movu        [t3 + 16], m1
+
+    movu        m0, [t0 + 32]
+    movu        m1, [t0 + 48]
+    movu        m2, [t1 + 32]
+    movu        m3, [t1 + 48]
+    paddw       m0, m2
+    paddw       m1, m3
+    CLIPW       m0, m4, m5
+    CLIPW       m1, m4, m5
+
+    ; store recon[] and recipred[]
+    movu        [t2 + 32], m0
+    movu        [t2 + 48], m1
+    movu        [t4 + 32], m0
+    movu        [t4 + 48], m1
+%if ARCH_X86_64 == 0
+    add         t4, t7
+    add         t4, t7
+%endif
+
+    ; store recqt[]
+    movu        [t3 + 32], m0
+    movu        [t3 + 48], m1
+    add         t3, t6
+
+    movu        m0, [t0 + t5]
+    movu        m1, [t0 + t5 + 16]
+    movu        m2, [t1 + t5]
+    movu        m3, [t1 + t5 + 16]
+    paddw       m0, m2
+    paddw       m1, m3
+    CLIPW       m0, m4, m5
+    CLIPW       m1, m4, m5
+
+    ; store recon[] and recipred[]
+    movu        [t2 + t5], m0
+    movu        [t2 + t5 + 16], m1
+%if ARCH_X86_64 == 0
+    movu        [t4], m0
+    movu        [t4 + 16], m1
+%else
+    movu        [t4 + t7], m0
+    movu        [t4 + t7 + 16], m1
+%endif
+
+    ; store recqt[]
+    movu        [t3], m0
+    movu        [t3 + 16], m1
+
+    movu        m0, [t0 + t5 + 32]
+    movu        m1, [t0 + t5 + 48]
+    movu        m2, [t1 + t5 + 32]
+    movu        m3, [t1 + t5 + 48]
+    paddw       m0, m2
+    paddw       m1, m3
+    CLIPW       m0, m4, m5
+    CLIPW       m1, m4, m5
+
+    ; store recon[] and recipred[]
+    movu        [t2 + t5 + 32], m0
+    movu        [t2 + t5 + 48], m1
+%if ARCH_X86_64 == 0
+    movu        [t4 + 32], m0
+    movu        [t4 + 48], m1
+    add         t4, t7
+    add         t4, t7
+%else
+    movu        [t4 + t7 + 32], m0
+    movu        [t4 + t7 + 48], m1
+    lea         t4, [t4 + t7 * 2]
+%endif
+
+    ; store recqt[]
+    movu        [t3 + 32], m0
+    movu        [t3 + 48], m1
+    add         t3, t6
+
+    lea         t0, [t0 + t5 * 2]
+    lea         t1, [t1 + t5 * 2]
+    lea         t2, [t2 + t5 * 2]
+
+    dec         t8d
+    jnz        .loop
+%else          ;HIGH_BIT_DEPTH
 INIT_XMM sse4
 cglobal calcRecons32
 %if ARCH_X86_64 == 1
@@ -446,6 +671,7 @@
 
     dec         t8d
     jnz        .loop
+%endif          ;HIGH_BIT_DEPTH
     RET
 
 


More information about the x265-devel mailing list