[x265] [PATCH] asm: 10bpp code for calcresidual_4x4 and 8x8

murugan at multicorewareinc.com murugan at multicorewareinc.com
Tue Dec 10 13:52:14 CET 2013


# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386679907 -19800
#      Tue Dec 10 18:21:47 2013 +0530
# Node ID 1e14d4cc6f85b76a14713db5ef6526e71d5016c4
# Parent  682981f97057b0e66cc9fca638a9eb81938b3444
asm: 10bpp code for calcresidual_4x4 and 8x8

diff -r 682981f97057 -r 1e14d4cc6f85 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Dec 10 16:46:51 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Dec 10 18:21:47 2013 +0530
@@ -666,6 +666,9 @@
 
         CHROMA_BLOCKCOPY(_sse2);
         LUMA_BLOCKCOPY(_sse2);
+
+        p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
+        p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
     }
     if (cpuMask & X265_CPU_SSSE3)
     {
diff -r 682981f97057 -r 1e14d4cc6f85 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Dec 10 16:46:51 2013 +0530
+++ b/source/common/x86/pixel-util8.asm	Tue Dec 10 18:21:47 2013 +0530
@@ -316,6 +316,37 @@
 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse2
+%if HIGH_BIT_DEPTH
+cglobal getResidual4, 4,4,4
+    add      r3,    r3
+
+    ; row 0-1
+    movh         m0, [r0]
+    movh         m1, [r0 + r3]
+    movh         m2, [r1]
+    movh         m3, [r1 + r3]
+    punpcklqdq   m0, m1
+    punpcklqdq   m2, m3
+    psubw        m0, m2
+
+    movlps       [r2], m0
+    movhps       [r2 + r3], m0
+    lea          r0, [r0 + r3 * 2]
+    lea          r1, [r1 + r3 * 2]
+    lea          r2, [r2 + r3 * 2]
+
+    ; row 2-3
+    movh         m0, [r0]
+    movh         m1, [r0 + r3]
+    movh         m2, [r1]
+    movh         m3, [r1 + r3]
+    punpcklqdq   m0, m1
+    punpcklqdq   m2, m3
+    psubw        m0, m2
+
+    movlps      [r2], m0
+    movhps      [r2 + r3], m0
+%else
 cglobal getResidual4, 4,4,5
     pxor        m0, m0
 
@@ -347,11 +378,34 @@
     psubw       m1, m3
     movlps      [r2], m1
     movhps      [r2 + r3 * 2], m1
-
+%endif
     RET
 
 
 INIT_XMM sse2
+%if HIGH_BIT_DEPTH
+cglobal getResidual8, 4,4,4
+    add      r3,    r3
+
+%assign x 0
+%rep 8/2
+    ; row 0-1
+    movu        m1, [r0]
+    movu        m2, [r0 + r3]
+    movu        m3, [r1]
+    movu        m4, [r1 + r3]
+    psubw       m1, m3
+    psubw       m2, m4
+    movu        [r2], m1
+    movu        [r2 + r3], m2
+%assign x x+1
+%if (x != 4)
+    lea         r0, [r0 + r3 * 2]
+    lea         r1, [r1 + r3 * 2]
+    lea         r2, [r2 + r3 * 2]
+%endif
+%endrep
+%else
 cglobal getResidual8, 4,4,5
     pxor        m0, m0
 
@@ -377,6 +431,7 @@
     lea         r2, [r2 + r3 * 4]
 %endif
 %endrep
+%endif
     RET
 
 


More information about the x265-devel mailing list