[x265] [PATCH Review Only] ASM routine for interp_8tap_vert_pp_8xN function, (N=4, 8, 16, 32)

nabajit at multicorewareinc.com nabajit at multicorewareinc.com
Tue Oct 29 14:54:19 CET 2013


# HG changeset patch
# User Nabajit Deka
# Date 1383054838 -19800
#      Tue Oct 29 19:23:58 2013 +0530
# Node ID 2c8344b73d62b97347109946266d7f2e0200daa6
# Parent  358400cb0c67c22a1f387f3af86966447ba884fa
ASM routine for interp_8tap_vert_pp_8xN function, (N=4,8,16,32)

diff -r 358400cb0c67 -r 2c8344b73d62 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Tue Oct 29 11:33:01 2013 +0530
+++ b/source/common/x86/ipfilter8.asm	Tue Oct 29 19:23:58 2013 +0530
@@ -73,6 +73,26 @@
                 times 4 dw 58, -10
                 times 4 dw 4, -1
 
+tab_LumaCoeffVL:times 8 db 0, 0
+                times 8 db 0, 64
+                times 8 db 0, 0
+                times 8 db 0, 0
+
+                times 8 db -1, 4
+                times 8 db -10, 58
+                times 8 db 17, -5
+                times 8 db 1, 0
+
+                times 8 db -1, 4
+                times 8 db -11, 40
+                times 8 db 40, -11
+                times 8 db 4, -1
+
+                times 8 db 0, 1
+                times 8 db -5, 17
+                times 8 db 58, -10
+                times 8 db 4, -1
+
 SECTION .text
 
 %macro FILTER_H4_w2_2 3
@@ -898,3 +918,122 @@
     jnz         .loopH
 
     RET
+
+;This macro is used by vertical luma filter to calculate 4 output rows for input width of 8.
+%macro FILTER_VL_W8_4R 0
+    movq        m0,        [r0]
+    movq        m1,        [r0 + r1]
+    punpcklbw   m0,        m1
+    pmaddubsw   m7,        m0,        [r6]                  ;m7 = [0+1]
+
+    movq        m0,        [r0 + 2 * r1]
+    punpcklbw   m1,        m0
+    pmaddubsw   m6,        m1,        [r6]                  ;m6 = [1+2]
+
+    lea         r0,        [r0 + 2 * r1]
+    movq        m1,        [r0 + r1]
+    punpcklbw   m0,        m1
+    pmaddubsw   m5,        m0,        [r6]                  ;m5 = [2+3]
+    pmaddubsw   m0,        [r6 + 16]
+    paddw       m7,        m0                               ;m7 = [0+1+2+3]
+
+    movq        m0,        [r0 + 2 * r1]
+    punpcklbw   m1,        m0
+    pmaddubsw   m4,        m1,        [r6]                  ;m4 = [3+4]
+    pmaddubsw   m1,        [r6 + 16]
+    paddw       m6,        m1                               ;m6 = [1+2+3+4]
+
+    lea         r0,        [r0 + 2 * r1]
+    movq        m1,        [r0 + r1]
+    punpcklbw   m0,        m1
+    pmaddubsw   m2,        m0,        [r6 + 16]
+    pmaddubsw   m0,        [r6 + 32]
+    paddw       m7,        m0                               ;m7 = [0+1+2+3+4+5]
+    paddw       m5,        m2                               ;m5 = [2+3+4+5]
+
+    movq        m0,        [r0 + 2 * r1]
+    punpcklbw   m1,        m0
+    pmaddubsw   m2,        m1,        [r6 + 16]
+    pmaddubsw   m1,        [r6 + 32]
+    paddw       m6,        m1                               ;m6 = [1+2+3+4+5+6]
+    paddw       m4,        m2                               ;m4 = [3+4+5+6]
+
+    lea         r0,        [r0 + 2 * r1]
+    movq        m1,        [r0 + r1]
+    punpcklbw   m0,        m1
+    pmaddubsw   m2,        m0,        [r6 + 32]
+    pmaddubsw   m0,        [r6 + 48]
+    paddw       m7,        m0                               ;m7 = [0+1+2+3+4+5+6+7]
+    paddw       m5,        m2                               ;m5 = [2+3+4+5+6+7]
+
+    movq        m0,        [r0 + 2 * r1]
+    punpcklbw   m1,        m0
+    pmaddubsw   m2,        m1,        [r6 + 32]
+    pmaddubsw   m1,        [r6 + 48]
+    paddw       m6,        m1                               ;m6 = [1+2+3+4+5+6+7+8]
+    paddw       m4,        m2                               ;m4 = [3+4+5+6+7+8]
+
+    lea         r0,        [r0 + 2 * r1]
+    movq        m1,        [r0 + r1]
+    punpcklbw   m0,        m1
+    pmaddubsw   m0,        [r6 + 48]
+    paddw       m5,        m0                               ;m5 = [2+3+4+5+6+7+8+9]
+
+    movq        m0,        [r0 + 2 * r1]
+    punpcklbw   m1,        m0
+    pmaddubsw   m1,        [r6 + 48]
+    paddw       m4,        m1                              ;m4 = [3+4+5+6+7+8+9+10]
+
+    pmulhrsw    m7,        [tab_c_512]
+    pmulhrsw    m6,        [tab_c_512]
+    pmulhrsw    m5,        [tab_c_512]
+    pmulhrsw    m4,        [tab_c_512]
+
+    packuswb    m7,        m6
+    packuswb    m5,        m4
+
+    movlps      [r2],           m7
+    movhps      [r2 + r3],      m7
+    movlps      [r2 + 2 * r3],  m5
+    lea         r5,             [r3 + 2 * r3]
+    movhps      [r2 + r5],      m5
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_V_LUMA_8xN 2
+INIT_XMM sse4
+cglobal interp_8tap_vert_pp_%1x%2, 4, 7, 7
+    mov         r4d,       r4m
+    lea         r5,        [r1 + 2 * r1]
+    sub         r0,        r5
+
+    shl         r4,        6
+%ifdef PIC
+    lea         r5,        [tab_LumaCoeffVL]
+    lea         r6,        [r5 + r4]
+%else
+    lea         r6,        [tab_LumaCoeffVL + r4]
+%endif
+
+xor         r4,        r4
+add         r4d,       %2
+
+.loopH
+    FILTER_VL_W8_4R
+
+    lea         r5,        [4 * r1]
+    sub         r0,        r5
+    lea         r5,        [4 * r3]
+    add         r2,        r5
+
+    sub         r4,        4
+    jnz         .loopH
+    RET
+%endmacro
+
+    FILTER_V_LUMA_8xN 8,4
+    FILTER_V_LUMA_8xN 8,8
+    FILTER_V_LUMA_8xN 8,16
+    FILTER_V_LUMA_8xN 8,32


More information about the x265-devel mailing list