[x265] [PATCH 1 of 2] asm: routines for luma vss filter functions for all block sizes

nabajit at multicorewareinc.com nabajit at multicorewareinc.com
Fri Nov 15 13:18:37 CET 2013


# HG changeset patch
# User Nabajit Deka
# Date 1384517817 -19800
#      Fri Nov 15 17:46:57 2013 +0530
# Node ID 351229c80f52d580d24853f64f79e42d47617f87
# Parent  b918110fd337178a1cf3616989c65a1e0ed14776
asm: routines for luma vss filter functions for all block sizes.

diff -r b918110fd337 -r 351229c80f52 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Fri Nov 15 17:45:00 2013 +0530
+++ b/source/common/x86/ipfilter8.asm	Fri Nov 15 17:46:57 2013 +0530
@@ -5101,3 +5101,147 @@
 FILTER_VER_CHROMA_SS_W8_H2 8, 8
 FILTER_VER_CHROMA_SS_W8_H2 8, 16
 FILTER_VER_CHROMA_SS_W8_H2 8, 32
+
+;-----------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_SS 2
+INIT_XMM sse2
+cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-1
+
+    add        r1d, r1d
+    add        r3d, r3d
+    lea        r5, [r1 + 2 * r1]
+    sub        r0, r5
+    shl        r4d, 6
+
+%ifdef PIC
+    lea        r5, [tab_LumaCoeffV]
+    lea        r6, [r5 + r4]
+%else
+    lea        r6, [tab_LumaCoeffV + r4]
+%endif
+
+    mov        byte [rsp], %2/4
+.loopH
+    mov        r4d, (%1/4)
+.loopW
+    movq       m0, [r0]
+    movq       m1, [r0 + r1]
+    punpcklwd  m0, m1                          ;m0=[0 1]
+    pmaddwd    m0, [r6 + 0 *16]                ;m0=[0+1]  Row1
+
+    movq       m4, [r0 + 2 * r1]
+    punpcklwd  m1, m4                          ;m1=[1 2]
+    pmaddwd    m1, [r6 + 0 *16]                ;m1=[1+2]  Row2
+
+    lea        r0, [r0 + 2 * r1]
+    movq       m5, [r0 + r1]
+    punpcklwd  m4, m5                          ;m4=[2 3]
+    pmaddwd    m2, m4, [r6 + 0 *16]            ;m2=[2+3]  Row3
+    pmaddwd    m4, [r6 + 1 * 16]
+    paddd      m0, m4                          ;m0=[0+1+2+3]  Row1
+
+    movq       m4, [r0 + 2 * r1]
+    punpcklwd  m5, m4                          ;m5=[3 4]
+    pmaddwd    m3, m5, [r6 + 0 *16]            ;m3=[3+4]  Row4
+    pmaddwd    m5, [r6 + 1 * 16]
+    paddd      m1, m5                          ;m1 = [1+2+3+4]  Row2
+
+    lea        r0, [r0 + 2 * r1]
+    movq       m5, [r0 + r1]
+    punpcklwd  m4, m5                          ;m4=[4 5]
+    pmaddwd    m6, m4, [r6 + 1 * 16]
+    paddd      m2, m6                          ;m2=[2+3+4+5]  Row3
+    pmaddwd    m4, [r6 + 2 * 16]
+    paddd      m0, m4                          ;m0=[0+1+2+3+4+5]  Row1
+
+    movq       m4, [r0 + 2 * r1]
+    punpcklwd  m5, m4                          ;m5=[5 6]
+    pmaddwd    m6, m5, [r6 + 1 * 16]
+    paddd      m3, m6                          ;m3=[3+4+5+6]  Row4
+    pmaddwd    m5, [r6 + 2 * 16]
+    paddd      m1, m5                          ;m1=[1+2+3+4+5+6]  Row2
+
+    lea        r0, [r0 + 2 * r1]
+    movq       m5, [r0 + r1]
+    punpcklwd  m4, m5                          ;m4=[6 7]
+    pmaddwd    m6, m4, [r6 + 2 * 16]
+    paddd      m2, m6                          ;m2=[2+3+4+5+6+7]  Row3
+    pmaddwd    m4, [r6 + 3 * 16]
+    paddd      m0, m4                          ;m0=[0+1+2+3+4+5+6+7]  Row1 end
+    psrad      m0, 6
+
+    movq       m4, [r0 + 2 * r1]
+    punpcklwd  m5, m4                          ;m5=[7 8]
+    pmaddwd    m6, m5, [r6 + 2 * 16]
+    paddd      m3, m6                          ;m3=[3+4+5+6+7+8]  Row4
+    pmaddwd    m5, [r6 + 3 * 16]
+    paddd      m1, m5                          ;m1=[1+2+3+4+5+6+7+8]  Row2 end
+    psrad      m1, 6
+
+    packssdw   m0, m1
+
+    movlps     [r2], m0
+    movhps     [r2 + r3], m0
+
+    lea        r0, [r0 + 2 * r1]
+    movq       m5, [r0 + r1]
+    punpcklwd  m4, m5                          ;m4=[8 9]
+    pmaddwd    m4, [r6 + 3 * 16]
+    paddd      m2, m4                          ;m2=[2+3+4+5+6+7+8+9]  Row3 end
+    psrad      m2, 6
+
+    movq       m4, [r0 + 2 * r1]
+    punpcklwd  m5, m4                          ;m5=[9 10]
+    pmaddwd    m5, [r6 + 3 * 16]
+    paddd      m3, m5                          ;m3=[3+4+5+6+7+8+9+10]  Row4 end
+    psrad      m3, 6
+
+    packssdw   m2, m3
+
+    movlps     [r2 + 2 * r3], m2
+    lea        r5, [r3 + 2 * r3]
+    movhps     [r2 + r5], m2
+
+    lea        r5, [8 * r1 - 2 * 4]
+    sub        r0, r5
+    add        r2, 2 * 4
+
+    dec        r4d
+    jnz        .loopW
+
+    lea        r0, [r0 + 4 * r1 - 2 * %1]
+    lea        r2, [r2 + 4 * r3 - 2 * %1]
+
+    dec        byte [rsp]
+    jnz        .loopH
+
+    RET
+%endmacro
+
+    FILTER_VER_LUMA_SS 4, 4
+    FILTER_VER_LUMA_SS 8, 8
+    FILTER_VER_LUMA_SS 8, 4
+    FILTER_VER_LUMA_SS 4, 8
+    FILTER_VER_LUMA_SS 16, 16
+    FILTER_VER_LUMA_SS 16, 8
+    FILTER_VER_LUMA_SS 8, 16
+    FILTER_VER_LUMA_SS 16, 12
+    FILTER_VER_LUMA_SS 12, 16
+    FILTER_VER_LUMA_SS 16, 4
+    FILTER_VER_LUMA_SS 4, 16
+    FILTER_VER_LUMA_SS 32, 32
+    FILTER_VER_LUMA_SS 32, 16
+    FILTER_VER_LUMA_SS 16, 32
+    FILTER_VER_LUMA_SS 32, 24
+    FILTER_VER_LUMA_SS 24, 32
+    FILTER_VER_LUMA_SS 32, 8
+    FILTER_VER_LUMA_SS 8, 32
+    FILTER_VER_LUMA_SS 64, 64
+    FILTER_VER_LUMA_SS 64, 32
+    FILTER_VER_LUMA_SS 32, 64
+    FILTER_VER_LUMA_SS 64, 48
+    FILTER_VER_LUMA_SS 48, 64
+    FILTER_VER_LUMA_SS 64, 16
+    FILTER_VER_LUMA_SS 16, 64


More information about the x265-devel mailing list