[x265] [PATCH 1 of 2] asm: routines for chroma vss filter functions for all block sizes

nabajit at multicorewareinc.com nabajit at multicorewareinc.com
Fri Nov 15 12:41:07 CET 2013


# HG changeset patch
# User Nabajit Deka
# Date 1384515384 -19800
#      Fri Nov 15 17:06:24 2013 +0530
# Node ID 9842bb0aab4c3b3a5b241d77cf6436d8bd7e717f
# Parent  7d42727cd87856e593f294ececcac218110d388a
asm: routines for chroma vss filter functions for all block sizes

diff -r 7d42727cd878 -r 9842bb0aab4c source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Fri Nov 15 17:00:39 2013 +0530
+++ b/source/common/x86/ipfilter8.asm	Fri Nov 15 17:06:24 2013 +0530
@@ -4802,21 +4802,302 @@
     punpcklbw  m2, m3
     punpcklbw  m5, m2
 
-    pmaddubsw   m5, m0
-
-    phaddw      m4, m5
-
-    psubw       m4, m1
-
-    movd        [r2 + 2 * r3], m4
-    lea         r6, [r2 + 2 * r3]
-    pshufd      m4 , m4 ,2
-    movd        [r6 + r3], m4
-
-    lea         r0, [r0 + 4 * r1]
-    lea         r2, [r2 + 4 * r3]
-
-    dec         r4d
+    pmaddubsw  m5, m0
+
+    phaddw     m4, m5
+
+    psubw      m4, m1
+
+    movd       [r2 + 2 * r3], m4
+    lea        r6, [r2 + 2 * r3]
+    pshufd     m4 , m4 ,2
+    movd       [r6 + r3], m4
+
+    lea        r0, [r0 + 4 * r1]
+    lea        r2, [r2 + 4 * r3]
+
+    dec        r4d
     jnz        .loop
 
 RET
+
+;-----------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SS 2
+INIT_XMM sse2
+cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-1
+
+    add       r1d, r1d
+    add       r3d, r3d
+    sub       r0, r1
+    shl       r4d, 5
+
+%ifdef PIC
+    lea       r5, [tab_ChromaCoeffV]
+    lea       r6, [r5 + r4]
+%else
+    lea       r6, [tab_ChromaCoeffV + r4]
+%endif
+
+    mov       byte [rsp], %2/4
+
+.loopH
+    mov       r4d, (%1/4)
+.loopW
+    PROCESS_CHROMA_SP_W4_4R
+
+    psrad     m0, 6
+    psrad     m1, 6
+    psrad     m2, 6
+    psrad     m3, 6
+
+    packssdw  m0, m1
+    packssdw  m2, m3
+
+    movlps    [r2], m0
+    movhps    [r2 + r3], m0
+    movlps    [r2 + 2 * r3], m2
+    lea       r5, [r3 + 2 * r3]
+    movhps    [r2 + r5], m2
+
+    lea       r5, [4 * r1 - 2 * 4]
+    sub       r0, r5
+    add       r2, 2 * 4
+
+    dec       r4d
+    jnz       .loopW
+
+    lea       r0, [r0 + 4 * r1 - 2 * %1]
+    lea       r2, [r2 + 4 * r3 - 2 * %1]
+
+    dec       byte [rsp]
+    jnz       .loopH
+
+    RET
+%endmacro
+
+    FILTER_VER_CHROMA_SS 4, 4
+    FILTER_VER_CHROMA_SS 4, 8
+    FILTER_VER_CHROMA_SS 16, 16
+    FILTER_VER_CHROMA_SS 16, 8
+    FILTER_VER_CHROMA_SS 16, 12
+    FILTER_VER_CHROMA_SS 12, 16
+    FILTER_VER_CHROMA_SS 16, 4
+    FILTER_VER_CHROMA_SS 4, 16
+    FILTER_VER_CHROMA_SS 32, 32
+    FILTER_VER_CHROMA_SS 32, 16
+    FILTER_VER_CHROMA_SS 16, 32
+    FILTER_VER_CHROMA_SS 32, 24
+    FILTER_VER_CHROMA_SS 24, 32
+    FILTER_VER_CHROMA_SS 32, 8
+
+;---------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;---------------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SS_W2_4R 2
+INIT_XMM sse2
+cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 5
+
+    add       r1d, r1d
+    add       r3d, r3d
+    sub       r0, r1
+    shl       r4d, 5
+
+%ifdef PIC
+    lea       r5, [tab_ChromaCoeffV]
+    lea       r6, [r5 + r4]
+%else
+    lea       r6, [tab_ChromaCoeffV + r4]
+%endif
+
+    mov       r4d, (%2/4)
+
+.loopH
+    PROCESS_CHROMA_SP_W2_4R
+
+    psrad     m0, 6
+    psrad     m2, 6
+
+    packssdw  m0, m0
+    packssdw  m2, m2
+
+    movd      [r2], m0
+    pshufd    m0, m0, 1
+    movd      [r2 + r3], m0
+    lea       r2, [r2 + 2 * r3]
+    movd      [r2], m2
+    pshufd    m2, m2, 1
+    movd      [r2 + r3], m2
+
+    lea       r2, [r2 + 2 * r3]
+
+    dec       r4d
+    jnz       .loopH
+
+    RET
+%endmacro
+
+FILTER_VER_CHROMA_SS_W2_4R 2, 4
+FILTER_VER_CHROMA_SS_W2_4R 2, 8
+
+;---------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;---------------------------------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal interp_4tap_vert_ss_4x2, 5, 6, 4
+
+    add        r1d, r1d
+    add        r3d, r3d
+    sub        r0, r1
+    shl        r4d, 5
+
+%ifdef PIC
+    lea        r5, [tab_ChromaCoeffV]
+    lea        r5, [r5 + r4]
+%else
+    lea        r5, [tab_ChromaCoeffV + r4]
+%endif
+
+    movq       m0, [r0]
+    movq       m1, [r0 + r1]
+    punpcklwd  m0, m1                          ;m0=[0 1]
+    pmaddwd    m0, [r5 + 0 *16]                ;m0=[0+1]  Row1
+
+    movq       m2, [r0 + 2 * r1]
+    punpcklwd  m1, m2                          ;m1=[1 2]
+    pmaddwd    m1, [r5 + 0 *16]                ;m1=[1+2]  Row2
+
+    lea        r0, [r0 + 2 * r1]
+    movq       m3, [r0 + r1]
+    punpcklwd  m2, m3                          ;m4=[2 3]
+    pmaddwd    m2, [r5 + 1 * 16]
+    paddd      m0, m2                          ;m0=[0+1+2+3]  Row1 done
+    psrad      m0, 6
+
+    movq       m2, [r0 + 2 * r1]
+    punpcklwd  m3, m2                          ;m5=[3 4]
+    pmaddwd    m3, [r5 + 1 * 16]
+    paddd      m1, m3                          ;m1=[1+2+3+4]  Row2 done
+    psrad      m1, 6
+
+    packssdw   m0, m1
+
+    movlps     [r2], m0
+    movhps     [r2 + r3], m0
+
+    RET
+
+;-------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal interp_4tap_vert_ss_6x8, 5, 7, 6
+
+    add       r1d, r1d
+    add       r3d, r3d
+    sub       r0, r1
+    shl       r4d, 5
+
+%ifdef PIC
+    lea       r5, [tab_ChromaCoeffV]
+    lea       r6, [r5 + r4]
+%else
+    lea       r6, [tab_ChromaCoeffV + r4]
+%endif
+
+    mov       r4d, 8/4
+
+.loopH
+    PROCESS_CHROMA_SP_W4_4R
+
+    psrad     m0, 6
+    psrad     m1, 6
+    psrad     m2, 6
+    psrad     m3, 6
+
+    packssdw  m0, m1
+    packssdw  m2, m3
+
+    movlps    [r2], m0
+    movhps    [r2 + r3], m0
+    movlps    [r2 + 2 * r3], m2
+    lea       r5, [r3 + 2 * r3]
+    movhps    [r2 + r5], m2
+
+    lea       r5, [4 * r1 - 2 * 4]
+    sub       r0, r5
+    add       r2, 2 * 4
+
+    PROCESS_CHROMA_SP_W2_4R
+
+    psrad     m0, 6
+    psrad     m2, 6
+
+    packssdw  m0, m0
+    packssdw  m2, m2
+
+    movd      [r2], m0
+    pshufd    m0, m0, 1
+    movd      [r2 + r3], m0
+    lea       r2, [r2 + 2 * r3]
+    movd      [r2], m2
+    pshufd    m2, m2, 1
+    movd      [r2 + r3], m2
+
+    sub       r0, 2 * 4
+    lea       r2, [r2 + 2 * r3 - 2 * 4]
+
+    dec       r4d
+    jnz       .loopH
+
+    RET
+
+;----------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert_ss_8x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;----------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_SS_W8_H2 2
+INIT_XMM sse2
+cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7
+
+    add       r1d, r1d
+    add       r3d, r3d
+    sub       r0, r1
+    shl       r4d, 5
+
+%ifdef PIC
+    lea       r5, [tab_ChromaCoeffV]
+    lea       r5, [r5 + r4]
+%else
+    lea       r5, [tab_ChromaCoeffV + r4]
+%endif
+
+    mov       r4d, %2/2
+.loopH
+    PROCESS_CHROMA_SP_W8_2R
+
+    psrad     m0, 6
+    psrad     m1, 6
+    psrad     m2, 6
+    psrad     m3, 6
+
+    packssdw  m0, m1
+    packssdw  m2, m3
+
+    movu      [r2], m0
+    movu      [r2 + r3], m2
+
+    lea       r2, [r2 + 2 * r3]
+
+    dec       r4d
+    jnz       .loopH
+
+    RET
+%endmacro
+
+FILTER_VER_CHROMA_SS_W8_H2 8, 2
+FILTER_VER_CHROMA_SS_W8_H2 8, 4
+FILTER_VER_CHROMA_SS_W8_H2 8, 6
+FILTER_VER_CHROMA_SS_W8_H2 8, 8
+FILTER_VER_CHROMA_SS_W8_H2 8, 16
+FILTER_VER_CHROMA_SS_W8_H2 8, 32


More information about the x265-devel mailing list