[x265] [PATCH] asm: interp_4tap_horiz_pp sse3

dtyx265 at gmail.com dtyx265 at gmail.com
Mon May 11 20:51:25 CEST 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1431368383 25200
# Node ID f43b8e01ab507ac36825128322e02a1e06b7cd01
# Parent  3700169eb622204e7476d8b56772771b4f4e52c1
asm: interp_4tap_horiz_pp sse3

Reduce code size with macros
move sse4 macro closer to sse4 code
There are no changes to functionality or performance

diff -r 3700169eb622 -r f43b8e01ab50 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Thu May 07 10:18:17 2015 +0800
+++ b/source/common/x86/ipfilter8.asm	Mon May 11 11:19:43 2015 -0700
@@ -330,80 +330,38 @@
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_2xN(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_H4_W2xN_sse3 1
 INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_2x4, 4, 6, 6, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m5,         [pw_32]
-
-%ifdef PIC
-    lea         r5,          [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
+cglobal interp_4tap_horiz_pp_2x%1, 4, 6, 6, src, srcstride, dst, dststride
+    mov         r4d,    r4m
+    mova        m5,     [pw_32]
+
+%ifdef PIC
+    lea         r5,     [tabw_ChromaCoeff]
+    movddup     m4,     [r5 + r4 * 8]
+%else
+    movddup     m4,     [tabw_ChromaCoeff + r4 * 8]
+%endif
+
+%assign x 1
+%rep %1/2
     FILTER_H4_w2_2_sse2
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
-    FILTER_H4_w2_2_sse2
-
-    RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_2x8, 4, 6, 6, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m5,         [pw_32]
-
-%ifdef PIC
-    lea         r5,          [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 4
-    FILTER_H4_w2_2_sse2
-%if x < 4
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
+%if x < %1/2
+    lea         srcq,   [srcq + srcstrideq * 2]
+    lea         dstq,   [dstq + dststrideq * 2]
 %endif
 %assign x x+1
 %endrep
 
     RET
 
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_2x16, 4, 6, 6, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m5,         [pw_32]
-
-%ifdef PIC
-    lea         r5,         [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 8
-    FILTER_H4_w2_2_sse2
-%if x < 8
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
-%endif
-%assign x x+1
-%endrep
-
-    RET
+%endmacro
+
+    FILTER_H4_W2xN_sse3 4
+    FILTER_H4_W2xN_sse3 8
+    FILTER_H4_W2xN_sse3 16
 
 %macro FILTER_H4_w4_2_sse2 0
     pxor        m5, m5
@@ -447,142 +405,40 @@
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_H4_W4xN_sse3 1
 INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x2, 4, 6, 8, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m7,         [pw_32]
-
-%ifdef PIC
-    lea         r5,         [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
+cglobal interp_4tap_horiz_pp_4x%1, 4, 6, 8, src, srcstride, dst, dststride
+    mov         r4d,    r4m
+    mova        m7,     [pw_32]
+
+%ifdef PIC
+    lea         r5,     [tabw_ChromaCoeff]
+    movddup     m4,     [r5 + r4 * 8]
+%else
+    movddup     m4,     [tabw_ChromaCoeff + r4 * 8]
+%endif
+
+%assign x 1
+%rep %1/2
     FILTER_H4_w4_2_sse2
-
-    RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x4, 4, 6, 8, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m7,         [pw_32]
-
-%ifdef PIC
-    lea         r5,         [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-    FILTER_H4_w4_2_sse2
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
-    FILTER_H4_w4_2_sse2
-
-    RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x8, 4, 6, 8, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m7,         [pw_32]
-
-%ifdef PIC
-    lea         r5,         [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 4
-    FILTER_H4_w4_2_sse2
-%if x < 4
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
+%if x < %1/2
+    lea         srcq,   [srcq + srcstrideq * 2]
+    lea         dstq,   [dstq + dststrideq * 2]
 %endif
 %assign x x+1
 %endrep
 
     RET
 
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x16, 4, 6, 8, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m7,         [pw_32]
-
-%ifdef PIC
-    lea         r5,         [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 8
-    FILTER_H4_w4_2_sse2
-%if x < 8
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
-%endif
-%assign x x+1
-%endrep
-
-    RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x32, 4, 6, 8, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m7,         [pw_32]
-
-%ifdef PIC
-    lea         r5,          [tabw_ChromaCoeff]
-    movddup     m4,       [r5 + r4 * 8]
-%else
-    movddup     m4,       [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 16
-    FILTER_H4_w4_2_sse2
-%if x < 16
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
-%endif
-%assign x x+1
-%endrep
-
-    RET
-
-%macro FILTER_H4_w2_2 3
-    movh        %2, [srcq - 1]
-    pshufb      %2, %2, Tm0
-    movh        %1, [srcq + srcstrideq - 1]
-    pshufb      %1, %1, Tm0
-    punpcklqdq  %2, %1
-    pmaddubsw   %2, coef2
-    phaddw      %2, %2
-    pmulhrsw    %2, %3
-    packuswb    %2, %2
-    movd        r4, %2
-    mov         [dstq], r4w
-    shr         r4, 16
-    mov         [dstq + dststrideq], r4w
-%endmacro
+%endmacro
+
+    FILTER_H4_W4xN_sse3 2
+    FILTER_H4_W4xN_sse3 4
+    FILTER_H4_W4xN_sse3 8
+    FILTER_H4_W4xN_sse3 16
+    FILTER_H4_W4xN_sse3 32
 
 %macro FILTER_H4_w6_sse2 0
     pxor        m4, m4
@@ -762,58 +618,28 @@
     IPFILTER_CHROMA_sse3 8,  64
     IPFILTER_CHROMA_sse3 12, 32
 
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-%macro IPFILTER_CHROMA_W_sse3 2
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_%1x%2, 4, 6, 8, src, srcstride, dst, dststride
-    mov         r4d,         r4m
-    mova        m7,         [pw_32]
-    pxor        m4,         m4
-%ifdef PIC
-    lea         r5,          [tabw_ChromaCoeff]
-    movddup     m6,       [r5 + r4 * 8]
-%else
-    movddup     m6,       [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep %2
-    FILTER_H4_w%1_sse2
-%if x < %2
-    add         srcq,        srcstrideq
-    add         dstq,        dststrideq
-%endif
-%assign x x+1
-%endrep
-
-    RET
-
-%endmacro
-
-    IPFILTER_CHROMA_W_sse3 16,  4
-    IPFILTER_CHROMA_W_sse3 16,  8
-    IPFILTER_CHROMA_W_sse3 16, 12
-    IPFILTER_CHROMA_W_sse3 16, 16
-    IPFILTER_CHROMA_W_sse3 16, 32
-    IPFILTER_CHROMA_W_sse3 32,  8
-    IPFILTER_CHROMA_W_sse3 32, 16
-    IPFILTER_CHROMA_W_sse3 32, 24
-    IPFILTER_CHROMA_W_sse3 24, 32
-    IPFILTER_CHROMA_W_sse3 32, 32
-
-    IPFILTER_CHROMA_W_sse3 16, 24
-    IPFILTER_CHROMA_W_sse3 16, 64
-    IPFILTER_CHROMA_W_sse3 32, 48
-    IPFILTER_CHROMA_W_sse3 24, 64
-    IPFILTER_CHROMA_W_sse3 32, 64
-
-    IPFILTER_CHROMA_W_sse3 64, 64
-    IPFILTER_CHROMA_W_sse3 64, 32
-    IPFILTER_CHROMA_W_sse3 64, 48
-    IPFILTER_CHROMA_W_sse3 48, 64
-    IPFILTER_CHROMA_W_sse3 64, 16
+    IPFILTER_CHROMA_sse3 16,  4
+    IPFILTER_CHROMA_sse3 16,  8
+    IPFILTER_CHROMA_sse3 16, 12
+    IPFILTER_CHROMA_sse3 16, 16
+    IPFILTER_CHROMA_sse3 16, 32
+    IPFILTER_CHROMA_sse3 32,  8
+    IPFILTER_CHROMA_sse3 32, 16
+    IPFILTER_CHROMA_sse3 32, 24
+    IPFILTER_CHROMA_sse3 24, 32
+    IPFILTER_CHROMA_sse3 32, 32
+
+    IPFILTER_CHROMA_sse3 16, 24
+    IPFILTER_CHROMA_sse3 16, 64
+    IPFILTER_CHROMA_sse3 32, 48
+    IPFILTER_CHROMA_sse3 24, 64
+    IPFILTER_CHROMA_sse3 32, 64
+
+    IPFILTER_CHROMA_sse3 64, 64
+    IPFILTER_CHROMA_sse3 64, 32
+    IPFILTER_CHROMA_sse3 64, 48
+    IPFILTER_CHROMA_sse3 48, 64
+    IPFILTER_CHROMA_sse3 64, 16
 
 %macro FILTER_H8_W8_sse2 0
     movh        m1, [r0 + x - 3]
@@ -1876,6 +1702,22 @@
     FILTER_V4_W8_H8_H16_H32_sse2 8, 64
 %endif
 
+%macro FILTER_H4_w2_2 3
+    movh        %2, [srcq - 1]
+    pshufb      %2, %2, Tm0
+    movh        %1, [srcq + srcstrideq - 1]
+    pshufb      %1, %1, Tm0
+    punpcklqdq  %2, %1
+    pmaddubsw   %2, coef2
+    phaddw      %2, %2
+    pmulhrsw    %2, %3
+    packuswb    %2, %2
+    movd        r4, %2
+    mov         [dstq], r4w
+    shr         r4, 16
+    mov         [dstq + dststrideq], r4w
+%endmacro
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list