[x265] [PATCH Review Only] filterHorizontal_p_p_4, 32x32 asm code

praveen at multicorewareinc.com praveen at multicorewareinc.com
Tue Oct 8 09:23:58 CEST 2013


# HG changeset patch
# User Praveen Tiwari
# Date 1381217024 -19800
# Node ID 2b2fc4a46c7dcf8720b1b9872c0f3b86c048ffcd
# Parent  e5b94aa4444dc927ef38236365cbe3bd757e9eb4
filterHorizontal_p_p_4, 32x32 asm code

diff -r e5b94aa4444d -r 2b2fc4a46c7d source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Tue Oct 08 12:45:35 2013 +0530
+++ b/source/common/x86/ipfilter8.asm	Tue Oct 08 12:53:44 2013 +0530
@@ -448,3 +448,85 @@
     FILTER_H4_w24   x0, x1, x2, x3
     movh        [dstq + 16],    x1
     RET
+
+    SECTION_RODATA 32
+tab_Tm:     db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+            db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+
+tab_c_512:  times 8 dw 512
+
+SECTION .text
+
+%macro FILTER_H4_w32 4
+    movu        %1, [srcq - 1]
+    pshufb      %2, %1, Tm0
+    pmaddubsw   %2, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %2, %1
+    movu        %1, [srcq - 1 + 8]
+    pshufb      %4, %1, Tm0
+    pmaddubsw   %4, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %4, %1
+    pmulhrsw    %2, %3
+    pmulhrsw    %4, %3
+    packuswb    %2, %4
+    movu        [dstq], %2
+    movu        %1, [srcq - 1 + 16]
+    pshufb      %2, %1, Tm0
+    pmaddubsw   %2, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %2, %1
+    movu        %1, [srcq - 1 + 24]
+    pshufb      %4, %1, Tm0
+    pmaddubsw   %4, coef2
+    pshufb      %1, %1, Tm1
+    pmaddubsw   %1, coef2
+    phaddw      %4, %1
+    pmulhrsw    %2, %3
+    pmulhrsw    %4, %3
+    packuswb    %2, %4
+%endmacro
+
+%macro FILTER_H4_w32_CALL 0
+    FILTER_H4_w32   x0, x1, x2, x3
+
+    movu        [dstq + 16],      x1
+
+    add         srcq,        srcstrideq
+    add         dstq,        dststrideq
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal filterHorizontal_p_p_4, 4, 5, 7, src, srcstride, dst, dststride
+%define coef2       m6
+%define Tm0         m5
+%define Tm1         m4
+%define x3          m3
+%define x2          m2
+%define x1          m1
+%define x0          m0
+
+    mov         r4,         r6m
+    movu        coef2,      [r4]
+    packsswb    coef2,      coef2
+    pshufd      coef2,      coef2,      0
+
+    mova        x2,         [tab_c_512]
+
+    mova        Tm0,        [tab_Tm]
+    mova        Tm1,        [tab_Tm + 16]
+
+ %rep 31
+ FILTER_H4_w32_CALL
+ %endrep
+
+    FILTER_H4_w32   x0, x1, x2, x3
+    movu        [dstq + 16],    x1
+    RET


More information about the x265-devel mailing list