[x265] [PATCH 1 of 4] asm : Modifications for luma_hps and chroma_hps(extra rows)

nabajit at multicorewareinc.com nabajit at multicorewareinc.com
Wed Dec 4 13:49:48 CET 2013


# HG changeset patch
# User Nabajit Deka
# Date 1386159854 -19800
#      Wed Dec 04 17:54:14 2013 +0530
# Node ID 9440e424c637a46e15a96c03739d645e1dbf8b56
# Parent  9b062eb8124e9fb12bc16e32eab524ba080cf258
asm : Modifications for luma_hps and chroma_hps(extra rows)

diff -r 9b062eb8124e -r 9440e424c637 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed Dec 04 14:54:59 2013 +0550
+++ b/source/common/x86/ipfilter8.asm	Wed Dec 04 17:54:14 2013 +0530
@@ -618,71 +618,75 @@
     phaddw      %2, %2
 %endmacro
 
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
 %macro IPFILTER_LUMA 3
 INIT_XMM sse4
-cglobal interp_8tap_horiz_%3_%1x%2, 4, 6, 5
-
-mov         r4d, r4m
+cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 5
+
+    mov       r4d, r4m
 
 %ifdef PIC
-lea         r5, [tab_LumaCoeff]
-movh        m3, [r5 + r4 * 8]
+    lea       r6, [tab_LumaCoeff]
+    movh      m3, [r6 + r4 * 8]
 %else
-movh        m3, [tab_LumaCoeff + r4 * 8]
+    movh      m3, [tab_LumaCoeff + r4 * 8]
 %endif
-
+    punpcklqdq  m3, m3
+
+%ifidn %3, pp 
+    mova      m2, [tab_c_512]
+%else
+    mova      m2, [pw_2000]
+%endif
+
+    mov       r4d, %2
 %ifidn %3, ps
-    add     r3d, r3d
+    add       r3, r3
+    cmp       r5m, byte 0
+    je        .loopH
+    lea       r6, [r1 + 2 * r1]
+    sub       r0, r6
+    add       r4d, 7
 %endif
 
-punpcklqdq  m3, m3
-%ifidn %3, pp 
-    mova    m2, [tab_c_512]
-%else
-    mova    m2, [pw_2000]
-%endif
-
-mov         r4, %2
-
-.loop
-    xor     r5, r5
+.loopH
+    xor       r5, r5
 %rep %1 / 8
   %ifidn %3, pp 
     FILTER_H8_W8  m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5]
   %else
     FILTER_H8_W8  m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5]
-    psubw   m1, m2
-    movu    [r2 + 2 * r5], m1
+    psubw     m1, m2
+    movu      [r2 + 2 * r5], m1
   %endif
-    add     r5, 8
+    add       r5, 8
 %endrep
 
 %rep (%1 % 8) / 4
     FILTER_H8_W4  m0, m1
   %ifidn %3, pp 
-    pmulhrsw    m1, m2
-    packuswb    m1, m1
-    movd        [r2 + r5], m1
+    pmulhrsw  m1, m2
+    packuswb  m1, m1
+    movd      [r2 + r5], m1
   %else
-    psubw       m1, m2
-    movh        [r2 + 2 * r5], m1
+    psubw     m1, m2
+    movh      [r2 + 2 * r5], m1
   %endif
 %endrep
 
-    add    r0, r1
-    add    r2, r3
-
-    dec    r4d
-    jnz   .loop
+    add       r0, r1
+    add       r2, r3
+
+    dec       r4d
+    jnz       .loopH
     RET
 %endmacro
 
-;-------------------------------------------------------------------------------------------------------------
+;--------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
+;--------------------------------------------------------------------------------------------------------------
     IPFILTER_LUMA 4, 4, pp
     IPFILTER_LUMA 8, 8, pp
     IPFILTER_LUMA 8, 4, pp
@@ -709,9 +713,9 @@
     IPFILTER_LUMA 64, 16, pp
     IPFILTER_LUMA 16, 64, pp
 
-;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-------------------------------------------------------------------------------------------------------------
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
     IPFILTER_LUMA 4, 4, ps
     IPFILTER_LUMA 8, 8, ps
     IPFILTER_LUMA 8, 4, ps
@@ -4287,53 +4291,51 @@
 FILTER_VER_CHROMA_SP_W8_H2 8, 16
 FILTER_VER_CHROMA_SP_W8_H2 8, 32
 
-%macro PROCESS_CHROMA_W2 3
-    movh        %2, [srcq]
-    pshufb      %2, %2, Tm0
-    movh        %1, [srcq + srcstrideq]
-    pshufb      %1, %1, Tm0
-    punpcklqdq  %2, %1
-    pmaddubsw   %2, coef2
-    phaddw      %2, %2
-    psubw       %2, %3
-    movd        [dstq], %2
-    pshufd      %2, %2, 1
-    movd        [dstq + dststrideq], %2
-%endmacro
-
-;---------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;---------------------------------------------------------------------------------------------------------------
+;-----------------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_ps_2x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;-----------------------------------------------------------------------------------------------------------------------------
 %macro FILTER_HORIZ_CHROMA_2xN 2
 INIT_XMM sse4
-cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
-%define coef2    m5
-%define Tm0      m4
-%define Tm1      m3
-%define t2       m2
-%define t1       m1
-%define t0       m0
-
-    dec     srcq
-    mov     r4d, r4m
-    add     dststrided, dststrided
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
+%define coef2  m3
+%define Tm0    m2
+%define t1     m1
+%define t0     m0
+
+    dec        srcq
+    mov        r4d, r4m
+    add        dststrided, dststrided
 
 %ifdef PIC
-    lea     r5, [tab_ChromaCoeff]
-    movd    coef2, [r5 + r4 * 4]
+    lea        r6, [tab_ChromaCoeff]
+    movd       coef2, [r6 + r4 * 4]
 %else
-    movd    coef2, [tab_ChromaCoeff + r4 * 4]
+    movd       coef2, [tab_ChromaCoeff + r4 * 4]
 %endif
 
-    pshufd  coef2, coef2, 0
-    mova    t2, [pw_2000]
-    mova    Tm0, [tab_Tm]
-
-%rep %2/2
-    PROCESS_CHROMA_W2  t0, t1, t2
-    lea     srcq, [srcq + srcstrideq * 2]
-    lea     dstq, [dstq + dststrideq * 2]
-%endrep
+    pshufd     coef2, coef2, 0
+    mova       t1, [pw_2000]
+    mova       Tm0, [tab_Tm]
+
+    mov        r4d, %2
+    cmp        r5m, byte 0
+    je         .loopH
+    sub        srcq, srcstrideq
+    add        r4d, 3
+
+.loopH
+    movh       t0, [srcq]
+    pshufb     t0, t0, Tm0
+    pmaddubsw  t0, coef2
+    phaddw     t0, t0
+    psubw      t0, t1
+    movd       [dstq], t0
+
+    lea        srcq, [srcq + srcstrideq]
+    lea        dstq, [dstq + dststrideq]
+
+    dec        r4d
+    jnz        .loopH
 
     RET
 %endmacro
@@ -4341,53 +4343,51 @@
 FILTER_HORIZ_CHROMA_2xN 2, 4
 FILTER_HORIZ_CHROMA_2xN 2, 8
 
-%macro PROCESS_CHROMA_W4 3
-    movh        %2, [srcq]
-    pshufb      %2, %2, Tm0
-    pmaddubsw   %2, coef2
-    movh        %1, [srcq + srcstrideq]
-    pshufb      %1, %1, Tm0
-    pmaddubsw   %1, coef2
-    phaddw      %2, %1
-    psubw       %2, %3
-    movlps      [dstq], %2
-    movhps      [dstq + dststrideq], %2
-%endmacro
-
-;---------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;---------------------------------------------------------------------------------------------------------------
+;-----------------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;-----------------------------------------------------------------------------------------------------------------------------
 %macro FILTER_HORIZ_CHROMA_4xN 2
 INIT_XMM sse4
-cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
-%define coef2    m5
-%define Tm0      m4
-%define Tm1      m3
-%define t2       m2
-%define t1       m1
-%define t0       m0
-
-    dec     srcq
-    mov     r4d, r4m
-    add     dststrided, dststrided
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 4, src, srcstride, dst, dststride
+%define coef2  m3
+%define Tm0    m2
+%define t1     m1
+%define t0     m0
+
+    dec        srcq
+    mov        r4d, r4m
+    add        dststrided, dststrided
 
 %ifdef PIC
-    lea     r5, [tab_ChromaCoeff]
-    movd    coef2, [r5 + r4 * 4]
+    lea        r6, [tab_ChromaCoeff]
+    movd       coef2, [r6 + r4 * 4]
 %else
-    movd    coef2, [tab_ChromaCoeff + r4 * 4]
+    movd       coef2, [tab_ChromaCoeff + r4 * 4]
 %endif
 
-    pshufd  coef2, coef2, 0
-    mova    t2, [pw_2000]
-    mova    Tm0, [tab_Tm]
-
-%rep %2/2
-    PROCESS_CHROMA_W4  t0, t1, t2
-    lea     srcq, [srcq + srcstrideq * 2]
-    lea     dstq, [dstq + dststrideq * 2]
-%endrep
-
+    pshufd     coef2, coef2, 0
+    mova       t1, [pw_2000]
+    mova       Tm0, [tab_Tm]
+
+    mov        r4d, %2
+    cmp        r5m, byte 0
+    je         .loopH
+    sub        srcq, srcstrideq
+    add        r4d, 3
+
+.loopH
+    movh       t0, [srcq]
+    pshufb     t0, t0, Tm0
+    pmaddubsw  t0, coef2
+    phaddw     t0, t0
+    psubw      t0, t1
+    movlps     [dstq], t0
+
+    lea        srcq, [srcq + srcstrideq]
+    lea        dstq, [dstq + dststrideq]
+
+    dec        r4d
+    jnz        .loopH
     RET
 %endmacro
 
@@ -4426,12 +4426,12 @@
     movh       [dstq + 16], %1
 %endmacro
 
-;---------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;---------------------------------------------------------------------------------------------------------------
+;-----------------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_ps_6x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;-----------------------------------------------------------------------------------------------------------------------------
 %macro FILTER_HORIZ_CHROMA 2
 INIT_XMM sse4
-cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
 %define coef2    m5
 %define Tm0      m4
 %define Tm1      m3
@@ -4444,8 +4444,8 @@
     add     dststrided, dststrided
 
 %ifdef PIC
-    lea     r5, [tab_ChromaCoeff]
-    movd    coef2, [r5 + r4 * 4]
+    lea     r6, [tab_ChromaCoeff]
+    movd    coef2, [r6 + r4 * 4]
 %else
     movd    coef2, [tab_ChromaCoeff + r4 * 4]
 %endif
@@ -4455,14 +4455,19 @@
     mova    Tm0, [tab_Tm]
     mova    Tm1, [tab_Tm + 16]
 
-    mov     r5d, %2
-.loop
+    mov     r4d, %2
+    cmp     r5m, byte 0
+    je      .loopH
+    sub     srcq, srcstrideq
+    add     r4d, 3
+
+.loopH
     PROCESS_CHROMA_W%1  t0, t1, t2
     add     srcq, srcstrideq
     add     dstq, dststrideq
 
-    dec     r5d
-    jnz     .loop
+    dec     r4d
+    jnz     .loopH
 
     RET
 %endmacro
@@ -4481,12 +4486,12 @@
     movu        [dstq], %2
 %endmacro
 
-;---------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;---------------------------------------------------------------------------------------------------------------
+;-----------------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;-----------------------------------------------------------------------------------------------------------------------------
 %macro FILTER_HORIZ_CHROMA_8xN 2
 INIT_XMM sse4
-cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 6, src, srcstride, dst, dststride
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 6, src, srcstride, dst, dststride
 %define coef2    m5
 %define Tm0      m4
 %define Tm1      m3
@@ -4499,8 +4504,8 @@
     add     dststrided, dststrided
 
 %ifdef PIC
-    lea     r5, [tab_ChromaCoeff]
-    movd    coef2, [r5 + r4 * 4]
+    lea     r6, [tab_ChromaCoeff]
+    movd    coef2, [r6 + r4 * 4]
 %else
     movd    coef2, [tab_ChromaCoeff + r4 * 4]
 %endif
@@ -4510,14 +4515,19 @@
     mova    Tm0, [tab_Tm]
     mova    Tm1, [tab_Tm + 16]
 
-    mov     r5d, %2
-.loop
+    mov     r4d, %2
+    cmp     r5m, byte 0
+    je      .loopH
+    sub     srcq, srcstrideq
+    add     r4d, 3
+
+.loopH
     PROCESS_CHROMA_W8  t0, t1, t2
     add     srcq, srcstrideq
     add     dstq, dststrideq
 
-    dec     r5d
-    jnz     .loop
+    dec     r4d
+    jnz     .loopH
 
     RET
 %endmacro
@@ -4610,12 +4620,12 @@
     movu        [dstq + 48], %4
 %endmacro
 
-;---------------------------------------------------------------------------------------------------------------
-; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
-;---------------------------------------------------------------------------------------------------------------
+;------------------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;------------------------------------------------------------------------------------------------------------------------------
 %macro FILTER_HORIZ_CHROMA_WxN 2
 INIT_XMM sse4
-cglobal interp_4tap_horiz_ps_%1x%2, 4, 6, 7, src, srcstride, dst, dststride
+cglobal interp_4tap_horiz_ps_%1x%2, 4, 7, 7, src, srcstride, dst, dststride
 %define coef2    m6
 %define Tm0      m5
 %define Tm1      m4
@@ -4629,26 +4639,30 @@
     add     dststrided, dststrided
 
 %ifdef PIC
-    lea     r5, [tab_ChromaCoeff]
-    movd    coef2, [r5 + r4 * 4]
+    lea     r6, [tab_ChromaCoeff]
+    movd    coef2, [r6 + r4 * 4]
 %else
     movd    coef2, [tab_ChromaCoeff + r4 * 4]
 %endif
 
-    mov     r5d, %2
-
     pshufd  coef2, coef2, 0
     mova    t2, [pw_2000]
     mova    Tm0, [tab_Tm]
     mova    Tm1, [tab_Tm + 16]
 
-.loop
+    mov     r4d, %2
+    cmp     r5m, byte 0
+    je      .loopH
+    sub     srcq, srcstrideq
+    add     r4d, 3
+
+.loopH
     PROCESS_CHROMA_W%1   t0, t1, t2, t3
     add     srcq, srcstrideq
     add     dstq, dststrideq
 
-    dec     r5d
-    jnz     .loop
+    dec     r4d
+    jnz     .loopH
 
     RET
 %endmacro


More information about the x265-devel mailing list