[x265] [PATCH 1 of 2] asm : Add new file for 10bpp asm filter functions

nabajit at multicorewareinc.com nabajit at multicorewareinc.com
Tue Feb 25 12:39:40 CET 2014


# HG changeset patch
# User Nabajit Deka
# Date 1393328083 -19800
#      Tue Feb 25 17:04:43 2014 +0530
# Node ID c9236d867a07b18d0e28bd39528a02bf03cf4eda
# Parent  a36a669d09e89332dd91817afdf139853ba3ad03
asm : Add new file for 10bpp asm filter functions.

diff -r a36a669d09e8 -r c9236d867a07 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt	Tue Feb 25 02:22:06 2014 -0600
+++ b/source/common/CMakeLists.txt	Tue Feb 25 17:04:43 2014 +0530
@@ -104,12 +104,12 @@
     set_source_files_properties(primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
     set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h)
     set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm
-               mc-a2.asm ipfilter8.asm pixel-util8.asm blockcopy8.asm
+               mc-a2.asm pixel-util8.asm blockcopy8.asm
                pixeladd8.asm dct8.asm)
     if(HIGH_BIT_DEPTH)
-        set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm)
+        set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm)
     else()
-        set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm)
+        set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm ipfilter8.asm)
     endif()
 
     if(NOT X64)
diff -r a36a669d09e8 -r c9236d867a07 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Tue Feb 25 02:22:06 2014 -0600
+++ b/source/common/x86/const-a.asm	Tue Feb 25 17:04:43 2014 +0530
@@ -84,6 +84,7 @@
 const pd_1024,     times 4 dd 1024
 const pd_2048,     times 4 dd 2048
 const pd_ffff,     times 4 dd 0xffff
+const pd_n32768,   times 4 dd 0xffff8000
 const pw_ff00,     times 8 dw 0xff00
 
 const multi_2Row,  dw 1, 2, 3, 4, 1, 2, 3, 4
diff -r a36a669d09e8 -r c9236d867a07 source/common/x86/ipfilter16.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/ipfilter16.asm	Tue Feb 25 17:04:43 2014 +0530
@@ -0,0 +1,723 @@
+;*****************************************************************************
+;* Copyright (C) 2013 x265 project
+;*
+;* Authors: Nabajit Deka <nabajit at multicorewareinc.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing at multicorewareinc.com.
+;*****************************************************************************/
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA 32
+
+tab_LumaCoeff:   dw   0, 0,  0,  64,  0,   0,  0,  0
+                 dw  -1, 4, -10, 58,  17, -5,  1,  0
+                 dw  -1, 4, -11, 40,  40, -11, 4, -1
+                 dw   0, 1, -5,  17,  58, -10, 4, -1
+
+SECTION .text
+
+cextern pd_32
+cextern pw_pixel_max
+cextern pd_n32768
+
+;------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;------------------------------------------------------------------------------------------------------------
+%macro FILTER_HOR_LUMA_W4 3
+INIT_XMM sse4
+cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
+
+    mov         r4d, r4m
+    sub         r0, 6
+    shl         r4d, 4
+    add         r1, r1
+    add         r3, r3
+
+%ifdef PIC
+    lea         r6, [tab_LumaCoeff]
+    mova        m0, [r6 + r4]
+%else
+    mova        m0, [tab_LumaCoeff + r4]
+%endif
+
+%ifidn %3, pp 
+    mova        m1, [pd_32]
+    pxor        m6, m6
+    mova        m7, [pw_pixel_max]
+%else
+    mova        m1, [pd_n32768]
+%endif
+
+    mov         r4d, %2
+%ifidn %3, ps
+    cmp         r5m, byte 0
+    je          .loopH
+    lea         r6, [r1 + 2 * r1]
+    sub         r0, r6
+    add         r4d, 7
+%endif
+
+.loopH
+    movu        m2, [r0]                     ; m2 = src[0-7]
+    movu        m3, [r0 + 16]                ; m3 = src[8-15]
+
+    pmaddwd     m4, m2, m0
+    palignr     m5, m3, m2, 2                ; m5 = src[1-8]
+    pmaddwd     m5, m0
+    phaddd      m4, m5
+
+    palignr     m5, m3, m2, 4                ; m5 = src[2-9]
+    pmaddwd     m5, m0
+    palignr     m3, m2, 6                    ; m3 = src[3-10]
+    pmaddwd     m3, m0
+    phaddd      m5, m3
+
+    phaddd      m4, m5
+    paddd       m4, m1
+%ifidn %3, pp
+    psrad       m4, 6
+    packusdw    m4, m4
+    CLIPW       m4, m6, m7
+%else
+    psrad       m4, 2
+    packssdw    m4, m4
+%endif
+
+    movh        [r2], m4
+
+    add         r0, r1
+    add         r2, r3
+
+    dec         r4d
+    jnz         .loopH
+    RET
+%endmacro
+
+;------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W4 4, 4, pp
+FILTER_HOR_LUMA_W4 4, 8, pp
+FILTER_HOR_LUMA_W4 4, 16, pp
+
+;---------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;---------------------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W4 4, 4, ps
+FILTER_HOR_LUMA_W4 4, 8, ps
+FILTER_HOR_LUMA_W4 4, 16, ps
+
+;------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;------------------------------------------------------------------------------------------------------------
+%macro FILTER_HOR_LUMA_W8 3
+INIT_XMM sse4
+cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
+
+    add         r1, r1
+    add         r3, r3
+    mov         r4d, r4m
+    sub         r0, 6
+    shl         r4d, 4
+
+%ifdef PIC
+    lea         r6, [tab_LumaCoeff]
+    mova        m0, [r6 + r4]
+%else
+    mova        m0, [tab_LumaCoeff + r4]
+%endif
+
+%ifidn %3, pp 
+    mova        m1, [pd_32]
+    pxor        m7, m7
+%else
+    mova        m1, [pd_n32768]
+%endif
+
+    mov         r4d, %2
+%ifidn %3, ps
+    cmp         r5m, byte 0
+    je          .loopH
+    lea         r6, [r1 + 2 * r1]
+    sub         r0, r6
+    add         r4d, 7
+%endif
+
+.loopH
+    movu        m2, [r0]                     ; m2 = src[0-7]
+    movu        m3, [r0 + 16]                ; m3 = src[8-15]
+
+    pmaddwd     m4, m2, m0
+    palignr     m5, m3, m2, 2                ; m5 = src[1-8]
+    pmaddwd     m5, m0
+    phaddd      m4, m5
+
+    palignr     m5, m3, m2, 4                ; m5 = src[2-9]
+    pmaddwd     m5, m0
+    palignr     m6, m3, m2, 6                ; m6 = src[3-10]
+    pmaddwd     m6, m0
+    phaddd      m5, m6
+    phaddd      m4, m5
+    paddd       m4, m1
+
+    palignr     m5, m3, m2, 8                ; m5 = src[4-11]
+    pmaddwd     m5, m0
+    palignr     m6, m3, m2, 10               ; m6 = src[5-12]
+    pmaddwd     m6, m0
+    phaddd      m5, m6
+
+    palignr     m6, m3, m2, 12               ; m6 = src[6-13]
+    pmaddwd     m6, m0
+    palignr     m3, m2, 14                   ; m3 = src[7-14]
+    pmaddwd     m3, m0
+    phaddd      m6, m3
+    phaddd      m5, m6
+    paddd       m5, m1
+%ifidn %3, pp 
+    psrad       m4, 6
+    psrad       m5, 6
+    packusdw    m4, m5
+    CLIPW       m4, m7, [pw_pixel_max]
+%else
+    psrad       m4, 2
+    psrad       m5, 2
+    packssdw    m4, m5
+%endif
+
+    movu        [r2], m4
+
+    add         r0, r1
+    add         r2, r3
+
+    dec         r4d
+    jnz         .loopH
+    RET
+%endmacro
+
+;------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W8 8, 4, pp
+FILTER_HOR_LUMA_W8 8, 8, pp
+FILTER_HOR_LUMA_W8 8, 16, pp
+FILTER_HOR_LUMA_W8 8, 32, pp
+
+;---------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;---------------------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W8 8, 4, ps
+FILTER_HOR_LUMA_W8 8, 8, ps
+FILTER_HOR_LUMA_W8 8, 16, ps
+FILTER_HOR_LUMA_W8 8, 32, ps
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_HOR_LUMA_W12 3
+INIT_XMM sse4
+cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
+
+    add         r1, r1
+    add         r3, r3
+    mov         r4d, r4m
+    sub         r0, 6
+    shl         r4d, 4
+
+%ifdef PIC
+    lea         r6, [tab_LumaCoeff]
+    mova        m0, [r6 + r4]
+%else
+    mova        m0, [tab_LumaCoeff + r4]
+%endif
+%ifidn %3, pp 
+    mova        m1, [pd_32]
+%else
+    mova        m1, [pd_n32768]
+%endif
+
+    mov         r4d, %2
+%ifidn %3, ps
+    cmp         r5m, byte 0
+    je          .loopH
+    lea         r6, [r1 + 2 * r1]
+    sub         r0, r6
+    add         r4d, 7
+%endif
+
+.loopH
+    movu        m2, [r0]                     ; m2 = src[0-7]
+    movu        m3, [r0 + 16]                ; m3 = src[8-15]
+
+    pmaddwd     m4, m2, m0
+    palignr     m5, m3, m2, 2                ; m5 = src[1-8]
+    pmaddwd     m5, m0
+    phaddd      m4, m5
+
+    palignr     m5, m3, m2, 4                ; m5 = src[2-9]
+    pmaddwd     m5, m0
+    palignr     m6, m3, m2, 6                ; m6 = src[3-10]
+    pmaddwd     m6, m0
+    phaddd      m5, m6
+    phaddd      m4, m5
+    paddd       m4, m1
+
+    palignr     m5, m3, m2, 8                ; m5 = src[4-11]
+    pmaddwd     m5, m0
+    palignr     m6, m3, m2, 10               ; m6 = src[5-12]
+    pmaddwd     m6, m0
+    phaddd      m5, m6
+
+    palignr     m6, m3, m2, 12               ; m6 = src[6-13]
+    pmaddwd     m6, m0
+    palignr     m7, m3, m2, 14               ; m2 = src[7-14]
+    pmaddwd     m7, m0
+    phaddd      m6, m7
+    phaddd      m5, m6
+    paddd       m5, m1
+%ifidn %3, pp 
+    psrad       m4, 6
+    psrad       m5, 6
+    packusdw    m4, m5
+    pxor        m5, m5
+    CLIPW       m4, m5, [pw_pixel_max]
+%else
+    psrad       m4, 2
+    psrad       m5, 2
+    packssdw    m4, m5
+%endif
+
+    movu        [r2], m4
+
+    movu        m2, [r0 + 32]                ; m2 = src[16-23]
+
+    pmaddwd     m4, m3, m0                   ; m3 = src[8-15]
+    palignr     m5, m2, m3, 2                ; m5 = src[9-16]
+    pmaddwd     m5, m0
+    phaddd      m4, m5
+
+    palignr     m5, m2, m3, 4                ; m5 = src[10-17]
+    pmaddwd     m5, m0
+    palignr     m2, m3, 6                    ; m2 = src[11-18]
+    pmaddwd     m2, m0
+    phaddd      m5, m2
+    phaddd      m4, m5
+    paddd       m4, m1
+%ifidn %3, pp 
+    psrad       m4, 6
+    packusdw    m4, m4
+    pxor        m5, m5
+    CLIPW       m4, m5, [pw_pixel_max]
+%else
+    psrad       m4, 2
+    packssdw    m4, m4
+%endif
+
+    movh        [r2 + 16], m4
+
+    add         r0, r1
+    add         r2, r3
+
+    dec         r4d
+    jnz         .loopH
+    RET
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W12 12, 16, pp
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W12 12, 16, ps
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_HOR_LUMA_W16 3
+INIT_XMM sse4
+cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
+
+    add         r1, r1
+    add         r3, r3
+    mov         r4d, r4m
+    sub         r0, 6
+    shl         r4d, 4
+
+%ifdef PIC
+    lea         r6, [tab_LumaCoeff]
+    mova        m0, [r6 + r4]
+%else
+    mova        m0, [tab_LumaCoeff + r4]
+%endif
+
+%ifidn %3, pp 
+    mova        m1, [pd_32]
+%else
+    mova        m1, [pd_n32768]
+%endif
+
+    mov         r4d, %2
+%ifidn %3, ps
+    cmp         r5m, byte 0
+    je          .loopH
+    lea         r6, [r1 + 2 * r1]
+    sub         r0, r6
+    add         r4d, 7
+%endif
+
+.loopH
+%assign x 0
+%rep %1 / 16
+    movu        m2, [r0 + x]                 ; m2 = src[0-7]
+    movu        m3, [r0 + 16 + x]            ; m3 = src[8-15]
+
+    pmaddwd     m4, m2, m0
+    palignr     m5, m3, m2, 2                ; m5 = src[1-8]
+    pmaddwd     m5, m0
+    phaddd      m4, m5
+
+    palignr     m5, m3, m2, 4                ; m5 = src[2-9]
+    pmaddwd     m5, m0
+    palignr     m6, m3, m2, 6                ; m6 = src[3-10]
+    pmaddwd     m6, m0
+    phaddd      m5, m6
+    phaddd      m4, m5
+    paddd       m4, m1
+
+    palignr     m5, m3, m2, 8                ; m5 = src[4-11]
+    pmaddwd     m5, m0
+    palignr     m6, m3, m2, 10               ; m6 = src[5-12]
+    pmaddwd     m6, m0
+    phaddd      m5, m6
+
+    palignr     m6, m3, m2, 12               ; m6 = src[6-13]
+    pmaddwd     m6, m0
+    palignr     m7, m3, m2, 14               ; m2 = src[7-14]
+    pmaddwd     m7, m0
+    phaddd      m6, m7
+    phaddd      m5, m6
+    paddd       m5, m1
+%ifidn %3, pp
+    psrad       m4, 6
+    psrad       m5, 6
+    packusdw    m4, m5
+    pxor        m5, m5
+    CLIPW       m4, m5, [pw_pixel_max]
+%else
+    psrad       m4, 2
+    psrad       m5, 2
+    packssdw    m4, m5
+%endif
+    movu        [r2 + x], m4
+
+    movu        m2, [r0 + 32 + x]            ; m2 = src[16-23]
+
+    pmaddwd     m4, m3, m0                   ; m3 = src[8-15]
+    palignr     m5, m2, m3, 2                ; m5 = src[9-16]
+    pmaddwd     m5, m0
+    phaddd      m4, m5
+
+    palignr     m5, m2, m3, 4                ; m5 = src[10-17]
+    pmaddwd     m5, m0
+    palignr     m6, m2, m3, 6                ; m6 = src[11-18]
+    pmaddwd     m6, m0
+    phaddd      m5, m6
+    phaddd      m4, m5
+    paddd       m4, m1
+
+    palignr     m5, m2, m3, 8                ; m5 = src[12-19]
+    pmaddwd     m5, m0
+    palignr     m6, m2, m3, 10               ; m6 = src[13-20]
+    pmaddwd     m6, m0
+    phaddd      m5, m6
+
+    palignr     m6, m2, m3, 12               ; m6 = src[14-21]
+    pmaddwd     m6, m0
+    palignr     m2, m3, 14                   ; m3 = src[15-22]
+    pmaddwd     m2, m0
+    phaddd      m6, m2
+    phaddd      m5, m6
+    paddd       m5, m1
+%ifidn %3, pp 
+    psrad       m4, 6
+    psrad       m5, 6
+    packusdw    m4, m5
+    pxor        m5, m5
+    CLIPW       m4, m5, [pw_pixel_max]
+%else
+    psrad       m4, 2
+    psrad       m5, 2
+    packssdw    m4, m5
+%endif
+    movu        [r2 + 16 + x], m4
+
+%assign x x+32
+%endrep
+
+    add         r0, r1
+    add         r2, r3
+
+    dec         r4d
+    jnz         .loopH
+    RET
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W16 16, 4, pp
+FILTER_HOR_LUMA_W16 16, 8, pp
+FILTER_HOR_LUMA_W16 16, 12, pp
+FILTER_HOR_LUMA_W16 16, 16, pp
+FILTER_HOR_LUMA_W16 16, 32, pp
+FILTER_HOR_LUMA_W16 16, 64, pp
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_16x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W16 16, 4, ps
+FILTER_HOR_LUMA_W16 16, 8, ps
+FILTER_HOR_LUMA_W16 16, 12, ps
+FILTER_HOR_LUMA_W16 16, 16, ps
+FILTER_HOR_LUMA_W16 16, 32, ps
+FILTER_HOR_LUMA_W16 16, 64, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W16 32, 8, pp
+FILTER_HOR_LUMA_W16 32, 16, pp
+FILTER_HOR_LUMA_W16 32, 24, pp
+FILTER_HOR_LUMA_W16 32, 32, pp
+FILTER_HOR_LUMA_W16 32, 64, pp
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_32x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W16 32, 8, ps
+FILTER_HOR_LUMA_W16 32, 16, ps
+FILTER_HOR_LUMA_W16 32, 24, ps
+FILTER_HOR_LUMA_W16 32, 32, ps
+FILTER_HOR_LUMA_W16 32, 64, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_48x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W16 48, 64, pp
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_48x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W16 48, 64, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_64x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W16 64, 16, pp
+FILTER_HOR_LUMA_W16 64, 32, pp
+FILTER_HOR_LUMA_W16 64, 48, pp
+FILTER_HOR_LUMA_W16 64, 64, pp
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_64x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W16 64, 16, ps
+FILTER_HOR_LUMA_W16 64, 32, ps
+FILTER_HOR_LUMA_W16 64, 48, ps
+FILTER_HOR_LUMA_W16 64, 64, ps
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+%macro FILTER_HOR_LUMA_W24 3
+INIT_XMM sse4
+cglobal interp_8tap_horiz_%3_%1x%2, 4, 7, 8
+
+    add         r1, r1
+    add         r3, r3
+    mov         r4d, r4m
+    sub         r0, 6
+    shl         r4d, 4
+
+%ifdef PIC
+    lea         r6, [tab_LumaCoeff]
+    mova        m0, [r6 + r4]
+%else
+    mova        m0, [tab_LumaCoeff + r4]
+%endif
+%ifidn %3, pp 
+    mova        m1, [pd_32]
+%else
+    mova        m1, [pd_n32768]
+%endif
+
+    mov         r4d, %2
+%ifidn %3, ps
+    cmp         r5m, byte 0
+    je          .loopH
+    lea         r6, [r1 + 2 * r1]
+    sub         r0, r6
+    add         r4d, 7
+%endif
+
+.loopH
+    movu        m2, [r0]                     ; m2 = src[0-7]
+    movu        m3, [r0 + 16]                ; m3 = src[8-15]
+
+    pmaddwd     m4, m2, m0
+    palignr     m5, m3, m2, 2                ; m5 = src[1-8]
+    pmaddwd     m5, m0
+    phaddd      m4, m5
+
+    palignr     m5, m3, m2, 4                ; m5 = src[2-9]
+    pmaddwd     m5, m0
+    palignr     m6, m3, m2, 6                ; m6 = src[3-10]
+    pmaddwd     m6, m0
+    phaddd      m5, m6
+    phaddd      m4, m5
+    paddd       m4, m1
+
+    palignr     m5, m3, m2, 8                ; m5 = src[4-11]
+    pmaddwd     m5, m0
+    palignr     m6, m3, m2, 10               ; m6 = src[5-12]
+    pmaddwd     m6, m0
+    phaddd      m5, m6
+
+    palignr     m6, m3, m2, 12               ; m6 = src[6-13]
+    pmaddwd     m6, m0
+    palignr     m7, m3, m2, 14               ; m7 = src[7-14]
+    pmaddwd     m7, m0
+    phaddd      m6, m7
+    phaddd      m5, m6
+    paddd       m5, m1
+%ifidn %3, pp 
+    psrad       m4, 6
+    psrad       m5, 6
+    packusdw    m4, m5
+    pxor        m5, m5
+    CLIPW       m4, m5, [pw_pixel_max]
+%else
+    psrad       m4, 2
+    psrad       m5, 2
+    packssdw    m4, m5
+%endif
+    movu        [r2], m4
+
+    movu        m2, [r0 + 32]                ; m2 = src[16-23]
+
+    pmaddwd     m4, m3, m0                   ; m3 = src[8-15]
+    palignr     m5, m2, m3, 2                ; m5 = src[1-8]
+    pmaddwd     m5, m0
+    phaddd      m4, m5
+
+    palignr     m5, m2, m3, 4                ; m5 = src[2-9]
+    pmaddwd     m5, m0
+    palignr     m6, m2, m3, 6                ; m6 = src[3-10]
+    pmaddwd     m6, m0
+    phaddd      m5, m6
+    phaddd      m4, m5
+    paddd       m4, m1
+
+    palignr     m5, m2, m3, 8                ; m5 = src[4-11]
+    pmaddwd     m5, m0
+    palignr     m6, m2, m3, 10               ; m6 = src[5-12]
+    pmaddwd     m6, m0
+    phaddd      m5, m6
+
+    palignr     m6, m2, m3, 12               ; m6 = src[6-13]
+    pmaddwd     m6, m0
+    palignr     m7, m2, m3, 14               ; m7 = src[7-14]
+    pmaddwd     m7, m0
+    phaddd      m6, m7
+    phaddd      m5, m6
+    paddd       m5, m1
+%ifidn %3, pp 
+    psrad       m4, 6
+    psrad       m5, 6
+    packusdw    m4, m5
+    pxor        m5, m5
+    CLIPW       m4, m5, [pw_pixel_max]
+%else
+    psrad       m4, 2
+    psrad       m5, 2
+    packssdw    m4, m5
+%endif
+    movu        [r2 + 16], m4
+
+    movu        m3, [r0 + 48]                ; m3 = src[24-31]
+
+    pmaddwd     m4, m2, m0                   ; m2 = src[16-23]
+    palignr     m5, m3, m2, 2                ; m5 = src[1-8]
+    pmaddwd     m5, m0
+    phaddd      m4, m5
+
+    palignr     m5, m3, m2, 4                ; m5 = src[2-9]
+    pmaddwd     m5, m0
+    palignr     m6, m3, m2, 6                ; m6 = src[3-10]
+    pmaddwd     m6, m0
+    phaddd      m5, m6
+    phaddd      m4, m5
+    paddd       m4, m1
+
+    palignr     m5, m3, m2, 8                ; m5 = src[4-11]
+    pmaddwd     m5, m0
+    palignr     m6, m3, m2, 10               ; m6 = src[5-12]
+    pmaddwd     m6, m0
+    phaddd      m5, m6
+
+    palignr     m6, m3, m2, 12               ; m6 = src[6-13]
+    pmaddwd     m6, m0
+    palignr     m7, m3, m2, 14               ; m7 = src[7-14]
+    pmaddwd     m7, m0
+    phaddd      m6, m7
+    phaddd      m5, m6
+    paddd       m5, m1
+%ifidn %3, pp 
+    psrad       m4, 6
+    psrad       m5, 6
+    packusdw    m4, m5
+    pxor        m5, m5
+    CLIPW       m4, m5, [pw_pixel_max]
+%else
+    psrad       m4, 2
+    psrad       m5, 2
+    packssdw    m4, m5
+%endif
+    movu        [r2 + 32], m4
+
+    add         r0, r1
+    add         r2, r3
+
+    dec         r4d
+    jnz         .loopH
+    RET
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_24x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx
+;-------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W24 24, 32, pp
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_24x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+FILTER_HOR_LUMA_W24 24, 32, ps


More information about the x265-devel mailing list