[x265] [PATCH 1 of 2] asm: routines for luma vps filter functions for all block sizes

Tue Nov 5 08:49:27 CET 2013

# HG changeset patch
# User Nabajit Deka
# Date 1383636969 -19800
#      Tue Nov 05 13:06:09 2013 +0530
# Node ID ecf2873ed6be542e54cc56047bfeff64c72000c4
# Parent  b1bee3614505ed6eb6ac610a0f023e1af5b42c9c
asm: routines for luma vps filter functions for all block sizes.

diff -r b1bee3614505 -r ecf2873ed6be source/common/x86/ipfilter8.asm

--- a/source/common/x86/ipfilter8.asm	Tue Nov 05 12:50:01 2013 +0530
+++ b/source/common/x86/ipfilter8.asm	Tue Nov 05 13:06:09 2013 +0530
@@ -78,25 +78,25 @@
                 times 4 dw 58, -10
                 times 4 dw 4, -1
 
-tab_LumaCoeffVerLuma: times 8 db 0, 0
-                      times 8 db 0, 64
-                      times 8 db 0, 0
-                      times 8 db 0, 0
-
-                      times 8 db -1, 4
-                      times 8 db -10, 58
-                      times 8 db 17, -5
-                      times 8 db 1, 0
-
-                      times 8 db -1, 4
-                      times 8 db -11, 40
-                      times 8 db 40, -11
-                      times 8 db 4, -1
-
-                      times 8 db 0, 1
-                      times 8 db -5, 17
-                      times 8 db 58, -10
-                      times 8 db 4, -1
+tab_LumaCoeffVer: times 8 db 0, 0
+                  times 8 db 0, 64
+                  times 8 db 0, 0
+                  times 8 db 0, 0
+
+                  times 8 db -1, 4
+                  times 8 db -10, 58
+                  times 8 db 17, -5
+                  times 8 db 1, 0
+
+                  times 8 db -1, 4
+                  times 8 db -11, 40
+                  times 8 db 40, -11
+                  times 8 db 4, -1
+
+                  times 8 db 0, 1
+                  times 8 db -5, 17
+                  times 8 db 58, -10
+                  times 8 db 4, -1
 
 tab_c_128:      times 16 db 0x80
 tab_c_64_n64:   times 8 db 64, -64
@@ -2263,30 +2263,39 @@
 %endmacro
 
 ;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_pp_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+; void interp_8tap_vert_%3_4x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_LUMA_4xN 2
+%macro FILTER_VER_LUMA_4xN 3
 INIT_XMM sse4
-cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 6
+cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 6
     lea       r5, [r1 + 2 * r1]
     sub       r0, r5
     shl       r4d, 6
+%ifidn %3, ps
+    add       r3d, r3d
+%endif
 
 %ifdef PIC
-    lea       r5, [tab_LumaCoeffVerLuma]
+    lea       r5, [tab_LumaCoeffVer]
     lea       r6, [r5 + r4]
 %else
-    lea       r6, [tab_LumaCoeffVerLuma + r4]
+    lea       r6, [tab_LumaCoeffVer + r4]
 %endif
 
-    mova      m5, [tab_c_512]
+%ifidn %3, pp 
+    mova      m3, [tab_c_512]
+%else
+    mova      m3, [tab_c_8192]
+%endif
+
     mov       r4d, %2/4
 
 .loopH
     PROCESS_LUMA_W4_4R
 
-    pmulhrsw  m7, m5
-    pmulhrsw  m6, m5
+%ifidn %3, pp
+    pmulhrsw  m7, m3
+    pmulhrsw  m6, m3
 
     packuswb  m7, m7
     packuswb  m6, m6
@@ -2298,6 +2307,16 @@
     pshufd    m6, m6, 1
     lea       r5, [r3 + 2 * r3]
     movd      [r2 + r5], m6
+%else
+    psubw  m7, m3
+    psubw  m6, m3
+
+    movlps    [r2], m7
+    movhps    [r2 + r3], m7
+    movlps    [r2 + 2 * r3], m6
+    lea       r5, [r3 + 2 * r3]
+    movhps    [r2 + r5], m6
+%endif
 
     lea       r5, [4 * r1]
     sub       r0, r5
@@ -2312,41 +2331,66 @@
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_4xN 4,4
+FILTER_VER_LUMA_4xN 4, 4, pp
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_4xN 4,8
+FILTER_VER_LUMA_4xN 4, 8, pp
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_4xN 4,16
+FILTER_VER_LUMA_4xN 4, 16, pp
 
 ;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_pp_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+; void interp_8tap_vert_ps_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_LUMA_8xN 2
+FILTER_VER_LUMA_4xN 4, 4, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_4xN 4, 8, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_4xN 4, 16, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_%3_8x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_8xN 3
 INIT_XMM sse4
-cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8
+cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
     lea       r5, [r1 + 2 * r1]
     sub       r0, r5
     shl       r4d, 6
 
+%ifidn %3, ps
+    add       r3d, r3d
+%endif
+
 %ifdef PIC
-    lea       r5, [tab_LumaCoeffVerLuma]
+    lea       r5, [tab_LumaCoeffVer]
     lea       r6, [r5 + r4]
 %else
-    lea       r6, [tab_LumaCoeffVerLuma + r4]
+    lea       r6, [tab_LumaCoeffVer + r4]
 %endif
 
+ %ifidn %3, pp
     mova      m3, [tab_c_512]
+%else
+    mova      m3, [tab_c_8192]
+%endif
+
     mov       r4d, %2/4
 
 .loopH
     PROCESS_LUMA_W8_4R
 
+%ifidn %3, pp 
     pmulhrsw  m7, m3
     pmulhrsw  m6, m3
     pmulhrsw  m5, m3
@@ -2360,6 +2404,18 @@
     movlps    [r2 + 2 * r3], m5
     lea       r5, [r3 + 2 * r3]
     movhps    [r2 + r5], m5
+%else
+    psubw  m7, m3
+    psubw  m6, m3
+    psubw  m5, m3
+    psubw  m4, m3
+
+    movu    [r2], m7
+    movu    [r2 + r3], m6
+    movu    [r2 + 2 * r3], m5
+    lea     r5, [r3 + 2 * r3]
+    movu    [r2 + r5], m4
+%endif
 
     lea       r5, [4 * r1]
     sub       r0, r5
@@ -2374,46 +2430,75 @@
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8,4
+FILTER_VER_LUMA_8xN 8, 4, pp
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8,8
+FILTER_VER_LUMA_8xN 8, 8, pp
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8,16
+FILTER_VER_LUMA_8xN 8, 16, pp
 
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_8xN 8,32
+FILTER_VER_LUMA_8xN 8, 32, pp
 
 ;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_pp_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+; void interp_8tap_vert_ps_8x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_LUMA_12xN 2
+FILTER_VER_LUMA_8xN 8, 4, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_8x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_8xN 8, 8, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_8x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_8xN 8, 16, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_ps_8x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+FILTER_VER_LUMA_8xN 8, 32, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_%3_12x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA_12xN 3
 INIT_XMM sse4
-cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8
+cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8
     lea       r5, [r1 + 2 * r1]
     sub       r0, r5
     shl       r4d, 6
+%ifidn %3, ps
+    add       r3d, r3d
+%endif
 
 %ifdef PIC
-    lea       r5, [tab_LumaCoeffVerLuma]
+    lea       r5, [tab_LumaCoeffVer]
     lea       r6, [r5 + r4]
 %else
-    lea       r6, [tab_LumaCoeffVerLuma + r4]
+    lea       r6, [tab_LumaCoeffVer + r4]
 %endif
 
+ %ifidn %3, pp
     mova      m3, [tab_c_512]
+%else
+    mova      m3, [tab_c_8192]
+%endif
+
     mov       r4d, %2/4
 
 .loopH
     PROCESS_LUMA_W8_4R
 
+%ifidn %3, pp 
     pmulhrsw  m7, m3
     pmulhrsw  m6, m3
     pmulhrsw  m5, m3
@@ -2427,13 +2512,30 @@
     movlps    [r2 + 2 * r3], m5
     lea       r5, [r3 + 2 * r3]
     movhps    [r2 + r5], m5
+%else
+    psubw  m7, m3
+    psubw  m6, m3
+    psubw  m5, m3
+    psubw  m4, m3
+
+    movu    [r2], m7
+    movu    [r2 + r3], m6
+    movu    [r2 + 2 * r3], m5
+    lea     r5, [r3 + 2 * r3]
+    movu    [r2 + r5], m4
+%endif
 
     lea       r5, [8 * r1 - 8]
     sub       r0, r5
+%ifidn %3, pp 
     add       r2, 8
+%else
+    add       r2, 16
+%endif
 
     PROCESS_LUMA_W4_4R
 
+%ifidn %3, pp 
     pmulhrsw  m7, m3
     pmulhrsw  m6, m3
 
@@ -2447,10 +2549,24 @@
     pshufd    m6, m6, 1
     lea       r5, [r3 + 2 * r3]
     movd      [r2 + r5], m6
+%else
+    psubw  m7, m3
+    psubw  m6, m3
+
+    movlps    [r2], m7
+    movhps    [r2 + r3], m7
+    movlps    [r2 + 2 * r3], m6
+    lea       r5, [r3 + 2 * r3]
+    movhps    [r2 + r5], m6
+%endif
 
     lea       r5, [4 * r1 + 8]
     sub       r0, r5
+%ifidn %3, pp 
     lea       r2, [r2 + 4 * r3 - 8]
+%else
+    lea       r2, [r2 + 4 * r3 - 16]
+%endif
 
     dec       r4d
     jnz       .loopH
@@ -2461,34 +2577,45 @@
 ;-------------------------------------------------------------------------------------------------------------
 ; void interp_8tap_vert_pp_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
-FILTER_VER_LUMA_12xN 12, 16
+FILTER_VER_LUMA_12xN 12, 16, pp
 
 ;-------------------------------------------------------------------------------------------------------------
-; void interp_8tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+; void interp_8tap_vert_ps_12x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------
-%macro FILTER_VER_LUMA 2
+FILTER_VER_LUMA_12xN 12, 16, ps
+
+;-------------------------------------------------------------------------------------------------------------
+; void interp_8tap_vert_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_LUMA 3
 INIT_XMM sse4
-cglobal interp_8tap_vert_pp_%1x%2, 5, 7, 8 ,0-1
+cglobal interp_8tap_vert_%3_%1x%2, 5, 7, 8 ,0-1
     lea       r5, [r1 + 2 * r1]
     sub       r0, r5
     shl       r4d, 6
+%ifidn %3, ps
+    add       r3d, r3d
+%endif
 
 %ifdef PIC
-    lea       r5, [tab_LumaCoeffVerLuma]
+    lea       r5, [tab_LumaCoeffVer]
     lea       r6, [r5 + r4]
 %else
-    lea       r6, [tab_LumaCoeffVerLuma + r4]
+    lea       r6, [tab_LumaCoeffVer + r4]
 %endif
 
+%ifidn %3, pp
     mova      m3, [tab_c_512]
+%else
+    mova      m3, [tab_c_8192]
+%endif
     mov       byte [rsp], %2/4
 
-
 .loopH
     mov       r4d, (%1/8)
 .loopW
     PROCESS_LUMA_W8_4R
-
+%ifidn %3, pp 
     pmulhrsw  m7, m3
     pmulhrsw  m6, m3
     pmulhrsw  m5, m3
@@ -2502,15 +2629,35 @@
     movlps    [r2 + 2 * r3], m5
     lea       r5, [r3 + 2 * r3]
     movhps    [r2 + r5], m5
+%else
+    psubw  m7, m3
+    psubw  m6, m3
+    psubw  m5, m3
+    psubw  m4, m3
+
+    movu    [r2], m7
+    movu    [r2 + r3], m6
+    movu    [r2 + 2 * r3], m5
+    lea     r5, [r3 + 2 * r3]
+    movu    [r2 + r5], m4
+%endif
 
     lea       r5, [8 * r1 - 8]
     sub       r0, r5
+%ifidn %3, pp 
     add       r2, 8
+%else
+    add       r2, 16
+%endif
     dec       r4d
     jnz      .loopW
 
     lea       r0, [r0 + 4 * r1 - %1]
+%ifidn %3, pp 
     lea       r2, [r2 + 4 * r3 - %1]
+%else
+    lea       r2, [r2 + 4 * r3 - 2 * %1]
+%endif
 
     dec       byte [rsp]
     jnz       .loopH
@@ -2518,24 +2665,41 @@
     RET
 %endmacro
 
-FILTER_VER_LUMA 16, 4
-FILTER_VER_LUMA 16, 8
-FILTER_VER_LUMA 16, 12
-FILTER_VER_LUMA 16, 16
-FILTER_VER_LUMA 16, 32
-FILTER_VER_LUMA 16, 64
-FILTER_VER_LUMA 24, 32
-FILTER_VER_LUMA 32, 8
-FILTER_VER_LUMA 32, 16
-FILTER_VER_LUMA 32, 24
-FILTER_VER_LUMA 32, 32
-FILTER_VER_LUMA 32, 64
-FILTER_VER_LUMA 48, 64
-FILTER_VER_LUMA 64, 16
-FILTER_VER_LUMA 64, 32
-FILTER_VER_LUMA 64, 48
-FILTER_VER_LUMA 64, 64
-
+FILTER_VER_LUMA 16, 4, pp
+FILTER_VER_LUMA 16, 8, pp
+FILTER_VER_LUMA 16, 12, pp
+FILTER_VER_LUMA 16, 16, pp
+FILTER_VER_LUMA 16, 32, pp
+FILTER_VER_LUMA 16, 64, pp
+FILTER_VER_LUMA 24, 32, pp
+FILTER_VER_LUMA 32, 8, pp
+FILTER_VER_LUMA 32, 16, pp
+FILTER_VER_LUMA 32, 24, pp
+FILTER_VER_LUMA 32, 32, pp
+FILTER_VER_LUMA 32, 64, pp
+FILTER_VER_LUMA 48, 64, pp
+FILTER_VER_LUMA 64, 16, pp
+FILTER_VER_LUMA 64, 32, pp
+FILTER_VER_LUMA 64, 48, pp
+FILTER_VER_LUMA 64, 64, pp
+
+FILTER_VER_LUMA 16, 4, ps
+FILTER_VER_LUMA 16, 8, ps
+FILTER_VER_LUMA 16, 12, ps
+FILTER_VER_LUMA 16, 16, ps
+FILTER_VER_LUMA 16, 32, ps
+FILTER_VER_LUMA 16, 64, ps
+FILTER_VER_LUMA 24, 32, ps
+FILTER_VER_LUMA 32, 8, ps
+FILTER_VER_LUMA 32, 16, ps
+FILTER_VER_LUMA 32, 24, ps
+FILTER_VER_LUMA 32, 32, ps
+FILTER_VER_LUMA 32, 64, ps
+FILTER_VER_LUMA 48, 64, ps
+FILTER_VER_LUMA 64, 16, ps
+FILTER_VER_LUMA 64, 32, ps
+FILTER_VER_LUMA 64, 48, ps
+FILTER_VER_LUMA 64, 64, ps
 
 ; TODO: combin of U and V is more performance, but need more register
 ; TODO: use two path for height alignment to 4 and otherwise may improvement 10% performance, but code is more complex, so I disable it