[x265] [PATCH] asm: avx2 version convert_p2s[48x64], 4069c -> 3043c

Min Chen chenm003 at 163.com
Wed Apr 8 11:40:17 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1428486011 -28800
# Node ID 2a20a2592efbc0639034b8acdd9577c6657129c4
# Parent  3e416dec8024b8339b18568cf65e48eb3448bed1
asm: avx2 version convert_p2s[48x64], 4069c -> 3043c
---
 source/common/x86/asm-primitives.cpp |    1 +
 source/common/x86/ipfilter8.asm      |  130 +++++++++++++++++++++++++++-------
 source/common/x86/ipfilter8.h        |    1 +
 3 files changed, 106 insertions(+), 26 deletions(-)

diff -r 3e416dec8024 -r 2a20a2592efb source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Apr 07 16:00:39 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Wed Apr 08 17:40:11 2015 +0800
@@ -2050,6 +2050,7 @@
         p.pu[LUMA_64x32].convert_p2s = x265_filterPixelToShort_64x32_avx2;
         p.pu[LUMA_64x48].convert_p2s = x265_filterPixelToShort_64x48_avx2;
         p.pu[LUMA_64x64].convert_p2s = x265_filterPixelToShort_64x64_avx2;
+        p.pu[LUMA_48x64].convert_p2s = x265_filterPixelToShort_48x64_avx2;
 
         if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
             p.findPosLast = x265_findPosLast_x64;
diff -r 3e416dec8024 -r 2a20a2592efb source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Tue Apr 07 16:00:39 2015 -0500
+++ b/source/common/x86/ipfilter8.asm	Wed Apr 08 17:40:11 2015 +0800
@@ -105,7 +105,7 @@
                   times 4 dw -4, 54
                   times 4 dw 16, -2
 
-                  times 4 dw -6, 46 
+                  times 4 dw -6, 46
                   times 4 dw 28, -4
 
                   times 4 dw -4, 36
@@ -130,7 +130,7 @@
                   times 8 dw -4, 54
                   times 8 dw 16, -2
 
-                  times 8 dw -6, 46 
+                  times 8 dw -6, 46
                   times 8 dw 28, -4
 
                   times 8 dw -4, 36
@@ -918,7 +918,7 @@
 %endif
     punpcklqdq  m3, m3
 
-%ifidn %3, pp 
+%ifidn %3, pp
     mova      m2, [pw_512]
 %else
     mova      m2, [pw_2000]
@@ -937,7 +937,7 @@
 .loopH:
     xor       r5, r5
 %rep %1 / 8
-  %ifidn %3, pp 
+  %ifidn %3, pp
     FILTER_H8_W8  m0, m1, m4, m5, m3, m2, [r0 - 3 + r5], [r2 + r5]
   %else
     FILTER_H8_W8  m0, m1, m4, m5, m3, UNUSED, [r0 - 3 + r5]
@@ -949,7 +949,7 @@
 
 %rep (%1 % 8) / 4
     FILTER_H8_W4  m0, m1
-  %ifidn %3, pp 
+  %ifidn %3, pp
     pmulhrsw  m1, m2
     packuswb  m1, m1
     movd      [r2 + r5], m1
@@ -1306,7 +1306,7 @@
     pmaddubsw         m5,         m1
     paddw             m4,         m5
     pmaddwd           m4,         m7
-    vbroadcasti128    m5,         [r0 + 8]                    ; second 8 elements in Row0 
+    vbroadcasti128    m5,         [r0 + 8]                    ; second 8 elements in Row0
     pshufb            m6,         m5,     m3
     pshufb            m5,         [tab_Tm]
     pmaddubsw         m5,         m0
@@ -1322,7 +1322,7 @@
     pmaddubsw         m5,         m1
     paddw             m2,         m5
     pmaddwd           m2,         m7
-    vbroadcasti128    m5,         [r0 + r1 + 8]                    ; second 8 elements in Row0 
+    vbroadcasti128    m5,         [r0 + r1 + 8]                    ; second 8 elements in Row0
     pshufb            m6,         m5,     m3
     pshufb            m5,         [tab_Tm]
     pmaddubsw         m5,         m0
@@ -1617,7 +1617,7 @@
     jnz               .loop
     RET
 
-INIT_YMM avx2 
+INIT_YMM avx2
 cglobal interp_4tap_horiz_pp_4x4, 4,6,6
     mov             r4d, r4m
 
@@ -1665,7 +1665,7 @@
     pextrd            [r2+r0],      xm3,     3
     RET
 
-INIT_YMM avx2 
+INIT_YMM avx2
 cglobal interp_4tap_horiz_pp_2x4, 4, 6, 3
     mov               r4d,           r4m
 
@@ -1698,7 +1698,7 @@
     pextrw            [r2 + r4],     xm1,         3
     RET
 
-INIT_YMM avx2 
+INIT_YMM avx2
 cglobal interp_4tap_horiz_pp_2x8, 4, 6, 6
     mov               r4d,           r4m
 
@@ -1941,7 +1941,7 @@
 
     IPFILTER_LUMA_AVX2 16, 4
     IPFILTER_LUMA_AVX2 16, 8
-    IPFILTER_LUMA_AVX2 16, 12 
+    IPFILTER_LUMA_AVX2 16, 12
     IPFILTER_LUMA_AVX2 16, 16
     IPFILTER_LUMA_AVX2 16, 32
     IPFILTER_LUMA_AVX2 16, 64
@@ -2230,7 +2230,7 @@
     pshufb                      m4,                m1
     pmaddubsw                   m4,                m0
     phaddw                      m4,                m4                           ; DWORD [R3D R3C R2D R2C R3B R3A R2B R2A]
-    phaddw                      m3,                m4 
+    phaddw                      m3,                m4
 
     vpermd                      m3,                m5,            m3            ; m5 don't broken in above
     psubw                       m3,                m2
@@ -2312,7 +2312,7 @@
     lea                         r2,         [r2 + r3 * 2]                   ; first loop dst ->5th row(i.e 4)
     sub                         r5d,        2
     jg                         .loop
-    jz                         .end             
+    jz                         .end
 
     ; last row
     movu                        xm1,        [r0]
@@ -8830,6 +8830,84 @@
     jnz         .loop
     RET
 
+;-----------------------------------------------------------------------------
+; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride)
+;-----------------------------------------------------------------------------
+INIT_YMM avx2
+cglobal filterPixelToShort_48x64, 3,7,4
+    mov         r3d, r3m
+    add         r3d, r3d
+    lea         r5, [r1 * 3]
+    lea         r6, [r3 * 3]
+
+    ; load height
+    mov         r4d, 64/4
+
+    ; load constant
+    vpbroadcastd m3, [pw_2000]
+
+    ; just unroll(1) because it is best choice for 48x64
+.loop:
+    pmovzxbw    m0, [r0 + 0 * mmsize/2]
+    pmovzxbw    m1, [r0 + 1 * mmsize/2]
+    pmovzxbw    m2, [r0 + 2 * mmsize/2]
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psubw       m0, m3
+    psubw       m1, m3
+    psubw       m2, m3
+    movu        [r2 + 0 * mmsize], m0
+    movu        [r2 + 1 * mmsize], m1
+    movu        [r2 + 2 * mmsize], m2
+
+    pmovzxbw    m0, [r0 + r1 + 0 * mmsize/2]
+    pmovzxbw    m1, [r0 + r1 + 1 * mmsize/2]
+    pmovzxbw    m2, [r0 + r1 + 2 * mmsize/2]
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psubw       m0, m3
+    psubw       m1, m3
+    psubw       m2, m3
+    movu        [r2 + r3 + 0 * mmsize], m0
+    movu        [r2 + r3 + 1 * mmsize], m1
+    movu        [r2 + r3 + 2 * mmsize], m2
+
+    pmovzxbw    m0, [r0 + r1 * 2 + 0 * mmsize/2]
+    pmovzxbw    m1, [r0 + r1 * 2 + 1 * mmsize/2]
+    pmovzxbw    m2, [r0 + r1 * 2 + 2 * mmsize/2]
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psubw       m0, m3
+    psubw       m1, m3
+    psubw       m2, m3
+    movu        [r2 + r3 * 2 + 0 * mmsize], m0
+    movu        [r2 + r3 * 2 + 1 * mmsize], m1
+    movu        [r2 + r3 * 2 + 2 * mmsize], m2
+
+    pmovzxbw    m0, [r0 + r5 + 0 * mmsize/2]
+    pmovzxbw    m1, [r0 + r5 + 1 * mmsize/2]
+    pmovzxbw    m2, [r0 + r5 + 2 * mmsize/2]
+    psllw       m0, 6
+    psllw       m1, 6
+    psllw       m2, 6
+    psubw       m0, m3
+    psubw       m1, m3
+    psubw       m2, m3
+    movu        [r2 + r6 + 0 * mmsize], m0
+    movu        [r2 + r6 + 1 * mmsize], m1
+    movu        [r2 + r6 + 2 * mmsize], m2
+
+    lea         r0, [r0 + r1 * 4]
+    lea         r2, [r2 + r3 * 4]
+
+    dec         r4d
+    jnz        .loop
+    RET
+
+
 %macro PROCESS_LUMA_W4_4R 0
     movd        m0, [r0]
     movd        m1, [r0 + r1]
@@ -9775,7 +9853,7 @@
 
     lea       r5, [8 * r1 - 8]
     sub       r0, r5
-%ifidn %3,pp 
+%ifidn %3,pp
     add       r2, 8
 %else
     add       r2, 16
@@ -12337,7 +12415,7 @@
 
 FILTER_VER_LUMA_AVX2_32x16 pp
 FILTER_VER_LUMA_AVX2_32x16 ps
- 
+
 %macro FILTER_VER_LUMA_AVX2_32x24 1
 INIT_YMM avx2
 %if ARCH_X86_64 == 1
@@ -19531,7 +19609,7 @@
 ;-----------------------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_horiz_ps_32x32(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;-----------------------------------------------------------------------------------------------------------------------------;
-INIT_YMM avx2 
+INIT_YMM avx2
 cglobal interp_4tap_horiz_ps_32x32, 4,7,6
     mov             r4d, r4m
     mov             r5d, r5m
@@ -19598,7 +19676,7 @@
 ;-----------------------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_horiz_ps_16x16(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;-----------------------------------------------------------------------------------------------------------------------------;
-INIT_YMM avx2 
+INIT_YMM avx2
 cglobal interp_4tap_horiz_ps_16x16, 4,7,6
     mov             r4d, r4m
     mov             r5d, r5m
@@ -19652,7 +19730,7 @@
 ; void interp_4tap_horiz_ps_16xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;-----------------------------------------------------------------------------------------------------------------------------
 %macro IPFILTER_CHROMA_PS_16xN_AVX2 2
-INIT_YMM avx2 
+INIT_YMM avx2
 cglobal interp_4tap_horiz_ps_%1x%2, 4,7,6
     mov                    r4d,        r4m
     mov                    r5d,        r5m
@@ -19713,7 +19791,7 @@
 ; void interp_4tap_horiz_ps_32xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;-----------------------------------------------------------------------------------------------------------------------------
 %macro IPFILTER_CHROMA_PS_32xN_AVX2 2
-INIT_YMM avx2 
+INIT_YMM avx2
 cglobal interp_4tap_horiz_ps_%1x%2, 4,7,6
     mov                r4d,          r4m
     mov                r5d,          r5m
@@ -19786,7 +19864,7 @@
 ;-----------------------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_horiz_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;-----------------------------------------------------------------------------------------------------------------------------
-INIT_YMM avx2 
+INIT_YMM avx2
 cglobal interp_4tap_horiz_ps_4x4, 4,7,5
     mov             r4d, r4m
     mov             r5d, r5m
@@ -19940,7 +20018,7 @@
 ; void interp_4tap_horiz_ps_4xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;-----------------------------------------------------------------------------------------------------------------------------;
 %macro IPFILTER_CHROMA_PS_4xN_AVX2 2
-INIT_YMM avx2 
+INIT_YMM avx2
 cglobal interp_4tap_horiz_ps_%1x%2, 4,7,5
     mov             r4d, r4m
     mov             r5d, r5m
@@ -20033,7 +20111,7 @@
 ;-----------------------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_horiz_ps_8x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;-----------------------------------------------------------------------------------------------------------------------------;
-INIT_YMM avx2 
+INIT_YMM avx2
 cglobal interp_4tap_horiz_ps_8x8, 4,7,6
     mov             r4d, r4m
     mov             r5d, r5m
@@ -20104,7 +20182,7 @@
 .end
    RET
 
-INIT_YMM avx2 
+INIT_YMM avx2
 cglobal interp_4tap_horiz_pp_4x2, 4,6,4
     mov             r4d, r4m
 %ifdef PIC
@@ -20281,7 +20359,7 @@
 ; void interp_4tap_horiz_pp_4xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx
 ;-------------------------------------------------------------------------------------------------------------
 %macro IPFILTER_CHROMA_PP_4xN_AVX2 2
-INIT_YMM avx2 
+INIT_YMM avx2
 cglobal interp_4tap_horiz_pp_%1x%2, 4,6,6
     mov             r4d, r4m
 
@@ -20914,7 +20992,7 @@
 ; void interp_4tap_horiz_ps_8xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;-----------------------------------------------------------------------------------------------------------------------------
 %macro IPFILTER_CHROMA_PS_8xN_AVX2 1
-INIT_YMM avx2 
+INIT_YMM avx2
 cglobal interp_4tap_horiz_ps_8x%1, 4,7,6
     mov                r4d,             r4m
     mov                r5d,             r5m
@@ -21249,7 +21327,7 @@
 ;-----------------------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_horiz_ps_6x8(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;-----------------------------------------------------------------------------------------------------------------------------;
-INIT_YMM avx2 
+INIT_YMM avx2
 cglobal interp_4tap_horiz_ps_6x8, 4,7,6
     mov                r4d,            r4m
     mov                r5d,            r5m
diff -r 3e416dec8024 -r 2a20a2592efb source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Tue Apr 07 16:00:39 2015 -0500
+++ b/source/common/x86/ipfilter8.h	Wed Apr 08 17:40:11 2015 +0800
@@ -684,6 +684,7 @@
 void x265_filterPixelToShort_64x32_avx2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
 void x265_filterPixelToShort_64x48_avx2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
 void x265_filterPixelToShort_64x64_avx2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
+void x265_filterPixelToShort_48x64_avx2(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride);
 #undef LUMA_FILTERS
 #undef LUMA_SP_FILTERS
 #undef LUMA_SS_FILTERS



More information about the x265-devel mailing list