[x265] [PATCH] asm: filter_vpp, filter_vps for 4x32 in avx2

Divya Manivannan divya at multicorewareinc.com
Thu May 14 07:18:51 CEST 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1431579857 -19800
#      Thu May 14 10:34:17 2015 +0530
# Node ID 68ebb4bdda79543be758a31e4308e3f4e23ff274
# Parent  479087422e29a672d6e9bc8d0cd2a65649d71fe2
asm: filter_vpp, filter_vps for 4x32 in avx2

filter_vpp[4x32]: 1564c->1172c
filter_vps[4x32]: 1283c->1035c

diff -r 479087422e29 -r 68ebb4bdda79 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed May 13 16:52:59 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp	Thu May 14 10:34:17 2015 +0530
@@ -2683,6 +2683,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].filter_vps = x265_interp_4tap_vert_ps_2x4_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vps = x265_interp_4tap_vert_ps_16x24_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vps = x265_interp_4tap_vert_ps_2x16_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vps = x265_interp_4tap_vert_ps_4x32_avx2;
 
         //i444 for chroma_vps
         p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_avx2;
@@ -2729,6 +2730,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].filter_vpp = x265_interp_4tap_vert_pp_2x4_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = x265_interp_4tap_vert_pp_16x24_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vpp = x265_interp_4tap_vert_pp_2x16_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vpp = x265_interp_4tap_vert_pp_4x32_avx2;
 
         //i444 for chroma_vpp
         p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
diff -r 479087422e29 -r 68ebb4bdda79 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed May 13 16:52:59 2015 -0700
+++ b/source/common/x86/ipfilter8.asm	Thu May 14 10:34:17 2015 +0530
@@ -5698,10 +5698,10 @@
     FILTER_VER_CHROMA_AVX2_4x8 pp
     FILTER_VER_CHROMA_AVX2_4x8 ps
 
-%macro FILTER_VER_CHROMA_AVX2_4x16 1
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_4tap_vert_%1_4x16, 4, 6, 9
+%macro FILTER_VER_CHROMA_AVX2_4xN 2
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal interp_4tap_vert_%1_4x%2, 4, 6, 12
     mov             r4d, r4m
     shl             r4d, 6
     sub             r0, r1
@@ -5714,7 +5714,16 @@
 %endif
 
     lea             r4, [r1 * 3]
-
+    mova            m10, [r5]
+    mova            m11, [r5 + mmsize]
+%ifidn %1,pp
+    mova            m9, [pw_512]
+%else
+    add             r3d, r3d
+    mova            m9, [pw_2000]
+%endif
+    lea             r5, [r3 * 3]
+%rep %2 / 16
     movd            xm1, [r0]
     pinsrd          xm1, [r0 + r1], 1
     pinsrd          xm1, [r0 + r1 * 2], 2
@@ -5762,29 +5771,27 @@
     pshufb          m6, m6, m5
     pshufb          m7, m7, m5
     pshufb          m8, m8, m5
-    pmaddubsw       m0, [r5]
-    pmaddubsw       m6, [r5]
-    pmaddubsw       m7, [r5]
-    pmaddubsw       m8, [r5]
-    pmaddubsw       m1, [r5 + mmsize]
-    pmaddubsw       m2, [r5 + mmsize]
-    pmaddubsw       m3, [r5 + mmsize]
-    pmaddubsw       m4, [r5 + mmsize]
+    pmaddubsw       m0, m10
+    pmaddubsw       m6, m10
+    pmaddubsw       m7, m10
+    pmaddubsw       m8, m10
+    pmaddubsw       m1, m11
+    pmaddubsw       m2, m11
+    pmaddubsw       m3, m11
+    pmaddubsw       m4, m11
     paddw           m0, m1                                  ; m0 = WORD ROW[3 2 1 0]
     paddw           m6, m2                                  ; m6 = WORD ROW[7 6 5 4]
     paddw           m7, m3                                  ; m7 = WORD ROW[11 10 9 8]
     paddw           m8, m4                                  ; m8 = WORD ROW[15 14 13 12]
 %ifidn %1,pp
-    mova            m5, [pw_512]
-    pmulhrsw        m0, m5
-    pmulhrsw        m6, m5
-    pmulhrsw        m7, m5
-    pmulhrsw        m8, m5
+    pmulhrsw        m0, m9
+    pmulhrsw        m6, m9
+    pmulhrsw        m7, m9
+    pmulhrsw        m8, m9
     packuswb        m0, m6
     packuswb        m7, m8
     vextracti128    xm1, m0, 1
     vextracti128    xm2, m7, 1
-    lea             r5, [r3 * 3]
     movd            [r2], xm0
     pextrd          [r2 + r3], xm0, 1
     movd            [r2 + r3 * 2], xm1
@@ -5805,17 +5812,14 @@
     pextrd          [r2 + r3 * 2], xm2, 2
     pextrd          [r2 + r5], xm2, 3
 %else
-    add             r3d, r3d
-    mova            m5, [pw_2000]
-    psubw           m0, m5
-    psubw           m6, m5
-    psubw           m7, m5
-    psubw           m8, m5
+    psubw           m0, m9
+    psubw           m6, m9
+    psubw           m7, m9
+    psubw           m8, m9
     vextracti128    xm1, m0, 1
     vextracti128    xm2, m6, 1
     vextracti128    xm3, m7, 1
     vextracti128    xm4, m8, 1
-    lea             r5, [r3 * 3]
     movq            [r2], xm0
     movhps          [r2 + r3], xm0
     movq            [r2 + r3 * 2], xm1
@@ -5836,12 +5840,16 @@
     movq            [r2 + r3 * 2], xm4
     movhps          [r2 + r5], xm4
 %endif
-    RET
-%endif
-%endmacro
-
-    FILTER_VER_CHROMA_AVX2_4x16 pp
-    FILTER_VER_CHROMA_AVX2_4x16 ps
+    lea             r2, [r2 + r3 * 4]
+%endrep
+    RET
+%endif
+%endmacro
+
+    FILTER_VER_CHROMA_AVX2_4xN pp, 16
+    FILTER_VER_CHROMA_AVX2_4xN ps, 16
+    FILTER_VER_CHROMA_AVX2_4xN pp, 32
+    FILTER_VER_CHROMA_AVX2_4xN ps, 32
 
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_vert_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)


More information about the x265-devel mailing list