[x265] [PATCH] asm: filter_vsp[4x4], filter_vss[4x4] in avx2: 407c->198c, 361c->180c

Divya Manivannan divya at multicorewareinc.com
Thu Mar 12 07:03:21 CET 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1426140136 -19800
#      Thu Mar 12 11:32:16 2015 +0530
# Node ID ed3549b49cc488315da7d4709d6932e7244e5b33
# Parent  b931c50d55011a1ddc08f0a230b9632fcb4674d7
asm: filter_vsp[4x4], filter_vss[4x4] in avx2: 407c->198c, 361c->180c

diff -r b931c50d5501 -r ed3549b49cc4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Mar 11 21:58:02 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Thu Mar 12 11:32:16 2015 +0530
@@ -1621,6 +1621,10 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = x265_interp_4tap_vert_ps_32x24_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = x265_interp_4tap_vert_ps_32x16_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = x265_interp_4tap_vert_ps_32x8_avx2;
+
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vsp = x265_interp_4tap_vert_sp_4x4_avx2;
+
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vss = x265_interp_4tap_vert_ss_4x4_avx2;
     }
 #endif
 }
diff -r b931c50d5501 -r ed3549b49cc4 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed Mar 11 21:58:02 2015 -0500
+++ b/source/common/x86/ipfilter8.asm	Thu Mar 12 11:32:16 2015 +0530
@@ -120,6 +120,31 @@
                   times 4 dw -2, 10
                   times 4 dw 58, -2
 
+ALIGN 32
+pw_ChromaCoeffV:  times 8 dw 0, 64
+                  times 8 dw 0, 0
+
+                  times 8 dw -2, 58
+                  times 8 dw 10, -2
+
+                  times 8 dw -4, 54
+                  times 8 dw 16, -2
+
+                  times 8 dw -6, 46 
+                  times 8 dw 28, -4
+
+                  times 8 dw -4, 36
+                  times 8 dw 36, -4
+
+                  times 8 dw -4, 28
+                  times 8 dw 46, -6
+
+                  times 8 dw -2, 16
+                  times 8 dw 54, -4
+
+                  times 8 dw -2, 10
+                  times 8 dw 58, -2
+
 tab_LumaCoeff:   db   0, 0,  0,  64,  0,   0,  0,  0
                  db  -1, 4, -10, 58,  17, -5,  1,  0
                  db  -1, 4, -11, 40,  40, -11, 4, -1
@@ -11621,6 +11646,82 @@
     FILTER_VER_CHROMA_SS 48, 64
     FILTER_VER_CHROMA_SS 64, 16
 
+%macro FILTER_VER_CHROMA_S_AVX2_4x4 1
+INIT_YMM avx2
+cglobal interp_4tap_vert_%1_4x4, 4, 6, 7
+    mov             r4d, r4m
+    add             r1d, r1d
+    shl             r4d, 6
+    sub             r0, r1
+
+%ifdef PIC
+    lea             r5, [pw_ChromaCoeffV]
+    add             r5, r4
+%else
+    lea             r5, [pw_ChromaCoeffV + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+%ifidn %1,sp
+    mova            m6, [pd_526336]
+%else
+    add             r3d, r3d
+%endif
+
+    movq            xm0, [r0]
+    movq            xm1, [r0 + r1]
+    punpcklwd       xm0, xm1
+    movq            xm2, [r0 + r1 * 2]
+    punpcklwd       xm1, xm2
+    vinserti128     m0, m0, xm1, 1                  ; m0 = [2 1 1 0]
+    pmaddwd         m0, [r5]
+    movq            xm3, [r0 + r4]
+    punpcklwd       xm2, xm3
+    lea             r0, [r0 + 4 * r1]
+    movq            xm4, [r0]
+    punpcklwd       xm3, xm4
+    vinserti128     m2, m2, xm3, 1                  ; m2 = [4 3 3 2]
+    pmaddwd         m5, m2, [r5 + 1 * mmsize]
+    pmaddwd         m2, [r5]
+    paddd           m0, m5
+    movq            xm3, [r0 + r1]
+    punpcklwd       xm4, xm3
+    movq            xm1, [r0 + r1 * 2]
+    punpcklwd       xm3, xm1
+    vinserti128     m4, m4, xm3, 1                  ; m4 = [6 5 5 4]
+    pmaddwd         m4, [r5 + 1 * mmsize]
+    paddd           m2, m4
+
+%ifidn %1,sp
+    paddd           m0, m6
+    paddd           m2, m6
+    psrad           m0, 12
+    psrad           m2, 12
+%else
+    psrad           m0, 6
+    psrad           m2, 6
+%endif
+    packssdw        m0, m2
+    vextracti128    xm2, m0, 1
+    lea             r4, [r3 * 3]
+
+%ifidn %1,sp
+    packuswb        xm0, xm2
+    movd            [r2], xm0
+    pextrd          [r2 + r3], xm0, 2
+    pextrd          [r2 + r3 * 2], xm0, 1
+    pextrd          [r2 + r4], xm0, 3
+%else
+    movq            [r2], xm0
+    movq            [r2 + r3], xm2
+    movhps          [r2 + r3 * 2], xm0
+    movhps          [r2 + r4], xm2
+%endif
+    RET
+%endmacro
+
+FILTER_VER_CHROMA_S_AVX2_4x4 sp
+FILTER_VER_CHROMA_S_AVX2_4x4 ss
 
 ;---------------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vertical_ss_%1x%2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
diff -r b931c50d5501 -r ed3549b49cc4 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Wed Mar 11 21:58:02 2015 -0500
+++ b/source/common/x86/ipfilter8.h	Thu Mar 12 11:32:16 2015 +0530
@@ -576,8 +576,10 @@
 CHROMA_420_FILTERS(_avx2);
 CHROMA_420_SP_FILTERS(_sse2);
 CHROMA_420_SP_FILTERS_SSE4(_sse4);
+CHROMA_420_SP_FILTERS_SSE4(_avx2);
 CHROMA_420_SS_FILTERS(_sse2);
 CHROMA_420_SS_FILTERS_SSE4(_sse4);
+CHROMA_420_SS_FILTERS(_avx2);
 
 CHROMA_422_FILTERS(_sse4);
 CHROMA_422_FILTERS(_avx2);


More information about the x265-devel mailing list