[x265] [PATCH Review only] asm-avx2: filter_vpp[6x8], filter_vps[6x8]: 415c->396c, 364c->351c

Divya Manivannan divya at multicorewareinc.com
Fri Mar 6 06:38:17 CET 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1425620240 -19800
#      Fri Mar 06 11:07:20 2015 +0530
# Node ID 1738a545c24bee479c763625eee6592ac0e963b2
# Parent  45deb0125890ab0211d0e256c74d91ede78d12d3
asm-avx2: filter_vpp[6x8], filter_vps[6x8]: 415c->396c, 364c->351c

diff -r 45deb0125890 -r 1738a545c24b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Mar 05 20:39:08 2015 -0600
+++ b/source/common/x86/asm-primitives.cpp	Fri Mar 06 11:07:20 2015 +0530
@@ -1576,6 +1576,7 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vpp = x265_interp_4tap_vert_pp_2x4_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vpp = x265_interp_4tap_vert_pp_4x2_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vpp = x265_interp_4tap_vert_pp_4x8_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_vpp = x265_interp_4tap_vert_pp_6x8_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vpp = x265_interp_4tap_vert_pp_8x2_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vpp = x265_interp_4tap_vert_pp_8x4_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vpp = x265_interp_4tap_vert_pp_8x6_avx2;
@@ -1592,6 +1593,7 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vps = x265_interp_4tap_vert_ps_4x2_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vps = x265_interp_4tap_vert_ps_4x8_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].filter_vps = x265_interp_4tap_vert_ps_6x8_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].filter_vps = x265_interp_4tap_vert_ps_8x2_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vps = x265_interp_4tap_vert_ps_8x4_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vps = x265_interp_4tap_vert_ps_8x6_avx2;
diff -r 45deb0125890 -r 1738a545c24b source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Thu Mar 05 20:39:08 2015 -0600
+++ b/source/common/x86/ipfilter8.asm	Fri Mar 06 11:07:20 2015 +0530
@@ -4652,6 +4652,86 @@
 FILTER_VER_CHROMA_AVX2_8x2 pp
 FILTER_VER_CHROMA_AVX2_8x2 ps
 
+%macro FILTER_VER_CHROMA_AVX2_6x8 1
+INIT_YMM avx2
+cglobal interp_4tap_vert_%1_6x8, 4, 6, 7
+    mov             r4d, r4m
+    shl             r4d, 6
+
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeffVer_32]
+    add             r5, r4
+%else
+    lea             r5, [tab_ChromaCoeffVer_32 + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+    sub             r0, r1
+    PROCESS_CHROMA_AVX2_W8_8R
+%ifidn %1,pp
+    lea             r4, [r3 * 3]
+    mova            m3, [pw_512]
+    pmulhrsw        m5, m3                          ; m5 = word: row 0, row 1
+    pmulhrsw        m2, m3                          ; m2 = word: row 2, row 3
+    pmulhrsw        m1, m3                          ; m1 = word: row 4, row 5
+    pmulhrsw        m4, m3                          ; m4 = word: row 6, row 7
+    packuswb        m5, m2
+    packuswb        m1, m4
+    vextracti128    xm2, m5, 1
+    vextracti128    xm4, m1, 1
+    movd            [r2], xm5
+    pextrw          [r2 + 4], xm5, 2
+    movd            [r2 + r3], xm2
+    pextrw          [r2 + r3 + 4], xm2, 2
+    pextrd          [r2 + r3 * 2], xm5, 2
+    pextrw          [r2 + r3 * 2 + 4], xm5, 6
+    pextrd          [r2 + r4], xm2, 2
+    pextrw          [r2 + r4 + 4], xm2, 6
+    lea             r2, [r2 + r3 * 4]
+    movd            [r2], xm1
+    pextrw          [r2 + 4], xm1, 2
+    movd            [r2 + r3], xm4
+    pextrw          [r2 + r3 + 4], xm4, 2
+    pextrd          [r2 + r3 * 2], xm1, 2
+    pextrw          [r2 + r3 * 2 + 4], xm1, 6
+    pextrd          [r2 + r4], xm4, 2
+    pextrw          [r2 + r4 + 4], xm4, 6
+%else
+    add             r3d, r3d
+    vbroadcasti128  m3, [pw_2000]
+    lea             r4, [r3 * 3]
+    psubw           m5, m3                          ; m5 = word: row 0, row 1
+    psubw           m2, m3                          ; m2 = word: row 2, row 3
+    psubw           m1, m3                          ; m1 = word: row 4, row 5
+    psubw           m4, m3                          ; m4 = word: row 6, row 7
+    vextracti128    xm6, m5, 1
+    vextracti128    xm3, m2, 1
+    vextracti128    xm0, m1, 1
+    movq            [r2], xm5
+    pextrd          [r2 + 8], xm5, 2
+    movq            [r2 + r3], xm6
+    pextrd          [r2 + r3 + 8], xm6, 2
+    movq            [r2 + r3 * 2], xm2
+    pextrd          [r2 + r3 * 2 + 8], xm2, 2
+    movq            [r2 + r4], xm3
+    pextrd          [r2 + r4 + 8], xm3, 2
+    lea             r2, [r2 + r3 * 4]
+    movq            [r2], xm1
+    pextrd          [r2 + 8], xm1, 2
+    movq            [r2 + r3], xm0
+    pextrd          [r2 + r3 + 8], xm0, 2
+    movq            [r2 + r3 * 2], xm4
+    pextrd          [r2 + r3 * 2 + 8], xm4, 2
+    vextracti128    xm4, m4, 1
+    movq            [r2 + r4], xm4
+    pextrd          [r2 + r4 + 8], xm4, 2
+%endif
+    RET
+%endmacro
+
+FILTER_VER_CHROMA_AVX2_6x8 pp
+FILTER_VER_CHROMA_AVX2_6x8 ps
+
 ;-----------------------------------------------------------------------------
 ;void interp_4tap_vert_pp_6x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list