[x265] [PATCH] asm: filter_hpp[2x8] in avx2: 301c->249c

Divya Manivannan divya at multicorewareinc.com
Mon Mar 23 05:41:32 CET 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1427085460 -19800
#      Mon Mar 23 10:07:40 2015 +0530
# Node ID 1ac750487fb5ad497f95510327ebac46ff5e3467
# Parent  cc496665280f9e9e4776327e9a1cc1b2eeffecbc
asm: filter_hpp[2x8] in avx2: 301c->249c

diff -r cc496665280f -r 1ac750487fb5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Sun Mar 22 22:16:45 2015 -0400
+++ b/source/common/x86/asm-primitives.cpp	Mon Mar 23 10:07:40 2015 +0530
@@ -1650,6 +1650,7 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = x265_interp_4tap_horiz_pp_16x16_avx2;
 
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hpp = x265_interp_4tap_horiz_pp_2x4_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hpp = x265_interp_4tap_horiz_pp_2x8_avx2;
 
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_hpp = x265_interp_4tap_horiz_pp_4x2_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_hpp = x265_interp_4tap_horiz_pp_4x8_avx2;
diff -r cc496665280f -r 1ac750487fb5 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Sun Mar 22 22:16:45 2015 -0400
+++ b/source/common/x86/ipfilter8.asm	Mon Mar 23 10:07:40 2015 +0530
@@ -1695,6 +1695,56 @@
     pextrw            [r2 + r4],     xm1,         3
     RET
 
+INIT_YMM avx2 
+cglobal interp_4tap_horiz_pp_2x8, 4, 6, 6
+    mov               r4d,           r4m
+
+%ifdef PIC
+    lea               r5,            [tab_ChromaCoeff]
+    vpbroadcastd      m0,            [r5 + r4 * 4]
+%else
+    vpbroadcastd      m0,            [tab_ChromaCoeff + r4 * 4]
+%endif
+
+    mova              m4,            [interp4_hpp_shuf]
+    mova              m5,            [pw_1]
+    dec               r0
+    lea               r4,            [r1 * 3]
+    movq              xm1,           [r0]
+    movhps            xm1,           [r0 + r1]
+    movq              xm2,           [r0 + r1 * 2]
+    movhps            xm2,           [r0 + r4]
+    vinserti128       m1,            m1,          xm2,          1
+    lea               r0,            [r0 + r1 * 4]
+    movq              xm3,           [r0]
+    movhps            xm3,           [r0 + r1]
+    movq              xm2,           [r0 + r1 * 2]
+    movhps            xm2,           [r0 + r4]
+    vinserti128       m3,            m3,          xm2,          1
+
+    pshufb            m1,            m4
+    pshufb            m3,            m4
+    pmaddubsw         m1,            m0
+    pmaddubsw         m3,            m0
+    pmaddwd           m1,            m5
+    pmaddwd           m3,            m5
+    packssdw          m1,            m3
+    pmulhrsw          m1,            [pw_512]
+    vextracti128      xm2,           m1,          1
+    packuswb          xm1,           xm2
+
+    lea               r4,            [r3 * 3]
+    pextrw            [r2],          xm1,         0
+    pextrw            [r2 + r3],     xm1,         1
+    pextrw            [r2 + r3 * 2], xm1,         4
+    pextrw            [r2 + r4],     xm1,         5
+    lea               r2,            [r2 + r3 * 4]
+    pextrw            [r2],          xm1,         2
+    pextrw            [r2 + r3],     xm1,         3
+    pextrw            [r2 + r3 * 2], xm1,         6
+    pextrw            [r2 + r4],     xm1,         7
+    RET
+
 INIT_YMM avx2
 cglobal interp_4tap_horiz_pp_32x32, 4,6,7
     mov             r4d, r4m


More information about the x265-devel mailing list