[x265] [PATCH] asm: chroma_hps[2x8] avx2 - improved 502c->378c

aasaipriya at multicorewareinc.com aasaipriya at multicorewareinc.com
Wed Mar 25 05:24:19 CET 2015


# HG changeset patch
# User Aasaipriya Chandran <aasaipriya at multicorewareinc.com>
# Date 1427257444 -19800
#      Wed Mar 25 09:54:04 2015 +0530
# Node ID 1763037ac15965d9379bc15f795c34702bc39803
# Parent  ba00598f3ad32867e9a12898ace89e3210b75061
asm: chroma_hps[2x8] avx2 - improved 502c->378c

diff -r ba00598f3ad3 -r 1763037ac159 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Mar 25 09:50:37 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Mar 25 09:54:04 2015 +0530
@@ -1738,6 +1738,7 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = x265_interp_4tap_horiz_ps_32x24_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = x265_interp_4tap_horiz_ps_32x8_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_hps = x265_interp_4tap_horiz_ps_2x4_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].filter_hps = x265_interp_4tap_horiz_ps_2x8_avx2;
 
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
 
diff -r ba00598f3ad3 -r 1763037ac159 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed Mar 25 09:50:37 2015 +0530
+++ b/source/common/x86/ipfilter8.asm	Wed Mar 25 09:54:04 2015 +0530
@@ -20287,3 +20287,80 @@
     pextrd             [r2 + r3 * 2],  xm1,         2
 .end
     RET
+
+INIT_YMM avx2
+cglobal interp_4tap_horiz_ps_2x8, 4, 7, 7
+    mov               r4d,           r4m
+    mov               r5d,           r5m
+    add               r3d,           r3d
+
+%ifdef PIC
+    lea               r6,            [tab_ChromaCoeff]
+    vpbroadcastd      m0,            [r6 + r4 * 4]
+%else
+    vpbroadcastd      m0,            [tab_ChromaCoeff + r4 * 4]
+%endif
+    vbroadcasti128    m6,            [pw_2000]
+    test              r5d,            r5d
+    jz                .label
+    sub               r0,             r1
+
+.label
+    mova              m4,            [interp4_hps_shuf]
+    mova              m5,            [pw_1]
+    dec               r0
+    lea               r4,            [r1 * 3]
+    movq              xm1,           [r0]                                   ;row 0
+    movhps            xm1,           [r0 + r1]
+    movq              xm2,           [r0 + r1 * 2]
+    movhps            xm2,           [r0 + r4]
+    vinserti128       m1,            m1,          xm2,          1
+    lea               r0,            [r0 + r1 * 4]
+    movq              xm3,           [r0]
+    movhps            xm3,           [r0 + r1]
+    movq              xm2,           [r0 + r1 * 2]
+    movhps            xm2,           [r0 + r4]
+    vinserti128       m3,            m3,          xm2,          1
+
+    pshufb            m1,            m4
+    pshufb            m3,            m4
+    pmaddubsw         m1,            m0
+    pmaddubsw         m3,            m0
+    pmaddwd           m1,            m5
+    pmaddwd           m3,            m5
+    packssdw          m1,            m3
+    psubw             m1,            m6
+
+    lea               r4,            [r3 * 3]
+    vextracti128      xm2,           m1,          1
+
+    movd              [r2],          xm1
+    pextrd            [r2 + r3],     xm1,         1
+    movd              [r2 + r3 * 2], xm2
+    pextrd            [r2 + r4],     xm2,         1
+    lea               r2,            [r2 + r3 * 4]
+    pextrd            [r2],          xm1,         2
+    pextrd            [r2 + r3],     xm1,         3
+    pextrd            [r2 + r3 * 2], xm2,         2
+    pextrd            [r2 + r4],     xm2,         3
+    test              r5d,            r5d
+    jz                .end
+
+    lea               r0,            [r0 + r1 * 4]
+    lea               r2,            [r2 + r3 * 4]
+    movq              xm1,           [r0]                                   ;row 0
+    movhps            xm1,           [r0 + r1]
+    movq              xm2,           [r0 + r1 * 2]
+    vinserti128       m1,            m1,          xm2,          1
+    pshufb            m1,            m4
+    pmaddubsw         m1,            m0
+    pmaddwd           m1,            m5
+    packssdw          m1,            m1
+    psubw             m1,            m6
+    vextracti128      xm2,           m1,          1
+
+    movd              [r2],          xm1
+    pextrd            [r2 + r3],     xm1,         1
+    movd              [r2 + r3 * 2], xm2
+.end
+    RET


More information about the x265-devel mailing list