[x265] [PATCH] asm: chroma_hps[2x16] for i422 - improved 796c->559c

aasaipriya at multicorewareinc.com aasaipriya at multicorewareinc.com
Mon May 4 06:48:34 CEST 2015


# HG changeset patch
# User Aasaipriya Chandran <aasaipriya at multicorewareinc.com>
# Date 1430714560 -19800
#      Mon May 04 10:12:40 2015 +0530
# Node ID 6eff41530ea88e669d740c23eb03b5ee83a71553
# Parent  9cbbdcc6f04120f395b2cf95532cb0dc86b93f06
asm: chroma_hps[2x16] for i422 - improved 796c->559c

diff -r 9cbbdcc6f041 -r 6eff41530ea8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon May 04 10:09:46 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon May 04 10:12:40 2015 +0530
@@ -2443,6 +2443,7 @@
 
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].filter_hps = x265_interp_4tap_horiz_ps_2x8_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hps = x265_interp_4tap_horiz_ps_24x64_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_hps = x265_interp_4tap_horiz_ps_2x16_avx2;
 
         //i444 chroma_hps
         p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = x265_interp_4tap_horiz_ps_64x32_avx2;
diff -r 9cbbdcc6f041 -r 6eff41530ea8 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Mon May 04 10:09:46 2015 +0530
+++ b/source/common/x86/ipfilter8.asm	Mon May 04 10:12:40 2015 +0530
@@ -291,6 +291,9 @@
 
 const interp8_hps_shuf,     dd 0, 4, 1, 5, 2, 6, 3, 7
 
+ALIGN 32
+interp4_hps_shuf: times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 8, 9, 10, 11, 9, 10, 11, 12
+
 SECTION .text
 
 cextern pb_128
@@ -24217,3 +24220,118 @@
     dec                r6d
     jnz                .loop
     RET
+
+INIT_YMM avx2
+cglobal interp_4tap_horiz_ps_2x16, 4, 7, 7
+    mov               r4d,           r4m
+    mov               r5d,           r5m
+    add               r3d,           r3d
+
+%ifdef PIC
+    lea               r6,            [tab_ChromaCoeff]
+    vpbroadcastd      m0,            [r6 + r4 * 4]
+%else
+    vpbroadcastd      m0,            [tab_ChromaCoeff + r4 * 4]
+%endif
+    vbroadcasti128    m6,            [pw_2000]
+    test              r5d,            r5d
+    jz                .label
+    sub               r0,             r1
+
+.label
+    mova              m4,            [interp4_hps_shuf]
+    mova              m5,            [pw_1]
+    dec               r0
+    lea               r4,            [r1 * 3]
+    movq              xm1,           [r0]                                   ;row 0
+    movhps            xm1,           [r0 + r1]
+    movq              xm2,           [r0 + r1 * 2]
+    movhps            xm2,           [r0 + r4]
+    vinserti128       m1,            m1,           xm2,          1
+    lea               r0,            [r0 + r1 * 4]
+    movq              xm3,           [r0]
+    movhps            xm3,           [r0 + r1]
+    movq              xm2,           [r0 + r1 * 2]
+    movhps            xm2,           [r0 + r4]
+    vinserti128       m3,            m3,           xm2,          1
+
+    pshufb            m1,            m4
+    pshufb            m3,            m4
+    pmaddubsw         m1,            m0
+    pmaddubsw         m3,            m0
+    pmaddwd           m1,            m5
+    pmaddwd           m3,            m5
+    packssdw          m1,            m3
+    psubw             m1,            m6
+
+    lea               r4,            [r3 * 3]
+    vextracti128      xm2,           m1,           1
+
+    movd              [r2],          xm1
+    pextrd            [r2 + r3],     xm1,          1
+    movd              [r2 + r3 * 2], xm2
+    pextrd            [r2 + r4],     xm2,          1
+    lea               r2,            [r2 + r3 * 4]
+    pextrd            [r2],          xm1,          2
+    pextrd            [r2 + r3],     xm1,          3
+    pextrd            [r2 + r3 * 2], xm2,          2
+    pextrd            [r2 + r4],     xm2,          3
+
+    lea               r0,            [r0 + r1 * 4]
+    lea               r2,            [r2 + r3 * 4]
+    lea               r4,            [r1 * 3]
+    movq              xm1,           [r0]
+    movhps            xm1,           [r0 + r1]
+    movq              xm2,           [r0 + r1 * 2]
+    movhps            xm2,           [r0 + r4]
+    vinserti128       m1,            m1,          xm2,           1
+    lea               r0,            [r0 + r1 * 4]
+    movq              xm3,           [r0]
+    movhps            xm3,           [r0 + r1]
+    movq              xm2,           [r0 + r1 * 2]
+    movhps            xm2,           [r0 + r4]
+    vinserti128       m3,            m3,          xm2,           1
+
+    pshufb            m1,            m4
+    pshufb            m3,            m4
+    pmaddubsw         m1,            m0
+    pmaddubsw         m3,            m0
+    pmaddwd           m1,            m5
+    pmaddwd           m3,            m5
+    packssdw          m1,            m3
+    psubw             m1,            m6
+
+    lea               r4,            [r3 * 3]
+    vextracti128      xm2,           m1,           1
+
+    movd              [r2],          xm1
+    pextrd            [r2 + r3],     xm1,          1
+    movd              [r2 + r3 * 2], xm2
+    pextrd            [r2 + r4],     xm2,          1
+    lea               r2,            [r2 + r3 * 4]
+    pextrd            [r2],          xm1,          2
+    pextrd            [r2 + r3],     xm1,          3
+    pextrd            [r2 + r3 * 2], xm2,          2
+    pextrd            [r2 + r4],     xm2,          3
+
+    test              r5d,            r5d
+    jz                .end
+
+    lea               r0,            [r0 + r1 * 4]
+    lea               r2,            [r2 + r3 * 4]
+    movq              xm1,           [r0]
+    movhps            xm1,           [r0 + r1]
+    movq              xm2,           [r0 + r1 * 2]
+    vinserti128       m1,            m1,          xm2,           1
+    pshufb            m1,            m4
+    pmaddubsw         m1,            m0
+    pmaddwd           m1,            m5
+    packssdw          m1,            m1
+    psubw             m1,            m6
+    vextracti128      xm2,           m1,           1
+
+    movd              [r2],          xm1
+    pextrd            [r2 + r3],     xm1,          1
+    movd              [r2 + r3 * 2], xm2
+.end
+    RET


More information about the x265-devel mailing list