[x265] [PATCH] asm: chroma_hps[16x4, 16x8, 16x12, 16x32] for i420 avx2 - improved 743c->468c, 1065c->681c, 1399c->894c, 2961c->1844c

aasaipriya at multicorewareinc.com aasaipriya at multicorewareinc.com
Thu Mar 12 09:25:31 CET 2015


# HG changeset patch
# User Aasaipriya Chandran <aasaipriya at multicorewareinc.com>
# Date 1426148724 -19800
#      Thu Mar 12 13:55:24 2015 +0530
# Node ID d78714dc5cb537d480550ba507c320f9e9a0cc9d
# Parent  9e8b95e8d6ba9e036f9a1c8f0e09d478149f1859
asm: chroma_hps[16x4, 16x8, 16x12, 16x32] for i420 avx2 - improved 743c->468c, 1065c->681c, 1399c->894c, 2961c->1844c

diff -r 9e8b95e8d6ba -r d78714dc5cb5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Mar 12 12:18:51 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Mar 12 13:55:24 2015 +0530
@@ -1591,6 +1591,11 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_hps = x265_interp_4tap_horiz_ps_4x4_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hps = x265_interp_4tap_horiz_ps_8x8_avx2;
 
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hps = x265_interp_4tap_horiz_ps_16x32_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hps = x265_interp_4tap_horiz_ps_16x12_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hps = x265_interp_4tap_horiz_ps_16x8_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hps = x265_interp_4tap_horiz_ps_16x4_avx2;
+
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
 
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
diff -r 9e8b95e8d6ba -r d78714dc5cb5 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Thu Mar 12 12:18:51 2015 +0530
+++ b/source/common/x86/ipfilter8.asm	Thu Mar 12 13:55:24 2015 +0530
@@ -14769,6 +14769,66 @@
     jnz                .loop
    RET
 
+;-----------------------------------------------------------------------------------------------------------------------------
+; void interp_4tap_horiz_ps_16xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;-----------------------------------------------------------------------------------------------------------------------------
+%macro IPFILTER_CHROMA_PS_16xN_AVX2 2
+INIT_YMM avx2 
+cglobal interp_4tap_horiz_ps_%1x%2, 4,7,6
+    mov                    r4d,        r4m
+    mov                    r5d,        r5m
+    add                    r3d,        r3d
+
+%ifdef PIC
+    lea                    r6,         [tab_ChromaCoeff]
+    vpbroadcastd           m0,         [r6 + r4 * 4]
+%else
+    vpbroadcastd           m0,         [tab_ChromaCoeff + r4 * 4]
+%endif
+
+    vbroadcasti128         m2,         [pw_1]
+    vbroadcasti128         m5,         [pw_2000]
+    mova                   m1,         [tab_Tm]
+
+    ; register map
+    ; m0 - interpolate coeff
+    ; m1 - shuffle order table
+    ; m2 - constant word 1
+    mov                    r6d,        %2
+    dec                    r0
+    test                   r5d,        r5d
+    je                     .loop
+    sub                    r0 ,        r1
+    add                    r6d ,       3
+
+.loop
+    ; Row 0
+    vbroadcasti128         m3,         [r0]
+    pshufb                 m3,         m1
+    pmaddubsw              m3,         m0
+    pmaddwd                m3,         m2
+    vbroadcasti128         m4,         [r0 + 8]
+    pshufb                 m4,         m1
+    pmaddubsw              m4,         m0
+    pmaddwd                m4,         m2
+
+    packssdw               m3,         m4
+    psubw                  m3,         m5
+
+    vpermq                 m3,         m3,          11011000b
+    movu                   [r2],       m3
+
+    add                    r2,         r3
+    add                    r0,         r1
+    dec                    r6d
+    jnz                    .loop
+    RET
+%endmacro
+
+    IPFILTER_CHROMA_PS_16xN_AVX2  16 , 32
+    IPFILTER_CHROMA_PS_16xN_AVX2  16 , 12
+    IPFILTER_CHROMA_PS_16xN_AVX2  16 , 8
+    IPFILTER_CHROMA_PS_16xN_AVX2  16 , 4
 ;-----------------------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_horiz_ps_4x4(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 ;-----------------------------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list