[x265] [PATCH] asm: interp_4tap_vert_X[64xN] avx2 10bit code for i444

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Wed Jun 10 13:18:49 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1433928862 -19800
#      Wed Jun 10 15:04:22 2015 +0530
# Node ID e62e1cdeb0f4d11a52c9d71bf2f33d798e6a27f0
# Parent  6245476add8f0562e3ccb657f572ff94fe96adf0
asm: interp_4tap_vert_X[64xN] avx2 10bit code for i444

avx2:
chroma_vpp[64x64]       14.09x   35866.48        505498.13
chroma_vps[64x64]       13.36x   31091.82        415364.22
chroma_vsp[64x64]       31.60x   35634.32        1126102.00
chroma_vss[64x64]       14.87x   26751.72        397682.13

sse4:
chroma_vpp[64x64]       6.35x    77950.40        495373.38
chroma_vps[64x64]       5.99x    70017.03        419411.72
chroma_vsp[64x64]       14.06x   77960.27        1096133.63
chroma_vss[64x64]       6.53x    62690.69        409574.47

diff -r 6245476add8f -r e62e1cdeb0f4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jun 10 11:54:27 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Jun 10 15:04:22 2015 +0530
@@ -1877,6 +1877,22 @@
         p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = x265_interp_4tap_vert_ps_48x64_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = x265_interp_4tap_vert_ss_48x64_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = x265_interp_4tap_vert_sp_48x64_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = x265_interp_4tap_vert_pp_64x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = x265_interp_4tap_vert_pp_64x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = x265_interp_4tap_vert_pp_64x48_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = x265_interp_4tap_vert_pp_64x64_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = x265_interp_4tap_vert_ps_64x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = x265_interp_4tap_vert_ps_64x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = x265_interp_4tap_vert_ps_64x48_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = x265_interp_4tap_vert_ps_64x64_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = x265_interp_4tap_vert_ss_64x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = x265_interp_4tap_vert_ss_64x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = x265_interp_4tap_vert_ss_64x48_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = x265_interp_4tap_vert_ss_64x64_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vsp = x265_interp_4tap_vert_sp_64x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = x265_interp_4tap_vert_sp_64x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = x265_interp_4tap_vert_sp_64x48_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = x265_interp_4tap_vert_sp_64x64_avx2;
 
         if (cpuMask & X265_CPU_BMI2)
             p.scanPosLast = x265_scanPosLast_avx2_bmi2;
diff -r 6245476add8f -r e62e1cdeb0f4 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Wed Jun 10 11:54:27 2015 +0530
+++ b/source/common/x86/ipfilter16.asm	Wed Jun 10 15:04:22 2015 +0530
@@ -5132,6 +5132,137 @@
     FILTER_VER_CHROMA_W16_32xN_avx2 48, sp, 15
     FILTER_VER_CHROMA_W16_32xN_avx2 64, sp, 15
 
+;-----------------------------------------------------------------------------------------------------------------
+; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------------------------------------------
+%macro FILTER_VER_CHROMA_W16_64xN_avx2 3
+INIT_YMM avx2
+cglobal interp_4tap_vert_%2_64x%1, 5, 7, %3
+    add       r1d, r1d
+    add       r3d, r3d
+    sub       r0, r1
+    shl       r4d, 6
+
+%ifdef PIC
+    lea       r5, [tab_ChromaCoeffV]
+    lea       r5, [r5 + r4]
+%else
+    lea       r5, [tab_ChromaCoeffV + r4]
+%endif
+    mov       r4d, %1/2
+
+%ifidn %2, pp
+    mova      m7, [tab_c_32]
+%elifidn %2, sp
+    mova      m7, [pd_524800]
+%elifidn %2, ps
+    mova      m7, [tab_c_n32768]
+%endif
+
+.loopH:
+%assign x 0
+%rep 4
+    movu       m1, [r0 + x]
+    movu       m3, [r0 + r1 + x]
+    movu       m5, [r5 + 0 * mmsize]
+    punpcklwd  m0, m1, m3
+    pmaddwd    m0, m5
+    punpckhwd  m1, m3
+    pmaddwd    m1, m5
+
+    movu       m4, [r0 + 2 * r1 + x]
+    punpcklwd  m2, m3, m4
+    pmaddwd    m2, m5
+    punpckhwd  m3, m4
+    pmaddwd    m3, m5
+
+    lea        r6, [r0 + 2 * r1]
+    movu       m5, [r6 + r1 + x]
+    punpcklwd  m6, m4, m5
+    pmaddwd    m6, [r5 + 1 * mmsize]
+    paddd      m0, m6
+    punpckhwd  m4, m5
+    pmaddwd    m4, [r5 + 1 * mmsize]
+    paddd      m1, m4
+
+    movu       m4, [r6 + 2 * r1 + x]
+    punpcklwd  m6, m5, m4
+    pmaddwd    m6, [r5 + 1 * mmsize]
+    paddd      m2, m6
+    punpckhwd  m5, m4
+    pmaddwd    m5, [r5 + 1 * mmsize]
+    paddd      m3, m5
+
+%ifidn %2, ss
+    psrad     m0, 6
+    psrad     m1, 6
+    psrad     m2, 6
+    psrad     m3, 6
+
+    packssdw  m0, m1
+    packssdw  m2, m3
+%elifidn %2, ps
+    paddd     m0, m7
+    paddd     m1, m7
+    paddd     m2, m7
+    paddd     m3, m7
+    psrad     m0, 2
+    psrad     m1, 2
+    psrad     m2, 2
+    psrad     m3, 2
+
+    packssdw  m0, m1
+    packssdw  m2, m3
+%else
+    paddd     m0, m7
+    paddd     m1, m7
+    paddd     m2, m7
+    paddd     m3, m7
+%ifidn %2, pp
+    psrad     m0, 6
+    psrad     m1, 6
+    psrad     m2, 6
+    psrad     m3, 6
+%else
+    psrad     m0, 10
+    psrad     m1, 10
+    psrad     m2, 10
+    psrad     m3, 10
+%endif
+    packssdw  m0, m1
+    packssdw  m2, m3
+    pxor      m5, m5
+    CLIPW2    m0, m2, m5, [pw_pixel_max]
+%endif
+
+    movu      [r2 + x], m0
+    movu      [r2 + r3 + x], m2
+%assign x x+mmsize
+%endrep
+
+    lea       r2, [r2 + 2 * r3]
+    lea       r0, [r0 + 2 * r1]
+    dec       r4d
+    jnz       .loopH
+    RET
+%endmacro
+    FILTER_VER_CHROMA_W16_64xN_avx2 16, ss, 7
+    FILTER_VER_CHROMA_W16_64xN_avx2 32, ss, 7
+    FILTER_VER_CHROMA_W16_64xN_avx2 48, ss, 7
+    FILTER_VER_CHROMA_W16_64xN_avx2 64, ss, 7
+    FILTER_VER_CHROMA_W16_64xN_avx2 16, sp, 8
+    FILTER_VER_CHROMA_W16_64xN_avx2 32, sp, 8
+    FILTER_VER_CHROMA_W16_64xN_avx2 48, sp, 8
+    FILTER_VER_CHROMA_W16_64xN_avx2 64, sp, 8
+    FILTER_VER_CHROMA_W16_64xN_avx2 16, ps, 8
+    FILTER_VER_CHROMA_W16_64xN_avx2 32, ps, 8
+    FILTER_VER_CHROMA_W16_64xN_avx2 48, ps, 8
+    FILTER_VER_CHROMA_W16_64xN_avx2 64, ps, 8
+    FILTER_VER_CHROMA_W16_64xN_avx2 16, pp, 8
+    FILTER_VER_CHROMA_W16_64xN_avx2 32, pp, 8
+    FILTER_VER_CHROMA_W16_64xN_avx2 48, pp, 8
+    FILTER_VER_CHROMA_W16_64xN_avx2 64, pp, 8
+
 ;-----------------------------------------------------------------------------------------------------------------
 ; void interp_4tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list