[x265] [PATCH] asm-avx2: filter_vpp[4x2], filter_vps[4x2]: improve 142c->130c, 126c->121c

Divya Manivannan divya at multicorewareinc.com
Wed Mar 4 07:19:46 CET 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1425449960 -19800
#      Wed Mar 04 11:49:20 2015 +0530
# Node ID 526974a41de7f30f53375a9583ddb3320384cd7f
# Parent  018e8bbaa854b1a4bd82b3a2e23f7775a77da5cc
asm-avx2: filter_vpp[4x2], filter_vps[4x2]: improve 142c->130c, 126c->121c

diff -r 018e8bbaa854 -r 526974a41de7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Feb 27 11:46:09 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Mar 04 11:49:20 2015 +0530
@@ -1623,12 +1623,14 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = x265_interp_4tap_vert_pp_8x8_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vpp = x265_interp_4tap_vert_pp_2x4_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vpp = x265_interp_4tap_vert_pp_4x2_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vpp = x265_interp_4tap_vert_pp_4x8_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vpp = x265_interp_4tap_vert_pp_8x4_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = x265_interp_4tap_vert_pp_8x16_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = x265_interp_4tap_vert_pp_16x8_avx2;
 
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].filter_vps = x265_interp_4tap_vert_ps_2x4_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].filter_vps = x265_interp_4tap_vert_ps_4x2_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].filter_vps = x265_interp_4tap_vert_ps_4x8_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_vps = x265_interp_4tap_vert_ps_8x4_avx2;
diff -r 018e8bbaa854 -r 526974a41de7 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Fri Feb 27 11:46:09 2015 +0530
+++ b/source/common/x86/ipfilter8.asm	Wed Mar 04 11:49:20 2015 +0530
@@ -2482,6 +2482,55 @@
 
 RET
 
+%macro FILTER_VER_CHROMA_AVX2_4x2 1
+INIT_YMM avx2
+cglobal interp_4tap_vert_%1_4x2, 4, 6, 4
+    mov             r4d, r4m
+    shl             r4d, 5
+    sub             r0, r1
+
+%ifdef PIC
+    lea             r5, [tab_ChromaCoeff_V]
+    add             r5, r4
+%else
+    lea             r5, [tab_ChromaCoeff_V + r4]
+%endif
+
+    lea             r4, [r1 * 3]
+
+    movd            xm1, [r0]
+    movd            xm2, [r0 + r1]
+    punpcklbw       xm1, xm2
+    movd            xm3, [r0 + r1 * 2]
+    punpcklbw       xm2, xm3
+    movlhps         xm1, xm2
+    movd            xm0, [r0 + r4]
+    punpcklbw       xm3, xm0
+    movd            xm2, [r0 + r1 * 4]
+    punpcklbw       xm0, xm2
+    movlhps         xm3, xm0
+    vinserti128     m1, m1, xm3, 1                          ; m1 = row[x x x 4 3 2 1 0]
+
+    pmaddubsw       m1, [r5]
+    vextracti128    xm3, m1, 1
+    paddw           xm1, xm3
+%ifidn %1,pp
+    pmulhrsw        xm1, [pw_512]
+    packuswb        xm1, xm1
+    movd            [r2], xm1
+    pextrd          [r2 + r3], xm1, 1
+%else
+    add             r3d, r3d
+    psubw           xm1, [pw_2000]
+    movq            [r2], xm1
+    movhps          [r2 + r3], xm1
+%endif
+    RET
+%endmacro
+
+FILTER_VER_CHROMA_AVX2_4x2 pp
+FILTER_VER_CHROMA_AVX2_4x2 ps
+
 ;-----------------------------------------------------------------------------
 ; void interp_4tap_vert_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
 ;-----------------------------------------------------------------------------
@@ -6044,8 +6093,7 @@
     paddw           m0, m4
     paddw           m0, m2                                  ; m0 = WORD ROW[3 2 1 0]
 
-    vbroadcasti128  m3, [pw_2000]
-    psubw           m0, m3
+    psubw           m0, [pw_2000]
     vextracti128    xm2, m0, 1
     lea             r5, [r3 * 3]
     movq            [r2], xm0


More information about the x265-devel mailing list