[x265] [PATCH] asm: filter_vsp, filter_vss for 16x24 in avx2

Divya Manivannan divya at multicorewareinc.com
Tue Apr 28 11:17:46 CEST 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1430211431 -19800
#      Tue Apr 28 14:27:11 2015 +0530
# Node ID b143e26c2846d917c0e911708907b0e3e85a368c
# Parent  a5a8d76ef3df4c85510cce653d52f6dba9307ff2
asm: filter_vsp, filter_vss for 16x24 in avx2

filter_vsp[16x24]: 4357c->2865c
filter_vss[16x24]: 3545c->3171c

diff -r a5a8d76ef3df -r b143e26c2846 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Apr 28 13:30:29 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Apr 28 14:27:11 2015 +0530
@@ -2286,6 +2286,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vss = x265_interp_4tap_vert_ss_8x12_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_vss = x265_interp_4tap_vert_ss_6x16_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vss = x265_interp_4tap_vert_ss_2x16_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vss = x265_interp_4tap_vert_ss_16x24_avx2;
 
         //i444 for chroma_vss
         p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vss = x265_interp_4tap_vert_ss_4x4_avx2;
@@ -2473,6 +2474,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_vsp = x265_interp_4tap_vert_sp_8x12_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].filter_vsp = x265_interp_4tap_vert_sp_6x16_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].filter_vsp = x265_interp_4tap_vert_sp_2x16_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vsp = x265_interp_4tap_vert_sp_16x24_avx2;
 
         //i444 for chroma_vsp
         p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vsp = x265_interp_4tap_vert_sp_4x4_avx2;
diff -r a5a8d76ef3df -r b143e26c2846 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Tue Apr 28 13:30:29 2015 +0530
+++ b/source/common/x86/ipfilter8.asm	Tue Apr 28 14:27:11 2015 +0530
@@ -17422,10 +17422,10 @@
     FILTER_VER_CHROMA_S_AVX2_8xN ss, 32
     FILTER_VER_CHROMA_S_AVX2_8xN ss, 64
 
-%macro FILTER_VER_CHROMA_S_AVX2_32x24 1
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_4tap_vert_%1_32x24, 4, 10, 10
+%macro FILTER_VER_CHROMA_S_AVX2_Nx24 2
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal interp_4tap_vert_%1_%2x24, 4, 10, 10
     mov             r4d, r4m
     shl             r4d, 6
     add             r1d, r1d
@@ -17445,7 +17445,7 @@
     add             r3d, r3d
 %endif
     lea             r6, [r3 * 3]
-    mov             r9d, 4
+    mov             r9d, %2 / 8
 .loopW:
     PROCESS_CHROMA_S_AVX2_W8_16R %1
 %ifidn %1,sp
@@ -17457,13 +17457,13 @@
     dec             r9d
     jnz             .loopW
 %ifidn %1,sp
-    lea             r2, [r8 + r3 * 4 - 24]
-%else
-    lea             r2, [r8 + r3 * 4 - 48]
-%endif
-    lea             r0, [r7 - 48]
+    lea             r2, [r8 + r3 * 4 - %2 + 8]
+%else
+    lea             r2, [r8 + r3 * 4 - 2 * %2 + 16]
+%endif
+    lea             r0, [r7 - 2 * %2 + 16]
     mova            m7, m9
-    mov             r9d, 4
+    mov             r9d, %2 / 8
 .loop:
     PROCESS_CHROMA_S_AVX2_W8_8R %1
 %ifidn %1,sp
@@ -17478,8 +17478,10 @@
 %endif
 %endmacro
 
-    FILTER_VER_CHROMA_S_AVX2_32x24 sp
-    FILTER_VER_CHROMA_S_AVX2_32x24 ss
+    FILTER_VER_CHROMA_S_AVX2_Nx24 sp, 32
+    FILTER_VER_CHROMA_S_AVX2_Nx24 sp, 16
+    FILTER_VER_CHROMA_S_AVX2_Nx24 ss, 32
+    FILTER_VER_CHROMA_S_AVX2_Nx24 ss, 16
 
 %macro FILTER_VER_CHROMA_S_AVX2_2x8 1
 INIT_YMM avx2


More information about the x265-devel mailing list