[x265] [PATCH] asm: filter_vsp, filter_vss for 64xN, 48x64 in avx2

Divya Manivannan divya at multicorewareinc.com
Wed Apr 29 08:24:48 CEST 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1430288213 -19800
#      Wed Apr 29 11:46:53 2015 +0530
# Node ID 41a94b5fe6a8b2d0d964955971377731a48439ac
# Parent  861c8a143802759b1c960289058bb3165481f6cd
asm: filter_vsp, filter_vss for 64xN, 48x64 in avx2

filter_vsp[64x64, 64x32, 64x48, 48x64, 64x16]: 48832c->33182c, 22838c->15159c, 35532c->22386c, 33320c->22436c, 11928c->7625c
filter_vss[64x64, 64x32, 64x48, 48x64, 64x16]: 38361c->33126c, 17764c->15819c, 29908c->24571c, 26276c->24565c, 9161c->8253c

diff -r 861c8a143802 -r 41a94b5fe6a8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Apr 29 10:03:13 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Apr 29 11:46:53 2015 +0530
@@ -2298,6 +2298,7 @@
         p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vss = x265_interp_4tap_vert_ss_8x8_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vss = x265_interp_4tap_vert_ss_16x16_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = x265_interp_4tap_vert_ss_32x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = x265_interp_4tap_vert_ss_64x64_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vss = x265_interp_4tap_vert_ss_8x4_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vss = x265_interp_4tap_vert_ss_4x8_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vss = x265_interp_4tap_vert_ss_16x8_avx2;
@@ -2312,6 +2313,12 @@
         p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vss = x265_interp_4tap_vert_ss_24x32_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = x265_interp_4tap_vert_ss_32x8_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vss = x265_interp_4tap_vert_ss_8x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = x265_interp_4tap_vert_ss_64x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = x265_interp_4tap_vert_ss_32x64_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = x265_interp_4tap_vert_ss_64x48_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = x265_interp_4tap_vert_ss_48x64_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = x265_interp_4tap_vert_ss_64x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vss = x265_interp_4tap_vert_ss_16x64_avx2;
 
         p.pu[LUMA_16x16].luma_hvpp = x265_interp_8tap_hv_pp_16x16_avx2;
 
@@ -2489,6 +2496,7 @@
         p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vsp = x265_interp_4tap_vert_sp_8x8_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vsp = x265_interp_4tap_vert_sp_16x16_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = x265_interp_4tap_vert_sp_32x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = x265_interp_4tap_vert_sp_64x64_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vsp = x265_interp_4tap_vert_sp_8x4_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vsp = x265_interp_4tap_vert_sp_4x8_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vsp = x265_interp_4tap_vert_sp_16x8_avx2;
@@ -2503,6 +2511,12 @@
         p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp = x265_interp_4tap_vert_sp_24x32_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vsp = x265_interp_4tap_vert_sp_32x8_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vsp = x265_interp_4tap_vert_sp_8x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = x265_interp_4tap_vert_sp_64x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vsp = x265_interp_4tap_vert_sp_32x64_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = x265_interp_4tap_vert_sp_64x48_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = x265_interp_4tap_vert_sp_48x64_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vsp = x265_interp_4tap_vert_sp_64x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vsp = x265_interp_4tap_vert_sp_16x64_avx2;
 
         //i422 for chroma_vps
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vps = x265_interp_4tap_vert_ps_4x8_avx2;
diff -r 861c8a143802 -r 41a94b5fe6a8 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed Apr 29 10:03:13 2015 +0530
+++ b/source/common/x86/ipfilter8.asm	Wed Apr 29 11:46:53 2015 +0530
@@ -15938,8 +15938,10 @@
 
     FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 16
     FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 32
+    FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 64
     FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 16
     FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 32
+    FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 64
 
 %macro FILTER_VER_CHROMA_S_AVX2_NxN 3
 INIT_YMM avx2
@@ -16002,6 +16004,14 @@
     FILTER_VER_CHROMA_S_AVX2_NxN 16, 64, ss
     FILTER_VER_CHROMA_S_AVX2_NxN 24, 64, ss
     FILTER_VER_CHROMA_S_AVX2_NxN 32, 64, ss
+    FILTER_VER_CHROMA_S_AVX2_NxN 64, 64, sp
+    FILTER_VER_CHROMA_S_AVX2_NxN 64, 32, sp
+    FILTER_VER_CHROMA_S_AVX2_NxN 64, 48, sp
+    FILTER_VER_CHROMA_S_AVX2_NxN 48, 64, sp
+    FILTER_VER_CHROMA_S_AVX2_NxN 64, 64, ss
+    FILTER_VER_CHROMA_S_AVX2_NxN 64, 32, ss
+    FILTER_VER_CHROMA_S_AVX2_NxN 64, 48, ss
+    FILTER_VER_CHROMA_S_AVX2_NxN 48, 64, ss
 
 %macro PROCESS_CHROMA_S_AVX2_W8_4R 1
     movu            xm0, [r0]                       ; m0 = row 0
diff -r 861c8a143802 -r 41a94b5fe6a8 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Wed Apr 29 10:03:13 2015 +0530
+++ b/source/common/x86/ipfilter8.h	Wed Apr 29 11:46:53 2015 +0530
@@ -746,6 +746,8 @@
 CHROMA_444_SP_FILTERS(_sse4);
 CHROMA_444_SS_FILTERS(_sse2);
 CHROMA_444_FILTERS(_avx2);
+CHROMA_444_SP_FILTERS(_avx2);
+CHROMA_444_SS_FILTERS(_avx2);
 
 #undef SETUP_CHROMA_FUNC_DEF
 #undef SETUP_CHROMA_SP_FUNC_DEF


More information about the x265-devel mailing list