[x265-commits] [x265] api: allow libx265 to forward x265_api_get() calls

Steve Borho steve at borho.org
Wed Apr 29 18:15:05 CEST 2015


details:   http://hg.videolan.org/x265/rev/521643054a55
branches:  
changeset: 10321:521643054a55
user:      Steve Borho <steve at borho.org>
date:      Tue Apr 28 15:05:54 2015 -0500
description:
api: allow libx265 to forward x265_api_get() calls

By adding dynamic binding to x265_api_get() within libx265 itself, we remove the
need for a shim library. Now any libx265 library can forward requests for APIs
supporting a different bitdepth.  The library name is hard-coded as libx265_main
or libx265_main10 depending on the requested bit depth (making it simple to add
libx265_main12 in the future).
Subject: [x265] cli: add -P short option for --profile

details:   http://hg.videolan.org/x265/rev/efaca34116cd
branches:  
changeset: 10322:efaca34116cd
user:      Steve Borho <steve at borho.org>
date:      Tue Apr 28 14:18:13 2015 -0500
description:
cli: add -P short option for --profile
Subject: [x265] asm: filter_vsp, filter_vss for 2x4 in avx2

details:   http://hg.videolan.org/x265/rev/a733d7da0571
branches:  
changeset: 10323:a733d7da0571
user:      Divya Manivannan <divya at multicorewareinc.com>
date:      Wed Apr 29 10:03:13 2015 +0530
description:
asm: filter_vsp, filter_vss for 2x4 in avx2
Subject: [x265] asm: filter_vsp, filter_vss for 64xN, 48x64 in avx2

details:   http://hg.videolan.org/x265/rev/ed2a2ce5ae2e
branches:  
changeset: 10324:ed2a2ce5ae2e
user:      Divya Manivannan <divya at multicorewareinc.com>
date:      Wed Apr 29 11:46:53 2015 +0530
description:
asm: filter_vsp, filter_vss for 64xN, 48x64 in avx2

filter_vsp[64x64, 64x32, 64x48, 48x64, 64x16]: 48832c->33182c, 22838c->15159c, 35532c->22386c, 33320c->22436c, 11928c->7625c
filter_vss[64x64, 64x32, 64x48, 48x64, 64x16]: 38361c->33126c, 17764c->15819c, 29908c->24571c, 26276c->24565c, 9161c->8253c
Subject: [x265] asm: filter_vpp, filter_vps for 16x64 in avx2

details:   http://hg.videolan.org/x265/rev/41dd3f329cc5
branches:  
changeset: 10325:41dd3f329cc5
user:      Divya Manivannan <divya at multicorewareinc.com>
date:      Wed Apr 29 13:38:02 2015 +0530
description:
asm: filter_vpp, filter_vps for 16x64 in avx2

filter_vpp[16x64]: 4281c->3767c
filter_vps[16x64]: 4295c->3396c
Subject: [x265] asm: filter_vpp, filter_vps for 8x64 in avx2

details:   http://hg.videolan.org/x265/rev/fc8ac7c1c890
branches:  
changeset: 10326:fc8ac7c1c890
user:      Divya Manivannan <divya at multicorewareinc.com>
date:      Wed Apr 29 15:03:59 2015 +0530
description:
asm: filter_vpp, filter_vps for 8x64 in avx2

filter_vpp[8x64]: 2083c->1820c
filter_vps[8x64]: 2215c->1722c
Subject: [x265] asm: filter_vpp, filter_vps for 32x64, 32x48 in avx2

details:   http://hg.videolan.org/x265/rev/af75df7046e7
branches:  
changeset: 10327:af75df7046e7
user:      Divya Manivannan <divya at multicorewareinc.com>
date:      Wed Apr 29 15:38:36 2015 +0530
description:
asm: filter_vpp, filter_vps for 32x64, 32x48 in avx2

filter_vpp[32x64, 32x48]: 7487c->4072c, 5689c->3038c
filter_vps[32x64, 32x48]: 8026c->5078c, 6166c->3874c
Subject: [x265] api: document x265_api_get()'s ability to forward API requests

details:   http://hg.videolan.org/x265/rev/74d7fe7a81ad
branches:  
changeset: 10328:74d7fe7a81ad
user:      Steve Borho <steve at borho.org>
date:      Wed Apr 29 11:08:44 2015 -0500
description:
api: document x265_api_get()'s ability to forward API requests

diffstat:

 doc/reST/api.rst                     |  27 ++++++++++++++---
 doc/reST/cli.rst                     |   2 +-
 source/CMakeLists.txt                |   4 ++
 source/common/ipfilter.cpp           |   1 +
 source/common/x86/asm-primitives.cpp |  28 +++++++++++++++++++
 source/common/x86/ipfilter8.asm      |  48 ++++++++++++++++++++++----------
 source/common/x86/ipfilter8.h        |   4 ++
 source/encoder/api.cpp               |  53 ++++++++++++++++++++++++++++++++++++
 source/x265.h                        |  10 +++++-
 source/x265cli.h                     |   6 ++--
 10 files changed, 157 insertions(+), 26 deletions(-)

diffs (truncated from 433 to 300 lines):

diff -r c4d9ee2cef03 -r 74d7fe7a81ad doc/reST/api.rst
--- a/doc/reST/api.rst	Tue Apr 28 14:34:45 2015 -0500
+++ b/doc/reST/api.rst	Wed Apr 29 11:08:44 2015 -0500
@@ -375,8 +375,25 @@ the encoder to use (8 or 10), and if tha
 API for bitDepth=0, which returns the system default libx265.
 
 Note that using this multi-library API in your application is only the
-first step. Next your application must dynamically link to libx265 and
-then you must build and install a multi-lib configuration of libx265,
-which includes 8bpp and 16bpp builds of libx265 and a shim library which
-forwards x265_api_get() calls to the appropriate library using dynamic
-loading and binding.
+first step.  Your application must link to one build of libx265
+(statically or dynamically) and this linked version of libx265 will
+support one bit-depth (8 or 10 bits). If you request a different
+bit-depth, the linked libx265 will attempt to dynamically bind a shared
+library libx265 with a name appropriate for the requested bit-depth:
+
+    8-bit:  libx265_main.dll
+    10-bit: libx265_main10.dll
+
+    (the shared library extension is obviously platform specific. On
+    Linux it is .so while on Mac it is .dylib)
+
+For example on Windows, one could package together an x265.exe
+statically linked against the 8bpp libx265 together with a
+libx265_main10.dll in the same folder, and this executable would be able
+to encode 10bit bitstreams by specifying -P main10 on the command line.
+
+On Linux, x265 packagers could install 8bpp static and shared libraries
+under the name libx265 (so all applications link against 8bpp libx265)
+and then also install libx265_main10.so (symlinked to its numbered solib).
+Thus applications which use x265_api_get() will be able to generate main
+or main10 bitstreams.
diff -r c4d9ee2cef03 -r 74d7fe7a81ad doc/reST/cli.rst
--- a/doc/reST/cli.rst	Tue Apr 28 14:34:45 2015 -0500
+++ b/doc/reST/cli.rst	Wed Apr 29 11:08:44 2015 -0500
@@ -422,7 +422,7 @@ frame counts) are only applicable to the
 Profile, Level, Tier
 ====================
 
-.. option:: --profile <string>
+.. option:: --profile, -P <string>
 
 	Enforce the requirements of the specified profile, ensuring the
 	output stream will be decodable by a decoder which supports that
diff -r c4d9ee2cef03 -r 74d7fe7a81ad source/CMakeLists.txt
--- a/source/CMakeLists.txt	Tue Apr 28 14:34:45 2015 -0500
+++ b/source/CMakeLists.txt	Wed Apr 29 11:08:44 2015 -0500
@@ -65,6 +65,10 @@ if(UNIX)
     if(LIBRT)
         list(APPEND PLATFORM_LIBS rt)
     endif()
+    find_library(LIBDL dl)
+    if(LIBDL)
+        list(APPEND PLATFORM_LIBS dl)
+    endif()
     find_package(Numa)
     if(NUMA_FOUND)
         link_directories(${NUMA_LIBRARY_DIR})
diff -r c4d9ee2cef03 -r 74d7fe7a81ad source/common/ipfilter.cpp
--- a/source/common/ipfilter.cpp	Tue Apr 28 14:34:45 2015 -0500
+++ b/source/common/ipfilter.cpp	Wed Apr 29 11:08:44 2015 -0500
@@ -463,6 +463,7 @@ void setupFilterPrimitives_c(EncoderPrim
 
     CHROMA_422(4, 8);
     CHROMA_422(4, 4);
+    CHROMA_422(2, 4);
     CHROMA_422(2, 8);
     CHROMA_422(8,  16);
     CHROMA_422(8,  8);
diff -r c4d9ee2cef03 -r 74d7fe7a81ad source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Apr 28 14:34:45 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Wed Apr 29 11:08:44 2015 -0500
@@ -2291,12 +2291,14 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vss = x265_interp_4tap_vert_ss_16x24_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vss = x265_interp_4tap_vert_ss_12x32_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vss = x265_interp_4tap_vert_ss_4x32_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].filter_vss = x265_interp_4tap_vert_ss_2x4_avx2;
 
         //i444 for chroma_vss
         p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vss = x265_interp_4tap_vert_ss_4x4_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vss = x265_interp_4tap_vert_ss_8x8_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vss = x265_interp_4tap_vert_ss_16x16_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = x265_interp_4tap_vert_ss_32x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = x265_interp_4tap_vert_ss_64x64_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vss = x265_interp_4tap_vert_ss_8x4_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vss = x265_interp_4tap_vert_ss_4x8_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vss = x265_interp_4tap_vert_ss_16x8_avx2;
@@ -2311,6 +2313,12 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vss = x265_interp_4tap_vert_ss_24x32_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = x265_interp_4tap_vert_ss_32x8_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vss = x265_interp_4tap_vert_ss_8x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = x265_interp_4tap_vert_ss_64x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = x265_interp_4tap_vert_ss_32x64_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = x265_interp_4tap_vert_ss_64x48_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = x265_interp_4tap_vert_ss_48x64_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = x265_interp_4tap_vert_ss_64x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vss = x265_interp_4tap_vert_ss_16x64_avx2;
 
         p.pu[LUMA_16x16].luma_hvpp = x265_interp_8tap_hv_pp_16x16_avx2;
 
@@ -2481,12 +2489,14 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vsp = x265_interp_4tap_vert_sp_16x24_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].filter_vsp = x265_interp_4tap_vert_sp_12x32_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].filter_vsp = x265_interp_4tap_vert_sp_4x32_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_2x4].filter_vsp = x265_interp_4tap_vert_sp_2x4_avx2;
 
         //i444 for chroma_vsp
         p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vsp = x265_interp_4tap_vert_sp_4x4_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vsp = x265_interp_4tap_vert_sp_8x8_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vsp = x265_interp_4tap_vert_sp_16x16_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = x265_interp_4tap_vert_sp_32x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = x265_interp_4tap_vert_sp_64x64_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_vsp = x265_interp_4tap_vert_sp_8x4_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_4x8].filter_vsp = x265_interp_4tap_vert_sp_4x8_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vsp = x265_interp_4tap_vert_sp_16x8_avx2;
@@ -2501,6 +2511,12 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vsp = x265_interp_4tap_vert_sp_24x32_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vsp = x265_interp_4tap_vert_sp_32x8_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vsp = x265_interp_4tap_vert_sp_8x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = x265_interp_4tap_vert_sp_64x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vsp = x265_interp_4tap_vert_sp_32x64_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = x265_interp_4tap_vert_sp_64x48_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = x265_interp_4tap_vert_sp_48x64_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vsp = x265_interp_4tap_vert_sp_64x16_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vsp = x265_interp_4tap_vert_sp_16x64_avx2;
 
         //i422 for chroma_vps
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vps = x265_interp_4tap_vert_ps_4x8_avx2;
@@ -2516,6 +2532,10 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vps = x265_interp_4tap_vert_ps_8x4_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vps = x265_interp_4tap_vert_ps_16x8_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = x265_interp_4tap_vert_ps_32x16_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vps = x265_interp_4tap_vert_ps_16x64_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vps = x265_interp_4tap_vert_ps_8x64_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vps = x265_interp_4tap_vert_ps_32x64_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vps = x265_interp_4tap_vert_ps_32x48_avx2;
 
         //i444 for chroma_vps
         p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vps = x265_interp_4tap_vert_ps_4x4_avx2;
@@ -2536,6 +2556,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vps = x265_interp_4tap_vert_ps_24x32_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = x265_interp_4tap_vert_ps_32x8_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vps = x265_interp_4tap_vert_ps_8x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vps = x265_interp_4tap_vert_ps_16x64_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = x265_interp_4tap_vert_ps_32x64_avx2;
 
         //i422 for chroma_vpp
         p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].filter_vpp = x265_interp_4tap_vert_pp_4x8_avx2;
@@ -2551,6 +2573,10 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_vpp = x265_interp_4tap_vert_pp_8x4_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = x265_interp_4tap_vert_pp_16x8_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = x265_interp_4tap_vert_pp_32x16_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = x265_interp_4tap_vert_pp_16x64_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vpp = x265_interp_4tap_vert_pp_8x64_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = x265_interp_4tap_vert_pp_32x64_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = x265_interp_4tap_vert_pp_32x48_avx2;
 
         //i444 for chroma_vpp
         p.chroma[X265_CSP_I444].pu[LUMA_4x4].filter_vpp = x265_interp_4tap_vert_pp_4x4_avx2;
@@ -2571,6 +2597,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_vpp = x265_interp_4tap_vert_pp_24x32_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = x265_interp_4tap_vert_pp_32x8_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vpp = x265_interp_4tap_vert_pp_8x32_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = x265_interp_4tap_vert_pp_16x64_avx2;
+        p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = x265_interp_4tap_vert_pp_32x64_avx2;
 
         if (cpuMask & X265_CPU_BMI2)
             p.scanPosLast = x265_scanPosLast_avx2_bmi2;
diff -r c4d9ee2cef03 -r 74d7fe7a81ad source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Tue Apr 28 14:34:45 2015 -0500
+++ b/source/common/x86/ipfilter8.asm	Wed Apr 29 11:08:44 2015 -0500
@@ -5556,9 +5556,9 @@ cglobal interp_4tap_vert_%1_8x16, 4, 7, 
     FILTER_VER_CHROMA_AVX2_8x16 pp
     FILTER_VER_CHROMA_AVX2_8x16 ps
 
-%macro FILTER_VER_CHROMA_AVX2_8x32 1
-INIT_YMM avx2
-cglobal interp_4tap_vert_%1_8x32, 4, 7, 8
+%macro FILTER_VER_CHROMA_AVX2_8xN 2
+INIT_YMM avx2
+cglobal interp_4tap_vert_%1_8x%2, 4, 7, 8
     mov             r4d, r4m
     shl             r4d, 6
 
@@ -5578,15 +5578,17 @@ cglobal interp_4tap_vert_%1_8x32, 4, 7, 
     mova            m7, [pw_2000]
 %endif
     lea             r6, [r3 * 3]
-%rep 2
+%rep %2 / 16
     PROCESS_CHROMA_AVX2_W8_16R %1
     lea             r2, [r2 + r3 * 4]
 %endrep
     RET
 %endmacro
 
-    FILTER_VER_CHROMA_AVX2_8x32 pp
-    FILTER_VER_CHROMA_AVX2_8x32 ps
+    FILTER_VER_CHROMA_AVX2_8xN pp, 32
+    FILTER_VER_CHROMA_AVX2_8xN ps, 32
+    FILTER_VER_CHROMA_AVX2_8xN pp, 64
+    FILTER_VER_CHROMA_AVX2_8xN ps, 64
 
 %macro PROCESS_CHROMA_AVX2_W8_4R 0
     movq            xm1, [r0]                       ; m1 = row 0
@@ -6714,10 +6716,10 @@ cglobal interp_4tap_vert_%1_16x12, 4, 6,
     FILTER_VER_CHROMA_AVX2_16x12 pp
     FILTER_VER_CHROMA_AVX2_16x12 ps
 
-%macro FILTER_VER_CHROMA_AVX2_16x32 1
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal interp_4tap_vert_%1_16x32, 4, 8, 8
+%macro FILTER_VER_CHROMA_AVX2_16xN 2
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal interp_4tap_vert_%1_16x%2, 4, 8, 8
     mov             r4d, r4m
     shl             r4d, 6
 
@@ -6737,7 +6739,7 @@ cglobal interp_4tap_vert_%1_16x32, 4, 8,
     mova            m7, [pw_2000]
 %endif
     lea             r6, [r3 * 3]
-    mov             r7d, 2
+    mov             r7d, %2 / 16
 .loopH:
     movu            xm0, [r0]
     vinserti128     m0, m0, [r0 + r1 * 2], 1
@@ -7004,8 +7006,10 @@ cglobal interp_4tap_vert_%1_16x32, 4, 8,
 %endif
 %endmacro
 
-    FILTER_VER_CHROMA_AVX2_16x32 pp
-    FILTER_VER_CHROMA_AVX2_16x32 ps
+    FILTER_VER_CHROMA_AVX2_16xN pp, 32
+    FILTER_VER_CHROMA_AVX2_16xN ps, 32
+    FILTER_VER_CHROMA_AVX2_16xN pp, 64
+    FILTER_VER_CHROMA_AVX2_16xN ps, 64
 
 %macro FILTER_VER_CHROMA_AVX2_24x32 1
 INIT_YMM avx2
@@ -8092,8 +8096,8 @@ cglobal interp_4tap_vert_pp_%1x%2, 4, 6,
     FILTER_V4_W32 32, 64
 
 %macro FILTER_VER_CHROMA_AVX2_32xN 2
-INIT_YMM avx2
-%if ARCH_X86_64 == 1
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
 cglobal interp_4tap_vert_%1_32x%2, 4, 7, 13
     mov             r4d, r4m
     shl             r4d, 6
@@ -8223,10 +8227,14 @@ cglobal interp_4tap_vert_%1_32x%2, 4, 7,
 %endif
 %endmacro
 
+    FILTER_VER_CHROMA_AVX2_32xN pp, 64
+    FILTER_VER_CHROMA_AVX2_32xN pp, 48
     FILTER_VER_CHROMA_AVX2_32xN pp, 32
     FILTER_VER_CHROMA_AVX2_32xN pp, 24
     FILTER_VER_CHROMA_AVX2_32xN pp, 16
     FILTER_VER_CHROMA_AVX2_32xN pp, 8
+    FILTER_VER_CHROMA_AVX2_32xN ps, 64
+    FILTER_VER_CHROMA_AVX2_32xN ps, 48
     FILTER_VER_CHROMA_AVX2_32xN ps, 32
     FILTER_VER_CHROMA_AVX2_32xN ps, 24
     FILTER_VER_CHROMA_AVX2_32xN ps, 16
@@ -15938,8 +15946,10 @@ cglobal interp_4tap_vert_%1_%2x16, 4, 10
 
     FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 16
     FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 32
+    FILTER_VER_CHROMA_S_AVX2_Nx16 sp, 64
     FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 16
     FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 32
+    FILTER_VER_CHROMA_S_AVX2_Nx16 ss, 64
 
 %macro FILTER_VER_CHROMA_S_AVX2_NxN 3
 INIT_YMM avx2
@@ -16002,6 +16012,14 @@ cglobal interp_4tap_vert_%3_%1x%2, 4, 11
     FILTER_VER_CHROMA_S_AVX2_NxN 16, 64, ss
     FILTER_VER_CHROMA_S_AVX2_NxN 24, 64, ss
     FILTER_VER_CHROMA_S_AVX2_NxN 32, 64, ss
+    FILTER_VER_CHROMA_S_AVX2_NxN 64, 64, sp
+    FILTER_VER_CHROMA_S_AVX2_NxN 64, 32, sp
+    FILTER_VER_CHROMA_S_AVX2_NxN 64, 48, sp
+    FILTER_VER_CHROMA_S_AVX2_NxN 48, 64, sp
+    FILTER_VER_CHROMA_S_AVX2_NxN 64, 64, ss
+    FILTER_VER_CHROMA_S_AVX2_NxN 64, 32, ss
+    FILTER_VER_CHROMA_S_AVX2_NxN 64, 48, ss
+    FILTER_VER_CHROMA_S_AVX2_NxN 48, 64, ss
 
 %macro PROCESS_CHROMA_S_AVX2_W8_4R 1
     movu            xm0, [r0]                       ; m0 = row 0
diff -r c4d9ee2cef03 -r 74d7fe7a81ad source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h	Tue Apr 28 14:34:45 2015 -0500
+++ b/source/common/x86/ipfilter8.h	Wed Apr 29 11:08:44 2015 -0500
@@ -739,11 +739,15 @@ CHROMA_422_SS_FILTERS_SSE4(_avx2);


More information about the x265-commits mailing list