[x265-commits] [x265] cmake: introduce fprofile options

Steve Borho steve at borho.org
Mon Mar 30 22:19:40 CEST 2015


details:   http://hg.videolan.org/x265/rev/65d004d54895
branches:  stable
changeset: 9937:65d004d54895
user:      Steve Borho <steve at borho.org>
date:      Sun Mar 29 12:01:11 2015 -0500
description:
cmake: introduce fprofile options

These are hidden from showing up in the GUI, since they come as 'some assembly
required'. The test-harness repo has a script which will drive this processs:

1) build with -fprofile-generate
2) run some representative test encodes
3) build with -fprofile-use

By using generic cmake options, we can add support for MSVC and other compilers
Subject: [x265] cmake: add option for -march=native

details:   http://hg.videolan.org/x265/rev/e5495a8cd07c
branches:  stable
changeset: 9938:e5495a8cd07c
user:      Steve Borho <steve at borho.org>
date:      Sun Mar 29 13:24:10 2015 -0500
description:
cmake: add option for -march=native
Subject: [x265] doc: fix typo in --lookahead-slices

details:   http://hg.videolan.org/x265/rev/66eff66ec406
branches:  stable
changeset: 9939:66eff66ec406
user:      Steve Borho <steve at borho.org>
date:      Mon Mar 30 10:26:31 2015 -0500
description:
doc: fix typo in --lookahead-slices
Subject: [x265] cli: save and restore console title before and after encoding

details:   http://hg.videolan.org/x265/rev/64191dc206b8
branches:  
changeset: 9940:64191dc206b8
user:      Xinyue Lu <i at 7086.in>
date:      Mon Mar 30 04:12:50 2015 -0700
description:
cli: save and restore console title before and after encoding
Subject: [x265] cmake: avoid CMP0054 warnings in main script

details:   http://hg.videolan.org/x265/rev/64697f08a047
branches:  stable
changeset: 9941:64697f08a047
user:      Steve Borho <steve at borho.org>
date:      Mon Mar 30 12:43:39 2015 -0500
description:
cmake: avoid CMP0054 warnings in main script

cmake 3.2 will warn about these even though we say we want the old behavior.
note that if you enable the testbench build, CMP0054 warnings get triggered from
cmake scripts distributed by cmake itself.
Subject: [x265] asm: replace movd+pshufd+vinserti128 with vpbroadcastd instruction

details:   http://hg.videolan.org/x265/rev/8f761004c163
branches:  
changeset: 9942:8f761004c163
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Mon Mar 30 11:27:36 2015 +0530
description:
asm: replace movd+pshufd+vinserti128 with vpbroadcastd instruction
Subject: [x265] cli: prevent Windows from falling into standby

details:   http://hg.videolan.org/x265/rev/0e77bd541284
branches:  
changeset: 9943:0e77bd541284
user:      Xinyue Lu <i at 7086.in>
date:      Mon Mar 30 04:42:47 2015 -0700
description:
cli: prevent Windows from falling into standby
Subject: [x265] Merge with stable

details:   http://hg.videolan.org/x265/rev/45bac4cace5e
branches:  
changeset: 9944:45bac4cace5e
user:      Steve Borho <steve at borho.org>
date:      Mon Mar 30 12:44:33 2015 -0500
description:
Merge with stable

diffstat:

 doc/reST/cli.rst                     |    2 +-
 source/CMakeLists.txt                |   23 +-
 source/common/pixel.cpp              |    2 +-
 source/common/primitives.h           |    7 +-
 source/common/x86/asm-primitives.cpp |    5 +
 source/common/x86/intrapred.h        |    3 +
 source/common/x86/intrapred8.asm     |  990 +++++++++++++++++++++++++++++++++++
 source/common/x86/pixel-util.h       |    5 +-
 source/common/x86/pixel-util8.asm    |  125 +++-
 source/encoder/search.cpp            |   25 +-
 source/test/pixelharness.cpp         |   33 +-
 source/test/pixelharness.h           |    3 +-
 source/x265.cpp                      |    9 +
 13 files changed, 1185 insertions(+), 47 deletions(-)

diffs (truncated from 1469 to 300 lines):

diff -r 6cd3938d8683 -r 45bac4cace5e doc/reST/cli.rst
--- a/doc/reST/cli.rst	Fri Mar 27 22:59:16 2015 -0500
+++ b/doc/reST/cli.rst	Mon Mar 30 12:44:33 2015 -0500
@@ -976,7 +976,7 @@ Slice decision options
 
 	The encoder may internally lower the number of slices to ensure
 	each slice codes at least 10 16x16 rows of lowres blocks. If slices
-	are used in lookahead, the are logged in the list of tools as
+	are used in lookahead, they are logged in the list of tools as
 	*lslices*.
 	
 	**Values:** 0 - disabled (default). 1 is the same as 0. Max 16
diff -r 6cd3938d8683 -r 45bac4cace5e source/CMakeLists.txt
--- a/source/CMakeLists.txt	Fri Mar 27 22:59:16 2015 -0500
+++ b/source/CMakeLists.txt	Mon Mar 30 12:44:33 2015 -0500
@@ -23,6 +23,11 @@ include(CheckFunctionExists)
 include(CheckSymbolExists)
 include(CheckCXXCompilerFlag)
 
+option(FPROFILE_GENERATE "Compile executable to generate usage data" OFF)
+option(FPROFILE_USE "Compile executable using generated usage data" OFF)
+option(NATIVE_BUILD "Target the build CPU" OFF)
+mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
+
 # X265_BUILD must be incremented each time the public API is changed
 set(X265_BUILD 50)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
@@ -88,13 +93,13 @@ if (APPLE)
   add_definitions(-DMACOS)
 endif()
 
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
     set(CLANG 1)
 endif()
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
     set(INTEL_CXX 1)
 endif()
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
     set(GCC 1)
 endif()
 
@@ -141,12 +146,22 @@ if(GCC)
     if(ENABLE_PIC)
          add_definitions(-fPIC)
     endif(ENABLE_PIC)
-    if(X86 AND NOT X64)
+    if(NATIVE_BUILD)
+        add_definitions(-march=native)
+    elseif(X86 AND NOT X64)
         add_definitions(-march=i686)
     endif()
     if(ARM)
         add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp)
     endif()
+    if(FPROFILE_GENERATE)
+        add_definitions(-fprofile-generate)
+        list(APPEND LINKER_OPTIONS "-fprofile-generate")
+    endif(FPROFILE_GENERATE)
+    if(FPROFILE_USE)
+        add_definitions(-fprofile-use)
+        list(APPEND LINKER_OPTIONS "-fprofile-use")
+    endif(FPROFILE_USE)
     check_cxx_compiler_flag(-Wno-narrowing CC_HAS_NO_NARROWING) 
     check_cxx_compiler_flag(-Wno-array-bounds CC_HAS_NO_ARRAY_BOUNDS) 
     if (CC_HAS_NO_ARRAY_BOUNDS)
diff -r 6cd3938d8683 -r 45bac4cace5e source/common/pixel.cpp
--- a/source/common/pixel.cpp	Fri Mar 27 22:59:16 2015 -0500
+++ b/source/common/pixel.cpp	Mon Mar 30 12:44:33 2015 -0500
@@ -582,7 +582,7 @@ void pixelavg_pp(pixel* dst, intptr_t ds
     }
 }
 
-void scale1D_128to64(pixel *dst, const pixel *src, intptr_t /*stride*/)
+void scale1D_128to64(pixel *dst, const pixel *src)
 {
     int x;
     const pixel* src1 = src;
diff -r 6cd3938d8683 -r 45bac4cace5e source/common/primitives.h
--- a/source/common/primitives.h	Fri Mar 27 22:59:16 2015 -0500
+++ b/source/common/primitives.h	Mon Mar 30 12:44:33 2015 -0500
@@ -140,7 +140,8 @@ typedef void (*dequant_normal_t)(const i
 typedef int(*count_nonzero_t)(const int16_t* quantCoeff);
 typedef void (*weightp_pp_t)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
 typedef void (*weightp_sp_t)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
-typedef void (*scale_t)(pixel* dst, const pixel* src, intptr_t stride);
+typedef void (*scale1D_t)(pixel* dst, const pixel* src);
+typedef void (*scale2D_t)(pixel* dst, const pixel* src, intptr_t stride);
 typedef void (*downscale_t)(const pixel* src0, pixel* dstf, pixel* dsth, pixel* dstv, pixel* dstc,
                             intptr_t src_stride, intptr_t dst_stride, int width, int height);
 typedef void (*extendCURowBorder_t)(pixel* txt, intptr_t stride, int width, int height, int marginX);
@@ -266,8 +267,8 @@ struct EncoderPrimitives
     dequant_scaling_t     dequant_scaling;
     dequant_normal_t      dequant_normal;
     denoiseDct_t          denoiseDct;
-    scale_t               scale1D_128to64;
-    scale_t               scale2D_64to32;
+    scale1D_t             scale1D_128to64;
+    scale2D_t             scale2D_64to32;
 
     ssim_4x4x2_core_t     ssim_4x4x2_core;
     ssim_end4_t           ssim_end_4;
diff -r 6cd3938d8683 -r 45bac4cace5e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Mar 27 22:59:16 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Mon Mar 30 12:44:33 2015 -0500
@@ -1447,6 +1447,8 @@ void setupAssemblyPrimitives(EncoderPrim
 #if X86_64
     if (cpuMask & X265_CPU_AVX2)
     {
+        p.scale2D_64to32 = x265_scale2D_64to32_avx2;
+
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
         p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
         p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_avx2;
@@ -1642,6 +1644,9 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_32x32].intra_pred[30] = x265_intra_pred_ang32_30_avx2;
         p.cu[BLOCK_32x32].intra_pred[31] = x265_intra_pred_ang32_31_avx2;
         p.cu[BLOCK_32x32].intra_pred[32] = x265_intra_pred_ang32_32_avx2;
+        p.cu[BLOCK_32x32].intra_pred[33] = x265_intra_pred_ang32_33_avx2;
+        p.cu[BLOCK_32x32].intra_pred[25] = x265_intra_pred_ang32_25_avx2;
+        p.cu[BLOCK_32x32].intra_pred[24] = x265_intra_pred_ang32_24_avx2;
 
         // copy_sp primitives
         p.cu[BLOCK_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2;
diff -r 6cd3938d8683 -r 45bac4cace5e source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Fri Mar 27 22:59:16 2015 -0500
+++ b/source/common/x86/intrapred.h	Mon Mar 30 12:44:33 2015 -0500
@@ -212,6 +212,9 @@ void x265_intra_pred_ang32_29_avx2(pixel
 void x265_intra_pred_ang32_30_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_31_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang32_33_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang32_25_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang32_24_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
 void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
 void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
diff -r 6cd3938d8683 -r 45bac4cace5e source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Fri Mar 27 22:59:16 2015 -0500
+++ b/source/common/x86/intrapred8.asm	Mon Mar 30 12:44:33 2015 -0500
@@ -376,6 +376,77 @@ c_ang32_mode_32:   db 11, 21, 11, 21, 11
                    db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
                    db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
 
+
+ALIGN 32
+c_ang32_mode_33:   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
+                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
+                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
+                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
+                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
+                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
+                   db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
+                   db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
+                   db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
+                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
+                   db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
+                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
+                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
+                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
+                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
+                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
+                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
+                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
+                   db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
+                   db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
+                   db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
+                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
+                   db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
+                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
+                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
+
+
+
+ALIGN 32
+c_ang32_mode_25:   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
+                   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
+                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
+                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
+                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
+                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
+                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
+                   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
+                   db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
+                   db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
+                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+                   db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
+                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
+                   db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
+                   db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
+
+
+
+ALIGN 32
+c_ang32_mode_24:   db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
+                   db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
+                   db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
+                   db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
+                   db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
+                   db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
+                   db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
+                   db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+                   db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
+                   db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
+                   db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
+                   db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
+                   db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
+                   db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
+                   db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
+                   db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
+                   db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
+
 ALIGN 32
 ;; (blkSize - 1 - x)
 pw_planar4_0:         dw 3,  2,  1,  0,  3,  2,  1,  0
@@ -13514,5 +13585,924 @@ cglobal intra_pred_ang32_32, 3, 5, 11
     vpermq            m6, m6, 11011000b
     movu              [r0 + r3], m6
     RET
+
+INIT_YMM avx2
+cglobal intra_pred_ang32_33, 3, 5, 11
+    mova              m0, [pw_1024]
+    mova              m1, [intra_pred_shuff_0_8]
+    lea               r3, [3 * r1]
+    lea               r4, [c_ang32_mode_33]
+
+    ;row [0]
+    vbroadcasti128    m2, [r2 + 1]
+    pshufb            m2, m1
+    vbroadcasti128    m3, [r2 + 9]
+    pshufb            m3, m1
+    vbroadcasti128    m4, [r2 + 17]
+    pshufb            m4, m1
+    vbroadcasti128    m5, [r2 + 25]
+    pshufb            m5, m1
+
+    mova              m10, [r4 + 0 * mmsize]
+    vperm2i128        m6, m2, m3, 00100000b
+    pmaddubsw         m6, m10
+    pmulhrsw          m6, m0
+    vperm2i128        m7, m4, m5, 00100000b
+    pmaddubsw         m7, m10
+    pmulhrsw          m7, m0
+    packuswb          m6, m7
+    vpermq            m6, m6, 11011000b
+    movu              [r0], m6
+
+    ;row [1]
+    vbroadcasti128    m2, [r2 + 2]
+    pshufb            m2, m1
+    vbroadcasti128    m3, [r2 + 10]
+    pshufb            m3, m1
+    vbroadcasti128    m4, [r2 + 18]
+    pshufb            m4, m1
+    vbroadcasti128    m5, [r2 + 26]
+    pshufb            m5, m1
+
+    mova              m10, [r4 + 1 * mmsize]
+    vperm2i128        m6, m2, m3, 00100000b
+    pmaddubsw         m6, m10
+    pmulhrsw          m6, m0
+    vperm2i128        m7, m4, m5, 00100000b
+    pmaddubsw         m7, m10
+    pmulhrsw          m7, m0
+    packuswb          m6, m7
+    vpermq            m6, m6, 11011000b
+    movu              [r0 + r1], m6
+
+    ;row [2]
+    vbroadcasti128    m2, [r2 + 3]
+    pshufb            m2, m1
+    vbroadcasti128    m3, [r2 + 11]
+    pshufb            m3, m1
+    vbroadcasti128    m4, [r2 + 19]
+    pshufb            m4, m1
+    vbroadcasti128    m5, [r2 + 27]
+    pshufb            m5, m1
+
+    mova              m10, [r4 + 2 * mmsize]
+    vperm2i128        m6, m2, m3, 00100000b
+    pmaddubsw         m6, m10
+    pmulhrsw          m6, m0
+    vperm2i128        m7, m4, m5, 00100000b
+    pmaddubsw         m7, m10
+    pmulhrsw          m7, m0
+    packuswb          m6, m7
+    vpermq            m6, m6, 11011000b
+    movu              [r0 + 2 * r1], m6
+
+    ;row [3]
+    vbroadcasti128    m2, [r2 + 4]
+    pshufb            m2, m1
+    vbroadcasti128    m3, [r2 + 12]
+    pshufb            m3, m1


More information about the x265-commits mailing list