[x265-commits] [x265] cmake: introduce fprofile options
Steve Borho
steve at borho.org
Mon Mar 30 22:19:40 CEST 2015
details: http://hg.videolan.org/x265/rev/65d004d54895
branches: stable
changeset: 9937:65d004d54895
user: Steve Borho <steve at borho.org>
date: Sun Mar 29 12:01:11 2015 -0500
description:
cmake: introduce fprofile options
These are hidden from showing up in the GUI, since they come as 'some assembly
required'. The test-harness repo has a script which will drive this processs:
1) build with -fprofile-generate
2) run some representative test encodes
3) build with -fprofile-use
By using generic cmake options, we can add support for MSVC and other compilers
Subject: [x265] cmake: add option for -march=native
details: http://hg.videolan.org/x265/rev/e5495a8cd07c
branches: stable
changeset: 9938:e5495a8cd07c
user: Steve Borho <steve at borho.org>
date: Sun Mar 29 13:24:10 2015 -0500
description:
cmake: add option for -march=native
Subject: [x265] doc: fix typo in --lookahead-slices
details: http://hg.videolan.org/x265/rev/66eff66ec406
branches: stable
changeset: 9939:66eff66ec406
user: Steve Borho <steve at borho.org>
date: Mon Mar 30 10:26:31 2015 -0500
description:
doc: fix typo in --lookahead-slices
Subject: [x265] cli: save and restore console title before and after encoding
details: http://hg.videolan.org/x265/rev/64191dc206b8
branches:
changeset: 9940:64191dc206b8
user: Xinyue Lu <i at 7086.in>
date: Mon Mar 30 04:12:50 2015 -0700
description:
cli: save and restore console title before and after encoding
Subject: [x265] cmake: avoid CMP0054 warnings in main script
details: http://hg.videolan.org/x265/rev/64697f08a047
branches: stable
changeset: 9941:64697f08a047
user: Steve Borho <steve at borho.org>
date: Mon Mar 30 12:43:39 2015 -0500
description:
cmake: avoid CMP0054 warnings in main script
cmake 3.2 will warn about these even though we say we want the old behavior.
note that if you enable the testbench build, CMP0054 warnings get triggered from
cmake scripts distributed by cmake itself.
Subject: [x265] asm: replace movd+pshufd+vinserti128 with vpbroadcastd instruction
details: http://hg.videolan.org/x265/rev/8f761004c163
branches:
changeset: 9942:8f761004c163
user: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date: Mon Mar 30 11:27:36 2015 +0530
description:
asm: replace movd+pshufd+vinserti128 with vpbroadcastd instruction
Subject: [x265] cli: prevent Windows from falling into standby
details: http://hg.videolan.org/x265/rev/0e77bd541284
branches:
changeset: 9943:0e77bd541284
user: Xinyue Lu <i at 7086.in>
date: Mon Mar 30 04:42:47 2015 -0700
description:
cli: prevent Windows from falling into standby
Subject: [x265] Merge with stable
details: http://hg.videolan.org/x265/rev/45bac4cace5e
branches:
changeset: 9944:45bac4cace5e
user: Steve Borho <steve at borho.org>
date: Mon Mar 30 12:44:33 2015 -0500
description:
Merge with stable
diffstat:
doc/reST/cli.rst | 2 +-
source/CMakeLists.txt | 23 +-
source/common/pixel.cpp | 2 +-
source/common/primitives.h | 7 +-
source/common/x86/asm-primitives.cpp | 5 +
source/common/x86/intrapred.h | 3 +
source/common/x86/intrapred8.asm | 990 +++++++++++++++++++++++++++++++++++
source/common/x86/pixel-util.h | 5 +-
source/common/x86/pixel-util8.asm | 125 +++-
source/encoder/search.cpp | 25 +-
source/test/pixelharness.cpp | 33 +-
source/test/pixelharness.h | 3 +-
source/x265.cpp | 9 +
13 files changed, 1185 insertions(+), 47 deletions(-)
diffs (truncated from 1469 to 300 lines):
diff -r 6cd3938d8683 -r 45bac4cace5e doc/reST/cli.rst
--- a/doc/reST/cli.rst Fri Mar 27 22:59:16 2015 -0500
+++ b/doc/reST/cli.rst Mon Mar 30 12:44:33 2015 -0500
@@ -976,7 +976,7 @@ Slice decision options
The encoder may internally lower the number of slices to ensure
each slice codes at least 10 16x16 rows of lowres blocks. If slices
- are used in lookahead, the are logged in the list of tools as
+ are used in lookahead, they are logged in the list of tools as
*lslices*.
**Values:** 0 - disabled (default). 1 is the same as 0. Max 16
diff -r 6cd3938d8683 -r 45bac4cace5e source/CMakeLists.txt
--- a/source/CMakeLists.txt Fri Mar 27 22:59:16 2015 -0500
+++ b/source/CMakeLists.txt Mon Mar 30 12:44:33 2015 -0500
@@ -23,6 +23,11 @@ include(CheckFunctionExists)
include(CheckSymbolExists)
include(CheckCXXCompilerFlag)
+option(FPROFILE_GENERATE "Compile executable to generate usage data" OFF)
+option(FPROFILE_USE "Compile executable using generated usage data" OFF)
+option(NATIVE_BUILD "Target the build CPU" OFF)
+mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
+
# X265_BUILD must be incremented each time the public API is changed
set(X265_BUILD 50)
configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
@@ -88,13 +93,13 @@ if (APPLE)
add_definitions(-DMACOS)
endif()
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Clang")
set(CLANG 1)
endif()
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel")
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
set(INTEL_CXX 1)
endif()
-if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
+if(${CMAKE_CXX_COMPILER_ID} STREQUAL "GNU")
set(GCC 1)
endif()
@@ -141,12 +146,22 @@ if(GCC)
if(ENABLE_PIC)
add_definitions(-fPIC)
endif(ENABLE_PIC)
- if(X86 AND NOT X64)
+ if(NATIVE_BUILD)
+ add_definitions(-march=native)
+ elseif(X86 AND NOT X64)
add_definitions(-march=i686)
endif()
if(ARM)
add_definitions(-march=armv6 -mfloat-abi=hard -mfpu=vfp)
endif()
+ if(FPROFILE_GENERATE)
+ add_definitions(-fprofile-generate)
+ list(APPEND LINKER_OPTIONS "-fprofile-generate")
+ endif(FPROFILE_GENERATE)
+ if(FPROFILE_USE)
+ add_definitions(-fprofile-use)
+ list(APPEND LINKER_OPTIONS "-fprofile-use")
+ endif(FPROFILE_USE)
check_cxx_compiler_flag(-Wno-narrowing CC_HAS_NO_NARROWING)
check_cxx_compiler_flag(-Wno-array-bounds CC_HAS_NO_ARRAY_BOUNDS)
if (CC_HAS_NO_ARRAY_BOUNDS)
diff -r 6cd3938d8683 -r 45bac4cace5e source/common/pixel.cpp
--- a/source/common/pixel.cpp Fri Mar 27 22:59:16 2015 -0500
+++ b/source/common/pixel.cpp Mon Mar 30 12:44:33 2015 -0500
@@ -582,7 +582,7 @@ void pixelavg_pp(pixel* dst, intptr_t ds
}
}
-void scale1D_128to64(pixel *dst, const pixel *src, intptr_t /*stride*/)
+void scale1D_128to64(pixel *dst, const pixel *src)
{
int x;
const pixel* src1 = src;
diff -r 6cd3938d8683 -r 45bac4cace5e source/common/primitives.h
--- a/source/common/primitives.h Fri Mar 27 22:59:16 2015 -0500
+++ b/source/common/primitives.h Mon Mar 30 12:44:33 2015 -0500
@@ -140,7 +140,8 @@ typedef void (*dequant_normal_t)(const i
typedef int(*count_nonzero_t)(const int16_t* quantCoeff);
typedef void (*weightp_pp_t)(const pixel* src, pixel* dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset);
typedef void (*weightp_sp_t)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
-typedef void (*scale_t)(pixel* dst, const pixel* src, intptr_t stride);
+typedef void (*scale1D_t)(pixel* dst, const pixel* src);
+typedef void (*scale2D_t)(pixel* dst, const pixel* src, intptr_t stride);
typedef void (*downscale_t)(const pixel* src0, pixel* dstf, pixel* dsth, pixel* dstv, pixel* dstc,
intptr_t src_stride, intptr_t dst_stride, int width, int height);
typedef void (*extendCURowBorder_t)(pixel* txt, intptr_t stride, int width, int height, int marginX);
@@ -266,8 +267,8 @@ struct EncoderPrimitives
dequant_scaling_t dequant_scaling;
dequant_normal_t dequant_normal;
denoiseDct_t denoiseDct;
- scale_t scale1D_128to64;
- scale_t scale2D_64to32;
+ scale1D_t scale1D_128to64;
+ scale2D_t scale2D_64to32;
ssim_4x4x2_core_t ssim_4x4x2_core;
ssim_end4_t ssim_end_4;
diff -r 6cd3938d8683 -r 45bac4cace5e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Mar 27 22:59:16 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Mon Mar 30 12:44:33 2015 -0500
@@ -1447,6 +1447,8 @@ void setupAssemblyPrimitives(EncoderPrim
#if X86_64
if (cpuMask & X265_CPU_AVX2)
{
+ p.scale2D_64to32 = x265_scale2D_64to32_avx2;
+
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_avx2;
@@ -1642,6 +1644,9 @@ void setupAssemblyPrimitives(EncoderPrim
p.cu[BLOCK_32x32].intra_pred[30] = x265_intra_pred_ang32_30_avx2;
p.cu[BLOCK_32x32].intra_pred[31] = x265_intra_pred_ang32_31_avx2;
p.cu[BLOCK_32x32].intra_pred[32] = x265_intra_pred_ang32_32_avx2;
+ p.cu[BLOCK_32x32].intra_pred[33] = x265_intra_pred_ang32_33_avx2;
+ p.cu[BLOCK_32x32].intra_pred[25] = x265_intra_pred_ang32_25_avx2;
+ p.cu[BLOCK_32x32].intra_pred[24] = x265_intra_pred_ang32_24_avx2;
// copy_sp primitives
p.cu[BLOCK_16x16].copy_sp = x265_blockcopy_sp_16x16_avx2;
diff -r 6cd3938d8683 -r 45bac4cace5e source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Fri Mar 27 22:59:16 2015 -0500
+++ b/source/common/x86/intrapred.h Mon Mar 30 12:44:33 2015 -0500
@@ -212,6 +212,9 @@ void x265_intra_pred_ang32_29_avx2(pixel
void x265_intra_pred_ang32_30_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang32_31_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_intra_pred_ang32_32_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang32_33_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang32_25_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang32_24_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
void x265_all_angs_pred_4x4_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
void x265_all_angs_pred_8x8_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
void x265_all_angs_pred_16x16_sse4(pixel *dest, pixel *refPix, pixel *filtPix, int bLuma);
diff -r 6cd3938d8683 -r 45bac4cace5e source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Fri Mar 27 22:59:16 2015 -0500
+++ b/source/common/x86/intrapred8.asm Mon Mar 30 12:44:33 2015 -0500
@@ -376,6 +376,77 @@ c_ang32_mode_32: db 11, 21, 11, 21, 11
db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11
db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
+
+ALIGN 32
+c_ang32_mode_33: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
+ db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
+ db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
+ db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
+ db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
+ db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
+ db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
+ db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
+ db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
+ db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
+ db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
+ db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
+ db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
+ db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
+ db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
+ db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
+ db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
+ db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
+ db 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10
+ db 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30
+ db 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
+ db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18
+ db 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
+ db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
+ db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
+
+
+
+ALIGN 32
+c_ang32_mode_25: db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
+ db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
+ db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
+ db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
+ db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
+ db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
+ db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
+ db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28
+ db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
+ db 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
+ db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ db 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
+ db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8
+ db 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
+ db 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
+
+
+
+ALIGN 32
+c_ang32_mode_24: db 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22
+ db 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12
+ db 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2
+ db 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24
+ db 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14
+ db 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4
+ db 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26
+ db 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+ db 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 21, 11, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6
+ db 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1, 31, 1
+ db 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 4, 28, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23, 9, 23
+ db 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 14, 18, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13, 19, 13
+ db 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 24, 8, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3, 29, 3
+ db 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 2, 30, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25, 7, 25
+ db 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15, 17, 15
+ db 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 22, 10, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5, 27, 5
+ db 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0, 32, 0
+
ALIGN 32
;; (blkSize - 1 - x)
pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0
@@ -13514,5 +13585,924 @@ cglobal intra_pred_ang32_32, 3, 5, 11
vpermq m6, m6, 11011000b
movu [r0 + r3], m6
RET
+
+INIT_YMM avx2
+cglobal intra_pred_ang32_33, 3, 5, 11
+ mova m0, [pw_1024]
+ mova m1, [intra_pred_shuff_0_8]
+ lea r3, [3 * r1]
+ lea r4, [c_ang32_mode_33]
+
+ ;row [0]
+ vbroadcasti128 m2, [r2 + 1]
+ pshufb m2, m1
+ vbroadcasti128 m3, [r2 + 9]
+ pshufb m3, m1
+ vbroadcasti128 m4, [r2 + 17]
+ pshufb m4, m1
+ vbroadcasti128 m5, [r2 + 25]
+ pshufb m5, m1
+
+ mova m10, [r4 + 0 * mmsize]
+ vperm2i128 m6, m2, m3, 00100000b
+ pmaddubsw m6, m10
+ pmulhrsw m6, m0
+ vperm2i128 m7, m4, m5, 00100000b
+ pmaddubsw m7, m10
+ pmulhrsw m7, m0
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ movu [r0], m6
+
+ ;row [1]
+ vbroadcasti128 m2, [r2 + 2]
+ pshufb m2, m1
+ vbroadcasti128 m3, [r2 + 10]
+ pshufb m3, m1
+ vbroadcasti128 m4, [r2 + 18]
+ pshufb m4, m1
+ vbroadcasti128 m5, [r2 + 26]
+ pshufb m5, m1
+
+ mova m10, [r4 + 1 * mmsize]
+ vperm2i128 m6, m2, m3, 00100000b
+ pmaddubsw m6, m10
+ pmulhrsw m6, m0
+ vperm2i128 m7, m4, m5, 00100000b
+ pmaddubsw m7, m10
+ pmulhrsw m7, m0
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ movu [r0 + r1], m6
+
+ ;row [2]
+ vbroadcasti128 m2, [r2 + 3]
+ pshufb m2, m1
+ vbroadcasti128 m3, [r2 + 11]
+ pshufb m3, m1
+ vbroadcasti128 m4, [r2 + 19]
+ pshufb m4, m1
+ vbroadcasti128 m5, [r2 + 27]
+ pshufb m5, m1
+
+ mova m10, [r4 + 2 * mmsize]
+ vperm2i128 m6, m2, m3, 00100000b
+ pmaddubsw m6, m10
+ pmulhrsw m6, m0
+ vperm2i128 m7, m4, m5, 00100000b
+ pmaddubsw m7, m10
+ pmulhrsw m7, m0
+ packuswb m6, m7
+ vpermq m6, m6, 11011000b
+ movu [r0 + 2 * r1], m6
+
+ ;row [3]
+ vbroadcasti128 m2, [r2 + 4]
+ pshufb m2, m1
+ vbroadcasti128 m3, [r2 + 12]
+ pshufb m3, m1
More information about the x265-commits
mailing list