[x265-commits] [x265] asm: new algorithm for intra_ang_32 modes 3 & 33, improve...
Dnyaneshwar G
dnyaneshwar at multicorewareinc.com
Wed Jul 22 19:57:30 CEST 2015
details: http://hg.videolan.org/x265/rev/f60490f762b2
branches:
changeset: 10835:f60490f762b2
user: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date: Mon Jul 13 12:16:57 2015 +0530
description:
asm: new algorithm for intra_ang_32 modes 3 & 33, improved over 50% than previous asm
Subject: [x265] stats: average luma and chroma distortion per frame
details: http://hg.videolan.org/x265/rev/cb9ef421e875
branches:
changeset: 10836:cb9ef421e875
user: Divya Manivannan <divya at multicorewareinc.com>
date: Mon Jul 13 16:00:36 2015 +0530
description:
stats: average luma and chroma distortion per frame
Subject: [x265] Fixed POWER CPU architecture detection
details: http://hg.videolan.org/x265/rev/304baf3d1028
branches:
changeset: 10837:304baf3d1028
user: Peter Kovář <peter.kovar at reflexion.tv>
date: Fri Jul 03 06:29:07 2015 -0400
description:
Fixed POWER CPU architecture detection
Subject: [x265] asm: apply new algorithm on upShift_8_sse4
details: http://hg.videolan.org/x265/rev/46152345eb6f
branches:
changeset: 10838:46152345eb6f
user: Min Chen <chenm003 at 163.com>
date: Mon Jul 20 17:18:54 2015 -0700
description:
asm: apply new algorithm on upShift_8_sse4
Subject: [x265] aq: new auto variance mode with biasing to dark scenes
details: http://hg.videolan.org/x265/rev/26829cf58345
branches:
changeset: 10839:26829cf58345
user: Santhoshini Sekar<santhoshini at multicorewareinc.com>
date: Wed Jun 17 12:02:33 2015 +0530
description:
aq: new auto variance mode with biasing to dark scenes
Subject: [x265] asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX
details: http://hg.videolan.org/x265/rev/9ef62719aa2d
branches:
changeset: 10840:9ef62719aa2d
user: Min Chen <chenm003 at 163.com>
date: Tue Jul 21 14:30:11 2015 -0700
description:
asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX
Subject: [x265] asm: disable Main12 fault functions on AVX2
details: http://hg.videolan.org/x265/rev/29d3289189e7
branches:
changeset: 10841:29d3289189e7
user: Min Chen <chenm003 at 163.com>
date: Tue Jul 21 14:30:14 2015 -0700
description:
asm: disable Main12 fault functions on AVX2
Subject: [x265] asm: fix Main12 fault on intra_dc_avx2
details: http://hg.videolan.org/x265/rev/ff162d8ce551
branches:
changeset: 10842:ff162d8ce551
user: Min Chen <chenm003 at 163.com>
date: Tue Jul 21 14:30:16 2015 -0700
description:
asm: fix Main12 fault on intra_dc_avx2
Subject: [x265] asm: fix Main12 fault on saoCuOrgB0_avx2
details: http://hg.videolan.org/x265/rev/daf94621d40c
branches:
changeset: 10843:daf94621d40c
user: Min Chen <chenm003 at 163.com>
date: Tue Jul 21 14:30:19 2015 -0700
description:
asm: fix Main12 fault on saoCuOrgB0_avx2
Subject: [x265] slicetype: nit
details: http://hg.videolan.org/x265/rev/42bc8575020b
branches:
changeset: 10844:42bc8575020b
user: Steve Borho <steve at borho.org>
date: Wed Jul 22 12:56:34 2015 -0500
description:
slicetype: nit
diffstat:
doc/reST/cli.rst | 3 +-
source/CMakeLists.txt | 8 +-
source/common/framedata.h | 5 +
source/common/param.cpp | 2 +-
source/common/x86/asm-primitives.cpp | 34 +
source/common/x86/const-a.asm | 1 +
source/common/x86/intrapred16.asm | 74 +-
source/common/x86/intrapred8.asm | 880 ++++++++---------------
source/common/x86/ipfilter16.asm | 1257 ++++++++++++++++-----------------
source/common/x86/loopfilter.asm | 54 +-
source/common/x86/mc-a.asm | 176 ++--
source/common/x86/pixel-a.asm | 92 +-
source/common/x86/pixel-util8.asm | 24 +-
source/encoder/encoder.cpp | 2 +
source/encoder/frameencoder.cpp | 18 +-
source/encoder/search.cpp | 27 +-
source/encoder/search.h | 10 +
source/encoder/slicetype.cpp | 19 +-
source/test/pixelharness.cpp | 3 +-
source/x265-extras.cpp | 2 +
source/x265.h | 3 +
source/x265cli.h | 2 +-
22 files changed, 1234 insertions(+), 1462 deletions(-)
diffs (truncated from 4830 to 300 lines):
diff -r b2ba7df1fc69 -r 42bc8575020b doc/reST/cli.rst
--- a/doc/reST/cli.rst Thu Jul 16 19:36:35 2015 -0700
+++ b/doc/reST/cli.rst Wed Jul 22 12:56:34 2015 -0500
@@ -1233,7 +1233,7 @@ Quality, rate control and rate distortio
ignored. Slower presets will generally achieve better compression
efficiency (and generate smaller bitstreams). Default disabled.
-.. option:: --aq-mode <0|1|2>
+.. option:: --aq-mode <0|1|2|3>
Adaptive Quantization operating mode. Raise or lower per-block
quantization based on complexity analysis of the source image. The
@@ -1244,6 +1244,7 @@ Quality, rate control and rate distortio
0. disabled
1. AQ enabled **(default)**
2. AQ enabled with auto-variance
+ 3. AQ enabled with auto-variance and bias to dark scenes
.. option:: --aq-strength <float>
diff -r b2ba7df1fc69 -r 42bc8575020b source/CMakeLists.txt
--- a/source/CMakeLists.txt Thu Jul 16 19:36:35 2015 -0700
+++ b/source/CMakeLists.txt Wed Jul 22 12:56:34 2015 -0500
@@ -30,7 +30,7 @@ option(STATIC_LINK_CRT "Statically link
mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
# X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 64)
+set(X265_BUILD 66)
configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
"${PROJECT_BINARY_DIR}/x265.def")
configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -42,6 +42,8 @@ SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_
string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
set(X86_ALIASES x86 i386 i686 x86_64 amd64)
list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
+set(POWER_ALIASES ppc64 ppc64le)
+list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
message(STATUS "Detected x86 target processor")
set(X86 1)
@@ -50,6 +52,10 @@ if("${SYSPROC}" STREQUAL "" OR X86MATCH
set(X64 1)
add_definitions(-DX86_64=1)
endif()
+elseif(POWERMATCH GREATER "-1")
+ message(STATUS "Detected POWER target processor")
+ set(POWER 1)
+ add_definitions(-DX265_ARCH_POWER=1)
elseif(${SYSPROC} STREQUAL "armv6l")
message(STATUS "Detected ARM target processor")
set(ARM 1)
diff -r b2ba7df1fc69 -r 42bc8575020b source/common/framedata.h
--- a/source/common/framedata.h Thu Jul 16 19:36:35 2015 -0700
+++ b/source/common/framedata.h Wed Jul 22 12:56:34 2015 -0500
@@ -52,6 +52,8 @@ struct FrameStats
double percent8x8Intra;
double percent8x8Inter;
double percent8x8Skip;
+ double avgLumaDistortion;
+ double avgChromaDistortion;
double percentIntraNxN;
double percentSkipCu[NUM_CU_DEPTH];
double percentMergeCu[NUM_CU_DEPTH];
@@ -60,6 +62,9 @@ struct FrameStats
uint64_t cntIntraNxN;
uint64_t totalCu;
+ uint64_t totalCtu;
+ uint64_t lumaDistortion;
+ uint64_t chromaDistortion;
uint64_t cntSkipCu[NUM_CU_DEPTH];
uint64_t cntMergeCu[NUM_CU_DEPTH];
uint64_t cntInter[NUM_CU_DEPTH];
diff -r b2ba7df1fc69 -r 42bc8575020b source/common/param.cpp
--- a/source/common/param.cpp Thu Jul 16 19:36:35 2015 -0700
+++ b/source/common/param.cpp Wed Jul 22 12:56:34 2015 -0500
@@ -1086,7 +1086,7 @@ int x265_check_params(x265_param* param)
"Lookahead depth must be less than 256");
CHECK(param->lookaheadSlices > 16 || param->lookaheadSlices < 0,
"Lookahead slices must between 0 and 16");
- CHECK(param->rc.aqMode < X265_AQ_NONE || X265_AQ_AUTO_VARIANCE < param->rc.aqMode,
+ CHECK(param->rc.aqMode < X265_AQ_NONE || X265_AQ_AUTO_VARIANCE_BIASED < param->rc.aqMode,
"Aq-Mode is out of range");
CHECK(param->rc.aqStrength < 0 || param->rc.aqStrength > 3,
"Aq-Strength is out of range");
diff -r b2ba7df1fc69 -r 42bc8575020b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Jul 16 19:36:35 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp Wed Jul 22 12:56:34 2015 -0500
@@ -1043,7 +1043,9 @@ void setupAssemblyPrimitives(EncoderPrim
// p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
ALL_LUMA_PU(satd, pixel_satd, ssse3);
+#if X265_DEPTH <= 10
ASSIGN_SA8D(ssse3);
+#endif
INTRA_ANG_SSSE3(ssse3);
p.dst4x4 = PFX(dst4_ssse3);
@@ -1126,14 +1128,18 @@ void setupAssemblyPrimitives(EncoderPrim
// p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sse4); fails tests
ALL_LUMA_PU(satd, pixel_satd, sse4);
+#if X265_DEPTH <= 10
ASSIGN_SA8D(sse4);
+#endif
p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
+#if X265_DEPTH <= 10
ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
+#endif
ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
INTRA_ANG_SSE4_COMMON(sse4);
INTRA_ANG_SSE4_HIGH(sse4);
@@ -1147,7 +1153,9 @@ void setupAssemblyPrimitives(EncoderPrim
// TODO: check POPCNT flag!
ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
+#if X265_DEPTH <= 10
ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
+#endif
ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
@@ -1184,7 +1192,9 @@ void setupAssemblyPrimitives(EncoderPrim
p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_avx);
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx);
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx);
+#if X265_DEPTH <= 10
ASSIGN_SA8D(avx);
+#endif
p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx);
p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx);
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx);
@@ -1292,7 +1302,9 @@ void setupAssemblyPrimitives(EncoderPrim
{
//p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_xop); this one is broken
ALL_LUMA_PU(satd, pixel_satd, xop);
+#if X265_DEPTH <= 10
ASSIGN_SA8D(xop);
+#endif
LUMA_VAR(xop);
p.frameInitLowres = PFX(frame_init_lowres_core_xop);
}
@@ -1464,12 +1476,14 @@ void setupAssemblyPrimitives(EncoderPrim
p.cu[BLOCK_32x32].psy_cost_ss = PFX(psyCost_ss_32x32_avx2);
p.cu[BLOCK_64x64].psy_cost_ss = PFX(psyCost_ss_64x64_avx2);
p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
+#if X265_DEPTH <= 10
p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2);
p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2);
p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
+#endif
p.cu[BLOCK_16x16].intra_pred[DC_IDX] = PFX(intra_pred_dc16_avx2);
p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
@@ -1527,15 +1541,19 @@ void setupAssemblyPrimitives(EncoderPrim
p.quant = PFX(quant_avx2);
p.nquant = PFX(nquant_avx2);
+#if X265_DEPTH <= 10
p.dequant_normal = PFX(dequant_normal_avx2);
p.dequant_scaling = PFX(dequant_scaling_avx2);
+#endif
p.dst4x4 = PFX(dst4_avx2);
p.idst4x4 = PFX(idst4_avx2);
p.denoiseDct = PFX(denoise_dct_avx2);
p.scale1D_128to64 = PFX(scale1D_128to64_avx2);
p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
+#if X265_DEPTH <= 10
p.weight_pp = PFX(weight_pp_avx2);
+#endif
p.weight_sp = PFX(weight_sp_avx2);
p.sign = PFX(calSign_avx2);
p.planecopy_cp = PFX(upShift_8_avx2);
@@ -1562,15 +1580,21 @@ void setupAssemblyPrimitives(EncoderPrim
p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16_avx2);
p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx2);
+#if X265_DEPTH <= 10
ALL_LUMA_TU_S(dct, dct, avx2);
ALL_LUMA_TU_S(idct, idct, avx2);
+#endif
ALL_LUMA_CU_S(transpose, transpose, avx2);
ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, avx2);
ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, avx2);
+#if X265_DEPTH <= 10
ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, avx2);
+#endif
ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2);
+#if X265_DEPTH <= 10
p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_avx2); // since ALL_LUMA_PU didn't declare 4x4 size, calling separately luma_vsp function to use
+#endif
p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
@@ -1593,6 +1617,7 @@ void setupAssemblyPrimitives(EncoderPrim
p.pu[LUMA_16x12].sad = PFX(pixel_sad_16x12_avx2);
p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx2);
p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx2);
+#if X265_DEPTH <= 10
p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx2);
p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx2);
p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx2);
@@ -1604,6 +1629,7 @@ void setupAssemblyPrimitives(EncoderPrim
p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx2);
p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx2);
p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx2);
+#endif
p.pu[LUMA_16x4].sad_x3 = PFX(pixel_sad_x3_16x4_avx2);
p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_avx2);
@@ -1678,6 +1704,7 @@ void setupAssemblyPrimitives(EncoderPrim
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx2);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx2);
+#if X265_DEPTH <= 10
p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_avx2);
p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_avx2);
p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_avx2);
@@ -1703,6 +1730,7 @@ void setupAssemblyPrimitives(EncoderPrim
p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx2);
p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_avx2);
p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_avx2);
+#endif
p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_avx2);
p.pu[LUMA_4x8].luma_hpp = PFX(interp_8tap_horiz_pp_4x8_avx2);
@@ -1730,6 +1758,7 @@ void setupAssemblyPrimitives(EncoderPrim
p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_avx2);
p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx2);
+#if X265_DEPTH <= 10
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx2);
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx2);
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx2);
@@ -2127,11 +2156,15 @@ void setupAssemblyPrimitives(EncoderPrim
p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = PFX(interp_4tap_vert_sp_64x32_avx2);
p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx2);
p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx2);
+#endif
p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
+#if X265_DEPTH <= 10
+ // TODO: depends on hps and vsp
ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu); // calling luma_hvpp for all sizes
p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>; // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4]
+#endif
if (cpuMask & X265_CPU_BMI2)
p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
@@ -2958,6 +2991,7 @@ void setupAssemblyPrimitives(EncoderPrim
p.cu[BLOCK_32x32].intra_pred[22] = PFX(intra_pred_ang32_22_avx2);
p.cu[BLOCK_32x32].intra_pred[21] = PFX(intra_pred_ang32_21_avx2);
p.cu[BLOCK_32x32].intra_pred[18] = PFX(intra_pred_ang32_18_avx2);
+ p.cu[BLOCK_32x32].intra_pred[3] = PFX(intra_pred_ang32_3_avx2);
// all_angs primitives
p.cu[BLOCK_4x4].intra_pred_allangs = PFX(all_angs_pred_4x4_avx2);
diff -r b2ba7df1fc69 -r 42bc8575020b source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Thu Jul 16 19:36:35 2015 -0700
+++ b/source/common/x86/const-a.asm Wed Jul 22 12:56:34 2015 -0500
@@ -79,6 +79,7 @@ const pw_257, times 16 dw
const pw_512, times 16 dw 512
const pw_1023, times 16 dw 1023
const pw_1024, times 16 dw 1024
+const pw_2048, times 16 dw 2048
const pw_4096, times 16 dw 4096
const pw_8192, times 8 dw 8192
const pw_00ff, times 16 dw 0x00ff
diff -r b2ba7df1fc69 -r 42bc8575020b source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Thu Jul 16 19:36:35 2015 -0700
+++ b/source/common/x86/intrapred16.asm Wed Jul 22 12:56:34 2015 -0500
@@ -473,14 +473,14 @@ cglobal intra_pred_dc16, 3, 9, 4
add r1d, r1d
movu m0, [r2 + 66]
movu m2, [r2 + 2]
- paddw m0, m2
+ paddw m0, m2 ; dynamic range 13 bits
vextracti128 xm1, m0, 1
- paddw xm0, xm1
+ paddw xm0, xm1 ; dynamic range 14 bits
movhlps xm1, xm0
- paddw xm0, xm1
- phaddw xm0, xm0
+ paddw xm0, xm1 ; dynamic range 15 bits
pmaddwd xm0, [pw_1]
+ phaddd xm0, xm0
paddd xm0, [pd_16]
More information about the x265-commits
mailing list