[x265-commits] [x265] asm: new algorithm for intra_ang_32 modes 3 & 33, improve...

Wed Jul 22 19:57:30 CEST 2015

details:   http://hg.videolan.org/x265/rev/f60490f762b2
branches:  
changeset: 10835:f60490f762b2
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Mon Jul 13 12:16:57 2015 +0530
description:
asm: new algorithm for intra_ang_32 modes 3 & 33, improved over 50% than previous asm
Subject: [x265] stats: average luma and chroma distortion per frame

details:   http://hg.videolan.org/x265/rev/cb9ef421e875
branches:  
changeset: 10836:cb9ef421e875
user:      Divya Manivannan <divya at multicorewareinc.com>
date:      Mon Jul 13 16:00:36 2015 +0530
description:
stats: average luma and chroma distortion per frame
Subject: [x265] Fixed POWER CPU architecture detection

details:   http://hg.videolan.org/x265/rev/304baf3d1028
branches:  
changeset: 10837:304baf3d1028
user:      Peter KovÃ¡Å™ <peter.kovar at reflexion.tv>
date:      Fri Jul 03 06:29:07 2015 -0400
description:
Fixed POWER CPU architecture detection
Subject: [x265] asm: apply new algorithm on upShift_8_sse4

details:   http://hg.videolan.org/x265/rev/46152345eb6f
branches:  
changeset: 10838:46152345eb6f
user:      Min Chen <chenm003 at 163.com>
date:      Mon Jul 20 17:18:54 2015 -0700
description:
asm: apply new algorithm on upShift_8_sse4
Subject: [x265] aq: new auto variance mode with biasing to dark scenes

details:   http://hg.videolan.org/x265/rev/26829cf58345
branches:  
changeset: 10839:26829cf58345
user:      Santhoshini Sekar<santhoshini at multicorewareinc.com>
date:      Wed Jun 17 12:02:33 2015 +0530
description:
aq: new auto variance mode with biasing to dark scenes
Subject: [x265] asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX

details:   http://hg.videolan.org/x265/rev/9ef62719aa2d
branches:  
changeset: 10840:9ef62719aa2d
user:      Min Chen <chenm003 at 163.com>
date:      Tue Jul 21 14:30:11 2015 -0700
description:
asm: fix Main12 Assembly error and disable fault functions, now we are work with assembly up to AVX
Subject: [x265] asm: disable Main12 fault functions on AVX2

details:   http://hg.videolan.org/x265/rev/29d3289189e7
branches:  
changeset: 10841:29d3289189e7
user:      Min Chen <chenm003 at 163.com>
date:      Tue Jul 21 14:30:14 2015 -0700
description:
asm: disable Main12 fault functions on AVX2
Subject: [x265] asm: fix Main12 fault on intra_dc_avx2

details:   http://hg.videolan.org/x265/rev/ff162d8ce551
branches:  
changeset: 10842:ff162d8ce551
user:      Min Chen <chenm003 at 163.com>
date:      Tue Jul 21 14:30:16 2015 -0700
description:
asm: fix Main12 fault on intra_dc_avx2
Subject: [x265] asm: fix Main12 fault on saoCuOrgB0_avx2

details:   http://hg.videolan.org/x265/rev/daf94621d40c
branches:  
changeset: 10843:daf94621d40c
user:      Min Chen <chenm003 at 163.com>
date:      Tue Jul 21 14:30:19 2015 -0700
description:
asm: fix Main12 fault on saoCuOrgB0_avx2
Subject: [x265] slicetype: nit

details:   http://hg.videolan.org/x265/rev/42bc8575020b
branches:  
changeset: 10844:42bc8575020b
user:      Steve Borho <steve at borho.org>
date:      Wed Jul 22 12:56:34 2015 -0500
description:
slicetype: nit

diffstat:

 doc/reST/cli.rst                     |     3 +-
 source/CMakeLists.txt                |     8 +-
 source/common/framedata.h            |     5 +
 source/common/param.cpp              |     2 +-
 source/common/x86/asm-primitives.cpp |    34 +
 source/common/x86/const-a.asm        |     1 +
 source/common/x86/intrapred16.asm    |    74 +-
 source/common/x86/intrapred8.asm     |   880 ++++++++---------------
 source/common/x86/ipfilter16.asm     |  1257 ++++++++++++++++-----------------
 source/common/x86/loopfilter.asm     |    54 +-
 source/common/x86/mc-a.asm           |   176 ++--
 source/common/x86/pixel-a.asm        |    92 +-
 source/common/x86/pixel-util8.asm    |    24 +-
 source/encoder/encoder.cpp           |     2 +
 source/encoder/frameencoder.cpp      |    18 +-
 source/encoder/search.cpp            |    27 +-
 source/encoder/search.h              |    10 +
 source/encoder/slicetype.cpp         |    19 +-
 source/test/pixelharness.cpp         |     3 +-
 source/x265-extras.cpp               |     2 +
 source/x265.h                        |     3 +
 source/x265cli.h                     |     2 +-
 22 files changed, 1234 insertions(+), 1462 deletions(-)

diffs (truncated from 4830 to 300 lines):

diff -r b2ba7df1fc69 -r 42bc8575020b doc/reST/cli.rst

--- a/doc/reST/cli.rst	Thu Jul 16 19:36:35 2015 -0700
+++ b/doc/reST/cli.rst	Wed Jul 22 12:56:34 2015 -0500
@@ -1233,7 +1233,7 @@ Quality, rate control and rate distortio
 	ignored. Slower presets will generally achieve better compression
 	efficiency (and generate smaller bitstreams). Default disabled.
 
-.. option:: --aq-mode <0|1|2>
+.. option:: --aq-mode <0|1|2|3>
 
 	Adaptive Quantization operating mode. Raise or lower per-block
 	quantization based on complexity analysis of the source image. The
@@ -1244,6 +1244,7 @@ Quality, rate control and rate distortio
 	0. disabled
 	1. AQ enabled **(default)**
 	2. AQ enabled with auto-variance
+	3. AQ enabled with auto-variance and bias to dark scenes
 
 .. option:: --aq-strength <float>
 
diff -r b2ba7df1fc69 -r 42bc8575020b source/CMakeLists.txt
--- a/source/CMakeLists.txt	Thu Jul 16 19:36:35 2015 -0700
+++ b/source/CMakeLists.txt	Wed Jul 22 12:56:34 2015 -0500
@@ -30,7 +30,7 @@ option(STATIC_LINK_CRT "Statically link 
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 64)
+set(X265_BUILD 66)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -42,6 +42,8 @@ SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_
 string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC)
 set(X86_ALIASES x86 i386 i686 x86_64 amd64)
 list(FIND X86_ALIASES "${SYSPROC}" X86MATCH)
+set(POWER_ALIASES ppc64 ppc64le)
+list(FIND POWER_ALIASES "${SYSPROC}" POWERMATCH)
 if("${SYSPROC}" STREQUAL "" OR X86MATCH GREATER "-1")
     message(STATUS "Detected x86 target processor")
     set(X86 1)
@@ -50,6 +52,10 @@ if("${SYSPROC}" STREQUAL "" OR X86MATCH 
         set(X64 1)
         add_definitions(-DX86_64=1)
     endif()
+elseif(POWERMATCH GREATER "-1")
+    message(STATUS "Detected POWER target processor")
+    set(POWER 1)
+    add_definitions(-DX265_ARCH_POWER=1)
 elseif(${SYSPROC} STREQUAL "armv6l")
     message(STATUS "Detected ARM target processor")
     set(ARM 1)
diff -r b2ba7df1fc69 -r 42bc8575020b source/common/framedata.h
--- a/source/common/framedata.h	Thu Jul 16 19:36:35 2015 -0700
+++ b/source/common/framedata.h	Wed Jul 22 12:56:34 2015 -0500
@@ -52,6 +52,8 @@ struct FrameStats
     double      percent8x8Intra;
     double      percent8x8Inter;
     double      percent8x8Skip;
+    double      avgLumaDistortion;
+    double      avgChromaDistortion;
     double      percentIntraNxN;
     double      percentSkipCu[NUM_CU_DEPTH];
     double      percentMergeCu[NUM_CU_DEPTH];
@@ -60,6 +62,9 @@ struct FrameStats
 
     uint64_t    cntIntraNxN;
     uint64_t    totalCu;
+    uint64_t    totalCtu;
+    uint64_t    lumaDistortion;
+    uint64_t    chromaDistortion;
     uint64_t    cntSkipCu[NUM_CU_DEPTH];
     uint64_t    cntMergeCu[NUM_CU_DEPTH];
     uint64_t    cntInter[NUM_CU_DEPTH];
diff -r b2ba7df1fc69 -r 42bc8575020b source/common/param.cpp
--- a/source/common/param.cpp	Thu Jul 16 19:36:35 2015 -0700
+++ b/source/common/param.cpp	Wed Jul 22 12:56:34 2015 -0500
@@ -1086,7 +1086,7 @@ int x265_check_params(x265_param* param)
           "Lookahead depth must be less than 256");
     CHECK(param->lookaheadSlices > 16 || param->lookaheadSlices < 0,
           "Lookahead slices must between 0 and 16");
-    CHECK(param->rc.aqMode < X265_AQ_NONE || X265_AQ_AUTO_VARIANCE < param->rc.aqMode,
+    CHECK(param->rc.aqMode < X265_AQ_NONE || X265_AQ_AUTO_VARIANCE_BIASED < param->rc.aqMode,
           "Aq-Mode is out of range");
     CHECK(param->rc.aqStrength < 0 || param->rc.aqStrength > 3,
           "Aq-Strength is out of range");
diff -r b2ba7df1fc69 -r 42bc8575020b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Jul 16 19:36:35 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp	Wed Jul 22 12:56:34 2015 -0500
@@ -1043,7 +1043,9 @@ void setupAssemblyPrimitives(EncoderPrim
 
         // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_ssse3); this one is broken
         ALL_LUMA_PU(satd, pixel_satd, ssse3);
+#if X265_DEPTH <= 10
         ASSIGN_SA8D(ssse3);
+#endif
         INTRA_ANG_SSSE3(ssse3);
 
         p.dst4x4 = PFX(dst4_ssse3);
@@ -1126,14 +1128,18 @@ void setupAssemblyPrimitives(EncoderPrim
 
         // p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sse4); fails tests
         ALL_LUMA_PU(satd, pixel_satd, sse4);
+#if X265_DEPTH <= 10
         ASSIGN_SA8D(sse4);
+#endif
 
         p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
         p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
         p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
         p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
 
+#if X265_DEPTH <= 10
         ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
+#endif
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
         INTRA_ANG_SSE4_COMMON(sse4);
         INTRA_ANG_SSE4_HIGH(sse4);
@@ -1147,7 +1153,9 @@ void setupAssemblyPrimitives(EncoderPrim
 
         // TODO: check POPCNT flag!
         ALL_LUMA_TU_S(copy_cnt, copy_cnt_, sse4);
+#if X265_DEPTH <= 10
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
+#endif
         ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
 
         p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].p2s = PFX(filterPixelToShort_2x4_sse4);
@@ -1184,7 +1192,9 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_avx);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx);
+#if X265_DEPTH <= 10
         ASSIGN_SA8D(avx);
+#endif
         p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx);
         p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx);
         p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx);
@@ -1292,7 +1302,9 @@ void setupAssemblyPrimitives(EncoderPrim
     {
         //p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_xop); this one is broken
         ALL_LUMA_PU(satd, pixel_satd, xop);
+#if X265_DEPTH <= 10
         ASSIGN_SA8D(xop);
+#endif
         LUMA_VAR(xop);
         p.frameInitLowres = PFX(frame_init_lowres_core_xop);
     }
@@ -1464,12 +1476,14 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_32x32].psy_cost_ss = PFX(psyCost_ss_32x32_avx2);
         p.cu[BLOCK_64x64].psy_cost_ss = PFX(psyCost_ss_64x64_avx2);
         p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_pp_4x4_avx2);
+#if X265_DEPTH <= 10
         p.cu[BLOCK_8x8].psy_cost_pp = PFX(psyCost_pp_8x8_avx2);
         p.cu[BLOCK_16x16].psy_cost_pp = PFX(psyCost_pp_16x16_avx2);
         p.cu[BLOCK_32x32].psy_cost_pp = PFX(psyCost_pp_32x32_avx2);
         p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
         p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
         p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
+#endif
 
         p.cu[BLOCK_16x16].intra_pred[DC_IDX] = PFX(intra_pred_dc16_avx2);
         p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
@@ -1527,15 +1541,19 @@ void setupAssemblyPrimitives(EncoderPrim
 
         p.quant = PFX(quant_avx2);
         p.nquant = PFX(nquant_avx2);
+#if X265_DEPTH <= 10
         p.dequant_normal  = PFX(dequant_normal_avx2);
         p.dequant_scaling = PFX(dequant_scaling_avx2);
+#endif
         p.dst4x4 = PFX(dst4_avx2);
         p.idst4x4 = PFX(idst4_avx2);
         p.denoiseDct = PFX(denoise_dct_avx2);
 
         p.scale1D_128to64 = PFX(scale1D_128to64_avx2);
         p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
+#if X265_DEPTH <= 10
         p.weight_pp = PFX(weight_pp_avx2);
+#endif
         p.weight_sp = PFX(weight_sp_avx2);
         p.sign = PFX(calSign_avx2);
         p.planecopy_cp = PFX(upShift_8_avx2);
@@ -1562,15 +1580,21 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16_avx2);
         p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx2);
 
+#if X265_DEPTH <= 10
         ALL_LUMA_TU_S(dct, dct, avx2);
         ALL_LUMA_TU_S(idct, idct, avx2);
+#endif
         ALL_LUMA_CU_S(transpose, transpose, avx2);
 
         ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, avx2);
         ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, avx2);
+#if X265_DEPTH <= 10
         ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, avx2);
+#endif
         ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, avx2);
+#if X265_DEPTH <= 10
         p.pu[LUMA_4x4].luma_vsp = PFX(interp_8tap_vert_sp_4x4_avx2);               // since ALL_LUMA_PU didn't declare 4x4 size, calling separately luma_vsp function to use 
+#endif
 
         p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_avx2);
         p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2);
@@ -1593,6 +1617,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_16x12].sad = PFX(pixel_sad_16x12_avx2);
         p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx2);
         p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx2);
+#if X265_DEPTH <= 10
         p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx2);
         p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx2);
         p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx2);
@@ -1604,6 +1629,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx2);
         p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx2);
         p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx2);
+#endif
 
         p.pu[LUMA_16x4].sad_x3 = PFX(pixel_sad_x3_16x4_avx2);
         p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_avx2);
@@ -1678,6 +1704,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx2);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx2);
 
+#if X265_DEPTH <= 10
         p.pu[LUMA_4x4].luma_hps = PFX(interp_8tap_horiz_ps_4x4_avx2);
         p.pu[LUMA_4x8].luma_hps = PFX(interp_8tap_horiz_ps_4x8_avx2);
         p.pu[LUMA_4x16].luma_hps = PFX(interp_8tap_horiz_ps_4x16_avx2);
@@ -1703,6 +1730,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx2);
         p.pu[LUMA_24x32].luma_hps = PFX(interp_8tap_horiz_ps_24x32_avx2);
         p.pu[LUMA_12x16].luma_hps = PFX(interp_8tap_horiz_ps_12x16_avx2);
+#endif
 
         p.pu[LUMA_4x4].luma_hpp = PFX(interp_8tap_horiz_pp_4x4_avx2);
         p.pu[LUMA_4x8].luma_hpp = PFX(interp_8tap_horiz_pp_4x8_avx2);
@@ -1730,6 +1758,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_24x32].luma_hpp = PFX(interp_8tap_horiz_pp_24x32_avx2);
         p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx2);
 
+#if X265_DEPTH <= 10
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hps = PFX(interp_4tap_horiz_ps_8x8_avx2);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hps = PFX(interp_4tap_horiz_ps_8x4_avx2);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx2);
@@ -2127,11 +2156,15 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = PFX(interp_4tap_vert_sp_64x32_avx2);
         p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx2);
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx2);
+#endif
 
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
 
+#if X265_DEPTH <= 10
+        // TODO: depends on hps and vsp
         ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);                        // calling luma_hvpp for all sizes
         p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;             // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4] 
+#endif
 
         if (cpuMask & X265_CPU_BMI2)
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
@@ -2958,6 +2991,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.cu[BLOCK_32x32].intra_pred[22] = PFX(intra_pred_ang32_22_avx2);
         p.cu[BLOCK_32x32].intra_pred[21] = PFX(intra_pred_ang32_21_avx2);
         p.cu[BLOCK_32x32].intra_pred[18] = PFX(intra_pred_ang32_18_avx2);
+        p.cu[BLOCK_32x32].intra_pred[3]  = PFX(intra_pred_ang32_3_avx2);
 
         // all_angs primitives
         p.cu[BLOCK_4x4].intra_pred_allangs = PFX(all_angs_pred_4x4_avx2);
diff -r b2ba7df1fc69 -r 42bc8575020b source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Thu Jul 16 19:36:35 2015 -0700
+++ b/source/common/x86/const-a.asm	Wed Jul 22 12:56:34 2015 -0500
@@ -79,6 +79,7 @@ const pw_257,               times 16 dw 
 const pw_512,               times 16 dw 512
 const pw_1023,              times 16 dw 1023
 const pw_1024,              times 16 dw 1024
+const pw_2048,              times 16 dw 2048
 const pw_4096,              times 16 dw 4096
 const pw_8192,              times  8 dw 8192
 const pw_00ff,              times 16 dw 0x00ff
diff -r b2ba7df1fc69 -r 42bc8575020b source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Thu Jul 16 19:36:35 2015 -0700
+++ b/source/common/x86/intrapred16.asm	Wed Jul 22 12:56:34 2015 -0500
@@ -473,14 +473,14 @@ cglobal intra_pred_dc16, 3, 9, 4
     add             r1d,                 r1d
     movu            m0,                  [r2 + 66]
     movu            m2,                  [r2 +  2]
-    paddw           m0,                  m2
+    paddw           m0,                  m2                 ; dynamic range 13 bits
 
     vextracti128    xm1,                 m0, 1
-    paddw           xm0,                 xm1
+    paddw           xm0,                 xm1                ; dynamic range 14 bits
     movhlps         xm1,                 xm0
-    paddw           xm0,                 xm1
-    phaddw          xm0,                 xm0
+    paddw           xm0,                 xm1                ; dynamic range 15 bits
     pmaddwd         xm0,                 [pw_1]
+    phaddd          xm0,                 xm0
     paddd           xm0,                 [pd_16]