[x265-commits] [x265] asm: avx2 code for pixelavg_pp 32xN & 64xN, improved over...
Dnyaneshwar G
dnyaneshwar at multicorewareinc.com
Tue Aug 4 20:32:52 CEST 2015
details: http://hg.videolan.org/x265/rev/3b8d33994918
branches:
changeset: 10865:3b8d33994918
user: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date: Mon Aug 03 15:40:50 2015 +0530
description:
asm: avx2 code for pixelavg_pp 32xN & 64xN, improved over 40% than SSE
Subject: [x265] asm: disabled AVX primitives having less than 3% speed up over SSE
details: http://hg.videolan.org/x265/rev/76fe4b09c5c2
branches:
changeset: 10866:76fe4b09c5c2
user: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date: Tue Aug 04 11:59:48 2015 +0530
description:
asm: disabled AVX primitives having less than 3% speed up over SSE
these AVX primitives are slower than SSE primitives
Subject: [x265] vui: tweak the string name of ARIB STD-B67, to avoid possible future conflicts
details: http://hg.videolan.org/x265/rev/3fa7f6838098
branches:
changeset: 10867:3fa7f6838098
user: Steve Borho <steve at borho.org>
date: Mon Aug 03 14:56:21 2015 -0500
description:
vui: tweak the string name of ARIB STD-B67, to avoid possible future conflicts
diffstat:
doc/reST/cli.rst | 2 +-
source/common/param.cpp | 2 +-
source/common/x86/asm-primitives.cpp | 108 ++++++++++++++++++++++++-------
source/common/x86/mc-a.asm | 118 ++++++++++++++++++++++++++++++----
source/x265.h | 2 +-
source/x265cli.h | 2 +-
6 files changed, 191 insertions(+), 43 deletions(-)
diffs (truncated from 341 to 300 lines):
diff -r d5278c76d341 -r 3fa7f6838098 doc/reST/cli.rst
--- a/doc/reST/cli.rst Mon Aug 03 10:18:46 2015 -0500
+++ b/doc/reST/cli.rst Mon Aug 03 14:56:21 2015 -0500
@@ -1583,7 +1583,7 @@ VUI fields must be manually specified.
15. bt2020-12
16. smpte-st-2084
17. smpte-st-428
- 18. std-b67
+ 18. arib-std-b67
.. option:: --colormatrix <integer|string>
diff -r d5278c76d341 -r 3fa7f6838098 source/common/param.cpp
--- a/source/common/param.cpp Mon Aug 03 10:18:46 2015 -0500
+++ b/source/common/param.cpp Mon Aug 03 14:56:21 2015 -0500
@@ -1118,7 +1118,7 @@ int x265_check_params(x265_param* param)
|| param->vui.transferCharacteristics == 3,
"Transfer Characteristics must be undef, bt709, bt470m, bt470bg,"
" smpte170m, smpte240m, linear, log100, log316, iec61966-2-4, bt1361e,"
- " iec61966-2-1, bt2020-10, bt2020-12, smpte-st-2084, smpte-st-428 or std-b67");
+ " iec61966-2-1, bt2020-10, bt2020-12, smpte-st-2084, smpte-st-428 or arib-std-b67");
CHECK(param->vui.matrixCoeffs < 0
|| param->vui.matrixCoeffs > 10
|| param->vui.matrixCoeffs == 3,
diff -r d5278c76d341 -r 3fa7f6838098 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Aug 03 10:18:46 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Mon Aug 03 14:56:21 2015 -0500
@@ -2556,7 +2556,6 @@ void setupAssemblyPrimitives(EncoderPrim
}
if (cpuMask & X265_CPU_AVX)
{
- p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = PFX(pixel_satd_16x24_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = PFX(pixel_satd_24x64_avx);
@@ -2571,28 +2570,41 @@ void setupAssemblyPrimitives(EncoderPrim
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_avx);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_avx);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);
- ALL_LUMA_PU(satd, pixel_satd, avx);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_avx);
+
+ p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx);
+ p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx);
+ p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx);
+ p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx);
+ p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx);
+ p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_avx);
+ p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_avx);
+ p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx);
+ p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx);
+ p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx);
+ p.pu[LUMA_16x12].satd = PFX(pixel_satd_16x12_avx);
+
+ p.pu[LUMA_16x4].satd = PFX(pixel_satd_16x4_avx);
+ p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx);
+ p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_avx);
+ p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx);
+ p.pu[LUMA_8x32].satd = PFX(pixel_satd_8x32_avx);
+ p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx);
+ p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_avx);
+ p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx);
+ p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_avx);
+
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_avx);
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_avx);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx);
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx);
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_avx);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx);
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_avx);
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = PFX(pixel_satd_16x12_avx);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_avx);
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = PFX(pixel_satd_16x4_avx);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx);
p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_avx);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx);
@@ -2601,22 +2613,24 @@ void setupAssemblyPrimitives(EncoderPrim
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sa8d = PFX(pixel_sa8d_32x32_avx);
p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sa8d = PFX(pixel_sa8d_16x16_avx);
p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sa8d = PFX(pixel_satd_4x4_avx);
- ASSIGN_SSE_PP(avx);
+
+ p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx);
+ p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx);
+
p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sse_pp = PFX(pixel_ssd_8x8_avx);
- ASSIGN_SSE_SS(avx);
- LUMA_VAR(avx);
-
- p.pu[LUMA_12x16].sad_x3 = PFX(pixel_sad_x3_12x16_avx);
- p.pu[LUMA_16x4].sad_x3 = PFX(pixel_sad_x3_16x4_avx);
- HEVC_SAD_X3(avx);
-
- p.pu[LUMA_12x16].sad_x4 = PFX(pixel_sad_x4_12x16_avx);
+
+ p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx);
+
p.pu[LUMA_16x4].sad_x4 = PFX(pixel_sad_x4_16x4_avx);
- HEVC_SAD_X4(avx);
-
- p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx);
- p.ssim_end_4 = PFX(pixel_ssim_end4_avx);
+ p.pu[LUMA_16x8].sad_x4 = PFX(pixel_sad_x4_16x8_avx);
+ p.pu[LUMA_16x12].sad_x4 = PFX(pixel_sad_x4_16x12_avx);
+ p.pu[LUMA_16x16].sad_x4 = PFX(pixel_sad_x4_16x16_avx);
+ p.pu[LUMA_16x32].sad_x4 = PFX(pixel_sad_x4_16x32_avx);
+ p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx);
+ p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx);
+ p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx);
+ p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx);
+ p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx);
p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_avx);
p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx);
@@ -2653,6 +2667,50 @@ void setupAssemblyPrimitives(EncoderPrim
p.pu[LUMA_48x64].copy_pp = PFX(blockcopy_pp_48x64_avx);
p.frameInitLowres = PFX(frame_init_lowres_core_avx);
+
+ /* The following primitives have been disabled since performance compared to SSE4.2 is negligible/negative */
+#if 0
+ p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_avx);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_avx);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx);
+ p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx);
+ p.pu[LUMA_8x4].satd = PFX(pixel_satd_8x4_avx);
+ p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_avx);
+ p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_avx);
+ p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_avx);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_avx);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_avx);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_avx);
+ p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sa8d = PFX(pixel_satd_4x4_avx);
+
+ p.cu[BLOCK_8x8].sse_pp = PFX(pixel_ssd_8x8_avx);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = PFX(pixel_ssd_8x16_avx);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = PFX(pixel_ssd_16x32_avx);
+ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = PFX(pixel_ssd_32x64_avx);
+ ASSIGN_SSE_SS(avx);
+ p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx);
+ p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx);
+ p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx)
+
+ p.pu[LUMA_12x16].sad_x3 = PFX(pixel_sad_x3_12x16_avx);
+ p.pu[LUMA_16x4].sad_x3 = PFX(pixel_sad_x3_16x4_avx);
+ HEVC_SAD_X3(avx);
+
+ p.pu[LUMA_12x16].sad_x4 = PFX(pixel_sad_x4_12x16_avx);
+ p.pu[LUMA_16x64].sad_x4 = PFX(pixel_sad_x4_16x64_avx);
+ p.pu[LUMA_24x32].sad_x4 = PFX(pixel_sad_x4_24x32_avx);
+ p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx);
+ p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx);
+ p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx);
+ p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx);
+ p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx)
+
+ p.ssim_4x4x2_core = PFX(pixel_ssim_4x4x2_core_avx);
+ p.ssim_end_4 = PFX(pixel_ssim_end4_avx);
+#endif
}
if (cpuMask & X265_CPU_XOP)
{
diff -r d5278c76d341 -r 3fa7f6838098 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Mon Aug 03 10:18:46 2015 -0500
+++ b/source/common/x86/mc-a.asm Mon Aug 03 14:56:21 2015 -0500
@@ -4300,24 +4300,12 @@ AVGH 4, 16
AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
+
INIT_XMM avx2
; TODO: active AVX2 after debug
;AVG_FUNC 24, movdqu, movdqa
;AVGH 24, 32
-AVG_FUNC 64, movdqu, movdqa
-AVGH 64, 64
-AVGH 64, 48
-AVGH 64, 32
-AVGH 64, 16
-
-AVG_FUNC 32, movdqu, movdqa
-AVGH 32, 64
-AVGH 32, 32
-AVGH 32, 24
-AVGH 32, 16
-AVGH 32, 8
-
AVG_FUNC 16, movdqu, movdqa
AVGH 16, 64
AVGH 16, 32
@@ -4328,7 +4316,109 @@ AVGH 16, 4
%endif ;HIGH_BIT_DEPTH
-
+;-------------------------------------------------------------------------------------------------------------------------------
+;void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
+;-------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64 && BIT_DEPTH == 8
+INIT_YMM avx2
+cglobal pixel_avg_8x32
+%rep 4
+ movu m0, [r2]
+ movu m2, [r2 + r3]
+ movu m1, [r4]
+ movu m3, [r4 + r5]
+ pavgb m0, m1
+ pavgb m2, m3
+ movu [r0], m0
+ movu [r0 + r1], m2
+
+ lea r2, [r2 + r3 * 2]
+ lea r4, [r4 + r5 * 2]
+ lea r0, [r0 + r1 * 2]
+%endrep
+ ret
+
+cglobal pixel_avg_16x64_8bit
+%rep 8
+ movu m0, [r2]
+ movu m2, [r2 + mmsize]
+ movu m1, [r4]
+ movu m3, [r4 + mmsize]
+ pavgb m0, m1
+ pavgb m2, m3
+ movu [r0], m0
+ movu [r0 + mmsize], m2
+
+ movu m0, [r2 + r3]
+ movu m2, [r2 + r3 + mmsize]
+ movu m1, [r4 + r5]
+ movu m3, [r4 + r5 + mmsize]
+ pavgb m0, m1
+ pavgb m2, m3
+ movu [r0 + r1], m0
+ movu [r0 + r1 + mmsize], m2
+
+ lea r2, [r2 + r3 * 2]
+ lea r4, [r4 + r5 * 2]
+ lea r0, [r0 + r1 * 2]
+%endrep
+ ret
+
+cglobal pixel_avg_32x8, 6,6,4
+ call pixel_avg_8x32
+ RET
+
+cglobal pixel_avg_32x16, 6,6,4
+ call pixel_avg_8x32
+ call pixel_avg_8x32
+ RET
+
+cglobal pixel_avg_32x24, 6,6,4
+ call pixel_avg_8x32
+ call pixel_avg_8x32
+ call pixel_avg_8x32
+ RET
+
+cglobal pixel_avg_32x32, 6,6,4
+ call pixel_avg_8x32
+ call pixel_avg_8x32
+ call pixel_avg_8x32
+ call pixel_avg_8x32
+ RET
+
+cglobal pixel_avg_32x64, 6,6,4
+ call pixel_avg_8x32
+ call pixel_avg_8x32
+ call pixel_avg_8x32
+ call pixel_avg_8x32
+ call pixel_avg_8x32
+ call pixel_avg_8x32
+ call pixel_avg_8x32
+ call pixel_avg_8x32
+ RET
+
+cglobal pixel_avg_64x16, 6,6,4
+ call pixel_avg_16x64_8bit
+ RET
+
+cglobal pixel_avg_64x32, 6,6,4
+ call pixel_avg_16x64_8bit
+ call pixel_avg_16x64_8bit
+ RET
More information about the x265-commits
mailing list