[x265-commits] [x265] slicetype: fix build with detailed stats enabled
Steve Borho
steve at borho.org
Mon May 11 23:44:17 CEST 2015
details: http://hg.videolan.org/x265/rev/c7ac09d16802
branches: stable
changeset: 10401:c7ac09d16802
user: Steve Borho <steve at borho.org>
date: Mon May 11 08:15:56 2015 -0500
description:
slicetype: fix build with detailed stats enabled
Subject: [x265] asm: avx2 code for sad[16x64] for 10 bpp (3060->1353)
details: http://hg.videolan.org/x265/rev/7971adc03a16
branches:
changeset: 10402:7971adc03a16
user: Sumalatha Polureddy
date: Mon May 11 10:42:13 2015 +0530
description:
asm: avx2 code for sad[16x64] for 10 bpp (3060->1353)
sse2:
sad[16x64] 3.08x 3060.55 9412.37
avx2:
sad[16x64] 6.85x 1353.36 9263.85
Subject: [x265] asm: avx2 code for sad[32x8] for 10 bpp(833 -> 304)
details: http://hg.videolan.org/x265/rev/0f8f6f0a419a
branches:
changeset: 10403:0f8f6f0a419a
user: Sumalatha Polureddy
date: Mon May 11 10:49:02 2015 +0530
description:
asm: avx2 code for sad[32x8] for 10 bpp(833 -> 304)
sse2:
sad[ 32x8] 2.86x 833.73 2380.88
avx2:
sad[ 32x8] 7.31x 304.54 2225.90
Subject: [x265] asm: avx2 code for sad[32x16],[32x24],[32x32] for 10 bpp
details: http://hg.videolan.org/x265/rev/47a2e12c2c15
branches:
changeset: 10404:47a2e12c2c15
user: Sumalatha Polureddy
date: Mon May 11 11:37:56 2015 +0530
description:
asm: avx2 code for sad[32x16],[32x24],[32x32] for 10 bpp
sse2:
sad[32x16] 3.01x 1595.11 4794.98
sad[32x24] 2.98x 2362.68 7051.16
sad[32x32] 2.97x 3128.34 9278.31
avx2:
sad[32x16] 7.66x 603.61 4621.37
sad[32x24] 6.98x 1003.75 7006.95
sad[32x32] 7.05x 1340.97 9452.61
Subject: [x265] cli: allow --output-depth to select output|internal bit depth
details: http://hg.videolan.org/x265/rev/dadd6c419e55
branches: stable
changeset: 10405:dadd6c419e55
user: Steve Borho <steve at borho.org>
date: Mon May 11 08:50:41 2015 -0500
description:
cli: allow --output-depth to select output|internal bit depth
Subject: [x265] Merge with stable
details: http://hg.videolan.org/x265/rev/aacc4361aca7
branches:
changeset: 10406:aacc4361aca7
user: Steve Borho <steve at borho.org>
date: Mon May 11 10:28:11 2015 -0500
description:
Merge with stable
Subject: [x265] asm: interp_4tap_horiz_pp sse3
details: http://hg.videolan.org/x265/rev/5f9e5e9d4444
branches:
changeset: 10407:5f9e5e9d4444
user: David T Yuen <dtyx265 at gmail.com>
date: Mon May 11 11:19:43 2015 -0700
description:
asm: interp_4tap_horiz_pp sse3
Reduce code size with macros
move sse4 macro closer to sse4 code
There are no changes to functionality or performance
diffstat:
doc/reST/cli.rst | 10 +
source/common/x86/asm-primitives.cpp | 5 +
source/common/x86/ipfilter8.asm | 334 +++++++++-------------------------
source/common/x86/sad16-a.asm | 313 ++++++++++++++++++++++++++++++++
source/encoder/slicetype.cpp | 5 +-
source/x265.cpp | 65 ++++--
source/x265cli.h | 4 +-
7 files changed, 464 insertions(+), 272 deletions(-)
diffs (truncated from 966 to 300 lines):
diff -r b642b3d8cc1e -r 5f9e5e9d4444 doc/reST/cli.rst
--- a/doc/reST/cli.rst Sat May 09 12:33:18 2015 -0500
+++ b/doc/reST/cli.rst Mon May 11 11:19:43 2015 -0700
@@ -419,6 +419,16 @@ frame counts) are only applicable to the
**CLI ONLY**
+.. option:: --output-depth, -D 8|10
+
+ Bitdepth of output HEVC bitstream, which is also the internal bit
+ depth of the encoder. If the requested bit depth is not the bit
+ depth of the linked libx265, it will attempt to bind libx265_main
+ for an 8bit encoder, or libx265_main10 for a 10bit encoder, with the
+ same API version as the linked libx265.
+
+ **CLI ONLY**
+
Profile, Level, Tier
====================
diff -r b642b3d8cc1e -r 5f9e5e9d4444 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Sat May 09 12:33:18 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Mon May 11 11:19:43 2015 -0700
@@ -1264,6 +1264,11 @@ void setupAssemblyPrimitives(EncoderPrim
p.pu[LUMA_16x12].sad = x265_pixel_sad_16x12_avx2;
p.pu[LUMA_16x16].sad = x265_pixel_sad_16x16_avx2;
p.pu[LUMA_16x32].sad = x265_pixel_sad_16x32_avx2;
+ p.pu[LUMA_16x64].sad = x265_pixel_sad_16x64_avx2;
+ p.pu[LUMA_32x8].sad = x265_pixel_sad_32x8_avx2;
+ p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_avx2;
+ p.pu[LUMA_32x24].sad = x265_pixel_sad_32x24_avx2;
+ p.pu[LUMA_32x32].sad = x265_pixel_sad_32x32_avx2;
p.pu[LUMA_16x4].convert_p2s = x265_filterPixelToShort_16x4_avx2;
p.pu[LUMA_16x8].convert_p2s = x265_filterPixelToShort_16x8_avx2;
diff -r b642b3d8cc1e -r 5f9e5e9d4444 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Sat May 09 12:33:18 2015 -0500
+++ b/source/common/x86/ipfilter8.asm Mon May 11 11:19:43 2015 -0700
@@ -330,80 +330,38 @@ cextern pw_2000
%endmacro
;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_2xN(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_H4_W2xN_sse3 1
INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_2x4, 4, 6, 6, src, srcstride, dst, dststride
- mov r4d, r4m
- mova m5, [pw_32]
-
-%ifdef PIC
- lea r5, [tabw_ChromaCoeff]
- movddup m4, [r5 + r4 * 8]
-%else
- movddup m4, [tabw_ChromaCoeff + r4 * 8]
-%endif
-
+cglobal interp_4tap_horiz_pp_2x%1, 4, 6, 6, src, srcstride, dst, dststride
+ mov r4d, r4m
+ mova m5, [pw_32]
+
+%ifdef PIC
+ lea r5, [tabw_ChromaCoeff]
+ movddup m4, [r5 + r4 * 8]
+%else
+ movddup m4, [tabw_ChromaCoeff + r4 * 8]
+%endif
+
+%assign x 1
+%rep %1/2
FILTER_H4_w2_2_sse2
- lea srcq, [srcq + srcstrideq * 2]
- lea dstq, [dstq + dststrideq * 2]
- FILTER_H4_w2_2_sse2
-
- RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_2x8, 4, 6, 6, src, srcstride, dst, dststride
- mov r4d, r4m
- mova m5, [pw_32]
-
-%ifdef PIC
- lea r5, [tabw_ChromaCoeff]
- movddup m4, [r5 + r4 * 8]
-%else
- movddup m4, [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 4
- FILTER_H4_w2_2_sse2
-%if x < 4
- lea srcq, [srcq + srcstrideq * 2]
- lea dstq, [dstq + dststrideq * 2]
+%if x < %1/2
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
%endif
%assign x x+1
%endrep
RET
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_2x16, 4, 6, 6, src, srcstride, dst, dststride
- mov r4d, r4m
- mova m5, [pw_32]
-
-%ifdef PIC
- lea r5, [tabw_ChromaCoeff]
- movddup m4, [r5 + r4 * 8]
-%else
- movddup m4, [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 8
- FILTER_H4_w2_2_sse2
-%if x < 8
- lea srcq, [srcq + srcstrideq * 2]
- lea dstq, [dstq + dststrideq * 2]
-%endif
-%assign x x+1
-%endrep
-
- RET
+%endmacro
+
+ FILTER_H4_W2xN_sse3 4
+ FILTER_H4_W2xN_sse3 8
+ FILTER_H4_W2xN_sse3 16
%macro FILTER_H4_w4_2_sse2 0
pxor m5, m5
@@ -447,142 +405,40 @@ cglobal interp_4tap_horiz_pp_2x16, 4, 6,
%endmacro
;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_H4_W4xN_sse3 1
INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x2, 4, 6, 8, src, srcstride, dst, dststride
- mov r4d, r4m
- mova m7, [pw_32]
-
-%ifdef PIC
- lea r5, [tabw_ChromaCoeff]
- movddup m4, [r5 + r4 * 8]
-%else
- movddup m4, [tabw_ChromaCoeff + r4 * 8]
-%endif
-
+cglobal interp_4tap_horiz_pp_4x%1, 4, 6, 8, src, srcstride, dst, dststride
+ mov r4d, r4m
+ mova m7, [pw_32]
+
+%ifdef PIC
+ lea r5, [tabw_ChromaCoeff]
+ movddup m4, [r5 + r4 * 8]
+%else
+ movddup m4, [tabw_ChromaCoeff + r4 * 8]
+%endif
+
+%assign x 1
+%rep %1/2
FILTER_H4_w4_2_sse2
-
- RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x4, 4, 6, 8, src, srcstride, dst, dststride
- mov r4d, r4m
- mova m7, [pw_32]
-
-%ifdef PIC
- lea r5, [tabw_ChromaCoeff]
- movddup m4, [r5 + r4 * 8]
-%else
- movddup m4, [tabw_ChromaCoeff + r4 * 8]
-%endif
-
- FILTER_H4_w4_2_sse2
- lea srcq, [srcq + srcstrideq * 2]
- lea dstq, [dstq + dststrideq * 2]
- FILTER_H4_w4_2_sse2
-
- RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x8, 4, 6, 8, src, srcstride, dst, dststride
- mov r4d, r4m
- mova m7, [pw_32]
-
-%ifdef PIC
- lea r5, [tabw_ChromaCoeff]
- movddup m4, [r5 + r4 * 8]
-%else
- movddup m4, [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 4
- FILTER_H4_w4_2_sse2
-%if x < 4
- lea srcq, [srcq + srcstrideq * 2]
- lea dstq, [dstq + dststrideq * 2]
+%if x < %1/2
+ lea srcq, [srcq + srcstrideq * 2]
+ lea dstq, [dstq + dststrideq * 2]
%endif
%assign x x+1
%endrep
RET
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x16, 4, 6, 8, src, srcstride, dst, dststride
- mov r4d, r4m
- mova m7, [pw_32]
-
-%ifdef PIC
- lea r5, [tabw_ChromaCoeff]
- movddup m4, [r5 + r4 * 8]
-%else
- movddup m4, [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 8
- FILTER_H4_w4_2_sse2
-%if x < 8
- lea srcq, [srcq + srcstrideq * 2]
- lea dstq, [dstq + dststrideq * 2]
-%endif
-%assign x x+1
-%endrep
-
- RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x32, 4, 6, 8, src, srcstride, dst, dststride
- mov r4d, r4m
- mova m7, [pw_32]
-
-%ifdef PIC
- lea r5, [tabw_ChromaCoeff]
- movddup m4, [r5 + r4 * 8]
-%else
- movddup m4, [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 16
- FILTER_H4_w4_2_sse2
-%if x < 16
- lea srcq, [srcq + srcstrideq * 2]
- lea dstq, [dstq + dststrideq * 2]
-%endif
-%assign x x+1
-%endrep
-
- RET
-
-%macro FILTER_H4_w2_2 3
- movh %2, [srcq - 1]
- pshufb %2, %2, Tm0
- movh %1, [srcq + srcstrideq - 1]
- pshufb %1, %1, Tm0
- punpcklqdq %2, %1
- pmaddubsw %2, coef2
- phaddw %2, %2
- pmulhrsw %2, %3
- packuswb %2, %2
- movd r4, %2
- mov [dstq], r4w
- shr r4, 16
More information about the x265-commits
mailing list