[x265-commits] [x265] slicetype: fix build with detailed stats enabled

Steve Borho steve at borho.org
Mon May 11 23:44:17 CEST 2015


details:   http://hg.videolan.org/x265/rev/c7ac09d16802
branches:  stable
changeset: 10401:c7ac09d16802
user:      Steve Borho <steve at borho.org>
date:      Mon May 11 08:15:56 2015 -0500
description:
slicetype: fix build with detailed stats enabled
Subject: [x265] asm: avx2 code for sad[16x64] for 10 bpp (3060->1353)

details:   http://hg.videolan.org/x265/rev/7971adc03a16
branches:  
changeset: 10402:7971adc03a16
user:      Sumalatha Polureddy
date:      Mon May 11 10:42:13 2015 +0530
description:
asm: avx2 code for sad[16x64] for 10 bpp (3060->1353)

sse2:
sad[16x64]  3.08x    3060.55         9412.37
avx2:
sad[16x64]  6.85x    1353.36         9263.85
Subject: [x265] asm: avx2 code for sad[32x8] for 10 bpp(833 -> 304)

details:   http://hg.videolan.org/x265/rev/0f8f6f0a419a
branches:  
changeset: 10403:0f8f6f0a419a
user:      Sumalatha Polureddy
date:      Mon May 11 10:49:02 2015 +0530
description:
asm: avx2 code for sad[32x8] for 10 bpp(833 -> 304)

sse2:
sad[ 32x8]  2.86x    833.73          2380.88

avx2:
sad[ 32x8]  7.31x    304.54          2225.90
Subject: [x265] asm: avx2 code for sad[32x16],[32x24],[32x32] for 10 bpp

details:   http://hg.videolan.org/x265/rev/47a2e12c2c15
branches:  
changeset: 10404:47a2e12c2c15
user:      Sumalatha Polureddy
date:      Mon May 11 11:37:56 2015 +0530
description:
asm: avx2 code for sad[32x16],[32x24],[32x32] for 10 bpp

sse2:
sad[32x16]  3.01x    1595.11         4794.98
sad[32x24]  2.98x    2362.68         7051.16
sad[32x32]  2.97x    3128.34         9278.31

avx2:
sad[32x16]  7.66x    603.61          4621.37
sad[32x24]  6.98x    1003.75         7006.95
sad[32x32]  7.05x    1340.97         9452.61
Subject: [x265] cli: allow --output-depth to select output|internal bit depth

details:   http://hg.videolan.org/x265/rev/dadd6c419e55
branches:  stable
changeset: 10405:dadd6c419e55
user:      Steve Borho <steve at borho.org>
date:      Mon May 11 08:50:41 2015 -0500
description:
cli: allow --output-depth to select output|internal bit depth
Subject: [x265] Merge with stable

details:   http://hg.videolan.org/x265/rev/aacc4361aca7
branches:  
changeset: 10406:aacc4361aca7
user:      Steve Borho <steve at borho.org>
date:      Mon May 11 10:28:11 2015 -0500
description:
Merge with stable
Subject: [x265] asm: interp_4tap_horiz_pp sse3

details:   http://hg.videolan.org/x265/rev/5f9e5e9d4444
branches:  
changeset: 10407:5f9e5e9d4444
user:      David T Yuen <dtyx265 at gmail.com>
date:      Mon May 11 11:19:43 2015 -0700
description:
asm: interp_4tap_horiz_pp sse3

Reduce code size with macros
move sse4 macro closer to sse4 code
There are no changes to functionality or performance

diffstat:

 doc/reST/cli.rst                     |   10 +
 source/common/x86/asm-primitives.cpp |    5 +
 source/common/x86/ipfilter8.asm      |  334 +++++++++-------------------------
 source/common/x86/sad16-a.asm        |  313 ++++++++++++++++++++++++++++++++
 source/encoder/slicetype.cpp         |    5 +-
 source/x265.cpp                      |   65 ++++--
 source/x265cli.h                     |    4 +-
 7 files changed, 464 insertions(+), 272 deletions(-)

diffs (truncated from 966 to 300 lines):

diff -r b642b3d8cc1e -r 5f9e5e9d4444 doc/reST/cli.rst
--- a/doc/reST/cli.rst	Sat May 09 12:33:18 2015 -0500
+++ b/doc/reST/cli.rst	Mon May 11 11:19:43 2015 -0700
@@ -419,6 +419,16 @@ frame counts) are only applicable to the
 
 	**CLI ONLY**
 
+.. option:: --output-depth, -D 8|10
+
+	Bitdepth of output HEVC bitstream, which is also the internal bit
+	depth of the encoder. If the requested bit depth is not the bit
+	depth of the linked libx265, it will attempt to bind libx265_main
+	for an 8bit encoder, or libx265_main10 for a 10bit encoder, with the
+	same API version as the linked libx265.
+
+	**CLI ONLY**
+
 Profile, Level, Tier
 ====================
 
diff -r b642b3d8cc1e -r 5f9e5e9d4444 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Sat May 09 12:33:18 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Mon May 11 11:19:43 2015 -0700
@@ -1264,6 +1264,11 @@ void setupAssemblyPrimitives(EncoderPrim
         p.pu[LUMA_16x12].sad = x265_pixel_sad_16x12_avx2;
         p.pu[LUMA_16x16].sad = x265_pixel_sad_16x16_avx2;
         p.pu[LUMA_16x32].sad = x265_pixel_sad_16x32_avx2;
+        p.pu[LUMA_16x64].sad = x265_pixel_sad_16x64_avx2;
+        p.pu[LUMA_32x8].sad = x265_pixel_sad_32x8_avx2;
+        p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_avx2;
+        p.pu[LUMA_32x24].sad = x265_pixel_sad_32x24_avx2;
+        p.pu[LUMA_32x32].sad = x265_pixel_sad_32x32_avx2;
 
         p.pu[LUMA_16x4].convert_p2s = x265_filterPixelToShort_16x4_avx2;
         p.pu[LUMA_16x8].convert_p2s = x265_filterPixelToShort_16x8_avx2;
diff -r b642b3d8cc1e -r 5f9e5e9d4444 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Sat May 09 12:33:18 2015 -0500
+++ b/source/common/x86/ipfilter8.asm	Mon May 11 11:19:43 2015 -0700
@@ -330,80 +330,38 @@ cextern pw_2000
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_2x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_2xN(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_H4_W2xN_sse3 1
 INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_2x4, 4, 6, 6, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m5,         [pw_32]
-
-%ifdef PIC
-    lea         r5,          [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
+cglobal interp_4tap_horiz_pp_2x%1, 4, 6, 6, src, srcstride, dst, dststride
+    mov         r4d,    r4m
+    mova        m5,     [pw_32]
+
+%ifdef PIC
+    lea         r5,     [tabw_ChromaCoeff]
+    movddup     m4,     [r5 + r4 * 8]
+%else
+    movddup     m4,     [tabw_ChromaCoeff + r4 * 8]
+%endif
+
+%assign x 1
+%rep %1/2
     FILTER_H4_w2_2_sse2
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
-    FILTER_H4_w2_2_sse2
-
-    RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_2x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_2x8, 4, 6, 6, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m5,         [pw_32]
-
-%ifdef PIC
-    lea         r5,          [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 4
-    FILTER_H4_w2_2_sse2
-%if x < 4
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
+%if x < %1/2
+    lea         srcq,   [srcq + srcstrideq * 2]
+    lea         dstq,   [dstq + dststrideq * 2]
 %endif
 %assign x x+1
 %endrep
 
     RET
 
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_2x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_2x16, 4, 6, 6, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m5,         [pw_32]
-
-%ifdef PIC
-    lea         r5,         [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 8
-    FILTER_H4_w2_2_sse2
-%if x < 8
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
-%endif
-%assign x x+1
-%endrep
-
-    RET
+%endmacro
+
+    FILTER_H4_W2xN_sse3 4
+    FILTER_H4_W2xN_sse3 8
+    FILTER_H4_W2xN_sse3 16
 
 %macro FILTER_H4_w4_2_sse2 0
     pxor        m5, m5
@@ -447,142 +405,40 @@ cglobal interp_4tap_horiz_pp_2x16, 4, 6,
 %endmacro
 
 ;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
+; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;-----------------------------------------------------------------------------
+%macro FILTER_H4_W4xN_sse3 1
 INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x2, 4, 6, 8, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m7,         [pw_32]
-
-%ifdef PIC
-    lea         r5,         [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
+cglobal interp_4tap_horiz_pp_4x%1, 4, 6, 8, src, srcstride, dst, dststride
+    mov         r4d,    r4m
+    mova        m7,     [pw_32]
+
+%ifdef PIC
+    lea         r5,     [tabw_ChromaCoeff]
+    movddup     m4,     [r5 + r4 * 8]
+%else
+    movddup     m4,     [tabw_ChromaCoeff + r4 * 8]
+%endif
+
+%assign x 1
+%rep %1/2
     FILTER_H4_w4_2_sse2
-
-    RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x4, 4, 6, 8, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m7,         [pw_32]
-
-%ifdef PIC
-    lea         r5,         [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-    FILTER_H4_w4_2_sse2
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
-    FILTER_H4_w4_2_sse2
-
-    RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x8(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x8, 4, 6, 8, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m7,         [pw_32]
-
-%ifdef PIC
-    lea         r5,         [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 4
-    FILTER_H4_w4_2_sse2
-%if x < 4
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
+%if x < %1/2
+    lea         srcq,   [srcq + srcstrideq * 2]
+    lea         dstq,   [dstq + dststrideq * 2]
 %endif
 %assign x x+1
 %endrep
 
     RET
 
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x16(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x16, 4, 6, 8, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m7,         [pw_32]
-
-%ifdef PIC
-    lea         r5,         [tabw_ChromaCoeff]
-    movddup     m4,         [r5 + r4 * 8]
-%else
-    movddup     m4,         [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 8
-    FILTER_H4_w4_2_sse2
-%if x < 8
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
-%endif
-%assign x x+1
-%endrep
-
-    RET
-
-;-----------------------------------------------------------------------------
-; void interp_4tap_horiz_pp_4x32(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;-----------------------------------------------------------------------------
-INIT_XMM sse3
-cglobal interp_4tap_horiz_pp_4x32, 4, 6, 8, src, srcstride, dst, dststride
-    mov         r4d,        r4m
-    mova        m7,         [pw_32]
-
-%ifdef PIC
-    lea         r5,          [tabw_ChromaCoeff]
-    movddup     m4,       [r5 + r4 * 8]
-%else
-    movddup     m4,       [tabw_ChromaCoeff + r4 * 8]
-%endif
-
-%assign x 1
-%rep 16
-    FILTER_H4_w4_2_sse2
-%if x < 16
-    lea         srcq,       [srcq + srcstrideq * 2]
-    lea         dstq,       [dstq + dststrideq * 2]
-%endif
-%assign x x+1
-%endrep
-
-    RET
-
-%macro FILTER_H4_w2_2 3
-    movh        %2, [srcq - 1]
-    pshufb      %2, %2, Tm0
-    movh        %1, [srcq + srcstrideq - 1]
-    pshufb      %1, %1, Tm0
-    punpcklqdq  %2, %1
-    pmaddubsw   %2, coef2
-    phaddw      %2, %2
-    pmulhrsw    %2, %3
-    packuswb    %2, %2
-    movd        r4, %2
-    mov         [dstq], r4w
-    shr         r4, 16


More information about the x265-commits mailing list