[x265-commits] [x265] asm: sse4 code for saoCuStatsBO, improved 185378c->131279c

Dnyaneshwar G dnyaneshwar at multicorewareinc.com
Mon Jul 13 18:34:48 CEST 2015


details:   http://hg.videolan.org/x265/rev/2d15249c39ba
branches:  
changeset: 10801:2d15249c39ba
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Tue Jul 07 11:14:35 2015 +0530
description:
asm: sse4 code for saoCuStatsBO, improved 185378c->131279c
Subject: [x265] asm: sse4 code for saoCuStatsE0, improved 250341c->147284c

details:   http://hg.videolan.org/x265/rev/3ce49b558052
branches:  
changeset: 10802:3ce49b558052
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Tue Jul 07 12:17:08 2015 +0530
description:
asm: sse4 code for saoCuStatsE0, improved 250341c->147284c
Subject: [x265] asm: sse4 code for saoCuStatsE1, improved 320369c->151086c

details:   http://hg.videolan.org/x265/rev/dd6945c3044e
branches:  
changeset: 10803:dd6945c3044e
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Tue Jul 07 12:29:32 2015 +0530
description:
asm: sse4 code for saoCuStatsE1, improved 320369c->151086c
Subject: [x265] asm: frameInitLowres avx2 code for 8bpp and 10bpp

details:   http://hg.videolan.org/x265/rev/448a0901a74c
branches:  
changeset: 10804:448a0901a74c
user:      Rajesh Paulraj<rajesh at multicorewareinc.com>
date:      Thu Jul 09 17:51:09 2015 +0530
description:
asm: frameInitLowres avx2 code for 8bpp and 10bpp

8bpp:
avx2: downscale  30.38x   22659.94        688378.63
avx : downscale  18.92x   33242.29        628884.06

10bpp:
avx2: downscale  13.48x   51288.90        691165.69
avx : downscale  10.84x   64374.10        697631.81
Subject: [x265] cmake: remove empty file set

details:   http://hg.videolan.org/x265/rev/11911e0e5949
branches:  
changeset: 10805:11911e0e5949
user:      Deepthi Nandakumar <deepthi at multicorewareinc.com>
date:      Mon Jul 13 11:05:25 2015 +0530
description:
cmake: remove empty file set
Subject: [x265] asm: fix linux build error- cannot override register size

details:   http://hg.videolan.org/x265/rev/26b7dc40afa6
branches:  
changeset: 10806:26b7dc40afa6
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Mon Jul 13 12:47:50 2015 +0530
description:
asm: fix linux build error- cannot override register size
Subject: [x265] log-level: remove X265_LOG_FRAME, frame level logging is supported in CSV only

details:   http://hg.videolan.org/x265/rev/63fe043f739c
branches:  
changeset: 10807:63fe043f739c
user:      Deepthi Nandakumar <deepthi at multicorewareinc.com>
date:      Mon Jul 13 14:44:57 2015 +0530
description:
log-level: remove X265_LOG_FRAME, frame level logging is supported in CSV only
Subject: [x265] stats: count of each CU partition per frame

details:   http://hg.videolan.org/x265/rev/3f029010cf52
branches:  
changeset: 10808:3f029010cf52
user:      Divya Manivannan <divya at multicorewareinc.com>
date:      Wed Jul 01 14:36:18 2015 +0530
description:
stats: count of each CU partition per frame
Subject: [x265] asm: fix saoCuStatE1 testbench failure

details:   http://hg.videolan.org/x265/rev/426169ca6c76
branches:  
changeset: 10809:426169ca6c76
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Mon Jul 13 16:53:29 2015 +0530
description:
asm: fix saoCuStatE1 testbench failure
Subject: [x265] param: follow-up cleanups from --log-level frame removal

details:   http://hg.videolan.org/x265/rev/39f5205c28d0
branches:  
changeset: 10810:39f5205c28d0
user:      Steve Borho <steve at borho.org>
date:      Mon Jul 13 10:47:22 2015 -0500
description:
param: follow-up cleanups from --log-level frame removal

diffstat:

 doc/reST/api.rst                     |    4 -
 doc/reST/cli.rst                     |    7 +-
 source/CMakeLists.txt                |    7 +-
 source/common/framedata.h            |   22 ++
 source/common/param.h                |    2 +-
 source/common/x86/asm-primitives.cpp |    7 +
 source/common/x86/loopfilter.asm     |  295 +++++++++++++++++++++++++++++++++++
 source/common/x86/loopfilter.h       |    3 +
 source/common/x86/mc-a2.asm          |   32 ++-
 source/common/x86/mc.h               |    1 +
 source/common/x86/x86util.asm        |    8 +-
 source/encoder/encoder.cpp           |   12 +
 source/encoder/frameencoder.cpp      |  102 ++++++-----
 source/encoder/frameencoder.h        |    6 +-
 source/test/pixelharness.cpp         |  168 +++++++++++++++++++
 source/test/pixelharness.h           |    3 +
 source/x265-extras.cpp               |   70 ++++++++-
 source/x265-extras.h                 |    2 +-
 source/x265.cpp                      |   30 +-
 source/x265.h                        |   18 +-
 20 files changed, 695 insertions(+), 104 deletions(-)

diffs (truncated from 1255 to 300 lines):

diff -r ae2e79a13089 -r 39f5205c28d0 doc/reST/api.rst
--- a/doc/reST/api.rst	Sun Jul 12 17:23:57 2015 -0500
+++ b/doc/reST/api.rst	Mon Jul 13 10:47:22 2015 -0500
@@ -339,10 +339,6 @@ statistics from the encoder::
 Cleanup
 =======
 
-	/* x265_encoder_log:
-	 *       This function is now deprecated */
-	void x265_encoder_log(x265_encoder *encoder, int argc, char **argv);
-
 Finally, the encoder must be closed in order to free all of its
 resources. An encoder that has been flushed cannot be restarted and
 reused. Once **x265_encoder_close()** has been called, the encoder
diff -r ae2e79a13089 -r 39f5205c28d0 doc/reST/cli.rst
--- a/doc/reST/cli.rst	Sun Jul 12 17:23:57 2015 -0500
+++ b/doc/reST/cli.rst	Mon Jul 13 10:47:22 2015 -0500
@@ -28,7 +28,7 @@ consider this an error and abort.
 
 Generally, when an option expects a string value from a list of strings
 the user may specify the integer ordinal of the value they desire. ie:
-:option:`--log-level` 4 is equivalent to :option:`--log-level` debug.
+:option:`--log-level` 3 is equivalent to :option:`--log-level` debug.
 
 Executable Options
 ==================
@@ -68,9 +68,8 @@ Logging/Statistic Options
 	0. error
 	1. warning
 	2. info **(default)**
-	3. frame
-	4. debug
-	5. full
+	3. debug
+	4. full
 
 .. option:: --no-progress
 
diff -r ae2e79a13089 -r 39f5205c28d0 source/CMakeLists.txt
--- a/source/CMakeLists.txt	Sun Jul 12 17:23:57 2015 -0500
+++ b/source/CMakeLists.txt	Mon Jul 13 10:47:22 2015 -0500
@@ -30,7 +30,7 @@ option(STATIC_LINK_CRT "Statically link 
 mark_as_advanced(FPROFILE_USE FPROFILE_GENERATE NATIVE_BUILD)
 
 # X265_BUILD must be incremented each time the public API is changed
-set(X265_BUILD 63)
+set(X265_BUILD 64)
 configure_file("${PROJECT_SOURCE_DIR}/x265.def.in"
                "${PROJECT_BINARY_DIR}/x265.def")
 configure_file("${PROJECT_SOURCE_DIR}/x265_config.h.in"
@@ -513,7 +513,6 @@ if(ENABLE_CLI)
                           output/raw.cpp)               # muxers
     source_group(input FILES ${InputFiles})
     source_group(output FILES ${OutputFiles})
-    source_group(filters FILES ${FilterFiles})
 
     check_include_files(getopt.h HAVE_GETOPT_H)
     if(NOT HAVE_GETOPT_H)
@@ -529,11 +528,11 @@ if(ENABLE_CLI)
 
     if(XCODE)
         # Xcode seems unable to link the CLI with libs, so link as one targget
-        add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT}
+        add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT}
                        x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp
                        $<TARGET_OBJECTS:encoder> $<TARGET_OBJECTS:common> ${YASM_OBJS} ${YASM_SRCS})
     else()
-        add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${FilterFiles} ${GETOPT} ${X265_RC_FILE}
+        add_executable(cli ../COPYING ${InputFiles} ${OutputFiles} ${GETOPT} ${X265_RC_FILE}
                        ${ExportDefs} x265.cpp x265.h x265cli.h x265-extras.h x265-extras.cpp)
         if(WIN32 OR NOT ENABLE_SHARED OR INTEL_CXX)
             # The CLI cannot link to the shared library on Windows, it
diff -r ae2e79a13089 -r 39f5205c28d0 source/common/framedata.h
--- a/source/common/framedata.h	Sun Jul 12 17:23:57 2015 -0500
+++ b/source/common/framedata.h	Mon Jul 13 10:47:22 2015 -0500
@@ -34,6 +34,9 @@ namespace X265_NS {
 class PicYuv;
 class JobProvider;
 
+#define INTER_MODES 4 // 2Nx2N, 2NxN, Nx2N, AMP modes
+#define INTRA_MODES 3 // DC, Planar, Angular modes
+
 /* Current frame stats for 2 pass */
 struct FrameStats
 {
@@ -49,6 +52,25 @@ struct FrameStats
     double      percent8x8Intra;
     double      percent8x8Inter;
     double      percent8x8Skip;
+    double      percentIntraNxN;
+    double      percentSkipCu[NUM_CU_DEPTH];
+    double      percentMergeCu[NUM_CU_DEPTH];
+    double      percentIntraDistribution[NUM_CU_DEPTH][INTRA_MODES];
+    double      percentInterDistribution[NUM_CU_DEPTH][3];           // 2Nx2N, RECT, AMP modes percentage
+
+    uint64_t    cntIntraNxN;
+    uint64_t    totalCu;
+    uint64_t    cntSkipCu[NUM_CU_DEPTH];
+    uint64_t    cntMergeCu[NUM_CU_DEPTH];
+    uint64_t    cntInter[NUM_CU_DEPTH];
+    uint64_t    cntIntra[NUM_CU_DEPTH];
+    uint64_t    cuInterDistribution[NUM_CU_DEPTH][INTER_MODES];
+    uint64_t    cuIntraDistribution[NUM_CU_DEPTH][INTRA_MODES];
+
+    FrameStats()
+    {
+        memset(this, 0, sizeof(FrameStats));
+    }
 };
 
 /* Per-frame data that is used during encodes and referenced while the picture
diff -r ae2e79a13089 -r 39f5205c28d0 source/common/param.h
--- a/source/common/param.h	Sun Jul 12 17:23:57 2015 -0500
+++ b/source/common/param.h	Mon Jul 13 10:47:22 2015 -0500
@@ -41,7 +41,7 @@ void  getParamAspectRatio(x265_param *p,
 bool  parseLambdaFile(x265_param *param);
 
 /* this table is kept internal to avoid confusion, since log level indices start at -1 */
-static const char * const logLevelNames[] = { "none", "error", "warning", "info", "frame", "debug", "full", 0 };
+static const char * const logLevelNames[] = { "none", "error", "warning", "info", "debug", "full", 0 };
 
 #if EXPORT_C_API
 #define PARAM_NS
diff -r ae2e79a13089 -r 39f5205c28d0 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Sun Jul 12 17:23:57 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Mon Jul 13 10:47:22 2015 -0500
@@ -2116,6 +2116,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx2);
         p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx2);
 
+        p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
+
         ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);                        // calling luma_hvpp for all sizes
         p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;             // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4] 
 
@@ -2497,6 +2499,9 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
 
 #if X86_64
+        p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
+        p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
+        p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
         p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
         p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
 
@@ -3558,6 +3563,8 @@ void setupAssemblyPrimitives(EncoderPrim
         p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx2);
         p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx2);
 
+        p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
+
         if (cpuMask & X265_CPU_BMI2)
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
     }
diff -r ae2e79a13089 -r 39f5205c28d0 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Sun Jul 12 17:23:57 2015 -0500
+++ b/source/common/x86/loopfilter.asm	Mon Jul 13 10:47:22 2015 -0500
@@ -29,6 +29,7 @@
 
 SECTION_RODATA 32
 pb_31:      times 32 db 31
+pb_124:     times 32 db 124
 pb_15:      times 32 db 15
 pb_movemask_32:  times 32 db 0x00
                  times 32 db 0xFF
@@ -41,6 +42,8 @@ cextern pw_2
 cextern pw_1023
 cextern pb_movemask
 cextern pw_1
+cextern hmul_16p
+cextern pb_4
 
 
 ;============================================================================================================
@@ -1984,3 +1987,295 @@ cglobal calSign, 4, 5, 6
 .end:
     RET
 %endif
+
+;--------------------------------------------------------------------------------------------------------------------------
+; saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+;--------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal saoCuStatsBO, 7,12,6
+    mova        m3, [hmul_16p + 16]
+    mova        m4, [pb_124]
+    mova        m5, [pb_4]
+    xor         r7d, r7d
+
+.loopH:
+    mov         r10, r0
+    mov         r11, r1
+    mov         r9d, r3d
+.loopL:
+    movu        m1, [r11]
+    movu        m0, [r10]
+
+    punpckhbw   m2, m0, m1
+    punpcklbw   m0, m1
+    psrlw       m1, 1               ; rec[x] >> boShift
+    pmaddubsw   m2, m3
+    pmaddubsw   m0, m3
+    pand        m1, m4
+    paddb       m1, m5
+
+%assign x 0
+%rep 16
+    pextrb      r7d, m1, x
+
+%if (x < 8)
+    pextrw      r8d, m0, (x % 8)
+%else
+    pextrw      r8d, m2, (x % 8)
+%endif
+    movsx       r8d, r8w
+    inc         dword  [r6 + r7]    ; count[classIdx]++
+    add         [r5 + r7], r8d      ; stats[classIdx] += (fenc[x] - rec[x]);
+    dec         r9d
+    jz          .next
+%assign x x+1
+%endrep
+
+    add         r10, 16
+    add         r11, 16
+    jmp         .loopL
+
+.next:
+    add         r0, r2
+    add         r1, r2
+    dec         r4d
+    jnz         .loopH
+    RET
+%endif
+
+;-----------------------------------------------------------------------------------------------------------------------
+; saoCuStatsE0(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+;-----------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal saoCuStatsE0, 5,8,8, 0-32
+    mov         r3d, r3m
+
+    ; clear internal temporary buffer
+    pxor        m0, m0
+    mova        [rsp], m0
+    mova        [rsp + mmsize], m0
+    mova        m4, [pb_128]
+    mova        m5, [hmul_16p + 16]
+    mova        m6, [pb_2]
+    xor         r7d, r7d
+
+.loopH:
+    mov         r5d, r3d
+
+    ; calculate signLeft
+    mov         r7b, [r1]
+    sub         r7b, [r1 - 1]
+    seta        r7b
+    setb        r6b
+    sub         r7b, r6b
+    neg         r7b
+    pinsrb      m0, r7d, 15
+
+.loopL:
+    movu        m7, [r1]
+    movu        m2, [r1 + 1]
+
+    pxor        m1, m7, m4
+    pxor        m3, m2, m4
+    pcmpgtb     m2, m1, m3
+    pcmpgtb     m3, m1
+    pand        m2, [pb_1]
+    por         m2, m3              ; signRight
+
+    palignr     m3, m2, m0, 15
+    psignb      m3, m4              ; signLeft
+
+    mova        m0, m2
+    paddb       m2, m3
+    paddb       m2, m6              ; edgeType
+
+    ; stats[edgeType]
+    movu        m3, [r0]            ; fenc[0-15]
+    punpckhbw   m1, m3, m7
+    punpcklbw   m3, m7
+    pmaddubsw   m1, m5
+    pmaddubsw   m3, m5
+
+%assign x 0
+%rep 16
+    pextrb      r7d, m2, x
+
+%if (x < 8)
+    pextrw      r6d, m3, (x % 8)
+%else
+    pextrw      r6d, m1, (x % 8)
+%endif
+    movsx       r6d, r6w
+    inc         word [rsp + r7 * 2]             ; tmp_count[edgeType]++


More information about the x265-commits mailing list