[x265-commits] [x265] api: nits

Steve Borho steve at borho.org
Tue Jun 30 18:38:47 CEST 2015


details:   http://hg.videolan.org/x265/rev/2e0d851a4952
branches:  
changeset: 10731:2e0d851a4952
user:      Steve Borho <steve at borho.org>
date:      Mon Jun 29 15:20:39 2015 -0500
description:
api: nits
Subject: [x265] asm: new algorithm x265_count_nonzero_4x4_avx2, Issue #152

details:   http://hg.videolan.org/x265/rev/483c85f83f07
branches:  
changeset: 10732:483c85f83f07
user:      Min Chen <chenm003 at 163.com>
date:      Mon Jun 29 12:18:38 2015 -0700
description:
asm: new algorithm x265_count_nonzero_4x4_avx2, Issue #152
Subject: [x265] asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80)

details:   http://hg.videolan.org/x265/rev/99ec5b951233
branches:  
changeset: 10733:99ec5b951233
user:      Rajesh Paulraj<rajesh at multicorewareinc.com>
date:      Fri Jun 26 15:11:17 2015 +0530
description:
asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80)

avx2:
planecopy_cp  19.36x   5685.80         110052.08

sse4:
planecopy_cp  9.65x    10660.20        102850.27
Subject: [x265] asm: intra_filter 10bpp sse4 code

details:   http://hg.videolan.org/x265/rev/b13019448940
branches:  
changeset: 10734:b13019448940
user:      Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date:      Mon Jun 29 17:19:07 2015 +0530
description:
asm: intra_filter 10bpp sse4 code

Performance improved over C code:
intra_filter_32x32 7.46x    525.64          3922.56
intra_filter_16x16 6.53x    289.11          1886.86
intra_filter_8x8   5.60x    170.75          956.81
intra_filter_4x4   3.05x    121.20          369.74
Subject: [x265] cli: fix multilib CPU detect, use detected cpuid from param

details:   http://hg.videolan.org/x265/rev/d9731802f1b7
branches:  
changeset: 10735:d9731802f1b7
user:      Steve Borho <steve at borho.org>
date:      Tue Jun 30 10:58:56 2015 -0500
description:
cli: fix multilib CPU detect, use detected cpuid from param

if the linked library was build without assembly, param->cpuid will be 0 and
then the help will properly show no assembly will be used.
Subject: [x265] primitives: remove cpuid parameter from x265_setup_primitives()

details:   http://hg.videolan.org/x265/rev/38168ee95560
branches:  
changeset: 10736:38168ee95560
user:      Steve Borho <steve at borho.org>
date:      Tue Jun 30 11:27:46 2015 -0500
description:
primitives: remove cpuid parameter from x265_setup_primitives()
Subject: [x265] cli: split x265_report_simd() from x265_setup_primitives()

details:   http://hg.videolan.org/x265/rev/61df434bea40
branches:  
changeset: 10737:61df434bea40
user:      Steve Borho <steve at borho.org>
date:      Tue Jun 30 11:36:55 2015 -0500
description:
cli: split x265_report_simd() from x265_setup_primitives()

The CLI shouldn't be calling into x265_setup_primitives() when all it wants to
do is report the SIMD architectures that the encoder will use.

diffstat:

 source/common/common.h               |    6 +-
 source/common/primitives.cpp         |   55 ++--
 source/common/x86/asm-primitives.cpp |    6 +
 source/common/x86/intrapred16.asm    |  413 +++++++++++++++++++++++++++++++++++
 source/common/x86/pixel-a.asm        |   90 +++++++
 source/common/x86/pixel-util8.asm    |   19 +-
 source/common/x86/pixel.h            |    1 +
 source/encoder/api.cpp               |    2 +-
 source/x265.cpp                      |    2 +-
 source/x265.h                        |    4 +-
 10 files changed, 553 insertions(+), 45 deletions(-)

diffs (truncated from 742 to 300 lines):

diff -r c0fc87075c75 -r 61df434bea40 source/common/common.h
--- a/source/common/common.h	Mon Jun 29 11:47:39 2015 -0500
+++ b/source/common/common.h	Tue Jun 30 11:36:55 2015 -0500
@@ -409,7 +409,7 @@ enum SignificanceMapContextType
 /* located in pixel.cpp */
 void extendPicBorder(pixel* recon, intptr_t stride, int width, int height, int marginX, int marginY);
 
-/* outside x265 namespace, but prefixed. defined in common.cpp */
+/* located in common.cpp */
 int64_t  x265_mdate(void);
 #define  x265_log(param, ...) general_log(param, "x265", __VA_ARGS__)
 void     general_log(const x265_param* param, const char* caller, int level, const char* fmt, ...);
@@ -424,7 +424,9 @@ void*    x265_malloc(size_t size);
 void     x265_free(void *ptr);
 char*    x265_slurp_file(const char *filename);
 
-void     x265_setup_primitives(x265_param* param, int cpu); /* primitives.cpp */
+/* located in primitives.cpp */
+void     x265_setup_primitives(x265_param* param);
+void     x265_report_simd(x265_param* param);
 }
 
 #include "constants.h"
diff -r c0fc87075c75 -r 61df434bea40 source/common/primitives.cpp
--- a/source/common/primitives.cpp	Mon Jun 29 11:47:39 2015 -0500
+++ b/source/common/primitives.cpp	Tue Jun 30 11:36:55 2015 -0500
@@ -188,36 +188,12 @@ void setupAliasPrimitives(EncoderPrimiti
     p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sse_pp = NULL;
 }
 
-/* cpuid >= 0 - force CPU type
- * cpuid < 0  - auto-detect if uninitialized */
-void x265_setup_primitives(x265_param *param, int cpuid)
+void x265_report_simd(x265_param* param)
 {
-    if (cpuid < 0)
-        cpuid = X265_NS::cpu_detect();
-
-    // initialize global variables
-    if (!primitives.pu[0].sad)
-    {
-        setupCPrimitives(primitives);
-
-        /* We do not want the encoder to use the un-optimized intra all-angles
-         * C references. It is better to call the individual angle functions
-         * instead. We must check for NULL before using this primitive */
-        for (int i = 0; i < NUM_TR_SIZE; i++)
-            primitives.cu[i].intra_pred_allangs = NULL;
-
-#if ENABLE_ASSEMBLY
-        setupInstrinsicPrimitives(primitives, cpuid);
-        setupAssemblyPrimitives(primitives, cpuid);
-#else
-        x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n");
-#endif
-
-        setupAliasPrimitives(primitives);
-    }
-
     if (param->logLevel >= X265_LOG_INFO)
     {
+        int cpuid = param->cpuid;
+
         char buf[1000];
         char *p = buf + sprintf(buf, "using cpu capabilities:");
         char *none = p;
@@ -248,6 +224,31 @@ void x265_setup_primitives(x265_param *p
         x265_log(param, X265_LOG_INFO, "%s\n", buf);
     }
 }
+
+void x265_setup_primitives(x265_param *param)
+{
+    if (!primitives.pu[0].sad)
+    {
+        setupCPrimitives(primitives);
+
+        /* We do not want the encoder to use the un-optimized intra all-angles
+         * C references. It is better to call the individual angle functions
+         * instead. We must check for NULL before using this primitive */
+        for (int i = 0; i < NUM_TR_SIZE; i++)
+            primitives.cu[i].intra_pred_allangs = NULL;
+
+#if ENABLE_ASSEMBLY
+        setupInstrinsicPrimitives(primitives, param->cpuid);
+        setupAssemblyPrimitives(primitives, param->cpuid);
+#else
+        x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n");
+#endif
+
+        setupAliasPrimitives(primitives);
+    }
+
+    x265_report_simd(param);
+}
 }
 
 #if ENABLE_ASSEMBLY
diff -r c0fc87075c75 -r 61df434bea40 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jun 29 11:47:39 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Tue Jun 30 11:36:55 2015 -0500
@@ -1120,6 +1120,11 @@ void setupAssemblyPrimitives(EncoderPrim
         ALL_LUMA_PU(satd, pixel_satd, sse4);
         ASSIGN_SA8D(sse4);
 
+        p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
+        p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
+        p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
+        p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
+
         ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
         ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
         INTRA_ANG_SSE4_COMMON(sse4);
@@ -1518,6 +1523,7 @@ void setupAssemblyPrimitives(EncoderPrim
         p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
         p.weight_pp = PFX(weight_pp_avx2);
         p.sign = PFX(calSign_avx2);
+        p.planecopy_cp = PFX(upShift_8_avx2);
 
         p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
         p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
diff -r c0fc87075c75 -r 61df434bea40 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Mon Jun 29 11:47:39 2015 -0500
+++ b/source/common/x86/intrapred16.asm	Tue Jun 30 11:36:55 2015 -0500
@@ -75,6 +75,9 @@ const pw_swap16,            times 2 db 1
 const pw_ang16_13,                  db 14, 15,  8,  9,  0,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
 const pw_ang16_16,                  db  0,  0,  0,  0,  0,  0, 10, 11,  8,  9,  6,  7,  2,  3,  0,  1
 
+intra_filter4_shuf0:                db  2,  3,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10 ,11, 12, 13
+intra_filter4_shuf1:                db 14, 15,  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10 ,11, 12, 13
+
 ;; (blkSize - 1 - x)
 pw_planar4_0:                       dw  3,  2,  1,  0,  3,  2,  1,  0
 
@@ -21634,3 +21637,413 @@ cglobal intra_pred_ang32_33, 3,7,8
     dec    r4
     jnz    .loop
     RET
+
+;-----------------------------------------------------------------------------------
+; void intra_filter_NxN(const pixel* references, pixel* filtered)
+;-----------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_filter_4x4, 2,4,5
+    mov             r2w, word [r0 + 16]             ; topLast
+    mov             r3w, word [r0 + 32]             ; LeftLast
+
+    ; filtering top
+    movu            m0, [r0 +  0]
+    movu            m1, [r0 + 16]
+    movu            m2, [r0 + 32]
+
+    pshufb          m4, m0, [intra_filter4_shuf0]   ; [6 5 4 3 2 1 0 1] samples[i - 1]
+    palignr         m3, m1, m0, 4
+    pshufb          m3, [intra_filter4_shuf1]       ; [8 7 6 5 4 3 2 9] samples[i + 1]
+
+    psllw           m0, 1
+    paddw           m4, m3
+    paddw           m0, m4
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+
+    ; filtering left
+    palignr         m4, m1, m1, 14
+    pinsrw          m4, [r0], 1
+    palignr         m3, m2, m1, 4
+    pshufb          m3, [intra_filter4_shuf1]
+
+    psllw           m1, 1
+    paddw           m4, m3
+    paddw           m1, m4
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+
+    movu            [r1], m0
+    movu            [r1 + 16], m1
+    mov             [r1 + 16], r2w                  ; topLast
+    mov             [r1 + 32], r3w                  ; LeftLast
+    RET
+
+INIT_XMM sse4
+cglobal intra_filter_8x8, 2,4,6
+    mov             r2w, word [r0 + 32]             ; topLast
+    mov             r3w, word [r0 + 64]             ; LeftLast
+
+    ; filtering top
+    movu            m0, [r0]
+    movu            m1, [r0 + 16]
+    movu            m2, [r0 + 32]
+
+    pshufb          m4, m0, [intra_filter4_shuf0]
+    palignr         m5, m1, m0, 2
+    pinsrw          m5, [r0 + 34], 0
+
+    palignr         m3, m1, m0, 14
+    psllw           m0, 1
+    paddw           m4, m5
+    paddw           m0, m4
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+
+    palignr         m4, m2, m1, 2
+    psllw           m1, 1
+    paddw           m4, m3
+    paddw           m1, m4
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+    movu            [r1], m0
+    movu            [r1 + 16], m1
+
+    ; filtering left
+    movu            m1, [r0 + 48]
+    movu            m0, [r0 + 64]
+
+    palignr         m4, m2, m2, 14
+    pinsrw          m4, [r0], 1
+    palignr         m5, m1, m2, 2
+
+    palignr         m3, m1, m2, 14
+    palignr         m0, m1, 2
+
+    psllw           m2, 1
+    paddw           m4, m5
+    paddw           m2, m4
+    paddw           m2, [pw_2]
+    psrlw           m2, 2
+
+    psllw           m1, 1
+    paddw           m0, m3
+    paddw           m1, m0
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+
+    movu            [r1 + 32], m2
+    movu            [r1 + 48], m1
+    mov             [r1 + 32], r2w                  ; topLast
+    mov             [r1 + 64], r3w                  ; LeftLast
+    RET
+
+INIT_XMM sse4
+cglobal intra_filter_16x16, 2,4,6
+    mov             r2w, word [r0 +  64]            ; topLast
+    mov             r3w, word [r0 + 128]            ; LeftLast
+
+    ; filtering top
+    movu            m0, [r0]
+    movu            m1, [r0 + 16]
+    movu            m2, [r0 + 32]
+
+    pshufb          m4, m0, [intra_filter4_shuf0]
+    palignr         m5, m1, m0, 2
+    pinsrw          m5, [r0 + 66], 0
+
+    palignr         m3, m1, m0, 14
+    psllw           m0, 1
+    paddw           m4, m5
+    paddw           m0, m4
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+
+    palignr         m4, m2, m1, 2
+    psllw           m5, m1, 1
+    paddw           m4, m3
+    paddw           m5, m4
+    paddw           m5, [pw_2]
+    psrlw           m5, 2
+    movu            [r1], m0
+    movu            [r1 + 16], m5
+
+    movu            m0, [r0 + 48]
+    movu            m5, [r0 + 64]
+
+    palignr         m3, m2, m1, 14
+    palignr         m4, m0, m2, 2
+
+    psllw           m1, m2, 1
+    paddw           m3, m4
+    paddw           m1, m3
+    paddw           m1, [pw_2]
+    psrlw           m1, 2
+
+    palignr         m3, m0, m2, 14
+    palignr         m4, m5, m0, 2
+
+    psllw           m0, 1
+    paddw           m4, m3
+    paddw           m0, m4
+    paddw           m0, [pw_2]
+    psrlw           m0, 2
+    movu            [r1 + 32], m1
+    movu            [r1 + 48], m0
+
+    ; filtering left
+    movu            m1, [r0 + 80]
+    movu            m2, [r0 + 96]
+
+    palignr         m4, m5, m5, 14
+    pinsrw          m4, [r0], 1
+    palignr         m0, m1, m5, 2
+


More information about the x265-commits mailing list