[x265-commits] [x265] api: nits
Steve Borho
steve at borho.org
Tue Jun 30 18:38:47 CEST 2015
details: http://hg.videolan.org/x265/rev/2e0d851a4952
branches:
changeset: 10731:2e0d851a4952
user: Steve Borho <steve at borho.org>
date: Mon Jun 29 15:20:39 2015 -0500
description:
api: nits
Subject: [x265] asm: new algorithm x265_count_nonzero_4x4_avx2, Issue #152
details: http://hg.videolan.org/x265/rev/483c85f83f07
branches:
changeset: 10732:483c85f83f07
user: Min Chen <chenm003 at 163.com>
date: Mon Jun 29 12:18:38 2015 -0700
description:
asm: new algorithm x265_count_nonzero_4x4_avx2, Issue #152
Subject: [x265] asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80)
details: http://hg.videolan.org/x265/rev/99ec5b951233
branches:
changeset: 10733:99ec5b951233
user: Rajesh Paulraj<rajesh at multicorewareinc.com>
date: Fri Jun 26 15:11:17 2015 +0530
description:
asm: avx2 10bit code for planecopy_cp(10660.20 -> 5685.80)
avx2:
planecopy_cp 19.36x 5685.80 110052.08
sse4:
planecopy_cp 9.65x 10660.20 102850.27
Subject: [x265] asm: intra_filter 10bpp sse4 code
details: http://hg.videolan.org/x265/rev/b13019448940
branches:
changeset: 10734:b13019448940
user: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date: Mon Jun 29 17:19:07 2015 +0530
description:
asm: intra_filter 10bpp sse4 code
Performance improved over C code:
intra_filter_32x32 7.46x 525.64 3922.56
intra_filter_16x16 6.53x 289.11 1886.86
intra_filter_8x8 5.60x 170.75 956.81
intra_filter_4x4 3.05x 121.20 369.74
Subject: [x265] cli: fix multilib CPU detect, use detected cpuid from param
details: http://hg.videolan.org/x265/rev/d9731802f1b7
branches:
changeset: 10735:d9731802f1b7
user: Steve Borho <steve at borho.org>
date: Tue Jun 30 10:58:56 2015 -0500
description:
cli: fix multilib CPU detect, use detected cpuid from param
if the linked library was build without assembly, param->cpuid will be 0 and
then the help will properly show no assembly will be used.
Subject: [x265] primitives: remove cpuid parameter from x265_setup_primitives()
details: http://hg.videolan.org/x265/rev/38168ee95560
branches:
changeset: 10736:38168ee95560
user: Steve Borho <steve at borho.org>
date: Tue Jun 30 11:27:46 2015 -0500
description:
primitives: remove cpuid parameter from x265_setup_primitives()
Subject: [x265] cli: split x265_report_simd() from x265_setup_primitives()
details: http://hg.videolan.org/x265/rev/61df434bea40
branches:
changeset: 10737:61df434bea40
user: Steve Borho <steve at borho.org>
date: Tue Jun 30 11:36:55 2015 -0500
description:
cli: split x265_report_simd() from x265_setup_primitives()
The CLI shouldn't be calling into x265_setup_primitives() when all it wants to
do is report the SIMD architectures that the encoder will use.
diffstat:
source/common/common.h | 6 +-
source/common/primitives.cpp | 55 ++--
source/common/x86/asm-primitives.cpp | 6 +
source/common/x86/intrapred16.asm | 413 +++++++++++++++++++++++++++++++++++
source/common/x86/pixel-a.asm | 90 +++++++
source/common/x86/pixel-util8.asm | 19 +-
source/common/x86/pixel.h | 1 +
source/encoder/api.cpp | 2 +-
source/x265.cpp | 2 +-
source/x265.h | 4 +-
10 files changed, 553 insertions(+), 45 deletions(-)
diffs (truncated from 742 to 300 lines):
diff -r c0fc87075c75 -r 61df434bea40 source/common/common.h
--- a/source/common/common.h Mon Jun 29 11:47:39 2015 -0500
+++ b/source/common/common.h Tue Jun 30 11:36:55 2015 -0500
@@ -409,7 +409,7 @@ enum SignificanceMapContextType
/* located in pixel.cpp */
void extendPicBorder(pixel* recon, intptr_t stride, int width, int height, int marginX, int marginY);
-/* outside x265 namespace, but prefixed. defined in common.cpp */
+/* located in common.cpp */
int64_t x265_mdate(void);
#define x265_log(param, ...) general_log(param, "x265", __VA_ARGS__)
void general_log(const x265_param* param, const char* caller, int level, const char* fmt, ...);
@@ -424,7 +424,9 @@ void* x265_malloc(size_t size);
void x265_free(void *ptr);
char* x265_slurp_file(const char *filename);
-void x265_setup_primitives(x265_param* param, int cpu); /* primitives.cpp */
+/* located in primitives.cpp */
+void x265_setup_primitives(x265_param* param);
+void x265_report_simd(x265_param* param);
}
#include "constants.h"
diff -r c0fc87075c75 -r 61df434bea40 source/common/primitives.cpp
--- a/source/common/primitives.cpp Mon Jun 29 11:47:39 2015 -0500
+++ b/source/common/primitives.cpp Tue Jun 30 11:36:55 2015 -0500
@@ -188,36 +188,12 @@ void setupAliasPrimitives(EncoderPrimiti
p.chroma[X265_CSP_I422].cu[BLOCK_422_2x4].sse_pp = NULL;
}
-/* cpuid >= 0 - force CPU type
- * cpuid < 0 - auto-detect if uninitialized */
-void x265_setup_primitives(x265_param *param, int cpuid)
+void x265_report_simd(x265_param* param)
{
- if (cpuid < 0)
- cpuid = X265_NS::cpu_detect();
-
- // initialize global variables
- if (!primitives.pu[0].sad)
- {
- setupCPrimitives(primitives);
-
- /* We do not want the encoder to use the un-optimized intra all-angles
- * C references. It is better to call the individual angle functions
- * instead. We must check for NULL before using this primitive */
- for (int i = 0; i < NUM_TR_SIZE; i++)
- primitives.cu[i].intra_pred_allangs = NULL;
-
-#if ENABLE_ASSEMBLY
- setupInstrinsicPrimitives(primitives, cpuid);
- setupAssemblyPrimitives(primitives, cpuid);
-#else
- x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n");
-#endif
-
- setupAliasPrimitives(primitives);
- }
-
if (param->logLevel >= X265_LOG_INFO)
{
+ int cpuid = param->cpuid;
+
char buf[1000];
char *p = buf + sprintf(buf, "using cpu capabilities:");
char *none = p;
@@ -248,6 +224,31 @@ void x265_setup_primitives(x265_param *p
x265_log(param, X265_LOG_INFO, "%s\n", buf);
}
}
+
+void x265_setup_primitives(x265_param *param)
+{
+ if (!primitives.pu[0].sad)
+ {
+ setupCPrimitives(primitives);
+
+ /* We do not want the encoder to use the un-optimized intra all-angles
+ * C references. It is better to call the individual angle functions
+ * instead. We must check for NULL before using this primitive */
+ for (int i = 0; i < NUM_TR_SIZE; i++)
+ primitives.cu[i].intra_pred_allangs = NULL;
+
+#if ENABLE_ASSEMBLY
+ setupInstrinsicPrimitives(primitives, param->cpuid);
+ setupAssemblyPrimitives(primitives, param->cpuid);
+#else
+ x265_log(param, X265_LOG_WARNING, "Assembly not supported in this binary\n");
+#endif
+
+ setupAliasPrimitives(primitives);
+ }
+
+ x265_report_simd(param);
+}
}
#if ENABLE_ASSEMBLY
diff -r c0fc87075c75 -r 61df434bea40 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jun 29 11:47:39 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Tue Jun 30 11:36:55 2015 -0500
@@ -1120,6 +1120,11 @@ void setupAssemblyPrimitives(EncoderPrim
ALL_LUMA_PU(satd, pixel_satd, sse4);
ASSIGN_SA8D(sse4);
+ p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_sse4);
+ p.cu[BLOCK_8x8].intra_filter = PFX(intra_filter_8x8_sse4);
+ p.cu[BLOCK_16x16].intra_filter = PFX(intra_filter_16x16_sse4);
+ p.cu[BLOCK_32x32].intra_filter = PFX(intra_filter_32x32_sse4);
+
ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse4);
ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse4);
INTRA_ANG_SSE4_COMMON(sse4);
@@ -1518,6 +1523,7 @@ void setupAssemblyPrimitives(EncoderPrim
p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
p.weight_pp = PFX(weight_pp_avx2);
p.sign = PFX(calSign_avx2);
+ p.planecopy_cp = PFX(upShift_8_avx2);
p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
diff -r c0fc87075c75 -r 61df434bea40 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Mon Jun 29 11:47:39 2015 -0500
+++ b/source/common/x86/intrapred16.asm Tue Jun 30 11:36:55 2015 -0500
@@ -75,6 +75,9 @@ const pw_swap16, times 2 db 1
const pw_ang16_13, db 14, 15, 8, 9, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
const pw_ang16_16, db 0, 0, 0, 0, 0, 0, 10, 11, 8, 9, 6, 7, 2, 3, 0, 1
+intra_filter4_shuf0: db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
+intra_filter4_shuf1: db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
+
;; (blkSize - 1 - x)
pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0
@@ -21634,3 +21637,413 @@ cglobal intra_pred_ang32_33, 3,7,8
dec r4
jnz .loop
RET
+
+;-----------------------------------------------------------------------------------
+; void intra_filter_NxN(const pixel* references, pixel* filtered)
+;-----------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_filter_4x4, 2,4,5
+ mov r2w, word [r0 + 16] ; topLast
+ mov r3w, word [r0 + 32] ; LeftLast
+
+ ; filtering top
+ movu m0, [r0 + 0]
+ movu m1, [r0 + 16]
+ movu m2, [r0 + 32]
+
+ pshufb m4, m0, [intra_filter4_shuf0] ; [6 5 4 3 2 1 0 1] samples[i - 1]
+ palignr m3, m1, m0, 4
+ pshufb m3, [intra_filter4_shuf1] ; [8 7 6 5 4 3 2 9] samples[i + 1]
+
+ psllw m0, 1
+ paddw m4, m3
+ paddw m0, m4
+ paddw m0, [pw_2]
+ psrlw m0, 2
+
+ ; filtering left
+ palignr m4, m1, m1, 14
+ pinsrw m4, [r0], 1
+ palignr m3, m2, m1, 4
+ pshufb m3, [intra_filter4_shuf1]
+
+ psllw m1, 1
+ paddw m4, m3
+ paddw m1, m4
+ paddw m1, [pw_2]
+ psrlw m1, 2
+
+ movu [r1], m0
+ movu [r1 + 16], m1
+ mov [r1 + 16], r2w ; topLast
+ mov [r1 + 32], r3w ; LeftLast
+ RET
+
+INIT_XMM sse4
+cglobal intra_filter_8x8, 2,4,6
+ mov r2w, word [r0 + 32] ; topLast
+ mov r3w, word [r0 + 64] ; LeftLast
+
+ ; filtering top
+ movu m0, [r0]
+ movu m1, [r0 + 16]
+ movu m2, [r0 + 32]
+
+ pshufb m4, m0, [intra_filter4_shuf0]
+ palignr m5, m1, m0, 2
+ pinsrw m5, [r0 + 34], 0
+
+ palignr m3, m1, m0, 14
+ psllw m0, 1
+ paddw m4, m5
+ paddw m0, m4
+ paddw m0, [pw_2]
+ psrlw m0, 2
+
+ palignr m4, m2, m1, 2
+ psllw m1, 1
+ paddw m4, m3
+ paddw m1, m4
+ paddw m1, [pw_2]
+ psrlw m1, 2
+ movu [r1], m0
+ movu [r1 + 16], m1
+
+ ; filtering left
+ movu m1, [r0 + 48]
+ movu m0, [r0 + 64]
+
+ palignr m4, m2, m2, 14
+ pinsrw m4, [r0], 1
+ palignr m5, m1, m2, 2
+
+ palignr m3, m1, m2, 14
+ palignr m0, m1, 2
+
+ psllw m2, 1
+ paddw m4, m5
+ paddw m2, m4
+ paddw m2, [pw_2]
+ psrlw m2, 2
+
+ psllw m1, 1
+ paddw m0, m3
+ paddw m1, m0
+ paddw m1, [pw_2]
+ psrlw m1, 2
+
+ movu [r1 + 32], m2
+ movu [r1 + 48], m1
+ mov [r1 + 32], r2w ; topLast
+ mov [r1 + 64], r3w ; LeftLast
+ RET
+
+INIT_XMM sse4
+cglobal intra_filter_16x16, 2,4,6
+ mov r2w, word [r0 + 64] ; topLast
+ mov r3w, word [r0 + 128] ; LeftLast
+
+ ; filtering top
+ movu m0, [r0]
+ movu m1, [r0 + 16]
+ movu m2, [r0 + 32]
+
+ pshufb m4, m0, [intra_filter4_shuf0]
+ palignr m5, m1, m0, 2
+ pinsrw m5, [r0 + 66], 0
+
+ palignr m3, m1, m0, 14
+ psllw m0, 1
+ paddw m4, m5
+ paddw m0, m4
+ paddw m0, [pw_2]
+ psrlw m0, 2
+
+ palignr m4, m2, m1, 2
+ psllw m5, m1, 1
+ paddw m4, m3
+ paddw m5, m4
+ paddw m5, [pw_2]
+ psrlw m5, 2
+ movu [r1], m0
+ movu [r1 + 16], m5
+
+ movu m0, [r0 + 48]
+ movu m5, [r0 + 64]
+
+ palignr m3, m2, m1, 14
+ palignr m4, m0, m2, 2
+
+ psllw m1, m2, 1
+ paddw m3, m4
+ paddw m1, m3
+ paddw m1, [pw_2]
+ psrlw m1, 2
+
+ palignr m3, m0, m2, 14
+ palignr m4, m5, m0, 2
+
+ psllw m0, 1
+ paddw m4, m3
+ paddw m0, m4
+ paddw m0, [pw_2]
+ psrlw m0, 2
+ movu [r1 + 32], m1
+ movu [r1 + 48], m0
+
+ ; filtering left
+ movu m1, [r0 + 80]
+ movu m2, [r0 + 96]
+
+ palignr m4, m5, m5, 14
+ pinsrw m4, [r0], 1
+ palignr m0, m1, m5, 2
+
More information about the x265-commits
mailing list