[x265-commits] [x265] doc: update limit-refs documentation
Deepthi Nandakumar
deepthi at multicorewareinc.com
Fri Jul 17 05:17:10 CEST 2015
details: http://hg.videolan.org/x265/rev/3c7fef832387
branches:
changeset: 10829:3c7fef832387
user: Deepthi Nandakumar <deepthi at multicorewareinc.com>
date: Thu Jul 16 19:03:03 2015 +0530
description:
doc: update limit-refs documentation
Subject: [x265] stats: fix skip cu count for 2 pass
details: http://hg.videolan.org/x265/rev/269f46d0b34e
branches:
changeset: 10830:269f46d0b34e
user: Divya Manivannan <divya at multicorewareinc.com>
date: Wed Jul 15 16:18:53 2015 +0530
description:
stats: fix skip cu count for 2 pass
Subject: [x265] asm: fix pixel_val_sse2 in Main12
details: http://hg.videolan.org/x265/rev/22bad629bba9
branches:
changeset: 10831:22bad629bba9
user: Min Chen <chenm003 at 163.com>
date: Wed Jul 15 17:09:52 2015 -0700
description:
asm: fix pixel_val_sse2 in Main12
Subject: [x265] asm: disable error SSE2 functions for Main12
details: http://hg.videolan.org/x265/rev/d5ac612bb5bc
branches:
changeset: 10832:d5ac612bb5bc
user: Min Chen <chenm003 at 163.com>
date: Wed Jul 15 17:09:54 2015 -0700
description:
asm: disable error SSE2 functions for Main12
Subject: [x265] asm: fix pixel_satd_4x16_sse2 overflow in Main12
details: http://hg.videolan.org/x265/rev/7e3315972ad2
branches:
changeset: 10833:7e3315972ad2
user: Min Chen <chenm003 at 163.com>
date: Wed Jul 15 17:09:57 2015 -0700
description:
asm: fix pixel_satd_4x16_sse2 overflow in Main12
Subject: [x265] asm: rewrite partial process code in upShift_8_avx2 to avoid Mac crash bug
details: http://hg.videolan.org/x265/rev/b2ba7df1fc69
branches:
changeset: 10834:b2ba7df1fc69
user: Min Chen <chenm003 at 163.com>
date: Thu Jul 16 19:36:35 2015 -0700
description:
asm: rewrite partial process code in upShift_8_avx2 to avoid Mac crash bug
diffstat:
doc/reST/cli.rst | 3 +-
source/common/x86/asm-primitives.cpp | 4 +
source/common/x86/pixel-a.asm | 28 +-
source/common/x86/pixel-util8.asm | 468 ++++++++++++++++++++++++++--------
source/encoder/frameencoder.cpp | 2 +-
5 files changed, 375 insertions(+), 130 deletions(-)
diffs (truncated from 630 to 300 lines):
diff -r 8efce8620ae2 -r b2ba7df1fc69 doc/reST/cli.rst
--- a/doc/reST/cli.rst Tue Jul 14 16:29:46 2015 -0700
+++ b/doc/reST/cli.rst Thu Jul 16 19:36:35 2015 -0700
@@ -675,8 +675,7 @@ the prediction quad-tree.
(within your decoder level limits) if you enable one or
both of these flags.
- This feature is EXPERIMENTAL and currently only functional at RD
- levels 0 through 4
+ This feature is EXPERIMENTAL and functional at all RD levels.
.. option:: --rect, --no-rect
diff -r 8efce8620ae2 -r b2ba7df1fc69 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jul 14 16:29:46 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp Thu Jul 16 19:36:35 2015 -0700
@@ -917,7 +917,9 @@ void setupAssemblyPrimitives(EncoderPrim
p.pu[LUMA_4x4].satd = p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_mmx2);
ALL_LUMA_PU(satd, pixel_satd, sse2);
+#if X265_DEPTH <= 10
ASSIGN_SA8D(sse2);
+#endif /* X265_DEPTH <= 10 */
LUMA_PIXELSUB(sse2);
CHROMA_420_PIXELSUB_PS(sse2);
CHROMA_422_PIXELSUB_PS(sse2);
@@ -958,7 +960,9 @@ void setupAssemblyPrimitives(EncoderPrim
ALL_LUMA_TU_S(calcresidual, getResidual, sse2);
ALL_LUMA_TU_S(transpose, transpose, sse2);
+#if X265_DEPTH <= 10
ALL_LUMA_TU_S(intra_pred[PLANAR_IDX], intra_pred_planar, sse2);
+#endif /* X265_DEPTH <= 10 */
ALL_LUMA_TU_S(intra_pred[DC_IDX], intra_pred_dc, sse2);
p.cu[BLOCK_4x4].intra_pred[2] = PFX(intra_pred_ang4_2_sse2);
diff -r 8efce8620ae2 -r b2ba7df1fc69 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Jul 14 16:29:46 2015 -0700
+++ b/source/common/x86/pixel-a.asm Thu Jul 16 19:36:35 2015 -0700
@@ -643,7 +643,7 @@ cglobal pixel_satd_4x16, 4, 6, 8
lea r0, [r0+r1*2*SIZEOF_PIXEL]
lea r2, [r2+r3*2*SIZEOF_PIXEL]
SATD_4x8_SSE vertical, 1, add
- HADDW m7, m1
+ HADDUW m7, m1
movd eax, m7
RET
@@ -7394,7 +7394,7 @@ cglobal upShift_8, 6,7,3
;---------------------------------------------------------------------------------------------------------------------
%if ARCH_X86_64
INIT_YMM avx2
-cglobal upShift_8, 6,7,4
+cglobal upShift_8, 6,7,3
movd xm2, r6m
add r3d, r3d
dec r5d
@@ -7420,29 +7420,25 @@ cglobal upShift_8, 6,7,4
jg .loopH
; processing last row of every frame [To handle width which not a multiple of 32]
- lea r3, [pb_movemask + 16]
- mov r5d, 15
- and r5d, r4d
- sub r3, r5
- pmovsxbw m3, [r3]
+ mov r1d, 15
+ and r1d, r4d
+ sub r1, mmsize/2
; NOTE: Width MUST BE more than or equal to 16
- shr r4d, 4
-.loopW2:
+ shr r4d, 4 ; log2(mmsize)
+.loopW16:
pmovzxbw m0,[r0]
psllw m0, xm2
movu [r2], m0
add r0, mmsize/2
add r2, mmsize
dec r4d
- jg .loopW2
-
-.nextW2:
- ; process partial of 16
- pmovzxbw m0,[r0]
+ jg .loopW16
+
+ ; Mac OS X can't read beyond array bound, so rollback some bytes
+ pmovzxbw m0,[r0 + r1]
psllw m0, xm2
- vpblendvb m0, m0, [r2], m3
- movu [r2], m0
+ movu [r2 + r1 * 2], m0
RET
%endif
diff -r 8efce8620ae2 -r b2ba7df1fc69 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Jul 14 16:29:46 2015 -0700
+++ b/source/common/x86/pixel-util8.asm Thu Jul 16 19:36:35 2015 -0700
@@ -5779,6 +5779,19 @@ cglobal pixel_sub_ps_64x64, 6, 7, 8, des
RET
%endmacro
+%macro VAR_END_12bit 2
+ HADDD m5, m1
+ HADDD m6, m1
+%if ARCH_X86_64
+ punpckldq m5, m6
+ movq rax, m5
+%else
+ movd eax, m5
+ movd edx, m6
+%endif
+ RET
+%endmacro
+
%macro VAR_CORE 0
paddw m5, m0
paddw m5, m3
@@ -5794,9 +5807,9 @@ cglobal pixel_sub_ps_64x64, 6, 7, 8, des
paddd m6, m4
%endmacro
-%macro VAR_2ROW 3
+%macro VAR_2ROW 2
mov r2d, %2
-.loop%3:
+%%loop:
%if HIGH_BIT_DEPTH
movu m0, [r0]
movu m1, [r0+mmsize]
@@ -5820,7 +5833,7 @@ cglobal pixel_sub_ps_64x64, 6, 7, 8, des
%endif ; !HIGH_BIT_DEPTH
VAR_CORE
dec r2d
- jg .loop%3
+ jg %%loop
%endmacro
;-----------------------------------------------------------------------------
@@ -5830,23 +5843,361 @@ INIT_MMX mmx2
cglobal pixel_var_16x16, 2,3
FIX_STRIDES r1
VAR_START 0
- VAR_2ROW 8*SIZEOF_PIXEL, 16, 1
+ VAR_2ROW 8*SIZEOF_PIXEL, 16
VAR_END 16, 16
cglobal pixel_var_8x8, 2,3
FIX_STRIDES r1
VAR_START 0
- VAR_2ROW r1, 4, 1
+ VAR_2ROW r1, 4
VAR_END 8, 8
%if HIGH_BIT_DEPTH
%macro VAR 0
+
+%if BIT_DEPTH <= 10
cglobal pixel_var_16x16, 2,3,8
FIX_STRIDES r1
VAR_START 0
- VAR_2ROW r1, 8, 1
+ VAR_2ROW r1, 8
VAR_END 16, 16
+cglobal pixel_var_32x32, 2,6,8
+ FIX_STRIDES r1
+ mov r3, r0
+ VAR_START 0
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r4d, m5
+ pxor m5, m5
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ lea r0, [r3 + 32]
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ VAR_2ROW r1, 8
+ VAR_END 32, 32
+
+cglobal pixel_var_64x64, 2,6,8
+ FIX_STRIDES r1
+ mov r3, r0
+ VAR_START 0
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r4d, m5
+ pxor m5, m5
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ lea r0, [r3 + 32]
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ lea r0, [r3 + 64]
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ lea r0, [r3 + 96]
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ VAR_2ROW r1, 8
+ HADDW m5, m2
+ movd r5d, m5
+ add r4, r5
+ pxor m5, m5
+ VAR_2ROW r1, 8
+ VAR_END 64, 64
+
+%else ; BIT_DEPTH <= 10
+
+cglobal pixel_var_16x16, 2,3,8
+ FIX_STRIDES r1
+ VAR_START 0
+ VAR_2ROW r1, 4
+ HADDUWD m5, m1
+ mova m7, m5
+ pxor m5, m5
+ VAR_2ROW r1, 4
+ HADDUWD m5, m1
+ paddd m5, m7
+ VAR_END_12bit 16, 16
+
+cglobal pixel_var_32x32, 2,6,8
+ FIX_STRIDES r1
+ mov r3, r0
+ VAR_START 0
+
+ VAR_2ROW r1, 4
+ HADDUWD m5, m1
+ mova m7, m5
+
+ pxor m5, m5
+ VAR_2ROW r1, 4
+ HADDUWD m5, m1
+ paddd m7, m5
More information about the x265-commits
mailing list