[x265-commits] [x265] asm: intra_filter4x4 avx2 code, improved 8bit: 141c->118c...
Dnyaneshwar G
dnyaneshwar at multicorewareinc.com
Thu Jul 2 17:55:48 CEST 2015
details: http://hg.videolan.org/x265/rev/1e41b3bcd911
branches:
changeset: 10743:1e41b3bcd911
user: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
date: Tue Jun 30 16:52:40 2015 +0530
description:
asm: intra_filter4x4 avx2 code, improved 8bit: 141c->118c, 10bit: 121c->88c
Subject: [x265] asm: pixelavg_pp[8xN] sse2 code for 10bpp
details: http://hg.videolan.org/x265/rev/86ab0e0e5c1a
branches:
changeset: 10744:86ab0e0e5c1a
user: Rajesh Paulraj<rajesh at multicorewareinc.com>
date: Wed Jul 01 12:56:19 2015 +0530
description:
asm: pixelavg_pp[8xN] sse2 code for 10bpp
avg_pp[ 8x4] 5.12x 125.34 641.61
avg_pp[ 8x8] 5.95x 202.53 1205.34
avg_pp[ 8x16] 6.94x 334.54 2322.57
avg_pp[ 8x32] 8.15x 589.39 4806.23
Subject: [x265] testbench: fix bug in plane_copy_cp, detect outside bound now
details: http://hg.videolan.org/x265/rev/027ae320032e
branches:
changeset: 10745:027ae320032e
user: Min Chen <chenm003 at 163.com>
date: Wed Jul 01 17:05:48 2015 -0700
description:
testbench: fix bug in plane_copy_cp, detect outside bound now
Subject: [x265] asm: fix buffer overwrite bug in upShift_8_avx2
details: http://hg.videolan.org/x265/rev/76a314f91799
branches:
changeset: 10746:76a314f91799
user: Min Chen <chenm003 at 163.com>
date: Wed Jul 01 17:05:52 2015 -0700
description:
asm: fix buffer overwrite bug in upShift_8_avx2
diffstat:
source/common/x86/asm-primitives.cpp | 4 +
source/common/x86/intrapred16.asm | 27 ++++++++
source/common/x86/intrapred8.asm | 34 ++++++++++-
source/common/x86/mc-a.asm | 86 ++++++++++++++++++++++++++-
source/common/x86/pixel-a.asm | 107 ++++++++++++----------------------
source/test/pixelharness.cpp | 6 +-
6 files changed, 184 insertions(+), 80 deletions(-)
diffs (truncated from 383 to 300 lines):
diff -r 68d089360477 -r 76a314f91799 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jul 01 14:51:36 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Jul 01 17:05:52 2015 -0700
@@ -1290,6 +1290,8 @@ void setupAssemblyPrimitives(EncoderPrim
}
if (cpuMask & X265_CPU_AVX2)
{
+ p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
+
p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2);
p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2);
@@ -2619,6 +2621,8 @@ void setupAssemblyPrimitives(EncoderPrim
#if X86_64
if (cpuMask & X265_CPU_AVX2)
{
+ p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
+
p.planecopy_sp = PFX(downShift_16_avx2);
p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
diff -r 68d089360477 -r 76a314f91799 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Wed Jul 01 14:51:36 2015 +0530
+++ b/source/common/x86/intrapred16.asm Wed Jul 01 17:05:52 2015 -0700
@@ -77,6 +77,7 @@ const pw_ang16_16, db
intra_filter4_shuf0: db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
intra_filter4_shuf1: db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
+intra_filter4_shuf2: times 2 db 4, 5, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
;; (blkSize - 1 - x)
pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0
@@ -22047,3 +22048,29 @@ cglobal intra_filter_32x32, 2,4,6
mov [r1 + 128], r2w ; topLast
mov [r1 + 256], r3w ; LeftLast
RET
+
+INIT_YMM avx2
+cglobal intra_filter_4x4, 2,4,4
+ mov r2w, word [r0 + 16] ; topLast
+ mov r3w, word [r0 + 32] ; LeftLast
+
+ ; filtering top
+ movu m0, [r0]
+ vpbroadcastw m2, xm0
+ movu m1, [r0 + 16]
+
+ palignr m3, m0, m2, 14 ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0]
+ pshufb m3, [intra_filter4_shuf2] ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1]
+ palignr m1, m0, 4 ; [9 8 7 6 5 4 3 2]
+ palignr m1, m1, 14 ; [9 8 7 6 5 4 3 2]
+
+ psllw m0, 1
+ paddw m3, m1
+ paddw m0, m3
+ paddw m0, [pw_2]
+ psrlw m0, 2
+
+ movu [r1], m0
+ mov [r1 + 16], r2w ; topLast
+ mov [r1 + 32], r3w ; LeftLast
+ RET
diff -r 68d089360477 -r 76a314f91799 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Wed Jul 01 14:51:36 2015 +0530
+++ b/source/common/x86/intrapred8.asm Wed Jul 01 17:05:52 2015 -0700
@@ -30,8 +30,9 @@ SECTION_RODATA 32
intra_pred_shuff_0_8: times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
intra_pred_shuff_15_0: times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-intra_filter4_shuf0: db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
-intra_filter4_shuf1: db 14,15,0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
+intra_filter4_shuf0: times 2 db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
+intra_filter4_shuf1: times 2 db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
+intra_filter4_shuf2: times 2 db 4, 5, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
pb_0_8 times 8 db 0, 8
pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8
@@ -18690,3 +18691,32 @@ cglobal intra_filter_32x32, 2,4,6
mov [r1 + 64], r2b ; topLast
mov [r1 + 128], r3b ; LeftLast
RET
+
+INIT_YMM avx2
+cglobal intra_filter_4x4, 2,4,4
+ mov r2b, byte [r0 + 8] ; topLast
+ mov r3b, byte [r0 + 16] ; LeftLast
+
+ ; filtering top
+ pmovzxbw m0, [r0]
+ vpbroadcastw m2, xm0
+ pmovzxbw m1, [r0 + 8]
+
+ palignr m3, m0, m2, 14 ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0]
+ pshufb m3, [intra_filter4_shuf2] ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1]
+ palignr m1, m0, 4 ; [9 8 7 6 5 4 3 2]
+ palignr m1, m1, 14 ; [9 8 7 6 5 4 3 2]
+
+ psllw m0, 1
+ paddw m3, m1
+ paddw m0, m3
+ paddw m0, [pw_2]
+ psrlw m0, 2
+
+ packuswb m0, m0
+ vpermq m0, m0, 10001000b
+
+ movu [r1], xm0
+ mov [r1 + 8], r2b ; topLast
+ mov [r1 + 16], r3b ; LeftLast
+ RET
diff -r 68d089360477 -r 76a314f91799 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Wed Jul 01 14:51:36 2015 +0530
+++ b/source/common/x86/mc-a.asm Wed Jul 01 17:05:52 2015 -0700
@@ -4009,6 +4009,87 @@ cglobal pixel_avg_w%1
AVG_END
%endmacro
+%macro pixel_avg_W8 0
+ movu m0, [r2]
+ movu m1, [r4]
+ pavgw m0, m1
+ movu [r0], m0
+ movu m2, [r2 + r3]
+ movu m3, [r4 + r5]
+ pavgw m2, m3
+ movu [r0 + r1], m2
+
+ movu m0, [r2 + r3 * 2]
+ movu m1, [r4 + r5 * 2]
+ pavgw m0, m1
+ movu [r0 + r1 * 2], m0
+ movu m2, [r2 + r6]
+ movu m3, [r4 + r7]
+ pavgw m2, m3
+ movu [r0 + r8], m2
+
+ lea r0, [r0 + 4 * r1]
+ lea r2, [r2 + 4 * r3]
+ lea r4, [r4 + 4 * r5]
+%endmacro
+
+;-------------------------------------------------------------------------------------------------------------------------------
+;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
+;-------------------------------------------------------------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal pixel_avg_8x4, 6,9,4
+ add r1d, r1d
+ add r3d, r3d
+ add r5d, r5d
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ pixel_avg_W8
+ RET
+
+cglobal pixel_avg_8x8, 6,9,4
+ add r1d, r1d
+ add r3d, r3d
+ add r5d, r5d
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ pixel_avg_W8
+ pixel_avg_W8
+ RET
+
+cglobal pixel_avg_8x16, 6,10,4
+ add r1d, r1d
+ add r3d, r3d
+ add r5d, r5d
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ mov r9d, 4
+.loop
+ pixel_avg_W8
+ dec r9d
+ jnz .loop
+ RET
+
+cglobal pixel_avg_8x32, 6,10,4
+ add r1d, r1d
+ add r3d, r3d
+ add r5d, r5d
+ lea r6, [r3 * 3]
+ lea r7, [r5 * 3]
+ lea r8, [r1 * 3]
+ mov r9d, 8
+.loop
+ pixel_avg_W8
+ dec r9d
+ jnz .loop
+ RET
+%endif
+%endif
+
%if HIGH_BIT_DEPTH
INIT_MMX mmx2
@@ -4060,11 +4141,6 @@ AVGH 4, 8
AVGH 4, 4
AVGH 4, 2
-AVG_FUNC 8, movdqu, movdqa
-AVGH 8, 32
-AVGH 8, 16
-AVGH 8, 8
-AVGH 8, 4
AVG_FUNC 16, movdqu, movdqa
AVGH 16, 64
diff -r 68d089360477 -r 76a314f91799 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Wed Jul 01 14:51:36 2015 +0530
+++ b/source/common/x86/pixel-a.asm Wed Jul 01 17:05:52 2015 -0700
@@ -69,6 +69,7 @@ cextern pd_1
cextern popcnt_table
cextern pd_2
cextern hmul_16p
+cextern pb_movemask
;=============================================================================
; SATD
@@ -7299,10 +7300,9 @@ cglobal downShift_16, 6,7,3
;void planecopy_cp(uint8_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift)
;---------------------------------------------------------------------------------------------------------------------
INIT_XMM sse4
-cglobal upShift_8, 7,7,3
-
- movd m2, r6d ; m0 = shift
- add r3, r3
+cglobal upShift_8, 6,7,3
+ movd m2, r6m ; m0 = shift
+ add r3d, r3d
dec r5d
.loopH:
@@ -7393,88 +7393,55 @@ cglobal upShift_8, 7,7,3
;---------------------------------------------------------------------------------------------------------------------
%if ARCH_X86_64
INIT_YMM avx2
-cglobal upShift_8, 7,8,3
- movd xm2, r6d
- add r3, r3
+cglobal upShift_8, 6,7,4
+ movd xm2, r6m
+ add r3d, r3d
+ dec r5d
.loopH:
- xor r7, r7
- mov r6d, r4d
+ xor r6, r6
.loopW:
- pmovzxbw m0,[r0 + r7]
- pmovzxbw m1,[r0 + r7 + 16]
+ pmovzxbw m0,[r0 + r6]
+ pmovzxbw m1,[r0 + r6 + mmsize/2]
psllw m0, xm2
psllw m1, xm2
- movu [r2 + r7 * 2], m0
- movu [r2 + r7 * 2 + 32], m1
-
- add r7d, 32
- sub r6d, 32
- jg .loopW
+ movu [r2 + r6 * 2], m0
+ movu [r2 + r6 * 2 + mmsize], m1
+
+ add r6d, mmsize
+ cmp r6d, r4d
+ jl .loopW
; move to next row
add r0, r1
add r2, r3
dec r5d
- jnz .loopH
-
-;processing last row of every frame [To handle width which not a multiple of 16]
-
-.loop16:
+ jg .loopH
+
+ ; processing last row of every frame [To handle width which not a multiple of 32]
+ lea r3, [pb_movemask + 16]
+ mov r5d, 15
+ and r5d, r4d
+ sub r3, r5
+ pmovsxbw m3, [r3]
+
+ ; NOTE: Width MUST BE more than or equal to 16
+ shr r4d, 4
+.loopW2:
pmovzxbw m0,[r0]
psllw m0, xm2
movu [r2], m0
-
- add r0, mmsize
More information about the x265-commits
mailing list