[x265] [PATCH] asm: intra_filter4x4 avx2 code, improved 8bit: 141c->118c, 10bit: 121c->88c
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Jun 30 14:02:14 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1435663360 -19800
# Tue Jun 30 16:52:40 2015 +0530
# Node ID 9340454d3b551f57ba9ce6a3f77fade041975e62
# Parent b1301944894051b9641006797e4d6253b277f3e4
asm: intra_filter4x4 avx2 code, improved 8bit: 141c->118c, 10bit: 121c->88c
diff -r b13019448940 -r 9340454d3b55 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jun 29 17:19:07 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jun 30 16:52:40 2015 +0530
@@ -1290,6 +1290,8 @@
}
if (cpuMask & X265_CPU_AVX2)
{
+ p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
+
p.saoCuOrgE0 = PFX(saoCuOrgE0_avx2);
p.saoCuOrgE1 = PFX(saoCuOrgE1_avx2);
p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_avx2);
@@ -2619,6 +2621,8 @@
#if X86_64
if (cpuMask & X265_CPU_AVX2)
{
+ p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
+
p.planecopy_sp = PFX(downShift_16_avx2);
p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
diff -r b13019448940 -r 9340454d3b55 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Mon Jun 29 17:19:07 2015 +0530
+++ b/source/common/x86/intrapred16.asm Tue Jun 30 16:52:40 2015 +0530
@@ -77,6 +77,7 @@
intra_filter4_shuf0: db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
intra_filter4_shuf1: db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
+intra_filter4_shuf2: times 2 db 4, 5, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
;; (blkSize - 1 - x)
pw_planar4_0: dw 3, 2, 1, 0, 3, 2, 1, 0
@@ -22047,3 +22048,29 @@
mov [r1 + 128], r2w ; topLast
mov [r1 + 256], r3w ; LeftLast
RET
+
+INIT_YMM avx2
+cglobal intra_filter_4x4, 2,4,4
+ mov r2w, word [r0 + 16] ; topLast
+ mov r3w, word [r0 + 32] ; LeftLast
+
+ ; filtering top
+ movu m0, [r0]
+ vpbroadcastw m2, xm0
+ movu m1, [r0 + 16]
+
+ palignr m3, m0, m2, 14 ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0]
+ pshufb m3, [intra_filter4_shuf2] ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1]
+ palignr m1, m0, 4 ; [9 8 7 6 5 4 3 2]
+ palignr m1, m1, 14 ; [9 8 7 6 5 4 3 2]
+
+ psllw m0, 1
+ paddw m3, m1
+ paddw m0, m3
+ paddw m0, [pw_2]
+ psrlw m0, 2
+
+ movu [r1], m0
+ mov [r1 + 16], r2w ; topLast
+ mov [r1 + 32], r3w ; LeftLast
+ RET
diff -r b13019448940 -r 9340454d3b55 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Mon Jun 29 17:19:07 2015 +0530
+++ b/source/common/x86/intrapred8.asm Tue Jun 30 16:52:40 2015 +0530
@@ -30,8 +30,9 @@
intra_pred_shuff_0_8: times 2 db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
intra_pred_shuff_15_0: times 2 db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-intra_filter4_shuf0: db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
-intra_filter4_shuf1: db 14,15,0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 ,11, 12, 13
+intra_filter4_shuf0: times 2 db 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
+intra_filter4_shuf1: times 2 db 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13
+intra_filter4_shuf2: times 2 db 4, 5, 0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
pb_0_8 times 8 db 0, 8
pb_unpackbw1 times 2 db 1, 8, 2, 8, 3, 8, 4, 8
@@ -18690,3 +18691,32 @@
mov [r1 + 64], r2b ; topLast
mov [r1 + 128], r3b ; LeftLast
RET
+
+INIT_YMM avx2
+cglobal intra_filter_4x4, 2,4,4
+ mov r2b, byte [r0 + 8] ; topLast
+ mov r3b, byte [r0 + 16] ; LeftLast
+
+ ; filtering top
+ pmovzxbw m0, [r0]
+ vpbroadcastw m2, xm0
+ pmovzxbw m1, [r0 + 8]
+
+ palignr m3, m0, m2, 14 ; [6 5 4 3 2 1 0 0] [14 13 12 11 10 9 8 0]
+ pshufb m3, [intra_filter4_shuf2] ; [6 5 4 3 2 1 0 1] [14 13 12 11 10 9 0 9] samples[i - 1]
+ palignr m1, m0, 4 ; [9 8 7 6 5 4 3 2]
+ palignr m1, m1, 14 ; [9 8 7 6 5 4 3 2]
+
+ psllw m0, 1
+ paddw m3, m1
+ paddw m0, m3
+ paddw m0, [pw_2]
+ psrlw m0, 2
+
+ packuswb m0, m0
+ vpermq m0, m0, 10001000b
+
+ movu [r1], xm0
+ mov [r1 + 8], r2b ; topLast
+ mov [r1 + 16], r3b ; LeftLast
+ RET
More information about the x265-devel
mailing list