[x265] [PATCH 3 of 3] asm: asm code for pelFilterLumaStrong_V/H & pelFilterChroma_V/H for main10 & main12
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Fri Feb 26 10:11:31 CET 2016
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1456466696 -19800
# Fri Feb 26 11:34:56 2016 +0530
# Node ID d7d0c03b5e6e7fd0258d609ad5e9f4d7c0a40390
# Parent 59d9eca3d144e71f11d509a5dd40b634bb9ab500
asm: asm code for pelFilterLumaStrong_V/H & pelFilterChroma_V/H for main10 & main12
diff -r 59d9eca3d144 -r d7d0c03b5e6e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Feb 26 11:34:39 2016 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Feb 26 11:34:56 2016 +0530
@@ -1101,6 +1101,11 @@
}
if (cpuMask & X265_CPU_SSE4)
{
+ p.pelFilterLumaStrong[0] = PFX(pelFilterLumaStrong_V_sse4);
+ p.pelFilterLumaStrong[1] = PFX(pelFilterLumaStrong_H_sse4);
+ p.pelFilterChroma[0] = PFX(pelFilterChroma_V_sse4);
+ p.pelFilterChroma[1] = PFX(pelFilterChroma_H_sse4);
+
p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
p.saoCuOrgE1 = PFX(saoCuOrgE1_sse4);
p.saoCuOrgE1_2Rows = PFX(saoCuOrgE1_2Rows_sse4);
diff -r 59d9eca3d144 -r d7d0c03b5e6e source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Fri Feb 26 11:34:39 2016 +0530
+++ b/source/common/x86/const-a.asm Fri Feb 26 11:34:56 2016 +0530
@@ -69,6 +69,7 @@
const pb_000000000000000F, db 0xff
times 15 db 0x00
const pb_shuf_off4, times 2 db 0, 4, 1, 5, 2, 6, 3, 7
+const pw_shuf_off4, times 1 db 0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15
;; 16-bit constants
diff -r 59d9eca3d144 -r d7d0c03b5e6e source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Fri Feb 26 11:34:39 2016 +0530
+++ b/source/common/x86/loopfilter.asm Fri Feb 26 11:34:56 2016 +0530
@@ -51,6 +51,8 @@
cextern hmul_16p
cextern pw_1_ffff
cextern pb_shuf_off4
+cextern pw_shuf_off4
+
;============================================================================================================
; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride)
;============================================================================================================
@@ -3758,6 +3760,9 @@
INIT_XMM sse4
cglobal pelFilterLumaStrong_H, 5,7,10
+%if HIGH_BIT_DEPTH
+ add r2d, r2d
+%endif
mov r1, r2
neg r3d
neg r4d
@@ -3766,6 +3771,16 @@
lea r5, [r2 * 3]
lea r6, [r1 * 3]
+%if HIGH_BIT_DEPTH
+ movu m4, [r0] ; src[0]
+ movu m3, [r0 + r1] ; src[-offset]
+ movu m2, [r0 + r1 * 2] ; src[-offset * 2]
+ movu m1, [r0 + r6] ; src[-offset * 3]
+ movu m0, [r0 + r1 * 4] ; src[-offset * 4]
+ movu m5, [r0 + r2] ; src[offset]
+ movu m6, [r0 + r2 * 2] ; src[offset * 2]
+ movu m7, [r0 + r5] ; src[offset * 3]
+%else
pmovzxbw m4, [r0] ; src[0]
pmovzxbw m3, [r0 + r1] ; src[-offset]
pmovzxbw m2, [r0 + r1 * 2] ; src[-offset * 2]
@@ -3774,6 +3789,7 @@
pmovzxbw m5, [r0 + r2] ; src[offset]
pmovzxbw m6, [r0 + r2 * 2] ; src[offset * 2]
pmovzxbw m7, [r0 + r5] ; src[offset * 3]
+%endif
paddw m0, m0 ; m0*2
mova m8, m2
@@ -3841,6 +3857,15 @@
paddw m0, m1
paddw m3, m4
paddw m9, m5
+
+%if HIGH_BIT_DEPTH
+ movh [r0 + r6], m0
+ movhps [r0 + r1], m0
+ movh [r0], m3
+ movhps [r0 + r2 * 2], m3,
+ movh [r0 + r2 * 1], m9
+ movhps [r0 + r1 * 2], m9
+%else
packuswb m0, m0
packuswb m3, m9
@@ -3850,14 +3875,41 @@
pextrd [r0 + r2 * 2], m3, 1
pextrd [r0 + r2 * 1], m3, 2
pextrd [r0 + r1 * 2], m3, 3
+%endif
RET
INIT_XMM sse4
cglobal pelFilterLumaStrong_V, 5,5,10
+%if HIGH_BIT_DEPTH
+ add r1d, r1d
+%endif
neg r3d
neg r4d
lea r2, [r1 * 3]
+%if HIGH_BIT_DEPTH
+ movu m0, [r0 - 8] ; src[-offset * 4] row 0
+ movu m1, [r0 + r1 * 1 - 8] ; src[-offset * 4] row 1
+ movu m2, [r0 + r1 * 2 - 8] ; src[-offset * 4] row 2
+ movu m3, [r0 + r2 * 1 - 8] ; src[-offset * 4] row 3
+
+ punpckhwd m4, m0, m1 ; [m4 m4 m5 m5 m6 m6 m7 m7]
+ punpcklwd m0, m1 ; [m0 m0 m1 m1 m2 m2 m3 m3]
+
+ punpckhwd m5, m2, m3 ; [m4 m4 m5 m5 m6 m6 m7 m7]
+ punpcklwd m2, m3 ; [m0 m0 m1 m1 m2 m2 m3 m3]
+
+ punpckhdq m3, m0, m2 ; [m2 m2 m2 m2 m3 m3 m3 m3]
+ punpckldq m0, m2 ; [m0 m0 m0 m0 m1 m1 m1 m1]
+ psrldq m1, m0, 8 ; [m1 m1 m1 m1 x x x x]
+ mova m2, m3 ; [m2 m2 m2 m2 x x x x]
+ punpckhqdq m3, m3 ; [m3 m3 m3 m3 x x x x]
+
+ punpckhdq m6, m4, m5 ; [m6 m6 m6 m6 m7 m7 m7 m7]
+ punpckldq m4, m5 ; [m4 m4 m4 m4 m5 m5 m5 m5]
+ psrldq m7, m6, 8
+ psrldq m5, m4, 8
+%else
movh m0, [r0 - 4] ; src[-offset * 4] row 0
movh m1, [r0 + r1 * 1 - 4] ; src[-offset * 4] row 1
movh m2, [r0 + r1 * 2 - 4] ; src[-offset * 4] row 2
@@ -3890,6 +3942,7 @@
pmovzxbw m5, m5
pmovzxbw m6, m6
pmovzxbw m7, m7
+%endif
paddw m0, m0 ; m0*2
mova m8, m2
@@ -3957,6 +4010,35 @@
paddw m0, m1
paddw m3, m4
paddw m9, m5
+
+%if HIGH_BIT_DEPTH
+ ; 4x6 output rows -
+ ; m0 - col 0
+ ; m3 - col 3
+
+ psrldq m1, m0, 8
+ psrldq m2, m3, 8
+
+ mova m4, m9
+ psrldq m5, m9, 8
+
+ ; transpose 4x6 to 6x4
+ punpcklwd m0, m5
+ punpcklwd m1, m3
+ punpcklwd m4, m2
+
+ punpckldq m9, m0, m1
+ punpckhdq m0, m1
+
+ movh [r0 + r1 * 0 - 6], m9
+ movhps [r0 + r1 * 1 - 6], m9
+ movh [r0 + r1 * 2 - 6], m0
+ movhps [r0 + r2 * 1 - 6], m0
+ pextrd [r0 + r1 * 0 + 2], m4, 0
+ pextrd [r0 + r1 * 1 + 2], m4, 1
+ pextrd [r0 + r1 * 2 + 2], m4, 2
+ pextrd [r0 + r2 * 1 + 2], m4, 3
+%else
packuswb m0, m0
packuswb m3, m9
@@ -3986,20 +4068,31 @@
pextrw [r0 + r1 * 1 + 1], m4, 1
pextrw [r0 + r1 * 2 + 1], m4, 2
pextrw [r0 + r2 * 1 + 1], m4, 3
+%endif
RET
%endif ; ARCH_X86_64
%if ARCH_X86_64
INIT_XMM sse4
cglobal pelFilterChroma_H, 6,6,5
+%if HIGH_BIT_DEPTH
+ add r2d, r2d
+%endif
mov r1, r2
neg r3d
neg r1
+%if HIGH_BIT_DEPTH
+ movu m4, [r0] ; src[0]
+ movu m3, [r0 + r1] ; src[-offset]
+ movu m0, [r0 + r2] ; src[offset]
+ movu m2, [r0 + r1 * 2] ; src[-offset * 2]
+%else
pmovzxbw m4, [r0] ; src[0]
pmovzxbw m3, [r0 + r1] ; src[-offset]
pmovzxbw m0, [r0 + r2] ; src[offset]
pmovzxbw m2, [r0 + r1 * 2] ; src[-offset * 2]
+%endif
psubw m1, m4, m3 ; m4 - m3
psubw m2, m0 ; m2 - m5
@@ -4032,21 +4125,35 @@
pmaxsw m3, m0
pminsw m3, [pw_pixel_max]
+%if HIGH_BIT_DEPTH
+ movh [r0 + r1], m3
+ movhps [r0], m3
+%else
packuswb m3, m3
movd [r0 + r1], m3
pextrd [r0], m3, 1
+%endif
RET
INIT_XMM sse4
cglobal pelFilterChroma_V, 6,6,5
+%if HIGH_BIT_DEPTH
+ add r1d, r1d
+%endif
neg r3d
lea r2, [r1 * 3]
+%if HIGH_BIT_DEPTH
+ movu m4, [r0 + r1 * 0 - 4] ; src[-offset*2, -offset, 0, offset] [m2 m3 m4 m5]
+ movu m3, [r0 + r1 * 1 - 4]
+ movu m0, [r0 + r1 * 2 - 4]
+ movu m2, [r0 + r2 * 1 - 4]
+%else
pmovzxbw m4, [r0 + r1 * 0 - 2] ; src[-offset*2, -offset, 0, offset] [m2 m3 m4 m5]
pmovzxbw m3, [r0 + r1 * 1 - 2]
pmovzxbw m0, [r0 + r1 * 2 - 2]
pmovzxbw m2, [r0 + r2 * 1 - 2]
-
+%endif
punpcklwd m4, m3
punpcklwd m0, m2
punpckldq m2, m4, m0 ; [m2 m2 m2 m2 m3 m3 m3 m3]
@@ -4085,11 +4192,19 @@
pmaxsw m3, m0
pminsw m3, [pw_pixel_max]
+%if HIGH_BIT_DEPTH
+ pshufb m3, [pw_shuf_off4]
+ pextrd [r0 + r1 * 0 - 2], m3, 0
+ pextrd [r0 + r1 * 1 - 2], m3, 1
+ pextrd [r0 + r1 * 2 - 2], m3, 2
+ pextrd [r0 + r2 * 1 - 2], m3, 3
+%else
packuswb m3, m3
pshufb m3, [pb_shuf_off4]
pextrw [r0 + r1 * 0 - 1], m3, 0
pextrw [r0 + r1 * 1 - 1], m3, 1
pextrw [r0 + r1 * 2 - 1], m3, 2
pextrw [r0 + r2 * 1 - 1], m3, 3
+%endif
RET
%endif ; ARCH_X86_64
More information about the x265-devel
mailing list