[x265] [PATCH] asm: 16bpp support for quant and dequant_normal
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Fri Feb 14 14:33:55 CET 2014
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1392384257 -19800
# Fri Feb 14 18:54:17 2014 +0530
# Node ID 75e6e510877b685611c7083eca5433283c49f5a3
# Parent 423c7ff885b34a88628d32e4c26532f7664a93f7
asm: 16bpp support for quant and dequant_normal
diff -r 423c7ff885b3 -r 75e6e510877b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Feb 14 18:52:13 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Feb 14 18:54:17 2014 +0530
@@ -743,6 +743,9 @@
}
if (cpuMask & X265_CPU_SSE4)
{
+
+ p.quant = x265_quant_sse4;
+ p.dequant_normal = x265_dequant_normal_sse4;
p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
p.intra_pred[BLOCK_4x4][0] = x265_intra_pred_planar4_sse4;
diff -r 423c7ff885b3 -r 75e6e510877b source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Fri Feb 14 18:52:13 2014 +0530
+++ b/source/common/x86/pixel-util8.asm Fri Feb 14 18:54:17 2014 +0530
@@ -1071,18 +1071,18 @@
mova m4, [c_d_4] ; m4 = [4 4 4 4]
.loop:
; 4 coeff
- movu m0, [r0] ; m1 = level
+ movu m0, [r0] ; m0 = level
pxor m1, m1
- pcmpgtd m1, m0 ; m2 = sign
- movu m2, [r1] ; m3 = qcoeff
+ pcmpgtd m1, m0 ; m1 = sign
+ movu m2, [r1] ; m2 = qcoeff
pabsd m0, m0
- pmulld m0, m2 ; m1 = tmpLevel1
+ pmulld m0, m2 ; m0 = tmpLevel1
paddd m2, m0, addVec
- psrad m2, qbits ; m3 = level1
+ psrad m2, qbits ; m2 = level1
paddd m7, m2
pslld m3, m2, qbits
psubd m0, m3
- psrad m0, qbits8 ; m1 = deltaU1
+ psrad m0, qbits8 ; m0 = deltaU1
movu [r2], m0
pxor m0, m0
@@ -1099,18 +1099,18 @@
movu [r3], m2
; 4 coeff
- movu m0, [r0 + 16] ; m1 = level
+ movu m0, [r0 + 16] ; m0 = level
pxor m1, m1
- pcmpgtd m1, m0 ; m2 = sign
- movu m2, [r1 + 16] ; m3 = qcoeff
+ pcmpgtd m1, m0 ; m1 = sign
+ movu m2, [r1 + 16] ; m2 = qcoeff
pabsd m0, m0
- pmulld m0, m2 ; m1 = tmpLevel1
+ pmulld m0, m2 ; m0 = tmpLevel1
paddd m2, m0, addVec
- psrad m2, qbits ; m3 = level1
+ psrad m2, qbits ; m2 = level1
paddd m7, m2
pslld m3, m2, qbits
psubd m0, m3
- psrad m0, qbits8 ; m1 = deltaU1
+ psrad m0, qbits8 ; m0 = deltaU1
movu [r2 + 16], m0
pxor m0, m0
@@ -1154,14 +1154,18 @@
; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal dequant_normal, 2,5,8
- movd m1, r3m ; m1 = word [scale]
+cglobal dequant_normal, 4,6,5
+ movd m1, r3 ; m1 = word [scale]
+ cmp r3d, 255
+ jle .skip
+ psrld m1, 2
+.skip:
mov r4d, r4m
movd m0, r4d ; m0 = shift
- xor r3d, r3d
+ xor r5d, r5d
dec r4d
- bts r3d, r4d
- movd m2, r3d
+ bts r5d, r4d
+ movd m2, r5d
punpcklwd m1, m2
pshufd m1, m1, 0 ; m1 = dword [add scale]
mova m2, [pw_1]
@@ -1174,6 +1178,10 @@
movu m3, [r0]
movu m4, [r0 + 16]
packssdw m3, m4 ; m3 = clipQCoef
+ cmp r3d, 255
+ jle .skip1
+ psllw m3, 2
+.skip1:
punpckhwd m4, m3, m2
punpcklwd m3, m2
pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
More information about the x265-devel
mailing list