[x265] [PATCH] asm: 16bpp code for quant and dequant_normal
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Mon Feb 17 13:29:55 CET 2014
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1392639418 -19800
# Mon Feb 17 17:46:58 2014 +0530
# Node ID 85691d6c02e6f7323194ab1b054149dbf940a0d3
# Parent 85be97320422ca9682272a81f31733b3884efa02
asm: 16bpp code for quant and dequant_normal
diff -r 85be97320422 -r 85691d6c02e6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Feb 17 17:46:12 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Feb 17 17:46:58 2014 +0530
@@ -825,8 +825,10 @@
}
if (cpuMask & X265_CPU_SSE4)
{
+
+ p.quant = x265_quant_sse4;
+ p.dequant_normal = x265_dequant_normal_sse4;
p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
-
p.intra_pred[BLOCK_4x4][0] = x265_intra_pred_planar4_sse4;
p.intra_pred[BLOCK_8x8][0] = x265_intra_pred_planar8_sse4;
p.intra_pred[BLOCK_16x16][0] = x265_intra_pred_planar16_sse4;
diff -r 85be97320422 -r 85691d6c02e6 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Mon Feb 17 17:46:12 2014 +0530
+++ b/source/common/x86/pixel-util8.asm Mon Feb 17 17:46:58 2014 +0530
@@ -1071,20 +1071,19 @@
mova m4, [c_d_4] ; m4 = [4 4 4 4]
.loop:
; 4 coeff
- movu m0, [r0] ; m1 = level
+ movu m0, [r0] ; m0 = level
pxor m1, m1
- pcmpgtd m1, m0 ; m2 = sign
- movu m2, [r1] ; m3 = qcoeff
+ pcmpgtd m1, m0 ; m1 = sign
+ movu m2, [r1] ; m2 = qcoeff
pabsd m0, m0
- pmulld m0, m2 ; m1 = tmpLevel1
+ pmulld m0, m2 ; m0 = tmpLevel1
paddd m2, m0, addVec
- psrad m2, qbits ; m3 = level1
+ psrad m2, qbits ; m2 = level1
paddd m7, m2
pslld m3, m2, qbits
psubd m0, m3
- psrad m0, qbits8 ; m1 = deltaU1
+ psrad m0, qbits8 ; m0 = deltaU1
movu [r2], m0
-
pxor m0, m0
pcmpeqd m0, m2 ; m0 = mask4
pand m5, m0
@@ -1097,22 +1096,20 @@
packssdw m2, m2
pmovsxwd m2, m2
movu [r3], m2
-
; 4 coeff
- movu m0, [r0 + 16] ; m1 = level
+ movu m0, [r0 + 16] ; m0 = level
pxor m1, m1
- pcmpgtd m1, m0 ; m2 = sign
- movu m2, [r1 + 16] ; m3 = qcoeff
+ pcmpgtd m1, m0 ; m1 = sign
+ movu m2, [r1 + 16] ; m2 = qcoeff
pabsd m0, m0
- pmulld m0, m2 ; m1 = tmpLevel1
+ pmulld m0, m2 ; m0 = tmpLevel1
paddd m2, m0, addVec
- psrad m2, qbits ; m3 = level1
+ psrad m2, qbits ; m2 = level1
paddd m7, m2
pslld m3, m2, qbits
psubd m0, m3
- psrad m0, qbits8 ; m1 = deltaU1
+ psrad m0, qbits8 ; m0 = deltaU1
movu [r2 + 16], m0
-
pxor m0, m0
pcmpeqd m0, m2 ; m0 = mask4
pand m5, m0
@@ -1154,8 +1151,11 @@
; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal dequant_normal, 2,5,8
- movd m1, r3m ; m1 = word [scale]
+cglobal dequant_normal, 4,5,5
+ movd m1, r3 ; m1 = word [scale]
+ cmp r3d, 255
+ jle .skip
+ psrld m1, 2
mov r4d, r4m
movd m0, r4d ; m0 = shift
xor r3d, r3d
@@ -1174,6 +1174,45 @@
movu m3, [r0]
movu m4, [r0 + 16]
packssdw m3, m4 ; m3 = clipQCoef
+ psllw m3, 2
+ punpckhwd m4, m3, m2
+ punpcklwd m3, m2
+ pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
+ pmaddwd m4, m1
+ psrad m3, m0
+ psrad m4, m0
+ packssdw m3, m3 ; OPT_ME: store must be 32 bits
+ pmovsxwd m3, m3
+ packssdw m4, m4
+ pmovsxwd m4, m4
+ movu [r1], m3
+ movu [r1 + 16], m4
+
+ add r0, 32
+ add r1, 32
+
+ sub r2d, 8
+ jnz .loop
+ jz .end
+
+.skip:
+ mov r4d, r4m
+ movd m0, r4d ; m0 = shift
+ xor r3d, r3d
+ dec r4d
+ bts r3d, r4d
+ movd m2, r3d
+ punpcklwd m1, m2
+ pshufd m1, m1, 0 ; m1 = dword [add scale]
+ mova m2, [pw_1]
+ mov r2d, r2m
+ ; m0 = shift
+ ; m1 = scale
+ ; m2 = word [1]
+.sloop:
+ movu m3, [r0]
+ movu m4, [r0 + 16]
+ packssdw m3, m4 ; m3 = clipQCoef
punpckhwd m4, m3, m2
punpcklwd m3, m2
pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
@@ -1191,7 +1230,8 @@
add r1, 32
sub r2d, 8
- jnz .loop
+ jnz .sloop
+.end:
RET
;-----------------------------------------------------------------------------------------------------------------------------------------------
More information about the x265-devel
mailing list