[x265] asm: fix dequant_normal
Satoshi Nakagawa
nakagawa424 at oki.com
Sat Aug 30 06:39:07 CEST 2014
# HG changeset patch
# User Satoshi Nakagawa <nakagawa424 at oki.com>
# Date 1409373356 -32400
# Sat Aug 30 13:35:56 2014 +0900
# Node ID 9b5f0c75d052e963b0a413f341a74036141b3675
# Parent 4e2d9ac6d489e82e70544d626c89964ee653c452
asm: fix dequant_normal
diff -r 4e2d9ac6d489 -r 9b5f0c75d052 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Fri Aug 29 11:12:49 2014 +0200
+++ b/source/common/x86/pixel-util8.asm Sat Aug 30 13:35:56 2014 +0900
@@ -1005,23 +1005,21 @@
; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
;-----------------------------------------------------------------------------
INIT_XMM sse4
-cglobal dequant_normal, 4,5,5
- movd m1, r3 ; m1 = word [scale]
+cglobal dequant_normal, 5,5,5
cmp r3d, 32767
+ movd m1, r3 ; m1 = word [scale]
+ mova m2, [pw_1]
jle .skip
-
psrld m1, 2
- mov r4d, r4m
+ sub r4d, 2
+.skip:
movd m0, r4d ; m0 = shift
xor r3d, r3d
dec r4d
bts r3d, r4d
- movd m2, r3d
- punpcklwd m1, m2
+ movd m3, r3d
+ punpcklwd m1, m3
pshufd m1, m1, 0 ; m1 = dword [add scale]
- mova m2, [pw_1]
- mov r2d, r2m
-
; m0 = shift
; m1 = scale
; m2 = word [1]
@@ -1029,45 +1027,6 @@
movu m3, [r0]
movu m4, [r0 + 16]
packssdw m3, m4 ; m3 = clipQCoef
- psllw m3, 2
- punpckhwd m4, m3, m2
- punpcklwd m3, m2
- pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
- pmaddwd m4, m1
- psrad m3, m0
- psrad m4, m0
- packssdw m3, m3 ; OPT_ME: store must be 32 bits
- pmovsxwd m3, m3
- packssdw m4, m4
- pmovsxwd m4, m4
- movu [r1], m3
- movu [r1 + 16], m4
-
- add r0, 32
- add r1, 32
-
- sub r2d, 8
- jnz .loop
- jz .end
-
-.skip:
- mov r4d, r4m
- movd m0, r4d ; m0 = shift
- xor r3d, r3d
- dec r4d
- bts r3d, r4d
- movd m2, r3d
- punpcklwd m1, m2
- pshufd m1, m1, 0 ; m1 = dword [add scale]
- mova m2, [pw_1]
- mov r2d, r2m
- ; m0 = shift
- ; m1 = scale
- ; m2 = word [1]
-.sloop:
- movu m3, [r0]
- movu m4, [r0 + 16]
- packssdw m3, m4 ; m3 = clipQCoef
punpckhwd m4, m3, m2
punpcklwd m3, m2
pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
@@ -1085,8 +1044,7 @@
add r1, 32
sub r2d, 8
- jnz .sloop
-.end:
+ jnz .loop
RET
More information about the x265-devel
mailing list