[x265] asm: fix dequant_normal
Steve Borho
steve at borho.org
Sat Aug 30 09:43:25 CEST 2014
On 08/30, Satoshi Nakagawa wrote:
> > How about remove '#if...'?
> > The asm code didn't check it.
>
> added '%if...' to asm code :)
>
>
> # HG changeset patch
> # User Satoshi Nakagawa <nakagawa424 at oki.com>
> # Date 1409378187 -32400
> # Sat Aug 30 14:56:27 2014 +0900
> # Node ID c4f15840feb443f8c38ba58b52ef5ba6d518e626
> # Parent 4e2d9ac6d489e82e70544d626c89964ee653c452
> asm: fix dequant_normal
Queued for stable, thanks
> diff -r 4e2d9ac6d489 -r c4f15840feb4 source/common/dct.cpp
> --- a/source/common/dct.cpp Fri Aug 29 11:12:49 2014 +0200
> +++ b/source/common/dct.cpp Sat Aug 30 14:56:27 2014 +0900
> @@ -720,7 +720,9 @@
>
> void dequant_normal_c(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
> {
> -#if !HIGH_BIT_DEPTH
> +#if HIGH_BIT_DEPTH
> + X265_CHECK(scale < 32768 || ((scale & 3) == 0 && shift > 2), "dequant invalid scale %d\n", scale);
> +#else
> // NOTE: maximum of scale is (72 * 256)
> X265_CHECK(scale < 32768, "dequant invalid scale %d\n", scale);
> #endif
> diff -r 4e2d9ac6d489 -r c4f15840feb4 source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Fri Aug 29 11:12:49 2014 +0200
> +++ b/source/common/x86/pixel-util8.asm Sat Aug 30 14:56:27 2014 +0900
> @@ -1005,23 +1005,23 @@
> ; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse4
> -cglobal dequant_normal, 4,5,5
> - movd m1, r3 ; m1 = word [scale]
> +cglobal dequant_normal, 5,5,5
> + movd m1, r3 ; m1 = word [scale]
> + mova m2, [pw_1]
> +%if HIGH_BIT_DEPTH
> cmp r3d, 32767
> jle .skip
> -
> psrld m1, 2
> - mov r4d, r4m
> + sub r4d, 2
> +.skip:
> +%endif
> movd m0, r4d ; m0 = shift
> xor r3d, r3d
> dec r4d
> bts r3d, r4d
> - movd m2, r3d
> - punpcklwd m1, m2
> + movd m3, r3d
> + punpcklwd m1, m3
> pshufd m1, m1, 0 ; m1 = dword [add scale]
> - mova m2, [pw_1]
> - mov r2d, r2m
> -
> ; m0 = shift
> ; m1 = scale
> ; m2 = word [1]
> @@ -1029,45 +1029,6 @@
> movu m3, [r0]
> movu m4, [r0 + 16]
> packssdw m3, m4 ; m3 = clipQCoef
> - psllw m3, 2
> - punpckhwd m4, m3, m2
> - punpcklwd m3, m2
> - pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
> - pmaddwd m4, m1
> - psrad m3, m0
> - psrad m4, m0
> - packssdw m3, m3 ; OPT_ME: store must be 32 bits
> - pmovsxwd m3, m3
> - packssdw m4, m4
> - pmovsxwd m4, m4
> - movu [r1], m3
> - movu [r1 + 16], m4
> -
> - add r0, 32
> - add r1, 32
> -
> - sub r2d, 8
> - jnz .loop
> - jz .end
> -
> -.skip:
> - mov r4d, r4m
> - movd m0, r4d ; m0 = shift
> - xor r3d, r3d
> - dec r4d
> - bts r3d, r4d
> - movd m2, r3d
> - punpcklwd m1, m2
> - pshufd m1, m1, 0 ; m1 = dword [add scale]
> - mova m2, [pw_1]
> - mov r2d, r2m
> - ; m0 = shift
> - ; m1 = scale
> - ; m2 = word [1]
> -.sloop:
> - movu m3, [r0]
> - movu m4, [r0 + 16]
> - packssdw m3, m4 ; m3 = clipQCoef
> punpckhwd m4, m3, m2
> punpcklwd m3, m2
> pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
> @@ -1085,8 +1046,7 @@
> add r1, 32
>
> sub r2d, 8
> - jnz .sloop
> -.end:
> + jnz .loop
> RET
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list