[x265] [PATCH] asm: fix broken on weight_sp and weight_pp on 8bpp mode
Min Chen
chenm003 at 163.com
Mon Jan 19 18:33:59 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1421688831 -28800
# Node ID bbb1616fe28efe00ced6cbb9daa8925a0e11b7a3
# Parent 34a64bf50c17e63edbcf84fc29c654920034cded
asm: fix broken on weight_sp and weight_pp on 8bpp mode
---
source/common/x86/pixel-util8.asm | 152 +++++++++++++++++++++++++++++++++++++
1 files changed, 152 insertions(+), 0 deletions(-)
diff -r 34a64bf50c17 -r bbb1616fe28e source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Jan 20 01:19:23 2015 +0800
+++ b/source/common/x86/pixel-util8.asm Tue Jan 20 01:33:51 2015 +0800
@@ -857,6 +857,7 @@
;-----------------------------------------------------------------------------------------------------------------------------------------------
;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
;-----------------------------------------------------------------------------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
INIT_XMM sse4
cglobal weight_pp, 4,7,7
%define correction (14 - BIT_DEPTH)
@@ -931,6 +932,72 @@
jnz .loopH
RET
+%else ; end of (HIGH_BIT_DEPTH == 1)
+
+INIT_XMM sse4
+cglobal weight_pp, 6,7,6
+ shl r5d, 6 ; m0 = [w0<<6]
+ mov r6d, r6m
+ shl r6d, 16
+ or r6d, r5d ; assuming both (w0<<6) and round are using maximum of 16 bits each.
+ movd m0, r6d
+ pshufd m0, m0, 0 ; m0 = [w0<<6, round]
+ movd m1, r7m
+ movd m2, r8m
+ pshufd m2, m2, 0
+ mova m5, [pw_1]
+ sub r2d, r3d
+ shr r3d, 4
+
+.loopH:
+ mov r5d, r3d
+
+.loopW:
+ pmovzxbw m4, [r0]
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m0
+ psrad m3, m1
+ paddd m3, m2
+
+ punpckhwd m4, m5
+ pmaddwd m4, m0
+ psrad m4, m1
+ paddd m4, m2
+
+ packssdw m3, m4
+ packuswb m3, m3
+ movh [r1], m3
+
+ pmovzxbw m4, [r0 + 8]
+ punpcklwd m3, m4, m5
+ pmaddwd m3, m0
+ psrad m3, m1
+ paddd m3, m2
+
+ punpckhwd m4, m5
+ pmaddwd m4, m0
+ psrad m4, m1
+ paddd m4, m2
+
+ packssdw m3, m4
+ packuswb m3, m3
+ movh [r1 + 8], m3
+
+ add r0, 16
+ add r1, 16
+
+ dec r5d
+ jnz .loopW
+
+ lea r0, [r0 + r2]
+ lea r1, [r1 + r2]
+
+ dec r4d
+ jnz .loopH
+ RET
+%endif ; end of (HIGH_BIT_DEPTH == 0)
+
+
INIT_YMM avx2
cglobal weight_pp, 6, 7, 6
@@ -985,6 +1052,7 @@
;-------------------------------------------------------------------------------------------------------------------------------------------------
;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
;-------------------------------------------------------------------------------------------------------------------------------------------------
+%if HIGH_BIT_DEPTH
INIT_XMM sse4
cglobal weight_sp, 6,7,8
%if BIT_DEPTH == 10
@@ -1064,6 +1132,90 @@
jnz .loopH
RET
+%else ; end of (HIGH_BIT_DEPTH == 1)
+
+INIT_XMM sse4
+%if ARCH_X86_64
+cglobal weight_sp, 6, 7+2, 7
+ %define tmp_r0 r7
+ %define tmp_r1 r8
+%else ; ARCH_X86_64 = 0
+cglobal weight_sp, 6, 7, 7, 0-(2*4)
+ %define tmp_r0 [(rsp + 0 * 4)]
+ %define tmp_r1 [(rsp + 1 * 4)]
+%endif ; ARCH_X86_64
+
+ movd m0, r6m ; m0 = [w0]
+
+ movd m1, r7m ; m1 = [round]
+ punpcklwd m0, m1
+ pshufd m0, m0, 0 ; m0 = [w0 round]
+
+ movd m1, r8m ; m1 = [shift]
+
+ movd m2, r9m
+ pshufd m2, m2, 0 ; m2 =[offset]
+
+ mova m3, [pw_1]
+ mova m4, [pw_2000]
+
+ add r2d, r2d
+
+.loopH:
+ mov r6d, r4d
+
+ ; save old src and dst
+ mov tmp_r0, r0
+ mov tmp_r1, r1
+.loopW:
+ movu m5, [r0]
+ paddw m5, m4
+
+ punpcklwd m6,m5, m3
+ pmaddwd m6, m0
+ psrad m6, m1
+ paddd m6, m2
+
+ punpckhwd m5, m3
+ pmaddwd m5, m0
+ psrad m5, m1
+ paddd m5, m2
+
+ packssdw m6, m5
+ packuswb m6, m6
+
+ sub r6d, 8
+ jl .width4
+ movh [r1], m6
+ je .nextH
+ add r0, 16
+ add r1, 8
+
+ jmp .loopW
+
+.width4:
+ cmp r6d, -4
+ jl .width2
+ movd [r1], m6
+ je .nextH
+ add r1, 4
+ pshufd m6, m6, 1
+
+.width2:
+ pextrw [r1], m6, 0
+
+.nextH:
+ mov r0, tmp_r0
+ mov r1, tmp_r1
+ lea r0, [r0 + r2]
+ lea r1, [r1 + r3]
+
+ dec r5d
+ jnz .loopH
+ RET
+%endif ; end of (HIGH_BIT_DEPTH == 0)
+
+
;-----------------------------------------------------------------
; void transpose_4x4(pixel *dst, pixel *src, intptr_t stride)
;-----------------------------------------------------------------
More information about the x265-devel
mailing list