[x265] [PATCH 1 of 3] asm : routine for weight_sp()
nabajit at multicorewareinc.com
nabajit at multicorewareinc.com
Mon Nov 25 11:38:24 CET 2013
# HG changeset patch
# User Nabajit Deka
# Date 1385375693 -19800
# Mon Nov 25 16:04:53 2013 +0530
# Node ID 4a5ad44661863551a57ab5a2d38f9e91e4297b7c
# Parent 92969306ae85ed2c506d53d709e02f3d98b895f7
asm : routine for weight_sp().
diff -r 92969306ae85 -r 4a5ad4466186 source/common/x86/pixel-util.asm
--- a/source/common/x86/pixel-util.asm Mon Nov 25 15:46:49 2013 +0530
+++ b/source/common/x86/pixel-util.asm Mon Nov 25 16:04:53 2013 +0530
@@ -31,6 +31,7 @@
c_d_1234: dd 1, 2, 3, 4
tab_c_1: times 8 dw 1
+tab_c_8192: times 8 dw 8192
SECTION .text
@@ -751,3 +752,87 @@
jnz .loopH
RET
+
+;-------------------------------------------------------------------------------------------------------------------------------------------------
+;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
+;-------------------------------------------------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+%if ARCH_X86_64
+cglobal weight_sp, 6, 7+2, 6
+ %define tmp_r0 r7
+ %define tmp_r1 r8
+%else ; ARCH_X86_64 = 0
+cglobal weight_sp, 6, 7, 6, 0-(2*4)
+ %define tmp_r0 [(rsp + 0 * 4)]
+ %define tmp_r1 [(rsp + 1 * 4)]
+%endif ; ARCH_X86_64
+
+ movd m0, r6m ; m0 = [w0]
+
+ movd m1, r7m ; m1 = [round]
+ punpcklwd m0, m1
+ pshufd m0, m0, 0 ; m0 = [w0 round]
+
+ movd m1, r8m ; m1 = [shift]
+
+ movd m2, r9m
+ pshufd m2, m2, 0 ; m2 =[offset]
+
+ mova m3, [tab_c_1]
+ mova m4, [tab_c_8192]
+
+ add r2d, r2d
+
+.loopH
+ mov r6d, r4d
+
+ ; save old src and dst
+ mov tmp_r0, r0
+ mov tmp_r1, r1
+.loopW:
+ movu m5, [r0]
+ paddw m5, m4
+
+ punpcklwd m6,m5, m3
+ pmaddwd m6, m0
+ psrad m6, m1
+ paddd m6, m2
+
+ punpckhwd m5, m3
+ pmaddwd m5, m0
+ psrad m5, m1
+ paddd m5, m2
+
+ packssdw m6, m5
+ packuswb m6, m6
+
+ sub r6d, 8
+ jl .width4
+ movh [r1], m6
+ je .nextH
+ add r0, 16
+ add r1, 8
+
+ jmp .loopW
+
+.width4
+ cmp r6d, -4
+ jl .width2
+ movd [r1], m6
+ je .nextH
+ add r1, 4
+ pshufd m6, m6, 1
+
+.width2
+ pextrw [r1], m6, 0
+
+.nextH
+ mov r0, tmp_r0
+ mov r1, tmp_r1
+ lea r0, [r0 + r2]
+ lea r1, [r1 + r3]
+
+ dec r5d
+ jnz .loopH
+
+ RET
More information about the x265-devel
mailing list