[x265] [PATCH 1 of 3] asm : routine for weight_sp()

nabajit at multicorewareinc.com nabajit at multicorewareinc.com
Mon Nov 25 11:38:24 CET 2013


# HG changeset patch
# User Nabajit Deka
# Date 1385375693 -19800
#      Mon Nov 25 16:04:53 2013 +0530
# Node ID 4a5ad44661863551a57ab5a2d38f9e91e4297b7c
# Parent  92969306ae85ed2c506d53d709e02f3d98b895f7
asm : routine for weight_sp().

diff -r 92969306ae85 -r 4a5ad4466186 source/common/x86/pixel-util.asm
--- a/source/common/x86/pixel-util.asm	Mon Nov 25 15:46:49 2013 +0530
+++ b/source/common/x86/pixel-util.asm	Mon Nov 25 16:04:53 2013 +0530
@@ -31,6 +31,7 @@
 c_d_1234:       dd 1, 2, 3, 4
 
 tab_c_1:        times 8 dw 1
+tab_c_8192:     times 8 dw 8192
 
 
 SECTION .text
@@ -751,3 +752,87 @@
     jnz         .loopH
 
     RET
+
+;-------------------------------------------------------------------------------------------------------------------------------------------------
+;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset)
+;-------------------------------------------------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+%if ARCH_X86_64
+cglobal weight_sp, 6, 7+2, 6
+    %define tmp_r0      r7
+    %define tmp_r1      r8
+%else ; ARCH_X86_64 = 0
+cglobal weight_sp, 6, 7, 6, 0-(2*4)
+    %define tmp_r0      [(rsp + 0 * 4)]
+    %define tmp_r1      [(rsp + 1 * 4)]
+%endif ; ARCH_X86_64
+
+    movd        m0, r6m         ; m0 = [w0]
+
+    movd        m1, r7m         ; m1 = [round]
+    punpcklwd   m0, m1
+    pshufd      m0, m0, 0       ; m0 = [w0 round]
+
+    movd        m1, r8m         ; m1 = [shift]
+
+    movd        m2, r9m
+    pshufd      m2, m2, 0       ; m2 =[offset]
+
+    mova        m3, [tab_c_1]
+    mova        m4, [tab_c_8192]
+
+    add         r2d, r2d
+
+.loopH
+    mov         r6d, r4d
+
+    ; save old src and dst
+    mov         tmp_r0, r0
+    mov         tmp_r1, r1
+.loopW:
+    movu        m5, [r0]
+    paddw       m5, m4
+
+    punpcklwd   m6,m5, m3
+    pmaddwd     m6, m0
+    psrad       m6, m1
+    paddd       m6, m2
+
+    punpckhwd   m5, m3
+    pmaddwd     m5, m0
+    psrad       m5, m1
+    paddd       m5, m2
+
+    packssdw    m6, m5
+    packuswb    m6, m6
+
+    sub         r6d, 8
+    jl          .width4
+    movh        [r1], m6
+    je          .nextH
+    add         r0, 16
+    add         r1, 8
+
+    jmp         .loopW
+
+.width4
+    cmp         r6d, -4
+    jl          .width2
+    movd        [r1], m6
+    je          .nextH
+    add         r1, 4
+    pshufd      m6, m6, 1
+
+.width2
+    pextrw      [r1], m6, 0
+
+.nextH
+    mov         r0, tmp_r0
+    mov         r1, tmp_r1
+    lea         r0, [r0 + r2]
+    lea         r1, [r1 + r3]
+
+    dec         r5d
+    jnz         .loopH
+
+    RET


More information about the x265-devel mailing list