[x265] [PATCH REVIEW Only ] chroma 4XN block, coeffIdex insted of coeff pointer

chen chenm003 at 163.com
Sat Oct 12 05:34:43 CEST 2013


I think x86 scale factor is less than or equal to 8, so your instruction [tab_coeff + height * 16] is invalid.
 
we can replace by below
movq x, [tab_coeff + height * 8] 
punpcklqdq x, x  (or pshufd x, 01000100b it faster on some kind of CPU)

At 2013-10-12 01:51:57,"Praveen Tiwari" <praveen at multicorewareinc.com> wrote:

I have just missed to change the line  mova        coef2,       [tab_coeff + 16] (I was just testing for coeffIdex 1 ) I will make it for random like  mova    coef2,       [tab_coeff + height * 16]. Please Ignore this.


Regards,
Praveen



On Fri, Oct 11, 2013 at 10:20 PM, <praveen at multicorewareinc.com> wrote:
# HG changeset patch
# User Praveen Tiwari
# Date 1381510220 -19800
# Node ID 5a9160e8b0bdc3117c2417bc29453077488efd8e
# Parent  c6d89dc62e191f56f63dbcb1781a6494da50a70d
chroma 4XN block, coeffIdex insted of coeff pointer

diff -r c6d89dc62e19 -r 5a9160e8b0bd source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm   Fri Oct 11 01:47:53 2013 -0500
+++ b/source/common/x86/ipfilter8.asm   Fri Oct 11 22:20:20 2013 +0530
@@ -26,107 +26,58 @@
 %include "x86inc.asm"
 %include "x86util.asm"

-%if ARCH_X86_64 == 0
-
 SECTION_RODATA 32
-tab_leftmask:   db -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0
-
 tab_Tm:     db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-            db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10

 tab_c_512:  times 8 dw 512

+tab_coeff:    db  0, 64, 0, 0, 0, 64, 0, 0, 0, 64, 0, 0, 0, 64, 0, 0
+              db -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2, -2, 58, 10, -2
+              db -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2, -4, 54, 16, -2
+              db -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4, -6, 46, 28, -4
+              db -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4, -4, 36, 36, -4
+              db -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6, -4, 28, 46, -6
+              db -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4, -2, 16, 54, -4
+              db -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2, -2, 10, 58, -2
+
 SECTION .text

-%macro FILTER_H4 3
-    movu        %1, [src + col - 1]
-    pshufb      %2, %1, Tm4
+%macro FILTER_H4_w4 3
+    movu        %1, [srcq - 1]
+    pshufb      %2, %1, Tm0
     pmaddubsw   %2, coef2
-    pshufb      %1, %1, Tm5
-    pmaddubsw   %1, coef2
     phaddw      %2, %1
     pmulhrsw    %2, %3
     packuswb    %2, %2
 %endmacro

+%macro FILTER_H4_w4_CALL 0
+    FILTER_H4_w4   x0, x1, x2
+
+    movd        [dstq],      x1
+
+    add         srcq,        srcstrideq
+    add         dstq,        dststrideq
+%endmacro
+
 ;-----------------------------------------------------------------------------
-; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff)
+; void interp_4tap_horiz_pp_w4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int height, int coeffIdx)
 ;-----------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal filterHorizontal_p_p_4, 0, 7, 8
-%define src         r0
-%define dst         r1
-%define row         r2
-%define col         r3
-%define width       r4
-%define widthleft   r5
-%define mask_offset r6
-%define coef2       m7
-%define x3          m6
-%define Tm5         m5
-%define Tm4         m4
-%define x2          m3
-%define x1          m2
-%define x0          m1
-%define leftmask    m0
-%define tmp         r0
-%define tmp1        r1
-
-    mov         tmp,        r6m
-    movu        coef2,      [tmp]
-    packsswb    coef2,      coef2
-    pshufd      coef2,      coef2,      0
+cglobal interp_4tap_horiz_pp_w4, 6, 6, 5, src, srcstride, dst, dststride, height, coeffIdx
+%define coef2       m4
+%define Tm0         m3
+%define x2          m2
+%define x1          m1
+%define x0          m0

-    mova        x3,         [tab_c_512]
+    mova        coef2,       [tab_coeff + 16]
+    mova        x2,          [tab_c_512]
+    mova        Tm0,         [tab_Tm]

-    mov         width,      r4m
-    mov         widthleft,  width
-    and         width,      ~7
-    and         widthleft,  7
-    mov         mask_offset,  widthleft
-    neg         mask_offset
+.loop
+FILTER_H4_w4_CALL
+dec          r4d
+jnz         .loop
+RET

-    movq        leftmask,   [tab_leftmask + (7 + mask_offset)]
-    mova        Tm4,        [tab_Tm]
-    mova        Tm5,        [tab_Tm + 16]
-
-    mov         src,        r0m
-    mov         dst,        r2m
-    mov         row,        r5m
-
-_loop_row:
-    xor         col,        col
-
-_loop_col:
-    FILTER_H4   x0, x1, x3
-    movh        [dst + col], x1
-
-    add         col,         8
-
-    cmp         col,        width
-    jl         _loop_col
-
-_end_col:
-    test        widthleft,  widthleft
-    jz          _next_row
-
-    movq        x2, [dst + col]
-    FILTER_H4   x0, x1, x3
-    pblendvb    x2, x2, x1, leftmask
-    movh        [dst + col], x2
-
-_next_row:
-    add         src,        r1m
-    add         dst,        r3m
-    dec         row
-
-    test        row,        row
-    jz          _end_row
-
-    jmp         _loop_row
-
-_end_row:
-
-    RET
-
-%endif  ; ARCH_X86_64 == 0


-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131012/84deac46/attachment-0001.html>


More information about the x265-devel mailing list