[x265] [PATCH] asm: improvement filterHorizontal_p_p_4 by reorder intermedia data

Min Chen chenm003 at 163.com
Fri Oct 11 08:52:07 CEST 2013


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1381474318 -28800
# Node ID a4037c2f8e046838452f1d8d139f694944a519c2
# Parent  067790d03bad0758fa3523288f008c9c76dd726e
asm: improvement filterHorizontal_p_p_4 by reorder intermedia data

1. repleace phaddw to paddw
2. use extra load operator to split data dependency and reduce table size

diff -r 067790d03bad -r a4037c2f8e04 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Fri Oct 11 13:51:46 2013 +0800
+++ b/source/common/x86/ipfilter8.asm	Fri Oct 11 14:51:58 2013 +0800
@@ -31,22 +31,22 @@
 SECTION_RODATA 32
 tab_leftmask:   db -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0
 
-tab_Tm:     db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
-            db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+tab_Tm:     db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
 
 tab_c_512:  times 8 dw 512
 
 SECTION .text
 
-%macro FILTER_H4 3
+%macro FILTER_H4 2
     movu        %1, [src + col - 1]
-    pshufb      %2, %1, Tm4
-    pmaddubsw   %2, coef2
-    pshufb      %1, %1, Tm5
+    pshufb      %1, Tm4
     pmaddubsw   %1, coef2
-    phaddw      %2, %1
-    pmulhrsw    %2, %3
-    packuswb    %2, %2
+    movu        %2, [src + col + 1]
+    pshufb      %2, Tm4
+    pmaddubsw   %2, coef3
+    paddw       %1, %2
+    pmulhrsw    %1, c512
+    packuswb    %1, %1
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -62,9 +62,9 @@
 %define widthleft   r5
 %define mask_offset r6
 %define coef2       m7
-%define x3          m6
-%define Tm5         m5
-%define Tm4         m4
+%define coef3       m6
+%define Tm4         m5
+%define c512        m4
 %define x2          m3
 %define x1          m2
 %define x0          m1
@@ -73,11 +73,12 @@
 %define tmp1        r1
  
     mov         tmp,        r6m
-    movu        coef2,      [tmp]
+    movd        coef2,      [tmp    ]
+    movd        coef3,      [tmp + 4]
+    pshufd      coef2,      coef2,  0
+    pshufd      coef3,      coef3,  0
     packsswb    coef2,      coef2
-    pshufd      coef2,      coef2,      0
-
-    mova        x3,         [tab_c_512]
+    packsswb    coef3,      coef3
 
     mov         width,      r4m
     mov         widthleft,  width
@@ -88,7 +89,7 @@
 
     movq        leftmask,   [tab_leftmask + (7 + mask_offset)]
     mova        Tm4,        [tab_Tm]
-    mova        Tm5,        [tab_Tm + 16]
+    mova        c512,       [tab_c_512]
 
     mov         src,        r0m
     mov         dst,        r2m
@@ -101,8 +102,8 @@
     cmp         col,        width
     jge         _end_col
 
-    FILTER_H4   x0, x1, x3
-    movh        [dst + col], x1
+    FILTER_H4   x0, x1
+    movh        [dst + col], x0
 
     add         col,         8
     jmp         _loop_col
@@ -112,8 +113,8 @@
     jz          _next_row
 
     movq        x2, [dst + col]
-    FILTER_H4   x0, x1, x3
-    pblendvb    x2, x2, x1, leftmask
+    FILTER_H4   x0, x1
+    pblendvb    x2, x2, x0, leftmask
     movh        [dst + col], x2
 
 _next_row:



More information about the x265-devel mailing list