[x265] [PATCH 3 of 3] asm: rewrite interpolate hps width of [32, 48, 64], improve ~20%

Min Chen chenm003 at 163.com
Tue Apr 12 19:31:02 CEST 2016


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1460482251 18000
# Node ID f74e220607e15ea4c00645e59055996767303aaa
# Parent  37e80d50caf51a74e85c83f24317935171a5d375
asm: rewrite interpolate hps width of [32,48,64], improve ~20%
OLD:
  luma_hps[32x32]         6.32x    16429.69        103771.02
  luma_hps[32x16]         6.04x    10121.56        61140.21
  luma_hps[32x64]         6.47x    30813.70        199438.95
  luma_hps[32x24]         6.23x    13277.26        82747.75
  luma_hps[48x64]         6.13x    46002.25        282176.44
  luma_hps[64x64]         6.15x    61393.88        377670.03
  luma_hps[64x32]         6.79x    33001.77        224096.58
  luma_hps[64x48]         6.21x    47242.66        293529.16
  luma_hps[64x16]         6.51x    19207.61        125016.56

NEW:
  luma_hps[32x32]         7.66x    13404.22        102730.96
  luma_hps[32x16]         7.32x    8355.57         61133.25
  luma_hps[32x64]         7.68x    24496.17        188086.11
  luma_hps[32x24]         8.00x    10879.09        87077.93
  luma_hps[48x64]         7.62x    37094.37        282758.94
  luma_hps[64x64]         7.82x    48535.86        379390.78
  luma_hps[64x32]         7.91x    26512.17        209755.50
  luma_hps[64x48]         8.06x    37020.63        298498.28
  luma_hps[64x16]         7.95x    15479.03        123132.41
---
 source/common/x86/ipfilter16.asm |  100 +++++++++++++++++++-------------------
 1 files changed, 50 insertions(+), 50 deletions(-)

diff -r 37e80d50caf5 -r f74e220607e1 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm	Tue Apr 12 12:30:48 2016 -0500
+++ b/source/common/x86/ipfilter16.asm	Tue Apr 12 12:30:51 2016 -0500
@@ -116,6 +116,7 @@
                   dw  -1, 4, -11, 40,  40, -11, 4, -1
                   dw   0, 1, -5,  17,  58, -10, 4, -1
 
+ALIGN 32
 tab_LumaCoeffV:   times 4 dw 0, 0
                   times 4 dw 0, 64
                   times 4 dw 0, 0
@@ -161,9 +162,8 @@
 const interp8_hpp_shuf,     db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
                             db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
 
-const pb_shuf,  db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
-                db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13
-
+const interp8_hpp_shuf_new, db 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9
+                            db 4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
 
 SECTION .text
 cextern pd_8
@@ -10407,7 +10407,7 @@
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -10475,7 +10475,7 @@
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -10536,16 +10536,16 @@
     add                 r3d, r3d
     mov                 r4d, r4m
     mov                 r5d, r5m
-    shl                 r4d, 4
-%ifdef PIC
-    lea                 r6, [tab_LumaCoeff]
-    vpbroadcastq        m0, [r6 + r4]
-    vpbroadcastq        m1, [r6 + r4 + 8]
-%else
-    vpbroadcastq        m0, [tab_LumaCoeff + r4]
-    vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
-%endif
-    mova                m3, [pb_shuf]
+    shl                 r4d, 6
+%ifdef PIC
+    lea                 r6, [tab_LumaCoeffV]
+    movu                m0, [r6 + r4]
+    movu                m1, [r6 + r4 + mmsize]
+%else
+    movu                m0, [tab_LumaCoeffV + r4]
+    movu                m1, [tab_LumaCoeffV + r4 + mmsize]
+%endif
+    mova                m3, [interp8_hpp_shuf_new]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -10554,7 +10554,7 @@
     sub                 r0, 6
     test                r5d, r5d
     mov                 r4d, %2
-    jz                  .loop0
+    jz                 .loop0
     lea                 r6, [r1*3]
     sub                 r0, r6
     add                 r4d, 7
@@ -10563,64 +10563,64 @@
 %assign x 0
 %rep %1/16
     vbroadcasti128      m4, [r0 + x]
-    vbroadcasti128      m5, [r0 + 8 + x]
+    vbroadcasti128      m5, [r0 + 4 * SIZEOF_PIXEL + x]
     pshufb              m4, m3
-    pshufb              m7, m5, m3
+    pshufb              m5, m3
 
     pmaddwd             m4, m0
-    pmaddwd             m7, m1
+    pmaddwd             m7, m5, m1
     paddd               m4, m7
+    vextracti128        xm7, m4, 1
+    paddd               xm4, xm7
+    paddd               xm4, xm2
+    psrad               xm4, INTERP_SHIFT_PS
 
     vbroadcasti128      m6, [r0 + 16 + x]
+    pshufb              m6, m3
+
+    pmaddwd             m5, m0
+    pmaddwd             m7, m6, m1
+    paddd               m5, m7
+    vextracti128        xm7, m5, 1
+    paddd               xm5, xm7
+    paddd               xm5, xm2
+    psrad               xm5, INTERP_SHIFT_PS
+
+    packssdw            xm4, xm5
+    movu                [r2 + x], xm4
+
+    vbroadcasti128      m5, [r0 + 24 + x]
     pshufb              m5, m3
-    pshufb              m7, m6, m3
+
+    pmaddwd             m6, m0
+    pmaddwd             m7, m5, m1
+    paddd               m6, m7
+    vextracti128        xm7, m6, 1
+    paddd               xm6, xm7
+    paddd               xm6, xm2
+    psrad               xm6, INTERP_SHIFT_PS
+
+    vbroadcasti128      m7, [r0 + 32 + x]
+    pshufb              m7, m3
 
     pmaddwd             m5, m0
     pmaddwd             m7, m1
     paddd               m5, m7
-
-    phaddd              m4, m5
-    vpermq              m4, m4, q3120
-    paddd               m4, m2
-    vextracti128        xm5,m4, 1
-    psrad               xm4, INTERP_SHIFT_PS
+    vextracti128        xm7, m5, 1
+    paddd               xm5, xm7
+    paddd               xm5, xm2
     psrad               xm5, INTERP_SHIFT_PS
-    packssdw            xm4, xm5
-
-    movu                [r2 + x], xm4
-
-    vbroadcasti128      m5, [r0 + 24 + x]
-    pshufb              m6, m3
-    pshufb              m7, m5, m3
-
-    pmaddwd             m6, m0
-    pmaddwd             m7, m1
-    paddd               m6, m7
-
-    vbroadcasti128      m7, [r0 + 32 + x]
-    pshufb              m5, m3
-    pshufb              m7, m3
-
-    pmaddwd             m5, m0
-    pmaddwd             m7, m1
-    paddd               m5, m7
-
-    phaddd              m6, m5
-    vpermq              m6, m6, q3120
-    paddd               m6, m2
-    vextracti128        xm5,m6, 1
-    psrad               xm6, INTERP_SHIFT_PS
-    psrad               xm5, INTERP_SHIFT_PS
+
     packssdw            xm6, xm5
-
     movu                [r2 + 16 + x], xm6
-    %assign x x+32
-    %endrep
+
+%assign x x+32
+%endrep
 
     add                 r2, r3
     add                 r0, r1
     dec                 r4d
-    jnz                 .loop0
+    jnz                .loop0
     RET
 %endif
 %endmacro
@@ -10656,7 +10656,7 @@
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -10749,7 +10749,7 @@
     vpbroadcastq        m0, [tab_LumaCoeff + r4]
     vpbroadcastq        m1, [tab_LumaCoeff + r4 + 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -10824,7 +10824,7 @@
 %else
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -10883,7 +10883,7 @@
 %else
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -10956,7 +10956,7 @@
 %else
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -11038,7 +11038,7 @@
 %else
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -11103,7 +11103,7 @@
 %else
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -11204,7 +11204,7 @@
 %else
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -11357,7 +11357,7 @@
 %else
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map
@@ -11477,7 +11477,7 @@
 %else
     vpbroadcastq        m0, [tab_ChromaCoeff + r4 * 8]
 %endif
-    mova                m3, [pb_shuf]
+    mova                m3, [interp8_hpp_shuf]
     vbroadcasti128      m2, [INTERP_OFFSET_PS]
 
     ; register map



More information about the x265-devel mailing list