[x265] [PATCH] asm: improve avx2 code for add_ps[32x32] (1428 -> 1312)

sumalatha at multicorewareinc.com sumalatha at multicorewareinc.com
Tue Apr 7 11:09:08 CEST 2015


# HG changeset patch
# User Sumalatha Polureddy
# Date 1428395385 -19800
#      Tue Apr 07 13:59:45 2015 +0530
# Node ID f0de9cd5399a5b49293d328ff56476ddc16066e9
# Parent  095ed87526e5964ad45949ec81903704451d79f8
asm: improve avx2 code for add_ps[32x32] (1428 -> 1312)

diff -r 095ed87526e5 -r f0de9cd5399a source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm	Tue Apr 07 09:42:31 2015 +0530
+++ b/source/common/x86/pixeladd8.asm	Tue Apr 07 13:59:45 2015 +0530
@@ -570,10 +570,14 @@
     jnz         .loop
     RET
 
+%if ARCH_X86_64
 INIT_YMM avx2
-cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_32x%2, 6, 10, 8, dest, destride, src0, scr1, srcStride0, srcStride1
     mov         r6d,        %2/4
     add         r5,         r5
+    lea         r7,         [r4 * 3]
+    lea         r8,         [r5 * 3]
+    lea         r9,         [r1 * 3]
 .loop:
     pmovzxbw    m0,         [r2]                ; first half of row 0 of src0
     pmovzxbw    m1,         [r2 + 16]           ; second half of row 0 of src0
@@ -597,40 +601,37 @@
     vpermq      m0, m0, 11011000b
     movu        [r0 + r1],      m0              ; row 1 of dst
 
-    lea         r2,         [r2 + r4 * 2]
-    lea         r3,         [r3 + r5 * 2]
-    lea         r0,         [r0 + r1 * 2]
-
-    pmovzxbw    m0,         [r2]                ; first half of row 2 of src0
-    pmovzxbw    m1,         [r2 + 16]           ; second half of row 2 of src0
-    movu        m2,         [r3]                ; first half of row 2 of src1
-    movu        m3,         [r3 + 32]           ; second half of row 2 of src1
+    pmovzxbw    m0,         [r2 + r4 * 2]       ; first half of row 2 of src0
+    pmovzxbw    m1,         [r2 + r4 * 2 + 16]  ; second half of row 2 of src0
+    movu        m2,         [r3 + r5 * 2]       ; first half of row 2 of src1
+    movu        m3,         [r3 + + r5 * 2 + 32]; second half of row 2 of src1
 
     paddw       m0,         m2
     paddw       m1,         m3
     packuswb    m0,         m1
     vpermq      m0, m0, 11011000b
-    movu        [r0],      m0                   ; row 2 of dst
+    movu        [r0 + r1 * 2],      m0          ; row 2 of dst
 
-    pmovzxbw    m0,         [r2 + r4]           ; first half of row 3 of src0
-    pmovzxbw    m1,         [r2 + r4 + 16]      ; second half of row 3 of src0
-    movu        m2,         [r3 + r5]           ; first half of row 3 of src1
-    movu        m3,         [r3 + r5 + 32]      ; second half of row 3 of src1
+    pmovzxbw    m0,         [r2 + r7]           ; first half of row 3 of src0
+    pmovzxbw    m1,         [r2 + r7 + 16]      ; second half of row 3 of src0
+    movu        m2,         [r3 + r8]           ; first half of row 3 of src1
+    movu        m3,         [r3 + r8 + 32]      ; second half of row 3 of src1
 
     paddw       m0,         m2
     paddw       m1,         m3
     packuswb    m0,         m1
     vpermq      m0, m0, 11011000b
-    movu        [r0 + r1],      m0              ; row 3 of dst
+    movu        [r0 + r9],      m0              ; row 3 of dst
 
-    lea         r2,         [r2 + r4 * 2]
-    lea         r3,         [r3 + r5 * 2]
-    lea         r0,         [r0 + r1 * 2]
+    lea         r2,         [r2 + r4 * 4]
+    lea         r3,         [r3 + r5 * 4]
+    lea         r0,         [r0 + r1 * 4]
 
     dec         r6d
     jnz         .loop
     RET
 %endif
+%endif
 %endmacro
 
 PIXEL_ADD_PS_W32_H2 32, 32


More information about the x265-devel mailing list