[x265] [PATCH] asm: avx2 code for pixelavg_pp 32xN & 64xN, improved over 40% than SSE

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Mon Aug 3 12:15:17 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1438596650 -19800
#      Mon Aug 03 15:40:50 2015 +0530
# Node ID 43fe4ec1c13a2514030010c2cd699382b67f65cb
# Parent  a3b72e2a25a7fc544b1b76e872eda012035bf4ac
asm: avx2 code for pixelavg_pp 32xN & 64xN, improved over 40% than SSE

diff -r a3b72e2a25a7 -r 43fe4ec1c13a source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Mon Aug 03 10:28:34 2015 +0530
+++ b/source/common/x86/mc-a.asm	Mon Aug 03 15:40:50 2015 +0530
@@ -4300,24 +4300,12 @@
 AVGH  4,  8
 AVGH  4,  4
 AVGH  4,  2
+
 INIT_XMM avx2
 ; TODO: active AVX2 after debug
 ;AVG_FUNC 24, movdqu, movdqa
 ;AVGH 24, 32
 
-AVG_FUNC 64, movdqu, movdqa
-AVGH 64, 64
-AVGH 64, 48
-AVGH 64, 32
-AVGH 64, 16
-
-AVG_FUNC 32, movdqu, movdqa
-AVGH 32, 64
-AVGH 32, 32
-AVGH 32, 24
-AVGH 32, 16
-AVGH 32, 8
-
 AVG_FUNC 16, movdqu, movdqa
 AVGH 16, 64
 AVGH 16, 32
@@ -4328,7 +4316,109 @@
 
 %endif ;HIGH_BIT_DEPTH
 
-
+;-------------------------------------------------------------------------------------------------------------------------------
+;void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
+;-------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64 && BIT_DEPTH == 8
+INIT_YMM avx2
+cglobal pixel_avg_8x32
+%rep 4
+    movu        m0, [r2]
+    movu        m2, [r2 + r3]
+    movu        m1, [r4]
+    movu        m3, [r4 + r5]
+    pavgb       m0, m1
+    pavgb       m2, m3
+    movu        [r0], m0
+    movu        [r0 + r1], m2
+
+    lea         r2, [r2 + r3 * 2]
+    lea         r4, [r4 + r5 * 2]
+    lea         r0, [r0 + r1 * 2]
+%endrep
+    ret
+
+cglobal pixel_avg_16x64_8bit
+%rep 8
+    movu        m0, [r2]
+    movu        m2, [r2 + mmsize]
+    movu        m1, [r4]
+    movu        m3, [r4 + mmsize]
+    pavgb       m0, m1
+    pavgb       m2, m3
+    movu        [r0], m0
+    movu        [r0 + mmsize], m2
+
+    movu        m0, [r2 + r3]
+    movu        m2, [r2 + r3 + mmsize]
+    movu        m1, [r4 + r5]
+    movu        m3, [r4 + r5 + mmsize]
+    pavgb       m0, m1
+    pavgb       m2, m3
+    movu        [r0 + r1], m0
+    movu        [r0 + r1 + mmsize], m2
+
+    lea         r2, [r2 + r3 * 2]
+    lea         r4, [r4 + r5 * 2]
+    lea         r0, [r0 + r1 * 2]
+%endrep
+    ret
+
+cglobal pixel_avg_32x8, 6,6,4
+    call pixel_avg_8x32
+    RET
+
+cglobal pixel_avg_32x16, 6,6,4
+    call pixel_avg_8x32
+    call pixel_avg_8x32
+    RET
+
+cglobal pixel_avg_32x24, 6,6,4
+    call pixel_avg_8x32
+    call pixel_avg_8x32
+    call pixel_avg_8x32
+    RET
+
+cglobal pixel_avg_32x32, 6,6,4
+    call pixel_avg_8x32
+    call pixel_avg_8x32
+    call pixel_avg_8x32
+    call pixel_avg_8x32
+    RET
+
+cglobal pixel_avg_32x64, 6,6,4
+    call pixel_avg_8x32
+    call pixel_avg_8x32
+    call pixel_avg_8x32
+    call pixel_avg_8x32
+    call pixel_avg_8x32
+    call pixel_avg_8x32
+    call pixel_avg_8x32
+    call pixel_avg_8x32
+    RET
+
+cglobal pixel_avg_64x16, 6,6,4
+    call pixel_avg_16x64_8bit
+    RET
+
+cglobal pixel_avg_64x32, 6,6,4
+    call pixel_avg_16x64_8bit
+    call pixel_avg_16x64_8bit
+    RET
+
+cglobal pixel_avg_64x48, 6,6,4
+    call pixel_avg_16x64_8bit
+    call pixel_avg_16x64_8bit
+    call pixel_avg_16x64_8bit
+    RET
+
+cglobal pixel_avg_64x64, 6,6,4
+    call pixel_avg_16x64_8bit
+    call pixel_avg_16x64_8bit
+    call pixel_avg_16x64_8bit
+    call pixel_avg_16x64_8bit
+    RET
+%endif
 
 ;=============================================================================
 ; pixel avg2


More information about the x265-devel mailing list