[x265] [PATCH] asm: pixelavg_pp[48x64] avx2 8bpp code

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Fri Sep 11 07:13:11 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1441881122 -19800
#      Thu Sep 10 16:02:02 2015 +0530
# Node ID 6bbfbfc5611e804938a49bb83689c7d81d1844c1
# Parent  365f7ed4d89628d49cd6af8d81d4edc01f73ffad
asm: pixelavg_pp[48x64] avx2 8bpp code

avx2:
avg_pp[48x64]  31.62x  1687.89         53372.89

sse2:
avg_pp[48x64]  21.47x   2519.02         54079.22

diff -r 365f7ed4d896 -r 6bbfbfc5611e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Sep 10 16:02:02 2015 +0530
@@ -2826,6 +2826,7 @@
         p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx2);
         p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx2);
         p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx2);
+        p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx2);
         p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx2);
         p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx2);
         p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx2);
diff -r 365f7ed4d896 -r 6bbfbfc5611e source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/x86/mc-a.asm	Thu Sep 10 16:02:02 2015 +0530
@@ -4418,6 +4418,37 @@
     call pixel_avg_16x64_8bit
     call pixel_avg_16x64_8bit
     RET
+
+cglobal pixel_avg_48x64, 6,7,4
+   mov          r6d, 4
+.loop:
+%rep 8
+    movu        m0, [r2]
+    movu        xm2, [r2 + mmsize]
+    movu        m1, [r4]
+    movu        xm3, [r4 + mmsize]
+    pavgb       m0, m1
+    pavgb       xm2, xm3
+    movu        [r0], m0
+    movu        [r0 + mmsize], xm2
+
+    movu        m0, [r2 + r3]
+    movu        xm2, [r2 + r3 + mmsize]
+    movu        m1, [r4 + r5]
+    movu        xm3, [r4 + r5 + mmsize]
+    pavgb       m0, m1
+    pavgb       xm2, xm3
+    movu        [r0 + r1], m0
+    movu        [r0 + r1 + mmsize], xm2
+
+    lea         r2, [r2 + r3 * 2]
+    lea         r4, [r4 + r5 * 2]
+    lea         r0, [r0 + r1 * 2]
+%endrep
+
+    dec         r6d
+    jnz         .loop
+    RET
 %endif
 
 ;=============================================================================


More information about the x265-devel mailing list