[x265] [PATCH] asm: avx2 code for addavg[24x64] for 10 bpp (5584 -> 4052)

sumalatha at multicorewareinc.com sumalatha at multicorewareinc.com
Tue May 26 06:57:58 CEST 2015


# HG changeset patch
# User Sumalatha Polureddy
# Date 1432616269 -19800
#      Tue May 26 10:27:49 2015 +0530
# Node ID 9162e9b4edc45c9333889dbd0b8d90374dc32921
# Parent  1404801f5874bb421b57b1e46c84ad888f5250a2
asm: avx2 code for addavg[24x64] for 10 bpp (5584 -> 4052)

sse4:
[i422]  addAvg[24x64]  10.09x   5584.70         56358.54
avx2:
[i422]  addAvg[24x64]  13.89x   4052.17         56287.46

diff -r 1404801f5874 -r 9162e9b4edc4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon May 25 15:50:39 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue May 26 10:27:49 2015 +0530
@@ -1249,6 +1249,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = x265_addAvg_16x24_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg = x265_addAvg_16x8_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg = x265_addAvg_8x64_avx2;
+        p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = x265_addAvg_24x64_avx2;
 
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
         p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
diff -r 1404801f5874 -r 9162e9b4edc4 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Mon May 25 15:50:39 2015 +0530
+++ b/source/common/x86/mc-a.asm	Tue May 26 10:27:49 2015 +0530
@@ -1304,6 +1304,62 @@
     jnz         .loop
     RET
 
+cglobal addAvg_24x64, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,              [pw_512]
+    mova        m5,              [pw_1023]
+    paddw       m3,              m4,  m4
+    pxor        m1,              m1
+    add         r3,              r3
+    add         r4,              r4
+    add         r5,              r5
+
+    mov         r6d,             32
+
+.loop:
+    movu        m0,              [r0]
+    movu        m2,              [r1]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m1
+    pminsw      m0,              m5
+    movu        [r2],            m0
+
+    movu        xm0,             [r0 + 32]
+    movu        xm2,             [r1 + 32]
+    paddw       xm0,             xm2
+    pmulhrsw    xm0,             xm3
+    paddw       xm0,             xm4
+    pmaxsw      xm0,             xm1
+    pminsw      xm0,             xm5
+    movu        [r2 + 32],       xm0
+
+    movu        m0,              [r0 + r3]
+    movu        m2,              [r1 + r4]
+    paddw       m0,              m2
+    pmulhrsw    m0,              m3
+    paddw       m0,              m4
+    pmaxsw      m0,              m1
+    pminsw      m0,              m5
+    movu        [r2 + r5],       m0
+
+    movu        xm2,             [r0 + r3 + 32]
+    movu        xm0,             [r1 + r4 + 32]
+    paddw       xm2,             xm0
+    pmulhrsw    xm2,             xm3
+    paddw       xm2,             xm4
+    pmaxsw      xm2,             xm1
+    pminsw      xm2,             xm5
+    movu        [r2 + r5 + 32],  xm2
+
+    lea         r2,              [r2 + 2 * r5]
+    lea         r0,              [r0 + 2 * r3]
+    lea         r1,              [r1 + 2 * r4]
+
+    dec         r6d
+    jnz         .loop
+    RET
+
 %macro ADDAVG_W32_H2_AVX2 1
 cglobal addAvg_32x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
     mova        m4,              [pw_512]


More information about the x265-devel mailing list