[x265] [PATCH] asm: addAvg high_bit_depth avx2 asm for chroma sizes width >= 8, reused code from luma

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Thu May 14 14:03:06 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1431603734 -19800
#      Thu May 14 17:12:14 2015 +0530
# Node ID c237e9a1fd58fb95d6cab19385fb971989907739
# Parent  cd88d1705bcb952e2dd2e0f753a510a86909560f
asm: addAvg high_bit_depth avx2 asm for chroma sizes width >= 8, reused code from luma

diff -r cd88d1705bcb -r c237e9a1fd58 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu May 14 15:39:53 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu May 14 17:12:14 2015 +0530
@@ -1204,6 +1204,23 @@
         p.pu[LUMA_64x48].addAvg = x265_addAvg_64x48_avx2;
         p.pu[LUMA_64x64].addAvg = x265_addAvg_64x64_avx2;
 
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg   = x265_addAvg_8x2_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg   = x265_addAvg_8x4_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg   = x265_addAvg_8x6_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg   = x265_addAvg_8x8_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg  = x265_addAvg_8x16_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg  = x265_addAvg_8x32_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = x265_addAvg_12x16_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg  = x265_addAvg_16x4_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg  = x265_addAvg_16x8_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = x265_addAvg_16x12_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = x265_addAvg_16x16_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = x265_addAvg_16x32_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg  = x265_addAvg_32x8_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = x265_addAvg_32x16_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = x265_addAvg_32x24_avx2;
+        p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = x265_addAvg_32x32_avx2;
+
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
         p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_avx2;
         p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_avx2;
diff -r cd88d1705bcb -r c237e9a1fd58 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm	Thu May 14 15:39:53 2015 +0530
+++ b/source/common/x86/mc-a.asm	Thu May 14 17:12:14 2015 +0530
@@ -1022,6 +1022,83 @@
 ; avx2 asm for addAvg high_bit_depth
 ;------------------------------------------------------------------------------
 INIT_YMM avx2
+cglobal addAvg_8x2, 6,6,2, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    movu        xm0,         [r0]
+    vinserti128 m0,          m0, [r0 + r3 * 2], 1
+    movu        xm1,         [r1]
+    vinserti128 m1,          m1, [r1 + r4 * 2], 1
+
+    paddw       m0,          m1
+    pxor        m1,          m1
+    pmulhrsw    m0,          [pw_1024]
+    paddw       m0,          [pw_512]
+    pmaxsw      m0,          m1
+    pminsw      m0,          [pw_1023]
+    vextracti128 xm1,        m0, 1
+    movu        [r2],        xm0
+    movu        [r2 + r5 * 2], xm1
+    RET
+
+cglobal addAvg_8x6, 6,6,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+    mova        m4,          [pw_512]
+    mova        m5,          [pw_1023]
+    mova        m3,          [pw_1024]
+    pxor        m1,          m1
+    add         r3d,         r3d
+    add         r4d,         r4d
+    add         r5d,         r5d
+
+    movu        xm0,         [r0]
+    vinserti128 m0,          m0, [r0 + r3], 1
+    movu        xm2,         [r1]
+    vinserti128 m2,          m2, [r1 + r4], 1
+
+    paddw       m0,          m2
+    pmulhrsw    m0,          m3
+    paddw       m0,          m4
+    pmaxsw      m0,          m1
+    pminsw      m0,          m5
+    vextracti128 xm2,        m0, 1
+    movu        [r2],        xm0
+    movu        [r2 + r5],   xm2
+
+    lea         r2,          [r2 + 2 * r5]
+    lea         r0,          [r0 + 2 * r3]
+    lea         r1,          [r1 + 2 * r4]
+
+    movu        xm0,         [r0]
+    vinserti128 m0,          m0, [r0 + r3], 1
+    movu        xm2,         [r1]
+    vinserti128 m2,          m2, [r1 + r4], 1
+
+    paddw       m0,          m2
+    pmulhrsw    m0,          m3
+    paddw       m0,          m4
+    pmaxsw      m0,          m1
+    pminsw      m0,          m5
+    vextracti128 xm2,        m0, 1
+    movu        [r2],        xm0
+    movu        [r2 + r5],   xm2
+
+    lea         r2,          [r2 + 2 * r5]
+    lea         r0,          [r0 + 2 * r3]
+    lea         r1,          [r1 + 2 * r4]
+
+    movu        xm0,         [r0]
+    vinserti128 m0,          m0, [r0 + r3], 1
+    movu        xm2,         [r1]
+    vinserti128 m2,          m2, [r1 + r4], 1
+
+    paddw       m0,          m2
+    pmulhrsw    m0,          m3
+    paddw       m0,          m4
+    pmaxsw      m0,          m1
+    pminsw      m0,          m5
+    vextracti128 xm2,        m0, 1
+    movu        [r2],        xm0
+    movu        [r2 + r5],   xm2
+    RET
+
 %macro ADDAVG_W8_H4_AVX2 1
 cglobal addAvg_8x%1, 6,7,6, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
     mova        m4,          [pw_512]


More information about the x265-devel mailing list