[x265] [PATCH 2 of 7] asm: AVX2 asm for intra_ang_32 mode 10, 816c->452c

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Wed Aug 26 12:24:31 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1440480212 -19800
#      Tue Aug 25 10:53:32 2015 +0530
# Node ID 0409b136c208cb944fb76bfd400e76ba43e330a8
# Parent  38a0e6b5f22302fb076913077d464b902c9cf63e
asm: AVX2 asm for intra_ang_32 mode 10, 816c->452c

diff -r 38a0e6b5f223 -r 0409b136c208 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Aug 24 18:25:53 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Aug 25 10:53:32 2015 +0530
@@ -2998,6 +2998,7 @@
         p.cu[BLOCK_32x32].intra_pred[7]  = PFX(intra_pred_ang32_7_avx2);
         p.cu[BLOCK_32x32].intra_pred[8]  = PFX(intra_pred_ang32_8_avx2);
         p.cu[BLOCK_32x32].intra_pred[9]  = PFX(intra_pred_ang32_9_avx2);
+        p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx2);
         p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
         p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
         p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
diff -r 38a0e6b5f223 -r 0409b136c208 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Mon Aug 24 18:25:53 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Tue Aug 25 10:53:32 2015 +0530
@@ -531,6 +531,7 @@
 %endrep
 
 SECTION .text
+cextern pb_1
 cextern pw_2
 cextern pw_3
 cextern pw_4
@@ -13893,6 +13894,132 @@
     movu                [r0 + r4], m1
     RET
 
+cglobal intra_pred_ang32_10, 5,5,4
+    pxor                m0, m0
+    mova                m1, [pb_1]
+    lea                 r4, [r1 * 3]
+
+    vbroadcasti128      m2, [r2 + mmsize*2 + 1]
+
+    pshufb              m3, m2, m0
+    movu                [r0], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1 * 2], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r4], m3
+
+    lea                 r0, [r0 + r1 * 4]
+
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1 * 2], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r4], m3
+
+    lea                 r0, [r0 + r1 * 4]
+
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1 * 2], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r4], m3
+
+    lea                 r0, [r0 + r1 * 4]
+
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1 * 2], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r4], m3
+
+    lea                 r0, [r0 + r1 * 4]
+    pxor                m0, m0
+    vbroadcasti128      m2, [r2 + mmsize*2 + mmsize/2 + 1]
+
+    pshufb              m3, m2, m0
+    movu                [r0], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1 * 2], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r4], m3
+
+    lea                 r0, [r0 + r1 * 4]
+
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1 * 2], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r4], m3
+
+    lea                 r0, [r0 + r1 * 4]
+
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1 * 2], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r4], m3
+
+    lea                 r0, [r0 + r1 * 4]
+
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r1 * 2], m3
+    paddb               m0, m1
+    pshufb              m3, m2, m0
+    movu                [r0 + r4], m3
+    RET
+
 %endif  ; ARCH_X86_64
 ;-----------------------------------------------------------------------------------------
 ; end of intra_pred_ang32 angular modes avx2 asm


More information about the x265-devel mailing list