[x265] [PATCH 2 of 5] asm: AVX2 asm for intra_ang_32 mode 10, 816c->452c

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Aug 18 06:11:36 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1439557064 -19800
#      Fri Aug 14 18:27:44 2015 +0530
# Node ID 8752daab2f07711c556dfffa9a733b7278484479
# Parent  5ed23f786ea8f98e003189a537f960e4ff16201f
asm: AVX2 asm for intra_ang_32 mode 10, 816c->452c

diff -r 5ed23f786ea8 -r 8752daab2f07 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Aug 14 11:28:37 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Aug 14 18:27:44 2015 +0530
@@ -3026,6 +3026,7 @@
         p.cu[BLOCK_32x32].intra_pred[7]  = PFX(intra_pred_ang32_7_avx2);
         p.cu[BLOCK_32x32].intra_pred[8]  = PFX(intra_pred_ang32_8_avx2);
         p.cu[BLOCK_32x32].intra_pred[9]  = PFX(intra_pred_ang32_9_avx2);
+        p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx2);
         p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_34_avx2);
         p.cu[BLOCK_32x32].intra_pred[2] = PFX(intra_pred_ang32_2_avx2);
         p.cu[BLOCK_32x32].intra_pred[26] = PFX(intra_pred_ang32_26_avx2);
diff -r 5ed23f786ea8 -r 8752daab2f07 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Fri Aug 14 11:28:37 2015 +0530
+++ b/source/common/x86/intrapred8.asm	Fri Aug 14 18:27:44 2015 +0530
@@ -462,6 +462,7 @@
 %endrep
 
 SECTION .text
+cextern pb_1
 cextern pw_2
 cextern pw_3
 cextern pw_4
@@ -13500,6 +13501,132 @@
     call ang32_mode_9_27_avx2
     RET
 
+cglobal intra_pred_ang32_10, 5,5,4
+    pxor            m0, m0
+    mova            m1, [pb_1]
+    lea             r4, [r1 * 3]
+
+    vbroadcasti128  m2, [r2 + mmsize*2 + 1]
+
+    pshufb          m3, m2, m0
+    movu            [r0], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1 * 2], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r4], m3
+
+    lea             r0, [r0 + r1 * 4]
+
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1 * 2], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r4], m3
+
+    lea             r0, [r0 + r1 * 4]
+
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1 * 2], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r4], m3
+
+    lea             r0, [r0 + r1 * 4]
+
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1 * 2], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r4], m3
+
+    lea             r0, [r0 + r1 * 4]
+    pxor            m0, m0
+    vbroadcasti128  m2, [r2 + mmsize*2 + mmsize/2 + 1]
+
+    pshufb          m3, m2, m0
+    movu            [r0], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1 * 2], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r4], m3
+
+    lea             r0, [r0 + r1 * 4]
+
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1 * 2], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r4], m3
+
+    lea             r0, [r0 + r1 * 4]
+
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1 * 2], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r4], m3
+
+    lea             r0, [r0 + r1 * 4]
+
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r1 * 2], m3
+    paddb           m0, m1
+    pshufb          m3, m2, m0
+    movu            [r0 + r4], m3
+    RET
+
 %endif  ; ARCH_X86_64
 ;-----------------------------------------------------------------------------------------
 ; end of intra_pred_ang32 angular modes avx2 asm


More information about the x265-devel mailing list