[x265] [PATCH] asm: 10bpp avx2 code for intra_pred_ang32x32 mode 11 & 25

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Jun 12 06:22:35 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1434000876 -19800
#      Thu Jun 11 11:04:36 2015 +0530
# Node ID 091940c84ef56d1373d8aa476e8d3f1618436567
# Parent  4d2da861ec98105cfa4bf118235678b6491a1c93
asm: 10bpp avx2 code for intra_pred_ang32x32 mode 11 & 25

performance improvement over SSE:
intra_ang_32x32[11]    8256c->4236c, 48%
intra_ang_32x32[25]    5646c->2755c, 51%

diff -r 4d2da861ec98 -r 091940c84ef5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jun 10 07:55:00 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp	Thu Jun 11 11:04:36 2015 +0530
@@ -1263,6 +1263,8 @@
         p.cu[BLOCK_32x32].intra_pred[8]     = x265_intra_pred_ang32_8_avx2;
         p.cu[BLOCK_32x32].intra_pred[9]     = x265_intra_pred_ang32_9_avx2;
         p.cu[BLOCK_32x32].intra_pred[10]    = x265_intra_pred_ang32_10_avx2;
+        p.cu[BLOCK_32x32].intra_pred[11]    = x265_intra_pred_ang32_11_avx2;
+        p.cu[BLOCK_32x32].intra_pred[25]    = x265_intra_pred_ang32_25_avx2;
         p.cu[BLOCK_32x32].intra_pred[26]    = x265_intra_pred_ang32_26_avx2;
         p.cu[BLOCK_32x32].intra_pred[27]    = x265_intra_pred_ang32_27_avx2;
         p.cu[BLOCK_32x32].intra_pred[28]    = x265_intra_pred_ang32_28_avx2;
diff -r 4d2da861ec98 -r 091940c84ef5 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Wed Jun 10 07:55:00 2015 -0700
+++ b/source/common/x86/intrapred.h	Thu Jun 11 11:04:36 2015 +0530
@@ -284,6 +284,7 @@
 void x265_intra_pred_ang32_8_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_9_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_10_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
+void x265_intra_pred_ang32_11_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_26_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_27_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
 void x265_intra_pred_ang32_28_avx2(pixel* dst, intptr_t dstStride, const pixel* srcPix, int dirMode, int bFilter);
diff -r 4d2da861ec98 -r 091940c84ef5 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Wed Jun 10 07:55:00 2015 -0700
+++ b/source/common/x86/intrapred16.asm	Thu Jun 11 11:04:36 2015 +0530
@@ -14643,6 +14643,80 @@
     movu        [r0 + r2],          m0
     movu        [r0 + r2 + 32],     m1
     RET
+
+cglobal intra_pred_ang32_11, 3,8,12, 0-8
+    movzx       r5d,        word [r2 + 128]  ; [0]
+    movzx       r6d,        word [r2]
+    mov         [rsp],      r5w
+    mov         [r2 + 128], r6w
+
+    movzx       r5d,        word [r2 + 126]  ; [16]
+    movzx       r6d,        word [r2 + 32]
+    mov         [rsp + 4],  r5w
+    mov         [r2 + 126], r6w
+
+    add         r2,         128
+    xor         r6d,        r6d
+    lea         r3,         [ang_table_avx2 + 16 * 32]
+    add         r1d,        r1d
+    lea         r4,         [r1 * 3]
+    lea         r7,         [r0 + 8 * r1]
+
+    call        ang16_mode_11_25
+
+    sub         r2,         2
+    lea         r0,         [r0 + 32]
+
+    call        ang16_mode_11_25
+
+    add         r2,         34
+    lea         r0,         [r7 + 8 * r1]
+
+    call        ang16_mode_11_25
+
+    sub         r2,         2
+    lea         r0,         [r0 + 32]
+
+    call        ang16_mode_11_25
+
+    mov         r6d,        [rsp]
+    mov         [r2 - 30], r6w
+    mov         r6d,       [rsp + 4]
+    mov         [r2 - 32], r6w
+    RET
+
+cglobal intra_pred_ang32_25, 3,7,12, 0-4
+    xor         r6d,        r6d
+    inc         r6d
+    lea         r3,         [ang_table_avx2 + 16 * 32]
+    add         r1d,        r1d
+
+    movzx       r4d,        word [r2 - 2]
+    movzx       r5d,        word [r2 + 160]     ; [16]
+    mov         [rsp],      r4w
+    mov         [r2 - 2],   r5w
+
+    lea         r4,         [r1 * 3]
+    lea         r5,         [r0 + 32]
+
+    call        ang16_mode_11_25
+
+    sub         r2,         2
+
+    call        ang16_mode_11_25
+
+    add         r2,         34
+    mov         r0,         r5
+
+    call        ang16_mode_11_25
+
+    sub         r2,         2
+
+    call        ang16_mode_11_25
+
+    mov         r5d,        [rsp]
+    mov         [r2 - 32],  r5w
+    RET
 ;-------------------------------------------------------------------------------------------------------
 ; end of avx2 code for intra_pred_ang32 mode 2 to 34
 ;-------------------------------------------------------------------------------------------------------


More information about the x265-devel mailing list