[x265] [PATCH] assembly code for intra_pred_planar_16x16 for 10 and 12-bit
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Dec 10 14:04:41 CET 2013
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1386680027 -19800
# Tue Dec 10 18:23:47 2013 +0530
# Node ID 981a0e6d10fb3df403329664d5e4efdee0578a9c
# Parent 7af37d60e4437602cde5ab17357812733741ac1d
assembly code for intra_pred_planar_16x16 for 10 and 12-bit
diff -r 7af37d60e443 -r 981a0e6d10fb source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Dec 10 17:10:30 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Dec 10 18:23:47 2013 +0530
@@ -721,6 +721,7 @@
{
p.intra_pred[BLOCK_4x4][0] = x265_intra_pred_planar4_sse4;
p.intra_pred[BLOCK_8x8][0] = x265_intra_pred_planar8_sse4;
+ p.intra_pred[BLOCK_16x16][0] = x265_intra_pred_planar16_sse4;
p.intra_pred[BLOCK_4x4][1] = x265_intra_pred_dc4_sse4;
p.intra_pred[BLOCK_8x8][1] = x265_intra_pred_dc8_sse4;
diff -r 7af37d60e443 -r 981a0e6d10fb source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Tue Dec 10 17:10:30 2013 +0530
+++ b/source/common/x86/intrapred16.asm Tue Dec 10 18:23:47 2013 +0530
@@ -3,6 +3,7 @@
;*
;* Authors: Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
;* Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
+;* Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
@@ -34,6 +35,7 @@
%assign x x+1
%endrep
+const pw_unpack0wd, times 4 db 0,1,8,8
SECTION .text
@@ -43,6 +45,7 @@
cextern pd_32
cextern pw_4096
cextern multiL
+cextern multiH
cextern multi_2Row
@@ -542,6 +545,188 @@
RET
+;-----------------------------------------------------------------------------------------------------------
+; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left, pixel* above, int dirMode, int filter)
+;-----------------------------------------------------------------------------------------------------------
+INIT_XMM sse4
+%if (BIT_DEPTH == 12)
+
+%if (ARCH_X86_64 == 1)
+cglobal intra_pred_planar16, 4,7,8+3
+ %define bottomRow0 m7
+ %define bottomRow1 m8
+ %define bottomRow2 m9
+ %define bottomRow3 m10
+%else
+cglobal intra_pred_planar16, 4,7,8, 0-3*mmsize
+ %define bottomRow0 [rsp + 0*mmsize]
+ %define bottomRow1 [rsp + 1*mmsize]
+ %define bottomRow2 [rsp + 2*mmsize]
+ %define bottomRow3 m7
+%endif
+
+ add r2, 2
+ add r3, 2
+ add r1, r1
+
+ pxor m0, m0
+
+ ; bottomRow
+ movzx r4d, word [r2 + 16*2]
+ movd m1, r4d
+ pshufd m1, m1, 0 ; m1 = bottomLeft
+ movu m2, [r3]
+ pmovzxwd m3, m2
+ punpckhwd m2, m0
+ psubd m4, m1, m3
+ mova bottomRow0, m4
+ psubd m4, m1, m2
+ mova bottomRow1, m4
+ movu m2, [r3 + 16]
+ pmovzxwd m3, m2
+ punpckhwd m2, m0
+ psubd m4, m1, m3
+ mova bottomRow2, m4
+ psubd m1, m2
+ mova bottomRow3, m1
+
+ ; topRow
+ pmovzxwd m0, [r3 + 0*8]
+ pslld m0, 4
+ pmovzxwd m1, [r3 + 1*8]
+ pslld m1, 4
+ pmovzxwd m2, [r3 + 2*8]
+ pslld m2, 4
+ pmovzxwd m3, [r3 + 3*8]
+ pslld m3, 4
+
+ xor r6, r6
+.loopH:
+ movzx r4d, word [r2 + r6*2]
+ movzx r5d, word [r3 + 16*2] ; r5 = topRight
+ sub r5d, r4d
+ movd m5, r5d
+ pshuflw m5, m5, 0
+ pmullw m5, [multiL]
+ pmovsxwd m5, m5 ; m5 = rightCol
+ add r4d, r4d
+ lea r4d, [r4d * 8 + 16]
+ movd m4, r4d
+ pshufd m4, m4, 0 ; m4 = horPred
+ paddd m4, m5
+ pshufd m6, m5, 0xFF ; m6 = [4 4 4 4]
+
+ ; 0-3
+ paddd m0, bottomRow0
+ paddd m5, m0, m4
+ psrad m5, 5
+ packusdw m5, m5
+ movh [r0 + 0*8], m5
+
+ ; 4-7
+ paddd m4, m6
+ paddd m1, bottomRow1
+ paddd m5, m1, m4
+ psrad m5, 5
+ packusdw m5, m5
+ movh [r0 + 1*8], m5
+
+ ; 8-11
+ paddd m4, m6
+ paddd m2, bottomRow2
+ paddd m5, m2, m4
+ psrad m5, 5
+ packusdw m5, m5
+ movh [r0 + 2*8], m5
+
+ ; 12-15
+ paddd m4, m6
+ paddd m3, bottomRow3
+ paddd m5, m3, m4
+ psrad m5, 5
+ packusdw m5, m5
+ movh [r0 + 3*8], m5
+
+ add r0, r1
+ inc r6d
+ cmp r6d, 16
+ jnz .loopH
+
+ RET
+%else ; BIT-DEPTH == 10
+INIT_XMM sse4
+cglobal intra_pred_planar16, 4,6,7
+ add r2, 2
+ add r3, 2
+ add r1, r1
+
+ movu m1, [r3] ; topRow[0-7]
+ movu m2, [r3 + 16] ; topRow[8-15]
+
+ movd m3, [r2 + 32]
+ pshuflw m3, m3, 0
+ pshufd m3, m3, 0
+ movzx r4d, word [r3 + 32] ; topRight = above[16]
+
+ psubw m4, m3, m1 ; v_bottomRow[0]
+ psubw m3, m2 ; v_bottomRow[1]
+
+ psllw m1, 4
+ psllw m2, 4
+
+%macro PRED_PLANAR_ROW16 1
+ movzx r5d, word [r2 + %1 * 2]
+ add r5d, r5d
+ lea r5d, [r5d * 8 + 16]
+ movd m5, r5d
+ pshuflw m5, m5, 0
+ pshufd m5, m5, 0 ; horPred
+
+ movzx r5d, word [r2 + %1 * 2]
+ mov r3d, r4d
+ sub r3d, r5d
+ movd m0, r3d
+ pshuflw m0, m0, 0
+ pshufd m0, m0, 0
+
+ pmullw m6, m0, [multiL]
+ paddw m6, m5
+ paddw m1, m4
+ paddw m6, m1
+ psraw m6, 5
+
+ pmullw m0, m0, [multiH]
+ paddw m5, m0
+ paddw m2, m3
+ paddw m5, m2
+ psraw m5, 5
+
+ movu [r0], m6
+ movu [r0 + 16], m5
+ add r0, r1
+%endmacro
+
+ PRED_PLANAR_ROW16 0
+ PRED_PLANAR_ROW16 1
+ PRED_PLANAR_ROW16 2
+ PRED_PLANAR_ROW16 3
+ PRED_PLANAR_ROW16 4
+ PRED_PLANAR_ROW16 5
+ PRED_PLANAR_ROW16 6
+ PRED_PLANAR_ROW16 7
+ PRED_PLANAR_ROW16 8
+ PRED_PLANAR_ROW16 9
+ PRED_PLANAR_ROW16 10
+ PRED_PLANAR_ROW16 11
+ PRED_PLANAR_ROW16 12
+ PRED_PLANAR_ROW16 13
+ PRED_PLANAR_ROW16 14
+ PRED_PLANAR_ROW16 15
+%undef PRED_PLANAR_ROW16
+
+ RET
+%endif
+
;-----------------------------------------------------------------------------
; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel *refAbove, int dirMode, int bFilter)
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list