[x265] [PATCH] assembly code for intra_pred_planar_16x16 for 10 and 12-bit
Dnyaneshwar Gorade
dnyaneshwar at multicorewareinc.com
Tue Dec 10 14:21:44 CET 2013
please ignore this patch
---
Dnyaneshwar G
On Tue, Dec 10, 2013 at 6:34 PM, <dnyaneshwar at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
> # Date 1386680027 -19800
> # Tue Dec 10 18:23:47 2013 +0530
> # Node ID 981a0e6d10fb3df403329664d5e4efdee0578a9c
> # Parent 7af37d60e4437602cde5ab17357812733741ac1d
> assembly code for intra_pred_planar_16x16 for 10 and 12-bit
>
> diff -r 7af37d60e443 -r 981a0e6d10fb source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Tue Dec 10 17:10:30 2013
> +0530
> +++ b/source/common/x86/asm-primitives.cpp Tue Dec 10 18:23:47 2013
> +0530
> @@ -721,6 +721,7 @@
> {
> p.intra_pred[BLOCK_4x4][0] = x265_intra_pred_planar4_sse4;
> p.intra_pred[BLOCK_8x8][0] = x265_intra_pred_planar8_sse4;
> + p.intra_pred[BLOCK_16x16][0] = x265_intra_pred_planar16_sse4;
>
> p.intra_pred[BLOCK_4x4][1] = x265_intra_pred_dc4_sse4;
> p.intra_pred[BLOCK_8x8][1] = x265_intra_pred_dc8_sse4;
> diff -r 7af37d60e443 -r 981a0e6d10fb source/common/x86/intrapred16.asm
> --- a/source/common/x86/intrapred16.asm Tue Dec 10 17:10:30 2013 +0530
> +++ b/source/common/x86/intrapred16.asm Tue Dec 10 18:23:47 2013 +0530
> @@ -3,6 +3,7 @@
> ;*
> ;* Authors: Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
> ;* Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
> +;* Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
> ;*
> ;* This program is free software; you can redistribute it and/or modify
> ;* it under the terms of the GNU General Public License as published by
> @@ -34,6 +35,7 @@
> %assign x x+1
> %endrep
>
> +const pw_unpack0wd, times 4 db 0,1,8,8
>
> SECTION .text
>
> @@ -43,6 +45,7 @@
> cextern pd_32
> cextern pw_4096
> cextern multiL
> +cextern multiH
> cextern multi_2Row
>
>
> @@ -542,6 +545,188 @@
> RET
>
>
>
> +;-----------------------------------------------------------------------------------------------------------
> +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left,
> pixel* above, int dirMode, int filter)
>
> +;-----------------------------------------------------------------------------------------------------------
> +INIT_XMM sse4
> +%if (BIT_DEPTH == 12)
> +
> +%if (ARCH_X86_64 == 1)
> +cglobal intra_pred_planar16, 4,7,8+3
> + %define bottomRow0 m7
> + %define bottomRow1 m8
> + %define bottomRow2 m9
> + %define bottomRow3 m10
> +%else
> +cglobal intra_pred_planar16, 4,7,8, 0-3*mmsize
> + %define bottomRow0 [rsp + 0*mmsize]
> + %define bottomRow1 [rsp + 1*mmsize]
> + %define bottomRow2 [rsp + 2*mmsize]
> + %define bottomRow3 m7
> +%endif
> +
> + add r2, 2
> + add r3, 2
> + add r1, r1
> +
> + pxor m0, m0
> +
> + ; bottomRow
> + movzx r4d, word [r2 + 16*2]
> + movd m1, r4d
> + pshufd m1, m1, 0 ; m1 = bottomLeft
> + movu m2, [r3]
> + pmovzxwd m3, m2
> + punpckhwd m2, m0
> + psubd m4, m1, m3
> + mova bottomRow0, m4
> + psubd m4, m1, m2
> + mova bottomRow1, m4
> + movu m2, [r3 + 16]
> + pmovzxwd m3, m2
> + punpckhwd m2, m0
> + psubd m4, m1, m3
> + mova bottomRow2, m4
> + psubd m1, m2
> + mova bottomRow3, m1
> +
> + ; topRow
> + pmovzxwd m0, [r3 + 0*8]
> + pslld m0, 4
> + pmovzxwd m1, [r3 + 1*8]
> + pslld m1, 4
> + pmovzxwd m2, [r3 + 2*8]
> + pslld m2, 4
> + pmovzxwd m3, [r3 + 3*8]
> + pslld m3, 4
> +
> + xor r6, r6
> +.loopH:
> + movzx r4d, word [r2 + r6*2]
> + movzx r5d, word [r3 + 16*2] ; r5 = topRight
> + sub r5d, r4d
> + movd m5, r5d
> + pshuflw m5, m5, 0
> + pmullw m5, [multiL]
> + pmovsxwd m5, m5 ; m5 = rightCol
> + add r4d, r4d
> + lea r4d, [r4d * 8 + 16]
> + movd m4, r4d
> + pshufd m4, m4, 0 ; m4 = horPred
> + paddd m4, m5
> + pshufd m6, m5, 0xFF ; m6 = [4 4 4 4]
> +
> + ; 0-3
> + paddd m0, bottomRow0
> + paddd m5, m0, m4
> + psrad m5, 5
> + packusdw m5, m5
> + movh [r0 + 0*8], m5
> +
> + ; 4-7
> + paddd m4, m6
> + paddd m1, bottomRow1
> + paddd m5, m1, m4
> + psrad m5, 5
> + packusdw m5, m5
> + movh [r0 + 1*8], m5
> +
> + ; 8-11
> + paddd m4, m6
> + paddd m2, bottomRow2
> + paddd m5, m2, m4
> + psrad m5, 5
> + packusdw m5, m5
> + movh [r0 + 2*8], m5
> +
> + ; 12-15
> + paddd m4, m6
> + paddd m3, bottomRow3
> + paddd m5, m3, m4
> + psrad m5, 5
> + packusdw m5, m5
> + movh [r0 + 3*8], m5
> +
> + add r0, r1
> + inc r6d
> + cmp r6d, 16
> + jnz .loopH
> +
> + RET
> +%else ; BIT-DEPTH == 10
> +INIT_XMM sse4
> +cglobal intra_pred_planar16, 4,6,7
> + add r2, 2
> + add r3, 2
> + add r1, r1
> +
> + movu m1, [r3] ; topRow[0-7]
> + movu m2, [r3 + 16] ; topRow[8-15]
> +
> + movd m3, [r2 + 32]
> + pshuflw m3, m3, 0
> + pshufd m3, m3, 0
> + movzx r4d, word [r3 + 32] ; topRight = above[16]
> +
> + psubw m4, m3, m1 ; v_bottomRow[0]
> + psubw m3, m2 ; v_bottomRow[1]
> +
> + psllw m1, 4
> + psllw m2, 4
> +
> +%macro PRED_PLANAR_ROW16 1
> + movzx r5d, word [r2 + %1 * 2]
> + add r5d, r5d
> + lea r5d, [r5d * 8 + 16]
> + movd m5, r5d
> + pshuflw m5, m5, 0
> + pshufd m5, m5, 0 ; horPred
> +
> + movzx r5d, word [r2 + %1 * 2]
> + mov r3d, r4d
> + sub r3d, r5d
> + movd m0, r3d
> + pshuflw m0, m0, 0
> + pshufd m0, m0, 0
> +
> + pmullw m6, m0, [multiL]
> + paddw m6, m5
> + paddw m1, m4
> + paddw m6, m1
> + psraw m6, 5
> +
> + pmullw m0, m0, [multiH]
> + paddw m5, m0
> + paddw m2, m3
> + paddw m5, m2
> + psraw m5, 5
> +
> + movu [r0], m6
> + movu [r0 + 16], m5
> + add r0, r1
> +%endmacro
> +
> + PRED_PLANAR_ROW16 0
> + PRED_PLANAR_ROW16 1
> + PRED_PLANAR_ROW16 2
> + PRED_PLANAR_ROW16 3
> + PRED_PLANAR_ROW16 4
> + PRED_PLANAR_ROW16 5
> + PRED_PLANAR_ROW16 6
> + PRED_PLANAR_ROW16 7
> + PRED_PLANAR_ROW16 8
> + PRED_PLANAR_ROW16 9
> + PRED_PLANAR_ROW16 10
> + PRED_PLANAR_ROW16 11
> + PRED_PLANAR_ROW16 12
> + PRED_PLANAR_ROW16 13
> + PRED_PLANAR_ROW16 14
> + PRED_PLANAR_ROW16 15
> +%undef PRED_PLANAR_ROW16
> +
> + RET
> +%endif
> +
>
> ;-----------------------------------------------------------------------------
> ; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel
> *refAbove, int dirMode, int bFilter)
>
> ;-----------------------------------------------------------------------------
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131210/b0b37a83/attachment-0001.html>
More information about the x265-devel
mailing list