[x265] [PATCH] assembly code for intra_pred_planar_16x16 for 10 and 12-bit

Dnyaneshwar Gorade dnyaneshwar at multicorewareinc.com
Tue Dec 10 14:21:44 CET 2013


please ignore this patch

---
Dnyaneshwar G


On Tue, Dec 10, 2013 at 6:34 PM, <dnyaneshwar at multicorewareinc.com> wrote:

> # HG changeset patch
> # User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
> # Date 1386680027 -19800
> #      Tue Dec 10 18:23:47 2013 +0530
> # Node ID 981a0e6d10fb3df403329664d5e4efdee0578a9c
> # Parent  7af37d60e4437602cde5ab17357812733741ac1d
> assembly code for intra_pred_planar_16x16 for 10 and 12-bit
>
> diff -r 7af37d60e443 -r 981a0e6d10fb source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp      Tue Dec 10 17:10:30 2013
> +0530
> +++ b/source/common/x86/asm-primitives.cpp      Tue Dec 10 18:23:47 2013
> +0530
> @@ -721,6 +721,7 @@
>      {
>          p.intra_pred[BLOCK_4x4][0] = x265_intra_pred_planar4_sse4;
>          p.intra_pred[BLOCK_8x8][0] = x265_intra_pred_planar8_sse4;
> +        p.intra_pred[BLOCK_16x16][0] = x265_intra_pred_planar16_sse4;
>
>          p.intra_pred[BLOCK_4x4][1] = x265_intra_pred_dc4_sse4;
>          p.intra_pred[BLOCK_8x8][1] = x265_intra_pred_dc8_sse4;
> diff -r 7af37d60e443 -r 981a0e6d10fb source/common/x86/intrapred16.asm
> --- a/source/common/x86/intrapred16.asm Tue Dec 10 17:10:30 2013 +0530
> +++ b/source/common/x86/intrapred16.asm Tue Dec 10 18:23:47 2013 +0530
> @@ -3,6 +3,7 @@
>  ;*
>  ;* Authors: Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
>  ;*          Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
> +;*          Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
>  ;*
>  ;* This program is free software; you can redistribute it and/or modify
>  ;* it under the terms of the GNU General Public License as published by
> @@ -34,6 +35,7 @@
>  %assign x x+1
>  %endrep
>
> +const pw_unpack0wd, times 4 db 0,1,8,8
>
>  SECTION .text
>
> @@ -43,6 +45,7 @@
>  cextern pd_32
>  cextern pw_4096
>  cextern multiL
> +cextern multiH
>  cextern multi_2Row
>
>
> @@ -542,6 +545,188 @@
>      RET
>
>
>
> +;-----------------------------------------------------------------------------------------------------------
> +; void intra_pred_planar(pixel* dst, intptr_t dstStride, pixel* left,
> pixel* above, int dirMode, int filter)
>
> +;-----------------------------------------------------------------------------------------------------------
> +INIT_XMM sse4
> +%if (BIT_DEPTH == 12)
> +
> +%if (ARCH_X86_64 == 1)
> +cglobal intra_pred_planar16, 4,7,8+3
> +    %define bottomRow0  m7
> +    %define bottomRow1  m8
> +    %define bottomRow2  m9
> +    %define bottomRow3  m10
> +%else
> +cglobal intra_pred_planar16, 4,7,8, 0-3*mmsize
> +    %define bottomRow0  [rsp + 0*mmsize]
> +    %define bottomRow1  [rsp + 1*mmsize]
> +    %define bottomRow2  [rsp + 2*mmsize]
> +    %define bottomRow3  m7
> +%endif
> +
> +    add             r2, 2
> +    add             r3, 2
> +    add             r1, r1
> +
> +    pxor            m0, m0
> +
> +    ; bottomRow
> +    movzx           r4d, word [r2 + 16*2]
> +    movd            m1, r4d
> +    pshufd          m1, m1, 0               ; m1 = bottomLeft
> +    movu            m2, [r3]
> +    pmovzxwd        m3, m2
> +    punpckhwd       m2, m0
> +    psubd           m4, m1, m3
> +    mova            bottomRow0, m4
> +    psubd           m4, m1, m2
> +    mova            bottomRow1, m4
> +    movu            m2, [r3 + 16]
> +    pmovzxwd        m3, m2
> +    punpckhwd       m2, m0
> +    psubd           m4, m1, m3
> +    mova            bottomRow2, m4
> +    psubd           m1, m2
> +    mova            bottomRow3, m1
> +
> +    ; topRow
> +    pmovzxwd        m0, [r3 + 0*8]
> +    pslld           m0, 4
> +    pmovzxwd        m1, [r3 + 1*8]
> +    pslld           m1, 4
> +    pmovzxwd        m2, [r3 + 2*8]
> +    pslld           m2, 4
> +    pmovzxwd        m3, [r3 + 3*8]
> +    pslld           m3, 4
> +
> +    xor             r6, r6
> +.loopH:
> +    movzx           r4d, word [r2 + r6*2]
> +    movzx           r5d, word [r3 + 16*2]       ; r5 = topRight
> +    sub             r5d, r4d
> +    movd            m5, r5d
> +    pshuflw         m5, m5, 0
> +    pmullw          m5, [multiL]
> +    pmovsxwd        m5, m5                      ; m5 = rightCol
> +    add             r4d, r4d
> +    lea             r4d, [r4d * 8 + 16]
> +    movd            m4, r4d
> +    pshufd          m4, m4, 0                   ; m4 = horPred
> +    paddd           m4, m5
> +    pshufd          m6, m5, 0xFF                ; m6 = [4 4 4 4]
> +
> +    ; 0-3
> +    paddd           m0, bottomRow0
> +    paddd           m5, m0, m4
> +    psrad           m5, 5
> +    packusdw        m5, m5
> +    movh            [r0 + 0*8], m5
> +
> +    ; 4-7
> +    paddd           m4, m6
> +    paddd           m1, bottomRow1
> +    paddd           m5, m1, m4
> +    psrad           m5, 5
> +    packusdw        m5, m5
> +    movh            [r0 + 1*8], m5
> +
> +    ; 8-11
> +    paddd           m4, m6
> +    paddd           m2, bottomRow2
> +    paddd           m5, m2, m4
> +    psrad           m5, 5
> +    packusdw        m5, m5
> +    movh            [r0 + 2*8], m5
> +
> +    ; 12-15
> +    paddd           m4, m6
> +    paddd           m3, bottomRow3
> +    paddd           m5, m3, m4
> +    psrad           m5, 5
> +    packusdw        m5, m5
> +    movh            [r0 + 3*8], m5
> +
> +    add             r0, r1
> +    inc             r6d
> +    cmp             r6d, 16
> +    jnz            .loopH
> +
> +    RET
> +%else ; BIT-DEPTH == 10
> +INIT_XMM sse4
> +cglobal intra_pred_planar16, 4,6,7
> +    add             r2,         2
> +    add             r3,         2
> +    add             r1,         r1
> +
> +    movu            m1,         [r3]        ; topRow[0-7]
> +    movu            m2,         [r3 + 16]   ; topRow[8-15]
> +
> +    movd            m3,         [r2 + 32]
> +    pshuflw         m3,         m3, 0
> +    pshufd          m3,         m3, 0
> +    movzx           r4d, word   [r3 + 32]   ; topRight = above[16]
> +
> +    psubw           m4,         m3, m1      ; v_bottomRow[0]
> +    psubw           m3,         m2          ; v_bottomRow[1]
> +
> +    psllw           m1,         4
> +    psllw           m2,         4
> +
> +%macro PRED_PLANAR_ROW16 1
> +    movzx           r5d, word   [r2 + %1 * 2]
> +    add             r5d,        r5d
> +    lea             r5d,        [r5d * 8 + 16]
> +    movd            m5,         r5d
> +    pshuflw         m5,         m5, 0
> +    pshufd          m5,         m5, 0       ; horPred
> +
> +    movzx           r5d, word   [r2 + %1 * 2]
> +    mov             r3d,        r4d
> +    sub             r3d,        r5d
> +    movd            m0,         r3d
> +    pshuflw         m0,         m0, 0
> +    pshufd          m0,         m0, 0
> +
> +    pmullw          m6,         m0, [multiL]
> +    paddw           m6,         m5
> +    paddw           m1,         m4
> +    paddw           m6,         m1
> +    psraw           m6,         5
> +
> +    pmullw          m0,         m0, [multiH]
> +    paddw           m5,         m0
> +    paddw           m2,         m3
> +    paddw           m5,         m2
> +    psraw           m5,         5
> +
> +    movu            [r0],       m6
> +    movu            [r0 + 16],  m5
> +    add             r0,         r1
> +%endmacro
> +
> +    PRED_PLANAR_ROW16 0
> +    PRED_PLANAR_ROW16 1
> +    PRED_PLANAR_ROW16 2
> +    PRED_PLANAR_ROW16 3
> +    PRED_PLANAR_ROW16 4
> +    PRED_PLANAR_ROW16 5
> +    PRED_PLANAR_ROW16 6
> +    PRED_PLANAR_ROW16 7
> +    PRED_PLANAR_ROW16 8
> +    PRED_PLANAR_ROW16 9
> +    PRED_PLANAR_ROW16 10
> +    PRED_PLANAR_ROW16 11
> +    PRED_PLANAR_ROW16 12
> +    PRED_PLANAR_ROW16 13
> +    PRED_PLANAR_ROW16 14
> +    PRED_PLANAR_ROW16 15
> +%undef PRED_PLANAR_ROW16
> +
> +    RET
> +%endif
> +
>
>  ;-----------------------------------------------------------------------------
>  ; void intraPredAng(pixel* dst, intptr_t dstStride, pixel *refLeft, pixel
> *refAbove, int dirMode, int bFilter)
>
>  ;-----------------------------------------------------------------------------
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131210/b0b37a83/attachment-0001.html>


More information about the x265-devel mailing list