[x265] [PATCH] arm: Implement planecopy_cp NEON

Ramya Sriraman ramya at multicorewareinc.com
Fri Mar 4 10:29:40 CET 2016


Thanks for the improvements min. Pls find the modified patch below.

# HG changeset patch
# User Ramya Sriraman<ramya at multicorewareinc.com>
# Date 1456985538 -19800
#      Thu Mar 03 11:42:18 2016 +0530
# Node ID 75a3948f28b6bd8f2b3536cf18e17cc8573be444
# Parent  9cc9920bf82be1b43efd2a3628e28a3a78ab3b2f
arm: Implement planecopy_cp NEON

diff -r 9cc9920bf82b -r 75a3948f28b6 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp    Wed Mar 02 17:26:11 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp    Thu Mar 03 11:42:18 2016 +0530
@@ -33,6 +33,7 @@
 #include "blockcopy8.h"
 #include "pixel.h"
 #include "pixel-util.h"
+#include "ipfilter8.h"
 }

 namespace X265_NS {
@@ -142,6 +143,9 @@
         p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_neon);
         p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon);

+        // planecopy
+        p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
+
         // sad
         p.pu[LUMA_8x4].sad    = PFX(pixel_sad_8x4_neon);
         p.pu[LUMA_8x8].sad    = PFX(pixel_sad_8x8_neon);
diff -r 9cc9920bf82b -r 75a3948f28b6 source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S    Wed Mar 02 17:26:11 2016 +0530
+++ b/source/common/arm/pixel-util.S    Thu Mar 03 11:42:18 2016 +0530
@@ -626,3 +626,55 @@
     pop             {r4, r5}
     bx              lr
 endfunc
+
+function x265_pixel_planecopy_cp_neon
+    push            {r4, r5, r6, r7}
+    ldr             r4, [sp, #4 * 4]
+    ldr             r5, [sp, #4 * 4 + 4]
+    ldr             r12, [sp, #4 * 4 + 8]
+    vdup.8          q2, r12
+    sub             r5, #1
+
+.loop_h:
+    mov             r6, r0
+    mov             r12, r2
+    eor             r7, r7
+.loop_w:
+    vld1.u8         {q0}, [r6]!
+    vshl.u8         q0, q0, q2
+    vst1.u8         {q0}, [r12]!
+
+    add             r7, #16
+    cmp             r7, r4
+    blt             .loop_w
+
+    add             r0, r1
+    add             r2, r3
+
+    subs             r5, #1
+    bgt             .loop_h
+
+// handle last row
+    mov             r5, r4
+    lsr             r5, #3
+
+.loopW8:
+    vld1.u8         d0, [r0]!
+    vshl.u8         d0, d0, d4
+    vst1.u8         d0, [r2]!
+    subs            r4, r4, #8
+    subs            r5, #1
+    bgt             .loopW8
+
+    mov             r5,#8
+    sub             r5, r4
+    sub             r0, r5
+    sub             r2, r5
+    vld1.u8         d0, [r0]
+    vshl.u8         d0, d0, d4
+    vst1.u8         d0, [r2]
+
+    pop             {r4, r5, r6, r7}
+    bx              lr
+endfunc
+
diff -r 9cc9920bf82b -r 75a3948f28b6 source/common/arm/pixel.h
--- a/source/common/arm/pixel.h    Wed Mar 02 17:26:11 2016 +0530
+++ b/source/common/arm/pixel.h    Thu Mar 03 11:42:18 2016 +0530
@@ -163,4 +163,6 @@
 void x265_pixel_add_ps_16x16_neon(pixel* a, intptr_t dstride, const pixel*
b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
 void x265_pixel_add_ps_32x32_neon(pixel* a, intptr_t dstride, const pixel*
b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
 void x265_pixel_add_ps_64x64_neon(pixel* a, intptr_t dstride, const pixel*
b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+
+void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride,
pixel* dst, intptr_t dstStride, int width, int height, int shift);
 #endif // ifndef X265_I386_PIXEL_ARM_H



Thank you
Regards
Ramya

On Fri, Mar 4, 2016 at 2:18 PM, Ramya Sriraman <ramya at multicorewareinc.com>
wrote:

> # HG changeset patch
> # User Ramya Sriraman<ramya at multicorewareinc.com>
> # Date 1456985538 -19800
> #      Thu Mar 03 11:42:18 2016 +0530
> # Node ID 299caedec2f38b9d9b658aace5c74ace36b6b324
> # Parent  9cc9920bf82be1b43efd2a3628e28a3a78ab3b2f
> arm: Implement planecopy_cp NEON
>
> diff -r 9cc9920bf82b -r 299caedec2f3 source/common/arm/asm-primitives.cpp
> --- a/source/common/arm/asm-primitives.cpp    Wed Mar 02 17:26:11 2016
> +0530
> +++ b/source/common/arm/asm-primitives.cpp    Thu Mar 03 11:42:18 2016
> +0530
> @@ -33,6 +33,7 @@
>  #include "blockcopy8.h"
>  #include "pixel.h"
>  #include "pixel-util.h"
> +#include "ipfilter8.h"
>  }
>
>  namespace X265_NS {
> @@ -142,6 +143,9 @@
>          p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_neon);
>          p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon);
>
> +        // planecopy
> +        p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
> +
>          // sad
>          p.pu[LUMA_8x4].sad    = PFX(pixel_sad_8x4_neon);
>          p.pu[LUMA_8x8].sad    = PFX(pixel_sad_8x8_neon);
> diff -r 9cc9920bf82b -r 299caedec2f3 source/common/arm/pixel-util.S
> --- a/source/common/arm/pixel-util.S    Wed Mar 02 17:26:11 2016 +0530
> +++ b/source/common/arm/pixel-util.S    Thu Mar 03 11:42:18 2016 +0530
> @@ -626,3 +626,57 @@
>      pop             {r4, r5}
>      bx              lr
>  endfunc
> +
> +function x265_pixel_planecopy_cp_neon
> +    push            {r4, r5, r6, r7}
> +    ldr             r4, [sp, #4 * 4]
> +    ldr             r5, [sp, #4 * 4 + 4]
> +    ldr             r12, [sp, #4 * 4 + 8]
> +    vdup.8          q2, r12
> +    sub             r5, #1
> +
> +.loop_h:
> +    mov             r6, r0
> +    mov             r12, r2
> +    eor             r7, r7
> +.loop_w:
> +    vld1.u8         {q0}, [r6]
> +    vshl.u8         q0, q0, q2
> +    vst1.u8         {q0}, [r12]
> +
> +    add             r12, #16
> +    add             r6, #16
> +    add             r7, #16
> +    cmp             r7, r4
> +    blt             .loop_w
> +
>
> +    add             r0, r1
> +    add             r2, r3
> +
> +    subs             r5, #1
> +    bgt             .loop_h
> +
> +// handle last row
> +    mov             r5, r4
> +    lsr             r5, #3
> +
> +.loopW8:
> +    vld1.u8         d0, [r0]!
> +    vshl.u8         d0, d0, d4
> +    vst1.u8         d0, [r2]!
> +    subs            r4, r4, #8
> +    subs            r5, #1
> +    bgt             .loopW8
> +
> +    mov             r5,#8
> +    sub             r5, r4
> +    sub             r0, r5
> +    sub             r2, r5
> +    vld1.u8         d0, [r0]
> +    vshl.u8         d0, d0, d4
> +    vst1.u8         d0, [r2]
> +
> +    pop             {r4, r5, r6, r7}
> +    bx              lr
> +endfunc
> +
> diff -r 9cc9920bf82b -r 299caedec2f3 source/common/arm/pixel.h
> --- a/source/common/arm/pixel.h    Wed Mar 02 17:26:11 2016 +0530
> +++ b/source/common/arm/pixel.h    Thu Mar 03 11:42:18 2016 +0530
> @@ -163,4 +163,6 @@
>  void x265_pixel_add_ps_16x16_neon(pixel* a, intptr_t dstride, const
> pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
>  void x265_pixel_add_ps_32x32_neon(pixel* a, intptr_t dstride, const
> pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
>  void x265_pixel_add_ps_64x64_neon(pixel* a, intptr_t dstride, const
> pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
> +
> +void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride,
> pixel* dst, intptr_t dstStride, int width, int height, int shift);
>  #endif // ifndef X265_I386_PIXEL_ARM_H
>
>
>
> Thank you
> Regards
> Ramya
>
> On Fri, Mar 4, 2016 at 11:42 AM, Ramya Sriraman <
> ramya at multicorewareinc.com> wrote:
>
>> Hi min,
>> I made the #12 -> #4*3 correction.
>> R0 is constant because if i keep adding number of bytes loaded by
>> combining it with vld1.u8, then at the end of the loop when i add r1, it
>> will be r0+number_of_bytes+r1 and not the intended r0+r1.
>> Also, this is basically an upShift primitive. So it mite be useful for
>> 8bit build also.
>> I will mail the patch with modification to mailing list based on your
>> response.
>>
>>
>>
>> Thank you
>> Regards
>> Ramya
>>
>> On Fri, Mar 4, 2016 at 11:41 AM, Min Chen <min.chen at multicorewareinc.com>
>> wrote:
>>
>>>
>>>
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160304/f38628cd/attachment.html>


More information about the x265-devel mailing list