[x265] [PATCH] arm: Implement planecopy_cp NEON
chen
chenm003 at 163.com
Fri Mar 4 17:40:29 CET 2016
this version looks good, thanks
At 2016-03-04 17:29:40,"Ramya Sriraman" <ramya at multicorewareinc.com> wrote:
Thanks for the improvements min. Pls find the modified patch below.
# HG changeset patch
# User Ramya Sriraman<ramya at multicorewareinc.com>
# Date 1456985538 -19800
# Thu Mar 03 11:42:18 2016 +0530
# Node ID 75a3948f28b6bd8f2b3536cf18e17cc8573be444
# Parent 9cc9920bf82be1b43efd2a3628e28a3a78ab3b2f
arm: Implement planecopy_cp NEON
diff -r 9cc9920bf82b -r 75a3948f28b6 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Wed Mar 02 17:26:11 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Thu Mar 03 11:42:18 2016 +0530
@@ -33,6 +33,7 @@
#include "blockcopy8.h"
#include "pixel.h"
#include "pixel-util.h"
+#include "ipfilter8.h"
}
namespace X265_NS {
@@ -142,6 +143,9 @@
p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_neon);
p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon);
+ // planecopy
+ p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
+
// sad
p.pu[LUMA_8x4].sad = PFX(pixel_sad_8x4_neon);
p.pu[LUMA_8x8].sad = PFX(pixel_sad_8x8_neon);
diff -r 9cc9920bf82b -r 75a3948f28b6 source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S Wed Mar 02 17:26:11 2016 +0530
+++ b/source/common/arm/pixel-util.S Thu Mar 03 11:42:18 2016 +0530
@@ -626,3 +626,55 @@
pop {r4, r5}
bx lr
endfunc
+
+function x265_pixel_planecopy_cp_neon
+ push {r4, r5, r6, r7}
+ ldr r4, [sp, #4 * 4]
+ ldr r5, [sp, #4 * 4 + 4]
+ ldr r12, [sp, #4 * 4 + 8]
+ vdup.8 q2, r12
+ sub r5, #1
+
+.loop_h:
+ mov r6, r0
+ mov r12, r2
+ eor r7, r7
+.loop_w:
+ vld1.u8 {q0}, [r6]!
+ vshl.u8 q0, q0, q2
+ vst1.u8 {q0}, [r12]!
+
+ add r7, #16
+ cmp r7, r4
+ blt .loop_w
+
+ add r0, r1
+ add r2, r3
+
+ subs r5, #1
+ bgt .loop_h
+
+// handle last row
+ mov r5, r4
+ lsr r5, #3
+
+.loopW8:
+ vld1.u8 d0, [r0]!
+ vshl.u8 d0, d0, d4
+ vst1.u8 d0, [r2]!
+ subs r4, r4, #8
+ subs r5, #1
+ bgt .loopW8
+
+ mov r5,#8
+ sub r5, r4
+ sub r0, r5
+ sub r2, r5
+ vld1.u8 d0, [r0]
+ vshl.u8 d0, d0, d4
+ vst1.u8 d0, [r2]
+
+ pop {r4, r5, r6, r7}
+ bx lr
+endfunc
+
diff -r 9cc9920bf82b -r 75a3948f28b6 source/common/arm/pixel.h
--- a/source/common/arm/pixel.h Wed Mar 02 17:26:11 2016 +0530
+++ b/source/common/arm/pixel.h Thu Mar 03 11:42:18 2016 +0530
@@ -163,4 +163,6 @@
void x265_pixel_add_ps_16x16_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_add_ps_32x32_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_add_ps_64x64_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+
+void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
#endif // ifndef X265_I386_PIXEL_ARM_H
Thank you
Regards
Ramya
On Fri, Mar 4, 2016 at 2:18 PM, Ramya Sriraman <ramya at multicorewareinc.com> wrote:
# HG changeset patch
# User Ramya Sriraman<ramya at multicorewareinc.com>
# Date 1456985538 -19800
# Thu Mar 03 11:42:18 2016 +0530
# Node ID 299caedec2f38b9d9b658aace5c74ace36b6b324
# Parent 9cc9920bf82be1b43efd2a3628e28a3a78ab3b2f
arm: Implement planecopy_cp NEON
diff -r 9cc9920bf82b -r 299caedec2f3 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Wed Mar 02 17:26:11 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Thu Mar 03 11:42:18 2016 +0530
@@ -33,6 +33,7 @@
#include "blockcopy8.h"
#include "pixel.h"
#include "pixel-util.h"
+#include "ipfilter8.h"
}
namespace X265_NS {
@@ -142,6 +143,9 @@
p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_neon);
p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_neon);
+ // planecopy
+ p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
+
// sad
p.pu[LUMA_8x4].sad = PFX(pixel_sad_8x4_neon);
p.pu[LUMA_8x8].sad = PFX(pixel_sad_8x8_neon);
diff -r 9cc9920bf82b -r 299caedec2f3 source/common/arm/pixel-util.S
--- a/source/common/arm/pixel-util.S Wed Mar 02 17:26:11 2016 +0530
+++ b/source/common/arm/pixel-util.S Thu Mar 03 11:42:18 2016 +0530
@@ -626,3 +626,57 @@
pop {r4, r5}
bx lr
endfunc
+
+function x265_pixel_planecopy_cp_neon
+ push {r4, r5, r6, r7}
+ ldr r4, [sp, #4 * 4]
+ ldr r5, [sp, #4 * 4 + 4]
+ ldr r12, [sp, #4 * 4 + 8]
+ vdup.8 q2, r12
+ sub r5, #1
+
+.loop_h:
+ mov r6, r0
+ mov r12, r2
+ eor r7, r7
+.loop_w:
+ vld1.u8 {q0}, [r6]
+ vshl.u8 q0, q0, q2
+ vst1.u8 {q0}, [r12]
+
+ add r12, #16
+ add r6, #16
+ add r7, #16
+ cmp r7, r4
+ blt .loop_w
+
+ add r0, r1
+ add r2, r3
+
+ subs r5, #1
+ bgt .loop_h
+
+// handle last row
+ mov r5, r4
+ lsr r5, #3
+
+.loopW8:
+ vld1.u8 d0, [r0]!
+ vshl.u8 d0, d0, d4
+ vst1.u8 d0, [r2]!
+ subs r4, r4, #8
+ subs r5, #1
+ bgt .loopW8
+
+ mov r5,#8
+ sub r5, r4
+ sub r0, r5
+ sub r2, r5
+ vld1.u8 d0, [r0]
+ vshl.u8 d0, d0, d4
+ vst1.u8 d0, [r2]
+
+ pop {r4, r5, r6, r7}
+ bx lr
+endfunc
+
diff -r 9cc9920bf82b -r 299caedec2f3 source/common/arm/pixel.h
--- a/source/common/arm/pixel.h Wed Mar 02 17:26:11 2016 +0530
+++ b/source/common/arm/pixel.h Thu Mar 03 11:42:18 2016 +0530
@@ -163,4 +163,6 @@
void x265_pixel_add_ps_16x16_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_add_ps_32x32_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_add_ps_64x64_neon(pixel* a, intptr_t dstride, const pixel* b0, const int16_t* b1, intptr_t sstride0, intptr_t sstride1);
+
+void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
#endif // ifndef X265_I386_PIXEL_ARM_H
Thank you
Regards
Ramya
On Fri, Mar 4, 2016 at 11:42 AM, Ramya Sriraman <ramya at multicorewareinc.com> wrote:
Hi min,
I made the #12 -> #4*3 correction.
R0 is constant because if i keep adding number of bytes loaded by combining it with vld1.u8, then at the end of the loop when i add r1, it will be r0+number_of_bytes+r1 and not the intended r0+r1.
Also, this is basically an upShift primitive. So it mite be useful for 8bit build also.
I will mail the patch with modification to mailing list based on your response.
Thank you
Regards
Ramya
On Fri, Mar 4, 2016 at 11:41 AM, Min Chen <min.chen at multicorewareinc.com> wrote:
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20160305/7d1e04c3/attachment-0001.html>
More information about the x265-devel
mailing list