[x265] [PATCH] arm: Implement pixel_ssd_s ARM NEON asm
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Feb 25 09:28:26 CET 2016
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1456136894 -19800
# Mon Feb 22 15:58:14 2016 +0530
# Node ID ed3dd1a26cb5801e306db8f1d4a52cd1f4d6620b
# Parent 4a1b8f3c0c7385ff19fd61133e0af4464510e9aa
arm: Implement pixel_ssd_s ARM NEON asm
diff -r 4a1b8f3c0c73 -r ed3dd1a26cb5 source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Thu Feb 25 12:15:51 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Mon Feb 22 15:58:14 2016 +0530
@@ -42,6 +42,12 @@
{
if (cpuMask & X265_CPU_NEON)
{
+ // ssd_s
+ p.cu[BLOCK_4x4].ssd_s = PFX(pixel_ssd_s_4x4_neon);
+ p.cu[BLOCK_8x8].ssd_s = PFX(pixel_ssd_s_8x8_neon);
+ p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16x16_neon);
+ p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32x32_neon);
+
// sse_ss
p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_neon);
p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_neon);
diff -r 4a1b8f3c0c73 -r ed3dd1a26cb5 source/common/arm/pixel.h
--- a/source/common/arm/pixel.h Thu Feb 25 12:15:51 2016 +0530
+++ b/source/common/arm/pixel.h Mon Feb 22 15:58:14 2016 +0530
@@ -123,6 +123,12 @@
sse_t x265_pixel_sse_ss_32x32_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
sse_t x265_pixel_sse_ss_64x64_neon(const int16_t* pix1, intptr_t stride_pix1, const int16_t* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_ssd_s_4x4_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_8x8_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_16x16_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_32x32_neon(const int16_t* a, intptr_t dstride);
+sse_t x265_pixel_ssd_s_64x64_neon(const int16_t* a, intptr_t dstride);
+
void x265_pixel_sub_ps_4x4_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_sub_ps_8x8_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
void x265_pixel_sub_ps_16x16_neon(int16_t* a, intptr_t dstride, const pixel* b0, const pixel* b1, intptr_t sstride0, intptr_t sstride1);
diff -r 4a1b8f3c0c73 -r ed3dd1a26cb5 source/common/arm/ssd-a.S
--- a/source/common/arm/ssd-a.S Thu Feb 25 12:15:51 2016 +0530
+++ b/source/common/arm/ssd-a.S Mon Feb 22 15:58:14 2016 +0530
@@ -371,4 +371,99 @@
bx lr
endfunc
+function x265_pixel_ssd_s_4x4_neon
+ add r1, r1
+ vld1.s16 {d4}, [r0], r1
+ vld1.s16 {d5}, [r0], r1
+ vld1.s16 {d6}, [r0], r1
+ vld1.s16 {d7}, [r0]
+ vmull.s16 q0, d4, d4
+ vmull.s16 q1, d5, d5
+ vmlal.s16 q0, d6, d6
+ vmlal.s16 q1, d7, d7
+ vadd.s32 q0, q1
+ vadd.s32 d0, d0, d1
+ vpadd.s32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+function x265_pixel_ssd_s_8x8_neon
+ add r1, r1
+ vld1.s16 {q8}, [r0], r1
+ vld1.s16 {q9}, [r0], r1
+ vmull.s16 q0, d16, d16
+ vmull.s16 q1, d17, d17
+ vmlal.s16 q0, d18, d18
+ vmlal.s16 q1, d19, d19
+.rept 3
+ vld1.s16 {q8}, [r0], r1
+ vld1.s16 {q9}, [r0], r1
+ vmlal.s16 q0, d16, d16
+ vmlal.s16 q1, d17, d17
+ vmlal.s16 q0, d18, d18
+ vmlal.s16 q1, d19, d19
+.endr
+ vadd.s32 q0, q1
+ vadd.s32 d0, d0, d1
+ vpadd.s32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_ssd_s_16x16_neon
+ add r1, r1
+ mov r12, #4
+ veor.u8 q0, q0
+ veor.u8 q1, q1
+
+.loop_ssd_s_16:
+ subs r12, #1
+.rept 2
+ vld1.s16 {q8-q9}, [r0], r1
+ vld1.s16 {q10-q11}, [r0], r1
+ vmlal.s16 q0, d16, d16
+ vmlal.s16 q1, d17, d17
+ vmlal.s16 q0, d18, d18
+ vmlal.s16 q1, d19, d19
+ vmlal.s16 q0, d20, d20
+ vmlal.s16 q1, d21, d21
+ vmlal.s16 q0, d22, d22
+ vmlal.s16 q1, d23, d23
+.endr
+ bne .loop_ssd_s_16
+ vadd.s32 q0, q1
+ vadd.s32 d0, d0, d1
+ vpadd.s32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_ssd_s_32x32_neon
+ add r1, r1
+ sub r1, #32
+ mov r12, #8
+ veor.u8 q0, q0
+ veor.u8 q1, q1
+
+.loop_ssd_s_32:
+ subs r12, #1
+.rept 4
+ vld1.s16 {q8-q9}, [r0]!
+ vld1.s16 {q10-q11}, [r0], r1
+ vmlal.s16 q0, d16, d16
+ vmlal.s16 q1, d17, d17
+ vmlal.s16 q0, d18, d18
+ vmlal.s16 q1, d19, d19
+ vmlal.s16 q0, d20, d20
+ vmlal.s16 q1, d21, d21
+ vmlal.s16 q0, d22, d22
+ vmlal.s16 q1, d23, d23
+.endr
+ bne .loop_ssd_s_32
+ vadd.s32 q0, q1
+ vadd.s32 d0, d0, d1
+ vpadd.s32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
More information about the x265-devel
mailing list