[x265] [PATCH] arm: Implement pixel_sse_pp ARM NEON asm
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Feb 18 12:32:52 CET 2016
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1455794242 -19800
# Thu Feb 18 16:47:22 2016 +0530
# Node ID 5e4593ef30cc4bccc5eec2a0109b8dff397e5c93
# Parent b31fa1a4ef43697e163d17dda0f4650de45d6ff9
arm: Implement pixel_sse_pp ARM NEON asm
diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt Thu Feb 18 16:37:01 2016 +0530
+++ b/source/common/CMakeLists.txt Thu Feb 18 16:47:22 2016 +0530
@@ -89,7 +89,7 @@
set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
# add ARM assembly/intrinsic files here
- set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S)
+ set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S)
set(VEC_PRIMITIVES)
set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources")
diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/arm/asm-primitives.cpp
--- a/source/common/arm/asm-primitives.cpp Thu Feb 18 16:37:01 2016 +0530
+++ b/source/common/arm/asm-primitives.cpp Thu Feb 18 16:47:22 2016 +0530
@@ -42,6 +42,13 @@
{
if (cpuMask & X265_CPU_NEON)
{
+ // sse_pp
+ p.cu[BLOCK_4x4].sse_pp = PFX(pixel_sse_pp_4x4_neon);
+ p.cu[BLOCK_8x8].sse_pp = PFX(pixel_sse_pp_8x8_neon);
+ p.cu[BLOCK_16x16].sse_pp = PFX(pixel_sse_pp_16x16_neon);
+ p.cu[BLOCK_32x32].sse_pp = PFX(pixel_sse_pp_32x32_neon);
+ p.cu[BLOCK_64x64].sse_pp = PFX(pixel_sse_pp_64x64_neon);
+
// pixel_var
p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_neon);
p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_neon);
diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/arm/pixel.h
--- a/source/common/arm/pixel.h Thu Feb 18 16:37:01 2016 +0530
+++ b/source/common/arm/pixel.h Thu Feb 18 16:47:22 2016 +0530
@@ -111,4 +111,10 @@
void x265_sad_x4_64x48_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
void x265_sad_x4_64x64_neon(const pixel* fenc, const pixel* fref0, const pixel* fref1, const pixel* fref2, const pixel* fref3, intptr_t frefstride, int32_t* res);
+sse_t x265_pixel_sse_pp_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_pp_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_pp_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_pp_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+sse_t x265_pixel_sse_pp_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+
#endif // ifndef X265_I386_PIXEL_ARM_H
diff -r b31fa1a4ef43 -r 5e4593ef30cc source/common/arm/ssd-a.S
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/arm/ssd-a.S Thu Feb 18 16:47:22 2016 +0530
@@ -0,0 +1,196 @@
+/*****************************************************************************
+ * Copyright (C) 2016 x265 project
+ *
+ * Authors: Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+
+.align 4
+
+
+.text
+
+
+function x265_pixel_sse_pp_4x4_neon
+ vld1.32 {d16[]}, [r0], r1
+ vld1.32 {d17[]}, [r2], r3
+ vsubl.u8 q2, d16, d17
+ vld1.32 {d16[]}, [r0], r1
+ vmull.s16 q0, d4, d4
+ vld1.32 {d17[]}, [r2], r3
+
+ vsubl.u8 q2, d16, d17
+ vld1.32 {d16[]}, [r0], r1
+ vmlal.s16 q0, d4, d4
+ vld1.32 {d17[]}, [r2], r3
+
+ vsubl.u8 q2, d16, d17
+ vld1.32 {d16[]}, [r0], r1
+ vmlal.s16 q0, d4, d4
+ vld1.32 {d17[]}, [r2], r3
+
+ vsubl.u8 q2, d16, d17
+ vmlal.s16 q0, d4, d4
+ vadd.s32 d0, d0, d1
+ vpadd.s32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_sse_pp_8x8_neon
+ vld1.64 {d16}, [r0], r1
+ vld1.64 {d17}, [r2], r3
+ vsubl.u8 q2, d16, d17
+ vld1.64 {d16}, [r0], r1
+ vmull.s16 q0, d4, d4
+ vmlal.s16 q0, d5, d5
+ vld1.64 {d17}, [r2], r3
+
+.rept 6
+ vsubl.u8 q2, d16, d17
+ vld1.64 {d16}, [r0], r1
+ vmlal.s16 q0, d4, d4
+ vmlal.s16 q0, d5, d5
+ vld1.64 {d17}, [r2], r3
+.endr
+ vsubl.u8 q2, d16, d17
+ vmlal.s16 q0, d4, d4
+ vmlal.s16 q0, d5, d5
+ vadd.s32 d0, d0, d1
+ vpadd.s32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_sse_pp_16x16_neon
+ vld1.64 {d16-d17}, [r0], r1
+ vld1.64 {d18-d19}, [r2], r3
+ vsubl.u8 q2, d16, d18
+ vsubl.u8 q3, d17, d19
+ vld1.64 {d16-d17}, [r0], r1
+ vmull.s16 q0, d4, d4
+ vmlal.s16 q0, d5, d5
+ vld1.64 {d18-d19}, [r2], r3
+ vmlal.s16 q0, d6, d6
+ vmlal.s16 q0, d7, d7
+
+.rept 14
+ vsubl.u8 q2, d16, d18
+ vsubl.u8 q3, d17, d19
+ vld1.64 {d16-d17}, [r0], r1
+ vmlal.s16 q0, d4, d4
+ vmlal.s16 q0, d5, d5
+ vld1.64 {d18-d19}, [r2], r3
+ vmlal.s16 q0, d6, d6
+ vmlal.s16 q0, d7, d7
+.endr
+ vsubl.u8 q2, d16, d18
+ vsubl.u8 q3, d17, d19
+ vmlal.s16 q0, d4, d4
+ vmlal.s16 q0, d5, d5
+ vmlal.s16 q0, d6, d6
+ vmlal.s16 q0, d7, d7
+ vadd.s32 d0, d0, d1
+ vpadd.s32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_sse_pp_32x32_neon
+ mov r12, #8
+ veor.u8 q0, q0
+ veor.u8 q1, q1
+
+.loop_sse_pp_32:
+ subs r12, #1
+.rept 4
+ vld1.64 {q8-q9}, [r0], r1
+ vld1.64 {q10-q11}, [r2], r3
+ vsubl.u8 q2, d16, d20
+ vsubl.u8 q3, d17, d21
+ vsubl.u8 q12, d18, d22
+ vsubl.u8 q13, d19, d23
+ vmlal.s16 q0, d4, d4
+ vmlal.s16 q1, d5, d5
+ vmlal.s16 q0, d6, d6
+ vmlal.s16 q1, d7, d7
+ vmlal.s16 q0, d24, d24
+ vmlal.s16 q1, d25, d25
+ vmlal.s16 q0, d26, d26
+ vmlal.s16 q1, d27, d27
+.endr
+ bne .loop_sse_pp_32
+ vadd.s32 q0, q1
+ vadd.s32 d0, d0, d1
+ vpadd.s32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
+
+function x265_pixel_sse_pp_64x64_neon
+ sub r1, #32
+ sub r3, #32
+ mov r12, #16
+ veor.u8 q0, q0
+ veor.u8 q1, q1
+
+.loop_sse_pp_64:
+ subs r12, #1
+.rept 4
+ vld1.64 {q8-q9}, [r0]!
+ vld1.64 {q10-q11}, [r2]!
+ vsubl.u8 q2, d16, d20
+ vsubl.u8 q3, d17, d21
+ vsubl.u8 q12, d18, d22
+ vsubl.u8 q13, d19, d23
+ vmlal.s16 q0, d4, d4
+ vmlal.s16 q1, d5, d5
+ vmlal.s16 q0, d6, d6
+ vmlal.s16 q1, d7, d7
+ vmlal.s16 q0, d24, d24
+ vmlal.s16 q1, d25, d25
+ vmlal.s16 q0, d26, d26
+ vmlal.s16 q1, d27, d27
+
+ vld1.64 {q8-q9}, [r0], r1
+ vld1.64 {q10-q11}, [r2], r3
+ vsubl.u8 q2, d16, d20
+ vsubl.u8 q3, d17, d21
+ vsubl.u8 q12, d18, d22
+ vsubl.u8 q13, d19, d23
+ vmlal.s16 q0, d4, d4
+ vmlal.s16 q1, d5, d5
+ vmlal.s16 q0, d6, d6
+ vmlal.s16 q1, d7, d7
+ vmlal.s16 q0, d24, d24
+ vmlal.s16 q1, d25, d25
+ vmlal.s16 q0, d26, d26
+ vmlal.s16 q1, d27, d27
+.endr
+ bne .loop_sse_pp_64
+ vadd.s32 q0, q1
+ vadd.s32 d0, d0, d1
+ vpadd.s32 d0, d0, d0
+ vmov.32 r0, d0[0]
+ bx lr
+endfunc
More information about the x265-devel
mailing list