[vlc-devel] [PATCH 01/23] contrib: libmpeg2 simplistic NEON acceleration for motion compensation

Rémi Denis-Courmont remi at remlab.net
Sat Oct 3 19:52:08 CEST 2009


---
 extras/contrib/src/Makefile                       |    2 +
 extras/contrib/src/Patches/libmpeg2-mc-neon.patch |  385 +++++++++++++++++++++
 2 files changed, 387 insertions(+), 0 deletions(-)
 create mode 100644 extras/contrib/src/Patches/libmpeg2-mc-neon.patch

diff --git a/extras/contrib/src/Makefile b/extras/contrib/src/Makefile
index 54b8a0f..e208993 100644
--- a/extras/contrib/src/Makefile
+++ b/extras/contrib/src/Makefile
@@ -543,6 +543,8 @@ libmpeg2-$(LIBMPEG2_VERSION).tar.gz:
 
 libmpeg2: libmpeg2-$(LIBMPEG2_VERSION).tar.gz
 	$(EXTRACT_GZ)
+	cd libmpeg2 && patch -p0 < ../Patches/libmpeg2-mc-neon.patch
+	cd libmpeg2 && ./bootstrap
 
 .mpeg2: libmpeg2
 	(cd $<; $(HOSTCC) ./configure $(HOSTCONF) --prefix=$(PREFIX) --without-x --disable-sdl && cd libmpeg2 && make && make install && cd ../include && make && make install)
diff --git a/extras/contrib/src/Patches/libmpeg2-mc-neon.patch b/extras/contrib/src/Patches/libmpeg2-mc-neon.patch
new file mode 100644
index 0000000..5d87d0a
--- /dev/null
+++ b/extras/contrib/src/Patches/libmpeg2-mc-neon.patch
@@ -0,0 +1,385 @@
+Index: include/mpeg2.h
+===================================================================
+--- include/mpeg2.h	(révision 1193)
++++ include/mpeg2.h	(copie de travail)
+@@ -164,6 +164,7 @@
+ #define MPEG2_ACCEL_SPARC_VIS 1
+ #define MPEG2_ACCEL_SPARC_VIS2 2
+ #define MPEG2_ACCEL_ARM 1
++#define MPEG2_ACCEL_ARM_NEON 2
+ #define MPEG2_ACCEL_DETECT 0x80000000
+ 
+ uint32_t mpeg2_accel (uint32_t accel);
+Index: libmpeg2/motion_comp_neon.c
+===================================================================
+--- libmpeg2/motion_comp_neon.c	(révision 0)
++++ libmpeg2/motion_comp_neon.c	(révision 0)
+@@ -0,0 +1,302 @@
++/*
++ * motion_comp_neon.c
++ * Copyright (C) 2009 Rémi Denis-Courmont
++ *
++ * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
++ * See http://libmpeg2.sourceforge.net/ for updates.
++ *
++ * mpeg2dec is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 2 of the License, or
++ * (at your option) any later version.
++ *
++ * mpeg2dec is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License along
++ * with mpeg2dec; if not, write to the Free Software Foundation, Inc.,
++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
++ */
++
++#include "config.h"
++
++#if defined(ARCH_ARM_NEON)
++
++#include <stdint.h>
++#include <string.h>
++
++#include "mpeg2.h"
++#include "attributes.h"
++#include "mpeg2_internal.h"
++
++/* dest = ref */
++static void MC_put_o_16_neon (uint8_t * dest, const uint8_t * ref,
++			      const int stride, int height)
++{
++    do {
++	memcpy (dest, ref, 16);
++	ref += stride;
++	dest += stride;
++    } while (--height);
++}
++
++static void MC_put_o_8_neon (uint8_t * dest, const uint8_t * ref,
++			     const int stride, int height)
++{
++    do {
++	memcpy (dest, ref, 8);
++	ref += stride;
++	dest += stride;
++    } while (--height);
++}
++
++/* dest = (src1 + src2 + 1) / 2 */
++static void MC_avg_1_16_neon (uint8_t * dest, const uint8_t * src1,
++			      const uint8_t * src2,
++			      const int stride, unsigned height)
++{
++    do {
++	asm volatile (
++	    "vld1.u8 {q0}, [%[src1]]\n"
++	    "vld1.u8 {q1}, [%[src2]]\n"
++	    "vrhadd.u8 q0, q0, q1\n"
++	    /* XXX: three cycles stall */
++	    "vst1.u8 {q0}, [%[dest]]\n"
++	    :
++	    : [dest]"r"(dest), [src1]"r"(src1), [src2]"r"(src2)
++	    : "memory", "q0", "q1");
++	src1 += stride;
++	src2 += stride;
++	dest += stride;
++    } while (--height);
++}
++
++static void MC_avg_1_8_neon (uint8_t * dest, const uint8_t * src1,
++			     const uint8_t * src2,
++			     const int stride, unsigned height)
++{
++    do {
++	asm volatile (
++	    "vld1.u8 {d0}, [%[src1]]\n"
++	    "vld1.u8 {d1}, [%[src2]]\n"
++	    "vrhadd.u8 d0, d0, d1\n"
++	    "vst1.u8 {d0}, [%[dest]]\n"
++	    :
++	    : [dest]"r"(dest), [src1]"r"(src1), [src2]"r"(src2)
++	    : "memory", "q0");
++	
++	src1 += stride;
++	src2 += stride;
++	dest += stride;
++    } while (--height);
++}
++
++/* dest = (dest + ((src1 + src2 + 1) / 2) + 1) / 2 */
++static void MC_avg_2_16_neon (uint8_t * dest, const uint8_t * src1,
++			      const uint8_t * src2,
++			      const int stride, unsigned height)
++{
++    do {
++	asm volatile (
++	    "vld1.u8 {q0}, [%[src1]]\n"
++	    "vld1.u8 {q1}, [%[src2]]\n"
++	    "vrhadd.u8 q0, q0, q1\n"
++	    "vld1.u8 {q2}, [%[dest]]\n"
++	    /* XXX: one cycle stall */
++	    "vrhadd.u8 q0, q0, q2\n"
++	    /* XXX: three cycles stall */
++	    "vst1.u8 {q0}, [%[dest]]\n"
++	    :
++	    : [dest]"r"(dest), [src1]"r"(src1), [src2]"r"(src2)
++	    : "memory", "q0", "q1", "q2");
++	src1 += stride;
++	src2 += stride;
++	dest += stride;
++    } while (--height);
++}
++
++static void MC_avg_2_8_neon (uint8_t * dest, const uint8_t * src1,
++			     const uint8_t * src2,
++			     const int stride, unsigned height)
++{
++    do {
++	asm volatile (
++	    "vld1.u8 {d0}, [%[src1]]\n"
++	    "vld1.u8 {d1}, [%[src2]]\n"
++	    "vrhadd.u8 d0, d0, d1\n"
++	    "vld1.u8 {d2}, [%[dest]]\n"
++	    "vrhadd.u8 d0, d0, d2\n"
++	    "vst1.u8 {d0}, [%[dest]]\n"
++	    :
++	    : [dest]"r"(dest), [src1]"r"(src1), [src2]"r"(src2)
++	    : "memory", "q0", "d2");
++	src1 += stride;
++	src2 += stride;
++	dest += stride;
++    } while (--height);
++}
++
++static void MC_avg_o_16_neon (uint8_t * dest, const uint8_t * ref,
++			      const int stride, int height)
++{
++    MC_avg_1_16_neon (dest, dest, ref, stride, height);
++}
++
++static void MC_avg_o_8_neon (uint8_t * dest, const uint8_t * ref,
++			     const int stride, int height)
++{
++    MC_avg_1_8_neon (dest, dest, ref, stride, height);
++}
++
++static void MC_put_x_16_neon (uint8_t * dest, const uint8_t * ref,
++			      const int stride, int height)
++{
++    MC_avg_1_16_neon (dest, ref, ref + 1, stride, height);
++}
++
++static void MC_put_x_8_neon (uint8_t * dest, const uint8_t * ref,
++			     const int stride, int height)
++{
++    MC_avg_1_8_neon (dest, ref, ref + 1, stride, height);
++}
++
++static void MC_avg_x_16_neon (uint8_t * dest, const uint8_t * ref,
++			      const int stride, int height)
++{
++    MC_avg_2_16_neon (dest, ref, ref + 1, stride, height);
++}
++
++static void MC_avg_x_8_neon (uint8_t * dest, const uint8_t * ref,
++			     const int stride, int height)
++{
++    MC_avg_2_8_neon (dest, ref, ref + 1, stride, height);
++}
++
++static void MC_put_y_16_neon (uint8_t * dest, const uint8_t * ref,
++			      const int stride, int height)
++{
++    MC_avg_1_16_neon (dest, ref, ref + stride, stride, height);
++}
++static void MC_put_y_8_neon (uint8_t * dest, const uint8_t * ref,
++			     const int stride, int height)
++{
++    MC_avg_1_8_neon (dest, ref, ref + stride, stride, height);
++}
++
++static void MC_avg_y_16_neon (uint8_t * dest, const uint8_t * ref,
++			      const int stride, int height)
++{
++    MC_avg_2_16_neon (dest, ref, ref + stride, stride, height);
++}
++
++static void MC_avg_y_8_neon (uint8_t * dest, const uint8_t * ref,
++			     const int stride, int height)
++{
++    MC_avg_2_8_neon (dest, ref, ref + stride, stride, height);
++}
++
++static void MC_put_xy_16_neon (uint8_t * dest, const uint8_t * ref,
++			       const int stride, int height)
++{
++    do {
++	asm volatile (
++	    "vld1.u8 {q0}, [%[ref]]\n"
++	    "vld1.u8 {q1}, [%[refx]]\n"
++	    "vrhadd.u8 q0, q0, q1\n"
++	    "vld1.u8 {q2}, [%[refy]]\n"
++	    "vld1.u8 {q3}, [%[refxy]]\n"
++	    "vrhadd.u8 q2, q2, q3\n"
++	    /* XXX: three cycles stall */
++	    "vrhadd.u8 q0, q0, q2\n"
++	    /* XXX: three cycles stall */
++	    "vst1.u8 {q0}, [%[dest]]\n"
++	    :
++	    : [dest]"r"(dest), [ref]"r"(ref), [refx]"r"(ref + 1),
++		       [refy]"r"(ref + stride), [refxy]"r"(ref + stride + 1)
++	    : "memory", "q0", "q1", "q2", "q3");
++	ref += stride;
++	dest += stride;
++    } while (--height);
++}
++
++static void MC_put_xy_8_neon (uint8_t * dest, const uint8_t * ref,
++			      const int stride, int height)
++{
++    do {
++	asm volatile (
++	    "vld1.u8 {d0}, [%[ref]]\n"
++	    "vld1.u8 {d1}, [%[refx]]\n"
++	    "vrhadd.u8 d0, d0, d1\n"
++	    "vld1.u8 {d2}, [%[refy]]\n"
++	    "vld1.u8 {d3}, [%[refxy]]\n"
++	    "vrhadd.u8 d2, d2, d3\n"
++	    /* XXX: three cycles stall */
++	    "vrhadd.u8 d0, d0, d2\n"
++	    /* XXX: three cycles stall */
++	    "vst1.u8 {d0}, [%[dest]]\n"
++	    :
++	    : [dest]"r"(dest), [ref]"r"(ref), [refx]"r"(ref + 1),
++		       [refy]"r"(ref + stride), [refxy]"r"(ref + stride + 1)
++	    : "memory", "q0", "q1");
++	ref += stride;
++	dest += stride;
++    } while (--height);
++}
++
++static void MC_avg_xy_16_neon (uint8_t * dest, const uint8_t * ref,
++			       const int stride, int height)
++{
++    do {
++ 	asm volatile (
++	    "vld1.u8 {q0}, [%[ref]]\n"
++	    "vld1.u8 {q1}, [%[refx]]\n"
++	    "vrhadd.u8 q0, q0, q1\n"
++	    "vld1.u8 {q2}, [%[refy]]\n"
++	    "vld1.u8 {q3}, [%[refxy]]\n"
++	    "vrhadd.u8 q2, q2, q3\n"
++	    "vld1.u8 {q4}, [%[dest]]\n"
++	    /* XXX: one cycle stall */
++	    "vrhadd.u8 q0, q0, q2\n"
++	    /* XXX: three cycles stall */
++	    "vrhadd.u8 q0, q4, q0\n"
++	    "vst1.u8 {q0}, [%[dest]]\n"
++	    :
++	    : [dest]"r"(dest), [ref]"r"(ref), [refx]"r"(ref + 1),
++		       [refy]"r"(ref + stride), [refxy]"r"(ref + stride + 1)
++	    : "memory", "q0", "q1", "q2", "q3", "q4");
++	ref += stride;
++	dest += stride;
++    } while (--height);
++}
++
++static void MC_avg_xy_8_neon (uint8_t * dest, const uint8_t * ref,
++			      const int stride, int height)
++{
++    do {
++	asm volatile (
++	    "vld1.u8 {d0}, [%[ref]]\n"
++	    "vld1.u8 {d1}, [%[refx]]\n"
++	    "vrhadd.u8 d0, d0, d1\n"
++	    "vld1.u8 {d2}, [%[refy]]\n"
++	    "vld1.u8 {d3}, [%[refxy]]\n"
++	    "vrhadd.u8 d2, d2, d3\n"
++	    "vld1.u8 {d4}, [%[dest]]\n"
++	    /* XXX: one cycle stall */
++	    "vrhadd.u8 d0, d0, d2\n"
++	    /* XXX: three cycles stall */
++	    "vrhadd.u8 d0, d4, d0\n"
++	    "vst1.u8 {d0}, [%[dest]]\n"
++	    :
++	    : [dest]"r"(dest), [ref]"r"(ref), [refx]"r"(ref + 1),
++		       [refy]"r"(ref + stride), [refxy]"r"(ref + stride + 1)
++	    : "memory", "q0", "q1", "d4");
++	ref += stride;
++	dest += stride;
++    } while (--height);
++}
++
++MPEG2_MC_EXTERN (neon)
++
++#endif /* ARCH_ARM_NEON */
+
+Modification de propriétés sur libmpeg2/motion_comp_neon.c
+___________________________________________________________________
+Ajouté : svn:eol-style
+   + native
+
+Index: libmpeg2/mpeg2_internal.h
+===================================================================
+--- libmpeg2/mpeg2_internal.h	(révision 1193)
++++ libmpeg2/mpeg2_internal.h	(copie de travail)
+@@ -313,5 +313,6 @@
+ extern mpeg2_mc_t mpeg2_mc_alpha;
+ extern mpeg2_mc_t mpeg2_mc_vis;
+ extern mpeg2_mc_t mpeg2_mc_arm;
++extern mpeg2_mc_t mpeg2_mc_neon;
+ 
+ #endif /* LIBMPEG2_MPEG2_INTERNAL_H */
+Index: libmpeg2/motion_comp.c
+===================================================================
+--- libmpeg2/motion_comp.c	(révision 1193)
++++ libmpeg2/motion_comp.c	(copie de travail)
+@@ -58,6 +58,11 @@
+     else
+ #endif
+ #ifdef ARCH_ARM
++#ifdef ARCH_ARM_NEON
++    if (accel & MPEG2_ACCEL_ARM_NEON)
++        mpeg2_mc = mpeg2_mc_neon;
++    else
++#endif
+     if (accel & MPEG2_ACCEL_ARM) {
+ 	mpeg2_mc = mpeg2_mc_arm;
+     } else
+Index: libmpeg2/Makefile.am
+===================================================================
+--- libmpeg2/Makefile.am	(révision 1193)
++++ libmpeg2/Makefile.am	(copie de travail)
+@@ -14,7 +14,7 @@
+ 			  motion_comp_vis.c motion_comp_arm.c \
+ 			  cpu_accel.c cpu_state.c
+ if ARCH_ARM
+-libmpeg2arch_la_SOURCES += motion_comp_arm_s.S
++libmpeg2arch_la_SOURCES += motion_comp_arm_s.S motion_comp_neon.c
+ endif
+ libmpeg2arch_la_CFLAGS = $(OPT_CFLAGS) $(ARCH_OPT_CFLAGS) $(LIBMPEG2_CFLAGS)
+ 
+Index: configure.ac
+===================================================================
+--- configure.ac	(révision 1193)
++++ configure.ac	(copie de travail)
+@@ -103,7 +103,14 @@
+ 	AC_DEFINE([ARCH_ALPHA],,[alpha architecture]);;
+     arm*)
+ 	arm_conditional=:
+-	AC_DEFINE([ARCH_ARM],,[ARM architecture]);;
++	AC_DEFINE([ARCH_ARM],,[ARM architecture])
++        AC_MSG_CHECKING([if inline ARM Advanced SIMD assembly is supported])
++        AC_TRY_COMPILE([],
++           [asm ("vqmovun.s64 d0, q1":::"d0");],
++           [AC_DEFINE([ARCH_ARM_NEON],, [ARM Advanced SIMD assembly])
++            AC_MSG_RESULT(yes)],
++           [AC_MSG_RESULT(no)])
++        ;;
+     esac
+ elif test x"$CC" = x"tendracc"; then
+     dnl TenDRA portability checking compiler
-- 
1.6.4.3




More information about the vlc-devel mailing list