[vlc-devel] [PATCH-fixed] Added assembly (SSE2, SSE4.1) processing functions to adjust filter
Martin Briza
Gamajun at seznam.cz
Thu Aug 25 00:11:59 CEST 2011
From: Martin Briza <xbriza00 at stud.fit.vutbr.cz>
---
modules/video_filter/adjust.c | 35 ++-
modules/video_filter/adjust_sat_hue.c | 926
+++++++++++++++++++++++++++++++++
modules/video_filter/adjust_sat_hue.h | 34 ++
3 files changed, 991 insertions(+), 4 deletions(-)
diff --git a/modules/video_filter/adjust.c b/modules/video_filter/adjust.c
index c2f6649..a741d24 100644
--- a/modules/video_filter/adjust.c
+++ b/modules/video_filter/adjust.c
@@ -166,15 +166,42 @@ static int Create( vlc_object_t *p_this )
CASE_PLANAR_YUV
/* Planar YUV */
p_filter->pf_video_filter = FilterPlanar;
- p_sys->pf_process_sat_hue_clip = planar_sat_hue_clip_C;
- p_sys->pf_process_sat_hue = planar_sat_hue_C;
+#ifdef CAN_COMPILE_SSE4_1
+ if (vlc_CPU() & CPU_CAPABILITY_SSE4_1)
+ {
+ p_sys->pf_process_sat_hue_clip =
planar_sat_hue_clip_SSE41;
+ p_sys->pf_process_sat_hue = planar_sat_hue_SSE2;
+ }
+ else
+#elif defined( CAN_COMPILE_SSE4_1 )
+ if (vlc_CPU() & CPU_CAPABILITY_SSE2)
+ {
+ p_sys->pf_process_sat_hue_clip = planar_sat_hue_clip_C;
+ p_sys->pf_process_sat_hue = planar_sat_hue_SSE2;
+ }
+ else
+#endif
+ {
+ p_sys->pf_process_sat_hue_clip = planar_sat_hue_clip_C;
+ p_sys->pf_process_sat_hue = planar_sat_hue_C;
+ }
break;
CASE_PACKED_YUV_422
/* Packed YUV 4:2:2 */
p_filter->pf_video_filter = FilterPacked;
- p_sys->pf_process_sat_hue_clip = packed_sat_hue_clip_C;
- p_sys->pf_process_sat_hue = packed_sat_hue_C;
+#ifdef CAN_COMPILE_SSE4_1
+ if (vlc_CPU() & CPU_CAPABILITY_SSE4_1)
+ {
+ p_sys->pf_process_sat_hue_clip =
packed_sat_hue_clip_SSE41;
+ p_sys->pf_process_sat_hue = packed_sat_hue_SSE41;
+ }
+ else
+#endif
+ {
+ p_sys->pf_process_sat_hue_clip = packed_sat_hue_clip_C;
+ p_sys->pf_process_sat_hue = packed_sat_hue_C;
+ }
break;
default:
diff --git a/modules/video_filter/adjust_sat_hue.c
b/modules/video_filter/adjust_sat_hue.c
index cbc6f13..75c783f 100644
--- a/modules/video_filter/adjust_sat_hue.c
+++ b/modules/video_filter/adjust_sat_hue.c
@@ -66,10 +66,936 @@
#define ADJUST_4_TIMES(x) x; x; x; x
#define ADJUST_8_TIMES(x) x; x; x; x; x; x; x; x
+#ifdef _WIN64
+#define STORE_XMM_REGISTERS \
+ static uint64_t xmm_temporary_storage[32]; \
+ __asm__ volatile( \
+ "movdqa %%xmm0, (%[x])\n" \
+ "movdqa %%xmm1, 16(%[x])\n" \
+ "movdqa %%xmm2, 32(%[x])\n" \
+ "movdqa %%xmm3, 48(%[x])\n" \
+ "movdqa %%xmm4, 64(%[x])\n" \
+ "movdqa %%xmm5, 80(%[x])\n" \
+ "movdqa %%xmm6, 96(%[x])\n" \
+ "movdqa %%xmm7, 112(%[x])\n" \
+ "movdqa %%xmm8, 128(%[x])\n" \
+ "movdqa %%xmm9, 144(%[x])\n" \
+ "movdqa %%xmm10, 160(%[x])\n" \
+ "movdqa %%xmm11, 176(%[x])\n" \
+ "movdqa %%xmm12, 192(%[x])\n" \
+ "movdqa %%xmm13, 208(%[x])\n" \
+ "movdqa %%xmm14, 224(%[x])\n" \
+ "movdqa %%xmm15, 240(%[x])\n" \
+ : \
+ : [x] "r" (xmm_temporary_storage) \
+ : "memory" \
+ )
+#else
+#define STORE_XMM_REGISTERS
+#endif
+
+#ifdef _WIN64
+#define RESTORE_XMM_REGISTERS \
+ __asm__ volatile( \
+ "movdqa (%[x]), %%xmm0\n" \
+ "movdqa 16(%[x]), %%xmm1\n" \
+ "movdqa 32(%[x]), %%xmm2\n" \
+ "movdqa 48(%[x]), %%xmm3\n" \
+ "movdqa 64(%[x]), %%xmm4\n" \
+ "movdqa 80(%[x]), %%xmm5\n" \
+ "movdqa 96(%[x]), %%xmm6\n" \
+ "movdqa 112(%[x]), %%xmm7\n" \
+ "movdqa 128(%[x]), %%xmm8\n" \
+ "movdqa 144(%[x]), %%xmm9\n" \
+ "movdqa 160(%[x]), %%xmm10\n" \
+ "movdqa 176(%[x]), %%xmm11\n" \
+ "movdqa 192(%[x]), %%xmm12\n" \
+ "movdqa 208(%[x]), %%xmm13\n" \
+ "movdqa 224(%[x]), %%xmm14\n" \
+ "movdqa 240(%[x]), %%xmm15\n" \
+ : \
+ : [x] "r" (xmm_temporary_storage) \
+ : "memory" \
+ )
+#else
+#define RESTORE_XMM_REGISTERS
+#endif
+
/*****************************************************************************
* Hue and saturation adjusting routines
*****************************************************************************/
+#ifdef CAN_COMPILE_SSE4_1
+int planar_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic,
+ int i_sin, int i_cos, int i_sat, int i_x,
+ int i_y )
+{
+ uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+ uint8_t *p_out, *p_out_v;
+
+ p_in = p_pic->p[U_PLANE].p_pixels;
+ p_in_v = p_pic->p[V_PLANE].p_pixels;
+ p_in_end = p_in + p_pic->p[U_PLANE].i_visible_lines
+ * p_pic->p[U_PLANE].i_pitch - 8;
+
+ p_out = p_outpic->p[U_PLANE].p_pixels;
+ p_out_v = p_outpic->p[V_PLANE].p_pixels;
+
+ uint8_t i_u, i_v;
+
+ STORE_XMM_REGISTERS;
+
+#if defined(__x86_64__)
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7\n"
+ "movd %[cos], %%xmm8\n"
+ "movd %[sin], %%xmm9\n"
+ "movd %[x], %%xmm10\n"
+ "movd %[y], %%xmm11\n"
+ "movd %[sat], %%xmm12\n"
+ "pshufd $0, %%xmm8, %%xmm8\n"
+ "pshufd $0, %%xmm9, %%xmm9\n"
+ "pshufd $0, %%xmm10, %%xmm10\n"
+ "pshufd $0, %%xmm11, %%xmm11\n"
+ "pshufd $0, %%xmm12, %%xmm12\n"
+ :
+ : [x] "r" (i_x),
+ [y] "r" (i_y),
+ [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#elif defined (__i386__)
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7\n"
+ "movd %[cos], %%xmm4\n"
+ "movd %[sin], %%xmm5\n"
+ "movd %[sat], %%xmm6\n"
+ "pshufd $0, %%xmm4, %%xmm4\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "pshufd $0, %%xmm6, %%xmm6\n"
+ :
+ : [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#endif
+
+ for( ; p_in < p_in_end ; )
+ {
+ p_line_end = p_in + p_pic->p[U_PLANE].i_visible_pitch - 8;
+
+ for( ; p_in < p_line_end ; )
+ {
+ /* Do 8 pixels at a time */
+ ADJUST_2_TIMES(
+ __asm__ __volatile__(
+ "movd (%[in]), %%xmm0\n"
+ "movd (%[in_v]), %%xmm1\n"
+ "punpcklbw %%xmm7, %%xmm0\n"
+ "punpcklwd %%xmm7, %%xmm0\n"
+ "punpcklbw %%xmm7, %%xmm1\n"
+ "punpcklwd %%xmm7, %%xmm1\n"
+ :
+ : [in] "r" (p_in),
+ [in_v] "r" (p_in_v)
+ : "memory" );
+#if defined(__x86_64__)
+ __asm__ __volatile__(
+ "movdqu %%xmm0, %%xmm2\n"
+ "movdqu %%xmm1, %%xmm3\n"
+
+ "pmulld %%xmm8, %%xmm0\n"
+ "pmulld %%xmm9, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ "psubd %%xmm10, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+ "pmulld %%xmm12, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+
+ "pcmpeqd %%xmm1, %%xmm1\n"
+ "psrld $31, %%xmm1\n"
+ "pslld $7, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ /* out_u stays in xmm0 */
+
+ "pmulld %%xmm8, %%xmm3\n"
+ "pmulld %%xmm9, %%xmm2\n"
+ "psubd %%xmm2, %%xmm3\n"
+ "psubd %%xmm11, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+ "pmulld %%xmm12, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+
+ "paddd %%xmm1, %%xmm3\n"
+ /* out_v stays in xmm3 */
+ :
+ : );
+#elif defined (__i386__)
+ __asm__ __volatile__(
+ "movdqu %%xmm0, %%xmm2\n"
+ "movdqu %%xmm1, %%xmm3\n"
+
+ "movd %[x], %%xmm7\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pmulld %%xmm4, %%xmm0\n"
+ "pmulld %%xmm5, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ "psubd %%xmm7, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+ "pmulld %%xmm6, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+
+ "pcmpeqd %%xmm1, %%xmm1\n"
+ "psrld $31, %%xmm1\n"
+ "pslld $7, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ /* out_u stays in xmm0 */
+
+ "movd %[y], %%xmm7\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pmulld %%xmm4, %%xmm3\n"
+ "pmulld %%xmm5, %%xmm2\n"
+ "psubd %%xmm2, %%xmm3\n"
+ "psubd %%xmm7, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+ "pmulld %%xmm6, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+
+ "paddd %%xmm1, %%xmm3\n"
+
+ /* out_v stays in xmm3 */
+ "pxor %%xmm7, %%xmm7\n"
+ :
+ : [x] "r" (i_x),
+ [y] "r" (i_y)
+ : "memory" );
+#endif
+ __asm__ __volatile__(
+ /* pack and saturate (if there is something to
saturate) and
+ * store in destination */
+ "packusdw %%xmm7, %%xmm0\n"
+ "packuswb %%xmm7, %%xmm0\n"
+ "movd %%xmm0, (%[out])\n"
+ "packusdw %%xmm7, %%xmm3\n"
+ "packuswb %%xmm7, %%xmm3\n"
+ "movd %%xmm3, (%[out_v])\n"
+ :
+ : [out] "r" (p_out),
+ [out_v] "r" (p_out_v)
+ : "eax", "memory" );
+ p_in += 4;
+ p_in_v += 4;
+ p_out += 4;
+ p_out_v += 4;
+ );
+ }
+
+ p_line_end += 8;
+
+ for( ; p_in < p_line_end ; )
+ {
+ PLANAR_WRITE_UV_CLIP();
+ }
+
+ p_in += p_pic->p[U_PLANE].i_pitch
+ - p_pic->p[U_PLANE].i_visible_pitch;
+ p_in_v += p_pic->p[V_PLANE].i_pitch
+ - p_pic->p[V_PLANE].i_visible_pitch;
+ p_out += p_outpic->p[U_PLANE].i_pitch
+ - p_outpic->p[U_PLANE].i_visible_pitch;
+ p_out_v += p_outpic->p[V_PLANE].i_pitch
+ - p_outpic->p[V_PLANE].i_visible_pitch;
+ }
+
+ RESTORE_XMM_REGISTERS;
+
+ return VLC_SUCCESS;
+}
+#endif
+
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic,
int i_sin, int i_cos,
+ int i_sat, int i_x, int i_y )
+{
+ uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+ uint8_t *p_out, *p_out_v;
+
+ int i_y_offset, i_u_offset, i_v_offset;
+ int i_visible_lines, i_pitch, i_visible_pitch;
+
+
+ if ( GetPackedYuvOffsets( p_pic->format.i_chroma, &i_y_offset,
+ &i_u_offset, &i_v_offset ) != VLC_SUCCESS )
+ return VLC_EGENERIC;
+
+ i_visible_lines = p_pic->p->i_visible_lines;
+ i_pitch = p_pic->p->i_pitch;
+ i_visible_pitch = p_pic->p->i_visible_pitch;
+
+ p_in = p_pic->p->p_pixels + i_u_offset;
+ p_in_v = p_pic->p->p_pixels + i_v_offset;
+ p_in_end = p_in + i_visible_lines * i_pitch - 8 * 4;
+
+ p_out = p_outpic->p->p_pixels + i_u_offset;
+ p_out_v = p_outpic->p->p_pixels + i_v_offset;
+
+ uint8_t i_u, i_v;
+
+ STORE_XMM_REGISTERS;
+
+#if defined(__x86_64__)
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7\n"
+ "movd %[cos], %%xmm8\n"
+ "movd %[sin], %%xmm9\n"
+ "movd %[x], %%xmm10\n"
+ "movd %[y], %%xmm11\n"
+ "movd %[sat], %%xmm12\n"
+ "pshufd $0, %%xmm8, %%xmm8\n"
+ "pshufd $0, %%xmm9, %%xmm9\n"
+ "pshufd $0, %%xmm10, %%xmm10\n"
+ "pshufd $0, %%xmm11, %%xmm11\n"
+ "pshufd $0, %%xmm12, %%xmm12\n"
+ :
+ : [x] "r" (i_x),
+ [y] "r" (i_y),
+ [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#elif defined (__i386__)
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7\n"
+ "movd %[cos], %%xmm4\n"
+ "movd %[sin], %%xmm5\n"
+ "movd %[sat], %%xmm6\n"
+ "pshufd $0, %%xmm4, %%xmm4\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "pshufd $0, %%xmm6, %%xmm6\n"
+ :
+ : [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#endif
+
+ for( ; p_in < p_in_end ; )
+ {
+ p_line_end = p_in + p_pic->p->i_visible_pitch - 8 * 4;
+
+ for( ; p_in < p_line_end ; )
+ {
+ ADJUST_2_TIMES(
+ /* Do 8 pixels at a time */
+ __asm__ __volatile__ (
+ "movdqu (%[in]), %%xmm0\n"
+ "movdqu (%[in_v]), %%xmm1\n"
+ "pcmpeqd %%xmm2, %%xmm2\n" //
0xFFFFFFFF
+ "psrld $24, %%xmm2\n" //
0x000000FF
+ "pand %%xmm2, %%xmm0\n" // mask out
unnecessary data
+ "pand %%xmm2, %%xmm1\n"
+ :
+ : [in] "r" (p_in),
+ [in_v] "r" (p_in_v)
+ : "memory" );
+
+#if defined(__x86_64__)
+ __asm__ __volatile__ (
+ "movdqu %%xmm0, %%xmm2\n"
+ "movdqu %%xmm1, %%xmm3\n"
+
+ "pmulld %%xmm8, %%xmm0\n"
+ "pmulld %%xmm9, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ "psubd %%xmm10, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+ "pmulld %%xmm12, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+
+ "pcmpeqd %%xmm1, %%xmm1\n"
+ "psrld $31, %%xmm1\n"
+ "pslld $7, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ // out_u stays in xmm0
+
+ "pmulld %%xmm8, %%xmm3\n"
+ "pmulld %%xmm9, %%xmm2\n"
+ "psubd %%xmm2, %%xmm3\n"
+ "psubd %%xmm11, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+ "pmulld %%xmm12, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+
+ "paddd %%xmm1, %%xmm3\n"
+ // out_v stays in xmm3
+ :
+ : );
+#elif defined (__i386__)
+ __asm__ __volatile__ (
+ "movdqu %%xmm0, %%xmm2\n"
+ "movdqu %%xmm1, %%xmm3\n"
+
+ "movd %[x], %%xmm7\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pmulld %%xmm4, %%xmm0\n"
+ "pmulld %%xmm5, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ "psubd %%xmm7, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+ "pmulld %%xmm6, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+
+ "pcmpeqd %%xmm1, %%xmm1\n"
+ "psrld $31, %%xmm1\n"
+ "pslld $7, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ // out_u stays in xmm0
+
+
+ "movd %[y], %%xmm7\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pmulld %%xmm4, %%xmm3\n"
+ "pmulld %%xmm5, %%xmm2\n"
+ "psubd %%xmm2, %%xmm3\n"
+ "psubd %%xmm7, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+ "pmulld %%xmm6, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+
+ "paddd %%xmm1, %%xmm3\n"
+
+ // out_v stays in xmm3
+ "pxor %%xmm7, %%xmm7\n"
+ :
+ : [x] "r" (i_x),
+ [y] "r" (i_y)
+ : "memory");
+#endif
+ __asm__ __volatile__ (
+ "packusdw %%xmm7, %%xmm0\n"
+ "packuswb %%xmm7, %%xmm0\n"
+ "punpcklbw %%xmm7, %%xmm0\n"
+ "punpcklwd %%xmm7, %%xmm0\n"
+ "packusdw %%xmm7, %%xmm3\n"
+ "packuswb %%xmm7, %%xmm3\n"
+ "punpcklbw %%xmm7, %%xmm3\n"
+ "punpcklwd %%xmm7, %%xmm3\n"
+
+ "movdqu (%[out]), %%xmm1\n"
+ "pcmpeqd %%xmm2, %%xmm2\n"
+ "pslld $8, %%xmm2\n"
+ "pand %%xmm1, %%xmm2\n"
+ "por %%xmm0, %%xmm2\n"
+ "movdqu %%xmm2, (%[out])\n"
+
+ "movdqu (%[out_v]), %%xmm1\n"
+ "pcmpeqd %%xmm2, %%xmm2\n"
+ "pslld $8, %%xmm2\n"
+ "pand %%xmm1, %%xmm2\n"
+ "por %%xmm3, %%xmm2\n"
+ "movdqu %%xmm2, (%[out_v])\n"
+ :
+ : [out] "r" (p_out),
+ [out_v] "r" (p_out_v)
+ : "memory" );
+ p_in += 4;
+ p_in_v += 4;
+ p_out += 4;
+ p_out_v += 4;
+ );
+ }
+
+ p_line_end += 8 * 4;
+
+ for( ; p_in < p_line_end ; )
+ {
+ PACKED_WRITE_UV_CLIP();
+ }
+
+ p_in += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+ p_in_v += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+ p_out += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+ p_out_v += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+ }
+
+ RESTORE_XMM_REGISTERS;
+
+ return VLC_SUCCESS;
+}
+#endif
+
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_SSE41( picture_t * p_pic, picture_t * p_outpic, int
i_sin,
+ int i_cos, int i_sat, int i_x, int i_y )
+{
+ uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+ uint8_t *p_out, *p_out_v;
+
+ int i_y_offset, i_u_offset, i_v_offset;
+ int i_visible_lines, i_pitch, i_visible_pitch;
+
+ if ( GetPackedYuvOffsets( p_pic->format.i_chroma, &i_y_offset,
+ &i_u_offset, &i_v_offset ) != VLC_SUCCESS )
+ return VLC_EGENERIC;
+
+ i_visible_lines = p_pic->p->i_visible_lines;
+ i_pitch = p_pic->p->i_pitch;
+ i_visible_pitch = p_pic->p->i_visible_pitch;
+
+ p_in = p_pic->p->p_pixels + i_u_offset;
+ p_in_v = p_pic->p->p_pixels + i_v_offset;
+ p_in_end = p_in + i_visible_lines * i_pitch - 8 * 4;
+
+ p_out = p_outpic->p->p_pixels + i_u_offset;
+ p_out_v = p_outpic->p->p_pixels + i_v_offset;
+
+ uint8_t i_u, i_v;
+
+ STORE_XMM_REGISTERS;
+
+#if defined(__x86_64__)
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7\n"
+ "movd %[cos], %%xmm8\n"
+ "movd %[sin], %%xmm9\n"
+ "movd %[x], %%xmm10\n"
+ "movd %[y], %%xmm11\n"
+ "movd %[sat], %%xmm12\n"
+ "pshufd $0, %%xmm8, %%xmm8\n"
+ "pshufd $0, %%xmm9, %%xmm9\n"
+ "pshufd $0, %%xmm10, %%xmm10\n"
+ "pshufd $0, %%xmm11, %%xmm11\n"
+ "pshufd $0, %%xmm12, %%xmm12\n"
+ :
+ : [x] "r" (i_x),
+ [y] "r" (i_y),
+ [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#elif defined (__i386__)
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7\n"
+ "movd %[cos], %%xmm4\n"
+ "movd %[sin], %%xmm5\n"
+ "movd %[sat], %%xmm6\n"
+ "pshufd $0, %%xmm4, %%xmm4\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "pshufd $0, %%xmm6, %%xmm6\n"
+ :
+ : [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#endif
+
+ for( ; p_in < p_in_end ; )
+ {
+ p_line_end = p_in + i_visible_pitch - 8 * 4;
+
+ for( ; p_in < p_line_end ; )
+ {
+ ADJUST_2_TIMES(
+ /* Do 8 pixels at a time */
+ __asm__ __volatile__(
+ "movdqu (%[in]), %%xmm0\n"
+ "movdqu (%[in_v]), %%xmm1\n"
+ "pcmpeqd %%xmm2, %%xmm2\n" //
0xFFFFFFFF
+ "psrld $24, %%xmm2\n" //
0x000000FF
+ "pand %%xmm2, %%xmm0\n" // mask out
unnecessary data
+ "pand %%xmm2, %%xmm1\n"
+ :
+ : [in] "r" (p_in),
+ [in_v] "r" (p_in_v)
+ : "memory" );
+#if defined(__x86_64__)
+ __asm__ __volatile__(
+ "movdqu %%xmm0, %%xmm2\n"
+ "movdqu %%xmm1, %%xmm3\n"
+
+ "pmulld %%xmm8, %%xmm0\n"
+ "pmulld %%xmm9, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ "psubd %%xmm10, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+ "pmulld %%xmm12, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+
+ "pcmpeqd %%xmm1, %%xmm1\n"
+ "psrld $31, %%xmm1\n"
+ "pslld $7, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ // out_u stays in xmm0
+
+ "pmulld %%xmm8, %%xmm3\n"
+ "pmulld %%xmm9, %%xmm2\n"
+ "psubd %%xmm2, %%xmm3\n"
+ "psubd %%xmm11, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+ "pmulld %%xmm12, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+
+ "paddd %%xmm1, %%xmm3\n"
+ // out_v stays in xmm3
+ :
+ : );
+#elif defined (__i386__)
+ __asm__ __volatile__(
+ "movdqu %%xmm0, %%xmm2\n"
+ "movdqu %%xmm1, %%xmm3\n"
+
+ "movd %[x], %%xmm7\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pmulld %%xmm4, %%xmm0\n"
+ "pmulld %%xmm5, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ "psubd %%xmm7, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+ "pmulld %%xmm6, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+
+ "pcmpeqd %%xmm1, %%xmm1\n"
+ "psrld $31, %%xmm1\n"
+ "pslld $7, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ // out_u stays in xmm0
+
+
+ "movd %[y], %%xmm7\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pmulld %%xmm4, %%xmm3\n"
+ "pmulld %%xmm5, %%xmm2\n"
+ "psubd %%xmm2, %%xmm3\n"
+ "psubd %%xmm7, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+ "pmulld %%xmm6, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+
+ "paddd %%xmm1, %%xmm3\n"
+
+ // out_v stays in xmm3
+ "pxor %%xmm7, %%xmm7\n"
+ :
+ : [x] "r" (i_x),
+ [y] "r" (i_y)
+ : "memory" );
+#endif
+ __asm__ __volatile__(
+ "pcmpeqd %%xmm2, %%xmm2\n" //
0xFFFFFFFF
+ "psrld $24, %%xmm2\n" //
0x000000FF
+ "pand %%xmm2, %%xmm0\n" // mask out
unnecessary data
+ "pand %%xmm2, %%xmm3\n"
+
+ "movdqu (%[out]), %%xmm1\n"
+ "pcmpeqd %%xmm2, %%xmm2\n"
+ "pslld $8, %%xmm2\n"
+ "pand %%xmm1, %%xmm2\n"
+ "por %%xmm0, %%xmm2\n"
+ "movdqu %%xmm2, (%[out])\n"
+
+ "movdqu (%[out_v]), %%xmm1\n"
+ "pcmpeqd %%xmm2, %%xmm2\n"
+ "pslld $8, %%xmm2\n"
+ "pand %%xmm1, %%xmm2\n"
+ "por %%xmm3, %%xmm2\n"
+ "movdqu %%xmm2, (%[out_v])\n"
+ :
+ : [out] "r" (p_out),
+ [out_v] "r" (p_out_v)
+ : "memory" );
+ p_in += 4;
+ p_in_v += 4;
+ p_out += 4;
+ p_out_v += 4;
+ );
+ }
+
+ p_line_end += 8 * 4;
+
+ for( ; p_in < p_line_end ; )
+ {
+ PACKED_WRITE_UV();
+ }
+
+ p_in += i_pitch - i_visible_pitch;
+ p_in_v += i_pitch - i_visible_pitch;
+ p_out += i_pitch - i_visible_pitch;
+ p_out_v += i_pitch - i_visible_pitch;
+ }
+
+ RESTORE_XMM_REGISTERS;
+
+ return VLC_SUCCESS;
+}
+#endif
+
+#ifdef CAN_COMPILE_SSE2
+int planar_sat_hue_SSE2( picture_t * p_pic, picture_t * p_outpic, int
i_sin, int i_cos,
+ int i_sat, int i_x, int i_y )
+{
+ uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+ uint8_t *p_out, *p_out_v;
+
+ p_in = p_pic->p[U_PLANE].p_pixels;
+ p_in_v = p_pic->p[V_PLANE].p_pixels;
+ p_in_end = p_in + p_pic->p[U_PLANE].i_visible_lines
+ * p_pic->p[U_PLANE].i_pitch - 8;
+
+ p_out = p_outpic->p[U_PLANE].p_pixels;
+ p_out_v = p_outpic->p[V_PLANE].p_pixels;
+
+ uint8_t i_u, i_v;
+
+ STORE_XMM_REGISTERS;
+
+ __asm__ volatile(
+#if defined(__i386__)
+ "pxor %%xmm0, %%xmm0\n"
+ "movd %[cos], %%xmm3\n"
+ "movd %[sin], %%xmm4\n"
+ "pslld $16, %%xmm4\n"
+ "pslld $16, %%xmm3\n"
+ "psrld $16, %%xmm3\n"
+ "pshufd $0, %%xmm3, %%xmm3\n"
+ "pshufd $0, %%xmm4, %%xmm4\n"
+ "por %%xmm4, %%xmm3\n"
+ "movd %[sat], %%xmm4\n"
+ "pshufd $0, %%xmm4, %%xmm4\n"
+ "pcmpeqb %%xmm6, %%xmm6\n"
+ "psrlw $15, %%xmm6\n"
+ "psllw $7, %%xmm6\n"
+#elif defined(__x86_64__)
+ "pxor %%xmm0, %%xmm0\n"
+ "movd %[cos], %%xmm6\n"
+ "movd %[sin], %%xmm7\n"
+ "pslld $16, %%xmm7\n"
+ "pslld $16, %%xmm6\n"
+ "psrld $16, %%xmm6\n"
+ "pshufd $0, %%xmm6, %%xmm6\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+ "por %%xmm7, %%xmm6\n"
+ "movd %[sat], %%xmm7\n"
+ "movd %[x], %%xmm10\n"
+ "movd %[y], %%xmm11\n"
+ "pshufd $0, %%xmm10, %%xmm10\n"
+ "pshufd $0, %%xmm11, %%xmm11\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pcmpeqb %%xmm12, %%xmm12\n"
+ "pcmpeqb %%xmm9, %%xmm9\n"
+ "pcmpeqb %%xmm13, %%xmm13\n"
+ "psrlw $15, %%xmm9\n"
+ "psllw $7, %%xmm9\n"
+ "psrlw $15, %%xmm13\n"
+ "mov $0x8000, %%eax\n"
+ "movd %%eax, %%xmm14\n"
+ "pshufd $0, %%xmm14, %%xmm14\n"
+ "mov $0x80008000, %%eax\n"
+ "movd %%eax, %%xmm15\n"
+ "pshufd $0, %%xmm15, %%xmm15\n"
+#endif
+ :
+ :
+ [x] "r" ( i_x ),
+ [y] "r" ( i_y ),
+ [sat] "r" ( i_sat * 0x10001 ),
+ [sin] "r" ( i_sin ),
+ [cos] "r" ( i_cos )
+ : "eax", "memory" );
+ for( ; p_in < p_in_end ; )
+ {
+ p_line_end = p_in + p_pic->p[U_PLANE].i_visible_pitch - 8;
+
+ for( ; p_in < p_line_end ; )
+ {
+ /* Do 8 pixels at a time */
+ __asm__ __volatile__ (
+ "movd (%[in]), %%xmm1\n"
+ "movd (%[in_v]), %%xmm2\n"
+ "punpcklbw %%xmm0, %%xmm1\n"
+ "punpcklbw %%xmm0, %%xmm2\n"
+ "punpcklwd %%xmm2, %%xmm1\n"
+ /////////////////////////////////////////
+ "movd 4(%[in]), %%xmm2\n"
+ "movd 4(%[in_v]), %%xmm5\n"
+ "punpcklbw %%xmm0, %%xmm2\n"
+ "punpcklbw %%xmm0, %%xmm5\n"
+ "punpcklwd %%xmm5, %%xmm2\n"
+ :
+ : [in] "r" (p_in),
+ [in_v] "r" (p_in_v)
+ : "memory" );
+#ifdef __i386__
+ __asm__ __volatile__ (
+ "pmaddwd %%xmm3, %%xmm1\n"
+ "pmaddwd %%xmm3, %%xmm2\n"
+ "movd %[x], %%xmm5\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "psubd %%xmm5, %%xmm1\n"
+ "psubd %%xmm5, %%xmm2\n"
+ "pslld $8, %%xmm1\n"
+ "psrld $16, %%xmm1\n"
+ "pslld $8, %%xmm2\n"
+ "psrld $16, %%xmm2\n"
+ "mov $0x8000, %%eax\n"
+ "movd %%eax, %%xmm5\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "psubd %%xmm5, %%xmm2\n"
+ "psubd %%xmm5, %%xmm1\n"
+ "packssdw %%xmm2, %%xmm1\n"
+ "pshuflw $0, %%xmm5, %%xmm5\n"
+ "pshufhw $0, %%xmm5, %%xmm5\n"
+ "paddw %%xmm5, %%xmm1\n"
+ "pmullw %%xmm4, %%xmm1\n"
+ "psraw $8, %%xmm1\n"
+ "paddw %%xmm6, %%xmm1\n"
+ /* store U channel */
+ "packuswb %%xmm0, %%xmm1\n"
+ "movq %%xmm1, (%[out])\n"
+ :
+ : [x] "r" (i_x),
+ [out] "r" (p_out)
+ : "eax", "memory" );
+ __asm__ __volatile__ (
+ /////////////////////////////////////////
+ "pcmpeqb %%xmm6, %%xmm6\n"
+ "movq (%[in]), %%xmm5\n"
+ "movd (%[in_v]), %%xmm1\n"
+ "movd 4(%[in_v]), %%xmm2\n"
+ "punpcklbw %%xmm0, %%xmm5\n"
+ "pandn %%xmm6, %%xmm5\n"
+ "punpcklbw %%xmm0, %%xmm1\n"
+ "punpcklbw %%xmm0, %%xmm2\n"
+ "paddw %%xmm6, %%xmm5\n"
+ "psrlw $15, %%xmm6\n"
+ "punpcklwd %%xmm5, %%xmm1\n"
+ "punpckhqdq %%xmm5, %%xmm5\n"
+ "punpcklwd %%xmm5, %%xmm2\n"
+
+ "psllw $7, %%xmm6\n"
+ "pmaddwd %%xmm3, %%xmm1\n"
+ "pmaddwd %%xmm3, %%xmm2\n"
+ "movd %[y], %%xmm5\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "psubd %%xmm5, %%xmm1\n"
+ "psubd %%xmm5, %%xmm2\n"
+ "pslld $8, %%xmm1\n"
+ "psrld $16, %%xmm1\n"
+ "pslld $8, %%xmm2\n"
+ "psrld $16, %%xmm2\n"
+
+ "mov $0x8000, %%eax\n"
+ "movd %%eax, %%xmm5\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "psubd %%xmm5, %%xmm2\n"
+ "psubd %%xmm5, %%xmm1\n"
+ "packssdw %%xmm2, %%xmm1\n"
+ "pshuflw $0, %%xmm5, %%xmm5\n"
+ "pshufhw $0, %%xmm5, %%xmm5\n"
+ "paddw %%xmm5, %%xmm1\n"
+ "pmullw %%xmm4, %%xmm1\n"
+ "psraw $8, %%xmm1\n"
+ "paddw %%xmm6, %%xmm1\n"
+ "packuswb %%xmm0, %%xmm1\n"
+ "movq %%xmm1, (%[out_v])\n"
+ :
+ : [in] "r" (p_in),
+ [in_v] "r" (p_in_v),
+ [y] "r" (i_y),
+ [out_v] "r" (p_out_v)
+ : "eax", "memory" );
+#elif defined(__x86_64__)
+ __asm__ __volatile__ (
+ /////////////////////////////////////////
+ "movq (%[in]), %%xmm5\n"
+ "movd (%[in_v]), %%xmm3\n"
+ "movd 4(%[in_v]), %%xmm4\n"
+ "punpcklbw %%xmm0, %%xmm5\n"
+ "pandn %%xmm12, %%xmm5\n" // invert U (to
be subtracted)
+ "punpcklbw %%xmm0, %%xmm3\n"
+ "paddw %%xmm13, %%xmm5\n" // add 1
+ "punpcklbw %%xmm0, %%xmm4\n"
+ "punpcklwd %%xmm5, %%xmm3\n"
+ "punpckhqdq %%xmm5, %%xmm5\n"
+ "punpcklwd %%xmm5, %%xmm4\n"
+ /////////////////////////////////////////
+ "pmaddwd %%xmm6, %%xmm1\n"
+ "pmaddwd %%xmm6, %%xmm2\n"
+ "pmaddwd %%xmm6, %%xmm3\n"
+ "pmaddwd %%xmm6, %%xmm4\n"
+ "psubd %%xmm10, %%xmm1\n"
+ "psubd %%xmm10, %%xmm2\n"
+ "psubd %%xmm11, %%xmm3\n"
+ "psubd %%xmm11, %%xmm4\n"
+ "pslld $8, %%xmm1\n"
+ "pslld $8, %%xmm2\n"
+ "pslld $8, %%xmm3\n"
+ "pslld $8, %%xmm4\n"
+ "psrld $16, %%xmm1\n"
+ "psrld $16, %%xmm2\n"
+ "psrld $16, %%xmm3\n"
+ "psrld $16, %%xmm4\n"
+ "psubd %%xmm14, %%xmm1\n"
+ "psubd %%xmm14, %%xmm2\n"
+ "psubd %%xmm14, %%xmm3\n"
+ "psubd %%xmm14, %%xmm4\n"
+ "packssdw %%xmm2, %%xmm1\n"
+ "packssdw %%xmm4, %%xmm3\n"
+ "paddw %%xmm15, %%xmm1\n"
+ "paddw %%xmm15, %%xmm3\n"
+ "pmullw %%xmm7, %%xmm1\n"
+ "pmullw %%xmm7, %%xmm3\n"
+ "psraw $8, %%xmm1\n"
+ "psraw $8, %%xmm3\n"
+ "paddw %%xmm9, %%xmm1\n"
+ "paddw %%xmm9, %%xmm3\n"
+ "packuswb %%xmm0, %%xmm1\n"
+ "packuswb %%xmm0, %%xmm3\n"
+ "movq %%xmm1, (%[out])\n"
+ "movq %%xmm3, (%[out_v])\n"
+ :
+ : [in] "r" (p_in),
+ [in_v] "r" (p_in_v),
+ [out] "r" (p_out),
+ [out_v] "r" (p_out_v)
+ : "eax", "memory" );
+#endif
+ p_in += 8;
+ p_in_v += 8;
+ p_out += 8;
+ p_out_v += 8;
+ }
+
+ p_line_end += 8;
+
+ for( ; p_in < p_line_end ; )
+ {
+ PLANAR_WRITE_UV();
+ }
+
+ p_in += p_pic->p[U_PLANE].i_pitch
+ - p_pic->p[U_PLANE].i_visible_pitch;
+ p_in_v += p_pic->p[V_PLANE].i_pitch
+ - p_pic->p[V_PLANE].i_visible_pitch;
+ p_out += p_outpic->p[U_PLANE].i_pitch
+ - p_outpic->p[U_PLANE].i_visible_pitch;
+ p_out_v += p_outpic->p[V_PLANE].i_pitch
+ - p_outpic->p[V_PLANE].i_visible_pitch;
+ }
+
+ RESTORE_XMM_REGISTERS;
+
+ return VLC_SUCCESS;
+}
+#endif
+
int planar_sat_hue_clip_C( picture_t * p_pic, picture_t * p_outpic, int
i_sin, int i_cos,
int i_sat, int i_x, int i_y )
{
diff --git a/modules/video_filter/adjust_sat_hue.h
b/modules/video_filter/adjust_sat_hue.h
index d850dec..cef13ec 100644
--- a/modules/video_filter/adjust_sat_hue.h
+++ b/modules/video_filter/adjust_sat_hue.h
@@ -39,6 +39,40 @@
*/
/**
+ * SSE4.1 version function for planar format, i_sat > 256
+ */
+#ifdef CAN_COMPILE_SSE4_1
+int planar_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic,
+ int i_sin, int i_cos, int i_sat, int i_x,
+ int i_y );
+#endif
+
+/**
+ * SSE4.1 version function for packed format, i_sat > 256
+ */
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic,
+ int i_sin, int i_cos, int i_sat, int i_x,
+ int i_y );
+#endif
+
+/**
+ * SSE4.1 version function for packed format, i_sat <= 256
+ */
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_SSE41( picture_t * p_pic, picture_t * p_outpic, int
i_sin,
+ int i_cos, int i_sat, int i_x, int i_y );
+#endif
+
+/**
+ * SSE4.1 version function for planar format, i_sat > 256
+ */
+#ifdef CAN_COMPILE_SSE2
+int planar_sat_hue_SSE2( picture_t * p_pic, picture_t * p_outpic, int
i_sin,
+ int i_cos, int i_sat, int i_x, int i_y );
+#endif
+
+/**
* Basic C compiler generated function for planar format, i_sat > 256
*/
int planar_sat_hue_clip_C( picture_t * p_pic, picture_t * p_outpic,
--
1.7.1
More information about the vlc-devel
mailing list