[vlc-devel] [PATCH 2/2] Added assembly (SSE2, SSE4.1) processing functions to adjust filter
Martin Briza
Gamajun at seznam.cz
Tue Aug 16 16:57:06 CEST 2011
From: Martin Briza <xbriza00 at stud.fit.vutbr.cz>
---
modules/video_filter/adjust.c | 31 ++-
modules/video_filter/adjust_sat_hue.c | 824 +++++++++++++++++++++++++++++++++
modules/video_filter/adjust_sat_hue.h | 34 ++
3 files changed, 885 insertions(+), 4 deletions(-)
diff --git a/modules/video_filter/adjust.c b/modules/video_filter/adjust.c
index dcfb46b..056af0f 100644
--- a/modules/video_filter/adjust.c
+++ b/modules/video_filter/adjust.c
@@ -167,15 +167,38 @@ static int Create( vlc_object_t *p_this )
CASE_PLANAR_YUV
/* Planar YUV */
p_filter->pf_video_filter = FilterPlanar;
- p_sys->pf_process_sat_hue_clip = planar_sat_hue_clip_C;
- p_sys->pf_process_sat_hue = planar_sat_hue_C;
+
+ if (vlc_CPU() & CPU_CAPABILITY_SSE4_1)
+ {
+ p_sys->pf_process_sat_hue_clip = planar_sat_hue_clip_SSE41;
+ p_sys->pf_process_sat_hue = planar_sat_hue_SSE2;
+ }
+ else if (vlc_CPU() & CPU_CAPABILITY_SSE2)
+ {
+ p_sys->pf_process_sat_hue_clip = planar_sat_hue_clip_C;
+ p_sys->pf_process_sat_hue = planar_sat_hue_SSE2;
+ }
+ else
+ {
+ p_sys->pf_process_sat_hue_clip = planar_sat_hue_clip_C;
+ p_sys->pf_process_sat_hue = planar_sat_hue_C;
+ }
break;
CASE_PACKED_YUV_422
/* Packed YUV 4:2:2 */
p_filter->pf_video_filter = FilterPacked;
- p_sys->pf_process_sat_hue_clip = packed_sat_hue_clip_C;
- p_sys->pf_process_sat_hue = packed_sat_hue_C;
+
+ if (vlc_CPU() & CPU_CAPABILITY_SSE4_1)
+ {
+ p_sys->pf_process_sat_hue_clip = packed_sat_hue_clip_SSE41;
+ p_sys->pf_process_sat_hue = packed_sat_hue_SSE41;
+ }
+ else
+ {
+ p_sys->pf_process_sat_hue_clip = packed_sat_hue_clip_C;
+ p_sys->pf_process_sat_hue = packed_sat_hue_C;
+ }
break;
default:
diff --git a/modules/video_filter/adjust_sat_hue.c b/modules/video_filter/adjust_sat_hue.c
index 45598d4..4a63eb1 100644
--- a/modules/video_filter/adjust_sat_hue.c
+++ b/modules/video_filter/adjust_sat_hue.c
@@ -70,6 +70,830 @@
* Hue and saturation adjusting routines
*****************************************************************************/
+#ifdef CAN_COMPILE_SSE4_1
+int planar_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic,
+ int i_sin, int i_cos, int i_sat, int i_x,
+ int i_y )
+{
+ uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+ uint8_t *p_out, *p_out_v;
+
+ p_in = p_pic->p[U_PLANE].p_pixels;
+ p_in_v = p_pic->p[V_PLANE].p_pixels;
+ p_in_end = p_in + p_pic->p[U_PLANE].i_visible_lines
+ * p_pic->p[U_PLANE].i_pitch - 8;
+
+ p_out = p_outpic->p[U_PLANE].p_pixels;
+ p_out_v = p_outpic->p[V_PLANE].p_pixels;
+
+ uint8_t i_u, i_v;
+
+#if defined(__x86_64__)
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7\n"
+ "movd %[cos], %%xmm8\n"
+ "movd %[sin], %%xmm9\n"
+ "movd %[x], %%xmm10\n"
+ "movd %[y], %%xmm11\n"
+ "movd %[sat], %%xmm12\n"
+ "pshufd $0, %%xmm8, %%xmm8\n"
+ "pshufd $0, %%xmm9, %%xmm9\n"
+ "pshufd $0, %%xmm10, %%xmm10\n"
+ "pshufd $0, %%xmm11, %%xmm11\n"
+ "pshufd $0, %%xmm12, %%xmm12\n"
+ :
+ : [x] "r" (i_x),
+ [y] "r" (i_y),
+ [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#elif defined (__i386__)
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7\n"
+ "movd %[cos], %%xmm4\n"
+ "movd %[sin], %%xmm5\n"
+ "movd %[sat], %%xmm6\n"
+ "pshufd $0, %%xmm4, %%xmm4\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "pshufd $0, %%xmm6, %%xmm6\n"
+ :
+ : [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#endif
+
+ for( ; p_in < p_in_end ; )
+ {
+ p_line_end = p_in + p_pic->p[U_PLANE].i_visible_pitch - 8;
+
+ for( ; p_in < p_line_end ; )
+ {
+ /* Do 8 pixels at a time */
+ ADJUST_2_TIMES(
+ __asm__ __volatile__(
+ "movd (%[in]), %%xmm0\n"
+ "movd (%[in_v]), %%xmm1\n"
+ "punpcklbw %%xmm7, %%xmm0\n"
+ "punpcklwd %%xmm7, %%xmm0\n"
+ "punpcklbw %%xmm7, %%xmm1\n"
+ "punpcklwd %%xmm7, %%xmm1\n"
+#if defined(__x86_64__)
+ "movdqu %%xmm0, %%xmm2\n"
+ "movdqu %%xmm1, %%xmm3\n"
+
+ "pmulld %%xmm8, %%xmm0\n"
+ "pmulld %%xmm9, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ "psubd %%xmm10, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+ "pmulld %%xmm12, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+
+ "pcmpeqd %%xmm1, %%xmm1\n"
+ "psrld $31, %%xmm1\n"
+ "pslld $7, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ /* out_u stays in xmm0 */
+
+ "pmulld %%xmm8, %%xmm3\n"
+ "pmulld %%xmm9, %%xmm2\n"
+ "psubd %%xmm2, %%xmm3\n"
+ "psubd %%xmm11, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+ "pmulld %%xmm12, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+
+ "paddd %%xmm1, %%xmm3\n"
+
+ /* out_v stays in xmm3 */
+#elif defined (__i386__)
+ "movdqu %%xmm0, %%xmm2\n"
+ "movdqu %%xmm1, %%xmm3\n"
+
+ "movd %[x], %%xmm7\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pmulld %%xmm4, %%xmm0\n"
+ "pmulld %%xmm5, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ "psubd %%xmm7, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+ "pmulld %%xmm6, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+
+ "pcmpeqd %%xmm1, %%xmm1\n"
+ "psrld $31, %%xmm1\n"
+ "pslld $7, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ /* out_u stays in xmm0 */
+
+
+ "movd %[y], %%xmm7\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pmulld %%xmm4, %%xmm3\n"
+ "pmulld %%xmm5, %%xmm2\n"
+ "psubd %%xmm2, %%xmm3\n"
+ "psubd %%xmm7, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+ "pmulld %%xmm6, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+
+ "paddd %%xmm1, %%xmm3\n"
+
+ /* out_v stays in xmm3 */
+ "pxor %%xmm7, %%xmm7\n"
+#endif
+ /* pack and saturate (if there is something to saturate) and
+ * store in destination */
+ "packusdw %%xmm7, %%xmm0\n"
+ "packuswb %%xmm7, %%xmm0\n"
+ "movd %%xmm0, (%[out])\n"
+ "packusdw %%xmm7, %%xmm3\n"
+ "packuswb %%xmm7, %%xmm3\n"
+ "movd %%xmm3, (%[out_v])\n"
+ :
+ : [in] "r" (p_in),
+ [in_v] "r" (p_in_v),
+ [x] "r" (i_x),
+ [y] "r" (i_y),
+ [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat),
+ [out] "r" (p_out),
+ [out_v] "r" (p_out_v)
+ : "eax", "memory" );
+ p_in += 4;
+ p_in_v += 4;
+ p_out += 4;
+ p_out_v += 4; );
+ }
+
+ p_line_end += 8;
+
+ for( ; p_in < p_line_end ; )
+ {
+ PLANAR_WRITE_UV_CLIP();
+ }
+
+ p_in += p_pic->p[U_PLANE].i_pitch
+ - p_pic->p[U_PLANE].i_visible_pitch;
+ p_in_v += p_pic->p[V_PLANE].i_pitch
+ - p_pic->p[V_PLANE].i_visible_pitch;
+ p_out += p_outpic->p[U_PLANE].i_pitch
+ - p_outpic->p[U_PLANE].i_visible_pitch;
+ p_out_v += p_outpic->p[V_PLANE].i_pitch
+ - p_outpic->p[V_PLANE].i_visible_pitch;
+ }
+
+ return VLC_SUCCESS;
+}
+#endif
+
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic, int i_sin, int i_cos,
+ int i_sat, int i_x, int i_y )
+{
+ uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+ uint8_t *p_out, *p_out_v;
+
+ int i_y_offset, i_u_offset, i_v_offset;
+ int i_visible_lines, i_pitch, i_visible_pitch;
+
+
+ if ( GetPackedYuvOffsets( p_pic->format.i_chroma, &i_y_offset,
+ &i_u_offset, &i_v_offset ) != VLC_SUCCESS )
+ return VLC_EGENERIC;
+
+ i_visible_lines = p_pic->p->i_visible_lines;
+ i_pitch = p_pic->p->i_pitch;
+ i_visible_pitch = p_pic->p->i_visible_pitch;
+
+ p_in = p_pic->p->p_pixels + i_u_offset;
+ p_in_v = p_pic->p->p_pixels + i_v_offset;
+ p_in_end = p_in + i_visible_lines * i_pitch - 8 * 4;
+
+ p_out = p_outpic->p->p_pixels + i_u_offset;
+ p_out_v = p_outpic->p->p_pixels + i_v_offset;
+
+ uint8_t i_u, i_v;
+
+#if defined(__x86_64__)
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7\n"
+ "movd %[cos], %%xmm8\n"
+ "movd %[sin], %%xmm9\n"
+ "movd %[x], %%xmm10\n"
+ "movd %[y], %%xmm11\n"
+ "movd %[sat], %%xmm12\n"
+ "pshufd $0, %%xmm8, %%xmm8\n"
+ "pshufd $0, %%xmm9, %%xmm9\n"
+ "pshufd $0, %%xmm10, %%xmm10\n"
+ "pshufd $0, %%xmm11, %%xmm11\n"
+ "pshufd $0, %%xmm12, %%xmm12\n"
+ :
+ : [x] "r" (i_x),
+ [y] "r" (i_y),
+ [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#elif defined (__i386__)
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7\n"
+ "movd %[cos], %%xmm4\n"
+ "movd %[sin], %%xmm5\n"
+ "movd %[sat], %%xmm6\n"
+ "pshufd $0, %%xmm4, %%xmm4\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "pshufd $0, %%xmm6, %%xmm6\n"
+ :
+ : [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#endif
+
+ for( ; p_in < p_in_end ; )
+ {
+ p_line_end = p_in + p_pic->p->i_visible_pitch - 8 * 4;
+
+ for( ; p_in < p_line_end ; )
+ {
+ ADJUST_2_TIMES(
+ /* Do 8 pixels at a time */
+ __asm__ __volatile__ (
+ "movdqu (%[in]), %%xmm0\n"
+ "movdqu (%[in_v]), %%xmm1\n"
+ "pcmpeqd %%xmm2, %%xmm2\n" // 0xFFFFFFFF
+ "psrld $24, %%xmm2\n" // 0x000000FF
+ "pand %%xmm2, %%xmm0\n" // mask out unnecessary data
+ "pand %%xmm2, %%xmm1\n"
+#if defined(__x86_64__)
+ "movdqu %%xmm0, %%xmm2\n"
+ "movdqu %%xmm1, %%xmm3\n"
+
+ "pmulld %%xmm8, %%xmm0\n"
+ "pmulld %%xmm9, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ "psubd %%xmm10, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+ "pmulld %%xmm12, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+
+ "pcmpeqd %%xmm1, %%xmm1\n"
+ "psrld $31, %%xmm1\n"
+ "pslld $7, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ // out_u stays in xmm0
+
+ "pmulld %%xmm8, %%xmm3\n"
+ "pmulld %%xmm9, %%xmm2\n"
+ "psubd %%xmm2, %%xmm3\n"
+ "psubd %%xmm11, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+ "pmulld %%xmm12, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+
+ "paddd %%xmm1, %%xmm3\n"
+
+ // out_v stays in xmm3
+#elif defined (__i386__)
+ "movdqu %%xmm0, %%xmm2\n"
+ "movdqu %%xmm1, %%xmm3\n"
+
+ "movd %[x], %%xmm7\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pmulld %%xmm4, %%xmm0\n"
+ "pmulld %%xmm5, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ "psubd %%xmm7, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+ "pmulld %%xmm6, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+
+ "pcmpeqd %%xmm1, %%xmm1\n"
+ "psrld $31, %%xmm1\n"
+ "pslld $7, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ // out_u stays in xmm0
+
+
+ "movd %[y], %%xmm7\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pmulld %%xmm4, %%xmm3\n"
+ "pmulld %%xmm5, %%xmm2\n"
+ "psubd %%xmm2, %%xmm3\n"
+ "psubd %%xmm7, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+ "pmulld %%xmm6, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+
+ "paddd %%xmm1, %%xmm3\n"
+
+ // out_v stays in xmm3
+ "pxor %%xmm7, %%xmm7\n"
+#endif
+ "packusdw %%xmm7, %%xmm0\n"
+ "packuswb %%xmm7, %%xmm0\n"
+ "punpcklbw %%xmm7, %%xmm0\n"
+ "punpcklwd %%xmm7, %%xmm0\n"
+ "packusdw %%xmm7, %%xmm3\n"
+ "packuswb %%xmm7, %%xmm3\n"
+ "punpcklbw %%xmm7, %%xmm3\n"
+ "punpcklwd %%xmm7, %%xmm3\n"
+
+ "movdqu (%[out]), %%xmm1\n"
+ "pcmpeqd %%xmm2, %%xmm2\n"
+ "pslld $8, %%xmm2\n"
+ "pand %%xmm1, %%xmm2\n"
+ "por %%xmm0, %%xmm2\n"
+ "movdqu %%xmm2, (%[out])\n"
+
+ "movdqu (%[out_v]), %%xmm1\n"
+ "pcmpeqd %%xmm2, %%xmm2\n"
+ "pslld $8, %%xmm2\n"
+ "pand %%xmm1, %%xmm2\n"
+ "por %%xmm3, %%xmm2\n"
+ "movdqu %%xmm2, (%[out_v])\n"
+ :
+ : [in] "r" (p_in),
+ [in_v] "r" (p_in_v),
+ [x] "r" (i_x),
+ [y] "r" (i_y),
+ [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat),
+ [out] "r" (p_out),
+ [out_v] "r" (p_out_v)
+ : "eax", "memory" );
+ p_in += 4;
+ p_in_v += 4;
+ p_out += 4;
+ p_out_v += 4;
+ );
+ }
+
+ p_line_end += 8 * 4;
+
+ for( ; p_in < p_line_end ; )
+ {
+ PACKED_WRITE_UV_CLIP();
+ }
+
+ p_in += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+ p_in_v += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+ p_out += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+ p_out_v += p_pic->p->i_pitch - p_pic->p->i_visible_pitch;
+ }
+
+ return VLC_SUCCESS;
+}
+#endif
+
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_SSE41( picture_t * p_pic, picture_t * p_outpic, int i_sin,
+ int i_cos, int i_sat, int i_x, int i_y )
+{
+ uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+ uint8_t *p_out, *p_out_v;
+
+ int i_y_offset, i_u_offset, i_v_offset;
+ int i_visible_lines, i_pitch, i_visible_pitch;
+
+ if ( GetPackedYuvOffsets( p_pic->format.i_chroma, &i_y_offset,
+ &i_u_offset, &i_v_offset ) != VLC_SUCCESS )
+ return VLC_EGENERIC;
+
+ i_visible_lines = p_pic->p->i_visible_lines;
+ i_pitch = p_pic->p->i_pitch;
+ i_visible_pitch = p_pic->p->i_visible_pitch;
+
+ p_in = p_pic->p->p_pixels + i_u_offset;
+ p_in_v = p_pic->p->p_pixels + i_v_offset;
+ p_in_end = p_in + i_visible_lines * i_pitch - 8 * 4;
+
+ p_out = p_outpic->p->p_pixels + i_u_offset;
+ p_out_v = p_outpic->p->p_pixels + i_v_offset;
+
+ uint8_t i_u, i_v;
+
+#if defined(__x86_64__)
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7\n"
+ "movd %[cos], %%xmm8\n"
+ "movd %[sin], %%xmm9\n"
+ "movd %[x], %%xmm10\n"
+ "movd %[y], %%xmm11\n"
+ "movd %[sat], %%xmm12\n"
+ "pshufd $0, %%xmm8, %%xmm8\n"
+ "pshufd $0, %%xmm9, %%xmm9\n"
+ "pshufd $0, %%xmm10, %%xmm10\n"
+ "pshufd $0, %%xmm11, %%xmm11\n"
+ "pshufd $0, %%xmm12, %%xmm12\n"
+ :
+ : [x] "r" (i_x),
+ [y] "r" (i_y),
+ [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#elif defined (__i386__)
+ __asm__ volatile(
+ "pxor %%xmm7, %%xmm7\n"
+ "movd %[cos], %%xmm4\n"
+ "movd %[sin], %%xmm5\n"
+ "movd %[sat], %%xmm6\n"
+ "pshufd $0, %%xmm4, %%xmm4\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "pshufd $0, %%xmm6, %%xmm6\n"
+ :
+ : [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat)
+ : "eax", "memory" );
+#endif
+
+ for( ; p_in < p_in_end ; )
+ {
+ p_line_end = p_in + i_visible_pitch - 8 * 4;
+
+ for( ; p_in < p_line_end ; )
+ {
+ ADJUST_2_TIMES(
+ /* Do 8 pixels at a time */
+ __asm__ volatile(
+ "movdqu (%[in]), %%xmm0\n"
+ "movdqu (%[in_v]), %%xmm1\n"
+ "pcmpeqd %%xmm2, %%xmm2\n" // 0xFFFFFFFF
+ "psrld $24, %%xmm2\n" // 0x000000FF
+ "pand %%xmm2, %%xmm0\n" // mask out unnecessary data
+ "pand %%xmm2, %%xmm1\n"
+#if defined(__x86_64__)
+ "movdqu %%xmm0, %%xmm2\n"
+ "movdqu %%xmm1, %%xmm3\n"
+
+ "pmulld %%xmm8, %%xmm0\n"
+ "pmulld %%xmm9, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ "psubd %%xmm10, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+ "pmulld %%xmm12, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+
+ "pcmpeqd %%xmm1, %%xmm1\n"
+ "psrld $31, %%xmm1\n"
+ "pslld $7, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ // out_u stays in xmm0
+
+ "pmulld %%xmm8, %%xmm3\n"
+ "pmulld %%xmm9, %%xmm2\n"
+ "psubd %%xmm2, %%xmm3\n"
+ "psubd %%xmm11, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+ "pmulld %%xmm12, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+
+ "paddd %%xmm1, %%xmm3\n"
+
+ // out_v stays in xmm3
+#elif defined (__i386__)
+
+ "movdqu %%xmm0, %%xmm2\n"
+ "movdqu %%xmm1, %%xmm3\n"
+
+ "movd %[x], %%xmm7\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pmulld %%xmm4, %%xmm0\n"
+ "pmulld %%xmm5, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ "psubd %%xmm7, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+ "pmulld %%xmm6, %%xmm0\n"
+ "psrad $8, %%xmm0\n"
+
+ "pcmpeqd %%xmm1, %%xmm1\n"
+ "psrld $31, %%xmm1\n"
+ "pslld $7, %%xmm1\n"
+ "paddd %%xmm1, %%xmm0\n"
+ // out_u stays in xmm0
+
+
+ "movd %[y], %%xmm7\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pmulld %%xmm4, %%xmm3\n"
+ "pmulld %%xmm5, %%xmm2\n"
+ "psubd %%xmm2, %%xmm3\n"
+ "psubd %%xmm7, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+ "pmulld %%xmm6, %%xmm3\n"
+ "psrad $8, %%xmm3\n"
+
+ "paddd %%xmm1, %%xmm3\n"
+
+ // out_v stays in xmm3
+ "pxor %%xmm7, %%xmm7\n"
+#endif
+
+ "pcmpeqd %%xmm2, %%xmm2\n" // 0xFFFFFFFF
+ "psrld $24, %%xmm2\n" // 0x000000FF
+ "pand %%xmm2, %%xmm0\n" // mask out unnecessary data
+ "pand %%xmm2, %%xmm3\n"
+
+ "movdqu (%[out]), %%xmm1\n"
+ "pcmpeqd %%xmm2, %%xmm2\n"
+ "pslld $8, %%xmm2\n"
+ "pand %%xmm1, %%xmm2\n"
+ "por %%xmm0, %%xmm2\n"
+ "movdqu %%xmm2, (%[out])\n"
+
+ "movdqu (%[out_v]), %%xmm1\n"
+ "pcmpeqd %%xmm2, %%xmm2\n"
+ "pslld $8, %%xmm2\n"
+ "pand %%xmm1, %%xmm2\n"
+ "por %%xmm3, %%xmm2\n"
+ "movdqu %%xmm2, (%[out_v])\n"
+ :
+ : [in] "r" (p_in),
+ [in_v] "r" (p_in_v),
+ [x] "r" (i_x),
+ [y] "r" (i_y),
+ [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat),
+ [out] "r" (p_out),
+ [out_v] "r" (p_out_v)
+ : "eax", "memory" );
+ p_in += 4;
+ p_in_v += 4;
+ p_out += 4;
+ p_out_v += 4;
+ );
+ }
+
+ p_line_end += 8 * 4;
+
+ for( ; p_in < p_line_end ; )
+ {
+ PACKED_WRITE_UV();
+ }
+
+ p_in += i_pitch - i_visible_pitch;
+ p_in_v += i_pitch - i_visible_pitch;
+ p_out += i_pitch - i_visible_pitch;
+ p_out_v += i_pitch - i_visible_pitch;
+ }
+
+ return VLC_SUCCESS;
+}
+#endif
+
+#ifdef CAN_COMPILE_SSE2
+int planar_sat_hue_SSE2( picture_t * p_pic, picture_t * p_outpic, int i_sin, int i_cos,
+ int i_sat, int i_x, int i_y )
+{
+ uint8_t *p_in, *p_in_v, *p_in_end, *p_line_end;
+ uint8_t *p_out, *p_out_v;
+
+ p_in = p_pic->p[U_PLANE].p_pixels;
+ p_in_v = p_pic->p[V_PLANE].p_pixels;
+ p_in_end = p_in + p_pic->p[U_PLANE].i_visible_lines
+ * p_pic->p[U_PLANE].i_pitch - 8;
+
+ p_out = p_outpic->p[U_PLANE].p_pixels;
+ p_out_v = p_outpic->p[V_PLANE].p_pixels;
+
+ uint8_t i_u, i_v;
+fprintf(stderr, "I'm here, bitch");
+ __asm__ volatile(
+#if !defined(__x86_64__)
+ "pxor %%xmm0, %%xmm0\n"
+ "movd %[cos], %%xmm3\n"
+ "movd %[sin], %%xmm4\n"
+ "pslld $16, %%xmm4\n"
+ "pslld $16, %%xmm3\n"
+ "psrld $16, %%xmm3\n"
+ "pshufd $0, %%xmm3, %%xmm3\n"
+ "pshufd $0, %%xmm4, %%xmm4\n"
+ "por %%xmm4, %%xmm3\n"
+ "movd %[sat], %%xmm4\n"
+ "pshufd $0, %%xmm4, %%xmm4\n"
+ "pcmpeqb %%xmm6, %%xmm6\n"
+ "psrlw $15, %%xmm6\n"
+ "psllw $7, %%xmm6\n"
+#else
+ "pxor %%xmm0, %%xmm0\n"
+ "movd %[cos], %%xmm6\n"
+ "movd %[sin], %%xmm7\n"
+ "pslld $16, %%xmm7\n"
+ "pslld $16, %%xmm6\n"
+ "psrld $16, %%xmm6\n"
+ "pshufd $0, %%xmm6, %%xmm6\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+ "por %%xmm7, %%xmm6\n"
+ "movd %[sat], %%xmm7\n"
+ "movd %[x], %%xmm10\n"
+ "movd %[y], %%xmm11\n"
+ "pshufd $0, %%xmm10, %%xmm10\n"
+ "pshufd $0, %%xmm11, %%xmm11\n"
+ "pshufd $0, %%xmm7, %%xmm7\n"
+
+ "pcmpeqb %%xmm12, %%xmm12\n"
+ "pcmpeqb %%xmm9, %%xmm9\n"
+ "pcmpeqb %%xmm13, %%xmm13\n"
+ "psrlw $15, %%xmm9\n"
+ "psllw $7, %%xmm9\n"
+ "psrlw $15, %%xmm13\n"
+ "mov $0x8000, %%eax\n"
+ "movd %%eax, %%xmm14\n"
+ "pshufd $0, %%xmm14, %%xmm14\n"
+ "mov $0x80008000, %%eax\n"
+ "movd %%eax, %%xmm15\n"
+ "pshufd $0, %%xmm15, %%xmm15\n"
+#endif
+ :
+ :
+ [x] "r" ( i_x ),
+ [y] "r" ( i_y ),
+ [sat] "r" ( i_sat * 0x10001 ),
+ [sin] "r" ( i_sin ),
+ [cos] "r" ( i_cos )
+ : "eax", "memory" );
+ for( ; p_in < p_in_end ; )
+ {
+ p_line_end = p_in + p_pic->p[U_PLANE].i_visible_pitch - 8;
+
+ for( ; p_in < p_line_end ; )
+ {
+ /* Do 8 pixels at a time */
+ __asm__ __volatile__ (
+ "movd (%[in]), %%xmm1\n"
+ "movd (%[in_v]), %%xmm2\n"
+ "punpcklbw %%xmm0, %%xmm1\n"
+ "punpcklbw %%xmm0, %%xmm2\n"
+ "punpcklwd %%xmm2, %%xmm1\n"
+ /////////////////////////////////////////
+ "movd 4(%[in]), %%xmm2\n"
+ "movd 4(%[in_v]), %%xmm5\n"
+ "punpcklbw %%xmm0, %%xmm2\n"
+ "punpcklbw %%xmm0, %%xmm5\n"
+ "punpcklwd %%xmm5, %%xmm2\n"
+ /////////////////////////////////////////
+#ifdef __i386__
+ "pmaddwd %%xmm3, %%xmm1\n"
+ "pmaddwd %%xmm3, %%xmm2\n"
+ "movd %[x], %%xmm5\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "psubd %%xmm5, %%xmm1\n"
+ "psubd %%xmm5, %%xmm2\n"
+ "pslld $8, %%xmm1\n"
+ "psrld $16, %%xmm1\n"
+ "pslld $8, %%xmm2\n"
+ "psrld $16, %%xmm2\n"
+ "mov $0x8000, %%eax\n"
+ "movd %%eax, %%xmm5\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "psubd %%xmm5, %%xmm2\n"
+ "psubd %%xmm5, %%xmm1\n"
+ "packssdw %%xmm2, %%xmm1\n"
+ "pshuflw $0, %%xmm5, %%xmm5\n"
+ "pshufhw $0, %%xmm5, %%xmm5\n"
+ "paddw %%xmm5, %%xmm1\n"
+ "pmullw %%xmm4, %%xmm1\n"
+ "psraw $8, %%xmm1\n"
+ "paddw %%xmm6, %%xmm1\n"
+ "packuswb %%xmm0, %%xmm1\n"
+ "movq %%xmm1, (%[out])\n"
+ "pcmpeqb %%xmm6, %%xmm6\n"
+ "movq (%[in]), %%xmm5\n"
+ "movd (%[in_v]), %%xmm1\n"
+ "movd 4(%[in_v]), %%xmm2\n"
+ "punpcklbw %%xmm0, %%xmm5\n"
+ "pandn %%xmm6, %%xmm5\n"
+ "punpcklbw %%xmm0, %%xmm1\n"
+ "punpcklbw %%xmm0, %%xmm2\n"
+ "paddw %%xmm6, %%xmm5\n"
+ "psrlw $15, %%xmm6\n"
+ "punpcklwd %%xmm5, %%xmm1\n"
+ "punpckhqdq %%xmm5, %%xmm5\n"
+ "punpcklwd %%xmm5, %%xmm2\n"
+
+ "psllw $7, %%xmm6\n"
+ "pmaddwd %%xmm3, %%xmm1\n"
+ "pmaddwd %%xmm3, %%xmm2\n"
+ "movd %[y], %%xmm5\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "psubd %%xmm5, %%xmm1\n"
+ "psubd %%xmm5, %%xmm2\n"
+ "pslld $8, %%xmm1\n"
+ "psrld $16, %%xmm1\n"
+ "pslld $8, %%xmm2\n"
+ "psrld $16, %%xmm2\n"
+
+ "mov $0x8000, %%eax\n"
+ "movd %%eax, %%xmm5\n"
+ "pshufd $0, %%xmm5, %%xmm5\n"
+ "psubd %%xmm5, %%xmm2\n"
+ "psubd %%xmm5, %%xmm1\n"
+ "packssdw %%xmm2, %%xmm1\n"
+ "pshuflw $0, %%xmm5, %%xmm5\n"
+ "pshufhw $0, %%xmm5, %%xmm5\n"
+ "paddw %%xmm5, %%xmm1\n"
+ "pmullw %%xmm4, %%xmm1\n"
+ "psraw $8, %%xmm1\n"
+ "paddw %%xmm6, %%xmm1\n"
+ "packuswb %%xmm0, %%xmm1\n"
+ "movq %%xmm1, (%[out_v])\n"
+#elif defined(__x86_64__)
+ /////////////////////////////////////////
+ "movq (%[in]), %%xmm5\n"
+ "movd (%[in_v]), %%xmm3\n"
+ "movd 4(%[in_v]), %%xmm4\n"
+ "punpcklbw %%xmm0, %%xmm5\n"
+ "pandn %%xmm12, %%xmm5\n" // invert U (to be subtracted)
+ "punpcklbw %%xmm0, %%xmm3\n"
+ "paddw %%xmm13, %%xmm5\n" // add 1
+ "punpcklbw %%xmm0, %%xmm4\n"
+ "punpcklwd %%xmm5, %%xmm3\n"
+ "punpckhqdq %%xmm5, %%xmm5\n"
+ "punpcklwd %%xmm5, %%xmm4\n"
+ /////////////////////////////////////////
+ "pmaddwd %%xmm6, %%xmm1\n"
+ "pmaddwd %%xmm6, %%xmm2\n"
+ "pmaddwd %%xmm6, %%xmm3\n"
+ "pmaddwd %%xmm6, %%xmm4\n"
+ "psubd %%xmm10, %%xmm1\n"
+ "psubd %%xmm10, %%xmm2\n"
+ "psubd %%xmm11, %%xmm3\n"
+ "psubd %%xmm11, %%xmm4\n"
+ "pslld $8, %%xmm1\n"
+ "pslld $8, %%xmm2\n"
+ "pslld $8, %%xmm3\n"
+ "pslld $8, %%xmm4\n"
+ "psrld $16, %%xmm1\n"
+ "psrld $16, %%xmm2\n"
+ "psrld $16, %%xmm3\n"
+ "psrld $16, %%xmm4\n"
+ "psubd %%xmm14, %%xmm1\n"
+ "psubd %%xmm14, %%xmm2\n"
+ "psubd %%xmm14, %%xmm3\n"
+ "psubd %%xmm14, %%xmm4\n"
+ "packssdw %%xmm2, %%xmm1\n"
+ "packssdw %%xmm4, %%xmm3\n"
+ "paddw %%xmm15, %%xmm1\n"
+ "paddw %%xmm15, %%xmm3\n"
+ "pmullw %%xmm7, %%xmm1\n"
+ "pmullw %%xmm7, %%xmm3\n"
+ "psraw $8, %%xmm1\n"
+ "psraw $8, %%xmm3\n"
+ "paddw %%xmm9, %%xmm1\n"
+ "paddw %%xmm9, %%xmm3\n"
+ "packuswb %%xmm0, %%xmm1\n"
+ "packuswb %%xmm0, %%xmm3\n"
+ "movq %%xmm1, (%[out])\n"
+ "movq %%xmm3, (%[out_v])\n"
+#endif
+ :
+ : [in] "r" (p_in),
+ [in_v] "r" (p_in_v),
+ [x] "r" (i_x),
+ [y] "r" (i_y),
+ [cos] "r" (i_cos),
+ [sin] "r" (i_sin),
+ [sat] "r" (i_sat),
+ [out] "r" (p_out),
+ [out_v] "r" (p_out_v)
+ : "eax", "memory" );
+ }
+
+ p_line_end += 8;
+
+ for( ; p_in < p_line_end ; )
+ {
+ PLANAR_WRITE_UV();
+ }
+
+ p_in += p_pic->p[U_PLANE].i_pitch
+ - p_pic->p[U_PLANE].i_visible_pitch;
+ p_in_v += p_pic->p[V_PLANE].i_pitch
+ - p_pic->p[V_PLANE].i_visible_pitch;
+ p_out += p_outpic->p[U_PLANE].i_pitch
+ - p_outpic->p[U_PLANE].i_visible_pitch;
+ p_out_v += p_outpic->p[V_PLANE].i_pitch
+ - p_outpic->p[V_PLANE].i_visible_pitch;
+ }
+
+ return VLC_SUCCESS;
+}
+#endif
+
int planar_sat_hue_clip_C( picture_t * p_pic, picture_t * p_outpic, int i_sin, int i_cos,
int i_sat, int i_x, int i_y )
{
diff --git a/modules/video_filter/adjust_sat_hue.h b/modules/video_filter/adjust_sat_hue.h
index 1e177fe..1ac51c2 100644
--- a/modules/video_filter/adjust_sat_hue.h
+++ b/modules/video_filter/adjust_sat_hue.h
@@ -39,6 +39,40 @@
*/
/**
+ * SSE4.1 version function for planar format, i_sat > 256
+ */
+#ifdef CAN_COMPILE_SSE4_1
+int planar_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic,
+ int i_sin, int i_cos, int i_sat, int i_x,
+ int i_y );
+#endif
+
+/**
+ * SSE4.1 version function for packed format, i_sat > 256
+ */
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_clip_SSE41( picture_t * p_pic, picture_t * p_outpic,
+ int i_sin, int i_cos, int i_sat, int i_x,
+ int i_y );
+#endif
+
+/**
+ * SSE4.1 version function for packed format, i_sat <= 256
+ */
+#ifdef CAN_COMPILE_SSE4_1
+int packed_sat_hue_SSE41( picture_t * p_pic, picture_t * p_outpic, int i_sin,
+ int i_cos, int i_sat, int i_x, int i_y );
+#endif
+
+/**
+ * SSE4.1 version function for planar format, i_sat > 256
+ */
+#ifdef CAN_COMPILE_SSE2
+int planar_sat_hue_SSE2( picture_t * p_pic, picture_t * p_outpic, int i_sin,
+ int i_cos, int i_sat, int i_x, int i_y );
+#endif
+
+/**
* Basic C compiler generated function for planar format, i_sat > 256
*/
int planar_sat_hue_clip_C( picture_t * p_pic, picture_t * p_outpic,
--
1.7.1
More information about the vlc-devel
mailing list