[x264-devel] x86: AVX2 predict_8x8c_p/predict_8x16c_p
Henrik Gramner
git at videolan.org
Tue Apr 23 23:37:09 CEST 2013
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Tue Apr 16 23:27:22 2013 +0200| [dcad117131f0e0b5032bf5ca8c27def7fcdce17f] | committer: Jason Garrett-Glaser
x86: AVX2 predict_8x8c_p/predict_8x16c_p
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=dcad117131f0e0b5032bf5ca8c27def7fcdce17f
---
common/x86/predict-a.asm | 95 ++++++++++++++++++++++++++--------------------
common/x86/predict-c.c | 24 ++++++++----
common/x86/predict.h | 6 ++-
3 files changed, 74 insertions(+), 51 deletions(-)
diff --git a/common/x86/predict-a.asm b/common/x86/predict-a.asm
index ecf1cd5..5ccccb5 100644
--- a/common/x86/predict-a.asm
+++ b/common/x86/predict-a.asm
@@ -32,9 +32,9 @@
SECTION_RODATA 32
pw_0to15: dw 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
-pw_43210123: dw -3, -2, -1, 0, 1, 2, 3, 4
-pw_m3: times 8 dw -3
-pw_m7: times 8 dw -7
+pw_43210123: times 2 dw -3, -2, -1, 0, 1, 2, 3, 4
+pw_m3: times 16 dw -3
+pw_m7: times 16 dw -7
pb_00s_ff: times 8 db 0
pb_0s_ff: times 7 db 0
db 0xff
@@ -1122,17 +1122,12 @@ PREDICT_CHROMA_P_MMX 8
PREDICT_CHROMA_P_MMX 16
%endif ; !ARCH_X86_64 && !HIGH_BIT_DEPTH
-%macro PREDICT_CHROMA_P_XMM 1
+%macro PREDICT_CHROMA_P 1
%if HIGH_BIT_DEPTH
cglobal predict_8x%1c_p_core, 1,2,7
- movd m0, r1m
- movd m2, r2m
- movd m4, r3m
+ LOAD_PLANE_ARGS
mova m3, [pw_pixel_max]
pxor m1, m1
- SPLATW m0, m0, 0
- SPLATW m2, m2, 0
- SPLATW m4, m4, 0
pmullw m2, [pw_43210123] ; b
%if %1 == 16
pmullw m5, m4, [pw_m7] ; c
@@ -1140,59 +1135,77 @@ cglobal predict_8x%1c_p_core, 1,2,7
pmullw m5, m4, [pw_m3]
%endif
paddw m5, [pw_16]
- mov r1d, %1
+%if mmsize == 32
+ mova xm6, xm4
+ paddw m4, m4
+ paddw m5, m6
+%endif
+ mov r1d, %1/(mmsize/16)
.loop:
paddsw m6, m2, m5
paddsw m6, m0
psraw m6, 5
CLIPW m6, m1, m3
- mova [r0], m6
paddw m5, m4
+%if mmsize == 32
+ vextracti128 [r0], m6, 1
+ mova [r0+FDEC_STRIDEB], xm6
+ add r0, 2*FDEC_STRIDEB
+%else
+ mova [r0], m6
add r0, FDEC_STRIDEB
+%endif
dec r1d
jg .loop
RET
%else ; !HIGH_BIT_DEPTH
cglobal predict_8x%1c_p_core, 1,2
- movd m0, r1m
- movd m2, r2m
- movd m4, r3m
- SPLATW m0, m0, 0
- SPLATW m2, m2, 0
- SPLATW m4, m4, 0
+ LOAD_PLANE_ARGS
+%if mmsize == 32
+ vbroadcasti128 m1, [pw_0to15] ; 0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
+ pmullw m2, m1
+ mova xm1, xm4 ; zero upper half
+ paddsw m4, m4
+ paddsw m0, m1
+%else
pmullw m2, [pw_0to15]
- paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
- paddsw m3, m0, m4
+%endif
+ paddsw m0, m2 ; m0 = {i+0*b, i+1*b, i+2*b, i+3*b, i+4*b, i+5*b, i+6*b, i+7*b}
+ paddsw m1, m0, m4
paddsw m4, m4
- mov r1d, %1/4
+ mov r1d, %1/(mmsize/8)
.loop:
- paddsw m1, m3, m4
- paddsw m5, m0, m4
- psraw m3, 5
- psraw m0, 5
- packuswb m0, m3
- movq [r0+FDEC_STRIDE*0], m0
- movhps [r0+FDEC_STRIDE*1], m0
- paddsw m0, m5, m4
- paddsw m3, m1, m4
- psraw m5, 5
- psraw m1, 5
- packuswb m5, m1
- movq [r0+FDEC_STRIDE*2], m5
- movhps [r0+FDEC_STRIDE*3], m5
- add r0, FDEC_STRIDE*4
+ psraw m2, m0, 5
+ psraw m3, m1, 5
+ paddsw m0, m4
+ paddsw m1, m4
+ packuswb m2, m3
+%if mmsize == 32
+ movq [r0+FDEC_STRIDE*1], xm2
+ movhps [r0+FDEC_STRIDE*3], xm2
+ vextracti128 xm2, m2, 1
+ movq [r0+FDEC_STRIDE*0], xm2
+ movhps [r0+FDEC_STRIDE*2], xm2
+%else
+ movq [r0+FDEC_STRIDE*0], xm2
+ movhps [r0+FDEC_STRIDE*1], xm2
+%endif
+ add r0, FDEC_STRIDE*mmsize/8
dec r1d
jg .loop
RET
%endif ; HIGH_BIT_DEPTH
-%endmacro ; PREDICT_CHROMA_P_XMM
+%endmacro ; PREDICT_CHROMA_P
INIT_XMM sse2
-PREDICT_CHROMA_P_XMM 8
-PREDICT_CHROMA_P_XMM 16
+PREDICT_CHROMA_P 8
+PREDICT_CHROMA_P 16
INIT_XMM avx
-PREDICT_CHROMA_P_XMM 8
-PREDICT_CHROMA_P_XMM 16
+PREDICT_CHROMA_P 8
+PREDICT_CHROMA_P 16
+INIT_YMM avx2
+PREDICT_CHROMA_P 8
+PREDICT_CHROMA_P 16
;-----------------------------------------------------------------------------
; void predict_16x16_p_core( uint8_t *src, int i00, int b, int c )
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 1091908..eccf86b 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -115,9 +115,6 @@ static void x264_predict_8x16c_p_##name( uint16_t *src )\
PREDICT_8x16C_P_CORE \
x264_predict_8x16c_p_core_##name( src, a, b, c );\
}
-
-PREDICT_8x16_P(sse2)
-PREDICT_8x16_P(avx)
#else
#define PREDICT_8x16_P(name)\
static void x264_predict_8x16c_p_##name( uint8_t *src )\
@@ -129,9 +126,10 @@ static void x264_predict_8x16c_p_##name( uint8_t *src )\
#ifndef ARCH_X86_64
PREDICT_8x16_P(mmx2)
#endif
+#endif
PREDICT_8x16_P(sse2)
PREDICT_8x16_P(avx)
-#endif
+PREDICT_8x16_P(avx2)
#if HAVE_X86_INLINE_ASM
#if HIGH_BIT_DEPTH
@@ -225,9 +223,9 @@ static void x264_predict_8x8c_p_##name( uint8_t *src )\
x264_predict_8x8c_p_core_##name( src, i00, b, c );\
}
#ifndef ARCH_X86_64
-PREDICT_8x8_P( mmx2 )
+PREDICT_8x8_P(mmx2)
#endif
-PREDICT_8x8_P( sse2 )
+PREDICT_8x8_P(sse2)
#endif //!HIGH_BIT_DEPTH
@@ -264,7 +262,6 @@ static void x264_predict_8x8c_p_ ## cpu1( pixel *src )\
}
PREDICT_8x8_P2(sse2, sse2)
-PREDICT_8x8_P2( avx, avx)
#else //!HIGH_BIT_DEPTH
#define PREDICT_8x8_P2(cpu1, cpu2)\
@@ -289,8 +286,9 @@ static void x264_predict_8x8c_p_ ## cpu1( pixel *src )\
}
PREDICT_8x8_P2(ssse3, sse2)
-PREDICT_8x8_P2( avx, avx)
#endif
+PREDICT_8x8_P2( avx, avx)
+PREDICT_8x8_P2( avx2, avx2)
#endif
#if ARCH_X86_64 && !HIGH_BIT_DEPTH
@@ -439,6 +437,11 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_avx;
#endif
#endif // HIGH_BIT_DEPTH
+
+ if( cpu&X264_CPU_AVX2 )
+ {
+ pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_avx2;
+ }
}
void x264_predict_8x16c_init_mmx( int cpu, x264_predict_t pf[7] )
@@ -485,6 +488,11 @@ void x264_predict_8x16c_init_mmx( int cpu, x264_predict_t pf[7] )
return;
pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx;
#endif // HIGH_BIT_DEPTH
+
+ if( cpu&X264_CPU_AVX2 )
+ {
+ pf[I_PRED_CHROMA_P] = x264_predict_8x16c_p_avx2;
+ }
}
void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
diff --git a/common/x86/predict.h b/common/x86/predict.h
index c196497..7691c09 100644
--- a/common/x86/predict.h
+++ b/common/x86/predict.h
@@ -65,10 +65,12 @@ void x264_predict_8x16c_h_ssse3( uint8_t *src );
void x264_predict_8x16c_h_avx2( uint16_t *src );
void x264_predict_8x16c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_8x16c_p_core_sse2( pixel *src, int i00, int b, int c );
-void x264_predict_8x16c_p_core_avx( pixel *src, int i00, int b, int c );
+void x264_predict_8x16c_p_core_avx ( pixel *src, int i00, int b, int c );
+void x264_predict_8x16c_p_core_avx2( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_mmx2( uint8_t *src, int i00, int b, int c );
void x264_predict_8x8c_p_core_sse2( pixel *src, int i00, int b, int c );
-void x264_predict_8x8c_p_core_avx( pixel *src, int i00, int b, int c );
+void x264_predict_8x8c_p_core_avx ( pixel *src, int i00, int b, int c );
+void x264_predict_8x8c_p_core_avx2( pixel *src, int i00, int b, int c );
void x264_predict_8x8c_dc_mmx2( pixel *src );
void x264_predict_8x8c_dc_sse2( uint16_t *src );
void x264_predict_8x8c_dc_top_mmx2( uint8_t *src );
More information about the x264-devel
mailing list