[x265] [PATCH] asm: avx2 8bpp code for filter_p2s[4x4](2.26x), filter_p2s[4x8](3.01x),
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Tue Mar 10 15:05:25 CET 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1425996251 -19800
# Tue Mar 10 19:34:11 2015 +0530
# Node ID fdfd37fe64245837628ae0445749811a281e3aae
# Parent 2dc6b50681ccc8b3a5123ea02728786de9aca7a4
asm: avx2 8bpp code for filter_p2s[4x4](2.26x), filter_p2s[4x8](3.01x),
filter_p2s[4x16](3.00x)
diff -r 2dc6b50681cc -r fdfd37fe6424 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Mar 10 18:41:56 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Mar 10 19:34:11 2015 +0530
@@ -1479,6 +1479,10 @@
p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2;
p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_avx2;
+ p.pu[LUMA_4x4].filter_p2s = x265_pixelToShort_4x4_avx2;
+ p.pu[LUMA_4x8].filter_p2s = x265_pixelToShort_4x8_avx2;
+ p.pu[LUMA_4x16].filter_p2s = x265_pixelToShort_4x16_avx2;
+
p.scale1D_128to64 = x265_scale1D_128to64_avx2;
p.weight_pp = x265_weight_pp_avx2;
diff -r 2dc6b50681cc -r fdfd37fe6424 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Tue Mar 10 18:41:56 2015 +0530
+++ b/source/common/x86/ipfilter8.asm Tue Mar 10 19:34:11 2015 +0530
@@ -6027,6 +6027,51 @@
PIXEL_WH_4xN 4, 4
PIXEL_WH_4xN 4, 8
PIXEL_WH_4xN 4, 16
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
+;-----------------------------------------------------------------------------
+%macro P2S_H_4xN_avx2 1
+INIT_YMM avx2
+cglobal pixelToShort_4x%1, 3, 5, 6
+
+ ; load height
+ mov r3d, %1
+
+ ; load constant
+ vbroadcasti128 m4, [pb_128]
+ vbroadcasti128 m5, [tab_c_64_n64]
+
+%rep %1 / 4
+ movd xm0, [r0]
+ movd xm1, [r0 + r1]
+ movd xm2, [r0 + r1 * 2]
+ lea r4, [r1 * 3]
+ movd xm3, [r0 + r4]
+
+ punpckldq m0, m0, m1
+ punpckldq m2, m2 , m3
+ punpcklbw m0, m4
+ punpcklbw m2, m4
+ vinserti128 m2, m0, xm2, 1
+ pmaddubsw m2,m5
+
+ movq [r2 + FENC_STRIDE * 0], xm2
+ movhps [r2 + FENC_STRIDE * 2], xm2
+ vextracti128 xm2, m2, 1
+ movq [r2 + FENC_STRIDE * 4], xm2
+ movhps [r2 + FENC_STRIDE * 6], xm2
+
+ lea r0, [r0 + r1 * 4]
+ add r2, FENC_STRIDE * 8
+
+%endrep
+ RET
+%endmacro
+P2S_H_4xN_avx2 4
+P2S_H_4xN_avx2 8
+P2S_H_4xN_avx2 16
+
;-----------------------------------------------------------------------------
; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
;-----------------------------------------------------------------------------
@@ -6528,7 +6573,7 @@
movu [r2 + FENC_STRIDE * 6 + 64], m7
movu [r2 + FENC_STRIDE * 6 + 80], m6
- lea r0, [r0 + r1 * 4]
+ lea r0, [r0 + r1 * 4]
add r2, FENC_STRIDE * 8
%endrep
RET
diff -r 2dc6b50681cc -r fdfd37fe6424 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Tue Mar 10 18:41:56 2015 +0530
+++ b/source/common/x86/ipfilter8.h Tue Mar 10 19:34:11 2015 +0530
@@ -646,6 +646,10 @@
void x265_pixelToShort_24x32_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
void x265_pixelToShort_48x64_ssse3(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_4x4_avx2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_4x8_avx2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_4x16_avx2(const pixel* src, intptr_t srcStride, int16_t* dst);
+
#undef LUMA_FILTERS
#undef LUMA_SP_FILTERS
#undef LUMA_SS_FILTERS
More information about the x265-devel
mailing list