[x265] [PATCH] x86: split ipfilter8 kernels into two different source file
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Feb 20 10:05:57 CET 2018
# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1516343663 -19800
# Fri Jan 19 12:04:23 2018 +0530
# Node ID 55a15ecc1110f206199db1b0f997272b5f7ddc82
# Parent 52782aeb20818273cbf749d221647a254b26c4a4
x86: split ipfilter8 kernels into two different source file
This patch implements infrastructure to split ipfiletr8 asm source file into two
different files in order to avoid longer build time. It moves interp_8tap_horizontal
kernels to the newly created file.
diff -r 52782aeb2081 -r 55a15ecc1110 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt Fri Feb 16 11:40:59 2018 +0530
+++ b/source/common/CMakeLists.txt Fri Jan 19 12:04:23 2018 +0530
@@ -56,17 +56,15 @@
endif()
set(VEC_PRIMITIVES vec/vec-primitives.cpp ${PRIMITIVES})
source_group(Intrinsics FILES ${VEC_PRIMITIVES})
-
- set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h seaintegral.h)
+ set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h h-ipfilter8.h loopfilter.h seaintegral.h)
set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm
mc-a2.asm pixel-util8.asm blockcopy8.asm
pixeladd8.asm dct8.asm seaintegral.asm)
if(HIGH_BIT_DEPTH)
set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm loopfilter.asm)
else()
- set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm ipfilter8.asm loopfilter.asm)
+ set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm h-ipfilter8.asm ipfilter8.asm loopfilter.asm)
endif()
-
if(NOT X64)
set(A_SRCS ${A_SRCS} pixel-32.asm)
endif()
diff -r 52782aeb2081 -r 55a15ecc1110 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Feb 16 11:40:59 2018 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Jan 19 12:04:23 2018 +0530
@@ -115,8 +115,8 @@
#include "intrapred.h"
#include "dct8.h"
#include "seaintegral.h"
+#include "h-ipfilter8.h"
}
-
#define ALL_LUMA_CU_TYPED(prim, fncdef, fname, cpu) \
p.cu[BLOCK_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \
p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
diff -r 52782aeb2081 -r 55a15ecc1110 source/common/x86/h-ipfilter8.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/h-ipfilter8.asm Fri Jan 19 12:04:23 2018 +0530
@@ -0,0 +1,267 @@
+;*****************************************************************************
+;* Copyright (C) 2013-2017 MulticoreWare, Inc
+;*
+;* Authors: Min Chen <chenm003 at 163.com>
+;* Nabajit Deka <nabajit at multicorewareinc.com>
+;* Praveen Kumar Tiwari <praveen at multicorewareinc.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at license @ x265.com.
+;*****************************************************************************/
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA 32
+
+const h_tabw_LumaCoeff, dw 0, 0, 0, 64, 0, 0, 0, 0
+ dw -1, 4, -10, 58, 17, -5, 1, 0
+ dw -1, 4, -11, 40, 40, -11, 4, -1
+ dw 0, 1, -5, 17, 58, -10, 4, -1
+
+SECTION .text
+
+cextern pw_32
+cextern pw_2000
+
+%macro FILTER_H8_W8_sse2 0
+ movh m1, [r0 + x - 3]
+ movh m4, [r0 + x - 2]
+ punpcklbw m1, m6
+ punpcklbw m4, m6
+ movh m5, [r0 + x - 1]
+ movh m0, [r0 + x]
+ punpcklbw m5, m6
+ punpcklbw m0, m6
+ pmaddwd m1, m3
+ pmaddwd m4, m3
+ pmaddwd m5, m3
+ pmaddwd m0, m3
+ packssdw m1, m4
+ packssdw m5, m0
+ pshuflw m4, m1, q2301
+ pshufhw m4, m4, q2301
+ pshuflw m0, m5, q2301
+ pshufhw m0, m0, q2301
+ paddw m1, m4
+ paddw m5, m0
+ psrldq m1, 2
+ psrldq m5, 2
+ pshufd m1, m1, q3120
+ pshufd m5, m5, q3120
+ punpcklqdq m1, m5
+ movh m7, [r0 + x + 1]
+ movh m4, [r0 + x + 2]
+ punpcklbw m7, m6
+ punpcklbw m4, m6
+ movh m5, [r0 + x + 3]
+ movh m0, [r0 + x + 4]
+ punpcklbw m5, m6
+ punpcklbw m0, m6
+ pmaddwd m7, m3
+ pmaddwd m4, m3
+ pmaddwd m5, m3
+ pmaddwd m0, m3
+ packssdw m7, m4
+ packssdw m5, m0
+ pshuflw m4, m7, q2301
+ pshufhw m4, m4, q2301
+ pshuflw m0, m5, q2301
+ pshufhw m0, m0, q2301
+ paddw m7, m4
+ paddw m5, m0
+ psrldq m7, 2
+ psrldq m5, 2
+ pshufd m7, m7, q3120
+ pshufd m5, m5, q3120
+ punpcklqdq m7, m5
+ pshuflw m4, m1, q2301
+ pshufhw m4, m4, q2301
+ pshuflw m0, m7, q2301
+ pshufhw m0, m0, q2301
+ paddw m1, m4
+ paddw m7, m0
+ psrldq m1, 2
+ psrldq m7, 2
+ pshufd m1, m1, q3120
+ pshufd m7, m7, q3120
+ punpcklqdq m1, m7
+%endmacro
+
+%macro FILTER_H8_W4_sse2 0
+ movh m1, [r0 + x - 3]
+ movh m0, [r0 + x - 2]
+ punpcklbw m1, m6
+ punpcklbw m0, m6
+ movh m4, [r0 + x - 1]
+ movh m5, [r0 + x]
+ punpcklbw m4, m6
+ punpcklbw m5, m6
+ pmaddwd m1, m3
+ pmaddwd m0, m3
+ pmaddwd m4, m3
+ pmaddwd m5, m3
+ packssdw m1, m0
+ packssdw m4, m5
+ pshuflw m0, m1, q2301
+ pshufhw m0, m0, q2301
+ pshuflw m5, m4, q2301
+ pshufhw m5, m5, q2301
+ paddw m1, m0
+ paddw m4, m5
+ psrldq m1, 2
+ psrldq m4, 2
+ pshufd m1, m1, q3120
+ pshufd m4, m4, q3120
+ punpcklqdq m1, m4
+ pshuflw m0, m1, q2301
+ pshufhw m0, m0, q2301
+ paddw m1, m0
+ psrldq m1, 2
+ pshufd m1, m1, q3120
+%endmacro
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+%macro IPFILTER_LUMA_sse2 3
+INIT_XMM sse2
+cglobal interp_8tap_horiz_%3_%1x%2, 4,6,8
+ mov r4d, r4m
+ add r4d, r4d
+ pxor m6, m6
+
+%ifidn %3, ps
+ add r3d, r3d
+ cmp r5m, byte 0
+%endif
+
+%ifdef PIC
+ lea r5, [h_tabw_LumaCoeff]
+ movu m3, [r5 + r4 * 8]
+%else
+ movu m3, [h_tabw_LumaCoeff + r4 * 8]
+%endif
+
+ mov r4d, %2
+
+%ifidn %3, pp
+ mova m2, [pw_32]
+%else
+ mova m2, [pw_2000]
+ je .loopH
+ lea r5, [r1 + 2 * r1]
+ sub r0, r5
+ add r4d, 7
+%endif
+
+.loopH:
+%assign x 0
+%rep %1 / 8
+ FILTER_H8_W8_sse2
+ %ifidn %3, pp
+ paddw m1, m2
+ psraw m1, 6
+ packuswb m1, m1
+ movh [r2 + x], m1
+ %else
+ psubw m1, m2
+ movu [r2 + 2 * x], m1
+ %endif
+%assign x x+8
+%endrep
+
+%rep (%1 % 8) / 4
+ FILTER_H8_W4_sse2
+ %ifidn %3, pp
+ paddw m1, m2
+ psraw m1, 6
+ packuswb m1, m1
+ movd [r2 + x], m1
+ %else
+ psubw m1, m2
+ movh [r2 + 2 * x], m1
+ %endif
+%endrep
+
+ add r0, r1
+ add r2, r3
+
+ dec r4d
+ jnz .loopH
+ RET
+
+%endmacro
+
+;--------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
+;--------------------------------------------------------------------------------------------------------------
+ IPFILTER_LUMA_sse2 4, 4, pp
+ IPFILTER_LUMA_sse2 4, 8, pp
+ IPFILTER_LUMA_sse2 8, 4, pp
+ IPFILTER_LUMA_sse2 8, 8, pp
+ IPFILTER_LUMA_sse2 16, 16, pp
+ IPFILTER_LUMA_sse2 16, 8, pp
+ IPFILTER_LUMA_sse2 8, 16, pp
+ IPFILTER_LUMA_sse2 16, 12, pp
+ IPFILTER_LUMA_sse2 12, 16, pp
+ IPFILTER_LUMA_sse2 16, 4, pp
+ IPFILTER_LUMA_sse2 4, 16, pp
+ IPFILTER_LUMA_sse2 32, 32, pp
+ IPFILTER_LUMA_sse2 32, 16, pp
+ IPFILTER_LUMA_sse2 16, 32, pp
+ IPFILTER_LUMA_sse2 32, 24, pp
+ IPFILTER_LUMA_sse2 24, 32, pp
+ IPFILTER_LUMA_sse2 32, 8, pp
+ IPFILTER_LUMA_sse2 8, 32, pp
+ IPFILTER_LUMA_sse2 64, 64, pp
+ IPFILTER_LUMA_sse2 64, 32, pp
+ IPFILTER_LUMA_sse2 32, 64, pp
+ IPFILTER_LUMA_sse2 64, 48, pp
+ IPFILTER_LUMA_sse2 48, 64, pp
+ IPFILTER_LUMA_sse2 64, 16, pp
+ IPFILTER_LUMA_sse2 16, 64, pp
+
+;----------------------------------------------------------------------------------------------------------------------------
+; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
+;----------------------------------------------------------------------------------------------------------------------------
+ IPFILTER_LUMA_sse2 4, 4, ps
+ IPFILTER_LUMA_sse2 8, 8, ps
+ IPFILTER_LUMA_sse2 8, 4, ps
+ IPFILTER_LUMA_sse2 4, 8, ps
+ IPFILTER_LUMA_sse2 16, 16, ps
+ IPFILTER_LUMA_sse2 16, 8, ps
+ IPFILTER_LUMA_sse2 8, 16, ps
+ IPFILTER_LUMA_sse2 16, 12, ps
+ IPFILTER_LUMA_sse2 12, 16, ps
+ IPFILTER_LUMA_sse2 16, 4, ps
+ IPFILTER_LUMA_sse2 4, 16, ps
+ IPFILTER_LUMA_sse2 32, 32, ps
+ IPFILTER_LUMA_sse2 32, 16, ps
+ IPFILTER_LUMA_sse2 16, 32, ps
+ IPFILTER_LUMA_sse2 32, 24, ps
+ IPFILTER_LUMA_sse2 24, 32, ps
+ IPFILTER_LUMA_sse2 32, 8, ps
+ IPFILTER_LUMA_sse2 8, 32, ps
+ IPFILTER_LUMA_sse2 64, 64, ps
+ IPFILTER_LUMA_sse2 64, 32, ps
+ IPFILTER_LUMA_sse2 32, 64, ps
+ IPFILTER_LUMA_sse2 64, 48, ps
+ IPFILTER_LUMA_sse2 48, 64, ps
+ IPFILTER_LUMA_sse2 64, 16, ps
+ IPFILTER_LUMA_sse2 16, 64, ps
+
diff -r 52782aeb2081 -r 55a15ecc1110 source/common/x86/h-ipfilter8.h
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/h-ipfilter8.h Fri Jan 19 12:04:23 2018 +0530
@@ -0,0 +1,39 @@
+/*****************************************************************************
+* Copyright (C) 2013-2017 MulticoreWare, Inc
+*
+* Authors: Steve Borho <steve at borho.org>
+* Praveen Kuamr Tiwari <praveen at multicorewareinc.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#ifndef X265_H_IPFILTER8_H
+#define X265_H_IPFILTER8_H
+
+
+#define SETUP_H_FUNC_DEF(cpu) \
+ FUNCDEF_PU(void, interp_8tap_horiz_pp, cpu, const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx); \
+ FUNCDEF_PU(void, interp_8tap_horiz_ps, cpu, const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt);
+
+SETUP_H_FUNC_DEF(sse2);
+SETUP_H_FUNC_DEF(ssse3);
+SETUP_H_FUNC_DEF(sse3);
+SETUP_H_FUNC_DEF(sse4);
+SETUP_H_FUNC_DEF(avx2);
+
+#endif // ifndef X265_H_IPFILTER8_H
diff -r 52782aeb2081 -r 55a15ecc1110 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Fri Feb 16 11:40:59 2018 +0530
+++ b/source/common/x86/ipfilter8.asm Fri Jan 19 12:04:23 2018 +0530
@@ -855,137 +855,6 @@
psrldq m1, 2
pshufd m1, m1, q3120
%endmacro
-
-;----------------------------------------------------------------------------------------------------------------------------
-; void interp_8tap_horiz_%3_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
-;----------------------------------------------------------------------------------------------------------------------------
-%macro IPFILTER_LUMA_sse2 3
-INIT_XMM sse2
-cglobal interp_8tap_horiz_%3_%1x%2, 4,6,8
- mov r4d, r4m
- add r4d, r4d
- pxor m6, m6
-
-%ifidn %3, ps
- add r3d, r3d
- cmp r5m, byte 0
-%endif
-
-%ifdef PIC
- lea r5, [tabw_LumaCoeff]
- movu m3, [r5 + r4 * 8]
-%else
- movu m3, [tabw_LumaCoeff + r4 * 8]
-%endif
-
- mov r4d, %2
-
-%ifidn %3, pp
- mova m2, [pw_32]
-%else
- mova m2, [pw_2000]
- je .loopH
- lea r5, [r1 + 2 * r1]
- sub r0, r5
- add r4d, 7
-%endif
-
-.loopH:
-%assign x 0
-%rep %1 / 8
- FILTER_H8_W8_sse2
- %ifidn %3, pp
- paddw m1, m2
- psraw m1, 6
- packuswb m1, m1
- movh [r2 + x], m1
- %else
- psubw m1, m2
- movu [r2 + 2 * x], m1
- %endif
-%assign x x+8
-%endrep
-
-%rep (%1 % 8) / 4
- FILTER_H8_W4_sse2
- %ifidn %3, pp
- paddw m1, m2
- psraw m1, 6
- packuswb m1, m1
- movd [r2 + x], m1
- %else
- psubw m1, m2
- movh [r2 + 2 * x], m1
- %endif
-%endrep
-
- add r0, r1
- add r2, r3
-
- dec r4d
- jnz .loopH
- RET
-
-%endmacro
-
-;--------------------------------------------------------------------------------------------------------------
-; void interp_8tap_horiz_pp_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx)
-;--------------------------------------------------------------------------------------------------------------
- IPFILTER_LUMA_sse2 4, 4, pp
- IPFILTER_LUMA_sse2 4, 8, pp
- IPFILTER_LUMA_sse2 8, 4, pp
- IPFILTER_LUMA_sse2 8, 8, pp
- IPFILTER_LUMA_sse2 16, 16, pp
- IPFILTER_LUMA_sse2 16, 8, pp
- IPFILTER_LUMA_sse2 8, 16, pp
- IPFILTER_LUMA_sse2 16, 12, pp
- IPFILTER_LUMA_sse2 12, 16, pp
- IPFILTER_LUMA_sse2 16, 4, pp
- IPFILTER_LUMA_sse2 4, 16, pp
- IPFILTER_LUMA_sse2 32, 32, pp
- IPFILTER_LUMA_sse2 32, 16, pp
- IPFILTER_LUMA_sse2 16, 32, pp
- IPFILTER_LUMA_sse2 32, 24, pp
- IPFILTER_LUMA_sse2 24, 32, pp
- IPFILTER_LUMA_sse2 32, 8, pp
- IPFILTER_LUMA_sse2 8, 32, pp
- IPFILTER_LUMA_sse2 64, 64, pp
- IPFILTER_LUMA_sse2 64, 32, pp
- IPFILTER_LUMA_sse2 32, 64, pp
- IPFILTER_LUMA_sse2 64, 48, pp
- IPFILTER_LUMA_sse2 48, 64, pp
- IPFILTER_LUMA_sse2 64, 16, pp
- IPFILTER_LUMA_sse2 16, 64, pp
-
-;----------------------------------------------------------------------------------------------------------------------------
-; void interp_8tap_horiz_ps_%1x%2(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int coeffIdx, int isRowExt)
-;----------------------------------------------------------------------------------------------------------------------------
- IPFILTER_LUMA_sse2 4, 4, ps
- IPFILTER_LUMA_sse2 8, 8, ps
- IPFILTER_LUMA_sse2 8, 4, ps
- IPFILTER_LUMA_sse2 4, 8, ps
- IPFILTER_LUMA_sse2 16, 16, ps
- IPFILTER_LUMA_sse2 16, 8, ps
- IPFILTER_LUMA_sse2 8, 16, ps
- IPFILTER_LUMA_sse2 16, 12, ps
- IPFILTER_LUMA_sse2 12, 16, ps
- IPFILTER_LUMA_sse2 16, 4, ps
- IPFILTER_LUMA_sse2 4, 16, ps
- IPFILTER_LUMA_sse2 32, 32, ps
- IPFILTER_LUMA_sse2 32, 16, ps
- IPFILTER_LUMA_sse2 16, 32, ps
- IPFILTER_LUMA_sse2 32, 24, ps
- IPFILTER_LUMA_sse2 24, 32, ps
- IPFILTER_LUMA_sse2 32, 8, ps
- IPFILTER_LUMA_sse2 8, 32, ps
- IPFILTER_LUMA_sse2 64, 64, ps
- IPFILTER_LUMA_sse2 64, 32, ps
- IPFILTER_LUMA_sse2 32, 64, ps
- IPFILTER_LUMA_sse2 64, 48, ps
- IPFILTER_LUMA_sse2 48, 64, ps
- IPFILTER_LUMA_sse2 64, 16, ps
- IPFILTER_LUMA_sse2 16, 64, ps
-
%macro PROCESS_LUMA_W4_4R_sse2 0
movd m2, [r0]
movd m7, [r0 + r1]
More information about the x265-devel
mailing list