[x265] Fwd: [PATCH] asm code for filterHorizontal_p_p 4 tap filter
Praveen Tiwari
praveen at multicorewareinc.com
Thu Sep 26 16:19:03 CEST 2013
---------- Forwarded message ----------
From: <praveen at multicorewareinc.com>
Date: Thu, Sep 26, 2013 at 7:40 PM
Subject: [PATCH] asm code for filterHorizontal_p_p 4 tap filter
To: x265-devel at videolan.org
# HG changeset patch
# User praveen Tiwari
# Date 1380204623 -19800
# Node ID a31b81b707066aaf0ed42d5a2b453b5c86b9f797
# Parent 0dbfb0bbca1a1b714aa48db7eaae3f2f9ab713ec
asm code for filterHorizontal_p_p 4 tap filter.
diff -r 0dbfb0bbca1a -r a31b81b70706 source/common/x86/CMakeLists.txt
--- a/source/common/x86/CMakeLists.txt Wed Sep 25 14:34:49 2013 +0530
+++ b/source/common/x86/CMakeLists.txt Thu Sep 26 19:40:23 2013 +0530
@@ -5,7 +5,7 @@
add_definitions(-DHAVE_ALIGNED_STACK=0)
endif()
-set(ASMS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a2.asm)
+set(ASMS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a2.asm
ipfilter8.asm)
if (X64)
add_definitions(-DARCH_X86_64=1)
else()
diff -r 0dbfb0bbca1a -r a31b81b70706 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Sep 25 14:34:49 2013
+0530
+++ b/source/common/x86/asm-primitives.cpp Thu Sep 26 19:40:23 2013
+0530
@@ -37,6 +37,9 @@
LOWRES(ssse3)
LOWRES(avx)
LOWRES(xop)
+
+extern "C" void x265_filterHorizontal_p_p_4_sse4(pixel *src, intptr_t
srcStride, pixel *dst, intptr_t dstStride, int width, int height, short
const *coeff);
+
}
bool hasXOP(void); // instr_detect.cpp
@@ -615,6 +618,7 @@
p.sa8d_inter[PARTITION_64x4] = p.satd[PARTITION_64x4];
p.sa8d_inter[PARTITION_64x12] = p.satd[PARTITION_64x12];
}
+ p.ipfilter_pp[FILTER_H_P_P_4] = x265_filterHorizontal_p_p_4_sse4;
}
}
diff -r 0dbfb0bbca1a -r a31b81b70706 source/common/x86/ipfilter8.asm
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/ipfilter8.asm Thu Sep 26 19:40:23 2013 +0530
@@ -0,0 +1,134 @@
+;*****************************************************************************
+;* Copyright (C) 2013 x265 project
+;*
+;* Authors: Min Chen <chenm003 at 163.com>
+;* Nabajit Deka <nabajit at multicorewareinc.com>
+;* Praveen Kumar Tiwari <praveen at multicorewareinc.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111,
USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing at multicorewareinc.com.
+;*****************************************************************************/
+
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+%if ARCH_X86_64 == 0
+
+INIT_XMM sse4
+
+SECTION_RODATA 32
+tab_leftmask: db -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0
+
+tab_Tm: db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
+ db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10
+
+tab_c_512: times 8 dw 512
+
+SECTION .text
+
+%macro FILTER_H4 3
+ movu %1, [src + col - 1]
+ pshufb %2, %1, Tm4
+ pmaddubsw %2, coef2
+ pshufb %1, %1, Tm5
+ pmaddubsw %1, coef2
+ phaddw %2, %1
+ pmulhrsw %2, %3
+ packuswb %2, %2
+%endmacro
+
+;-----------------------------------------------------------------------------
+; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst,
intptr_t dstStride, int width, int height, short const *coeff)
+;-----------------------------------------------------------------------------
+cglobal filterHorizontal_p_p_4, 0, 7, 8
+%define src r0
+%define dst r1
+%define row r2
+%define col r3
+%define width r4
+%define widthleft r5
+%define mask_offset r6
+%define coef2 m7
+%define x3 m6
+%define Tm5 m5
+%define Tm4 m4
+%define x2 m3
+%define x1 m2
+%define x0 m1
+%define leftmask m0
+%define tmp r0
+%define tmp1 r1
x86 inc based parameters are not used as of now, they are giving build
errors.
+ mov tmp, r6m
+ movu coef2, [tmp]
+ packsswb coef2, coef2
+ pshufd coef2, coef2, 0
+
+ mova x3, [tab_c_512]
+
+ mov width, r4m
+ mov widthleft, width
+ and width, ~7
+ and widthleft, 7
+ mov mask_offset, widthleft
+ neg mask_offset
+
+ movq leftmask, [tab_leftmask + (7 + mask_offset)]
+ mova Tm4, [tab_Tm]
+ mova Tm5, [tab_Tm + 16]
+
+ mov src, r0m
+ mov dst, r2m
+ mov row, r5m
+
+_loop_row:
+ test row, row
+ jz _end_row
+
+ xor col, col
+
+_loop_col:
+ cmp col, width
+ jge _end_col
+
+ FILTER_H4 x0, x1, x3
+ movh [dst + col], x1
+ add col, 8
+
+ jmp _loop_col
+
+_end_col:
+ test widthleft, widthleft
+ jz _next_row
+
+ movq x2, [dst + col]
+ FILTER_H4 x0, x1, x3
+ pblendvb x2, x2, x1, leftmask
+ movh [dst + col], x2
+
+_next_row:
+ add src, r1m
+ add dst, r3m
+ dec row
+
+ jmp _loop_row
Loop conditions are used at the start to satisfy the test bench
requirements.
+
+_end_row:
+ RET
+
+%endif ; ARCH_X86_64 == 0
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20130926/87160e25/attachment.html>
More information about the x265-devel
mailing list