[x265] [PATCH] asm-sse2: 16bpp code for filter_p2s[4x4](2.47x), filter_p2s[4x8](2.78x),
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Wed Mar 4 09:05:36 CET 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1425456296 -19800
# Wed Mar 04 13:34:56 2015 +0530
# Node ID 1f1589d50a59ebcda55559a323b606c73a9b315f
# Parent 94991f753feae850b6edd371481e199f76243af3
asm-sse2: 16bpp code for filter_p2s[4x4](2.47x), filter_p2s[4x8](2.78x),
filter_p2s[4x16](2.95x), filter_p2s[8x4](4.07x), filter_p2s[8x8](4.54x),
filter_p2s[8x16](4.40x), filter_p2s[8x32](4.67x), filter_p2s[16x4](6.19x),
filter_p2s[16x8](7.51x), filter_p2s[16x12](7.35x), filter_p2s[16x16](7.30x),
filter_p2s[16x32](7.29x), filter_p2s[16x64](7.22x), filter_p2s[32x8](6.26x),
filter_p2s[32x16](6.92x), filter_p2s[32x24](6.18x), filter_p2s[32x32](6.91x),
filter_p2s[32x64](6.69x), filter_p2s[64x16](6.13x), filter_p2s[64x32](8.10x),
filter_p2s[64x48](5.89x), filter_p2s[64x64](8.03x), filter_p2s[12x16](7.93x),
filter_p2s[24x32](10.31x), filter_p2s[48x64](6.73x)
diff -r 94991f753fea -r 1f1589d50a59 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Mar 04 12:08:08 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Mar 04 13:34:56 2015 +0530
@@ -855,7 +855,32 @@
PIXEL_AVG_W4(mmx2);
LUMA_VAR(sse2);
- p.luma_p2s = x265_luma_p2s_sse2;
+ p.pu[LUMA_4x4].filter_p2s = x265_pixelToShort_4x4_sse2;
+ p.pu[LUMA_4x8].filter_p2s = x265_pixelToShort_4x8_sse2;
+ p.pu[LUMA_4x16].filter_p2s = x265_pixelToShort_4x16_sse2;
+ p.pu[LUMA_8x4].filter_p2s = x265_pixelToShort_8x4_sse2;
+ p.pu[LUMA_8x8].filter_p2s = x265_pixelToShort_8x8_sse2;
+ p.pu[LUMA_8x16].filter_p2s = x265_pixelToShort_8x16_sse2;
+ p.pu[LUMA_8x32].filter_p2s = x265_pixelToShort_8x32_sse2;
+ p.pu[LUMA_16x4].filter_p2s = x265_pixelToShort_16x4_sse2;
+ p.pu[LUMA_16x8].filter_p2s = x265_pixelToShort_16x8_sse2;
+ p.pu[LUMA_16x12].filter_p2s = x265_pixelToShort_16x12_sse2;
+ p.pu[LUMA_16x16].filter_p2s = x265_pixelToShort_16x16_sse2;
+ p.pu[LUMA_16x32].filter_p2s = x265_pixelToShort_16x32_sse2;
+ p.pu[LUMA_16x64].filter_p2s = x265_pixelToShort_16x64_sse2;
+ p.pu[LUMA_32x8].filter_p2s = x265_pixelToShort_32x8_sse2;
+ p.pu[LUMA_32x16].filter_p2s = x265_pixelToShort_32x16_sse2;
+ p.pu[LUMA_32x24].filter_p2s = x265_pixelToShort_32x24_sse2;
+ p.pu[LUMA_32x32].filter_p2s = x265_pixelToShort_32x32_sse2;
+ p.pu[LUMA_32x64].filter_p2s = x265_pixelToShort_32x64_sse2;
+ p.pu[LUMA_64x16].filter_p2s = x265_pixelToShort_64x16_sse2;
+ p.pu[LUMA_64x32].filter_p2s = x265_pixelToShort_64x32_sse2;
+ p.pu[LUMA_64x48].filter_p2s = x265_pixelToShort_64x48_sse2;
+ p.pu[LUMA_64x64].filter_p2s = x265_pixelToShort_64x64_sse2;
+ p.pu[LUMA_12x16].filter_p2s = x265_pixelToShort_12x16_sse2;
+ p.pu[LUMA_24x32].filter_p2s = x265_pixelToShort_24x32_sse2;
+ p.pu[LUMA_48x64].filter_p2s = x265_pixelToShort_48x64_sse2;
+
p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_sse2;
p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_sse2;
diff -r 94991f753fea -r 1f1589d50a59 source/common/x86/ipfilter16.asm
--- a/source/common/x86/ipfilter16.asm Wed Mar 04 12:08:08 2015 +0530
+++ b/source/common/x86/ipfilter16.asm Wed Mar 04 13:34:56 2015 +0530
@@ -3,6 +3,7 @@
;*
;* Authors: Nabajit Deka <nabajit at multicorewareinc.com>
;* Murugan Vairavel <murugan at multicorewareinc.com>
+;* Rajesh Paulraj <rajesh at multicorewareinc.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
@@ -5525,65 +5526,497 @@
FILTER_VER_LUMA_SS 64, 16
FILTER_VER_LUMA_SS 16, 64
-;--------------------------------------------------------------------------------------------------
-; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
-;--------------------------------------------------------------------------------------------------
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
+;-----------------------------------------------------------------------------
+%macro P2S_H_4xN 1
INIT_XMM sse2
-cglobal luma_p2s, 3, 7, 5
-
+cglobal pixelToShort_4x%1, 3, 6, 5
add r1, r1
- ; load width and height
- mov r3d, r3m
- mov r4d, r4m
+ ; load height
+ mov r3d, %1
; load constant
mova m4, [tab_c_n8192]
.loopH:
-
- xor r5d, r5d
+ xor r4d, r4d
.loopW:
- lea r6, [r0 + r5 * 2]
-
- movu m0, [r6]
+ lea r5, [r0 + r4 * 2]
+
+ movu m0, [r5]
psllw m0, 4
paddw m0, m4
- movu m1, [r6 + r1]
+ movu m1, [r5 + r1]
psllw m1, 4
paddw m1, m4
- movu m2, [r6 + r1 * 2]
+ movu m2, [r5 + r1 * 2]
psllw m2, 4
paddw m2, m4
- lea r6, [r6 + r1 * 2]
- movu m3, [r6 + r1]
+ lea r5, [r5 + r1 * 2]
+ movu m3, [r5 + r1]
psllw m3, 4
paddw m3, m4
- add r5, 8
- cmp r5, r3
+ add r4, 8
+ cmp r4, 4
jg .width4
- movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
- movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
- movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
- movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
+ movu [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0
+ movu [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1
+ movu [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2
+ movu [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3
je .nextH
- jmp .loopW
+ jnz .loopW
.width4:
- movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
- movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
- movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
- movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
+ movh [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0
+ movh [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1
+ movh [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2
+ movh [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3
.nextH:
lea r0, [r0 + r1 * 4]
add r2, FENC_STRIDE * 8
- sub r4d, 4
+ sub r3d, 4
jnz .loopH
RET
+%endmacro
+P2S_H_4xN 4
+P2S_H_4xN 8
+P2S_H_4xN 16
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
+;-----------------------------------------------------------------------------
+%macro P2S_H_8xN 1
+INIT_XMM sse2
+cglobal pixelToShort_8x%1, 3, 6, 5
+
+ add r1, r1
+
+ ; load height
+ mov r3d, %1
+
+ ; load constant
+ mova m4, [tab_c_n8192]
+
+.loopH:
+ xor r4d, r4d
+.loopW:
+ lea r5, [r0 + r4 * 2]
+
+ movu m0, [r5]
+ psllw m0, 4
+ paddw m0, m4
+
+ movu m1, [r5 + r1]
+ psllw m1, 4
+ paddw m1, m4
+
+ movu m2, [r5 + r1 * 2]
+ psllw m2, 4
+ paddw m2, m4
+
+ lea r5, [r5 + r1 * 2]
+ movu m3, [r5 + r1]
+ psllw m3, 4
+ paddw m3, m4
+
+ add r4, 8
+ cmp r4, 8
+
+ movu [r2 + FENC_STRIDE * 0], m0
+ movu [r2 + FENC_STRIDE * 2], m1
+ movu [r2 + FENC_STRIDE * 4], m2
+ movu [r2 + FENC_STRIDE * 6], m3
+
+ je .nextH
+ jnz .loopW
+
+.nextH:
+ lea r0, [r0 + r1 * 4]
+ add r2, FENC_STRIDE * 8
+
+ sub r3d, 4
+ jnz .loopH
+
+ RET
+%endmacro
+P2S_H_8xN 8
+P2S_H_8xN 4
+P2S_H_8xN 16
+P2S_H_8xN 32
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
+;-----------------------------------------------------------------------------
+%macro P2S_H_16xN 1
+INIT_XMM sse2
+cglobal pixelToShort_16x%1, 3, 6, 5
+
+ add r1, r1
+
+ ; load height
+ mov r3d, %1
+
+ ; load constant
+ mova m4, [tab_c_n8192]
+
+.loopH:
+ xor r4d, r4d
+.loopW:
+ lea r5, [r0 + r4 * 2]
+
+ movu m0, [r5]
+ psllw m0, 4
+ paddw m0, m4
+
+ movu m1, [r5 + r1]
+ psllw m1, 4
+ paddw m1, m4
+
+ movu m2, [r5 + r1 * 2]
+ psllw m2, 4
+ paddw m2, m4
+
+ lea r5, [r5 + r1 * 2]
+ movu m3, [r5 + r1]
+ psllw m3, 4
+ paddw m3, m4
+
+ add r4, 8
+ cmp r4, 16
+
+ movu [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0
+ movu [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1
+ movu [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2
+ movu [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3
+ je .nextH
+ jnz .loopW
+
+.nextH:
+ lea r0, [r0 + r1 * 4]
+ add r2, FENC_STRIDE * 8
+
+ sub r3d, 4
+ jnz .loopH
+
+ RET
+%endmacro
+P2S_H_16xN 4
+P2S_H_16xN 8
+P2S_H_16xN 12
+P2S_H_16xN 16
+P2S_H_16xN 32
+P2S_H_16xN 64
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
+;-----------------------------------------------------------------------------
+%macro P2S_H_32xN 1
+INIT_XMM sse2
+cglobal pixelToShort_32x%1, 3, 6, 5
+
+ add r1, r1
+
+ ; load height
+ mov r3d, %1
+
+ ; load constant
+ mova m4, [tab_c_n8192]
+
+.loopH:
+ xor r4d, r4d
+.loopW:
+ lea r5, [r0 + r4 * 2]
+
+ movu m0, [r5]
+ psllw m0, 4
+ paddw m0, m4
+
+ movu m1, [r5 + r1]
+ psllw m1, 4
+ paddw m1, m4
+
+ movu m2, [r5 + r1 * 2]
+ psllw m2, 4
+ paddw m2, m4
+
+ lea r5, [r5 + r1 * 2]
+ movu m3, [r5 + r1]
+ psllw m3, 4
+ paddw m3, m4
+
+ add r4, 8
+ cmp r4, 32
+
+ movu [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0
+ movu [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1
+ movu [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2
+ movu [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3
+
+ je .nextH
+ jnz .loopW
+
+.nextH:
+ lea r0, [r0 + r1 * 4]
+ add r2, FENC_STRIDE * 8
+
+ sub r3d, 4
+ jnz .loopH
+
+ RET
+%endmacro
+P2S_H_32xN 8
+P2S_H_32xN 16
+P2S_H_32xN 24
+P2S_H_32xN 32
+P2S_H_32xN 64
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
+;-----------------------------------------------------------------------------
+%macro P2S_H_64xN 1
+INIT_XMM sse2
+cglobal pixelToShort_64x%1, 3, 6, 5
+
+ add r1, r1
+
+ ; load height
+ mov r3d, %1
+
+ ; load constant
+ mova m4, [tab_c_n8192]
+
+.loopH:
+ xor r4d, r4d
+.loopW:
+ lea r5, [r0 + r4 * 2]
+
+ movu m0, [r5]
+ psllw m0, 4
+ paddw m0, m4
+
+ movu m1, [r5 + r1]
+ psllw m1, 4
+ paddw m1, m4
+
+ movu m2, [r5 + r1 * 2]
+ psllw m2, 4
+ paddw m2, m4
+
+ lea r5, [r5 + r1 * 2]
+ movu m3, [r5 + r1]
+ psllw m3, 4
+ paddw m3, m4
+
+ add r4, 8
+ cmp r4, 64
+ jg .width4
+ movu [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0
+ movu [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1
+ movu [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2
+ movu [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3
+ je .nextH
+ jnz .loopW
+
+.width4:
+ movh [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0
+ movh [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1
+ movh [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2
+ movh [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3
+
+.nextH:
+ lea r0, [r0 + r1 * 4]
+ add r2, FENC_STRIDE * 8
+
+ sub r3d, 4
+ jnz .loopH
+
+ RET
+%endmacro
+P2S_H_64xN 16
+P2S_H_64xN 32
+P2S_H_64xN 48
+P2S_H_64xN 64
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pixelToShort_12x16, 3, 6, 5
+
+ add r1, r1
+
+ ;load height
+ mov r3d, 16
+
+ ; load constant
+ mova m4, [tab_c_n8192]
+
+.loopH:
+
+ xor r4d, r4d
+.loopW:
+ lea r5, [r0 + r4 * 2]
+
+ movu m0, [r5]
+ psllw m0, 4
+ paddw m0, m4
+
+ movu m1, [r5 + r1]
+ psllw m1, 4
+ paddw m1, m4
+
+ movu m2, [r5 + r1 * 2]
+ psllw m2, 4
+ paddw m2, m4
+
+ lea r5, [r5 + r1 * 2]
+ movu m3, [r5 + r1]
+ psllw m3, 4
+ paddw m3, m4
+
+ add r4, 8
+ cmp r4, 12
+ jg .width4
+ movu [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0
+ movu [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1
+ movu [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2
+ movu [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3
+ je .nextH
+ jnz .loopW
+
+.width4:
+ movh [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0
+ movh [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1
+ movh [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2
+ movh [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3
+
+.nextH:
+ lea r0, [r0 + r1 * 4]
+ add r2, FENC_STRIDE * 8
+
+ sub r3d, 4
+ jnz .loopH
+
+ RET
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pixelToShort_24x32, 3, 6, 5
+
+ add r1, r1
+
+ ; load height
+ mov r3d, 32
+
+ ; load constant
+ mova m4, [tab_c_n8192]
+
+.loopH:
+
+ xor r4d, r4d
+.loopW:
+ lea r5, [r0 + r4 * 2]
+
+ movu m0, [r5]
+ psllw m0, 4
+ paddw m0, m4
+
+ movu m1, [r5 + r1]
+ psllw m1, 4
+ paddw m1, m4
+
+ movu m2, [r5 + r1 * 2]
+ psllw m2, 4
+ paddw m2, m4
+
+ lea r5, [r5 + r1 * 2]
+ movu m3, [r5 + r1]
+ psllw m3, 4
+ paddw m3, m4
+
+ add r4, 8
+ cmp r4, 24
+
+ movu [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0
+ movu [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1
+ movu [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2
+ movu [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3
+ je .nextH
+ jnz .loopW
+
+.nextH:
+ lea r0, [r0 + r1 * 4]
+ add r2, FENC_STRIDE * 8
+
+ sub r3d, 4
+ jnz .loopH
+
+ RET
+
+;-----------------------------------------------------------------------------
+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal pixelToShort_48x64, 3, 6, 5
+
+ add r1, r1
+
+ ; load height
+ mov r3d, 64
+
+ ; load constant
+ mova m4, [tab_c_n8192]
+
+.loopH:
+
+ xor r4d, r4d
+.loopW:
+ lea r5, [r0 + r4 * 2]
+
+ movu m0, [r5]
+ psllw m0, 4
+ paddw m0, m4
+
+ movu m1, [r5 + r1]
+ psllw m1, 4
+ paddw m1, m4
+
+ movu m2, [r5 + r1 * 2]
+ psllw m2, 4
+ paddw m2, m4
+
+ lea r5, [r5 + r1 * 2]
+ movu m3, [r5 + r1]
+ psllw m3, 4
+ paddw m3, m4
+
+ add r4, 8
+ cmp r4, 48
+
+ movu [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0
+ movu [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1
+ movu [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2
+ movu [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3
+ je .nextH
+ jnz .loopW
+
+.nextH:
+ lea r0, [r0 + r1 * 4]
+ add r2, FENC_STRIDE * 8
+
+ sub r3d, 4
+ jnz .loopH
+
+ RET
+
diff -r 94991f753fea -r 1f1589d50a59 source/common/x86/ipfilter8.h
--- a/source/common/x86/ipfilter8.h Wed Mar 04 12:08:08 2015 +0530
+++ b/source/common/x86/ipfilter8.h Wed Mar 04 13:34:56 2015 +0530
@@ -290,7 +290,31 @@
SETUP_CHROMA_420_HORIZ_FUNC_DEF(16, 64, cpu)
void x265_chroma_p2s_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
-void x265_luma_p2s_sse2(const pixel* src, intptr_t srcStride, int16_t* dst, int width, int height);
+void x265_pixelToShort_4x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_4x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_4x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_8x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_8x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_8x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_8x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_16x4_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_16x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_16x12_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_16x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_16x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_16x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_32x8_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_32x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_32x24_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_32x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_32x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_64x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_64x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_64x48_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_64x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_12x16_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_24x32_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
+void x265_pixelToShort_48x64_sse2(const pixel* src, intptr_t srcStride, int16_t* dst);
CHROMA_420_VERT_FILTERS(_sse2);
CHROMA_420_HORIZ_FILTERS(_sse4);
More information about the x265-devel
mailing list