<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div><br> </div><pre><br>At 2015-03-03 23:02:35,rajesh@multicorewareinc.com wrote:
># HG changeset patch
># User Rajesh Paulraj<rajesh@multicorewareinc.com>
># Date 1425392157 -19800
># Tue Mar 03 19:45:57 2015 +0530
># Node ID ec91dcde21d99e2f4bae41d655dc40365b07265a
># Parent 018e8bbaa854b1a4bd82b3a2e23f7775a77da5cc
>asm-ssse3: filter_p2s[12x16](9.66x), filter_p2s[24x32](10.30x),
> filter_p2s[48x64](7.79x)
>
>diff -r 018e8bbaa854 -r ec91dcde21d9 source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Fri Feb 27 11:46:09 2015 +0530
>+++ b/source/common/x86/asm-primitives.cpp Tue Mar 03 19:45:57 2015 +0530
>@@ -1272,6 +1272,9 @@
> p.pu[LUMA_64x32].filter_p2s = x265_pixelToShort_64x32_ssse3;
> p.pu[LUMA_64x48].filter_p2s = x265_pixelToShort_64x48_ssse3;
> p.pu[LUMA_64x64].filter_p2s = x265_pixelToShort_64x64_ssse3;
>+ p.pu[LUMA_12x16].filter_p2s = x265_pixelToShort_12x16_ssse3;
>+ p.pu[LUMA_24x32].filter_p2s = x265_pixelToShort_24x32_ssse3;
>+ p.pu[LUMA_48x64].filter_p2s = x265_pixelToShort_48x64_ssse3;
>
> p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3;
> p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3;
>diff -r 018e8bbaa854 -r ec91dcde21d9 source/common/x86/ipfilter8.asm
>--- a/source/common/x86/ipfilter8.asm Fri Feb 27 11:46:09 2015 +0530
>+++ b/source/common/x86/ipfilter8.asm Tue Mar 03 19:45:57 2015 +0530
>@@ -5416,8 +5416,9 @@
> FILTER_V4_W16n_H2 64, 48
> FILTER_V4_W16n_H2 48, 64
> FILTER_V4_W16n_H2 64, 16
>-;-----------------------------------------------------------------------------
>-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
>+
>+;-----------------------------------------------------------------------------
>+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
> ;-----------------------------------------------------------------------------
> %macro PIXEL_WH_4xN 2
> INIT_XMM ssse3
>@@ -5480,7 +5481,7 @@
> PIXEL_WH_4xN 4, 16
>
> ;-----------------------------------------------------------------------------
>-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
>+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
> ;-----------------------------------------------------------------------------
> %macro PIXEL_WH_8xN 2
> INIT_XMM ssse3
>@@ -5541,9 +5542,8 @@
> PIXEL_WH_8xN 8, 16
> PIXEL_WH_8xN 8, 32
>
>-
>-;-----------------------------------------------------------------------------
>-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
>+;-----------------------------------------------------------------------------
>+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
> ;-----------------------------------------------------------------------------
> %macro PIXEL_WH_16xN 2
> INIT_XMM ssse3
>@@ -5607,7 +5607,7 @@
> PIXEL_WH_16xN 16, 64
>
> ;-----------------------------------------------------------------------------
>-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
>+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
> ;-----------------------------------------------------------------------------
> %macro PIXEL_WH_32xN 2
> INIT_XMM ssse3
>@@ -5670,7 +5670,7 @@
> PIXEL_WH_32xN 32, 64
>
> ;-----------------------------------------------------------------------------
>-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, int height)
>+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
> ;-----------------------------------------------------------------------------
> %macro PIXEL_WH_64xN 2
> INIT_XMM ssse3
>@@ -5731,6 +5731,172 @@
> PIXEL_WH_64xN 64, 32
> PIXEL_WH_64xN 64, 48
>
>+;-----------------------------------------------------------------------------
>+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
>+;-----------------------------------------------------------------------------
>+INIT_XMM ssse3
>+cglobal pixelToShort_12x16, 3, 4, 6
>+
>+ ; load constant
>+ mova m4, [pb_128]
>+ mova m5, [tab_c_64_n64]
>+
>+%rep 4
>+
>+ movu m0, [r0]
>+ movu m1, m0
>+ punpcklbw m1, m0, m4
>+ punpckhbw m0, m4
>+ pmaddubsw m0, m5
>+ pmaddubsw m1, m5
>+
>+ movu m2, [r0 + r1]
>+ movu m3, m2
>+ punpcklbw m3, m2, m4
>+ punpckhbw m2, m4
>+ pmaddubsw m2, m5
>+ pmaddubsw m3, m5
>+
>+ movu [r2 + FENC_STRIDE * 0], m1
>+ movu [r2 + FENC_STRIDE * 2], m3
>+
>+ movh [r2 + FENC_STRIDE * 0 + 16], m0
>+ movh [r2 + FENC_STRIDE * 2 + 16], m2
>+
>+ movu m0, [r0 + r1 * 2]
>+ movu m1, m0
>+ punpcklbw m1, m0, m4
>+ punpckhbw m0, m4
>+ pmaddubsw m0, m5
>+ pmaddubsw m1, m5
>+
>+ lea r3, [r0 + r1 * 2]
>+ movu m2, [r3 + r1]
>+ movu m3, m2
>+ punpcklbw m3, m2, m4
>+ punpckhbw m2, m4
>+ pmaddubsw m2, m5
>+ pmaddubsw m3, m5
>+
>+ movu [r2 + FENC_STRIDE * 4], m1
>+ movu [r2 + FENC_STRIDE * 6], m3
>+
>+ movh [r2 + FENC_STRIDE * 4 + 16], m0
>+ movh [r2 + FENC_STRIDE * 6 + 16], m2
>+
>+ lea r0, [r0 + r1 * 4]
>+ add r2, FENC_STRIDE * 8
>+%endrep
>+ RET
>+
>+;-----------------------------------------------------------------------------
>+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst)
>+;-----------------------------------------------------------------------------
>+INIT_XMM ssse3
>+cglobal pixelToShort_24x32, 3, 7, 6
>+
>+ ; load width and height
>+ mov r3d, 24
>+ mov r4d, 32
>+
>+ ; load constant
>+ mova m4, [pb_128]
>+ mova m5, [tab_c_64_n64]
>+
>+.loopH:
>+
>+ xor r5d, r5d
>+.loopW:
>+ lea r6, [r0 + r5]
>+
>+ movh m0, [r6]
>+ punpcklbw m0, m4
>+ pmaddubsw m0, m5
>+
>+ movh m1, [r6 + r1]
>+ punpcklbw m1, m4
>+ pmaddubsw m1, m5
>+
>+ movh m2, [r6 + r1 * 2]
>+ punpcklbw m2, m4
>+ pmaddubsw m2, m5
>+
>+ lea r6, [r6 + r1 * 2]
>+ movh m3, [r6 + r1]
>+ punpcklbw m3, m4
>+ pmaddubsw m3, m5
>+
>+ add r5, 8
>+ cmp r5, r3
r3 is constant, why spending one register?<br><br>>+ movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0
>+ movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1
>+ movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2
>+ movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3
>+ je .nextH
>+ jmp .loopW
why not jnz?<br><br>>+
>+.nextH:
>+ lea r0, [r0 + r1 * 4]
>+ add r2, FENC_STRIDE * 8
>+
>+ sub r4d, 4
>+ jnz .loopH
>+ RET
>+
</pre></div>