[x265] [PATCH Review only] asm: code for pixel_var_8xN

Steve Borho steve at borho.org
Mon Nov 25 21:28:37 CET 2013


I just checked and ratecontrol.cpp uses var for block sizes 8x8 and 16x16.  All the other block sizes are unused.

We should probably define only square block sizes for this primitive.

On Nov 25, 2013, at 2:07 PM, Steve Borho <steve at borho.org> wrote:

> 
> On Nov 25, 2013, at 7:38 AM, murugan at multicorewareinc.com wrote:
> 
>> # HG changeset patch
>> # User Murugan Vairavel <murugan at multicorewareinc.com>
>> # Date 1385386658 -19800
>> #      Mon Nov 25 19:07:38 2013 +0530
>> # Node ID deb2fc2dcaf24a86132ebfe0fbaac4859611c92f
>> # Parent  43da6ca15a61e18d033931ca58940d6794f6f8f8
>> asm: code for pixel_var_8xN
> 
> I'm not sure the encoder uses any variance block measurements other than 8x8
> 
>> 
>> diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/pixel.cpp
>> --- a/source/common/pixel.cpp	Mon Nov 25 18:46:28 2013 +0530
>> +++ b/source/common/pixel.cpp	Mon Nov 25 19:07:38 2013 +0530
>> @@ -968,8 +968,11 @@
>>    p.ssim_4x4x2_core = ssim_4x4x2_core;
>>    p.ssim_end_4 = ssim_end_4;
>> 
>> -    p.var[LUMA_16x16] = pixel_var<16, 16>;
>> +    p.var[LUMA_8x4] = pixel_var<8, 4>;
>>    p.var[LUMA_8x8] = pixel_var<8, 8>;
>> +    p.var[LUMA_8x16] = pixel_var<8, 16>;
>> +    p.var[LUMA_8x32] = pixel_var<8, 32>;
>> +
>>    p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma;
>> }
>> }
>> diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/asm-primitives.cpp
>> --- a/source/common/x86/asm-primitives.cpp	Mon Nov 25 18:46:28 2013 +0530
>> +++ b/source/common/x86/asm-primitives.cpp	Mon Nov 25 19:07:38 2013 +0530
>> @@ -412,6 +412,15 @@
>>    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 16, cpu); \
>>    SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 64, cpu);
>> 
>> +#define SETUP_PIXEL_VAR_DEF(W, H, cpu) \
>> +    p.var[LUMA_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu;
>> +
>> +#define LUMA_VAR(cpu) \
>> +    SETUP_PIXEL_VAR_DEF(8,   4, cpu); \
>> +    SETUP_PIXEL_VAR_DEF(8,   8, cpu); \
>> +    SETUP_PIXEL_VAR_DEF(8,  16, cpu); \
>> +    SETUP_PIXEL_VAR_DEF(8,  32, cpu);
>> +
>> namespace x265 {
>> // private x265 namespace
>> 
>> @@ -442,6 +451,8 @@
>>        PIXEL_AVG(sse2);
>>        PIXEL_AVG_W4(mmx2);
>> 
>> +        LUMA_VAR(_sse2);
>> +
>>        p.sad[LUMA_8x32]  = x265_pixel_sad_8x32_sse2;
>>        p.sad[LUMA_16x4]  = x265_pixel_sad_16x4_sse2;
>>        p.sad[LUMA_16x12] = x265_pixel_sad_16x12_sse2;
>> diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/pixel-a.asm
>> --- a/source/common/x86/pixel-a.asm	Mon Nov 25 18:46:28 2013 +0530
>> +++ b/source/common/x86/pixel-a.asm	Mon Nov 25 19:07:38 2013 +0530
>> @@ -1301,6 +1301,106 @@
>> 
>> %if HIGH_BIT_DEPTH == 0
>> %macro VAR 0
>> +cglobal pixel_var_8x4, 2,3,8
>> +    VAR_START 1
>> +    lea       r2,    [r1 * 3]
>> +    movh      m0,    [r0]
>> +    movh      m3,    [r0 + r1]
>> +    movhps    m0,    [r0 + r1 * 2]
>> +    movhps    m3,    [r0 + r2]
>> +    DEINTB    1, 0, 4, 3, 7
>> +    lea       r0,    [r0 + r1 * 4]
>> +    VAR_CORE
>> +    VAR_END 8, 4
>> +
>> +cglobal pixel_var_8x8, 2,3,8
>> +    VAR_START 1
>> +    lea       r2,    [r1 * 3]
>> +    movh      m0,    [r0]
>> +    movh      m3,    [r0 + r1]
>> +    movhps    m0,    [r0 + r1 * 2]
>> +    movhps    m3,    [r0 + r2]
>> +    DEINTB    1, 0, 4, 3, 7
>> +    lea       r0,    [r0 + r1 * 4]
>> +    VAR_CORE
>> +    movh      m0,    [r0]
>> +    movh      m3,    [r0 + r1]
>> +    movhps    m0,    [r0 + r1 * 2]
>> +    movhps    m3,    [r0 + r2]
>> +    DEINTB    1, 0, 4, 3, 7
>> +    VAR_CORE
>> +    VAR_END 8, 8
>> +
>> +
>> +cglobal pixel_var_8x16, 2,4,8
>> +    VAR_START 1
>> +    lea       r2,    [r1 * 3]
>> +    movh      m0,    [r0]
>> +    movh      m3,    [r0 + r1]
>> +    movhps    m0,    [r0 + r1 * 2]
>> +    movhps    m3,    [r0 + r2]
>> +    DEINTB    1, 0, 4, 3, 7
>> +    lea       r0,    [r0 + r1 * 4]
>> +    VAR_CORE
>> +    movh      m0,    [r0]
>> +    movh      m3,    [r0 + r1]
>> +    movhps    m0,    [r0 + r1 * 2]
>> +    movhps    m3,    [r0 + r2]
>> +    DEINTB    1, 0, 4, 3, 7
>> +    lea       r0,    [r0 + r1 * 4]
>> +    VAR_CORE
>> +    movh      m0,    [r0]
>> +    movh      m3,    [r0 + r1]
>> +    movhps    m0,    [r0 + r1 * 2]
>> +    movhps    m3,    [r0 + r2]
>> +    DEINTB    1, 0, 4, 3, 7
>> +    lea       r0,    [r0 + r1 * 4]
>> +    VAR_CORE
>> +    movh      m0,    [r0]
>> +    movh      m3,    [r0 + r1]
>> +    movhps    m0,    [r0 + r1 * 2]
>> +    movhps    m3,    [r0 + r2]
>> +    DEINTB    1, 0, 4, 3, 7
>> +    VAR_CORE
>> +    VAR_END 8, 16
>> +
>> +cglobal pixel_var_8x32, 2,4,8
>> +    VAR_START 1
>> +    mov       r2d,   2
>> +    lea       r3,    [r1 * 3]
>> +.loop:
>> +    movh      m0,    [r0]
>> +    movh      m3,    [r0 + r1]
>> +    movhps    m0,    [r0 + r1 * 2]
>> +    movhps    m3,    [r0 + r3]
>> +    DEINTB    1, 0, 4, 3, 7
>> +    lea       r0,    [r0 + r1 * 4]
>> +    VAR_CORE
>> +    movh      m0,    [r0]
>> +    movh      m3,    [r0 + r1]
>> +    movhps    m0,    [r0 + r1 * 2]
>> +    movhps    m3,    [r0 + r3]
>> +    DEINTB    1, 0, 4, 3, 7
>> +    lea       r0,    [r0 + r1 * 4]
>> +    VAR_CORE
>> +    movh      m0,    [r0]
>> +    movh      m3,    [r0 + r1]
>> +    movhps    m0,    [r0 + r1 * 2]
>> +    movhps    m3,    [r0 + r3]
>> +    DEINTB    1, 0, 4, 3, 7
>> +    lea       r0,    [r0 + r1 * 4]
>> +    VAR_CORE
>> +    movh      m0,    [r0]
>> +    movh      m3,    [r0 + r1]
>> +    movhps    m0,    [r0 + r1 * 2]
>> +    movhps    m3,    [r0 + r3]
>> +    DEINTB    1, 0, 4, 3, 7
>> +    lea       r0,    [r0 + r1 * 4]
>> +    VAR_CORE
>> +    dec    r2d
>> +    jnz    .loop
>> +    VAR_END 8, 32
>> +
>> cglobal pixel_var_16x16, 2,3,8
>>    VAR_START 1
>>    mov      r2d, 8
>> @@ -1313,38 +1413,6 @@
>>    dec r2d
>>    jg .loop
>>    VAR_END 16, 16
>> -
>> -cglobal pixel_var_8x8, 2,4,8
>> -    VAR_START 1
>> -    mov      r2d, 2
>> -    lea       r3, [r1*3]
>> -.loop:
>> -    movh      m0, [r0]
>> -    movh      m3, [r0+r1]
>> -    movhps    m0, [r0+r1*2]
>> -    movhps    m3, [r0+r3]
>> -    DEINTB    1, 0, 4, 3, 7
>> -    lea       r0, [r0+r1*4]
>> -    VAR_CORE
>> -    dec r2d
>> -    jg .loop
>> -    VAR_END 8, 8
>> -
>> -cglobal pixel_var_8x16, 2,4,8
>> -    VAR_START 1
>> -    mov      r2d, 4
>> -    lea       r3, [r1*3]
>> -.loop:
>> -    movh      m0, [r0]
>> -    movh      m3, [r0+r1]
>> -    movhps    m0, [r0+r1*2]
>> -    movhps    m3, [r0+r3]
>> -    DEINTB    1, 0, 4, 3, 7
>> -    lea       r0, [r0+r1*4]
>> -    VAR_CORE
>> -    dec r2d
>> -    jg .loop
>> -    VAR_END 8, 16
>> %endmacro ; VAR
>> 
>> INIT_XMM sse2
>> diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/pixel.h
>> --- a/source/common/x86/pixel.h	Mon Nov 25 18:46:28 2013 +0530
>> +++ b/source/common/x86/pixel.h	Mon Nov 25 19:07:38 2013 +0530
>> @@ -347,6 +347,17 @@
>> CHROMA_PIXELSUB_DEF(_sse4);
>> LUMA_PIXELSUB_DEF(_sse4);
>> 
>> +#define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \
>> +    uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(pixel *pix, intptr_t pixstride);
>> +
>> +#define LUMA_PIXELVAR_DEF(cpu) \
>> +    SETUP_LUMA_PIXELVAR_FUNC(8,   4, cpu); \
>> +    SETUP_LUMA_PIXELVAR_FUNC(8,   8, cpu); \
>> +    SETUP_LUMA_PIXELVAR_FUNC(8,  16, cpu); \
>> +    SETUP_LUMA_PIXELVAR_FUNC(8,  32, cpu);
>> +
>> +LUMA_PIXELVAR_DEF(_sse2);
>> +
>> #undef DECL_PIXELS
>> #undef DECL_SUF
>> #undef DECL_HEVC_SSD
>> @@ -357,6 +368,8 @@
>> #undef SETUP_LUMA_PIXELSUB_PS_FUNC
>> #undef CHROMA_PIXELSUB_DEF
>> #undef LUMA_PIXELSUB_DEF
>> +#undef LUMA_PIXELVAR_DEF
>> +#undef SETUP_LUMA_PIXELVAR_FUNC
>> 
>> void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
>> void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
>> _______________________________________________
>> x265-devel mailing list
>> x265-devel at videolan.org
>> https://mailman.videolan.org/listinfo/x265-devel
> 

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 842 bytes
Desc: Message signed with OpenPGP using GPGMail
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20131125/e8759240/attachment.sig>


More information about the x265-devel mailing list