[x265] [PATCH Only Review, don't merge] Assembly routine for filterHorizontal_p_p() for 4 tap filter

Thu Sep 19 21:27:35 CEST 2013

> +%if ARCH_X86_64 == 0
> +
> +SECTION_RODATA 32
> +tab_leftmask:   dq 0x0000000000000000
> +                dq 0x00000000000000FF
> +                dq 0x000000000000FFFF
> +                dq 0x0000000000FFFFFF
> +                dq 0x00000000FFFFFFFF
> +                dq 0x000000FFFFFFFFFF
> +                dq 0x0000FFFFFFFFFFFF
> +                dq 0x00FFFFFFFFFFFFFF

This can probably be replaced by an unaligned load from a single table
of db 0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1

> +INIT_XMM sse4

These generally go above the cglobal for each function, not at the
start of the file.

> +%macro FILTER_H4 2
> +    movu        %1, [src + col]
> +    pshufb      %2, %1, Tm4
> +    pmaddubsw   %2, coef2
> +    pshufb      %1, %1, Tm5
> +    pmaddubsw   %1, coef2
> +    phaddw      %2, %1
> +    paddw       %2, sumOffset
> +    psraw       %2, headRoom
> +    packuswb    %2, %1
> +%endmacro

phaddw is pretty slow (~3 uops) and should be avoided if you can, but
if there's no good solution, then using it is okay. Personally I'd try
to restructure the shuffles so that you don't have to add adjacent
values, but if you can't, that's okay.

paddw/psraw can probably be replaced by pmulhrsw with a magic
constant: e.g. (x + 32 >> 6) is the same as pmulhrsw(x, 512). This
tends to be faster in my experience.

> +;-----------------------------------------------------------------------------
> +; void filterHorizontal_p_p_4(pixel *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, short const *coeff)
> +;-----------------------------------------------------------------------------
> +cglobal filterHorizontal_p_p_4, 0, 7, 8
> +%define headRoom    14-8
> +%define offset      (1 << (headRoom - 1))
> +%define offset2     (offset | (offset << 16))
> +%define src         r0
> +%define dst         r1
> +%define row         r2
> +%define col         r3
> +%define width       r4
> +%define widthleft   r5
> +%define coef2       m7
> +%define sumOffset   m6
> +%define Tm5         m5
> +%define Tm4         m4
> +%define x2          m3
> +%define x1          m2
> +%define x0          m1
> +%define leftmask    m0
> +%define tmp         r0
> +%define tmp1        r1
x86inc named parameters are probably useful here.

> +
> +    mov         tmp,        r6m
> +    movu        coef2,      [tmp]
> +    packuswb    coef2,      coef2
> +    pshufd      coef2,      coef2,      0

Is there some reason the input coefficients are stored as 16-bit
instead of 8-bit?  This step seems like it's going to be repeated a
lot.

> +
> +    mov         tmp,        offset2
> +    movd        sumOffset,  tmp
> +    pshufd      sumOffset,  sumOffset,  0

You can movd directly from memory; going through a register is much
slower, especially on AMD machines.

> +    mov         width,      r4m
> +    mov         widthleft,  width
> +    and         width,      ~7
> +    and         widthleft,  7

mov width, r4m
mov widthleft, 7
and widthleft, width
and width, ~7

(probably a lower latency instruction sequence)

> +    movq        leftmask,   [tab_leftmask + widthleft * 8]
> +    mova        Tm4,        [tab_Tm]
> +    mova        Tm5,        [tab_Tm + 16]
> +
> +    mov         src,        r0m
> +    mov         dst,        r2m
> +    mov         row,        r5m

A lot of these can probably be auto-loaded during the function
prologue, unless there's a reason not to.

> +_loop_row:
> +    test        row,        row
> +    jz          _end_row

This condition should be at the end.

> +    xor         col,        col
> +_loop_col:
> +    cmp         col,        width
> +    jge         _end_col

Same as above.

> +    FILTER_H4   x0, x1
> +    movq        [dst + col], x1

nit: personally I'd do "movh" here.

> +    add         col,        8
> +    jmp         _loop_col
> +
> +_end_col:
> +    test        widthleft,  widthleft
> +    jz          _next_row
> +
> +    movq        x2, [dst + col]
> +    FILTER_H4   x0, x1
> +    pblendvb    x2, x2, x1, leftmask

Doesn't pblendvb require some weird thing (in non-AVX) where the xmm0
register is hardcoded?

> +    movq        [dst + col], x2
> +
> +_next_row:
> +    add         src,        r1m
> +    add         dst,        r3m
> +    dec         row
> +    jmp         _loop_row

As before, loop conditions should generally be at the end, not the
start; it avoids a wasteful jump.

Jason