[vlc-devel] [PATCH 01/16] block cipher: improve performance
Jean-Baptiste Kempf
jb at videolan.org
Mon Jul 6 18:08:39 CEST 2015
LGTM.
Does this add a special requirement to the input, like alignment?
Does this add latency?
On 26 Jun, glenvt18 wrote :
> 1. Use a dvbcsa_bs_world_t x 8 scratch buffer between the output of sbox and the input of
> the permute logic. It reduces the dependency between sbox output and permute logic input.
> 2x speed-up of CSA on x86 with both uint64 and sse2.
>
> 2. Lookup 4 values at a time (a, b, c, d) - reduces dependency on a read value.
>
> 3. Use separate scratch buffers for reading and writing.
>
> Changes 2 and 3 add about 5% depending on the architecture.
> ---
> src/dvbcsa_bs_block.c | 96 +++++++++++++++++++++++++++++++++------------------
> 1 file changed, 63 insertions(+), 33 deletions(-)
>
> diff --git a/src/dvbcsa_bs_block.c b/src/dvbcsa_bs_block.c
> index 58f6ca9..e4f532c 100644
> --- a/src/dvbcsa_bs_block.c
> +++ b/src/dvbcsa_bs_block.c
> @@ -31,32 +31,48 @@
> DVBCSA_INLINE static inline void
> dvbcsa_bs_block_decrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_t *r)
> {
> - int i, j, g;
> + dvbcsa_bs_word_t scratch1[8];
> + dvbcsa_bs_word_t scratch2[8];
> + int i, j, g;
>
> r += 8 * 56;
>
> - // loop over kk[55]..kk[0]
> + /* loop over kk[55]..kk[0] */
> for (i = 55; i >= 0; i--)
> {
> dvbcsa_bs_word_t *r6_N = r + 8 * 6;
>
> - r -= 8; /* virtual shift of registers */
> + r -= 8; /* virtual shift of registers */
>
> for (g = 0; g < 8; g++)
> - {
> - union {
> - dvbcsa_bs_word_t so;
> - uint8_t si[BS_BATCH_BYTES];
> - } u;
> + scratch1[g] = BS_XOR(block[i], r6_N[g]);
> +
> + /* sbox */
> + {
> + uint8_t *p1, *p2;
> + uint8_t a, b, c, d;
> +
> + p1 = (uint8_t *)scratch1;
> + p2 = (uint8_t *)scratch2;
> + for (j = 0; j < BS_BATCH_BYTES * 8; j += 4)
> + {
> + a = p1[j + 0];
> + b = p1[j + 1];
> + c = p1[j + 2];
> + d = p1[j + 3];
> + p2[j + 0] = dvbcsa_block_sbox[a];
> + p2[j + 1] = dvbcsa_block_sbox[b];
> + p2[j + 2] = dvbcsa_block_sbox[c];
> + p2[j + 3] = dvbcsa_block_sbox[d];
> + }
> + }
>
> - u.so = BS_XOR(block[i], r6_N[g]);
> -
> - for (j = 0; j < BS_BATCH_BYTES; j++)
> - u.si[j] = dvbcsa_block_sbox[u.si[j]];
> -
> - dvbcsa_bs_word_t sbox_out = u.so;
> + for (g = 0; g < 8; g++)
> + {
> + dvbcsa_bs_word_t sbox_out = scratch2[g];
> + dvbcsa_bs_word_t w;
>
> - // bit permutation
> + /* bit permutation */
>
> dvbcsa_bs_word_t in = BS_OR(
> BS_OR(
> @@ -67,8 +83,7 @@ dvbcsa_bs_block_decrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_
> BS_OR( BS_SHR (BS_AND (sbox_out, BS_VAL8(40)), 6),
> BS_SHR (BS_AND (sbox_out, BS_VAL8(80)), 4)));
>
> - dvbcsa_bs_word_t w = BS_XOR(r[8 * 8 + g], sbox_out);
> -
> + w = BS_XOR(r[8 * 8 + g], sbox_out);
> r[8 * 0 + g] = w;
> BS_XOREQ(r[8 * 2 + g], w);
> BS_XOREQ(r[8 * 3 + g], w);
> @@ -116,30 +131,46 @@ void dvbcsa_bs_block_decrypt_batch(const struct dvbcsa_bs_key_s *key,
> DVBCSA_INLINE static inline void
> dvbcsa_bs_block_encrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_t *r)
> {
> - int i, j, g;
> + dvbcsa_bs_word_t scratch1[8];
> + dvbcsa_bs_word_t scratch2[8];
> + int i, j, g;
>
> - // loop over kk[55]..kk[0]
> + /* loop over kk[55]..kk[0] */
> for (i = 0; i < 56; i++)
> {
> dvbcsa_bs_word_t *r7_N = r + 8 * 7;
>
> - r += 8; /* virtual shift of registers */
> + r += 8; /* virtual shift of registers */
>
> for (g = 0; g < 8; g++)
> - {
> - union {
> - dvbcsa_bs_word_t so;
> - uint8_t si[BS_BATCH_BYTES];
> - } u;
> + scratch1[g] = BS_XOR(block[i], r7_N[g]);
> +
> + /* sbox */
> + {
> + uint8_t *p1, *p2;
> + uint8_t a, b, c, d;
> +
> + p1 = (uint8_t *)scratch1;
> + p2 = (uint8_t *)scratch2;
> + for (j = 0; j < BS_BATCH_BYTES * 8; j += 4)
> + {
> + a = p1[j + 0];
> + b = p1[j + 1];
> + c = p1[j + 2];
> + d = p1[j + 3];
> + p2[j + 0] = dvbcsa_block_sbox[a];
> + p2[j + 1] = dvbcsa_block_sbox[b];
> + p2[j + 2] = dvbcsa_block_sbox[c];
> + p2[j + 3] = dvbcsa_block_sbox[d];
> + }
> + }
>
> - u.so = BS_XOR(block[i], r7_N[g]);
> -
> - for (j = 0; j < BS_BATCH_BYTES; j++)
> - u.si[j] = dvbcsa_block_sbox[u.si[j]];
> -
> - dvbcsa_bs_word_t sbox_out = u.so;
> + for (g = 0; g < 8; g++)
> + {
> + dvbcsa_bs_word_t sbox_out = scratch2[g];
> + dvbcsa_bs_word_t w = r[-8 * 1 + g];
>
> - // bit permutation
> + /* bit permutation */
>
> dvbcsa_bs_word_t in = BS_OR(
> BS_OR(
> @@ -150,7 +181,6 @@ dvbcsa_bs_block_encrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_
> BS_OR( BS_SHR (BS_AND (sbox_out, BS_VAL8(40)), 6),
> BS_SHR (BS_AND (sbox_out, BS_VAL8(80)), 4)));
>
> - dvbcsa_bs_word_t w = r[-8 * 1 + g];
>
> r[8 * 7 + g] = BS_XOR(w, sbox_out);
> BS_XOREQ(r[8 * 1 + g], w);
> --
> 1.9.1
>
> _______________________________________________
> vlc-devel mailing list
> To unsubscribe or modify your subscription options:
> https://mailman.videolan.org/listinfo/vlc-devel
--
With my kindest regards,
--
Jean-Baptiste Kempf
http://www.jbkempf.com/ - +33 672 704 734
Sent from my Electronic Device
More information about the vlc-devel
mailing list