[vlc-devel] [PATCH 07/16] block cipher: use one lookup table for sbox and permutation

Serg Chernyavskiy glenvt18 at gmail.com
Thu Jul 30 14:14:11 CEST 2015


Please review.

2015-06-26 14:20 GMT+03:00 glenvt18 <glenvt18 at gmail.com>:
> ---
>  src/dvbcsa_bs_block.c | 269 +++++++++++++++++++++++++++++++++++---------------
>  1 file changed, 187 insertions(+), 82 deletions(-)
>
> diff --git a/src/dvbcsa_bs_block.c b/src/dvbcsa_bs_block.c
> index e4f532c..9bf36b1 100644
> --- a/src/dvbcsa_bs_block.c
> +++ b/src/dvbcsa_bs_block.c
> @@ -26,14 +26,115 @@
>  #include "dvbcsa/dvbcsa.h"
>  #include "dvbcsa_bs.h"
>
> -#define BS_XOREQ(a, b) do { dvbcsa_bs_word_t *_t = &(a); *_t = BS_XOR(*_t, (b)); } while (0)
> +/* SIMD targets which support loading streams of two interleaved bytes */
> +#ifdef BS_LOAD_DEINTERLEAVE_8
> +
> +/*
> +  sbox + permute table - both values at one look-up
> +
> +  LSB: sbox output
> +  MSB: permuted sbox output
> +*/
> +static const uint16_t dvbcsa_block_sbox_perm[256] =
> +  {
> +    0xd43a, 0xd9ea, 0x5168, 0xfdfe, 0xc633, 0x5be9, 0x1888, 0x941a,
> +    0x8a83, 0xbbcf, 0x4be1, 0xf77f, 0xdcba, 0xc9e2, 0x5438, 0x8412,
> +    0x59e8, 0xe227, 0x4361, 0x2e95, 0x300c, 0xe436, 0x6be5, 0x4570,
> +    0xc8a2, 0xa006, 0x8882, 0x757c, 0xa617, 0xcaa3, 0xe026, 0x1349,
> +    0xfcbe, 0xd57a, 0x736d, 0xa347, 0x0bc1, 0x0751, 0xba8f, 0xcff3,
> +    0x39cc, 0x975b, 0xe367, 0x7ebd, 0x3bcd, 0x1418, 0x1008, 0x1bc9,
> +    0xffff, 0x5369, 0xfbef, 0x8203, 0xb14e, 0x1148, 0x914a, 0x2884,
> +    0xf63f, 0x6cb4, 0x0410, 0x2004, 0x3ddc, 0x6ff5, 0x355c, 0xa9c6,
> +    0xa416, 0xdaab, 0x78ac, 0x314c, 0x4ff1, 0xd16a, 0xf22f, 0x743c,
> +    0xd63b, 0x2dd4, 0x2fd5, 0x2c94, 0x0dd0, 0x29c4, 0xc363, 0xc162,
> +    0x4771, 0x4aa1, 0x5ff9, 0xb34f, 0xf02e, 0xd8aa, 0x2bc5, 0xa556,
> +    0xcbe3, 0x5639, 0x8e93, 0xb9ce, 0x6365, 0x6164, 0x69e4, 0x1558,
> +    0x716c, 0x1619, 0x8142, 0x5779, 0x3fdd, 0xf9ee, 0xac96, 0xedf6,
> +    0x988a, 0x79ec, 0xb41e, 0x2a85, 0x8753, 0x2345, 0xbdde, 0xdebb,
> +    0xf57e, 0x900a, 0x9c9a, 0x8613, 0xd02a, 0x3e9d, 0x89c2, 0xb55e,
> +    0x955a, 0xb61f, 0xc432, 0x6635, 0x3c9c, 0x58a8, 0xc773, 0x4430,
> +    0x5229, 0x763d, 0xebe7, 0x8c92, 0xaa87, 0x961b, 0xd22b, 0x934b,
> +    0x6aa5, 0xa757, 0xae97, 0x0140, 0x2615, 0xe9e6, 0x7cbc, 0xb00e,
> +    0xdbeb, 0x8bc3, 0x6434, 0x722d, 0x5cb8, 0x2144, 0x6225, 0x68a4,
> +    0x341c, 0xabc7, 0xc223, 0x7bed, 0x0c90, 0xf16e, 0x0550, 0x0000,
> +    0x1e99, 0xbc9e, 0x334d, 0x1fd9, 0x9dda, 0x3a8d, 0xf36f, 0xb75f,
> +    0xf43e, 0xafd7, 0x4221, 0x6574, 0xa886, 0xbfdf, 0xd36b, 0x2205,
> +    0xb88e, 0x375d, 0xe637, 0x0611, 0x8dd2, 0x5028, 0x6775, 0xadd6,
> +    0xeaa7, 0xe777, 0x6024, 0xfebf, 0x4df0, 0x4cb0, 0x8002, 0xeeb7,
> +    0x5df8, 0x7dfc, 0x0a81, 0x1209, 0x4eb1, 0x0201, 0xe576, 0x0e91,
> +    0x777d, 0xb20f, 0x19c8, 0x48a0, 0xcdf2, 0x9bcb, 0x5578, 0x4160,
> +    0x0fd1, 0xeff7, 0x49e0, 0x6eb5, 0x1c98, 0xc022, 0xceb3, 0x4020,
> +    0x361d, 0xe8a6, 0x9fdb, 0xd77b, 0x1759, 0xbe9f, 0xf8ae, 0x4631,
> +    0xdffb, 0x8fd3, 0xecb6, 0x99ca, 0x8343, 0xc572, 0xa207, 0x6df4,
> +    0x1dd8, 0x0341, 0x2414, 0x2755, 0x320d, 0x2554, 0x9a8b, 0x5eb9,
> +    0x7aad, 0xa146, 0x920b, 0xfaaf, 0x0880, 0x8552, 0x702c, 0xddfa,
> +    0x388c, 0x1a89, 0xe166, 0x7ffd, 0xccb2, 0x5aa9, 0x9e9b, 0x09c0,
> +  };
> +
> +#define BLOCK_SBOX_PERMUTE(in_buf, out_buf) \
> +    { \
> +    uint8_t *src = (uint8_t *)in_buf; \
> +    uint16_t *dst = (uint16_t *)out_buf; \
> +    uint8_t a, b, c, d; \
> +    int j; \
> +    for (j = 0; j < BS_BATCH_BYTES * 8; j += 4) \
> +      { \
> +        a = src[j + 0]; \
> +        b = src[j + 1]; \
> +        c = src[j + 2]; \
> +        d = src[j + 3]; \
> +        dst[j + 0] = dvbcsa_block_sbox_perm[a]; \
> +        dst[j + 1] = dvbcsa_block_sbox_perm[b]; \
> +        dst[j + 2] = dvbcsa_block_sbox_perm[c]; \
> +        dst[j + 3] = dvbcsa_block_sbox_perm[d]; \
> +      } \
> +    }
> +
> +#else
> +
> +#define BLOCK_SBOX(in_buf, out_buf) \
> +    { \
> +    uint8_t *src = (uint8_t *)in_buf; \
> +    uint8_t *dst = (uint8_t *)out_buf; \
> +    uint8_t a, b, c, d; \
> +    int j; \
> +    for (j = 0; j < BS_BATCH_BYTES * 8; j += 4) \
> +      { \
> +        a = src[j + 0]; \
> +        b = src[j + 1]; \
> +        c = src[j + 2]; \
> +        d = src[j + 3]; \
> +        dst[j + 0] = dvbcsa_block_sbox[a]; \
> +        dst[j + 1] = dvbcsa_block_sbox[b]; \
> +        dst[j + 2] = dvbcsa_block_sbox[c]; \
> +        dst[j + 3] = dvbcsa_block_sbox[d]; \
> +      } \
> +    }
> +
> +#define BLOCK_PERMUTE_LOGIC(in, out) \
> +    { \
> +    out = BS_OR( \
> +        BS_OR( \
> +              BS_OR (BS_SHL (BS_AND (in, BS_VAL8(29)), 1), \
> +                     BS_SHL (BS_AND (in, BS_VAL8(02)), 6)), \
> +              BS_OR (BS_SHL (BS_AND (in, BS_VAL8(04)), 3), \
> +                     BS_SHR (BS_AND (in, BS_VAL8(10)), 2))), \
> +        BS_OR(       BS_SHR (BS_AND (in, BS_VAL8(40)), 6), \
> +                     BS_SHR (BS_AND (in, BS_VAL8(80)), 4))); \
> +    }
> +
> +#endif
>
>  DVBCSA_INLINE static inline void
>  dvbcsa_bs_block_decrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_t *r)
>  {
>    dvbcsa_bs_word_t scratch1[8];
> +#ifdef BS_LOAD_DEINTERLEAVE_8
> +  dvbcsa_bs_word_t scratch2[8 * 2];
> +#else
>    dvbcsa_bs_word_t scratch2[8];
> -  int i, j, g;
> +#endif
> +  int i, g;
>
>    r += 8 * 56;
>
> @@ -47,48 +148,48 @@ dvbcsa_bs_block_decrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_
>        for (g = 0; g < 8; g++)
>          scratch1[g] = BS_XOR(block[i], r6_N[g]);
>
> -      /* sbox */
> -      {
> -      uint8_t *p1, *p2;
> -      uint8_t a, b, c, d;
> -
> -      p1 = (uint8_t *)scratch1;
> -      p2 = (uint8_t *)scratch2;
> -      for (j = 0; j < BS_BATCH_BYTES * 8; j += 4)
> -        {
> -          a = p1[j + 0];
> -          b = p1[j + 1];
> -          c = p1[j + 2];
> -          d = p1[j + 3];
> -          p2[j + 0] = dvbcsa_block_sbox[a];
> -          p2[j + 1] = dvbcsa_block_sbox[b];
> -          p2[j + 2] = dvbcsa_block_sbox[c];
> -          p2[j + 3] = dvbcsa_block_sbox[d];
> -        }
> -      }
> +#ifdef BS_LOAD_DEINTERLEAVE_8
> +      /* sbox + bit permutation */
> +      BLOCK_SBOX_PERMUTE(scratch1, scratch2);
> +#else
> +      /* only sbox */
> +      BLOCK_SBOX(scratch1, scratch2);
> +#endif
>
>        for (g = 0; g < 8; g++)
>          {
> -          dvbcsa_bs_word_t sbox_out = scratch2[g];
> -          dvbcsa_bs_word_t w;
> -
> -          /* bit permutation */
> -
> -         dvbcsa_bs_word_t in = BS_OR(
> -                                     BS_OR(
> -                                           BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(29)), 1),
> -                                                  BS_SHL (BS_AND (sbox_out, BS_VAL8(02)), 6)),
> -                                           BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(04)), 3),
> -                                                  BS_SHR (BS_AND (sbox_out, BS_VAL8(10)), 2))),
> -                                     BS_OR(       BS_SHR (BS_AND (sbox_out, BS_VAL8(40)), 6),
> -                                                  BS_SHR (BS_AND (sbox_out, BS_VAL8(80)), 4)));
> -
> -          w = BS_XOR(r[8 * 8 + g], sbox_out);
> -         r[8 * 0 + g] = w;
> -         BS_XOREQ(r[8 * 2 + g], w);
> -         BS_XOREQ(r[8 * 3 + g], w);
> -         BS_XOREQ(r[8 * 4 + g], w);
> -         BS_XOREQ(r[8 * 6 + g], in);
> +          dvbcsa_bs_word_t sbox_out, perm_out, w, tmp1, tmp2, tmp3, tmp4;
> +
> +#ifdef BS_LOAD_DEINTERLEAVE_8
> +          BS_LOAD_DEINTERLEAVE_8(scratch2 + g * 2, sbox_out, perm_out);
> +#else
> +          sbox_out = scratch2[g];
> +          BLOCK_PERMUTE_LOGIC(sbox_out, perm_out);
> +#endif
> +          /*
> +              w = r[8 * 8 + g] ^ sbox_out;
> +              r[8 * 0 + g] = w;
> +              r[8 * 2 + g] ^= w;
> +              r[8 * 3 + g] ^= w;
> +              r[8 * 4 + g] ^ =w;
> +              r[8 * 6 + g] ^= perm_out;
> +          */
> +
> +          w = r[8 * 8 + g];
> +          tmp1 = r[8 * 2 + g];
> +          tmp2 = r[8 * 3 + g];
> +          tmp3 = r[8 * 4 + g];
> +          w = BS_XOR(w, sbox_out);
> +          tmp4 = r[8 * 6 + g];
> +          tmp1 = BS_XOR(tmp1, w);
> +          tmp2 = BS_XOR(tmp2, w);
> +          tmp3 = BS_XOR(tmp3, w);
> +          r[8 * 0 + g] = w;
> +          r[8 * 2 + g] = tmp1;
> +          tmp4 = BS_XOR(tmp4, perm_out);
> +          r[8 * 3 + g] = tmp2;
> +          r[8 * 4 + g] = tmp3;
> +          r[8 * 6 + g] = tmp4;
>         }
>      }
>  }
> @@ -132,8 +233,12 @@ DVBCSA_INLINE static inline void
>  dvbcsa_bs_block_encrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_t *r)
>  {
>    dvbcsa_bs_word_t scratch1[8];
> +#ifdef BS_LOAD_DEINTERLEAVE_8
> +  dvbcsa_bs_word_t scratch2[8 * 2];
> +#else
>    dvbcsa_bs_word_t scratch2[8];
> -  int i, j, g;
> +#endif
> +  int i, g;
>
>    /* loop over kk[55]..kk[0] */
>    for (i = 0; i < 56; i++)
> @@ -143,50 +248,50 @@ dvbcsa_bs_block_encrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_
>        r += 8;   /* virtual shift of registers */
>
>        for (g = 0; g < 8; g++)
> -          scratch1[g] = BS_XOR(block[i], r7_N[g]);
> -
> -      /* sbox */
> -      {
> -      uint8_t *p1, *p2;
> -      uint8_t a, b, c, d;
> +         scratch1[g] = BS_XOR(block[i], r7_N[g]);
>
> -      p1 = (uint8_t *)scratch1;
> -      p2 = (uint8_t *)scratch2;
> -      for (j = 0; j < BS_BATCH_BYTES * 8; j += 4)
> -        {
> -          a = p1[j + 0];
> -          b = p1[j + 1];
> -          c = p1[j + 2];
> -          d = p1[j + 3];
> -          p2[j + 0] = dvbcsa_block_sbox[a];
> -          p2[j + 1] = dvbcsa_block_sbox[b];
> -          p2[j + 2] = dvbcsa_block_sbox[c];
> -          p2[j + 3] = dvbcsa_block_sbox[d];
> -        }
> -      }
> +#ifdef BS_LOAD_DEINTERLEAVE_8
> +      /* sbox + bit permutation */
> +      BLOCK_SBOX_PERMUTE(scratch1, scratch2);
> +#else
> +      /* only sbox */
> +      BLOCK_SBOX(scratch1, scratch2);
> +#endif
>
>        for (g = 0; g < 8; g++)
>          {
> -          dvbcsa_bs_word_t sbox_out = scratch2[g];
> -          dvbcsa_bs_word_t w = r[-8 * 1 + g];
> -
> -          /* bit permutation */
> -
> -         dvbcsa_bs_word_t in = BS_OR(
> -                                     BS_OR(
> -                                           BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(29)), 1),
> -                                                  BS_SHL (BS_AND (sbox_out, BS_VAL8(02)), 6)),
> -                                           BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(04)), 3),
> -                                                  BS_SHR (BS_AND (sbox_out, BS_VAL8(10)), 2))),
> -                                     BS_OR(       BS_SHR (BS_AND (sbox_out, BS_VAL8(40)), 6),
> -                                                  BS_SHR (BS_AND (sbox_out, BS_VAL8(80)), 4)));
> -
> -
> -         r[8 * 7 + g] = BS_XOR(w, sbox_out);
> -         BS_XOREQ(r[8 * 1 + g], w);
> -         BS_XOREQ(r[8 * 2 + g], w);
> -         BS_XOREQ(r[8 * 3 + g], w);
> -         BS_XOREQ(r[8 * 5 + g], in);
> +          dvbcsa_bs_word_t sbox_out, perm_out, w, tmp1, tmp2, tmp3, tmp4;
> +
> +#ifdef BS_LOAD_DEINTERLEAVE_8
> +          BS_LOAD_DEINTERLEAVE_8(scratch2 + g * 2, sbox_out, perm_out);
> +#else
> +          sbox_out = scratch2[g];
> +          BLOCK_PERMUTE_LOGIC(sbox_out, perm_out);
> +#endif
> +          /*
> +              w = r[-8 * 1 + g];
> +              r[8 * 7 + g] = w ^ sbox_out;
> +              r[8 * 1 + g] ^= w;
> +              r[8 * 2 + g] ^= w;
> +              r[8 * 3 + g] ^ =w;
> +              r[8 * 5 + g] ^= perm_out;
> +          */
> +
> +          w = r[-8 * 1 + g];
> +          tmp1 = r[8 * 1 + g];
> +          tmp2 = r[8 * 2 + g];
> +          tmp3 = r[8 * 3 + g];
> +          sbox_out = BS_XOR(sbox_out, w);
> +          tmp4 = r[8 * 5 + g];
> +          tmp1 = BS_XOR(tmp1, w);
> +          tmp2 = BS_XOR(tmp2, w);
> +          tmp3 = BS_XOR(tmp3, w);
> +          r[8 * 7 + g] = sbox_out;
> +          r[8 * 1 + g] = tmp1;
> +          tmp4 = BS_XOR(tmp4, perm_out);
> +          r[8 * 2 + g] = tmp2;
> +          r[8 * 3 + g] = tmp3;
> +          r[8 * 5 + g] = tmp4;
>         }
>      }
>  }
> --
> 1.9.1
>



More information about the vlc-devel mailing list