[vlc-devel] [PATCH 07/16] block cipher: use one lookup table for sbox and permutation
Serg Chernyavskiy
glenvt18 at gmail.com
Thu Jul 30 14:14:11 CEST 2015
Please review.
2015-06-26 14:20 GMT+03:00 glenvt18 <glenvt18 at gmail.com>:
> ---
> src/dvbcsa_bs_block.c | 269 +++++++++++++++++++++++++++++++++++---------------
> 1 file changed, 187 insertions(+), 82 deletions(-)
>
> diff --git a/src/dvbcsa_bs_block.c b/src/dvbcsa_bs_block.c
> index e4f532c..9bf36b1 100644
> --- a/src/dvbcsa_bs_block.c
> +++ b/src/dvbcsa_bs_block.c
> @@ -26,14 +26,115 @@
> #include "dvbcsa/dvbcsa.h"
> #include "dvbcsa_bs.h"
>
> -#define BS_XOREQ(a, b) do { dvbcsa_bs_word_t *_t = &(a); *_t = BS_XOR(*_t, (b)); } while (0)
> +/* SIMD targets which support loading streams of two interleaved bytes */
> +#ifdef BS_LOAD_DEINTERLEAVE_8
> +
> +/*
> + sbox + permute table - both values at one look-up
> +
> + LSB: sbox output
> + MSB: permuted sbox output
> +*/
> +static const uint16_t dvbcsa_block_sbox_perm[256] =
> + {
> + 0xd43a, 0xd9ea, 0x5168, 0xfdfe, 0xc633, 0x5be9, 0x1888, 0x941a,
> + 0x8a83, 0xbbcf, 0x4be1, 0xf77f, 0xdcba, 0xc9e2, 0x5438, 0x8412,
> + 0x59e8, 0xe227, 0x4361, 0x2e95, 0x300c, 0xe436, 0x6be5, 0x4570,
> + 0xc8a2, 0xa006, 0x8882, 0x757c, 0xa617, 0xcaa3, 0xe026, 0x1349,
> + 0xfcbe, 0xd57a, 0x736d, 0xa347, 0x0bc1, 0x0751, 0xba8f, 0xcff3,
> + 0x39cc, 0x975b, 0xe367, 0x7ebd, 0x3bcd, 0x1418, 0x1008, 0x1bc9,
> + 0xffff, 0x5369, 0xfbef, 0x8203, 0xb14e, 0x1148, 0x914a, 0x2884,
> + 0xf63f, 0x6cb4, 0x0410, 0x2004, 0x3ddc, 0x6ff5, 0x355c, 0xa9c6,
> + 0xa416, 0xdaab, 0x78ac, 0x314c, 0x4ff1, 0xd16a, 0xf22f, 0x743c,
> + 0xd63b, 0x2dd4, 0x2fd5, 0x2c94, 0x0dd0, 0x29c4, 0xc363, 0xc162,
> + 0x4771, 0x4aa1, 0x5ff9, 0xb34f, 0xf02e, 0xd8aa, 0x2bc5, 0xa556,
> + 0xcbe3, 0x5639, 0x8e93, 0xb9ce, 0x6365, 0x6164, 0x69e4, 0x1558,
> + 0x716c, 0x1619, 0x8142, 0x5779, 0x3fdd, 0xf9ee, 0xac96, 0xedf6,
> + 0x988a, 0x79ec, 0xb41e, 0x2a85, 0x8753, 0x2345, 0xbdde, 0xdebb,
> + 0xf57e, 0x900a, 0x9c9a, 0x8613, 0xd02a, 0x3e9d, 0x89c2, 0xb55e,
> + 0x955a, 0xb61f, 0xc432, 0x6635, 0x3c9c, 0x58a8, 0xc773, 0x4430,
> + 0x5229, 0x763d, 0xebe7, 0x8c92, 0xaa87, 0x961b, 0xd22b, 0x934b,
> + 0x6aa5, 0xa757, 0xae97, 0x0140, 0x2615, 0xe9e6, 0x7cbc, 0xb00e,
> + 0xdbeb, 0x8bc3, 0x6434, 0x722d, 0x5cb8, 0x2144, 0x6225, 0x68a4,
> + 0x341c, 0xabc7, 0xc223, 0x7bed, 0x0c90, 0xf16e, 0x0550, 0x0000,
> + 0x1e99, 0xbc9e, 0x334d, 0x1fd9, 0x9dda, 0x3a8d, 0xf36f, 0xb75f,
> + 0xf43e, 0xafd7, 0x4221, 0x6574, 0xa886, 0xbfdf, 0xd36b, 0x2205,
> + 0xb88e, 0x375d, 0xe637, 0x0611, 0x8dd2, 0x5028, 0x6775, 0xadd6,
> + 0xeaa7, 0xe777, 0x6024, 0xfebf, 0x4df0, 0x4cb0, 0x8002, 0xeeb7,
> + 0x5df8, 0x7dfc, 0x0a81, 0x1209, 0x4eb1, 0x0201, 0xe576, 0x0e91,
> + 0x777d, 0xb20f, 0x19c8, 0x48a0, 0xcdf2, 0x9bcb, 0x5578, 0x4160,
> + 0x0fd1, 0xeff7, 0x49e0, 0x6eb5, 0x1c98, 0xc022, 0xceb3, 0x4020,
> + 0x361d, 0xe8a6, 0x9fdb, 0xd77b, 0x1759, 0xbe9f, 0xf8ae, 0x4631,
> + 0xdffb, 0x8fd3, 0xecb6, 0x99ca, 0x8343, 0xc572, 0xa207, 0x6df4,
> + 0x1dd8, 0x0341, 0x2414, 0x2755, 0x320d, 0x2554, 0x9a8b, 0x5eb9,
> + 0x7aad, 0xa146, 0x920b, 0xfaaf, 0x0880, 0x8552, 0x702c, 0xddfa,
> + 0x388c, 0x1a89, 0xe166, 0x7ffd, 0xccb2, 0x5aa9, 0x9e9b, 0x09c0,
> + };
> +
> +#define BLOCK_SBOX_PERMUTE(in_buf, out_buf) \
> + { \
> + uint8_t *src = (uint8_t *)in_buf; \
> + uint16_t *dst = (uint16_t *)out_buf; \
> + uint8_t a, b, c, d; \
> + int j; \
> + for (j = 0; j < BS_BATCH_BYTES * 8; j += 4) \
> + { \
> + a = src[j + 0]; \
> + b = src[j + 1]; \
> + c = src[j + 2]; \
> + d = src[j + 3]; \
> + dst[j + 0] = dvbcsa_block_sbox_perm[a]; \
> + dst[j + 1] = dvbcsa_block_sbox_perm[b]; \
> + dst[j + 2] = dvbcsa_block_sbox_perm[c]; \
> + dst[j + 3] = dvbcsa_block_sbox_perm[d]; \
> + } \
> + }
> +
> +#else
> +
> +#define BLOCK_SBOX(in_buf, out_buf) \
> + { \
> + uint8_t *src = (uint8_t *)in_buf; \
> + uint8_t *dst = (uint8_t *)out_buf; \
> + uint8_t a, b, c, d; \
> + int j; \
> + for (j = 0; j < BS_BATCH_BYTES * 8; j += 4) \
> + { \
> + a = src[j + 0]; \
> + b = src[j + 1]; \
> + c = src[j + 2]; \
> + d = src[j + 3]; \
> + dst[j + 0] = dvbcsa_block_sbox[a]; \
> + dst[j + 1] = dvbcsa_block_sbox[b]; \
> + dst[j + 2] = dvbcsa_block_sbox[c]; \
> + dst[j + 3] = dvbcsa_block_sbox[d]; \
> + } \
> + }
> +
> +#define BLOCK_PERMUTE_LOGIC(in, out) \
> + { \
> + out = BS_OR( \
> + BS_OR( \
> + BS_OR (BS_SHL (BS_AND (in, BS_VAL8(29)), 1), \
> + BS_SHL (BS_AND (in, BS_VAL8(02)), 6)), \
> + BS_OR (BS_SHL (BS_AND (in, BS_VAL8(04)), 3), \
> + BS_SHR (BS_AND (in, BS_VAL8(10)), 2))), \
> + BS_OR( BS_SHR (BS_AND (in, BS_VAL8(40)), 6), \
> + BS_SHR (BS_AND (in, BS_VAL8(80)), 4))); \
> + }
> +
> +#endif
>
> DVBCSA_INLINE static inline void
> dvbcsa_bs_block_decrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_t *r)
> {
> dvbcsa_bs_word_t scratch1[8];
> +#ifdef BS_LOAD_DEINTERLEAVE_8
> + dvbcsa_bs_word_t scratch2[8 * 2];
> +#else
> dvbcsa_bs_word_t scratch2[8];
> - int i, j, g;
> +#endif
> + int i, g;
>
> r += 8 * 56;
>
> @@ -47,48 +148,48 @@ dvbcsa_bs_block_decrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_
> for (g = 0; g < 8; g++)
> scratch1[g] = BS_XOR(block[i], r6_N[g]);
>
> - /* sbox */
> - {
> - uint8_t *p1, *p2;
> - uint8_t a, b, c, d;
> -
> - p1 = (uint8_t *)scratch1;
> - p2 = (uint8_t *)scratch2;
> - for (j = 0; j < BS_BATCH_BYTES * 8; j += 4)
> - {
> - a = p1[j + 0];
> - b = p1[j + 1];
> - c = p1[j + 2];
> - d = p1[j + 3];
> - p2[j + 0] = dvbcsa_block_sbox[a];
> - p2[j + 1] = dvbcsa_block_sbox[b];
> - p2[j + 2] = dvbcsa_block_sbox[c];
> - p2[j + 3] = dvbcsa_block_sbox[d];
> - }
> - }
> +#ifdef BS_LOAD_DEINTERLEAVE_8
> + /* sbox + bit permutation */
> + BLOCK_SBOX_PERMUTE(scratch1, scratch2);
> +#else
> + /* only sbox */
> + BLOCK_SBOX(scratch1, scratch2);
> +#endif
>
> for (g = 0; g < 8; g++)
> {
> - dvbcsa_bs_word_t sbox_out = scratch2[g];
> - dvbcsa_bs_word_t w;
> -
> - /* bit permutation */
> -
> - dvbcsa_bs_word_t in = BS_OR(
> - BS_OR(
> - BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(29)), 1),
> - BS_SHL (BS_AND (sbox_out, BS_VAL8(02)), 6)),
> - BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(04)), 3),
> - BS_SHR (BS_AND (sbox_out, BS_VAL8(10)), 2))),
> - BS_OR( BS_SHR (BS_AND (sbox_out, BS_VAL8(40)), 6),
> - BS_SHR (BS_AND (sbox_out, BS_VAL8(80)), 4)));
> -
> - w = BS_XOR(r[8 * 8 + g], sbox_out);
> - r[8 * 0 + g] = w;
> - BS_XOREQ(r[8 * 2 + g], w);
> - BS_XOREQ(r[8 * 3 + g], w);
> - BS_XOREQ(r[8 * 4 + g], w);
> - BS_XOREQ(r[8 * 6 + g], in);
> + dvbcsa_bs_word_t sbox_out, perm_out, w, tmp1, tmp2, tmp3, tmp4;
> +
> +#ifdef BS_LOAD_DEINTERLEAVE_8
> + BS_LOAD_DEINTERLEAVE_8(scratch2 + g * 2, sbox_out, perm_out);
> +#else
> + sbox_out = scratch2[g];
> + BLOCK_PERMUTE_LOGIC(sbox_out, perm_out);
> +#endif
> + /*
> + w = r[8 * 8 + g] ^ sbox_out;
> + r[8 * 0 + g] = w;
> + r[8 * 2 + g] ^= w;
> + r[8 * 3 + g] ^= w;
> + r[8 * 4 + g] ^ =w;
> + r[8 * 6 + g] ^= perm_out;
> + */
> +
> + w = r[8 * 8 + g];
> + tmp1 = r[8 * 2 + g];
> + tmp2 = r[8 * 3 + g];
> + tmp3 = r[8 * 4 + g];
> + w = BS_XOR(w, sbox_out);
> + tmp4 = r[8 * 6 + g];
> + tmp1 = BS_XOR(tmp1, w);
> + tmp2 = BS_XOR(tmp2, w);
> + tmp3 = BS_XOR(tmp3, w);
> + r[8 * 0 + g] = w;
> + r[8 * 2 + g] = tmp1;
> + tmp4 = BS_XOR(tmp4, perm_out);
> + r[8 * 3 + g] = tmp2;
> + r[8 * 4 + g] = tmp3;
> + r[8 * 6 + g] = tmp4;
> }
> }
> }
> @@ -132,8 +233,12 @@ DVBCSA_INLINE static inline void
> dvbcsa_bs_block_encrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_t *r)
> {
> dvbcsa_bs_word_t scratch1[8];
> +#ifdef BS_LOAD_DEINTERLEAVE_8
> + dvbcsa_bs_word_t scratch2[8 * 2];
> +#else
> dvbcsa_bs_word_t scratch2[8];
> - int i, j, g;
> +#endif
> + int i, g;
>
> /* loop over kk[55]..kk[0] */
> for (i = 0; i < 56; i++)
> @@ -143,50 +248,50 @@ dvbcsa_bs_block_encrypt_register (const dvbcsa_bs_word_t *block, dvbcsa_bs_word_
> r += 8; /* virtual shift of registers */
>
> for (g = 0; g < 8; g++)
> - scratch1[g] = BS_XOR(block[i], r7_N[g]);
> -
> - /* sbox */
> - {
> - uint8_t *p1, *p2;
> - uint8_t a, b, c, d;
> + scratch1[g] = BS_XOR(block[i], r7_N[g]);
>
> - p1 = (uint8_t *)scratch1;
> - p2 = (uint8_t *)scratch2;
> - for (j = 0; j < BS_BATCH_BYTES * 8; j += 4)
> - {
> - a = p1[j + 0];
> - b = p1[j + 1];
> - c = p1[j + 2];
> - d = p1[j + 3];
> - p2[j + 0] = dvbcsa_block_sbox[a];
> - p2[j + 1] = dvbcsa_block_sbox[b];
> - p2[j + 2] = dvbcsa_block_sbox[c];
> - p2[j + 3] = dvbcsa_block_sbox[d];
> - }
> - }
> +#ifdef BS_LOAD_DEINTERLEAVE_8
> + /* sbox + bit permutation */
> + BLOCK_SBOX_PERMUTE(scratch1, scratch2);
> +#else
> + /* only sbox */
> + BLOCK_SBOX(scratch1, scratch2);
> +#endif
>
> for (g = 0; g < 8; g++)
> {
> - dvbcsa_bs_word_t sbox_out = scratch2[g];
> - dvbcsa_bs_word_t w = r[-8 * 1 + g];
> -
> - /* bit permutation */
> -
> - dvbcsa_bs_word_t in = BS_OR(
> - BS_OR(
> - BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(29)), 1),
> - BS_SHL (BS_AND (sbox_out, BS_VAL8(02)), 6)),
> - BS_OR (BS_SHL (BS_AND (sbox_out, BS_VAL8(04)), 3),
> - BS_SHR (BS_AND (sbox_out, BS_VAL8(10)), 2))),
> - BS_OR( BS_SHR (BS_AND (sbox_out, BS_VAL8(40)), 6),
> - BS_SHR (BS_AND (sbox_out, BS_VAL8(80)), 4)));
> -
> -
> - r[8 * 7 + g] = BS_XOR(w, sbox_out);
> - BS_XOREQ(r[8 * 1 + g], w);
> - BS_XOREQ(r[8 * 2 + g], w);
> - BS_XOREQ(r[8 * 3 + g], w);
> - BS_XOREQ(r[8 * 5 + g], in);
> + dvbcsa_bs_word_t sbox_out, perm_out, w, tmp1, tmp2, tmp3, tmp4;
> +
> +#ifdef BS_LOAD_DEINTERLEAVE_8
> + BS_LOAD_DEINTERLEAVE_8(scratch2 + g * 2, sbox_out, perm_out);
> +#else
> + sbox_out = scratch2[g];
> + BLOCK_PERMUTE_LOGIC(sbox_out, perm_out);
> +#endif
> + /*
> + w = r[-8 * 1 + g];
> + r[8 * 7 + g] = w ^ sbox_out;
> + r[8 * 1 + g] ^= w;
> + r[8 * 2 + g] ^= w;
> + r[8 * 3 + g] ^ =w;
> + r[8 * 5 + g] ^= perm_out;
> + */
> +
> + w = r[-8 * 1 + g];
> + tmp1 = r[8 * 1 + g];
> + tmp2 = r[8 * 2 + g];
> + tmp3 = r[8 * 3 + g];
> + sbox_out = BS_XOR(sbox_out, w);
> + tmp4 = r[8 * 5 + g];
> + tmp1 = BS_XOR(tmp1, w);
> + tmp2 = BS_XOR(tmp2, w);
> + tmp3 = BS_XOR(tmp3, w);
> + r[8 * 7 + g] = sbox_out;
> + r[8 * 1 + g] = tmp1;
> + tmp4 = BS_XOR(tmp4, perm_out);
> + r[8 * 2 + g] = tmp2;
> + r[8 * 3 + g] = tmp3;
> + r[8 * 5 + g] = tmp4;
> }
> }
> }
> --
> 1.9.1
>
More information about the vlc-devel
mailing list