[vlc-devel] [PATCH 02/16] stream cipher: refactoring

glenvt18 glenvt18 at gmail.com
Fri Jun 26 13:19:56 CEST 2015


1. Move stream cipher kernel code into a separate file included twice.
Don't duplicate code.
2. Store stream cipher registers in a structure passed as an argument.
3. Use virtual shift registers for A and B, avoid copying on each round.
---
 src/Makefile.am                 |   3 +-
 src/dvbcsa_bs_stream.c          | 410 +++-------------------------------------
 src/dvbcsa_bs_stream_kernel.h   |  23 +++
 src/dvbcsa_bs_stream_kernel.inc | 259 +++++++++++++++++++++++++
 4 files changed, 315 insertions(+), 380 deletions(-)
 create mode 100644 src/dvbcsa_bs_stream_kernel.h
 create mode 100644 src/dvbcsa_bs_stream_kernel.inc

diff --git a/src/Makefile.am b/src/Makefile.am
index d01c14e..dec4f55 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -7,7 +7,8 @@ libdvbcsa_la_SOURCES = dvbcsa_algo.c dvbcsa_block.c dvbcsa_bs_algo.c	\
 	dvbcsa_bs_block.c dvbcsa_bs_key.c dvbcsa_bs_stream.c		\
 	dvbcsa_stream.c dvbcsa_bs.h dvbcsa_pv.h dvbcsa_bs_uint64.h	\
 	dvbcsa_bs_uint32.h dvbcsa_bs_mmx.h dvbcsa_bs_sse.h		\
-	dvbcsa_bs_altivec.h dvbcsa_bs_transpose.c dvbcsa_key.c
+	dvbcsa_bs_altivec.h dvbcsa_bs_transpose.c dvbcsa_key.c	\
+	dvbcsa_bs_stream_kernel.inc dvbcsa_bs_stream_kernel.h
 
 if TRANSPOSE_128
 libdvbcsa_la_SOURCES += dvbcsa_bs_transpose128.c
diff --git a/src/dvbcsa_bs_stream.c b/src/dvbcsa_bs_stream.c
index 7cb7f09..86e8a6f 100644
--- a/src/dvbcsa_bs_stream.c
+++ b/src/dvbcsa_bs_stream.c
@@ -25,411 +25,63 @@
 
 #include "dvbcsa/dvbcsa.h"
 #include "dvbcsa_bs.h"
+#include "dvbcsa_bs_stream_kernel.h"
 
-static void DVBCSA_INLINE inline
-dvbcsa_bs_stream_sbox1(dvbcsa_bs_word_t fa, dvbcsa_bs_word_t fb,
-		       dvbcsa_bs_word_t fc, dvbcsa_bs_word_t fd,
-		       dvbcsa_bs_word_t fe,
-		       dvbcsa_bs_word_t *sa, dvbcsa_bs_word_t *sb)
-{
-  dvbcsa_bs_word_t tmp0, tmp1, tmp2, tmp3;
-
-  tmp0 = BS_XOR (fa, BS_XOR (fb, BS_NOT (BS_OR (BS_XOR (BS_OR (fa, fb), fc), BS_XOR (fc, fd)))));
-  tmp1 = BS_XOR (BS_OR (fa, fb), BS_NOT (BS_AND (fc, BS_OR (fa, BS_XOR (fb, fd)))));
-  tmp2 = BS_XOR (fa, BS_XOR (BS_AND (fb, fd), BS_OR (BS_AND (fa, fd), fc)));
-  tmp3 = BS_XOR (BS_AND (fa, fc), BS_XOR (fa, BS_OR (BS_AND (fa, fb), fd)));
-
-  *sa = BS_XOR (tmp0, BS_AND (fe, tmp1));
-  *sb = BS_XOR (tmp2, BS_AND (fe, tmp3));
-}
-
-static void DVBCSA_INLINE inline
-dvbcsa_bs_stream_sbox2(dvbcsa_bs_word_t fa, dvbcsa_bs_word_t fb,
-		       dvbcsa_bs_word_t fc, dvbcsa_bs_word_t fd,
-		       dvbcsa_bs_word_t fe,
-		       dvbcsa_bs_word_t *sa, dvbcsa_bs_word_t *sb)
-{
-  dvbcsa_bs_word_t tmp0, tmp1, tmp2, tmp3;
-
-  tmp0 = BS_XOR (fa, BS_XOR (BS_AND (fb, BS_OR (fc, fd)), BS_XOR (fc, BS_NOT (fd))));
-  tmp1 = BS_OR (BS_AND (fa, BS_XOR (fb, fd)), BS_AND (BS_OR (fa, fb), fc));
-  tmp2 = BS_XOR (BS_AND (fb, fd), BS_OR (BS_AND (fa, fd), BS_XOR (fb, BS_NOT (fc))));
-  tmp3 = BS_OR (BS_AND (fa, fd), BS_XOR (fa, BS_XOR (fb, BS_AND (fc, fd))));
-
-  *sa = BS_XOR (tmp0, BS_AND (fe, tmp1));
-  *sb = BS_XOR (tmp2, BS_AND (fe, tmp3));
-}
-
-static void DVBCSA_INLINE inline
-dvbcsa_bs_stream_sbox3(dvbcsa_bs_word_t fa, dvbcsa_bs_word_t fb,
-		       dvbcsa_bs_word_t fc, dvbcsa_bs_word_t fd,
-		       dvbcsa_bs_word_t fe,
-		       dvbcsa_bs_word_t *sa, dvbcsa_bs_word_t *sb)
-{
-  dvbcsa_bs_word_t tmp0, tmp1, tmp2;
-
-  tmp0 = BS_XOR (fa, BS_XOR (fb, BS_XOR (BS_AND (fc, BS_OR (fa, fd)), fd)));
-  tmp1 = BS_XOR (BS_AND (fa, fc), BS_OR (BS_XOR (fa, fd), BS_XOR (BS_OR (fb, fc), BS_NOT (fd))));
-  tmp2 = BS_XOR (fa, BS_XOR (BS_AND (BS_XOR (fb, fc), fd), fc));
-
-  *sa = BS_XOR (tmp0, BS_AND (BS_NOT (fe), tmp1));
-  *sb = BS_XOR (tmp2, fe);
-}
-
-static void DVBCSA_INLINE inline
-dvbcsa_bs_stream_sbox4(dvbcsa_bs_word_t fa, dvbcsa_bs_word_t fb,
-		       dvbcsa_bs_word_t fc, dvbcsa_bs_word_t fd,
-		       dvbcsa_bs_word_t fe,
-		       dvbcsa_bs_word_t *sa, dvbcsa_bs_word_t *sb)
-{
-  dvbcsa_bs_word_t tmp0, tmp1, tmp2;
-
-  tmp0 = BS_XOR (fa, BS_OR (BS_AND (fc, BS_XOR (fa, fd)), BS_XOR (fb, BS_OR (fc, BS_NOT (fd)))));
-  tmp1 = BS_XOR (BS_AND (fa, fb), BS_XOR (fb, BS_XOR (BS_AND (BS_OR (fa, fc), fd), fc)));
-  tmp2 = BS_XOR (fa, BS_OR (BS_AND (fb, fc), BS_XOR (BS_OR (BS_AND (fa, BS_XOR (fb, fd)), fc), fd)));
-
-  *sa = BS_XOR (tmp0, BS_AND (fe, BS_XOR (tmp1, tmp0)));
-  *sb = BS_XOR (BS_XOR (*sa, tmp2), fe);
-}
-
-static void DVBCSA_INLINE inline
-dvbcsa_bs_stream_sbox5(dvbcsa_bs_word_t fa, dvbcsa_bs_word_t fb,
-		       dvbcsa_bs_word_t fc, dvbcsa_bs_word_t fd,
-		       dvbcsa_bs_word_t fe,
-		       dvbcsa_bs_word_t *sa, dvbcsa_bs_word_t *sb)
-{
-  dvbcsa_bs_word_t tmp0, tmp1, tmp2, tmp3;
-
-  tmp0 = BS_OR (BS_XOR (BS_AND (fa, BS_OR (fb, fc)), fb), BS_XOR (BS_OR (BS_XOR (fa, fc), fd), BS_VAL8(ff)));
-  tmp1 = BS_XOR (fb, BS_AND (BS_XOR (fc, fd), BS_XOR (fc, BS_OR (fb, BS_XOR (fa, fd)))));
-  tmp2 = BS_XOR (BS_AND (fa, fc), BS_XOR (fb, BS_AND (BS_OR (fb, BS_XOR (fa, fc)), fd)));
-  tmp3 = BS_OR (BS_AND (BS_XOR (fa, fb), BS_XOR (fc, BS_VAL8(ff))), fd);
-
-  *sa = BS_XOR (tmp0, BS_AND (fe, tmp1));
-  *sb = BS_XOR (tmp2, BS_AND (fe, tmp3));
-}
-
-static void DVBCSA_INLINE inline
-dvbcsa_bs_stream_sbox6(dvbcsa_bs_word_t fa, dvbcsa_bs_word_t fb,
-		       dvbcsa_bs_word_t fc, dvbcsa_bs_word_t fd,
-		       dvbcsa_bs_word_t fe,
-		       dvbcsa_bs_word_t *sa, dvbcsa_bs_word_t *sb)
-{
-  dvbcsa_bs_word_t tmp0, tmp1, tmp2, tmp3;
-
-  tmp0 = BS_XOR (BS_AND (BS_AND (fa, fc), fd), BS_XOR (BS_AND (fb, BS_OR (fa, fd)), fc));
-  tmp1 = BS_NOT (BS_AND (BS_XOR (fa, fc), fd));
-  tmp2 = BS_XOR (BS_AND (fa, BS_OR (fb, fc)), BS_XOR (fb, BS_OR (BS_AND (fb, fc), fd)));
-  tmp3 = BS_AND (fc, BS_XOR (BS_AND (fa, BS_XOR (fb, fd)), BS_OR (fb, fd)));
-
-  *sa = BS_XOR (tmp0, BS_AND (fe, tmp1));
-  *sb = BS_XOR (tmp2, BS_AND (fe, tmp3));
-}
-
-static void DVBCSA_INLINE inline
-dvbcsa_bs_stream_sbox7(dvbcsa_bs_word_t fa, dvbcsa_bs_word_t fb,
-		       dvbcsa_bs_word_t fc, dvbcsa_bs_word_t fd,
-		       dvbcsa_bs_word_t fe,
-		       dvbcsa_bs_word_t *sa, dvbcsa_bs_word_t *sb)
-{
-  dvbcsa_bs_word_t tmp0, tmp1, tmp2, tmp3;
-
-  tmp0 = BS_XOR (fb, BS_OR (BS_AND (fc, fd), BS_XOR (fa, BS_XOR (fc, fd))));
-  tmp1 = BS_AND (BS_OR (fb, fd), BS_OR (BS_AND (fa, fc), BS_XOR (fb, BS_XOR (fc, fd))));
-  tmp2 = BS_XOR (BS_OR (fa, fb), BS_XOR (BS_AND (fc, BS_OR (fb, fd)), fd));
-  tmp3 = BS_OR (fd, BS_XOR (BS_AND (fa, fc), BS_VAL8(ff)));
-
-  *sa = BS_XOR (tmp0, BS_AND (fe, tmp1));
-  *sb = BS_XOR (tmp2, BS_AND (fe, tmp3));
-}
+#define DVBCSA_BS_STREAM_KERNEL_INIT 
+#include "dvbcsa_bs_stream_kernel.inc"
+#undef DVBCSA_BS_STREAM_KERNEL_INIT 
+#include "dvbcsa_bs_stream_kernel.inc"
 
 void
 dvbcsa_bs_stream_cipher_batch(const struct dvbcsa_bs_key_s *key,
 			      const struct dvbcsa_bs_batch_s *pcks,
 			      unsigned int maxlen)
 {
-  dvbcsa_bs_word_t A[10][4];
-  dvbcsa_bs_word_t B[10][4];
-  dvbcsa_bs_word_t X[4];
-  dvbcsa_bs_word_t Y[4];
-  dvbcsa_bs_word_t Z[4];
-  dvbcsa_bs_word_t D[4];
-  dvbcsa_bs_word_t E[4];
-  dvbcsa_bs_word_t F[4];
-  dvbcsa_bs_word_t p;
-  dvbcsa_bs_word_t q;
-  dvbcsa_bs_word_t r;
-  dvbcsa_bs_word_t in1[4];
-  dvbcsa_bs_word_t in2[4];
-  dvbcsa_bs_word_t extra_B[4];
-  dvbcsa_bs_word_t s1a, s1b, s2a, s2b, s3a, s3b, s4a, s4b, s5a, s5b, s6a, s6b, s7a, s7b;
-  dvbcsa_bs_word_t next_E[4];
-  dvbcsa_bs_word_t tmp0, tmp1, tmp3, tmp4;
-  dvbcsa_bs_word_t sb[64];
-  int h, i, j, k, b;
+  struct dvbcsa_bs_stream_regs_s regs __attribute__ ((aligned (BS_BATCH_BYTES)));
 
-  dvbcsa_bs_stream_transpose_in(pcks, sb);
+  int i, b;
+  unsigned int h;
+
+  dvbcsa_bs_stream_transpose_in(pcks, regs.sb);
 
   for (b = 0; b < 4; b++)
     {
       for (i = 0; i < 8; i++)
 	{
-	  A[i][b] = key->stream[b + i * 4];
-	  B[i][b] = key->stream[b + i * 4 + 32];
+	  regs.A[32 + i][b] = key->stream[b + i * 4];
+	  regs.B[32 + i][b] = key->stream[b + i * 4 + 32];
 	}
 
       // all other regs = 0
-      A[8][b] = BS_VAL8(00);
-      A[9][b] = BS_VAL8(00);
-      B[8][b] = BS_VAL8(00);
-      B[9][b] = BS_VAL8(00);
-
-      X[b] = BS_VAL8(00);
-      Y[b] = BS_VAL8(00);
-      Z[b] = BS_VAL8(00);
-      D[b] = BS_VAL8(00);
-      E[b] = BS_VAL8(00);
-      F[b] = BS_VAL8(00);
+      regs.A[32 + 8][b] = BS_VAL8(00);
+      regs.A[32 + 9][b] = BS_VAL8(00);
+      regs.B[32 + 8][b] = BS_VAL8(00);
+      regs.B[32 + 9][b] = BS_VAL8(00);
+
+      regs.X[b] = BS_VAL8(00);
+      regs.Y[b] = BS_VAL8(00);
+      regs.Z[b] = BS_VAL8(00);
+      regs.D[b] = BS_VAL8(00);
+      regs.E[b] = BS_VAL8(00);
+      regs.F[b] = BS_VAL8(00);
     }
 
-  p = BS_VAL8(00);
-  q = BS_VAL8(00);
-  r = BS_VAL8(00);
+  regs.p = BS_VAL8(00);
+  regs.q = BS_VAL8(00);
+  regs.r = BS_VAL8(00);
 
   /* Stream INIT */
 
-  for (i = 0; i < 8; i++)
-    {
-
-      for (b = 0; b < 4; b++)
-	{
-	  in1[b] = sb[8 * i + 4 + b];
-	  in2[b] = sb[8 * i + b];
-	}
-
-      for (j = 0; j < 4; j++)
-	{
-	  dvbcsa_bs_stream_sbox1(A[0][2], A[5][1], A[6][3], A[8][0], A[3][0], &s1a, &s1b);
-	  dvbcsa_bs_stream_sbox2(A[2][2], A[5][3], A[6][0], A[8][1], A[1][1], &s2a, &s2b);
-	  dvbcsa_bs_stream_sbox3(A[1][0], A[4][1], A[4][3], A[5][2], A[0][3], &s3a, &s3b);
-	  dvbcsa_bs_stream_sbox4(A[0][1], A[1][3], A[3][2], A[7][0], A[2][3], &s4a, &s4b);
-	  dvbcsa_bs_stream_sbox5(A[3][3], A[5][0], A[7][1], A[8][2], A[4][2], &s5a, &s5b);
-	  dvbcsa_bs_stream_sbox6(A[3][1], A[4][0], A[6][2], A[8][3], A[2][1], &s6a, &s6b);
-	  dvbcsa_bs_stream_sbox7(A[2][0], A[6][1], A[7][2], A[7][3], A[1][2], &s7a, &s7b);
-
-	  extra_B[3] = BS_XOR (BS_XOR (BS_XOR (B[2][0], B[5][1]), B[6][2]), B[8][3]);
-	  extra_B[2] = BS_XOR (BS_XOR (BS_XOR (B[5][0], B[7][1]), B[2][3]), B[3][2]);
-	  extra_B[1] = BS_XOR (BS_XOR (BS_XOR (B[4][3], B[7][2]), B[3][0]), B[4][1]);
-	  extra_B[0] = BS_XOR (BS_XOR (BS_XOR (B[8][2], B[5][3]), B[2][1]), B[7][0]);
-
-	  for (b = 0; b < 4; b++)
-	    {
-	      dvbcsa_bs_word_t	A_next;
-
-	      A_next = BS_XOR (A[9][b], X[b]);
-	      A_next = BS_XOR (BS_XOR (A_next, D[b]), ((j % 2) ? in2[b] : in1[b]));
+  dvbcsa_bs_stream_cipher_kernel_init(&regs);
 
-	      for (k = 9; k > 0; k--)
-		A[k][b] = A[k - 1][b];
-
-	      A[0][b] = A_next;
-	    }
-
-	  dvbcsa_bs_word_t	B_next[4];
-
-	  for (b = 0; b < 4; b++)
-	    {
-	      B_next[b] = BS_XOR (BS_XOR (B[6][b], B[9][b]), Y[b]);
-	      B_next[b] = BS_XOR (B_next[b], ((j % 2) ? in1[b] : in2[b]));
-	    }
-
-	  tmp3 = B_next[3];
-	  B_next[3] = BS_XOR (B_next[3], BS_AND (BS_XOR (B_next[3], B_next[2]), p));
-	  B_next[2] = BS_XOR (B_next[2], BS_AND (BS_XOR (B_next[2], B_next[1]), p));
-	  B_next[1] = BS_XOR (B_next[1], BS_AND (BS_XOR (B_next[1], B_next[0]), p));
-	  B_next[0] = BS_XOR (B_next[0], BS_AND (BS_XOR (B_next[0], tmp3), p));
-
-	  for (b = 0; b < 4; b++)
-	    {
-	      for (k = 9; k > 0; k--)
-		B[k][b] = B[k - 1][b];
-
-	      B[0][b] = B_next[b];
-	    }
-
-	  for (b = 0; b < 4; b++)
-	    D[b] = BS_XOR (BS_XOR (E[b], Z[b]), extra_B[b]);
-
-	  for (b = 0; b < 4; b++)
-	    next_E[b] = F[b];
-
-	  tmp0 = BS_XOR (Z[0], E[0]);
-	  tmp1 = BS_AND (Z[0], E[0]);
-	  F[0] = BS_XOR (E[0], BS_AND (q, BS_XOR (Z[0], r)));
-	  tmp3 = BS_AND (tmp0, r);
-	  tmp4 = BS_OR (tmp1, tmp3);
-
-	  tmp0 = BS_XOR (Z[1], E[1]);
-	  tmp1 = BS_AND (Z[1], E[1]);
-	  F[1] = BS_XOR (E[1], BS_AND (q, BS_XOR (Z[1], tmp4)));
-	  tmp3 = BS_AND (tmp0, tmp4);
-	  tmp4 = BS_OR (tmp1, tmp3);
-
-	  tmp0 = BS_XOR (Z[2], E[2]);
-	  tmp1 = BS_AND (Z[2], E[2]);
-	  F[2] = BS_XOR (E[2], BS_AND (q, BS_XOR (Z[2], tmp4)));
-	  tmp3 = BS_AND (tmp0, tmp4);
-	  tmp4 = BS_OR (tmp1, tmp3);
-
-	  tmp0 = BS_XOR (Z[3], E[3]);
-	  tmp1 = BS_AND (Z[3], E[3]);
-	  F[3] = BS_XOR (E[3], BS_AND (q, BS_XOR (Z[3], tmp4)));
-	  tmp3 = BS_AND (tmp0, tmp4);
-	  r = BS_XOR (r, BS_AND (q, BS_XOR (BS_OR (tmp1, tmp3), r)));	// ultimate carry
-
-	  for (b = 0; b < 4; b++)
-	    E[b] = next_E[b];
-
-	  X[0] = s1a;
-	  X[1] = s2a;
-	  X[2] = s3b;
-	  X[3] = s4b;
-	  Y[0] = s3a;
-	  Y[1] = s4a;
-	  Y[2] = s5b;
-	  Y[3] = s6b;
-	  Z[0] = s5a;
-	  Z[1] = s6a;
-	  Z[2] = s1b;
-	  Z[3] = s2b;
-	  p = s7a;
-	  q = s7b;
-
-	}
-
-    }
 
   /* Stream GEN */
 
-  for (h = 8; h < maxlen; h++)
+  for (h = 8; h < maxlen; h += 8)
     {
-      dvbcsa_bs_word_t cb[8];
-
-      for (j = 0; j < 4; j++)
-	{
-	  dvbcsa_bs_stream_sbox1(A[0][2], A[5][1], A[6][3], A[8][0], A[3][0], &s1a, &s1b);
-	  dvbcsa_bs_stream_sbox2(A[2][2], A[5][3], A[6][0], A[8][1], A[1][1], &s2a, &s2b);
-	  dvbcsa_bs_stream_sbox3(A[1][0], A[4][1], A[4][3], A[5][2], A[0][3], &s3a, &s3b);
-	  dvbcsa_bs_stream_sbox4(A[0][1], A[1][3], A[3][2], A[7][0], A[2][3], &s4a, &s4b);
-	  dvbcsa_bs_stream_sbox5(A[3][3], A[5][0], A[7][1], A[8][2], A[4][2], &s5a, &s5b);
-	  dvbcsa_bs_stream_sbox6(A[3][1], A[4][0], A[6][2], A[8][3], A[2][1], &s6a, &s6b);
-	  dvbcsa_bs_stream_sbox7(A[2][0], A[6][1], A[7][2], A[7][3], A[1][2], &s7a, &s7b);
-
-	  // use 4x4 xor to produce extra nibble for T3
-
-	  extra_B[3] = BS_XOR (BS_XOR (BS_XOR (B[2][0], B[5][1]), B[6][2]), B[8][3]);
-	  extra_B[2] = BS_XOR (BS_XOR (BS_XOR (B[5][0], B[7][1]), B[2][3]), B[3][2]);
-	  extra_B[1] = BS_XOR (BS_XOR (BS_XOR (B[4][3], B[7][2]), B[3][0]), B[4][1]);
-	  extra_B[0] = BS_XOR (BS_XOR (BS_XOR (B[8][2], B[5][3]), B[2][1]), B[7][0]);
-
-	  // T1 = xor all inputs
-	  // in1, in2, D are only used in T1 during initialisation, not generation
-	  for (b = 0; b < 4; b++)
-	    {
-	      dvbcsa_bs_word_t	A_next;
-
-	      A_next = BS_XOR (A[9][b], X[b]);
-
-	      for (k = 9; k > 0; k--)
-		A[k][b] = A[k - 1][b];
-
-	      A[0][b] = A_next;
-	    }
-
-	  dvbcsa_bs_word_t	B_next[4];
-
-	  // T2 =  xor all inputs
-	  // in1, in2 are only used in T1 during initialisation, not generation
-	  // if p=0, use this, if p=1, rotate the result left
-	  for (b = 0; b < 4; b++)
-	    B_next[b] = BS_XOR (BS_XOR (B[6][b], B[9][b]), Y[b]);
-
-	  // if p=1, rotate left (yes, this is what we're doing)
-	  tmp3 = B_next[3];
-	  B_next[3] = BS_XOR (B_next[3], BS_AND (BS_XOR (B_next[3], B_next[2]), p));
-	  B_next[2] = BS_XOR (B_next[2], BS_AND (BS_XOR (B_next[2], B_next[1]), p));
-	  B_next[1] = BS_XOR (B_next[1], BS_AND (BS_XOR (B_next[1], B_next[0]), p));
-	  B_next[0] = BS_XOR (B_next[0], BS_AND (BS_XOR (B_next[0], tmp3), p));
-
-	  for (b = 0; b < 4; b++)
-	    {
-	      for (k = 9; k > 0; k--)
-		B[k][b] = B[k - 1][b];
-
-	      B[0][b] = B_next[b];
-	    }
-
-	  // T3 = xor all inputs
-	  for (b = 0; b < 4; b++)
-	    D[b] = BS_XOR (BS_XOR (E[b], Z[b]), extra_B[b]);
-
-	  // T4 = sum, carry of Z + E + r
-	  for (b = 0; b < 4; b++)
-	    next_E[b] = F[b];
-
-	  tmp0 = BS_XOR (Z[0], E[0]);
-	  tmp1 = BS_AND (Z[0], E[0]);
-	  F[0] = BS_XOR (E[0], BS_AND (q, BS_XOR (Z[0], r)));
-	  tmp3 = BS_AND (tmp0, r);
-	  tmp4 = BS_OR (tmp1, tmp3);
-
-	  tmp0 = BS_XOR (Z[1], E[1]);
-	  tmp1 = BS_AND (Z[1], E[1]);
-	  F[1] = BS_XOR (E[1], BS_AND (q, BS_XOR (Z[1], tmp4)));
-	  tmp3 = BS_AND (tmp0, tmp4);
-	  tmp4 = BS_OR (tmp1, tmp3);
-
-	  tmp0 = BS_XOR (Z[2], E[2]);
-	  tmp1 = BS_AND (Z[2], E[2]);
-	  F[2] = BS_XOR (E[2], BS_AND (q, BS_XOR (Z[2], tmp4)));
-	  tmp3 = BS_AND (tmp0, tmp4);
-	  tmp4 = BS_OR (tmp1, tmp3);
-
-	  tmp0 = BS_XOR (Z[3], E[3]);
-	  tmp1 = BS_AND (Z[3], E[3]);
-	  F[3] = BS_XOR (E[3], BS_AND (q, BS_XOR (Z[3], tmp4)));
-	  tmp3 = BS_AND (tmp0, tmp4);
-	  r = BS_XOR (r, BS_AND (q, BS_XOR (BS_OR (tmp1, tmp3), r)));	// ultimate carry
-
-	  for (b = 0; b < 4; b++)
-	    E[b] = next_E[b];
-
-	  X[0] = s1a;
-	  X[1] = s2a;
-	  X[2] = s3b;
-	  X[3] = s4b;
-	  Y[0] = s3a;
-	  Y[1] = s4a;
-	  Y[2] = s5b;
-	  Y[3] = s6b;
-	  Z[0] = s5a;
-	  Z[1] = s6a;
-	  Z[2] = s1b;
-	  Z[3] = s2b;
-
-	  p = s7a;
-	  q = s7b;
-
-	  // require 4 loops per output byte
-	  // 2 output bits are a function of the 4 bits of D
-	  // xor 2 by 2
-	  cb[7 - 2 * j] = BS_XOR (D[2], D[3]);
-	  cb[6 - 2 * j] = BS_XOR (D[0], D[1]);
-	}				// EXTERNAL LOOP
-
-      ////////////////////////////////////////////////////////////////////////////////
-
-      dvbcsa_bs_stream_transpose_out(pcks, h, cb);
+      dvbcsa_bs_stream_cipher_kernel(&regs);
+      for (i = 0; i < 8; i++)
+          dvbcsa_bs_stream_transpose_out(pcks, h + i, regs.cb + i * 8);
 
     }
 
diff --git a/src/dvbcsa_bs_stream_kernel.h b/src/dvbcsa_bs_stream_kernel.h
new file mode 100644
index 0000000..b582028
--- /dev/null
+++ b/src/dvbcsa_bs_stream_kernel.h
@@ -0,0 +1,23 @@
+#ifndef DVBCSA_BS_STREAM_KERNEL_H_
+#define DVBCSA_BS_STREAM_KERNEL_H_
+
+#include "dvbcsa_bs.h"
+
+struct dvbcsa_bs_stream_regs_s {
+  dvbcsa_bs_word_t A[32 + 10][4];
+  dvbcsa_bs_word_t B[32 + 10][4];
+  dvbcsa_bs_word_t X[4];
+  dvbcsa_bs_word_t Y[4];
+  dvbcsa_bs_word_t Z[4];
+  dvbcsa_bs_word_t D[4];
+  dvbcsa_bs_word_t E[4];
+  dvbcsa_bs_word_t F[4];
+  dvbcsa_bs_word_t sb[64];
+  dvbcsa_bs_word_t cb[64];
+  dvbcsa_bs_word_t p;
+  dvbcsa_bs_word_t q;
+  dvbcsa_bs_word_t r;
+};
+
+#endif
+
diff --git a/src/dvbcsa_bs_stream_kernel.inc b/src/dvbcsa_bs_stream_kernel.inc
new file mode 100644
index 0000000..fcb0232
--- /dev/null
+++ b/src/dvbcsa_bs_stream_kernel.inc
@@ -0,0 +1,259 @@
+#ifdef DVBCSA_BS_STREAM_KERNEL_INIT
+
+static void DVBCSA_INLINE inline
+dvbcsa_bs_stream_sbox1(dvbcsa_bs_word_t fa, dvbcsa_bs_word_t fb,
+                       dvbcsa_bs_word_t fc, dvbcsa_bs_word_t fd,
+                       dvbcsa_bs_word_t fe,
+                       dvbcsa_bs_word_t *sa, dvbcsa_bs_word_t *sb)
+{
+  dvbcsa_bs_word_t tmp0, tmp1, tmp2, tmp3;
+
+  tmp0 = BS_XOR (fa, BS_XOR (fb, BS_NOT (BS_OR (BS_XOR (BS_OR (fa, fb), fc), BS_XOR (fc, fd)))));
+  tmp1 = BS_XOR (BS_OR (fa, fb), BS_NOT (BS_AND (fc, BS_OR (fa, BS_XOR (fb, fd)))));
+  tmp2 = BS_XOR (fa, BS_XOR (BS_AND (fb, fd), BS_OR (BS_AND (fa, fd), fc)));
+  tmp3 = BS_XOR (BS_AND (fa, fc), BS_XOR (fa, BS_OR (BS_AND (fa, fb), fd)));
+
+  *sa = BS_XOR (tmp0, BS_AND (fe, tmp1));
+  *sb = BS_XOR (tmp2, BS_AND (fe, tmp3));
+}
+
+static void DVBCSA_INLINE inline
+dvbcsa_bs_stream_sbox2(dvbcsa_bs_word_t fa, dvbcsa_bs_word_t fb,
+                       dvbcsa_bs_word_t fc, dvbcsa_bs_word_t fd,
+                       dvbcsa_bs_word_t fe,
+                       dvbcsa_bs_word_t *sa, dvbcsa_bs_word_t *sb)
+{
+  dvbcsa_bs_word_t tmp0, tmp1, tmp2, tmp3;
+
+  tmp0 = BS_XOR (fa, BS_XOR (BS_AND (fb, BS_OR (fc, fd)), BS_XOR (fc, BS_NOT (fd))));
+  tmp1 = BS_OR (BS_AND (fa, BS_XOR (fb, fd)), BS_AND (BS_OR (fa, fb), fc));
+  tmp2 = BS_XOR (BS_AND (fb, fd), BS_OR (BS_AND (fa, fd), BS_XOR (fb, BS_NOT (fc))));
+  tmp3 = BS_OR (BS_AND (fa, fd), BS_XOR (fa, BS_XOR (fb, BS_AND (fc, fd))));
+
+  *sa = BS_XOR (tmp0, BS_AND (fe, tmp1));
+  *sb = BS_XOR (tmp2, BS_AND (fe, tmp3));
+}
+
+static void DVBCSA_INLINE inline
+dvbcsa_bs_stream_sbox3(dvbcsa_bs_word_t fa, dvbcsa_bs_word_t fb,
+                       dvbcsa_bs_word_t fc, dvbcsa_bs_word_t fd,
+                       dvbcsa_bs_word_t fe,
+                       dvbcsa_bs_word_t *sa, dvbcsa_bs_word_t *sb)
+{
+  dvbcsa_bs_word_t tmp0, tmp1, tmp2;
+
+  tmp0 = BS_XOR (fa, BS_XOR (fb, BS_XOR (BS_AND (fc, BS_OR (fa, fd)), fd)));
+  tmp1 = BS_XOR (BS_AND (fa, fc), BS_OR (BS_XOR (fa, fd), BS_XOR (BS_OR (fb, fc), BS_NOT (fd))));
+  tmp2 = BS_XOR (fa, BS_XOR (BS_AND (BS_XOR (fb, fc), fd), fc));
+
+  *sa = BS_XOR (tmp0, BS_AND (BS_NOT (fe), tmp1));
+  *sb = BS_XOR (tmp2, fe);
+}
+
+static void DVBCSA_INLINE inline
+dvbcsa_bs_stream_sbox4(dvbcsa_bs_word_t fa, dvbcsa_bs_word_t fb,
+                       dvbcsa_bs_word_t fc, dvbcsa_bs_word_t fd,
+                       dvbcsa_bs_word_t fe,
+                       dvbcsa_bs_word_t *sa, dvbcsa_bs_word_t *sb)
+{
+  dvbcsa_bs_word_t tmp0, tmp1, tmp2;
+
+  tmp0 = BS_XOR (fa, BS_OR (BS_AND (fc, BS_XOR (fa, fd)), BS_XOR (fb, BS_OR (fc, BS_NOT (fd)))));
+  tmp1 = BS_XOR (BS_AND (fa, fb), BS_XOR (fb, BS_XOR (BS_AND (BS_OR (fa, fc), fd), fc)));
+  tmp2 = BS_XOR (fa, BS_OR (BS_AND (fb, fc), BS_XOR (BS_OR (BS_AND (fa, BS_XOR (fb, fd)), fc), fd)));
+
+  *sa = BS_XOR (tmp0, BS_AND (fe, BS_XOR (tmp1, tmp0)));
+  *sb = BS_XOR (BS_XOR (*sa, tmp2), fe);
+}
+
+static void DVBCSA_INLINE inline
+dvbcsa_bs_stream_sbox5(dvbcsa_bs_word_t fa, dvbcsa_bs_word_t fb,
+                       dvbcsa_bs_word_t fc, dvbcsa_bs_word_t fd,
+                       dvbcsa_bs_word_t fe,
+                       dvbcsa_bs_word_t *sa, dvbcsa_bs_word_t *sb)
+{
+  dvbcsa_bs_word_t tmp0, tmp1, tmp2, tmp3;
+
+  tmp0 = BS_OR (BS_XOR (BS_AND (fa, BS_OR (fb, fc)), fb), BS_XOR (BS_OR (BS_XOR (fa, fc), fd), BS_VAL8(ff)));
+  tmp1 = BS_XOR (fb, BS_AND (BS_XOR (fc, fd), BS_XOR (fc, BS_OR (fb, BS_XOR (fa, fd)))));
+  tmp2 = BS_XOR (BS_AND (fa, fc), BS_XOR (fb, BS_AND (BS_OR (fb, BS_XOR (fa, fc)), fd)));
+  tmp3 = BS_OR (BS_AND (BS_XOR (fa, fb), BS_XOR (fc, BS_VAL8(ff))), fd);
+
+  *sa = BS_XOR (tmp0, BS_AND (fe, tmp1));
+  *sb = BS_XOR (tmp2, BS_AND (fe, tmp3));
+}
+
+static void DVBCSA_INLINE inline
+dvbcsa_bs_stream_sbox6(dvbcsa_bs_word_t fa, dvbcsa_bs_word_t fb,
+                       dvbcsa_bs_word_t fc, dvbcsa_bs_word_t fd,
+                       dvbcsa_bs_word_t fe,
+                       dvbcsa_bs_word_t *sa, dvbcsa_bs_word_t *sb)
+{
+  dvbcsa_bs_word_t tmp0, tmp1, tmp2, tmp3;
+
+  tmp0 = BS_XOR (BS_AND (BS_AND (fa, fc), fd), BS_XOR (BS_AND (fb, BS_OR (fa, fd)), fc));
+  tmp1 = BS_NOT (BS_AND (BS_XOR (fa, fc), fd));
+  tmp2 = BS_XOR (BS_AND (fa, BS_OR (fb, fc)), BS_XOR (fb, BS_OR (BS_AND (fb, fc), fd)));
+  tmp3 = BS_AND (fc, BS_XOR (BS_AND (fa, BS_XOR (fb, fd)), BS_OR (fb, fd)));
+
+  *sa = BS_XOR (tmp0, BS_AND (fe, tmp1));
+  *sb = BS_XOR (tmp2, BS_AND (fe, tmp3));
+}
+
+static void DVBCSA_INLINE inline
+dvbcsa_bs_stream_sbox7(dvbcsa_bs_word_t fa, dvbcsa_bs_word_t fb,
+                       dvbcsa_bs_word_t fc, dvbcsa_bs_word_t fd,
+                       dvbcsa_bs_word_t fe,
+                       dvbcsa_bs_word_t *sa, dvbcsa_bs_word_t *sb)
+{
+  dvbcsa_bs_word_t tmp0, tmp1, tmp2, tmp3;
+
+  tmp0 = BS_XOR (fb, BS_OR (BS_AND (fc, fd), BS_XOR (fa, BS_XOR (fc, fd))));
+  tmp1 = BS_AND (BS_OR (fb, fd), BS_OR (BS_AND (fa, fc), BS_XOR (fb, BS_XOR (fc, fd))));
+  tmp2 = BS_XOR (BS_OR (fa, fb), BS_XOR (BS_AND (fc, BS_OR (fb, fd)), fd));
+  tmp3 = BS_OR (fd, BS_XOR (BS_AND (fa, fc), BS_VAL8(ff)));
+
+  *sa = BS_XOR (tmp0, BS_AND (fe, tmp1));
+  *sb = BS_XOR (tmp2, BS_AND (fe, tmp3));
+}
+
+static void
+dvbcsa_bs_stream_cipher_kernel_init(struct dvbcsa_bs_stream_regs_s *regs)
+
+#else
+
+static void
+dvbcsa_bs_stream_cipher_kernel(struct dvbcsa_bs_stream_regs_s *regs)
+
+#endif
+
+{
+  dvbcsa_bs_word_t extra_B[4];
+  dvbcsa_bs_word_t s1a, s1b, s2a, s2b, s3a, s3b, s4a, s4b, s5a, s5b, s6a, s6b, s7a, s7b;
+  dvbcsa_bs_word_t next_E[4];
+  dvbcsa_bs_word_t tmp0, tmp1, tmp3, tmp4;
+  dvbcsa_bs_word_t (*A)[4], (*B)[4];
+  int i, j, b;
+
+  A = regs->A + 32;
+  B = regs->B + 32;
+
+  for (i = 0; i < 8; i++)
+    {
+      for (j = 0; j < 4; j++)
+        {
+          dvbcsa_bs_stream_sbox1(A[0][2], A[5][1], A[6][3], A[8][0], A[3][0], &s1a, &s1b);
+          dvbcsa_bs_stream_sbox2(A[2][2], A[5][3], A[6][0], A[8][1], A[1][1], &s2a, &s2b);
+          dvbcsa_bs_stream_sbox3(A[1][0], A[4][1], A[4][3], A[5][2], A[0][3], &s3a, &s3b);
+          dvbcsa_bs_stream_sbox4(A[0][1], A[1][3], A[3][2], A[7][0], A[2][3], &s4a, &s4b);
+          dvbcsa_bs_stream_sbox5(A[3][3], A[5][0], A[7][1], A[8][2], A[4][2], &s5a, &s5b);
+          dvbcsa_bs_stream_sbox6(A[3][1], A[4][0], A[6][2], A[8][3], A[2][1], &s6a, &s6b);
+          dvbcsa_bs_stream_sbox7(A[2][0], A[6][1], A[7][2], A[7][3], A[1][2], &s7a, &s7b);
+
+          // use 4x4 xor to produce extra nibble for T3
+
+          extra_B[3] = BS_XOR (BS_XOR (BS_XOR (B[2][0], B[5][1]), B[6][2]), B[8][3]);
+          extra_B[2] = BS_XOR (BS_XOR (BS_XOR (B[5][0], B[7][1]), B[2][3]), B[3][2]);
+          extra_B[1] = BS_XOR (BS_XOR (BS_XOR (B[4][3], B[7][2]), B[3][0]), B[4][1]);
+          extra_B[0] = BS_XOR (BS_XOR (BS_XOR (B[8][2], B[5][3]), B[2][1]), B[7][0]);
+
+          // T1 = xor all inputs
+          // in1, in2, D are only used in T1 during initialisation, not generation
+          for (b = 0; b < 4; b++)
+            {
+              A[-1][b] = BS_XOR (A[9][b], regs->X[b]);
+#ifdef DVBCSA_BS_STREAM_KERNEL_INIT
+              //A[-1][b] = BS_XOR (BS_XOR (A[-1][b], D[b]), ((j % 2) ? in2[b] : in1[b]));
+              A[-1][b] = BS_XOR (BS_XOR (A[-1][b], regs->D[b]), ((j % 2) ? regs->sb[8 * i + b] : regs->sb[8 * i + 4 + b]));
+#endif
+            }
+
+          // T2 =  xor all inputs
+          // in1, in2 are only used in T1 during initialisation, not generation
+          // if p=0, use this, if p=1, rotate the result left
+          for (b = 0; b < 4; b++)
+            {
+              B[-1][b] = BS_XOR (BS_XOR (B[6][b], B[9][b]), regs->Y[b]);
+#ifdef DVBCSA_BS_STREAM_KERNEL_INIT
+              //B[-1][b] = BS_XOR (B[-1][b], ((j % 2) ? in1[b] : in2[b]));
+              B[-1][b] = BS_XOR (B[-1][b], ((j % 2) ? regs->sb[8 * i + 4 + b]: regs->sb[8 * i + b]));
+#endif
+            }
+
+          // if p=1, rotate left (yes, this is what we're doing)
+          tmp3 = B[-1][3];
+          B[-1][3] = BS_XOR (B[-1][3], BS_AND (BS_XOR (B[-1][3], B[-1][2]), regs->p));
+          B[-1][2] = BS_XOR (B[-1][2], BS_AND (BS_XOR (B[-1][2], B[-1][1]), regs->p));
+          B[-1][1] = BS_XOR (B[-1][1], BS_AND (BS_XOR (B[-1][1], B[-1][0]), regs->p));
+          B[-1][0] = BS_XOR (B[-1][0], BS_AND (BS_XOR (B[-1][0], tmp3), regs->p));
+
+          // T3 = xor all inputs
+          for (b = 0; b < 4; b++)
+            regs->D[b] = BS_XOR (BS_XOR (regs->E[b], regs->Z[b]), extra_B[b]);
+
+          // T4 = sum, carry of Z + E + r
+          for (b = 0; b < 4; b++)
+            next_E[b] = regs->F[b];
+
+          tmp0 = BS_XOR (regs->Z[0], regs->E[0]);
+          tmp1 = BS_AND (regs->Z[0], regs->E[0]);
+          regs->F[0] = BS_XOR (regs->E[0], BS_AND (regs->q, BS_XOR (regs->Z[0], regs->r)));
+          tmp3 = BS_AND (tmp0, regs->r);
+          tmp4 = BS_OR (tmp1, tmp3);
+
+          tmp0 = BS_XOR (regs->Z[1], regs->E[1]);
+          tmp1 = BS_AND (regs->Z[1], regs->E[1]);
+          regs->F[1] = BS_XOR (regs->E[1], BS_AND (regs->q, BS_XOR (regs->Z[1], tmp4)));
+          tmp3 = BS_AND (tmp0, tmp4);
+          tmp4 = BS_OR (tmp1, tmp3);
+
+          tmp0 = BS_XOR (regs->Z[2], regs->E[2]);
+          tmp1 = BS_AND (regs->Z[2], regs->E[2]);
+          regs->F[2] = BS_XOR (regs->E[2], BS_AND (regs->q, BS_XOR (regs->Z[2], tmp4)));
+          tmp3 = BS_AND (tmp0, tmp4);
+          tmp4 = BS_OR (tmp1, tmp3);
+
+          tmp0 = BS_XOR (regs->Z[3], regs->E[3]);
+          tmp1 = BS_AND (regs->Z[3], regs->E[3]);
+          regs->F[3] = BS_XOR (regs->E[3], BS_AND (regs->q, BS_XOR (regs->Z[3], tmp4)));
+          tmp3 = BS_AND (tmp0, tmp4);
+          regs->r = BS_XOR (regs->r, BS_AND (regs->q, BS_XOR (BS_OR (tmp1, tmp3), regs->r)));   // ultimate carry
+
+          for (b = 0; b < 4; b++)
+            regs->E[b] = next_E[b];
+
+          A--;
+          B--;
+
+          regs->X[0] = s1a;
+          regs->X[1] = s2a;
+          regs->X[2] = s3b;
+          regs->X[3] = s4b;
+          regs->Y[0] = s3a;
+          regs->Y[1] = s4a;
+          regs->Y[2] = s5b;
+          regs->Y[3] = s6b;
+          regs->Z[0] = s5a;
+          regs->Z[1] = s6a;
+          regs->Z[2] = s1b;
+          regs->Z[3] = s2b;
+
+          regs->p = s7a;
+          regs->q = s7b;
+
+          // require 4 loops per output byte
+          // 2 output bits are a function of the 4 bits of D
+          // xor 2 by 2
+          regs->cb[i * 8 + 7 - 2 * j] = BS_XOR (regs->D[2], regs->D[3]);
+          regs->cb[i * 8 + 6 - 2 * j] = BS_XOR (regs->D[0], regs->D[1]);
+        }  // INTERNAL LOOP
+    }   // EXTERNAL LOOP
+
+    for (i = 0; i < 10; i++)
+        for (b = 0; b < 4; b++)
+            regs->A[32 + i][b] = regs->A[i][b];
+    for (i = 0; i < 10; i++)
+        for (b = 0; b < 4; b++)
+            regs->B[32 + i][b] = regs->B[i][b];
+}
+
-- 
1.9.1




More information about the vlc-devel mailing list