lilcrypto/impl_poly1305.c

247 lines
8.2 KiB
C
Raw Normal View History

2024-05-31 12:59:58 +02:00
/*
* Copyright (c) 2024 Lucas Gabriel Vuotto <lucas@lgv5.net>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include "internal.h"
2024-05-31 12:59:58 +02:00
#include "util.h"
/*
* Poly1305 implementation.
*
* Poly1305 originally designed by Daniel J. Bernstein, "The Poly1305-AES
* message-authentication code", https://cr.yp.to/mac/poly1305-20050329.pdf .
*
* This implementation is written from scratch, but consulting poly1305-donna
* by Andrew Moon, https://github.com/floodyberry/poly1305-donna, released
* under MIT license. Similarities are to be expected.
*/
/*
* Copyright 2011-2016 Andrew Moon <liquidsun@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
/*
* To ease reduction modulo p = 2^130 - 5, work in base 2^130, as 2^130 = 5 mod
* p, allowing for easier operations. 2^130 splits evenly into 5 limbs of 26
* bits.
*
* Addition is performed limb-wise:
*
* h = h4 h3 h2 h1 h0
* c = c4 c3 c2 c1 c0
* -----------------------------------
* h+c = h4+c4 h3+c3 h2+c2 h1+c1 h0+c0
*
* Carry won't be propagated at this step.
*
* Considering h = h + c, multiplication is performed as school multiplication
* / long multiplication:
*
* h = h4 h3 h2 h1 h0
* r = r4 r3 r2 r1 r0
* -----------------------------------------------------------
* h4*r0 h3*r0 h2*r0 h1*r0 h0*r0
* h4*r1 h3*r1 h2*r1 h1*r1 h0*r1
* h4*r2 h3*r2 h2*r2 h1*r2 h0*r2
* h4*r3 h3*r3 h2*r3 h1*r3 h0*r3
* h4*r4 h3*r4 h2*r4 h1*r4 h0*r4
*
* Each hn*rn fits in 53 bits. Carry won't be propagated at this step. Partial
* reduction modulo p starts here:
*
* 2^130
* h = | h4 h3 h2 h1 h0
* r = | r4 r3 r2 r1 r0
* ------------------------------|------------------------------
* | h4*r0 h3*r0 h2*r0 h1*r0 h0*r0
* h4*r1 | h3*r1 h2*r1 h1*r1 h0*r1
* h4*r2 h3*r2 | h2*r2 h1*r2 h0*r2
* h4*r3 h3*r3 h2*r3 | h1*r3 h0*r3
* h4*r4 h3*r4 h2*r4 h1*r4 | h0*r4
*
* 2^130
* h = | h4 h3 h2 h1 h0
* r = | r4 r3 r2 r1 r0
* --------|--------------------------------------
* | h4*r0 h3*r0 h2*r0 h1*r0 h0*r0
* | h3*r1 h2*r1 h1*r1 h0*r1 5*h4*r1
* | h2*r2 h1*r2 h0*r2 5*h4*r2 5*h3*r2
* | h1*r3 h0*r3 5*h4*r3 5*h3*r3 5*h2*r3
* | h0*r4 5*h4*r4 5*h3*r4 5*h2*r4 5*h1*r4
* --------|--------------------------------------
* h*r = | t4 t3 t2 t1 t0
*
* All the carry propagations are performed after this step. h0 is set t0 low
* 26 bits of t0; h1 thru h4 are set to tn + (tn-1 >> 26) to propagate the
* carry. t4 might overflow so it needs to be backpropagated to h0 and h1. h1
* won't carry into h2: given the highest possible h, c, and r,
*
* h = 0xffffffffffffffffffffffffffffffff
* c = 0x1ffffffffffffffffffffffffffffffff
* r = 0x0ffffffc0ffffffc0ffffffc0fffffff
*
* the limbs and t4 before h0 and h1 second propagation are
*
* h4 = 0x257ffff
* h3 = 0x3a95fff
* h2 = 0x3fea57f
* h1 = 0x3fffa70
* h0 = 0x2000002
* t4 = 0x77fffffa57ffff
*
2024-06-10 19:44:20 +02:00
* which becomes
2024-05-31 12:59:58 +02:00
*
* h4 = 0x257ffff
* h3 = 0x3a95fff
* h2 = 0x3fea57f
* h1 = 0x3fffa95
* h0 = 0x3fffff8
*
* To perform the final reduction modulo p, observe that each hn is bound by
* 2^26, which means that h is bound by 2^130. Define minusp = 2^130 - p = 5.
2024-05-31 12:59:58 +02:00
* - If h < p, minusp + h < 2^136.
* - If h >= p, then h = p + k with k in {0,1,2,3,4}, and minusp + h =
* 2^130 - p + p + k = 2^130 + k >= 2^130, and both minusp + h = k mod 2^130
2024-05-31 12:59:58 +02:00
* and h = k mod p for all possible values of k.
*
* To avoid information leaking via side channels, define g = minusp + h, and
* select g if bit 130 is set, h otherwise. In particular, define a 32-bit
* mask = ~(g >> 130) + 1.
* - If bit 130 of g is 1, mask = ~1 + 1 = 0xffffffff.
* - If bit 130 of g is 0, mask = ~0 + 1 = 0.
2024-05-31 12:59:58 +02:00
* Then perform (h & ~mask) | (g & mask).
*/
void
poly1305_block(struct poly1305_state *state, uint32_t hibit)
2024-05-31 12:59:58 +02:00
{
uint64_t h0, h1, h2, h3, h4, t0, t1, t2, t3, t4;
uint32_t r0, r1, r2, r3, r4, x1, x2, x3, x4;
h0 = state->h0;
h1 = state->h1;
h2 = state->h2;
h3 = state->h3;
h4 = state->h4;
r0 = state->r0;
r1 = state->r1;
r2 = state->r2;
r3 = state->r3;
r4 = state->r4;
x1 = state->x1;
x2 = state->x2;
x3 = state->x3;
x4 = state->x4;
t0 = load32le(&state->b[0]);
t1 = load32le(&state->b[4]);
t2 = load32le(&state->b[8]);
t3 = load32le(&state->b[12]);
2024-05-31 12:59:58 +02:00
t4 = hibit;
h0 += t0 & 0x3ffffff;
h1 += ((t1 << 6) | (t0 >> 26)) & 0x3ffffff;
h2 += ((t2 << 12) | (t1 >> 20)) & 0x3ffffff;
h3 += ((t3 << 18) | (t2 >> 14)) & 0x3ffffff;
h4 += (t4 << 24) | (t3 >> 8);
t0 = h0 * r0 + h4 * x1 + h3 * x2 + h2 * x3 + h1 * x4;
t1 = h1 * r0 + h0 * r1 + h4 * x2 + h3 * x3 + h2 * x4;
t2 = h2 * r0 + h1 * r1 + h0 * r2 + h4 * x3 + h3 * x4;
t3 = h3 * r0 + h2 * r1 + h1 * r2 + h0 * r3 + h4 * x4;
t4 = h4 * r0 + h3 * r1 + h2 * r2 + h1 * r3 + h0 * r4;
h0 = t0 & 0x3ffffff;
t1 += t0 >> 26;
h1 = t1 & 0x3ffffff;
t2 += t1 >> 26;
h2 = t2 & 0x3ffffff;
t3 += t2 >> 26;
h3 = t3 & 0x3ffffff;
t4 += t3 >> 26;
h4 = t4 & 0x3ffffff;
h0 += 5 * (t4 >> 26);
h1 += h0 >> 26;
h0 &= 0x3ffffff;
state->h0 = h0;
state->h1 = h1;
state->h2 = h2;
state->h3 = h3;
state->h4 = h4;
2024-05-31 12:59:58 +02:00
}
void
poly1305_reduce(struct poly1305_state *state,
uint32_t a[POLY1305_TAGLEN_WORDS])
2024-05-31 12:59:58 +02:00
{
uint64_t t0, t1, t2, t3, t4, g0, g1, g2, g3, g4;
uint32_t mask;
t0 = (state->h0 | (state->h1 << 26)) & 0xffffffff;
t1 = ((state->h1 >> 6) | (state->h2 << 20)) & 0xffffffff;
t2 = ((state->h2 >> 12) | (state->h3 << 14)) & 0xffffffff;
t3 = ((state->h3 >> 18) | (state->h4 << 8)) & 0xffffffff;
t4 = state->h4 >> 24;
2024-05-31 12:59:58 +02:00
g0 = t0 + 5;
g1 = t1 + (g0 >> 32);
g2 = t2 + (g1 >> 32);
g3 = t3 + (g2 >> 32);
g4 = t4 + (g3 >> 32);
2024-05-31 12:59:58 +02:00
mask = ~(g4 >> 2) + 1;
2024-05-31 12:59:58 +02:00
/*
* In the case that t4 t3 t2 t1 t0 > 2^130 - 5, g0 thru g3 will
* overflow 32 bits. Given that mask is 32-bits wide, AND-ing it here
* will perform the required clamping.
*/
2024-05-31 12:59:58 +02:00
t0 = (t0 & ~mask) | (g0 & mask);
t1 = (t1 & ~mask) | (g1 & mask);
t2 = (t2 & ~mask) | (g2 & mask);
t3 = (t3 & ~mask) | (g3 & mask);
t0 += state->s0;
t1 += state->s1 + (t0 >> 32);
t2 += state->s2 + (t1 >> 32);
t3 += state->s3 + (t2 >> 32);
2024-05-31 12:59:58 +02:00
a[0] = t0 & 0xffffffff;
a[1] = t1 & 0xffffffff;
a[2] = t2 & 0xffffffff;
a[3] = t3 & 0xffffffff;
}