1
0
mirror of https://git.zx2c4.com/wireguard-nt synced 2024-09-22 06:11:35 +00:00
wireguard-nt/driver/crypto.c
Jason A. Donenfeld f970d33898 driver: ioctl: do not return zero psks
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
2021-08-10 00:27:36 +02:00

2889 lines
85 KiB
C

/* SPDX-License-Identifier: GPL-2.0
*
* Copyright (C) 2015-2021 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*/
#include "crypto.h"
#include "arithmetic.h"
#include "memory.h"
#pragma warning(disable : 4244) /* '=': conversion from 'UINT32' to 'UINT8', possible loss of data */
#pragma warning(disable : 4267) /* '=': conversion from 'SIZE_T' to 'ULONG', possible loss of data */
#pragma warning(disable : 4242) /* '=': conversion from 'SIZE_T' to 'UINT32', possible loss of data */
#pragma warning(disable : 6385) /* Reading invalid data from '<COMPLEX_EXPR>': the readable size is '_Old_5`32' \
bytes, but '56' bytes may be read. */
#pragma warning(disable : 26451) /* Arithmetic overflow: Using operator '*' on a 4 byte value and then casting the \
result to a 8 byte value. Cast the value to the wider type before calling operator \
'*' to avoid overflow (io.2). */
#ifdef ALLOC_PRAGMA
# pragma alloc_text(INIT, CryptoDriverEntry)
#endif
#if defined(_M_AMD64)
# include <intrin.h>
static CPU_FEATURE CpuFeatures;
# define CPUID_1_ECX_SSSE3_BIT 9
# define CPUID_1_ECX_SSSE3_BIT 9
# define CPUID_1_ECX_SSE3_BIT 0
# define CPUID_1_EDX_SSE2_BIT 26
# define CPUID_1_EDX_SSE_BIT 25
# define CPUID_1_ECX_AVX_BIT 28
# define CPUID_1_ECX_OSXSAVE_BIT 27
# define CPUID_70_EBX_AVX2_BIT 5
# define CPUID_70_EBX_AVX512F_BIT 16
# define CPUID_70_EBX_AVX512IFMA_BIT 21
# define CPUID_70_EBX_AVX512VL_BIT 31
# define WORD_EAX 0
# define WORD_EBX 1
# define WORD_ECX 2
# define WORD_EDX 3
typedef struct _CPUID_BIT_INFO
{
BYTE Leaf;
BYTE Word;
BYTE Bitno;
CPU_FEATURE RequiredBy;
} CPUID_BIT_INFO;
static CONST CPUID_BIT_INFO CpuidBitInfo[] = {
{ 1, WORD_EDX, CPUID_1_EDX_SSE_BIT, CPU_FEATURE_SSSE3 },
{ 1, WORD_EDX, CPUID_1_EDX_SSE2_BIT, CPU_FEATURE_SSSE3 },
{ 1, WORD_ECX, CPUID_1_ECX_SSE3_BIT, CPU_FEATURE_SSSE3 },
{ 1, WORD_ECX, CPUID_1_ECX_SSSE3_BIT, CPU_FEATURE_SSSE3 },
{ 1, WORD_ECX, CPUID_1_ECX_AVX_BIT, CPU_FEATURE_AVX },
{ 7, WORD_EBX, CPUID_70_EBX_AVX2_BIT, CPU_FEATURE_AVX2 },
{ 7, WORD_EBX, CPUID_70_EBX_AVX512F_BIT, CPU_FEATURE_AVX512F },
{ 7, WORD_EBX, CPUID_70_EBX_AVX512IFMA_BIT, CPU_FEATURE_AVX512IFMA },
{ 7, WORD_EBX, CPUID_70_EBX_AVX512VL_BIT, CPU_FEATURE_AVX512VL },
};
VOID CryptoDriverEntry(VOID)
{
/* It's not like it's exactly hard or complicated to support Windows 7, 8, or 8.1 kernels here,
* but it also means more testing, and given how poorly suited those old network stacks are for
* high speed networking, it's simpler to just fall back to the slow implementations, and concern
* ourselves with Windows 10 (and later, given the recent bout of Start Menu meddling).
*/
RTL_OSVERSIONINFOW OsVersionInfo = { .dwOSVersionInfoSize = sizeof(OsVersionInfo) };
if (!NT_SUCCESS(RtlGetVersion(&OsVersionInfo)) || OsVersionInfo.dwMajorVersion < 10)
return;
CPU_FEATURE DisabledCpuFeatures =
~(CPU_FEATURE_SSSE3 | CPU_FEATURE_AVX | CPU_FEATURE_AVX2 | CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512VL |
CPU_FEATURE_AVX512IFMA);
int CpuInfo[4], InfoType, MaxInfoType;
BOOLEAN IsIntel, IsSkylakeX, HasOSXSAVE;
__cpuid(CpuInfo, InfoType = 0);
MaxInfoType = CpuInfo[WORD_EAX];
IsIntel = CpuInfo[WORD_EBX] == 0x756e6547 && CpuInfo[WORD_EDX] == 0x49656e69 && CpuInfo[WORD_ECX] == 0x6c65746e;
__cpuid(CpuInfo, InfoType = 1);
IsSkylakeX = IsIntel && (CpuInfo[WORD_EAX] & 0xf0ff0) == 0x50650;
HasOSXSAVE = !!(CpuInfo[WORD_ECX] & (1 << CPUID_1_ECX_OSXSAVE_BIT));
for (ULONG i = 0; i < ARRAYSIZE(CpuidBitInfo); ++i)
{
if (CpuidBitInfo[i].Leaf != InfoType)
__cpuid(CpuInfo, InfoType = CpuidBitInfo[i].Leaf);
if (CpuidBitInfo[i].Leaf > MaxInfoType || !(CpuInfo[CpuidBitInfo[i].Word] & (1UL << CpuidBitInfo[i].Bitno)))
DisabledCpuFeatures |= CpuidBitInfo[i].RequiredBy;
}
ULONG64 FeatureMask = RtlGetEnabledExtendedFeatures((ULONG64)(-1)) & (ULONG64)(HasOSXSAVE ? _xgetbv(0) : 0);
if ((FeatureMask & (XSTATE_MASK_GSSE | XSTATE_MASK_AVX)) != (XSTATE_MASK_GSSE | XSTATE_MASK_AVX))
DisabledCpuFeatures |= CPU_FEATURE_AVX | CPU_FEATURE_AVX2;
if ((FeatureMask & (XSTATE_MASK_GSSE | XSTATE_MASK_AVX | XSTATE_MASK_AVX512)) !=
(XSTATE_MASK_GSSE | XSTATE_MASK_AVX | XSTATE_MASK_AVX512))
DisabledCpuFeatures |= CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512VL | CPU_FEATURE_AVX512IFMA;
if (DisabledCpuFeatures & CPU_FEATURE_SSSE3)
DisabledCpuFeatures |= CPU_FEATURE_AVX;
if (DisabledCpuFeatures & CPU_FEATURE_AVX)
DisabledCpuFeatures |= CPU_FEATURE_AVX2;
if (DisabledCpuFeatures & CPU_FEATURE_AVX2)
DisabledCpuFeatures |= CPU_FEATURE_AVX512F;
if (DisabledCpuFeatures & CPU_FEATURE_AVX512F)
DisabledCpuFeatures |= CPU_FEATURE_AVX512VL;
if (DisabledCpuFeatures & CPU_FEATURE_AVX512F)
DisabledCpuFeatures |= CPU_FEATURE_AVX512IFMA;
/* AVX512F downclocks too much on Skylake X, but VL is fine. */
if (IsSkylakeX)
DisabledCpuFeatures |= CPU_FEATURE_AVX512F;
CpuFeatures = ~DisabledCpuFeatures;
}
_Use_decl_annotations_
VOID
SimdGet(SIMD_STATE *State)
{
State->HasSavedXState = FALSE;
State->CpuFeatures = CpuFeatures;
if (CpuFeatures & (CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512VL | CPU_FEATURE_AVX512IFMA))
{
State->HasSavedXState =
NT_SUCCESS(KeSaveExtendedProcessorState(XSTATE_MASK_AVX | XSTATE_MASK_AVX512, &State->XState));
if (State->HasSavedXState)
return;
State->CpuFeatures &= ~(CPU_FEATURE_AVX512F | CPU_FEATURE_AVX512VL | CPU_FEATURE_AVX512IFMA);
}
if (CpuFeatures & (CPU_FEATURE_AVX2 | CPU_FEATURE_AVX))
{
State->HasSavedXState = NT_SUCCESS(KeSaveExtendedProcessorState(XSTATE_MASK_AVX, &State->XState));
if (State->HasSavedXState)
return;
State->CpuFeatures &= ~(CPU_FEATURE_AVX2 | CPU_FEATURE_AVX);
}
/* Sometimes State->XState isn't initialized, because of HaveSavedXState, but analysis doesn't know that. */
_Analysis_assume_((RtlFillMemory(State, sizeof(*State), 'A'), TRUE));
/* We don't need to save the state for SSSE3 on recent Windows. */
}
_Use_decl_annotations_
VOID
SimdPut(SIMD_STATE *State)
{
if (!State->HasSavedXState)
{
State->CpuFeatures = 0;
return;
}
KeRestoreExtendedProcessorState(&State->XState);
RtlSecureZeroMemory(State, sizeof(*State));
}
#else
VOID CryptoDriverEntry(VOID) {}
#endif
static inline UINT32
Rol32(_In_ UINT32 Word, _In_ LONG Shift)
{
return (Word << (Shift & 31)) | (Word >> ((-Shift) & 31));
}
static inline UINT32
Ror32(_In_ UINT32 Word, _In_ LONG Shift)
{
return (Word >> (Shift & 31)) | (Word << ((-Shift) & 31));
}
static inline UINT64
Rol64(_In_ UINT64 Word, _In_ LONG Shift)
{
return (Word << (Shift & 63)) | (Word >> ((-Shift) & 63));
}
#define Le16ToCpup(X) Le16ToCpu(*(X))
#define Le32ToCpup(X) Le32ToCpu(*(X))
#define Le64ToCpup(X) Le64ToCpu(*(X))
static inline UINT32
GetUnalignedLe32(_In_reads_bytes_(4) CONST UINT8 *A)
{
UINT32 L;
RtlCopyMemory(&L, A, sizeof(L));
return Le32ToCpup(&L);
}
static inline UINT64
GetUnalignedLe64(_In_reads_bytes_(8) CONST UINT8 *A)
{
UINT64 L;
RtlCopyMemory(&L, A, sizeof(L));
return Le64ToCpup(&L);
}
static inline VOID
PutUnalignedLe32(_In_ UINT32 S, _Out_writes_bytes_all_(4) UINT8 *D)
{
UINT32 L = CpuToLe32(S);
RtlCopyMemory(D, &L, sizeof(L));
}
static inline VOID
CpuToLe32Array(_Inout_updates_(Words) UINT32 *Buf, _In_ SIZE_T Words)
{
while (Words--)
{
*Buf = CpuToLe32(*Buf);
++Buf;
}
}
static inline VOID
Le32ToCpuArray(_Inout_updates_(Words) UINT32 *Buf, _In_ SIZE_T Words)
{
while (Words--)
{
*Buf = Le32ToCpup(Buf);
++Buf;
}
}
static VOID
XorCpy(
_Out_writes_bytes_all_(Len) UINT8 *Dst,
_In_reads_bytes_(Len) CONST UINT8 *Src1,
_In_reads_bytes_(Len) CONST UINT8 *Src2,
_In_ SIZE_T Len)
{
SIZE_T i;
for (i = 0; i < Len; ++i)
Dst[i] = Src1[i] ^ Src2[i];
}
#define QUARTER_ROUND(X, A, B, C, D) \
(X[A] += X[B], \
X[D] = Rol32((X[D] ^ X[A]), 16), \
X[C] += X[D], \
X[B] = Rol32((X[B] ^ X[C]), 12), \
X[A] += X[B], \
X[D] = Rol32((X[D] ^ X[A]), 8), \
X[C] += X[D], \
X[B] = Rol32((X[B] ^ X[C]), 7))
#define C(i, j) (i * 4 + j)
#define DOUBLE_ROUND(X) \
(/* Column Round */ \
QUARTER_ROUND(X, C(0, 0), C(1, 0), C(2, 0), C(3, 0)), \
QUARTER_ROUND(X, C(0, 1), C(1, 1), C(2, 1), C(3, 1)), \
QUARTER_ROUND(X, C(0, 2), C(1, 2), C(2, 2), C(3, 2)), \
QUARTER_ROUND(X, C(0, 3), C(1, 3), C(2, 3), C(3, 3)), /* Diagonal Round */ \
QUARTER_ROUND(X, C(0, 0), C(1, 1), C(2, 2), C(3, 3)), \
QUARTER_ROUND(X, C(0, 1), C(1, 2), C(2, 3), C(3, 0)), \
QUARTER_ROUND(X, C(0, 2), C(1, 3), C(2, 0), C(3, 1)), \
QUARTER_ROUND(X, C(0, 3), C(1, 0), C(2, 1), C(3, 2)))
#define TWENTY_ROUNDS(X) \
(DOUBLE_ROUND(X), \
DOUBLE_ROUND(X), \
DOUBLE_ROUND(X), \
DOUBLE_ROUND(X), \
DOUBLE_ROUND(X), \
DOUBLE_ROUND(X), \
DOUBLE_ROUND(X), \
DOUBLE_ROUND(X), \
DOUBLE_ROUND(X), \
DOUBLE_ROUND(X))
enum CHACHA20_LENGTHS
{
CHACHA20_NONCE_SIZE = 16,
CHACHA20_KEY_SIZE = 32,
CHACHA20_KEY_WORDS = CHACHA20_KEY_SIZE / sizeof(UINT32),
CHACHA20_BLOCK_SIZE = 64,
CHACHA20_BLOCK_WORDS = CHACHA20_BLOCK_SIZE / sizeof(UINT32),
HCHACHA20_NONCE_SIZE = CHACHA20_NONCE_SIZE,
HCHACHA20_KEY_SIZE = CHACHA20_KEY_SIZE
};
enum CHACHA20_CONSTANTS
{
/* expand 32-byte k */
CHACHA20_CONSTANT_EXPA = 0x61707865U,
CHACHA20_CONSTANT_ND_3 = 0x3320646eU,
CHACHA20_CONSTANT_2_BY = 0x79622d32U,
CHACHA20_CONSTANT_TE_K = 0x6b206574U
};
typedef struct _CHACHA20_CTX
{
union
{
UINT32 State[16];
struct
{
UINT32 Constant[4];
UINT32 Key[8];
UINT32 Counter[4];
};
};
} CHACHA20_CTX;
static VOID
ChaCha20Init(_Out_ CHACHA20_CTX *Ctx, _In_ CONST UINT8 Key[CHACHA20_KEY_SIZE], _In_ CONST UINT64 Nonce)
{
Ctx->Constant[0] = CHACHA20_CONSTANT_EXPA;
Ctx->Constant[1] = CHACHA20_CONSTANT_ND_3;
Ctx->Constant[2] = CHACHA20_CONSTANT_2_BY;
Ctx->Constant[3] = CHACHA20_CONSTANT_TE_K;
Ctx->Key[0] = GetUnalignedLe32(Key + 0);
Ctx->Key[1] = GetUnalignedLe32(Key + 4);
Ctx->Key[2] = GetUnalignedLe32(Key + 8);
Ctx->Key[3] = GetUnalignedLe32(Key + 12);
Ctx->Key[4] = GetUnalignedLe32(Key + 16);
Ctx->Key[5] = GetUnalignedLe32(Key + 20);
Ctx->Key[6] = GetUnalignedLe32(Key + 24);
Ctx->Key[7] = GetUnalignedLe32(Key + 28);
Ctx->Counter[0] = 0;
Ctx->Counter[1] = 0;
Ctx->Counter[2] = Nonce & 0xffffffffU;
Ctx->Counter[3] = Nonce >> 32;
}
#if defined(_M_AMD64)
VOID
ChaCha20ALU(
_Out_writes_bytes_all_(Len) UINT8 *Dst,
_In_reads_bytes_(Len) CONST UINT8 *Src,
_In_ SIZE_T Len,
_In_ CONST UINT32 Key[8],
_In_ CONST UINT32 Counter[4]);
VOID
ChaCha20SSSE3(
_Out_writes_bytes_all_(Len) UINT8 *Dst,
_In_reads_bytes_(Len) CONST UINT8 *Src,
_In_ SIZE_T Len,
_In_ CONST UINT32 Key[8],
_In_ CONST UINT32 Counter[4]);
VOID
ChaCha20AVX2(
_Out_writes_bytes_all_(Len) UINT8 *Dst,
_In_reads_bytes_(Len) CONST UINT8 *Src,
_In_ SIZE_T Len,
_In_ CONST UINT32 Key[8],
_In_ CONST UINT32 Counter[4]);
VOID
ChaCha20AVX512(
_Out_writes_bytes_all_(Len) UINT8 *Dst,
_In_reads_bytes_(Len) CONST UINT8 *Src,
_In_ SIZE_T Len,
_In_ CONST UINT32 Key[8],
_In_ CONST UINT32 Counter[4]);
VOID
ChaCha20AVX512VL(
_Out_writes_bytes_all_(Len) UINT8 *Dst,
_In_reads_bytes_(Len) CONST UINT8 *Src,
_In_ SIZE_T Len,
_In_ CONST UINT32 Key[8],
_In_ CONST UINT32 Counter[4]);
static VOID
ChaCha20(
_Inout_ CHACHA20_CTX *Ctx,
_Out_writes_bytes_all_(Len) UINT8 *Out,
_In_reads_bytes_(Len) CONST UINT8 *In,
_In_ UINT32 Len,
_In_opt_ CONST SIMD_STATE *Simd)
{
if (!Len)
return;
if (Simd && (Simd->CpuFeatures & CPU_FEATURE_AVX512F))
ChaCha20AVX512(Out, In, Len, Ctx->Key, Ctx->Counter);
else if (Simd && (Simd->CpuFeatures & CPU_FEATURE_AVX512VL))
ChaCha20AVX512VL(Out, In, Len, Ctx->Key, Ctx->Counter);
else if (Simd && (Simd->CpuFeatures & CPU_FEATURE_AVX2))
ChaCha20AVX2(Out, In, Len, Ctx->Key, Ctx->Counter);
else if ((Simd && (Simd->CpuFeatures & CPU_FEATURE_SSSE3)) || (!Simd && (CpuFeatures & CPU_FEATURE_SSSE3)))
ChaCha20SSSE3(Out, In, Len, Ctx->Key, Ctx->Counter);
else
ChaCha20ALU(Out, In, Len, Ctx->Key, Ctx->Counter);
Ctx->Counter[0] += (Len + 63) / 64;
}
static VOID
ChaCha20Block(
_Inout_ CHACHA20_CTX *Ctx,
_Out_writes_all_(CHACHA20_BLOCK_WORDS) UINT32 Stream[CHACHA20_BLOCK_WORDS],
_In_opt_ CONST SIMD_STATE *Simd)
{
static CONST UINT32 ZeroInput[CHACHA20_BLOCK_WORDS] = { 0 };
ChaCha20(Ctx, (UINT8 *)Stream, (CONST UINT8 *)ZeroInput, sizeof(ZeroInput), Simd);
}
#else
static VOID
ChaCha20Block(
_Inout_ CHACHA20_CTX *Ctx,
_Out_writes_all_(CHACHA20_BLOCK_WORDS) UINT32 Stream[CHACHA20_BLOCK_WORDS],
_In_opt_ CONST SIMD_STATE *Simd)
{
UINT32 X[CHACHA20_BLOCK_WORDS];
LONG i;
for (i = 0; i < ARRAYSIZE(X); ++i)
X[i] = Ctx->State[i];
TWENTY_ROUNDS(X);
for (i = 0; i < ARRAYSIZE(X); ++i)
Stream[i] = CpuToLe32(X[i] + Ctx->State[i]);
Ctx->Counter[0] += 1;
}
static VOID
ChaCha20(
_Inout_ CHACHA20_CTX *Ctx,
_Out_writes_bytes_all_(Len) UINT8 *Out,
_In_reads_bytes_(Len) CONST UINT8 *In,
_In_ UINT32 Len,
_In_opt_ CONST SIMD_STATE *Simd)
{
UINT32 Buf[CHACHA20_BLOCK_WORDS];
while (Len >= CHACHA20_BLOCK_SIZE)
{
ChaCha20Block(Ctx, Buf, Simd);
XorCpy(Out, In, (UINT8 *)Buf, CHACHA20_BLOCK_SIZE);
Len -= CHACHA20_BLOCK_SIZE;
Out += CHACHA20_BLOCK_SIZE;
In += CHACHA20_BLOCK_SIZE;
}
if (Len)
{
ChaCha20Block(Ctx, Buf, Simd);
XorCpy(Out, In, (UINT8 *)Buf, Len);
}
}
#endif
static VOID
HChaCha20(
_Out_writes_all_(CHACHA20_KEY_WORDS) UINT32 DerivedKey[CHACHA20_KEY_WORDS],
_In_ CONST UINT8 Nonce[HCHACHA20_NONCE_SIZE],
_In_ CONST UINT8 Key[HCHACHA20_KEY_SIZE])
{
UINT32 X[] = { CHACHA20_CONSTANT_EXPA, CHACHA20_CONSTANT_ND_3, CHACHA20_CONSTANT_2_BY,
CHACHA20_CONSTANT_TE_K, GetUnalignedLe32(Key + 0), GetUnalignedLe32(Key + 4),
GetUnalignedLe32(Key + 8), GetUnalignedLe32(Key + 12), GetUnalignedLe32(Key + 16),
GetUnalignedLe32(Key + 20), GetUnalignedLe32(Key + 24), GetUnalignedLe32(Key + 28),
GetUnalignedLe32(Nonce + 0), GetUnalignedLe32(Nonce + 4), GetUnalignedLe32(Nonce + 8),
GetUnalignedLe32(Nonce + 12) };
TWENTY_ROUNDS(X);
RtlCopyMemory(DerivedKey + 0, X + 0, sizeof(UINT32) * 4);
RtlCopyMemory(DerivedKey + 4, X + 12, sizeof(UINT32) * 4);
}
enum POLY1305_LENGTHS
{
POLY1305_BLOCK_SIZE = 16,
POLY1305_KEY_SIZE = 32,
POLY1305_MAC_SIZE = 16
};
#if defined(_M_AMD64)
typedef union _POLY1305_INTERNAL
{
struct
{
UINT64 H[3];
UINT64 R[2];
} Base264;
struct
{
UINT32 H[5];
UINT32 IsBase226;
UINT64 R[2];
UINT64 Pad;
struct
{
UINT32 R2, R1, R4, R3;
} RP[9];
} Base226;
struct
{
UINT64 H[3];
UINT64 S[2];
UINT64 R[3];
struct
{
UINT32 R1, R3, R2, R4;
} RP[4];
} Base244;
} POLY1305_INTERNAL;
VOID
Poly1305InitALU(_Out_ POLY1305_INTERNAL *Ctx, _In_ CONST UINT8 Key[POLY1305_BLOCK_SIZE]);
VOID
Poly1305InitAVX512IFMA(_Out_ POLY1305_INTERNAL *Ctx, _In_ CONST UINT8 Key[POLY1305_BLOCK_SIZE]);
VOID
Poly1305BlocksALU(
_Inout_ POLY1305_INTERNAL *Ctx,
_In_reads_bytes_(Len) CONST UINT8 *In,
_In_ CONST SIZE_T Len,
_In_ CONST UINT32 PadBit);
VOID
Poly1305BlocksAVX(
_Inout_ POLY1305_INTERNAL *Ctx,
_In_reads_bytes_(Len) CONST UINT8 *In,
_In_ CONST SIZE_T Len,
_In_ CONST UINT32 PadBit);
VOID
Poly1305BlocksAVX2(
_Inout_ POLY1305_INTERNAL *Ctx,
_In_reads_bytes_(Len) CONST UINT8 *In,
_In_ CONST SIZE_T Len,
_In_ CONST UINT32 PadBit);
VOID
Poly1305BlocksAVX512IFMA(
_Inout_ POLY1305_INTERNAL *Ctx,
_In_reads_bytes_(Len) CONST UINT8 *In,
_In_ CONST SIZE_T Len,
_In_ CONST UINT32 PadBit);
VOID
Poly1305EmitALU(
_In_ CONST POLY1305_INTERNAL *Ctx,
_Out_writes_bytes_all_(POLY1305_MAC_SIZE) UINT8 Mac[POLY1305_MAC_SIZE],
_In_ CONST UINT32 Nonce[4]);
VOID
Poly1305EmitAVX512IFMA(
_In_ CONST POLY1305_INTERNAL *Ctx,
_Out_writes_bytes_all_(POLY1305_MAC_SIZE) UINT8 Mac[POLY1305_MAC_SIZE],
_In_ CONST UINT32 Nonce[4]);
static VOID
Poly1305InitCore(_Out_ POLY1305_INTERNAL *St, _In_ CONST UINT8 Key[16], _In_opt_ CONST SIMD_STATE *Simd)
{
if (Simd && (Simd->CpuFeatures & CPU_FEATURE_AVX512IFMA))
Poly1305InitAVX512IFMA(St, Key);
else
Poly1305InitALU(St, Key);
}
static VOID
Poly1305BlocksCore(
_Inout_ POLY1305_INTERNAL *St,
_In_reads_bytes_(Len) CONST UINT8 *Input,
_In_ SIZE_T Len,
_In_ CONST UINT32 PadBit,
_In_opt_ CONST SIMD_STATE *Simd)
{
if (Simd && (Simd->CpuFeatures & CPU_FEATURE_AVX512IFMA))
Poly1305BlocksAVX512IFMA(St, Input, Len, PadBit);
else if (Simd && (Simd->CpuFeatures & CPU_FEATURE_AVX2))
Poly1305BlocksAVX2(St, Input, Len, PadBit);
else if (Simd && (Simd->CpuFeatures & CPU_FEATURE_AVX))
Poly1305BlocksAVX(St, Input, Len, PadBit);
else
Poly1305BlocksALU(St, Input, Len, PadBit);
}
static VOID
Poly1305EmitCore(
_In_ CONST POLY1305_INTERNAL *St,
_Out_writes_bytes_all_(16) UINT8 Mac[16],
_In_ CONST UINT32 Nonce[4],
_In_opt_ CONST SIMD_STATE *Simd)
{
if (Simd && (Simd->CpuFeatures & CPU_FEATURE_AVX512IFMA))
Poly1305EmitAVX512IFMA(St, Mac, Nonce);
else
Poly1305EmitALU(St, Mac, Nonce);
}
#else
typedef struct _POLY1305_INTERNAL
{
UINT32 H[5];
UINT32 R[5];
UINT32 S[4];
} POLY1305_INTERNAL;
static VOID
Poly1305InitCore(_Out_ POLY1305_INTERNAL *St, _In_ CONST UINT8 Key[16], _In_opt_ CONST SIMD_STATE *Simd)
{
/* r &= 0xffffffc0ffffffc0ffffffc0fffffff */
St->R[0] = (GetUnalignedLe32(&Key[0])) & 0x3ffffff;
St->R[1] = (GetUnalignedLe32(&Key[3]) >> 2) & 0x3ffff03;
St->R[2] = (GetUnalignedLe32(&Key[6]) >> 4) & 0x3ffc0ff;
St->R[3] = (GetUnalignedLe32(&Key[9]) >> 6) & 0x3f03fff;
St->R[4] = (GetUnalignedLe32(&Key[12]) >> 8) & 0x00fffff;
/* s = 5*r */
St->S[0] = St->R[1] * 5;
St->S[1] = St->R[2] * 5;
St->S[2] = St->R[3] * 5;
St->S[3] = St->R[4] * 5;
/* h = 0 */
St->H[0] = 0;
St->H[1] = 0;
St->H[2] = 0;
St->H[3] = 0;
St->H[4] = 0;
}
static VOID
Poly1305BlocksCore(
_Inout_ POLY1305_INTERNAL *St,
_In_reads_bytes_(Len) CONST UINT8 *Input,
_In_ SIZE_T Len,
_In_ CONST UINT32 PadBit,
_In_opt_ CONST SIMD_STATE *Simd)
{
CONST UINT32 Hibit = PadBit << 24;
UINT32 R0, R1, R2, R3, R4;
UINT32 S1, S2, S3, S4;
UINT32 H0, H1, H2, H3, H4;
UINT64 D0, D1, D2, D3, D4;
UINT32 C;
R0 = St->R[0];
R1 = St->R[1];
R2 = St->R[2];
R3 = St->R[3];
R4 = St->R[4];
S1 = St->S[0];
S2 = St->S[1];
S3 = St->S[2];
S4 = St->S[3];
H0 = St->H[0];
H1 = St->H[1];
H2 = St->H[2];
H3 = St->H[3];
H4 = St->H[4];
while (Len >= POLY1305_BLOCK_SIZE)
{
/* h += m[i] */
H0 += (GetUnalignedLe32(&Input[0])) & 0x3ffffff;
H1 += (GetUnalignedLe32(&Input[3]) >> 2) & 0x3ffffff;
H2 += (GetUnalignedLe32(&Input[6]) >> 4) & 0x3ffffff;
H3 += (GetUnalignedLe32(&Input[9]) >> 6) & 0x3ffffff;
H4 += (GetUnalignedLe32(&Input[12]) >> 8) | Hibit;
/* h *= r */
D0 = ((UINT64)H0 * R0) + ((UINT64)H1 * S4) + ((UINT64)H2 * S3) + ((UINT64)H3 * S2) + ((UINT64)H4 * S1);
D1 = ((UINT64)H0 * R1) + ((UINT64)H1 * R0) + ((UINT64)H2 * S4) + ((UINT64)H3 * S3) + ((UINT64)H4 * S2);
D2 = ((UINT64)H0 * R2) + ((UINT64)H1 * R1) + ((UINT64)H2 * R0) + ((UINT64)H3 * S4) + ((UINT64)H4 * S3);
D3 = ((UINT64)H0 * R3) + ((UINT64)H1 * R2) + ((UINT64)H2 * R1) + ((UINT64)H3 * R0) + ((UINT64)H4 * S4);
D4 = ((UINT64)H0 * R4) + ((UINT64)H1 * R3) + ((UINT64)H2 * R2) + ((UINT64)H3 * R1) + ((UINT64)H4 * R0);
/* (partial) h %= p */
C = (UINT32)(D0 >> 26);
H0 = (UINT32)D0 & 0x3ffffff;
D1 += C;
C = (UINT32)(D1 >> 26);
H1 = (UINT32)D1 & 0x3ffffff;
D2 += C;
C = (UINT32)(D2 >> 26);
H2 = (UINT32)D2 & 0x3ffffff;
D3 += C;
C = (UINT32)(D3 >> 26);
H3 = (UINT32)D3 & 0x3ffffff;
D4 += C;
C = (UINT32)(D4 >> 26);
H4 = (UINT32)D4 & 0x3ffffff;
H0 += C * 5;
C = (H0 >> 26);
H0 = H0 & 0x3ffffff;
H1 += C;
Input += POLY1305_BLOCK_SIZE;
Len -= POLY1305_BLOCK_SIZE;
}
St->H[0] = H0;
St->H[1] = H1;
St->H[2] = H2;
St->H[3] = H3;
St->H[4] = H4;
}
static VOID
Poly1305EmitCore(
_In_ CONST POLY1305_INTERNAL *St,
_Out_writes_bytes_all_(16) UINT8 Mac[16],
_In_ CONST UINT32 Nonce[4],
_In_opt_ CONST SIMD_STATE *Simd)
{
UINT32 H0, H1, H2, H3, H4, C;
UINT32 G0, G1, G2, G3, G4;
UINT64 F;
UINT32 Mask;
/* fully carry h */
H0 = St->H[0];
H1 = St->H[1];
H2 = St->H[2];
H3 = St->H[3];
H4 = St->H[4];
C = H1 >> 26;
H1 = H1 & 0x3ffffff;
H2 += C;
C = H2 >> 26;
H2 = H2 & 0x3ffffff;
H3 += C;
C = H3 >> 26;
H3 = H3 & 0x3ffffff;
H4 += C;
C = H4 >> 26;
H4 = H4 & 0x3ffffff;
H0 += C * 5;
C = H0 >> 26;
H0 = H0 & 0x3ffffff;
H1 += C;
/* compute h + -p */
G0 = H0 + 5;
C = G0 >> 26;
G0 &= 0x3ffffff;
G1 = H1 + C;
C = G1 >> 26;
G1 &= 0x3ffffff;
G2 = H2 + C;
C = G2 >> 26;
G2 &= 0x3ffffff;
G3 = H3 + C;
C = G3 >> 26;
G3 &= 0x3ffffff;
G4 = H4 + C - (1UL << 26);
/* select h if h < p, or h + -p if h >= p */
Mask = (G4 >> ((sizeof(UINT32) * 8) - 1)) - 1;
G0 &= Mask;
G1 &= Mask;
G2 &= Mask;
G3 &= Mask;
G4 &= Mask;
Mask = ~Mask;
H0 = (H0 & Mask) | G0;
H1 = (H1 & Mask) | G1;
H2 = (H2 & Mask) | G2;
H3 = (H3 & Mask) | G3;
H4 = (H4 & Mask) | G4;
/* h = h % (2^128) */
H0 = ((H0) | (H1 << 26)) & 0xffffffff;
H1 = ((H1 >> 6) | (H2 << 20)) & 0xffffffff;
H2 = ((H2 >> 12) | (H3 << 14)) & 0xffffffff;
H3 = ((H3 >> 18) | (H4 << 8)) & 0xffffffff;
/* mac = (h + nonce) % (2^128) */
F = (UINT64)H0 + Nonce[0];
H0 = (UINT32)F;
F = (UINT64)H1 + Nonce[1] + (F >> 32);
H1 = (UINT32)F;
F = (UINT64)H2 + Nonce[2] + (F >> 32);
H2 = (UINT32)F;
F = (UINT64)H3 + Nonce[3] + (F >> 32);
H3 = (UINT32)F;
PutUnalignedLe32(H0, &Mac[0]);
PutUnalignedLe32(H1, &Mac[4]);
PutUnalignedLe32(H2, &Mac[8]);
PutUnalignedLe32(H3, &Mac[12]);
}
#endif
typedef struct _POLY1305_CTX
{
POLY1305_INTERNAL State;
UINT32 Nonce[4];
UINT8 Data[POLY1305_BLOCK_SIZE];
SIZE_T Num;
CONST SIMD_STATE *Simd;
} POLY1305_CTX;
static VOID
Poly1305Init(_Out_ POLY1305_CTX *Ctx, _In_ CONST UINT8 Key[POLY1305_KEY_SIZE], _In_opt_ CONST SIMD_STATE *Simd)
{
Ctx->Nonce[0] = GetUnalignedLe32(&Key[16]);
Ctx->Nonce[1] = GetUnalignedLe32(&Key[20]);
Ctx->Nonce[2] = GetUnalignedLe32(&Key[24]);
Ctx->Nonce[3] = GetUnalignedLe32(&Key[28]);
Ctx->Simd = Simd;
Poly1305InitCore(&Ctx->State, Key, Ctx->Simd);
Ctx->Num = 0;
}
static VOID
Poly1305Update(_Inout_ POLY1305_CTX *Ctx, _In_reads_bytes_(Len) CONST UINT8 *Input, _In_ SIZE_T Len)
{
CONST SIZE_T Num = Ctx->Num;
SIZE_T Rem;
if (Num)
{
Rem = POLY1305_BLOCK_SIZE - Num;
if (Len < Rem)
{
RtlCopyMemory(Ctx->Data + Num, Input, Len);
Ctx->Num = Num + Len;
return;
}
RtlCopyMemory(Ctx->Data + Num, Input, Rem);
Poly1305BlocksCore(&Ctx->State, Ctx->Data, POLY1305_BLOCK_SIZE, 1, Ctx->Simd);
Input += Rem;
Len -= Rem;
}
Rem = Len % POLY1305_BLOCK_SIZE;
Len -= Rem;
if (Len >= POLY1305_BLOCK_SIZE)
{
Poly1305BlocksCore(&Ctx->State, Input, Len, 1, Ctx->Simd);
Input += Len;
}
if (Rem)
RtlCopyMemory(Ctx->Data, Input, Rem);
Ctx->Num = Rem;
}
static VOID
Poly1305Final(_Inout_ POLY1305_CTX *Ctx, _Out_writes_bytes_all_(16) UINT8 Mac[POLY1305_MAC_SIZE])
{
SIZE_T Num = Ctx->Num;
if (Num)
{
Ctx->Data[Num++] = 1;
while (Num < POLY1305_BLOCK_SIZE)
Ctx->Data[Num++] = 0;
Poly1305BlocksCore(&Ctx->State, Ctx->Data, POLY1305_BLOCK_SIZE, 0, Ctx->Simd);
}
Poly1305EmitCore(&Ctx->State, Mac, Ctx->Nonce, Ctx->Simd);
RtlSecureZeroMemory(Ctx, sizeof(*Ctx));
}
static CONST UINT8 Pad0[16] = { 0 };
_Use_decl_annotations_
VOID
ChaCha20Poly1305Encrypt(
UINT8 *Dst,
CONST UINT8 *Src,
CONST SIZE_T SrcLen,
CONST UINT8 *Ad,
CONST SIZE_T AdLen,
CONST UINT64 Nonce,
CONST UINT8 Key[CHACHA20POLY1305_KEY_SIZE])
{
POLY1305_CTX Poly1305State;
CHACHA20_CTX ChaCha20State;
union
{
UINT8 Block0[POLY1305_KEY_SIZE];
UINT64 Lens[2];
} B = { { 0 } };
ChaCha20Init(&ChaCha20State, Key, Nonce);
ChaCha20(&ChaCha20State, B.Block0, B.Block0, sizeof(B.Block0), NULL);
Poly1305Init(&Poly1305State, B.Block0, NULL);
Poly1305Update(&Poly1305State, Ad, AdLen);
Poly1305Update(&Poly1305State, Pad0, (0x10 - AdLen) & 0xf);
ChaCha20(&ChaCha20State, Dst, Src, SrcLen, NULL);
Poly1305Update(&Poly1305State, Dst, SrcLen);
Poly1305Update(&Poly1305State, Pad0, (0x10 - SrcLen) & 0xf);
B.Lens[0] = CpuToLe64(AdLen);
B.Lens[1] = CpuToLe64(SrcLen);
Poly1305Update(&Poly1305State, (UINT8 *)B.Lens, sizeof(B.Lens));
Poly1305Final(&Poly1305State, Dst + SrcLen);
RtlSecureZeroMemory(&ChaCha20State, sizeof(ChaCha20State));
RtlSecureZeroMemory(&B, sizeof(B));
}
_Use_decl_annotations_
BOOLEAN
ChaCha20Poly1305Decrypt(
UINT8 *Dst,
CONST UINT8 *Src,
CONST SIZE_T SrcLen,
CONST UINT8 *Ad,
CONST SIZE_T AdLen,
CONST UINT64 Nonce,
CONST UINT8 Key[CHACHA20POLY1305_KEY_SIZE])
{
POLY1305_CTX Poly1305State;
CHACHA20_CTX ChaCha20State;
BOOLEAN Ret;
SIZE_T DstLen;
union
{
UINT8 Block0[POLY1305_KEY_SIZE];
UINT8 Mac[POLY1305_MAC_SIZE];
UINT64 Lens[2];
} B = { { 0 } };
if (SrcLen < POLY1305_MAC_SIZE)
return FALSE;
ChaCha20Init(&ChaCha20State, Key, Nonce);
ChaCha20(&ChaCha20State, B.Block0, B.Block0, sizeof(B.Block0), NULL);
Poly1305Init(&Poly1305State, B.Block0, NULL);
Poly1305Update(&Poly1305State, Ad, AdLen);
Poly1305Update(&Poly1305State, Pad0, (0x10 - AdLen) & 0xf);
DstLen = SrcLen - POLY1305_MAC_SIZE;
Poly1305Update(&Poly1305State, Src, DstLen);
Poly1305Update(&Poly1305State, Pad0, (0x10 - DstLen) & 0xf);
B.Lens[0] = CpuToLe64(AdLen);
B.Lens[1] = CpuToLe64(DstLen);
Poly1305Update(&Poly1305State, (UINT8 *)B.Lens, sizeof(B.Lens));
Poly1305Final(&Poly1305State, B.Mac);
Ret = CryptoEqualMemory16(B.Mac, Src + DstLen);
if (Ret)
ChaCha20(&ChaCha20State, Dst, Src, DstLen, NULL);
RtlSecureZeroMemory(&ChaCha20State, sizeof(ChaCha20State));
RtlSecureZeroMemory(&B, sizeof(B));
return Ret;
}
_Use_decl_annotations_
BOOLEAN
ChaCha20Poly1305DecryptMdl(
UINT8 *Dst,
MDL *Src,
CONST ULONG SrcLen,
CONST ULONG SrcOffset,
CONST UINT8 *Ad,
CONST SIZE_T AdLen,
CONST UINT64 Nonce,
CONST UINT8 Key[CHACHA20POLY1305_KEY_SIZE],
CONST SIMD_STATE *Simd)
{
POLY1305_CTX Poly1305State;
CHACHA20_CTX ChaCha20State;
UINT8 *SrcBuf;
ULONG Len, LenMdl, OffsetMdl = SrcOffset, Leftover = 0, Total = SrcLen - POLY1305_MAC_SIZE, Remaining = Total;
MDL *Mdl = Src;
BOOLEAN Ret = FALSE;
union
{
UINT32 Stream[CHACHA20_BLOCK_WORDS];
UINT8 Block0[POLY1305_KEY_SIZE];
UINT8 Mac[POLY1305_MAC_SIZE * 2];
UINT64 Lens[2];
} B = { { 0 } };
if (SrcLen < POLY1305_MAC_SIZE)
return FALSE;
ChaCha20Init(&ChaCha20State, Key, Nonce);
ChaCha20(&ChaCha20State, B.Block0, B.Block0, sizeof(B.Block0), Simd);
Poly1305Init(&Poly1305State, B.Block0, Simd);
if (AdLen)
{
Poly1305Update(&Poly1305State, Ad, AdLen);
if (AdLen & 0xf)
Poly1305Update(&Poly1305State, Pad0, 0x10 - (AdLen & 0xf));
}
while (OffsetMdl >= MmGetMdlByteCount(Mdl))
{
OffsetMdl -= MmGetMdlByteCount(Mdl);
Mdl = Mdl->Next;
}
for (;;)
{
if (!Mdl)
goto out;
Len = LenMdl = min(MmGetMdlByteCount(Mdl) - OffsetMdl, Remaining);
SrcBuf = MmGetSystemAddressForMdlSafe(Mdl, NormalPagePriority | MdlMappingNoExecute | MdlMappingNoWrite);
if (!SrcBuf)
goto out;
SrcBuf += OffsetMdl;
/* Potential TOCTOU? We read the bytes from SrcBuf for Poly1305 here, and later below
* we decrypt those bytes with ChaCha20. If a user on the same physical machine can
* access these pages, I fear it might be possible sneak in a buffer that isn't
* actually authenticated.
*/
Poly1305Update(&Poly1305State, SrcBuf, LenMdl);
if (Leftover != 0)
{
ULONG l = min(Len, Leftover);
XorCpy(Dst, SrcBuf, ((UINT8 *)B.Stream) + (CHACHA20_BLOCK_SIZE - Leftover), l);
Leftover -= l;
SrcBuf += l;
Dst += l;
Len -= l;
}
if (Len >= CHACHA20_BLOCK_SIZE)
{
ULONG l = ALIGN_DOWN_BY_T(ULONG, Len, CHACHA20_BLOCK_SIZE);
ChaCha20(&ChaCha20State, Dst, SrcBuf, l, Simd);
SrcBuf += l;
Dst += l;
Len -= l;
}
if (Len)
{
ChaCha20Block(&ChaCha20State, B.Stream, Simd);
XorCpy(Dst, SrcBuf, (UINT8 *)B.Stream, Len);
Leftover = CHACHA20_BLOCK_SIZE - Len;
Dst += Len;
}
Remaining -= LenMdl;
if (!Remaining)
{
OffsetMdl += LenMdl;
break;
}
Mdl = Mdl->Next;
OffsetMdl = 0;
}
Poly1305Update(&Poly1305State, Pad0, (0x10 - Total) & 0xf);
B.Lens[0] = CpuToLe64(AdLen);
B.Lens[1] = CpuToLe64(Total);
Poly1305Update(&Poly1305State, (UINT8 *)B.Lens, sizeof(B.Lens));
Poly1305Final(&Poly1305State, B.Mac);
if (!NT_SUCCESS(MemCopyFromMdl(B.Mac + POLY1305_MAC_SIZE, Mdl, OffsetMdl, POLY1305_MAC_SIZE)))
goto out;
Ret = CryptoEqualMemory16(B.Mac, B.Mac + POLY1305_MAC_SIZE);
out:
RtlSecureZeroMemory(&ChaCha20State, sizeof(ChaCha20State));
RtlSecureZeroMemory(&B, sizeof(B));
return Ret;
}
_Use_decl_annotations_
BOOLEAN
ChaCha20Poly1305EncryptMdl(
UINT8 *Dst,
MDL *Src,
CONST ULONG SrcLen,
CONST ULONG SrcOffset,
CONST UINT8 *Ad,
CONST SIZE_T AdLen,
CONST UINT64 Nonce,
CONST UINT8 Key[CHACHA20POLY1305_KEY_SIZE],
CONST SIMD_STATE *Simd)
{
POLY1305_CTX Poly1305State;
CHACHA20_CTX ChaCha20State;
UINT8 *SrcBuf;
MDL *Mdl = Src;
ULONG Len, LenMdl, OffsetMdl = SrcOffset, Leftover = 0;
union
{
UINT32 Stream[CHACHA20_BLOCK_WORDS];
UINT8 Block0[POLY1305_KEY_SIZE];
UINT64 Lens[2];
} B = { { 0 } };
ChaCha20Init(&ChaCha20State, Key, Nonce);
ChaCha20(&ChaCha20State, B.Block0, B.Block0, sizeof(B.Block0), Simd);
Poly1305Init(&Poly1305State, B.Block0, Simd);
if (AdLen)
{
Poly1305Update(&Poly1305State, Ad, AdLen);
if (AdLen & 0xf)
Poly1305Update(&Poly1305State, Pad0, 0x10 - (AdLen & 0xf));
}
while (OffsetMdl >= MmGetMdlByteCount(Mdl))
{
OffsetMdl -= MmGetMdlByteCount(Mdl);
Mdl = Mdl->Next;
}
for (ULONG Remaining = SrcLen; Remaining; Remaining -= LenMdl)
{
if (!Mdl)
return FALSE;
Len = LenMdl = min(MmGetMdlByteCount(Mdl) - OffsetMdl, Remaining);
SrcBuf = MmGetSystemAddressForMdlSafe(Mdl, NormalPagePriority | MdlMappingNoExecute | MdlMappingNoWrite);
if (!SrcBuf)
return FALSE;
SrcBuf += OffsetMdl;
if (Leftover != 0)
{
ULONG l = min(Len, Leftover);
XorCpy(Dst, SrcBuf, ((UINT8 *)B.Stream) + (CHACHA20_BLOCK_SIZE - Leftover), l);
Leftover -= l;
SrcBuf += l;
Dst += l;
Len -= l;
}
if (Len >= CHACHA20_BLOCK_SIZE)
{
ULONG l = ALIGN_DOWN_BY_T(ULONG, Len, CHACHA20_BLOCK_SIZE);
ChaCha20(&ChaCha20State, Dst, SrcBuf, l, Simd);
SrcBuf += l;
Dst += l;
Len -= l;
}
if (Len)
{
ChaCha20Block(&ChaCha20State, B.Stream, Simd);
XorCpy(Dst, SrcBuf, (UINT8 *)B.Stream, Len);
Leftover = CHACHA20_BLOCK_SIZE - Len;
Dst += Len;
}
_Analysis_assume_((RtlFillMemory(Dst - LenMdl, LenMdl, 'A'), TRUE));
Poly1305Update(&Poly1305State, Dst - LenMdl, LenMdl);
Mdl = Mdl->Next;
OffsetMdl = 0;
}
Poly1305Update(&Poly1305State, Pad0, (0x10 - SrcLen) & 0xf);
B.Lens[0] = CpuToLe64(AdLen);
B.Lens[1] = CpuToLe64(SrcLen);
Poly1305Update(&Poly1305State, (UINT8 *)B.Lens, sizeof(B.Lens));
Poly1305Final(&Poly1305State, Dst);
RtlSecureZeroMemory(&ChaCha20State, sizeof(ChaCha20State));
RtlSecureZeroMemory(&B, sizeof(B));
return TRUE;
}
_Use_decl_annotations_
VOID
XChaCha20Poly1305Encrypt(
UINT8 *Dst,
CONST UINT8 *Src,
CONST SIZE_T SrcLen,
CONST UINT8 *Ad,
CONST SIZE_T AdLen,
CONST UINT8 Nonce[XCHACHA20POLY1305_NONCE_SIZE],
CONST UINT8 Key[CHACHA20POLY1305_KEY_SIZE])
{
UINT32 DerivedKey[CHACHA20_KEY_WORDS];
HChaCha20(DerivedKey, Nonce, Key);
CpuToLe32Array(DerivedKey, ARRAYSIZE(DerivedKey));
ChaCha20Poly1305Encrypt(Dst, Src, SrcLen, Ad, AdLen, GetUnalignedLe64(Nonce + 16), (UINT8 *)DerivedKey);
RtlSecureZeroMemory(DerivedKey, CHACHA20POLY1305_KEY_SIZE);
}
_Use_decl_annotations_
BOOLEAN
XChaCha20Poly1305Decrypt(
UINT8 *Dst,
CONST UINT8 *Src,
CONST SIZE_T SrcLen,
CONST UINT8 *Ad,
CONST SIZE_T AdLen,
CONST UINT8 Nonce[XCHACHA20POLY1305_NONCE_SIZE],
CONST UINT8 Key[CHACHA20POLY1305_KEY_SIZE])
{
BOOLEAN Ret;
UINT32 DerivedKey[CHACHA20_KEY_WORDS];
HChaCha20(DerivedKey, Nonce, Key);
CpuToLe32Array(DerivedKey, ARRAYSIZE(DerivedKey));
Ret = ChaCha20Poly1305Decrypt(Dst, Src, SrcLen, Ad, AdLen, GetUnalignedLe64(Nonce + 16), (UINT8 *)DerivedKey);
RtlSecureZeroMemory(DerivedKey, CHACHA20POLY1305_KEY_SIZE);
return Ret;
}
static CONST UINT32 Blake2sIv[8] = { 0x6A09E667UL, 0xBB67AE85UL, 0x3C6EF372UL, 0xA54FF53AUL,
0x510E527FUL, 0x9B05688CUL, 0x1F83D9ABUL, 0x5BE0CD19UL };
static CONST UINT8 Blake2sSigma[10][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
};
static inline VOID
Blake2sSetLastblock(_Out_ BLAKE2S_STATE *State)
{
State->F[0] = (UINT32)-1;
}
static inline VOID
Blake2sIncrementCounter(_Inout_ BLAKE2S_STATE *State, _In_ CONST UINT32 Inc)
{
State->T[0] += Inc;
State->T[1] += (State->T[0] < Inc);
}
static inline VOID
Blake2sInitParam(_Out_ BLAKE2S_STATE *State, _In_ CONST UINT32 Param)
{
LONG i;
RtlZeroMemory(State, sizeof(*State));
for (i = 0; i < 8; ++i)
State->H[i] = Blake2sIv[i];
State->H[0] ^= Param;
}
_Use_decl_annotations_
VOID
Blake2sInit(BLAKE2S_STATE *State, CONST SIZE_T OutLen)
{
Blake2sInitParam(State, 0x01010000 | OutLen);
State->OutLen = OutLen;
}
_Use_decl_annotations_
VOID
Blake2sInitKey(BLAKE2S_STATE *State, CONST SIZE_T OutLen, CONST UINT8 *Key, CONST SIZE_T KeyLen)
{
UINT8 Block[BLAKE2S_BLOCK_SIZE] = { 0 };
Blake2sInitParam(State, 0x01010000 | KeyLen << 8 | OutLen);
State->OutLen = OutLen;
RtlCopyMemory(Block, Key, KeyLen);
Blake2sUpdate(State, Block, BLAKE2S_BLOCK_SIZE);
RtlSecureZeroMemory(Block, BLAKE2S_BLOCK_SIZE);
}
static inline VOID
Blake2sCompress(
_Inout_ BLAKE2S_STATE *State,
_In_reads_bytes_(BLAKE2S_BLOCK_SIZE *Nblocks) CONST UINT8 *Block,
_In_ SIZE_T Nblocks,
_In_ CONST UINT32 Inc)
{
UINT32 M[16];
UINT32 V[16];
LONG i;
while (Nblocks > 0)
{
Blake2sIncrementCounter(State, Inc);
RtlCopyMemory(M, Block, BLAKE2S_BLOCK_SIZE);
Le32ToCpuArray(M, ARRAYSIZE(M));
RtlCopyMemory(V, State->H, 32);
V[8] = Blake2sIv[0];
V[9] = Blake2sIv[1];
V[10] = Blake2sIv[2];
V[11] = Blake2sIv[3];
V[12] = Blake2sIv[4] ^ State->T[0];
V[13] = Blake2sIv[5] ^ State->T[1];
V[14] = Blake2sIv[6] ^ State->F[0];
V[15] = Blake2sIv[7] ^ State->F[1];
#define G(R, i, A, B, C, D) \
do \
{ \
A += B + M[Blake2sSigma[R][2 * i + 0]]; \
D = Ror32(D ^ A, 16); \
C += D; \
B = Ror32(B ^ C, 12); \
A += B + M[Blake2sSigma[R][2 * i + 1]]; \
D = Ror32(D ^ A, 8); \
C += D; \
B = Ror32(B ^ C, 7); \
} while (0)
#define ROUND(R) \
do \
{ \
G(R, 0, V[0], V[4], V[8], V[12]); \
G(R, 1, V[1], V[5], V[9], V[13]); \
G(R, 2, V[2], V[6], V[10], V[14]); \
G(R, 3, V[3], V[7], V[11], V[15]); \
G(R, 4, V[0], V[5], V[10], V[15]); \
G(R, 5, V[1], V[6], V[11], V[12]); \
G(R, 6, V[2], V[7], V[8], V[13]); \
G(R, 7, V[3], V[4], V[9], V[14]); \
} while (0)
ROUND(0);
ROUND(1);
ROUND(2);
ROUND(3);
ROUND(4);
ROUND(5);
ROUND(6);
ROUND(7);
ROUND(8);
ROUND(9);
#undef G
#undef ROUND
for (i = 0; i < 8; ++i)
State->H[i] ^= V[i] ^ V[i + 8];
Block += BLAKE2S_BLOCK_SIZE;
--Nblocks;
}
}
_Use_decl_annotations_
VOID
Blake2sUpdate(BLAKE2S_STATE *State, CONST UINT8 *In, SIZE_T InLen)
{
CONST SIZE_T Fill = BLAKE2S_BLOCK_SIZE - State->BufLen;
if (!InLen)
return;
if (InLen > Fill)
{
RtlCopyMemory(State->Buf + State->BufLen, In, Fill);
Blake2sCompress(State, State->Buf, 1, BLAKE2S_BLOCK_SIZE);
State->BufLen = 0;
In += Fill;
InLen -= Fill;
}
if (InLen > BLAKE2S_BLOCK_SIZE)
{
CONST SIZE_T Nblocks = DIV_ROUND_UP(InLen, BLAKE2S_BLOCK_SIZE);
/* Hash one less (full) block than strictly possible */
Blake2sCompress(State, In, Nblocks - 1, BLAKE2S_BLOCK_SIZE);
In += BLAKE2S_BLOCK_SIZE * (Nblocks - 1);
InLen -= BLAKE2S_BLOCK_SIZE * (Nblocks - 1);
}
RtlCopyMemory(State->Buf + State->BufLen, In, InLen);
State->BufLen += InLen;
}
_Use_decl_annotations_
VOID
Blake2sFinal(BLAKE2S_STATE *State, UINT8 *Out)
{
Blake2sSetLastblock(State);
RtlZeroMemory(State->Buf + State->BufLen, BLAKE2S_BLOCK_SIZE - State->BufLen); /* Padding */
Blake2sCompress(State, State->Buf, 1, State->BufLen);
CpuToLe32Array(State->H, ARRAYSIZE(State->H));
RtlCopyMemory(Out, State->H, State->OutLen);
RtlSecureZeroMemory(State, sizeof(*State));
}
_Use_decl_annotations_
VOID
Blake2s(UINT8 *Out, CONST UINT8 *In, CONST UINT8 *Key, CONST SIZE_T OutLen, CONST SIZE_T InLen, CONST SIZE_T KeyLen)
{
BLAKE2S_STATE State;
if (KeyLen)
Blake2sInitKey(&State, OutLen, Key, KeyLen);
else
Blake2sInit(&State, OutLen);
Blake2sUpdate(&State, In, InLen);
Blake2sFinal(&State, Out);
}
_Use_decl_annotations_
VOID
Blake2s256Hmac(UINT8 *Out, CONST UINT8 *In, CONST UINT8 *Key, CONST SIZE_T InLen, CONST SIZE_T KeyLen)
{
BLAKE2S_STATE State;
__declspec(align(4)) UINT8 XKey[BLAKE2S_BLOCK_SIZE] = { 0 };
__declspec(align(4)) UINT8 IHash[BLAKE2S_HASH_SIZE];
LONG i;
if (KeyLen > BLAKE2S_BLOCK_SIZE)
{
Blake2sInit(&State, BLAKE2S_HASH_SIZE);
Blake2sUpdate(&State, Key, KeyLen);
Blake2sFinal(&State, XKey);
}
else
RtlCopyMemory(XKey, Key, KeyLen);
for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
XKey[i] ^= 0x36;
Blake2sInit(&State, BLAKE2S_HASH_SIZE);
Blake2sUpdate(&State, XKey, BLAKE2S_BLOCK_SIZE);
Blake2sUpdate(&State, In, InLen);
Blake2sFinal(&State, IHash);
for (i = 0; i < BLAKE2S_BLOCK_SIZE; ++i)
XKey[i] ^= 0x5c ^ 0x36;
Blake2sInit(&State, BLAKE2S_HASH_SIZE);
Blake2sUpdate(&State, XKey, BLAKE2S_BLOCK_SIZE);
Blake2sUpdate(&State, IHash, BLAKE2S_HASH_SIZE);
Blake2sFinal(&State, IHash);
RtlCopyMemory(Out, IHash, BLAKE2S_HASH_SIZE);
RtlSecureZeroMemory(XKey, BLAKE2S_BLOCK_SIZE);
RtlSecureZeroMemory(IHash, BLAKE2S_HASH_SIZE);
}
#define SIPROUND \
do \
{ \
V0 += V1; \
V1 = Rol64(V1, 13); \
V1 ^= V0; \
V0 = Rol64(V0, 32); \
V2 += V3; \
V3 = Rol64(V3, 16); \
V3 ^= V2; \
V0 += V3; \
V3 = Rol64(V3, 21); \
V3 ^= V0; \
V2 += V1; \
V1 = Rol64(V1, 17); \
V1 ^= V2; \
V2 = Rol64(V2, 32); \
} while (0)
#define PREAMBLE(Len) \
UINT64 V0 = 0x736f6d6570736575ULL; \
UINT64 V1 = 0x646f72616e646f6dULL; \
UINT64 V2 = 0x6c7967656e657261ULL; \
UINT64 V3 = 0x7465646279746573ULL; \
UINT64 B = ((UINT64)(Len)) << 56; \
V3 ^= Key->Key[1]; \
V2 ^= Key->Key[0]; \
V1 ^= Key->Key[1]; \
V0 ^= Key->Key[0];
#define POSTAMBLE \
V3 ^= B; \
SIPROUND; \
SIPROUND; \
V0 ^= B; \
V2 ^= 0xff; \
SIPROUND; \
SIPROUND; \
SIPROUND; \
SIPROUND; \
return (V0 ^ V1) ^ (V2 ^ V3);
_Use_decl_annotations_
UINT64
Siphash(CONST VOID *Data, SIZE_T Len, CONST SIPHASH_KEY *Key)
{
CONST UINT8 *End = (CONST UINT8 *)Data + Len - (Len % sizeof(UINT64));
CONST UINT8 Left = Len & (sizeof(UINT64) - 1);
UINT64 M;
PREAMBLE(Len)
for (; Data != End; Data = (CONST UINT8 *)Data + sizeof(UINT64))
{
M = Le64ToCpup((CONST UINT64_LE *)Data);
V3 ^= M;
SIPROUND;
SIPROUND;
V0 ^= M;
}
switch (Left)
{
case 7:
B |= ((UINT64)End[6]) << 48;
/* fallthrough */;
case 6:
B |= ((UINT64)End[5]) << 40;
/* fallthrough */;
case 5:
B |= ((UINT64)End[4]) << 32;
/* fallthrough */;
case 4:
B |= Le32ToCpup((CONST UINT32_LE *)Data);
break;
case 3:
B |= ((UINT64)End[2]) << 16;
/* fallthrough */;
case 2:
B |= Le16ToCpup((CONST UINT16_LE *)Data);
break;
case 1:
B |= End[0];
}
POSTAMBLE
}
_Use_decl_annotations_
UINT64
Siphash1u64(CONST UINT64 First, CONST SIPHASH_KEY *Key)
{
PREAMBLE(8)
V3 ^= First;
SIPROUND;
SIPROUND;
V0 ^= First;
POSTAMBLE
}
_Use_decl_annotations_
UINT64
Siphash2u64(CONST UINT64 First, CONST UINT64 Second, CONST SIPHASH_KEY *Key)
{
PREAMBLE(16)
V3 ^= First;
SIPROUND;
SIPROUND;
V0 ^= First;
V3 ^= Second;
SIPROUND;
SIPROUND;
V0 ^= Second;
POSTAMBLE
}
_Use_decl_annotations_
UINT64
Siphash3u64(CONST UINT64 First, CONST UINT64 Second, CONST UINT64 Third, CONST SIPHASH_KEY *Key)
{
PREAMBLE(24)
V3 ^= First;
SIPROUND;
SIPROUND;
V0 ^= First;
V3 ^= Second;
SIPROUND;
SIPROUND;
V0 ^= Second;
V3 ^= Third;
SIPROUND;
SIPROUND;
V0 ^= Third;
POSTAMBLE
}
_Use_decl_annotations_
UINT64
Siphash4u64(CONST UINT64 First, CONST UINT64 Second, CONST UINT64 Third, CONST UINT64 Forth, CONST SIPHASH_KEY *Key)
{
PREAMBLE(32)
V3 ^= First;
SIPROUND;
SIPROUND;
V0 ^= First;
V3 ^= Second;
SIPROUND;
SIPROUND;
V0 ^= Second;
V3 ^= Third;
SIPROUND;
SIPROUND;
V0 ^= Third;
V3 ^= Forth;
SIPROUND;
SIPROUND;
V0 ^= Forth;
POSTAMBLE
}
_Use_decl_annotations_
UINT64
Siphash1u32(CONST UINT32 First, CONST SIPHASH_KEY *Key)
{
PREAMBLE(4)
B |= First;
POSTAMBLE
}
_Use_decl_annotations_
UINT64
Siphash3u32(CONST UINT32 First, CONST UINT32 Second, CONST UINT32 Third, CONST SIPHASH_KEY *Key)
{
UINT64 Combined = (UINT64)Second << 32 | First;
PREAMBLE(12)
V3 ^= Combined;
SIPROUND;
SIPROUND;
V0 ^= Combined;
B |= Third;
POSTAMBLE
}
#if BITS_PER_POINTER == 64
/* Note that on 64-bit, we make HalfSiphash1-3 actually be Siphash1-3, for
* performance reasons. On 32-bit, below, we actually implement HalfSiphash1-3.
*/
# define HSIPROUND SIPROUND
# define HPREAMBLE(Len) PREAMBLE(Len)
# define HPOSTAMBLE \
V3 ^= B; \
HSIPROUND; \
V0 ^= B; \
V2 ^= 0xff; \
HSIPROUND; \
HSIPROUND; \
HSIPROUND; \
return (UINT32)((V0 ^ V1) ^ (V2 ^ V3));
_Use_decl_annotations_
UINT32
Hsiphash(CONST VOID *Data, SIZE_T Len, CONST HSIPHASH_KEY *Key)
{
CONST UINT8 *End = (CONST UINT8 *)Data + Len - (Len % sizeof(UINT64));
CONST UINT8 Left = Len & (sizeof(UINT64) - 1);
UINT64 M;
HPREAMBLE(Len)
for (; Data != End; Data = (CONST UINT8 *)Data + sizeof(UINT64))
{
M = Le64ToCpup((CONST UINT64_LE *)Data);
V3 ^= M;
HSIPROUND;
V0 ^= M;
}
switch (Left)
{
case 7:
B |= ((UINT64)End[6]) << 48;
/* fallthrough */;
case 6:
B |= ((UINT64)End[5]) << 40;
/* fallthrough */;
case 5:
B |= ((UINT64)End[4]) << 32;
/* fallthrough */;
case 4:
B |= Le32ToCpup((CONST UINT32_LE *)Data);
break;
case 3:
B |= ((UINT64)End[2]) << 16;
/* fallthrough */;
case 2:
B |= Le16ToCpup((CONST UINT16_LE *)Data);
break;
case 1:
B |= End[0];
}
HPOSTAMBLE
}
_Use_decl_annotations_
UINT32
Hsiphash1u32(CONST UINT32 First, CONST HSIPHASH_KEY *Key)
{
HPREAMBLE(4)
B |= First;
HPOSTAMBLE
}
_Use_decl_annotations_
UINT32
Hsiphash2u32(CONST UINT32 First, CONST UINT32 Second, CONST HSIPHASH_KEY *Key)
{
UINT64 Combined = (UINT64)Second << 32 | First;
HPREAMBLE(8)
V3 ^= Combined;
HSIPROUND;
V0 ^= Combined;
HPOSTAMBLE
}
_Use_decl_annotations_
UINT32
Hsiphash3u32(CONST UINT32 First, CONST UINT32 Second, CONST UINT32 Third, CONST HSIPHASH_KEY *Key)
{
UINT64 Combined = (UINT64)Second << 32 | First;
HPREAMBLE(12)
V3 ^= Combined;
HSIPROUND;
V0 ^= Combined;
B |= Third;
HPOSTAMBLE
}
_Use_decl_annotations_
UINT32
Hsiphash4u32(CONST UINT32 First, CONST UINT32 Second, CONST UINT32 Third, CONST UINT32 Forth, CONST HSIPHASH_KEY *Key)
{
UINT64 Combined = (UINT64)Second << 32 | First;
HPREAMBLE(16)
V3 ^= Combined;
HSIPROUND;
V0 ^= Combined;
Combined = (UINT64)Forth << 32 | Third;
V3 ^= Combined;
HSIPROUND;
V0 ^= Combined;
HPOSTAMBLE
}
#else
# define HSIPROUND \
do \
{ \
V0 += V1; \
V1 = Rol32(V1, 5); \
V1 ^= V0; \
V0 = Rol32(V0, 16); \
V2 += V3; \
V3 = Rol32(V3, 8); \
V3 ^= V2; \
V0 += V3; \
V3 = Rol32(V3, 7); \
V3 ^= V0; \
V2 += V1; \
V1 = Rol32(V1, 13); \
V1 ^= V2; \
V2 = Rol32(V2, 16); \
} while (0)
# define HPREAMBLE(Len) \
UINT32 V0 = 0; \
UINT32 V1 = 0; \
UINT32 V2 = 0x6c796765U; \
UINT32 V3 = 0x74656462U; \
UINT32 B = ((UINT32)(Len)) << 24; \
V3 ^= Key->Key[1]; \
V2 ^= Key->Key[0]; \
V1 ^= Key->Key[1]; \
V0 ^= Key->Key[0];
# define HPOSTAMBLE \
V3 ^= B; \
HSIPROUND; \
V0 ^= B; \
V2 ^= 0xff; \
HSIPROUND; \
HSIPROUND; \
HSIPROUND; \
return V1 ^ V3;
_Use_decl_annotations_
UINT32
Hsiphash(CONST VOID *Data, SIZE_T Len, CONST HSIPHASH_KEY *Key)
{
CONST UINT8 *End = (CONST UINT8 *)Data + Len - (Len % sizeof(UINT32));
CONST UINT8 Left = Len & (sizeof(UINT32) - 1);
UINT32 M;
HPREAMBLE(Len)
for (; Data != End; Data = (CONST UINT8 *)Data + sizeof(UINT32))
{
M = Le32ToCpup((CONST UINT32_LE *)Data);
V3 ^= M;
HSIPROUND;
V0 ^= M;
}
switch (Left)
{
case 3:
B |= ((UINT32)End[2]) << 16;
/* fallthrough */;
case 2:
B |= Le16ToCpup((CONST UINT16_LE *)Data);
break;
case 1:
B |= End[0];
}
HPOSTAMBLE
}
_Use_decl_annotations_
UINT32
Hsiphash1u32(CONST UINT32 First, CONST HSIPHASH_KEY *Key)
{
HPREAMBLE(4)
V3 ^= First;
HSIPROUND;
V0 ^= First;
HPOSTAMBLE
}
_Use_decl_annotations_
UINT32
Hsiphash2u32(CONST UINT32 First, CONST UINT32 Second, CONST HSIPHASH_KEY *Key)
{
HPREAMBLE(8)
V3 ^= First;
HSIPROUND;
V0 ^= First;
V3 ^= Second;
HSIPROUND;
V0 ^= Second;
HPOSTAMBLE
}
_Use_decl_annotations_
UINT32
Hsiphash3u32(CONST UINT32 First, CONST UINT32 Second, CONST UINT32 Third, CONST HSIPHASH_KEY *Key)
{
HPREAMBLE(12)
V3 ^= First;
HSIPROUND;
V0 ^= First;
V3 ^= Second;
HSIPROUND;
V0 ^= Second;
V3 ^= Third;
HSIPROUND;
V0 ^= Third;
HPOSTAMBLE
}
_Use_decl_annotations_
UINT32
Hsiphash4u32(CONST UINT32 First, CONST UINT32 Second, CONST UINT32 Third, CONST UINT32 Forth, CONST HSIPHASH_KEY *Key)
{
HPREAMBLE(16)
V3 ^= First;
HSIPROUND;
V0 ^= First;
V3 ^= Second;
HSIPROUND;
V0 ^= Second;
V3 ^= Third;
HSIPROUND;
V0 ^= Third;
V3 ^= Forth;
HSIPROUND;
V0 ^= Forth;
HPOSTAMBLE
}
#endif
/* Below here is fiat's implementation of x25519.
*
* Copyright (C) 2015-2016 The fiat-crypto Authors.
* Copyright (C) 2018-2021 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
*
* This is a machine-generated formally verified implementation of Curve25519
* ECDH from: <https://github.com/mit-plv/fiat-crypto>. Though originally
* machine generated, it has been tweaked to be suitable for use in the kernel.
* It is optimized for 32-bit machines and machines that cannot work efficiently
* with 128-bit integer types.
*/
/* Fe means field element. Here the field is \Z/(2^255-19). An element t,
* entries t[0]...t[9], represents the integer t[0]+2^26 t[1]+2^51 t[2]+2^77
* t[3]+2^102 t[4]+...+2^230 t[9].
* Fe limbs are bounded by 1.125*2^26,1.125*2^25,1.125*2^26,1.125*2^25,etc.
* Multiplication and carrying produce Fe from FeLoose.
*/
typedef struct Fe
{
UINT32 V[10];
} Fe;
/* FeLoose limbs are bounded by 3.375*2^26,3.375*2^25,3.375*2^26,3.375*2^25,etc
* Addition and subtraction produce FeLoose from (Fe, Fe).
*/
typedef struct FeLoose
{
UINT32 V[10];
} FeLoose;
static inline VOID
FeFrombytesImpl(_Out_writes_all_(10) UINT32 H[10], _In_reads_bytes_(32) CONST UINT8 *S)
{
/* Ignores top bit of s. */
UINT32 A0 = GetUnalignedLe32(S);
UINT32 A1 = GetUnalignedLe32(S + 4);
UINT32 A2 = GetUnalignedLe32(S + 8);
UINT32 A3 = GetUnalignedLe32(S + 12);
UINT32 A4 = GetUnalignedLe32(S + 16);
UINT32 A5 = GetUnalignedLe32(S + 20);
UINT32 A6 = GetUnalignedLe32(S + 24);
UINT32 A7 = GetUnalignedLe32(S + 28);
H[0] = A0 & ((1 << 26) - 1); /* 26 used, 32-26 left. 26 */
H[1] = (A0 >> 26) | ((A1 & ((1 << 19) - 1)) << 6); /* (32-26) + 19 = 6+19 = 25 */
H[2] = (A1 >> 19) | ((A2 & ((1 << 13) - 1)) << 13); /* (32-19) + 13 = 13+13 = 26 */
H[3] = (A2 >> 13) | ((A3 & ((1 << 6) - 1)) << 19); /* (32-13) + 6 = 19+ 6 = 25 */
H[4] = (A3 >> 6); /* (32- 6) = 26 */
H[5] = A4 & ((1 << 25) - 1); /* 25 */
H[6] = (A4 >> 25) | ((A5 & ((1 << 19) - 1)) << 7); /* (32-25) + 19 = 7+19 = 26 */
H[7] = (A5 >> 19) | ((A6 & ((1 << 12) - 1)) << 13); /* (32-19) + 12 = 13+12 = 25 */
H[8] = (A6 >> 12) | ((A7 & ((1 << 6) - 1)) << 20); /* (32-12) + 6 = 20+ 6 = 26 */
H[9] = (A7 >> 6) & ((1 << 25) - 1); /* 25 */
}
static inline VOID
FeFrombytes(_Out_ Fe *H, _In_reads_bytes_(32) CONST UINT8 *S)
{
FeFrombytesImpl(H->V, S);
}
static inline UINT8 /*bool*/
AddcarryxU25(_In_ CONST UINT8 /*bool*/ C, _In_ CONST UINT32 A, _In_ CONST UINT32 B, _Out_ UINT32 *Low)
{
/* This function extracts 25 bits of result and 1 bit of carry
* (26 total), so a 32-bit intermediate is sufficient.
*/
UINT32 X = A + B + C;
*Low = X & ((1 << 25) - 1);
return (X >> 25) & 1;
}
static inline UINT8 /*bool*/
AddcarryxU26(_In_ CONST UINT8 /*bool*/ C, _In_ CONST UINT32 A, _In_ CONST UINT32 B, _Out_ UINT32 *Low)
{
/* This function extracts 26 bits of result and 1 bit of carry
* (27 total), so a 32-bit intermediate is sufficient.
*/
UINT32 X = A + B + C;
*Low = X & ((1 << 26) - 1);
return (X >> 26) & 1;
}
static inline UINT8 /*bool*/
SubborrowU25(_In_ CONST UINT8 /*bool*/ C, _In_ CONST UINT32 A, _In_ CONST UINT32 B, _Out_ UINT32 *Low)
{
/* This function extracts 25 bits of result and 1 bit of borrow
* (26 total), so a 32-bit intermediate is sufficient.
*/
UINT32 X = A - B - C;
*Low = X & ((1 << 25) - 1);
return X >> 31;
}
static inline UINT8 /*bool*/
SubborrowU26(_In_ CONST UINT8 /*bool*/ C, _In_ CONST UINT32 A, _In_ CONST UINT32 B, _Out_ UINT32 *Low)
{
/* This function extracts 26 bits of result and 1 bit of borrow
*(27 total), so a 32-bit intermediate is sufficient.
*/
UINT32 X = A - B - C;
*Low = X & ((1 << 26) - 1);
return X >> 31;
}
static inline UINT32
Cmovznz32(_In_ UINT32 T, _In_ CONST UINT32 Z, _In_ CONST UINT32 Nz)
{
T = -!!T; /* all set if nonzero, 0 if 0 */
return (T & Nz) | ((~T) & Z);
}
static inline VOID
FeFreeze(_Out_writes_all_(10) UINT32 Out[10], _In_reads_(10) CONST UINT32 In1[10])
{
CONST UINT32 X17 = In1[9];
CONST UINT32 X18 = In1[8];
CONST UINT32 X16 = In1[7];
CONST UINT32 X14 = In1[6];
CONST UINT32 X12 = In1[5];
CONST UINT32 X10 = In1[4];
CONST UINT32 X8 = In1[3];
CONST UINT32 X6 = In1[2];
CONST UINT32 X4 = In1[1];
CONST UINT32 X2 = In1[0];
UINT32 X20;
UINT8 /*bool*/ X21 = SubborrowU26(0x0, X2, 0x3ffffed, &X20);
UINT32 X23;
UINT8 /*bool*/ X24 = SubborrowU25(X21, X4, 0x1ffffff, &X23);
UINT32 X26;
UINT8 /*bool*/ X27 = SubborrowU26(X24, X6, 0x3ffffff, &X26);
UINT32 X29;
UINT8 /*bool*/ X30 = SubborrowU25(X27, X8, 0x1ffffff, &X29);
UINT32 X32;
UINT8 /*bool*/ X33 = SubborrowU26(X30, X10, 0x3ffffff, &X32);
UINT32 X35;
UINT8 /*bool*/ X36 = SubborrowU25(X33, X12, 0x1ffffff, &X35);
UINT32 X38;
UINT8 /*bool*/ X39 = SubborrowU26(X36, X14, 0x3ffffff, &X38);
UINT32 X41;
UINT8 /*bool*/ X42 = SubborrowU25(X39, X16, 0x1ffffff, &X41);
UINT32 X44;
UINT8 /*bool*/ X45 = SubborrowU26(X42, X18, 0x3ffffff, &X44);
UINT32 X47;
UINT8 /*bool*/ X48 = SubborrowU25(X45, X17, 0x1ffffff, &X47);
UINT32 X49 = Cmovznz32(X48, 0x0, 0xffffffff);
UINT32 X50 = (X49 & 0x3ffffed);
UINT32 X52;
UINT8 /*bool*/ X53 = AddcarryxU26(0x0, X20, X50, &X52);
UINT32 X54 = (X49 & 0x1ffffff);
UINT32 X56;
UINT8 /*bool*/ X57 = AddcarryxU25(X53, X23, X54, &X56);
UINT32 X58 = (X49 & 0x3ffffff);
UINT32 X60;
UINT8 /*bool*/ X61 = AddcarryxU26(X57, X26, X58, &X60);
UINT32 X62 = (X49 & 0x1ffffff);
UINT32 X64;
UINT8 /*bool*/ X65 = AddcarryxU25(X61, X29, X62, &X64);
UINT32 X66 = (X49 & 0x3ffffff);
UINT32 X68;
UINT8 /*bool*/ X69 = AddcarryxU26(X65, X32, X66, &X68);
UINT32 X70 = (X49 & 0x1ffffff);
UINT32 X72;
UINT8 /*bool*/ X73 = AddcarryxU25(X69, X35, X70, &X72);
UINT32 X74 = (X49 & 0x3ffffff);
UINT32 X76;
UINT8 /*bool*/ X77 = AddcarryxU26(X73, X38, X74, &X76);
UINT32 X78 = (X49 & 0x1ffffff);
UINT32 X80;
UINT8 /*bool*/ X81 = AddcarryxU25(X77, X41, X78, &X80);
UINT32 X82 = (X49 & 0x3ffffff);
UINT32 X84;
UINT8 /*bool*/ X85 = AddcarryxU26(X81, X44, X82, &X84);
UINT32 X86 = (X49 & 0x1ffffff);
UINT32 X88;
AddcarryxU25(X85, X47, X86, &X88);
Out[0] = X52;
Out[1] = X56;
Out[2] = X60;
Out[3] = X64;
Out[4] = X68;
Out[5] = X72;
Out[6] = X76;
Out[7] = X80;
Out[8] = X84;
Out[9] = X88;
}
static inline VOID
FeTobytes(_Out_writes_bytes_all_(32) UINT8 S[32], _In_ CONST Fe *F)
{
UINT32 H[10];
FeFreeze(H, F->V);
S[0] = H[0] >> 0;
S[1] = H[0] >> 8;
S[2] = H[0] >> 16;
S[3] = (H[0] >> 24) | (H[1] << 2);
S[4] = H[1] >> 6;
S[5] = H[1] >> 14;
S[6] = (H[1] >> 22) | (H[2] << 3);
S[7] = H[2] >> 5;
S[8] = H[2] >> 13;
S[9] = (H[2] >> 21) | (H[3] << 5);
S[10] = H[3] >> 3;
S[11] = H[3] >> 11;
S[12] = (H[3] >> 19) | (H[4] << 6);
S[13] = H[4] >> 2;
S[14] = H[4] >> 10;
S[15] = H[4] >> 18;
S[16] = H[5] >> 0;
S[17] = H[5] >> 8;
S[18] = H[5] >> 16;
S[19] = (H[5] >> 24) | (H[6] << 1);
S[20] = H[6] >> 7;
S[21] = H[6] >> 15;
S[22] = (H[6] >> 23) | (H[7] << 3);
S[23] = H[7] >> 5;
S[24] = H[7] >> 13;
S[25] = (H[7] >> 21) | (H[8] << 4);
S[26] = H[8] >> 4;
S[27] = H[8] >> 12;
S[28] = (H[8] >> 20) | (H[9] << 6);
S[29] = H[9] >> 2;
S[30] = H[9] >> 10;
S[31] = H[9] >> 18;
}
/* h = f */
static inline VOID
FeCopy(_Out_ Fe *H, _In_ CONST Fe *F)
{
RtlMoveMemory(H, F, sizeof(UINT32) * 10);
}
static inline VOID
FeCopyLt(_Out_ FeLoose *H, _In_ CONST Fe *F)
{
RtlMoveMemory(H, F, sizeof(UINT32) * 10);
}
/* h = 0 */
static inline VOID
Fe0(_Out_ Fe *H)
{
RtlZeroMemory(H, sizeof(UINT32) * 10);
}
/* h = 1 */
static inline VOID
Fe1(_Out_ Fe *H)
{
RtlZeroMemory(H, sizeof(UINT32) * 10);
H->V[0] = 1;
}
static VOID
FeAddImpl(_Out_writes_all_(10) UINT32 Out[10], _In_reads_(10) CONST UINT32 In1[10], _In_reads_(10) CONST UINT32 In2[10])
{
CONST UINT32 X20 = In1[9];
CONST UINT32 X21 = In1[8];
CONST UINT32 X19 = In1[7];
CONST UINT32 X17 = In1[6];
CONST UINT32 X15 = In1[5];
CONST UINT32 X13 = In1[4];
CONST UINT32 X11 = In1[3];
CONST UINT32 X9 = In1[2];
CONST UINT32 X7 = In1[1];
CONST UINT32 X5 = In1[0];
CONST UINT32 X38 = In2[9];
CONST UINT32 X39 = In2[8];
CONST UINT32 X37 = In2[7];
CONST UINT32 X35 = In2[6];
CONST UINT32 X33 = In2[5];
CONST UINT32 X31 = In2[4];
CONST UINT32 X29 = In2[3];
CONST UINT32 X27 = In2[2];
CONST UINT32 X25 = In2[1];
CONST UINT32 X23 = In2[0];
Out[0] = (X5 + X23);
Out[1] = (X7 + X25);
Out[2] = (X9 + X27);
Out[3] = (X11 + X29);
Out[4] = (X13 + X31);
Out[5] = (X15 + X33);
Out[6] = (X17 + X35);
Out[7] = (X19 + X37);
Out[8] = (X21 + X39);
Out[9] = (X20 + X38);
}
/* h = f + g
* Can overlap h with f or g.
*/
static inline VOID
FeAdd(_Out_ FeLoose *H, _In_ CONST Fe *F, _In_ CONST Fe *G)
{
FeAddImpl(H->V, F->V, G->V);
}
static VOID
FeSubImpl(_Out_writes_all_(10) UINT32 Out[10], _In_reads_(10) CONST UINT32 In1[10], _In_reads_(10) CONST UINT32 In2[10])
{
CONST UINT32 X20 = In1[9];
CONST UINT32 X21 = In1[8];
CONST UINT32 X19 = In1[7];
CONST UINT32 X17 = In1[6];
CONST UINT32 X15 = In1[5];
CONST UINT32 X13 = In1[4];
CONST UINT32 X11 = In1[3];
CONST UINT32 X9 = In1[2];
CONST UINT32 X7 = In1[1];
CONST UINT32 X5 = In1[0];
CONST UINT32 X38 = In2[9];
CONST UINT32 X39 = In2[8];
CONST UINT32 X37 = In2[7];
CONST UINT32 X35 = In2[6];
CONST UINT32 X33 = In2[5];
CONST UINT32 X31 = In2[4];
CONST UINT32 X29 = In2[3];
CONST UINT32 X27 = In2[2];
CONST UINT32 X25 = In2[1];
CONST UINT32 X23 = In2[0];
Out[0] = ((0x7ffffda + X5) - X23);
Out[1] = ((0x3fffffe + X7) - X25);
Out[2] = ((0x7fffffe + X9) - X27);
Out[3] = ((0x3fffffe + X11) - X29);
Out[4] = ((0x7fffffe + X13) - X31);
Out[5] = ((0x3fffffe + X15) - X33);
Out[6] = ((0x7fffffe + X17) - X35);
Out[7] = ((0x3fffffe + X19) - X37);
Out[8] = ((0x7fffffe + X21) - X39);
Out[9] = ((0x3fffffe + X20) - X38);
}
/* h = f - g
* Can overlap h with f or g.
*/
static inline VOID
FeSub(_Out_ FeLoose *H, _In_ CONST Fe *F, _In_ CONST Fe *G)
{
FeSubImpl(H->V, F->V, G->V);
}
static VOID
FeMulImpl(_Out_writes_all_(10) UINT32 Out[10], _In_reads_(10) CONST UINT32 In1[10], _In_reads_(10) CONST UINT32 In2[10])
{
CONST UINT32 X20 = In1[9];
CONST UINT32 X21 = In1[8];
CONST UINT32 X19 = In1[7];
CONST UINT32 X17 = In1[6];
CONST UINT32 X15 = In1[5];
CONST UINT32 X13 = In1[4];
CONST UINT32 X11 = In1[3];
CONST UINT32 X9 = In1[2];
CONST UINT32 X7 = In1[1];
CONST UINT32 X5 = In1[0];
CONST UINT32 X38 = In2[9];
CONST UINT32 X39 = In2[8];
CONST UINT32 X37 = In2[7];
CONST UINT32 X35 = In2[6];
CONST UINT32 X33 = In2[5];
CONST UINT32 X31 = In2[4];
CONST UINT32 X29 = In2[3];
CONST UINT32 X27 = In2[2];
CONST UINT32 X25 = In2[1];
CONST UINT32 X23 = In2[0];
UINT64 X40 = ((UINT64)X23 * X5);
UINT64 X41 = (((UINT64)X23 * X7) + ((UINT64)X25 * X5));
UINT64 X42 = ((((UINT64)(0x2 * X25) * X7) + ((UINT64)X23 * X9)) + ((UINT64)X27 * X5));
UINT64 X43 = (((((UINT64)X25 * X9) + ((UINT64)X27 * X7)) + ((UINT64)X23 * X11)) + ((UINT64)X29 * X5));
UINT64 X44 =
(((((UINT64)X27 * X9) + (0x2 * (((UINT64)X25 * X11) + ((UINT64)X29 * X7)))) + ((UINT64)X23 * X13)) +
((UINT64)X31 * X5));
UINT64 X45 =
(((((((UINT64)X27 * X11) + ((UINT64)X29 * X9)) + ((UINT64)X25 * X13)) + ((UINT64)X31 * X7)) +
((UINT64)X23 * X15)) +
((UINT64)X33 * X5));
UINT64 X46 =
(((((0x2 * ((((UINT64)X29 * X11) + ((UINT64)X25 * X15)) + ((UINT64)X33 * X7))) + ((UINT64)X27 * X13)) +
((UINT64)X31 * X9)) +
((UINT64)X23 * X17)) +
((UINT64)X35 * X5));
UINT64 X47 =
(((((((((UINT64)X29 * X13) + ((UINT64)X31 * X11)) + ((UINT64)X27 * X15)) + ((UINT64)X33 * X9)) +
((UINT64)X25 * X17)) +
((UINT64)X35 * X7)) +
((UINT64)X23 * X19)) +
((UINT64)X37 * X5));
UINT64 X48 =
(((((((UINT64)X31 * X13) +
(0x2 * (((((UINT64)X29 * X15) + ((UINT64)X33 * X11)) + ((UINT64)X25 * X19)) + ((UINT64)X37 * X7)))) +
((UINT64)X27 * X17)) +
((UINT64)X35 * X9)) +
((UINT64)X23 * X21)) +
((UINT64)X39 * X5));
UINT64 X49 =
(((((((((((UINT64)X31 * X15) + ((UINT64)X33 * X13)) + ((UINT64)X29 * X17)) + ((UINT64)X35 * X11)) +
((UINT64)X27 * X19)) +
((UINT64)X37 * X9)) +
((UINT64)X25 * X21)) +
((UINT64)X39 * X7)) +
((UINT64)X23 * X20)) +
((UINT64)X38 * X5));
UINT64 X50 =
(((((0x2 * ((((((UINT64)X33 * X15) + ((UINT64)X29 * X19)) + ((UINT64)X37 * X11)) + ((UINT64)X25 * X20)) +
((UINT64)X38 * X7))) +
((UINT64)X31 * X17)) +
((UINT64)X35 * X13)) +
((UINT64)X27 * X21)) +
((UINT64)X39 * X9));
UINT64 X51 =
(((((((((UINT64)X33 * X17) + ((UINT64)X35 * X15)) + ((UINT64)X31 * X19)) + ((UINT64)X37 * X13)) +
((UINT64)X29 * X21)) +
((UINT64)X39 * X11)) +
((UINT64)X27 * X20)) +
((UINT64)X38 * X9));
UINT64 X52 =
(((((UINT64)X35 * X17) +
(0x2 * (((((UINT64)X33 * X19) + ((UINT64)X37 * X15)) + ((UINT64)X29 * X20)) + ((UINT64)X38 * X11)))) +
((UINT64)X31 * X21)) +
((UINT64)X39 * X13));
UINT64 X53 =
(((((((UINT64)X35 * X19) + ((UINT64)X37 * X17)) + ((UINT64)X33 * X21)) + ((UINT64)X39 * X15)) +
((UINT64)X31 * X20)) +
((UINT64)X38 * X13));
UINT64 X54 =
(((0x2 * ((((UINT64)X37 * X19) + ((UINT64)X33 * X20)) + ((UINT64)X38 * X15))) + ((UINT64)X35 * X21)) +
((UINT64)X39 * X17));
UINT64 X55 = (((((UINT64)X37 * X21) + ((UINT64)X39 * X19)) + ((UINT64)X35 * X20)) + ((UINT64)X38 * X17));
UINT64 X56 = (((UINT64)X39 * X21) + (0x2 * (((UINT64)X37 * X20) + ((UINT64)X38 * X19))));
UINT64 X57 = (((UINT64)X39 * X20) + ((UINT64)X38 * X21));
UINT64 X58 = ((UINT64)(0x2 * X38) * X20);
UINT64 X59 = (X48 + (X58 << 0x4));
UINT64 X60 = (X59 + (X58 << 0x1));
UINT64 X61 = (X60 + X58);
UINT64 X62 = (X47 + (X57 << 0x4));
UINT64 X63 = (X62 + (X57 << 0x1));
UINT64 X64 = (X63 + X57);
UINT64 X65 = (X46 + (X56 << 0x4));
UINT64 X66 = (X65 + (X56 << 0x1));
UINT64 X67 = (X66 + X56);
UINT64 X68 = (X45 + (X55 << 0x4));
UINT64 X69 = (X68 + (X55 << 0x1));
UINT64 X70 = (X69 + X55);
UINT64 X71 = (X44 + (X54 << 0x4));
UINT64 X72 = (X71 + (X54 << 0x1));
UINT64 X73 = (X72 + X54);
UINT64 X74 = (X43 + (X53 << 0x4));
UINT64 X75 = (X74 + (X53 << 0x1));
UINT64 X76 = (X75 + X53);
UINT64 X77 = (X42 + (X52 << 0x4));
UINT64 X78 = (X77 + (X52 << 0x1));
UINT64 X79 = (X78 + X52);
UINT64 X80 = (X41 + (X51 << 0x4));
UINT64 X81 = (X80 + (X51 << 0x1));
UINT64 X82 = (X81 + X51);
UINT64 X83 = (X40 + (X50 << 0x4));
UINT64 X84 = (X83 + (X50 << 0x1));
UINT64 X85 = (X84 + X50);
UINT64 X86 = (X85 >> 0x1a);
UINT32 X87 = ((UINT32)X85 & 0x3ffffff);
UINT64 X88 = (X86 + X82);
UINT64 X89 = (X88 >> 0x19);
UINT32 X90 = ((UINT32)X88 & 0x1ffffff);
UINT64 X91 = (X89 + X79);
UINT64 X92 = (X91 >> 0x1a);
UINT32 X93 = ((UINT32)X91 & 0x3ffffff);
UINT64 X94 = (X92 + X76);
UINT64 X95 = (X94 >> 0x19);
UINT32 X96 = ((UINT32)X94 & 0x1ffffff);
UINT64 X97 = (X95 + X73);
UINT64 X98 = (X97 >> 0x1a);
UINT32 X99 = ((UINT32)X97 & 0x3ffffff);
UINT64 X100 = (X98 + X70);
UINT64 X101 = (X100 >> 0x19);
UINT32 X102 = ((UINT32)X100 & 0x1ffffff);
UINT64 X103 = (X101 + X67);
UINT64 X104 = (X103 >> 0x1a);
UINT32 X105 = ((UINT32)X103 & 0x3ffffff);
UINT64 X106 = (X104 + X64);
UINT64 X107 = (X106 >> 0x19);
UINT32 X108 = ((UINT32)X106 & 0x1ffffff);
UINT64 X109 = (X107 + X61);
UINT64 X110 = (X109 >> 0x1a);
UINT32 X111 = ((UINT32)X109 & 0x3ffffff);
UINT64 X112 = (X110 + X49);
UINT64 X113 = (X112 >> 0x19);
UINT32 X114 = ((UINT32)X112 & 0x1ffffff);
UINT64 X115 = (X87 + (0x13 * X113));
UINT32 X116 = (UINT32)(X115 >> 0x1a);
UINT32 X117 = ((UINT32)X115 & 0x3ffffff);
UINT32 X118 = (X116 + X90);
UINT32 X119 = (X118 >> 0x19);
UINT32 X120 = (X118 & 0x1ffffff);
Out[0] = X117;
Out[1] = X120;
Out[2] = (X119 + X93);
Out[3] = X96;
Out[4] = X99;
Out[5] = X102;
Out[6] = X105;
Out[7] = X108;
Out[8] = X111;
Out[9] = X114;
}
static inline VOID
FeMulTtt(_Out_ Fe *H, _In_ CONST Fe *F, _In_ CONST Fe *G)
{
FeMulImpl(H->V, F->V, G->V);
}
static inline VOID
FeMulTlt(_Out_ Fe *H, _In_ CONST FeLoose *F, _In_ CONST Fe *G)
{
FeMulImpl(H->V, F->V, G->V);
}
static inline VOID
FeMulTll(_Out_ Fe *H, _In_ CONST FeLoose *F, _In_ CONST FeLoose *G)
{
FeMulImpl(H->V, F->V, G->V);
}
static VOID
FeSqrImpl(_Out_writes_all_(10) UINT32 Out[10], _In_reads_(10) CONST UINT32 In1[10])
{
CONST UINT32 X17 = In1[9];
CONST UINT32 X18 = In1[8];
CONST UINT32 X16 = In1[7];
CONST UINT32 X14 = In1[6];
CONST UINT32 X12 = In1[5];
CONST UINT32 X10 = In1[4];
CONST UINT32 X8 = In1[3];
CONST UINT32 X6 = In1[2];
CONST UINT32 X4 = In1[1];
CONST UINT32 X2 = In1[0];
UINT64 X19 = ((UINT64)X2 * X2);
UINT64 X20 = ((UINT64)(0x2 * X2) * X4);
UINT64 X21 = (0x2 * (((UINT64)X4 * X4) + ((UINT64)X2 * X6)));
UINT64 X22 = (0x2 * (((UINT64)X4 * X6) + ((UINT64)X2 * X8)));
UINT64 X23 = ((((UINT64)X6 * X6) + ((UINT64)(0x4 * X4) * X8)) + ((UINT64)(0x2 * X2) * X10));
UINT64 X24 = (0x2 * ((((UINT64)X6 * X8) + ((UINT64)X4 * X10)) + ((UINT64)X2 * X12)));
UINT64 X25 = (0x2 * (((((UINT64)X8 * X8) + ((UINT64)X6 * X10)) + ((UINT64)X2 * X14)) + ((UINT64)(0x2 * X4) * X12)));
UINT64 X26 = (0x2 * (((((UINT64)X8 * X10) + ((UINT64)X6 * X12)) + ((UINT64)X4 * X14)) + ((UINT64)X2 * X16)));
UINT64 X27 =
(((UINT64)X10 * X10) +
(0x2 * ((((UINT64)X6 * X14) + ((UINT64)X2 * X18)) + (0x2 * (((UINT64)X4 * X16) + ((UINT64)X8 * X12))))));
UINT64 X28 =
(0x2 * ((((((UINT64)X10 * X12) + ((UINT64)X8 * X14)) + ((UINT64)X6 * X16)) + ((UINT64)X4 * X18)) +
((UINT64)X2 * X17)));
UINT64 X29 =
(0x2 * (((((UINT64)X12 * X12) + ((UINT64)X10 * X14)) + ((UINT64)X6 * X18)) +
(0x2 * (((UINT64)X8 * X16) + ((UINT64)X4 * X17)))));
UINT64 X30 = (0x2 * (((((UINT64)X12 * X14) + ((UINT64)X10 * X16)) + ((UINT64)X8 * X18)) + ((UINT64)X6 * X17)));
UINT64 X31 =
(((UINT64)X14 * X14) + (0x2 * (((UINT64)X10 * X18) + (0x2 * (((UINT64)X12 * X16) + ((UINT64)X8 * X17))))));
UINT64 X32 = (0x2 * ((((UINT64)X14 * X16) + ((UINT64)X12 * X18)) + ((UINT64)X10 * X17)));
UINT64 X33 = (0x2 * ((((UINT64)X16 * X16) + ((UINT64)X14 * X18)) + ((UINT64)(0x2 * X12) * X17)));
UINT64 X34 = (0x2 * (((UINT64)X16 * X18) + ((UINT64)X14 * X17)));
UINT64 X35 = (((UINT64)X18 * X18) + ((UINT64)(0x4 * X16) * X17));
UINT64 X36 = ((UINT64)(0x2 * X18) * X17);
UINT64 X37 = ((UINT64)(0x2 * X17) * X17);
UINT64 X38 = (X27 + (X37 << 0x4));
UINT64 X39 = (X38 + (X37 << 0x1));
UINT64 X40 = (X39 + X37);
UINT64 X41 = (X26 + (X36 << 0x4));
UINT64 X42 = (X41 + (X36 << 0x1));
UINT64 X43 = (X42 + X36);
UINT64 X44 = (X25 + (X35 << 0x4));
UINT64 X45 = (X44 + (X35 << 0x1));
UINT64 X46 = (X45 + X35);
UINT64 X47 = (X24 + (X34 << 0x4));
UINT64 X48 = (X47 + (X34 << 0x1));
UINT64 X49 = (X48 + X34);
UINT64 X50 = (X23 + (X33 << 0x4));
UINT64 X51 = (X50 + (X33 << 0x1));
UINT64 X52 = (X51 + X33);
UINT64 X53 = (X22 + (X32 << 0x4));
UINT64 X54 = (X53 + (X32 << 0x1));
UINT64 X55 = (X54 + X32);
UINT64 X56 = (X21 + (X31 << 0x4));
UINT64 X57 = (X56 + (X31 << 0x1));
UINT64 X58 = (X57 + X31);
UINT64 X59 = (X20 + (X30 << 0x4));
UINT64 X60 = (X59 + (X30 << 0x1));
UINT64 X61 = (X60 + X30);
UINT64 X62 = (X19 + (X29 << 0x4));
UINT64 X63 = (X62 + (X29 << 0x1));
UINT64 X64 = (X63 + X29);
UINT64 X65 = (X64 >> 0x1a);
UINT32 X66 = ((UINT32)X64 & 0x3ffffff);
UINT64 X67 = (X65 + X61);
UINT64 X68 = (X67 >> 0x19);
UINT32 X69 = ((UINT32)X67 & 0x1ffffff);
UINT64 X70 = (X68 + X58);
UINT64 X71 = (X70 >> 0x1a);
UINT32 X72 = ((UINT32)X70 & 0x3ffffff);
UINT64 X73 = (X71 + X55);
UINT64 X74 = (X73 >> 0x19);
UINT32 X75 = ((UINT32)X73 & 0x1ffffff);
UINT64 X76 = (X74 + X52);
UINT64 X77 = (X76 >> 0x1a);
UINT32 X78 = ((UINT32)X76 & 0x3ffffff);
UINT64 X79 = (X77 + X49);
UINT64 X80 = (X79 >> 0x19);
UINT32 X81 = ((UINT32)X79 & 0x1ffffff);
UINT64 X82 = (X80 + X46);
UINT64 X83 = (X82 >> 0x1a);
UINT32 X84 = ((UINT32)X82 & 0x3ffffff);
UINT64 X85 = (X83 + X43);
UINT64 X86 = (X85 >> 0x19);
UINT32 X87 = ((UINT32)X85 & 0x1ffffff);
UINT64 X88 = (X86 + X40);
UINT64 X89 = (X88 >> 0x1a);
UINT32 X90 = ((UINT32)X88 & 0x3ffffff);
UINT64 X91 = (X89 + X28);
UINT64 X92 = (X91 >> 0x19);
UINT32 X93 = ((UINT32)X91 & 0x1ffffff);
UINT64 X94 = (X66 + (0x13 * X92));
UINT32 X95 = (UINT32)(X94 >> 0x1a);
UINT32 X96 = ((UINT32)X94 & 0x3ffffff);
UINT32 X97 = (X95 + X69);
UINT32 X98 = (X97 >> 0x19);
UINT32 X99 = (X97 & 0x1ffffff);
Out[0] = X96;
Out[1] = X99;
Out[2] = (X98 + X72);
Out[3] = X75;
Out[4] = X78;
Out[5] = X81;
Out[6] = X84;
Out[7] = X87;
Out[8] = X90;
Out[9] = X93;
}
static inline VOID
FeSqTl(_Out_ Fe *H, _In_ CONST FeLoose *F)
{
FeSqrImpl(H->V, F->V);
}
static inline VOID
FeSqTt(_Out_ Fe *H, _In_ CONST Fe *F)
{
FeSqrImpl(H->V, F->V);
}
static inline VOID
FeLooseInvert(_Out_ Fe *Out, _In_ CONST FeLoose *Z)
{
Fe T0;
Fe T1;
Fe T2;
Fe T3;
LONG i;
FeSqTl(&T0, Z);
FeSqTt(&T1, &T0);
for (i = 1; i < 2; ++i)
FeSqTt(&T1, &T1);
FeMulTlt(&T1, Z, &T1);
FeMulTtt(&T0, &T0, &T1);
FeSqTt(&T2, &T0);
FeMulTtt(&T1, &T1, &T2);
FeSqTt(&T2, &T1);
for (i = 1; i < 5; ++i)
FeSqTt(&T2, &T2);
FeMulTtt(&T1, &T2, &T1);
FeSqTt(&T2, &T1);
for (i = 1; i < 10; ++i)
FeSqTt(&T2, &T2);
FeMulTtt(&T2, &T2, &T1);
FeSqTt(&T3, &T2);
for (i = 1; i < 20; ++i)
FeSqTt(&T3, &T3);
FeMulTtt(&T2, &T3, &T2);
FeSqTt(&T2, &T2);
for (i = 1; i < 10; ++i)
FeSqTt(&T2, &T2);
FeMulTtt(&T1, &T2, &T1);
FeSqTt(&T2, &T1);
for (i = 1; i < 50; ++i)
FeSqTt(&T2, &T2);
FeMulTtt(&T2, &T2, &T1);
FeSqTt(&T3, &T2);
for (i = 1; i < 100; ++i)
FeSqTt(&T3, &T3);
FeMulTtt(&T2, &T3, &T2);
FeSqTt(&T2, &T2);
for (i = 1; i < 50; ++i)
FeSqTt(&T2, &T2);
FeMulTtt(&T1, &T2, &T1);
FeSqTt(&T1, &T1);
for (i = 1; i < 5; ++i)
FeSqTt(&T1, &T1);
FeMulTtt(Out, &T1, &T0);
}
static inline VOID
FeInvert(_Out_ Fe *Out, _In_ CONST Fe *Z)
{
FeLoose l;
FeCopyLt(&l, Z);
FeLooseInvert(Out, &l);
}
/* Replace (f,g) with (g,f) if b == 1;
* replace (f,g) with (f,g) if b == 0.
*
* Preconditions: b in {0,1}
*/
static inline VOID
FeCswap(_Inout_ Fe *F, _Inout_ Fe *G, _In_ UINT32 B)
{
LONG i;
B = 0 - B;
for (i = 0; i < 10; ++i)
{
UINT32 X = F->V[i] ^ G->V[i];
X &= B;
F->V[i] ^= X;
G->V[i] ^= X;
}
}
/* NOTE: based on fiat-crypto fe_mul, edited for in2=121666, 0, 0.*/
static inline VOID
FeMul121666Impl(_Out_writes_all_(10) UINT32 Out[10], _In_reads_(10) CONST UINT32 In1[10])
{
CONST UINT32 X20 = In1[9];
CONST UINT32 X21 = In1[8];
CONST UINT32 X19 = In1[7];
CONST UINT32 X17 = In1[6];
CONST UINT32 X15 = In1[5];
CONST UINT32 X13 = In1[4];
CONST UINT32 X11 = In1[3];
CONST UINT32 X9 = In1[2];
CONST UINT32 X7 = In1[1];
CONST UINT32 X5 = In1[0];
CONST UINT32 X38 = 0;
CONST UINT32 X39 = 0;
CONST UINT32 X37 = 0;
CONST UINT32 X35 = 0;
CONST UINT32 X33 = 0;
CONST UINT32 X31 = 0;
CONST UINT32 X29 = 0;
CONST UINT32 X27 = 0;
CONST UINT32 X25 = 0;
CONST UINT32 X23 = 121666;
UINT64 X40 = ((UINT64)X23 * X5);
UINT64 X41 = (((UINT64)X23 * X7) + ((UINT64)X25 * X5));
UINT64 X42 = ((((UINT64)(0x2 * X25) * X7) + ((UINT64)X23 * X9)) + ((UINT64)X27 * X5));
UINT64 X43 = (((((UINT64)X25 * X9) + ((UINT64)X27 * X7)) + ((UINT64)X23 * X11)) + ((UINT64)X29 * X5));
UINT64 X44 =
(((((UINT64)X27 * X9) + (0x2 * (((UINT64)X25 * X11) + ((UINT64)X29 * X7)))) + ((UINT64)X23 * X13)) +
((UINT64)X31 * X5));
UINT64 X45 =
(((((((UINT64)X27 * X11) + ((UINT64)X29 * X9)) + ((UINT64)X25 * X13)) + ((UINT64)X31 * X7)) +
((UINT64)X23 * X15)) +
((UINT64)X33 * X5));
UINT64 X46 =
(((((0x2 * ((((UINT64)X29 * X11) + ((UINT64)X25 * X15)) + ((UINT64)X33 * X7))) + ((UINT64)X27 * X13)) +
((UINT64)X31 * X9)) +
((UINT64)X23 * X17)) +
((UINT64)X35 * X5));
UINT64 X47 =
(((((((((UINT64)X29 * X13) + ((UINT64)X31 * X11)) + ((UINT64)X27 * X15)) + ((UINT64)X33 * X9)) +
((UINT64)X25 * X17)) +
((UINT64)X35 * X7)) +
((UINT64)X23 * X19)) +
((UINT64)X37 * X5));
UINT64 X48 =
(((((((UINT64)X31 * X13) +
(0x2 * (((((UINT64)X29 * X15) + ((UINT64)X33 * X11)) + ((UINT64)X25 * X19)) + ((UINT64)X37 * X7)))) +
((UINT64)X27 * X17)) +
((UINT64)X35 * X9)) +
((UINT64)X23 * X21)) +
((UINT64)X39 * X5));
UINT64 X49 =
(((((((((((UINT64)X31 * X15) + ((UINT64)X33 * X13)) + ((UINT64)X29 * X17)) + ((UINT64)X35 * X11)) +
((UINT64)X27 * X19)) +
((UINT64)X37 * X9)) +
((UINT64)X25 * X21)) +
((UINT64)X39 * X7)) +
((UINT64)X23 * X20)) +
((UINT64)X38 * X5));
UINT64 X50 =
(((((0x2 * ((((((UINT64)X33 * X15) + ((UINT64)X29 * X19)) + ((UINT64)X37 * X11)) + ((UINT64)X25 * X20)) +
((UINT64)X38 * X7))) +
((UINT64)X31 * X17)) +
((UINT64)X35 * X13)) +
((UINT64)X27 * X21)) +
((UINT64)X39 * X9));
UINT64 X51 =
(((((((((UINT64)X33 * X17) + ((UINT64)X35 * X15)) + ((UINT64)X31 * X19)) + ((UINT64)X37 * X13)) +
((UINT64)X29 * X21)) +
((UINT64)X39 * X11)) +
((UINT64)X27 * X20)) +
((UINT64)X38 * X9));
UINT64 X52 =
(((((UINT64)X35 * X17) +
(0x2 * (((((UINT64)X33 * X19) + ((UINT64)X37 * X15)) + ((UINT64)X29 * X20)) + ((UINT64)X38 * X11)))) +
((UINT64)X31 * X21)) +
((UINT64)X39 * X13));
UINT64 X53 =
(((((((UINT64)X35 * X19) + ((UINT64)X37 * X17)) + ((UINT64)X33 * X21)) + ((UINT64)X39 * X15)) +
((UINT64)X31 * X20)) +
((UINT64)X38 * X13));
UINT64 X54 =
(((0x2 * ((((UINT64)X37 * X19) + ((UINT64)X33 * X20)) + ((UINT64)X38 * X15))) + ((UINT64)X35 * X21)) +
((UINT64)X39 * X17));
UINT64 X55 = (((((UINT64)X37 * X21) + ((UINT64)X39 * X19)) + ((UINT64)X35 * X20)) + ((UINT64)X38 * X17));
UINT64 X56 = (((UINT64)X39 * X21) + (0x2 * (((UINT64)X37 * X20) + ((UINT64)X38 * X19))));
UINT64 X57 = (((UINT64)X39 * X20) + ((UINT64)X38 * X21));
UINT64 X58 = ((UINT64)(0x2 * X38) * X20);
UINT64 X59 = (X48 + (X58 << 0x4));
UINT64 X60 = (X59 + (X58 << 0x1));
UINT64 X61 = (X60 + X58);
UINT64 X62 = (X47 + (X57 << 0x4));
UINT64 X63 = (X62 + (X57 << 0x1));
UINT64 X64 = (X63 + X57);
UINT64 X65 = (X46 + (X56 << 0x4));
UINT64 X66 = (X65 + (X56 << 0x1));
UINT64 X67 = (X66 + X56);
UINT64 X68 = (X45 + (X55 << 0x4));
UINT64 X69 = (X68 + (X55 << 0x1));
UINT64 X70 = (X69 + X55);
UINT64 X71 = (X44 + (X54 << 0x4));
UINT64 X72 = (X71 + (X54 << 0x1));
UINT64 X73 = (X72 + X54);
UINT64 X74 = (X43 + (X53 << 0x4));
UINT64 X75 = (X74 + (X53 << 0x1));
UINT64 X76 = (X75 + X53);
UINT64 X77 = (X42 + (X52 << 0x4));
UINT64 X78 = (X77 + (X52 << 0x1));
UINT64 X79 = (X78 + X52);
UINT64 X80 = (X41 + (X51 << 0x4));
UINT64 X81 = (X80 + (X51 << 0x1));
UINT64 X82 = (X81 + X51);
UINT64 X83 = (X40 + (X50 << 0x4));
UINT64 X84 = (X83 + (X50 << 0x1));
UINT64 X85 = (X84 + X50);
UINT64 X86 = (X85 >> 0x1a);
UINT32 X87 = ((UINT32)X85 & 0x3ffffff);
UINT64 X88 = (X86 + X82);
UINT64 X89 = (X88 >> 0x19);
UINT32 X90 = ((UINT32)X88 & 0x1ffffff);
UINT64 X91 = (X89 + X79);
UINT64 X92 = (X91 >> 0x1a);
UINT32 X93 = ((UINT32)X91 & 0x3ffffff);
UINT64 X94 = (X92 + X76);
UINT64 X95 = (X94 >> 0x19);
UINT32 X96 = ((UINT32)X94 & 0x1ffffff);
UINT64 X97 = (X95 + X73);
UINT64 X98 = (X97 >> 0x1a);
UINT32 X99 = ((UINT32)X97 & 0x3ffffff);
UINT64 X100 = (X98 + X70);
UINT64 X101 = (X100 >> 0x19);
UINT32 X102 = ((UINT32)X100 & 0x1ffffff);
UINT64 X103 = (X101 + X67);
UINT64 X104 = (X103 >> 0x1a);
UINT32 X105 = ((UINT32)X103 & 0x3ffffff);
UINT64 X106 = (X104 + X64);
UINT64 X107 = (X106 >> 0x19);
UINT32 X108 = ((UINT32)X106 & 0x1ffffff);
UINT64 X109 = (X107 + X61);
UINT64 X110 = (X109 >> 0x1a);
UINT32 X111 = ((UINT32)X109 & 0x3ffffff);
UINT64 X112 = (X110 + X49);
UINT64 X113 = (X112 >> 0x19);
UINT32 X114 = ((UINT32)X112 & 0x1ffffff);
UINT64 X115 = (X87 + (0x13 * X113));
UINT32 X116 = (UINT32)(X115 >> 0x1a);
UINT32 X117 = ((UINT32)X115 & 0x3ffffff);
UINT32 X118 = (X116 + X90);
UINT32 X119 = (X118 >> 0x19);
UINT32 X120 = (X118 & 0x1ffffff);
Out[0] = X117;
Out[1] = X120;
Out[2] = (X119 + X93);
Out[3] = X96;
Out[4] = X99;
Out[5] = X102;
Out[6] = X105;
Out[7] = X108;
Out[8] = X111;
Out[9] = X114;
}
static inline VOID
FeMul121666(_Out_ Fe *H, _In_ CONST FeLoose *F)
{
FeMul121666Impl(H->V, F->V);
}
_Use_decl_annotations_
BOOLEAN
Curve25519(
UINT8 Out[CURVE25519_KEY_SIZE],
CONST UINT8 Scalar[CURVE25519_KEY_SIZE],
CONST UINT8 Point[CURVE25519_KEY_SIZE])
{
Fe X1, X2, Z2, X3, Z3;
FeLoose X2l, Z2l, X3l;
UINT32 Swap = 0;
LONG Pos;
UINT8 E[32];
RtlCopyMemory(E, Scalar, 32);
Curve25519ClampSecret(E);
/* The following implementation was transcribed to Coq and proven to
* correspond to unary scalar multiplication in affine coordinates given
* that x1 != 0 is the x coordinate of some point on the curve. It was
* also checked in Coq that doing a ladderstep with x1 = x3 = 0 gives
* z2' = z3' = 0, and z2 = z3 = 0 gives z2' = z3' = 0. The statement was
* quantified over the underlying field, so it applies to Curve25519
* itself and the quadratic twist of Curve25519. It was not proven in
* Coq that prime-field arithmetic correctly simulates extension-field
* arithmetic on prime-field values. The decoding of the byte array
* representation of e was not considered.
*
* Specification of Montgomery curves in affine coordinates:
* <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Spec/MontgomeryCurve.v#L27>
*
* Proof that these form a group that is isomorphic to a Weierstrass
* curve:
* <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/AffineProofs.v#L35>
*
* Coq transcription and correctness proof of the loop
* (where scalarbits=255):
* <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L118>
* <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L278>
* preconditions: 0 <= e < 2^255 (not necessarily e < order),
* fe_invert(0) = 0
*/
FeFrombytes(&X1, Point);
Fe1(&X2);
Fe0(&Z2);
FeCopy(&X3, &X1);
Fe1(&Z3);
for (Pos = 254; Pos >= 0; --Pos)
{
Fe Tmp0, Tmp1;
FeLoose Tmp0l, Tmp1l;
/* loop invariant as of right before the test, for the case
* where x1 != 0:
* pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3
* is nonzero
* let r := e >> (pos+1) in the following equalities of
* projective points:
* to_xz (r*P) === if swap then (x3, z3) else (x2, z2)
* to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3)
* x1 is the nonzero x coordinate of the nonzero
* point (r*P-(r+1)*P)
*/
UINT32 B = 1 & (E[Pos / 8] >> (Pos & 7));
Swap ^= B;
FeCswap(&X2, &X3, Swap);
FeCswap(&Z2, &Z3, Swap);
Swap = B;
/* Coq transcription of ladderstep formula (called from
* transcribed loop):
* <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZ.v#L89>
* <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L131>
* x1 != 0
* <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L217>
* x1 = 0
* <https://github.com/mit-plv/fiat-crypto/blob/2456d821825521f7e03e65882cc3521795b0320f/src/Curves/Montgomery/XZProofs.v#L147>
*/
FeSub(&Tmp0l, &X3, &Z3);
FeSub(&Tmp1l, &X2, &Z2);
FeAdd(&X2l, &X2, &Z2);
FeAdd(&Z2l, &X3, &Z3);
FeMulTll(&Z3, &Tmp0l, &X2l);
FeMulTll(&Z2, &Z2l, &Tmp1l);
FeSqTl(&Tmp0, &Tmp1l);
FeSqTl(&Tmp1, &X2l);
FeAdd(&X3l, &Z3, &Z2);
FeSub(&Z2l, &Z3, &Z2);
FeMulTtt(&X2, &Tmp1, &Tmp0);
FeSub(&Tmp1l, &Tmp1, &Tmp0);
FeSqTl(&Z2, &Z2l);
FeMul121666(&Z3, &Tmp1l);
FeSqTl(&X3, &X3l);
FeAdd(&Tmp0l, &Tmp0, &Z3);
FeMulTtt(&Z3, &X1, &Z2);
FeMulTll(&Z2, &Tmp1l, &Tmp0l);
}
/* here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3)
* else (x2, z2)
*/
FeCswap(&X2, &X3, Swap);
FeCswap(&Z2, &Z3, Swap);
FeInvert(&Z2, &Z2);
FeMulTtt(&X2, &X2, &Z2);
FeTobytes(Out, &X2);
RtlSecureZeroMemory(&X1, sizeof(X1));
RtlSecureZeroMemory(&X2, sizeof(X2));
RtlSecureZeroMemory(&Z2, sizeof(Z2));
RtlSecureZeroMemory(&X3, sizeof(X3));
RtlSecureZeroMemory(&Z3, sizeof(Z3));
RtlSecureZeroMemory(&X2l, sizeof(X2l));
RtlSecureZeroMemory(&Z2l, sizeof(Z2l));
RtlSecureZeroMemory(&X3l, sizeof(X3l));
RtlSecureZeroMemory(&E, sizeof(E));
return !CryptoIsZero32(Out);
}
#ifdef DBG
# include "selftest/chacha20poly1305.c"
# ifdef ALLOC_PRAGMA
# pragma alloc_text(INIT, CryptoSelftest)
# endif
_Use_decl_annotations_
BOOLEAN CryptoSelftest(VOID)
{
BOOLEAN Success = TRUE;
SIMD_STATE Simd;
SimdGet(&Simd);
ULONG FullSet = (ULONG)Simd.CpuFeatures;
Simd.CpuFeatures = 0;
do
{
if (!ChaCha20Poly1305Selftest(&Simd))
{
LogDebug("chacha20poly1305 self-test combination 0x%lx: FAIL", Simd.CpuFeatures);
Success = FALSE;
}
Simd.CpuFeatures = ((ULONG)Simd.CpuFeatures - FullSet) & FullSet;
} while (Simd.CpuFeatures);
SimdPut(&Simd);
if (Success)
LogDebug("crypto self-tests: pass");
return Success;
}
#endif