|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
| 2 | +/* |
| 3 | + * Copyright (C) 2022-2024 Jason A. Donenfeld <[email protected]>. All Rights Reserved. |
| 4 | + */ |
| 5 | + |
| 6 | +#include <linux/linkage.h> |
| 7 | +#include <asm/frame.h> |
| 8 | + |
| 9 | +.section .rodata, "a" |
| 10 | +.align 16 |
| 11 | +CONSTANTS: .octa 0x6b20657479622d323320646e61707865 |
| 12 | +.text |
| 13 | + |
| 14 | +/* |
| 15 | + * Very basic SSE2 implementation of ChaCha20. Produces a given positive number |
| 16 | + * of blocks of output with a nonce of 0, taking an input key and 8-byte |
| 17 | + * counter. Importantly does not spill to the stack. Its arguments are: |
| 18 | + * |
| 19 | + * rdi: output bytes |
| 20 | + * rsi: 32-byte key input |
| 21 | + * rdx: 8-byte counter input/output |
| 22 | + * rcx: number of 64-byte blocks to write to output |
| 23 | + */ |
| 24 | +SYM_FUNC_START(__arch_chacha20_blocks_nostack) |
| 25 | + |
| 26 | +.set output, %rdi |
| 27 | +.set key, %rsi |
| 28 | +.set counter, %rdx |
| 29 | +.set nblocks, %rcx |
| 30 | +.set i, %al |
| 31 | +/* xmm registers are *not* callee-save. */ |
| 32 | +.set temp, %xmm0 |
| 33 | +.set state0, %xmm1 |
| 34 | +.set state1, %xmm2 |
| 35 | +.set state2, %xmm3 |
| 36 | +.set state3, %xmm4 |
| 37 | +.set copy0, %xmm5 |
| 38 | +.set copy1, %xmm6 |
| 39 | +.set copy2, %xmm7 |
| 40 | +.set copy3, %xmm8 |
| 41 | +.set one, %xmm9 |
| 42 | + |
| 43 | + /* copy0 = "expand 32-byte k" */ |
| 44 | + movaps CONSTANTS(%rip),copy0 |
| 45 | + /* copy1,copy2 = key */ |
| 46 | + movups 0x00(key),copy1 |
| 47 | + movups 0x10(key),copy2 |
| 48 | + /* copy3 = counter || zero nonce */ |
| 49 | + movq 0x00(counter),copy3 |
| 50 | + /* one = 1 || 0 */ |
| 51 | + movq $1,%rax |
| 52 | + movq %rax,one |
| 53 | + |
| 54 | +.Lblock: |
| 55 | + /* state0,state1,state2,state3 = copy0,copy1,copy2,copy3 */ |
| 56 | + movdqa copy0,state0 |
| 57 | + movdqa copy1,state1 |
| 58 | + movdqa copy2,state2 |
| 59 | + movdqa copy3,state3 |
| 60 | + |
| 61 | + movb $10,i |
| 62 | +.Lpermute: |
| 63 | + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ |
| 64 | + paddd state1,state0 |
| 65 | + pxor state0,state3 |
| 66 | + movdqa state3,temp |
| 67 | + pslld $16,temp |
| 68 | + psrld $16,state3 |
| 69 | + por temp,state3 |
| 70 | + |
| 71 | + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ |
| 72 | + paddd state3,state2 |
| 73 | + pxor state2,state1 |
| 74 | + movdqa state1,temp |
| 75 | + pslld $12,temp |
| 76 | + psrld $20,state1 |
| 77 | + por temp,state1 |
| 78 | + |
| 79 | + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ |
| 80 | + paddd state1,state0 |
| 81 | + pxor state0,state3 |
| 82 | + movdqa state3,temp |
| 83 | + pslld $8,temp |
| 84 | + psrld $24,state3 |
| 85 | + por temp,state3 |
| 86 | + |
| 87 | + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ |
| 88 | + paddd state3,state2 |
| 89 | + pxor state2,state1 |
| 90 | + movdqa state1,temp |
| 91 | + pslld $7,temp |
| 92 | + psrld $25,state1 |
| 93 | + por temp,state1 |
| 94 | + |
| 95 | + /* state1[0,1,2,3] = state1[1,2,3,0] */ |
| 96 | + pshufd $0x39,state1,state1 |
| 97 | + /* state2[0,1,2,3] = state2[2,3,0,1] */ |
| 98 | + pshufd $0x4e,state2,state2 |
| 99 | + /* state3[0,1,2,3] = state3[3,0,1,2] */ |
| 100 | + pshufd $0x93,state3,state3 |
| 101 | + |
| 102 | + /* state0 += state1, state3 = rotl32(state3 ^ state0, 16) */ |
| 103 | + paddd state1,state0 |
| 104 | + pxor state0,state3 |
| 105 | + movdqa state3,temp |
| 106 | + pslld $16,temp |
| 107 | + psrld $16,state3 |
| 108 | + por temp,state3 |
| 109 | + |
| 110 | + /* state2 += state3, state1 = rotl32(state1 ^ state2, 12) */ |
| 111 | + paddd state3,state2 |
| 112 | + pxor state2,state1 |
| 113 | + movdqa state1,temp |
| 114 | + pslld $12,temp |
| 115 | + psrld $20,state1 |
| 116 | + por temp,state1 |
| 117 | + |
| 118 | + /* state0 += state1, state3 = rotl32(state3 ^ state0, 8) */ |
| 119 | + paddd state1,state0 |
| 120 | + pxor state0,state3 |
| 121 | + movdqa state3,temp |
| 122 | + pslld $8,temp |
| 123 | + psrld $24,state3 |
| 124 | + por temp,state3 |
| 125 | + |
| 126 | + /* state2 += state3, state1 = rotl32(state1 ^ state2, 7) */ |
| 127 | + paddd state3,state2 |
| 128 | + pxor state2,state1 |
| 129 | + movdqa state1,temp |
| 130 | + pslld $7,temp |
| 131 | + psrld $25,state1 |
| 132 | + por temp,state1 |
| 133 | + |
| 134 | + /* state1[0,1,2,3] = state1[3,0,1,2] */ |
| 135 | + pshufd $0x93,state1,state1 |
| 136 | + /* state2[0,1,2,3] = state2[2,3,0,1] */ |
| 137 | + pshufd $0x4e,state2,state2 |
| 138 | + /* state3[0,1,2,3] = state3[1,2,3,0] */ |
| 139 | + pshufd $0x39,state3,state3 |
| 140 | + |
| 141 | + decb i |
| 142 | + jnz .Lpermute |
| 143 | + |
| 144 | + /* output0 = state0 + copy0 */ |
| 145 | + paddd copy0,state0 |
| 146 | + movups state0,0x00(output) |
| 147 | + /* output1 = state1 + copy1 */ |
| 148 | + paddd copy1,state1 |
| 149 | + movups state1,0x10(output) |
| 150 | + /* output2 = state2 + copy2 */ |
| 151 | + paddd copy2,state2 |
| 152 | + movups state2,0x20(output) |
| 153 | + /* output3 = state3 + copy3 */ |
| 154 | + paddd copy3,state3 |
| 155 | + movups state3,0x30(output) |
| 156 | + |
| 157 | + /* ++copy3.counter */ |
| 158 | + paddq one,copy3 |
| 159 | + |
| 160 | + /* output += 64, --nblocks */ |
| 161 | + addq $64,output |
| 162 | + decq nblocks |
| 163 | + jnz .Lblock |
| 164 | + |
| 165 | + /* counter = copy3.counter */ |
| 166 | + movq copy3,0x00(counter) |
| 167 | + |
| 168 | + /* Zero out the potentially sensitive regs, in case nothing uses these again. */ |
| 169 | + pxor state0,state0 |
| 170 | + pxor state1,state1 |
| 171 | + pxor state2,state2 |
| 172 | + pxor state3,state3 |
| 173 | + pxor copy1,copy1 |
| 174 | + pxor copy2,copy2 |
| 175 | + pxor temp,temp |
| 176 | + |
| 177 | + ret |
| 178 | +SYM_FUNC_END(__arch_chacha20_blocks_nostack) |
0 commit comments