Adding in curl and openssl repos

This commit is contained in:
2025-08-14 12:09:30 -04:00
parent af2117b574
commit 0ace93e303
21174 changed files with 3607720 additions and 2 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,594 @@
#! /usr/bin/env perl
# This file is dual-licensed, meaning that you can use it under your
# choice of either of the following two licenses:
#
# Copyright 2022-2023 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You can obtain
# a copy in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# or
#
# Copyright (c) 2022, Hongren (Zenithal) Zheng <i@zenithal.me>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
use strict;
use warnings;
use FindBin qw($Bin);
use lib "$Bin";
use lib "$Bin/../../perlasm";
use riscv;
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$output and open STDOUT,">$output";
################################################################################
# Utility functions to help with keeping track of which registers to stack/
# unstack when entering / exiting routines.
################################################################################
{
# Callee-saved registers
my @callee_saved = map("x$_",(2,8,9,18..27));
# Caller-saved registers
my @caller_saved = map("x$_",(1,5..7,10..17,28..31));
my @must_save;
sub use_reg {
my $reg = shift;
if (grep(/^$reg$/, @callee_saved)) {
push(@must_save, $reg);
} elsif (!grep(/^$reg$/, @caller_saved)) {
# Register is not usable!
die("Unusable register ".$reg);
}
return $reg;
}
sub use_regs {
return map(use_reg("x$_"), @_);
}
sub save_regs {
my $ret = '';
my $stack_reservation = ($#must_save + 1) * 8;
my $stack_offset = $stack_reservation;
if ($stack_reservation % 16) {
$stack_reservation += 8;
}
$ret.=" addi sp,sp,-$stack_reservation\n";
foreach (@must_save) {
$stack_offset -= 8;
$ret.=" sd $_,$stack_offset(sp)\n";
}
return $ret;
}
sub load_regs {
my $ret = '';
my $stack_reservation = ($#must_save + 1) * 8;
my $stack_offset = $stack_reservation;
if ($stack_reservation % 16) {
$stack_reservation += 8;
}
foreach (@must_save) {
$stack_offset -= 8;
$ret.=" ld $_,$stack_offset(sp)\n";
}
$ret.=" addi sp,sp,$stack_reservation\n";
return $ret;
}
sub clear_regs {
@must_save = ();
}
}
################################################################################
# Register assignment for rv64i_zkne_encrypt and rv64i_zknd_decrypt
################################################################################
# Registers to hold AES state (called s0-s3 or y0-y3 elsewhere)
my ($Q0,$Q1,$Q2,$Q3) = use_regs(6..9);
# Function arguments (x10-x12 are a0-a2 in the ABI)
# Input block pointer, output block pointer, key pointer
my ($INP,$OUTP,$KEYP) = use_regs(10..12);
# Temporaries
my ($T0,$T1) = use_regs(13..14);
# Loop counter
my ($loopcntr) = use_regs(30);
################################################################################
# void rv64i_zkne_encrypt(const unsigned char *in, unsigned char *out,
# const AES_KEY *key);
################################################################################
my $code .= <<___;
.text
.balign 16
.globl rv64i_zkne_encrypt
.type rv64i_zkne_encrypt,\@function
rv64i_zkne_encrypt:
___
$code .= save_regs();
$code .= <<___;
# Load input to block cipher
ld $Q0,0($INP)
ld $Q1,8($INP)
# Load key
ld $T0,0($KEYP)
ld $T1,8($KEYP)
# Load number of rounds
lwu $loopcntr,240($KEYP)
# initial transformation
xor $Q0,$Q0,$T0
xor $Q1,$Q1,$T1
# The main loop only executes the first N-1 rounds.
add $loopcntr,$loopcntr,-1
# Do Nr - 1 rounds (final round is special)
1:
@{[aes64esm $Q2,$Q0,$Q1]}
@{[aes64esm $Q3,$Q1,$Q0]}
# Update key ptr to point to next key in schedule
add $KEYP,$KEYP,16
# Grab next key in schedule
ld $T0,0($KEYP)
ld $T1,8($KEYP)
xor $Q0,$Q2,$T0
xor $Q1,$Q3,$T1
add $loopcntr,$loopcntr,-1
bgtz $loopcntr,1b
# final round
@{[aes64es $Q2,$Q0,$Q1]}
@{[aes64es $Q3,$Q1,$Q0]}
# since not added 16 before
ld $T0,16($KEYP)
ld $T1,24($KEYP)
xor $Q0,$Q2,$T0
xor $Q1,$Q3,$T1
sd $Q0,0($OUTP)
sd $Q1,8($OUTP)
# Pop registers and return
___
$code .= load_regs();
$code .= <<___;
ret
___
################################################################################
# void rv64i_zknd_decrypt(const unsigned char *in, unsigned char *out,
# const AES_KEY *key);
################################################################################
$code .= <<___;
.text
.balign 16
.globl rv64i_zknd_decrypt
.type rv64i_zknd_decrypt,\@function
rv64i_zknd_decrypt:
___
$code .= save_regs();
$code .= <<___;
# Load input to block cipher
ld $Q0,0($INP)
ld $Q1,8($INP)
# Load number of rounds
lwu $loopcntr,240($KEYP)
# Load the last key
slli $T0,$loopcntr,4
add $KEYP,$KEYP,$T0
ld $T0,0($KEYP)
ld $T1,8($KEYP)
xor $Q0,$Q0,$T0
xor $Q1,$Q1,$T1
# The main loop only executes the first N-1 rounds.
add $loopcntr,$loopcntr,-1
# Do Nr - 1 rounds (final round is special)
1:
@{[aes64dsm $Q2,$Q0,$Q1]}
@{[aes64dsm $Q3,$Q1,$Q0]}
# Update key ptr to point to next key in schedule
add $KEYP,$KEYP,-16
# Grab next key in schedule
ld $T0,0($KEYP)
ld $T1,8($KEYP)
xor $Q0,$Q2,$T0
xor $Q1,$Q3,$T1
add $loopcntr,$loopcntr,-1
bgtz $loopcntr,1b
# final round
@{[aes64ds $Q2,$Q0,$Q1]}
@{[aes64ds $Q3,$Q1,$Q0]}
add $KEYP,$KEYP,-16
ld $T0,0($KEYP)
ld $T1,8($KEYP)
xor $Q0,$Q2,$T0
xor $Q1,$Q3,$T1
sd $Q0,0($OUTP)
sd $Q1,8($OUTP)
# Pop registers and return
___
$code .= load_regs();
$code .= <<___;
ret
___
clear_regs();
################################################################################
# Register assignment for rv64i_zkn[e/d]_set_[en/de]crypt_key
################################################################################
# Function arguments (x10-x12 are a0-a2 in the ABI)
# Pointer to user key, number of bits in key, key pointer
my ($UKEY,$BITS,$KEYP) = use_regs(10..12);
# Temporaries
my ($T0,$T1,$T2,$T3,$T4) = use_regs(6..8,13..14);
################################################################################
# utility functions for rv64i_zkne_set_encrypt_key
################################################################################
sub ke128enc {
my $rnum = 0;
my $ret = '';
$ret .= <<___;
ld $T0,0($UKEY)
ld $T1,8($UKEY)
sd $T0,0($KEYP)
sd $T1,8($KEYP)
___
while($rnum < 10) {
$ret .= <<___;
@{[aes64ks1i $T2,$T1,$rnum]}
@{[aes64ks2 $T0,$T2,$T0]}
@{[aes64ks2 $T1,$T0,$T1]}
add $KEYP,$KEYP,16
sd $T0,0($KEYP)
sd $T1,8($KEYP)
___
$rnum++;
}
return $ret;
}
sub ke192enc {
my $rnum = 0;
my $ret = '';
$ret .= <<___;
ld $T0,0($UKEY)
ld $T1,8($UKEY)
ld $T2,16($UKEY)
sd $T0,0($KEYP)
sd $T1,8($KEYP)
sd $T2,16($KEYP)
___
while($rnum < 8) {
$ret .= <<___;
@{[aes64ks1i $T3,$T2,$rnum]}
@{[aes64ks2 $T0,$T3,$T0]}
@{[aes64ks2 $T1,$T0,$T1]}
___
if ($rnum != 7) {
# note that (8+1)*24 = 216, (12+1)*16 = 208
# thus the last 8 bytes can be dropped
$ret .= <<___;
@{[aes64ks2 $T2,$T1,$T2]}
___
}
$ret .= <<___;
add $KEYP,$KEYP,24
sd $T0,0($KEYP)
sd $T1,8($KEYP)
___
if ($rnum != 7) {
$ret .= <<___;
sd $T2,16($KEYP)
___
}
$rnum++;
}
return $ret;
}
sub ke256enc {
my $rnum = 0;
my $ret = '';
$ret .= <<___;
ld $T0,0($UKEY)
ld $T1,8($UKEY)
ld $T2,16($UKEY)
ld $T3,24($UKEY)
sd $T0,0($KEYP)
sd $T1,8($KEYP)
sd $T2,16($KEYP)
sd $T3,24($KEYP)
___
while($rnum < 7) {
$ret .= <<___;
@{[aes64ks1i $T4,$T3,$rnum]}
@{[aes64ks2 $T0,$T4,$T0]}
@{[aes64ks2 $T1,$T0,$T1]}
add $KEYP,$KEYP,32
sd $T0,0($KEYP)
sd $T1,8($KEYP)
___
if ($rnum != 6) {
# note that (7+1)*32 = 256, (14+1)*16 = 240
# thus the last 16 bytes can be dropped
$ret .= <<___;
@{[aes64ks1i $T4,$T1,0xA]}
@{[aes64ks2 $T2,$T4,$T2]}
@{[aes64ks2 $T3,$T2,$T3]}
sd $T2,16($KEYP)
sd $T3,24($KEYP)
___
}
$rnum++;
}
return $ret;
}
################################################################################
# void rv64i_zkne_set_encrypt_key(const unsigned char *userKey, const int bits,
# AES_KEY *key)
################################################################################
sub AES_set_common {
my ($ke128, $ke192, $ke256) = @_;
my $ret = '';
$ret .= <<___;
bnez $UKEY,1f # if (!userKey || !key) return -1;
bnez $KEYP,1f
li a0,-1
ret
1:
# Determine number of rounds from key size in bits
li $T0,128
bne $BITS,$T0,1f
li $T1,10 # key->rounds = 10 if bits == 128
sw $T1,240($KEYP) # store key->rounds
$ke128
j 4f
1:
li $T0,192
bne $BITS,$T0,2f
li $T1,12 # key->rounds = 12 if bits == 192
sw $T1,240($KEYP) # store key->rounds
$ke192
j 4f
2:
li $T1,14 # key->rounds = 14 if bits == 256
li $T0,256
beq $BITS,$T0,3f
li a0,-2 # If bits != 128, 192, or 256, return -2
j 5f
3:
sw $T1,240($KEYP) # store key->rounds
$ke256
4: # return 0
li a0,0
5: # return a0
___
return $ret;
}
$code .= <<___;
.text
.balign 16
.globl rv64i_zkne_set_encrypt_key
.type rv64i_zkne_set_encrypt_key,\@function
rv64i_zkne_set_encrypt_key:
___
$code .= save_regs();
$code .= AES_set_common(ke128enc(), ke192enc(),ke256enc());
$code .= load_regs();
$code .= <<___;
ret
___
################################################################################
# utility functions for rv64i_zknd_set_decrypt_key
################################################################################
sub ke128dec {
my $rnum = 0;
my $ret = '';
$ret .= <<___;
ld $T0,0($UKEY)
ld $T1,8($UKEY)
sd $T0,0($KEYP)
sd $T1,8($KEYP)
___
while($rnum < 10) {
$ret .= <<___;
@{[aes64ks1i $T2,$T1,$rnum]}
@{[aes64ks2 $T0,$T2,$T0]}
@{[aes64ks2 $T1,$T0,$T1]}
add $KEYP,$KEYP,16
___
# need to aes64im for [1:N-1] round keys
# this is from the fact that aes64dsm subwords first then mix column
# intuitively decryption needs to first mix column then subwords
# however, for merging datapaths (encryption first subwords then mix column)
# aes64dsm chooses to inverse the order of them, thus
# transform should then be done on the round key
if ($rnum < 9) {
$ret .= <<___;
@{[aes64im $T2,$T0]}
sd $T2,0($KEYP)
@{[aes64im $T2,$T1]}
sd $T2,8($KEYP)
___
} else {
$ret .= <<___;
sd $T0,0($KEYP)
sd $T1,8($KEYP)
___
}
$rnum++;
}
return $ret;
}
sub ke192dec {
my $rnum = 0;
my $ret = '';
$ret .= <<___;
ld $T0,0($UKEY)
ld $T1,8($UKEY)
ld $T2,16($UKEY)
sd $T0,0($KEYP)
sd $T1,8($KEYP)
@{[aes64im $T3,$T2]}
sd $T3,16($KEYP)
___
while($rnum < 8) {
$ret .= <<___;
@{[aes64ks1i $T3,$T2,$rnum]}
@{[aes64ks2 $T0,$T3,$T0]}
@{[aes64ks2 $T1,$T0,$T1]}
add $KEYP,$KEYP,24
___
if ($rnum < 7) {
$ret .= <<___;
@{[aes64im $T3,$T0]}
sd $T3,0($KEYP)
@{[aes64im $T3,$T1]}
sd $T3,8($KEYP)
# the reason is in ke192enc
@{[aes64ks2 $T2,$T1,$T2]}
@{[aes64im $T3,$T2]}
sd $T3,16($KEYP)
___
} else { # rnum == 7
$ret .= <<___;
sd $T0,0($KEYP)
sd $T1,8($KEYP)
___
}
$rnum++;
}
return $ret;
}
sub ke256dec {
my $rnum = 0;
my $ret = '';
$ret .= <<___;
ld $T0,0($UKEY)
ld $T1,8($UKEY)
ld $T2,16($UKEY)
ld $T3,24($UKEY)
sd $T0,0($KEYP)
sd $T1,8($KEYP)
@{[aes64im $T4,$T2]}
sd $T4,16($KEYP)
@{[aes64im $T4,$T3]}
sd $T4,24($KEYP)
___
while($rnum < 7) {
$ret .= <<___;
@{[aes64ks1i $T4,$T3,$rnum]}
@{[aes64ks2 $T0,$T4,$T0]}
@{[aes64ks2 $T1,$T0,$T1]}
add $KEYP,$KEYP,32
___
if ($rnum < 6) {
$ret .= <<___;
@{[aes64ks1i $T4,$T1,0xA]}
@{[aes64ks2 $T2,$T4,$T2]}
@{[aes64ks2 $T3,$T2,$T3]}
@{[aes64im $T4,$T0]}
sd $T4,0($KEYP)
@{[aes64im $T4,$T1]}
sd $T4,8($KEYP)
@{[aes64im $T4,$T2]}
sd $T4,16($KEYP)
@{[aes64im $T4,$T3]}
sd $T4,24($KEYP)
___
} else {
$ret .= <<___;
sd $T0,0($KEYP)
sd $T1,8($KEYP)
# last two one dropped
___
}
$rnum++;
}
return $ret;
}
################################################################################
# void rv64i_zknd_set_decrypt_key(const unsigned char *userKey, const int bits,
# AES_KEY *key)
################################################################################
$code .= <<___;
.text
.balign 16
.globl rv64i_zknd_set_decrypt_key
.type rv64i_zknd_set_decrypt_key,\@function
rv64i_zknd_set_decrypt_key:
___
$code .= save_regs();
$code .= AES_set_common(ke128dec(), ke192dec(),ke256dec());
$code .= load_regs();
$code .= <<___;
ret
___
print $code;
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,710 @@
#! /usr/bin/env perl
# This file is dual-licensed, meaning that you can use it under your
# choice of either of the following two licenses:
#
# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You can obtain
# a copy in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# or
#
# Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# - RV64I
# - RISC-V Vector ('V') with VLEN >= 128
# - RISC-V Vector Bit-manipulation extension ('Zvbb')
# - RISC-V Vector GCM/GMAC extension ('Zvkg')
# - RISC-V Vector AES block cipher extension ('Zvkned')
# - RISC-V Zicclsm(Main memory supports misaligned loads/stores)
use strict;
use warnings;
use FindBin qw($Bin);
use lib "$Bin";
use lib "$Bin/../../perlasm";
use riscv;
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$output and open STDOUT,">$output";
my $code=<<___;
.text
___
{
################################################################################
# void rv64i_zvbb_zvkg_zvkned_aes_xts_encrypt(const unsigned char *in,
# unsigned char *out, size_t length,
# const AES_KEY *key1,
# const AES_KEY *key2,
# const unsigned char iv[16])
my ($INPUT, $OUTPUT, $LENGTH, $KEY1, $KEY2, $IV) = ("a0", "a1", "a2", "a3", "a4", "a5");
my ($TAIL_LENGTH) = ("a6");
my ($VL) = ("a7");
my ($T0, $T1, $T2) = ("t0", "t1", "t2");
my ($STORE_LEN32) = ("t3");
my ($LEN32) = ("t4");
my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7,
$V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15,
$V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23,
$V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31,
) = map("v$_",(0..31));
sub compute_xts_iv0 {
my $code=<<___;
# Load number of rounds
lwu $T0, 240($KEY2)
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vle32_v $V28, $IV]}
@{[vle32_v $V29, $KEY2]}
@{[vaesz_vs $V28, $V29]}
addi $T0, $T0, -1
addi $KEY2, $KEY2, 16
1:
@{[vle32_v $V29, $KEY2]}
@{[vaesem_vs $V28, $V29]}
addi $T0, $T0, -1
addi $KEY2, $KEY2, 16
bnez $T0, 1b
@{[vle32_v $V29, $KEY2]}
@{[vaesef_vs $V28, $V29]}
___
return $code;
}
# prepare input data(v24), iv(v28), bit-reversed-iv(v16), bit-reversed-iv-multiplier(v20)
sub init_first_round {
my $code=<<___;
# load input
@{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
@{[vle32_v $V24, $INPUT]}
li $T0, 5
# We could simplify the initialization steps if we have `block<=1`.
blt $LEN32, $T0, 1f
# Note: We use `vgmul` for GF(2^128) multiplication. The `vgmul` uses
# different order of coefficients. We should use`vbrev8` to reverse the
# data when we use `vgmul`.
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vbrev8_v $V0, $V28]}
@{[vsetvli "zero", $LEN32, "e32", "m4", "ta", "ma"]}
@{[vmv_v_i $V16, 0]}
# v16: [r-IV0, r-IV0, ...]
@{[vaesz_vs $V16, $V0]}
# Prepare GF(2^128) multiplier [1, x, x^2, x^3, ...] in v8.
slli $T0, $LEN32, 2
@{[vsetvli "zero", $T0, "e32", "m1", "ta", "ma"]}
# v2: [`1`, `1`, `1`, `1`, ...]
@{[vmv_v_i $V2, 1]}
# v3: [`0`, `1`, `2`, `3`, ...]
@{[vid_v $V3]}
@{[vsetvli "zero", $T0, "e64", "m2", "ta", "ma"]}
# v4: [`1`, 0, `1`, 0, `1`, 0, `1`, 0, ...]
@{[vzext_vf2 $V4, $V2]}
# v6: [`0`, 0, `1`, 0, `2`, 0, `3`, 0, ...]
@{[vzext_vf2 $V6, $V3]}
slli $T0, $LEN32, 1
@{[vsetvli "zero", $T0, "e32", "m2", "ta", "ma"]}
# v8: [1<<0=1, 0, 0, 0, 1<<1=x, 0, 0, 0, 1<<2=x^2, 0, 0, 0, ...]
@{[vwsll_vv $V8, $V4, $V6]}
# Compute [r-IV0*1, r-IV0*x, r-IV0*x^2, r-IV0*x^3, ...] in v16
@{[vsetvli "zero", $LEN32, "e32", "m4", "ta", "ma"]}
@{[vbrev8_v $V8, $V8]}
@{[vgmul_vv $V16, $V8]}
# Compute [IV0*1, IV0*x, IV0*x^2, IV0*x^3, ...] in v28.
# Reverse the bits order back.
@{[vbrev8_v $V28, $V16]}
# Prepare the x^n multiplier in v20. The `n` is the aes-xts block number
# in a LMUL=4 register group.
# n = ((VLEN*LMUL)/(32*4)) = ((VLEN*4)/(32*4))
# = (VLEN/32)
# We could use vsetvli with `e32, m1` to compute the `n` number.
@{[vsetvli $T0, "zero", "e32", "m1", "ta", "ma"]}
li $T1, 1
sll $T0, $T1, $T0
@{[vsetivli "zero", 2, "e64", "m1", "ta", "ma"]}
@{[vmv_v_i $V0, 0]}
@{[vsetivli "zero", 1, "e64", "m1", "tu", "ma"]}
@{[vmv_v_x $V0, $T0]}
@{[vsetivli "zero", 2, "e64", "m1", "ta", "ma"]}
@{[vbrev8_v $V0, $V0]}
@{[vsetvli "zero", $LEN32, "e32", "m4", "ta", "ma"]}
@{[vmv_v_i $V20, 0]}
@{[vaesz_vs $V20, $V0]}
j 2f
1:
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vbrev8_v $V16, $V28]}
2:
___
return $code;
}
# prepare xts enc last block's input(v24) and iv(v28)
sub handle_xts_enc_last_block {
my $code=<<___;
bnez $TAIL_LENGTH, 1f
ret
1:
# slidedown second to last block
addi $VL, $VL, -4
@{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]}
# ciphertext
@{[vslidedown_vx $V24, $V24, $VL]}
# multiplier
@{[vslidedown_vx $V16, $V16, $VL]}
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vmv_v_v $V25, $V24]}
# load last block into v24
# note: We should load the last block before store the second to last block
# for in-place operation.
@{[vsetvli "zero", $TAIL_LENGTH, "e8", "m1", "tu", "ma"]}
@{[vle8_v $V24, $INPUT]}
# setup `x` multiplier with byte-reversed order
# 0b00000010 => 0b01000000 (0x40)
li $T0, 0x40
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vmv_v_i $V28, 0]}
@{[vsetivli "zero", 1, "e8", "m1", "tu", "ma"]}
@{[vmv_v_x $V28, $T0]}
# compute IV for last block
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vgmul_vv $V16, $V28]}
@{[vbrev8_v $V28, $V16]}
# store second to last block
@{[vsetvli "zero", $TAIL_LENGTH, "e8", "m1", "ta", "ma"]}
@{[vse8_v $V25, $OUTPUT]}
___
return $code;
}
# prepare xts dec second to last block's input(v24) and iv(v29) and
# last block's and iv(v28)
sub handle_xts_dec_last_block {
my $code=<<___;
bnez $TAIL_LENGTH, 1f
ret
1:
# load second to last block's ciphertext
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vle32_v $V24, $INPUT]}
addi $INPUT, $INPUT, 16
# setup `x` multiplier with byte-reversed order
# 0b00000010 => 0b01000000 (0x40)
li $T0, 0x40
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vmv_v_i $V20, 0]}
@{[vsetivli "zero", 1, "e8", "m1", "tu", "ma"]}
@{[vmv_v_x $V20, $T0]}
beqz $LENGTH, 1f
# slidedown third to last block
addi $VL, $VL, -4
@{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]}
# multiplier
@{[vslidedown_vx $V16, $V16, $VL]}
# compute IV for last block
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vgmul_vv $V16, $V20]}
@{[vbrev8_v $V28, $V16]}
# compute IV for second to last block
@{[vgmul_vv $V16, $V20]}
@{[vbrev8_v $V29, $V16]}
j 2f
1:
# compute IV for second to last block
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vgmul_vv $V16, $V20]}
@{[vbrev8_v $V29, $V16]}
2:
___
return $code;
}
# Load all 11 round keys to v1-v11 registers.
sub aes_128_load_key {
my $code=<<___;
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vle32_v $V1, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V2, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V3, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V4, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V5, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V6, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V7, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V8, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V9, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V10, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V11, $KEY1]}
___
return $code;
}
# Load all 15 round keys to v1-v15 registers.
sub aes_256_load_key {
my $code=<<___;
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vle32_v $V1, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V2, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V3, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V4, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V5, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V6, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V7, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V8, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V9, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V10, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V11, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V12, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V13, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V14, $KEY1]}
addi $KEY1, $KEY1, 16
@{[vle32_v $V15, $KEY1]}
___
return $code;
}
# aes-128 enc with round keys v1-v11
sub aes_128_enc {
my $code=<<___;
@{[vaesz_vs $V24, $V1]}
@{[vaesem_vs $V24, $V2]}
@{[vaesem_vs $V24, $V3]}
@{[vaesem_vs $V24, $V4]}
@{[vaesem_vs $V24, $V5]}
@{[vaesem_vs $V24, $V6]}
@{[vaesem_vs $V24, $V7]}
@{[vaesem_vs $V24, $V8]}
@{[vaesem_vs $V24, $V9]}
@{[vaesem_vs $V24, $V10]}
@{[vaesef_vs $V24, $V11]}
___
return $code;
}
# aes-128 dec with round keys v1-v11
sub aes_128_dec {
my $code=<<___;
@{[vaesz_vs $V24, $V11]}
@{[vaesdm_vs $V24, $V10]}
@{[vaesdm_vs $V24, $V9]}
@{[vaesdm_vs $V24, $V8]}
@{[vaesdm_vs $V24, $V7]}
@{[vaesdm_vs $V24, $V6]}
@{[vaesdm_vs $V24, $V5]}
@{[vaesdm_vs $V24, $V4]}
@{[vaesdm_vs $V24, $V3]}
@{[vaesdm_vs $V24, $V2]}
@{[vaesdf_vs $V24, $V1]}
___
return $code;
}
# aes-256 enc with round keys v1-v15
sub aes_256_enc {
my $code=<<___;
@{[vaesz_vs $V24, $V1]}
@{[vaesem_vs $V24, $V2]}
@{[vaesem_vs $V24, $V3]}
@{[vaesem_vs $V24, $V4]}
@{[vaesem_vs $V24, $V5]}
@{[vaesem_vs $V24, $V6]}
@{[vaesem_vs $V24, $V7]}
@{[vaesem_vs $V24, $V8]}
@{[vaesem_vs $V24, $V9]}
@{[vaesem_vs $V24, $V10]}
@{[vaesem_vs $V24, $V11]}
@{[vaesem_vs $V24, $V12]}
@{[vaesem_vs $V24, $V13]}
@{[vaesem_vs $V24, $V14]}
@{[vaesef_vs $V24, $V15]}
___
return $code;
}
# aes-256 dec with round keys v1-v15
sub aes_256_dec {
my $code=<<___;
@{[vaesz_vs $V24, $V15]}
@{[vaesdm_vs $V24, $V14]}
@{[vaesdm_vs $V24, $V13]}
@{[vaesdm_vs $V24, $V12]}
@{[vaesdm_vs $V24, $V11]}
@{[vaesdm_vs $V24, $V10]}
@{[vaesdm_vs $V24, $V9]}
@{[vaesdm_vs $V24, $V8]}
@{[vaesdm_vs $V24, $V7]}
@{[vaesdm_vs $V24, $V6]}
@{[vaesdm_vs $V24, $V5]}
@{[vaesdm_vs $V24, $V4]}
@{[vaesdm_vs $V24, $V3]}
@{[vaesdm_vs $V24, $V2]}
@{[vaesdf_vs $V24, $V1]}
___
return $code;
}
$code .= <<___;
.p2align 3
.globl rv64i_zvbb_zvkg_zvkned_aes_xts_encrypt
.type rv64i_zvbb_zvkg_zvkned_aes_xts_encrypt,\@function
rv64i_zvbb_zvkg_zvkned_aes_xts_encrypt:
@{[compute_xts_iv0]}
# aes block size is 16
andi $TAIL_LENGTH, $LENGTH, 15
mv $STORE_LEN32, $LENGTH
beqz $TAIL_LENGTH, 1f
sub $LENGTH, $LENGTH, $TAIL_LENGTH
addi $STORE_LEN32, $LENGTH, -16
1:
# We make the `LENGTH` become e32 length here.
srli $LEN32, $LENGTH, 2
srli $STORE_LEN32, $STORE_LEN32, 2
# Load number of rounds
lwu $T0, 240($KEY1)
li $T1, 14
li $T2, 10
beq $T0, $T1, aes_xts_enc_256
beq $T0, $T2, aes_xts_enc_128
.size rv64i_zvbb_zvkg_zvkned_aes_xts_encrypt,.-rv64i_zvbb_zvkg_zvkned_aes_xts_encrypt
___
$code .= <<___;
.p2align 3
aes_xts_enc_128:
@{[init_first_round]}
@{[aes_128_load_key]}
@{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
j 1f
.Lenc_blocks_128:
@{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
# load plaintext into v24
@{[vle32_v $V24, $INPUT]}
# update iv
@{[vgmul_vv $V16, $V20]}
# reverse the iv's bits order back
@{[vbrev8_v $V28, $V16]}
1:
@{[vxor_vv $V24, $V24, $V28]}
slli $T0, $VL, 2
sub $LEN32, $LEN32, $VL
add $INPUT, $INPUT, $T0
@{[aes_128_enc]}
@{[vxor_vv $V24, $V24, $V28]}
# store ciphertext
@{[vsetvli "zero", $STORE_LEN32, "e32", "m4", "ta", "ma"]}
@{[vse32_v $V24, $OUTPUT]}
add $OUTPUT, $OUTPUT, $T0
sub $STORE_LEN32, $STORE_LEN32, $VL
bnez $LEN32, .Lenc_blocks_128
@{[handle_xts_enc_last_block]}
# xts last block
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vxor_vv $V24, $V24, $V28]}
@{[aes_128_enc]}
@{[vxor_vv $V24, $V24, $V28]}
# store last block ciphertext
addi $OUTPUT, $OUTPUT, -16
@{[vse32_v $V24, $OUTPUT]}
ret
.size aes_xts_enc_128,.-aes_xts_enc_128
___
$code .= <<___;
.p2align 3
aes_xts_enc_256:
@{[init_first_round]}
@{[aes_256_load_key]}
@{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
j 1f
.Lenc_blocks_256:
@{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
# load plaintext into v24
@{[vle32_v $V24, $INPUT]}
# update iv
@{[vgmul_vv $V16, $V20]}
# reverse the iv's bits order back
@{[vbrev8_v $V28, $V16]}
1:
@{[vxor_vv $V24, $V24, $V28]}
slli $T0, $VL, 2
sub $LEN32, $LEN32, $VL
add $INPUT, $INPUT, $T0
@{[aes_256_enc]}
@{[vxor_vv $V24, $V24, $V28]}
# store ciphertext
@{[vsetvli "zero", $STORE_LEN32, "e32", "m4", "ta", "ma"]}
@{[vse32_v $V24, $OUTPUT]}
add $OUTPUT, $OUTPUT, $T0
sub $STORE_LEN32, $STORE_LEN32, $VL
bnez $LEN32, .Lenc_blocks_256
@{[handle_xts_enc_last_block]}
# xts last block
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vxor_vv $V24, $V24, $V28]}
@{[aes_256_enc]}
@{[vxor_vv $V24, $V24, $V28]}
# store last block ciphertext
addi $OUTPUT, $OUTPUT, -16
@{[vse32_v $V24, $OUTPUT]}
ret
.size aes_xts_enc_256,.-aes_xts_enc_256
___
################################################################################
# void rv64i_zvbb_zvkg_zvkned_aes_xts_decrypt(const unsigned char *in,
# unsigned char *out, size_t length,
# const AES_KEY *key1,
# const AES_KEY *key2,
# const unsigned char iv[16])
$code .= <<___;
.p2align 3
.globl rv64i_zvbb_zvkg_zvkned_aes_xts_decrypt
.type rv64i_zvbb_zvkg_zvkned_aes_xts_decrypt,\@function
rv64i_zvbb_zvkg_zvkned_aes_xts_decrypt:
@{[compute_xts_iv0]}
# aes block size is 16
andi $TAIL_LENGTH, $LENGTH, 15
beqz $TAIL_LENGTH, 1f
sub $LENGTH, $LENGTH, $TAIL_LENGTH
addi $LENGTH, $LENGTH, -16
1:
# We make the `LENGTH` become e32 length here.
srli $LEN32, $LENGTH, 2
# Load number of rounds
lwu $T0, 240($KEY1)
li $T1, 14
li $T2, 10
beq $T0, $T1, aes_xts_dec_256
beq $T0, $T2, aes_xts_dec_128
.size rv64i_zvbb_zvkg_zvkned_aes_xts_decrypt,.-rv64i_zvbb_zvkg_zvkned_aes_xts_decrypt
___
$code .= <<___;
.p2align 3
aes_xts_dec_128:
@{[init_first_round]}
@{[aes_128_load_key]}
beqz $LEN32, 2f
@{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
j 1f
.Ldec_blocks_128:
@{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
# load ciphertext into v24
@{[vle32_v $V24, $INPUT]}
# update iv
@{[vgmul_vv $V16, $V20]}
# reverse the iv's bits order back
@{[vbrev8_v $V28, $V16]}
1:
@{[vxor_vv $V24, $V24, $V28]}
slli $T0, $VL, 2
sub $LEN32, $LEN32, $VL
add $INPUT, $INPUT, $T0
@{[aes_128_dec]}
@{[vxor_vv $V24, $V24, $V28]}
# store plaintext
@{[vse32_v $V24, $OUTPUT]}
add $OUTPUT, $OUTPUT, $T0
bnez $LEN32, .Ldec_blocks_128
2:
@{[handle_xts_dec_last_block]}
## xts second to last block
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vxor_vv $V24, $V24, $V29]}
@{[aes_128_dec]}
@{[vxor_vv $V24, $V24, $V29]}
@{[vmv_v_v $V25, $V24]}
# load last block ciphertext
@{[vsetvli "zero", $TAIL_LENGTH, "e8", "m1", "tu", "ma"]}
@{[vle8_v $V24, $INPUT]}
# store second to last block plaintext
addi $T0, $OUTPUT, 16
@{[vse8_v $V25, $T0]}
## xts last block
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vxor_vv $V24, $V24, $V28]}
@{[aes_128_dec]}
@{[vxor_vv $V24, $V24, $V28]}
# store second to last block plaintext
@{[vse32_v $V24, $OUTPUT]}
ret
.size aes_xts_dec_128,.-aes_xts_dec_128
___
$code .= <<___;
.p2align 3
aes_xts_dec_256:
@{[init_first_round]}
@{[aes_256_load_key]}
beqz $LEN32, 2f
@{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
j 1f
.Ldec_blocks_256:
@{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
# load ciphertext into v24
@{[vle32_v $V24, $INPUT]}
# update iv
@{[vgmul_vv $V16, $V20]}
# reverse the iv's bits order back
@{[vbrev8_v $V28, $V16]}
1:
@{[vxor_vv $V24, $V24, $V28]}
slli $T0, $VL, 2
sub $LEN32, $LEN32, $VL
add $INPUT, $INPUT, $T0
@{[aes_256_dec]}
@{[vxor_vv $V24, $V24, $V28]}
# store plaintext
@{[vse32_v $V24, $OUTPUT]}
add $OUTPUT, $OUTPUT, $T0
bnez $LEN32, .Ldec_blocks_256
2:
@{[handle_xts_dec_last_block]}
## xts second to last block
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vxor_vv $V24, $V24, $V29]}
@{[aes_256_dec]}
@{[vxor_vv $V24, $V24, $V29]}
@{[vmv_v_v $V25, $V24]}
# load last block ciphertext
@{[vsetvli "zero", $TAIL_LENGTH, "e8", "m1", "tu", "ma"]}
@{[vle8_v $V24, $INPUT]}
# store second to last block plaintext
addi $T0, $OUTPUT, 16
@{[vse8_v $V25, $T0]}
## xts last block
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vxor_vv $V24, $V24, $V28]}
@{[aes_256_dec]}
@{[vxor_vv $V24, $V24, $V28]}
# store second to last block plaintext
@{[vse32_v $V24, $OUTPUT]}
ret
.size aes_xts_dec_256,.-aes_xts_dec_256
___
}
print $code;
close STDOUT or die "error closing STDOUT: $!";

View File

@@ -0,0 +1,376 @@
#! /usr/bin/env perl
# This file is dual-licensed, meaning that you can use it under your
# choice of either of the following two licenses:
#
# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You can obtain
# a copy in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
#
# or
#
# Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
# - RV64I
# - RISC-V Vector ('V') with VLEN >= 128
# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
# - RISC-V Vector AES block cipher extension ('Zvkned')
# - RISC-V Zicclsm(Main memory supports misaligned loads/stores)
use strict;
use warnings;
use FindBin qw($Bin);
use lib "$Bin";
use lib "$Bin/../../perlasm";
use riscv;
# $output is the last argument if it looks like a file (it has an extension)
# $flavour is the first argument if it doesn't look like a file
my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$output and open STDOUT,">$output";
my $code=<<___;
.text
___
################################################################################
# void rv64i_zvkb_zvkned_ctr32_encrypt_blocks(const unsigned char *in,
# unsigned char *out, size_t blocks,
# const void *key,
# const unsigned char ivec[16]);
{
my ($INP, $OUTP, $BLOCK_NUM, $KEYP, $IVP) = ("a0", "a1", "a2", "a3", "a4");
my ($T0, $T1, $T2, $T3) = ("t0", "t1", "t2", "t3");
my ($VL) = ("t4");
my ($LEN32) = ("t5");
my ($CTR) = ("t6");
my ($MASK) = ("v0");
my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7,
$V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15,
$V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23,
$V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31,
) = map("v$_",(0..31));
# Prepare the AES ctr input data into v16.
sub init_aes_ctr_input {
my $code=<<___;
# Setup mask into v0
# The mask pattern for 4*N-th elements
# mask v0: [000100010001....]
# Note:
# We could setup the mask just for the maximum element length instead of
# the VLMAX.
li $T0, 0b10001000
@{[vsetvli $T2, "zero", "e8", "m1", "ta", "ma"]}
@{[vmv_v_x $MASK, $T0]}
# Load IV.
# v31:[IV0, IV1, IV2, big-endian count]
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vle32_v $V31, $IVP]}
# Convert the big-endian counter into little-endian.
@{[vsetivli "zero", 4, "e32", "m1", "ta", "mu"]}
@{[vrev8_v $V31, $V31, $MASK]}
# Splat the IV to v16
@{[vsetvli "zero", $LEN32, "e32", "m4", "ta", "ma"]}
@{[vmv_v_i $V16, 0]}
@{[vaesz_vs $V16, $V31]}
# Prepare the ctr pattern into v20
# v20: [x, x, x, 0, x, x, x, 1, x, x, x, 2, ...]
@{[viota_m $V20, $MASK, $MASK]}
# v16:[IV0, IV1, IV2, count+0, IV0, IV1, IV2, count+1, ...]
@{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "mu"]}
@{[vadd_vv $V16, $V16, $V20, $MASK]}
___
return $code;
}
$code .= <<___;
.p2align 3
.globl rv64i_zvkb_zvkned_ctr32_encrypt_blocks
.type rv64i_zvkb_zvkned_ctr32_encrypt_blocks,\@function
rv64i_zvkb_zvkned_ctr32_encrypt_blocks:
beqz $BLOCK_NUM, 1f
# Load number of rounds
lwu $T0, 240($KEYP)
li $T1, 14
li $T2, 12
li $T3, 10
slli $LEN32, $BLOCK_NUM, 2
beq $T0, $T1, ctr32_encrypt_blocks_256
beq $T0, $T2, ctr32_encrypt_blocks_192
beq $T0, $T3, ctr32_encrypt_blocks_128
1:
ret
.size rv64i_zvkb_zvkned_ctr32_encrypt_blocks,.-rv64i_zvkb_zvkned_ctr32_encrypt_blocks
___
$code .= <<___;
.p2align 3
ctr32_encrypt_blocks_128:
# Load all 11 round keys to v1-v11 registers.
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vle32_v $V1, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V2, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V3, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V4, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V5, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V6, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V7, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V8, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V9, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V10, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V11, $KEYP]}
@{[init_aes_ctr_input]}
##### AES body
j 2f
1:
@{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "mu"]}
# Increase ctr in v16.
@{[vadd_vx $V16, $V16, $CTR, $MASK]}
2:
# Load plaintext into v20
@{[vle32_v $V20, $INP]}
slli $T0, $VL, 2
srli $CTR, $VL, 2
sub $LEN32, $LEN32, $VL
add $INP, $INP, $T0
# Prepare the AES ctr input into v24.
# The ctr data uses big-endian form.
@{[vmv_v_v $V24, $V16]}
@{[vrev8_v $V24, $V24, $MASK]}
@{[vaesz_vs $V24, $V1]}
@{[vaesem_vs $V24, $V2]}
@{[vaesem_vs $V24, $V3]}
@{[vaesem_vs $V24, $V4]}
@{[vaesem_vs $V24, $V5]}
@{[vaesem_vs $V24, $V6]}
@{[vaesem_vs $V24, $V7]}
@{[vaesem_vs $V24, $V8]}
@{[vaesem_vs $V24, $V9]}
@{[vaesem_vs $V24, $V10]}
@{[vaesef_vs $V24, $V11]}
# ciphertext
@{[vxor_vv $V24, $V24, $V20]}
# Store the ciphertext.
@{[vse32_v $V24, $OUTP]}
add $OUTP, $OUTP, $T0
bnez $LEN32, 1b
ret
.size ctr32_encrypt_blocks_128,.-ctr32_encrypt_blocks_128
___
$code .= <<___;
.p2align 3
ctr32_encrypt_blocks_192:
# Load all 13 round keys to v1-v13 registers.
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vle32_v $V1, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V2, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V3, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V4, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V5, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V6, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V7, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V8, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V9, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V10, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V11, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V12, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V13, $KEYP]}
@{[init_aes_ctr_input]}
##### AES body
j 2f
1:
@{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "mu"]}
# Increase ctr in v16.
@{[vadd_vx $V16, $V16, $CTR, $MASK]}
2:
# Load plaintext into v20
@{[vle32_v $V20, $INP]}
slli $T0, $VL, 2
srli $CTR, $VL, 2
sub $LEN32, $LEN32, $VL
add $INP, $INP, $T0
# Prepare the AES ctr input into v24.
# The ctr data uses big-endian form.
@{[vmv_v_v $V24, $V16]}
@{[vrev8_v $V24, $V24, $MASK]}
@{[vaesz_vs $V24, $V1]}
@{[vaesem_vs $V24, $V2]}
@{[vaesem_vs $V24, $V3]}
@{[vaesem_vs $V24, $V4]}
@{[vaesem_vs $V24, $V5]}
@{[vaesem_vs $V24, $V6]}
@{[vaesem_vs $V24, $V7]}
@{[vaesem_vs $V24, $V8]}
@{[vaesem_vs $V24, $V9]}
@{[vaesem_vs $V24, $V10]}
@{[vaesem_vs $V24, $V11]}
@{[vaesem_vs $V24, $V12]}
@{[vaesef_vs $V24, $V13]}
# ciphertext
@{[vxor_vv $V24, $V24, $V20]}
# Store the ciphertext.
@{[vse32_v $V24, $OUTP]}
add $OUTP, $OUTP, $T0
bnez $LEN32, 1b
ret
.size ctr32_encrypt_blocks_192,.-ctr32_encrypt_blocks_192
___
$code .= <<___;
.p2align 3
ctr32_encrypt_blocks_256:
# Load all 15 round keys to v1-v15 registers.
@{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
@{[vle32_v $V1, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V2, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V3, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V4, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V5, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V6, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V7, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V8, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V9, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V10, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V11, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V12, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V13, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V14, $KEYP]}
addi $KEYP, $KEYP, 16
@{[vle32_v $V15, $KEYP]}
@{[init_aes_ctr_input]}
##### AES body
j 2f
1:
@{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "mu"]}
# Increase ctr in v16.
@{[vadd_vx $V16, $V16, $CTR, $MASK]}
2:
# Load plaintext into v20
@{[vle32_v $V20, $INP]}
slli $T0, $VL, 2
srli $CTR, $VL, 2
sub $LEN32, $LEN32, $VL
add $INP, $INP, $T0
# Prepare the AES ctr input into v24.
# The ctr data uses big-endian form.
@{[vmv_v_v $V24, $V16]}
@{[vrev8_v $V24, $V24, $MASK]}
@{[vaesz_vs $V24, $V1]}
@{[vaesem_vs $V24, $V2]}
@{[vaesem_vs $V24, $V3]}
@{[vaesem_vs $V24, $V4]}
@{[vaesem_vs $V24, $V5]}
@{[vaesem_vs $V24, $V6]}
@{[vaesem_vs $V24, $V7]}
@{[vaesem_vs $V24, $V8]}
@{[vaesem_vs $V24, $V9]}
@{[vaesem_vs $V24, $V10]}
@{[vaesem_vs $V24, $V11]}
@{[vaesem_vs $V24, $V12]}
@{[vaesem_vs $V24, $V13]}
@{[vaesem_vs $V24, $V14]}
@{[vaesef_vs $V24, $V15]}
# ciphertext
@{[vxor_vv $V24, $V24, $V20]}
# Store the ciphertext.
@{[vse32_v $V24, $OUTP]}
add $OUTP, $OUTP, $T0
bnez $LEN32, 1b
ret
.size ctr32_encrypt_blocks_256,.-ctr32_encrypt_blocks_256
___
}
print $code;
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,931 @@
#! /usr/bin/env perl
# Copyright 2012-2021 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# ====================================================================
# Written by David S. Miller and Andy Polyakov.
# The module is licensed under 2-clause BSD license. October 2012.
# All rights reserved.
# ====================================================================
######################################################################
# AES for SPARC T4.
#
# AES round instructions complete in 3 cycles and can be issued every
# cycle. It means that round calculations should take 4*rounds cycles,
# because any given round instruction depends on result of *both*
# previous instructions:
#
# |0 |1 |2 |3 |4
# |01|01|01|
# |23|23|23|
# |01|01|...
# |23|...
#
# Provided that fxor [with IV] takes 3 cycles to complete, critical
# path length for CBC encrypt would be 3+4*rounds, or in other words
# it should process one byte in at least (3+4*rounds)/16 cycles. This
# estimate doesn't account for "collateral" instructions, such as
# fetching input from memory, xor-ing it with zero-round key and
# storing the result. Yet, *measured* performance [for data aligned
# at 64-bit boundary!] deviates from this equation by less than 0.5%:
#
# 128-bit key 192- 256-
# CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90
# (*) numbers after slash are for
# misaligned data.
#
# Out-of-order execution logic managed to fully overlap "collateral"
# instructions with those on critical path. Amazing!
#
# As with Intel AES-NI, question is if it's possible to improve
# performance of parallelizable modes by interleaving round
# instructions. Provided round instruction latency and throughput
# optimal interleave factor is 2. But can we expect 2x performance
# improvement? Well, as round instructions can be issued one per
# cycle, they don't saturate the 2-way issue pipeline and therefore
# there is room for "collateral" calculations... Yet, 2x speed-up
# over CBC encrypt remains unattaintable:
#
# 128-bit key 192- 256-
# CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61
# CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61
# (*) numbers after slash are for
# misaligned data.
#
# Estimates based on amount of instructions under assumption that
# round instructions are not pairable with any other instruction
# suggest that latter is the actual case and pipeline runs
# underutilized. It should be noted that T4 out-of-order execution
# logic is so capable that performance gain from 2x interleave is
# not even impressive, ~7-13% over non-interleaved code, largest
# for 256-bit keys.
# To anchor to something else, software implementation processes
# one byte in 29 cycles with 128-bit key on same processor. Intel
# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
# in 0.93, naturally with AES-NI.
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "sparcv9_modes.pl";
$output = pop and open STDOUT,">$output";
$::evp=1; # if $evp is set to 0, script generates module with
# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
# points. These however are not fully compatible with openssl/aes.h,
# because they expect AES_KEY to be aligned at 64-bit boundary. When
# used through EVP, alignment is arranged at EVP layer. Second thing
# that is arranged by EVP is at least 32-bit alignment of IV.
######################################################################
# single-round subroutines
#
{
my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
$code.=<<___;
#ifndef __ASSEMBLER__
# define __ASSEMBLER__ 1
#endif
#include "crypto/sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.text
.globl aes_t4_encrypt
.align 32
aes_t4_encrypt:
andcc $inp, 7, %g1 ! is input aligned?
andn $inp, 7, $inp
ldx [$key + 0], %g4
ldx [$key + 8], %g5
ldx [$inp + 0], %o4
bz,pt %icc, 1f
ldx [$inp + 8], %o5
ldx [$inp + 16], $inp
sll %g1, 3, %g1
sub %g0, %g1, %o3
sllx %o4, %g1, %o4
sllx %o5, %g1, %g1
srlx %o5, %o3, %o5
srlx $inp, %o3, %o3
or %o5, %o4, %o4
or %o3, %g1, %o5
1:
ld [$key + 240], $rounds
ldd [$key + 16], %f12
ldd [$key + 24], %f14
xor %g4, %o4, %o4
xor %g5, %o5, %o5
movxtod %o4, %f0
movxtod %o5, %f2
srl $rounds, 1, $rounds
ldd [$key + 32], %f16
sub $rounds, 1, $rounds
ldd [$key + 40], %f18
add $key, 48, $key
.Lenc:
aes_eround01 %f12, %f0, %f2, %f4
aes_eround23 %f14, %f0, %f2, %f2
ldd [$key + 0], %f12
ldd [$key + 8], %f14
sub $rounds,1,$rounds
aes_eround01 %f16, %f4, %f2, %f0
aes_eround23 %f18, %f4, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
brnz,pt $rounds, .Lenc
add $key, 32, $key
andcc $out, 7, $tmp ! is output aligned?
aes_eround01 %f12, %f0, %f2, %f4
aes_eround23 %f14, %f0, %f2, %f2
aes_eround01_l %f16, %f4, %f2, %f0
aes_eround23_l %f18, %f4, %f2, %f2
bnz,pn %icc, 2f
nop
std %f0, [$out + 0]
retl
std %f2, [$out + 8]
2: alignaddrl $out, %g0, $out
mov 0xff, $mask
srl $mask, $tmp, $mask
faligndata %f0, %f0, %f4
faligndata %f0, %f2, %f6
faligndata %f2, %f2, %f8
stda %f4, [$out + $mask]0xc0 ! partial store
std %f6, [$out + 8]
add $out, 16, $out
orn %g0, $mask, $mask
retl
stda %f8, [$out + $mask]0xc0 ! partial store
.type aes_t4_encrypt,#function
.size aes_t4_encrypt,.-aes_t4_encrypt
.globl aes_t4_decrypt
.align 32
aes_t4_decrypt:
andcc $inp, 7, %g1 ! is input aligned?
andn $inp, 7, $inp
ldx [$key + 0], %g4
ldx [$key + 8], %g5
ldx [$inp + 0], %o4
bz,pt %icc, 1f
ldx [$inp + 8], %o5
ldx [$inp + 16], $inp
sll %g1, 3, %g1
sub %g0, %g1, %o3
sllx %o4, %g1, %o4
sllx %o5, %g1, %g1
srlx %o5, %o3, %o5
srlx $inp, %o3, %o3
or %o5, %o4, %o4
or %o3, %g1, %o5
1:
ld [$key + 240], $rounds
ldd [$key + 16], %f12
ldd [$key + 24], %f14
xor %g4, %o4, %o4
xor %g5, %o5, %o5
movxtod %o4, %f0
movxtod %o5, %f2
srl $rounds, 1, $rounds
ldd [$key + 32], %f16
sub $rounds, 1, $rounds
ldd [$key + 40], %f18
add $key, 48, $key
.Ldec:
aes_dround01 %f12, %f0, %f2, %f4
aes_dround23 %f14, %f0, %f2, %f2
ldd [$key + 0], %f12
ldd [$key + 8], %f14
sub $rounds,1,$rounds
aes_dround01 %f16, %f4, %f2, %f0
aes_dround23 %f18, %f4, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
brnz,pt $rounds, .Ldec
add $key, 32, $key
andcc $out, 7, $tmp ! is output aligned?
aes_dround01 %f12, %f0, %f2, %f4
aes_dround23 %f14, %f0, %f2, %f2
aes_dround01_l %f16, %f4, %f2, %f0
aes_dround23_l %f18, %f4, %f2, %f2
bnz,pn %icc, 2f
nop
std %f0, [$out + 0]
retl
std %f2, [$out + 8]
2: alignaddrl $out, %g0, $out
mov 0xff, $mask
srl $mask, $tmp, $mask
faligndata %f0, %f0, %f4
faligndata %f0, %f2, %f6
faligndata %f2, %f2, %f8
stda %f4, [$out + $mask]0xc0 ! partial store
std %f6, [$out + 8]
add $out, 16, $out
orn %g0, $mask, $mask
retl
stda %f8, [$out + $mask]0xc0 ! partial store
.type aes_t4_decrypt,#function
.size aes_t4_decrypt,.-aes_t4_decrypt
___
}
######################################################################
# key setup subroutines
#
{
my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
$code.=<<___;
.globl aes_t4_set_encrypt_key
.align 32
aes_t4_set_encrypt_key:
.Lset_encrypt_key:
and $inp, 7, $tmp
alignaddr $inp, %g0, $inp
cmp $bits, 192
ldd [$inp + 0], %f0
bl,pt %icc,.L128
ldd [$inp + 8], %f2
be,pt %icc,.L192
ldd [$inp + 16], %f4
brz,pt $tmp, .L256aligned
ldd [$inp + 24], %f6
ldd [$inp + 32], %f8
faligndata %f0, %f2, %f0
faligndata %f2, %f4, %f2
faligndata %f4, %f6, %f4
faligndata %f6, %f8, %f6
.L256aligned:
___
for ($i=0; $i<6; $i++) {
$code.=<<___;
std %f0, [$out + `32*$i+0`]
aes_kexpand1 %f0, %f6, $i, %f0
std %f2, [$out + `32*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `32*$i+16`]
aes_kexpand0 %f4, %f2, %f4
std %f6, [$out + `32*$i+24`]
aes_kexpand2 %f6, %f4, %f6
___
}
$code.=<<___;
std %f0, [$out + `32*$i+0`]
aes_kexpand1 %f0, %f6, $i, %f0
std %f2, [$out + `32*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `32*$i+16`]
std %f6, [$out + `32*$i+24`]
std %f0, [$out + `32*$i+32`]
std %f2, [$out + `32*$i+40`]
mov 14, $tmp
st $tmp, [$out + 240]
retl
xor %o0, %o0, %o0
.align 16
.L192:
brz,pt $tmp, .L192aligned
nop
ldd [$inp + 24], %f6
faligndata %f0, %f2, %f0
faligndata %f2, %f4, %f2
faligndata %f4, %f6, %f4
.L192aligned:
___
for ($i=0; $i<7; $i++) {
$code.=<<___;
std %f0, [$out + `24*$i+0`]
aes_kexpand1 %f0, %f4, $i, %f0
std %f2, [$out + `24*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `24*$i+16`]
aes_kexpand2 %f4, %f2, %f4
___
}
$code.=<<___;
std %f0, [$out + `24*$i+0`]
aes_kexpand1 %f0, %f4, $i, %f0
std %f2, [$out + `24*$i+8`]
aes_kexpand2 %f2, %f0, %f2
std %f4, [$out + `24*$i+16`]
std %f0, [$out + `24*$i+24`]
std %f2, [$out + `24*$i+32`]
mov 12, $tmp
st $tmp, [$out + 240]
retl
xor %o0, %o0, %o0
.align 16
.L128:
brz,pt $tmp, .L128aligned
nop
ldd [$inp + 16], %f4
faligndata %f0, %f2, %f0
faligndata %f2, %f4, %f2
.L128aligned:
___
for ($i=0; $i<10; $i++) {
$code.=<<___;
std %f0, [$out + `16*$i+0`]
aes_kexpand1 %f0, %f2, $i, %f0
std %f2, [$out + `16*$i+8`]
aes_kexpand2 %f2, %f0, %f2
___
}
$code.=<<___;
std %f0, [$out + `16*$i+0`]
std %f2, [$out + `16*$i+8`]
mov 10, $tmp
st $tmp, [$out + 240]
retl
xor %o0, %o0, %o0
.type aes_t4_set_encrypt_key,#function
.size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
.globl aes_t4_set_decrypt_key
.align 32
aes_t4_set_decrypt_key:
mov %o7, %o5
call .Lset_encrypt_key
nop
mov %o5, %o7
sll $tmp, 4, $inp ! $tmp is number of rounds
add $tmp, 2, $tmp
add $out, $inp, $inp ! $inp=$out+16*rounds
srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4
.Lkey_flip:
ldd [$out + 0], %f0
ldd [$out + 8], %f2
ldd [$out + 16], %f4
ldd [$out + 24], %f6
ldd [$inp + 0], %f8
ldd [$inp + 8], %f10
ldd [$inp - 16], %f12
ldd [$inp - 8], %f14
sub $tmp, 1, $tmp
std %f0, [$inp + 0]
std %f2, [$inp + 8]
std %f4, [$inp - 16]
std %f6, [$inp - 8]
std %f8, [$out + 0]
std %f10, [$out + 8]
std %f12, [$out + 16]
std %f14, [$out + 24]
add $out, 32, $out
brnz $tmp, .Lkey_flip
sub $inp, 32, $inp
retl
xor %o0, %o0, %o0
.type aes_t4_set_decrypt_key,#function
.size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
___
}
{{{
my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
$code.=<<___;
.align 32
_aes128_encrypt_1x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_eround01 %f48, %f0, %f2, %f4
aes_eround23 %f50, %f0, %f2, %f2
aes_eround01_l %f52, %f4, %f2, %f0
retl
aes_eround23_l %f54, %f4, %f2, %f2
.type _aes128_encrypt_1x,#function
.size _aes128_encrypt_1x,.-_aes128_encrypt_1x
.align 32
_aes128_encrypt_2x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_eround01 %f48, %f0, %f2, %f8
aes_eround23 %f50, %f0, %f2, %f2
aes_eround01 %f48, %f4, %f6, %f10
aes_eround23 %f50, %f4, %f6, %f6
aes_eround01_l %f52, %f8, %f2, %f0
aes_eround23_l %f54, %f8, %f2, %f2
aes_eround01_l %f52, %f10, %f6, %f4
retl
aes_eround23_l %f54, %f10, %f6, %f6
.type _aes128_encrypt_2x,#function
.size _aes128_encrypt_2x,.-_aes128_encrypt_2x
.align 32
_aes128_loadkey:
ldx [$key + 0], %g4
ldx [$key + 8], %g5
___
for ($i=2; $i<22;$i++) { # load key schedule
$code.=<<___;
ldd [$key + `8*$i`], %f`12+2*$i`
___
}
$code.=<<___;
retl
nop
.type _aes128_loadkey,#function
.size _aes128_loadkey,.-_aes128_loadkey
_aes128_load_enckey=_aes128_loadkey
_aes128_load_deckey=_aes128_loadkey
___
&alg_cbc_encrypt_implement("aes",128);
if ($::evp) {
&alg_ctr32_implement("aes",128);
&alg_xts_implement("aes",128,"en");
&alg_xts_implement("aes",128,"de");
}
&alg_cbc_decrypt_implement("aes",128);
$code.=<<___;
.align 32
_aes128_decrypt_1x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_dround01 %f48, %f0, %f2, %f4
aes_dround23 %f50, %f0, %f2, %f2
aes_dround01_l %f52, %f4, %f2, %f0
retl
aes_dround23_l %f54, %f4, %f2, %f2
.type _aes128_decrypt_1x,#function
.size _aes128_decrypt_1x,.-_aes128_decrypt_1x
.align 32
_aes128_decrypt_2x:
___
for ($i=0; $i<4; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_dround01 %f48, %f0, %f2, %f8
aes_dround23 %f50, %f0, %f2, %f2
aes_dround01 %f48, %f4, %f6, %f10
aes_dround23 %f50, %f4, %f6, %f6
aes_dround01_l %f52, %f8, %f2, %f0
aes_dround23_l %f54, %f8, %f2, %f2
aes_dround01_l %f52, %f10, %f6, %f4
retl
aes_dround23_l %f54, %f10, %f6, %f6
.type _aes128_decrypt_2x,#function
.size _aes128_decrypt_2x,.-_aes128_decrypt_2x
___
$code.=<<___;
.align 32
_aes192_encrypt_1x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_eround01 %f56, %f0, %f2, %f4
aes_eround23 %f58, %f0, %f2, %f2
aes_eround01_l %f60, %f4, %f2, %f0
retl
aes_eround23_l %f62, %f4, %f2, %f2
.type _aes192_encrypt_1x,#function
.size _aes192_encrypt_1x,.-_aes192_encrypt_1x
.align 32
_aes192_encrypt_2x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_eround01 %f56, %f0, %f2, %f8
aes_eround23 %f58, %f0, %f2, %f2
aes_eround01 %f56, %f4, %f6, %f10
aes_eround23 %f58, %f4, %f6, %f6
aes_eround01_l %f60, %f8, %f2, %f0
aes_eround23_l %f62, %f8, %f2, %f2
aes_eround01_l %f60, %f10, %f6, %f4
retl
aes_eround23_l %f62, %f10, %f6, %f6
.type _aes192_encrypt_2x,#function
.size _aes192_encrypt_2x,.-_aes192_encrypt_2x
.align 32
_aes256_encrypt_1x:
aes_eround01 %f16, %f0, %f2, %f4
aes_eround23 %f18, %f0, %f2, %f2
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_eround01 %f20, %f4, %f2, %f0
aes_eround23 %f22, %f4, %f2, %f2
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_eround01 %f16, %f0, %f2, %f4
aes_eround23 %f18, %f0, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_eround01_l %f20, %f4, %f2, %f0
aes_eround23_l %f22, %f4, %f2, %f2
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_encrypt_1x,#function
.size _aes256_encrypt_1x,.-_aes256_encrypt_1x
.align 32
_aes256_encrypt_2x:
aes_eround01 %f16, %f0, %f2, %f8
aes_eround23 %f18, %f0, %f2, %f2
aes_eround01 %f16, %f4, %f6, %f10
aes_eround23 %f18, %f4, %f6, %f6
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_eround01 %f20, %f8, %f2, %f0
aes_eround23 %f22, %f8, %f2, %f2
aes_eround01 %f20, %f10, %f6, %f4
aes_eround23 %f22, %f10, %f6, %f6
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_eround01 %f16, %f0, %f2, %f8
aes_eround23 %f18, %f0, %f2, %f2
aes_eround01 %f16, %f4, %f6, %f10
aes_eround23 %f18, %f4, %f6, %f6
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_eround01_l %f20, %f8, %f2, %f0
aes_eround23_l %f22, %f8, %f2, %f2
aes_eround01_l %f20, %f10, %f6, %f4
aes_eround23_l %f22, %f10, %f6, %f6
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_encrypt_2x,#function
.size _aes256_encrypt_2x,.-_aes256_encrypt_2x
.align 32
_aes192_loadkey:
ldx [$key + 0], %g4
ldx [$key + 8], %g5
___
for ($i=2; $i<26;$i++) { # load key schedule
$code.=<<___;
ldd [$key + `8*$i`], %f`12+2*$i`
___
}
$code.=<<___;
retl
nop
.type _aes192_loadkey,#function
.size _aes192_loadkey,.-_aes192_loadkey
_aes256_loadkey=_aes192_loadkey
_aes192_load_enckey=_aes192_loadkey
_aes192_load_deckey=_aes192_loadkey
_aes256_load_enckey=_aes192_loadkey
_aes256_load_deckey=_aes192_loadkey
___
&alg_cbc_encrypt_implement("aes",256);
&alg_cbc_encrypt_implement("aes",192);
if ($::evp) {
&alg_ctr32_implement("aes",256);
&alg_xts_implement("aes",256,"en");
&alg_xts_implement("aes",256,"de");
&alg_ctr32_implement("aes",192);
}
&alg_cbc_decrypt_implement("aes",192);
&alg_cbc_decrypt_implement("aes",256);
$code.=<<___;
.align 32
_aes256_decrypt_1x:
aes_dround01 %f16, %f0, %f2, %f4
aes_dround23 %f18, %f0, %f2, %f2
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_dround01 %f20, %f4, %f2, %f0
aes_dround23 %f22, %f4, %f2, %f2
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_dround01 %f16, %f0, %f2, %f4
aes_dround23 %f18, %f0, %f2, %f2
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_dround01_l %f20, %f4, %f2, %f0
aes_dround23_l %f22, %f4, %f2, %f2
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_decrypt_1x,#function
.size _aes256_decrypt_1x,.-_aes256_decrypt_1x
.align 32
_aes256_decrypt_2x:
aes_dround01 %f16, %f0, %f2, %f8
aes_dround23 %f18, %f0, %f2, %f2
aes_dround01 %f16, %f4, %f6, %f10
aes_dround23 %f18, %f4, %f6, %f6
ldd [$key + 208], %f16
ldd [$key + 216], %f18
aes_dround01 %f20, %f8, %f2, %f0
aes_dround23 %f22, %f8, %f2, %f2
aes_dround01 %f20, %f10, %f6, %f4
aes_dround23 %f22, %f10, %f6, %f6
ldd [$key + 224], %f20
ldd [$key + 232], %f22
___
for ($i=1; $i<6; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_dround01 %f16, %f0, %f2, %f8
aes_dround23 %f18, %f0, %f2, %f2
aes_dround01 %f16, %f4, %f6, %f10
aes_dround23 %f18, %f4, %f6, %f6
ldd [$key + 16], %f16
ldd [$key + 24], %f18
aes_dround01_l %f20, %f8, %f2, %f0
aes_dround23_l %f22, %f8, %f2, %f2
aes_dround01_l %f20, %f10, %f6, %f4
aes_dround23_l %f22, %f10, %f6, %f6
ldd [$key + 32], %f20
retl
ldd [$key + 40], %f22
.type _aes256_decrypt_2x,#function
.size _aes256_decrypt_2x,.-_aes256_decrypt_2x
.align 32
_aes192_decrypt_1x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
___
}
$code.=<<___;
aes_dround01 %f56, %f0, %f2, %f4
aes_dround23 %f58, %f0, %f2, %f2
aes_dround01_l %f60, %f4, %f2, %f0
retl
aes_dround23_l %f62, %f4, %f2, %f2
.type _aes192_decrypt_1x,#function
.size _aes192_decrypt_1x,.-_aes192_decrypt_1x
.align 32
_aes192_decrypt_2x:
___
for ($i=0; $i<5; $i++) {
$code.=<<___;
aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
___
}
$code.=<<___;
aes_dround01 %f56, %f0, %f2, %f8
aes_dround23 %f58, %f0, %f2, %f2
aes_dround01 %f56, %f4, %f6, %f10
aes_dround23 %f58, %f4, %f6, %f6
aes_dround01_l %f60, %f8, %f2, %f0
aes_dround23_l %f62, %f8, %f2, %f2
aes_dround01_l %f60, %f10, %f6, %f4
retl
aes_dround23_l %f62, %f10, %f6, %f6
.type _aes192_decrypt_2x,#function
.size _aes192_decrypt_2x,.-_aes192_decrypt_2x
___
}}}
if (!$::evp) {
$code.=<<___;
.global AES_encrypt
AES_encrypt=aes_t4_encrypt
.global AES_decrypt
AES_decrypt=aes_t4_decrypt
.global AES_set_encrypt_key
.align 32
AES_set_encrypt_key:
andcc %o2, 7, %g0 ! check alignment
bnz,a,pn %icc, 1f
mov -1, %o0
brz,a,pn %o0, 1f
mov -1, %o0
brz,a,pn %o2, 1f
mov -1, %o0
andncc %o1, 0x1c0, %g0
bnz,a,pn %icc, 1f
mov -2, %o0
cmp %o1, 128
bl,a,pn %icc, 1f
mov -2, %o0
b aes_t4_set_encrypt_key
nop
1: retl
nop
.type AES_set_encrypt_key,#function
.size AES_set_encrypt_key,.-AES_set_encrypt_key
.global AES_set_decrypt_key
.align 32
AES_set_decrypt_key:
andcc %o2, 7, %g0 ! check alignment
bnz,a,pn %icc, 1f
mov -1, %o0
brz,a,pn %o0, 1f
mov -1, %o0
brz,a,pn %o2, 1f
mov -1, %o0
andncc %o1, 0x1c0, %g0
bnz,a,pn %icc, 1f
mov -2, %o0
cmp %o1, 128
bl,a,pn %icc, 1f
mov -2, %o0
b aes_t4_set_decrypt_key
nop
1: retl
nop
.type AES_set_decrypt_key,#function
.size AES_set_decrypt_key,.-AES_set_decrypt_key
___
my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
$code.=<<___;
.globl AES_cbc_encrypt
.align 32
AES_cbc_encrypt:
ld [$key + 240], %g1
nop
brz $enc, .Lcbc_decrypt
cmp %g1, 12
bl,pt %icc, aes128_t4_cbc_encrypt
nop
be,pn %icc, aes192_t4_cbc_encrypt
nop
ba aes256_t4_cbc_encrypt
nop
.Lcbc_decrypt:
bl,pt %icc, aes128_t4_cbc_decrypt
nop
be,pn %icc, aes192_t4_cbc_decrypt
nop
ba aes256_t4_cbc_decrypt
nop
.type AES_cbc_encrypt,#function
.size AES_cbc_encrypt,.-AES_cbc_encrypt
___
}
$code.=<<___;
.asciz "AES for SPARC T4, David S. Miller, Andy Polyakov"
.align 4
___
&emit_assembler();
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,914 @@
#! /usr/bin/env perl
# Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the Apache License 2.0 (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
######################################################################
## Constant-time SSSE3 AES core implementation.
## version 0.1
##
## By Mike Hamburg (Stanford University), 2009
## Public domain.
##
## For details see http://shiftleft.org/papers/vector_aes/ and
## http://crypto.stanford.edu/vpaes/.
######################################################################
# September 2011.
#
# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
# doesn't handle partial vectors (doesn't have to if called from
# EVP only). "Drop-in" implies that this module doesn't share key
# schedule structure with the original nor does it make assumption
# about its alignment...
#
# Performance summary. aes-586.pl column lists large-block CBC
# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
# byte processed with 128-bit key, and vpaes-x86.pl column - [also
# large-block CBC] encrypt/decrypt.
#
# aes-586.pl vpaes-x86.pl
#
# Core 2(**) 28.1/41.4/18.3 21.9/25.2(***)
# Nehalem 27.9/40.4/18.1 10.2/11.9
# Atom 70.7/92.1/60.1 61.1/75.4(***)
# Silvermont 45.4/62.9/24.1 49.2/61.1(***)
#
# (*) "Hyper-threading" in the context refers rather to cache shared
# among multiple cores, than to specifically Intel HTT. As vast
# majority of contemporary cores share cache, slower code path
# is common place. In other words "with-hyper-threading-off"
# results are presented mostly for reference purposes.
#
# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
#
# (***) Less impressive improvement on Core 2 and Atom is due to slow
# pshufb, yet it's respectable +28%/64% improvement on Core 2
# and +15% on Atom (as implied, over "hyper-threading-safe"
# code path).
#
# <appro@openssl.org>
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output = pop and open STDOUT,">$output";
&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
$PREFIX="vpaes";
my ($round, $base, $magic, $key, $const, $inp, $out)=
("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
&static_label("_vpaes_consts");
&static_label("_vpaes_schedule_low_round");
&set_label("_vpaes_consts",64);
$k_inv=-0x30; # inv, inva
&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
$k_s0F=-0x10; # s0F
&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
$k_ipt=0x00; # input transform (lo, hi)
&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
$k_sb1=0x20; # sb1u, sb1t
&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
$k_sb2=0x40; # sb2u, sb2t
&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
$k_sbo=0x60; # sbou, sbot
&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
$k_mc_forward=0x80; # mc_forward
&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
$k_mc_backward=0xc0; # mc_backward
&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
$k_sr=0x100; # sr
&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
$k_rcon=0x140; # rcon
&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
$k_s63=0x150; # s63: all equal to 0x63 transformed
&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
$k_opt=0x160; # output transform
&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
$k_deskew=0x180; # deskew tables: inverts the sbox's "skew"
&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
##
## Decryption stuff
## Key schedule constants
##
$k_dksd=0x1a0; # decryption key schedule: invskew x*D
&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
$k_dksb=0x1c0; # decryption key schedule: invskew x*B
&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
$k_dkse=0x1e0; # decryption key schedule: invskew x*E + 0x63
&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
$k_dks9=0x200; # decryption key schedule: invskew x*9
&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
##
## Decryption stuff
## Round function constants
##
$k_dipt=0x220; # decryption input transform
&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
$k_dsb9=0x240; # decryption sbox output *9*u, *9*t
&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
$k_dsbd=0x260; # decryption sbox output *D*u, *D*t
&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
$k_dsbb=0x280; # decryption sbox output *B*u, *B*t
&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
$k_dsbe=0x2a0; # decryption sbox output *E*u, *E*t
&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
$k_dsbo=0x2c0; # decryption sbox final output
&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
&align (64);
&function_begin_B("_vpaes_preheat");
&add ($const,&DWP(0,"esp"));
&movdqa ("xmm7",&QWP($k_inv,$const));
&movdqa ("xmm6",&QWP($k_s0F,$const));
&ret ();
&function_end_B("_vpaes_preheat");
##
## _aes_encrypt_core
##
## AES-encrypt %xmm0.
##
## Inputs:
## %xmm0 = input
## %xmm6-%xmm7 as in _vpaes_preheat
## (%edx) = scheduled keys
##
## Output in %xmm0
## Clobbers %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
##
##
&function_begin_B("_vpaes_encrypt_core");
&mov ($magic,16);
&mov ($round,&DWP(240,$key));
&movdqa ("xmm1","xmm6")
&movdqa ("xmm2",&QWP($k_ipt,$const));
&pandn ("xmm1","xmm0");
&pand ("xmm0","xmm6");
&movdqu ("xmm5",&QWP(0,$key));
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP($k_ipt+16,$const));
&pxor ("xmm2","xmm5");
&psrld ("xmm1",4);
&add ($key,16);
&pshufb ("xmm0","xmm1");
&lea ($base,&DWP($k_mc_backward,$const));
&pxor ("xmm0","xmm2");
&jmp (&label("enc_entry"));
&set_label("enc_loop",16);
# middle of middle round
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sb1u
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
&pshufb ("xmm4","xmm2"); # 4 = sb1u
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
&movdqa ("xmm5",&QWP($k_sb2,$const)); # 4 : sb2u
&pxor ("xmm0","xmm4"); # 0 = A
&movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
&pshufb ("xmm5","xmm2"); # 4 = sb2u
&movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
&movdqa ("xmm4",&QWP(0,$base,$magic)); # .Lk_mc_backward[]
&pshufb ("xmm2","xmm3"); # 2 = sb2t
&movdqa ("xmm3","xmm0"); # 3 = A
&pxor ("xmm2","xmm5"); # 2 = 2A
&pshufb ("xmm0","xmm1"); # 0 = B
&add ($key,16); # next key
&pxor ("xmm0","xmm2"); # 0 = 2A+B
&pshufb ("xmm3","xmm4"); # 3 = D
&add ($magic,16); # next mc
&pxor ("xmm3","xmm0"); # 3 = 2A+B+D
&pshufb ("xmm0","xmm1"); # 0 = 2B+C
&and ($magic,0x30); # ... mod 4
&sub ($round,1); # nr--
&pxor ("xmm0","xmm3"); # 0 = 2A+3B+C+D
&set_label("enc_entry");
# top of round
&movdqa ("xmm1","xmm6"); # 1 : i
&movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
&pandn ("xmm1","xmm0"); # 1 = i<<4
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm6"); # 0 = k
&pshufb ("xmm5","xmm0"); # 2 = a/k
&movdqa ("xmm3","xmm7"); # 3 : 1/i
&pxor ("xmm0","xmm1"); # 0 = j
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&movdqa ("xmm4","xmm7"); # 4 : 1/j
&pxor ("xmm3","xmm5"); # 3 = iak = 1/i + a/k
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
&pxor ("xmm4","xmm5"); # 4 = jak = 1/j + a/k
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
&pxor ("xmm2","xmm0"); # 2 = io
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&movdqu ("xmm5",&QWP(0,$key));
&pxor ("xmm3","xmm1"); # 3 = jo
&jnz (&label("enc_loop"));
# middle of last round
&movdqa ("xmm4",&QWP($k_sbo,$const)); # 3 : sbou .Lk_sbo
&movdqa ("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot .Lk_sbo+16
&pshufb ("xmm4","xmm2"); # 4 = sbou
&pxor ("xmm4","xmm5"); # 4 = sb1u + k
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&movdqa ("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
&pxor ("xmm0","xmm4"); # 0 = A
&pshufb ("xmm0","xmm1");
&ret ();
&function_end_B("_vpaes_encrypt_core");
##
## Decryption core
##
## Same API as encryption core.
##
&function_begin_B("_vpaes_decrypt_core");
&lea ($base,&DWP($k_dsbd,$const));
&mov ($round,&DWP(240,$key));
&movdqa ("xmm1","xmm6");
&movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
&pandn ("xmm1","xmm0");
&mov ($magic,$round);
&psrld ("xmm1",4)
&movdqu ("xmm5",&QWP(0,$key));
&shl ($magic,4);
&pand ("xmm0","xmm6");
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
&xor ($magic,0x30);
&pshufb ("xmm0","xmm1");
&and ($magic,0x30);
&pxor ("xmm2","xmm5");
&movdqa ("xmm5",&QWP($k_mc_forward+48,$const));
&pxor ("xmm0","xmm2");
&add ($key,16);
&lea ($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
&jmp (&label("dec_entry"));
&set_label("dec_loop",16);
##
## Inverse mix columns
##
&movdqa ("xmm4",&QWP(-0x20,$base)); # 4 : sb9u
&movdqa ("xmm1",&QWP(-0x10,$base)); # 0 : sb9t
&pshufb ("xmm4","xmm2"); # 4 = sb9u
&pshufb ("xmm1","xmm3"); # 0 = sb9t
&pxor ("xmm0","xmm4");
&movdqa ("xmm4",&QWP(0,$base)); # 4 : sbdu
&pxor ("xmm0","xmm1"); # 0 = ch
&movdqa ("xmm1",&QWP(0x10,$base)); # 0 : sbdt
&pshufb ("xmm4","xmm2"); # 4 = sbdu
&pshufb ("xmm0","xmm5"); # MC ch
&pshufb ("xmm1","xmm3"); # 0 = sbdt
&pxor ("xmm0","xmm4"); # 4 = ch
&movdqa ("xmm4",&QWP(0x20,$base)); # 4 : sbbu
&pxor ("xmm0","xmm1"); # 0 = ch
&movdqa ("xmm1",&QWP(0x30,$base)); # 0 : sbbt
&pshufb ("xmm4","xmm2"); # 4 = sbbu
&pshufb ("xmm0","xmm5"); # MC ch
&pshufb ("xmm1","xmm3"); # 0 = sbbt
&pxor ("xmm0","xmm4"); # 4 = ch
&movdqa ("xmm4",&QWP(0x40,$base)); # 4 : sbeu
&pxor ("xmm0","xmm1"); # 0 = ch
&movdqa ("xmm1",&QWP(0x50,$base)); # 0 : sbet
&pshufb ("xmm4","xmm2"); # 4 = sbeu
&pshufb ("xmm0","xmm5"); # MC ch
&pshufb ("xmm1","xmm3"); # 0 = sbet
&pxor ("xmm0","xmm4"); # 4 = ch
&add ($key,16); # next round key
&palignr("xmm5","xmm5",12);
&pxor ("xmm0","xmm1"); # 0 = ch
&sub ($round,1); # nr--
&set_label("dec_entry");
# top of round
&movdqa ("xmm1","xmm6"); # 1 : i
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
&pandn ("xmm1","xmm0"); # 1 = i<<4
&pand ("xmm0","xmm6"); # 0 = k
&psrld ("xmm1",4); # 1 = i
&pshufb ("xmm2","xmm0"); # 2 = a/k
&movdqa ("xmm3","xmm7"); # 3 : 1/i
&pxor ("xmm0","xmm1"); # 0 = j
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&movdqa ("xmm4","xmm7"); # 4 : 1/j
&pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
&movdqa ("xmm2","xmm7"); # 2 : 1/iak
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&movdqa ("xmm3","xmm7"); # 3 : 1/jak
&pxor ("xmm2","xmm0"); # 2 = io
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&movdqu ("xmm0",&QWP(0,$key));
&pxor ("xmm3","xmm1"); # 3 = jo
&jnz (&label("dec_loop"));
# middle of last round
&movdqa ("xmm4",&QWP(0x60,$base)); # 3 : sbou
&pshufb ("xmm4","xmm2"); # 4 = sbou
&pxor ("xmm4","xmm0"); # 4 = sb1u + k
&movdqa ("xmm0",&QWP(0x70,$base)); # 0 : sbot
&movdqa ("xmm2",&QWP(0,$magic));
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm0","xmm4"); # 0 = A
&pshufb ("xmm0","xmm2");
&ret ();
&function_end_B("_vpaes_decrypt_core");
########################################################
## ##
## AES key schedule ##
## ##
########################################################
&function_begin_B("_vpaes_schedule_core");
&add ($const,&DWP(0,"esp"));
&movdqu ("xmm0",&QWP(0,$inp)); # load key (unaligned)
&movdqa ("xmm2",&QWP($k_rcon,$const)); # load rcon
# input transform
&movdqa ("xmm3","xmm0");
&lea ($base,&DWP($k_ipt,$const));
&movdqa (&QWP(4,"esp"),"xmm2"); # xmm8
&call ("_vpaes_schedule_transform");
&movdqa ("xmm7","xmm0");
&test ($out,$out);
&jnz (&label("schedule_am_decrypting"));
# encrypting, output zeroth round key after transform
&movdqu (&QWP(0,$key),"xmm0");
&jmp (&label("schedule_go"));
&set_label("schedule_am_decrypting");
# decrypting, output zeroth round key after shiftrows
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm3","xmm1");
&movdqu (&QWP(0,$key),"xmm3");
&xor ($magic,0x30);
&set_label("schedule_go");
&cmp ($round,192);
&ja (&label("schedule_256"));
&je (&label("schedule_192"));
# 128: fall though
##
## .schedule_128
##
## 128-bit specific part of key schedule.
##
## This schedule is really simple, because all its parts
## are accomplished by the subroutines.
##
&set_label("schedule_128");
&mov ($round,10);
&set_label("loop_schedule_128");
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle"); # write output
&jmp (&label("loop_schedule_128"));
##
## .aes_schedule_192
##
## 192-bit specific part of key schedule.
##
## The main body of this schedule is the same as the 128-bit
## schedule, but with more smearing. The long, high side is
## stored in %xmm7 as before, and the short, low side is in
## the high bits of %xmm6.
##
## This schedule is somewhat nastier, however, because each
## round produces 192 bits of key material, or 1.5 round keys.
## Therefore, on each cycle we do 2 rounds and produce 3 round
## keys.
##
&set_label("schedule_192",16);
&movdqu ("xmm0",&QWP(8,$inp)); # load key part 2 (very unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&movdqa ("xmm6","xmm0"); # save short part
&pxor ("xmm4","xmm4"); # clear 4
&movhlps("xmm6","xmm4"); # clobber low side with zeros
&mov ($round,4);
&set_label("loop_schedule_192");
&call ("_vpaes_schedule_round");
&palignr("xmm0","xmm6",8);
&call ("_vpaes_schedule_mangle"); # save key n
&call ("_vpaes_schedule_192_smear");
&call ("_vpaes_schedule_mangle"); # save key n+1
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle"); # save key n+2
&call ("_vpaes_schedule_192_smear");
&jmp (&label("loop_schedule_192"));
##
## .aes_schedule_256
##
## 256-bit specific part of key schedule.
##
## The structure here is very similar to the 128-bit
## schedule, but with an additional "low side" in
## %xmm6. The low side's rounds are the same as the
## high side's, except no rcon and no rotation.
##
&set_label("schedule_256",16);
&movdqu ("xmm0",&QWP(16,$inp)); # load key part 2 (unaligned)
&call ("_vpaes_schedule_transform"); # input transform
&mov ($round,7);
&set_label("loop_schedule_256");
&call ("_vpaes_schedule_mangle"); # output low result
&movdqa ("xmm6","xmm0"); # save cur_lo in xmm6
# high round
&call ("_vpaes_schedule_round");
&dec ($round);
&jz (&label("schedule_mangle_last"));
&call ("_vpaes_schedule_mangle");
# low round. swap xmm7 and xmm6
&pshufd ("xmm0","xmm0",0xFF);
&movdqa (&QWP(20,"esp"),"xmm7");
&movdqa ("xmm7","xmm6");
&call ("_vpaes_schedule_low_round");
&movdqa ("xmm7",&QWP(20,"esp"));
&jmp (&label("loop_schedule_256"));
##
## .aes_schedule_mangle_last
##
## Mangler for last round of key schedule
## Mangles %xmm0
## when encrypting, outputs out(%xmm0) ^ 63
## when decrypting, outputs unskew(%xmm0)
##
## Always called right before return... jumps to cleanup and exits
##
&set_label("schedule_mangle_last",16);
# schedule last round key from xmm0
&lea ($base,&DWP($k_deskew,$const));
&test ($out,$out);
&jnz (&label("schedule_mangle_last_dec"));
# encrypting
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm0","xmm1"); # output permute
&lea ($base,&DWP($k_opt,$const)); # prepare to output transform
&add ($key,32);
&set_label("schedule_mangle_last_dec");
&add ($key,-16);
&pxor ("xmm0",&QWP($k_s63,$const));
&call ("_vpaes_schedule_transform"); # output transform
&movdqu (&QWP(0,$key),"xmm0"); # save last key
# cleanup
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&pxor ("xmm6","xmm6");
&pxor ("xmm7","xmm7");
&ret ();
&function_end_B("_vpaes_schedule_core");
##
## .aes_schedule_192_smear
##
## Smear the short, low side in the 192-bit key schedule.
##
## Inputs:
## %xmm7: high side, b a x y
## %xmm6: low side, d c 0 0
## %xmm13: 0
##
## Outputs:
## %xmm6: b+c+d b+c 0 0
## %xmm0: b+c+d b+c b a
##
&function_begin_B("_vpaes_schedule_192_smear");
&pshufd ("xmm1","xmm6",0x80); # d c 0 0 -> c 0 0 0
&pshufd ("xmm0","xmm7",0xFE); # b a _ _ -> b b b a
&pxor ("xmm6","xmm1"); # -> c+d c 0 0
&pxor ("xmm1","xmm1");
&pxor ("xmm6","xmm0"); # -> b+c+d b+c b a
&movdqa ("xmm0","xmm6");
&movhlps("xmm6","xmm1"); # clobber low side with zeros
&ret ();
&function_end_B("_vpaes_schedule_192_smear");
##
## .aes_schedule_round
##
## Runs one main round of the key schedule on %xmm0, %xmm7
##
## Specifically, runs subbytes on the high dword of %xmm0
## then rotates it by one byte and xors into the low dword of
## %xmm7.
##
## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
## next rcon.
##
## Smears the dwords of %xmm7 by xoring the low into the
## second low, result into third, result into highest.
##
## Returns results in %xmm7 = %xmm0.
## Clobbers %xmm1-%xmm5.
##
&function_begin_B("_vpaes_schedule_round");
# extract rcon from xmm8
&movdqa ("xmm2",&QWP(8,"esp")); # xmm8
&pxor ("xmm1","xmm1");
&palignr("xmm1","xmm2",15);
&palignr("xmm2","xmm2",15);
&pxor ("xmm7","xmm1");
# rotate
&pshufd ("xmm0","xmm0",0xFF);
&palignr("xmm0","xmm0",1);
# fall through...
&movdqa (&QWP(8,"esp"),"xmm2"); # xmm8
# low round: same as high round, but no rotation and no rcon.
&set_label("_vpaes_schedule_low_round");
# smear xmm7
&movdqa ("xmm1","xmm7");
&pslldq ("xmm7",4);
&pxor ("xmm7","xmm1");
&movdqa ("xmm1","xmm7");
&pslldq ("xmm7",8);
&pxor ("xmm7","xmm1");
&pxor ("xmm7",&QWP($k_s63,$const));
# subbyte
&movdqa ("xmm4",&QWP($k_s0F,$const));
&movdqa ("xmm5",&QWP($k_inv,$const)); # 4 : 1/j
&movdqa ("xmm1","xmm4");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4); # 1 = i
&pand ("xmm0","xmm4"); # 0 = k
&movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
&pshufb ("xmm2","xmm0"); # 2 = a/k
&pxor ("xmm0","xmm1"); # 0 = j
&movdqa ("xmm3","xmm5"); # 3 : 1/i
&pshufb ("xmm3","xmm1"); # 3 = 1/i
&pxor ("xmm3","xmm2"); # 3 = iak = 1/i + a/k
&movdqa ("xmm4","xmm5"); # 4 : 1/j
&pshufb ("xmm4","xmm0"); # 4 = 1/j
&pxor ("xmm4","xmm2"); # 4 = jak = 1/j + a/k
&movdqa ("xmm2","xmm5"); # 2 : 1/iak
&pshufb ("xmm2","xmm3"); # 2 = 1/iak
&pxor ("xmm2","xmm0"); # 2 = io
&movdqa ("xmm3","xmm5"); # 3 : 1/jak
&pshufb ("xmm3","xmm4"); # 3 = 1/jak
&pxor ("xmm3","xmm1"); # 3 = jo
&movdqa ("xmm4",&QWP($k_sb1,$const)); # 4 : sbou
&pshufb ("xmm4","xmm2"); # 4 = sbou
&movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
&pshufb ("xmm0","xmm3"); # 0 = sb1t
&pxor ("xmm0","xmm4"); # 0 = sbox output
# add in smeared stuff
&pxor ("xmm0","xmm7");
&movdqa ("xmm7","xmm0");
&ret ();
&function_end_B("_vpaes_schedule_round");
##
## .aes_schedule_transform
##
## Linear-transform %xmm0 according to tables at (%ebx)
##
## Output in %xmm0
## Clobbers %xmm1, %xmm2
##
&function_begin_B("_vpaes_schedule_transform");
&movdqa ("xmm2",&QWP($k_s0F,$const));
&movdqa ("xmm1","xmm2");
&pandn ("xmm1","xmm0");
&psrld ("xmm1",4);
&pand ("xmm0","xmm2");
&movdqa ("xmm2",&QWP(0,$base));
&pshufb ("xmm2","xmm0");
&movdqa ("xmm0",&QWP(16,$base));
&pshufb ("xmm0","xmm1");
&pxor ("xmm0","xmm2");
&ret ();
&function_end_B("_vpaes_schedule_transform");
##
## .aes_schedule_mangle
##
## Mangle xmm0 from (basis-transformed) standard version
## to our version.
##
## On encrypt,
## xor with 0x63
## multiply by circulant 0,1,1,1
## apply shiftrows transform
##
## On decrypt,
## xor with 0x63
## multiply by "inverse mixcolumns" circulant E,B,D,9
## deskew
## apply shiftrows transform
##
##
## Writes out to (%edx), and increments or decrements it
## Keeps track of round number mod 4 in %ecx
## Preserves xmm0
## Clobbers xmm1-xmm5
##
&function_begin_B("_vpaes_schedule_mangle");
&movdqa ("xmm4","xmm0"); # save xmm0 for later
&movdqa ("xmm5",&QWP($k_mc_forward,$const));
&test ($out,$out);
&jnz (&label("schedule_mangle_dec"));
# encrypting
&add ($key,16);
&pxor ("xmm4",&QWP($k_s63,$const));
&pshufb ("xmm4","xmm5");
&movdqa ("xmm3","xmm4");
&pshufb ("xmm4","xmm5");
&pxor ("xmm3","xmm4");
&pshufb ("xmm4","xmm5");
&pxor ("xmm3","xmm4");
&jmp (&label("schedule_mangle_both"));
&set_label("schedule_mangle_dec",16);
# inverse mix columns
&movdqa ("xmm2",&QWP($k_s0F,$const));
&lea ($inp,&DWP($k_dksd,$const));
&movdqa ("xmm1","xmm2");
&pandn ("xmm1","xmm4");
&psrld ("xmm1",4); # 1 = hi
&pand ("xmm4","xmm2"); # 4 = lo
&movdqa ("xmm2",&QWP(0,$inp));
&pshufb ("xmm2","xmm4");
&movdqa ("xmm3",&QWP(0x10,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x20,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x30,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x40,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x50,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&pshufb ("xmm3","xmm5");
&movdqa ("xmm2",&QWP(0x60,$inp));
&pshufb ("xmm2","xmm4");
&pxor ("xmm2","xmm3");
&movdqa ("xmm3",&QWP(0x70,$inp));
&pshufb ("xmm3","xmm1");
&pxor ("xmm3","xmm2");
&add ($key,-16);
&set_label("schedule_mangle_both");
&movdqa ("xmm1",&QWP($k_sr,$const,$magic));
&pshufb ("xmm3","xmm1");
&add ($magic,-16);
&and ($magic,0x30);
&movdqu (&QWP(0,$key),"xmm3");
&ret ();
&function_end_B("_vpaes_schedule_mangle");
#
# Interface to OpenSSL
#
&function_begin("${PREFIX}_set_encrypt_key");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($round,&wparam(1)); # bits
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&mov ($base,$round);
&shr ($base,5);
&add ($base,5);
&mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
&mov ($magic,0x30);
&mov ($out,0);
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_schedule_core");
&set_label("pic_point");
&mov ("esp",&DWP(48,"esp"));
&xor ("eax","eax");
&function_end("${PREFIX}_set_encrypt_key");
&function_begin("${PREFIX}_set_decrypt_key");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($round,&wparam(1)); # bits
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&mov ($base,$round);
&shr ($base,5);
&add ($base,5);
&mov (&DWP(240,$key),$base); # AES_KEY->rounds = nbits/32+5;
&shl ($base,4);
&lea ($key,&DWP(16,$key,$base));
&mov ($out,1);
&mov ($magic,$round);
&shr ($magic,1);
&and ($magic,32);
&xor ($magic,32); # nbist==192?0:32;
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_schedule_core");
&set_label("pic_point");
&mov ("esp",&DWP(48,"esp"));
&xor ("eax","eax");
&function_end("${PREFIX}_set_decrypt_key");
&function_begin("${PREFIX}_encrypt");
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_preheat");
&set_label("pic_point");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($out,&wparam(1)); # out
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&movdqu ("xmm0",&QWP(0,$inp));
&call ("_vpaes_encrypt_core");
&movdqu (&QWP(0,$out),"xmm0");
&mov ("esp",&DWP(48,"esp"));
&function_end("${PREFIX}_encrypt");
&function_begin("${PREFIX}_decrypt");
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_preheat");
&set_label("pic_point");
&mov ($inp,&wparam(0)); # inp
&lea ($base,&DWP(-56,"esp"));
&mov ($out,&wparam(1)); # out
&and ($base,-16);
&mov ($key,&wparam(2)); # key
&xchg ($base,"esp"); # alloca
&mov (&DWP(48,"esp"),$base);
&movdqu ("xmm0",&QWP(0,$inp));
&call ("_vpaes_decrypt_core");
&movdqu (&QWP(0,$out),"xmm0");
&mov ("esp",&DWP(48,"esp"));
&function_end("${PREFIX}_decrypt");
&function_begin("${PREFIX}_cbc_encrypt");
&mov ($inp,&wparam(0)); # inp
&mov ($out,&wparam(1)); # out
&mov ($round,&wparam(2)); # len
&mov ($key,&wparam(3)); # key
&sub ($round,16);
&jc (&label("cbc_abort"));
&lea ($base,&DWP(-56,"esp"));
&mov ($const,&wparam(4)); # ivp
&and ($base,-16);
&mov ($magic,&wparam(5)); # enc
&xchg ($base,"esp"); # alloca
&movdqu ("xmm1",&QWP(0,$const)); # load IV
&sub ($out,$inp);
&mov (&DWP(48,"esp"),$base);
&mov (&DWP(0,"esp"),$out); # save out
&mov (&DWP(4,"esp"),$key) # save key
&mov (&DWP(8,"esp"),$const); # save ivp
&mov ($out,$round); # $out works as $len
&lea ($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
&call ("_vpaes_preheat");
&set_label("pic_point");
&cmp ($magic,0);
&je (&label("cbc_dec_loop"));
&jmp (&label("cbc_enc_loop"));
&set_label("cbc_enc_loop",16);
&movdqu ("xmm0",&QWP(0,$inp)); # load input
&pxor ("xmm0","xmm1"); # inp^=iv
&call ("_vpaes_encrypt_core");
&mov ($base,&DWP(0,"esp")); # restore out
&mov ($key,&DWP(4,"esp")); # restore key
&movdqa ("xmm1","xmm0");
&movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
&lea ($inp,&DWP(16,$inp));
&sub ($out,16);
&jnc (&label("cbc_enc_loop"));
&jmp (&label("cbc_done"));
&set_label("cbc_dec_loop",16);
&movdqu ("xmm0",&QWP(0,$inp)); # load input
&movdqa (&QWP(16,"esp"),"xmm1"); # save IV
&movdqa (&QWP(32,"esp"),"xmm0"); # save future IV
&call ("_vpaes_decrypt_core");
&mov ($base,&DWP(0,"esp")); # restore out
&mov ($key,&DWP(4,"esp")); # restore key
&pxor ("xmm0",&QWP(16,"esp")); # out^=iv
&movdqa ("xmm1",&QWP(32,"esp")); # load next IV
&movdqu (&QWP(0,$base,$inp),"xmm0"); # write output
&lea ($inp,&DWP(16,$inp));
&sub ($out,16);
&jnc (&label("cbc_dec_loop"));
&set_label("cbc_done");
&mov ($base,&DWP(8,"esp")); # restore ivp
&mov ("esp",&DWP(48,"esp"));
&movdqu (&QWP(0,$base),"xmm1"); # write IV
&set_label("cbc_abort");
&function_end("${PREFIX}_cbc_encrypt");
&asm_finish();
close STDOUT or die "error closing STDOUT: $!";

File diff suppressed because it is too large Load Diff