Adding in curl and openssl repos

2025-08-14 12:09:30 -04:00
parent af2117b574
commit 0ace93e303
21174 changed files with 3607720 additions and 2 deletions
--- a/openssl-3.4.2/crypto/aes/asm/aes-586.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aes-586.pl
--- a/openssl-3.4.2/crypto/aes/asm/aes-armv4.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aes-armv4.pl
--- a/openssl-3.4.2/crypto/aes/asm/aes-c64xplus.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aes-c64xplus.pl
--- a/openssl-3.4.2/crypto/aes/asm/aes-ia64.S
+++ b/openssl-3.4.2/crypto/aes/asm/aes-ia64.S
--- a/openssl-3.4.2/crypto/aes/asm/aes-mips.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aes-mips.pl
--- a/openssl-3.4.2/crypto/aes/asm/aes-parisc.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aes-parisc.pl
--- a/openssl-3.4.2/crypto/aes/asm/aes-ppc.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aes-ppc.pl
--- a/openssl-3.4.2/crypto/aes/asm/aes-riscv32-zkn.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aes-riscv32-zkn.pl
--- a/openssl-3.4.2/crypto/aes/asm/aes-riscv64-zkn.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aes-riscv64-zkn.pl
@@ -0,0 +1,594 @@
+#! /usr/bin/env perl
+# This file is dual-licensed, meaning that you can use it under your
+# choice of either of the following two licenses:
+#
+# Copyright 2022-2023 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You can obtain
+# a copy in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# or
+#
+# Copyright (c) 2022, Hongren (Zenithal) Zheng <i@zenithal.me>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+use strict;
+use warnings;
+
+use FindBin qw($Bin);
+use lib "$Bin";
+use lib "$Bin/../../perlasm";
+use riscv;
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$output and open STDOUT,">$output";
+
+################################################################################
+# Utility functions to help with keeping track of which registers to stack/
+# unstack when entering / exiting routines.
+################################################################################
+{
+    # Callee-saved registers
+    my @callee_saved = map("x$_",(2,8,9,18..27));
+    # Caller-saved registers
+    my @caller_saved = map("x$_",(1,5..7,10..17,28..31));
+    my @must_save;
+    sub use_reg {
+        my $reg = shift;
+        if (grep(/^$reg$/, @callee_saved)) {
+            push(@must_save, $reg);
+        } elsif (!grep(/^$reg$/, @caller_saved)) {
+            # Register is not usable!
+            die("Unusable register ".$reg);
+        }
+        return $reg;
+    }
+    sub use_regs {
+        return map(use_reg("x$_"), @_);
+    }
+    sub save_regs {
+        my $ret = '';
+        my $stack_reservation = ($#must_save + 1) * 8;
+        my $stack_offset = $stack_reservation;
+        if ($stack_reservation % 16) {
+            $stack_reservation += 8;
+        }
+        $ret.="    addi    sp,sp,-$stack_reservation\n";
+        foreach (@must_save) {
+            $stack_offset -= 8;
+            $ret.="    sd      $_,$stack_offset(sp)\n";
+        }
+        return $ret;
+    }
+    sub load_regs {
+        my $ret = '';
+        my $stack_reservation = ($#must_save + 1) * 8;
+        my $stack_offset = $stack_reservation;
+        if ($stack_reservation % 16) {
+            $stack_reservation += 8;
+        }
+        foreach (@must_save) {
+            $stack_offset -= 8;
+            $ret.="    ld      $_,$stack_offset(sp)\n";
+        }
+        $ret.="    addi    sp,sp,$stack_reservation\n";
+        return $ret;
+    }
+    sub clear_regs {
+        @must_save = ();
+    }
+}
+
+################################################################################
+# Register assignment for rv64i_zkne_encrypt and rv64i_zknd_decrypt
+################################################################################
+
+# Registers to hold AES state (called s0-s3 or y0-y3 elsewhere)
+my ($Q0,$Q1,$Q2,$Q3) = use_regs(6..9);
+
+# Function arguments (x10-x12 are a0-a2 in the ABI)
+# Input block pointer, output block pointer, key pointer
+my ($INP,$OUTP,$KEYP) = use_regs(10..12);
+
+# Temporaries
+my ($T0,$T1) = use_regs(13..14);
+
+# Loop counter
+my ($loopcntr) = use_regs(30);
+
+################################################################################
+# void rv64i_zkne_encrypt(const unsigned char *in, unsigned char *out,
+#   const AES_KEY *key);
+################################################################################
+my $code .= <<___;
+.text
+.balign 16
+.globl rv64i_zkne_encrypt
+.type   rv64i_zkne_encrypt,\@function
+rv64i_zkne_encrypt:
+___
+
+$code .= save_regs();
+
+$code .= <<___;
+
+    # Load input to block cipher
+    ld      $Q0,0($INP)
+    ld      $Q1,8($INP)
+
+    # Load key
+    ld      $T0,0($KEYP)
+    ld      $T1,8($KEYP)
+
+    # Load number of rounds
+    lwu     $loopcntr,240($KEYP)
+
+    # initial transformation
+    xor     $Q0,$Q0,$T0
+    xor     $Q1,$Q1,$T1
+
+    # The main loop only executes the first N-1 rounds.
+    add     $loopcntr,$loopcntr,-1
+
+    # Do Nr - 1 rounds (final round is special)
+1:
+    @{[aes64esm $Q2,$Q0,$Q1]}
+    @{[aes64esm $Q3,$Q1,$Q0]}
+
+    # Update key ptr to point to next key in schedule
+    add     $KEYP,$KEYP,16
+
+    # Grab next key in schedule
+    ld      $T0,0($KEYP)
+    ld      $T1,8($KEYP)
+    xor     $Q0,$Q2,$T0
+    xor     $Q1,$Q3,$T1
+
+    add     $loopcntr,$loopcntr,-1
+    bgtz    $loopcntr,1b
+
+    # final round
+    @{[aes64es $Q2,$Q0,$Q1]}
+    @{[aes64es $Q3,$Q1,$Q0]}
+
+    # since not added 16 before
+    ld      $T0,16($KEYP)
+    ld      $T1,24($KEYP)
+    xor     $Q0,$Q2,$T0
+    xor     $Q1,$Q3,$T1
+
+    sd      $Q0,0($OUTP)
+    sd      $Q1,8($OUTP)
+
+    # Pop registers and return
+___
+
+$code .= load_regs();
+
+$code .= <<___;
+    ret
+___
+
+################################################################################
+# void rv64i_zknd_decrypt(const unsigned char *in, unsigned char *out,
+#   const AES_KEY *key);
+################################################################################
+$code .= <<___;
+.text
+.balign 16
+.globl rv64i_zknd_decrypt
+.type   rv64i_zknd_decrypt,\@function
+rv64i_zknd_decrypt:
+___
+
+$code .= save_regs();
+
+$code .= <<___;
+
+    # Load input to block cipher
+    ld      $Q0,0($INP)
+    ld      $Q1,8($INP)
+
+    # Load number of rounds
+    lwu     $loopcntr,240($KEYP)
+
+    # Load the last key
+    slli    $T0,$loopcntr,4
+    add     $KEYP,$KEYP,$T0
+    ld      $T0,0($KEYP)
+    ld      $T1,8($KEYP)
+
+    xor     $Q0,$Q0,$T0
+    xor     $Q1,$Q1,$T1
+
+    # The main loop only executes the first N-1 rounds.
+    add     $loopcntr,$loopcntr,-1
+
+    # Do Nr - 1 rounds (final round is special)
+1:
+    @{[aes64dsm $Q2,$Q0,$Q1]}
+    @{[aes64dsm $Q3,$Q1,$Q0]}
+
+    # Update key ptr to point to next key in schedule
+    add     $KEYP,$KEYP,-16
+
+    # Grab next key in schedule
+    ld      $T0,0($KEYP)
+    ld      $T1,8($KEYP)
+    xor     $Q0,$Q2,$T0
+    xor     $Q1,$Q3,$T1
+
+    add     $loopcntr,$loopcntr,-1
+    bgtz    $loopcntr,1b
+
+    # final round
+    @{[aes64ds $Q2,$Q0,$Q1]}
+    @{[aes64ds $Q3,$Q1,$Q0]}
+
+    add     $KEYP,$KEYP,-16
+    ld      $T0,0($KEYP)
+    ld      $T1,8($KEYP)
+    xor     $Q0,$Q2,$T0
+    xor     $Q1,$Q3,$T1
+
+    sd      $Q0,0($OUTP)
+    sd      $Q1,8($OUTP)
+    # Pop registers and return
+___
+
+$code .= load_regs();
+
+$code .= <<___;
+    ret
+___
+
+clear_regs();
+
+################################################################################
+# Register assignment for rv64i_zkn[e/d]_set_[en/de]crypt_key
+################################################################################
+
+# Function arguments (x10-x12 are a0-a2 in the ABI)
+# Pointer to user key, number of bits in key, key pointer
+my ($UKEY,$BITS,$KEYP) = use_regs(10..12);
+
+# Temporaries
+my ($T0,$T1,$T2,$T3,$T4) = use_regs(6..8,13..14);
+
+################################################################################
+# utility functions for rv64i_zkne_set_encrypt_key
+################################################################################
+sub ke128enc {
+    my $rnum = 0;
+    my $ret = '';
+$ret .= <<___;
+    ld      $T0,0($UKEY)
+    ld      $T1,8($UKEY)
+    sd      $T0,0($KEYP)
+    sd      $T1,8($KEYP)
+___
+    while($rnum < 10) {
+$ret .= <<___;
+    @{[aes64ks1i   $T2,$T1,$rnum]}
+    @{[aes64ks2    $T0,$T2,$T0]}
+    @{[aes64ks2    $T1,$T0,$T1]}
+    add         $KEYP,$KEYP,16
+    sd          $T0,0($KEYP)
+    sd          $T1,8($KEYP)
+___
+        $rnum++;
+    }
+    return $ret;
+}
+
+sub ke192enc {
+    my $rnum = 0;
+    my $ret = '';
+$ret .= <<___;
+    ld      $T0,0($UKEY)
+    ld      $T1,8($UKEY)
+    ld      $T2,16($UKEY)
+    sd      $T0,0($KEYP)
+    sd      $T1,8($KEYP)
+    sd      $T2,16($KEYP)
+___
+    while($rnum < 8) {
+$ret .= <<___;
+    @{[aes64ks1i   $T3,$T2,$rnum]}
+    @{[aes64ks2    $T0,$T3,$T0]}
+    @{[aes64ks2    $T1,$T0,$T1]}
+___
+        if ($rnum != 7) {
+        # note that (8+1)*24 = 216, (12+1)*16 = 208
+        # thus the last 8 bytes can be dropped
+$ret .= <<___;
+    @{[aes64ks2    $T2,$T1,$T2]}
+___
+        }
+$ret .= <<___;
+    add         $KEYP,$KEYP,24
+    sd          $T0,0($KEYP)
+    sd          $T1,8($KEYP)
+___
+        if ($rnum != 7) {
+$ret .= <<___;
+    sd          $T2,16($KEYP)
+___
+        }
+        $rnum++;
+    }
+    return $ret;
+}
+
+sub ke256enc {
+    my $rnum = 0;
+    my $ret = '';
+$ret .= <<___;
+    ld      $T0,0($UKEY)
+    ld      $T1,8($UKEY)
+    ld      $T2,16($UKEY)
+    ld      $T3,24($UKEY)
+    sd      $T0,0($KEYP)
+    sd      $T1,8($KEYP)
+    sd      $T2,16($KEYP)
+    sd      $T3,24($KEYP)
+___
+    while($rnum < 7) {
+$ret .= <<___;
+    @{[aes64ks1i   $T4,$T3,$rnum]}
+    @{[aes64ks2    $T0,$T4,$T0]}
+    @{[aes64ks2    $T1,$T0,$T1]}
+    add         $KEYP,$KEYP,32
+    sd          $T0,0($KEYP)
+    sd          $T1,8($KEYP)
+___
+        if ($rnum != 6) {
+        # note that (7+1)*32 = 256, (14+1)*16 = 240
+        # thus the last 16 bytes can be dropped
+$ret .= <<___;
+    @{[aes64ks1i   $T4,$T1,0xA]}
+    @{[aes64ks2    $T2,$T4,$T2]}
+    @{[aes64ks2    $T3,$T2,$T3]}
+    sd          $T2,16($KEYP)
+    sd          $T3,24($KEYP)
+___
+        }
+        $rnum++;
+    }
+    return $ret;
+}
+
+################################################################################
+# void rv64i_zkne_set_encrypt_key(const unsigned char *userKey, const int bits,
+#   AES_KEY *key)
+################################################################################
+sub AES_set_common {
+    my ($ke128, $ke192, $ke256) = @_;
+    my $ret = '';
+$ret .= <<___;
+    bnez    $UKEY,1f        # if (!userKey || !key) return -1;
+    bnez    $KEYP,1f
+    li      a0,-1
+    ret
+1:
+    # Determine number of rounds from key size in bits
+    li      $T0,128
+    bne     $BITS,$T0,1f
+    li      $T1,10          # key->rounds = 10 if bits == 128
+    sw      $T1,240($KEYP)  # store key->rounds
+$ke128
+    j       4f
+1:
+    li      $T0,192
+    bne     $BITS,$T0,2f
+    li      $T1,12          # key->rounds = 12 if bits == 192
+    sw      $T1,240($KEYP)  # store key->rounds
+$ke192
+    j       4f
+2:
+    li      $T1,14          # key->rounds = 14 if bits == 256
+    li      $T0,256
+    beq     $BITS,$T0,3f
+    li      a0,-2           # If bits != 128, 192, or 256, return -2
+    j       5f
+3:
+    sw      $T1,240($KEYP)  # store key->rounds
+$ke256
+4:  # return 0
+    li      a0,0
+5:  # return a0
+___
+    return $ret;
+}
+$code .= <<___;
+.text
+.balign 16
+.globl rv64i_zkne_set_encrypt_key
+.type   rv64i_zkne_set_encrypt_key,\@function
+rv64i_zkne_set_encrypt_key:
+___
+$code .= save_regs();
+$code .= AES_set_common(ke128enc(), ke192enc(),ke256enc());
+$code .= load_regs();
+$code .= <<___;
+    ret
+___
+
+################################################################################
+# utility functions for rv64i_zknd_set_decrypt_key
+################################################################################
+sub ke128dec {
+    my $rnum = 0;
+    my $ret = '';
+$ret .= <<___;
+    ld      $T0,0($UKEY)
+    ld      $T1,8($UKEY)
+    sd      $T0,0($KEYP)
+    sd      $T1,8($KEYP)
+___
+    while($rnum < 10) {
+$ret .= <<___;
+    @{[aes64ks1i   $T2,$T1,$rnum]}
+    @{[aes64ks2    $T0,$T2,$T0]}
+    @{[aes64ks2    $T1,$T0,$T1]}
+    add         $KEYP,$KEYP,16
+___
+    # need to aes64im for [1:N-1] round keys
+    # this is from the fact that aes64dsm subwords first then mix column
+    # intuitively decryption needs to first mix column then subwords
+    # however, for merging datapaths (encryption first subwords then mix column)
+    # aes64dsm chooses to inverse the order of them, thus
+    # transform should then be done on the round key
+        if ($rnum < 9) {
+$ret .= <<___;
+    @{[aes64im     $T2,$T0]}
+    sd          $T2,0($KEYP)
+    @{[aes64im     $T2,$T1]}
+    sd          $T2,8($KEYP)
+___
+        } else {
+$ret .= <<___;
+    sd          $T0,0($KEYP)
+    sd          $T1,8($KEYP)
+___
+        }
+        $rnum++;
+    }
+    return $ret;
+}
+
+sub ke192dec {
+    my $rnum = 0;
+    my $ret = '';
+$ret .= <<___;
+    ld      $T0,0($UKEY)
+    ld      $T1,8($UKEY)
+    ld      $T2,16($UKEY)
+    sd      $T0,0($KEYP)
+    sd      $T1,8($KEYP)
+    @{[aes64im $T3,$T2]}
+    sd      $T3,16($KEYP)
+___
+    while($rnum < 8) {
+$ret .= <<___;
+    @{[aes64ks1i   $T3,$T2,$rnum]}
+    @{[aes64ks2    $T0,$T3,$T0]}
+    @{[aes64ks2    $T1,$T0,$T1]}
+    add         $KEYP,$KEYP,24
+___
+        if ($rnum < 7) {
+$ret .= <<___;
+    @{[aes64im     $T3,$T0]}
+    sd          $T3,0($KEYP)
+    @{[aes64im     $T3,$T1]}
+    sd          $T3,8($KEYP)
+    # the reason is in ke192enc
+    @{[aes64ks2    $T2,$T1,$T2]}
+    @{[aes64im     $T3,$T2]}
+    sd          $T3,16($KEYP)
+___
+        } else { # rnum == 7
+$ret .= <<___;
+    sd          $T0,0($KEYP)
+    sd          $T1,8($KEYP)
+___
+        }
+        $rnum++;
+    }
+    return $ret;
+}
+
+sub ke256dec {
+    my $rnum = 0;
+    my $ret = '';
+$ret .= <<___;
+    ld      $T0,0($UKEY)
+    ld      $T1,8($UKEY)
+    ld      $T2,16($UKEY)
+    ld      $T3,24($UKEY)
+    sd      $T0,0($KEYP)
+    sd      $T1,8($KEYP)
+    @{[aes64im $T4,$T2]}
+    sd      $T4,16($KEYP)
+    @{[aes64im $T4,$T3]}
+    sd      $T4,24($KEYP)
+___
+    while($rnum < 7) {
+$ret .= <<___;
+    @{[aes64ks1i   $T4,$T3,$rnum]}
+    @{[aes64ks2    $T0,$T4,$T0]}
+    @{[aes64ks2    $T1,$T0,$T1]}
+    add         $KEYP,$KEYP,32
+___
+        if ($rnum < 6) {
+$ret .= <<___;
+    @{[aes64ks1i   $T4,$T1,0xA]}
+    @{[aes64ks2    $T2,$T4,$T2]}
+    @{[aes64ks2    $T3,$T2,$T3]}
+    @{[aes64im     $T4,$T0]}
+    sd          $T4,0($KEYP)
+    @{[aes64im     $T4,$T1]}
+    sd          $T4,8($KEYP)
+    @{[aes64im     $T4,$T2]}
+    sd          $T4,16($KEYP)
+    @{[aes64im     $T4,$T3]}
+    sd          $T4,24($KEYP)
+___
+        } else {
+$ret .= <<___;
+    sd          $T0,0($KEYP)
+    sd          $T1,8($KEYP)
+    # last two one dropped
+___
+        }
+        $rnum++;
+    }
+    return $ret;
+}
+
+################################################################################
+# void rv64i_zknd_set_decrypt_key(const unsigned char *userKey, const int bits,
+#   AES_KEY *key)
+################################################################################
+$code .= <<___;
+.text
+.balign 16
+.globl rv64i_zknd_set_decrypt_key
+.type   rv64i_zknd_set_decrypt_key,\@function
+rv64i_zknd_set_decrypt_key:
+___
+$code .= save_regs();
+$code .= AES_set_common(ke128dec(), ke192dec(),ke256dec());
+$code .= load_regs();
+$code .= <<___;
+    ret
+___
+
+print $code;
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/aes/asm/aes-riscv64-zvbb-zvkg-zvkned.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aes-riscv64-zvbb-zvkg-zvkned.pl
@@ -0,0 +1,710 @@
+#! /usr/bin/env perl
+# This file is dual-licensed, meaning that you can use it under your
+# choice of either of the following two licenses:
+#
+# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You can obtain
+# a copy in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# or
+#
+# Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# - RV64I
+# - RISC-V Vector ('V') with VLEN >= 128
+# - RISC-V Vector Bit-manipulation extension ('Zvbb')
+# - RISC-V Vector GCM/GMAC extension ('Zvkg')
+# - RISC-V Vector AES block cipher extension ('Zvkned')
+# - RISC-V Zicclsm(Main memory supports misaligned loads/stores)
+
+use strict;
+use warnings;
+
+use FindBin qw($Bin);
+use lib "$Bin";
+use lib "$Bin/../../perlasm";
+use riscv;
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$output and open STDOUT,">$output";
+
+my $code=<<___;
+.text
+___
+
+{
+################################################################################
+# void rv64i_zvbb_zvkg_zvkned_aes_xts_encrypt(const unsigned char *in,
+#                                             unsigned char *out, size_t length,
+#                                             const AES_KEY *key1,
+#                                             const AES_KEY *key2,
+#                                             const unsigned char iv[16])
+my ($INPUT, $OUTPUT, $LENGTH, $KEY1, $KEY2, $IV) = ("a0", "a1", "a2", "a3", "a4", "a5");
+my ($TAIL_LENGTH) = ("a6");
+my ($VL) = ("a7");
+my ($T0, $T1, $T2) = ("t0", "t1", "t2");
+my ($STORE_LEN32) = ("t3");
+my ($LEN32) = ("t4");
+my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7,
+    $V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15,
+    $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23,
+    $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31,
+) = map("v$_",(0..31));
+
+sub compute_xts_iv0 {
+    my $code=<<___;
+    # Load number of rounds
+    lwu $T0, 240($KEY2)
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $V28, $IV]}
+    @{[vle32_v $V29, $KEY2]}
+    @{[vaesz_vs $V28, $V29]}
+    addi $T0, $T0, -1
+    addi $KEY2, $KEY2, 16
+1:
+    @{[vle32_v $V29, $KEY2]}
+    @{[vaesem_vs $V28, $V29]}
+    addi $T0, $T0, -1
+    addi $KEY2, $KEY2, 16
+    bnez $T0, 1b
+    @{[vle32_v $V29, $KEY2]}
+    @{[vaesef_vs $V28, $V29]}
+___
+
+    return $code;
+}
+
+# prepare input data(v24), iv(v28), bit-reversed-iv(v16), bit-reversed-iv-multiplier(v20)
+sub init_first_round {
+    my $code=<<___;
+    # load input
+    @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vle32_v $V24, $INPUT]}
+
+    li $T0, 5
+    # We could simplify the initialization steps if we have `block<=1`.
+    blt $LEN32, $T0, 1f
+
+    # Note: We use `vgmul` for GF(2^128) multiplication. The `vgmul` uses
+    # different order of coefficients. We should use`vbrev8` to reverse the
+    # data when we use `vgmul`.
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vbrev8_v $V0, $V28]}
+    @{[vsetvli "zero", $LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vmv_v_i $V16, 0]}
+    # v16: [r-IV0, r-IV0, ...]
+    @{[vaesz_vs $V16, $V0]}
+
+    # Prepare GF(2^128) multiplier [1, x, x^2, x^3, ...] in v8.
+    slli $T0, $LEN32, 2
+    @{[vsetvli "zero", $T0, "e32", "m1", "ta", "ma"]}
+    # v2: [`1`, `1`, `1`, `1`, ...]
+    @{[vmv_v_i $V2, 1]}
+    # v3: [`0`, `1`, `2`, `3`, ...]
+    @{[vid_v $V3]}
+    @{[vsetvli "zero", $T0, "e64", "m2", "ta", "ma"]}
+    # v4: [`1`, 0, `1`, 0, `1`, 0, `1`, 0, ...]
+    @{[vzext_vf2 $V4, $V2]}
+    # v6: [`0`, 0, `1`, 0, `2`, 0, `3`, 0, ...]
+    @{[vzext_vf2 $V6, $V3]}
+    slli $T0, $LEN32, 1
+    @{[vsetvli "zero", $T0, "e32", "m2", "ta", "ma"]}
+    # v8: [1<<0=1, 0, 0, 0, 1<<1=x, 0, 0, 0, 1<<2=x^2, 0, 0, 0, ...]
+    @{[vwsll_vv $V8, $V4, $V6]}
+
+    # Compute [r-IV0*1, r-IV0*x, r-IV0*x^2, r-IV0*x^3, ...] in v16
+    @{[vsetvli "zero", $LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vbrev8_v $V8, $V8]}
+    @{[vgmul_vv $V16, $V8]}
+
+    # Compute [IV0*1, IV0*x, IV0*x^2, IV0*x^3, ...] in v28.
+    # Reverse the bits order back.
+    @{[vbrev8_v $V28, $V16]}
+
+    # Prepare the x^n multiplier in v20. The `n` is the aes-xts block number
+    # in a LMUL=4 register group.
+    #   n = ((VLEN*LMUL)/(32*4)) = ((VLEN*4)/(32*4))
+    #     = (VLEN/32)
+    # We could use vsetvli with `e32, m1` to compute the `n` number.
+    @{[vsetvli $T0, "zero", "e32", "m1", "ta", "ma"]}
+    li $T1, 1
+    sll $T0, $T1, $T0
+    @{[vsetivli "zero", 2, "e64", "m1", "ta", "ma"]}
+    @{[vmv_v_i $V0, 0]}
+    @{[vsetivli "zero", 1, "e64", "m1", "tu", "ma"]}
+    @{[vmv_v_x $V0, $T0]}
+    @{[vsetivli "zero", 2, "e64", "m1", "ta", "ma"]}
+    @{[vbrev8_v $V0, $V0]}
+    @{[vsetvli "zero", $LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vmv_v_i $V20, 0]}
+    @{[vaesz_vs $V20, $V0]}
+
+    j 2f
+1:
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vbrev8_v $V16, $V28]}
+2:
+___
+
+    return $code;
+}
+
+# prepare xts enc last block's input(v24) and iv(v28)
+sub handle_xts_enc_last_block {
+    my $code=<<___;
+    bnez $TAIL_LENGTH, 1f
+    ret
+1:
+    # slidedown second to last block
+    addi $VL, $VL, -4
+    @{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]}
+    # ciphertext
+    @{[vslidedown_vx $V24, $V24, $VL]}
+    # multiplier
+    @{[vslidedown_vx $V16, $V16, $VL]}
+
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vmv_v_v $V25, $V24]}
+
+    # load last block into v24
+    # note: We should load the last block before store the second to last block
+    #       for in-place operation.
+    @{[vsetvli "zero", $TAIL_LENGTH, "e8", "m1", "tu", "ma"]}
+    @{[vle8_v $V24, $INPUT]}
+
+    # setup `x` multiplier with byte-reversed order
+    # 0b00000010 => 0b01000000 (0x40)
+    li $T0, 0x40
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vmv_v_i $V28, 0]}
+    @{[vsetivli "zero", 1, "e8", "m1", "tu", "ma"]}
+    @{[vmv_v_x $V28, $T0]}
+
+    # compute IV for last block
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vgmul_vv $V16, $V28]}
+    @{[vbrev8_v $V28, $V16]}
+
+    # store second to last block
+    @{[vsetvli "zero", $TAIL_LENGTH, "e8", "m1", "ta", "ma"]}
+    @{[vse8_v $V25, $OUTPUT]}
+___
+
+    return $code;
+}
+
+# prepare xts dec second to last block's input(v24) and iv(v29) and
+# last block's and iv(v28)
+sub handle_xts_dec_last_block {
+    my $code=<<___;
+    bnez $TAIL_LENGTH, 1f
+    ret
+1:
+    # load second to last block's ciphertext
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $V24, $INPUT]}
+    addi $INPUT, $INPUT, 16
+
+    # setup `x` multiplier with byte-reversed order
+    # 0b00000010 => 0b01000000 (0x40)
+    li $T0, 0x40
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vmv_v_i $V20, 0]}
+    @{[vsetivli "zero", 1, "e8", "m1", "tu", "ma"]}
+    @{[vmv_v_x $V20, $T0]}
+
+    beqz $LENGTH, 1f
+    # slidedown third to last block
+    addi $VL, $VL, -4
+    @{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]}
+    # multiplier
+    @{[vslidedown_vx $V16, $V16, $VL]}
+
+    # compute IV for last block
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vgmul_vv $V16, $V20]}
+    @{[vbrev8_v $V28, $V16]}
+
+    # compute IV for second to last block
+    @{[vgmul_vv $V16, $V20]}
+    @{[vbrev8_v $V29, $V16]}
+    j 2f
+1:
+    # compute IV for second to last block
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vgmul_vv $V16, $V20]}
+    @{[vbrev8_v $V29, $V16]}
+2:
+___
+
+    return $code;
+}
+
+# Load all 11 round keys to v1-v11 registers.
+sub aes_128_load_key {
+    my $code=<<___;
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $V1, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V2, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V3, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V4, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V5, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V6, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V7, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V8, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V9, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V10, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V11, $KEY1]}
+___
+
+    return $code;
+}
+
+# Load all 15 round keys to v1-v15 registers.
+sub aes_256_load_key {
+    my $code=<<___;
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $V1, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V2, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V3, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V4, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V5, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V6, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V7, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V8, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V9, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V10, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V11, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V12, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V13, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V14, $KEY1]}
+    addi $KEY1, $KEY1, 16
+    @{[vle32_v $V15, $KEY1]}
+___
+
+    return $code;
+}
+
+# aes-128 enc with round keys v1-v11
+sub aes_128_enc {
+    my $code=<<___;
+    @{[vaesz_vs $V24, $V1]}
+    @{[vaesem_vs $V24, $V2]}
+    @{[vaesem_vs $V24, $V3]}
+    @{[vaesem_vs $V24, $V4]}
+    @{[vaesem_vs $V24, $V5]}
+    @{[vaesem_vs $V24, $V6]}
+    @{[vaesem_vs $V24, $V7]}
+    @{[vaesem_vs $V24, $V8]}
+    @{[vaesem_vs $V24, $V9]}
+    @{[vaesem_vs $V24, $V10]}
+    @{[vaesef_vs $V24, $V11]}
+___
+
+    return $code;
+}
+
+# aes-128 dec with round keys v1-v11
+sub aes_128_dec {
+    my $code=<<___;
+    @{[vaesz_vs $V24, $V11]}
+    @{[vaesdm_vs $V24, $V10]}
+    @{[vaesdm_vs $V24, $V9]}
+    @{[vaesdm_vs $V24, $V8]}
+    @{[vaesdm_vs $V24, $V7]}
+    @{[vaesdm_vs $V24, $V6]}
+    @{[vaesdm_vs $V24, $V5]}
+    @{[vaesdm_vs $V24, $V4]}
+    @{[vaesdm_vs $V24, $V3]}
+    @{[vaesdm_vs $V24, $V2]}
+    @{[vaesdf_vs $V24, $V1]}
+___
+
+    return $code;
+}
+
+# aes-256 enc with round keys v1-v15
+sub aes_256_enc {
+    my $code=<<___;
+    @{[vaesz_vs $V24, $V1]}
+    @{[vaesem_vs $V24, $V2]}
+    @{[vaesem_vs $V24, $V3]}
+    @{[vaesem_vs $V24, $V4]}
+    @{[vaesem_vs $V24, $V5]}
+    @{[vaesem_vs $V24, $V6]}
+    @{[vaesem_vs $V24, $V7]}
+    @{[vaesem_vs $V24, $V8]}
+    @{[vaesem_vs $V24, $V9]}
+    @{[vaesem_vs $V24, $V10]}
+    @{[vaesem_vs $V24, $V11]}
+    @{[vaesem_vs $V24, $V12]}
+    @{[vaesem_vs $V24, $V13]}
+    @{[vaesem_vs $V24, $V14]}
+    @{[vaesef_vs $V24, $V15]}
+___
+
+    return $code;
+}
+
+# aes-256 dec with round keys v1-v15
+sub aes_256_dec {
+    my $code=<<___;
+    @{[vaesz_vs $V24, $V15]}
+    @{[vaesdm_vs $V24, $V14]}
+    @{[vaesdm_vs $V24, $V13]}
+    @{[vaesdm_vs $V24, $V12]}
+    @{[vaesdm_vs $V24, $V11]}
+    @{[vaesdm_vs $V24, $V10]}
+    @{[vaesdm_vs $V24, $V9]}
+    @{[vaesdm_vs $V24, $V8]}
+    @{[vaesdm_vs $V24, $V7]}
+    @{[vaesdm_vs $V24, $V6]}
+    @{[vaesdm_vs $V24, $V5]}
+    @{[vaesdm_vs $V24, $V4]}
+    @{[vaesdm_vs $V24, $V3]}
+    @{[vaesdm_vs $V24, $V2]}
+    @{[vaesdf_vs $V24, $V1]}
+___
+
+    return $code;
+}
+
+$code .= <<___;
+.p2align 3
+.globl rv64i_zvbb_zvkg_zvkned_aes_xts_encrypt
+.type rv64i_zvbb_zvkg_zvkned_aes_xts_encrypt,\@function
+rv64i_zvbb_zvkg_zvkned_aes_xts_encrypt:
+    @{[compute_xts_iv0]}
+
+    # aes block size is 16
+    andi $TAIL_LENGTH, $LENGTH, 15
+    mv $STORE_LEN32, $LENGTH
+    beqz $TAIL_LENGTH, 1f
+    sub $LENGTH, $LENGTH, $TAIL_LENGTH
+    addi $STORE_LEN32, $LENGTH, -16
+1:
+    # We make the `LENGTH` become e32 length here.
+    srli $LEN32, $LENGTH, 2
+    srli $STORE_LEN32, $STORE_LEN32, 2
+
+    # Load number of rounds
+    lwu $T0, 240($KEY1)
+    li $T1, 14
+    li $T2, 10
+    beq $T0, $T1, aes_xts_enc_256
+    beq $T0, $T2, aes_xts_enc_128
+.size rv64i_zvbb_zvkg_zvkned_aes_xts_encrypt,.-rv64i_zvbb_zvkg_zvkned_aes_xts_encrypt
+___
+
+$code .= <<___;
+.p2align 3
+aes_xts_enc_128:
+    @{[init_first_round]}
+    @{[aes_128_load_key]}
+
+    @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
+    j 1f
+
+.Lenc_blocks_128:
+    @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
+    # load plaintext into v24
+    @{[vle32_v $V24, $INPUT]}
+    # update iv
+    @{[vgmul_vv $V16, $V20]}
+    # reverse the iv's bits order back
+    @{[vbrev8_v $V28, $V16]}
+1:
+    @{[vxor_vv $V24, $V24, $V28]}
+    slli $T0, $VL, 2
+    sub $LEN32, $LEN32, $VL
+    add $INPUT, $INPUT, $T0
+    @{[aes_128_enc]}
+    @{[vxor_vv $V24, $V24, $V28]}
+
+    # store ciphertext
+    @{[vsetvli "zero", $STORE_LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vse32_v $V24, $OUTPUT]}
+    add $OUTPUT, $OUTPUT, $T0
+    sub $STORE_LEN32, $STORE_LEN32, $VL
+
+    bnez $LEN32, .Lenc_blocks_128
+
+    @{[handle_xts_enc_last_block]}
+
+    # xts last block
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vxor_vv $V24, $V24, $V28]}
+    @{[aes_128_enc]}
+    @{[vxor_vv $V24, $V24, $V28]}
+
+    # store last block ciphertext
+    addi $OUTPUT, $OUTPUT, -16
+    @{[vse32_v $V24, $OUTPUT]}
+
+    ret
+.size aes_xts_enc_128,.-aes_xts_enc_128
+___
+
+$code .= <<___;
+.p2align 3
+aes_xts_enc_256:
+    @{[init_first_round]}
+    @{[aes_256_load_key]}
+
+    @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
+    j 1f
+
+.Lenc_blocks_256:
+    @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
+    # load plaintext into v24
+    @{[vle32_v $V24, $INPUT]}
+    # update iv
+    @{[vgmul_vv $V16, $V20]}
+    # reverse the iv's bits order back
+    @{[vbrev8_v $V28, $V16]}
+1:
+    @{[vxor_vv $V24, $V24, $V28]}
+    slli $T0, $VL, 2
+    sub $LEN32, $LEN32, $VL
+    add $INPUT, $INPUT, $T0
+    @{[aes_256_enc]}
+    @{[vxor_vv $V24, $V24, $V28]}
+
+    # store ciphertext
+    @{[vsetvli "zero", $STORE_LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vse32_v $V24, $OUTPUT]}
+    add $OUTPUT, $OUTPUT, $T0
+    sub $STORE_LEN32, $STORE_LEN32, $VL
+
+    bnez $LEN32, .Lenc_blocks_256
+
+    @{[handle_xts_enc_last_block]}
+
+    # xts last block
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vxor_vv $V24, $V24, $V28]}
+    @{[aes_256_enc]}
+    @{[vxor_vv $V24, $V24, $V28]}
+
+    # store last block ciphertext
+    addi $OUTPUT, $OUTPUT, -16
+    @{[vse32_v $V24, $OUTPUT]}
+
+    ret
+.size aes_xts_enc_256,.-aes_xts_enc_256
+___
+
+################################################################################
+# void rv64i_zvbb_zvkg_zvkned_aes_xts_decrypt(const unsigned char *in,
+#                                             unsigned char *out, size_t length,
+#                                             const AES_KEY *key1,
+#                                             const AES_KEY *key2,
+#                                             const unsigned char iv[16])
+$code .= <<___;
+.p2align 3
+.globl rv64i_zvbb_zvkg_zvkned_aes_xts_decrypt
+.type rv64i_zvbb_zvkg_zvkned_aes_xts_decrypt,\@function
+rv64i_zvbb_zvkg_zvkned_aes_xts_decrypt:
+    @{[compute_xts_iv0]}
+
+    # aes block size is 16
+    andi $TAIL_LENGTH, $LENGTH, 15
+    beqz $TAIL_LENGTH, 1f
+    sub $LENGTH, $LENGTH, $TAIL_LENGTH
+    addi $LENGTH, $LENGTH, -16
+1:
+    # We make the `LENGTH` become e32 length here.
+    srli $LEN32, $LENGTH, 2
+
+    # Load number of rounds
+    lwu $T0, 240($KEY1)
+    li $T1, 14
+    li $T2, 10
+    beq $T0, $T1, aes_xts_dec_256
+    beq $T0, $T2, aes_xts_dec_128
+.size rv64i_zvbb_zvkg_zvkned_aes_xts_decrypt,.-rv64i_zvbb_zvkg_zvkned_aes_xts_decrypt
+___
+
+$code .= <<___;
+.p2align 3
+aes_xts_dec_128:
+    @{[init_first_round]}
+    @{[aes_128_load_key]}
+
+    beqz $LEN32, 2f
+
+    @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
+    j 1f
+
+.Ldec_blocks_128:
+    @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
+    # load ciphertext into v24
+    @{[vle32_v $V24, $INPUT]}
+    # update iv
+    @{[vgmul_vv $V16, $V20]}
+    # reverse the iv's bits order back
+    @{[vbrev8_v $V28, $V16]}
+1:
+    @{[vxor_vv $V24, $V24, $V28]}
+    slli $T0, $VL, 2
+    sub $LEN32, $LEN32, $VL
+    add $INPUT, $INPUT, $T0
+    @{[aes_128_dec]}
+    @{[vxor_vv $V24, $V24, $V28]}
+
+    # store plaintext
+    @{[vse32_v $V24, $OUTPUT]}
+    add $OUTPUT, $OUTPUT, $T0
+
+    bnez $LEN32, .Ldec_blocks_128
+
+2:
+    @{[handle_xts_dec_last_block]}
+
+    ## xts second to last block
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vxor_vv $V24, $V24, $V29]}
+    @{[aes_128_dec]}
+    @{[vxor_vv $V24, $V24, $V29]}
+    @{[vmv_v_v $V25, $V24]}
+
+    # load last block ciphertext
+    @{[vsetvli "zero", $TAIL_LENGTH, "e8", "m1", "tu", "ma"]}
+    @{[vle8_v $V24, $INPUT]}
+
+    # store second to last block plaintext
+    addi $T0, $OUTPUT, 16
+    @{[vse8_v $V25, $T0]}
+
+    ## xts last block
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vxor_vv $V24, $V24, $V28]}
+    @{[aes_128_dec]}
+    @{[vxor_vv $V24, $V24, $V28]}
+
+    # store second to last block plaintext
+    @{[vse32_v $V24, $OUTPUT]}
+
+    ret
+.size aes_xts_dec_128,.-aes_xts_dec_128
+___
+
+$code .= <<___;
+.p2align 3
+aes_xts_dec_256:
+    @{[init_first_round]}
+    @{[aes_256_load_key]}
+
+    beqz $LEN32, 2f
+
+    @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
+    j 1f
+
+.Ldec_blocks_256:
+    @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "ma"]}
+    # load ciphertext into v24
+    @{[vle32_v $V24, $INPUT]}
+    # update iv
+    @{[vgmul_vv $V16, $V20]}
+    # reverse the iv's bits order back
+    @{[vbrev8_v $V28, $V16]}
+1:
+    @{[vxor_vv $V24, $V24, $V28]}
+    slli $T0, $VL, 2
+    sub $LEN32, $LEN32, $VL
+    add $INPUT, $INPUT, $T0
+    @{[aes_256_dec]}
+    @{[vxor_vv $V24, $V24, $V28]}
+
+    # store plaintext
+    @{[vse32_v $V24, $OUTPUT]}
+    add $OUTPUT, $OUTPUT, $T0
+
+    bnez $LEN32, .Ldec_blocks_256
+
+2:
+    @{[handle_xts_dec_last_block]}
+
+    ## xts second to last block
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vxor_vv $V24, $V24, $V29]}
+    @{[aes_256_dec]}
+    @{[vxor_vv $V24, $V24, $V29]}
+    @{[vmv_v_v $V25, $V24]}
+
+    # load last block ciphertext
+    @{[vsetvli "zero", $TAIL_LENGTH, "e8", "m1", "tu", "ma"]}
+    @{[vle8_v $V24, $INPUT]}
+
+    # store second to last block plaintext
+    addi $T0, $OUTPUT, 16
+    @{[vse8_v $V25, $T0]}
+
+    ## xts last block
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vxor_vv $V24, $V24, $V28]}
+    @{[aes_256_dec]}
+    @{[vxor_vv $V24, $V24, $V28]}
+
+    # store second to last block plaintext
+    @{[vse32_v $V24, $OUTPUT]}
+
+    ret
+.size aes_xts_dec_256,.-aes_xts_dec_256
+___
+}
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/aes/asm/aes-riscv64-zvkb-zvkned.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aes-riscv64-zvkb-zvkned.pl
@@ -0,0 +1,376 @@
+#! /usr/bin/env perl
+# This file is dual-licensed, meaning that you can use it under your
+# choice of either of the following two licenses:
+#
+# Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You can obtain
+# a copy in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# or
+#
+# Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# - RV64I
+# - RISC-V Vector ('V') with VLEN >= 128
+# - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
+# - RISC-V Vector AES block cipher extension ('Zvkned')
+# - RISC-V Zicclsm(Main memory supports misaligned loads/stores)
+
+use strict;
+use warnings;
+
+use FindBin qw($Bin);
+use lib "$Bin";
+use lib "$Bin/../../perlasm";
+use riscv;
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$output and open STDOUT,">$output";
+
+my $code=<<___;
+.text
+___
+
+################################################################################
+# void rv64i_zvkb_zvkned_ctr32_encrypt_blocks(const unsigned char *in,
+#                                             unsigned char *out, size_t blocks,
+#                                             const void *key,
+#                                             const unsigned char ivec[16]);
+{
+my ($INP, $OUTP, $BLOCK_NUM, $KEYP, $IVP) = ("a0", "a1", "a2", "a3", "a4");
+my ($T0, $T1, $T2, $T3) = ("t0", "t1", "t2", "t3");
+my ($VL) = ("t4");
+my ($LEN32) = ("t5");
+my ($CTR) = ("t6");
+my ($MASK) = ("v0");
+my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7,
+    $V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15,
+    $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23,
+    $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31,
+) = map("v$_",(0..31));
+
+# Prepare the AES ctr input data into v16.
+sub init_aes_ctr_input {
+    my $code=<<___;
+    # Setup mask into v0
+    # The mask pattern for 4*N-th elements
+    # mask v0: [000100010001....]
+    # Note:
+    #   We could setup the mask just for the maximum element length instead of
+    #   the VLMAX.
+    li $T0, 0b10001000
+    @{[vsetvli $T2, "zero", "e8", "m1", "ta", "ma"]}
+    @{[vmv_v_x $MASK, $T0]}
+    # Load IV.
+    # v31:[IV0, IV1, IV2, big-endian count]
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $V31, $IVP]}
+    # Convert the big-endian counter into little-endian.
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "mu"]}
+    @{[vrev8_v $V31, $V31, $MASK]}
+    # Splat the IV to v16
+    @{[vsetvli "zero", $LEN32, "e32", "m4", "ta", "ma"]}
+    @{[vmv_v_i $V16, 0]}
+    @{[vaesz_vs $V16, $V31]}
+    # Prepare the ctr pattern into v20
+    # v20: [x, x, x, 0, x, x, x, 1, x, x, x, 2, ...]
+    @{[viota_m $V20, $MASK, $MASK]}
+    # v16:[IV0, IV1, IV2, count+0, IV0, IV1, IV2, count+1, ...]
+    @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "mu"]}
+    @{[vadd_vv $V16, $V16, $V20, $MASK]}
+___
+
+    return $code;
+}
+
+$code .= <<___;
+.p2align 3
+.globl rv64i_zvkb_zvkned_ctr32_encrypt_blocks
+.type rv64i_zvkb_zvkned_ctr32_encrypt_blocks,\@function
+rv64i_zvkb_zvkned_ctr32_encrypt_blocks:
+    beqz $BLOCK_NUM, 1f
+
+    # Load number of rounds
+    lwu $T0, 240($KEYP)
+    li $T1, 14
+    li $T2, 12
+    li $T3, 10
+
+    slli $LEN32, $BLOCK_NUM, 2
+
+    beq $T0, $T1, ctr32_encrypt_blocks_256
+    beq $T0, $T2, ctr32_encrypt_blocks_192
+    beq $T0, $T3, ctr32_encrypt_blocks_128
+
+1:
+    ret
+
+.size rv64i_zvkb_zvkned_ctr32_encrypt_blocks,.-rv64i_zvkb_zvkned_ctr32_encrypt_blocks
+___
+
+$code .= <<___;
+.p2align 3
+ctr32_encrypt_blocks_128:
+    # Load all 11 round keys to v1-v11 registers.
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $V1, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V2, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V3, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V4, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V5, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V6, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V7, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V8, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V9, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V10, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V11, $KEYP]}
+
+    @{[init_aes_ctr_input]}
+
+    ##### AES body
+    j 2f
+1:
+    @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "mu"]}
+    # Increase ctr in v16.
+    @{[vadd_vx $V16, $V16, $CTR, $MASK]}
+2:
+    # Load plaintext into v20
+    @{[vle32_v $V20, $INP]}
+    slli $T0, $VL, 2
+    srli $CTR, $VL, 2
+    sub $LEN32, $LEN32, $VL
+    add $INP, $INP, $T0
+    # Prepare the AES ctr input into v24.
+    # The ctr data uses big-endian form.
+    @{[vmv_v_v $V24, $V16]}
+    @{[vrev8_v $V24, $V24, $MASK]}
+
+    @{[vaesz_vs $V24, $V1]}
+    @{[vaesem_vs $V24, $V2]}
+    @{[vaesem_vs $V24, $V3]}
+    @{[vaesem_vs $V24, $V4]}
+    @{[vaesem_vs $V24, $V5]}
+    @{[vaesem_vs $V24, $V6]}
+    @{[vaesem_vs $V24, $V7]}
+    @{[vaesem_vs $V24, $V8]}
+    @{[vaesem_vs $V24, $V9]}
+    @{[vaesem_vs $V24, $V10]}
+    @{[vaesef_vs $V24, $V11]}
+
+    # ciphertext
+    @{[vxor_vv $V24, $V24, $V20]}
+
+    # Store the ciphertext.
+    @{[vse32_v $V24, $OUTP]}
+    add $OUTP, $OUTP, $T0
+
+    bnez $LEN32, 1b
+
+    ret
+.size ctr32_encrypt_blocks_128,.-ctr32_encrypt_blocks_128
+___
+
+$code .= <<___;
+.p2align 3
+ctr32_encrypt_blocks_192:
+    # Load all 13 round keys to v1-v13 registers.
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $V1, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V2, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V3, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V4, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V5, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V6, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V7, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V8, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V9, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V10, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V11, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V12, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V13, $KEYP]}
+
+    @{[init_aes_ctr_input]}
+
+    ##### AES body
+    j 2f
+1:
+    @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "mu"]}
+    # Increase ctr in v16.
+    @{[vadd_vx $V16, $V16, $CTR, $MASK]}
+2:
+    # Load plaintext into v20
+    @{[vle32_v $V20, $INP]}
+    slli $T0, $VL, 2
+    srli $CTR, $VL, 2
+    sub $LEN32, $LEN32, $VL
+    add $INP, $INP, $T0
+    # Prepare the AES ctr input into v24.
+    # The ctr data uses big-endian form.
+    @{[vmv_v_v $V24, $V16]}
+    @{[vrev8_v $V24, $V24, $MASK]}
+
+    @{[vaesz_vs $V24, $V1]}
+    @{[vaesem_vs $V24, $V2]}
+    @{[vaesem_vs $V24, $V3]}
+    @{[vaesem_vs $V24, $V4]}
+    @{[vaesem_vs $V24, $V5]}
+    @{[vaesem_vs $V24, $V6]}
+    @{[vaesem_vs $V24, $V7]}
+    @{[vaesem_vs $V24, $V8]}
+    @{[vaesem_vs $V24, $V9]}
+    @{[vaesem_vs $V24, $V10]}
+    @{[vaesem_vs $V24, $V11]}
+    @{[vaesem_vs $V24, $V12]}
+    @{[vaesef_vs $V24, $V13]}
+
+    # ciphertext
+    @{[vxor_vv $V24, $V24, $V20]}
+
+    # Store the ciphertext.
+    @{[vse32_v $V24, $OUTP]}
+    add $OUTP, $OUTP, $T0
+
+    bnez $LEN32, 1b
+
+    ret
+.size ctr32_encrypt_blocks_192,.-ctr32_encrypt_blocks_192
+___
+
+$code .= <<___;
+.p2align 3
+ctr32_encrypt_blocks_256:
+    # Load all 15 round keys to v1-v15 registers.
+    @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
+    @{[vle32_v $V1, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V2, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V3, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V4, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V5, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V6, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V7, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V8, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V9, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V10, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V11, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V12, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V13, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V14, $KEYP]}
+    addi $KEYP, $KEYP, 16
+    @{[vle32_v $V15, $KEYP]}
+
+    @{[init_aes_ctr_input]}
+
+    ##### AES body
+    j 2f
+1:
+    @{[vsetvli $VL, $LEN32, "e32", "m4", "ta", "mu"]}
+    # Increase ctr in v16.
+    @{[vadd_vx $V16, $V16, $CTR, $MASK]}
+2:
+    # Load plaintext into v20
+    @{[vle32_v $V20, $INP]}
+    slli $T0, $VL, 2
+    srli $CTR, $VL, 2
+    sub $LEN32, $LEN32, $VL
+    add $INP, $INP, $T0
+    # Prepare the AES ctr input into v24.
+    # The ctr data uses big-endian form.
+    @{[vmv_v_v $V24, $V16]}
+    @{[vrev8_v $V24, $V24, $MASK]}
+
+    @{[vaesz_vs $V24, $V1]}
+    @{[vaesem_vs $V24, $V2]}
+    @{[vaesem_vs $V24, $V3]}
+    @{[vaesem_vs $V24, $V4]}
+    @{[vaesem_vs $V24, $V5]}
+    @{[vaesem_vs $V24, $V6]}
+    @{[vaesem_vs $V24, $V7]}
+    @{[vaesem_vs $V24, $V8]}
+    @{[vaesem_vs $V24, $V9]}
+    @{[vaesem_vs $V24, $V10]}
+    @{[vaesem_vs $V24, $V11]}
+    @{[vaesem_vs $V24, $V12]}
+    @{[vaesem_vs $V24, $V13]}
+    @{[vaesem_vs $V24, $V14]}
+    @{[vaesef_vs $V24, $V15]}
+
+    # ciphertext
+    @{[vxor_vv $V24, $V24, $V20]}
+
+    # Store the ciphertext.
+    @{[vse32_v $V24, $OUTP]}
+    add $OUTP, $OUTP, $T0
+
+    bnez $LEN32, 1b
+
+    ret
+.size ctr32_encrypt_blocks_256,.-ctr32_encrypt_blocks_256
+___
+}
+
+print $code;
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/aes/asm/aes-riscv64-zvkned.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aes-riscv64-zvkned.pl
--- a/openssl-3.4.2/crypto/aes/asm/aes-riscv64.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aes-riscv64.pl
--- a/openssl-3.4.2/crypto/aes/asm/aes-s390x.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aes-s390x.pl
--- a/openssl-3.4.2/crypto/aes/asm/aes-sparcv9.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aes-sparcv9.pl
--- a/openssl-3.4.2/crypto/aes/asm/aes-x86_64.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aes-x86_64.pl
--- a/openssl-3.4.2/crypto/aes/asm/aesfx-sparcv9.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aesfx-sparcv9.pl
--- a/openssl-3.4.2/crypto/aes/asm/aesni-mb-x86_64.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aesni-mb-x86_64.pl
--- a/openssl-3.4.2/crypto/aes/asm/aesni-sha1-x86_64.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aesni-sha1-x86_64.pl
--- a/openssl-3.4.2/crypto/aes/asm/aesni-sha256-x86_64.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aesni-sha256-x86_64.pl
--- a/openssl-3.4.2/crypto/aes/asm/aesni-x86.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aesni-x86.pl
--- a/openssl-3.4.2/crypto/aes/asm/aesni-x86_64.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aesni-x86_64.pl
--- a/openssl-3.4.2/crypto/aes/asm/aesp8-ppc.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aesp8-ppc.pl
--- a/openssl-3.4.2/crypto/aes/asm/aest4-sparcv9.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aest4-sparcv9.pl
@@ -0,0 +1,931 @@
+#! /usr/bin/env perl
+# Copyright 2012-2021 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+# ====================================================================
+# Written by David S. Miller and Andy Polyakov.
+# The module is licensed under 2-clause BSD license. October 2012.
+# All rights reserved.
+# ====================================================================
+
+######################################################################
+# AES for SPARC T4.
+#
+# AES round instructions complete in 3 cycles and can be issued every
+# cycle. It means that round calculations should take 4*rounds cycles,
+# because any given round instruction depends on result of *both*
+# previous instructions:
+#
+#	|0 |1 |2 |3 |4
+#	|01|01|01|
+#	   |23|23|23|
+#	            |01|01|...
+#	               |23|...
+#
+# Provided that fxor [with IV] takes 3 cycles to complete, critical
+# path length for CBC encrypt would be 3+4*rounds, or in other words
+# it should process one byte in at least (3+4*rounds)/16 cycles. This
+# estimate doesn't account for "collateral" instructions, such as
+# fetching input from memory, xor-ing it with zero-round key and
+# storing the result. Yet, *measured* performance [for data aligned
+# at 64-bit boundary!] deviates from this equation by less than 0.5%:
+#
+#		128-bit key	192-		256-
+# CBC encrypt	2.70/2.90(*)	3.20/3.40	3.70/3.90
+#			 (*) numbers after slash are for
+#			     misaligned data.
+#
+# Out-of-order execution logic managed to fully overlap "collateral"
+# instructions with those on critical path. Amazing!
+#
+# As with Intel AES-NI, question is if it's possible to improve
+# performance of parallelizable modes by interleaving round
+# instructions. Provided round instruction latency and throughput
+# optimal interleave factor is 2. But can we expect 2x performance
+# improvement? Well, as round instructions can be issued one per
+# cycle, they don't saturate the 2-way issue pipeline and therefore
+# there is room for "collateral" calculations... Yet, 2x speed-up
+# over CBC encrypt remains unattaintable:
+#
+#		128-bit key	192-		256-
+# CBC decrypt	1.64/2.11	1.89/2.37	2.23/2.61
+# CTR		1.64/2.08(*)	1.89/2.33	2.23/2.61
+#			 (*) numbers after slash are for
+#			     misaligned data.
+#
+# Estimates based on amount of instructions under assumption that
+# round instructions are not pairable with any other instruction
+# suggest that latter is the actual case and pipeline runs
+# underutilized. It should be noted that T4 out-of-order execution
+# logic is so capable that performance gain from 2x interleave is
+# not even impressive, ~7-13% over non-interleaved code, largest
+# for 256-bit keys.
+
+# To anchor to something else, software implementation processes
+# one byte in 29 cycles with 128-bit key on same processor. Intel
+# Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
+# in 0.93, naturally with AES-NI.
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "sparcv9_modes.pl";
+
+$output = pop and open STDOUT,">$output";
+
+$::evp=1;	# if $evp is set to 0, script generates module with
+# AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
+# points. These however are not fully compatible with openssl/aes.h,
+# because they expect AES_KEY to be aligned at 64-bit boundary. When
+# used through EVP, alignment is arranged at EVP layer. Second thing
+# that is arranged by EVP is at least 32-bit alignment of IV.
+
+######################################################################
+# single-round subroutines
+#
+{
+my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
+
+$code.=<<___;
+#ifndef __ASSEMBLER__
+# define __ASSEMBLER__ 1
+#endif
+#include "crypto/sparc_arch.h"
+
+#ifdef	__arch64__
+.register	%g2,#scratch
+.register	%g3,#scratch
+#endif
+
+.text
+
+.globl	aes_t4_encrypt
+.align	32
+aes_t4_encrypt:
+	andcc		$inp, 7, %g1		! is input aligned?
+	andn		$inp, 7, $inp
+
+	ldx		[$key + 0], %g4
+	ldx		[$key + 8], %g5
+
+	ldx		[$inp + 0], %o4
+	bz,pt		%icc, 1f
+	ldx		[$inp + 8], %o5
+	ldx		[$inp + 16], $inp
+	sll		%g1, 3, %g1
+	sub		%g0, %g1, %o3
+	sllx		%o4, %g1, %o4
+	sllx		%o5, %g1, %g1
+	srlx		%o5, %o3, %o5
+	srlx		$inp, %o3, %o3
+	or		%o5, %o4, %o4
+	or		%o3, %g1, %o5
+1:
+	ld		[$key + 240], $rounds
+	ldd		[$key + 16], %f12
+	ldd		[$key + 24], %f14
+	xor		%g4, %o4, %o4
+	xor		%g5, %o5, %o5
+	movxtod		%o4, %f0
+	movxtod		%o5, %f2
+	srl		$rounds, 1, $rounds
+	ldd		[$key + 32], %f16
+	sub		$rounds, 1, $rounds
+	ldd		[$key + 40], %f18
+	add		$key, 48, $key
+
+.Lenc:
+	aes_eround01	%f12, %f0, %f2, %f4
+	aes_eround23	%f14, %f0, %f2, %f2
+	ldd		[$key + 0], %f12
+	ldd		[$key + 8], %f14
+	sub		$rounds,1,$rounds
+	aes_eround01	%f16, %f4, %f2, %f0
+	aes_eround23	%f18, %f4, %f2, %f2
+	ldd		[$key + 16], %f16
+	ldd		[$key + 24], %f18
+	brnz,pt		$rounds, .Lenc
+	add		$key, 32, $key
+
+	andcc		$out, 7, $tmp		! is output aligned?
+	aes_eround01	%f12, %f0, %f2, %f4
+	aes_eround23	%f14, %f0, %f2, %f2
+	aes_eround01_l	%f16, %f4, %f2, %f0
+	aes_eround23_l	%f18, %f4, %f2, %f2
+
+	bnz,pn		%icc, 2f
+	nop
+
+	std		%f0, [$out + 0]
+	retl
+	std		%f2, [$out + 8]
+
+2:	alignaddrl	$out, %g0, $out
+	mov		0xff, $mask
+	srl		$mask, $tmp, $mask
+
+	faligndata	%f0, %f0, %f4
+	faligndata	%f0, %f2, %f6
+	faligndata	%f2, %f2, %f8
+
+	stda		%f4, [$out + $mask]0xc0	! partial store
+	std		%f6, [$out + 8]
+	add		$out, 16, $out
+	orn		%g0, $mask, $mask
+	retl
+	stda		%f8, [$out + $mask]0xc0	! partial store
+.type	aes_t4_encrypt,#function
+.size	aes_t4_encrypt,.-aes_t4_encrypt
+
+.globl	aes_t4_decrypt
+.align	32
+aes_t4_decrypt:
+	andcc		$inp, 7, %g1		! is input aligned?
+	andn		$inp, 7, $inp
+
+	ldx		[$key + 0], %g4
+	ldx		[$key + 8], %g5
+
+	ldx		[$inp + 0], %o4
+	bz,pt		%icc, 1f
+	ldx		[$inp + 8], %o5
+	ldx		[$inp + 16], $inp
+	sll		%g1, 3, %g1
+	sub		%g0, %g1, %o3
+	sllx		%o4, %g1, %o4
+	sllx		%o5, %g1, %g1
+	srlx		%o5, %o3, %o5
+	srlx		$inp, %o3, %o3
+	or		%o5, %o4, %o4
+	or		%o3, %g1, %o5
+1:
+	ld		[$key + 240], $rounds
+	ldd		[$key + 16], %f12
+	ldd		[$key + 24], %f14
+	xor		%g4, %o4, %o4
+	xor		%g5, %o5, %o5
+	movxtod		%o4, %f0
+	movxtod		%o5, %f2
+	srl		$rounds, 1, $rounds
+	ldd		[$key + 32], %f16
+	sub		$rounds, 1, $rounds
+	ldd		[$key + 40], %f18
+	add		$key, 48, $key
+
+.Ldec:
+	aes_dround01	%f12, %f0, %f2, %f4
+	aes_dround23	%f14, %f0, %f2, %f2
+	ldd		[$key + 0], %f12
+	ldd		[$key + 8], %f14
+	sub		$rounds,1,$rounds
+	aes_dround01	%f16, %f4, %f2, %f0
+	aes_dround23	%f18, %f4, %f2, %f2
+	ldd		[$key + 16], %f16
+	ldd		[$key + 24], %f18
+	brnz,pt		$rounds, .Ldec
+	add		$key, 32, $key
+
+	andcc		$out, 7, $tmp		! is output aligned?
+	aes_dround01	%f12, %f0, %f2, %f4
+	aes_dround23	%f14, %f0, %f2, %f2
+	aes_dround01_l	%f16, %f4, %f2, %f0
+	aes_dround23_l	%f18, %f4, %f2, %f2
+
+	bnz,pn		%icc, 2f
+	nop
+
+	std		%f0, [$out + 0]
+	retl
+	std		%f2, [$out + 8]
+
+2:	alignaddrl	$out, %g0, $out
+	mov		0xff, $mask
+	srl		$mask, $tmp, $mask
+
+	faligndata	%f0, %f0, %f4
+	faligndata	%f0, %f2, %f6
+	faligndata	%f2, %f2, %f8
+
+	stda		%f4, [$out + $mask]0xc0	! partial store
+	std		%f6, [$out + 8]
+	add		$out, 16, $out
+	orn		%g0, $mask, $mask
+	retl
+	stda		%f8, [$out + $mask]0xc0	! partial store
+.type	aes_t4_decrypt,#function
+.size	aes_t4_decrypt,.-aes_t4_decrypt
+___
+}
+
+######################################################################
+# key setup subroutines
+#
+{
+my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
+$code.=<<___;
+.globl	aes_t4_set_encrypt_key
+.align	32
+aes_t4_set_encrypt_key:
+.Lset_encrypt_key:
+	and		$inp, 7, $tmp
+	alignaddr	$inp, %g0, $inp
+	cmp		$bits, 192
+	ldd		[$inp + 0], %f0
+	bl,pt		%icc,.L128
+	ldd		[$inp + 8], %f2
+
+	be,pt		%icc,.L192
+	ldd		[$inp + 16], %f4
+	brz,pt		$tmp, .L256aligned
+	ldd		[$inp + 24], %f6
+
+	ldd		[$inp + 32], %f8
+	faligndata	%f0, %f2, %f0
+	faligndata	%f2, %f4, %f2
+	faligndata	%f4, %f6, %f4
+	faligndata	%f6, %f8, %f6
+.L256aligned:
+___
+for ($i=0; $i<6; $i++) {
+    $code.=<<___;
+	std		%f0, [$out + `32*$i+0`]
+	aes_kexpand1	%f0, %f6, $i, %f0
+	std		%f2, [$out + `32*$i+8`]
+	aes_kexpand2	%f2, %f0, %f2
+	std		%f4, [$out + `32*$i+16`]
+	aes_kexpand0	%f4, %f2, %f4
+	std		%f6, [$out + `32*$i+24`]
+	aes_kexpand2	%f6, %f4, %f6
+___
+}
+$code.=<<___;
+	std		%f0, [$out + `32*$i+0`]
+	aes_kexpand1	%f0, %f6, $i, %f0
+	std		%f2, [$out + `32*$i+8`]
+	aes_kexpand2	%f2, %f0, %f2
+	std		%f4, [$out + `32*$i+16`]
+	std		%f6, [$out + `32*$i+24`]
+	std		%f0, [$out + `32*$i+32`]
+	std		%f2, [$out + `32*$i+40`]
+
+	mov		14, $tmp
+	st		$tmp, [$out + 240]
+	retl
+	xor		%o0, %o0, %o0
+
+.align	16
+.L192:
+	brz,pt		$tmp, .L192aligned
+	nop
+
+	ldd		[$inp + 24], %f6
+	faligndata	%f0, %f2, %f0
+	faligndata	%f2, %f4, %f2
+	faligndata	%f4, %f6, %f4
+.L192aligned:
+___
+for ($i=0; $i<7; $i++) {
+    $code.=<<___;
+	std		%f0, [$out + `24*$i+0`]
+	aes_kexpand1	%f0, %f4, $i, %f0
+	std		%f2, [$out + `24*$i+8`]
+	aes_kexpand2	%f2, %f0, %f2
+	std		%f4, [$out + `24*$i+16`]
+	aes_kexpand2	%f4, %f2, %f4
+___
+}
+$code.=<<___;
+	std		%f0, [$out + `24*$i+0`]
+	aes_kexpand1	%f0, %f4, $i, %f0
+	std		%f2, [$out + `24*$i+8`]
+	aes_kexpand2	%f2, %f0, %f2
+	std		%f4, [$out + `24*$i+16`]
+	std		%f0, [$out + `24*$i+24`]
+	std		%f2, [$out + `24*$i+32`]
+
+	mov		12, $tmp
+	st		$tmp, [$out + 240]
+	retl
+	xor		%o0, %o0, %o0
+
+.align	16
+.L128:
+	brz,pt		$tmp, .L128aligned
+	nop
+
+	ldd		[$inp + 16], %f4
+	faligndata	%f0, %f2, %f0
+	faligndata	%f2, %f4, %f2
+.L128aligned:
+___
+for ($i=0; $i<10; $i++) {
+    $code.=<<___;
+	std		%f0, [$out + `16*$i+0`]
+	aes_kexpand1	%f0, %f2, $i, %f0
+	std		%f2, [$out + `16*$i+8`]
+	aes_kexpand2	%f2, %f0, %f2
+___
+}
+$code.=<<___;
+	std		%f0, [$out + `16*$i+0`]
+	std		%f2, [$out + `16*$i+8`]
+
+	mov		10, $tmp
+	st		$tmp, [$out + 240]
+	retl
+	xor		%o0, %o0, %o0
+.type	aes_t4_set_encrypt_key,#function
+.size	aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
+
+.globl	aes_t4_set_decrypt_key
+.align	32
+aes_t4_set_decrypt_key:
+	mov		%o7, %o5
+	call		.Lset_encrypt_key
+	nop
+
+	mov		%o5, %o7
+	sll		$tmp, 4, $inp		! $tmp is number of rounds
+	add		$tmp, 2, $tmp
+	add		$out, $inp, $inp	! $inp=$out+16*rounds
+	srl		$tmp, 2, $tmp		! $tmp=(rounds+2)/4
+
+.Lkey_flip:
+	ldd		[$out + 0],  %f0
+	ldd		[$out + 8],  %f2
+	ldd		[$out + 16], %f4
+	ldd		[$out + 24], %f6
+	ldd		[$inp + 0],  %f8
+	ldd		[$inp + 8],  %f10
+	ldd		[$inp - 16], %f12
+	ldd		[$inp - 8],  %f14
+	sub		$tmp, 1, $tmp
+	std		%f0, [$inp + 0]
+	std		%f2, [$inp + 8]
+	std		%f4, [$inp - 16]
+	std		%f6, [$inp - 8]
+	std		%f8, [$out + 0]
+	std		%f10, [$out + 8]
+	std		%f12, [$out + 16]
+	std		%f14, [$out + 24]
+	add		$out, 32, $out
+	brnz		$tmp, .Lkey_flip
+	sub		$inp, 32, $inp
+
+	retl
+	xor		%o0, %o0, %o0
+.type	aes_t4_set_decrypt_key,#function
+.size	aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
+___
+}
+
+{{{
+my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
+my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
+
+$code.=<<___;
+.align	32
+_aes128_encrypt_1x:
+___
+for ($i=0; $i<4; $i++) {
+    $code.=<<___;
+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
+	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+	aes_eround01	%f48, %f0, %f2, %f4
+	aes_eround23	%f50, %f0, %f2, %f2
+	aes_eround01_l	%f52, %f4, %f2, %f0
+	retl
+	aes_eround23_l	%f54, %f4, %f2, %f2
+.type	_aes128_encrypt_1x,#function
+.size	_aes128_encrypt_1x,.-_aes128_encrypt_1x
+
+.align	32
+_aes128_encrypt_2x:
+___
+for ($i=0; $i<4; $i++) {
+    $code.=<<___;
+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
+	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
+	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
+	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
+	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
+	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+	aes_eround01	%f48, %f0, %f2, %f8
+	aes_eround23	%f50, %f0, %f2, %f2
+	aes_eround01	%f48, %f4, %f6, %f10
+	aes_eround23	%f50, %f4, %f6, %f6
+	aes_eround01_l	%f52, %f8, %f2, %f0
+	aes_eround23_l	%f54, %f8, %f2, %f2
+	aes_eround01_l	%f52, %f10, %f6, %f4
+	retl
+	aes_eround23_l	%f54, %f10, %f6, %f6
+.type	_aes128_encrypt_2x,#function
+.size	_aes128_encrypt_2x,.-_aes128_encrypt_2x
+
+.align	32
+_aes128_loadkey:
+	ldx		[$key + 0], %g4
+	ldx		[$key + 8], %g5
+___
+for ($i=2; $i<22;$i++) {			# load key schedule
+    $code.=<<___;
+	ldd		[$key + `8*$i`], %f`12+2*$i`
+___
+}
+$code.=<<___;
+	retl
+	nop
+.type	_aes128_loadkey,#function
+.size	_aes128_loadkey,.-_aes128_loadkey
+_aes128_load_enckey=_aes128_loadkey
+_aes128_load_deckey=_aes128_loadkey
+
+___
+
+&alg_cbc_encrypt_implement("aes",128);
+if ($::evp) {
+    &alg_ctr32_implement("aes",128);
+    &alg_xts_implement("aes",128,"en");
+    &alg_xts_implement("aes",128,"de");
+}
+&alg_cbc_decrypt_implement("aes",128);
+
+$code.=<<___;
+.align	32
+_aes128_decrypt_1x:
+___
+for ($i=0; $i<4; $i++) {
+    $code.=<<___;
+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
+	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+	aes_dround01	%f48, %f0, %f2, %f4
+	aes_dround23	%f50, %f0, %f2, %f2
+	aes_dround01_l	%f52, %f4, %f2, %f0
+	retl
+	aes_dround23_l	%f54, %f4, %f2, %f2
+.type	_aes128_decrypt_1x,#function
+.size	_aes128_decrypt_1x,.-_aes128_decrypt_1x
+
+.align	32
+_aes128_decrypt_2x:
+___
+for ($i=0; $i<4; $i++) {
+    $code.=<<___;
+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
+	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
+	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
+	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
+	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
+	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+	aes_dround01	%f48, %f0, %f2, %f8
+	aes_dround23	%f50, %f0, %f2, %f2
+	aes_dround01	%f48, %f4, %f6, %f10
+	aes_dround23	%f50, %f4, %f6, %f6
+	aes_dround01_l	%f52, %f8, %f2, %f0
+	aes_dround23_l	%f54, %f8, %f2, %f2
+	aes_dround01_l	%f52, %f10, %f6, %f4
+	retl
+	aes_dround23_l	%f54, %f10, %f6, %f6
+.type	_aes128_decrypt_2x,#function
+.size	_aes128_decrypt_2x,.-_aes128_decrypt_2x
+___
+
+$code.=<<___;
+.align	32
+_aes192_encrypt_1x:
+___
+for ($i=0; $i<5; $i++) {
+    $code.=<<___;
+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
+	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+	aes_eround01	%f56, %f0, %f2, %f4
+	aes_eround23	%f58, %f0, %f2, %f2
+	aes_eround01_l	%f60, %f4, %f2, %f0
+	retl
+	aes_eround23_l	%f62, %f4, %f2, %f2
+.type	_aes192_encrypt_1x,#function
+.size	_aes192_encrypt_1x,.-_aes192_encrypt_1x
+
+.align	32
+_aes192_encrypt_2x:
+___
+for ($i=0; $i<5; $i++) {
+    $code.=<<___;
+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
+	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
+	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
+	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
+	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
+	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+	aes_eround01	%f56, %f0, %f2, %f8
+	aes_eround23	%f58, %f0, %f2, %f2
+	aes_eround01	%f56, %f4, %f6, %f10
+	aes_eround23	%f58, %f4, %f6, %f6
+	aes_eround01_l	%f60, %f8, %f2, %f0
+	aes_eround23_l	%f62, %f8, %f2, %f2
+	aes_eround01_l	%f60, %f10, %f6, %f4
+	retl
+	aes_eround23_l	%f62, %f10, %f6, %f6
+.type	_aes192_encrypt_2x,#function
+.size	_aes192_encrypt_2x,.-_aes192_encrypt_2x
+
+.align	32
+_aes256_encrypt_1x:
+	aes_eround01	%f16, %f0, %f2, %f4
+	aes_eround23	%f18, %f0, %f2, %f2
+	ldd		[$key + 208], %f16
+	ldd		[$key + 216], %f18
+	aes_eround01	%f20, %f4, %f2, %f0
+	aes_eround23	%f22, %f4, %f2, %f2
+	ldd		[$key + 224], %f20
+	ldd		[$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+    $code.=<<___;
+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f4
+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_eround01	%f`16+8*$i+4`, %f4, %f2, %f0
+	aes_eround23	%f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+	aes_eround01	%f16, %f0, %f2, %f4
+	aes_eround23	%f18, %f0, %f2, %f2
+	ldd		[$key + 16], %f16
+	ldd		[$key + 24], %f18
+	aes_eround01_l	%f20, %f4, %f2, %f0
+	aes_eround23_l	%f22, %f4, %f2, %f2
+	ldd		[$key + 32], %f20
+	retl
+	ldd		[$key + 40], %f22
+.type	_aes256_encrypt_1x,#function
+.size	_aes256_encrypt_1x,.-_aes256_encrypt_1x
+
+.align	32
+_aes256_encrypt_2x:
+	aes_eround01	%f16, %f0, %f2, %f8
+	aes_eround23	%f18, %f0, %f2, %f2
+	aes_eround01	%f16, %f4, %f6, %f10
+	aes_eround23	%f18, %f4, %f6, %f6
+	ldd		[$key + 208], %f16
+	ldd		[$key + 216], %f18
+	aes_eround01	%f20, %f8, %f2, %f0
+	aes_eround23	%f22, %f8, %f2, %f2
+	aes_eround01	%f20, %f10, %f6, %f4
+	aes_eround23	%f22, %f10, %f6, %f6
+	ldd		[$key + 224], %f20
+	ldd		[$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+    $code.=<<___;
+	aes_eround01	%f`16+8*$i+0`, %f0, %f2, %f8
+	aes_eround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_eround01	%f`16+8*$i+0`, %f4, %f6, %f10
+	aes_eround23	%f`16+8*$i+2`, %f4, %f6, %f6
+	aes_eround01	%f`16+8*$i+4`, %f8, %f2, %f0
+	aes_eround23	%f`16+8*$i+6`, %f8, %f2, %f2
+	aes_eround01	%f`16+8*$i+4`, %f10, %f6, %f4
+	aes_eround23	%f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+	aes_eround01	%f16, %f0, %f2, %f8
+	aes_eround23	%f18, %f0, %f2, %f2
+	aes_eround01	%f16, %f4, %f6, %f10
+	aes_eround23	%f18, %f4, %f6, %f6
+	ldd		[$key + 16], %f16
+	ldd		[$key + 24], %f18
+	aes_eround01_l	%f20, %f8, %f2, %f0
+	aes_eround23_l	%f22, %f8, %f2, %f2
+	aes_eround01_l	%f20, %f10, %f6, %f4
+	aes_eround23_l	%f22, %f10, %f6, %f6
+	ldd		[$key + 32], %f20
+	retl
+	ldd		[$key + 40], %f22
+.type	_aes256_encrypt_2x,#function
+.size	_aes256_encrypt_2x,.-_aes256_encrypt_2x
+
+.align	32
+_aes192_loadkey:
+	ldx		[$key + 0], %g4
+	ldx		[$key + 8], %g5
+___
+for ($i=2; $i<26;$i++) {			# load key schedule
+    $code.=<<___;
+	ldd		[$key + `8*$i`], %f`12+2*$i`
+___
+}
+$code.=<<___;
+	retl
+	nop
+.type	_aes192_loadkey,#function
+.size	_aes192_loadkey,.-_aes192_loadkey
+_aes256_loadkey=_aes192_loadkey
+_aes192_load_enckey=_aes192_loadkey
+_aes192_load_deckey=_aes192_loadkey
+_aes256_load_enckey=_aes192_loadkey
+_aes256_load_deckey=_aes192_loadkey
+___
+
+&alg_cbc_encrypt_implement("aes",256);
+&alg_cbc_encrypt_implement("aes",192);
+if ($::evp) {
+    &alg_ctr32_implement("aes",256);
+    &alg_xts_implement("aes",256,"en");
+    &alg_xts_implement("aes",256,"de");
+    &alg_ctr32_implement("aes",192);
+}
+&alg_cbc_decrypt_implement("aes",192);
+&alg_cbc_decrypt_implement("aes",256);
+
+$code.=<<___;
+.align	32
+_aes256_decrypt_1x:
+	aes_dround01	%f16, %f0, %f2, %f4
+	aes_dround23	%f18, %f0, %f2, %f2
+	ldd		[$key + 208], %f16
+	ldd		[$key + 216], %f18
+	aes_dround01	%f20, %f4, %f2, %f0
+	aes_dround23	%f22, %f4, %f2, %f2
+	ldd		[$key + 224], %f20
+	ldd		[$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+    $code.=<<___;
+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
+	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+	aes_dround01	%f16, %f0, %f2, %f4
+	aes_dround23	%f18, %f0, %f2, %f2
+	ldd		[$key + 16], %f16
+	ldd		[$key + 24], %f18
+	aes_dround01_l	%f20, %f4, %f2, %f0
+	aes_dround23_l	%f22, %f4, %f2, %f2
+	ldd		[$key + 32], %f20
+	retl
+	ldd		[$key + 40], %f22
+.type	_aes256_decrypt_1x,#function
+.size	_aes256_decrypt_1x,.-_aes256_decrypt_1x
+
+.align	32
+_aes256_decrypt_2x:
+	aes_dround01	%f16, %f0, %f2, %f8
+	aes_dround23	%f18, %f0, %f2, %f2
+	aes_dround01	%f16, %f4, %f6, %f10
+	aes_dround23	%f18, %f4, %f6, %f6
+	ldd		[$key + 208], %f16
+	ldd		[$key + 216], %f18
+	aes_dround01	%f20, %f8, %f2, %f0
+	aes_dround23	%f22, %f8, %f2, %f2
+	aes_dround01	%f20, %f10, %f6, %f4
+	aes_dround23	%f22, %f10, %f6, %f6
+	ldd		[$key + 224], %f20
+	ldd		[$key + 232], %f22
+___
+for ($i=1; $i<6; $i++) {
+    $code.=<<___;
+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
+	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
+	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
+	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
+	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
+	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+	aes_dround01	%f16, %f0, %f2, %f8
+	aes_dround23	%f18, %f0, %f2, %f2
+	aes_dround01	%f16, %f4, %f6, %f10
+	aes_dround23	%f18, %f4, %f6, %f6
+	ldd		[$key + 16], %f16
+	ldd		[$key + 24], %f18
+	aes_dround01_l	%f20, %f8, %f2, %f0
+	aes_dround23_l	%f22, %f8, %f2, %f2
+	aes_dround01_l	%f20, %f10, %f6, %f4
+	aes_dround23_l	%f22, %f10, %f6, %f6
+	ldd		[$key + 32], %f20
+	retl
+	ldd		[$key + 40], %f22
+.type	_aes256_decrypt_2x,#function
+.size	_aes256_decrypt_2x,.-_aes256_decrypt_2x
+
+.align	32
+_aes192_decrypt_1x:
+___
+for ($i=0; $i<5; $i++) {
+    $code.=<<___;
+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f4
+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_dround01	%f`16+8*$i+4`, %f4, %f2, %f0
+	aes_dround23	%f`16+8*$i+6`, %f4, %f2, %f2
+___
+}
+$code.=<<___;
+	aes_dround01	%f56, %f0, %f2, %f4
+	aes_dround23	%f58, %f0, %f2, %f2
+	aes_dround01_l	%f60, %f4, %f2, %f0
+	retl
+	aes_dround23_l	%f62, %f4, %f2, %f2
+.type	_aes192_decrypt_1x,#function
+.size	_aes192_decrypt_1x,.-_aes192_decrypt_1x
+
+.align	32
+_aes192_decrypt_2x:
+___
+for ($i=0; $i<5; $i++) {
+    $code.=<<___;
+	aes_dround01	%f`16+8*$i+0`, %f0, %f2, %f8
+	aes_dround23	%f`16+8*$i+2`, %f0, %f2, %f2
+	aes_dround01	%f`16+8*$i+0`, %f4, %f6, %f10
+	aes_dround23	%f`16+8*$i+2`, %f4, %f6, %f6
+	aes_dround01	%f`16+8*$i+4`, %f8, %f2, %f0
+	aes_dround23	%f`16+8*$i+6`, %f8, %f2, %f2
+	aes_dround01	%f`16+8*$i+4`, %f10, %f6, %f4
+	aes_dround23	%f`16+8*$i+6`, %f10, %f6, %f6
+___
+}
+$code.=<<___;
+	aes_dround01	%f56, %f0, %f2, %f8
+	aes_dround23	%f58, %f0, %f2, %f2
+	aes_dround01	%f56, %f4, %f6, %f10
+	aes_dround23	%f58, %f4, %f6, %f6
+	aes_dround01_l	%f60, %f8, %f2, %f0
+	aes_dround23_l	%f62, %f8, %f2, %f2
+	aes_dround01_l	%f60, %f10, %f6, %f4
+	retl
+	aes_dround23_l	%f62, %f10, %f6, %f6
+.type	_aes192_decrypt_2x,#function
+.size	_aes192_decrypt_2x,.-_aes192_decrypt_2x
+___
+}}}
+
+if (!$::evp) {
+$code.=<<___;
+.global	AES_encrypt
+AES_encrypt=aes_t4_encrypt
+.global	AES_decrypt
+AES_decrypt=aes_t4_decrypt
+.global	AES_set_encrypt_key
+.align	32
+AES_set_encrypt_key:
+	andcc		%o2, 7, %g0		! check alignment
+	bnz,a,pn	%icc, 1f
+	mov		-1, %o0
+	brz,a,pn	%o0, 1f
+	mov		-1, %o0
+	brz,a,pn	%o2, 1f
+	mov		-1, %o0
+	andncc		%o1, 0x1c0, %g0
+	bnz,a,pn	%icc, 1f
+	mov		-2, %o0
+	cmp		%o1, 128
+	bl,a,pn		%icc, 1f
+	mov		-2, %o0
+	b		aes_t4_set_encrypt_key
+	nop
+1:	retl
+	nop
+.type	AES_set_encrypt_key,#function
+.size	AES_set_encrypt_key,.-AES_set_encrypt_key
+
+.global	AES_set_decrypt_key
+.align	32
+AES_set_decrypt_key:
+	andcc		%o2, 7, %g0		! check alignment
+	bnz,a,pn	%icc, 1f
+	mov		-1, %o0
+	brz,a,pn	%o0, 1f
+	mov		-1, %o0
+	brz,a,pn	%o2, 1f
+	mov		-1, %o0
+	andncc		%o1, 0x1c0, %g0
+	bnz,a,pn	%icc, 1f
+	mov		-2, %o0
+	cmp		%o1, 128
+	bl,a,pn		%icc, 1f
+	mov		-2, %o0
+	b		aes_t4_set_decrypt_key
+	nop
+1:	retl
+	nop
+.type	AES_set_decrypt_key,#function
+.size	AES_set_decrypt_key,.-AES_set_decrypt_key
+___
+
+my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
+
+$code.=<<___;
+.globl	AES_cbc_encrypt
+.align	32
+AES_cbc_encrypt:
+	ld		[$key + 240], %g1
+	nop
+	brz		$enc, .Lcbc_decrypt
+	cmp		%g1, 12
+
+	bl,pt		%icc, aes128_t4_cbc_encrypt
+	nop
+	be,pn		%icc, aes192_t4_cbc_encrypt
+	nop
+	ba		aes256_t4_cbc_encrypt
+	nop
+
+.Lcbc_decrypt:
+	bl,pt		%icc, aes128_t4_cbc_decrypt
+	nop
+	be,pn		%icc, aes192_t4_cbc_decrypt
+	nop
+	ba		aes256_t4_cbc_decrypt
+	nop
+.type	AES_cbc_encrypt,#function
+.size	AES_cbc_encrypt,.-AES_cbc_encrypt
+___
+}
+$code.=<<___;
+.asciz	"AES for SPARC T4, David S. Miller, Andy Polyakov"
+.align	4
+___
+
+&emit_assembler();
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/aes/asm/aesv8-armx.pl
+++ b/openssl-3.4.2/crypto/aes/asm/aesv8-armx.pl
--- a/openssl-3.4.2/crypto/aes/asm/bsaes-armv7.pl
+++ b/openssl-3.4.2/crypto/aes/asm/bsaes-armv7.pl
--- a/openssl-3.4.2/crypto/aes/asm/bsaes-armv8.pl
+++ b/openssl-3.4.2/crypto/aes/asm/bsaes-armv8.pl
--- a/openssl-3.4.2/crypto/aes/asm/bsaes-x86_64.pl
+++ b/openssl-3.4.2/crypto/aes/asm/bsaes-x86_64.pl
--- a/openssl-3.4.2/crypto/aes/asm/vpaes-armv8.pl
+++ b/openssl-3.4.2/crypto/aes/asm/vpaes-armv8.pl
--- a/openssl-3.4.2/crypto/aes/asm/vpaes-loongarch64.pl
+++ b/openssl-3.4.2/crypto/aes/asm/vpaes-loongarch64.pl
--- a/openssl-3.4.2/crypto/aes/asm/vpaes-ppc.pl
+++ b/openssl-3.4.2/crypto/aes/asm/vpaes-ppc.pl
--- a/openssl-3.4.2/crypto/aes/asm/vpaes-x86.pl
+++ b/openssl-3.4.2/crypto/aes/asm/vpaes-x86.pl
@@ -0,0 +1,914 @@
+#! /usr/bin/env perl
+# Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Port vpaes-x86_64.pl as 32-bit "almost" drop-in replacement for
+# aes-586.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-586.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86.pl column - [also
+# large-block CBC] encrypt/decrypt.
+#
+#		aes-586.pl		vpaes-x86.pl
+#
+# Core 2(**)	28.1/41.4/18.3		21.9/25.2(***)
+# Nehalem	27.9/40.4/18.1		10.2/11.9
+# Atom		70.7/92.1/60.1		61.1/75.4(***)
+# Silvermont	45.4/62.9/24.1		49.2/61.1(***)
+#
+# (*)	"Hyper-threading" in the context refers rather to cache shared
+#	among multiple cores, than to specifically Intel HTT. As vast
+#	majority of contemporary cores share cache, slower code path
+#	is common place. In other words "with-hyper-threading-off"
+#	results are presented mostly for reference purposes.
+#
+# (**)	"Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***)	Less impressive improvement on Core 2 and Atom is due to slow
+#	pshufb,	yet it's respectable +28%/64%  improvement on Core 2
+#	and +15% on Atom (as implied, over "hyper-threading-safe"
+#	code path).
+#
+#						<appro@openssl.org>
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
+require "x86asm.pl";
+
+$output = pop and open STDOUT,">$output";
+
+&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
+
+$PREFIX="vpaes";
+
+my  ($round, $base, $magic, $key, $const, $inp, $out)=
+    ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
+
+&static_label("_vpaes_consts");
+&static_label("_vpaes_schedule_low_round");
+
+&set_label("_vpaes_consts",64);
+$k_inv=-0x30;		# inv, inva
+	&data_word(0x0D080180,0x0E05060F,0x0A0B0C02,0x04070309);
+	&data_word(0x0F0B0780,0x01040A06,0x02050809,0x030D0E0C);
+
+$k_s0F=-0x10;		# s0F
+	&data_word(0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F,0x0F0F0F0F);
+
+$k_ipt=0x00;		# input transform (lo, hi)
+	&data_word(0x5A2A7000,0xC2B2E898,0x52227808,0xCABAE090);
+	&data_word(0x317C4D00,0x4C01307D,0xB0FDCC81,0xCD80B1FC);
+
+$k_sb1=0x20;		# sb1u, sb1t
+	&data_word(0xCB503E00,0xB19BE18F,0x142AF544,0xA5DF7A6E);
+	&data_word(0xFAE22300,0x3618D415,0x0D2ED9EF,0x3BF7CCC1);
+$k_sb2=0x40;		# sb2u, sb2t
+	&data_word(0x0B712400,0xE27A93C6,0xBC982FCD,0x5EB7E955);
+	&data_word(0x0AE12900,0x69EB8840,0xAB82234A,0xC2A163C8);
+$k_sbo=0x60;		# sbou, sbot
+	&data_word(0x6FBDC700,0xD0D26D17,0xC502A878,0x15AABF7A);
+	&data_word(0x5FBB6A00,0xCFE474A5,0x412B35FA,0x8E1E90D1);
+
+$k_mc_forward=0x80;	# mc_forward
+	&data_word(0x00030201,0x04070605,0x080B0A09,0x0C0F0E0D);
+	&data_word(0x04070605,0x080B0A09,0x0C0F0E0D,0x00030201);
+	&data_word(0x080B0A09,0x0C0F0E0D,0x00030201,0x04070605);
+	&data_word(0x0C0F0E0D,0x00030201,0x04070605,0x080B0A09);
+
+$k_mc_backward=0xc0;	# mc_backward
+	&data_word(0x02010003,0x06050407,0x0A09080B,0x0E0D0C0F);
+	&data_word(0x0E0D0C0F,0x02010003,0x06050407,0x0A09080B);
+	&data_word(0x0A09080B,0x0E0D0C0F,0x02010003,0x06050407);
+	&data_word(0x06050407,0x0A09080B,0x0E0D0C0F,0x02010003);
+
+$k_sr=0x100;		# sr
+	&data_word(0x03020100,0x07060504,0x0B0A0908,0x0F0E0D0C);
+	&data_word(0x0F0A0500,0x030E0904,0x07020D08,0x0B06010C);
+	&data_word(0x0B020900,0x0F060D04,0x030A0108,0x070E050C);
+	&data_word(0x070A0D00,0x0B0E0104,0x0F020508,0x0306090C);
+
+$k_rcon=0x140;		# rcon
+	&data_word(0xAF9DEEB6,0x1F8391B9,0x4D7C7D81,0x702A9808);
+
+$k_s63=0x150;		# s63: all equal to 0x63 transformed
+	&data_word(0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B,0x5B5B5B5B);
+
+$k_opt=0x160;		# output transform
+	&data_word(0xD6B66000,0xFF9F4929,0xDEBE6808,0xF7974121);
+	&data_word(0x50BCEC00,0x01EDBD51,0xB05C0CE0,0xE10D5DB1);
+
+$k_deskew=0x180;	# deskew tables: inverts the sbox's "skew"
+	&data_word(0x47A4E300,0x07E4A340,0x5DBEF91A,0x1DFEB95A);
+	&data_word(0x83EA6900,0x5F36B5DC,0xF49D1E77,0x2841C2AB);
+##
+##  Decryption stuff
+##  Key schedule constants
+##
+$k_dksd=0x1a0;		# decryption key schedule: invskew x*D
+	&data_word(0xA3E44700,0xFEB91A5D,0x5A1DBEF9,0x0740E3A4);
+	&data_word(0xB5368300,0x41C277F4,0xAB289D1E,0x5FDC69EA);
+$k_dksb=0x1c0;		# decryption key schedule: invskew x*B
+	&data_word(0x8550D500,0x9A4FCA1F,0x1CC94C99,0x03D65386);
+	&data_word(0xB6FC4A00,0x115BEDA7,0x7E3482C8,0xD993256F);
+$k_dkse=0x1e0;		# decryption key schedule: invskew x*E + 0x63
+	&data_word(0x1FC9D600,0xD5031CCA,0x994F5086,0x53859A4C);
+	&data_word(0x4FDC7BE8,0xA2319605,0x20B31487,0xCD5EF96A);
+$k_dks9=0x200;		# decryption key schedule: invskew x*9
+	&data_word(0x7ED9A700,0xB6116FC8,0x82255BFC,0x4AED9334);
+	&data_word(0x27143300,0x45765162,0xE9DAFDCE,0x8BB89FAC);
+
+##
+##  Decryption stuff
+##  Round function constants
+##
+$k_dipt=0x220;		# decryption input transform
+	&data_word(0x0B545F00,0x0F505B04,0x114E451A,0x154A411E);
+	&data_word(0x60056500,0x86E383E6,0xF491F194,0x12771772);
+
+$k_dsb9=0x240;		# decryption sbox output *9*u, *9*t
+	&data_word(0x9A86D600,0x851C0353,0x4F994CC9,0xCAD51F50);
+	&data_word(0xECD74900,0xC03B1789,0xB2FBA565,0x725E2C9E);
+$k_dsbd=0x260;		# decryption sbox output *D*u, *D*t
+	&data_word(0xE6B1A200,0x7D57CCDF,0x882A4439,0xF56E9B13);
+	&data_word(0x24C6CB00,0x3CE2FAF7,0x15DEEFD3,0x2931180D);
+$k_dsbb=0x280;		# decryption sbox output *B*u, *B*t
+	&data_word(0x96B44200,0xD0226492,0xB0F2D404,0x602646F6);
+	&data_word(0xCD596700,0xC19498A6,0x3255AA6B,0xF3FF0C3E);
+$k_dsbe=0x2a0;		# decryption sbox output *E*u, *E*t
+	&data_word(0x26D4D000,0x46F29296,0x64B4F6B0,0x22426004);
+	&data_word(0xFFAAC100,0x0C55A6CD,0x98593E32,0x9467F36B);
+$k_dsbo=0x2c0;		# decryption sbox final output
+	&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
+	&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
+&asciz	("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
+&align	(64);
+
+&function_begin_B("_vpaes_preheat");
+	&add	($const,&DWP(0,"esp"));
+	&movdqa	("xmm7",&QWP($k_inv,$const));
+	&movdqa	("xmm6",&QWP($k_s0F,$const));
+	&ret	();
+&function_end_B("_vpaes_preheat");
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm6-%xmm7 as in _vpaes_preheat
+##    (%edx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %eax, %ebx, %ecx, %edx
+##
+##
+&function_begin_B("_vpaes_encrypt_core");
+	&mov	($magic,16);
+	&mov	($round,&DWP(240,$key));
+	&movdqa	("xmm1","xmm6")
+	&movdqa	("xmm2",&QWP($k_ipt,$const));
+	&pandn	("xmm1","xmm0");
+	&pand	("xmm0","xmm6");
+	&movdqu	("xmm5",&QWP(0,$key));
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP($k_ipt+16,$const));
+	&pxor	("xmm2","xmm5");
+	&psrld	("xmm1",4);
+	&add	($key,16);
+	&pshufb	("xmm0","xmm1");
+	&lea	($base,&DWP($k_mc_backward,$const));
+	&pxor	("xmm0","xmm2");
+	&jmp	(&label("enc_entry"));
+
+
+&set_label("enc_loop",16);
+	# middle of middle round
+	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sb1u
+	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
+	&pshufb	("xmm4","xmm2");		# 4 = sb1u
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
+	&movdqa	("xmm5",&QWP($k_sb2,$const));	# 4 : sb2u
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&movdqa	("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
+	&pshufb	("xmm5","xmm2");		# 4 = sb2u
+	&movdqa	("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
+	&movdqa	("xmm4",&QWP(0,$base,$magic));	# .Lk_mc_backward[]
+	&pshufb	("xmm2","xmm3");		# 2 = sb2t
+	&movdqa	("xmm3","xmm0");		# 3 = A
+	&pxor	("xmm2","xmm5");		# 2 = 2A
+	&pshufb	("xmm0","xmm1");		# 0 = B
+	&add	($key,16);			# next key
+	&pxor	("xmm0","xmm2");		# 0 = 2A+B
+	&pshufb	("xmm3","xmm4");		# 3 = D
+	&add	($magic,16);			# next mc
+	&pxor	("xmm3","xmm0");		# 3 = 2A+B+D
+	&pshufb	("xmm0","xmm1");		# 0 = 2B+C
+	&and	($magic,0x30);			# ... mod 4
+	&sub	($round,1);			# nr--
+	&pxor	("xmm0","xmm3");		# 0 = 2A+3B+C+D
+
+&set_label("enc_entry");
+	# top of round
+	&movdqa	("xmm1","xmm6");		# 1 : i
+	&movdqa	("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
+	&pandn	("xmm1","xmm0");		# 1 = i<<4
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm6");		# 0 = k
+	&pshufb	("xmm5","xmm0");		# 2 = a/k
+	&movdqa	("xmm3","xmm7");		# 3 : 1/i
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&movdqa	("xmm4","xmm7");		# 4 : 1/j
+	&pxor	("xmm3","xmm5");		# 3 = iak = 1/i + a/k
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
+	&pxor	("xmm4","xmm5");		# 4 = jak = 1/j + a/k
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&movdqu	("xmm5",&QWP(0,$key));
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&jnz	(&label("enc_loop"));
+
+	# middle of last round
+	&movdqa	("xmm4",&QWP($k_sbo,$const));	# 3 : sbou      .Lk_sbo
+	&movdqa	("xmm0",&QWP($k_sbo+16,$const));# 3 : sbot      .Lk_sbo+16
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&pxor	("xmm4","xmm5");		# 4 = sb1u + k
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&movdqa	("xmm1",&QWP(0x40,$base,$magic));# .Lk_sr[]
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&pshufb	("xmm0","xmm1");
+	&ret	();
+&function_end_B("_vpaes_encrypt_core");
+
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+&function_begin_B("_vpaes_decrypt_core");
+	&lea	($base,&DWP($k_dsbd,$const));
+	&mov	($round,&DWP(240,$key));
+	&movdqa	("xmm1","xmm6");
+	&movdqa	("xmm2",&QWP($k_dipt-$k_dsbd,$base));
+	&pandn	("xmm1","xmm0");
+	&mov	($magic,$round);
+	&psrld	("xmm1",4)
+	&movdqu	("xmm5",&QWP(0,$key));
+	&shl	($magic,4);
+	&pand	("xmm0","xmm6");
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
+	&xor	($magic,0x30);
+	&pshufb	("xmm0","xmm1");
+	&and	($magic,0x30);
+	&pxor	("xmm2","xmm5");
+	&movdqa	("xmm5",&QWP($k_mc_forward+48,$const));
+	&pxor	("xmm0","xmm2");
+	&add	($key,16);
+	&lea	($magic,&DWP($k_sr-$k_dsbd,$base,$magic));
+	&jmp	(&label("dec_entry"));
+
+&set_label("dec_loop",16);
+##
+##  Inverse mix columns
+##
+	&movdqa	("xmm4",&QWP(-0x20,$base));	# 4 : sb9u
+	&movdqa	("xmm1",&QWP(-0x10,$base));	# 0 : sb9t
+	&pshufb	("xmm4","xmm2");		# 4 = sb9u
+	&pshufb	("xmm1","xmm3");		# 0 = sb9t
+	&pxor	("xmm0","xmm4");
+	&movdqa	("xmm4",&QWP(0,$base));		# 4 : sbdu
+	&pxor	("xmm0","xmm1");		# 0 = ch
+	&movdqa	("xmm1",&QWP(0x10,$base));	# 0 : sbdt
+
+	&pshufb	("xmm4","xmm2");		# 4 = sbdu
+	&pshufb	("xmm0","xmm5");		# MC ch
+	&pshufb	("xmm1","xmm3");		# 0 = sbdt
+	&pxor	("xmm0","xmm4");		# 4 = ch
+	&movdqa	("xmm4",&QWP(0x20,$base));	# 4 : sbbu
+	&pxor	("xmm0","xmm1");		# 0 = ch
+	&movdqa	("xmm1",&QWP(0x30,$base));	# 0 : sbbt
+
+	&pshufb	("xmm4","xmm2");		# 4 = sbbu
+	&pshufb	("xmm0","xmm5");		# MC ch
+	&pshufb	("xmm1","xmm3");		# 0 = sbbt
+	&pxor	("xmm0","xmm4");		# 4 = ch
+	&movdqa	("xmm4",&QWP(0x40,$base));	# 4 : sbeu
+	&pxor	("xmm0","xmm1");		# 0 = ch
+	&movdqa	("xmm1",&QWP(0x50,$base));	# 0 : sbet
+
+	&pshufb	("xmm4","xmm2");		# 4 = sbeu
+	&pshufb	("xmm0","xmm5");		# MC ch
+	&pshufb	("xmm1","xmm3");		# 0 = sbet
+	&pxor	("xmm0","xmm4");		# 4 = ch
+	&add	($key,16);			# next round key
+	&palignr("xmm5","xmm5",12);
+	&pxor	("xmm0","xmm1");		# 0 = ch
+	&sub	($round,1);			# nr--
+
+&set_label("dec_entry");
+	# top of round
+	&movdqa	("xmm1","xmm6");		# 1 : i
+	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+	&pandn	("xmm1","xmm0");		# 1 = i<<4
+	&pand	("xmm0","xmm6");		# 0 = k
+	&psrld	("xmm1",4);			# 1 = i
+	&pshufb	("xmm2","xmm0");		# 2 = a/k
+	&movdqa	("xmm3","xmm7");		# 3 : 1/i
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&movdqa	("xmm4","xmm7");		# 4 : 1/j
+	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm7");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&movdqa	("xmm3","xmm7");		# 3 : 1/jak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&movdqu	("xmm0",&QWP(0,$key));
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&jnz	(&label("dec_loop"));
+
+	# middle of last round
+	&movdqa	("xmm4",&QWP(0x60,$base));	# 3 : sbou
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&pxor	("xmm4","xmm0");		# 4 = sb1u + k
+	&movdqa	("xmm0",&QWP(0x70,$base));	# 0 : sbot
+	&movdqa	("xmm2",&QWP(0,$magic));
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = A
+	&pshufb	("xmm0","xmm2");
+	&ret	();
+&function_end_B("_vpaes_decrypt_core");
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+&function_begin_B("_vpaes_schedule_core");
+	&add	($const,&DWP(0,"esp"));
+	&movdqu	("xmm0",&QWP(0,$inp));		# load key (unaligned)
+	&movdqa	("xmm2",&QWP($k_rcon,$const));	# load rcon
+
+	# input transform
+	&movdqa	("xmm3","xmm0");
+	&lea	($base,&DWP($k_ipt,$const));
+	&movdqa	(&QWP(4,"esp"),"xmm2");		# xmm8
+	&call	("_vpaes_schedule_transform");
+	&movdqa	("xmm7","xmm0");
+
+	&test	($out,$out);
+	&jnz	(&label("schedule_am_decrypting"));
+
+	# encrypting, output zeroth round key after transform
+	&movdqu	(&QWP(0,$key),"xmm0");
+	&jmp	(&label("schedule_go"));
+
+&set_label("schedule_am_decrypting");
+	# decrypting, output zeroth round key after shiftrows
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm3","xmm1");
+	&movdqu	(&QWP(0,$key),"xmm3");
+	&xor	($magic,0x30);
+
+&set_label("schedule_go");
+	&cmp	($round,192);
+	&ja	(&label("schedule_256"));
+	&je	(&label("schedule_192"));
+	# 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+&set_label("schedule_128");
+	&mov	($round,10);
+
+&set_label("loop_schedule_128");
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	# write output
+	&jmp	(&label("loop_schedule_128"));
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+&set_label("schedule_192",16);
+	&movdqu	("xmm0",&QWP(8,$inp));		# load key part 2 (very unaligned)
+	&call	("_vpaes_schedule_transform");	# input transform
+	&movdqa	("xmm6","xmm0");		# save short part
+	&pxor	("xmm4","xmm4");		# clear 4
+	&movhlps("xmm6","xmm4");		# clobber low side with zeros
+	&mov	($round,4);
+
+&set_label("loop_schedule_192");
+	&call	("_vpaes_schedule_round");
+	&palignr("xmm0","xmm6",8);
+	&call	("_vpaes_schedule_mangle");	# save key n
+	&call	("_vpaes_schedule_192_smear");
+	&call	("_vpaes_schedule_mangle");	# save key n+1
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");	# save key n+2
+	&call	("_vpaes_schedule_192_smear");
+	&jmp	(&label("loop_schedule_192"));
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+&set_label("schedule_256",16);
+	&movdqu	("xmm0",&QWP(16,$inp));		# load key part 2 (unaligned)
+	&call	("_vpaes_schedule_transform");	# input transform
+	&mov	($round,7);
+
+&set_label("loop_schedule_256");
+	&call	("_vpaes_schedule_mangle");	# output low result
+	&movdqa	("xmm6","xmm0");		# save cur_lo in xmm6
+
+	# high round
+	&call	("_vpaes_schedule_round");
+	&dec	($round);
+	&jz	(&label("schedule_mangle_last"));
+	&call	("_vpaes_schedule_mangle");
+
+	# low round. swap xmm7 and xmm6
+	&pshufd	("xmm0","xmm0",0xFF);
+	&movdqa	(&QWP(20,"esp"),"xmm7");
+	&movdqa	("xmm7","xmm6");
+	&call	("_vpaes_schedule_low_round");
+	&movdqa	("xmm7",&QWP(20,"esp"));
+
+	&jmp	(&label("loop_schedule_256"));
+
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+&set_label("schedule_mangle_last",16);
+	# schedule last round key from xmm0
+	&lea	($base,&DWP($k_deskew,$const));
+	&test	($out,$out);
+	&jnz	(&label("schedule_mangle_last_dec"));
+
+	# encrypting
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm0","xmm1");		# output permute
+	&lea	($base,&DWP($k_opt,$const));	# prepare to output transform
+	&add	($key,32);
+
+&set_label("schedule_mangle_last_dec");
+	&add	($key,-16);
+	&pxor	("xmm0",&QWP($k_s63,$const));
+	&call	("_vpaes_schedule_transform");	# output transform
+	&movdqu	(&QWP(0,$key),"xmm0");		# save last key
+
+	# cleanup
+	&pxor	("xmm0","xmm0");
+	&pxor	("xmm1","xmm1");
+	&pxor	("xmm2","xmm2");
+	&pxor	("xmm3","xmm3");
+	&pxor	("xmm4","xmm4");
+	&pxor	("xmm5","xmm5");
+	&pxor	("xmm6","xmm6");
+	&pxor	("xmm7","xmm7");
+	&ret	();
+&function_end_B("_vpaes_schedule_core");
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+&function_begin_B("_vpaes_schedule_192_smear");
+	&pshufd	("xmm1","xmm6",0x80);		# d c 0 0 -> c 0 0 0
+	&pshufd	("xmm0","xmm7",0xFE);		# b a _ _ -> b b b a
+	&pxor	("xmm6","xmm1");		# -> c+d c 0 0
+	&pxor	("xmm1","xmm1");
+	&pxor	("xmm6","xmm0");		# -> b+c+d b+c b a
+	&movdqa	("xmm0","xmm6");
+	&movhlps("xmm6","xmm1");		# clobber low side with zeros
+	&ret	();
+&function_end_B("_vpaes_schedule_192_smear");
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm5.
+##
+&function_begin_B("_vpaes_schedule_round");
+	# extract rcon from xmm8
+	&movdqa	("xmm2",&QWP(8,"esp"));		# xmm8
+	&pxor	("xmm1","xmm1");
+	&palignr("xmm1","xmm2",15);
+	&palignr("xmm2","xmm2",15);
+	&pxor	("xmm7","xmm1");
+
+	# rotate
+	&pshufd	("xmm0","xmm0",0xFF);
+	&palignr("xmm0","xmm0",1);
+
+	# fall through...
+	&movdqa	(&QWP(8,"esp"),"xmm2");		# xmm8
+
+	# low round: same as high round, but no rotation and no rcon.
+&set_label("_vpaes_schedule_low_round");
+	# smear xmm7
+	&movdqa	("xmm1","xmm7");
+	&pslldq	("xmm7",4);
+	&pxor	("xmm7","xmm1");
+	&movdqa	("xmm1","xmm7");
+	&pslldq	("xmm7",8);
+	&pxor	("xmm7","xmm1");
+	&pxor	("xmm7",&QWP($k_s63,$const));
+
+	# subbyte
+	&movdqa	("xmm4",&QWP($k_s0F,$const));
+	&movdqa	("xmm5",&QWP($k_inv,$const));	# 4 : 1/j
+	&movdqa	("xmm1","xmm4");
+	&pandn	("xmm1","xmm0");
+	&psrld	("xmm1",4);			# 1 = i
+	&pand	("xmm0","xmm4");		# 0 = k
+	&movdqa	("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
+	&pshufb	("xmm2","xmm0");		# 2 = a/k
+	&pxor	("xmm0","xmm1");		# 0 = j
+	&movdqa	("xmm3","xmm5");		# 3 : 1/i
+	&pshufb	("xmm3","xmm1");		# 3 = 1/i
+	&pxor	("xmm3","xmm2");		# 3 = iak = 1/i + a/k
+	&movdqa	("xmm4","xmm5");		# 4 : 1/j
+	&pshufb	("xmm4","xmm0");		# 4 = 1/j
+	&pxor	("xmm4","xmm2");		# 4 = jak = 1/j + a/k
+	&movdqa	("xmm2","xmm5");		# 2 : 1/iak
+	&pshufb	("xmm2","xmm3");		# 2 = 1/iak
+	&pxor	("xmm2","xmm0");		# 2 = io
+	&movdqa	("xmm3","xmm5");		# 3 : 1/jak
+	&pshufb	("xmm3","xmm4");		# 3 = 1/jak
+	&pxor	("xmm3","xmm1");		# 3 = jo
+	&movdqa	("xmm4",&QWP($k_sb1,$const));	# 4 : sbou
+	&pshufb	("xmm4","xmm2");		# 4 = sbou
+	&movdqa	("xmm0",&QWP($k_sb1+16,$const));# 0 : sbot
+	&pshufb	("xmm0","xmm3");		# 0 = sb1t
+	&pxor	("xmm0","xmm4");		# 0 = sbox output
+
+	# add in smeared stuff
+	&pxor	("xmm0","xmm7");
+	&movdqa	("xmm7","xmm0");
+	&ret	();
+&function_end_B("_vpaes_schedule_round");
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%ebx)
+##
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+&function_begin_B("_vpaes_schedule_transform");
+	&movdqa	("xmm2",&QWP($k_s0F,$const));
+	&movdqa	("xmm1","xmm2");
+	&pandn	("xmm1","xmm0");
+	&psrld	("xmm1",4);
+	&pand	("xmm0","xmm2");
+	&movdqa	("xmm2",&QWP(0,$base));
+	&pshufb	("xmm2","xmm0");
+	&movdqa	("xmm0",&QWP(16,$base));
+	&pshufb	("xmm0","xmm1");
+	&pxor	("xmm0","xmm2");
+	&ret	();
+&function_end_B("_vpaes_schedule_transform");
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%edx), and increments or decrements it
+##  Keeps track of round number mod 4 in %ecx
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+&function_begin_B("_vpaes_schedule_mangle");
+	&movdqa	("xmm4","xmm0");	# save xmm0 for later
+	&movdqa	("xmm5",&QWP($k_mc_forward,$const));
+	&test	($out,$out);
+	&jnz	(&label("schedule_mangle_dec"));
+
+	# encrypting
+	&add	($key,16);
+	&pxor	("xmm4",&QWP($k_s63,$const));
+	&pshufb	("xmm4","xmm5");
+	&movdqa	("xmm3","xmm4");
+	&pshufb	("xmm4","xmm5");
+	&pxor	("xmm3","xmm4");
+	&pshufb	("xmm4","xmm5");
+	&pxor	("xmm3","xmm4");
+
+	&jmp	(&label("schedule_mangle_both"));
+
+&set_label("schedule_mangle_dec",16);
+	# inverse mix columns
+	&movdqa	("xmm2",&QWP($k_s0F,$const));
+	&lea	($inp,&DWP($k_dksd,$const));
+	&movdqa	("xmm1","xmm2");
+	&pandn	("xmm1","xmm4");
+	&psrld	("xmm1",4);			# 1 = hi
+	&pand	("xmm4","xmm2");		# 4 = lo
+
+	&movdqa	("xmm2",&QWP(0,$inp));
+	&pshufb	("xmm2","xmm4");
+	&movdqa	("xmm3",&QWP(0x10,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x20,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x30,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x40,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x50,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+	&pshufb	("xmm3","xmm5");
+
+	&movdqa	("xmm2",&QWP(0x60,$inp));
+	&pshufb	("xmm2","xmm4");
+	&pxor	("xmm2","xmm3");
+	&movdqa	("xmm3",&QWP(0x70,$inp));
+	&pshufb	("xmm3","xmm1");
+	&pxor	("xmm3","xmm2");
+
+	&add	($key,-16);
+
+&set_label("schedule_mangle_both");
+	&movdqa	("xmm1",&QWP($k_sr,$const,$magic));
+	&pshufb	("xmm3","xmm1");
+	&add	($magic,-16);
+	&and	($magic,0x30);
+	&movdqu	(&QWP(0,$key),"xmm3");
+	&ret	();
+&function_end_B("_vpaes_schedule_mangle");
+
+#
+# Interface to OpenSSL
+#
+&function_begin("${PREFIX}_set_encrypt_key");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($round,&wparam(1));		# bits
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	($base,$round);
+	&shr	($base,5);
+	&add	($base,5);
+	&mov	(&DWP(240,$key),$base);		# AES_KEY->rounds = nbits/32+5;
+	&mov	($magic,0x30);
+	&mov	($out,0);
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_schedule_core");
+&set_label("pic_point");
+
+	&mov	("esp",&DWP(48,"esp"));
+	&xor	("eax","eax");
+&function_end("${PREFIX}_set_encrypt_key");
+
+&function_begin("${PREFIX}_set_decrypt_key");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($round,&wparam(1));		# bits
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	($base,$round);
+	&shr	($base,5);
+	&add	($base,5);
+	&mov	(&DWP(240,$key),$base);	# AES_KEY->rounds = nbits/32+5;
+	&shl	($base,4);
+	&lea	($key,&DWP(16,$key,$base));
+
+	&mov	($out,1);
+	&mov	($magic,$round);
+	&shr	($magic,1);
+	&and	($magic,32);
+	&xor	($magic,32);			# nbist==192?0:32;
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_schedule_core");
+&set_label("pic_point");
+
+	&mov	("esp",&DWP(48,"esp"));
+	&xor	("eax","eax");
+&function_end("${PREFIX}_set_decrypt_key");
+
+&function_begin("${PREFIX}_encrypt");
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($out,&wparam(1));		# out
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&movdqu	("xmm0",&QWP(0,$inp));
+	&call	("_vpaes_encrypt_core");
+	&movdqu	(&QWP(0,$out),"xmm0");
+
+	&mov	("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_encrypt");
+
+&function_begin("${PREFIX}_decrypt");
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&mov	($inp,&wparam(0));		# inp
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($out,&wparam(1));		# out
+	&and	($base,-16);
+	&mov	($key,&wparam(2));		# key
+	&xchg	($base,"esp");			# alloca
+	&mov	(&DWP(48,"esp"),$base);
+
+	&movdqu	("xmm0",&QWP(0,$inp));
+	&call	("_vpaes_decrypt_core");
+	&movdqu	(&QWP(0,$out),"xmm0");
+
+	&mov	("esp",&DWP(48,"esp"));
+&function_end("${PREFIX}_decrypt");
+
+&function_begin("${PREFIX}_cbc_encrypt");
+	&mov	($inp,&wparam(0));		# inp
+	&mov	($out,&wparam(1));		# out
+	&mov	($round,&wparam(2));		# len
+	&mov	($key,&wparam(3));		# key
+	&sub	($round,16);
+	&jc	(&label("cbc_abort"));
+	&lea	($base,&DWP(-56,"esp"));
+	&mov	($const,&wparam(4));		# ivp
+	&and	($base,-16);
+	&mov	($magic,&wparam(5));		# enc
+	&xchg	($base,"esp");			# alloca
+	&movdqu	("xmm1",&QWP(0,$const));	# load IV
+	&sub	($out,$inp);
+	&mov	(&DWP(48,"esp"),$base);
+
+	&mov	(&DWP(0,"esp"),$out);		# save out
+	&mov	(&DWP(4,"esp"),$key)		# save key
+	&mov	(&DWP(8,"esp"),$const);		# save ivp
+	&mov	($out,$round);			# $out works as $len
+
+	&lea	($const,&DWP(&label("_vpaes_consts")."+0x30-".&label("pic_point")));
+	&call	("_vpaes_preheat");
+&set_label("pic_point");
+	&cmp	($magic,0);
+	&je	(&label("cbc_dec_loop"));
+	&jmp	(&label("cbc_enc_loop"));
+
+&set_label("cbc_enc_loop",16);
+	&movdqu	("xmm0",&QWP(0,$inp));		# load input
+	&pxor	("xmm0","xmm1");		# inp^=iv
+	&call	("_vpaes_encrypt_core");
+	&mov	($base,&DWP(0,"esp"));		# restore out
+	&mov	($key,&DWP(4,"esp"));		# restore key
+	&movdqa	("xmm1","xmm0");
+	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
+	&lea	($inp,&DWP(16,$inp));
+	&sub	($out,16);
+	&jnc	(&label("cbc_enc_loop"));
+	&jmp	(&label("cbc_done"));
+
+&set_label("cbc_dec_loop",16);
+	&movdqu	("xmm0",&QWP(0,$inp));		# load input
+	&movdqa	(&QWP(16,"esp"),"xmm1");	# save IV
+	&movdqa	(&QWP(32,"esp"),"xmm0");	# save future IV
+	&call	("_vpaes_decrypt_core");
+	&mov	($base,&DWP(0,"esp"));		# restore out
+	&mov	($key,&DWP(4,"esp"));		# restore key
+	&pxor	("xmm0",&QWP(16,"esp"));	# out^=iv
+	&movdqa	("xmm1",&QWP(32,"esp"));	# load next IV
+	&movdqu	(&QWP(0,$base,$inp),"xmm0");	# write output
+	&lea	($inp,&DWP(16,$inp));
+	&sub	($out,16);
+	&jnc	(&label("cbc_dec_loop"));
+
+&set_label("cbc_done");
+	&mov	($base,&DWP(8,"esp"));		# restore ivp
+	&mov	("esp",&DWP(48,"esp"));
+	&movdqu	(&QWP(0,$base),"xmm1");		# write IV
+&set_label("cbc_abort");
+&function_end("${PREFIX}_cbc_encrypt");
+
+&asm_finish();
+
+close STDOUT or die "error closing STDOUT: $!";
--- a/openssl-3.4.2/crypto/aes/asm/vpaes-x86_64.pl
+++ b/openssl-3.4.2/crypto/aes/asm/vpaes-x86_64.pl