diff --git a/CMakeLists.txt b/CMakeLists.txt index 6737c79..be17645 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -18,12 +18,17 @@ pico_enable_stdio_uart(${PROJECT} 0) pico_enable_stdio_usb(${PROJECT} 1) target_sources(${PROJECT} PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/src/ascon-xofa/hash.c + ${CMAKE_CURRENT_SOURCE_DIR}/src/ascon-xofa/permutations.c + ${CMAKE_CURRENT_SOURCE_DIR}/src/util.c ${CMAKE_CURRENT_SOURCE_DIR}/src/rorand.c + ${CMAKE_CURRENT_SOURCE_DIR}/src/rourand.c ${CMAKE_CURRENT_SOURCE_DIR}/src/main.c ) target_include_directories(${PROJECT} PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/src/ascon-xofa/ ${CMAKE_CURRENT_SOURCE_DIR}/src/ ) diff --git a/src/ascon-xofa/LICENSE b/src/ascon-xofa/LICENSE new file mode 100644 index 0000000..3bbbc1e --- /dev/null +++ b/src/ascon-xofa/LICENSE @@ -0,0 +1,116 @@ +CC0 1.0 Universal + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator and +subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for the +purpose of contributing to a commons of creative, cultural and scientific +works ("Commons") that the public can reliably and without fear of later +claims of infringement build upon, modify, incorporate in other works, reuse +and redistribute as freely as possible in any form whatsoever and for any +purposes, including without limitation commercial purposes. These owners may +contribute to the Commons to promote the ideal of a free culture and the +further production of creative, cultural and scientific works, or to gain +reputation or greater distribution for their Work in part through the use and +efforts of others. + +For these and/or other purposes and motivations, and without any expectation +of additional consideration or compensation, the person associating CC0 with a +Work (the "Affirmer"), to the extent that he or she is an owner of Copyright +and Related Rights in the Work, voluntarily elects to apply CC0 to the Work +and publicly distribute the Work under its terms, with knowledge of his or her +Copyright and Related Rights in the Work and the meaning and intended legal +effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not limited +to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, communicate, + and translate a Work; + + ii. moral rights retained by the original author(s) and/or performer(s); + + iii. publicity and privacy rights pertaining to a person's image or likeness + depicted in a Work; + + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + + v. rights protecting the extraction, dissemination, use and reuse of data in + a Work; + + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation thereof, + including any amended or successor version of such directive); and + + vii. other similar, equivalent or corresponding rights throughout the world + based on applicable law or treaty, and any national implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention of, +applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and +unconditionally waives, abandons, and surrenders all of Affirmer's Copyright +and Related Rights and associated claims and causes of action, whether now +known or unknown (including existing as well as future claims and causes of +action), in the Work (i) in all territories worldwide, (ii) for the maximum +duration provided by applicable law or treaty (including future time +extensions), (iii) in any current or future medium and for any number of +copies, and (iv) for any purpose whatsoever, including without limitation +commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes +the Waiver for the benefit of each member of the public at large and to the +detriment of Affirmer's heirs and successors, fully intending that such Waiver +shall not be subject to revocation, rescission, cancellation, termination, or +any other legal or equitable action to disrupt the quiet enjoyment of the Work +by the public as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason be +judged legally invalid or ineffective under applicable law, then the Waiver +shall be preserved to the maximum extent permitted taking into account +Affirmer's express Statement of Purpose. In addition, to the extent the Waiver +is so judged Affirmer hereby grants to each affected person a royalty-free, +non transferable, non sublicensable, non exclusive, irrevocable and +unconditional license to exercise Affirmer's Copyright and Related Rights in +the Work (i) in all territories worldwide, (ii) for the maximum duration +provided by applicable law or treaty (including future time extensions), (iii) +in any current or future medium and for any number of copies, and (iv) for any +purpose whatsoever, including without limitation commercial, advertising or +promotional purposes (the "License"). The License shall be deemed effective as +of the date CC0 was applied by Affirmer to the Work. Should any part of the +License for any reason be judged legally invalid or ineffective under +applicable law, such partial invalidity or ineffectiveness shall not +invalidate the remainder of the License, and in such case Affirmer hereby +affirms that he or she will not (i) exercise any of his or her remaining +Copyright and Related Rights in the Work or (ii) assert any associated claims +and causes of action with respect to the Work, in either case contrary to +Affirmer's express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + + b. Affirmer offers the Work as-is and makes no representations or warranties + of any kind concerning the Work, express, implied, statutory or otherwise, + including without limitation warranties of title, merchantability, fitness + for a particular purpose, non infringement, or the absence of latent or + other defects, accuracy, or the present or absence of errors, whether or not + discoverable, all to the greatest extent permissible under applicable law. + + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without limitation + any person's Copyright and Related Rights in the Work. Further, Affirmer + disclaims responsibility for obtaining any necessary consents, permissions + or other rights required for any use of the Work. + + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to this + CC0 or use of the Work. + +For more information, please see + \ No newline at end of file diff --git a/src/ascon-xofa/README.md b/src/ascon-xofa/README.md new file mode 100644 index 0000000..125080e --- /dev/null +++ b/src/ascon-xofa/README.md @@ -0,0 +1,519 @@ +# Reference, highly optimized, masked C and ASM implementations of Ascon + +Ascon is a family of lightweight cryptographic algorithms and consists of: +- Authenticated encryption schemes with associated data (AEAD) +- Hash functions (HASH) and extendible output functions (XOF) +- Pseudo-random functions (PRF) and message authentication codes (MAC) + +All implementations use the "ECRYPT Benchmarking of Cryptographic Systems (eBACS)" interface: + +- https://bench.cr.yp.to/call-aead.html for AEAD (Ascon-128, Ascon-128a, Ascon-80pq) +- https://bench.cr.yp.to/call-hash.html for HASH and XOF (Ascon-Hash, Ascon-Hasha, Ascon-Xof, Ascon-Xofa) +- https://nacl.cr.yp.to/auth.html for PRF and MAC (Ascon-Mac, Ascon-Prf, Ascon-PrfShort) + +For more information on Ascon visit: https://ascon.iaik.tugraz.at/ + + +## TL;DR + +If you do not know where to start, use the reference implementations (self-contained, portable, very fast): + +- `crypto_aead/ascon128v12/ref` +- `crypto_aead/ascon128av12/ref` +- `crypto_aead/asconxofv12/ref` +- `crypto_aead/asconxofav12/ref` + + +## Algorithms + +This repository contains implementations of the following 10 Ascon v1.2 algorithms: + +- `crypto_aead/ascon128v12`: Ascon-128 +- `crypto_aead/ascon128av12`: Ascon-128a +- `crypto_aead/ascon80pqv12`: Ascon-80pq +- `crypto_hash/asconhashv12`: Ascon-Hash +- `crypto_hash/asconhashav12`: Ascon-Hasha +- `crypto_hash/asconxofv12`: Ascon-Xof +- `crypto_hash/asconxofav12`: Ascon-Xofa +- `crypto_auth/asconmacv12`: Ascon-Mac +- `crypto_auth/asconprfv12`: Ascon-Prf +- `crypto_auth/asconprfsv12`: Ascon-PrfShort + +We also provide two combined algorithm implementations supporting both AEAD and +hashing: + +- `crypto_aead_hash/asconv12`: Ascon-128 combined with Ascon-Hash +- `crypto_aead_hash/asconav12`: Ascon-128a combined with Ascon-Hasha + +The following algorithms demonstrate the performance improvement of Ascon on +32-bit platforms without bit interleaving overhead. Bit interleaving could be +performed externally on the host side or using a dedicated instruction (e.g. +using the ARM Custom Datapath Extension). Note that a similar performance +improvement could be achieved using funnel shift instructions (available on some +32-bit RISC-V extensions). + +- `crypto_aead/ascon128bi32v12`: Ascon-128 (+17% on ARM1176JZF-S) +- `crypto_aead/ascon128abi32v12`: Ascon-128a (+23% on ARM1176JZF-S) +- `crypto_hash/asconhashbi32v12`: Ascon-Hash (+5% on ARM1176JZF-S) +- `crypto_hash/asconhashabi32v12`: Ascon-Hasha (+8% on ARM1176JZF-S) +- `crypto_aead_hash/asconbi32v12`: Ascon-128 combined with Ascon-Hash +- `crypto_aead_hash/asconabi32v12`: Ascon-128a combined with Ascon-Hasha + + +## Implementations + +For most algorithms, we provide the following pure C implementations: + +- `ref`: reference implementation +- `opt64`: 64-bit speed-optimized +- `opt32`: 32-bit speed-optimized +- `opt64_lowsize`: 64-bit size-optimized +- `opt32_lowsize`: 32-bit size-optimized +- `bi32`: 32-bit speed-optimized bit-interleaved +- `bi32_lowreg`: 32-bit speed-optimized bit-interleaved (low register usage) +- `bi32_lowsize`: 32-bit size-optimized bit-interleaved +- `esp32`: 32-bit ESP32 optimized +- `opt8`: 8-bit size- and speed-optimized +- `bi8`: 8-bit optimized bit-interleaved + +the following C with inline or partial ASM implementations: + +- `avx512`: 320-bit speed-optimized AVX512 +- `neon`: 64-bit speed-optimized ARM NEON +- `armv6`: 32-bit speed-optimized ARMv6 +- `armv6m`: 32-bit speed-optimized ARMv6-M +- `armv7m`: 32-bit speed-optimized ARMv7-M +- `armv6_lowsize`: 32-bit size-optimized ARMv6 +- `armv6m_lowsize`: 32-bit size-optimized ARMv6-M +- `armv7m_lowsize`: 32-bit size-optimized ARMv7-M +- `armv7m_small`: 32-bit small speed-optimized ARMv7-M +- `bi32_armv6`: 32-bit speed-optimized bit-interleaved ARMv6 +- `bi32_armv6m`: 32-bit speed-optimized bit-interleaved ARMv6-M +- `bi32_armv7m`: 32-bit speed-optimized bit-interleaved ARMv7-M +- `bi32_armv7m_small`: 32-bit small bit-interleaved ARMv7-M +- `avr`: 8-bit size- and speed-optimized AVR +- `avr_lowsize`: 8-bit size-optimized AVR + +the following ASM implementations: + +- `asm_esp32`: 32-bit optimized ESP32 using funnel-shift instructions +- `asm_rv32i`: 32-bit optimized RV32I using the base instruction set +- `asm_rv32b`: 32-bit optimized RV32B using bitmanip base (Zbb) +- `asm_fsr_rv32b`: 32-bit optimized funnel-shift RV32B using bitmanip base and bitmanip terniary (ZbbZbt) +- `asm_bi32_rv32b`: 32-bit optimized bit-interleaved RV32B using bitmanip base and bitmanip permutations (ZbbZbp) + +and the following high-level masked (shared) C with inline ASM implementations: + +- `protected_bi32_armv6`: 32-bit masked bit-interleaved ARMv6 +- `protected_bi32_armv6_leveled`: 32-bit masked and leveled bit-interleaved ARMv6 + +The masked C implementations can be used as a starting point to generate +device specific C/ASM implementations. Note that the masked C implementations +require a minimum amount of ASM instructions. Otherwise, the compiler may +heavily optimize the code and even combine shares. Obviously, the output +generated is very sensitive to compiler and environment changes and any +generated output needs to be security evaluated. A preliminary evaluation of +these implementations has been performed on some +[ChipWhisperer](https://www.newae.com/chipwhisperer) devices. The setup and +preliminary results can found at: https://github.com/ascon/simpleserial-ascon + + +# Performance results on different CPUs in cycles per byte + +## Ascon-128a + +| Message Length in Bytes | 1 | 8 | 16 | 32 | 64 | 1536 | long | +|:-------------------------|-----:|-----:|-----:|-----:|-----:|-----:|-----:| +| AMD EPYC 7742\* | | | | | 7.4 | 4.4 | 4.2 | +| AMD Ryzen 9 5950X\* | | | | | 8.1 | 5.3 | 5.2 | +| Apple M1 (ARMv8)\* | | | | | 9.4 | 6.3 | 6.3 | +| Cortex-A72 (ARMv8)\* | | | | | 10.9 | 7.2 | 7.0 | +| Intel Xeon E5-2609 v4\* | | | | | 11.3 | 7.4 | 7.2 | +| Intel Core i5-6300U | 365 | 47 | 31 | 19 | 13.5 | 8.0 | 7.8 | +| Intel Core i5-4200U | 519 | 67 | 44 | 27 | 18.8 | 11.0 | 10.6 | +| Cortex-A9 (ARMv7)\* | | | | | 42.8 | 24.6 | 24.0 | +| Cortex-A7 (NEON) | 2204 | 226 | 132 | 82 | 55.9 | 31.7 | 30.7 | +| Cortex-A7 (ARMv7)\* | | | | | 55.5 | 38.2 | 37.5 | +| ARM1176JZF-S (ARMv6) | 1908 | 235 | 156 | 99 | 70.4 | 43.0 | 42.9 | + + +## Ascon-128 and Ascon-80pq + +| Message Length in Bytes | 1 | 8 | 16 | 32 | 64 | 1536 | long | +|:-------------------------|-----:|-----:|-----:|-----:|-----:|-----:|-----:| +| AMD EPYC 7742\* | | | | | 8.1 | 6.6 | 6.5 | +| AMD Ryzen 9 5950X\* | | | | | 11.0 | 8.2 | 8.1 | +| Apple M1 (ARMv8)\* | | | | | 12.5 | 9.5 | 9.3 | +| Cortex-A72 (ARMv8)\* | | | | | 13.8 | 10.7 | 10.5 | +| Intel Xeon E5-2609 v4\* | | | | | 14.9 | 10.8 | 10.6 | +| Intel Core i5-6300U | 367 | 58 | 35 | 23 | 17.6 | 11.9 | 11.4 | +| Intel Core i5-4200U | 521 | 81 | 49 | 32 | 23.9 | 16.2 | 15.8 | +| Cortex-A9 (ARMv7)\* | | | | | 51.7 | 34.1 | 33.3 | +| Cortex-A7 (NEON) | 2182 | 249 | 148 | 97 | 71.7 | 47.5 | 46.5 | +| Cortex-A7 (ARMv7)\* | | | | | 69.6 | 52.0 | 51.6 | +| ARM1176JZF-S (ARMv6) | 1921 | 277 | 167 | 112 | 83.7 | 57.2 | 56.8 | + + +## Ascon-Hasha and Ascon-Xofa + +| Message Length in Bytes | 1 | 8 | 16 | 32 | 64 | 1536 | long | +|:-------------------------|-----:|-----:|-----:|-----:|------:|-----:|-----:| +| AMD EPYC 7742\* | | | | | | | | +| AMD Ryzen 7 1700\* | | | | | 22.0 | 12.1 | 11.7 | +| Apple M1 (ARMv8)\* | | | | | | | | +| Cortex-A72 (ARMv8)\* | | | | | 22.2 | 14.5 | 14.2 | +| Intel Xeon E5-2609 v4\* | | | | | 23.3 | 14.4 | 14.0 | +| Intel Core i5-6300U | 550 | 83 | 49 | 33 | 23.7 | 15.6 | 15.5 | +| Intel Core i5-4200U | 749 | 112 | 67 | 44 | 31.8 | 20.8 | 20.7 | +| Cortex-A9 (ARMv7)\* | | | | | 87.5 | 45.6 | 44.0 | +| Cortex-A7 (ARMv7)\* | | | | | 102.3 | 63.5 | 61.8 | +| ARM1176JZF-S (ARMv6) | 2390 | 356 | 211 | 138 | 100.7 | 65.7 | 65.3 | + + +## Ascon-Hash and Ascon-Xof + +| Message Length in Bytes | 1 | 8 | 16 | 32 | 64 | 1536 | long | +|:-------------------------|-----:|-----:|-----:|-----:|------:|-----:|-----:| +| AMD EPYC 7742\* | | | | | 21.1 | 13.3 | 12.4 | +| AMD Ryzen 9 5950X\* | | | | | 24.1 | 16.1 | 15.8 | +| Apple M1 (ARMv8)\* | | | | | 29.2 | 19.6 | 18.5 | +| Cortex-A72 (ARMv8)\* | | | | | 30.5 | 20.5 | 20.0 | +| Intel Xeon E5-2609 v4\* | | | | | 31.9 | 21.4 | 21.2 | +| Intel Core i5-6300U | 747 | 114 | 69 | 46 | 34.2 | 23.2 | 23.1 | +| Intel Core i5-4200U | 998 | 153 | 92 | 61 | 45.5 | 30.9 | 30.7 | +| Cortex-A9 (ARMv7)\* | | | | | 95.8 | 55.5 | 53.9 | +| Cortex-A7 (ARMv7)\* | | | | | 138.1 | 89.9 | 88.8 | +| ARM1176JZF-S (ARMv6) | 3051 | 462 | 277 | 184 | 137.3 | 92.6 | 92.2 | + + +## Ascon-Mac and Ascon-Prf + +| Message Length in Bytes | 1 | 8 | 16 | 32 | 64 | 1536 | long | +|:-------------------------|-----:|-----:|-----:|-----:|-----:|-----:|-----:| +| Intel Core i5-6300U | 369 | 46 | 24 | 18 | 11.7 | 6.4 | 6.3 | +| Intel Core i5-4200U | 506 | 63 | 32 | 24 | 16.2 | 8.8 | 8.7 | +| ARM1176JZF-S (ARMv6) | 1769 | 223 | 117 | 85 | 57.5 | 31.9 | 31.6 | + + +## Ascon-PrfShort + +| Message Length in Bytes | 1 | 8 | 16 | 32 | 64 | 1536 | long | +|:-------------------------|-----:|-----:|-----:|-----:|-----:|-----:|-----:| +| Intel Core i5-6300U | 185 | 23 | 12 | - | - | - | - | +| Intel Core i5-4200U | 257 | 33 | 17 | - | - | - | - | +| ARM1176JZF-S (ARMv6) | 1057 | 132 | 69 | - | - | - | - | + +\* Results taken from eBACS: http://bench.cr.yp.to/ + + +# Build and test + +Build and test all Ascon C targets using release flags (-O2 -fomit-frame-pointer -march=native -mtune=native): + +``` +mkdir build && cd build +cmake .. +cmake --build . +ctest +``` + + +Build and test all Ascon C targets on Windows: + +``` +mkdir build && cd build +cmake .. +cmake --build . --config Release +ctest -C Release +``` + + +Build and test all Ascon C targets using debug flags (with NIST defined flags and sanitizers): + +``` +mkdir build && cd build +cmake .. -DCMAKE_BUILD_TYPE=Debug +cmake --build . +ctest +``` + +Manually set the compiler and/or release flags (e.g. to disable -march=native -mtune=native). + +``` +mkdir build && cd build +cmake .. -DCMAKE_C_COMPILER=clang -DREL_FLAGS="-O2;-fomit-frame-pointer" +cmake --build . +ctest +``` + +Build and run only specific algorithms, implementations and tests: + +``` +mkdir build && cd build +cmake .. -DALG_LIST="ascon128;asconhash" -DIMPL_LIST="opt64;bi32" -DTEST_LIST="genkat" +cmake --build . +ctest +``` + +Note that cmake stores variables in a cache. Therefore, variables can be set +one-by-one, unset using e.g. `cmake . -UIMPL_LIST` and shown using `cmake . -L`: + +``` +mkdir build && cd build +cmake .. +cmake . -DALG_LIST="ascon128;asconhash" +cmake . -DIMPL_LIST="opt64;bi32" +cmake . -DTEST_LIST="genkat" +cmake . -L +cmake --build . +ctest +``` + +Cross compile and test with custom emulator using e.g. `qemu-arm`: + +``` +mkdir build && cd build +cmake .. -DCMAKE_C_COMPILER="arm-linux-gnueabi-gcc" \ + -DREL_FLAGS="-O2;-fomit-frame-pointer;-march=armv7;-mtune=cortex-m4" \ + -DEMULATOR="qemu-arm;-L;/usr/arm-linux-gnueabi" \ + -DALG_LIST="ascon128;ascon128a" -DIMPL_LIST="armv7m;bi32_armv7m" +cmake --build . +ctest +``` + +or using Intel SDE (use full path to `sde` or add to path variable): + +``` +mkdir build && cd build +cmake .. -DCMAKE_C_COMPILER=gcc -DIMPL_LIST=avx512 -DEMULATOR="sde;--" \ + -DREL_FLAGS="-O2;-fomit-frame-pointer;-march=icelake-client" +cmake --build . +ctest +``` + + +# Build and benchmark: + +Build the getcycles test: + +``` +mkdir build && cd build +cmake .. -DALG_LIST="ascon128;asconhash" -DIMPL_LIST="opt32;opt32_lowsize" -DTEST_LIST="getcycles" +cmake --build . +``` + +Get the CPU cycle performance: + +``` +./getcycles_crypto_aead_ascon128v12_opt32 +./getcycles_crypto_aead_ascon128v12_opt32_lowsize +./getcycles_crypto_hash_asconhashv12_opt32 +./getcycles_crypto_hash_asconhashv12_opt32_lowsize +``` + +Get the implementation size: + +``` +size -t libcrypto_aead_ascon128v12_opt32.a +size -t libcrypto_aead_ascon128v12_opt32_lowsize.a +size -t libcrypto_hash_asconhashv12_opt32.a +size -t libcrypto_hash_asconhashv12_opt32_lowsize.a +``` + + +# Manually build and run a single Ascon target: + +Build example for AEAD algorithms: + +``` +gcc -march=native -O3 -Icrypto_aead/ascon128v12/opt64 crypto_aead/ascon128v12/opt64/*.c -Itests tests/genkat_aead.c -o genkat +gcc -march=native -O3 -Icrypto_aead/ascon128v12/opt64 crypto_aead/ascon128v12/opt64/*.c -DCRYPTO_AEAD -Itests tests/getcycles.c -o getcycles +``` + +Build example for HASH algorithms: + +``` +gcc -march=native -O3 -Icrypto_hash/asconhashv12/opt64 crypto_hash/asconhashv12/opt64/*.c -Itests tests/genkat_hash.c -o genkat +gcc -march=native -O3 -Icrypto_hash/asconhashv12/opt64 crypto_hash/asconhashv12/opt64/*.c -DCRYPTO_HASH -Itests tests/getcycles.c -o getcycles +``` + +Generate KATs and get CPU cycles: + +``` +./genkat +./getcycles +``` + + +## Manually build and run an RV32 target: + + +Setup: + +``` +sudo apt install gcc-riscv64-unknown-elf picolibc-riscv64-unknown-elf qemu-system-misc +``` + +Example to build, run and test an AEAD/HASH algorithm using `gcc`, `picolibc` and `qemu`: + +``` +riscv64-unknown-elf-gcc -O2 -march=rv32i -mabi=ilp32 --specs=picolibc.specs --oslib=semihost --crt0=hosted -Ttests/rv32.ld \ + -Icrypto_aead/ascon128v12/asm_rv32i crypto_aead/ascon128v12/asm_rv32i/*.[cS] -Itests tests/genkat_aead.c -o genkat +qemu-system-riscv32 -semihosting-config enable=on -monitor none -serial none -nographic -machine virt,accel=tcg -cpu rv32 -bios none -kernel genkat +diff LWC_AEAD_KAT_128_128.txt crypto_aead/ascon128v12/LWC_AEAD_KAT_128_128.txt +``` + +``` +riscv64-unknown-elf-gcc -O2 -march=rv32i -mabi=ilp32 --specs=picolibc.specs --oslib=semihost --crt0=hosted -Ttests/rv32.ld \ + -Icrypto_hash/asconhashv12/opt32 crypto_hash/asconhashv12/opt32/*.[cS] -Itests tests/genkat_hash.c -o genkat +qemu-system-riscv32 -semihosting-config enable=on -monitor none -serial none -nographic -machine virt,accel=tcg -cpu rv32 -bios none -kernel genkat +diff LWC_HASH_KAT_256.txt crypto_hash/asconhashv12/LWC_HASH_KAT_256.txt +``` + + +## Manually build and run an AVR target: + +Example to build, run and test an AEAD algorithm using `avr-gcc`, `avr-libc` and `simavr`. + +Setup: + +``` +sudo apt install gcc-avr avr-libc simavr +git clone https://github.com/JohannCahier/avr_uart.git +``` + +Single test vector using `demo` and performance measurement using `getcycles`: + +``` +avr-gcc -mmcu=atmega128 -std=c99 -Os -Icrypto_aead/ascon128v12/opt8 crypto_aead/ascon128v12/opt8/*.[cS] \ + -DAVR_UART -Iavr_uart avr_uart/avr_uart.c -Wno-incompatible-pointer-types -Wno-cpp \ + -DCRYPTO_AEAD -Itests tests/demo.c -o demo +simavr -m atmega128 ./demo +``` +``` +avr-gcc -mmcu=atmega128 -std=c99 -Os -Icrypto_aead/ascon128v12/opt8 crypto_aead/ascon128v12/opt8/*.[cS] \ + -DAVR_UART -Iavr_uart avr_uart/avr_uart.c -Wno-incompatible-pointer-types -Wno-cpp \ + -DCRYPTO_AEAD -Itests tests/getcycles.c -o getcycles +simavr -t -m atmega128 ./getcycles +``` + +Generate all test vectors for AEAD/HASH and write result to a file. Press Ctrl-C to quit `simavr` after about a minute: + +``` +avr-gcc -mmcu=atmega128 -std=c99 -Os -Icrypto_aead/ascon128v12/opt8 crypto_aead/ascon128v12/opt8/*.[cS] \ + -DAVR_UART -Iavr_uart avr_uart/avr_uart.c -Wno-incompatible-pointer-types -Wno-cpp \ + -Itests tests/genkat_aead.c -o genkat_aead +echo "Press Ctrl-C to quit simavr after about a minute" +simavr -t -m atmega128 ./genkat_aead 2> LWC_AEAD_KAT_128_128.txt +sed -i -e 's/\x1b\[[0-9;]*m//g' -e 's/\.\.$//' LWC_AEAD_KAT_128_128.txt +diff LWC_AEAD_KAT_128_128.txt crypto_aead/ascon128v12/LWC_AEAD_KAT_128_128.txt +``` + +``` +avr-gcc -mmcu=atmega128 -std=c99 -Os -Icrypto_hash/asconhashv12/opt8 crypto_hash/asconhashv12/opt8/*.[cS] \ + -DAVR_UART -Iavr_uart avr_uart/avr_uart.c -Wno-incompatible-pointer-types -Wno-cpp \ + -Itests tests/genkat_hash.c -o genkat_hash +echo "Press Ctrl-C to quit simavr after about a minute" +simavr -t -m atmega128 ./genkat_hash 2> LWC_HASH_KAT_256.txt +sed -i -e 's/\x1b\[[0-9;]*m//g' -e 's/\.\.$//' LWC_HASH_KAT_256.txt +diff LWC_HASH_KAT_256.txt crypto_hash/asconhashv12/LWC_HASH_KAT_256.txt +``` + + +# Benchmarking + +## Hints to get more reliable getcycles results on Intel/AMD CPUs: + +* Determine the processor base frequency (also called design frequency): + - e.g. using the Intel/AMD website + - or using `lscpu` listed under model name + +* Disable turbo boost (this should lock the frequency to the next value + below the processor base frequency): + ``` + echo 1 | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo + ``` + +* If the above does not work, manually set the frequency using e.g. `cpufreq-set`. + +* Determine the actual frequency (under load): + - e.g. by watching the frequency using `lscpu` or `cpufreq-info` + +* Determine the scaling factor between the actual and base frequency: + - factor = actual frequency / base frequency + +* Run a getcycles program using the frequency factor and watch the results: + ``` + while true; do ./getcycles_crypto_aead_ascon128v12_opt64 $factor; done + ``` + +* Run the `benchmark-getcycles.sh` script with the frequency factor and a + specific algorithm to benchmark all corresponding getcycles implementations: + ``` + scripts/benchmark-getcycles.sh $factor ascon128 + ``` + + +## Hints to activate the performance monitor unit (PMU) on ARM CPUs: + +* First try to install `linux-tools` and see if it works. + +* On many ARM platforms, the PMU has to be enabled using a kernel module: + - Source code for Armv6 (32-bit): + + - Source code for Armv7 (32-bit): + + - Source code for Armv8/Aarch64 (64-bit): + + +* Steps to compile the kernel module on the raspberry pi: + - Find out the kernel version using `uname -a` + - Download the kernel header files, e.g. `raspberrypi-kernel-header` + - Download the source code for the Armv6 kernel module + - Build, install and load the kernel module + + +## Benchmark Ascon v1.2 using supercop + +Download supercop according to the website: http://bench.cr.yp.to/supercop.html + +To test only Ascon, just run the following commands: + +``` +./do-part init +./do-part crypto_aead ascon128v12 +./do-part crypto_aead ascon128av12 +./do-part crypto_aead ascon80pqv12 +./do-part crypto_hash asconhashv12 +./do-part crypto_hash asconxofv12 +``` + +Show the cycles/Byte for a 1536 Byte long message: + +``` +cat bench/*/data | grep '_cycles 1536 ' | awk '{printf "%.1f\t%s\t%s\n", $9/$8, +$6, $7}' | sort -nr +``` + + +## Evaluate and optimize Ascon on constraint devices: + +* The ascon-c code allows to set compile-time parameters `ASCON_INLINE_MODE` + (IM), `ASCON_INLINE_PERM` (IP), `ASCON_UNROLL_LOOPS` (UL), `ASCON_INLINE_BI` + (IB), via command line or in the `crypto_*/ascon*/*/config.h` files. +* Use the `benchmark-config.sh` script to evaluate all combinations of these + parameters for a given list of Ascon implementations. The script is called + with an output file, frequency factor, the algorithm, and the list of + implementations to test: + ``` + scripts/benchmark-config.sh results-config.md $factor ascon128 ref opt64 opt64_lowsize + ``` +* The `results-config.md` file then contains a markup table with size and cycles + for each implementation and parameter set to evaluate several time-area + trade-offs. +* The `benchmark-all.sh` and `benchmark-size.sh` scripts provides a time/size + and size-only table of all currently compiled implementations: + ``` + scripts/benchmark-all.sh results-all.md + scripts/benchmark-size.sh results-size.md + ``` diff --git a/src/ascon-xofa/api.h b/src/ascon-xofa/api.h new file mode 100644 index 0000000..6f9efc3 --- /dev/null +++ b/src/ascon-xofa/api.h @@ -0,0 +1,4 @@ +#define CRYPTO_VERSION "1.2.7" +#define CRYPTO_BYTES 32 +#define ASCON_HASH_BYTES 0 /* XOF */ +#define ASCON_HASH_ROUNDS 8 diff --git a/src/ascon-xofa/architectures b/src/ascon-xofa/architectures new file mode 100644 index 0000000..a07c7a4 --- /dev/null +++ b/src/ascon-xofa/architectures @@ -0,0 +1,3 @@ +aarch64 +armeabi +arm diff --git a/src/ascon-xofa/ascon.h b/src/ascon-xofa/ascon.h new file mode 100644 index 0000000..c2ee57b --- /dev/null +++ b/src/ascon-xofa/ascon.h @@ -0,0 +1,53 @@ +#ifndef ASCON_H_ +#define ASCON_H_ + +#include + +#include "api.h" +#include "config.h" + +typedef union { + uint64_t x[5]; + uint32_t w[5][2]; + uint8_t b[5][8]; +} ascon_state_t; + +#ifdef ASCON_AEAD_RATE + +#define ASCON_KEYWORDS (CRYPTO_KEYBYTES + 7) / 8 + +typedef union { + uint64_t x[ASCON_KEYWORDS]; + uint32_t w[ASCON_KEYWORDS][2]; + uint8_t b[ASCON_KEYWORDS][8]; +} ascon_key_t; + +#if !ASCON_INLINE_MODE + +void ascon_loadkey(ascon_key_t* key, const uint8_t* k); +void ascon_initaead(ascon_state_t* s, const ascon_key_t* key, + const uint8_t* npub); +void ascon_adata(ascon_state_t* s, const uint8_t* ad, uint64_t adlen); +void ascon_encrypt(ascon_state_t* s, uint8_t* c, const uint8_t* m, + uint64_t mlen); +void ascon_decrypt(ascon_state_t* s, uint8_t* m, const uint8_t* c, + uint64_t clen); +void ascon_final(ascon_state_t* s, const ascon_key_t* k); + +#endif + +#endif + +#ifdef ASCON_HASH_BYTES + +#if !ASCON_INLINE_MODE + +void ascon_inithash(ascon_state_t* s); +void ascon_absorb(ascon_state_t* s, const uint8_t* in, uint64_t inlen); +void ascon_squeeze(ascon_state_t* s, uint8_t* out, uint64_t outlen); + +#endif + +#endif + +#endif /* ASCON_H_ */ diff --git a/src/ascon-xofa/bendian.h b/src/ascon-xofa/bendian.h new file mode 100644 index 0000000..4691995 --- /dev/null +++ b/src/ascon-xofa/bendian.h @@ -0,0 +1,39 @@ +#ifndef ENDIAN_H_ +#define ENDIAN_H_ + +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ + +/* macros for big endian machines */ +#ifdef PRAGMA_ENDIAN +#pragma message("Using macros for big endian machines") +#endif +#define U64BIG(x) (x) +#define U32BIG(x) (x) +#define U16BIG(x) (x) + +#elif defined(_MSC_VER) || \ + (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + +/* macros for little endian machines */ +#ifdef PRAGMA_ENDIAN +#pragma message("Using macros for little endian machines") +#endif +#define U64BIG(x) \ + (((0x00000000000000FFULL & (x)) << 56) | \ + ((0x000000000000FF00ULL & (x)) << 40) | \ + ((0x0000000000FF0000ULL & (x)) << 24) | \ + ((0x00000000FF000000ULL & (x)) << 8) | \ + ((0x000000FF00000000ULL & (x)) >> 8) | \ + ((0x0000FF0000000000ULL & (x)) >> 24) | \ + ((0x00FF000000000000ULL & (x)) >> 40) | \ + ((0xFF00000000000000ULL & (x)) >> 56)) +#define U32BIG(x) \ + (((0x000000FF & (x)) << 24) | ((0x0000FF00 & (x)) << 8) | \ + ((0x00FF0000 & (x)) >> 8) | ((0xFF000000 & (x)) >> 24)) +#define U16BIG(x) (((0x00FF & (x)) << 8) | ((0xFF00 & (x)) >> 8)) + +#else +#error "Ascon byte order macros not defined in bendian.h" +#endif + +#endif /* ENDIAN_H_ */ diff --git a/src/ascon-xofa/config.h b/src/ascon-xofa/config.h new file mode 100644 index 0000000..66a1156 --- /dev/null +++ b/src/ascon-xofa/config.h @@ -0,0 +1,19 @@ +#ifndef CONFIG_H_ +#define CONFIG_H_ + +/* inline the ascon mode */ +#ifndef ASCON_INLINE_MODE +#define ASCON_INLINE_MODE 0 +#endif + +/* inline all permutations */ +#ifndef ASCON_INLINE_PERM +#define ASCON_INLINE_PERM 1 +#endif + +/* unroll permutation loops */ +#ifndef ASCON_UNROLL_LOOPS +#define ASCON_UNROLL_LOOPS 1 +#endif + +#endif /* CONFIG_H_ */ diff --git a/src/ascon-xofa/constants.h b/src/ascon-xofa/constants.h new file mode 100644 index 0000000..80eac8d --- /dev/null +++ b/src/ascon-xofa/constants.h @@ -0,0 +1,90 @@ +#ifndef CONSTANTS_H_ +#define CONSTANTS_H_ + +#include + +#define ASCON_128_KEYBYTES 16 +#define ASCON_128A_KEYBYTES 16 +#define ASCON_80PQ_KEYBYTES 20 + +#define ASCON_128_RATE 8 +#define ASCON_128A_RATE 16 +#define ASCON_HASH_RATE 8 +#define ASCON_PRF_IN_RATE 32 +#define ASCON_PRFA_IN_RATE 40 +#define ASCON_PRF_OUT_RATE 16 + +#define ASCON_128_PA_ROUNDS 12 +#define ASCON_128_PB_ROUNDS 6 +#define ASCON_128A_PA_ROUNDS 12 +#define ASCON_128A_PB_ROUNDS 8 + +#define ASCON_HASH_PA_ROUNDS 12 +#define ASCON_HASH_PB_ROUNDS 12 +#define ASCON_HASHA_PA_ROUNDS 12 +#define ASCON_HASHA_PB_ROUNDS 8 + +#define ASCON_PRF_PA_ROUNDS 12 +#define ASCON_PRF_PB_ROUNDS 12 +#define ASCON_PRFA_PA_ROUNDS 12 +#define ASCON_PRFA_PB_ROUNDS 8 + +#define ASCON_128_IV 0x80400c0600000000ull +#define ASCON_128A_IV 0x80800c0800000000ull +#define ASCON_80PQ_IV 0xa0400c0600000000ull + +#define ASCON_HASH_IV 0x00400c0000000100ull +#define ASCON_HASHA_IV 0x00400c0400000100ull +#define ASCON_XOF_IV 0x00400c0000000000ull +#define ASCON_XOFA_IV 0x00400c0400000000ull + +#define ASCON_HASH_IV0 0xee9398aadb67f03dull +#define ASCON_HASH_IV1 0x8bb21831c60f1002ull +#define ASCON_HASH_IV2 0xb48a92db98d5da62ull +#define ASCON_HASH_IV3 0x43189921b8f8e3e8ull +#define ASCON_HASH_IV4 0x348fa5c9d525e140ull + +#define ASCON_HASHA_IV0 0x01470194fc6528a6ull +#define ASCON_HASHA_IV1 0x738ec38ac0adffa7ull +#define ASCON_HASHA_IV2 0x2ec8e3296c76384cull +#define ASCON_HASHA_IV3 0xd6f6a54d7f52377dull +#define ASCON_HASHA_IV4 0xa13c42a223be8d87ull + +#define ASCON_XOF_IV0 0xb57e273b814cd416ull +#define ASCON_XOF_IV1 0x2b51042562ae2420ull +#define ASCON_XOF_IV2 0x66a3a7768ddf2218ull +#define ASCON_XOF_IV3 0x5aad0a7a8153650cull +#define ASCON_XOF_IV4 0x4f3e0e32539493b6ull + +#define ASCON_XOFA_IV0 0x44906568b77b9832ull +#define ASCON_XOFA_IV1 0xcd8d6cae53455532ull +#define ASCON_XOFA_IV2 0xf7b5212756422129ull +#define ASCON_XOFA_IV3 0x246885e1de0d225bull +#define ASCON_XOFA_IV4 0xa8cb5ce33449973full + +#define ASCON_MAC_IV 0x80808c0000000080ull +#define ASCON_MACA_IV 0x80808c0400000080ull +#define ASCON_PRF_IV 0x80808c0000000000ull +#define ASCON_PRFA_IV 0x80808c0400000000ull +#define ASCON_PRFS_IV 0x80004c8000000000ull + +#define RC0 0xf0 +#define RC1 0xe1 +#define RC2 0xd2 +#define RC3 0xc3 +#define RC4 0xb4 +#define RC5 0xa5 +#define RC6 0x96 +#define RC7 0x87 +#define RC8 0x78 +#define RC9 0x69 +#define RCa 0x5a +#define RCb 0x4b + +#define RC(i) (i) + +#define START(n) ((3 + (n)) << 4 | (12 - (n))) +#define INC -0x0f +#define END 0x3c + +#endif /* CONSTANTS_H_ */ diff --git a/src/ascon-xofa/forceinline.h b/src/ascon-xofa/forceinline.h new file mode 100644 index 0000000..e66c1eb --- /dev/null +++ b/src/ascon-xofa/forceinline.h @@ -0,0 +1,23 @@ +#ifndef FORCEINLINE_H_ +#define FORCEINLINE_H_ + +/* define forceinline macro */ +#ifdef _MSC_VER +#define forceinline __forceinline +#elif defined(__GNUC__) +#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +#define forceinline inline __attribute__((__always_inline__)) +#else +#define forceinline static inline +#endif +#elif defined(__CLANG__) +#if __has_attribute(__always_inline__) +#define forceinline inline __attribute__((__always_inline__)) +#else +#define forceinline inline +#endif +#else +#define forceinline inline +#endif + +#endif /* FORCEINLINE_H_ */ diff --git a/src/ascon-xofa/goal-constbranch b/src/ascon-xofa/goal-constbranch new file mode 100644 index 0000000..1a9c048 --- /dev/null +++ b/src/ascon-xofa/goal-constbranch @@ -0,0 +1 @@ +Branches reviewed 2020-11-13 by Martin Schläffer. diff --git a/src/ascon-xofa/goal-constindex b/src/ascon-xofa/goal-constindex new file mode 100644 index 0000000..316d11d --- /dev/null +++ b/src/ascon-xofa/goal-constindex @@ -0,0 +1 @@ +Addresses reviewed 2020-11-13 by Martin Schläffer. diff --git a/src/ascon-xofa/hash.c b/src/ascon-xofa/hash.c new file mode 100644 index 0000000..54bc8e4 --- /dev/null +++ b/src/ascon-xofa/hash.c @@ -0,0 +1,89 @@ +#include "api.h" +#include "ascon.h" +/*#include "crypto_hash.h"*/ +#include "permutations.h" +#include "printstate.h" + +#if !ASCON_INLINE_MODE +#undef forceinline +#define forceinline +#endif + +#ifdef ASCON_HASH_BYTES + +forceinline void ascon_inithash(ascon_state_t* s) { + int i; + /* initialize */ +#ifdef ASCON_PRINT_STATE +#if ASCON_HASH_BYTES == 32 && ASCON_HASH_ROUNDS == 12 + s->x[0] = ASCON_HASH_IV; +#elif ASCON_HASH_BYTES == 32 && ASCON_HASH_ROUNDS == 8 + s->x[0] = ASCON_HASHA_IV; +#elif ASCON_HASH_BYTES == 0 && ASCON_HASH_ROUNDS == 12 + s->x[0] = ASCON_XOF_IV; +#elif ASCON_HASH_BYTES == 0 && ASCON_HASH_ROUNDS == 8 + s->x[0] = ASCON_XOFA_IV; +#endif + for (i = 1; i < 5; ++i) s->x[i] = 0; + printstate("initial value", s); + P(s, 12); +#endif +#if ASCON_HASH_BYTES == 32 && ASCON_HASH_ROUNDS == 12 + const uint64_t iv[5] = {ASCON_HASH_IV0, ASCON_HASH_IV1, ASCON_HASH_IV2, + ASCON_HASH_IV3, ASCON_HASH_IV4}; +#elif ASCON_HASH_BYTES == 32 && ASCON_HASH_ROUNDS == 8 + const uint64_t iv[5] = {ASCON_HASHA_IV0, ASCON_HASHA_IV1, ASCON_HASHA_IV2, + ASCON_HASHA_IV3, ASCON_HASHA_IV4}; +#elif ASCON_HASH_BYTES == 0 && ASCON_HASH_ROUNDS == 12 + const uint64_t iv[5] = {ASCON_XOF_IV0, ASCON_XOF_IV1, ASCON_XOF_IV2, + ASCON_XOF_IV3, ASCON_XOF_IV4}; +#elif ASCON_HASH_BYTES == 0 && ASCON_HASH_ROUNDS == 8 + const uint64_t iv[5] = {ASCON_XOFA_IV0, ASCON_XOFA_IV1, ASCON_XOFA_IV2, + ASCON_XOFA_IV3, ASCON_XOFA_IV4}; +#endif + for (i = 0; i < 5; ++i) s->x[i] = (iv[i]); + printstate("initialization", s); +} + +forceinline void ascon_absorb(ascon_state_t* s, const uint8_t* in, + uint64_t inlen) { + /* absorb full plaintext blocks */ + while (inlen >= ASCON_HASH_RATE) { + s->x[0] ^= LOAD(in, 8); + printstate("absorb plaintext", s); + P(s, ASCON_HASH_ROUNDS); + in += ASCON_HASH_RATE; + inlen -= ASCON_HASH_RATE; + } + /* absorb final plaintext block */ + s->x[0] ^= LOADBYTES(in, inlen); + s->x[0] ^= PAD(inlen); + printstate("pad plaintext", s); +} + +forceinline void ascon_squeeze(ascon_state_t* s, uint8_t* out, + uint64_t outlen) { + /* squeeze full output blocks */ + P(s, 12); + while (outlen > ASCON_HASH_RATE) { + STORE(out, s->x[0], 8); + printstate("squeeze output", s); + P(s, ASCON_HASH_ROUNDS); + out += ASCON_HASH_RATE; + outlen -= ASCON_HASH_RATE; + } + /* squeeze final output block */ + STOREBYTES(out, s->x[0], outlen); + printstate("squeeze output", s); +} + +/*int crypto_hash(unsigned char* out, const unsigned char* in, + unsigned long long inlen) { + ascon_state_t s; + ascon_inithash(&s); + ascon_absorb(&s, in, inlen); + ascon_squeeze(&s, out, CRYPTO_BYTES); + return 0; +}*/ + +#endif diff --git a/src/ascon-xofa/implementors b/src/ascon-xofa/implementors new file mode 100644 index 0000000..b110c1a --- /dev/null +++ b/src/ascon-xofa/implementors @@ -0,0 +1,2 @@ +Christoph Dobraunig +Martin Schläffer diff --git a/src/ascon-xofa/permutations.c b/src/ascon-xofa/permutations.c new file mode 100644 index 0000000..02bbadb --- /dev/null +++ b/src/ascon-xofa/permutations.c @@ -0,0 +1,29 @@ +#include "permutations.h" + +#if !ASCON_INLINE_PERM && ASCON_UNROLL_LOOPS + +void P12(ascon_state_t* s) { P12ROUNDS(s); } + +#endif + +#if ((defined(ASCON_AEAD_RATE) && ASCON_AEAD_RATE == 16) || \ + (defined(ASCON_HASH_ROUNDS) && ASCON_HASH_ROUNDS == 8) || \ + (defined(ASCON_PRF_ROUNDS) && ASCON_PRF_ROUNDS == 8)) && \ + !ASCON_INLINE_PERM && ASCON_UNROLL_LOOPS + +void P8(ascon_state_t* s) { P8ROUNDS(s); } + +#endif + +#if (defined(ASCON_AEAD_RATE) && ASCON_AEAD_RATE == 8) && \ + !ASCON_INLINE_PERM && ASCON_UNROLL_LOOPS + +void P6(ascon_state_t* s) { P6ROUNDS(s); } + +#endif + +#if !ASCON_INLINE_PERM && !ASCON_UNROLL_LOOPS + +void P(ascon_state_t* s, int nr) { PROUNDS(s, nr); } + +#endif diff --git a/src/ascon-xofa/permutations.h b/src/ascon-xofa/permutations.h new file mode 100644 index 0000000..cc1b4af --- /dev/null +++ b/src/ascon-xofa/permutations.h @@ -0,0 +1,78 @@ +#ifndef PERMUTATIONS_H_ +#define PERMUTATIONS_H_ + +#include + +#include "api.h" +#include "ascon.h" +#include "config.h" +#include "constants.h" +#include "printstate.h" +#include "round.h" + +forceinline void P12ROUNDS(ascon_state_t* s) { + ROUND(s, RC0); + ROUND(s, RC1); + ROUND(s, RC2); + ROUND(s, RC3); + ROUND(s, RC4); + ROUND(s, RC5); + ROUND(s, RC6); + ROUND(s, RC7); + ROUND(s, RC8); + ROUND(s, RC9); + ROUND(s, RCa); + ROUND(s, RCb); +} + +forceinline void P8ROUNDS(ascon_state_t* s) { + ROUND(s, RC4); + ROUND(s, RC5); + ROUND(s, RC6); + ROUND(s, RC7); + ROUND(s, RC8); + ROUND(s, RC9); + ROUND(s, RCa); + ROUND(s, RCb); +} + +forceinline void P6ROUNDS(ascon_state_t* s) { + ROUND(s, RC6); + ROUND(s, RC7); + ROUND(s, RC8); + ROUND(s, RC9); + ROUND(s, RCa); + ROUND(s, RCb); +} + +#if ASCON_INLINE_PERM && ASCON_UNROLL_LOOPS + +forceinline void P(ascon_state_t* s, int nr) { + if (nr == 12) P12ROUNDS(s); + if (nr == 8) P8ROUNDS(s); + if (nr == 6) P6ROUNDS(s); +} + +#elif !ASCON_INLINE_PERM && ASCON_UNROLL_LOOPS + +void P12(ascon_state_t* s); +void P8(ascon_state_t* s); +void P6(ascon_state_t* s); + +forceinline void P(ascon_state_t* s, int nr) { + if (nr == 12) P12(s); + if (nr == 8) P8(s); + if (nr == 6) P6(s); +} + +#elif ASCON_INLINE_PERM && !ASCON_UNROLL_LOOPS + +forceinline void P(ascon_state_t* s, int nr) { PROUNDS(s, nr); } + +#else /* !ASCON_INLINE_PERM && !ASCON_UNROLL_LOOPS */ + +void P(ascon_state_t* s, int nr); + +#endif + +#endif /* PERMUTATIONS_H_ */ diff --git a/src/ascon-xofa/printstate.c b/src/ascon-xofa/printstate.c new file mode 100644 index 0000000..a99cbb1 --- /dev/null +++ b/src/ascon-xofa/printstate.c @@ -0,0 +1,41 @@ +#ifdef ASCON_PRINT_STATE + +#include "printstate.h" + +#include +#include +#include + +#ifndef WORDTOU64 +#define WORDTOU64 +#endif + +#ifndef U64BIG +#define U64BIG +#endif + +void printword(const char* text, const uint64_t x) { + printf("%s=%016" PRIx64, text, U64BIG(WORDTOU64(x))); +} + +void printstate(const char* text, const ascon_state_t* s) { + int i; + printf("%s:", text); + for (i = strlen(text); i < 17; ++i) printf(" "); + printword(" x0", s->x[0]); + printword(" x1", s->x[1]); + printword(" x2", s->x[2]); + printword(" x3", s->x[3]); + printword(" x4", s->x[4]); +#ifdef ASCON_PRINT_BI + printf(" "); + printf(" x0=%08x_%08x", s->w[0][1], s->w[0][0]); + printf(" x1=%08x_%08x", s->w[1][1], s->w[1][0]); + printf(" x2=%08x_%08x", s->w[2][1], s->w[2][0]); + printf(" x3=%08x_%08x", s->w[3][1], s->w[3][0]); + printf(" x4=%08x_%08x", s->w[4][1], s->w[4][0]); +#endif + printf("\n"); +} + +#endif diff --git a/src/ascon-xofa/printstate.h b/src/ascon-xofa/printstate.h new file mode 100644 index 0000000..40b1f9c --- /dev/null +++ b/src/ascon-xofa/printstate.h @@ -0,0 +1,24 @@ +#ifndef PRINTSTATE_H_ +#define PRINTSTATE_H_ + +#ifdef ASCON_PRINT_STATE + +#include "ascon.h" +#include "word.h" + +void printword(const char* text, const uint64_t x); +void printstate(const char* text, const ascon_state_t* s); + +#else + +#define printword(text, w) \ + do { \ + } while (0) + +#define printstate(text, s) \ + do { \ + } while (0) + +#endif + +#endif /* PRINTSTATE_H_ */ diff --git a/src/ascon-xofa/round.h b/src/ascon-xofa/round.h new file mode 100644 index 0000000..dcbaf33 --- /dev/null +++ b/src/ascon-xofa/round.h @@ -0,0 +1,350 @@ +#ifndef ROUND_H_ +#define ROUND_H_ + +#include "ascon.h" +#include "constants.h" +#include "forceinline.h" +#include "printstate.h" +#include "word.h" + +forceinline void ROUND_LOOP(ascon_state_t* s, uint32_t C) { + uint32_t tmp0, tmp1; + __asm__ __volatile__( + "@.syntax_unified\n\t" + "rbegin_%=:;\n\t" + "eor %[x2_l], %[x2_l], %[tmp1]\n\t" + "push {%[tmp1]}\n\t" + "eor %[x0_l], %[x0_l], %[x4_l]\n\t" + "eor %[x4_l], %[x4_l], %[x3_l]\n\t" + "eor %[x2_l], %[x2_l], %[x1_l]\n\t" + "mvn %[tmp0], %[x0_l]\n\t" + "orr %[tmp0], %[tmp0], %[x4_l]\n\t" + "movs %[tmp1], %[x2_l]\n\t" + "bic %[tmp1], %[tmp1], %[x1_l]\n\t" + "eor %[x0_l], %[x0_l], %[tmp1]\n\t" + "mvn %[tmp1], %[x4_l]\n\t" + "orr %[tmp1], %[tmp1], %[x3_l]\n\t" + "eor %[x2_l], %[x2_l], %[tmp1]\n\t" + "movs %[tmp1], %[x1_l]\n\t" + "bic %[tmp1], %[tmp1], %[x0_l]\n\t" + "eor %[x4_l], %[x4_l], %[tmp1]\n\t" + "movs %[tmp1], %[x3_l]\n\t" + "and %[tmp1], %[tmp1], %[x2_l]\n\t" + "eor %[tmp1], %[x1_l], %[tmp1]\n\t" + "eor %[tmp0], %[x3_l], %[tmp0]\n\t" + "eor %[tmp0], %[tmp0], %[x2_l]\n\t" + "eor %[tmp1], %[tmp1], %[x0_l]\n\t" + "eor %[x0_l], %[x0_l], %[x4_l]\n\t" + "movs %[x1_l], %[x0_h]\n\t" + "movs %[x3_l], %[x1_h]\n\t" + "movs %[x0_h], %[x2_l]\n\t" + "movs %[x1_h], %[x0_l]\n\t" + "movs %[x0_l], %[x2_h]\n\t" + "movs %[x2_l], %[x3_h]\n\t" + "movs %[tmp2], %[x4_h]\n\t" + "movs %[x2_h], %[tmp0]\n\t" + "movs %[x3_h], %[x4_l]\n\t" + "eor %[x1_l], %[x1_l], %[tmp2]\n\t" + "eor %[tmp2], %[tmp2], %[x2_l]\n\t" + "eor %[x0_l], %[x0_l], %[x3_l]\n\t" + "mvn %[tmp0], %[x1_l]\n\t" + "orr %[tmp0], %[tmp0], %[tmp2]\n\t" + "movs %[x4_l], %[x0_l]\n\t" + "bic %[x4_l], %[x4_l], %[x3_l]\n\t" + "eor %[x1_l], %[x1_l], %[x4_l]\n\t" + "mvn %[x4_l], %[tmp2]\n\t" + "orr %[x4_l], %[x4_l], %[x2_l]\n\t" + "eor %[x0_l], %[x0_l], %[x4_l]\n\t" + "movs %[x4_l], %[x3_l]\n\t" + "bic %[x4_l], %[x4_l], %[x1_l]\n\t" + "eor %[tmp2], %[tmp2], %[x4_l]\n\t" + "movs %[x4_l], %[x2_l]\n\t" + "and %[x4_l], %[x4_l], %[x0_l]\n\t" + "eor %[x3_l], %[x3_l], %[x4_l]\n\t" + "eor %[x2_l], %[x2_l], %[tmp0]\n\t" + "eor %[x2_l], %[x2_l], %[x0_l]\n\t" + "eor %[x3_l], %[x3_l], %[x1_l]\n\t" + "eor %[x1_l], %[x1_l], %[tmp2]\n\t" + "movs %[x4_h], %[x2_l]\n\t" + "movs %[x2_l], %[x0_h]\n\t" + "movs %[x0_h], %[x1_l]\n\t" + "lsr %[x4_l], %[x0_l], #6\n\t" + "lsl %[x1_l], %[x2_l], #26\n\t" + "lsr %[tmp0], %[x2_l], #6\n\t" + "eor %[x4_l], %[x4_l], %[x1_l]\n\t" + "lsl %[x1_l], %[x0_l], #26\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "lsr %[x1_l], %[x0_l], #1\n\t" + "eor %[x4_l], %[x4_l], %[x1_l]\n\t" + "lsl %[x1_l], %[x2_l], #31\n\t" + "eor %[x4_l], %[x4_l], %[x1_l]\n\t" + "lsr %[x1_l], %[x2_l], #1\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "lsl %[x1_l], %[x0_l], #31\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "eor %[x0_l], %[x0_l], %[x4_l]\n\t" + "eor %[x2_l], %[x2_l], %[tmp0]\n\t" + "lsl %[x4_l], %[x3_l], #3\n\t" + "lsr %[x1_l], %[tmp1], #29\n\t" + "lsl %[tmp0], %[tmp1], #3\n\t" + "eor %[x4_l], %[x4_l], %[x1_l]\n\t" + "lsr %[x1_l], %[x3_l], #29\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "lsl %[x1_l], %[x3_l], #25\n\t" + "eor %[x4_l], %[x4_l], %[x1_l]\n\t" + "lsr %[x1_l], %[tmp1], #7\n\t" + "eor %[x4_l], %[x4_l], %[x1_l]\n\t" + "lsl %[x1_l], %[tmp1], #25\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "lsr %[x1_l], %[x3_l], #7\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "eor %[x3_l], %[x3_l], %[x4_l]\n\t" + "eor %[tmp1], %[tmp1], %[tmp0]\n\t" + "movs %[x4_l], %[x3_h]\n\t" + "movs %[x3_h], %[tmp1]\n\t" + "lsl %[tmp1], %[tmp2], #23\n\t" + "lsr %[x1_l], %[x4_l], #9\n\t" + "lsl %[tmp0], %[x4_l], #23\n\t" + "eor %[tmp1], %[tmp1], %[x1_l]\n\t" + "lsr %[x1_l], %[tmp2], #9\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "lsr %[x1_l], %[tmp2], #7\n\t" + "eor %[tmp1], %[tmp1], %[x1_l]\n\t" + "lsl %[x1_l], %[x4_l], #25\n\t" + "eor %[tmp1], %[tmp1], %[x1_l]\n\t" + "lsr %[x1_l], %[x4_l], #7\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "lsl %[x1_l], %[tmp2], #25\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "eor %[tmp2], %[tmp2], %[tmp1]\n\t" + "eor %[x4_l], %[x4_l], %[tmp0]\n\t" + "movs %[x1_l], %[x3_h]\n\t" + "movs %[tmp1], %[x4_h]\n\t" + "movs %[x4_h], %[tmp2]\n\t" + "movs %[x3_h], %[x3_l]\n\t" + "movs %[x3_l], %[x2_h]\n\t" + "movs %[x2_h], %[x0_l]\n\t" + "lsr %[tmp2], %[tmp1], #17\n\t" + "lsl %[x0_l], %[x3_l], #15\n\t" + "lsr %[tmp0], %[x3_l], #17\n\t" + "eor %[tmp2], %[tmp2], %[x0_l]\n\t" + "lsl %[x0_l], %[tmp1], #15\n\t" + "eor %[tmp0], %[tmp0], %[x0_l]\n\t" + "lsr %[x0_l], %[tmp1], #10\n\t" + "eor %[tmp2], %[tmp2], %[x0_l]\n\t" + "lsl %[x0_l], %[x3_l], #22\n\t" + "eor %[tmp2], %[tmp2], %[x0_l]\n\t" + "lsr %[x0_l], %[x3_l], #10\n\t" + "eor %[tmp0], %[tmp0], %[x0_l]\n\t" + "lsl %[x0_l], %[tmp1], #22\n\t" + "eor %[tmp0], %[tmp0], %[x0_l]\n\t" + "eor %[tmp1], %[tmp1], %[tmp2]\n\t" + "eor %[x3_l], %[x3_l], %[tmp0]\n\t" + "movs %[tmp0], %[x0_h]\n\t" + "movs %[x0_l], %[x1_h]\n\t" + "movs %[x0_h], %[x4_l]\n\t" + "movs %[x1_h], %[x3_h]\n\t" + "movs %[x3_h], %[tmp1]\n\t" + "lsr %[x4_l], %[tmp0], #28\n\t" + "lsl %[tmp1], %[x0_l], #4\n\t" + "lsr %[tmp2], %[x0_l], #28\n\t" + "eor %[x4_l], %[x4_l], %[tmp1]\n\t" + "lsl %[tmp1], %[tmp0], #4\n\t" + "eor %[tmp2], %[tmp2], %[tmp1]\n\t" + "lsr %[tmp1], %[tmp0], #19\n\t" + "eor %[x4_l], %[x4_l], %[tmp1]\n\t" + "lsl %[tmp1], %[x0_l], #13\n\t" + "eor %[x4_l], %[x4_l], %[tmp1]\n\t" + "lsr %[tmp1], %[x0_l], #19\n\t" + "eor %[tmp2], %[tmp2], %[tmp1]\n\t" + "lsl %[tmp1], %[tmp0], #13\n\t" + "eor %[tmp2], %[tmp2], %[tmp1]\n\t" + "pop {%[tmp1]}\n\t" + "eor %[tmp0], %[tmp0], %[x4_l]\n\t" + "eor %[x0_l], %[x0_l], %[tmp2]\n\t" + "movs %[x4_l], %[x0_h]\n\t" + "movs %[x0_h], %[tmp0]\n\t" + "sub %[tmp1], %[tmp1], #15\n\t" + "cmp %[tmp1], #60\n\t" + "beq rend_%=\n\t" + "b rbegin_%=\n\t" + "rend_%=:;\n\t" + : + [x0_l] "+l"(s->w[0][0]), [x0_h] "+h"(s->w[0][1]), [x1_l] "+l"(s->w[1][0]), + [x1_h] "+h"(s->w[1][1]), [x2_l] "+l"(s->w[2][0]), [x2_h] "+h"(s->w[2][1]), + [x3_l] "+l"(s->w[3][0]), [x3_h] "+h"(s->w[3][1]), [x4_l] "+l"(s->w[4][0]), + [x4_h] "+h"(s->w[4][1]), [tmp1] "+l"(C), [tmp0] "=l"(tmp0), + [tmp2] "=l"(tmp1) + : + :); +} + +forceinline void ROUND(ascon_state_t* s, uint32_t C) { + uint32_t tmp0, tmp1, tmp2; + __asm__ __volatile__( + "@.syntax_unified\n\t" + "movs %[tmp0], %[C]\n\t" + "eor %[x2_l], %[x2_l], %[tmp0]\n\t" + "eor %[x0_l], %[x0_l], %[x4_l]\n\t" + "eor %[x4_l], %[x4_l], %[x3_l]\n\t" + "eor %[x2_l], %[x2_l], %[x1_l]\n\t" + "mvn %[tmp0], %[x0_l]\n\t" + "orr %[tmp0], %[tmp0], %[x4_l]\n\t" + "movs %[tmp1], %[x2_l]\n\t" + "bic %[tmp1], %[tmp1], %[x1_l]\n\t" + "eor %[x0_l], %[x0_l], %[tmp1]\n\t" + "mvn %[tmp1], %[x4_l]\n\t" + "orr %[tmp1], %[tmp1], %[x3_l]\n\t" + "eor %[x2_l], %[x2_l], %[tmp1]\n\t" + "movs %[tmp1], %[x1_l]\n\t" + "bic %[tmp1], %[tmp1], %[x0_l]\n\t" + "eor %[x4_l], %[x4_l], %[tmp1]\n\t" + "movs %[tmp1], %[x3_l]\n\t" + "and %[tmp1], %[tmp1], %[x2_l]\n\t" + "eor %[tmp1], %[x1_l], %[tmp1]\n\t" + "eor %[tmp0], %[x3_l], %[tmp0]\n\t" + "eor %[tmp0], %[tmp0], %[x2_l]\n\t" + "eor %[tmp1], %[tmp1], %[x0_l]\n\t" + "eor %[x0_l], %[x0_l], %[x4_l]\n\t" + "movs %[x1_l], %[x0_h]\n\t" + "movs %[x3_l], %[x1_h]\n\t" + "movs %[x0_h], %[x2_l]\n\t" + "movs %[x1_h], %[x0_l]\n\t" + "movs %[x0_l], %[x2_h]\n\t" + "movs %[x2_l], %[x3_h]\n\t" + "movs %[tmp2], %[x4_h]\n\t" + "movs %[x2_h], %[tmp0]\n\t" + "movs %[x3_h], %[x4_l]\n\t" + "eor %[x1_l], %[x1_l], %[tmp2]\n\t" + "eor %[tmp2], %[tmp2], %[x2_l]\n\t" + "eor %[x0_l], %[x0_l], %[x3_l]\n\t" + "mvn %[tmp0], %[x1_l]\n\t" + "orr %[tmp0], %[tmp0], %[tmp2]\n\t" + "movs %[x4_l], %[x0_l]\n\t" + "bic %[x4_l], %[x4_l], %[x3_l]\n\t" + "eor %[x1_l], %[x1_l], %[x4_l]\n\t" + "mvn %[x4_l], %[tmp2]\n\t" + "orr %[x4_l], %[x4_l], %[x2_l]\n\t" + "eor %[x0_l], %[x0_l], %[x4_l]\n\t" + "movs %[x4_l], %[x3_l]\n\t" + "bic %[x4_l], %[x4_l], %[x1_l]\n\t" + "eor %[tmp2], %[tmp2], %[x4_l]\n\t" + "movs %[x4_l], %[x2_l]\n\t" + "and %[x4_l], %[x4_l], %[x0_l]\n\t" + "eor %[x3_l], %[x3_l], %[x4_l]\n\t" + "eor %[x2_l], %[x2_l], %[tmp0]\n\t" + "eor %[x2_l], %[x2_l], %[x0_l]\n\t" + "eor %[x3_l], %[x3_l], %[x1_l]\n\t" + "eor %[x1_l], %[x1_l], %[tmp2]\n\t" + "movs %[x4_h], %[x2_l]\n\t" + "movs %[x2_l], %[x0_h]\n\t" + "movs %[x0_h], %[x1_l]\n\t" + "lsr %[x4_l], %[x0_l], #6\n\t" + "lsl %[x1_l], %[x2_l], #26\n\t" + "lsr %[tmp0], %[x2_l], #6\n\t" + "eor %[x4_l], %[x4_l], %[x1_l]\n\t" + "lsl %[x1_l], %[x0_l], #26\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "lsr %[x1_l], %[x0_l], #1\n\t" + "eor %[x4_l], %[x4_l], %[x1_l]\n\t" + "lsl %[x1_l], %[x2_l], #31\n\t" + "eor %[x4_l], %[x4_l], %[x1_l]\n\t" + "lsr %[x1_l], %[x2_l], #1\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "lsl %[x1_l], %[x0_l], #31\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "eor %[x0_l], %[x0_l], %[x4_l]\n\t" + "eor %[x2_l], %[x2_l], %[tmp0]\n\t" + "lsl %[x4_l], %[x3_l], #3\n\t" + "lsr %[x1_l], %[tmp1], #29\n\t" + "lsl %[tmp0], %[tmp1], #3\n\t" + "eor %[x4_l], %[x4_l], %[x1_l]\n\t" + "lsr %[x1_l], %[x3_l], #29\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "lsl %[x1_l], %[x3_l], #25\n\t" + "eor %[x4_l], %[x4_l], %[x1_l]\n\t" + "lsr %[x1_l], %[tmp1], #7\n\t" + "eor %[x4_l], %[x4_l], %[x1_l]\n\t" + "lsl %[x1_l], %[tmp1], #25\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "lsr %[x1_l], %[x3_l], #7\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "eor %[x3_l], %[x3_l], %[x4_l]\n\t" + "eor %[tmp1], %[tmp1], %[tmp0]\n\t" + "movs %[x4_l], %[x3_h]\n\t" + "movs %[x3_h], %[tmp1]\n\t" + "lsl %[tmp1], %[tmp2], #23\n\t" + "lsr %[x1_l], %[x4_l], #9\n\t" + "lsl %[tmp0], %[x4_l], #23\n\t" + "eor %[tmp1], %[tmp1], %[x1_l]\n\t" + "lsr %[x1_l], %[tmp2], #9\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "lsr %[x1_l], %[tmp2], #7\n\t" + "eor %[tmp1], %[tmp1], %[x1_l]\n\t" + "lsl %[x1_l], %[x4_l], #25\n\t" + "eor %[tmp1], %[tmp1], %[x1_l]\n\t" + "lsr %[x1_l], %[x4_l], #7\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "lsl %[x1_l], %[tmp2], #25\n\t" + "eor %[tmp0], %[tmp0], %[x1_l]\n\t" + "eor %[tmp2], %[tmp2], %[tmp1]\n\t" + "eor %[x4_l], %[x4_l], %[tmp0]\n\t" + "movs %[x1_l], %[x3_h]\n\t" + "movs %[tmp1], %[x4_h]\n\t" + "movs %[x4_h], %[tmp2]\n\t" + "movs %[x3_h], %[x3_l]\n\t" + "movs %[x3_l], %[x2_h]\n\t" + "movs %[x2_h], %[x0_l]\n\t" + "lsr %[tmp2], %[tmp1], #17\n\t" + "lsl %[x0_l], %[x3_l], #15\n\t" + "lsr %[tmp0], %[x3_l], #17\n\t" + "eor %[tmp2], %[tmp2], %[x0_l]\n\t" + "lsl %[x0_l], %[tmp1], #15\n\t" + "eor %[tmp0], %[tmp0], %[x0_l]\n\t" + "lsr %[x0_l], %[tmp1], #10\n\t" + "eor %[tmp2], %[tmp2], %[x0_l]\n\t" + "lsl %[x0_l], %[x3_l], #22\n\t" + "eor %[tmp2], %[tmp2], %[x0_l]\n\t" + "lsr %[x0_l], %[x3_l], #10\n\t" + "eor %[tmp0], %[tmp0], %[x0_l]\n\t" + "lsl %[x0_l], %[tmp1], #22\n\t" + "eor %[tmp0], %[tmp0], %[x0_l]\n\t" + "eor %[tmp1], %[tmp1], %[tmp2]\n\t" + "eor %[x3_l], %[x3_l], %[tmp0]\n\t" + "movs %[tmp0], %[x0_h]\n\t" + "movs %[x0_l], %[x1_h]\n\t" + "movs %[x0_h], %[x4_l]\n\t" + "movs %[x1_h], %[x3_h]\n\t" + "movs %[x3_h], %[tmp1]\n\t" + "lsr %[x4_l], %[tmp0], #28\n\t" + "lsl %[tmp1], %[x0_l], #4\n\t" + "lsr %[tmp2], %[x0_l], #28\n\t" + "eor %[x4_l], %[x4_l], %[tmp1]\n\t" + "lsl %[tmp1], %[tmp0], #4\n\t" + "eor %[tmp2], %[tmp2], %[tmp1]\n\t" + "lsr %[tmp1], %[tmp0], #19\n\t" + "eor %[x4_l], %[x4_l], %[tmp1]\n\t" + "lsl %[tmp1], %[x0_l], #13\n\t" + "eor %[x4_l], %[x4_l], %[tmp1]\n\t" + "lsr %[tmp1], %[x0_l], #19\n\t" + "eor %[tmp2], %[tmp2], %[tmp1]\n\t" + "lsl %[tmp1], %[tmp0], #13\n\t" + "eor %[tmp2], %[tmp2], %[tmp1]\n\t" + "eor %[tmp0], %[tmp0], %[x4_l]\n\t" + "eor %[x0_l], %[x0_l], %[tmp2]\n\t" + "movs %[x4_l], %[x0_h]\n\t" + "movs %[x0_h], %[tmp0]\n\t" + : + [x0_l] "+l"(s->w[0][0]), [x0_h] "+h"(s->w[0][1]), [x1_l] "+l"(s->w[1][0]), + [x1_h] "+h"(s->w[1][1]), [x2_l] "+l"(s->w[2][0]), [x2_h] "+h"(s->w[2][1]), + [x3_l] "+l"(s->w[3][0]), [x3_h] "+h"(s->w[3][1]), [x4_l] "+l"(s->w[4][0]), + [x4_h] "+h"(s->w[4][1]), [tmp0] "=l"(tmp0), [tmp1] "=l"(tmp1), + [tmp2] "=l"(tmp2) + : [C] "ri"(C) + :); + printstate(" round output", s); +} + +forceinline void PROUNDS(ascon_state_t* s, int nr) { ROUND_LOOP(s, START(nr)); } + +#endif /* ROUND_H_ */ diff --git a/src/ascon-xofa/word.h b/src/ascon-xofa/word.h new file mode 100644 index 0000000..e8949db --- /dev/null +++ b/src/ascon-xofa/word.h @@ -0,0 +1,69 @@ +#ifndef WORD_H_ +#define WORD_H_ + +#include +#include + +#include "bendian.h" +#include "forceinline.h" + +typedef union { + uint64_t x; + uint32_t w[2]; + uint8_t b[8]; +} word_t; + +#define U64TOWORD(x) U64BIG(x) +#define WORDTOU64(x) U64BIG(x) + +forceinline uint64_t ROR(uint64_t x, int n) { return x >> n | x << (-n & 63); } + +forceinline uint64_t KEYROT(uint64_t lo2hi, uint64_t hi2lo) { + return lo2hi << 32 | hi2lo >> 32; +} + +forceinline int NOTZERO(uint64_t a, uint64_t b) { + uint64_t result = a | b; + result |= result >> 32; + result |= result >> 16; + result |= result >> 8; + return ((((int)(result & 0xff) - 1) >> 8) & 1) - 1; +} + +forceinline uint64_t PAD(int i) { return 0x80ull << (56 - 8 * i); } + +forceinline uint64_t PRFS_MLEN(uint64_t len) { return len << 51; } + +forceinline uint64_t CLEAR(uint64_t w, int n) { + /* undefined for n == 0 */ + uint64_t mask = ~0ull >> (8 * n); + return w & mask; +} + +forceinline uint64_t MASK(int n) { + /* undefined for n == 0 */ + return ~0ull >> (64 - 8 * n); +} + +forceinline uint64_t LOAD(const uint8_t* bytes, int n) { + uint64_t x = *(uint64_t*)bytes & MASK(n); + return U64TOWORD(x); +} + +forceinline void STORE(uint8_t* bytes, uint64_t w, int n) { + *(uint64_t*)bytes &= ~MASK(n); + *(uint64_t*)bytes |= WORDTOU64(w); +} + +forceinline uint64_t LOADBYTES(const uint8_t* bytes, int n) { + uint64_t x = 0; + memcpy(&x, bytes, n); + return U64TOWORD(x); +} + +forceinline void STOREBYTES(uint8_t* bytes, uint64_t w, int n) { + uint64_t x = WORDTOU64(w); + memcpy(bytes, &x, n); +} + +#endif /* WORD_H_ */ diff --git a/src/main.c b/src/main.c index b660510..bd1fbc8 100644 --- a/src/main.c +++ b/src/main.c @@ -11,11 +11,13 @@ #include "rorand.h" +#include "rourand.h" #include "util.h" #define DO_TIME_BENCH 0 #define DATA_TOTAL 1024*1024 +#define USE_URANDOM 1 #if DO_TIME_BENCH static uint8_t time_bench[128*1024]; @@ -50,13 +52,23 @@ int main() { iprintf("rorand_init() returned %d\n", d); panic("can't init rorand"); } + struct rourand_state* ur = rourand_init(rorand_get, 0); + if (!ur) { + panic("Can't init rourand"); + } + +#if USE_URANDOM +#define rand_get(dst, size) rourand_get(ur, dst, size) +#else +#define rand_get(dst, size) rorand_get(dst, (size)*CHAR_BIT) +#endif #if DO_TIME_BENCH memset(time_bench, 0, sizeof(time_bench)); iprintf("[---] throughput benchmark start\n"); absolute_time_t ta = get_absolute_time(); - rorand_get(time_bench, count_of(time_bench)*CHAR_BIT); + rand_get(time_bench, count_of(time_bench)); absolute_time_t tb = get_absolute_time(); int64_t dt_us = absolute_time_diff_us(ta, tb); @@ -72,10 +84,11 @@ int main() { memset(data, 0, sizeof(data)); const uintptr_t total = DATA_TOTAL; for (uintptr_t off = 0; off < total; off += count_of(data)) { - rorand_get(data, count_of(data)*CHAR_BIT); + rand_get(data, count_of(data)); hexdump(NULL, off, data, sizeof(data)); } #endif + rourand_free(ur); iprintf("done\n"); while(1); diff --git a/src/rourand.c b/src/rourand.c new file mode 100644 index 0000000..e72099e --- /dev/null +++ b/src/rourand.c @@ -0,0 +1,71 @@ + +#include +#include +#include +#include + +#include "ascon.h" +#include "rourand.h" + + +struct rourand_state { + ascon_state_t ascon; + uint8_t buf[16]; + rourand_get_fn rand; + size_t bpos; + size_t rate; + size_t rcount; +}; + + +struct rourand_state* rourand_init(rourand_get_fn rawrand, int rate) { + struct rourand_state* r = calloc(1, sizeof(struct rourand_state)); + if (!r) return NULL; + + if (rate == 0) rate = 16; + + ascon_inithash(&r->ascon); + r->rand = rawrand; + r->bpos = 0; + r->rate = rate; + r->rcount = 0; + + rawrand(r->buf, sizeof(r->buf)*CHAR_BIT); + ascon_absorb(&r->ascon, r->buf, sizeof(r->buf)); + rawrand(r->buf, sizeof(r->buf)*CHAR_BIT); + ascon_absorb(&r->ascon, r->buf, sizeof(r->buf)); + + return r; +} +void rourand_free(struct rourand_state* st) { + if (st) { + explicit_bzero(st, sizeof(*st)); + free(st); + } +} + +void rourand_get(struct rourand_state* st, void* dst_, size_t nbytes) { + uint8_t* dst = (uint8_t*)dst_; + + while (nbytes > 0) { + if (st->bpos == 0) { + ++st->rcount; + if (st->rcount == st->rate) { + st->rand(st->buf, sizeof(st->buf)*CHAR_BIT); + ascon_absorb(&st->ascon, st->buf, sizeof(st->buf)); + st->rcount = 0; + } + ascon_squeeze(&st->ascon, st->buf, sizeof(st->buf)); + st->bpos = sizeof(st->buf); + } + + size_t todo = nbytes; + if (todo > st->bpos) todo = st->bpos; + + memcpy(dst, &st->buf[sizeof(st->buf) - st->bpos], todo); + nbytes -= todo; + st->bpos -= todo; + dst += todo; + } +} + diff --git a/src/rourand.h b/src/rourand.h new file mode 100644 index 0000000..c9ac6bd --- /dev/null +++ b/src/rourand.h @@ -0,0 +1,24 @@ + +#ifndef ROURAND_H_ +#define ROURAND_H_ + +#include + + +struct rourand_state; + +typedef void (*rourand_get_fn)(void* dst, size_t nbits); + +struct rourand_state* rourand_init(rourand_get_fn rawrand, int rate); +void rourand_free(struct rourand_state* st); + +void rourand_get(struct rourand_state* st, void* dst, size_t nbytes); + +static inline uint32_t rourand_get32(struct rourand_state* st) { + uint32_t r = 0; + rourand_get(st, &r, sizeof(r)); + return r; +} + +#endif +