diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6737c79..be17645 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,12 +18,17 @@ pico_enable_stdio_uart(${PROJECT} 0)
 pico_enable_stdio_usb(${PROJECT} 1)
 
 target_sources(${PROJECT} PUBLIC
+	${CMAKE_CURRENT_SOURCE_DIR}/src/ascon-xofa/hash.c
+	${CMAKE_CURRENT_SOURCE_DIR}/src/ascon-xofa/permutations.c
+
 	${CMAKE_CURRENT_SOURCE_DIR}/src/util.c
 	${CMAKE_CURRENT_SOURCE_DIR}/src/rorand.c
+	${CMAKE_CURRENT_SOURCE_DIR}/src/rourand.c
 	${CMAKE_CURRENT_SOURCE_DIR}/src/main.c
 )
 
 target_include_directories(${PROJECT} PUBLIC
+	${CMAKE_CURRENT_SOURCE_DIR}/src/ascon-xofa/
 	${CMAKE_CURRENT_SOURCE_DIR}/src/
 )
 
diff --git a/src/ascon-xofa/LICENSE b/src/ascon-xofa/LICENSE
new file mode 100644
index 0000000..3bbbc1e
--- /dev/null
+++ b/src/ascon-xofa/LICENSE
@@ -0,0 +1,116 @@
+CC0 1.0 Universal
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator and
+subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for the
+purpose of contributing to a commons of creative, cultural and scientific
+works ("Commons") that the public can reliably and without fear of later
+claims of infringement build upon, modify, incorporate in other works, reuse
+and redistribute as freely as possible in any form whatsoever and for any
+purposes, including without limitation commercial purposes. These owners may
+contribute to the Commons to promote the ideal of a free culture and the
+further production of creative, cultural and scientific works, or to gain
+reputation or greater distribution for their Work in part through the use and
+efforts of others.
+
+For these and/or other purposes and motivations, and without any expectation
+of additional consideration or compensation, the person associating CC0 with a
+Work (the "Affirmer"), to the extent that he or she is an owner of Copyright
+and Related Rights in the Work, voluntarily elects to apply CC0 to the Work
+and publicly distribute the Work under its terms, with knowledge of his or her
+Copyright and Related Rights in the Work and the meaning and intended legal
+effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not limited
+to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display, communicate,
+  and translate a Work;
+
+  ii. moral rights retained by the original author(s) and/or performer(s);
+
+  iii. publicity and privacy rights pertaining to a person's image or likeness
+  depicted in a Work;
+
+  iv. rights protecting against unfair competition in regards to a Work,
+  subject to the limitations in paragraph 4(a), below;
+
+  v. rights protecting the extraction, dissemination, use and reuse of data in
+  a Work;
+
+  vi. database rights (such as those arising under Directive 96/9/EC of the
+  European Parliament and of the Council of 11 March 1996 on the legal
+  protection of databases, and under any national implementation thereof,
+  including any amended or successor version of such directive); and
+
+  vii. other similar, equivalent or corresponding rights throughout the world
+  based on applicable law or treaty, and any national implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention of,
+applicable law, Affirmer hereby overtly, fully, permanently, irrevocably and
+unconditionally waives, abandons, and surrenders all of Affirmer's Copyright
+and Related Rights and associated claims and causes of action, whether now
+known or unknown (including existing as well as future claims and causes of
+action), in the Work (i) in all territories worldwide, (ii) for the maximum
+duration provided by applicable law or treaty (including future time
+extensions), (iii) in any current or future medium and for any number of
+copies, and (iv) for any purpose whatsoever, including without limitation
+commercial, advertising or promotional purposes (the "Waiver"). Affirmer makes
+the Waiver for the benefit of each member of the public at large and to the
+detriment of Affirmer's heirs and successors, fully intending that such Waiver
+shall not be subject to revocation, rescission, cancellation, termination, or
+any other legal or equitable action to disrupt the quiet enjoyment of the Work
+by the public as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason be
+judged legally invalid or ineffective under applicable law, then the Waiver
+shall be preserved to the maximum extent permitted taking into account
+Affirmer's express Statement of Purpose. In addition, to the extent the Waiver
+is so judged Affirmer hereby grants to each affected person a royalty-free,
+non transferable, non sublicensable, non exclusive, irrevocable and
+unconditional license to exercise Affirmer's Copyright and Related Rights in
+the Work (i) in all territories worldwide, (ii) for the maximum duration
+provided by applicable law or treaty (including future time extensions), (iii)
+in any current or future medium and for any number of copies, and (iv) for any
+purpose whatsoever, including without limitation commercial, advertising or
+promotional purposes (the "License"). The License shall be deemed effective as
+of the date CC0 was applied by Affirmer to the Work. Should any part of the
+License for any reason be judged legally invalid or ineffective under
+applicable law, such partial invalidity or ineffectiveness shall not
+invalidate the remainder of the License, and in such case Affirmer hereby
+affirms that he or she will not (i) exercise any of his or her remaining
+Copyright and Related Rights in the Work or (ii) assert any associated claims
+and causes of action with respect to the Work, in either case contrary to
+Affirmer's express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+  a. No trademark or patent rights held by Affirmer are waived, abandoned,
+  surrendered, licensed or otherwise affected by this document.
+
+  b. Affirmer offers the Work as-is and makes no representations or warranties
+  of any kind concerning the Work, express, implied, statutory or otherwise,
+  including without limitation warranties of title, merchantability, fitness
+  for a particular purpose, non infringement, or the absence of latent or
+  other defects, accuracy, or the present or absence of errors, whether or not
+  discoverable, all to the greatest extent permissible under applicable law.
+
+  c. Affirmer disclaims responsibility for clearing rights of other persons
+  that may apply to the Work or any use thereof, including without limitation
+  any person's Copyright and Related Rights in the Work. Further, Affirmer
+  disclaims responsibility for obtaining any necessary consents, permissions
+  or other rights required for any use of the Work.
+
+  d. Affirmer understands and acknowledges that Creative Commons is not a
+  party to this document and has no duty or obligation with respect to this
+  CC0 or use of the Work.
+
+For more information, please see
+<http://creativecommons.org/publicdomain/zero/1.0/>
\ No newline at end of file
diff --git a/src/ascon-xofa/README.md b/src/ascon-xofa/README.md
new file mode 100644
index 0000000..125080e
--- /dev/null
+++ b/src/ascon-xofa/README.md
@@ -0,0 +1,519 @@
+# Reference, highly optimized, masked C and ASM implementations of Ascon
+
+Ascon is a family of lightweight cryptographic algorithms and consists of:
+- Authenticated encryption schemes with associated data (AEAD)
+- Hash functions (HASH) and extendible output functions (XOF)
+- Pseudo-random functions (PRF) and message authentication codes (MAC)
+
+All implementations use the "ECRYPT Benchmarking of Cryptographic Systems (eBACS)" interface:
+
+- https://bench.cr.yp.to/call-aead.html for AEAD (Ascon-128, Ascon-128a, Ascon-80pq)
+- https://bench.cr.yp.to/call-hash.html for HASH and XOF (Ascon-Hash, Ascon-Hasha, Ascon-Xof, Ascon-Xofa)
+- https://nacl.cr.yp.to/auth.html for PRF and MAC (Ascon-Mac, Ascon-Prf, Ascon-PrfShort)
+
+For more information on Ascon visit: https://ascon.iaik.tugraz.at/
+
+
+## TL;DR
+
+If you do not know where to start, use the reference implementations (self-contained, portable, very fast):
+
+- `crypto_aead/ascon128v12/ref`
+- `crypto_aead/ascon128av12/ref`
+- `crypto_aead/asconxofv12/ref`
+- `crypto_aead/asconxofav12/ref`
+
+
+## Algorithms
+
+This repository contains implementations of the following 10 Ascon v1.2 algorithms:
+
+- `crypto_aead/ascon128v12`: Ascon-128
+- `crypto_aead/ascon128av12`: Ascon-128a
+- `crypto_aead/ascon80pqv12`: Ascon-80pq
+- `crypto_hash/asconhashv12`: Ascon-Hash
+- `crypto_hash/asconhashav12`: Ascon-Hasha
+- `crypto_hash/asconxofv12`: Ascon-Xof
+- `crypto_hash/asconxofav12`: Ascon-Xofa
+- `crypto_auth/asconmacv12`: Ascon-Mac
+- `crypto_auth/asconprfv12`: Ascon-Prf
+- `crypto_auth/asconprfsv12`: Ascon-PrfShort
+
+We also provide two combined algorithm implementations supporting both AEAD and
+hashing:
+
+- `crypto_aead_hash/asconv12`: Ascon-128 combined with Ascon-Hash
+- `crypto_aead_hash/asconav12`: Ascon-128a combined with Ascon-Hasha
+
+The following algorithms demonstrate the performance improvement of Ascon on
+32-bit platforms without bit interleaving overhead. Bit interleaving could be
+performed externally on the host side or using a dedicated instruction (e.g.
+using the ARM Custom Datapath Extension). Note that a similar performance
+improvement could be achieved using funnel shift instructions (available on some
+32-bit RISC-V extensions).
+
+- `crypto_aead/ascon128bi32v12`: Ascon-128 (+17% on ARM1176JZF-S)
+- `crypto_aead/ascon128abi32v12`: Ascon-128a (+23% on ARM1176JZF-S)
+- `crypto_hash/asconhashbi32v12`: Ascon-Hash (+5% on ARM1176JZF-S)
+- `crypto_hash/asconhashabi32v12`: Ascon-Hasha (+8% on ARM1176JZF-S)
+- `crypto_aead_hash/asconbi32v12`: Ascon-128 combined with Ascon-Hash
+- `crypto_aead_hash/asconabi32v12`: Ascon-128a combined with Ascon-Hasha
+
+
+## Implementations
+
+For most algorithms, we provide the following pure C implementations:
+
+- `ref`: reference implementation
+- `opt64`: 64-bit speed-optimized
+- `opt32`: 32-bit speed-optimized
+- `opt64_lowsize`: 64-bit size-optimized
+- `opt32_lowsize`: 32-bit size-optimized
+- `bi32`: 32-bit speed-optimized bit-interleaved
+- `bi32_lowreg`: 32-bit speed-optimized bit-interleaved (low register usage)
+- `bi32_lowsize`: 32-bit size-optimized bit-interleaved
+- `esp32`: 32-bit ESP32 optimized
+- `opt8`: 8-bit size- and speed-optimized
+- `bi8`: 8-bit optimized bit-interleaved
+
+the following C with inline or partial ASM implementations:
+
+- `avx512`: 320-bit speed-optimized AVX512
+- `neon`: 64-bit speed-optimized ARM NEON
+- `armv6`: 32-bit speed-optimized ARMv6
+- `armv6m`: 32-bit speed-optimized ARMv6-M
+- `armv7m`: 32-bit speed-optimized ARMv7-M
+- `armv6_lowsize`: 32-bit size-optimized ARMv6
+- `armv6m_lowsize`: 32-bit size-optimized ARMv6-M
+- `armv7m_lowsize`: 32-bit size-optimized ARMv7-M
+- `armv7m_small`: 32-bit small speed-optimized ARMv7-M
+- `bi32_armv6`: 32-bit speed-optimized bit-interleaved ARMv6
+- `bi32_armv6m`: 32-bit speed-optimized bit-interleaved ARMv6-M
+- `bi32_armv7m`: 32-bit speed-optimized bit-interleaved ARMv7-M
+- `bi32_armv7m_small`: 32-bit small bit-interleaved ARMv7-M
+- `avr`: 8-bit size- and speed-optimized AVR
+- `avr_lowsize`: 8-bit size-optimized AVR
+
+the following ASM implementations:
+
+- `asm_esp32`: 32-bit optimized ESP32 using funnel-shift instructions
+- `asm_rv32i`: 32-bit optimized RV32I using the base instruction set
+- `asm_rv32b`: 32-bit optimized RV32B using bitmanip base (Zbb)
+- `asm_fsr_rv32b`: 32-bit optimized funnel-shift RV32B using bitmanip base and bitmanip terniary (ZbbZbt)
+- `asm_bi32_rv32b`: 32-bit optimized bit-interleaved RV32B using bitmanip base and bitmanip permutations (ZbbZbp)
+
+and the following high-level masked (shared) C with inline ASM implementations:
+
+- `protected_bi32_armv6`: 32-bit masked bit-interleaved ARMv6
+- `protected_bi32_armv6_leveled`: 32-bit masked and leveled bit-interleaved ARMv6
+
+The masked C implementations can be used as a starting point to generate
+device specific C/ASM implementations. Note that the masked C implementations
+require a minimum amount of ASM instructions. Otherwise, the compiler may
+heavily optimize the code and even combine shares. Obviously, the output
+generated is very sensitive to compiler and environment changes and any
+generated output needs to be security evaluated. A preliminary evaluation of
+these implementations has been performed on some
+[ChipWhisperer](https://www.newae.com/chipwhisperer) devices. The setup and
+preliminary results can found at: https://github.com/ascon/simpleserial-ascon
+
+
+# Performance results on different CPUs in cycles per byte
+
+## Ascon-128a
+
+| Message Length in Bytes  |    1 |    8 |   16 |   32 |   64 | 1536 | long |
+|:-------------------------|-----:|-----:|-----:|-----:|-----:|-----:|-----:|
+| AMD EPYC 7742\*          |      |      |      |      |  7.4 |  4.4 |  4.2 |
+| AMD Ryzen 9 5950X\*      |      |      |      |      |  8.1 |  5.3 |  5.2 |
+| Apple M1 (ARMv8)\*       |      |      |      |      |  9.4 |  6.3 |  6.3 |
+| Cortex-A72 (ARMv8)\*     |      |      |      |      | 10.9 |  7.2 |  7.0 |
+| Intel Xeon E5-2609 v4\*  |      |      |      |      | 11.3 |  7.4 |  7.2 |
+| Intel Core i5-6300U      |  365 |   47 |   31 |   19 | 13.5 |  8.0 |  7.8 |
+| Intel Core i5-4200U      |  519 |   67 |   44 |   27 | 18.8 | 11.0 | 10.6 |
+| Cortex-A9 (ARMv7)\*      |      |      |      |      | 42.8 | 24.6 | 24.0 |
+| Cortex-A7 (NEON)         | 2204 |  226 |  132 |   82 | 55.9 | 31.7 | 30.7 |
+| Cortex-A7 (ARMv7)\*      |      |      |      |      | 55.5 | 38.2 | 37.5 |
+| ARM1176JZF-S (ARMv6)     | 1908 |  235 |  156 |   99 | 70.4 | 43.0 | 42.9 |
+
+
+## Ascon-128 and Ascon-80pq
+
+| Message Length in Bytes  |    1 |    8 |   16 |   32 |   64 | 1536 | long |
+|:-------------------------|-----:|-----:|-----:|-----:|-----:|-----:|-----:|
+| AMD EPYC 7742\*          |      |      |      |      |  8.1 |  6.6 |  6.5 |
+| AMD Ryzen 9 5950X\*      |      |      |      |      | 11.0 |  8.2 |  8.1 |
+| Apple M1 (ARMv8)\*       |      |      |      |      | 12.5 |  9.5 |  9.3 |
+| Cortex-A72 (ARMv8)\*     |      |      |      |      | 13.8 | 10.7 | 10.5 |
+| Intel Xeon E5-2609 v4\*  |      |      |      |      | 14.9 | 10.8 | 10.6 |
+| Intel Core i5-6300U      |  367 |   58 |   35 |   23 | 17.6 | 11.9 | 11.4 |
+| Intel Core i5-4200U      |  521 |   81 |   49 |   32 | 23.9 | 16.2 | 15.8 |
+| Cortex-A9 (ARMv7)\*      |      |      |      |      | 51.7 | 34.1 | 33.3 |
+| Cortex-A7 (NEON)         | 2182 |  249 |  148 |   97 | 71.7 | 47.5 | 46.5 |
+| Cortex-A7 (ARMv7)\*      |      |      |      |      | 69.6 | 52.0 | 51.6 |
+| ARM1176JZF-S (ARMv6)     | 1921 |  277 |  167 |  112 | 83.7 | 57.2 | 56.8 |
+
+
+## Ascon-Hasha and Ascon-Xofa
+
+| Message Length in Bytes  |    1 |    8 |   16 |   32 |    64 | 1536 | long |
+|:-------------------------|-----:|-----:|-----:|-----:|------:|-----:|-----:|
+| AMD EPYC 7742\*          |      |      |      |      |       |      |      |
+| AMD Ryzen 7 1700\*       |      |      |      |      |  22.0 | 12.1 | 11.7 |
+| Apple M1 (ARMv8)\*       |      |      |      |      |       |      |      |
+| Cortex-A72 (ARMv8)\*     |      |      |      |      |  22.2 | 14.5 | 14.2 |
+| Intel Xeon E5-2609 v4\*  |      |      |      |      |  23.3 | 14.4 | 14.0 |
+| Intel Core i5-6300U      |  550 |   83 |   49 |   33 |  23.7 | 15.6 | 15.5 |
+| Intel Core i5-4200U      |  749 |  112 |   67 |   44 |  31.8 | 20.8 | 20.7 |
+| Cortex-A9 (ARMv7)\*      |      |      |      |      |  87.5 | 45.6 | 44.0 |
+| Cortex-A7 (ARMv7)\*      |      |      |      |      | 102.3 | 63.5 | 61.8 |
+| ARM1176JZF-S (ARMv6)     | 2390 |  356 |  211 |  138 | 100.7 | 65.7 | 65.3 |
+
+
+## Ascon-Hash and Ascon-Xof
+
+| Message Length in Bytes  |    1 |    8 |   16 |   32 |    64 | 1536 | long |
+|:-------------------------|-----:|-----:|-----:|-----:|------:|-----:|-----:|
+| AMD EPYC 7742\*          |      |      |      |      |  21.1 | 13.3 | 12.4 |
+| AMD Ryzen 9 5950X\*      |      |      |      |      |  24.1 | 16.1 | 15.8 |
+| Apple M1 (ARMv8)\*       |      |      |      |      |  29.2 | 19.6 | 18.5 |
+| Cortex-A72 (ARMv8)\*     |      |      |      |      |  30.5 | 20.5 | 20.0 |
+| Intel Xeon E5-2609 v4\*  |      |      |      |      |  31.9 | 21.4 | 21.2 |
+| Intel Core i5-6300U      |  747 |  114 |   69 |   46 |  34.2 | 23.2 | 23.1 |
+| Intel Core i5-4200U      |  998 |  153 |   92 |   61 |  45.5 | 30.9 | 30.7 |
+| Cortex-A9 (ARMv7)\*      |      |      |      |      |  95.8 | 55.5 | 53.9 |
+| Cortex-A7 (ARMv7)\*      |      |      |      |      | 138.1 | 89.9 | 88.8 |
+| ARM1176JZF-S (ARMv6)     | 3051 |  462 |  277 |  184 | 137.3 | 92.6 | 92.2 |
+
+
+## Ascon-Mac and Ascon-Prf
+
+| Message Length in Bytes  |    1 |    8 |   16 |   32 |   64 | 1536 | long |
+|:-------------------------|-----:|-----:|-----:|-----:|-----:|-----:|-----:|
+| Intel Core i5-6300U      |  369 |   46 |   24 |   18 | 11.7 |  6.4 |  6.3 |
+| Intel Core i5-4200U      |  506 |   63 |   32 |   24 | 16.2 |  8.8 |  8.7 |
+| ARM1176JZF-S (ARMv6)     | 1769 |  223 |  117 |   85 | 57.5 | 31.9 | 31.6 |
+
+
+## Ascon-PrfShort
+
+| Message Length in Bytes  |    1 |    8 |   16 |   32 |   64 | 1536 | long |
+|:-------------------------|-----:|-----:|-----:|-----:|-----:|-----:|-----:|
+| Intel Core i5-6300U      |  185 |   23 |   12 |    - |    - |    - |    - |
+| Intel Core i5-4200U      |  257 |   33 |   17 |    - |    - |    - |    - |
+| ARM1176JZF-S (ARMv6)     | 1057 |  132 |   69 |    - |    - |    - |    - |
+
+\* Results taken from eBACS: http://bench.cr.yp.to/
+
+
+# Build and test
+
+Build and test all Ascon C targets using release flags (-O2 -fomit-frame-pointer -march=native -mtune=native):
+
+```
+mkdir build && cd build
+cmake ..
+cmake --build .
+ctest
+```
+
+
+Build and test all Ascon C targets on Windows:
+
+```
+mkdir build && cd build
+cmake ..
+cmake --build . --config Release
+ctest -C Release
+```
+
+
+Build and test all Ascon C targets using debug flags (with NIST defined flags and sanitizers):
+
+```
+mkdir build && cd build
+cmake .. -DCMAKE_BUILD_TYPE=Debug
+cmake --build .
+ctest
+```
+
+Manually set the compiler and/or release flags (e.g. to disable -march=native -mtune=native).
+
+```
+mkdir build && cd build
+cmake .. -DCMAKE_C_COMPILER=clang -DREL_FLAGS="-O2;-fomit-frame-pointer"
+cmake --build .
+ctest
+```
+
+Build and run only specific algorithms, implementations and tests:
+
+```
+mkdir build && cd build
+cmake .. -DALG_LIST="ascon128;asconhash" -DIMPL_LIST="opt64;bi32" -DTEST_LIST="genkat"
+cmake --build .
+ctest
+```
+
+Note that cmake stores variables in a cache. Therefore, variables can be set
+one-by-one, unset using e.g. `cmake . -UIMPL_LIST` and shown using `cmake . -L`:
+
+```
+mkdir build && cd build
+cmake ..
+cmake . -DALG_LIST="ascon128;asconhash"
+cmake . -DIMPL_LIST="opt64;bi32"
+cmake . -DTEST_LIST="genkat"
+cmake . -L
+cmake --build .
+ctest
+```
+
+Cross compile and test with custom emulator using e.g. `qemu-arm`:
+
+```
+mkdir build && cd build
+cmake .. -DCMAKE_C_COMPILER="arm-linux-gnueabi-gcc" \
+         -DREL_FLAGS="-O2;-fomit-frame-pointer;-march=armv7;-mtune=cortex-m4" \
+         -DEMULATOR="qemu-arm;-L;/usr/arm-linux-gnueabi" \
+         -DALG_LIST="ascon128;ascon128a" -DIMPL_LIST="armv7m;bi32_armv7m"
+cmake --build .
+ctest
+```
+
+or using Intel SDE (use full path to `sde` or add to path variable):
+
+```
+mkdir build && cd build
+cmake .. -DCMAKE_C_COMPILER=gcc -DIMPL_LIST=avx512 -DEMULATOR="sde;--" \
+         -DREL_FLAGS="-O2;-fomit-frame-pointer;-march=icelake-client"
+cmake --build .
+ctest
+```
+
+
+# Build and benchmark:
+
+Build the getcycles test:
+
+```
+mkdir build && cd build
+cmake .. -DALG_LIST="ascon128;asconhash" -DIMPL_LIST="opt32;opt32_lowsize" -DTEST_LIST="getcycles"
+cmake --build .
+```
+
+Get the CPU cycle performance:
+
+```
+./getcycles_crypto_aead_ascon128v12_opt32
+./getcycles_crypto_aead_ascon128v12_opt32_lowsize
+./getcycles_crypto_hash_asconhashv12_opt32
+./getcycles_crypto_hash_asconhashv12_opt32_lowsize
+```
+
+Get the implementation size:
+
+```
+size -t libcrypto_aead_ascon128v12_opt32.a
+size -t libcrypto_aead_ascon128v12_opt32_lowsize.a
+size -t libcrypto_hash_asconhashv12_opt32.a
+size -t libcrypto_hash_asconhashv12_opt32_lowsize.a
+```
+
+
+# Manually build and run a single Ascon target:
+
+Build example for AEAD algorithms:
+
+```
+gcc -march=native -O3 -Icrypto_aead/ascon128v12/opt64 crypto_aead/ascon128v12/opt64/*.c -Itests tests/genkat_aead.c -o genkat
+gcc -march=native -O3 -Icrypto_aead/ascon128v12/opt64 crypto_aead/ascon128v12/opt64/*.c -DCRYPTO_AEAD -Itests tests/getcycles.c -o getcycles
+```
+
+Build example for HASH algorithms:
+
+```
+gcc -march=native -O3 -Icrypto_hash/asconhashv12/opt64 crypto_hash/asconhashv12/opt64/*.c -Itests tests/genkat_hash.c -o genkat
+gcc -march=native -O3 -Icrypto_hash/asconhashv12/opt64 crypto_hash/asconhashv12/opt64/*.c -DCRYPTO_HASH -Itests tests/getcycles.c -o getcycles
+```
+
+Generate KATs and get CPU cycles:
+
+```
+./genkat
+./getcycles
+```
+
+
+## Manually build and run an RV32 target:
+
+
+Setup:
+
+```
+sudo apt install gcc-riscv64-unknown-elf picolibc-riscv64-unknown-elf qemu-system-misc
+```
+
+Example to build, run and test an AEAD/HASH algorithm using `gcc`, `picolibc` and `qemu`:
+
+```
+riscv64-unknown-elf-gcc -O2 -march=rv32i -mabi=ilp32 --specs=picolibc.specs --oslib=semihost --crt0=hosted -Ttests/rv32.ld \
+    -Icrypto_aead/ascon128v12/asm_rv32i crypto_aead/ascon128v12/asm_rv32i/*.[cS] -Itests tests/genkat_aead.c -o genkat
+qemu-system-riscv32 -semihosting-config enable=on -monitor none -serial none -nographic -machine virt,accel=tcg -cpu rv32 -bios none -kernel genkat
+diff LWC_AEAD_KAT_128_128.txt crypto_aead/ascon128v12/LWC_AEAD_KAT_128_128.txt
+```
+
+```
+riscv64-unknown-elf-gcc -O2 -march=rv32i -mabi=ilp32 --specs=picolibc.specs --oslib=semihost --crt0=hosted -Ttests/rv32.ld \
+    -Icrypto_hash/asconhashv12/opt32 crypto_hash/asconhashv12/opt32/*.[cS] -Itests tests/genkat_hash.c -o genkat
+qemu-system-riscv32 -semihosting-config enable=on -monitor none -serial none -nographic -machine virt,accel=tcg -cpu rv32 -bios none -kernel genkat
+diff LWC_HASH_KAT_256.txt crypto_hash/asconhashv12/LWC_HASH_KAT_256.txt
+```
+
+
+## Manually build and run an AVR target:
+
+Example to build, run and test an AEAD algorithm using `avr-gcc`, `avr-libc` and `simavr`.
+
+Setup:
+
+```
+sudo apt install gcc-avr avr-libc simavr
+git clone https://github.com/JohannCahier/avr_uart.git
+```
+
+Single test vector using `demo` and performance measurement using `getcycles`:
+
+```
+avr-gcc -mmcu=atmega128 -std=c99 -Os -Icrypto_aead/ascon128v12/opt8 crypto_aead/ascon128v12/opt8/*.[cS] \
+    -DAVR_UART -Iavr_uart avr_uart/avr_uart.c -Wno-incompatible-pointer-types -Wno-cpp \
+    -DCRYPTO_AEAD -Itests tests/demo.c -o demo
+simavr -m atmega128 ./demo
+```
+```
+avr-gcc -mmcu=atmega128 -std=c99 -Os -Icrypto_aead/ascon128v12/opt8 crypto_aead/ascon128v12/opt8/*.[cS] \
+    -DAVR_UART -Iavr_uart avr_uart/avr_uart.c -Wno-incompatible-pointer-types -Wno-cpp \
+    -DCRYPTO_AEAD -Itests tests/getcycles.c -o getcycles
+simavr -t -m atmega128 ./getcycles
+```
+
+Generate all test vectors for AEAD/HASH and write result to a file. Press Ctrl-C to quit `simavr` after about a minute:
+
+```
+avr-gcc -mmcu=atmega128 -std=c99 -Os -Icrypto_aead/ascon128v12/opt8 crypto_aead/ascon128v12/opt8/*.[cS] \
+    -DAVR_UART -Iavr_uart avr_uart/avr_uart.c -Wno-incompatible-pointer-types -Wno-cpp \
+    -Itests tests/genkat_aead.c -o genkat_aead
+echo "Press Ctrl-C to quit simavr after about a minute"
+simavr -t -m atmega128 ./genkat_aead 2> LWC_AEAD_KAT_128_128.txt
+sed -i -e 's/\x1b\[[0-9;]*m//g' -e 's/\.\.$//' LWC_AEAD_KAT_128_128.txt
+diff LWC_AEAD_KAT_128_128.txt crypto_aead/ascon128v12/LWC_AEAD_KAT_128_128.txt
+```
+
+```
+avr-gcc -mmcu=atmega128 -std=c99 -Os -Icrypto_hash/asconhashv12/opt8 crypto_hash/asconhashv12/opt8/*.[cS] \
+    -DAVR_UART -Iavr_uart avr_uart/avr_uart.c -Wno-incompatible-pointer-types -Wno-cpp \
+    -Itests tests/genkat_hash.c -o genkat_hash
+echo "Press Ctrl-C to quit simavr after about a minute"
+simavr -t -m atmega128 ./genkat_hash 2> LWC_HASH_KAT_256.txt
+sed -i -e 's/\x1b\[[0-9;]*m//g' -e 's/\.\.$//' LWC_HASH_KAT_256.txt
+diff LWC_HASH_KAT_256.txt crypto_hash/asconhashv12/LWC_HASH_KAT_256.txt
+```
+
+
+# Benchmarking
+
+## Hints to get more reliable getcycles results on Intel/AMD CPUs:
+
+* Determine the processor base frequency (also called design frequency):
+  - e.g. using the Intel/AMD website
+  - or using `lscpu` listed under model name
+
+* Disable turbo boost (this should lock the frequency to the next value
+  below the processor base frequency):
+  ```
+  echo 1 | sudo tee /sys/devices/system/cpu/intel_pstate/no_turbo
+  ```
+
+* If the above does not work, manually set the frequency using e.g. `cpufreq-set`.
+
+* Determine the actual frequency (under load):
+  - e.g. by watching the frequency using `lscpu` or `cpufreq-info`
+
+* Determine the scaling factor between the actual and base frequency:
+  - factor = actual frequency / base frequency
+
+* Run a getcycles program using the frequency factor and watch the results:
+  ```
+  while true; do ./getcycles_crypto_aead_ascon128v12_opt64 $factor; done
+  ```
+
+* Run the `benchmark-getcycles.sh` script with the frequency factor and a
+  specific algorithm to benchmark all corresponding getcycles implementations:
+  ```
+  scripts/benchmark-getcycles.sh $factor ascon128
+  ```
+
+
+## Hints to activate the performance monitor unit (PMU) on ARM CPUs:
+
+* First try to install `linux-tools` and see if it works.
+
+* On many ARM platforms, the PMU has to be enabled using a kernel module:
+  - Source code for Armv6 (32-bit):
+    <http://sandsoftwaresound.net/raspberry-pi/raspberry-pi-gen-1/performance-counter-kernel-module/>
+  - Source code for Armv7 (32-bit):
+    <https://github.com/thoughtpolice/enable_arm_pmu>
+  - Source code for Armv8/Aarch64 (64-bit):
+    <https://github.com/rdolbeau/enable_arm_pmu>
+
+* Steps to compile the kernel module on the raspberry pi:
+  - Find out the kernel version using `uname -a`
+  - Download the kernel header files, e.g. `raspberrypi-kernel-header`
+  - Download the source code for the Armv6 kernel module
+  - Build, install and load the kernel module
+
+
+## Benchmark Ascon v1.2 using supercop
+
+Download supercop according to the website: http://bench.cr.yp.to/supercop.html
+
+To test only Ascon, just run the following commands:
+
+```
+./do-part init
+./do-part crypto_aead ascon128v12
+./do-part crypto_aead ascon128av12
+./do-part crypto_aead ascon80pqv12
+./do-part crypto_hash asconhashv12
+./do-part crypto_hash asconxofv12
+```
+
+Show the cycles/Byte for a 1536 Byte long message:
+
+```
+cat bench/*/data | grep '_cycles 1536 ' | awk '{printf "%.1f\t%s\t%s\n", $9/$8,
+$6, $7}' | sort -nr
+```
+
+
+## Evaluate and optimize Ascon on constraint devices:
+
+* The ascon-c code allows to set compile-time parameters `ASCON_INLINE_MODE`
+  (IM), `ASCON_INLINE_PERM` (IP), `ASCON_UNROLL_LOOPS` (UL), `ASCON_INLINE_BI`
+  (IB), via command line or in the `crypto_*/ascon*/*/config.h` files.
+* Use the `benchmark-config.sh` script to evaluate all combinations of these
+  parameters for a given list of Ascon implementations. The script is called
+  with an output file, frequency factor, the algorithm, and the list of
+  implementations to test:
+  ```
+  scripts/benchmark-config.sh results-config.md $factor ascon128 ref opt64 opt64_lowsize
+  ```
+* The `results-config.md` file then contains a markup table with size and cycles
+  for each implementation and parameter set to evaluate several time-area
+  trade-offs.
+* The `benchmark-all.sh` and `benchmark-size.sh` scripts provides a time/size
+  and size-only table of all currently compiled implementations:
+  ```
+  scripts/benchmark-all.sh results-all.md
+  scripts/benchmark-size.sh results-size.md
+  ```
diff --git a/src/ascon-xofa/api.h b/src/ascon-xofa/api.h
new file mode 100644
index 0000000..6f9efc3
--- /dev/null
+++ b/src/ascon-xofa/api.h
@@ -0,0 +1,4 @@
+#define CRYPTO_VERSION "1.2.7"
+#define CRYPTO_BYTES 32
+#define ASCON_HASH_BYTES 0 /* XOF */
+#define ASCON_HASH_ROUNDS 8
diff --git a/src/ascon-xofa/architectures b/src/ascon-xofa/architectures
new file mode 100644
index 0000000..a07c7a4
--- /dev/null
+++ b/src/ascon-xofa/architectures
@@ -0,0 +1,3 @@
+aarch64
+armeabi
+arm
diff --git a/src/ascon-xofa/ascon.h b/src/ascon-xofa/ascon.h
new file mode 100644
index 0000000..c2ee57b
--- /dev/null
+++ b/src/ascon-xofa/ascon.h
@@ -0,0 +1,53 @@
+#ifndef ASCON_H_
+#define ASCON_H_
+
+#include <stdint.h>
+
+#include "api.h"
+#include "config.h"
+
+typedef union {
+  uint64_t x[5];
+  uint32_t w[5][2];
+  uint8_t b[5][8];
+} ascon_state_t;
+
+#ifdef ASCON_AEAD_RATE
+
+#define ASCON_KEYWORDS (CRYPTO_KEYBYTES + 7) / 8
+
+typedef union {
+  uint64_t x[ASCON_KEYWORDS];
+  uint32_t w[ASCON_KEYWORDS][2];
+  uint8_t b[ASCON_KEYWORDS][8];
+} ascon_key_t;
+
+#if !ASCON_INLINE_MODE
+
+void ascon_loadkey(ascon_key_t* key, const uint8_t* k);
+void ascon_initaead(ascon_state_t* s, const ascon_key_t* key,
+                    const uint8_t* npub);
+void ascon_adata(ascon_state_t* s, const uint8_t* ad, uint64_t adlen);
+void ascon_encrypt(ascon_state_t* s, uint8_t* c, const uint8_t* m,
+                   uint64_t mlen);
+void ascon_decrypt(ascon_state_t* s, uint8_t* m, const uint8_t* c,
+                   uint64_t clen);
+void ascon_final(ascon_state_t* s, const ascon_key_t* k);
+
+#endif
+
+#endif
+
+#ifdef ASCON_HASH_BYTES
+
+#if !ASCON_INLINE_MODE
+
+void ascon_inithash(ascon_state_t* s);
+void ascon_absorb(ascon_state_t* s, const uint8_t* in, uint64_t inlen);
+void ascon_squeeze(ascon_state_t* s, uint8_t* out, uint64_t outlen);
+
+#endif
+
+#endif
+
+#endif /* ASCON_H_ */
diff --git a/src/ascon-xofa/bendian.h b/src/ascon-xofa/bendian.h
new file mode 100644
index 0000000..4691995
--- /dev/null
+++ b/src/ascon-xofa/bendian.h
@@ -0,0 +1,39 @@
+#ifndef ENDIAN_H_
+#define ENDIAN_H_
+
+#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
+
+/* macros for big endian machines */
+#ifdef PRAGMA_ENDIAN
+#pragma message("Using macros for big endian machines")
+#endif
+#define U64BIG(x) (x)
+#define U32BIG(x) (x)
+#define U16BIG(x) (x)
+
+#elif defined(_MSC_VER) || \
+    (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+
+/* macros for little endian machines */
+#ifdef PRAGMA_ENDIAN
+#pragma message("Using macros for little endian machines")
+#endif
+#define U64BIG(x)                          \
+  (((0x00000000000000FFULL & (x)) << 56) | \
+   ((0x000000000000FF00ULL & (x)) << 40) | \
+   ((0x0000000000FF0000ULL & (x)) << 24) | \
+   ((0x00000000FF000000ULL & (x)) << 8) |  \
+   ((0x000000FF00000000ULL & (x)) >> 8) |  \
+   ((0x0000FF0000000000ULL & (x)) >> 24) | \
+   ((0x00FF000000000000ULL & (x)) >> 40) | \
+   ((0xFF00000000000000ULL & (x)) >> 56))
+#define U32BIG(x)                                           \
+  (((0x000000FF & (x)) << 24) | ((0x0000FF00 & (x)) << 8) | \
+   ((0x00FF0000 & (x)) >> 8) | ((0xFF000000 & (x)) >> 24))
+#define U16BIG(x) (((0x00FF & (x)) << 8) | ((0xFF00 & (x)) >> 8))
+
+#else
+#error "Ascon byte order macros not defined in bendian.h"
+#endif
+
+#endif /* ENDIAN_H_ */
diff --git a/src/ascon-xofa/config.h b/src/ascon-xofa/config.h
new file mode 100644
index 0000000..66a1156
--- /dev/null
+++ b/src/ascon-xofa/config.h
@@ -0,0 +1,19 @@
+#ifndef CONFIG_H_
+#define CONFIG_H_
+
+/* inline the ascon mode */
+#ifndef ASCON_INLINE_MODE
+#define ASCON_INLINE_MODE 0
+#endif
+
+/* inline all permutations */
+#ifndef ASCON_INLINE_PERM
+#define ASCON_INLINE_PERM 1
+#endif
+
+/* unroll permutation loops */
+#ifndef ASCON_UNROLL_LOOPS
+#define ASCON_UNROLL_LOOPS 1
+#endif
+
+#endif /* CONFIG_H_ */
diff --git a/src/ascon-xofa/constants.h b/src/ascon-xofa/constants.h
new file mode 100644
index 0000000..80eac8d
--- /dev/null
+++ b/src/ascon-xofa/constants.h
@@ -0,0 +1,90 @@
+#ifndef CONSTANTS_H_
+#define CONSTANTS_H_
+
+#include <stdint.h>
+
+#define ASCON_128_KEYBYTES 16
+#define ASCON_128A_KEYBYTES 16
+#define ASCON_80PQ_KEYBYTES 20
+
+#define ASCON_128_RATE 8
+#define ASCON_128A_RATE 16
+#define ASCON_HASH_RATE 8
+#define ASCON_PRF_IN_RATE 32
+#define ASCON_PRFA_IN_RATE 40
+#define ASCON_PRF_OUT_RATE 16
+
+#define ASCON_128_PA_ROUNDS 12
+#define ASCON_128_PB_ROUNDS 6
+#define ASCON_128A_PA_ROUNDS 12
+#define ASCON_128A_PB_ROUNDS 8
+
+#define ASCON_HASH_PA_ROUNDS 12
+#define ASCON_HASH_PB_ROUNDS 12
+#define ASCON_HASHA_PA_ROUNDS 12
+#define ASCON_HASHA_PB_ROUNDS 8
+
+#define ASCON_PRF_PA_ROUNDS 12
+#define ASCON_PRF_PB_ROUNDS 12
+#define ASCON_PRFA_PA_ROUNDS 12
+#define ASCON_PRFA_PB_ROUNDS 8
+
+#define ASCON_128_IV 0x80400c0600000000ull
+#define ASCON_128A_IV 0x80800c0800000000ull
+#define ASCON_80PQ_IV 0xa0400c0600000000ull
+
+#define ASCON_HASH_IV 0x00400c0000000100ull
+#define ASCON_HASHA_IV 0x00400c0400000100ull
+#define ASCON_XOF_IV 0x00400c0000000000ull
+#define ASCON_XOFA_IV 0x00400c0400000000ull
+
+#define ASCON_HASH_IV0 0xee9398aadb67f03dull
+#define ASCON_HASH_IV1 0x8bb21831c60f1002ull
+#define ASCON_HASH_IV2 0xb48a92db98d5da62ull
+#define ASCON_HASH_IV3 0x43189921b8f8e3e8ull
+#define ASCON_HASH_IV4 0x348fa5c9d525e140ull
+
+#define ASCON_HASHA_IV0 0x01470194fc6528a6ull
+#define ASCON_HASHA_IV1 0x738ec38ac0adffa7ull
+#define ASCON_HASHA_IV2 0x2ec8e3296c76384cull
+#define ASCON_HASHA_IV3 0xd6f6a54d7f52377dull
+#define ASCON_HASHA_IV4 0xa13c42a223be8d87ull
+
+#define ASCON_XOF_IV0 0xb57e273b814cd416ull
+#define ASCON_XOF_IV1 0x2b51042562ae2420ull
+#define ASCON_XOF_IV2 0x66a3a7768ddf2218ull
+#define ASCON_XOF_IV3 0x5aad0a7a8153650cull
+#define ASCON_XOF_IV4 0x4f3e0e32539493b6ull
+
+#define ASCON_XOFA_IV0 0x44906568b77b9832ull
+#define ASCON_XOFA_IV1 0xcd8d6cae53455532ull
+#define ASCON_XOFA_IV2 0xf7b5212756422129ull
+#define ASCON_XOFA_IV3 0x246885e1de0d225bull
+#define ASCON_XOFA_IV4 0xa8cb5ce33449973full
+
+#define ASCON_MAC_IV 0x80808c0000000080ull
+#define ASCON_MACA_IV 0x80808c0400000080ull
+#define ASCON_PRF_IV 0x80808c0000000000ull
+#define ASCON_PRFA_IV 0x80808c0400000000ull
+#define ASCON_PRFS_IV 0x80004c8000000000ull
+
+#define RC0 0xf0
+#define RC1 0xe1
+#define RC2 0xd2
+#define RC3 0xc3
+#define RC4 0xb4
+#define RC5 0xa5
+#define RC6 0x96
+#define RC7 0x87
+#define RC8 0x78
+#define RC9 0x69
+#define RCa 0x5a
+#define RCb 0x4b
+
+#define RC(i) (i)
+
+#define START(n) ((3 + (n)) << 4 | (12 - (n)))
+#define INC -0x0f
+#define END 0x3c
+
+#endif /* CONSTANTS_H_ */
diff --git a/src/ascon-xofa/forceinline.h b/src/ascon-xofa/forceinline.h
new file mode 100644
index 0000000..e66c1eb
--- /dev/null
+++ b/src/ascon-xofa/forceinline.h
@@ -0,0 +1,23 @@
+#ifndef FORCEINLINE_H_
+#define FORCEINLINE_H_
+
+/* define forceinline macro */
+#ifdef _MSC_VER
+#define forceinline __forceinline
+#elif defined(__GNUC__)
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
+#define forceinline inline __attribute__((__always_inline__))
+#else
+#define forceinline static inline
+#endif
+#elif defined(__CLANG__)
+#if __has_attribute(__always_inline__)
+#define forceinline inline __attribute__((__always_inline__))
+#else
+#define forceinline inline
+#endif
+#else
+#define forceinline inline
+#endif
+
+#endif /* FORCEINLINE_H_ */
diff --git a/src/ascon-xofa/goal-constbranch b/src/ascon-xofa/goal-constbranch
new file mode 100644
index 0000000..1a9c048
--- /dev/null
+++ b/src/ascon-xofa/goal-constbranch
@@ -0,0 +1 @@
+Branches reviewed 2020-11-13 by Martin Schläffer.
diff --git a/src/ascon-xofa/goal-constindex b/src/ascon-xofa/goal-constindex
new file mode 100644
index 0000000..316d11d
--- /dev/null
+++ b/src/ascon-xofa/goal-constindex
@@ -0,0 +1 @@
+Addresses reviewed 2020-11-13 by Martin Schläffer.
diff --git a/src/ascon-xofa/hash.c b/src/ascon-xofa/hash.c
new file mode 100644
index 0000000..54bc8e4
--- /dev/null
+++ b/src/ascon-xofa/hash.c
@@ -0,0 +1,89 @@
+#include "api.h"
+#include "ascon.h"
+/*#include "crypto_hash.h"*/
+#include "permutations.h"
+#include "printstate.h"
+
+#if !ASCON_INLINE_MODE
+#undef forceinline
+#define forceinline
+#endif
+
+#ifdef ASCON_HASH_BYTES
+
+forceinline void ascon_inithash(ascon_state_t* s) {
+  int i;
+  /* initialize */
+#ifdef ASCON_PRINT_STATE
+#if ASCON_HASH_BYTES == 32 && ASCON_HASH_ROUNDS == 12
+  s->x[0] = ASCON_HASH_IV;
+#elif ASCON_HASH_BYTES == 32 && ASCON_HASH_ROUNDS == 8
+  s->x[0] = ASCON_HASHA_IV;
+#elif ASCON_HASH_BYTES == 0 && ASCON_HASH_ROUNDS == 12
+  s->x[0] = ASCON_XOF_IV;
+#elif ASCON_HASH_BYTES == 0 && ASCON_HASH_ROUNDS == 8
+  s->x[0] = ASCON_XOFA_IV;
+#endif
+  for (i = 1; i < 5; ++i) s->x[i] = 0;
+  printstate("initial value", s);
+  P(s, 12);
+#endif
+#if ASCON_HASH_BYTES == 32 && ASCON_HASH_ROUNDS == 12
+  const uint64_t iv[5] = {ASCON_HASH_IV0, ASCON_HASH_IV1, ASCON_HASH_IV2,
+                          ASCON_HASH_IV3, ASCON_HASH_IV4};
+#elif ASCON_HASH_BYTES == 32 && ASCON_HASH_ROUNDS == 8
+  const uint64_t iv[5] = {ASCON_HASHA_IV0, ASCON_HASHA_IV1, ASCON_HASHA_IV2,
+                          ASCON_HASHA_IV3, ASCON_HASHA_IV4};
+#elif ASCON_HASH_BYTES == 0 && ASCON_HASH_ROUNDS == 12
+  const uint64_t iv[5] = {ASCON_XOF_IV0, ASCON_XOF_IV1, ASCON_XOF_IV2,
+                          ASCON_XOF_IV3, ASCON_XOF_IV4};
+#elif ASCON_HASH_BYTES == 0 && ASCON_HASH_ROUNDS == 8
+  const uint64_t iv[5] = {ASCON_XOFA_IV0, ASCON_XOFA_IV1, ASCON_XOFA_IV2,
+                          ASCON_XOFA_IV3, ASCON_XOFA_IV4};
+#endif
+  for (i = 0; i < 5; ++i) s->x[i] = (iv[i]);
+  printstate("initialization", s);
+}
+
+forceinline void ascon_absorb(ascon_state_t* s, const uint8_t* in,
+                              uint64_t inlen) {
+  /* absorb full plaintext blocks */
+  while (inlen >= ASCON_HASH_RATE) {
+    s->x[0] ^= LOAD(in, 8);
+    printstate("absorb plaintext", s);
+    P(s, ASCON_HASH_ROUNDS);
+    in += ASCON_HASH_RATE;
+    inlen -= ASCON_HASH_RATE;
+  }
+  /* absorb final plaintext block */
+  s->x[0] ^= LOADBYTES(in, inlen);
+  s->x[0] ^= PAD(inlen);
+  printstate("pad plaintext", s);
+}
+
+forceinline void ascon_squeeze(ascon_state_t* s, uint8_t* out,
+                               uint64_t outlen) {
+  /* squeeze full output blocks */
+  P(s, 12);
+  while (outlen > ASCON_HASH_RATE) {
+    STORE(out, s->x[0], 8);
+    printstate("squeeze output", s);
+    P(s, ASCON_HASH_ROUNDS);
+    out += ASCON_HASH_RATE;
+    outlen -= ASCON_HASH_RATE;
+  }
+  /* squeeze final output block */
+  STOREBYTES(out, s->x[0], outlen);
+  printstate("squeeze output", s);
+}
+
+/*int crypto_hash(unsigned char* out, const unsigned char* in,
+                unsigned long long inlen) {
+  ascon_state_t s;
+  ascon_inithash(&s);
+  ascon_absorb(&s, in, inlen);
+  ascon_squeeze(&s, out, CRYPTO_BYTES);
+  return 0;
+}*/
+
+#endif
diff --git a/src/ascon-xofa/implementors b/src/ascon-xofa/implementors
new file mode 100644
index 0000000..b110c1a
--- /dev/null
+++ b/src/ascon-xofa/implementors
@@ -0,0 +1,2 @@
+Christoph Dobraunig
+Martin Schläffer
diff --git a/src/ascon-xofa/permutations.c b/src/ascon-xofa/permutations.c
new file mode 100644
index 0000000..02bbadb
--- /dev/null
+++ b/src/ascon-xofa/permutations.c
@@ -0,0 +1,29 @@
+#include "permutations.h"
+
+#if !ASCON_INLINE_PERM && ASCON_UNROLL_LOOPS
+
+void P12(ascon_state_t* s) { P12ROUNDS(s); }
+
+#endif
+
+#if ((defined(ASCON_AEAD_RATE) && ASCON_AEAD_RATE == 16) ||    \
+     (defined(ASCON_HASH_ROUNDS) && ASCON_HASH_ROUNDS == 8) || \
+     (defined(ASCON_PRF_ROUNDS) && ASCON_PRF_ROUNDS == 8)) &&  \
+    !ASCON_INLINE_PERM && ASCON_UNROLL_LOOPS
+
+void P8(ascon_state_t* s) { P8ROUNDS(s); }
+
+#endif
+
+#if (defined(ASCON_AEAD_RATE) && ASCON_AEAD_RATE == 8) && \
+    !ASCON_INLINE_PERM && ASCON_UNROLL_LOOPS
+
+void P6(ascon_state_t* s) { P6ROUNDS(s); }
+
+#endif
+
+#if !ASCON_INLINE_PERM && !ASCON_UNROLL_LOOPS
+
+void P(ascon_state_t* s, int nr) { PROUNDS(s, nr); }
+
+#endif
diff --git a/src/ascon-xofa/permutations.h b/src/ascon-xofa/permutations.h
new file mode 100644
index 0000000..cc1b4af
--- /dev/null
+++ b/src/ascon-xofa/permutations.h
@@ -0,0 +1,78 @@
+#ifndef PERMUTATIONS_H_
+#define PERMUTATIONS_H_
+
+#include <stdint.h>
+
+#include "api.h"
+#include "ascon.h"
+#include "config.h"
+#include "constants.h"
+#include "printstate.h"
+#include "round.h"
+
+forceinline void P12ROUNDS(ascon_state_t* s) {
+  ROUND(s, RC0);
+  ROUND(s, RC1);
+  ROUND(s, RC2);
+  ROUND(s, RC3);
+  ROUND(s, RC4);
+  ROUND(s, RC5);
+  ROUND(s, RC6);
+  ROUND(s, RC7);
+  ROUND(s, RC8);
+  ROUND(s, RC9);
+  ROUND(s, RCa);
+  ROUND(s, RCb);
+}
+
+forceinline void P8ROUNDS(ascon_state_t* s) {
+  ROUND(s, RC4);
+  ROUND(s, RC5);
+  ROUND(s, RC6);
+  ROUND(s, RC7);
+  ROUND(s, RC8);
+  ROUND(s, RC9);
+  ROUND(s, RCa);
+  ROUND(s, RCb);
+}
+
+forceinline void P6ROUNDS(ascon_state_t* s) {
+  ROUND(s, RC6);
+  ROUND(s, RC7);
+  ROUND(s, RC8);
+  ROUND(s, RC9);
+  ROUND(s, RCa);
+  ROUND(s, RCb);
+}
+
+#if ASCON_INLINE_PERM && ASCON_UNROLL_LOOPS
+
+forceinline void P(ascon_state_t* s, int nr) {
+  if (nr == 12) P12ROUNDS(s);
+  if (nr == 8) P8ROUNDS(s);
+  if (nr == 6) P6ROUNDS(s);
+}
+
+#elif !ASCON_INLINE_PERM && ASCON_UNROLL_LOOPS
+
+void P12(ascon_state_t* s);
+void P8(ascon_state_t* s);
+void P6(ascon_state_t* s);
+
+forceinline void P(ascon_state_t* s, int nr) {
+  if (nr == 12) P12(s);
+  if (nr == 8) P8(s);
+  if (nr == 6) P6(s);
+}
+
+#elif ASCON_INLINE_PERM && !ASCON_UNROLL_LOOPS
+
+forceinline void P(ascon_state_t* s, int nr) { PROUNDS(s, nr); }
+
+#else /* !ASCON_INLINE_PERM && !ASCON_UNROLL_LOOPS */
+
+void P(ascon_state_t* s, int nr);
+
+#endif
+
+#endif /* PERMUTATIONS_H_ */
diff --git a/src/ascon-xofa/printstate.c b/src/ascon-xofa/printstate.c
new file mode 100644
index 0000000..a99cbb1
--- /dev/null
+++ b/src/ascon-xofa/printstate.c
@@ -0,0 +1,41 @@
+#ifdef ASCON_PRINT_STATE
+
+#include "printstate.h"
+
+#include <inttypes.h>
+#include <stdio.h>
+#include <string.h>
+
+#ifndef WORDTOU64
+#define WORDTOU64
+#endif
+
+#ifndef U64BIG
+#define U64BIG
+#endif
+
+void printword(const char* text, const uint64_t x) {
+  printf("%s=%016" PRIx64, text, U64BIG(WORDTOU64(x)));
+}
+
+void printstate(const char* text, const ascon_state_t* s) {
+  int i;
+  printf("%s:", text);
+  for (i = strlen(text); i < 17; ++i) printf(" ");
+  printword(" x0", s->x[0]);
+  printword(" x1", s->x[1]);
+  printword(" x2", s->x[2]);
+  printword(" x3", s->x[3]);
+  printword(" x4", s->x[4]);
+#ifdef ASCON_PRINT_BI
+  printf(" ");
+  printf(" x0=%08x_%08x", s->w[0][1], s->w[0][0]);
+  printf(" x1=%08x_%08x", s->w[1][1], s->w[1][0]);
+  printf(" x2=%08x_%08x", s->w[2][1], s->w[2][0]);
+  printf(" x3=%08x_%08x", s->w[3][1], s->w[3][0]);
+  printf(" x4=%08x_%08x", s->w[4][1], s->w[4][0]);
+#endif
+  printf("\n");
+}
+
+#endif
diff --git a/src/ascon-xofa/printstate.h b/src/ascon-xofa/printstate.h
new file mode 100644
index 0000000..40b1f9c
--- /dev/null
+++ b/src/ascon-xofa/printstate.h
@@ -0,0 +1,24 @@
+#ifndef PRINTSTATE_H_
+#define PRINTSTATE_H_
+
+#ifdef ASCON_PRINT_STATE
+
+#include "ascon.h"
+#include "word.h"
+
+void printword(const char* text, const uint64_t x);
+void printstate(const char* text, const ascon_state_t* s);
+
+#else
+
+#define printword(text, w) \
+  do {                     \
+  } while (0)
+
+#define printstate(text, s) \
+  do {                      \
+  } while (0)
+
+#endif
+
+#endif /* PRINTSTATE_H_ */
diff --git a/src/ascon-xofa/round.h b/src/ascon-xofa/round.h
new file mode 100644
index 0000000..dcbaf33
--- /dev/null
+++ b/src/ascon-xofa/round.h
@@ -0,0 +1,350 @@
+#ifndef ROUND_H_
+#define ROUND_H_
+
+#include "ascon.h"
+#include "constants.h"
+#include "forceinline.h"
+#include "printstate.h"
+#include "word.h"
+
+forceinline void ROUND_LOOP(ascon_state_t* s, uint32_t C) {
+  uint32_t tmp0, tmp1;
+  __asm__ __volatile__(
+      "@.syntax_unified\n\t"
+      "rbegin_%=:;\n\t"
+      "eor %[x2_l], %[x2_l], %[tmp1]\n\t"
+      "push  {%[tmp1]}\n\t"
+      "eor %[x0_l], %[x0_l], %[x4_l]\n\t"
+      "eor %[x4_l], %[x4_l], %[x3_l]\n\t"
+      "eor %[x2_l], %[x2_l], %[x1_l]\n\t"
+      "mvn %[tmp0], %[x0_l]\n\t"
+      "orr %[tmp0], %[tmp0], %[x4_l]\n\t"
+      "movs %[tmp1], %[x2_l]\n\t"
+      "bic %[tmp1], %[tmp1], %[x1_l]\n\t"
+      "eor %[x0_l], %[x0_l], %[tmp1]\n\t"
+      "mvn %[tmp1], %[x4_l]\n\t"
+      "orr %[tmp1], %[tmp1], %[x3_l]\n\t"
+      "eor %[x2_l], %[x2_l], %[tmp1]\n\t"
+      "movs %[tmp1], %[x1_l]\n\t"
+      "bic %[tmp1], %[tmp1], %[x0_l]\n\t"
+      "eor %[x4_l], %[x4_l], %[tmp1]\n\t"
+      "movs %[tmp1], %[x3_l]\n\t"
+      "and %[tmp1], %[tmp1], %[x2_l]\n\t"
+      "eor %[tmp1], %[x1_l], %[tmp1]\n\t"
+      "eor %[tmp0], %[x3_l], %[tmp0]\n\t"
+      "eor %[tmp0], %[tmp0], %[x2_l]\n\t"
+      "eor %[tmp1], %[tmp1], %[x0_l]\n\t"
+      "eor %[x0_l], %[x0_l], %[x4_l]\n\t"
+      "movs %[x1_l], %[x0_h]\n\t"
+      "movs %[x3_l], %[x1_h]\n\t"
+      "movs %[x0_h], %[x2_l]\n\t"
+      "movs %[x1_h], %[x0_l]\n\t"
+      "movs %[x0_l], %[x2_h]\n\t"
+      "movs %[x2_l], %[x3_h]\n\t"
+      "movs %[tmp2], %[x4_h]\n\t"
+      "movs %[x2_h], %[tmp0]\n\t"
+      "movs %[x3_h], %[x4_l]\n\t"
+      "eor %[x1_l], %[x1_l], %[tmp2]\n\t"
+      "eor %[tmp2], %[tmp2], %[x2_l]\n\t"
+      "eor %[x0_l], %[x0_l], %[x3_l]\n\t"
+      "mvn %[tmp0], %[x1_l]\n\t"
+      "orr %[tmp0], %[tmp0], %[tmp2]\n\t"
+      "movs %[x4_l], %[x0_l]\n\t"
+      "bic %[x4_l], %[x4_l], %[x3_l]\n\t"
+      "eor %[x1_l], %[x1_l], %[x4_l]\n\t"
+      "mvn %[x4_l], %[tmp2]\n\t"
+      "orr %[x4_l], %[x4_l], %[x2_l]\n\t"
+      "eor %[x0_l], %[x0_l], %[x4_l]\n\t"
+      "movs %[x4_l], %[x3_l]\n\t"
+      "bic %[x4_l], %[x4_l], %[x1_l]\n\t"
+      "eor %[tmp2], %[tmp2], %[x4_l]\n\t"
+      "movs %[x4_l], %[x2_l]\n\t"
+      "and %[x4_l], %[x4_l], %[x0_l]\n\t"
+      "eor %[x3_l], %[x3_l], %[x4_l]\n\t"
+      "eor %[x2_l], %[x2_l], %[tmp0]\n\t"
+      "eor %[x2_l], %[x2_l], %[x0_l]\n\t"
+      "eor %[x3_l], %[x3_l], %[x1_l]\n\t"
+      "eor %[x1_l], %[x1_l], %[tmp2]\n\t"
+      "movs %[x4_h], %[x2_l]\n\t"
+      "movs %[x2_l], %[x0_h]\n\t"
+      "movs %[x0_h], %[x1_l]\n\t"
+      "lsr %[x4_l], %[x0_l], #6\n\t"
+      "lsl %[x1_l], %[x2_l], #26\n\t"
+      "lsr %[tmp0], %[x2_l], #6\n\t"
+      "eor %[x4_l], %[x4_l], %[x1_l]\n\t"
+      "lsl %[x1_l], %[x0_l], #26\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "lsr %[x1_l], %[x0_l], #1\n\t"
+      "eor %[x4_l], %[x4_l], %[x1_l]\n\t"
+      "lsl %[x1_l], %[x2_l], #31\n\t"
+      "eor %[x4_l], %[x4_l], %[x1_l]\n\t"
+      "lsr %[x1_l], %[x2_l], #1\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "lsl %[x1_l], %[x0_l], #31\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "eor %[x0_l], %[x0_l], %[x4_l]\n\t"
+      "eor %[x2_l], %[x2_l], %[tmp0]\n\t"
+      "lsl %[x4_l], %[x3_l], #3\n\t"
+      "lsr %[x1_l], %[tmp1], #29\n\t"
+      "lsl %[tmp0], %[tmp1], #3\n\t"
+      "eor %[x4_l], %[x4_l], %[x1_l]\n\t"
+      "lsr %[x1_l], %[x3_l], #29\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "lsl %[x1_l], %[x3_l], #25\n\t"
+      "eor %[x4_l], %[x4_l], %[x1_l]\n\t"
+      "lsr %[x1_l], %[tmp1], #7\n\t"
+      "eor %[x4_l], %[x4_l], %[x1_l]\n\t"
+      "lsl %[x1_l], %[tmp1], #25\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "lsr %[x1_l], %[x3_l], #7\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "eor %[x3_l], %[x3_l], %[x4_l]\n\t"
+      "eor %[tmp1], %[tmp1], %[tmp0]\n\t"
+      "movs %[x4_l], %[x3_h]\n\t"
+      "movs %[x3_h], %[tmp1]\n\t"
+      "lsl %[tmp1], %[tmp2], #23\n\t"
+      "lsr %[x1_l], %[x4_l], #9\n\t"
+      "lsl %[tmp0], %[x4_l], #23\n\t"
+      "eor %[tmp1], %[tmp1], %[x1_l]\n\t"
+      "lsr %[x1_l], %[tmp2], #9\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "lsr %[x1_l], %[tmp2], #7\n\t"
+      "eor %[tmp1], %[tmp1], %[x1_l]\n\t"
+      "lsl %[x1_l], %[x4_l], #25\n\t"
+      "eor %[tmp1], %[tmp1], %[x1_l]\n\t"
+      "lsr %[x1_l], %[x4_l], #7\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "lsl %[x1_l], %[tmp2], #25\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "eor %[tmp2], %[tmp2], %[tmp1]\n\t"
+      "eor %[x4_l], %[x4_l], %[tmp0]\n\t"
+      "movs %[x1_l], %[x3_h]\n\t"
+      "movs %[tmp1], %[x4_h]\n\t"
+      "movs %[x4_h], %[tmp2]\n\t"
+      "movs %[x3_h], %[x3_l]\n\t"
+      "movs %[x3_l], %[x2_h]\n\t"
+      "movs %[x2_h], %[x0_l]\n\t"
+      "lsr %[tmp2], %[tmp1], #17\n\t"
+      "lsl %[x0_l], %[x3_l], #15\n\t"
+      "lsr %[tmp0], %[x3_l], #17\n\t"
+      "eor %[tmp2], %[tmp2], %[x0_l]\n\t"
+      "lsl %[x0_l], %[tmp1], #15\n\t"
+      "eor %[tmp0], %[tmp0], %[x0_l]\n\t"
+      "lsr %[x0_l], %[tmp1], #10\n\t"
+      "eor %[tmp2], %[tmp2], %[x0_l]\n\t"
+      "lsl %[x0_l], %[x3_l], #22\n\t"
+      "eor %[tmp2], %[tmp2], %[x0_l]\n\t"
+      "lsr %[x0_l], %[x3_l], #10\n\t"
+      "eor %[tmp0], %[tmp0], %[x0_l]\n\t"
+      "lsl %[x0_l], %[tmp1], #22\n\t"
+      "eor %[tmp0], %[tmp0], %[x0_l]\n\t"
+      "eor %[tmp1], %[tmp1], %[tmp2]\n\t"
+      "eor %[x3_l], %[x3_l], %[tmp0]\n\t"
+      "movs %[tmp0], %[x0_h]\n\t"
+      "movs %[x0_l], %[x1_h]\n\t"
+      "movs %[x0_h], %[x4_l]\n\t"
+      "movs %[x1_h], %[x3_h]\n\t"
+      "movs %[x3_h], %[tmp1]\n\t"
+      "lsr %[x4_l], %[tmp0], #28\n\t"
+      "lsl %[tmp1], %[x0_l], #4\n\t"
+      "lsr %[tmp2], %[x0_l], #28\n\t"
+      "eor %[x4_l], %[x4_l], %[tmp1]\n\t"
+      "lsl %[tmp1], %[tmp0], #4\n\t"
+      "eor %[tmp2], %[tmp2], %[tmp1]\n\t"
+      "lsr %[tmp1], %[tmp0], #19\n\t"
+      "eor %[x4_l], %[x4_l], %[tmp1]\n\t"
+      "lsl %[tmp1], %[x0_l], #13\n\t"
+      "eor %[x4_l], %[x4_l], %[tmp1]\n\t"
+      "lsr %[tmp1], %[x0_l], #19\n\t"
+      "eor %[tmp2], %[tmp2], %[tmp1]\n\t"
+      "lsl %[tmp1], %[tmp0], #13\n\t"
+      "eor %[tmp2], %[tmp2], %[tmp1]\n\t"
+      "pop  {%[tmp1]}\n\t"
+      "eor %[tmp0], %[tmp0], %[x4_l]\n\t"
+      "eor %[x0_l], %[x0_l], %[tmp2]\n\t"
+      "movs %[x4_l], %[x0_h]\n\t"
+      "movs %[x0_h], %[tmp0]\n\t"
+      "sub %[tmp1], %[tmp1], #15\n\t"
+      "cmp %[tmp1], #60\n\t"
+      "beq rend_%=\n\t"
+      "b rbegin_%=\n\t"
+      "rend_%=:;\n\t"
+      :
+      [x0_l] "+l"(s->w[0][0]), [x0_h] "+h"(s->w[0][1]), [x1_l] "+l"(s->w[1][0]),
+      [x1_h] "+h"(s->w[1][1]), [x2_l] "+l"(s->w[2][0]), [x2_h] "+h"(s->w[2][1]),
+      [x3_l] "+l"(s->w[3][0]), [x3_h] "+h"(s->w[3][1]), [x4_l] "+l"(s->w[4][0]),
+      [x4_h] "+h"(s->w[4][1]), [tmp1] "+l"(C), [tmp0] "=l"(tmp0),
+      [tmp2] "=l"(tmp1)
+      :
+      :);
+}
+
+forceinline void ROUND(ascon_state_t* s, uint32_t C) {
+  uint32_t tmp0, tmp1, tmp2;
+  __asm__ __volatile__(
+      "@.syntax_unified\n\t"
+      "movs %[tmp0],  %[C]\n\t"
+      "eor %[x2_l], %[x2_l], %[tmp0]\n\t"
+      "eor %[x0_l], %[x0_l], %[x4_l]\n\t"
+      "eor %[x4_l], %[x4_l], %[x3_l]\n\t"
+      "eor %[x2_l], %[x2_l], %[x1_l]\n\t"
+      "mvn %[tmp0], %[x0_l]\n\t"
+      "orr %[tmp0], %[tmp0], %[x4_l]\n\t"
+      "movs %[tmp1], %[x2_l]\n\t"
+      "bic %[tmp1], %[tmp1], %[x1_l]\n\t"
+      "eor %[x0_l], %[x0_l], %[tmp1]\n\t"
+      "mvn %[tmp1], %[x4_l]\n\t"
+      "orr %[tmp1], %[tmp1], %[x3_l]\n\t"
+      "eor %[x2_l], %[x2_l], %[tmp1]\n\t"
+      "movs %[tmp1], %[x1_l]\n\t"
+      "bic %[tmp1], %[tmp1], %[x0_l]\n\t"
+      "eor %[x4_l], %[x4_l], %[tmp1]\n\t"
+      "movs %[tmp1], %[x3_l]\n\t"
+      "and %[tmp1], %[tmp1], %[x2_l]\n\t"
+      "eor %[tmp1], %[x1_l], %[tmp1]\n\t"
+      "eor %[tmp0], %[x3_l], %[tmp0]\n\t"
+      "eor %[tmp0], %[tmp0], %[x2_l]\n\t"
+      "eor %[tmp1], %[tmp1], %[x0_l]\n\t"
+      "eor %[x0_l], %[x0_l], %[x4_l]\n\t"
+      "movs %[x1_l], %[x0_h]\n\t"
+      "movs %[x3_l], %[x1_h]\n\t"
+      "movs %[x0_h], %[x2_l]\n\t"
+      "movs %[x1_h], %[x0_l]\n\t"
+      "movs %[x0_l], %[x2_h]\n\t"
+      "movs %[x2_l], %[x3_h]\n\t"
+      "movs %[tmp2], %[x4_h]\n\t"
+      "movs %[x2_h], %[tmp0]\n\t"
+      "movs %[x3_h], %[x4_l]\n\t"
+      "eor %[x1_l], %[x1_l], %[tmp2]\n\t"
+      "eor %[tmp2], %[tmp2], %[x2_l]\n\t"
+      "eor %[x0_l], %[x0_l], %[x3_l]\n\t"
+      "mvn %[tmp0], %[x1_l]\n\t"
+      "orr %[tmp0], %[tmp0], %[tmp2]\n\t"
+      "movs %[x4_l], %[x0_l]\n\t"
+      "bic %[x4_l], %[x4_l], %[x3_l]\n\t"
+      "eor %[x1_l], %[x1_l], %[x4_l]\n\t"
+      "mvn %[x4_l], %[tmp2]\n\t"
+      "orr %[x4_l], %[x4_l], %[x2_l]\n\t"
+      "eor %[x0_l], %[x0_l], %[x4_l]\n\t"
+      "movs %[x4_l], %[x3_l]\n\t"
+      "bic %[x4_l], %[x4_l], %[x1_l]\n\t"
+      "eor %[tmp2], %[tmp2], %[x4_l]\n\t"
+      "movs %[x4_l], %[x2_l]\n\t"
+      "and %[x4_l], %[x4_l], %[x0_l]\n\t"
+      "eor %[x3_l], %[x3_l], %[x4_l]\n\t"
+      "eor %[x2_l], %[x2_l], %[tmp0]\n\t"
+      "eor %[x2_l], %[x2_l], %[x0_l]\n\t"
+      "eor %[x3_l], %[x3_l], %[x1_l]\n\t"
+      "eor %[x1_l], %[x1_l], %[tmp2]\n\t"
+      "movs %[x4_h], %[x2_l]\n\t"
+      "movs %[x2_l], %[x0_h]\n\t"
+      "movs %[x0_h], %[x1_l]\n\t"
+      "lsr %[x4_l], %[x0_l], #6\n\t"
+      "lsl %[x1_l], %[x2_l], #26\n\t"
+      "lsr %[tmp0], %[x2_l], #6\n\t"
+      "eor %[x4_l], %[x4_l], %[x1_l]\n\t"
+      "lsl %[x1_l], %[x0_l], #26\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "lsr %[x1_l], %[x0_l], #1\n\t"
+      "eor %[x4_l], %[x4_l], %[x1_l]\n\t"
+      "lsl %[x1_l], %[x2_l], #31\n\t"
+      "eor %[x4_l], %[x4_l], %[x1_l]\n\t"
+      "lsr %[x1_l], %[x2_l], #1\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "lsl %[x1_l], %[x0_l], #31\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "eor %[x0_l], %[x0_l], %[x4_l]\n\t"
+      "eor %[x2_l], %[x2_l], %[tmp0]\n\t"
+      "lsl %[x4_l], %[x3_l], #3\n\t"
+      "lsr %[x1_l], %[tmp1], #29\n\t"
+      "lsl %[tmp0], %[tmp1], #3\n\t"
+      "eor %[x4_l], %[x4_l], %[x1_l]\n\t"
+      "lsr %[x1_l], %[x3_l], #29\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "lsl %[x1_l], %[x3_l], #25\n\t"
+      "eor %[x4_l], %[x4_l], %[x1_l]\n\t"
+      "lsr %[x1_l], %[tmp1], #7\n\t"
+      "eor %[x4_l], %[x4_l], %[x1_l]\n\t"
+      "lsl %[x1_l], %[tmp1], #25\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "lsr %[x1_l], %[x3_l], #7\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "eor %[x3_l], %[x3_l], %[x4_l]\n\t"
+      "eor %[tmp1], %[tmp1], %[tmp0]\n\t"
+      "movs %[x4_l], %[x3_h]\n\t"
+      "movs %[x3_h], %[tmp1]\n\t"
+      "lsl %[tmp1], %[tmp2], #23\n\t"
+      "lsr %[x1_l], %[x4_l], #9\n\t"
+      "lsl %[tmp0], %[x4_l], #23\n\t"
+      "eor %[tmp1], %[tmp1], %[x1_l]\n\t"
+      "lsr %[x1_l], %[tmp2], #9\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "lsr %[x1_l], %[tmp2], #7\n\t"
+      "eor %[tmp1], %[tmp1], %[x1_l]\n\t"
+      "lsl %[x1_l], %[x4_l], #25\n\t"
+      "eor %[tmp1], %[tmp1], %[x1_l]\n\t"
+      "lsr %[x1_l], %[x4_l], #7\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "lsl %[x1_l], %[tmp2], #25\n\t"
+      "eor %[tmp0], %[tmp0], %[x1_l]\n\t"
+      "eor %[tmp2], %[tmp2], %[tmp1]\n\t"
+      "eor %[x4_l], %[x4_l], %[tmp0]\n\t"
+      "movs %[x1_l], %[x3_h]\n\t"
+      "movs %[tmp1], %[x4_h]\n\t"
+      "movs %[x4_h], %[tmp2]\n\t"
+      "movs %[x3_h], %[x3_l]\n\t"
+      "movs %[x3_l], %[x2_h]\n\t"
+      "movs %[x2_h], %[x0_l]\n\t"
+      "lsr %[tmp2], %[tmp1], #17\n\t"
+      "lsl %[x0_l], %[x3_l], #15\n\t"
+      "lsr %[tmp0], %[x3_l], #17\n\t"
+      "eor %[tmp2], %[tmp2], %[x0_l]\n\t"
+      "lsl %[x0_l], %[tmp1], #15\n\t"
+      "eor %[tmp0], %[tmp0], %[x0_l]\n\t"
+      "lsr %[x0_l], %[tmp1], #10\n\t"
+      "eor %[tmp2], %[tmp2], %[x0_l]\n\t"
+      "lsl %[x0_l], %[x3_l], #22\n\t"
+      "eor %[tmp2], %[tmp2], %[x0_l]\n\t"
+      "lsr %[x0_l], %[x3_l], #10\n\t"
+      "eor %[tmp0], %[tmp0], %[x0_l]\n\t"
+      "lsl %[x0_l], %[tmp1], #22\n\t"
+      "eor %[tmp0], %[tmp0], %[x0_l]\n\t"
+      "eor %[tmp1], %[tmp1], %[tmp2]\n\t"
+      "eor %[x3_l], %[x3_l], %[tmp0]\n\t"
+      "movs %[tmp0], %[x0_h]\n\t"
+      "movs %[x0_l], %[x1_h]\n\t"
+      "movs %[x0_h], %[x4_l]\n\t"
+      "movs %[x1_h], %[x3_h]\n\t"
+      "movs %[x3_h], %[tmp1]\n\t"
+      "lsr %[x4_l], %[tmp0], #28\n\t"
+      "lsl %[tmp1], %[x0_l], #4\n\t"
+      "lsr %[tmp2], %[x0_l], #28\n\t"
+      "eor %[x4_l], %[x4_l], %[tmp1]\n\t"
+      "lsl %[tmp1], %[tmp0], #4\n\t"
+      "eor %[tmp2], %[tmp2], %[tmp1]\n\t"
+      "lsr %[tmp1], %[tmp0], #19\n\t"
+      "eor %[x4_l], %[x4_l], %[tmp1]\n\t"
+      "lsl %[tmp1], %[x0_l], #13\n\t"
+      "eor %[x4_l], %[x4_l], %[tmp1]\n\t"
+      "lsr %[tmp1], %[x0_l], #19\n\t"
+      "eor %[tmp2], %[tmp2], %[tmp1]\n\t"
+      "lsl %[tmp1], %[tmp0], #13\n\t"
+      "eor %[tmp2], %[tmp2], %[tmp1]\n\t"
+      "eor %[tmp0], %[tmp0], %[x4_l]\n\t"
+      "eor %[x0_l], %[x0_l], %[tmp2]\n\t"
+      "movs %[x4_l], %[x0_h]\n\t"
+      "movs %[x0_h], %[tmp0]\n\t"
+      :
+      [x0_l] "+l"(s->w[0][0]), [x0_h] "+h"(s->w[0][1]), [x1_l] "+l"(s->w[1][0]),
+      [x1_h] "+h"(s->w[1][1]), [x2_l] "+l"(s->w[2][0]), [x2_h] "+h"(s->w[2][1]),
+      [x3_l] "+l"(s->w[3][0]), [x3_h] "+h"(s->w[3][1]), [x4_l] "+l"(s->w[4][0]),
+      [x4_h] "+h"(s->w[4][1]), [tmp0] "=l"(tmp0), [tmp1] "=l"(tmp1),
+      [tmp2] "=l"(tmp2)
+      : [C] "ri"(C)
+      :);
+  printstate(" round output", s);
+}
+
+forceinline void PROUNDS(ascon_state_t* s, int nr) { ROUND_LOOP(s, START(nr)); }
+
+#endif /* ROUND_H_ */
diff --git a/src/ascon-xofa/word.h b/src/ascon-xofa/word.h
new file mode 100644
index 0000000..e8949db
--- /dev/null
+++ b/src/ascon-xofa/word.h
@@ -0,0 +1,69 @@
+#ifndef WORD_H_
+#define WORD_H_
+
+#include <stdint.h>
+#include <string.h>
+
+#include "bendian.h"
+#include "forceinline.h"
+
+typedef union {
+  uint64_t x;
+  uint32_t w[2];
+  uint8_t b[8];
+} word_t;
+
+#define U64TOWORD(x) U64BIG(x)
+#define WORDTOU64(x) U64BIG(x)
+
+forceinline uint64_t ROR(uint64_t x, int n) { return x >> n | x << (-n & 63); }
+
+forceinline uint64_t KEYROT(uint64_t lo2hi, uint64_t hi2lo) {
+  return lo2hi << 32 | hi2lo >> 32;
+}
+
+forceinline int NOTZERO(uint64_t a, uint64_t b) {
+  uint64_t result = a | b;
+  result |= result >> 32;
+  result |= result >> 16;
+  result |= result >> 8;
+  return ((((int)(result & 0xff) - 1) >> 8) & 1) - 1;
+}
+
+forceinline uint64_t PAD(int i) { return 0x80ull << (56 - 8 * i); }
+
+forceinline uint64_t PRFS_MLEN(uint64_t len) { return len << 51; }
+
+forceinline uint64_t CLEAR(uint64_t w, int n) {
+  /* undefined for n == 0 */
+  uint64_t mask = ~0ull >> (8 * n);
+  return w & mask;
+}
+
+forceinline uint64_t MASK(int n) {
+  /* undefined for n == 0 */
+  return ~0ull >> (64 - 8 * n);
+}
+
+forceinline uint64_t LOAD(const uint8_t* bytes, int n) {
+  uint64_t x = *(uint64_t*)bytes & MASK(n);
+  return U64TOWORD(x);
+}
+
+forceinline void STORE(uint8_t* bytes, uint64_t w, int n) {
+  *(uint64_t*)bytes &= ~MASK(n);
+  *(uint64_t*)bytes |= WORDTOU64(w);
+}
+
+forceinline uint64_t LOADBYTES(const uint8_t* bytes, int n) {
+  uint64_t x = 0;
+  memcpy(&x, bytes, n);
+  return U64TOWORD(x);
+}
+
+forceinline void STOREBYTES(uint8_t* bytes, uint64_t w, int n) {
+  uint64_t x = WORDTOU64(w);
+  memcpy(bytes, &x, n);
+}
+
+#endif /* WORD_H_ */
diff --git a/src/main.c b/src/main.c
index b660510..bd1fbc8 100644
--- a/src/main.c
+++ b/src/main.c
@@ -11,11 +11,13 @@
 
 
 #include "rorand.h"
+#include "rourand.h"
 #include "util.h"
 
 
 #define DO_TIME_BENCH 0
 #define DATA_TOTAL 1024*1024
+#define USE_URANDOM 1
 
 #if DO_TIME_BENCH
 static uint8_t time_bench[128*1024];
@@ -50,13 +52,23 @@ int main() {
 		iprintf("rorand_init() returned %d\n", d);
 		panic("can't init rorand");
 	}
+	struct rourand_state* ur = rourand_init(rorand_get, 0);
+	if (!ur) {
+		panic("Can't init rourand");
+	}
+
+#if USE_URANDOM
+#define rand_get(dst, size) rourand_get(ur, dst, size)
+#else
+#define rand_get(dst, size) rorand_get(dst, (size)*CHAR_BIT)
+#endif
 
 #if DO_TIME_BENCH
 	memset(time_bench, 0, sizeof(time_bench));
 	iprintf("[---] throughput benchmark start\n");
 
 	absolute_time_t ta = get_absolute_time();
-	rorand_get(time_bench, count_of(time_bench)*CHAR_BIT);
+	rand_get(time_bench, count_of(time_bench));
 	absolute_time_t tb = get_absolute_time();
 
 	int64_t dt_us = absolute_time_diff_us(ta, tb);
@@ -72,10 +84,11 @@ int main() {
 	memset(data, 0, sizeof(data));
 	const uintptr_t total = DATA_TOTAL;
 	for (uintptr_t off = 0; off < total; off += count_of(data)) {
-		rorand_get(data, count_of(data)*CHAR_BIT);
+		rand_get(data, count_of(data));
 		hexdump(NULL, off, data, sizeof(data));
 	}
 #endif
+	rourand_free(ur);
 
 	iprintf("done\n");
 	while(1);
diff --git a/src/rourand.c b/src/rourand.c
new file mode 100644
index 0000000..e72099e
--- /dev/null
+++ b/src/rourand.c
@@ -0,0 +1,71 @@
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+
+#include "ascon.h"
+#include "rourand.h"
+
+
+struct rourand_state {
+	ascon_state_t ascon;
+	uint8_t buf[16];
+	rourand_get_fn rand;
+	size_t bpos;
+	size_t rate;
+	size_t rcount;
+};
+
+
+struct rourand_state* rourand_init(rourand_get_fn rawrand, int rate) {
+	struct rourand_state* r = calloc(1, sizeof(struct rourand_state));
+	if (!r) return NULL;
+
+	if (rate == 0) rate = 16;
+
+	ascon_inithash(&r->ascon);
+	r->rand = rawrand;
+	r->bpos = 0;
+	r->rate = rate;
+	r->rcount = 0;
+
+	rawrand(r->buf, sizeof(r->buf)*CHAR_BIT);
+	ascon_absorb(&r->ascon, r->buf, sizeof(r->buf));
+	rawrand(r->buf, sizeof(r->buf)*CHAR_BIT);
+	ascon_absorb(&r->ascon, r->buf, sizeof(r->buf));
+
+	return r;
+}
+void rourand_free(struct rourand_state* st) {
+	if (st) {
+		explicit_bzero(st, sizeof(*st));
+		free(st);
+	}
+}
+
+void rourand_get(struct rourand_state* st, void* dst_, size_t nbytes) {
+	uint8_t* dst = (uint8_t*)dst_;
+
+	while (nbytes > 0) {
+		if (st->bpos == 0) {
+			++st->rcount;
+			if (st->rcount == st->rate) {
+				st->rand(st->buf, sizeof(st->buf)*CHAR_BIT);
+				ascon_absorb(&st->ascon, st->buf, sizeof(st->buf));
+				st->rcount = 0;
+			}
+			ascon_squeeze(&st->ascon, st->buf, sizeof(st->buf));
+			st->bpos = sizeof(st->buf);
+		}
+
+		size_t todo = nbytes;
+		if (todo > st->bpos) todo = st->bpos;
+
+		memcpy(dst, &st->buf[sizeof(st->buf) - st->bpos], todo);
+		nbytes -= todo;
+		st->bpos -= todo;
+		dst += todo;
+	}
+}
+
diff --git a/src/rourand.h b/src/rourand.h
new file mode 100644
index 0000000..c9ac6bd
--- /dev/null
+++ b/src/rourand.h
@@ -0,0 +1,24 @@
+
+#ifndef ROURAND_H_
+#define ROURAND_H_
+
+#include <stddef.h>
+
+
+struct rourand_state;
+
+typedef void (*rourand_get_fn)(void* dst, size_t nbits);
+
+struct rourand_state* rourand_init(rourand_get_fn rawrand, int rate);
+void rourand_free(struct rourand_state* st);
+
+void rourand_get(struct rourand_state* st, void* dst, size_t nbytes);
+
+static inline uint32_t rourand_get32(struct rourand_state* st) {
+	uint32_t r = 0;
+	rourand_get(st, &r, sizeof(r));
+	return r;
+}
+
+#endif
+