From ad55537bf35e332503b950d6951c41bcb8a7954f Mon Sep 17 00:00:00 2001
From: graham sanderson <graham.sanderson@raspberrypi.com>
Date: Mon, 13 Sep 2021 13:29:01 -0500
Subject: [PATCH] B1 changes

---
 CMakeLists.txt                  |   25 +-
 LICENSE.TXT                     |    3 +-
 README.md                       |    5 +-
 bootrom/async_task.c            |    4 +-
 bootrom/async_task.h            |    4 +-
 bootrom/bit_functions.S         |  330 +++++
 bootrom/bootrom_main.c          |   14 +-
 bootrom/bootrom_misc.S          |    2 -
 bootrom/bootrom_rt0.S           |  266 ++--
 bootrom/info_uf2.txt            |    2 +-
 bootrom/mufplib-double.S        | 2326 +++++++++++++++++++++++++++++++
 bootrom/mufplib.S               |  401 +++---
 bootrom/program_flash_generic.c |    7 +-
 bootrom/usb_boot_device.c       |  168 +--
 generator/main.c                |    6 +
 pico_sdk                        |    2 +-
 test/CMakeLists.txt             |   35 +
 test/bit_functions_test.c       |  199 +++
 test/mem_functions_test.c       |  289 ++++
 test/tc_rom_double.c            |  527 +++++++
 test/tc_rom_float.c             |  506 +++++++
 test/tictoc.h                   |   15 +
 usb_device_tiny/runtime.c       |   16 +-
 usb_device_tiny/runtime.h       |    9 +-
 usb_device_tiny/scsi_ir.h       |    2 +-
 25 files changed, 4689 insertions(+), 474 deletions(-)
 create mode 100644 bootrom/bit_functions.S
 create mode 100644 bootrom/mufplib-double.S
 create mode 100644 test/CMakeLists.txt
 create mode 100644 test/bit_functions_test.c
 create mode 100644 test/mem_functions_test.c
 create mode 100644 test/tc_rom_double.c
 create mode 100644 test/tc_rom_float.c
 create mode 100644 test/tictoc.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4d63dcb..8ad451f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,12 @@ include(pico_sdk/pico_sdk_init.cmake)
 
 project(pico_bootrom)
 
-set(PICO_BARE_METAL 1)
+if (NOT INCLUDE_TESTS)
+    set(PICO_BARE_METAL 1)
+else()
+    # SDK versions conflict with code to be tested
+    set(SKIP_PICO_BIT_OPS 1)
+endif()
 
 # Add pico targets to the build
 pico_sdk_init()
@@ -32,6 +37,7 @@ set(GENERATED_H ${CMAKE_CURRENT_BINARY_DIR}/generated.h)
 # order matches original makefile
 add_executable(bootrom
         bootrom/bootrom_rt0.S
+        bootrom/bit_functions.S
         bootrom/bootrom_main.c
         bootrom/bootrom_misc.S
         bootrom/program_flash_generic.c
@@ -39,6 +45,7 @@ add_executable(bootrom
         bootrom/virtual_disk.c
         bootrom/async_task.c
         bootrom/mufplib.S
+        bootrom/mufplib-double.S
         usb_device_tiny/runtime.c
         usb_device_tiny/usb_device.c
         usb_device_tiny/usb_msc.c
@@ -104,13 +111,23 @@ target_compile_definitions(bootrom PRIVATE
         BOOTROM_ONLY_SIZE_HACKS
         USE_16BIT_ROM_FUNCS
         USE_BOOTROM_GPIO
+
+        # for
         USE_HW_DIV
-        )
+
+        USE_POPCOUNT32
+        USE_CLZ32
+        USE_CTZ32
+        USE_REVERSE32
+)
 
 target_link_options(bootrom PRIVATE "LINKER:--script=${CMAKE_CURRENT_LIST_DIR}/bootrom/bootrom.ld")
 set_target_properties(bootrom PROPERTIES LINK_DEPENDS ${CMAKE_CURRENT_LIST_DIR}/bootrom/bootrom.ld)
 target_link_libraries(bootrom PRIVATE
         hardware_resets
+        hardware_regs
+        hardware_structs
+        pico_platform
         hardware_sync
         boot_uf2_headers
         boot_picoboot_headers
@@ -128,4 +145,8 @@ pico_add_map_output(bootrom)
 pico_add_hex_output(bootrom)
 pico_add_h32_output(bootrom)
 
+if (INCLUDE_TESTS)
+    add_subdirectory(test)
+endif()
+
 
diff --git a/LICENSE.TXT b/LICENSE.TXT
index 6f0b605..d4cccb7 100644
--- a/LICENSE.TXT
+++ b/LICENSE.TXT
@@ -1,4 +1,5 @@
-NOTE: this license applies to the contents of this repository, EXCLUDING the contents of the files bootrom/mufplib.S which is licensed separately:
+NOTE: this license applies to the contents of this repository, EXCLUDING the contents of the files bootrom/mufplib.S
+and bootrom/mufplib-double.S, which are licensed separately:
 
 Copyright 2020 (c) 2020 Raspberry Pi (Trading) Ltd.
 
diff --git a/README.md b/README.md
index e74306f..54f2b0a 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
-This is the B0 version of the RP2040 bootrom.
+This is the B1 version of the RP2040 bootrom.
 
-The version on the chip was built in _Release_ mode using an _arm-cortex_m0-eabi_ version of
-[crosstool-NG](https://crosstool-ng.github.io/crostool-NG) with GCC version 7.2.0.
+The version on the chip was built in _Debug_ mode using GCC 9.3.1 (GNU Arm Embedded Toolchain 9-2020-q2-update).
 
 Note the GIT revision info (included in the bootrom) on chip does not match the GIT revision of this
 branch. Additionally, certain `DMB` instructions are encoded as `DMB SY` vs `DMB ISH` when building
diff --git a/bootrom/async_task.c b/bootrom/async_task.c
index b00ed1a..662261a 100644
--- a/bootrom/async_task.c
+++ b/bootrom/async_task.c
@@ -296,8 +296,8 @@ void __attribute__((noreturn)) async_task_worker() {
             execute_task(&virtual_disk_queue, &_worker_task);
         }
 #ifdef USE_PICOBOOT
-        else if (dequeue_task(&rpiboot_queue, &_worker_task)) {
-            execute_task(&rpiboot_queue, &_worker_task);
+        else if (dequeue_task(&picoboot_queue, &_worker_task)) {
+            execute_task(&picoboot_queue, &_worker_task);
         }
 #endif
         else {
diff --git a/bootrom/async_task.h b/bootrom/async_task.h
index 9248f16..06fb958 100644
--- a/bootrom/async_task.h
+++ b/bootrom/async_task.h
@@ -47,7 +47,7 @@ struct async_task {
     uint32_t erase_size;
     uint8_t *data;
     uint32_t data_length;
-    uint32_t rpiboot_user_token;
+    uint32_t picoboot_user_token;
     uint8_t type;
     uint8_t exclusive_param;
     // an identifier for the logical source of the task
@@ -79,7 +79,7 @@ void __attribute__((noreturn)) async_task_worker();
 void reset_task(struct async_task *task);
 
 extern struct async_task_queue virtual_disk_queue;
-extern struct async_task_queue rpiboot_queue;
+extern struct async_task_queue picoboot_queue;
 
 static inline void async_disable_queue(struct async_task_queue *queue, bool disable) {
     queue->disable = disable;
diff --git a/bootrom/bit_functions.S b/bootrom/bit_functions.S
new file mode 100644
index 0000000..52c1283
--- /dev/null
+++ b/bootrom/bit_functions.S
@@ -0,0 +1,330 @@
+/**
+ * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+// these are available and tested for all input values, though can certainly be omitted for space...
+
+// these numbers are for uniform input (average over uniform distribution of 32 bit input) cycle counts
+// NAME       | TABLE | NON TABLE | BUILTIN | DESCRIPTION
+// -----------|-------|-----------|---------|-------------
+// CLZ32      |    13 |       9.6 |      20 | count of leading zeroes
+// CTZ32      |    12 |        11 |      20 | count of trailing zeroes
+// REVERSE32  |    21 |        22 |       - | bit reverse
+// POPCOUNT32 |    18 |        20 |      22 | count of 1 bits
+//
+
+#define NEW_CLZ32 1
+#define NEW_CTZ32 1
+#define NEW_REVERSE32 1
+#define NEW_POPCOUNT32 1
+
+.cpu cortex-m0
+.thumb
+.section .text.bit_functions, "ax"
+
+#ifdef USE_POPCOUNT32
+.align 2
+.global popcount32
+.type popcount32,%function
+.thumb_func
+popcount32:
+#if NEW_POPCOUNT32
+    ldr r1,=#0x49249249
+    lsr r2,r0,#1
+    and r2,r1
+    lsr r3,r0,#2
+    and r3,r1
+    and r0,r1
+    add r0,r2
+    add r0,r3      @ xx0xx0xx0xx0xx0xx0xx0xx0xx0xx0xx
+    lsr r1,r0,#3
+    add r0,r1      @ xx***xxx***xxx***xxx***xxx***xxx
+    ldr r1,=#0xC71C71C7
+    and r0,r1      @ xx000xxx000xxx000xxx000xxx000xxx
+    lsr r1,r0,#6
+    add r0,r1      @ **00xxxx00****00xxxx00****00xxxx
+    ldr r1,=#0x04004004
+    mul r0,r1
+    lsr r0,#26
+    bx lr
+#else
+    adr r1, popcount8_table
+    uxtb r2, r0
+    ldrb r3, [r1, r2]
+    lsr r0, r0, #8
+    uxtb r2, r0
+    ldrb r2, [r1, r2]
+    add r3, r2
+    lsr r0, r0, #8
+    uxtb r2, r0
+    ldrb r2, [r1, r2]
+    add r3, r2
+    lsr r0, r0, #8
+    ldrb r0, [r1, r0]
+    add r0, r3
+    bx lr
+#endif
+#endif
+
+#ifdef USE_REVERSE32
+.align 2
+.global reverse32
+.type reverse32,%function
+.thumb_func
+reverse32:
+#if NEW_REVERSE32
+  	ldr	r2, =#0xcccccccc
+  	lsr r1, r2, #1
+  	eor r1, r2
+  	lsl	r3, r0, #1
+  	and	r3, r1
+  	and r0, r1
+  	lsr r0, #1
+  	orr	r0, r3
+  	lsl	r3, r0, #2
+  	and	r3, r2
+  	and	r0, r2
+  	lsr	r0, r0, #2
+  	orr	r3, r0
+  	ldr	r2, =#0xf0f0f0f0
+  	lsl	r0, r3, #4
+  	and	r0, r2
+  	and	r3, r2
+  	lsr	r3, r3, #4
+  	orr	r0, r3
+  	rev	r0, r0
+  	bx	lr
+#else
+    adr r3, reverse8_table
+    uxtb r1, r0
+    lsr r0, #8
+    ldrb r2, [r3, r1]
+    uxtb r1, r0
+    rev16 r2, r2
+    ldrb r1, [r3, r1]
+    orr r2, r1
+    lsr r0, #8
+    uxtb r1, r0
+    ldrb r1, [r3, r1]
+    rev r2, r2
+    orr r2, r1
+    lsr r1, r0, #8
+    ldrb r0, [r3, r1]
+    rev16 r2, r2
+    orr r0, r2
+    bx lr
+#endif
+#endif
+
+#ifdef USE_CLZ32
+.global clz32
+.type clz32,%function
+.thumb_func
+clz32:
+#if NEW_CLZ32
+    adr r3, clz6_table
+    lsr r1, r0, #16
+    bne clz32_0_15_n
+//clz32_16_31_n:
+    lsr r1, r0, #10
+    bne clz32_16_21_l
+//clz32_22_31_n:
+    lsr r1, r0, #4
+    bne clz32_22_27_l
+//clz32_28_31_l:
+    ldrb r0, [r3, r0]
+    add r0, #28 - 2 // - 2 since we're using a 4 bit not 6 bit index
+    bx lr
+clz32_16_21_l:
+    ldrb r0, [r3, r1]
+    add r0, #16
+    bx lr
+clz32_22_27_l:
+    ldrb r0, [r3, r1]
+    add r0, #22
+    bx lr
+clz32_0_15_n:
+    lsr r0, r1, #10
+    bne clz32_0_5_l
+//clz32_6_15_n:
+    lsr r0, r1, #4
+    bne clz32_6_11_l
+//clz32_12_15_l:
+    ldrb r0, [r3, r1]
+    add r0, #12 - 2 // - 2 since we're using a 4 bit not 6 bit index
+    bx lr
+clz32_0_5_l:
+    ldrb r0, [r3, r0]
+    bx lr
+clz32_6_11_l:
+    ldrb r0, [r3, r0]
+    add r0, #6
+    bx lr
+
+#else
+
+    adr r3, clz8_table
+    mov r2, #24
+    lsr r1, r0, #16
+    beq 1f
+    sub r2, #16
+    mov r0, r1
+1:
+    lsr r1, r0, #8
+    beq 1f
+    sub r2, #8
+    mov r0, r1
+1:
+    ldrb r0, [r3, r0]
+    add r0, r2
+    bx lr
+#endif
+#endif
+
+#ifdef USE_CTZ32
+.global ctz32
+.type ctz32,%function
+.thumb_func
+ctz32:
+#if NEW_CTZ32
+    adr r3, ctz6_table
+    lsl r1, r0, #16
+    beq ctz32_16_31_n
+ctz32_0_15_n:
+    lsl r0, r1, #6
+    beq ctz32_10_15_l
+//ctz32_22_31_n:
+    lsl r1, r0, #6
+    beq ctz32_4_9_l
+//ctz32_16_20_l:
+    lsr r1, #28
+    add r1, #16
+    ldrb r0, [r3, r1]
+    bx lr
+ctz32_10_15_l:
+    lsr r1, #26
+    ldrb r0, [r3, r1]
+    add r0, #10
+    bx lr
+ctz32_4_9_l:
+    lsr r0, #26
+    ldrb r0, [r3, r0]
+    add r0, #4
+    bx lr
+ctz32_16_31_n:
+    lsl r1, r0, #6
+    beq ctz32_26_31_l
+//ctz32_16_25_n:
+    lsl r0, r1, #6
+    beq ctz32_20_25_l
+//ctz32_16_20_l:
+    lsr r0, #28
+    add r0, #16
+    ldrb r0, [r3, r0]
+    add r0, #16
+    bx lr
+ctz32_26_31_l:
+    lsr r0, #26
+    ldrb r0, [r3, r0]
+    add r0, #26
+    bx lr
+ctz32_20_25_l:
+    lsr r1, #26
+    ldrb r0, [r3, r1]
+    add r0, #20
+    bx lr
+#else
+    adr r3, ctz8_table
+    mov r2, #0
+    lsl r1, r0, #16
+    bne 1f
+    add r2, #16
+    lsr r0, r0, #16
+1:
+    lsl r1, r0, #24
+    bne 1f
+    add r2, #8
+    rev16 r0, r0
+1:
+    uxtb r0, r0
+    ldrb r0, [r3, r0]
+    add r0, r2
+    bx lr
+#endif
+#endif
+
+rt0_literals:
+.ltorg
+
+.align 2
+#ifdef USE_POPCOUNT32
+#if !NEW_POPCOUNT32
+.global popcount8_table
+popcount8_table:
+  .byte 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05
+  .byte 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06
+  .byte 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06
+  .byte 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07
+  .byte 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06
+  .byte 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07
+  .byte 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07
+  .byte 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08
+#endif
+#endif
+
+#ifdef USE_REVERSE32
+#if !NEW_REVERSE32
+.global reverse8_table
+reverse8_table:
+  .byte 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8
+  .byte 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc
+  .byte 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa
+  .byte 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe
+  .byte 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9
+  .byte 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd
+  .byte 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb
+  .byte 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff
+#endif
+#endif
+
+#ifdef USE_CLZ32
+#if NEW_CLZ32
+.global clz6_table
+clz6_table:
+  .byte 0x08-2, 0x07-2, 0x06-2, 0x06-2, 0x05-2, 0x05-2, 0x05-2, 0x05-2, 0x04-2, 0x04-2, 0x04-2, 0x04-2, 0x04-2, 0x04-2, 0x04-2, 0x04-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2, 0x03-2
+  .byte 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2, 0x02-2
+#else
+.global clz8_table
+clz8_table:
+  .byte 0x08, 0x07, 0x06, 0x06, 0x05, 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
+  .byte 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
+  .byte 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
+  .byte 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
+#endif
+#endif
+
+#ifdef USE_CTZ32
+.global ctz6_table
+#if NEW_CTZ32
+ctz6_table:
+  .byte 0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
+  .byte 0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
+#else
+.global ctz8_table
+ctz8_table:
+  .byte 0x08, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
+  .byte 0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
+  .byte 0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
+  .byte 0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
+  .byte 0x07, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
+  .byte 0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
+  .byte 0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
+  .byte 0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
+#endif
+#endif
diff --git a/bootrom/bootrom_main.c b/bootrom/bootrom_main.c
index 956e00b..fad61b7 100644
--- a/bootrom/bootrom_main.c
+++ b/bootrom/bootrom_main.c
@@ -179,7 +179,9 @@ static void _usb_clock_setup() {
     while (!(clocks_hw->clk[clk_sys].selected & 0x2u));
 }
 
-static __noinline __attribute__((noreturn)) void _usb_boot_actual(uint32_t _usb_activity_gpio_pin_mask,
+void __noinline __attribute__((noreturn)) async_task_worker_thunk();
+
+static __noinline __attribute__((noreturn)) void _usb_boot(uint32_t _usb_activity_gpio_pin_mask,
                                                                   uint32_t disable_interface_mask) {
     reset_block(RESETS_RESET_USBCTRL_BITS);
     if (!running_on_fpga())
@@ -206,14 +208,8 @@ static __noinline __attribute__((noreturn)) void _usb_boot_actual(uint32_t _usb_
     usb_boot_device_init(disable_interface_mask);
 
     // worker to run tasks on this thread (never returns); Note: USB code is IRQ driven
-    async_task_worker();
-}
-
-static void __attribute__((noreturn)) _usb_boot(uint32_t _usb_activity_gpio_pin_mask,
-                                                uint32_t _disable_interface_mask) {
-    static uint32_t usb_boot_stack[300]; // this will go in bss
-    __asm volatile ("msr MSP,%0"::"r" (&((uint8_t *) usb_boot_stack)[sizeof(usb_boot_stack)]) : );
-    _usb_boot_actual(_usb_activity_gpio_pin_mask, _disable_interface_mask);
+    // this thunk switches stack into USB DPRAM then calls async_task_worker
+    async_task_worker_thunk();
 }
 
 static void __attribute__((noreturn)) _usb_boot_reboot_wrapper() {
diff --git a/bootrom/bootrom_misc.S b/bootrom/bootrom_misc.S
index 560374d..1c92c29 100644
--- a/bootrom/bootrom_misc.S
+++ b/bootrom/bootrom_misc.S
@@ -7,8 +7,6 @@
 .cpu cortex-m0
 .thumb
 
-// Stolen from Graham
-
 .section .text.crc, "ax"
 
 // Each byte digested MSB-first
diff --git a/bootrom/bootrom_rt0.S b/bootrom/bootrom_rt0.S
index db980a9..c290e12 100644
--- a/bootrom/bootrom_rt0.S
+++ b/bootrom/bootrom_rt0.S
@@ -28,23 +28,6 @@
 #include "hardware/regs/m0plus.h"
 #include "git_info.h"
 
-// these are available and tested for all input values, though can certainly be omitted for space...
-
-// these numbers are for uniform input (average over uniform distribution of 32 bit input) cycle counts
-// NAME       | TABLE | BUILTIN/NON-TABLE | DESCRIPTION
-// -----------|-------|-------------------|-------------
-// CLZ32      |    15 |                22 | count of leading zeroes
-// CLT32      |    14 |                22 | count of trailing zeroes
-// REVERSE32  |    13 |                20 | bit reverse
-// POPCOUNT32 |    20 |                23 | count of 1 bits
-//
-// Note that 16 bit and 8 bit versions will skew more heavily in favor of the tables...
-
-#define USE_CLZ32
-#define USE_CTZ32
-#define USE_POPCOUNT32
-#define USE_REVERSE32
-
 .cpu cortex-m0
 .thumb
 .section .vectors
@@ -64,7 +47,7 @@ _magic:
 # compatibility version (change if function table is incompatible, or functions are backwards incompatible)
 .byte 1
 # ROM version
-.byte 1
+.byte 2
 
 .global _well_known
 _well_known:
@@ -217,21 +200,22 @@ data_table:
     .hword software_git_revision
     .byte 'C', 'R'
     .hword copyright
-    .byte 'P', '8'
-    .hword popcount8_table
-    .byte 'R', '8'
-    .hword reverse8_table
-    .byte 'L', '8'
-    .hword clz8_table
-    .byte 'T', '8'
-    .hword ctz8_table
     .byte 'S', 'F'
     .hword soft_float_table
+    .byte 'S', 'D'
+    .hword soft_double_table
+    .byte 'F', 'Z'
+    .hword soft_float_table_size
     // expose library start and end to facilitate users copying into RAM
     .byte 'F, 'S'
     .hword mufp_lib_start
     .byte 'F, 'E'
     .hword mufp_lib_end
+    // expose library start and end to facilitate users copying into RAM
+    .byte 'D, 'S'
+    .hword mufp_lib_double_start
+    .byte 'D, 'E'
+    .hword mufp_lib_double_end
     .hword 0
 
 // ----------------------------------------------------------------------------
@@ -408,163 +392,21 @@ debug_trampoline_end:
     bkpt #0
     b debug_trampoline
 
-#ifdef USE_POPCOUNT32
-.align 2
-.global popcount32
-.type popcount32,%function
-.thumb_func
-popcount32:
-    adr r1, popcount8_table
-    uxtb r2, r0
-    ldrb r3, [r1, r2]
-    lsr r0, r0, #8
-    uxtb r2, r0
-    ldrb r2, [r1, r2]
-    add r3, r2
-    lsr r0, r0, #8
-    uxtb r2, r0
-    ldrb r2, [r1, r2]
-    add r3, r2
-    lsr r0, r0, #8
-    ldrb r0, [r1, r0]
-    add r0, r3
-    bx lr
-#endif
+    .byte 0x11, 0x38, 0xc0, 0x7a, 0x00, 0xbd, 0x00, 0xb5
+    .byte 0x42, 0x40, 0x00, 0x2a, 0x00, 0xf0, 0x02, 0xf8
+    .byte 0xf6, 0xd2, 0x8e, 0x46, 0x70, 0x46, 0x00, 0x47
+zphd:
 
-#ifdef USE_REVERSE32
+soft_float_table_size:
+.byte (soft_float_table_end - soft_float_table) / 4
 .align 2
-.global reverse32
-.type reverse32,%function
-.thumb_func
-reverse32:
-    adr r3, reverse8_table
-    uxtb r1, r0
-    lsr r0, #8
-    ldrb r2, [r3, r1]
-    uxtb r1, r0
-    rev16 r2, r2
-    ldrb r1, [r3, r1]
-    orr r2, r1
-    lsr r0, #8
-    uxtb r1, r0
-    ldrb r1, [r3, r1]
-    rev r2, r2
-    orr r2, r1
-    lsr r1, r0, #8
-    ldrb r0, [r3, r1]
-    rev16 r2, r2
-    orr r0, r2
-    bx lr
-#endif
-
-#ifdef USE_CLZ32
-.global clz32
-.type clz32,%function
-.thumb_func
-clz32:
-    adr r3, clz8_table
-    mov r2, #24
-    lsr r1, r0, #16
-    beq 1f
-    sub r2, #16
-    mov r0, r1
-1:
-    lsr r1, r0, #8
-    beq 1f
-    sub r2, #8
-    mov r0, r1
-1:
-    ldrb r0, [r3, r0]
-    add r0, r2
-    bx lr
-#endif
-
-#ifdef USE_CTZ32
-.global ctz32
-.type ctz32,%function
-.thumb_func
-.global clz32
-.type clz32,%function
-.thumb_func
-ctz32:
-    adr r3, ctz8_table
-    mov r2, #0
-    lsl r1, r0, #16
-    bne 1f
-    add r2, #16
-    lsr r0, r0, #16
-1:
-    lsl r1, r0, #24
-    bne 1f
-    add r2, #8
-    rev16 r0, r0
-1:
-    uxtb r0, r0
-    ldrb r0, [r3, r0]
-    add r0, r2
-    bx lr
-#endif
-
-rt0_literals:
-.ltorg
-
-//.section .rodata.keep
-
-#ifdef USE_POPCOUNT32
-popcount8_table:
-  .byte 0x00, 0x01, 0x01, 0x02, 0x01, 0x02, 0x02, 0x03, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05
-  .byte 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06
-  .byte 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06
-  .byte 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07
-  .byte 0x01, 0x02, 0x02, 0x03, 0x02, 0x03, 0x03, 0x04, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06
-  .byte 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07
-  .byte 0x02, 0x03, 0x03, 0x04, 0x03, 0x04, 0x04, 0x05, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07
-  .byte 0x03, 0x04, 0x04, 0x05, 0x04, 0x05, 0x05, 0x06, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x04, 0x05, 0x05, 0x06, 0x05, 0x06, 0x06, 0x07, 0x05, 0x06, 0x06, 0x07, 0x06, 0x07, 0x07, 0x08
-#endif
-
-#ifdef USE_REVERSE32
-reverse8_table:
-  .byte 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8
-  .byte 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc
-  .byte 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa
-  .byte 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe
-  .byte 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9
-  .byte 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd
-  .byte 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb
-  .byte 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff
-#endif
-
-#ifdef USE_CLZ32
-clz8_table:
-  .byte 0x08, 0x07, 0x06, 0x06, 0x05, 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03
-  .byte 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02
-  .byte 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
-  .byte 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01
-  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
-#endif
-
-#ifdef USE_CTZ32
-ctz8_table:
-  .byte 0x08, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
-  .byte 0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
-  .byte 0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
-  .byte 0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
-  .byte 0x07, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
-  .byte 0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
-  .byte 0x06, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
-  .byte 0x05, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x04, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00, 0x03, 0x00, 0x01, 0x00, 0x02, 0x00, 0x01, 0x00
-#endif
-
 soft_float_table:
     .word mufp_fadd
     .word mufp_fsub
     .word mufp_fmul
     .word mufp_fdiv
-    .word mufp_fcmp
-    .word mufp_fcmp_flags
+    .word mufp_fcmp_fast
+    .word mufp_fcmp_fast_flags
     .word mufp_fsqrt
     .word mufp_float2int
     .word mufp_float2fix
@@ -577,7 +419,75 @@ soft_float_table:
     .word mufp_fcos
     .word mufp_fsin
     .word mufp_ftan
-    .word mufp_fatan2
+    .word _dead // todo use this for storage?
     .word mufp_fexp
     .word mufp_fln
-soft_float_table_end:
\ No newline at end of file
+
+    .word mufp_fcmp_combined
+    .word mufp_fatan2
+    .word mufp_int642float
+    .word mufp_fix642float
+    .word mufp_uint642float
+    .word mufp_ufix642float
+    .word mufp_float2int64
+    .word mufp_float2fix64
+    .word mufp_float2uint64
+    .word mufp_float2ufix64
+    .word mufp_float2double
+soft_float_table_end:
+
+soft_double_table:
+    .word mufp_dadd
+    .word mufp_dsub
+    .word mufp_dmul
+    .word mufp_ddiv
+    .word mufp_dcmp_fast
+    .word mufp_dcmp_fast_flags
+    .word mufp_dsqrt
+    .word mufp_double2int
+    .word mufp_double2fix
+    .word mufp_double2uint
+    .word mufp_double2ufix
+    .word mufp_int2double
+    .word mufp_fix2double
+    .word mufp_uint2double
+    .word mufp_ufix2double
+    .word mufp_dcos
+    .word mufp_dsin
+    .word mufp_dtan
+    .word _dead
+    .word mufp_dexp
+    .word mufp_dln
+
+    .word mufp_dcmp_combined
+    .word mufp_datan2
+    .word mufp_int642double
+    .word mufp_fix642double
+    .word mufp_uint642double
+    .word mufp_ufix642double
+    .word mufp_double2int64
+    .word mufp_double2fix64
+    .word mufp_double2uint64
+    .word mufp_double2ufix64
+    .word mufp_double2float
+soft_double_table_end:
+#if (soft_double_table_end - soft_double_table) != (soft_float_table_end - soft_float_table)
+#error FLOAT and DOUBLE table size mismatch
+#endif
+
+#define USB_BOOT_STACK_SIZE 300
+
+// we clear USB SRAM (aka .bss and stack), and switch stack
+.global async_task_worker_thunk
+.thumb_func
+async_task_worker_thunk:
+    // set stack
+    ldr r0, =usb_boot_stack_end
+    msr MSP, r0
+    bl async_task_worker
+
+.section .bss
+.align 2
+usb_boot_stack:
+.space USB_BOOT_STACK_SIZE * 4
+usb_boot_stack_end:
diff --git a/bootrom/info_uf2.txt b/bootrom/info_uf2.txt
index 6f9c1d7..bdceef9 100644
--- a/bootrom/info_uf2.txt
+++ b/bootrom/info_uf2.txt
@@ -1,3 +1,3 @@
-UF2 Bootloader v1.0
+UF2 Bootloader v2.0
 Model: Raspberry Pi RP2
 Board-ID: RPI-RP2
diff --git a/bootrom/mufplib-double.S b/bootrom/mufplib-double.S
new file mode 100644
index 0000000..e876d28
--- /dev/null
+++ b/bootrom/mufplib-double.S
@@ -0,0 +1,2326 @@
+/**
+ * Copyright (c) 2020 Mark Owen https://www.quinapalus.com .
+ *
+ * Raspberry Pi (Trading) Ltd (Licensor) hereby grants to you a non-exclusive license to use the software solely on a
+ * Raspberry Pi RP2040 device. No other use is permitted under the terms of this license.
+ *
+ * This software is also available from the copyright owner under GPLv2 licence.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE LICENSOR AND COPYRIGHT OWNER "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE LICENSOR OR COPYRIGHT OWNER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+.syntax unified
+.cpu cortex-m0plus
+.thumb
+
+@ exported symbols
+
+.global mufp_lib_double_start
+.global mufp_dadd
+.global mufp_dsub
+.global mufp_dmul
+.global mufp_ddiv
+.global mufp_dsqrt
+.global mufp_dcos
+.global mufp_dsin
+.global mufp_dtan
+.global mufp_datan2
+.global mufp_dexp
+.global mufp_dln
+.global mufp_dcmp_combined
+.global mufp_dcmp_fast
+.global mufp_dcmp_fast_flags
+
+.global mufp_float2int64
+.global mufp_float2fix64
+.global mufp_float2uint64
+.global mufp_float2ufix64
+
+.global mufp_double2int
+.global mufp_double2fix
+.global mufp_double2uint
+.global mufp_double2ufix
+.global mufp_double2int64
+.global mufp_double2fix64
+.global mufp_double2uint64
+.global mufp_double2ufix64
+
+.global mufp_int2double
+.global mufp_fix2double
+.global mufp_uint2double
+.global mufp_ufix2double
+.global mufp_int642double
+.global mufp_fix642double
+.global mufp_uint642double
+.global mufp_ufix642double
+
+.global mufp_double2float
+.global mufp_float2double
+
+.global mufp_lib_double_end
+
+#ifndef USE_HW_DIV
+.equ use_hw_div,0
+#else
+.equ use_hw_div,1
+#endif
+
+.equ IOPORT       ,0xd0000000
+.equ DIV_UDIVIDEND,0x00000060
+.equ DIV_UDIVISOR ,0x00000064
+.equ DIV_QUOTIENT ,0x00000070
+.equ DIV_CSR      ,0x00000078
+
+mufp_lib_double_start:
+
+@ Notation:
+@ rx:ry means the concatenation of rx and ry with rx having the less significant bits
+
+.equ debug,0
+.macro mdump k
+.if debug
+ push {r0-r3}
+ push {r14}
+ push {r0-r3}
+ bl osp
+ movs r0,#\k
+ bl o1ch
+ pop {r0-r3}
+ bl dump
+ bl osp
+ bl osp
+ ldr r0,[r13]
+ bl o8hex         @ r14
+ bl onl
+ pop {r0}
+ mov r14,r0
+ pop {r0-r3}
+.endif
+.endm
+ 
+
+@ IEEE double in ra:rb ->
+@ mantissa in ra:rb 12Q52 (53 significant bits) with implied 1 set
+@ exponent in re
+@ sign in rs
+@ trashes rt
+.macro mdunpack ra,rb,re,rs,rt
+ lsrs \re,\rb,#20 @ extract sign and exponent
+ subs \rs,\re,#1
+ lsls \rs,#20
+ subs \rb,\rs     @ clear sign and exponent in mantissa; insert implied 1
+ lsrs \rs,\re,#11 @ sign
+ lsls \re,#21
+ lsrs \re,#21     @ exponent
+ beq l\@_1        @ zero exponent?
+ adds \rt,\re,#1
+ lsrs \rt,#11
+ beq l\@_2        @ exponent != 0x7ff? then done
+l\@_1:
+ movs \ra,#0
+ movs \rb,#1
+ lsls \rb,#20
+ subs \re,#128
+ lsls \re,#12
+l\@_2:
+.endm
+
+@ IEEE double in ra:rb ->
+@ signed mantissa in ra:rb 12Q52 (53 significant bits) with implied 1
+@ exponent in re
+@ trashes rt0 and rt1
+@ +zero, +denormal -> exponent=-0x80000
+@ -zero, -denormal -> exponent=-0x80000
+@ +Inf, +NaN -> exponent=+0x77f000
+@ -Inf, -NaN -> exponent=+0x77e000
+.macro mdunpacks ra,rb,re,rt0,rt1
+ lsrs \re,\rb,#20    @ extract sign and exponent
+ lsrs \rt1,\rb,#31   @ sign only
+ subs \rt0,\re,#1
+ lsls \rt0,#20
+ subs \rb,\rt0       @ clear sign and exponent in mantissa; insert implied 1
+ lsls \re,#21
+ bcc l\@_1           @ skip on positive
+ mvns \rb,\rb        @ negate mantissa
+ rsbs \ra,#0
+ bcc l\@_1
+ adds \rb,#1
+l\@_1:
+ lsrs \re,#21
+ beq l\@_2           @ zero exponent?
+ adds \rt0,\re,#1
+ lsrs \rt0,#11
+ beq l\@_3           @ exponent != 0x7ff? then done
+ subs \re,\rt1
+l\@_2:
+ movs \ra,#0
+ lsls \rt1,#1        @ +ve: 0  -ve: 2
+ adds \rb,\rt1,#1    @ +ve: 1  -ve: 3
+ lsls \rb,#30        @ create +/-1 mantissa
+ asrs \rb,#10
+ subs \re,#128
+ lsls \re,#12
+l\@_3:
+.endm
+
+.align 2
+.thumb_func
+mufp_dsub:
+ push {r4-r7,r14}
+ movs r4,#1
+ lsls r4,#31
+ eors r3,r4    @ flip sign on second argument
+ b da_entry    @ continue in dadd
+
+.align 2
+.thumb_func
+mufp_dadd:
+ push {r4-r7,r14}
+da_entry:
+ mdunpacks r0,r1,r4,r6,r7
+ mdunpacks r2,r3,r5,r6,r7
+ subs r7,r5,r4    @ ye-xe
+ subs r6,r4,r5    @ xe-ye
+ bmi da_ygtx
+@ here xe>=ye: need to shift y down r6 places
+ mov r12,r4       @ save exponent
+ cmp r6,#32
+ bge da_xrgty     @ xe rather greater than ye?
+ adds r7,#32
+ movs r4,r2
+ lsls r4,r4,r7    @ rounding bit + sticky bits
+da_xgty0:
+ movs r5,r3
+ lsls r5,r5,r7
+ lsrs r2,r6
+ asrs r3,r6
+ orrs r2,r5
+da_add:
+ adds r0,r2
+ adcs r1,r3
+da_pack:
+@ here unnormalised signed result (possibly 0) is in r0:r1 with exponent r12, rounding + sticky bits in r4
+@ Note that if a large normalisation shift is required then the arguments were close in magnitude and so we
+@ cannot have not gone via the xrgty/yrgtx paths. There will therefore always be enough high bits in r4
+@ to provide a correct continuation of the exact result.
+@ now pack result back up
+ lsrs r3,r1,#31   @ get sign bit
+ beq 1f           @ skip on positive
+ mvns r1,r1       @ negate mantissa
+ mvns r0,r0
+ movs r2,#0
+ rsbs r4,#0
+ adcs r0,r2
+ adcs r1,r2
+1:
+ mov r2,r12       @ get exponent
+ lsrs r5,r1,#21
+ bne da_0         @ shift down required?
+ lsrs r5,r1,#20
+ bne da_1         @ normalised?
+ cmp r0,#0
+ beq da_5         @ could mantissa be zero?
+da_2:
+ adds r4,r4
+ adcs r0,r0
+ adcs r1,r1
+ subs r2,#1       @ adjust exponent
+ lsrs r5,r1,#20
+ beq da_2
+da_1:
+ lsls r4,#1       @ check rounding bit
+ bcc da_3
+da_4:
+ adds r0,#1       @ round up
+ bcc 2f
+ adds r1,#1
+2:
+ cmp r4,#0        @ sticky bits zero?
+ bne da_3
+ lsrs r0,#1       @ round to even
+ lsls r0,#1
+da_3:
+ subs r2,#1
+ bmi da_6
+ adds r4,r2,#2                 @ check if exponent is overflowing
+ lsrs r4,#11
+ bne da_7
+ lsls r2,#20                   @ pack exponent and sign
+ add r1,r2
+ lsls r3,#31
+ add r1,r3
+ pop {r4-r7,r15}
+
+da_7:
+@ here exponent overflow: return signed infinity
+ lsls r1,r3,#31
+ ldr r3,=#0x7ff00000
+ orrs r1,r3
+ b 1f
+da_6:
+@ here exponent underflow: return signed zero
+ lsls r1,r3,#31
+1:
+ movs r0,#0
+ pop {r4-r7,r15}
+
+da_5:
+@ here mantissa could be zero
+ cmp r1,#0
+ bne da_2
+ cmp r4,#0
+ bne da_2
+@ inputs must have been of identical magnitude and opposite sign, so return +0
+ pop {r4-r7,r15}
+
+da_0:
+@ here a shift down by one place is required for normalisation
+ adds r2,#1        @ adjust exponent
+ lsls r6,r0,#31    @ save rounding bit
+ lsrs r0,#1
+ lsls r5,r1,#31
+ orrs r0,r5
+ lsrs r1,#1
+ cmp r6,#0
+ beq da_3
+ b da_4
+
+da_xrgty:         @ xe>ye and shift>=32 places
+ cmp r6,#60
+ bge da_xmgty     @ xe much greater than ye?
+ subs r6,#32
+ adds r7,#64
+
+ movs r4,r2
+ lsls r4,r4,r7    @ these would be shifted off the bottom of the sticky bits
+ beq 1f
+ movs r4,#1
+1:
+ lsrs r2,r2,r6
+ orrs r4,r2
+ movs r2,r3
+ lsls r3,r3,r7
+ orrs r4,r3
+ asrs r3,r2,#31   @ propagate sign bit
+ b da_xgty0
+
+da_ygtx:
+@ here ye>xe: need to shift x down r7 places
+ mov r12,r5       @ save exponent
+ cmp r7,#32
+ bge da_yrgtx     @ ye rather greater than xe?
+ adds r6,#32
+ movs r4,r0
+ lsls r4,r4,r6    @ rounding bit + sticky bits
+da_ygtx0:
+ movs r5,r1
+ lsls r5,r5,r6
+ lsrs r0,r7
+ asrs r1,r7
+ orrs r0,r5
+ b da_add
+
+da_yrgtx:
+ cmp r7,#60
+ bge da_ymgtx     @ ye much greater than xe?
+ subs r7,#32
+ adds r6,#64
+
+ movs r4,r0
+ lsls r4,r4,r6    @ these would be shifted off the bottom of the sticky bits
+ beq 1f
+ movs r4,#1
+1:
+ lsrs r0,r0,r7
+ orrs r4,r0
+ movs r0,r1
+ lsls r1,r1,r6
+ orrs r4,r1
+ asrs r1,r0,#31   @ propagate sign bit
+ b da_ygtx0
+
+da_ymgtx:         @ result is just y
+ movs r0,r2
+ movs r1,r3
+da_xmgty:         @ result is just x
+ movs r4,#0       @ clear sticky bits
+ b da_pack
+
+.ltorg
+
+@ equivalent of UMULL
+@ needs five temporary registers
+@ can have rt3==rx, in which case rx trashed
+@ can have rt4==ry, in which case ry trashed
+@ can have rzl==rx
+@ can have rzh==ry
+@ can have rzl,rzh==rt3,rt4
+.macro mul32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
+                      @   t0   t1   t2   t3   t4
+                      @                  (x)  (y)
+ uxth \rt0,\rx        @   xl
+ uxth \rt1,\ry        @        yl
+ muls \rt0,\rt1       @  xlyl=L
+ lsrs \rt2,\rx,#16    @             xh
+ muls \rt1,\rt2       @       xhyl=M0
+ lsrs \rt4,\ry,#16    @                       yh
+ muls \rt2,\rt4       @           xhyh=H
+ uxth \rt3,\rx        @                   xl
+ muls \rt3,\rt4       @                  xlyh=M1
+ adds \rt1,\rt3       @      M0+M1=M
+ bcc l\@_1            @ addition of the two cross terms can overflow, so add carry into H
+ movs \rt3,#1         @                   1
+ lsls \rt3,#16        @                0x10000
+ adds \rt2,\rt3       @             H'
+l\@_1:
+                      @   t0   t1   t2   t3   t4
+                      @                 (zl) (zh)
+ lsls \rzl,\rt1,#16   @                  ML
+ lsrs \rzh,\rt1,#16   @                       MH
+ adds \rzl,\rt0       @                  ZL
+ adcs \rzh,\rt2       @                       ZH
+.endm
+
+@ SUMULL: x signed, y unsigned
+@ in table below ¯ means signed variable
+@ needs five temporary registers
+@ can have rt3==rx, in which case rx trashed
+@ can have rt4==ry, in which case ry trashed
+@ can have rzl==rx
+@ can have rzh==ry
+@ can have rzl,rzh==rt3,rt4
+.macro muls32_32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
+                      @   t0   t1   t2   t3   t4
+                      @                 ¯(x)  (y)
+ uxth \rt0,\rx        @   xl
+ uxth \rt1,\ry        @        yl
+ muls \rt0,\rt1       @  xlyl=L
+ asrs \rt2,\rx,#16    @            ¯xh
+ muls \rt1,\rt2       @      ¯xhyl=M0
+ lsrs \rt4,\ry,#16    @                       yh
+ muls \rt2,\rt4       @          ¯xhyh=H
+ uxth \rt3,\rx        @                   xl
+ muls \rt3,\rt4       @                 xlyh=M1
+ asrs \rt4,\rt1,#31   @                      M0sx   (M1 sign extension is zero)
+ adds \rt1,\rt3       @      M0+M1=M 
+ movs \rt3,#0         @                    0
+ adcs \rt4,\rt3       @                      ¯Msx
+ lsls \rt4,#16        @                    ¯Msx<<16
+ adds \rt2,\rt4       @             H'
+
+                      @   t0   t1   t2   t3   t4
+                      @                 (zl) (zh)
+ lsls \rzl,\rt1,#16   @                  M~
+ lsrs \rzh,\rt1,#16   @                       M~
+ adds \rzl,\rt0       @                  ZL
+ adcs \rzh,\rt2       @                      ¯ZH
+.endm
+
+@ SSMULL: x signed, y signed
+@ in table below ¯ means signed variable
+@ needs five temporary registers
+@ can have rt3==rx, in which case rx trashed
+@ can have rt4==ry, in which case ry trashed
+@ can have rzl==rx
+@ can have rzh==ry
+@ can have rzl,rzh==rt3,rt4
+.macro muls32_s32_64 rx,ry,rzl,rzh,rt0,rt1,rt2,rt3,rt4
+                      @   t0   t1   t2   t3   t4
+                      @                 ¯(x)  (y)
+ uxth \rt0,\rx        @   xl
+ uxth \rt1,\ry        @        yl
+ muls \rt0,\rt1       @  xlyl=L
+ asrs \rt2,\rx,#16    @            ¯xh
+ muls \rt1,\rt2       @      ¯xhyl=M0
+ asrs \rt4,\ry,#16    @                      ¯yh
+ muls \rt2,\rt4       @          ¯xhyh=H
+ uxth \rt3,\rx        @                   xl
+ muls \rt3,\rt4       @                ¯xlyh=M1
+ adds \rt1,\rt3       @     ¯M0+M1=M
+ asrs \rt3,\rt1,#31   @                  Msx
+ bvc l\@_1            @
+ mvns \rt3,\rt3       @                 ¯Msx        flip sign extension bits if overflow
+l\@_1:
+ lsls \rt3,#16        @                    ¯Msx<<16
+ adds \rt2,\rt3       @             H'
+
+                      @   t0   t1   t2   t3   t4
+                      @                 (zl) (zh)
+ lsls \rzl,\rt1,#16   @                  M~
+ lsrs \rzh,\rt1,#16   @                       M~
+ adds \rzl,\rt0       @                  ZL
+ adcs \rzh,\rt2       @                      ¯ZH
+.endm
+
+@ can have rt2==rx, in which case rx trashed
+@ can have rzl==rx
+@ can have rzh==rt1
+.macro square32_64 rx,rzl,rzh,rt0,rt1,rt2
+                      @   t0   t1   t2   zl   zh
+ uxth \rt0,\rx        @   xl
+ muls \rt0,\rt0       @ xlxl=L 
+ uxth \rt1,\rx        @        xl
+ lsrs \rt2,\rx,#16    @             xh
+ muls \rt1,\rt2       @      xlxh=M
+ muls \rt2,\rt2       @           xhxh=H
+ lsls \rzl,\rt1,#17   @                  ML
+ lsrs \rzh,\rt1,#15   @                       MH
+ adds \rzl,\rt0       @                  ZL
+ adcs \rzh,\rt2       @                       ZH
+.endm
+
+.align 2
+.thumb_func
+mufp_dmul:
+ push {r4-r7,r14}
+ mdunpack r0,r1,r4,r6,r5
+ mov r12,r4
+ mdunpack r2,r3,r4,r7,r5
+ eors r7,r6           @ sign of result
+ add r4,r12           @ exponent of result
+ push {r0-r2,r4,r7}
+
+@ accumulate full product in r12:r5:r6:r7
+ mul32_32_64 r0,r2, r0,r5, r4,r6,r7,r0,r5    @ XL*YL
+ mov r12,r0                                  @ save LL bits
+
+ mul32_32_64 r1,r3, r6,r7, r0,r2,r4,r6,r7    @ XH*YH
+
+ pop {r0}                                    @ XL
+ mul32_32_64 r0,r3, r0,r3, r1,r2,r4,r0,r3    @ XL*YH
+ adds r5,r0
+ adcs r6,r3
+ movs r0,#0
+ adcs r7,r0
+
+ pop {r1,r2}                                 @ XH,YL
+ mul32_32_64 r1,r2, r1,r2, r0,r3,r4, r1,r2   @ XH*YL
+ adds r5,r1
+ adcs r6,r2
+ movs r0,#0
+ adcs r7,r0
+
+@ here r5:r6:r7 holds the product [1..4) in Q(104-32)=Q72, with extra LSBs in r12
+ pop {r3,r4}       @ exponent in r3, sign in r4
+ lsls r1,r7,#11
+ lsrs r2,r6,#21
+ orrs r1,r2
+ lsls r0,r6,#11
+ lsrs r2,r5,#21
+ orrs r0,r2
+ lsls r5,#11       @ now r5:r0:r1 Q83=Q(51+32), extra LSBs in r12
+ lsrs r2,r1,#20
+ bne 1f            @ skip if in range [2..4)
+ adds r5,r5        @ shift up so always [2..4) Q83, i.e. [1..2) Q84=Q(52+32)
+ adcs r0,r0
+ adcs r1,r1
+ subs r3,#1        @ correct exponent
+1:
+ ldr r6,=#0x3ff
+ subs r3,r6        @ correct for exponent bias
+ lsls r6,#1        @ 0x7fe
+ cmp r3,r6
+ bhs dm_0          @ exponent over- or underflow
+ lsls r5,#1        @ rounding bit to carry
+ bcc 1f            @ result is correctly rounded
+ adds r0,#1
+ movs r6,#0
+ adcs r1,r6        @ round up
+ mov r6,r12        @ remaining sticky bits
+ orrs r5,r6
+ bne 1f            @ some sticky bits set?
+ lsrs r0,#1
+ lsls r0,#1        @ round to even
+1:
+ lsls r3,#20
+ adds r1,r3
+dm_2:
+ lsls r4,#31
+ add r1,r4
+ pop {r4-r7,r15}
+
+@ here for exponent over- or underflow
+dm_0:
+ bge dm_1          @ overflow?
+ adds r3,#1        @ would-be zero exponent?
+ bne 1f
+ adds r0,#1
+ bne 1f            @ all-ones mantissa?
+ adds r1,#1
+ lsrs r7,r1,#21
+ beq 1f
+ lsrs r1,#1
+ b dm_2
+1:
+ lsls r1,r4,#31
+ movs r0,#0
+ pop {r4-r7,r15}
+
+@ here for exponent overflow
+dm_1:
+ adds r6,#1        @ 0x7ff
+ lsls r1,r6,#20
+ movs r0,#0
+ b dm_2
+
+.ltorg
+
+@ Approach to division y/x is as follows.
+@
+@ First generate u1, an approximation to 1/x to about 29 bits. Multiply this by the top
+@ 32 bits of y to generate a0, a first approximation to the result (good to 28 bits or so).
+@ Calculate the exact remainder r0=y-a0*x, which will be about 0. Calculate a correction
+@ d0=r0*u1, and then write a1=a0+d0. If near a rounding boundary, compute the exact
+@ remainder r1=y-a1*x (which can be done using r0 as a basis) to determine whether to
+@ round up or down.
+@
+@ The calculation of 1/x is as given in dreciptest.c. That code verifies exhaustively
+@ that | u1*x-1 | < 10*2^-32.
+@
+@ More precisely:
+@
+@ x0=(q16)x;
+@ x1=(q30)x;
+@ y0=(q31)y;
+@ u0=(q15~)"(0xffffffffU/(unsigned int)roundq(x/x_ulp))/powq(2,16)"(x0); // q15 approximation to 1/x; "~" denotes rounding rather than truncation
+@ v=(q30)(u0*x1-1);
+@ u1=(q30)u0-(q30~)(u0*v);
+@
+@ a0=(q30)(u1*y0);
+@ r0=(q82)y-a0*x;
+@ r0x=(q57)r0;
+@ d0=r0x*u1;
+@ a1=d0+a0;
+@
+@ Error analysis
+@
+@ Use Greek letters to represent the errors introduced by rounding and truncation.
+@
+@               r₀ = y - a₀x
+@                  = y - [ u₁ ( y - α ) - β ] x    where 0 ≤ α < 2^-31, 0 ≤ β < 2^-30
+@                  = y ( 1 - u₁x ) + ( u₁α + β ) x
+@
+@     Hence
+@
+@       | r₀ / x | < 2 * 10*2^-32 + 2^-31 + 2^-30
+@                  = 26*2^-32
+@
+@               r₁ = y - a₁x
+@                  = y - a₀x - d₀x
+@                  = r₀ - d₀x
+@                  = r₀ - u₁ ( r₀ - γ ) x    where 0 ≤ γ < 2^-57
+@                  = r₀ ( 1 - u₁x ) + u₁γx
+@
+@     Hence
+@
+@       | r₁ / x | < 26*2^-32 * 10*2^-32 + 2^-57
+@                  = (260+128)*2^-64
+@                  < 2^-55
+@
+@ Empirically it seems to be nearly twice as good as this.
+@
+@ To determine correctly whether the exact remainder calculation can be skipped we need a result
+@ accurate to < 0.25ulp. In the case where x>y the quotient will be shifted up one place for normalisation
+@ and so 1ulp is 2^-53 and so the calculation above suffices.
+
+.align 2
+.thumb_func
+mufp_ddiv:
+ push {r4-r7,r14}
+ddiv0:                         @ entry point from dtan
+ mdunpack r2,r3,r4,r7,r6  @ unpack divisor
+
+.if use_hw_div
+
+ movs r5,#IOPORT>>24
+ lsls r5,#24
+ movs r6,#0
+ mvns r6,r6
+ str r6,[r5,#DIV_UDIVIDEND]
+ lsrs r6,r3,#4           @ x0=(q16)x
+ str r6,[r5,#DIV_UDIVISOR]
+@ if there are not enough cycles from now to the read of the quotient for
+@ the divider to do its stuff we need a busy-wait here
+
+.endif
+
+@ unpack dividend by hand to save on register use
+ lsrs r6,r1,#31
+ adds r6,r7
+ mov r12,r6              @ result sign in r12b0; r12b1 trashed
+ lsls r1,#1
+ lsrs r7,r1,#21          @ exponent
+ beq 1f                  @ zero exponent?
+ adds r6,r7,#1
+ lsrs r6,#11
+ beq 2f                  @ exponent != 0x7ff? then done
+1:
+ movs r0,#0
+ movs r1,#0
+ subs r7,#64             @ less drastic fiddling of exponents to get 0/0, Inf/Inf correct
+ lsls r7,#12
+2:
+ subs r6,r7,r4
+ lsls r6,#2
+ add r12,r12,r6          @ (signed) exponent in r12[31..8]
+ subs r7,#1              @ implied 1
+ lsls r7,#21
+ subs r1,r7
+ lsrs r1,#1
+
+.if use_hw_div
+
+ ldr r6,[r5,#DIV_QUOTIENT]
+ adds r6,#1
+ lsrs r6,#1
+
+.else
+
+@ this is not beautiful; could be replaced by better code that uses knowledge of divisor range
+ push {r0-r3}
+ movs r0,#0
+ mvns r0,r0
+ lsrs r1,r3,#4           @ x0=(q16)x
+ bl __aeabi_uidiv              @ !!! this could (but apparently does not) trash R12
+ adds r6,r0,#1
+ lsrs r6,#1
+ pop {r0-r3}
+
+.endif
+
+@ here
+@ r0:r1 y mantissa
+@ r2:r3 x mantissa
+@ r6    u0, first approximation to 1/x Q15
+@ r12: result sign, exponent
+
+ lsls r4,r3,#10
+ lsrs r5,r2,#22
+ orrs r5,r4              @ x1=(q30)x
+ muls r5,r6              @ u0*x1 Q45
+ asrs r5,#15             @ v=u0*x1-1 Q30
+ muls r5,r6              @ u0*v Q45
+ asrs r5,#14
+ adds r5,#1
+ asrs r5,#1              @ round u0*v to Q30
+ lsls r6,#15
+ subs r6,r5              @ u1 Q30
+
+@ here
+@ r0:r1 y mantissa
+@ r2:r3 x mantissa
+@ r6    u1, second approximation to 1/x Q30
+@ r12: result sign, exponent
+
+ push {r2,r3}
+ lsls r4,r1,#11
+ lsrs r5,r0,#21
+ orrs r4,r5              @ y0=(q31)y
+ mul32_32_64 r4,r6, r4,r5, r2,r3,r7,r4,r5  @ y0*u1 Q61
+ adds r4,r4
+ adcs r5,r5              @ a0=(q30)(y0*u1)
+
+@ here
+@ r0:r1 y mantissa
+@ r5    a0, first approximation to y/x Q30
+@ r6    u1, second approximation to 1/x Q30
+@ r12   result sign, exponent
+
+ ldr r2,[r13,#0]         @ xL
+ mul32_32_64 r2,r5, r2,r3, r1,r4,r7,r2,r3  @ xL*a0
+ ldr r4,[r13,#4]         @ xH
+ muls r4,r5              @ xH*a0
+ adds r3,r4              @ r2:r3 now x*a0 Q82
+ lsrs r2,#25
+ lsls r1,r3,#7
+ orrs r2,r1              @ r2 now x*a0 Q57; r7:r2 is x*a0 Q89
+ lsls r4,r0,#5           @ y Q57
+ subs r0,r4,r2           @ r0x=y-x*a0 Q57 (signed)
+
+@ here
+@ r0  r0x Q57
+@ r5  a0, first approximation to y/x Q30
+@ r4  yL  Q57
+@ r6  u1 Q30
+@ r12 result sign, exponent
+
+ muls32_32_64 r0,r6, r7,r6, r1,r2,r3, r7,r6   @ r7:r6 r0x*u1 Q87
+ asrs r3,r6,#25
+ adds r5,r3
+ lsls r3,r6,#7           @ r3:r5 a1 Q62 (but bottom 7 bits are zero so 55 bits of precision after binary point)
+@ here we could recover another 7 bits of precision (but not accuracy) from the top of r7
+@ but these bits are thrown away in the rounding and conversion to Q52 below
+
+@ here
+@ r3:r5  a1 Q62 candidate quotient [0.5,2) or so
+@ r4     yL Q57
+@ r12    result sign, exponent
+
+ movs r6,#0
+ adds r3,#128            @ for initial rounding to Q53
+ adcs r5,r5,r6
+ lsrs  r1,r5,#30
+ bne dd_0
+@ here candidate quotient a1 is in range [0.5,1)
+@ so 30 significant bits in r5
+
+ lsls r4,#1              @ y now Q58
+ lsrs r1,r5,#9           @ to Q52
+ lsls r0,r5,#23
+ lsrs r3,#9              @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result
+ orrs r0,r3
+ bcs dd_1
+ b dd_2
+dd_0:
+@ here candidate quotient a1 is in range [1,2)
+@ so 31 significant bits in r5
+
+ movs r2,#4
+ add r12,r12,r2          @ fix exponent; r3:r5 now effectively Q61
+ adds r3,#128            @ complete rounding to Q53
+ adcs r5,r5,r6
+ lsrs r1,r5,#10
+ lsls r0,r5,#22
+ lsrs r3,#10             @ 0.5ulp-significance bit in carry: if this is 1 we may need to correct result
+ orrs r0,r3
+ bcc dd_2
+dd_1:
+
+@ here
+@ r0:r1  rounded result Q53 [0.5,1) or Q52 [1,2), but may not be correctly rounded-to-nearest
+@ r4     yL Q58 or Q57
+@ r12    result sign, exponent
+@ carry set
+
+ adcs r0,r0,r0
+ adcs r1,r1,r1            @ z Q53 with 1 in LSB
+ lsls r4,#16              @ Q105-32=Q73
+ ldr r2,[r13,#0]          @ xL Q52
+ ldr r3,[r13,#4]          @ xH Q20
+
+ movs r5,r1               @ zH Q21
+ muls r5,r2               @ zH*xL Q73
+ subs r4,r5
+ muls r3,r0               @ zL*xH Q73
+ subs r4,r3
+ mul32_32_64 r2,r0, r2,r3, r5,r6,r7,r2,r3  @ xL*zL
+ rsbs r2,#0               @ borrow from low half?
+ sbcs r4,r3               @ y-xz Q73 (remainder bits 52..73)
+
+ cmp r4,#0
+
+ bmi 1f
+ movs r2,#0               @ round up
+ adds r0,#1
+ adcs r1,r2
+1:
+ lsrs r0,#1               @ shift back down to Q52
+ lsls r2,r1,#31
+ orrs r0,r2
+ lsrs r1,#1
+dd_2:
+ add r13,#8
+ mov r2,r12
+ lsls r7,r2,#31           @ result sign
+ asrs r2,#2               @ result exponent
+ ldr r3,=#0x3fd
+ adds r2,r3
+ ldr r3,=#0x7fe
+ cmp r2,r3
+ bhs dd_3                 @ over- or underflow?
+ lsls r2,#20
+ adds r1,r2               @ pack exponent
+dd_5:
+ adds r1,r7               @ pack sign
+ pop {r4-r7,r15}
+
+dd_3:
+ movs r0,#0
+ cmp r2,#0
+ bgt dd_4                 @ overflow?
+ movs r1,r7
+ pop {r4-r7,r15}
+
+dd_4:
+ adds r3,#1               @ 0x7ff
+ lsls r1,r3,#20
+ b dd_5
+
+/*
+Approach to square root x=sqrt(y) is as follows.
+
+First generate a3, an approximation to 1/sqrt(y) to about 30 bits. Multiply this by y
+to give a4~sqrt(y) to about 28 bits and a remainder r4=y-a4^2. Then, because
+d sqrt(y) / dy = 1 / (2 sqrt(y)) let d4=r4*a3/2 and then the value a5=a4+d4 is
+a better approximation to sqrt(y). If this is near a rounding boundary we
+compute an exact remainder y-a5*a5 to decide whether to round up or down.
+
+The calculation of a3 and a4 is as given in dsqrttest.c. That code verifies exhaustively
+that | 1 - a3a4 | < 10*2^-32, | r4 | < 40*2^-32 and | r4/y | < 20*2^-32.
+
+More precisely, with "y" representing y truncated to 30 binary places:
+
+u=(q3)y;                          // 24-entry table
+a0=(q8~)"1/sqrtq(x+x_ulp/2)"(u);  // first approximation from table
+p0=(q16)(a0*a0) * (q16)y;
+r0=(q20)(p0-1);
+dy0=(q15)(r0*a0);                 // Newton-Raphson correction term
+a1=(q16)a0-dy0/2;                 // good to ~9 bits
+
+p1=(q19)(a1*a1)*(q19)y;
+r1=(q23)(p1-1);
+dy1=(q15~)(r1*a1);                // second Newton-Raphson correction
+a2x=(q16)a1-dy1/2;                // good to ~16 bits
+a2=a2x-a2x/1t16;                  // prevent overflow of a2*a2 in 32 bits
+
+p2=(a2*a2)*(q30)y;                // Q62
+r2=(q36)(p2-1+1t-31);
+dy2=(q30)(r2*a2);                 // Q52->Q30
+a3=(q31)a2-dy2/2;                 // good to about 30 bits
+a4=(q30)(a3*(q30)y+1t-31);        // good to about 28 bits
+
+Error analysis
+
+          r₄ = y - a₄²
+          d₄ = 1/2 a₃r₄
+          a₅ = a₄ + d₄
+          r₅ = y - a₅²
+             = y - ( a₄ + d₄ )²
+             = y - a₄² - a₃a₄r₄ - 1/4 a₃²r₄²
+             = r₄ - a₃a₄r₄ - 1/4 a₃²r₄²
+
+      | r₅ | < | r₄ | | 1 - a₃a₄ | + 1/4 r₄²
+
+          a₅ = √y √( 1 - r₅/y )
+             = √y ( 1 - 1/2 r₅/y + ... )
+
+So to first order (second order being very tiny)
+
+     √y - a₅ = 1/2 r₅/y
+
+and
+
+ | √y - a₅ | < 1/2 ( | r₄/y | | 1 - a₃a₄ | + 1/4 r₄²/y )
+
+From dsqrttest.c (conservatively):
+
+             < 1/2 ( 20*2^-32 * 10*2^-32 + 1/4 * 40*2^-32*20*2^-32 )
+             = 1/2 ( 200 + 200 ) * 2^-64
+             < 2^-56
+
+Empirically we see about 1ulp worst-case error including rounding at Q57.
+
+To determine correctly whether the exact remainder calculation can be skipped we need a result
+accurate to < 0.25ulp at Q52, or 2^-54.
+*/
+
+dq_2:
+ bge dq_3           @ +Inf?
+ movs r1,#0
+ b dq_4
+
+dq_0:
+ lsrs r1,#31
+ lsls r1,#31        @ preserve sign bit
+ lsrs r2,#21        @ extract exponent
+ beq dq_4           @ -0? return it
+ asrs r1,#11        @ make -Inf
+ b dq_4
+
+dq_3:
+ ldr r1,=#0x7ff
+ lsls r1,#20        @ return +Inf
+dq_4:
+ movs r0,#0
+dq_1:
+ bx r14
+
+.align 2
+.thumb_func
+mufp_dsqrt:
+ lsls r2,r1,#1
+ bcs dq_0           @ negative?
+ lsrs r2,#21        @ extract exponent
+ subs r2,#1
+ ldr r3,=#0x7fe
+ cmp r2,r3
+ bhs dq_2           @ catches 0 and +Inf
+ push {r4-r7,r14}
+ lsls r4,r2,#20
+ subs r1,r4         @ insert implied 1
+ lsrs r2,#1
+ bcc 1f             @ even exponent? skip
+ adds r0,r0,r0      @ odd exponent: shift up mantissa
+ adcs r1,r1,r1
+1:
+ lsrs r3,#2
+ adds r2,r3
+ lsls r2,#20
+ mov r12,r2         @ save result exponent
+
+@ here
+@ r0:r1  y mantissa Q52 [1,4)
+@ r12    result exponent
+
+ adr r4,drsqrtapp-8 @ first eight table entries are never accessed because of the mantissa's leading 1
+ lsrs r2,r1,#17    @ y Q3
+ ldrb r2,[r4,r2]   @ initial approximation to reciprocal square root a0 Q8
+ lsrs r3,r1,#4     @ first Newton-Raphson iteration
+ muls r3,r2
+ muls r3,r2        @  i32 p0=a0*a0*(y>>14);          // Q32
+ asrs r3,r3,#12    @  i32 r0=p0>>12;                 // Q20
+ muls r3,r2
+ asrs r3,#13       @  i32 dy0=(r0*a0)>>13;           // Q15
+ lsls r2,#8
+ subs r2,r3        @  i32 a1=(a0<<8)-dy0;         // Q16
+
+ movs r3,r2
+ muls r3,r3
+ lsrs r3,#13
+ lsrs r4,r1,#1
+ muls r3,r4        @  i32 p1=((a1*a1)>>11)*(y>>11);  // Q19*Q19=Q38
+ asrs r3,#15       @  i32 r1=p1>>15;                 // Q23
+ muls r3,r2
+ asrs r3,#23
+ adds r3,#1
+ asrs r3,#1        @  i32 dy1=(r1*a1+(1<<23))>>24;   // Q23*Q16=Q39; Q15
+ subs r2,r3        @  i32 a2=a1-dy1;                 // Q16
+ lsrs r3,r2,#16
+ subs r2,r3        @  if(a2>=0x10000) a2=0xffff; to prevent overflow of a2*a2
+
+@ here
+@ r0:r1 y mantissa
+@ r2    a2 ~ 1/sqrt(y) Q16
+@ r12   result exponent
+
+ movs r3,r2
+ muls r3,r3
+ lsls r1,#10
+ lsrs r4,r0,#22
+ orrs r1,r4        @ y Q30
+ mul32_32_64 r1,r3, r4,r3, r5,r6,r7,r4,r3   @  i64 p2=(ui64)(a2*a2)*(ui64)y;  // Q62 r4:r3
+ lsls r5,r3,#6
+ lsrs r4,#26
+ orrs r4,r5
+ adds r4,#0x20     @  i32 r2=(p2>>26)+0x20;          // Q36 r4
+ uxth r5,r4
+ muls r5,r2
+ asrs r4,#16
+ muls r4,r2
+ lsrs r5,#16
+ adds r4,r5
+ asrs r4,#6        @ i32 dy2=((i64)r2*(i64)a2)>>22; // Q36*Q16=Q52; Q30
+ lsls r2,#15
+ subs r2,r4
+
+@ here
+@ r0    y low bits
+@ r1    y Q30
+@ r2    a3 ~ 1/sqrt(y) Q31
+@ r12   result exponent
+
+ mul32_32_64 r2,r1, r3,r4, r5,r6,r7,r3,r4
+ adds r3,r3,r3
+ adcs r4,r4,r4
+ adds r3,r3,r3
+ movs r3,#0
+ adcs r3,r4        @ ui32 a4=((ui64)a3*(ui64)y+(1U<<31))>>31; // Q30
+
+@ here
+@ r0    y low bits
+@ r1    y Q30
+@ r2    a3 Q31 ~ 1/sqrt(y)
+@ r3    a4 Q30 ~ sqrt(y)
+@ r12   result exponent
+
+ square32_64 r3, r4,r5, r6,r5,r7
+ lsls r6,r0,#8
+ lsrs r7,r1,#2
+ subs r6,r4
+ sbcs r7,r5        @ r4=(q60)y-a4*a4
+
+@ by exhaustive testing, r4 = fffffffc0e134fdc .. 00000003c2bf539c Q60
+
+ lsls r5,r7,#29
+ lsrs r6,#3
+ adcs r6,r5        @ r4 Q57 with rounding
+ muls32_32_64 r6,r2, r6,r2, r4,r5,r7,r6,r2    @ d4=a3*r4/2 Q89
+@ r4+d4 is correct to 1ULP at Q57, tested on ~9bn cases including all extreme values of r4 for each possible y Q30
+
+ adds r2,#8
+ asrs r2,#5        @ d4 Q52, rounded to Q53 with spare bit in carry
+
+@ here
+@ r0    y low bits
+@ r1    y Q30
+@ r2    d4 Q52, rounded to Q53
+@ C flag contains d4_b53
+@ r3    a4 Q30
+
+ bcs dq_5
+
+ lsrs r5,r3,#10    @ a4 Q52
+ lsls r4,r3,#22
+
+ asrs r1,r2,#31
+ adds r0,r2,r4
+ adcs r1,r5        @ a4+d4
+
+ add r1,r12        @ pack exponent
+ pop {r4-r7,r15}
+
+.ltorg
+ 
+
+@ round(sqrt(2^22./[68:8:252]))
+drsqrtapp:
+.byte 0xf8,0xeb,0xdf,0xd6,0xcd,0xc5,0xbe,0xb8
+.byte 0xb2,0xad,0xa8,0xa4,0xa0,0x9c,0x99,0x95
+.byte 0x92,0x8f,0x8d,0x8a,0x88,0x85,0x83,0x81
+
+dq_5:
+@ here we are near a rounding boundary, C is set
+ adcs r2,r2,r2     @ d4 Q53+1ulp
+ lsrs r5,r3,#9
+ lsls r4,r3,#23    @ r4:r5 a4 Q53
+ asrs r1,r2,#31
+ adds r4,r2,r4
+ adcs r5,r1        @ r4:r5 a5=a4+d4 Q53+1ulp
+ movs r3,r5
+ muls r3,r4
+ square32_64 r4,r1,r2,r6,r2,r7
+ adds r2,r3
+ adds r2,r3        @ r1:r2 a5^2 Q106
+ lsls r0,#22       @ y Q84
+
+ rsbs r1,#0
+ sbcs r0,r2        @ remainder y-a5^2
+ bmi 1f            @ y<a5^2: no need to increment a5
+ movs r3,#0
+ adds r4,#1
+ adcs r5,r3        @ bump a5 if over rounding boundary
+1:
+ lsrs r0,r4,#1
+ lsrs r1,r5,#1
+ lsls r5,#31
+ orrs r0,r5
+ add r1,r12
+ pop {r4-r7,r15}
+
+.thumb_func
+mufp_dcmp_fast:
+.thumb_func
+mufp_dcmp_fast_flags:
+ push {r4,r6,r7,r14}
+ b dcmp_fast_entry
+
+@ compare r0:r1 against r2:r3, returning -1/0/1 for <, =, >
+@ also set flags accordingly
+.thumb_func
+mufp_dcmp_combined:
+ push {r4,r6,r7,r14}
+ ldr r7,=#0x7ff                @ flush NaNs and denormals
+ lsls r4,r1,#1
+ lsrs r4,#21
+ beq 1f
+ cmp r4,r7
+ bne 2f
+1:
+ movs r0,#0
+ lsrs r1,#20
+ lsls r1,#20
+2:
+ lsls r4,r3,#1
+ lsrs r4,#21
+ beq 1f
+ cmp r4,r7
+ bne 2f
+1:
+ movs r2,#0
+ lsrs r3,#20
+ lsls r3,#20
+2:
+dcmp_fast_entry:
+ movs r6,#1
+ eors r3,r1
+ bmi 4f                        @ opposite signs? then can proceed on basis of sign of x
+ eors r3,r1                    @ restore r3
+ bpl 1f
+ rsbs r6,#0                    @ negative? flip comparison
+1:
+ cmp r1,r3
+ bne 1f
+ cmp r0,r2
+ bhi 2f
+ blo 3f
+5:
+ movs r6,#0                    @ equal? result is 0
+1:
+ bgt 2f
+3:
+ rsbs r6,#0
+2:
+ subs r0,r6,#0                 @ copy and set flags
+ pop {r4,r6,r7,r15}
+4:
+ orrs r3,r1                    @ make -0==+0
+ adds r3,r3
+ orrs r3,r0
+ orrs r3,r2
+ beq 5b
+ cmp r1,#0
+ bge 2b
+ b 3b
+
+
+@ "scientific" functions start here
+
+.thumb_func
+push_r8_r11:
+ mov r4,r8
+ mov r5,r9
+ mov r6,r10
+ mov r7,r11
+ push {r4-r7}
+ bx r14
+
+.thumb_func
+pop_r8_r11:
+ pop {r4-r7}
+ mov r8,r4
+ mov r9,r5
+ mov r10,r6
+ mov r11,r7
+ bx r14
+
+@ double-length CORDIC rotation step
+
+@ r0:r1   ω
+@ r6      32-i (complementary shift)
+@ r7      i (shift)
+@ r8:r9   x
+@ r10:r11 y
+@ r12     coefficient pointer
+
+@ an option in rotation mode would be to compute the sequence of σ values
+@ in one pass, rotate the initial vector by the residual ω and then run a
+@ second pass to compute the final x and y. This would relieve pressure
+@ on registers and hence possibly be faster. The same trick does not work
+@ in vectoring mode (but perhaps one could work to single precision in
+@ a first pass and then double precision in a second pass?).
+
+.thumb_func
+dcordic_vec_step:
+ mov r2,r12
+ ldmia r2!,{r3,r4}
+ mov r12,r2
+ mov r2,r11
+ cmp r2,#0
+ blt 1f
+ b 2f
+
+.thumb_func
+dcordic_rot_step:
+ mov r2,r12
+ ldmia r2!,{r3,r4}
+ mov r12,r2
+ cmp r1,#0
+ bge 1f
+2:
+@ ω<0 / y>=0
+@ ω+=dω
+@ x+=y>>i, y-=x>>i
+ adds r0,r3
+ adcs r1,r4
+
+ mov r3,r11
+ asrs r3,r7
+ mov r4,r11
+ lsls r4,r6
+ mov r2,r10
+ lsrs r2,r7
+ orrs r2,r4 @ r2:r3 y>>i, rounding in carry
+ mov r4,r8
+ mov r5,r9   @ r4:r5 x
+ adcs r2,r4
+ adcs r3,r5  @ r2:r3 x+(y>>i)
+ mov r8,r2
+ mov r9,r3
+
+ mov r3,r5
+ lsls r3,r6
+ asrs r5,r7
+ lsrs r4,r7
+ orrs r4,r3 @ r4:r5 x>>i, rounding in carry
+ mov r2,r10
+ mov r3,r11
+ sbcs r2,r4
+ sbcs r3,r5  @ r2:r3 y-(x>>i)
+ mov r10,r2
+ mov r11,r3
+ bx r14
+
+
+@ ω>0 / y<0
+@ ω-=dω
+@ x-=y>>i, y+=x>>i
+1:
+ subs r0,r3
+ sbcs r1,r4
+
+ mov r3,r9
+ asrs r3,r7
+ mov r4,r9
+ lsls r4,r6
+ mov r2,r8
+ lsrs r2,r7
+ orrs r2,r4 @ r2:r3 x>>i, rounding in carry
+ mov r4,r10
+ mov r5,r11  @ r4:r5 y
+ adcs r2,r4
+ adcs r3,r5  @ r2:r3 y+(x>>i)
+ mov r10,r2
+ mov r11,r3
+
+ mov r3,r5
+ lsls r3,r6
+ asrs r5,r7
+ lsrs r4,r7
+ orrs r4,r3 @ r4:r5 y>>i, rounding in carry
+ mov r2,r8
+ mov r3,r9
+ sbcs r2,r4
+ sbcs r3,r5  @ r2:r3 x-(y>>i)
+ mov r8,r2
+ mov r9,r3
+ bx r14
+
+ret_dzero:
+ movs r0,#0
+ movs r1,#0
+ bx r14
+
+@ convert packed double in r0:r1 to signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point], with rounding towards -Inf
+@ fixed-point versions only work with reasonable values in r2 because of the way dunpacks works
+
+.thumb_func
+mufp_double2int:
+ movs r2,#0                    @ and fall through
+.thumb_func
+mufp_double2fix:
+ push {r14}
+ adds r2,#32
+ bl mufp_double2fix64
+ movs r0,r1
+ pop {r15}
+
+.thumb_func
+mufp_double2uint:
+ movs r2,#0                    @ and fall through
+.thumb_func
+mufp_double2ufix:
+ push {r14}
+ adds r2,#32
+ bl mufp_double2ufix64
+ movs r0,r1
+ pop {r15}
+
+.thumb_func
+mufp_float2int64:
+ movs r1,#0                    @ and fall through
+.thumb_func
+mufp_float2fix64:
+ push {r14}
+ bl f2fix
+ b d2f64_a
+
+.thumb_func
+mufp_float2uint64:
+ movs r1,#0                    @ and fall through
+.thumb_func
+mufp_float2ufix64:
+ asrs r3,r0,#23                @ negative? return 0
+ bmi ret_dzero
+@ and fall through
+
+@ convert float in r0 to signed fixed point in r0:r1:r3, r1 places after point, rounding towards -Inf
+@ result clamped so that r3 can only be 0 or -1
+@ trashes r12
+.thumb_func
+f2fix:
+ push {r4,r14}
+ mov r12,r1
+ asrs r3,r0,#31
+ lsls r0,#1
+ lsrs r2,r0,#24
+ beq 1f                        @ zero?
+ cmp r2,#0xff                  @ Inf?
+ beq 2f
+ subs r1,r2,#1
+ subs r2,#0x7f                 @ remove exponent bias
+ lsls r1,#24
+ subs r0,r1                    @ insert implied 1
+ eors r0,r3
+ subs r0,r3                    @ top two's complement
+ asrs r1,r0,#4                 @ convert to double format
+ lsls r0,#28
+ b d2fix_a
+1:
+ movs r0,#0
+ movs r1,r0
+ movs r3,r0
+ pop {r4,r15}
+2:
+ mvns r0,r3                    @ return max/min value
+ mvns r1,r3
+ pop {r4,r15}
+
+.thumb_func
+mufp_double2int64:
+ movs r2,#0                    @ and fall through
+.thumb_func
+mufp_double2fix64:
+ push {r14}
+ bl d2fix
+d2f64_a:
+ asrs r2,r1,#31
+ cmp r2,r3
+ bne 1f                        @ sign extension bits fail to match sign of result?
+ pop {r15}
+1:
+ mvns r0,r3
+ movs r1,#1
+ lsls r1,#31
+ eors r1,r1,r0                 @ generate extreme fixed-point values
+ pop {r15}
+
+.thumb_func
+mufp_double2uint64:
+ movs r2,#0                    @ and fall through
+.thumb_func
+mufp_double2ufix64:
+ asrs r3,r1,#20                @ negative? return 0
+ bmi ret_dzero
+@ and fall through
+
+@ convert double in r0:r1 to signed fixed point in r0:r1:r3, r2 places after point, rounding towards -Inf
+@ result clamped so that r3 can only be 0 or -1
+@ trashes r12
+.thumb_func
+d2fix:
+ push {r4,r14}
+ mov r12,r2
+ bl dunpacks
+ asrs r4,r2,#16
+ adds r4,#1
+ bge 1f
+ movs r1,#0                    @ -0 -> +0
+1:
+ asrs r3,r1,#31
+d2fix_a:
+@ here
+@ r0:r1 two's complement mantissa
+@ r2    unbaised exponent
+@ r3    mantissa sign extension bits
+ add r2,r12                    @ exponent plus offset for required binary point position
+ subs r2,#52                   @ required shift
+ bmi 1f                        @ shift down?
+@ here a shift up by r2 places
+ cmp r2,#12                    @ will clamp?
+ bge 2f
+ movs r4,r0
+ lsls r1,r2
+ lsls r0,r2
+ rsbs r2,#0
+ adds r2,#32                   @ complementary shift
+ lsrs r4,r2
+ orrs r1,r4
+ pop {r4,r15}
+2:
+ mvns r0,r3
+ mvns r1,r3                    @ overflow: clamp to extreme fixed-point values
+ pop {r4,r15}
+1:
+@ here a shift down by -r2 places
+ adds r2,#32
+ bmi 1f                        @ long shift?
+ mov r4,r1
+ lsls r4,r2
+ rsbs r2,#0
+ adds r2,#32                   @ complementary shift
+ asrs r1,r2
+ lsrs r0,r2
+ orrs r0,r4
+ pop {r4,r15}
+1:
+@ here a long shift down
+ movs r0,r1
+ asrs r1,#31                   @ shift down 32 places
+ adds r2,#32
+ bmi 1f                        @ very long shift?
+ rsbs r2,#0
+ adds r2,#32
+ asrs r0,r2
+ pop {r4,r15}
+1:
+ movs r0,r3                    @ result very near zero: use sign extension bits
+ movs r1,r3
+ pop {r4,r15}
+
+@ float <-> double conversions
+.thumb_func
+mufp_float2double:
+ lsrs r3,r0,#31                @ sign bit
+ lsls r3,#31
+ lsls r1,r0,#1
+ lsrs r2,r1,#24                @ exponent
+ beq 1f                        @ zero?
+ cmp r2,#0xff                  @ Inf?
+ beq 2f
+ lsrs r1,#4                    @ exponent and top 20 bits of mantissa
+ ldr r2,=#(0x3ff-0x7f)<<20     @ difference in exponent offsets
+ adds r1,r2
+ orrs r1,r3
+ lsls r0,#29                   @ bottom 3 bits of mantissa
+ bx r14
+1:
+ movs r1,r3                    @ return signed zero
+3:
+ movs r0,#0
+ bx r14
+2:
+ ldr r1,=#0x7ff00000           @ return signed infinity
+ adds r1,r3
+ b 3b
+
+.thumb_func
+mufp_double2float:
+ lsls r2,r1,#1
+ lsrs r2,#21                   @ exponent
+ ldr r3,=#0x3ff-0x7f
+ subs r2,r3                    @ fix exponent bias
+ ble 1f                        @ underflow or zero
+ cmp r2,#0xff
+ bge 2f                        @ overflow or infinity
+ lsls r2,#23                   @ position exponent of result
+ lsrs r3,r1,#31
+ lsls r3,#31
+ orrs r2,r3                    @ insert sign
+ lsls r3,r0,#3                 @ rounding bits
+ lsrs r0,#29
+ lsls r1,#12
+ lsrs r1,#9
+ orrs r0,r1                    @ assemble mantissa
+ orrs r0,r2                    @ insert exponent and sign
+ lsls r3,#1
+ bcc 3f                        @ no rounding
+ beq 4f                        @ all sticky bits 0?
+5:
+ adds r0,#1
+3:
+ bx r14
+4:
+ lsrs r3,r0,#1                 @ odd? then round up
+ bcs 5b
+ bx r14
+1:
+ beq 6f                        @ check case where value is just less than smallest normal
+7:
+ lsrs r0,r1,#31
+ lsls r0,#31
+ bx r14
+6:
+ lsls r2,r1,#12                @ 20 1:s at top of mantissa?
+ asrs r2,#12
+ adds r2,#1
+ bne 7b
+ lsrs r2,r0,#29                @ and 3 more 1:s?
+ cmp r2,#7
+ bne 7b
+ movs r2,#1                    @ return smallest normal with correct sign
+ b 8f
+2:
+ movs r2,#0xff
+8:
+ lsrs r0,r1,#31                @ return signed infinity
+ lsls r0,#8
+ adds r0,r2
+ lsls r0,#23
+ bx r14
+
+@ convert signed/unsigned 32/64-bit integer/fixed-point value in r0:r1 [with r2 places after point] to packed double in r0:r1, with rounding
+
+.thumb_func
+mufp_uint2double:
+ movs r1,#0                    @ and fall through
+.thumb_func
+mufp_ufix2double:
+ movs r2,r1
+ movs r1,#0
+ b mufp_ufix642double
+
+.thumb_func
+mufp_int2double:
+ movs r1,#0                    @ and fall through
+.thumb_func
+mufp_fix2double:
+ movs r2,r1
+ asrs r1,r0,#31                @ sign extend
+ b mufp_fix642double
+
+.thumb_func
+mufp_uint642double:
+ movs r2,#0                    @ and fall through
+.thumb_func
+mufp_ufix642double:
+ movs r3,#0
+ b uf2d
+
+.thumb_func
+mufp_int642double:
+ movs r2,#0                    @ and fall through
+.thumb_func
+mufp_fix642double:
+ asrs r3,r1,#31                @ sign bit across all bits
+ eors r0,r3
+ eors r1,r3
+ subs r0,r3
+ sbcs r1,r3
+uf2d:
+ push {r4,r5,r14}
+ ldr r4,=#0x432
+ subs r2,r4,r2                 @ form biased exponent
+@ here
+@ r0:r1 unnormalised mantissa
+@ r2 -Q (will become exponent)
+@ r3 sign across all bits
+ cmp r1,#0
+ bne 1f                        @ short normalising shift?
+ movs r1,r0
+ beq 2f                        @ zero? return it
+ movs r0,#0
+ subs r2,#32                   @ fix exponent
+1:
+ asrs r4,r1,#21
+ bne 3f                        @ will need shift down (and rounding?)
+ bcs 4f                        @ normalised already?
+5:
+ subs r2,#1
+ adds r0,r0                    @ shift up
+ adcs r1,r1
+ lsrs r4,r1,#21
+ bcc 5b
+4:
+ ldr r4,=#0x7fe
+ cmp r2,r4
+ bhs 6f                        @ over/underflow? return signed zero/infinity
+7:
+ lsls r2,#20                   @ pack and return
+ adds r1,r2
+ lsls r3,#31
+ adds r1,r3
+2:
+ pop {r4,r5,r15}
+6:                             @ return signed zero/infinity according to unclamped exponent in r2
+ mvns r2,r2
+ lsrs r2,#21
+ movs r0,#0
+ movs r1,#0
+ b 7b
+
+3:
+@ here we need to shift down to normalise and possibly round
+ bmi 1f                        @ already normalised to Q63?
+2:
+ subs r2,#1
+ adds r0,r0                    @ shift up
+ adcs r1,r1
+ bpl 2b
+1:
+@ here we have a 1 in b63 of r0:r1
+ adds r2,#11                   @ correct exponent for subsequent shift down
+ lsls r4,r0,#21                @ save bits for rounding
+ lsrs r0,#11
+ lsls r5,r1,#21
+ orrs r0,r5
+ lsrs r1,#11
+ lsls r4,#1
+ beq 1f                        @ sticky bits are zero?
+8:
+ movs r4,#0
+ adcs r0,r4
+ adcs r1,r4
+ b 4b
+1:
+ bcc 4b                        @ sticky bits are zero but not on rounding boundary
+ lsrs r4,r0,#1                 @ increment if odd (force round to even)
+ b 8b
+
+
+.ltorg
+
+.thumb_func
+dunpacks:
+ mdunpacks r0,r1,r2,r3,r4
+ ldr r3,=#0x3ff
+ subs r2,r3                    @ exponent without offset
+ bx r14
+
+@ r0:r1  signed mantissa Q52
+@ r2     unbiased exponent < 10 (i.e., |x|<2^10)
+@ r4     pointer to:
+@          - divisor reciprocal approximation r=1/d Q15
+@          - divisor d Q62  0..20
+@          - divisor d Q62 21..41
+@          - divisor d Q62 42..62
+@ returns:
+@ r0:r1  reduced result y Q62, -0.6 d < y < 0.6 d (better in practice)
+@ r2     quotient q (number of reductions)
+@ if exponent >=10, returns r0:r1=0, r2=1024*mantissa sign
+@ designed to work for 0.5<d<2, in particular d=ln2 (~0.7) and d=π/2 (~1.6)
+.thumb_func
+dreduce:
+ adds r2,#2     @ e+2
+ bmi 1f         @ |x|<0.25, too small to need adjustment
+ cmp r2,#12
+ bge 4f
+2:
+ movs r5,#17
+ subs r5,r2     @ 15-e
+ movs r3,r1     @ Q20
+ asrs r3,r5     @ x Q5
+ adds r2,#8     @ e+10
+ adds r5,#7     @ 22-e = 32-(e+10)
+ movs r6,r0
+ lsrs r6,r5
+ lsls r0,r2
+ lsls r1,r2
+ orrs r1,r6     @ r0:r1 x Q62
+ ldmia r4,{r4-r7}
+ muls r3,r4     @ rx Q20
+ asrs r2,r3,#20
+ movs r3,#0
+ adcs r2,r3     @ rx Q0 rounded = q; for e.g. r=1.5 |q|<1.5*2^10
+ muls r5,r2     @ qd in pieces: L Q62
+ muls r6,r2     @               M Q41
+ muls r7,r2     @               H Q20
+ lsls r7,#10
+ asrs r4,r6,#11
+ lsls r6,#21
+ adds r6,r5
+ adcs r7,r4
+ asrs r5,#31
+ adds r7,r5     @ r6:r7 qd Q62
+ subs r0,r6
+ sbcs r1,r7     @ remainder Q62
+ bx r14
+4:
+ movs r2,#12                   @ overflow: clamp to +/-1024
+ movs r0,#0
+ asrs r1,#31
+ lsls r1,#1
+ adds r1,#1
+ lsls r1,#20
+ b 2b
+
+1:
+ lsls r1,#8
+ lsrs r3,r0,#24
+ orrs r1,r3
+ lsls r0,#8     @ r0:r1 Q60, to be shifted down -r2 places
+ rsbs r3,r2,#0
+ adds r2,#32    @ shift down in r3, complementary shift in r2
+ bmi 1f         @ long shift?
+2:
+ movs r4,r1
+ asrs r1,r3
+ lsls r4,r2
+ lsrs r0,r3
+ orrs r0,r4
+ movs r2,#0     @ rounding
+ adcs r0,r2
+ adcs r1,r2
+ bx r14
+
+1:
+ movs r0,r1     @ down 32 places
+ asrs r1,#31
+ subs r3,#32
+ adds r2,#32
+ bpl 2b
+ movs r0,#0     @ very long shift? return 0
+ movs r1,#0
+ movs r2,#0
+ bx r14
+
+.thumb_func
+mufp_dtan:
+ push {r4-r7,r14}
+ bl push_r8_r11
+ bl dsincos
+ mov r12,r0                    @ save ε
+ bl dcos_finish
+ push {r0,r1}
+ mov r0,r12
+ bl dsin_finish
+ pop {r2,r3}
+ bl pop_r8_r11
+ b ddiv0                       @ compute sin θ/cos θ
+
+.thumb_func
+mufp_dcos:
+ push {r4-r7,r14}
+ bl push_r8_r11
+ bl dsincos
+ bl dcos_finish
+ b 1f
+
+.thumb_func
+mufp_dsin:
+ push {r4-r7,r14}
+ bl push_r8_r11
+ bl dsincos
+ bl dsin_finish
+1:
+ bl pop_r8_r11
+ pop {r4-r7,r15}
+
+
+@ unpack double θ in r0:r1, range reduce and calculate ε, cos α and sin α such that
+@ θ=α+ε and |ε|≤2^-32
+@ on return:
+@ r0:r1   ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
+@ r8:r9   cos α Q62
+@ r10:r11 sin α Q62
+.thumb_func
+dsincos:
+ push {r14}
+ bl dunpacks
+ adr r4,dreddata0
+ bl dreduce
+
+ movs r4,#0
+ ldr r5,=#0x9df04dbb @ this value compensates for the non-unity scaling of the CORDIC rotations
+ ldr r6,=#0x36f656c5
+ lsls r2,#31
+ bcc 1f
+@ quadrant 2 or 3
+ mvns r6,r6
+ rsbs r5,r5,#0
+ adcs r6,r4
+1:
+ lsls r2,#1
+ bcs 1f
+@ even quadrant
+ mov r10,r4
+ mov r11,r4
+ mov r8,r5
+ mov r9,r6
+ b 2f
+1:
+@ odd quadrant
+ mov r8,r4
+ mov r9,r4
+ mov r10,r5
+ mov r11,r6
+2:
+ adr r4,dtab_cc
+ mov r12,r4
+ movs r7,#1
+ movs r6,#31
+1:
+ bl dcordic_rot_step
+ adds r7,#1
+ subs r6,#1
+ cmp r7,#33
+ bne 1b
+ pop {r15}
+
+dcos_finish:
+@ here
+@ r0:r1   ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
+@ r8:r9   cos α Q62
+@ r10:r11 sin α Q62
+@ and we wish to calculate cos θ=cos(α+ε)~cos α - ε sin α
+ mov r1,r11
+@ mov r2,r10
+@ lsrs r2,#31
+@ adds r1,r2                    @ rounding improves accuracy very slightly
+ muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3
+@ r2:r3   ε sin α Q(62+62-32)=Q92
+ mov r0,r8
+ mov r1,r9
+ lsls r5,r3,#2
+ asrs r3,r3,#30
+ lsrs r2,r2,#30
+ orrs r2,r5
+ sbcs r0,r2                    @ include rounding
+ sbcs r1,r3
+ movs r2,#62
+ b mufp_fix642double
+
+dsin_finish:
+@ here
+@ r0:r1   ε (residual ω, where θ=α+ε) Q62, |ε|≤2^-32 (so fits in r0)
+@ r8:r9   cos α Q62
+@ r10:r11 sin α Q62
+@ and we wish to calculate sin θ=sin(α+ε)~sin α + ε cos α
+ mov r1,r9
+ muls32_s32_64 r0,r1, r2,r3, r4,r5,r6,r2,r3
+@ r2:r3   ε cos α Q(62+62-32)=Q92
+ mov r0,r10
+ mov r1,r11
+ lsls r5,r3,#2
+ asrs r3,r3,#30
+ lsrs r2,r2,#30
+ orrs r2,r5
+ adcs r0,r2 @ include rounding
+ adcs r1,r3
+ movs r2,#62
+ b mufp_fix642double
+
+.ltorg
+.align 2
+dreddata0:
+.word 0x0000517d    @ 2/π Q15
+.word 0x0014611A    @ π/2 Q62=6487ED5110B4611A split into 21-bit pieces
+.word 0x000A8885
+.word 0x001921FB
+
+.thumb_func
+mufp_datan2:
+@ r0:r1 y
+@ r2:r3 x
+ push {r4-r7,r14}
+ bl push_r8_r11
+ ldr r5,=#0x7ff00000
+ movs r4,r1
+ ands r4,r5                    @ y==0?
+ beq 1f
+ cmp r4,r5                     @ or Inf/NaN?
+ bne 2f
+1:
+ lsrs r1,#20                   @ flush
+ lsls r1,#20
+ movs r0,#0
+2:
+ movs r4,r3
+ ands r4,r5                    @ x==0?
+ beq 1f
+ cmp r4,r5                     @ or Inf/NaN?
+ bne 2f
+1:
+ lsrs r3,#20                   @ flush
+ lsls r3,#20
+ movs r2,#0
+2:
+ movs r6,#0                    @ quadrant offset
+ lsls r5,#11                   @ constant 0x80000000
+ cmp r3,#0
+ bpl 1f                        @ skip if x positive
+ movs r6,#2
+ eors r3,r5
+ eors r1,r5
+ bmi 1f                        @ quadrant offset=+2 if y was positive
+ rsbs r6,#0                    @ quadrant offset=-2 if y was negative
+1:
+@ now in quadrant 0 or 3
+ adds r7,r1,r5                 @ r7=-r1
+ bpl 1f
+@ y>=0: in quadrant 0
+ cmp r1,r3
+ ble 2f                        @ y<~x so 0≤θ<~π/4: skip
+ adds r6,#1
+ eors r1,r5                    @ negate x
+ b 3f                          @ and exchange x and y = rotate by -π/2
+1:
+ cmp r3,r7
+ bge 2f                        @ -y<~x so -π/4<~θ≤0: skip
+ subs r6,#1
+ eors r3,r5                    @ negate y and ...
+3:
+ movs r7,r0                    @ exchange x and y
+ movs r0,r2
+ movs r2,r7
+ movs r7,r1
+ movs r1,r3
+ movs r3,r7
+2:
+@ here -π/4<~θ<~π/4
+@ r6 has quadrant offset
+ push {r6}
+ cmp r2,#0
+ bne 1f
+ cmp r3,#0
+ beq 10f                       @ x==0 going into division?
+ lsls r4,r3,#1
+ asrs r4,#21
+ adds r4,#1
+ bne 1f                        @ x==Inf going into division?
+ lsls r4,r1,#1
+ asrs r4,#21
+ adds r4,#1                    @ y also ±Inf?
+ bne 10f
+ subs r1,#1                    @ make them both just finite
+ subs r3,#1
+ b 1f
+
+10:
+ movs r0,#0
+ movs r1,#0
+ b 12f
+
+1:
+ bl mufp_ddiv
+ movs r2,#62
+ bl mufp_double2fix64
+@ r0:r1 y/x
+ mov r10,r0
+ mov r11,r1
+ movs r0,#0                    @ ω=0
+ movs r1,#0
+ mov r8,r0
+ movs r2,#1
+ lsls r2,#30
+ mov r9,r2                     @ x=1
+
+ adr r4,dtab_cc
+ mov r12,r4
+ movs r7,#1
+ movs r6,#31
+1:
+ bl dcordic_vec_step
+ adds r7,#1
+ subs r6,#1
+ cmp r7,#33
+ bne 1b
+@ r0:r1   atan(y/x) Q62
+@ r8:r9   x residual Q62
+@ r10:r11 y residual Q62
+ mov r2,r9
+ mov r3,r10
+ subs r2,#12                   @ this makes atan(0)==0
+@ the following is basically a division residual y/x ~ atan(residual y/x)
+ movs r4,#1
+ lsls r4,#29
+ movs r7,#0
+2:
+ lsrs r2,#1
+ movs r3,r3                    @ preserve carry
+ bmi 1f
+ sbcs r3,r2
+ adds r0,r4
+ adcs r1,r7
+ lsrs r4,#1
+ bne 2b
+ b 3f
+1:
+ adcs r3,r2
+ subs r0,r4
+ sbcs r1,r7
+ lsrs r4,#1
+ bne 2b
+3:
+ lsls r6,r1,#31
+ asrs r1,#1
+ lsrs r0,#1
+ orrs r0,r6                    @ Q61
+
+12:
+ pop {r6}
+
+ cmp r6,#0
+ beq 1f
+ ldr r4,=#0x885A308D           @ π/2 Q61
+ ldr r5,=#0x3243F6A8
+ bpl 2f
+ mvns r4,r4                    @ negative quadrant offset
+ mvns r5,r5
+2:
+ lsls r6,#31
+ bne 2f                        @ skip if quadrant offset is ±1
+ adds r0,r4
+ adcs r1,r5
+2:
+ adds r0,r4
+ adcs r1,r5
+1:
+ movs r2,#61
+ bl mufp_fix642double
+
+ bl pop_r8_r11
+ pop {r4-r7,r15}
+
+.ltorg
+
+dtab_cc:
+.word 0x61bb4f69, 0x1dac6705 @ atan 2^-1 Q62
+.word 0x96406eb1, 0x0fadbafc @ atan 2^-2 Q62
+.word 0xab0bdb72, 0x07f56ea6 @ atan 2^-3 Q62
+.word 0xe59fbd39, 0x03feab76 @ atan 2^-4 Q62
+.word 0xba97624b, 0x01ffd55b @ atan 2^-5 Q62
+.word 0xdddb94d6, 0x00fffaaa @ atan 2^-6 Q62
+.word 0x56eeea5d, 0x007fff55 @ atan 2^-7 Q62
+.word 0xaab7776e, 0x003fffea @ atan 2^-8 Q62
+.word 0x5555bbbc, 0x001ffffd @ atan 2^-9 Q62
+.word 0xaaaaadde, 0x000fffff @ atan 2^-10 Q62
+.word 0xf555556f, 0x0007ffff @ atan 2^-11 Q62
+.word 0xfeaaaaab, 0x0003ffff @ atan 2^-12 Q62
+.word 0xffd55555, 0x0001ffff @ atan 2^-13 Q62
+.word 0xfffaaaab, 0x0000ffff @ atan 2^-14 Q62
+.word 0xffff5555, 0x00007fff @ atan 2^-15 Q62
+.word 0xffffeaab, 0x00003fff @ atan 2^-16 Q62
+.word 0xfffffd55, 0x00001fff @ atan 2^-17 Q62
+.word 0xffffffab, 0x00000fff @ atan 2^-18 Q62
+.word 0xfffffff5, 0x000007ff @ atan 2^-19 Q62
+.word 0xffffffff, 0x000003ff @ atan 2^-20 Q62
+.word 0x00000000, 0x00000200 @ atan 2^-21 Q62 @ consider optimising these
+.word 0x00000000, 0x00000100 @ atan 2^-22 Q62
+.word 0x00000000, 0x00000080 @ atan 2^-23 Q62
+.word 0x00000000, 0x00000040 @ atan 2^-24 Q62
+.word 0x00000000, 0x00000020 @ atan 2^-25 Q62
+.word 0x00000000, 0x00000010 @ atan 2^-26 Q62
+.word 0x00000000, 0x00000008 @ atan 2^-27 Q62
+.word 0x00000000, 0x00000004 @ atan 2^-28 Q62
+.word 0x00000000, 0x00000002 @ atan 2^-29 Q62
+.word 0x00000000, 0x00000001 @ atan 2^-30 Q62
+.word 0x80000000, 0x00000000 @ atan 2^-31 Q62
+.word 0x40000000, 0x00000000 @ atan 2^-32 Q62
+
+.thumb_func
+mufp_dexp:
+ push {r4-r7,r14}
+ bl dunpacks
+ adr r4,dreddata1
+ bl dreduce
+ cmp r1,#0
+ bge 1f
+ ldr r4,=#0xF473DE6B
+ ldr r5,=#0x2C5C85FD           @ ln2 Q62
+ adds r0,r4
+ adcs r1,r5
+ subs r2,#1
+1:
+ push {r2}
+ movs r7,#1                    @ shift
+ adr r6,dtab_exp
+ movs r2,#0
+ movs r3,#1
+ lsls r3,#30                   @ x=1 Q62
+
+3:
+ ldmia r6!,{r4,r5}
+ mov r12,r6
+ subs r0,r4
+ sbcs r1,r5
+ bmi 1f
+
+ rsbs r6,r7,#0
+ adds r6,#32                   @ complementary shift
+ movs r5,r3
+ asrs r5,r7
+ movs r4,r3
+ lsls r4,r6
+ movs r6,r2
+ lsrs r6,r7                    @ rounding bit in carry
+ orrs r4,r6
+ adcs r2,r4
+ adcs r3,r5                    @ x+=x>>i
+ b 2f
+
+1:
+ adds r0,r4                    @ restore argument
+ adcs r1,r5
+2:
+ mov r6,r12
+ adds r7,#1
+ cmp r7,#33
+ bne 3b
+
+@ here
+@ r0:r1   ε (residual x, where x=a+ε) Q62, |ε|≤2^-32 (so fits in r0)
+@ r2:r3   exp a Q62
+@ and we wish to calculate exp x=exp a exp ε~(exp a)(1+ε)
+ muls32_32_64 r0,r3, r4,r1, r5,r6,r7,r4,r1
+@ r4:r1 ε exp a Q(62+62-32)=Q92
+ lsrs r4,#30
+ lsls r0,r1,#2
+ orrs r0,r4
+ asrs r1,#30
+ adds r0,r2
+ adcs r1,r3
+
+ pop {r2}
+ rsbs r2,#0
+ adds r2,#62
+ bl mufp_fix642double                 @ in principle we can pack faster than this because we know the exponent
+ pop {r4-r7,r15}
+
+.ltorg
+
+.thumb_func
+mufp_dln:
+ push {r4-r7,r14}
+ lsls r7,r1,#1
+ bcs 5f                        @ <0 ...
+ asrs r7,#21
+ beq 5f                        @ ... or =0? return -Inf
+ adds r7,#1
+ beq 6f                        @ Inf/NaN? return +Inf
+ bl dunpacks
+ push {r2}
+ lsls r1,#9
+ lsrs r2,r0,#23
+ orrs r1,r2
+ lsls r0,#9
+@ r0:r1 m Q61 = m/2 Q62 0.5≤m/2<1
+
+ movs r7,#1                    @ shift
+ adr r6,dtab_exp
+ mov r12,r6
+ movs r2,#0
+ movs r3,#0                    @ y=0 Q62
+
+3:
+ rsbs r6,r7,#0
+ adds r6,#32                   @ complementary shift
+ movs r5,r1
+ asrs r5,r7
+ movs r4,r1
+ lsls r4,r6
+ movs r6,r0
+ lsrs r6,r7
+ orrs r4,r6                    @ x>>i, rounding bit in carry
+ adcs r4,r0
+ adcs r5,r1                    @ x+(x>>i)
+
+ lsrs r6,r5,#30
+ bne 1f                        @ x+(x>>i)>1?
+ movs r0,r4
+ movs r1,r5                    @ x+=x>>i
+ mov r6,r12
+ ldmia r6!,{r4,r5}
+ subs r2,r4
+ sbcs r3,r5
+
+1:
+ movs r4,#8
+ add r12,r4
+ adds r7,#1
+ cmp r7,#33
+ bne 3b
+@ here:
+@ r0:r1 residual x, nearly 1 Q62
+@ r2:r3 y ~ ln m/2 = ln m - ln2 Q62
+@ result is y + ln2 + ln x ~ y + ln2 + (x-1)
+ lsls r1,#2
+ asrs r1,#2                    @ x-1
+ adds r2,r0
+ adcs r3,r1
+
+ pop {r7}
+@ here:
+@ r2:r3 ln m/2 = ln m - ln2 Q62
+@ r7    unbiased exponent
+
+ adr r4,dreddata1+4
+ ldmia r4,{r0,r1,r4}
+ adds r7,#1
+ muls r0,r7                    @ Q62
+ muls r1,r7                    @ Q41
+ muls r4,r7                    @ Q20
+ lsls r7,r1,#21
+ asrs r1,#11
+ asrs r5,r1,#31
+ adds r0,r7
+ adcs r1,r5
+ lsls r7,r4,#10
+ asrs r4,#22
+ asrs r5,r1,#31
+ adds r1,r7
+ adcs r4,r5
+@ r0:r1:r4 exponent*ln2 Q62
+ asrs r5,r3,#31
+ adds r0,r2
+ adcs r1,r3
+ adcs r4,r5
+@ r0:r1:r4 result Q62
+ movs r2,#62
+1:
+ asrs r5,r1,#31
+ cmp r4,r5
+ beq 2f                        @ r4 a sign extension of r1?
+ lsrs r0,#4                    @ no: shift down 4 places and try again
+ lsls r6,r1,#28
+ orrs r0,r6
+ lsrs r1,#4
+ lsls r6,r4,#28
+ orrs r1,r6
+ asrs r4,#4
+ subs r2,#4
+ b 1b
+2:
+ bl mufp_fix642double
+ pop {r4-r7,r15}
+
+5:
+ ldr r1,=#0xfff00000
+ movs r0,#0
+ pop {r4-r7,r15}
+
+6:
+ ldr r1,=#0x7ff00000
+ movs r0,#0
+ pop {r4-r7,r15}
+
+
+.ltorg
+
+.align 2
+dreddata1:
+.word 0x0000B8AA    @ 1/ln 2 Q15
+.word 0x0013DE6B    @ ln2 Q62 Q62=2C5C85FDF473DE6B split into 21-bit pieces
+.word 0x000FEFA3
+.word 0x000B1721
+
+dtab_exp:
+.word 0xbf984bf3, 0x19f323ec   @ log 1+2^-1 Q62
+.word 0xcd4d10d6, 0x0e47fbe3   @ log 1+2^-2 Q62
+.word 0x8abcb97a, 0x0789c1db   @ log 1+2^-3 Q62
+.word 0x022c54cc, 0x03e14618   @ log 1+2^-4 Q62
+.word 0xe7833005, 0x01f829b0   @ log 1+2^-5 Q62
+.word 0x87e01f1e, 0x00fe0545   @ log 1+2^-6 Q62
+.word 0xac419e24, 0x007f80a9   @ log 1+2^-7 Q62
+.word 0x45621781, 0x003fe015   @ log 1+2^-8 Q62
+.word 0xa9ab10e6, 0x001ff802   @ log 1+2^-9 Q62
+.word 0x55455888, 0x000ffe00   @ log 1+2^-10 Q62
+.word 0x0aa9aac4, 0x0007ff80   @ log 1+2^-11 Q62
+.word 0x01554556, 0x0003ffe0   @ log 1+2^-12 Q62
+.word 0x002aa9ab, 0x0001fff8   @ log 1+2^-13 Q62
+.word 0x00055545, 0x0000fffe   @ log 1+2^-14 Q62
+.word 0x8000aaaa, 0x00007fff   @ log 1+2^-15 Q62
+.word 0xe0001555, 0x00003fff   @ log 1+2^-16 Q62
+.word 0xf80002ab, 0x00001fff   @ log 1+2^-17 Q62
+.word 0xfe000055, 0x00000fff   @ log 1+2^-18 Q62
+.word 0xff80000b, 0x000007ff   @ log 1+2^-19 Q62
+.word 0xffe00001, 0x000003ff   @ log 1+2^-20 Q62
+.word 0xfff80000, 0x000001ff   @ log 1+2^-21 Q62
+.word 0xfffe0000, 0x000000ff   @ log 1+2^-22 Q62
+.word 0xffff8000, 0x0000007f   @ log 1+2^-23 Q62
+.word 0xffffe000, 0x0000003f   @ log 1+2^-24 Q62
+.word 0xfffff800, 0x0000001f   @ log 1+2^-25 Q62
+.word 0xfffffe00, 0x0000000f   @ log 1+2^-26 Q62
+.word 0xffffff80, 0x00000007   @ log 1+2^-27 Q62
+.word 0xffffffe0, 0x00000003   @ log 1+2^-28 Q62
+.word 0xfffffff8, 0x00000001   @ log 1+2^-29 Q62
+.word 0xfffffffe, 0x00000000   @ log 1+2^-30 Q62
+.word 0x80000000, 0x00000000   @ log 1+2^-31 Q62
+.word 0x40000000, 0x00000000   @ log 1+2^-32 Q62
+
+mufp_lib_double_end:
diff --git a/bootrom/mufplib.S b/bootrom/mufplib.S
index 2aba2e2..bd52b84 100644
--- a/bootrom/mufplib.S
+++ b/bootrom/mufplib.S
@@ -26,8 +26,9 @@
 .global mufp_fsub
 .global mufp_fmul
 .global mufp_fdiv
-.global mufp_fcmp
-.global mufp_fcmp_flags
+.global mufp_fcmp_combined
+.global mufp_fcmp_fast
+.global mufp_fcmp_fast_flags
 .global mufp_fsqrt
 .global mufp_float2int
 .global mufp_float2fix
@@ -37,6 +38,10 @@
 .global mufp_fix2float
 .global mufp_uint2float
 .global mufp_ufix2float
+.global mufp_int642float
+.global mufp_fix642float
+.global mufp_uint642float
+.global mufp_ufix642float
 .global mufp_fcos
 .global mufp_fsin
 .global mufp_ftan
@@ -158,31 +163,6 @@ packx0:
  lsls r0,#10
  b 9b
 
-@ unpack two arguments (r0,r1) and shift one down to have common exponent; note that arguments are exchanged
-unpackxyalign:
- push {r14}
- bl unpackx
- bl xchxy
- bl unpackx
- lsls r0,r0,#6  @ Q29
- lsls r1,r1,#6  @ Q29
- subs r4,r2,r3  @ calculate shift
- bge 1f         @ x>=y?
- mov r2,r3      @ no: take common exponent from y
- rsbs r4,#0     @ make shift positive
- asrs r0,r4
- cmp r4,#32
- bhs 2f
- pop {r15}
-1:
- asrs r1,r4
- cmp r4,#32
- bhs 2f
- pop {r15}
-2:
- movs r0,#0    @ large shift, so result is zero
- pop {r15}
-
 .align 2
 .ltorg
 
@@ -213,40 +193,56 @@ mul0:
  bx r14
 
 .thumb_func
-mufp_fcmp:
- movs r2,#1      @ initialise result
- lsls r3,r2,#31  @ r3=0x80000000
- tst r0,r3       @ check sign of first argument
+mufp_fcmp_combined:
+@ *****
+@ WARNING: this code is required by the wrapper functions to preserve R3
+@ *****
+ lsls r2,r0,#1
+ lsrs r2,#24
  beq 1f
- subs r0,r3,r0   @ convert to 2's complement form for direct comparison
+ cmp r2,#0xff
+ bne 2f
 1:
- tst r1,r3       @ repeat for second argument
- beq 2f
- subs r1,r3,r1
+ lsrs r0,#23     @ clear mantissa if NaN or denormal
+ lsls r0,#23
 2:
- subs r0,r1     @ perform comparison
- beq 4f         @ equal? return 0
- bgt 3f         @ r0>r1? return +1
- rsbs r2,#0     @ r0<r1: return -1
-3:
- mov r0,r2
-4:
- bx r14
-
-.thumb_func
-mufp_fcmp_flags:
- movs r2, #1
- lsls r2,r2,#31  @ r3=0x80000000
- tst r0,r2       @ check sign of first argument
+ lsls r2,r1,#1
+ lsrs r2,#24
  beq 1f
- subs r0,r2,r0   @ convert to 2's complement form for direct comparison
+ cmp r2,#0xff
+ bne 2f
 1:
- tst r1,r2       @ repeat for second argument
- beq 2f
- subs r1,r2,r1
+ lsrs r1,#23     @ clear mantissa if NaN or denormal
+ lsls r1,#23
 2:
- subs r0,r1     @ perform comparison
+.thumb_func
+mufp_fcmp_fast_flags:
+.thumb_func
+mufp_fcmp_fast:
+ movs r2,#1      @ initialise result
+ eors r1,r0
+ bmi 4f          @ opposite signs? then can proceed on basis of sign of x
+ eors r1,r0      @ restore y
+ bpl 1f
+ rsbs r2,#0      @ both negative? flip comparison
+1:
+ cmp r0,r1
+ bgt 2f
+ blt 3f
+5:
+ movs r2,#0
+3:
+ rsbs r2,#0
+2:
+ subs r0,r2,#0
  bx r14
+4:
+ orrs r1,r0
+ adds r1,r1
+ beq 5b
+ cmp r0,#0
+ bge 2b
+ b 3b
 
 @ convert float to signed int, rounding towards -Inf, clamping
 .thumb_func
@@ -258,6 +254,9 @@ mufp_float2int:
 mufp_float2fix:
  push {r4,r14}
  bl unpackx
+ movs r3,r2
+ adds r3,#130
+ bmi 6f          @ -0?
  add r2,r1       @ incorporate binary point position into exponent
  subs r2,#23     @ r2 is now amount of left shift required
  blt 1f          @ requires right shift?
@@ -279,6 +278,9 @@ mufp_float2fix:
 2:
  asrs r0,r0,r2
  pop {r4,r15}
+6:
+ movs r0,#0
+ pop {r4,r15}
 
 @ unsigned version
 .thumb_func
@@ -300,6 +302,56 @@ mufp_float2ufix:
  lsls r0,r0,r2   @ result fits, left shifted
  pop {r4,r15}
 
+
+@ convert uint64 to float, rounding
+.thumb_func
+mufp_uint642float:
+ movs r2,#0       @ fall through
+
+@ convert unsigned 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
+.thumb_func
+mufp_ufix642float:
+ push {r4,r5,r14}
+ cmp r1,#0
+ bpl 3f          @ positive? we can use signed code
+ lsls r5,r1,#31  @ contribution to sticky bits
+ orrs r5,r0
+ lsrs r0,r1,#1
+ subs r2,#1
+ b 4f
+
+@ convert int64 to float, rounding
+.thumb_func
+mufp_int642float:
+ movs r2,#0       @ fall through
+
+@ convert signed 64-bit fix to float, rounding; number of r0:r1 bits after point in r2
+.thumb_func
+mufp_fix642float:
+ push {r4,r5,r14}
+3:
+ movs r5,r0
+ orrs r5,r1
+ beq ret_pop45   @ zero? return +0
+ asrs r5,r1,#31  @ sign bits
+2:
+ asrs r4,r1,#24  @ try shifting 7 bits at a time
+ cmp r4,r5
+ bne 1f          @ next shift will overflow?
+ lsls r1,#7
+ lsrs r4,r0,#25
+ orrs r1,r4
+ lsls r0,#7
+ adds r2,#7
+ b 2b
+1:
+ movs r5,r0
+ movs r0,r1
+4:
+ rsbs r2,#0
+ adds r2,#32+29
+ b packret
+
 @ convert signed int to float, rounding
 .thumb_func
 mufp_int2float:
@@ -316,6 +368,7 @@ packretns:       @ pack and return, sticky bits=0
  movs r5,#0
 packret:         @ common return point: "pack and return"
  bl packx
+ret_pop45:
  pop {r4,r5,r15}
 
 
@@ -344,14 +397,12 @@ mufp_ufix2float:
 @ r1: y
 @ r2: z/omega
 @ r3: coefficient pointer
-@ r4,r8: m
+@ r4,r12: m
 @ r5: i (shift)
 
 cordic_start: @ initialisation
- mov r7,r8
- push {r7}
  movs r5,#0   @ initial shift=0
- mov r8,r4
+ mov r12,r4
  b 5f
 
 cordic_vstep: @ one step of algorithm in vector mode
@@ -370,12 +421,11 @@ cordic_rstep: @ one step of algorithm in rotation mode
  adds r1,r6   @ positive rotation: y=y+(x>>i)
  subs r2,r4   @ accumulate angle
 2:
- mov r4,r8
+ mov r4,r12
  muls r7,r4   @ apply sign from m
  subs r0,r7   @ finish rotation: x=x{+/-}(y>>i)
 5:
- ldr r4,[r3]  @ fetch next angle from table
- adds r3,#4   @ bump pointer
+ ldmia r3!,{r4} @ fetch next angle from table and bump pointer
  lsrs r4,#1   @ repeated angle?
  bcs 3f
  adds r5,#1   @ adjust shift if not
@@ -399,15 +449,13 @@ cordic_rot:
  asrs r2,#3
  muls r6,r2        @ all remaining CORDIC steps in a multiplication
  muls r7,r2
- mov r4,r8
+ mov r4,r12
  muls r7,r4
  asrs r6,#12
  asrs r7,#12
  subs r0,r7        @ x=x{+/-}(yz>>k)
  adds r1,r6        @ y=y+(xz>>k)
 cordic_exit:
- pop {r7}
- mov r8,r7
  pop {r6,r7,r15}
 
 @ CORDIC vector mode
@@ -583,7 +631,41 @@ mufp_fln:
 .thumb_func
 mufp_fatan2:
  push {r4,r5,r14}
- bl unpackxyalign        @ convert to fixed point (ensure common exponent, which is discarded)
+
+@ unpack arguments and shift one down to have common exponent
+ bl unpackx
+ bl xchxy
+ bl unpackx
+ lsls r0,r0,#5  @ Q28
+ lsls r1,r1,#5  @ Q28
+ adds r4,r2,r3  @ this is -760 if both arguments are 0 and at least -380-126=-506 otherwise
+ asrs r4,#9
+ adds r4,#1
+ bmi 2f         @ force y to 0 proper, so result will be zero
+ subs r4,r2,r3  @ calculate shift
+ bge 1f         @ ex>=ey?
+ rsbs r4,#0     @ make shift positive
+ asrs r0,r4
+ cmp r4,#28
+ blo 3f
+ asrs r0,#31
+ b 3f
+1:
+ asrs r1,r4
+ cmp r4,#28
+ blo 3f
+2:
+@ here |x|>>|y| or both x and y are ±0
+ cmp r0,#0
+ bge 4f         @ x positive, return signed 0
+ ldr r0,pi_q29  @ x negative, return +/- pi
+ asrs r1,#31
+ eors r0,r1
+ b 7f
+4:
+ asrs r0,r1,#31
+ b 7f
+3:
  movs r2,#0              @ initial angle
  cmp r0,#0               @ x negative
  bge 5f
@@ -595,6 +677,8 @@ mufp_fatan2:
  movs r4,#1              @ m=1
  bl cordic_vec           @ also produces magnitude (with scaling factor 1.646760119), which is discarded
  mov r0,r2               @ result here is -pi/2..3pi/2 Q29
+@ asrs r2,#29
+@ subs r0,r2
  ldr r2,pi_q29           @ pi Q29
  adds r4,r0,r2           @ attempt to fix -3pi/2..-pi case
  bcs 6f                  @ -pi/2..0? leave result as is
@@ -603,6 +687,7 @@ mufp_fatan2:
  subs r0,r4,r2           @ >pi: take off 2pi
 6:
  subs r0,#1              @ fiddle factor so atan2(0,1)==0
+7:
  movs r2,#0              @ exponent for pack
  b packretns
 
@@ -802,11 +887,12 @@ fa_ye255:
 .align 2
 .thumb_func
 mufp_fmul:
- push {r4,r5,r6,r7,r14}
- mov r6,r0
- eors r6,r1     @ sign of result
- lsrs r6,#31
- lsls r6,#31
+ push {r7,r14}
+ mov r2,r0
+ eors r2,r1       @ sign of result
+ lsrs r2,#31
+ lsls r2,#31
+ mov r14,r2
  lsls r0,#1
  lsls r1,#1
  lsrs r2,r0,#24 @ xe
@@ -821,69 +907,74 @@ fm_xe:
 fm_ye:
  adds r7,r2,r3  @ exponent of result (will possibly be incremented)
  subs r7,#128   @ adjust bias for packing
- movs r2,#1
- lsls r2,#23
  lsls r0,#8     @ x mantissa
- lsls r1,#8
- lsrs r0,#9     @ y mantissa
+ lsls r1,#8       @ y mantissa
+ lsrs r0,#9
  lsrs r1,#9
- adds r0,r2     @ implied 1s
- adds r1,r2
 
- uxth r3,r0      @ Q23
- lsrs r4,r1,#16  @ Q7
- muls r3,r4      @ L*H, Q30
- lsrs r4,r0,#16  @ Q7
- uxth r5,r1      @ Q23
- muls r4,r5      @ H*L, Q30
- adds r3,r4      @ sum of middle partial products Q30
- uxth r4,r0
- muls r4,r5      @ L*L, Q46
- uxth r5,r4      @ initialise sticky bits from low half of low partial product
- lsrs r4,#16     @ Q30
- adds r3,r4      @ add high half of low partial product to sum of middle partial products Q30
-                 @ (cannot generate carry by limits on input arguments)
- lsrs r0,#16     @ Q7
- lsrs r1,#16     @ Q7
- muls r0,r1      @ H*H, Q14
- lsls r0,#9      @ high partial product Q23
- lsrs r1,r3,#7   @ middle partial products Q23
- adds r0,r1      @ result Q23
- lsrs r1,r0,#24
+ adds r2,r0,r1    @ for later
+ mov r12,r2
+ lsrs r2,r0,#7    @ x[22..7] Q16
+ lsrs r3,r1,#7    @ y[22..7] Q16
+ muls r2,r2,r3    @ result [45..14] Q32: never an overestimate and worst case error is 2*(2^7-1)*(2^23-2^7)+(2^7-1)^2 = 2130690049 < 2^31
+ muls r0,r0,r1    @ result [31..0] Q46
+ lsrs r2,#18      @ result [45..32] Q14
+ bcc 1f
+ cmp r0,#0
+ bmi 1f
+ adds r2,#1       @ fix error in r2
+1:
+ lsls r3,r0,#9    @ bits off bottom of result
+ lsrs r0,#23      @ Q23
+ lsls r2,#9
+ adds r0,r2       @ cut'n'shut
+ add r0,r12       @ implied 1*(x+y) to compensate for no insertion of implied 1s
+@ result-1 in r3:r0 Q23+32, i.e., in range [0,3)
+
+ lsrs r1,r0,#23
  bne fm_0        @ branch if we need to shift down one place
 @ here 1<=result<2
  cmp r7,#254
  bhs fm_3a       @ catches both underflow and overflow
- lsls r3,#26     @ sticky bits at top of R3, rounding bit in carry
+ lsls r3,#1       @ sticky bits at top of R3, rounding bit in carry
  bcc fm_1        @ no rounding
- adds r0,#1      @ assume we will round up
- orrs r5,r3      @ union of sticky bits
  beq fm_2        @ rounding tie?
+ adds r0,#1       @ round up
 fm_1:
+ adds r7,#1       @ for implied 1
  lsls r7,#23     @ pack result
  add r0,r7
- add r0,r6
- pop {r4,r5,r6,r7,r15}
+ add r0,r14
+ pop {r7,r15}
 fm_2:            @ rounding tie
+ adds r0,#1
+fm_3:
  lsrs r0,#1
  lsls r0,#1      @ clear bottom bit
  b fm_1
 
-@ here 2<=result<4
+@ here 1<=result-1<3
 fm_0:
  adds r7,#1      @ increment exponent
  cmp r7,#254
  bhs fm_3b       @ catches both underflow and overflow
  lsrs r0,#1      @ shift mantissa down
- bcc fm_1        @ no rounding
+ bcc fm_1a        @ no rounding
  adds r0,#1      @ assume we will round up
- lsls r3,#25     @ sticky bits
- orrs r5,r3
- beq fm_2        @ rounding tie?
- lsls r7,#23     @ pack result
+ cmp r3,#0        @ sticky bits
+ beq fm_3c        @ rounding tie?
+fm_1a:
+ adds r7,r7
+ adds r7,#1       @ for implied 1
+ lsls r7,#22      @ pack result
  add r0,r7
- add r0,r6
- pop {r4,r5,r6,r7,r15}
+ add r0,r14
+ pop {r7,r15}
+
+fm_3c:
+ lsrs r0,#1
+ lsls r0,#1       @ clear bottom bit
+ b fm_1a
 
 fm_xe0:
  subs r2,#16
@@ -900,31 +991,38 @@ fm_ye255:
 fm_3b:
  bge fm_4        @ branch on overflow
 @ trap case where result is denormal 0x007fffff + 0.5ulp or more
- lsrs r0,#1
+ adds r7,#1      @ exponent=-1?
+ bne fm_5
+@ corrected mantissa will be >= 3.FFFFFC (0x1fffffe Q23)
+@ so r0 >= 2.FFFFFC (0x17ffffe Q23)
+ adds r0,#2
+ lsrs r0,#23
+ cmp r0,#3
+ bne fm_5
  b fm_6
 
 fm_3a:
  bge fm_4        @ branch on overflow
 @ trap case where result is denormal 0x007fffff + 0.5ulp or more
-fm_6:
  adds r7,#1      @ exponent=-1?
  bne fm_5
- adds r0,#1      @ mantissa=0xffffff?
- lsrs r0,#24
+ adds r0,#1      @ mantissa=0xffffff (i.e., r0=0x7fffff)?
+ lsrs r0,#23
  beq fm_5
+fm_6:
  movs r0,#1      @ return smallest normal
  lsls r0,#23
- orrs r0,r6
- pop {r4,r5,r6,r7,r15}
+ add r0,r14
+ pop {r7,r15}
 
 fm_5:
- movs r0,r6
- pop {r4,r5,r6,r7,r15}
+ mov r0,r14
+ pop {r7,r15}
 fm_4:
  movs r0,#0xff
  lsls r0,#23
- orrs r0,r6
- pop {r4,r5,r6,r7,r15}
+ add r0,r14
+ pop {r7,r15}
 
 @ This version of the division algorithm uses external divider hardware to estimate the
 @ reciprocal of the divisor to about 14 bits; then a multiplication step to get a first
@@ -984,7 +1082,7 @@ fdiv_n:
  lsls r6,#31       @ sign of result in bit 31, other bits clear
 
 .if use_hw_div
-@ the above code takes long enough to guaratee the result is ready
+@ the above code takes long enough to guarantee the result is ready
  ldr r5,[r5,#DIV_QUOTIENT]
 .endif
 
@@ -1092,17 +1190,8 @@ retinf:
  bne retzero      @ exponent <-1? return 0
 @ here exponent is exactly -1
  lsrs r1,r4,#25
- bcs 11f          @ mantissa is 01000000?
- adds r1,r4,#1
- lsrs r1,#25
- bcc retzero
-@ here mantissa of result is 00ffffff
-@ bl dump
- lsls r1,r2,#9
- bne retzero      @ and was it a division by a power of 2?
-
+ bcc retzero      @ mantissa is not 01000000?
 @ return minimum normal
-11:
  movs r0,#1
  lsls r0,#23
  orrs r0,r6
@@ -1192,54 +1281,28 @@ mufp_fsqrt:
 sq_3:
  lsls r2,#23      @ pack exponent
  adds r0,r2,r3
+sq_6:
  pop {r4}
  bx r14
 
 sq_0:
- beq sq_4       @ -0: return it
-@ here negative and not -0: return an infinity
-sq_1:
- movs r0,#0xff
+ lsrs r1,#24
+ beq sq_2         @ -0: return it
+@ here negative and not -0: return -Inf
+ asrs r0,#31
+sq_5:
  lsls r0,#23
- pop {r4}
- bx r14
+ b sq_6
+sq_1:             @ +Inf
+ lsrs r0,#23
+ b sq_5
 sq_2:
- movs r0,#0
-sq_4:
- pop {r4}
- bx r14
+ lsrs r0,#31
+ lsls r0,#31
+ b sq_6
 
 @ round(sqrt(2^22./[72:16:248]))
 rsqrtapp:
 .byte 0xf1,0xda,0xc9,0xbb, 0xb0,0xa6,0x9e,0x97, 0x91,0x8b,0x86,0x82
 
 mufp_lib_end:
-
-// dump:
-//  push {r0-r7,r14}
-//  ldr r0,[r13,#0]
-//  bl o8hex
-//  bl osp
-//  ldr r0,[r13,#4]
-//  bl o8hex
-//  bl osp
-//  ldr r0,[r13,#8]
-//  bl o8hex
-//  bl osp
-//  ldr r0,[r13,#12]
-//  bl o8hex
-//  bl osp
-//  bl osp
-//  ldr r0,[r13,#16]
-//  bl o8hex
-//  bl osp
-//  ldr r0,[r13,#20]
-//  bl o8hex
-//  bl osp
-//  ldr r0,[r13,#24]
-//  bl o8hex
-//  bl osp
-//  ldr r0,[r13,#28]
-//  bl o8hex
-//  bl onl
-//  pop {r0-r7,r15}
diff --git a/bootrom/program_flash_generic.c b/bootrom/program_flash_generic.c
index a53e706..78510d5 100644
--- a/bootrom/program_flash_generic.c
+++ b/bootrom/program_flash_generic.c
@@ -228,10 +228,15 @@ void __noinline flash_exit_xip() {
         flash_cs_force(OUTOVER_LOW);
     }
 
-    // Restore IO/pad controls, and send 0xff, 0xff
+    // Restore IO/pad controls, and send 0xff, 0xff. Put pullup on IO2/IO3 as
+    // these may be used as WPn/HOLDn at this point, and we are now starting
+    // to issue serial commands.
 
     qspi_sd_padctrl->sd0 = padctrl_save;
     qspi_sd_padctrl->sd1 = padctrl_save;
+    padctrl_save = (padctrl_save
+        & ~PADS_QSPI_GPIO_QSPI_SD0_PDE_BITS
+    ) | PADS_QSPI_GPIO_QSPI_SD0_PUE_BITS;
     qspi_sd_padctrl->sd2 = padctrl_save;
     qspi_sd_padctrl->sd3 = padctrl_save;
 
diff --git a/bootrom/usb_boot_device.c b/bootrom/usb_boot_device.c
index a4b7ccb..97bccfa 100644
--- a/bootrom/usb_boot_device.c
+++ b/bootrom/usb_boot_device.c
@@ -124,8 +124,8 @@ static_assert(sizeof(boot_device_config) == sizeof(struct usb_configuration_desc
 static struct usb_interface msd_interface;
 
 #ifdef USE_PICOBOOT
-static struct usb_endpoint rpiboot_in, rpiboot_out;
-static struct usb_interface rpiboot_interface;
+static struct usb_endpoint picoboot_in, picoboot_out;
+static struct usb_interface picoboot_interface;
 #endif
 
 static const struct usb_device_descriptor boot_device_descriptor = {
@@ -174,60 +174,60 @@ const char *_get_descriptor_string(uint index) {
 
 #ifdef USE_PICOBOOT
 
-__rom_function_static_impl(void, _rpiboot_cmd_packet)(struct usb_endpoint *ep);
+__rom_function_static_impl(void, _picoboot_cmd_packet)(struct usb_endpoint *ep);
 
-static const struct usb_transfer_type _rpiboot_cmd_transfer_type = {
-        .on_packet = __rom_function_ref(_rpiboot_cmd_packet),
+static const struct usb_transfer_type _picoboot_cmd_transfer_type = {
+        .on_packet = __rom_function_ref(_picoboot_cmd_packet),
         .initial_packet_count = 1,
 };
 
-struct picoboot_cmd_status _rpiboot_current_cmd_status;
+struct picoboot_cmd_status _picoboot_current_cmd_status;
 
-static void _rpiboot_reset() {
+static void _picoboot_reset() {
     usb_debug("PICOBOOT RESET\n");
-    usb_soft_reset_endpoint(&rpiboot_out);
-    usb_soft_reset_endpoint(&rpiboot_in);
-    if (_rpiboot_current_cmd_status.bInProgress) {
+    usb_soft_reset_endpoint(&picoboot_out);
+    usb_soft_reset_endpoint(&picoboot_in);
+    if (_picoboot_current_cmd_status.bInProgress) {
         printf("command in progress so aborting flash\n");
         flash_abort();
     }
-    memset0(&_rpiboot_current_cmd_status, sizeof(_rpiboot_current_cmd_status));
+    memset0(&_picoboot_current_cmd_status, sizeof(_picoboot_current_cmd_status));
     // reset queue (note this also clears exclusive access)
     reset_queue(&virtual_disk_queue);
-    reset_queue(&rpiboot_queue);
+    reset_queue(&picoboot_queue);
 }
 
-struct async_task_queue rpiboot_queue;
+struct async_task_queue picoboot_queue;
 
-static void _tf_rpiboot_wait_command(__unused struct usb_endpoint *ep, __unused struct usb_transfer *transfer) {
-    usb_debug("_tf_rpiboot_wait_command\n");
+static void _tf_picoboot_wait_command(__unused struct usb_endpoint *ep, __unused struct usb_transfer *transfer) {
+    usb_debug("_tf_picoboot_wait_command\n");
     // todo check this at the end of an OUT ACK
-    usb_start_default_transfer_if_not_already_running_or_halted(&rpiboot_out);
+    usb_start_default_transfer_if_not_already_running_or_halted(&picoboot_out);
 }
 
-static void _rpiboot_ack() {
+static void _picoboot_ack() {
     static struct usb_transfer _ack_transfer;
-    _rpiboot_current_cmd_status.bInProgress = false;
-    usb_start_empty_transfer((_rpiboot_current_cmd_status.bCmdId & 0x80u) ? &rpiboot_out : &rpiboot_in, &_ack_transfer,
-                             _tf_rpiboot_wait_command);
+    _picoboot_current_cmd_status.bInProgress = false;
+    usb_start_empty_transfer((_picoboot_current_cmd_status.bCmdId & 0x80u) ? &picoboot_out : &picoboot_in, &_ack_transfer,
+                             _tf_picoboot_wait_command);
 }
 
-#define _tf_ack ((usb_transfer_completed_func)_rpiboot_ack)
+#define _tf_ack ((usb_transfer_completed_func)_picoboot_ack)
 
-static bool _rpiboot_setup_request_handler(__unused struct usb_interface *interface, struct usb_setup_packet *setup) {
+static bool _picoboot_setup_request_handler(__unused struct usb_interface *interface, struct usb_setup_packet *setup) {
     setup = __builtin_assume_aligned(setup, 4);
     if (USB_REQ_TYPE_TYPE_VENDOR == (setup->bmRequestType & USB_REQ_TYPE_TYPE_MASK)) {
         if (setup->bmRequestType & USB_DIR_IN) {
-            if (setup->bRequest == PICOBOOT_IF_CMD_STATUS && setup->wLength == sizeof(_rpiboot_current_cmd_status)) {
+            if (setup->bRequest == PICOBOOT_IF_CMD_STATUS && setup->wLength == sizeof(_picoboot_current_cmd_status)) {
                 uint8_t *buffer = usb_get_single_packet_response_buffer(usb_get_control_in_endpoint(),
-                                                                        sizeof(_rpiboot_current_cmd_status));
-                memcpy(buffer, &_rpiboot_current_cmd_status, sizeof(_rpiboot_current_cmd_status));
+                                                                        sizeof(_picoboot_current_cmd_status));
+                memcpy(buffer, &_picoboot_current_cmd_status, sizeof(_picoboot_current_cmd_status));
                 usb_start_single_buffer_control_in_transfer();
                 return true;
             }
         } else {
             if (setup->bRequest == PICOBOOT_IF_RESET) {
-                _rpiboot_reset();
+                _picoboot_reset();
                 usb_start_empty_control_in_transfer_null_completion();
                 return true;
             }
@@ -236,51 +236,51 @@ static bool _rpiboot_setup_request_handler(__unused struct usb_interface *interf
     return false;
 }
 
-static struct rpiboot_stream_transfer {
+static struct picoboot_stream_transfer {
     struct usb_stream_transfer stream;
     struct async_task task;
-} _rpiboot_stream_transfer;
+} _picoboot_stream_transfer;
 
 static void _atc_ack(struct async_task *task) {
-    if (task->rpiboot_user_token == _rpiboot_stream_transfer.task.rpiboot_user_token) {
+    if (task->picoboot_user_token == _picoboot_stream_transfer.task.picoboot_user_token) {
         usb_warn("atc_ack\n");
-        _rpiboot_ack();
+        _picoboot_ack();
     } else {
-        usb_warn("atc for wrong rpiboot token %08x != %08x\n", (uint) task->rpiboot_user_token,
-                 (uint) _rpiboot_stream_transfer.task.rpiboot_user_token);
+        usb_warn("atc for wrong picoboot token %08x != %08x\n", (uint) task->picoboot_user_token,
+                 (uint) _picoboot_stream_transfer.task.picoboot_user_token);
     }
 }
 
 static void _set_cmd_status(uint32_t status) {
-    _rpiboot_current_cmd_status.dStatusCode = status;
+    _picoboot_current_cmd_status.dStatusCode = status;
 }
 
 static void _atc_chunk_task_done(struct async_task *task) {
-    if (task->rpiboot_user_token == _rpiboot_stream_transfer.task.rpiboot_user_token) {
+    if (task->picoboot_user_token == _picoboot_stream_transfer.task.picoboot_user_token) {
         // save away result
         _set_cmd_status(task->result);
         if (task->result) {
-            usb_halt_endpoint(_rpiboot_stream_transfer.stream.ep);
-            _rpiboot_current_cmd_status.bInProgress = false;
+            usb_halt_endpoint(_picoboot_stream_transfer.stream.ep);
+            _picoboot_current_cmd_status.bInProgress = false;
         }
         // we update the position of the original task which will be submitted again in on_stream_chunk
-        _rpiboot_stream_transfer.task.transfer_addr += task->data_length;
-        usb_stream_chunk_done(&_rpiboot_stream_transfer.stream);
+        _picoboot_stream_transfer.task.transfer_addr += task->data_length;
+        usb_stream_chunk_done(&_picoboot_stream_transfer.stream);
     }
 }
 
-__rom_function_static_impl(bool, _rpiboot_on_stream_chunk)(uint32_t chunk_len __comma_removed_for_space(
+__rom_function_static_impl(bool, _picoboot_on_stream_chunk)(uint32_t chunk_len __comma_removed_for_space(
         struct usb_stream_transfer *transfer)) {
-    assert(transfer == &_rpiboot_stream_transfer.stream);
+    assert(transfer == &_picoboot_stream_transfer.stream);
     assert(chunk_len <= FLASH_PAGE_SIZE);
-    _rpiboot_stream_transfer.task.data_length = chunk_len;
-    queue_task(&rpiboot_queue, &_rpiboot_stream_transfer.task, _atc_chunk_task_done);
+    _picoboot_stream_transfer.task.data_length = chunk_len;
+    queue_task(&picoboot_queue, &_picoboot_stream_transfer.task, _atc_chunk_task_done);
     // for subsequent tasks, check the mutation source
-    _rpiboot_stream_transfer.task.check_last_mutation_source = true;
+    _picoboot_stream_transfer.task.check_last_mutation_source = true;
     return true;
 }
 
-static void _rpiboot_cmd_packet_internal(struct usb_endpoint *ep) {
+static void _picoboot_cmd_packet_internal(struct usb_endpoint *ep) {
     struct usb_buffer *buffer = usb_current_out_packet_buffer(ep);
     uint len = buffer->data_len;
 
@@ -288,16 +288,16 @@ static void _rpiboot_cmd_packet_internal(struct usb_endpoint *ep) {
     if (len == sizeof(struct picoboot_cmd) && cmd->dMagic == PICOBOOT_MAGIC) {
         // pre-init even if we don't need it
         static uint32_t real_token;
-        reset_task(&_rpiboot_stream_transfer.task);
-        _rpiboot_stream_transfer.task.token = --real_token; // we go backwards to disambiguate with MSC tasks
-        _rpiboot_stream_transfer.task.rpiboot_user_token = cmd->dToken;
-        _rpiboot_current_cmd_status.bCmdId = cmd->bCmdId;
-        _rpiboot_current_cmd_status.dToken = cmd->dToken;
-        _rpiboot_current_cmd_status.bInProgress = false;
+        reset_task(&_picoboot_stream_transfer.task);
+        _picoboot_stream_transfer.task.token = --real_token; // we go backwards to disambiguate with MSC tasks
+        _picoboot_stream_transfer.task.picoboot_user_token = cmd->dToken;
+        _picoboot_current_cmd_status.bCmdId = cmd->bCmdId;
+        _picoboot_current_cmd_status.dToken = cmd->dToken;
+        _picoboot_current_cmd_status.bInProgress = false;
         _set_cmd_status(PICOBOOT_UNKNOWN_CMD);
-        _rpiboot_stream_transfer.task.transfer_addr = _rpiboot_stream_transfer.task.erase_addr = cmd->range_cmd.dAddr;
-        _rpiboot_stream_transfer.task.erase_size = cmd->range_cmd.dSize;
-        _rpiboot_stream_transfer.task.exclusive_param = cmd->exclusive_cmd.bExclusive;
+        _picoboot_stream_transfer.task.transfer_addr = _picoboot_stream_transfer.task.erase_addr = cmd->range_cmd.dAddr;
+        _picoboot_stream_transfer.task.erase_size = cmd->range_cmd.dSize;
+        _picoboot_stream_transfer.task.exclusive_param = cmd->exclusive_cmd.bExclusive;
         static_assert(
                 offsetof(struct picoboot_cmd, range_cmd.dAddr) == offsetof(struct picoboot_cmd, address_only_cmd.dAddr),
                 ""); // we want transfer_addr == exec_cmd.addr also
@@ -337,33 +337,33 @@ static void _rpiboot_cmd_packet_internal(struct usb_endpoint *ep) {
                 }
                 if (cmd->bCmdId == PC_REBOOT) {
                     safe_reboot(cmd->reboot_cmd.dPC, cmd->reboot_cmd.dSP, cmd->reboot_cmd.dDelayMS);
-                    return _rpiboot_ack();
+                    return _picoboot_ack();
                 }
                 if (type) {
-                    _rpiboot_stream_transfer.task.type = type;
-                    _rpiboot_stream_transfer.task.source = TASK_SOURCE_PICOBOOT;
-                    _rpiboot_current_cmd_status.bInProgress = true;
+                    _picoboot_stream_transfer.task.type = type;
+                    _picoboot_stream_transfer.task.source = TASK_SOURCE_PICOBOOT;
+                    _picoboot_current_cmd_status.bInProgress = true;
                     if (cmd->dTransferLength) {
                         static uint8_t _buffer[FLASH_PAGE_SIZE];
-                        static const struct usb_stream_transfer_funcs _rpiboot_stream_funcs = {
+                        static const struct usb_stream_transfer_funcs _picoboot_stream_funcs = {
                                 .on_packet_complete = usb_stream_noop_on_packet_complete,
-                                .on_chunk = __rom_function_ref(_rpiboot_on_stream_chunk)
+                                .on_chunk = __rom_function_ref(_picoboot_on_stream_chunk)
                         };
 
-                        _rpiboot_stream_transfer.task.data = _buffer;
-                        usb_stream_setup_transfer(&_rpiboot_stream_transfer.stream,
-                                                  &_rpiboot_stream_funcs, _buffer, FLASH_PAGE_SIZE,
+                        _picoboot_stream_transfer.task.data = _buffer;
+                        usb_stream_setup_transfer(&_picoboot_stream_transfer.stream,
+                                                  &_picoboot_stream_funcs, _buffer, FLASH_PAGE_SIZE,
                                                   cmd->dTransferLength,
                                                   _tf_ack);
                         if (type & AT_WRITE) {
-                            _rpiboot_stream_transfer.stream.ep = &rpiboot_out;
-                            return usb_chain_transfer(&rpiboot_out, &_rpiboot_stream_transfer.stream.core);
+                            _picoboot_stream_transfer.stream.ep = &picoboot_out;
+                            return usb_chain_transfer(&picoboot_out, &_picoboot_stream_transfer.stream.core);
                         } else {
-                            _rpiboot_stream_transfer.stream.ep = &rpiboot_in;
-                            return usb_start_transfer(&rpiboot_in, &_rpiboot_stream_transfer.stream.core);
+                            _picoboot_stream_transfer.stream.ep = &picoboot_in;
+                            return usb_start_transfer(&picoboot_in, &_picoboot_stream_transfer.stream.core);
                         }
                     }
-                    return queue_task(&rpiboot_queue, &_rpiboot_stream_transfer.task, _atc_ack);
+                    return queue_task(&picoboot_queue, &_picoboot_stream_transfer.task, _atc_ack);
                 }
                 _set_cmd_status(PICOBOOT_INVALID_TRANSFER_LENGTH);
             } else {
@@ -371,12 +371,12 @@ static void _rpiboot_cmd_packet_internal(struct usb_endpoint *ep) {
             }
         }
     }
-    usb_halt_endpoint(&rpiboot_in);
-    usb_halt_endpoint(&rpiboot_out);
+    usb_halt_endpoint(&picoboot_in);
+    usb_halt_endpoint(&picoboot_out);
 }
 
-__rom_function_static_impl(void, _rpiboot_cmd_packet)(struct usb_endpoint *ep) {
-    _rpiboot_cmd_packet_internal(ep);
+__rom_function_static_impl(void, _picoboot_cmd_packet)(struct usb_endpoint *ep) {
+    _picoboot_cmd_packet_internal(ep);
     usb_packet_done(ep);
 }
 
@@ -390,7 +390,7 @@ static void _usb_boot_on_configure(struct usb_device *device, bool configured) {
 #endif
     msc_on_configure(device, configured);
 #ifdef USE_PICOBOOT
-    if (configured) _rpiboot_reset();
+    if (configured) _picoboot_reset();
 #endif
 }
 
@@ -431,7 +431,7 @@ void usb_boot_device_init(uint32_t _usb_disable_interface_mask) {
     _write_six_msb_hex_chars(serial_number_string + 6, software_git_revision);
 
     const struct boot_device_config *config_desc = &boot_device_config;
-    uint rpiboot_interface_num = 1;
+    uint picoboot_interface_num = 1;
 #ifdef USB_BOOT_WITH_SUBSET_OF_INTERFACES
     // if we are disabling interfaces
     if (usb_disable_interface_mask) {
@@ -441,7 +441,7 @@ void usb_boot_device_init(uint32_t _usb_disable_interface_mask) {
         static_assert(sizeof(_single_interface_config) ==
                       sizeof(struct usb_configuration_descriptor) + sizeof(struct usb_simple_interface_descriptor), "");
         if (usb_disable_interface_mask & 1u) {
-            rpiboot_interface_num = 0;
+            picoboot_interface_num = 0;
             memcpy(&_single_interface_config.interface_desc[0], &boot_device_config.interface_desc[1],
                    sizeof(struct usb_simple_interface_descriptor));
         }
@@ -461,23 +461,23 @@ void usb_boot_device_init(uint32_t _usb_disable_interface_mask) {
     }
 #ifdef USE_PICOBOOT
     if (!(usb_disable_interface_mask & 2u)) {
-        static struct usb_endpoint *const rpiboot_endpoints[] = {
-                &rpiboot_out,
-                &rpiboot_in,
+        static struct usb_endpoint *const picoboot_endpoints[] = {
+                &picoboot_out,
+                &picoboot_in,
         };
-        usb_interface_init(&rpiboot_interface, &config_desc->interface_desc[rpiboot_interface_num].desc,
-                           rpiboot_endpoints, count_of(rpiboot_endpoints), true);
-        static struct usb_transfer _rpiboot_cmd_transfer;
-        _rpiboot_cmd_transfer.type = &_rpiboot_cmd_transfer_type;
-        usb_set_default_transfer(&rpiboot_out, &_rpiboot_cmd_transfer);
-        rpiboot_interface.setup_request_handler = _rpiboot_setup_request_handler;
+        usb_interface_init(&picoboot_interface, &config_desc->interface_desc[picoboot_interface_num].desc,
+                           picoboot_endpoints, count_of(picoboot_endpoints), true);
+        static struct usb_transfer _picoboot_cmd_transfer;
+        _picoboot_cmd_transfer.type = &_picoboot_cmd_transfer_type;
+        usb_set_default_transfer(&picoboot_out, &_picoboot_cmd_transfer);
+        picoboot_interface.setup_request_handler = _picoboot_setup_request_handler;
     }
 #endif
 
     static struct usb_interface *const boot_device_interfaces[] = {
             &msd_interface,
 #ifdef USE_PICOBOOT
-            &rpiboot_interface,
+            &picoboot_interface,
 #endif
     };
     static_assert(count_of(boot_device_interfaces) == BOOT_DEVICE_NUM_INTERFACES, "");
diff --git a/generator/main.c b/generator/main.c
index 24ae81d..a68861a 100644
--- a/generator/main.c
+++ b/generator/main.c
@@ -1,3 +1,9 @@
+/**
+ * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
 #include <stdio.h>
 #include <stdbool.h>
 #include <stdint.h>
diff --git a/pico_sdk b/pico_sdk
index bfcbefa..26653ea 160000
--- a/pico_sdk
+++ b/pico_sdk
@@ -1 +1 @@
-Subproject commit bfcbefafc5d2a210551a4d9d80b4303d4ae0adf7
+Subproject commit 26653ea81e340cacee55025d110c3e014a252a87
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
new file mode 100644
index 0000000..9235e4d
--- /dev/null
+++ b/test/CMakeLists.txt
@@ -0,0 +1,35 @@
+add_executable(bit_functions_test
+        bit_functions_test.c
+        ../bootrom/bit_functions.S)
+
+target_compile_definitions(bit_functions_test PRIVATE
+        NEW_BIT_FUNCTIONS=1
+        USE_POPCOUNT32
+        USE_CLZ32
+        USE_CTZ32
+        USE_REVERSE32)
+
+target_link_libraries(bit_functions_test PRIVATE pico_stdlib)
+pico_add_extra_outputs(bit_functions_test)
+
+add_executable(mem_functions_test
+        mem_functions_test.c
+        ../bootrom/bootrom_misc.S)
+
+target_link_libraries(mem_functions_test PRIVATE pico_stdlib)
+pico_add_extra_outputs(mem_functions_test)
+
+add_executable(tc_rom_float tc_rom_float.c)
+add_executable(tc_rom_double tc_rom_double.c)
+
+# don't redirect C floating point code thru the bootrom!
+pico_set_float_implementation(tc_rom_float compiler)
+pico_set_float_implementation(tc_rom_double compiler)
+pico_set_double_implementation(tc_rom_float compiler)
+pico_set_double_implementation(tc_rom_double compiler)
+
+target_link_libraries(tc_rom_float PRIVATE pico_stdlib)
+target_link_libraries(tc_rom_double PRIVATE pico_stdlib)
+
+pico_add_extra_outputs(tc_rom_float)
+pico_add_extra_outputs(tc_rom_double)
diff --git a/test/bit_functions_test.c b/test/bit_functions_test.c
new file mode 100644
index 0000000..1449e0b
--- /dev/null
+++ b/test/bit_functions_test.c
@@ -0,0 +1,199 @@
+/**
+ * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "pico/stdlib.h"
+#include "pico/bootrom.h"
+#include "tictoc.h"
+
+extern uint32_t popcount32(uint32_t v);
+extern uint32_t reverse32(uint32_t v);
+extern uint32_t clz32(uint32_t v);
+extern uint32_t ctz32(uint32_t v);
+
+#define ASSERT(x) if (!(x)) { panic("ASSERT: %s l %d: " #x "\n" , __FILE__, __LINE__); }
+
+static bool roughly_ascii(uint x) {
+    return x >= 32 && x < 127;
+}
+
+static int check_table(uint16_t *table, uint align_mask, uint align_value) {
+    static uint16_t entries[256];
+    uint32_t (*table_lookup)(uint16_t *, uint32_t) = (uint32_t (*)(uint16_t *, uint32_t))(uint32_t)(*(uint16_t *)0x18);
+    int n = 0;
+    while (*table && n != count_of(entries)) {
+        printf("Checking %c%c %d %d\n", table[0], table[0]>>8u, align_mask, align_value);
+        for(int i=0; i<n;i++) {
+            ASSERT(entries[i] != table[0]); // disallow duplicate table codes
+        }
+        ASSERT(roughly_ascii(table[0] & 0xffu));
+        ASSERT(roughly_ascii(table[0] >> 8u));
+        if ((table[0] & 0xffu) != 'F' || (table[0] >> 8u) != 'Z') // this is byte aligned, so no check
+            ASSERT(align_value == (table[1] & align_mask)); // check alignment
+        entries[n++] = table[0];
+        ASSERT(table_lookup(table, table[0]) == table[1]); // check that looking up the value works correctly
+        table += 2;
+    }
+    ASSERT(n); // table should not be empty
+    ASSERT(n != count_of(entries)); // table should terminate relatively quickly
+    ASSERT(0 == table_lookup(table, rom_table_code('Z','Z')));
+    return 0;
+}
+
+// slightly slower than ours, but known to be good
+static uint32_t __noinline bit_reverse(uint32_t i) {
+i = ((i & 0x55555555u) << 1u) | ((i >> 1u) & 0x55555555u);
+i = ((i & 0x33333333u) << 2u) | ((i >> 2u) & 0x33333333u);
+i = ((i & 0x0f0f0f0fu) << 4u) | ((i >> 4u) & 0x0f0f0f0fu);
+return __bswap32(i);
+}
+
+void check_popcount32(uint32_t i, uint32_t o) {
+    ASSERT(o == __builtin_popcount(i));
+}
+
+void check_reverse32(uint32_t i, uint32_t o) {
+    ASSERT(o == bit_reverse(i));
+}
+
+void check_clz32(uint32_t i, uint32_t o) {
+    if (i && o != __builtin_clz(i)) {
+        panic("INPUT %08x EXPECTED %08x GOT %08x\n", i, __builtin_clz(i), o);
+    }
+}
+
+void check_ctz32(uint32_t i, uint32_t o) {
+    if (i && o != __builtin_ctz(i)) {
+        panic("INPUT %08x EXPECTED %08x GOT %08x\n", i, __builtin_clz(i), o);
+    }
+}
+
+static int check_bit_function_internal(uint32_t i, uint32_t (*test_fn)(uint32_t), void (*check_fn)(uint32_t, uint32_t))
+{
+    uint32_t v = test_fn(i);
+    if (check_fn) {
+        check_fn(i, v);
+    }
+    return 0;
+}
+
+static int check_bit_function(uint32_t i, uint32_t (*test_fn)(uint32_t), void (*check_fn)(uint32_t, uint32_t), bool bitreverse_input) {
+    check_bit_function_internal(i, test_fn, check_fn);
+    check_bit_function_internal(i ^ 0xffffffffu, test_fn, check_fn);
+    if (bitreverse_input) {
+        uint32_t rev = bit_reverse(i);
+        check_bit_function_internal(rev, test_fn, check_fn);
+        check_bit_function_internal(rev ^ 0xffffffffu, test_fn, check_fn);
+    }
+    return 0;
+}
+
+__attribute__((naked)) uint32_t empty_fn() {
+    asm ("bx lr");
+}
+
+// note this is not quite exhaustive; it does not test input 0
+#define run_exhaustive(name, func, alternate_func) \
+{ \
+    printf("Exhaustive %s...\n", name); \
+    absolute_time_t t = get_absolute_time(); \
+    for(uint32_t i=0xffffffff; i>0; i--) { \
+        if (func(i) != alternate_func(i)) { \
+            panic("%s failed at %08x\n", name, i); \
+        } \
+        if (!(i&0xfffff)) { \
+            int64_t elapsed = time_diff(t, get_absolute_time()); \
+            int64_t expected = 4096 * elapsed / (4096 - (i>>20)); \
+            int32_t remaining_secs = (expected - elapsed) / 1000000; \
+            printf("\r%d %ds    ", (i>>20), remaining_secs); \
+        } \
+    } \
+    printf("\n"); \
+}
+
+uint32_t __42(uint32_t x) {
+    return 42;
+}
+
+int __time_critical_func(main)()
+{
+    setup_default_uart();
+
+    srand(0xf005ba11);
+
+    uint16_t *func_table = (uint16_t *)(uint32_t)*(uint16_t *) 0x14;
+    uint16_t *data_table = (uint16_t *)(uint32_t)*(uint16_t *) 0x16;
+
+    check_table(func_table, 1, 1); // odd values only
+    // todo this is not true - we should check they are naturally aligned
+    // convention is for these to be word aligned
+    check_table(data_table, 3, 0);
+
+    extern uint32_t __clzsi2(uint32_t);
+    extern uint32_t __ctzsi2(uint32_t);
+    extern uint32_t __popcountsi2(uint32_t);
+    static struct {
+        const char *name;
+        uint32_t (*func)(uint32_t);
+        void (*check)(uint32_t, uint32_t);
+    } tests[] = {
+            { "empty", empty_fn, NULL, },
+            { "popcount32", popcount32, check_popcount32, },
+            { "reverse32", reverse32, check_reverse32, },
+            { "clz32", clz32, check_clz32, },
+            { "ctz32", ctz32, check_ctz32, },
+            { "__clzsi2", __clzsi2, NULL, },
+            { "__clzsi2", __ctzsi2, NULL, },
+            { "__popcountsi2", __popcountsi2, NULL, },
+    };
+    int n_checks = 1 << 16;
+    for (int t = 1; t<5;t++) {
+        printf("Testing %s...\n", tests[t].name);
+        for(int i = 0; i <= n_checks; i++) {
+            check_bit_function(i, tests[t].func, tests[t].check, true);
+            check_bit_function(-i, tests[t].func, tests[t].check, true);
+            check_bit_function((uint) rand(), tests[t].func, tests[t].check, true);
+        }
+    }
+
+    ASSERT(clz32(0) == 32);
+    ASSERT(clz32(-1) == 0);
+    ASSERT(ctz32(0) == 32);
+    ASSERT(ctz32(-1) == 0);
+
+    tictoc_init();
+    // timing
+    n_checks = 1 << 13; // about the max we can test before rollover
+    uint32_t *rands = (uint32_t *)calloc(n_checks, sizeof(uint32_t));
+    for(int i=0;i<n_checks;i++) rands[i] = (uint32_t)rand();
+    uint32_t times[count_of(tests)];
+    for (int t = 0; t<count_of(tests);t++) {
+        printf("Timing %s...\n", tests[t].name);
+        uint32_t cycle = cyc();
+        for(int i = 0; i <= n_checks; i++) {
+            check_bit_function(i, tests[t].func, NULL, false);
+            check_bit_function(-i, tests[t].func, NULL, false);
+            check_bit_function(rands[t], tests[t].func, NULL, false);
+        }
+        times[t] = cyc() - cycle;
+        printf("   %08x\n", (int)times[t]);
+    }
+    for (int t = 0; t<count_of(tests);t++) {
+        int32_t delta = (times[t] - times[0])>>8u;
+        double clocks = delta / (6.0 * n_checks);
+        printf("%s %2.2g cycles\n", tests[t].name, clocks);
+    }
+    free(rands);
+
+#if 0
+    run_exhaustive("clz32", clz32, __clzsi2);
+    run_exhaustive("ctz32", ctz32, __ctzsi2);
+    run_exhaustive("popcount32", popcount32, __popcountsi2);
+    run_exhaustive("reverse32", reverse32, bit_reverse);
+#endif
+    printf("Done\n");
+}
diff --git a/test/mem_functions_test.c b/test/mem_functions_test.c
new file mode 100644
index 0000000..d35b28e
--- /dev/null
+++ b/test/mem_functions_test.c
@@ -0,0 +1,289 @@
+/**
+ * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <string.h>
+#include <stdio.h>
+#include "pico/stdlib.h"
+#include "tictoc.h"
+
+#define ASSERT(x) if (!(x)) { panic("ASSERT: %s l %d: " #x "\n" , __FILE__, __LINE__); }
+
+typedef void *(*memcpy_func)(void *a, void *b, uint len);
+memcpy_func memcpy_general, memcpy_44;
+
+__attribute__((naked)) void *memcpy_slow(void *a, void *b, uint c) {
+    asm (
+    "cmp r2, #0\n"
+    "beq 1f\n"
+    "mov ip, r0\n"
+    "2:\n"
+    "sub r2, #1\n"
+    "ldrb r3, [r1, r2]\n"
+    "strb r3, [r0, r2]\n"
+    "bne 2b\n"
+    "mov r0, ip\n"
+    "1:\n"
+    "bx lr\n"
+    );
+}
+
+__attribute__((naked)) void *memset_slow(void *a, int v, uint c) {
+    asm(
+    "cmp r2, #0\n"
+    "beq 1f\n"
+    "mov ip, r0\n"
+    "2: \n"
+    "sub r2, #1\n"
+    "strb r1, [r0, r2] \n"
+    "bne 2b\n"
+    "mov r0, ip\n"
+    "1:\n"
+    "bx lr \n"
+    );
+}
+
+
+#define TIC t1=cyc();
+#define TOC(x) t1=cyc()-t1; x=t1>>8u; x-=3; // timing overhead
+
+void * __noinline tictoc_memcpy_slow(uint8_t *a, uint8_t *b, uint c, uint32_t *t) {
+uint t1 = 0;
+TIC;
+void *rc = memcpy_slow(a, b, c);
+TOC(*t);
+return rc;
+}
+
+void * __noinline tictoc_memcpy_general(uint8_t *a, uint8_t *b, uint c, uint32_t *t) {
+uint t1 = 0;
+TIC;
+void *rc = memcpy_general(a, b, c);
+TOC(*t);
+return rc;
+}
+
+void * __noinline tictoc_memcpy(uint8_t *a, uint8_t *b, uint c, uint32_t *t) {
+uint t1 = 0;
+TIC;
+void *rc = memcpy(a, b, c);
+TOC(*t);
+return rc;
+}
+
+static inline uint64_t rotl(const uint64_t x, int k) {
+    return (x << k) | (x >> (64 - k));
+}
+
+static uint64_t s[4] = {0x5a5a5a5a5a5a5a5au, 0xf005ba11deadbeefu, 0xb007c0d3caf3bab3u, 0x0123456789abcdefu};
+
+uint64_t xrand(void) {
+    const uint64_t result = rotl(s[0] + s[3], 23) + s[0];
+
+    const uint64_t t = s[1] << 17;
+
+    s[2] ^= s[0];
+    s[3] ^= s[1];
+    s[1] ^= s[2];
+    s[0] ^= s[3];
+
+    s[2] ^= t;
+
+    s[3] = rotl(s[3], 45);
+
+    return result;
+}
+
+void *__memcpy(void *a, void *b, uint len);
+void *__memcpy_44(void *a, void *b, uint len);
+
+static int check_memcpy() {
+    printf("------------------- MEMCPY ---------------------\n");
+    int ret = 0;
+    memcpy_general = __memcpy;
+    memcpy_44 = __memcpy_44;
+    if (!memcpy_general || !memcpy_44) {
+        printf("Doh\n");
+        ret = 1;
+    }
+
+    static uint8_t source[1024];
+    static uint8_t desta[1024];
+    static uint8_t destb[1024];
+    static uint8_t destc[1024];
+    for(int len = 0 ; !ret && len < 1020; len++) {
+        for(int i = 0; i<len + 4;i++) {
+            source[i] = xrand();
+        }
+        for(int src_off = 0; !ret && src_off < 4; src_off++) {
+            for(int dst_off = 0; !ret && dst_off < 4; dst_off++) {
+                memcpy_func mc;
+                for(int v = 0; v < 3; v++) {
+                    switch (v) {
+                        case 0:
+                            mc = memcpy_slow;
+                            break;
+                        case 1:
+                            mc = memcpy_general;
+                            break;
+                        default:
+                            if (src_off || dst_off) {
+                                continue;
+                            }
+                            mc = memcpy_44;
+                    }
+                    memset(desta, 0, len);
+                    memset(destb, 0, len);
+                    void *a = memcpy(desta + dst_off, source + src_off, len);
+//                    if (v == 2 && dst_off == 0 && len == 4) {
+//                        __breakpoint();
+//                    }
+//
+                    void *b = mc(destb + dst_off, source + src_off, len);
+                    b += desta - destb;
+                    uint x = memcmp(desta, destb, 1024);
+                    if (a != b || x) {
+                        printf("Failed v %d +%d->+%d len = %d (%p/%p/%d)\n", v, src_off, dst_off, len, a, b, x);
+                        mc(destb + dst_off, source + src_off, len); // for debugging
+                        ret = 1;
+                    }
+                }
+            }
+        }
+    }
+    tictoc_init();
+
+    for(int len = 0 ; !ret && len < 128; len++)
+    {
+        for(int src_off = 0; !ret && src_off < 4; src_off++)
+        {
+            for(int dst_off = 0; !ret && dst_off < 4; dst_off++)
+            {
+                uint32_t ta, tb, tc;
+                void *a = tictoc_memcpy_slow(desta + dst_off, source + src_off, len, &ta);
+                void *b = tictoc_memcpy_general(destb + dst_off, source + src_off, len, &tb);
+                __unused void *c = tictoc_memcpy(destc + dst_off, source + src_off, len, &tc);
+                b += desta - destb;
+                uint x = memcmp(desta, destb, 1024);
+                if (a != b || x)
+                {
+                    printf("Failed +%d->+%d len = %d (%p/%p/%d)\n", src_off, dst_off, len, a, b, x);
+                    ret = 1;
+                } else {
+                    printf("+%d->+%d len = %d\t%d\t%d\t%d\n", src_off, dst_off, len, (int)ta, (int)tb, (int)tc);
+                }
+            }
+        }
+    }
+    return ret;
+}
+
+typedef void *(*memset_func)(void *a, int c, uint len);
+memset_func memset_general, memset_4;
+
+void *__memset(void *a, int c, uint len);
+void *__memset_4(void *a, int c, uint len);
+
+void * __noinline tictoc_memset_slow(uint8_t *a, int b, uint c, uint32_t *t) {
+uint t1 = 0;
+TIC;
+void *rc = memset_slow(a, b, c);
+TOC(*t);
+return rc;
+}
+
+void * __noinline tictoc_memset_general(uint8_t *a, int b, uint c, uint32_t *t) {
+uint t1 = 0;
+TIC;
+void *rc = memset_general(a, b, c);
+TOC(*t);
+return rc;
+}
+
+void * __noinline tictoc_memset(uint8_t *a, int b, uint c, uint32_t *t) {
+uint t1 = 0;
+TIC;
+void *rc = memset(a, b, c);
+TOC(*t);
+return rc;
+}
+
+static int check_memset() {
+    printf("------------------- MEMSET ---------------------\n");
+    int ret = 0;
+    memset_general = __memset;
+    memset_4 = __memset_4;
+    if (!memset_general || !memset_4) {
+        printf("Doh\n");
+        ret = 1;
+    }
+
+    static uint8_t desta[1024];
+    static uint8_t destb[1024];
+    static uint8_t destc[1024];
+    for(int len = 0 ; !ret && len < 1020; len++) {
+        for(int dst_off = 0; !ret && dst_off < 4; dst_off++) {
+            memset_func ms;
+            for(int v = 0; v < 3; v++) {
+                switch (v) {
+                    case 0:
+                        ms = memset_slow;
+                        break;
+                    case 1:
+                        ms = memset_general;
+                        break;
+                    default:
+                        if (dst_off) {
+                            continue;
+                        }
+                        ms = memset_4;
+                }
+                memset(desta, 0, len);
+                memset(destb, 0, len);
+                int c = (1u + xrand()) & 0xfeu;
+                void *a = memset(desta + dst_off, c, len);
+                void *b = ms(destb + dst_off, c, len);
+                b += desta - destb;
+                uint x = memcmp(desta, destb, 1024);
+                if (a != b || x) {
+                    printf("Failed v %d +%d len = %d (%p/%p/%d)\n", v, dst_off, len, a, b, x);
+                    ms(destb + dst_off, c, len); // for debugging
+                    ret = 1;
+                }
+            }
+        }
+    }
+    *(volatile unsigned int *)0xe000e010=5; // enable SYSTICK at core clock
+
+    for(int len = 0 ; !ret && len < 128; len++)
+    {
+        for(int dst_off = 0; !ret && dst_off < 4; dst_off++)
+        {
+            int c = (1 + xrand()) & 0xfeu;
+            uint32_t ta, tb, tc;
+            void *a = tictoc_memset_slow(desta + dst_off, c, len, &ta);
+            void *b = tictoc_memset_general(destb + dst_off, c, len, &tb);
+            __unused void *_c = tictoc_memset(destc + dst_off, c, len, &tc);
+            b += desta - destb;
+            uint x = memcmp(desta, destb, 1024);
+            if (a != b || x)
+            {
+                printf("Failed +%d len = %d (%p/%p/%d)\n", dst_off, len, a, b, x);
+                ret = 1;
+            } else {
+                printf("+%d len = %d\t%d\t%d\t%d\n", dst_off, len, (int)ta, (int)tb, (int)tc);
+            }
+        }
+    }
+    return ret;
+}
+
+int main() {
+    setup_default_uart();
+    ASSERT(!check_memcpy());
+    ASSERT(!check_memset());
+    printf("OK\n");
+    return 0;
+}
\ No newline at end of file
diff --git a/test/tc_rom_double.c b/test/tc_rom_double.c
new file mode 100644
index 0000000..48a634c
--- /dev/null
+++ b/test/tc_rom_double.c
@@ -0,0 +1,527 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "pico/stdlib.h"
+#include "pico/bootrom.h"
+
+// NOTE THIS IS JUST A SMOKE TEST OF ALL FLOAT FUNCtiONS, NOT AN EXHAUSTIVE CORRECTNESS TEST
+
+#define ASSERT(x) if (!(x)) { panic("ASSERT: %s l %d: " #x "\n" , __FILE__, __LINE__); }
+
+typedef union {
+    double d;
+    uint64_t l;
+} double_value;
+
+double_value least_neg_norm = { .l =  0x8010000000000000ll };
+double_value most_neg_denorm = { .l = 0x800fffffffffffffll };
+double_value random_neg_denorm = { .l = 0x800123456789abcdll };
+double_value least_neg_denorm = { .l = 0x8000000000000001ll };
+double_value least_neg_denorm2 = { .l = 0x8000000000000002ll };
+double_value minus_zero = { .l = 0x8000000000000000ll };
+double_value zero = { .l = 0x0000000000000000ll };
+double_value least_pos_denorm = { .l = 0x0000000000000001ll };
+double_value least_pos_denorm2 = { .l = 0x0000000000000002ll };
+double_value random_pos_denorm = { .l = 0x000123456789abcdll };
+double_value most_pos_denorm = { .l = 0x000fffffffffffffll };
+double_value least_pos_norm = { .l = 0x0010000000000000ll };
+
+typedef int (*i3func)(int, int);
+
+struct mufp_funcs {
+    double (*mufp_dadd)(double, double);
+    double (*mufp_dsub)(double, double);
+    double (*mufp_dmul)(double, double);
+    double (*mufp_ddiv)(double, double);
+    int (*mufp_dcmp_fast)(double, double);
+    void (*mufp_dcmp_fast_flags)(double, double);
+    double (*mufp_dsqrt)(double);
+    int (*mufp_double2int)(double);
+    int (*mufp_double2fix)(double, int);
+    uint (*mufp_double2uint)(double);
+    uint (*mufp_double2ufix)(double, int);
+    double (*mufp_int2double)(int);
+    double (*mufp_fix2double)(int, int);
+    double (*mufp_uint2double)(uint);
+    double (*mufp_ufix2double)(uint, int);
+    double (*mufp_dcos)(double);
+    double (*mufp_dsin)(double);
+    double (*mufp_dtan)(double);
+    // since float doesn't use this slot, we don't either
+    uint32_t _not_fatan2; // double (*mufp_datan2)(double, double);
+    double (*mufp_dexp)(double);
+    double (*mufp_dln)(double);
+
+    // these are in rom version 2
+    int (*mufp_dcmp)(double, double);
+    double (*mufp_datan2)(double, double);
+    double (*mufp_int642double)(int64_t);
+    double (*mufp_fix642double)(int64_t, int);
+    double (*mufp_uint642double)(int64_t);
+    double (*mufp_ufix642double)(int64_t, int);
+    int64_t (*mufp_double2int64)(double);
+    int64_t (*mufp_double2fix64)(double, int);
+    int64_t (*mufp_double2uint64)(double);
+    int64_t (*mufp_double2ufix64)(double, int);
+
+    float (*mufp_double2float)(double);
+} *mufp_funcs;
+
+double __noinline dadd(double a, double b) {
+    return a + b;
+}
+
+double __noinline dsub(double a, double b) {
+    return a - b;
+}
+
+double __noinline dmul(double a, double b) {
+    return a * b;
+}
+
+double __noinline ddiv(double a, double b) {
+    return a / b;
+}
+
+int __noinline dcmp_fast(double a, double b) {
+    return a < b ? - 1 : (a > b ? 1 : 0);
+}
+
+static double flush(double x) {
+    double_value val = { .d = x };
+    if (val.l >= zero.l && val.l <= most_pos_denorm.l) x = 0;
+    if (val.l >= minus_zero.l && val.l <= most_neg_denorm.l) x = 0;
+    return x;
+}
+
+int __noinline dcmp(double a, double b) {
+    return dcmp_fast(flush(a), flush(b));
+}
+
+double __noinline dsqrt(double a) {
+    return sqrtf(a);
+}
+
+int __noinline double2int(double a) {
+    return (int)a;
+}
+
+int64_t __noinline double2int64(double a) {
+    return (int64_t)a;
+}
+
+int __noinline double2fix(double a, int b) {
+    return (int)(a * pow(2.0, b));
+}
+
+int64_t __noinline double2fix64(double a, int b) {
+    return (int64_t)(a * powf(2.0, b));
+}
+
+uint __noinline double2uint(double a) {
+    // we do this which seems more useful... a wrapper for casting can choose to call double2int instead and cast that as uint if it wants
+    return a < 0 ? 0 : (uint) a;
+}
+
+uint64_t __noinline double2uint64(double a) {
+    // we do this which seems more useful... a wrapper for casting can choose to call double2int instead and cast that as uint if it wants
+    return a < 0 ? 0 : (uint64_t) a;
+}
+
+uint __noinline double2ufix(double a, int b) {
+    if (a < 0) return 0;
+    return (uint)(a * pow(2.0, b));
+}
+
+uint64_t __noinline double2ufix64(double a, int b) {
+    if (a < 0) return 0;
+    return (uint64_t)(a * powf(2.0, b));
+}
+
+double int2double(int a) {
+    return (double)a;
+}
+
+double int642double(int64_t a) {
+    return (double)a;
+}
+
+double __noinline fix2double(int a, int b) {
+    return ((double)a) / pow(2.0, b);
+}
+
+double __noinline fix642double(int64_t a, int b) {
+    return ((double)a) / powf(2.0, b);
+}
+
+double uint2double(uint a) {
+    return (double)a;
+}
+
+double uint642double(uint64_t a) {
+    return (double)a;
+}
+
+double ufix2double(uint a, int b) {
+    return ((double)a) / pow(2.0, b);
+}
+
+double ufix642double(uint64_t a, int b) {
+    return ((double)a) / powf(2.0, b);
+}
+
+float double2float(double a) {
+    return (float)a;
+}
+
+double __noinline dcos(double a) {
+    return cos(a);
+}
+
+double __noinline dsin(double a) {
+    return sin(a);
+}
+
+double __noinline dtan(double a) {
+    return tan(a);
+}
+
+double __noinline datan2(double a, double b) {
+    return atan2(a, b);
+}
+
+double __noinline dexp(double a) {
+    return exp(a);
+}
+
+double __noinline dln(double a) {
+    return log(a);
+}
+
+// yuk our ee_printf crashses on infinites and doubles
+#define safe_for_print(x) (float)((x) != (x) ? -12301.0 : ((x) == infinity() ? -12302.0 : ((x) == -infinity() ? -12303.0 : (x))))
+// want typeof, but don't want to change build to use new C version at this point
+#define check_double(a, b) ({if (!(a == b || abs((a)-(b)) < 1e-11f)) printf("%f != %f %s\n", safe_for_print(a), safe_for_print(b), __STRING((b))); ASSERT(a == b || abs((a)-(b)) < 1e-11f); })
+#define check_int(a, b) ({if ((a)!=(b)) printf("%d != %d %s\n", a, b, __STRING((b))); ASSERT((a) == (b)); })
+#define check_uint(a, b) ({if ((a)!=(b)) printf("%u != %u %s\n", a, b, __STRING((b))); ASSERT((a) == (b)); })
+#define check_int64(a, b) ({if ((a)!=(b)) printf("%08x%08x != %08x%08x %s\n", (int)(a>>32), (int)a, (int)(b>>32), (int)b, __STRING((b))); ASSERT((a) == (b)); })
+#define check_uint64(a, b) ({if ((a)!=(b)) printf("%08x%08x != %08x%08x %s\n", (int)(a>>32), (int)a, (int)(b>>32), (int)b, __STRING((b))); ASSERT((a) == (b)); })
+
+#define check_double_fn1(fn, a) check_double(fn(a), mufp_funcs->mufp_##fn(a))
+#define check_double_fn2(fn, a, b) check_double(fn(a, b), mufp_funcs->mufp_##fn(a, b))
+#define check_int_fn1(fn, a) check_int(fn(a), mufp_funcs->mufp_##fn(a))
+#define check_int64_fn1(fn, a) check_int64(fn(a), mufp_funcs->mufp_##fn(a))
+#define check_int_fn2(fn, a, b) check_int(fn(a, b), mufp_funcs->mufp_##fn(a, b))
+#define check_int64_fn2(fn, a, b) check_int64(fn(a, b), mufp_funcs->mufp_##fn(a, b))
+#define check_uint_fn1(fn, a) check_uint(fn(a), mufp_funcs->mufp_##fn(a))
+#define check_uint64_fn1(fn, a) check_uint64(fn(a), mufp_funcs->mufp_##fn(a))
+#define check_uint_fn2(fn, a, b) check_uint(fn(a, b), mufp_funcs->mufp_##fn(a, b))
+#define check_uint64_fn2(fn, a, b) check_uint64(fn(a, b), mufp_funcs->mufp_##fn(a, b))
+
+int __attribute__((naked)) dcmp_from_dcmp_flags(double a, double b, int (*dmcp_flags)(double, double)) {
+    asm(
+        "push {r4, r5, lr}\n"
+        "mov r4, #1\n"
+        "ldr r5, [sp, #0xc]\n" // dcmp_flags param
+        "blx r5\n"
+        "bge 1f\n"
+        "neg r4, r4\n"
+        "1:\n"
+        "bne 1f\n"
+        "sub r4, r4\n"
+        "1:\n"
+        "mov r0, r4\n"
+        "pop {r4, r5, pc}\n"
+    );
+}
+
+#define check_dcmp_flags(a,b) check_int(dcmp(a, b), dcmp_from_dcmp_flags(a, b, mufp_funcs->mufp_dcmp)) // dcmp is dcmp_flags now
+#define check_dcmp_fast_flags(a,b) check_int(dcmp_fast(a, b), dcmp_from_dcmp_flags(a, b, mufp_funcs->mufp_dcmp_fast_flags))
+
+int main()
+{
+    setup_default_uart();
+    int rom_version = *(uint8_t*)0x13;
+    printf("ROM VERSION %d\n", rom_version);
+    if (rom_version == 1) {
+        printf("ROM VERSION 1 HAS NO DOUBLE, SKIPPING\n");
+        exit(0);
+    }
+
+    srand(0xf005ba11);
+    mufp_funcs = (struct mufp_funcs *)rom_data_lookup(rom_table_code('S','D'));
+    ASSERT(mufp_funcs);
+
+    uint8_t *func_count = (uint8_t *)rom_data_lookup(rom_table_code('F','Z'));
+    assert(func_count);
+    assert(*func_count == sizeof(struct mufp_funcs) / 4);
+    for(int i=0; i<sizeof(struct mufp_funcs); i+=4) {
+        uint32_t fp = *(uint32_t*)(((uint8_t*)mufp_funcs) + i);
+        ASSERT(fp);
+        ASSERT(fp & 1u); // thumb bit!
+        ASSERT(fp < 16 * 1024); // in ROM!
+    }
+
+    // very simple sanity tests
+    check_double_fn2(dadd, 1.3, -5.0);
+
+    check_double_fn2(dsub, 1000.75, 998.6);
+
+    check_double_fn2(dmul, 1.75, 31.4);
+
+    check_double_fn2(ddiv, 2314.6, -.37);
+    check_double_fn2(ddiv, 234.6, -10000.37);
+    check_double_fn2(ddiv, 2314.6, infinity());
+    check_double_fn2(ddiv, 2314.6, -infinity());
+
+    if (rom_version > 1) {
+        // todo check denormals
+        check_int_fn2(dcmp, -3.0, 7.3);
+        check_int_fn2(dcmp, 3.0, -7.3);
+        check_int_fn2(dcmp, 3.0, 3.0);
+        check_int_fn2(dcmp, 3.0, -infinity());
+        check_int_fn2(dcmp, 3.0, infinity());
+
+        check_int_fn2(dcmp, least_neg_denorm.d, most_neg_denorm.d);
+        check_int_fn2(dcmp, most_neg_denorm.d, least_neg_denorm.d);
+        check_int_fn2(dcmp, least_neg_denorm.d, least_neg_denorm.d);
+        check_int_fn2(dcmp, least_neg_norm.d, least_neg_denorm.d);
+        check_int_fn2(dcmp, least_neg_denorm.d, least_neg_denorm2.d);
+        check_int_fn2(dcmp, least_neg_denorm.d, least_pos_denorm.d);
+        check_int_fn2(dcmp, least_pos_denorm.d, most_pos_denorm.d);
+        check_int_fn2(dcmp, most_pos_denorm.d, least_pos_denorm.d);
+        check_int_fn2(dcmp, least_pos_denorm.d, least_pos_denorm.d);
+        check_int_fn2(dcmp, least_pos_denorm.d, least_pos_denorm.d);
+        check_int_fn2(dcmp, least_pos_norm.d, least_pos_denorm.d);
+        check_int_fn2(dcmp, least_pos_denorm.d, least_pos_denorm2.d);
+        check_int_fn2(dcmp, least_pos_denorm.d, least_neg_denorm.d);
+    }
+
+    check_int_fn2(dcmp_fast, -3.0, 7.3);
+    check_int_fn2(dcmp_fast, 3.0, -7.3);
+    check_int_fn2(dcmp_fast, 3.0, 3.0);
+    check_int_fn2(dcmp_fast, 3.0, -infinity());
+    check_int_fn2(dcmp_fast, 3.0, infinity());
+
+    check_int_fn2(dcmp_fast, least_neg_denorm.d, most_neg_denorm.d);
+    check_int_fn2(dcmp_fast, most_neg_denorm.d, least_neg_denorm.d);
+    check_int_fn2(dcmp_fast, least_neg_denorm.d, least_neg_denorm.d);
+    check_int_fn2(dcmp_fast, least_neg_norm.d, least_neg_denorm.d);
+    check_int_fn2(dcmp_fast, least_neg_denorm.d, least_neg_denorm2.d);
+    check_int_fn2(dcmp_fast, least_neg_denorm.d, least_pos_denorm.d);
+    check_int_fn2(dcmp_fast, least_pos_denorm.d, most_pos_denorm.d);
+    check_int_fn2(dcmp_fast, most_pos_denorm.d, least_pos_denorm.d);
+    check_int_fn2(dcmp_fast, least_pos_denorm.d, least_pos_denorm.d);
+    check_int_fn2(dcmp_fast, least_pos_denorm.d, least_pos_denorm.d);
+    check_int_fn2(dcmp_fast, least_pos_norm.d, least_pos_denorm.d);
+    check_int_fn2(dcmp_fast, least_pos_denorm.d, least_pos_denorm2.d);
+    check_int_fn2(dcmp_fast, least_pos_denorm.d, least_neg_denorm.d);
+
+
+    if (rom_version > 1) {
+        // todo check denormals
+        check_dcmp_flags(-3.0, 7.3);
+        check_dcmp_flags(3.0, -7.3);
+        check_dcmp_flags(3.0, 3.0);
+        check_dcmp_flags(3.0, -infinity());
+        check_dcmp_flags(3.0, infinity());
+
+        check_dcmp_flags( least_neg_denorm.d, most_neg_denorm.d);
+        check_dcmp_flags( most_neg_denorm.d, least_neg_denorm.d);
+        check_dcmp_flags( least_neg_denorm.d, least_neg_denorm.d);
+        check_dcmp_flags( least_neg_norm.d, least_neg_denorm.d);
+        check_dcmp_flags( least_neg_denorm.d, least_neg_denorm2.d);
+        check_dcmp_flags( least_neg_denorm.d, least_pos_denorm.d);
+        check_dcmp_flags( least_pos_denorm.d, most_pos_denorm.d);
+        check_dcmp_flags( most_pos_denorm.d, least_pos_denorm.d);
+        check_dcmp_flags( least_pos_denorm.d, least_pos_denorm.d);
+        check_dcmp_flags( least_pos_denorm.d, least_pos_denorm.d);
+        check_dcmp_flags( least_pos_norm.d, least_pos_denorm.d);
+        check_dcmp_flags( least_pos_denorm.d, least_pos_denorm2.d);
+        check_dcmp_flags( least_pos_denorm.d, least_neg_denorm.d);
+    }
+
+    check_dcmp_fast_flags(-3.0, 7.3);
+    check_dcmp_fast_flags(3.0, -7.3);
+    check_dcmp_fast_flags(3.0, 3.0);
+    check_dcmp_fast_flags(3.0, -infinity());
+    check_dcmp_fast_flags(3.0, infinity());
+
+    check_dcmp_fast_flags( least_neg_denorm.d, most_neg_denorm.d);
+    check_dcmp_fast_flags( most_neg_denorm.d, least_neg_denorm.d);
+    check_dcmp_fast_flags( least_neg_denorm.d, least_neg_denorm.d);
+    check_dcmp_fast_flags( least_neg_norm.d, least_neg_denorm.d);
+    check_dcmp_fast_flags( least_neg_denorm.d, least_neg_denorm2.d);
+    check_dcmp_fast_flags( least_neg_denorm.d, least_pos_denorm.d);
+    check_dcmp_fast_flags( least_pos_denorm.d, most_pos_denorm.d);
+    check_dcmp_fast_flags( most_pos_denorm.d, least_pos_denorm.d);
+    check_dcmp_fast_flags( least_pos_denorm.d, least_pos_denorm.d);
+    check_dcmp_fast_flags( least_pos_denorm.d, least_pos_denorm.d);
+    check_dcmp_fast_flags( least_pos_norm.d, least_pos_denorm.d);
+    check_dcmp_fast_flags( least_pos_denorm.d, least_pos_denorm2.d);
+    check_dcmp_fast_flags( least_pos_denorm.d, least_neg_denorm.d);
+
+    check_double_fn1(dsqrt, 3.0);
+
+    // we are returning INFINITE not NAN as we don't support NANs
+//    check_double_fn1(fsqrt, -3.0);
+    // todo right now qsqrt and fsqrt return opposite signed infinity
+#if 0
+    ASSERT(infinity() == mufp_funcs->mufp_dsqrt(-3.0));
+#else
+    ASSERT(-infinity() == mufp_funcs->mufp_dsqrt(-3.0));
+#endif
+
+    check_int_fn1(double2int, 3.0);
+    check_int_fn1(double2int, 123456000000.0);
+    check_int_fn1(double2int, -3.0);
+    check_int_fn1(double2int, -123456000000.0);
+    check_int_fn1(double2int, infinity());
+    check_int_fn1(double2int, -infinity());
+
+    check_int_fn2(double2fix, 3.0, 3);
+    check_int_fn2(double2fix, 31.0, -3);
+    check_int_fn2(double2fix, -3.0, 3);
+    // todo JURY IS OUT ON THIS ONE
+    //check_int_fn2(double2fix, -31.0, -3);
+
+    check_uint_fn1(double2uint, 3.0);
+    check_uint_fn1(double2uint, 123456000000.0);
+    check_uint_fn1(double2uint, -3.0);
+    check_uint_fn1(double2uint, -123456000000.0);
+
+    check_uint_fn2(double2ufix, 3.0, 3);
+    check_uint_fn2(double2ufix, 3.0, -3);
+
+    check_double_fn1(int2double, 3);
+    check_double_fn1(int2double, INT32_MAX);
+    check_double_fn1(int2double, INT32_MIN);
+    check_double_fn1(int2double, -3);
+
+    check_double_fn2(fix2double, 3, 3);
+    check_double_fn2(fix2double, 3, -3);
+    check_double_fn2(fix2double, -3, 3);
+    check_double_fn2(fix2double, -3, -3);
+
+    check_double_fn1(uint2double, 3);
+    check_double_fn1(uint2double, UINT32_MAX);
+
+//    double (*mufp_ufix2double)(uint, int);
+
+    check_double_fn1(dcos, 0.0);
+    check_double_fn1(dcos, 2.7);
+    check_double_fn1(dcos, -32.7);
+
+    check_double_fn1(dsin, 0.0);
+    check_double_fn1(dsin, 2.7);
+    check_double_fn1(dsin, -32.7);
+
+    check_double_fn1(dtan, 0.0);
+    check_double_fn1(dtan, 2.7);
+    check_double_fn1(dtan, -32.7);
+
+    if (rom_version > 1) {
+        check_double_fn2(datan2, 3.0, 4.0);
+        check_double_fn2(datan2, -3.0, 4.0);
+        check_double_fn2(datan2, 4.0, -.31);
+        check_double_fn2(datan2, -3.0, -.17);
+    }
+
+    check_double_fn1(dexp, 0.0);
+    check_double_fn1(dexp, 2.7);
+    check_double_fn1(dexp, -32.7);
+
+    check_double_fn1(dln, 0.3);
+    check_double_fn1(dln, 1.0);
+    check_double_fn1(dln, 2.7);
+
+    // we are returning -INFINITE as we don't support NANs
+//    check_double_fn1(fln, -32.7);
+    ASSERT(-INFINITY == mufp_funcs->mufp_dln(-32.7));
+
+    check_int64_fn1(double2int64, 3.0);
+    check_int64_fn1(double2int64, 123456000000.0);
+    check_int64_fn1(double2int64, 12345678912345.0);
+    check_int64_fn1(double2int64, -3.0);
+    check_int64_fn1(double2int64, -123456000000.0);
+    check_int64_fn1(double2int64, -12345678912345.0);
+
+    // seems like gcc is wrong on this one
+//        check_int64_fn1(double2int64, INFINITY);
+    // so
+    ASSERT(INT64_MAX == mufp_funcs->mufp_double2int64(INFINITY));
+
+    // seems like gcc is wrong on this one
+//        check_int64_fn1(double2int64, -INFINITY);
+    // so
+    ASSERT( INT64_MIN == mufp_funcs->mufp_double2int64(-INFINITY));
+
+
+    check_int64_fn2(double2fix64, 3.0, 3);
+    check_int64_fn2(double2fix64, 31.0, -3);
+    check_int64_fn2(double2fix64, -3.0, 3);
+    // todo JURY IS OUT ON THIS ONE
+    //check_int64_fn2(double2fix64, -31.0, -3);
+
+    check_uint64_fn1(double2uint64, 3.0);
+    check_uint64_fn1(double2uint64, 123456000000.0);
+    check_uint64_fn1(double2uint64, 12345678912345.0);
+    check_uint64_fn1(double2uint64, -3.0);
+    check_uint64_fn1(double2uint64, -123456000000.0);
+    check_uint64_fn1(double2uint64, -12345678912345.0);
+
+    check_uint64_fn2(double2ufix64, 3.0, 3);
+    check_uint64_fn2(double2ufix64, 3.0, 43);
+    check_uint64_fn2(double2ufix64, 3.0, -3);
+    check_uint64_fn2(double2ufix64, 3.0, -43);
+
+#define LARGE 0x1234567800000000ll
+    check_double_fn1(int642double, 3);
+    check_double_fn1(int642double, LARGE);
+    check_double_fn1(int642double, INT32_MAX);
+    check_double_fn1(int642double, INT64_MAX);
+    check_double_fn1(int642double, INT32_MIN);
+    check_double_fn1(int642double, INT64_MIN);
+    check_double_fn1(int642double, -3);
+    check_double_fn1(int642double, -LARGE);
+
+    check_double_fn2(fix642double, 3, 3);
+    check_double_fn2(fix642double, 3, -3);
+    check_double_fn2(fix642double, -3, 3);
+    check_double_fn2(fix642double, -3, -3);
+
+    check_double_fn2(fix642double, LARGE, 3);
+    check_double_fn2(fix642double, LARGE, -3);
+    check_double_fn2(fix642double, -LARGE, 3);
+    check_double_fn2(fix642double, -LARGE, -3);
+
+    check_double_fn1(uint642double, 3);
+    check_double_fn1(uint642double, UINT32_MAX);
+    check_double_fn1(uint642double, UINT64_MAX);
+    check_double_fn1(uint642double, LARGE);
+
+    check_double_fn1(double2float, 3.0f);
+    check_double_fn1(double2float, 3.0e12f);
+    check_double_fn1(double2float, -3.0e12f);
+    check_double_fn1(double2float, -3.13159275412321412565856845745);
+    check_double_fn1(double2float, 3.13159275412321412565856845745);
+    check_double_fn1(double2float, -INFINITY);
+    check_double_fn1(double2float, INFINITY);
+    check_double_fn1(double2float, most_pos_denorm.d);
+    check_double_fn1(double2float, least_pos_denorm.d);
+    check_double_fn1(double2float, least_pos_norm.d);
+    check_double_fn1(double2float, least_neg_denorm.d);
+    check_double_fn1(double2float, least_neg_norm.d);
+    check_double_fn1(double2float, most_neg_denorm.d);
+    check_double_fn1(double2float, 0.f);
+    check_double_fn1(double2float, -0.f);
+
+    check_double_fn1(double2float, 3.0e-58);
+    check_double_fn1(double2float, 3.0e58);
+
+    check_double_fn1(double2float, 3.0e68);
+    check_double_fn1(double2float, -3.0e68);
+
+    printf("DOUBLE OK\n");
+	return 0;
+}
diff --git a/test/tc_rom_float.c b/test/tc_rom_float.c
new file mode 100644
index 0000000..1f76a7e
--- /dev/null
+++ b/test/tc_rom_float.c
@@ -0,0 +1,506 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include "pico/stdlib.h"
+#include "pico/bootrom.h"
+
+// NOTE THIS IS JUST A SMOKE TEST OF ALL FLOAT FUNCtiONS, NOT AN EXHAUSTIVE CORRECTNESS TEST
+
+#define ASSERT(x) if (!(x)) { panic("ASSERT: %s l %d: " #x "\n" , __FILE__, __LINE__); }
+
+typedef union {
+    float f;
+    uint32_t i;
+} float_value;
+
+float_value least_neg_norm = { .i = 0x80800000 };
+float_value most_neg_denorm = { .i = 0x807fffff };
+float_value random_neg_denorm = { .i = 0x80123456 };
+float_value least_neg_denorm = { .i = 0x80000001 };
+float_value minus_zero = { .i = 0x80000000 };
+float_value zero = { .i = 0x00000000 };
+float_value least_pos_denorm = { .i = 0x00000001 };
+float_value random_pos_denorm = { .i = 0x00123456 };
+float_value most_pos_denorm = { .i = 0x007fffff };
+float_value least_pos_norm = { .i = 0x00800000 };
+
+typedef int (*i3func)(int, int);
+
+struct mufp_funcs {
+    float (*mufp_fadd)(float, float);
+    float (*mufp_fsub)(float, float);
+    float (*mufp_fmul)(float, float);
+    float (*mufp_fdiv)(float, float);
+    int (*mufp_fcmp_fast)(float, float);
+    void (*mufp_fcmp_fast_flags)(float, float);
+    float (*mufp_fsqrt)(float);
+    int (*mufp_float2int)(float);
+    int (*mufp_float2fix)(float, int);
+    uint (*mufp_float2uint)(float);
+    uint (*mufp_float2ufix)(float, int);
+    float (*mufp_int2float)(int);
+    float (*mufp_fix2float)(int, int);
+    float (*mufp_uint2float)(uint);
+    float (*mufp_ufix2float)(uint, int);
+    float (*mufp_fcos)(float);
+    float (*mufp_fsin)(float);
+    float (*mufp_ftan)(float);
+    uint32_t _broken_fatan2; //    float (*mufp_fatan2)(float, float);
+    float (*mufp_fexp)(float);
+    float (*mufp_fln)(float);
+
+    // these are in rom version 2
+    int (*mufp_fcmp)(float, float);
+    float (*mufp_fatan2)(float, float);
+    float (*mufp_int642float)(int64_t);
+    float (*mufp_fix642float)(int64_t, int);
+    float (*mufp_uint642float)(int64_t);
+    float (*mufp_ufix642float)(int64_t, int);
+    int64_t (*mufp_float2int64)(float);
+    int64_t (*mufp_float2fix64)(float, int);
+    int64_t (*mufp_float2uint64)(float);
+    int64_t (*mufp_float2ufix64)(float, int);
+    double (*mufp_float2double)(float);
+
+} *mufp_funcs;
+
+float __noinline fadd(float a, float b) {
+    return a + b;
+}
+
+float __noinline fsub(float a, float b) {
+    return a - b;
+}
+
+float __noinline fmul(float a, float b) {
+    return a * b;
+}
+
+float __noinline fdiv(float a, float b) {
+    return a / b;
+}
+
+float flush(float x) {
+    float_value val = { .f = x };
+    if (val.i >= zero.i && val.i <= most_pos_denorm.i) x = 0;
+    if (val.i >= minus_zero.i && val.i <= most_neg_denorm.i) x = 0;
+    return x;
+}
+
+int __noinline fcmp_fast(float a, float b) {
+    return a < b ? - 1 : (a > b ? 1 : 0);
+}
+
+int __noinline fcmp(float a, float b) {
+    return fcmp_fast(flush(a), flush(b));
+}
+
+float __noinline fsqrt(float a) {
+    return sqrtf(a);
+}
+
+int __noinline float2int(float a) {
+    return (int)a;
+}
+
+int64_t __noinline float2int64(float a) {
+    return (int64_t)a;
+}
+
+int __noinline float2fix(float a, int b) {
+    return (int)(a * powf(2.f, b));
+}
+
+int64_t __noinline float2fix64(float a, int b) {
+    return (int64_t)(a * powf(2.f, b));
+}
+
+uint __noinline float2uint(float a) {
+    // we do this which seems more useful... a wrapper for casting can choose to call float2int instead and cast that as uint if it wants
+    return a < 0 ? 0 : (uint) a;
+}
+
+uint64_t __noinline float2uint64(float a) {
+    // we do this which seems more useful... a wrapper for casting can choose to call float2int instead and cast that as uint if it wants
+    return a < 0 ? 0 : (uint64_t) a;
+}
+
+
+uint __noinline float2ufix(float a, int b) {
+    if (a < 0) return 0;
+    return (uint)(a * powf(2.f, b));
+}
+
+uint64_t __noinline float2ufix64(float a, int b) {
+    if (a < 0) return 0;
+    return (uint64_t)(a * powf(2.f, b));
+}
+
+float int2float(int a) {
+    return (float)a;
+}
+
+float int642float(int64_t a) {
+    return (float)a;
+}
+
+float __noinline fix2float(int a, int b) {
+    return ((float)a) / powf(2.f, b);
+}
+
+float __noinline fix642float(int64_t a, int b) {
+    return ((float)a) / powf(2.f, b);
+}
+
+float uint2float(uint a) {
+    return (float)a;
+}
+
+float uint642float(uint64_t a) {
+    return (float)a;
+}
+
+float ufix2float(uint a, int b) {
+    return ((float)a) / powf(2.f, b);
+}
+
+float ufix642float(uint64_t a, int b) {
+    return ((float)a) / powf(2.f, b);
+}
+
+double float2double(float a) {
+    return (double)a;
+}
+
+float __noinline fcos(float a) {
+    return cosf(a);
+}
+
+float __noinline fsin(float a) {
+    return sinf(a);
+}
+
+float __noinline ftan(float a) {
+    return tanf(a);
+}
+
+float __noinline fatan2(float a, float b) {
+    return atan2f(a, b);
+}
+
+float __noinline fexp(float a) {
+    return expf(a);
+}
+
+float __noinline fln(float a) {
+    return logf(a);
+}
+
+// yuk our ee_printf crashses on infinites and floats
+#define safe_for_print(x) (x) != (x) ? -12301.f : ((x) == INFINITY ? -12302.f : ((x) == -INFINITY ? -12303.f : (x)))
+// want typeof, but don't want to change build to use new C version at this point
+#define check_float(a, b) ({if (!(a == b || fabsf((a)-(b)) < 1e-6f)) printf("%f != %f %s\n", safe_for_print(a), safe_for_print(b), __STRING((b))); ASSERT(a == b || fabsf((a)-(b)) < 1e-6f); })
+#define check_int(a, b) ({if ((a)!=(b)) printf("%d != %d %s\n", a, b, __STRING((b))); ASSERT((a) == (b)); })
+#define check_uint(a, b) ({if ((a)!=(b)) printf("%u != %u %s\n", a, b, __STRING((b))); ASSERT((a) == (b)); })
+#define check_int64(a, b) ({if ((a)!=(b)) printf("%08x%08x != %08x%08x %s\n", (int)(a>>32), (int)a, (int)(b>>32), (int)b, __STRING((b))); ASSERT((a) == (b)); })
+#define check_uint64(a, b) ({if ((a)!=(b)) printf("%08x%08x != %08x%08x %s\n", (int)(a>>32), (int)a, (int)(b>>32), (int)b, __STRING((b))); ASSERT((a) == (b)); })
+
+#define check_float_fn1(fn, a) check_float(fn(a), mufp_funcs->mufp_##fn(a))
+#define check_float_fn2(fn, a, b) check_float(fn(a, b), mufp_funcs->mufp_##fn(a, b))
+#define check_int_fn1(fn, a) check_int(fn(a), mufp_funcs->mufp_##fn(a))
+#define check_int64_fn1(fn, a) check_int64(fn(a), mufp_funcs->mufp_##fn(a))
+#define check_int_fn2(fn, a, b) check_int(fn(a, b), mufp_funcs->mufp_##fn(a, b))
+#define check_int64_fn2(fn, a, b) check_int64(fn(a, b), mufp_funcs->mufp_##fn(a, b))
+#define check_uint_fn1(fn, a) check_uint(fn(a), mufp_funcs->mufp_##fn(a))
+#define check_uint64_fn1(fn, a) check_uint64(fn(a), mufp_funcs->mufp_##fn(a))
+#define check_uint_fn2(fn, a, b) check_uint(fn(a, b), mufp_funcs->mufp_##fn(a, b))
+#define check_uint64_fn2(fn, a, b) check_uint64(fn(a, b), mufp_funcs->mufp_##fn(a, b))
+
+int __attribute__((naked)) fcmp_from_fcmp_flags(float a, float b, int (*fmcp_flags)(float, float)) {
+    asm(
+        "push {r4, lr}\n"
+        "mov r4, #1\n"
+        "blx r2\n"
+        "bge 1f\n"
+        "neg r4, r4\n"
+        "1:\n"
+        "bne 1f\n"
+        "sub r4, r4\n"
+        "1:\n"
+        "mov r0, r4\n"
+        "pop {r4, pc}\n"
+    );
+}
+
+#define check_fcmp_flags(a,b) check_int(fcmp(a, b), fcmp_from_fcmp_flags(a, b, mufp_funcs->mufp_fcmp)) // f_cmp is f_cmp_flags now
+#define check_fcmp_fast_flags(a,b) check_int(fcmp_fast(a, b), fcmp_from_fcmp_flags(a, b, mufp_funcs->mufp_fcmp_fast_flags))
+
+int main()
+{
+    setup_default_uart();
+ 	srand(0xf005ba11);
+    mufp_funcs = (struct mufp_funcs *)rom_data_lookup(rom_table_code('S','F'));
+    ASSERT(mufp_funcs);
+
+    int rom_version = *(uint8_t*)0x13;
+    printf("ROM VERSION %d\n", rom_version);
+    if (rom_version > 1) {
+        uint8_t *func_count = (uint8_t *)rom_data_lookup(rom_table_code('F','Z'));
+        assert(func_count);
+        assert(*func_count == sizeof(struct mufp_funcs) / 4);
+    }
+    int valid_funcs_size = sizeof(struct mufp_funcs);
+    if (rom_version == 1) {
+        valid_funcs_size -= 11 * 4;
+    }
+    for(int i=0; i<valid_funcs_size; i+=4) {
+        uint32_t fp = *(uint32_t*)(((uint8_t*)mufp_funcs) + i);
+        ASSERT(fp);
+        ASSERT(fp & 1u); // thumb bit!
+        ASSERT(fp < 16 * 1024); // in ROM!
+    }
+
+    // very simple sanity tests
+    check_float_fn2(fadd, 1.3f, -5.0f);
+
+    check_float_fn2(fsub, 1000.75f, 998.6f);
+
+    check_float_fn2(fmul, 1.75f, 31.4f);
+
+    check_float_fn2(fdiv, 2314.6f, -.37f);
+    check_float_fn2(fdiv, 234.6f, -10000.37f);
+    check_float_fn2(fdiv, 2314.6f, INFINITY);
+    check_float_fn2(fdiv, 2314.6f, -INFINITY);
+
+    if (rom_version > 1) {
+        check_int_fn2(fcmp, -3.0f, 7.3f);
+        check_int_fn2(fcmp, 3.0f, -7.3f);
+        check_int_fn2(fcmp, 3.0f, 3.0f);
+        check_int_fn2(fcmp, 3.0f, -INFINITY);
+        check_int_fn2(fcmp, 3.0f, INFINITY);
+
+        check_int_fn2(fcmp, least_neg_denorm.f, most_neg_denorm.f);
+        check_int_fn2(fcmp, most_neg_denorm.f, least_neg_denorm.f);
+        check_int_fn2(fcmp, least_neg_denorm.f, least_neg_denorm.f);
+        check_int_fn2(fcmp, least_neg_norm.f, least_neg_denorm.f);
+        check_int_fn2(fcmp, least_pos_denorm.f, most_pos_denorm.f);
+        check_int_fn2(fcmp, most_pos_denorm.f, least_pos_denorm.f);
+        check_int_fn2(fcmp, least_pos_denorm.f, least_pos_denorm.f);
+        check_int_fn2(fcmp, least_pos_denorm.f, least_pos_denorm.f);
+        check_int_fn2(fcmp, least_pos_norm.f, least_pos_denorm.f);
+
+    }
+
+    check_int_fn2(fcmp_fast, -3.0f, 7.3f);
+    check_int_fn2(fcmp_fast, 3.0f, -7.3f);
+    check_int_fn2(fcmp_fast, 3.0f, 3.0f);
+    check_int_fn2(fcmp_fast, 3.0f, -INFINITY);
+    check_int_fn2(fcmp_fast, 3.0f, INFINITY);
+    check_int_fn2(fcmp_fast, minus_zero.f, zero.f);
+
+    check_int_fn2(fcmp_fast, least_neg_denorm.f, most_neg_denorm.f);
+    check_int_fn2(fcmp_fast, most_neg_denorm.f, least_neg_denorm.f);
+    check_int_fn2(fcmp_fast, least_neg_denorm.f, least_neg_denorm.f);
+    check_int_fn2(fcmp_fast, least_neg_norm.f, least_neg_denorm.f);
+    check_int_fn2(fcmp_fast, least_pos_denorm.f, most_pos_denorm.f);
+    check_int_fn2(fcmp_fast, most_pos_denorm.f, least_pos_denorm.f);
+    check_int_fn2(fcmp_fast, least_pos_denorm.f, least_pos_denorm.f);
+    check_int_fn2(fcmp_fast, least_pos_denorm.f, least_pos_denorm.f);
+    check_int_fn2(fcmp_fast, least_pos_norm.f, least_pos_denorm.f);
+
+    if (rom_version > 1) {
+        check_fcmp_flags(-3.0f, 7.3f);
+        check_fcmp_flags(3.0f, -7.3f);
+        check_fcmp_flags(3.0f, 3.0f);
+        check_fcmp_flags(3.0f, -INFINITY);
+        check_fcmp_flags(3.0f, INFINITY);
+
+        check_fcmp_flags(least_neg_denorm.f, most_neg_denorm.f);
+        check_fcmp_flags(most_neg_denorm.f, least_neg_denorm.f);
+        check_fcmp_flags(least_neg_denorm.f, least_neg_denorm.f);
+        check_fcmp_flags(least_neg_norm.f, least_neg_denorm.f);
+        check_fcmp_flags(least_pos_denorm.f, most_pos_denorm.f);
+        check_fcmp_flags(most_pos_denorm.f, least_pos_denorm.f);
+        check_fcmp_flags(least_pos_denorm.f, least_pos_denorm.f);
+        check_fcmp_flags(least_pos_denorm.f, least_pos_denorm.f);
+        check_fcmp_flags(least_pos_norm.f, least_pos_denorm.f);
+    }
+
+    check_fcmp_fast_flags(-3.0f, 7.3f);
+    check_fcmp_fast_flags(3.0f, -7.3f);
+    check_fcmp_fast_flags(3.0f, 3.0f);
+    check_fcmp_fast_flags(3.0f, -INFINITY);
+    check_fcmp_fast_flags(3.0f, INFINITY);
+
+    check_fcmp_fast_flags(least_neg_denorm.f, most_neg_denorm.f);
+    check_fcmp_fast_flags(most_neg_denorm.f, least_neg_denorm.f);
+    check_fcmp_fast_flags(least_neg_denorm.f, least_neg_denorm.f);
+    check_fcmp_fast_flags(least_neg_norm.f, least_neg_denorm.f);
+    check_fcmp_fast_flags(least_pos_denorm.f, most_pos_denorm.f);
+    check_fcmp_fast_flags(most_pos_denorm.f, least_pos_denorm.f);
+    check_fcmp_fast_flags(least_pos_denorm.f, least_pos_denorm.f);
+    check_fcmp_fast_flags(least_pos_denorm.f, least_pos_denorm.f);
+    check_fcmp_fast_flags(least_pos_norm.f, least_pos_denorm.f);
+
+    check_float_fn1(fsqrt, 3.0f);
+
+    // we are returning INFINITE not NAN as we don't support NANs
+//    check_float_fn1(fsqrt, -3.0f);
+    if (rom_version == 1)
+    {
+        // is buggy in rom version 1
+        ASSERT(INFINITY == mufp_funcs->mufp_fsqrt(-3.0f));
+    } else {
+        ASSERT(-INFINITY == mufp_funcs->mufp_fsqrt(-3.0f));
+    }
+
+    check_int_fn1(float2int, 3.f);
+    check_int_fn1(float2int, 123456000000.f);
+    check_int_fn1(float2int, -3.f);
+    check_int_fn1(float2int, -123456000000.f);
+    check_int_fn1(float2int, INFINITY);
+    check_int_fn1(float2int, -INFINITY);
+
+    check_int_fn2(float2fix, 3.f, 3);
+    check_int_fn2(float2fix, 31.f, -3);
+    check_int_fn2(float2fix, -3.f, 3);
+    // todo JURY IS OUT ON THIS ONE
+    //check_int_fn2(float2fix, -31.f, -3);
+
+    check_uint_fn1(float2uint, 3.f);
+    check_uint_fn1(float2uint, 123456000000.f);
+    check_uint_fn1(float2uint, -3.f);
+    check_uint_fn1(float2uint, -123456000000.f);
+
+    check_uint_fn2(float2ufix, 3.f, 3);
+    check_uint_fn2(float2ufix, 3.f, -3);
+
+    check_float_fn1(int2float, 3);
+    check_float_fn1(int2float, INT32_MAX);
+    check_float_fn1(int2float, INT32_MIN);
+    check_float_fn1(int2float, -3);
+
+    check_float_fn2(fix2float, 3, 3);
+    check_float_fn2(fix2float, 3, -3);
+    check_float_fn2(fix2float, -3, 3);
+    check_float_fn2(fix2float, -3, -3);
+
+    check_float_fn1(uint2float, 3);
+    check_float_fn1(uint2float, UINT32_MAX);
+
+//    float (*mufp_ufix2float)(uint, int);
+
+    check_float_fn1(fcos, 0.0f);
+    check_float_fn1(fcos, 2.7f);
+    check_float_fn1(fcos, -32.7f);
+
+    check_float_fn1(fsin, 0.0f);
+    check_float_fn1(fsin, 2.7f);
+    check_float_fn1(fsin, -32.7f);
+
+    check_float_fn1(ftan, 0.0f);
+    check_float_fn1(ftan, 2.7f);
+    check_float_fn1(ftan, -32.7f);
+
+    if (rom_version > 1) {
+        // broken on rom v 1
+        // todo check broken range
+        check_float_fn2(fatan2, 3.0f, 4.0f);
+        check_float_fn2(fatan2, -3.0f, 4.0f);
+        check_float_fn2(fatan2, 4.0f, -.31f);
+        check_float_fn2(fatan2, -3.0f, -.17f);
+    }
+
+    check_float_fn1(fexp, 0.0f);
+    check_float_fn1(fexp, 2.7f);
+    check_float_fn1(fexp, -32.7f);
+
+    check_float_fn1(fln, 0.3f);
+    check_float_fn1(fln, 1.0f);
+    check_float_fn1(fln, 2.7f);
+    // we are returning -INFINITE as we don't support NANs
+//    check_float_fn1(fln, -32.7f);
+    ASSERT(-INFINITY == mufp_funcs->mufp_fln(-32.7f));
+
+    if (rom_version > 1) {
+        check_int64_fn1(float2int64, 3.f);
+        check_int64_fn1(float2int64, 123456000000.f);
+        check_int64_fn1(float2int64, 12345678912345.f);
+        check_int64_fn1(float2int64, -3.f);
+        check_int64_fn1(float2int64, -123456000000.f);
+        check_int64_fn1(float2int64, -12345678912345.f);
+
+        // seems like gcc is wrong on this one
+//        check_int64_fn1(float2int64, INFINITY);
+        // so
+        ASSERT(INT64_MAX == mufp_funcs->mufp_float2int64(INFINITY));
+
+        // seems like gcc is wrong on this one
+//        check_int64_fn1(float2int64, -INFINITY);
+        // so
+        ASSERT( INT64_MIN == mufp_funcs->mufp_float2int64(-INFINITY));
+
+
+        check_int64_fn2(float2fix64, 3.f, 3);
+        check_int64_fn2(float2fix64, 31.f, -3);
+        check_int64_fn2(float2fix64, -3.f, 3);
+        // todo JURY IS OUT ON THIS ONE
+        //check_int64_fn2(float2fix64, -31.f, -3);
+
+        check_uint64_fn1(float2uint64, 3.f);
+        check_uint64_fn1(float2uint64, 123456000000.f);
+        check_uint64_fn1(float2uint64, 12345678912345.f);
+        check_uint64_fn1(float2uint64, -3.f);
+        check_uint64_fn1(float2uint64, -123456000000.f);
+        check_uint64_fn1(float2uint64, -12345678912345.f);
+
+        check_uint64_fn2(float2ufix64, 3.f, 3);
+        check_uint64_fn2(float2ufix64, 3.f, 43);
+        check_uint64_fn2(float2ufix64, 3.f, -3);
+        check_uint64_fn2(float2ufix64, 3.f, -43);
+
+#define LARGE 0x1234567800000000ll
+        check_float_fn1(int642float, 3);
+        check_float_fn1(int642float, LARGE);
+        check_float_fn1(int642float, INT32_MAX);
+        check_float_fn1(int642float, INT64_MAX);
+        check_float_fn1(int642float, INT32_MIN);
+        check_float_fn1(int642float, INT64_MIN);
+        check_float_fn1(int642float, -3);
+        check_float_fn1(int642float, -LARGE);
+
+        check_float_fn2(fix642float, 3, 3);
+        check_float_fn2(fix642float, 3, -3);
+        check_float_fn2(fix642float, -3, 3);
+        check_float_fn2(fix642float, -3, -3);
+
+        check_float_fn2(fix642float, LARGE, 3);
+        check_float_fn2(fix642float, LARGE, -3);
+        check_float_fn2(fix642float, -LARGE, 3);
+        check_float_fn2(fix642float, -LARGE, -3);
+
+        check_float_fn1(uint642float, 3);
+        check_float_fn1(uint642float, UINT32_MAX);
+        check_float_fn1(uint642float, UINT64_MAX);
+        check_float_fn1(uint642float, LARGE);
+
+        check_float_fn1(float2double, 3.0f);
+        check_float_fn1(float2double, 3.0e12f);
+        check_float_fn1(float2double, -3.0e12f);
+        check_float_fn1(float2double, -3.131592754123214125f);
+        check_float_fn1(float2double, 3.131592754123214125f);
+        check_float_fn1(float2double, -INFINITY);
+        check_float_fn1(float2double, INFINITY);
+        check_float_fn1(float2double, most_pos_denorm.f);
+        check_float_fn1(float2double, least_pos_denorm.f);
+        check_float_fn1(float2double, least_pos_norm.f);
+        check_float_fn1(float2double, least_neg_denorm.f);
+        check_float_fn1(float2double, least_neg_norm.f);
+        check_float_fn1(float2double, most_neg_denorm.f);
+        check_float_fn1(float2double, 0.f);
+        check_float_fn1(float2double, -0.f);
+    }
+
+    printf("FLOAT OK\n");
+	return 0;
+}
diff --git a/test/tictoc.h b/test/tictoc.h
new file mode 100644
index 0000000..00d432d
--- /dev/null
+++ b/test/tictoc.h
@@ -0,0 +1,15 @@
+/**
+ * Copyright (c) 2020 Raspberry Pi (Trading) Ltd.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+static inline void tictoc_init() {
+    *(volatile unsigned int *)0xe000e010=5; // enable SYSTICK at core clock
+}
+
+static inline unsigned int cyc() {
+    return (~*(volatile unsigned int *)0xe000e018)<<8;
+}
+
+
diff --git a/usb_device_tiny/runtime.c b/usb_device_tiny/runtime.c
index 5f320f1..b62191d 100644
--- a/usb_device_tiny/runtime.c
+++ b/usb_device_tiny/runtime.c
@@ -5,8 +5,6 @@
  */
 
 #include "hardware/regs/m0plus.h"
-#include "hardware/regs/sysinfo.h"
-#include "hardware/regs/tbman.h"
 #include "hardware/structs/clocks.h"
 #include "hardware/structs/psm.h"
 #include "hardware/structs/sio.h"
@@ -30,7 +28,7 @@ asm (
 ".hword _dead + 1\n" // should not be called
 ".hword impl_usb_transfer_current_packet_only + 1\n"
 #ifdef USE_PICOBOOT
-".hword impl_rpiboot_cmd_packet + 1\n"
+".hword impl_picoboot_cmd_packet + 1\n"
 #else
 ".hword _dead + 1\n" // should not be called
 #endif
@@ -38,20 +36,14 @@ asm (
 ".hword impl_usb_stream_packet_packet_handler + 1\n"
 ".hword impl_msc_on_sector_stream_chunk + 1\n"
 #ifdef USE_PICOBOOT
-".hword impl_rpiboot_on_stream_chunk + 1\n"
+".hword impl_picoboot_on_stream_chunk + 1\n"
 #endif
 );
 #endif
 
-void *_memcpy(void *dest, const void *src, uint n) {
-    for (uint i = 0; i < n; i++) {
-        ((uint8_t *) dest)[i] = ((const uint8_t *) src)[i];
-    }
-    return dest;
-}
-
 void memset0(void *dest, uint n) {
-    while (n) ((uint8_t *) dest)[--n] = 0;
+    extern void __memset(void *dest, int c, uint n);
+    __memset(dest, 0, n);
 }
 
 volatile bool rebooting;
diff --git a/usb_device_tiny/runtime.h b/usb_device_tiny/runtime.h
index af69aa6..384a643 100644
--- a/usb_device_tiny/runtime.h
+++ b/usb_device_tiny/runtime.h
@@ -36,13 +36,13 @@
 
 #define ROM_FUNC_usb_transfer_current_packet_only 1
 #ifdef USE_PICOBOOT
-#define ROM_FUNC_rpiboot_cmd_packet 2
+#define ROM_FUNC_picoboot_cmd_packet 2
 #endif
 #define ROM_FUNC_msc_cmd_packet 3
 #define ROM_FUNC_usb_stream_packet_packet_handler 4
 #define ROM_FUNC_msc_on_sector_stream_chunk 5
 #ifdef USE_PICOBOOT
-#define ROM_FUNC_rpiboot_on_stream_chunk 6
+#define ROM_FUNC_picoboot_on_stream_chunk 6
 #endif
 
 extern uint8_t _rom_functions[];
@@ -56,11 +56,8 @@ typedef unsigned int uint;
 
 #define count_of(a) (sizeof(a)/sizeof((a)[0]))
 
-extern void *_memcpy(void *dest, const void *src, uint n);
 extern void *__memcpy(void *dest, const void *src, uint n);
-// c version
-//#define memcpy _memcpy
-// Rom good version
+// Rom version
 #define memcpy __memcpy
 extern void memset0(void *dest, uint count);
 void interrupt_enable(uint int_num, bool enable);
diff --git a/usb_device_tiny/scsi_ir.h b/usb_device_tiny/scsi_ir.h
index de6d1cc..7433bfb 100644
--- a/usb_device_tiny/scsi_ir.h
+++ b/usb_device_tiny/scsi_ir.h
@@ -32,7 +32,7 @@ static const struct scsi_inquiry_response scsi_ir = {
         .additional_length = sizeof(struct scsi_inquiry_response) - 4,
         .vendor  = "RPI     ",
         .product = "RP2             ",
-        .version = "1   ",
+        .version = "2   ",
 };
 #endif
 #endif
\ No newline at end of file