9 files changed, 466 insertions, 311 deletions
diff --git a/lib/darwin/execmem.c b/lib/darwin/execmem.c
index 76f0643..3048cd8 100644
--- a/lib/darwin/execmem.c
+++ b/lib/darwin/execmem.c
@@ -1,60 +1,39 @@
+/* define to avoid error that ucontext is "deprecated" (it's unavoidable with
+ * sigaction!) */
+#define _XOPEN_SOURCE 700
+#define _DARWIN_C_SOURCE
+#include "cbit/htab.h"
 #include "execmem.h"
-#include "darwin/manual-syscall.h"
+/* #include "darwin/manual-syscall.h" */
+#include "darwin/mach-decls.h"
 #include "substitute.h"
+#include "substitute-internal.h"
 #include <mach/mach.h>
 #include <sys/mman.h>
 #include <sys/syscall.h>
 #include <errno.h>
 #include <stdio.h>
+#include <stdlib.h>
+#include <ucontext.h>
+#include <signal.h>
 
-int execmem_write(void *dest, const void *src, size_t len) {
-    /* Use vm_region to determine the original protection, so we can mprotect
-     * it back afterwards.  (Note: PROT_* are equal to VM_PROT_*.) */
-    vm_address_t region = (vm_address_t) dest;
-    vm_size_t region_len = 0;
-    struct vm_region_submap_short_info_64 info;
-    mach_msg_type_number_t info_count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
-    natural_t max_depth = 99999;
-    kern_return_t kr = vm_region_recurse_64(mach_task_self(), &region, &region_len,
-                                            &max_depth,
-                                            (vm_region_recurse_info_t) &info,
-                                            &info_count);
-    if (kr) {
-        /* Weird; this probably means the region doesn't exist, but we should
-         * have already read from the memory in order to generate the patch. */
-        errno = 0;
-        return SUBSTITUTE_ERR_VM;
-    }
+#define port_hash(portp) (*(portp))
+#define port_eq(port1p, port2p) (*(port1p) == *(port2p))
+#define port_null(portp) (*(portp) == MACH_PORT_NULL)
+DECL_STATIC_HTAB_KEY(mach_port_t, mach_port_t, port_hash, port_eq, port_null, 0);
+struct empty {};
+DECL_HTAB(mach_port_set, mach_port_t, struct empty);
 
-    uintptr_t lopage = (uintptr_t) dest & ~PAGE_MASK;
-    uintptr_t hipage = ((uintptr_t) dest + len + PAGE_MASK) & ~PAGE_MASK;
-
-    /* We do the syscall manually just in case the user is trying to write to
+    /* ORPHAN: We do the syscall manually just in case the user is trying to write to
      * the mprotect syscall stub itself, or one of the functions it calls.
      * (Obviously, it will still break if the user targets some libsubstitute
      * function within the same page as this one, though.) */
-    int ret = manual_syscall(SYS_mprotect, lopage, hipage - lopage,
-                             PROT_READ | PROT_WRITE, 0);
-    if (ret) {
-        errno = ret;
-        return SUBSTITUTE_ERR_VM;
-    }
 
-    /* volatile to avoid compiler transformation to call to memcpy */
-    volatile uint8_t *d8 = dest;
-    const uint8_t *s8 = src;
-    while (len--)
-        *d8++ = *s8++;
-
-    int oldprot = info.protection & (PROT_READ | PROT_WRITE | PROT_EXEC);
-    ret = manual_syscall(SYS_mprotect, lopage, hipage - lopage,
-                         oldprot, 0);
-    if (ret) {
-        errno = ret;
-        return SUBSTITUTE_ERR_VM;
-    }
-    return SUBSTITUTE_OK;
-}
+/* This should only run on the main thread, so just use globals. */
+static HTAB_STORAGE(mach_port_set) g_suspended_ports;
+static struct sigaction old_segv, old_bus;
+static execmem_pc_patch_callback g_pc_patch_callback;
+static void *g_pc_patch_callback_ctx;
 
 int execmem_alloc_unsealed(uintptr_t hint, void **page_p, size_t *size_p) {
     *size_p = PAGE_SIZE;
@@ -74,3 +53,368 @@ int execmem_seal(void *page) {
 void execmem_free(void *page) {
     munmap(page, PAGE_SIZE);
 }
+
+#if defined(__x86_64__)
+    typedef struct __darwin_x86_thread_state64 native_thread_state;
+    #define NATIVE_THREAD_STATE_FLAVOR x86_THREAD_STATE64
+#elif defined(__i386__)
+    typedef struct __darwin_i386_thread_state native_thread_state;
+    #define NATIVE_THREAD_STATE_FLAVOR x86_THREAD_STATE32
+#elif defined(__arm__)
+    typedef struct __darwin_arm_thread_state native_thread_state;
+    #define NATIVE_THREAD_STATE_FLAVOR ARM_THREAD_STATE
+#elif defined(__arm64__)
+    typedef struct __darwin_arm_thread_state64 native_thread_state;
+    #define NATIVE_THREAD_STATE_FLAVOR ARM_THREAD_STATE64
+#else
+    #error ?
+#endif
+
+/* returns whether it changed */
+static bool apply_one_pcp_with_state(native_thread_state *state,
+                                     execmem_pc_patch_callback callback,
+                                     void *ctx) {
+
+    uintptr_t *pcp;
+#if defined(__x86_64__)
+    pcp = (uintptr_t *) &state->__rip;
+#elif defined(__i386__)
+    pcp = (uintptr_t *) &state->__eip;
+#elif defined(__arm__) || defined(__arm64__)
+    pcp = (uintptr_t *) &state->__pc;
+#endif
+    uintptr_t old = *pcp;
+#ifdef __arm__
+    /* thumb */
+    if (state.cpsr & 0x20)
+        old |= 1;
+#endif
+    uintptr_t new = callback(ctx, *pcp);
+    bool changed = new != old;
+    *pcp = new;
+#ifdef __arm__
+    *pcp &= ~1;
+    state.cpsr = (state.cpsr & ~0x20) | ((new & 1) * 0x20);
+#endif
+    return changed;
+}
+
+static int apply_one_pcp(mach_port_t thread, execmem_pc_patch_callback callback,
+                         void *ctx) {
+    native_thread_state state;
+    mach_msg_type_number_t real_cnt = sizeof(state) / sizeof(int);
+    mach_msg_type_number_t cnt = real_cnt;
+    kern_return_t kr = thread_get_state(thread, NATIVE_THREAD_STATE_FLAVOR,
+                                        (thread_state_t) &state, &cnt);
+    if (kr == KERN_TERMINATED)
+        return SUBSTITUTE_OK;
+    if (kr || cnt != real_cnt)
+        return SUBSTITUTE_ERR_ADJUSTING_THREADS;;
+
+    if (apply_one_pcp_with_state(&state, callback, ctx)) {
+        kr = thread_set_state(thread, NATIVE_THREAD_STATE_FLAVOR,
+                              (thread_state_t) &state, real_cnt);
+        if (kr)
+            return SUBSTITUTE_ERR_ADJUSTING_THREADS;
+    }
+    return SUBSTITUTE_OK;
+}
+
+static void resume_other_threads();
+
+static int stop_other_threads() {
+    /* pthread_main should have already been checked. */
+
+    int ret;
+    mach_port_t self = mach_thread_self();
+
+    /* The following shenanigans are for catching any new threads that are
+     * created while we're looping, without suspending anything twice.  Keep
+     * looping until only threads we already suspended before this loop are
+     * there. */
+    HTAB_STORAGE_INIT(&g_suspended_ports, mach_port_set);
+    struct htab_mach_port_set *suspended_set = &g_suspended_ports.h;
+
+    bool got_new;
+    do {
+        got_new = false;
+
+        thread_act_port_array_t ports;
+        mach_msg_type_number_t nports;
+
+        kern_return_t kr = task_threads(mach_task_self(), &ports, &nports);
+        if (kr) { /* ouch */
+            ret = SUBSTITUTE_ERR_ADJUSTING_THREADS;
+            goto fail;
+        }
+
+        for (mach_msg_type_number_t i = 0; i < nports; i++) {
+            mach_port_t port = ports[i];
+            struct htab_bucket_mach_port_set *bucket;
+            if (port == self ||
+                (bucket = htab_setbucket_mach_port_set(suspended_set, &port),
+                 bucket->key)) {
+                /* already suspended, ignore */
+                mach_port_deallocate(mach_task_self(), port);
+            } else {
+                got_new = true;
+                kr = thread_suspend(port);
+                if (kr == KERN_TERMINATED) {
+                    /* too late */
+                    mach_port_deallocate(mach_task_self(), port);
+                } else if (kr) {
+                    ret = SUBSTITUTE_ERR_ADJUSTING_THREADS;
+                    for (; i < nports; i++)
+                        mach_port_deallocate(mach_task_self(), ports[i]);
+                    vm_deallocate(mach_task_self(), (vm_address_t) ports,
+                                  nports * sizeof(*ports));
+                    goto fail;
+                }
+                bucket->key = port;
+            }
+        }
+        vm_deallocate(mach_task_self(), (vm_address_t) ports,
+                      nports * sizeof(*ports));
+    } while(got_new);
+
+    /* Success - keep the set around for when we're done. */
+    return SUBSTITUTE_OK;
+
+fail:
+    resume_other_threads();
+    return ret;
+}
+
+static void resume_other_threads() {
+    struct htab_mach_port_set *suspended_set = &g_suspended_ports.h;
+    HTAB_FOREACH(suspended_set, mach_port_t *threadp,
+                 UNUSED struct empty *_,
+                 mach_port_set) {
+        thread_resume(*threadp);
+        mach_port_deallocate(mach_task_self(), *threadp);
+    }
+    htab_free_storage_mach_port_set(suspended_set);
+}
+
+static void segfault_handler(UNUSED int sig, UNUSED siginfo_t *info,
+                             void *uap_) {
+    /* We didn't catch it before it segfaulted so have to fix it up here. */
+    ucontext_t *uap = uap_;
+    apply_one_pcp_with_state(&uap->uc_mcontext->__ss, g_pc_patch_callback,
+                             g_pc_patch_callback_ctx);
+    /* just let it continue, whatever */
+}
+
+static int init_pc_patch(execmem_pc_patch_callback callback, void *ctx) {
+    g_pc_patch_callback = callback;
+    g_pc_patch_callback_ctx = ctx;
+    int ret;
+    if ((ret = stop_other_threads()))
+        return ret;
+
+    struct sigaction sa;
+    memset(&sa, 0, sizeof(sa));
+    sa.sa_sigaction = segfault_handler;
+    sigfillset(&sa.sa_mask);
+    sa.sa_flags = SA_RESTART | SA_NODEFER | SA_SIGINFO;
+
+    if (sigaction(SIGSEGV, &sa, &old_segv))
+        return SUBSTITUTE_ERR_ADJUSTING_THREADS;
+    if (sigaction(SIGBUS, &sa, &old_bus)) {
+        sigaction(SIGSEGV, &old_segv, NULL);
+        return SUBSTITUTE_ERR_ADJUSTING_THREADS;
+    }
+    return SUBSTITUTE_OK;
+}
+
+static int run_pc_patch() {
+    int ret;
+
+    struct htab_mach_port_set *suspended_set = &g_suspended_ports.h;
+    HTAB_FOREACH(suspended_set, mach_port_t *threadp,
+                 UNUSED struct empty *_,
+                 mach_port_set) {
+        if ((ret = apply_one_pcp(*threadp, g_pc_patch_callback,
+                                 g_pc_patch_callback_ctx)))
+            return ret;
+    }
+
+    return SUBSTITUTE_OK;
+}
+
+static int finish_pc_patch() {
+    if (sigaction(SIGBUS, &old_bus, NULL) ||
+        sigaction(SIGSEGV, &old_segv, NULL))
+        return SUBSTITUTE_ERR_ADJUSTING_THREADS;
+
+    resume_other_threads();
+    return SUBSTITUTE_OK;
+}
+
+static int compare_dsts(const void *a, const void *b) {
+    void *dst_a = ((struct execmem_foreign_write *) a)->dst;
+    void *dst_b = ((struct execmem_foreign_write *) b)->dst;
+    return dst_a < dst_b ? -1 : dst_a > dst_b ? 1 : 0;
+}
+
+static kern_return_t get_page_prot(uintptr_t ptr, vm_prot_t *prot,
+                                   vm_inherit_t *inherit) {
+
+    vm_address_t region = (vm_address_t) ptr;
+    vm_size_t region_len = 0;
+    struct vm_region_submap_short_info_64 info;
+    mach_msg_type_number_t info_count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
+    natural_t max_depth = 99999;
+    kern_return_t kr = vm_region_recurse_64(mach_task_self(), &region, &region_len,
+                                            &max_depth,
+                                            (vm_region_recurse_info_t) &info,
+                                            &info_count);
+    *prot = info.protection & (PROT_READ | PROT_WRITE | PROT_EXEC);
+    *inherit = info.inheritance;
+    return kr;
+}
+
+static void manual_memcpy(void *restrict dest, const void *src, size_t len) {
+    /* volatile to avoid compiler transformation to call to memcpy */
+    volatile uint8_t *d8 = dest;
+    const uint8_t *s8 = src;
+    while (len--)
+        *d8++ = *s8++;
+}
+
+int execmem_foreign_write_with_pc_patch(struct execmem_foreign_write *writes,
+                                        size_t nwrites,
+                                        execmem_pc_patch_callback callback,
+                                        void *callback_ctx) {
+    int ret;
+
+    qsort(writes, nwrites, sizeof(*writes), compare_dsts);
+
+    size_t last;
+    for (size_t first = 0; first < nwrites; first = last + 1) {
+        const struct execmem_foreign_write *first_write = &writes[first];
+        uintptr_t page_start = (uintptr_t) first_write->dst & ~PAGE_MASK;
+        uintptr_t page_end = ((uintptr_t) first_write->dst +
+                              first_write->len - 1) & ~PAGE_MASK;
+
+        last = first;
+        while (last + 1 < nwrites) {
+            uintptr_t this_start = (uintptr_t) first_write->dst & ~PAGE_MASK;
+            uintptr_t this_end = ((uintptr_t) first_write->dst +
+                                  first_write->len - 1) & ~PAGE_MASK;
+            if (page_start <= this_start && this_start <= page_end) {
+                if (this_end > page_end)
+                    page_end = this_end;
+            } else if (page_start <= this_end && this_end <= page_end) {
+                if (this_start < page_start)
+                    page_start = this_start;
+            } else {
+                break;
+            }
+            last++;
+        }
+        size_t len = page_end - page_start + PAGE_SIZE;
+
+        vm_prot_t prot;
+        vm_inherit_t inherit;
+        /* Assume that a single patch region will be pages of all the same
+         * protection, since the alternative is probably someone doing
+         * something wrong. */
+        kern_return_t kr = get_page_prot(page_start, &prot, &inherit);
+        if (kr) {
+            /* Weird; this probably means the region doesn't exist, but we should
+             * have already read from the memory in order to generate the patch. */
+            return SUBSTITUTE_ERR_VM;
+        }
+        /* Instead of trying to set the existing region to write, which may
+         * fail due to max_protection, we make a fresh copy and remap it over
+         * the original. */
+        void *new = mmap(NULL, len, PROT_READ | PROT_WRITE,
+                         MAP_ANON | MAP_SHARED, -1, 0);
+        if (new == MAP_FAILED)
+            return SUBSTITUTE_ERR_VM;
+        /* Ideally, if the original page wasn't mapped anywhere else, no actual
+         * copy will take place: new will be CoW, then we unmap the original so
+         * new becomes the sole owner before actually writing.  Though, for all
+         * I know, these trips through the VM system could be slower than just
+         * memcpying a page or two... */
+        kr = vm_copy(mach_task_self(), page_start, len, (vm_address_t) new);
+        if (kr) {
+            ret = SUBSTITUTE_ERR_VM;
+            goto fail_unmap;
+        }
+        if (callback) {
+            /* Set the segfault handler - stopping all other threads before
+             * doing so in case they were using it for something (this
+             * happens).  One might think the latter makes segfaults
+             * impossible, but we can't prevent injectors from making new
+             * threads that might run during this process.  Hopefully no
+             * *injected* threads try to use segfault handlers for something!
+             */
+            if ((ret = init_pc_patch(callback, callback_ctx)))
+                goto fail_unmap;
+        }
+        /* Disable access to the page so anyone trying to execute there
+         * will segfault. */
+        if (mmap(NULL, len, PROT_NONE, MAP_ANON | MAP_SHARED, -1, 0)
+            == MAP_FAILED) {
+            ret = SUBSTITUTE_ERR_VM;
+            goto fail_unmap;
+        }
+        /* Write patches to the copy. */
+        for (size_t i = first; i <= last; i++) {
+            struct execmem_foreign_write *write = &writes[i];
+            ptrdiff_t off = (uintptr_t) write->dst - page_start;
+            manual_memcpy(new + off, write->src, write->len);
+        }
+        if (callback) {
+            /* Actually run the callback for any threads which are paused at an
+             * affected PC, or are running and don't get scheduled by the
+             * kernel in time to segfault.  Any thread which moves to an
+             * affected PC *after* run_pc_patch() is assumed to do so by
+             * calling the function in question, so they can't get past the
+             * first instruction and it doesn't matter whether or not they're
+             * patched.  (A call instruction within the affected region would
+             * break this assumption, as then a thread could move to an
+             * affected PC by returning. */
+            if ((ret = run_pc_patch()))
+                goto fail_unmap;
+        }
+
+        /* Protect new like the original, and move it into place. */
+        vm_address_t target = page_start;
+        if (mprotect(new, len, prot)) {
+            ret = SUBSTITUTE_ERR_VM;
+            goto fail_unmap;
+        }
+        vm_prot_t c, m;
+        kr = vm_remap(mach_task_self(), &target, len, 0, VM_FLAGS_OVERWRITE,
+                      mach_task_self(), (vm_address_t) new, /*copy*/ FALSE,
+                      &c, &m, inherit);
+        if (kr) {
+            ret = SUBSTITUTE_ERR_VM;
+            goto fail_unmap;
+        }
+        /* ignore errors... */
+        munmap(new, len);
+        if (callback) {
+            /* Other threads are no longer in danger of segfaulting, so put
+             * back the old setfault handler. */
+            if ((ret = finish_pc_patch()))
+                return ret;
+        }
+
+        continue;
+
+        fail_unmap:
+            /* This is probably useless, since the original page is gone
+             * forever (intentionally, see above).  May as well arrange the
+             * deck chairs, though. */
+            munmap(new, PAGE_SIZE);
+            return ret;
+    }
+
+    /* Shockingly, we made it out! */
+    return SUBSTITUTE_OK;
+}
+
diff --git a/lib/darwin/inject.c b/lib/darwin/inject.c
index 6ec07fa..6e0c50e 100644
--- a/lib/darwin/inject.c
+++ b/lib/darwin/inject.c
@@ -646,10 +646,14 @@ int substitute_dlopen_in_pid(int pid, const char *filename, int options,
         goto fail;
 
     union {
-        struct _x86_thread_state_32 x32;
-        struct _x86_thread_state_64 x64;
-        struct _arm_thread_state_32 a32;
-        struct _arm_thread_state_64 a64;
+#if defined(__x86_64__) || defined(__i386__)
+        struct __darwin_i386_thread_state x32;
+        struct __darwin_x86_thread_state64 x64;
+#endif
+#if defined(__arm__) || defined(__arm64__)
+        struct __darwin_arm_thread_state a32;
+        struct __darwin_arm_thread_state64 a64;
+#endif
     } u;
     size_t state_size;
     thread_state_flavor_t flavor;
@@ -658,34 +662,34 @@ int substitute_dlopen_in_pid(int pid, const char *filename, int options,
     switch (cputype) {
 #if defined(__x86_64__) || defined(__i386__)
     case CPU_TYPE_X86_64:
-        u.x64.rsp = target_stack_top;
-        u.x64.rdi = target_stack_top;
-        u.x64.rip = target_code_page + (inject_start_x86_64 - inject_page_start);
+        u.x64.__rsp = target_stack_top;
+        u.x64.__rdi = target_stack_top;
+        u.x64.__rip = target_code_page + (inject_start_x86_64 - inject_page_start);
         state_size = sizeof(u.x64);
-        flavor = _x86_thread_state_64_flavor;
+        flavor = x86_THREAD_STATE64;
         break;
     case CPU_TYPE_I386:
-        u.x32.esp = target_stack_top;
-        u.x32.ecx = target_stack_top;
-        u.x32.eip = target_code_page + (inject_start_i386 - inject_page_start);
+        u.x32.__esp = target_stack_top;
+        u.x32.__ecx = target_stack_top;
+        u.x32.__eip = target_code_page + (inject_start_i386 - inject_page_start);
         state_size = sizeof(u.x32);
-        flavor = _x86_thread_state_32_flavor;
+        flavor = x86_THREAD_STATE32;
         break;
 #endif
 #if defined(__arm__) || defined(__arm64__)
     case CPU_TYPE_ARM:
-        u.a32.sp = target_stack_top;
-        u.a32.r[0] = target_stack_top;
-        u.a32.pc = target_code_page + (inject_start_arm - inject_page_start);
+        u.a32.__sp = target_stack_top;
+        u.a32.__r[0] = target_stack_top;
+        u.a32.__pc = target_code_page + (inject_start_arm - inject_page_start);
         state_size = sizeof(u.a32);
-        flavor = _arm_thread_state_32_flavor;
+        flavor = ARM_THREAD_STATE;
         break;
     case CPU_TYPE_ARM64:
-        u.a64.sp = target_stack_top;
-        u.a64.x[0] = target_stack_top;
-        u.a64.pc = target_code_page + (inject_start_arm64 - inject_page_start);
+        u.a64.__sp = target_stack_top;
+        u.a64.__x[0] = target_stack_top;
+        u.a64.__pc = target_code_page + (inject_start_arm64 - inject_page_start);
         state_size = sizeof(u.a64);
-        flavor = _arm_thread_state_64_flavor;
+        flavor = ARM_THREAD_STATE64;
         break;
 #endif
     default:
diff --git a/lib/darwin/mach-decls.h b/lib/darwin/mach-decls.h
index 29ea908..b1c7af6 100644
--- a/lib/darwin/mach-decls.h
+++ b/lib/darwin/mach-decls.h
@@ -1,26 +1,6 @@
 #pragma once
 #include <stdint.h>
-
-struct _x86_thread_state_32 {
-    uint32_t eax, ebx, ecx, edx, edi, esi, ebp, esp;
-    uint32_t ss, eflags, eip, cs, ds, es, fs, gs;
-};
-#define _x86_thread_state_32_flavor 1
-struct _x86_thread_state_64 {
-    uint64_t rax, rbx, rcx, rdx, rdi, rsi, rbp, rsp;
-    uint64_t r8, r9, r10, r11, r12, r13, r14, r15;
-    uint64_t rip, rflags, cs, fs, gs;
-};
-#define _x86_thread_state_64_flavor 4
-struct _arm_thread_state_32 {
-    uint32_t r[13], sp, lr, pc, cpsr;
-};
-#define _arm_thread_state_32_flavor 9
-struct _arm_thread_state_64 {
-    uint64_t x[29], fp, lr, sp, pc;
-    uint32_t cpsr, pad;
-};
-#define _arm_thread_state_64_flavor 6
+#include <mach/mach.h>
 
 kern_return_t mach_vm_read_overwrite(vm_map_t, mach_vm_address_t, mach_vm_size_t, mach_vm_address_t, mach_vm_size_t *);
 kern_return_t mach_vm_remap(vm_map_t, mach_vm_address_t *, mach_vm_size_t, mach_vm_offset_t, int, vm_map_t, mach_vm_address_t, boolean_t, vm_prot_t *, vm_prot_t *, vm_inherit_t);
diff --git a/lib/darwin/stop-other-threads.c b/lib/darwin/stop-other-threads.c
deleted file mode 100644
index ff239f3..0000000
--- a/lib/darwin/stop-other-threads.c
+++ /dev/null
@@ -1,163 +0,0 @@
-#include "substitute.h"
-#include "substitute-internal.h"
-#include "darwin/mach-decls.h"
-#include "stop-other-threads.h"
-#include "cbit/htab.h"
-#include <pthread.h>
-#include <mach/mach.h>
-
-#define port_hash(portp) (*(portp))
-#define port_eq(port1p, port2p) (*(port1p) == *(port2p))
-#define port_null(portp) (*(portp) == MACH_PORT_NULL)
-DECL_STATIC_HTAB_KEY(mach_port_t, mach_port_t, port_hash, port_eq, port_null, 0);
-struct empty {};
-DECL_HTAB(mach_port_set, mach_port_t, struct empty);
-
-static bool apply_one_pcp(mach_port_t thread,
-                          uintptr_t (*callback)(void *ctx, uintptr_t pc),
-                          void *ctx) {
-    int flavor;
-#if defined(__x86_64__)
-    struct _x86_thread_state_64 state;
-    flavor = _x86_thread_state_64_flavor;
-#elif defined(__i386__)
-    struct _x86_thread_state_32 state;
-    flavor = _x86_thread_state_32_flavor;
-#elif defined(__arm__)
-    struct _arm_thread_state_32 state;
-    flavor = _arm_thread_state_32_flavor;
-#elif defined(__arm64__)
-    struct _arm_thread_state_64 state;
-    flavor = _arm_thread_state_64_flavor;
-#else
-    #error ?
-#endif
-
-    mach_msg_type_number_t real_cnt = sizeof(state) / sizeof(int);
-    mach_msg_type_number_t cnt = real_cnt;
-    kern_return_t kr = thread_get_state(thread, flavor, (thread_state_t) &state, &cnt);
-    if (kr || cnt != real_cnt)
-        return false;
-
-    uintptr_t *pcp;
-#if defined(__x86_64__)
-    pcp = (uintptr_t *) &state.rip;
-#elif defined(__i386__)
-    pcp = (uintptr_t *) &state.eip;
-#elif defined(__arm__) || defined(__arm64__)
-    pcp = (uintptr_t *) &state.pc;
-#endif
-    uintptr_t old = *pcp;
-#ifdef __arm__
-    /* thumb */
-    if (state.cpsr & 0x20)
-        old |= 1;
-#endif
-    uintptr_t new = callback(ctx, *pcp);
-    if (new != old) {
-        *pcp = new;
-#ifdef __arm__
-        *pcp &= ~1;
-        state.cpsr = (state.cpsr & ~0x20) | ((new & 1) * 0x20);
-#endif
-        kr = thread_set_state(thread, flavor, (thread_state_t) &state, real_cnt);
-        if (kr)
-            return false;
-    }
-    return true;
-}
-
-int stop_other_threads(void **token_ptr) {
-    if (!pthread_main_np())
-        return SUBSTITUTE_ERR_NOT_ON_MAIN_THREAD;
-
-    int ret;
-    mach_port_t self = mach_thread_self();
-
-    /* The following shenanigans are for catching any new threads that are
-     * created while we're looping, without suspending anything twice.  Keep
-     * looping until only threads we already suspended before this loop are
-     * there. */
-    HTAB_STORAGE(mach_port_set) *hs = malloc(sizeof(*hs));
-    HTAB_STORAGE_INIT(hs, mach_port_set);
-    struct htab_mach_port_set *suspended_set = &hs->h;
-
-    thread_act_array_t ports = 0;
-    mach_msg_type_number_t nports = 0;
-
-    bool got_new = true;
-    while (got_new) {
-        got_new = false;
-
-        kern_return_t kr = task_threads(mach_task_self(), &ports, &nports);
-        if (kr) { /* ouch */
-            ret = SUBSTITUTE_ERR_ADJUSTING_THREADS;
-            goto fail;
-        }
-
-        for (mach_msg_type_number_t i = 0; i < nports; i++) {
-            mach_port_t port = ports[i];
-            struct htab_bucket_mach_port_set *bucket;
-            if (port == self ||
-                (bucket = htab_setbucket_mach_port_set(suspended_set, &port),
-                 bucket->key)) {
-                /* already suspended, ignore */
-                mach_port_deallocate(mach_task_self(), port);
-            } else {
-                got_new = true;
-                kr = thread_suspend(port);
-                if (kr == KERN_TERMINATED) {
-                    /* too late */
-                    mach_port_deallocate(mach_task_self(), port);
-                } else if (kr) {
-                    ret = SUBSTITUTE_ERR_ADJUSTING_THREADS;
-                    for (; i < nports; i++)
-                        mach_port_deallocate(mach_task_self(), ports[i]);
-                    vm_deallocate(mach_task_self(), (vm_address_t) ports,
-                                  nports * sizeof(*ports));
-                    goto fail;
-                }
-                bucket->key = port;
-            }
-        }
-        vm_deallocate(mach_task_self(), (vm_address_t) ports,
-                      nports * sizeof(*ports));
-    }
-
-    /* Success - keep the set around for when we're done. */
-    *token_ptr = suspended_set;
-    return SUBSTITUTE_OK;
-
-fail:
-    resume_other_threads(suspended_set);
-    return ret;
-}
-
-int apply_pc_patch_callback(void *token,
-                            uintptr_t (*pc_patch_callback)(void *ctx, uintptr_t pc),
-                            void *ctx) {
-    struct htab_mach_port_set *suspended_set = token;
-    int ret = SUBSTITUTE_OK;
-    HTAB_FOREACH(suspended_set, mach_port_t *threadp,
-                 UNUSED struct empty *_,
-                 mach_port_set) {
-        if (!apply_one_pcp(*threadp, pc_patch_callback, ctx)) {
-            ret = SUBSTITUTE_ERR_ADJUSTING_THREADS;
-            break;
-        }
-    }
-    return ret;
-}
-
-int resume_other_threads(void *token) {
-    struct htab_mach_port_set *suspended_set = token;
-    HTAB_FOREACH(suspended_set, mach_port_t *threadp,
-                 UNUSED struct empty *_,
-                 mach_port_set) {
-        thread_resume(*threadp);
-        mach_port_deallocate(mach_task_self(), *threadp);
-    }
-    htab_free_storage_mach_port_set(suspended_set);
-    free(suspended_set);
-    return SUBSTITUTE_OK; /* eh */
-}
diff --git a/lib/darwin/substrate-compat.c b/lib/darwin/substrate-compat.c
index 2746795..2cdcf6f 100644
--- a/lib/darwin/substrate-compat.c
+++ b/lib/darwin/substrate-compat.c
@@ -43,7 +43,7 @@ EXPORT
 void SubHookFunction(void *symbol, void *replace, void **result) __asm__("SubHookFunction");
 void SubHookFunction(void *symbol, void *replace, void **result) {
     struct substitute_function_hook hook = {symbol, replace, result};
-    int ret = substitute_hook_functions(&hook, 1, SUBSTITUTE_DONT_STOP_THREADS);
+    int ret = substitute_hook_functions(&hook, 1, SUBSTITUTE_NO_THREAD_SAFETY);
     if (ret) {
         panic("SubHookFunction: substitute_hook_functions returned %s\n",
               substitute_strerror(ret));
diff --git a/lib/execmem.h b/lib/execmem.h
index b4860e9..895769d 100644
--- a/lib/execmem.h
+++ b/lib/execmem.h
@@ -1,9 +1,22 @@
 #pragma once
-#include <stdlib.h>
-/* Write to a foreign page which is already RX / with unknown permissions. */
-int execmem_write(void *dest, const void *src, size_t len);
-
+#include <sys/types.h>
 /* For allocating trampolines - this is just a mmap wrapper. */
 int execmem_alloc_unsealed(uintptr_t hint, void **page_p, size_t *size_p);
 int execmem_seal(void *page);
 void execmem_free(void *page);
+
+/* Write to foreign pages which are already RX or have unknown permissions.
+ * If callback is not NULL, run it on all other threads 'atomically', in the
+ * sense that it will be called on any thread which executed any of the old
+ * instructions in the write region.
+ * Oh, and it might mutate writes (to sort it). */
+struct execmem_foreign_write {
+    void *dst;
+    const void *src;
+    size_t len;
+};
+typedef uintptr_t (*execmem_pc_patch_callback)(void *ctx, uintptr_t pc);
+int execmem_foreign_write_with_pc_patch(struct execmem_foreign_write *writes,
+                                        size_t nwrites,
+                                        execmem_pc_patch_callback callback,
+                                        void *callback_ctx);
diff --git a/lib/hook-functions.c b/lib/hook-functions.c
index 7db06d4..5d1f1d5 100644
--- a/lib/hook-functions.c
+++ b/lib/hook-functions.c
@@ -4,8 +4,8 @@
 #include "jump-dis.h"
 #include "transform-dis.h"
 #include "execmem.h"
-#include "stop-other-threads.h"
 #include stringify(TARGET_DIR/jump-patch.h)
+#include <pthread.h>
 
 struct hook_internal {
     int offset_by_pcdiff[MAX_JUMP_PATCH_SIZE + 1];
@@ -16,6 +16,7 @@ struct hook_internal {
     /* page allocated with execmem_alloc_unsealed - only if we had to allocate
      * one when processing this hook */
     void *trampoline_page;
+    struct arch_dis_ctx arch_dis_ctx;
 };
 
 struct pc_callback_info {
@@ -125,22 +126,21 @@ skip_after:;
 EXPORT
 int substitute_hook_functions(const struct substitute_function_hook *hooks,
                               size_t nhooks, int options) {
-    struct hook_internal *his = malloc(nhooks * sizeof(*his));
+    bool thread_safe = !(options & SUBSTITUTE_NO_THREAD_SAFETY);
+    if (thread_safe && !pthread_main_np())
+        return SUBSTITUTE_ERR_NOT_ON_MAIN_THREAD;
+
+    struct execmem_foreign_write *fws;
+    struct hook_internal *his = malloc(nhooks * sizeof(*his) +
+                                       nhooks + sizeof(*fws));
     if (!his)
         return SUBSTITUTE_ERR_OOM;
+    fws = (void *) (his + nhooks);
 
     for (size_t i = 0; i < nhooks; i++)
         his[i].trampoline_page = NULL;
 
     int ret = SUBSTITUTE_OK;
-    ssize_t emw_finished_i = -1;
-    bool stopped = false;
-    void *stop_token;
-    if (!(options & SUBSTITUTE_DONT_STOP_THREADS)) {
-        if ((ret = stop_other_threads(&stop_token)))
-            goto end;
-        stopped = true;
-    }
 
     void *trampoline_ptr = NULL;
     size_t trampoline_size_left = 0;
@@ -160,6 +160,7 @@ int substitute_hook_functions(const struct substitute_function_hook *hooks,
         }
 #endif
         hi->code = code;
+        hi->arch_dis_ctx = arch;
         uintptr_t pc_patch_start = (uintptr_t) code;
         int patch_size;
         bool need_intro_trampoline;
@@ -201,6 +202,7 @@ int substitute_hook_functions(const struct substitute_function_hook *hooks,
         }
 
         hi->outro_trampoline = trampoline_ptr;
+        *(void **) hook->old_ptr = hi->outro_trampoline;
         uintptr_t dpc = pc_patch_end;
 #ifdef __arm__
         if (arch.pc_low_bit) {
@@ -229,49 +231,34 @@ int substitute_hook_functions(const struct substitute_function_hook *hooks,
 
     /* Now commit. */
     for (size_t i = 0; i < nhooks; i++) {
-        const struct substitute_function_hook *hook = &hooks[i];
         struct hook_internal *hi = &his[i];
-        emw_finished_i = (ssize_t) i;
-        if ((ret = execmem_write(hi->code, hi->jump_patch, hi->jump_patch_size))) {
-            /* User is probably screwed, since this probably means a failure to
-             * re-protect exec, thanks to code signing, so now the function is
-             * permanently inaccessible. */
-            goto end;
-        }
-        if (hook->old_ptr)
-            *(void **) hook->old_ptr = hi->outro_trampoline;
+        void *page = hi->trampoline_page;
+        if (page)
+            execmem_seal(page);
+        fws[i].dst = hi->code;
+        fws[i].src = hi->jump_patch;
+        fws[i].len = hi->jump_patch_size;
     }
 
-    /* *sigh of relief* now we can rewrite the PCs. */
-    if (stopped) {
-        struct pc_callback_info info = {his, nhooks, false};
-        if ((ret = apply_pc_patch_callback(stop_token, pc_callback, &info)))
-            goto end;
-        if (info.encountered_bad_pc) {
-            ret = SUBSTITUTE_ERR_UNEXPECTED_PC_ON_OTHER_THREAD;
-            goto end;
-        }
+    struct pc_callback_info info = {his, nhooks, false};
+    if ((ret = execmem_foreign_write_with_pc_patch(
+            fws, nhooks, thread_safe ? pc_callback : NULL, &info))) {
+        /* Too late to free the trampolines.  Chances are this is fatal anyway. */
+        goto end_dont_free;
+    }
+    if (info.encountered_bad_pc) {
+        ret = SUBSTITUTE_ERR_UNEXPECTED_PC_ON_OTHER_THREAD;
+        goto end_dont_free;
     }
 
 end:
+    /* if we failed, get rid of the trampolines. */
     for (size_t i = 0; i < nhooks; i++) {
         void *page = his[i].trampoline_page;
-        if (page) {
-            /* if we failed, get rid of the trampolines.  if we succeeded, make
-             * them executable */
-            if (ret && (ssize_t) i >= emw_finished_i) {
-                execmem_free(page);
-            } else {
-                /* we already patched them all, too late to go back.. */
-                ret = execmem_seal(page);
-            }
-        }
-    }
-    if (stopped) {
-        int r2 = resume_other_threads(stop_token);
-        if (!ret)
-            ret = r2;
+        if (page)
+            execmem_free(page);
     }
+end_dont_free:
     free(his);
     return ret;
 }
diff --git a/lib/stop-other-threads.h b/lib/stop-other-threads.h
deleted file mode 100644
index 1f6e639..0000000
--- a/lib/stop-other-threads.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma once
-#include <stdint.h>
-
-/* Stop the world; return token to be used for applying PC patches and resuming. */
-int stop_other_threads(void **token_ptr);
-int apply_pc_patch_callback(void *token,
-                            uintptr_t (*pc_patch_callback)(void *ctx, uintptr_t pc),
-                            void *ctx);
-int resume_other_threads(void *token);
diff --git a/lib/substitute.h b/lib/substitute.h
index 8764bcf..2045c3d 100644
--- a/lib/substitute.h
+++ b/lib/substitute.h
@@ -37,9 +37,8 @@ enum {
     /* out of memory */
     SUBSTITUTE_ERR_OOM,
 
-    /* substitute_hook_functions:    mmap or mprotect failure other than ENOMEM
-     *                               (preserved in errno on return)
-     * substitute_hook_functions:    vm_region failure (errno = 0)
+    /* substitute_hook_functions:    mmap, mprotect, vm_copy, or
+     *                               vm_remap failure
      * substitute_hook_objc_message: vm_remap failure
      * Most likely to come up with substitute_hook_functions if the kernel is
      * preventing pages from being marked executable. */
@@ -48,7 +47,7 @@ enum {
     /* substitute_hook_functions: not on the main thread (so stopping all other
      * threads would be unsafe, as concurrent attempts to do the same from
      * other threads would result in deadlock), and you did not pass
-     * SUBSTITUTE_DONT_STOP_THREADS */
+     * SUBSTITUTE_NO_THREAD_SAFETY */
     SUBSTITUTE_ERR_NOT_ON_MAIN_THREAD,
 
     /* substitute_hook_functions: when trying to patch the PC of other threads
@@ -86,7 +85,7 @@ const char *substitute_strerror(int err);
 
 /* substitute_hook_functions options */
 enum {
-    SUBSTITUTE_DONT_STOP_THREADS = 1,
+    SUBSTITUTE_NO_THREAD_SAFETY = 1,
 };
 
 /* TODO doc */