diff options
Diffstat (limited to '')
-rw-r--r-- | Makefile | 3 | ||||
-rw-r--r-- | lib/darwin/execmem.c | 432 | ||||
-rw-r--r-- | lib/darwin/inject.c | 44 | ||||
-rw-r--r-- | lib/darwin/mach-decls.h | 22 | ||||
-rw-r--r-- | lib/darwin/stop-other-threads.c | 163 | ||||
-rw-r--r-- | lib/darwin/substrate-compat.c | 2 | ||||
-rw-r--r-- | lib/execmem.h | 21 | ||||
-rw-r--r-- | lib/hook-functions.c | 75 | ||||
-rw-r--r-- | lib/stop-other-threads.h | 9 | ||||
-rw-r--r-- | lib/substitute.h | 9 | ||||
-rw-r--r-- | test/test-execmem.c | 11 | ||||
-rw-r--r-- | test/test-pc-patch.c (renamed from test/test-stop-threads.c) | 22 |
12 files changed, 488 insertions, 325 deletions
@@ -69,7 +69,6 @@ LIB_OBJS := \ out/darwin/objc.o \ out/darwin/read.o \ out/darwin/substrate-compat.o \ - out/darwin/stop-other-threads.o \ out/darwin/execmem.o \ out/darwin/unrestrict.o \ out/jump-dis.o \ @@ -134,7 +133,7 @@ $(eval $(call define_test,imp-forwarding,imp-forwarding,$(CC) -std=c11 -framewor $(eval $(call define_test,objc-hook,objc-hook,$(CC) -std=c11 -framework Foundation -lsubstitute)) $(eval $(call define_test,interpose,interpose,$(CC) -std=c11 -lsubstitute)) $(eval $(call define_test,inject,inject,$(CC) -std=c11 -lsubstitute out/darwin/inject.o out/darwin/read.o)) -$(eval $(call define_test,stop-threads,stop-threads,$(CC) -std=c11 out/darwin/stop-other-threads.o)) +$(eval $(call define_test,pc-patch,pc-patch,$(CC) -std=c11 out/darwin/execmem.o)) $(eval $(call define_test,execmem,execmem,$(CC) -std=c11 out/darwin/execmem.o -segprot __TEST rwx rx)) $(eval $(call define_test,hook-functions,hook-functions,$(CC) -std=c11 -lsubstitute)) $(eval $(call define_test,posixspawn-hook,posixspawn-hook,$(CC) -std=c11)) diff --git a/lib/darwin/execmem.c b/lib/darwin/execmem.c index 76f0643..3048cd8 100644 --- a/lib/darwin/execmem.c +++ b/lib/darwin/execmem.c @@ -1,60 +1,39 @@ +/* define to avoid error that ucontext is "deprecated" (it's unavoidable with + * sigaction!) */ +#define _XOPEN_SOURCE 700 +#define _DARWIN_C_SOURCE +#include "cbit/htab.h" #include "execmem.h" -#include "darwin/manual-syscall.h" +/* #include "darwin/manual-syscall.h" */ +#include "darwin/mach-decls.h" #include "substitute.h" +#include "substitute-internal.h" #include <mach/mach.h> #include <sys/mman.h> #include <sys/syscall.h> #include <errno.h> #include <stdio.h> +#include <stdlib.h> +#include <ucontext.h> +#include <signal.h> -int execmem_write(void *dest, const void *src, size_t len) { - /* Use vm_region to determine the original protection, so we can mprotect - * it back afterwards. (Note: PROT_* are equal to VM_PROT_*.) */ - vm_address_t region = (vm_address_t) dest; - vm_size_t region_len = 0; - struct vm_region_submap_short_info_64 info; - mach_msg_type_number_t info_count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64; - natural_t max_depth = 99999; - kern_return_t kr = vm_region_recurse_64(mach_task_self(), ®ion, ®ion_len, - &max_depth, - (vm_region_recurse_info_t) &info, - &info_count); - if (kr) { - /* Weird; this probably means the region doesn't exist, but we should - * have already read from the memory in order to generate the patch. */ - errno = 0; - return SUBSTITUTE_ERR_VM; - } +#define port_hash(portp) (*(portp)) +#define port_eq(port1p, port2p) (*(port1p) == *(port2p)) +#define port_null(portp) (*(portp) == MACH_PORT_NULL) +DECL_STATIC_HTAB_KEY(mach_port_t, mach_port_t, port_hash, port_eq, port_null, 0); +struct empty {}; +DECL_HTAB(mach_port_set, mach_port_t, struct empty); - uintptr_t lopage = (uintptr_t) dest & ~PAGE_MASK; - uintptr_t hipage = ((uintptr_t) dest + len + PAGE_MASK) & ~PAGE_MASK; - - /* We do the syscall manually just in case the user is trying to write to + /* ORPHAN: We do the syscall manually just in case the user is trying to write to * the mprotect syscall stub itself, or one of the functions it calls. * (Obviously, it will still break if the user targets some libsubstitute * function within the same page as this one, though.) */ - int ret = manual_syscall(SYS_mprotect, lopage, hipage - lopage, - PROT_READ | PROT_WRITE, 0); - if (ret) { - errno = ret; - return SUBSTITUTE_ERR_VM; - } - /* volatile to avoid compiler transformation to call to memcpy */ - volatile uint8_t *d8 = dest; - const uint8_t *s8 = src; - while (len--) - *d8++ = *s8++; - - int oldprot = info.protection & (PROT_READ | PROT_WRITE | PROT_EXEC); - ret = manual_syscall(SYS_mprotect, lopage, hipage - lopage, - oldprot, 0); - if (ret) { - errno = ret; - return SUBSTITUTE_ERR_VM; - } - return SUBSTITUTE_OK; -} +/* This should only run on the main thread, so just use globals. */ +static HTAB_STORAGE(mach_port_set) g_suspended_ports; +static struct sigaction old_segv, old_bus; +static execmem_pc_patch_callback g_pc_patch_callback; +static void *g_pc_patch_callback_ctx; int execmem_alloc_unsealed(uintptr_t hint, void **page_p, size_t *size_p) { *size_p = PAGE_SIZE; @@ -74,3 +53,368 @@ int execmem_seal(void *page) { void execmem_free(void *page) { munmap(page, PAGE_SIZE); } + +#if defined(__x86_64__) + typedef struct __darwin_x86_thread_state64 native_thread_state; + #define NATIVE_THREAD_STATE_FLAVOR x86_THREAD_STATE64 +#elif defined(__i386__) + typedef struct __darwin_i386_thread_state native_thread_state; + #define NATIVE_THREAD_STATE_FLAVOR x86_THREAD_STATE32 +#elif defined(__arm__) + typedef struct __darwin_arm_thread_state native_thread_state; + #define NATIVE_THREAD_STATE_FLAVOR ARM_THREAD_STATE +#elif defined(__arm64__) + typedef struct __darwin_arm_thread_state64 native_thread_state; + #define NATIVE_THREAD_STATE_FLAVOR ARM_THREAD_STATE64 +#else + #error ? +#endif + +/* returns whether it changed */ +static bool apply_one_pcp_with_state(native_thread_state *state, + execmem_pc_patch_callback callback, + void *ctx) { + + uintptr_t *pcp; +#if defined(__x86_64__) + pcp = (uintptr_t *) &state->__rip; +#elif defined(__i386__) + pcp = (uintptr_t *) &state->__eip; +#elif defined(__arm__) || defined(__arm64__) + pcp = (uintptr_t *) &state->__pc; +#endif + uintptr_t old = *pcp; +#ifdef __arm__ + /* thumb */ + if (state.cpsr & 0x20) + old |= 1; +#endif + uintptr_t new = callback(ctx, *pcp); + bool changed = new != old; + *pcp = new; +#ifdef __arm__ + *pcp &= ~1; + state.cpsr = (state.cpsr & ~0x20) | ((new & 1) * 0x20); +#endif + return changed; +} + +static int apply_one_pcp(mach_port_t thread, execmem_pc_patch_callback callback, + void *ctx) { + native_thread_state state; + mach_msg_type_number_t real_cnt = sizeof(state) / sizeof(int); + mach_msg_type_number_t cnt = real_cnt; + kern_return_t kr = thread_get_state(thread, NATIVE_THREAD_STATE_FLAVOR, + (thread_state_t) &state, &cnt); + if (kr == KERN_TERMINATED) + return SUBSTITUTE_OK; + if (kr || cnt != real_cnt) + return SUBSTITUTE_ERR_ADJUSTING_THREADS;; + + if (apply_one_pcp_with_state(&state, callback, ctx)) { + kr = thread_set_state(thread, NATIVE_THREAD_STATE_FLAVOR, + (thread_state_t) &state, real_cnt); + if (kr) + return SUBSTITUTE_ERR_ADJUSTING_THREADS; + } + return SUBSTITUTE_OK; +} + +static void resume_other_threads(); + +static int stop_other_threads() { + /* pthread_main should have already been checked. */ + + int ret; + mach_port_t self = mach_thread_self(); + + /* The following shenanigans are for catching any new threads that are + * created while we're looping, without suspending anything twice. Keep + * looping until only threads we already suspended before this loop are + * there. */ + HTAB_STORAGE_INIT(&g_suspended_ports, mach_port_set); + struct htab_mach_port_set *suspended_set = &g_suspended_ports.h; + + bool got_new; + do { + got_new = false; + + thread_act_port_array_t ports; + mach_msg_type_number_t nports; + + kern_return_t kr = task_threads(mach_task_self(), &ports, &nports); + if (kr) { /* ouch */ + ret = SUBSTITUTE_ERR_ADJUSTING_THREADS; + goto fail; + } + + for (mach_msg_type_number_t i = 0; i < nports; i++) { + mach_port_t port = ports[i]; + struct htab_bucket_mach_port_set *bucket; + if (port == self || + (bucket = htab_setbucket_mach_port_set(suspended_set, &port), + bucket->key)) { + /* already suspended, ignore */ + mach_port_deallocate(mach_task_self(), port); + } else { + got_new = true; + kr = thread_suspend(port); + if (kr == KERN_TERMINATED) { + /* too late */ + mach_port_deallocate(mach_task_self(), port); + } else if (kr) { + ret = SUBSTITUTE_ERR_ADJUSTING_THREADS; + for (; i < nports; i++) + mach_port_deallocate(mach_task_self(), ports[i]); + vm_deallocate(mach_task_self(), (vm_address_t) ports, + nports * sizeof(*ports)); + goto fail; + } + bucket->key = port; + } + } + vm_deallocate(mach_task_self(), (vm_address_t) ports, + nports * sizeof(*ports)); + } while(got_new); + + /* Success - keep the set around for when we're done. */ + return SUBSTITUTE_OK; + +fail: + resume_other_threads(); + return ret; +} + +static void resume_other_threads() { + struct htab_mach_port_set *suspended_set = &g_suspended_ports.h; + HTAB_FOREACH(suspended_set, mach_port_t *threadp, + UNUSED struct empty *_, + mach_port_set) { + thread_resume(*threadp); + mach_port_deallocate(mach_task_self(), *threadp); + } + htab_free_storage_mach_port_set(suspended_set); +} + +static void segfault_handler(UNUSED int sig, UNUSED siginfo_t *info, + void *uap_) { + /* We didn't catch it before it segfaulted so have to fix it up here. */ + ucontext_t *uap = uap_; + apply_one_pcp_with_state(&uap->uc_mcontext->__ss, g_pc_patch_callback, + g_pc_patch_callback_ctx); + /* just let it continue, whatever */ +} + +static int init_pc_patch(execmem_pc_patch_callback callback, void *ctx) { + g_pc_patch_callback = callback; + g_pc_patch_callback_ctx = ctx; + int ret; + if ((ret = stop_other_threads())) + return ret; + + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = segfault_handler; + sigfillset(&sa.sa_mask); + sa.sa_flags = SA_RESTART | SA_NODEFER | SA_SIGINFO; + + if (sigaction(SIGSEGV, &sa, &old_segv)) + return SUBSTITUTE_ERR_ADJUSTING_THREADS; + if (sigaction(SIGBUS, &sa, &old_bus)) { + sigaction(SIGSEGV, &old_segv, NULL); + return SUBSTITUTE_ERR_ADJUSTING_THREADS; + } + return SUBSTITUTE_OK; +} + +static int run_pc_patch() { + int ret; + + struct htab_mach_port_set *suspended_set = &g_suspended_ports.h; + HTAB_FOREACH(suspended_set, mach_port_t *threadp, + UNUSED struct empty *_, + mach_port_set) { + if ((ret = apply_one_pcp(*threadp, g_pc_patch_callback, + g_pc_patch_callback_ctx))) + return ret; + } + + return SUBSTITUTE_OK; +} + +static int finish_pc_patch() { + if (sigaction(SIGBUS, &old_bus, NULL) || + sigaction(SIGSEGV, &old_segv, NULL)) + return SUBSTITUTE_ERR_ADJUSTING_THREADS; + + resume_other_threads(); + return SUBSTITUTE_OK; +} + +static int compare_dsts(const void *a, const void *b) { + void *dst_a = ((struct execmem_foreign_write *) a)->dst; + void *dst_b = ((struct execmem_foreign_write *) b)->dst; + return dst_a < dst_b ? -1 : dst_a > dst_b ? 1 : 0; +} + +static kern_return_t get_page_prot(uintptr_t ptr, vm_prot_t *prot, + vm_inherit_t *inherit) { + + vm_address_t region = (vm_address_t) ptr; + vm_size_t region_len = 0; + struct vm_region_submap_short_info_64 info; + mach_msg_type_number_t info_count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64; + natural_t max_depth = 99999; + kern_return_t kr = vm_region_recurse_64(mach_task_self(), ®ion, ®ion_len, + &max_depth, + (vm_region_recurse_info_t) &info, + &info_count); + *prot = info.protection & (PROT_READ | PROT_WRITE | PROT_EXEC); + *inherit = info.inheritance; + return kr; +} + +static void manual_memcpy(void *restrict dest, const void *src, size_t len) { + /* volatile to avoid compiler transformation to call to memcpy */ + volatile uint8_t *d8 = dest; + const uint8_t *s8 = src; + while (len--) + *d8++ = *s8++; +} + +int execmem_foreign_write_with_pc_patch(struct execmem_foreign_write *writes, + size_t nwrites, + execmem_pc_patch_callback callback, + void *callback_ctx) { + int ret; + + qsort(writes, nwrites, sizeof(*writes), compare_dsts); + + size_t last; + for (size_t first = 0; first < nwrites; first = last + 1) { + const struct execmem_foreign_write *first_write = &writes[first]; + uintptr_t page_start = (uintptr_t) first_write->dst & ~PAGE_MASK; + uintptr_t page_end = ((uintptr_t) first_write->dst + + first_write->len - 1) & ~PAGE_MASK; + + last = first; + while (last + 1 < nwrites) { + uintptr_t this_start = (uintptr_t) first_write->dst & ~PAGE_MASK; + uintptr_t this_end = ((uintptr_t) first_write->dst + + first_write->len - 1) & ~PAGE_MASK; + if (page_start <= this_start && this_start <= page_end) { + if (this_end > page_end) + page_end = this_end; + } else if (page_start <= this_end && this_end <= page_end) { + if (this_start < page_start) + page_start = this_start; + } else { + break; + } + last++; + } + size_t len = page_end - page_start + PAGE_SIZE; + + vm_prot_t prot; + vm_inherit_t inherit; + /* Assume that a single patch region will be pages of all the same + * protection, since the alternative is probably someone doing + * something wrong. */ + kern_return_t kr = get_page_prot(page_start, &prot, &inherit); + if (kr) { + /* Weird; this probably means the region doesn't exist, but we should + * have already read from the memory in order to generate the patch. */ + return SUBSTITUTE_ERR_VM; + } + /* Instead of trying to set the existing region to write, which may + * fail due to max_protection, we make a fresh copy and remap it over + * the original. */ + void *new = mmap(NULL, len, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_SHARED, -1, 0); + if (new == MAP_FAILED) + return SUBSTITUTE_ERR_VM; + /* Ideally, if the original page wasn't mapped anywhere else, no actual + * copy will take place: new will be CoW, then we unmap the original so + * new becomes the sole owner before actually writing. Though, for all + * I know, these trips through the VM system could be slower than just + * memcpying a page or two... */ + kr = vm_copy(mach_task_self(), page_start, len, (vm_address_t) new); + if (kr) { + ret = SUBSTITUTE_ERR_VM; + goto fail_unmap; + } + if (callback) { + /* Set the segfault handler - stopping all other threads before + * doing so in case they were using it for something (this + * happens). One might think the latter makes segfaults + * impossible, but we can't prevent injectors from making new + * threads that might run during this process. Hopefully no + * *injected* threads try to use segfault handlers for something! + */ + if ((ret = init_pc_patch(callback, callback_ctx))) + goto fail_unmap; + } + /* Disable access to the page so anyone trying to execute there + * will segfault. */ + if (mmap(NULL, len, PROT_NONE, MAP_ANON | MAP_SHARED, -1, 0) + == MAP_FAILED) { + ret = SUBSTITUTE_ERR_VM; + goto fail_unmap; + } + /* Write patches to the copy. */ + for (size_t i = first; i <= last; i++) { + struct execmem_foreign_write *write = &writes[i]; + ptrdiff_t off = (uintptr_t) write->dst - page_start; + manual_memcpy(new + off, write->src, write->len); + } + if (callback) { + /* Actually run the callback for any threads which are paused at an + * affected PC, or are running and don't get scheduled by the + * kernel in time to segfault. Any thread which moves to an + * affected PC *after* run_pc_patch() is assumed to do so by + * calling the function in question, so they can't get past the + * first instruction and it doesn't matter whether or not they're + * patched. (A call instruction within the affected region would + * break this assumption, as then a thread could move to an + * affected PC by returning. */ + if ((ret = run_pc_patch())) + goto fail_unmap; + } + + /* Protect new like the original, and move it into place. */ + vm_address_t target = page_start; + if (mprotect(new, len, prot)) { + ret = SUBSTITUTE_ERR_VM; + goto fail_unmap; + } + vm_prot_t c, m; + kr = vm_remap(mach_task_self(), &target, len, 0, VM_FLAGS_OVERWRITE, + mach_task_self(), (vm_address_t) new, /*copy*/ FALSE, + &c, &m, inherit); + if (kr) { + ret = SUBSTITUTE_ERR_VM; + goto fail_unmap; + } + /* ignore errors... */ + munmap(new, len); + if (callback) { + /* Other threads are no longer in danger of segfaulting, so put + * back the old setfault handler. */ + if ((ret = finish_pc_patch())) + return ret; + } + + continue; + + fail_unmap: + /* This is probably useless, since the original page is gone + * forever (intentionally, see above). May as well arrange the + * deck chairs, though. */ + munmap(new, PAGE_SIZE); + return ret; + } + + /* Shockingly, we made it out! */ + return SUBSTITUTE_OK; +} + diff --git a/lib/darwin/inject.c b/lib/darwin/inject.c index 6ec07fa..6e0c50e 100644 --- a/lib/darwin/inject.c +++ b/lib/darwin/inject.c @@ -646,10 +646,14 @@ int substitute_dlopen_in_pid(int pid, const char *filename, int options, goto fail; union { - struct _x86_thread_state_32 x32; - struct _x86_thread_state_64 x64; - struct _arm_thread_state_32 a32; - struct _arm_thread_state_64 a64; +#if defined(__x86_64__) || defined(__i386__) + struct __darwin_i386_thread_state x32; + struct __darwin_x86_thread_state64 x64; +#endif +#if defined(__arm__) || defined(__arm64__) + struct __darwin_arm_thread_state a32; + struct __darwin_arm_thread_state64 a64; +#endif } u; size_t state_size; thread_state_flavor_t flavor; @@ -658,34 +662,34 @@ int substitute_dlopen_in_pid(int pid, const char *filename, int options, switch (cputype) { #if defined(__x86_64__) || defined(__i386__) case CPU_TYPE_X86_64: - u.x64.rsp = target_stack_top; - u.x64.rdi = target_stack_top; - u.x64.rip = target_code_page + (inject_start_x86_64 - inject_page_start); + u.x64.__rsp = target_stack_top; + u.x64.__rdi = target_stack_top; + u.x64.__rip = target_code_page + (inject_start_x86_64 - inject_page_start); state_size = sizeof(u.x64); - flavor = _x86_thread_state_64_flavor; + flavor = x86_THREAD_STATE64; break; case CPU_TYPE_I386: - u.x32.esp = target_stack_top; - u.x32.ecx = target_stack_top; - u.x32.eip = target_code_page + (inject_start_i386 - inject_page_start); + u.x32.__esp = target_stack_top; + u.x32.__ecx = target_stack_top; + u.x32.__eip = target_code_page + (inject_start_i386 - inject_page_start); state_size = sizeof(u.x32); - flavor = _x86_thread_state_32_flavor; + flavor = x86_THREAD_STATE32; break; #endif #if defined(__arm__) || defined(__arm64__) case CPU_TYPE_ARM: - u.a32.sp = target_stack_top; - u.a32.r[0] = target_stack_top; - u.a32.pc = target_code_page + (inject_start_arm - inject_page_start); + u.a32.__sp = target_stack_top; + u.a32.__r[0] = target_stack_top; + u.a32.__pc = target_code_page + (inject_start_arm - inject_page_start); state_size = sizeof(u.a32); - flavor = _arm_thread_state_32_flavor; + flavor = ARM_THREAD_STATE; break; case CPU_TYPE_ARM64: - u.a64.sp = target_stack_top; - u.a64.x[0] = target_stack_top; - u.a64.pc = target_code_page + (inject_start_arm64 - inject_page_start); + u.a64.__sp = target_stack_top; + u.a64.__x[0] = target_stack_top; + u.a64.__pc = target_code_page + (inject_start_arm64 - inject_page_start); state_size = sizeof(u.a64); - flavor = _arm_thread_state_64_flavor; + flavor = ARM_THREAD_STATE64; break; #endif default: diff --git a/lib/darwin/mach-decls.h b/lib/darwin/mach-decls.h index 29ea908..b1c7af6 100644 --- a/lib/darwin/mach-decls.h +++ b/lib/darwin/mach-decls.h @@ -1,26 +1,6 @@ #pragma once #include <stdint.h> - -struct _x86_thread_state_32 { - uint32_t eax, ebx, ecx, edx, edi, esi, ebp, esp; - uint32_t ss, eflags, eip, cs, ds, es, fs, gs; -}; -#define _x86_thread_state_32_flavor 1 -struct _x86_thread_state_64 { - uint64_t rax, rbx, rcx, rdx, rdi, rsi, rbp, rsp; - uint64_t r8, r9, r10, r11, r12, r13, r14, r15; - uint64_t rip, rflags, cs, fs, gs; -}; -#define _x86_thread_state_64_flavor 4 -struct _arm_thread_state_32 { - uint32_t r[13], sp, lr, pc, cpsr; -}; -#define _arm_thread_state_32_flavor 9 -struct _arm_thread_state_64 { - uint64_t x[29], fp, lr, sp, pc; - uint32_t cpsr, pad; -}; -#define _arm_thread_state_64_flavor 6 +#include <mach/mach.h> kern_return_t mach_vm_read_overwrite(vm_map_t, mach_vm_address_t, mach_vm_size_t, mach_vm_address_t, mach_vm_size_t *); kern_return_t mach_vm_remap(vm_map_t, mach_vm_address_t *, mach_vm_size_t, mach_vm_offset_t, int, vm_map_t, mach_vm_address_t, boolean_t, vm_prot_t *, vm_prot_t *, vm_inherit_t); diff --git a/lib/darwin/stop-other-threads.c b/lib/darwin/stop-other-threads.c deleted file mode 100644 index ff239f3..0000000 --- a/lib/darwin/stop-other-threads.c +++ /dev/null @@ -1,163 +0,0 @@ -#include "substitute.h" -#include "substitute-internal.h" -#include "darwin/mach-decls.h" -#include "stop-other-threads.h" -#include "cbit/htab.h" -#include <pthread.h> -#include <mach/mach.h> - -#define port_hash(portp) (*(portp)) -#define port_eq(port1p, port2p) (*(port1p) == *(port2p)) -#define port_null(portp) (*(portp) == MACH_PORT_NULL) -DECL_STATIC_HTAB_KEY(mach_port_t, mach_port_t, port_hash, port_eq, port_null, 0); -struct empty {}; -DECL_HTAB(mach_port_set, mach_port_t, struct empty); - -static bool apply_one_pcp(mach_port_t thread, - uintptr_t (*callback)(void *ctx, uintptr_t pc), - void *ctx) { - int flavor; -#if defined(__x86_64__) - struct _x86_thread_state_64 state; - flavor = _x86_thread_state_64_flavor; -#elif defined(__i386__) - struct _x86_thread_state_32 state; - flavor = _x86_thread_state_32_flavor; -#elif defined(__arm__) - struct _arm_thread_state_32 state; - flavor = _arm_thread_state_32_flavor; -#elif defined(__arm64__) - struct _arm_thread_state_64 state; - flavor = _arm_thread_state_64_flavor; -#else - #error ? -#endif - - mach_msg_type_number_t real_cnt = sizeof(state) / sizeof(int); - mach_msg_type_number_t cnt = real_cnt; - kern_return_t kr = thread_get_state(thread, flavor, (thread_state_t) &state, &cnt); - if (kr || cnt != real_cnt) - return false; - - uintptr_t *pcp; -#if defined(__x86_64__) - pcp = (uintptr_t *) &state.rip; -#elif defined(__i386__) - pcp = (uintptr_t *) &state.eip; -#elif defined(__arm__) || defined(__arm64__) - pcp = (uintptr_t *) &state.pc; -#endif - uintptr_t old = *pcp; -#ifdef __arm__ - /* thumb */ - if (state.cpsr & 0x20) - old |= 1; -#endif - uintptr_t new = callback(ctx, *pcp); - if (new != old) { - *pcp = new; -#ifdef __arm__ - *pcp &= ~1; - state.cpsr = (state.cpsr & ~0x20) | ((new & 1) * 0x20); -#endif - kr = thread_set_state(thread, flavor, (thread_state_t) &state, real_cnt); - if (kr) - return false; - } - return true; -} - -int stop_other_threads(void **token_ptr) { - if (!pthread_main_np()) - return SUBSTITUTE_ERR_NOT_ON_MAIN_THREAD; - - int ret; - mach_port_t self = mach_thread_self(); - - /* The following shenanigans are for catching any new threads that are - * created while we're looping, without suspending anything twice. Keep - * looping until only threads we already suspended before this loop are - * there. */ - HTAB_STORAGE(mach_port_set) *hs = malloc(sizeof(*hs)); - HTAB_STORAGE_INIT(hs, mach_port_set); - struct htab_mach_port_set *suspended_set = &hs->h; - - thread_act_array_t ports = 0; - mach_msg_type_number_t nports = 0; - - bool got_new = true; - while (got_new) { - got_new = false; - - kern_return_t kr = task_threads(mach_task_self(), &ports, &nports); - if (kr) { /* ouch */ - ret = SUBSTITUTE_ERR_ADJUSTING_THREADS; - goto fail; - } - - for (mach_msg_type_number_t i = 0; i < nports; i++) { - mach_port_t port = ports[i]; - struct htab_bucket_mach_port_set *bucket; - if (port == self || - (bucket = htab_setbucket_mach_port_set(suspended_set, &port), - bucket->key)) { - /* already suspended, ignore */ - mach_port_deallocate(mach_task_self(), port); - } else { - got_new = true; - kr = thread_suspend(port); - if (kr == KERN_TERMINATED) { - /* too late */ - mach_port_deallocate(mach_task_self(), port); - } else if (kr) { - ret = SUBSTITUTE_ERR_ADJUSTING_THREADS; - for (; i < nports; i++) - mach_port_deallocate(mach_task_self(), ports[i]); - vm_deallocate(mach_task_self(), (vm_address_t) ports, - nports * sizeof(*ports)); - goto fail; - } - bucket->key = port; - } - } - vm_deallocate(mach_task_self(), (vm_address_t) ports, - nports * sizeof(*ports)); - } - - /* Success - keep the set around for when we're done. */ - *token_ptr = suspended_set; - return SUBSTITUTE_OK; - -fail: - resume_other_threads(suspended_set); - return ret; -} - -int apply_pc_patch_callback(void *token, - uintptr_t (*pc_patch_callback)(void *ctx, uintptr_t pc), - void *ctx) { - struct htab_mach_port_set *suspended_set = token; - int ret = SUBSTITUTE_OK; - HTAB_FOREACH(suspended_set, mach_port_t *threadp, - UNUSED struct empty *_, - mach_port_set) { - if (!apply_one_pcp(*threadp, pc_patch_callback, ctx)) { - ret = SUBSTITUTE_ERR_ADJUSTING_THREADS; - break; - } - } - return ret; -} - -int resume_other_threads(void *token) { - struct htab_mach_port_set *suspended_set = token; - HTAB_FOREACH(suspended_set, mach_port_t *threadp, - UNUSED struct empty *_, - mach_port_set) { - thread_resume(*threadp); - mach_port_deallocate(mach_task_self(), *threadp); - } - htab_free_storage_mach_port_set(suspended_set); - free(suspended_set); - return SUBSTITUTE_OK; /* eh */ -} diff --git a/lib/darwin/substrate-compat.c b/lib/darwin/substrate-compat.c index 2746795..2cdcf6f 100644 --- a/lib/darwin/substrate-compat.c +++ b/lib/darwin/substrate-compat.c @@ -43,7 +43,7 @@ EXPORT void SubHookFunction(void *symbol, void *replace, void **result) __asm__("SubHookFunction"); void SubHookFunction(void *symbol, void *replace, void **result) { struct substitute_function_hook hook = {symbol, replace, result}; - int ret = substitute_hook_functions(&hook, 1, SUBSTITUTE_DONT_STOP_THREADS); + int ret = substitute_hook_functions(&hook, 1, SUBSTITUTE_NO_THREAD_SAFETY); if (ret) { panic("SubHookFunction: substitute_hook_functions returned %s\n", substitute_strerror(ret)); diff --git a/lib/execmem.h b/lib/execmem.h index b4860e9..895769d 100644 --- a/lib/execmem.h +++ b/lib/execmem.h @@ -1,9 +1,22 @@ #pragma once -#include <stdlib.h> -/* Write to a foreign page which is already RX / with unknown permissions. */ -int execmem_write(void *dest, const void *src, size_t len); - +#include <sys/types.h> /* For allocating trampolines - this is just a mmap wrapper. */ int execmem_alloc_unsealed(uintptr_t hint, void **page_p, size_t *size_p); int execmem_seal(void *page); void execmem_free(void *page); + +/* Write to foreign pages which are already RX or have unknown permissions. + * If callback is not NULL, run it on all other threads 'atomically', in the + * sense that it will be called on any thread which executed any of the old + * instructions in the write region. + * Oh, and it might mutate writes (to sort it). */ +struct execmem_foreign_write { + void *dst; + const void *src; + size_t len; +}; +typedef uintptr_t (*execmem_pc_patch_callback)(void *ctx, uintptr_t pc); +int execmem_foreign_write_with_pc_patch(struct execmem_foreign_write *writes, + size_t nwrites, + execmem_pc_patch_callback callback, + void *callback_ctx); diff --git a/lib/hook-functions.c b/lib/hook-functions.c index 7db06d4..5d1f1d5 100644 --- a/lib/hook-functions.c +++ b/lib/hook-functions.c @@ -4,8 +4,8 @@ #include "jump-dis.h" #include "transform-dis.h" #include "execmem.h" -#include "stop-other-threads.h" #include stringify(TARGET_DIR/jump-patch.h) +#include <pthread.h> struct hook_internal { int offset_by_pcdiff[MAX_JUMP_PATCH_SIZE + 1]; @@ -16,6 +16,7 @@ struct hook_internal { /* page allocated with execmem_alloc_unsealed - only if we had to allocate * one when processing this hook */ void *trampoline_page; + struct arch_dis_ctx arch_dis_ctx; }; struct pc_callback_info { @@ -125,22 +126,21 @@ skip_after:; EXPORT int substitute_hook_functions(const struct substitute_function_hook *hooks, size_t nhooks, int options) { - struct hook_internal *his = malloc(nhooks * sizeof(*his)); + bool thread_safe = !(options & SUBSTITUTE_NO_THREAD_SAFETY); + if (thread_safe && !pthread_main_np()) + return SUBSTITUTE_ERR_NOT_ON_MAIN_THREAD; + + struct execmem_foreign_write *fws; + struct hook_internal *his = malloc(nhooks * sizeof(*his) + + nhooks + sizeof(*fws)); if (!his) return SUBSTITUTE_ERR_OOM; + fws = (void *) (his + nhooks); for (size_t i = 0; i < nhooks; i++) his[i].trampoline_page = NULL; int ret = SUBSTITUTE_OK; - ssize_t emw_finished_i = -1; - bool stopped = false; - void *stop_token; - if (!(options & SUBSTITUTE_DONT_STOP_THREADS)) { - if ((ret = stop_other_threads(&stop_token))) - goto end; - stopped = true; - } void *trampoline_ptr = NULL; size_t trampoline_size_left = 0; @@ -160,6 +160,7 @@ int substitute_hook_functions(const struct substitute_function_hook *hooks, } #endif hi->code = code; + hi->arch_dis_ctx = arch; uintptr_t pc_patch_start = (uintptr_t) code; int patch_size; bool need_intro_trampoline; @@ -201,6 +202,7 @@ int substitute_hook_functions(const struct substitute_function_hook *hooks, } hi->outro_trampoline = trampoline_ptr; + *(void **) hook->old_ptr = hi->outro_trampoline; uintptr_t dpc = pc_patch_end; #ifdef __arm__ if (arch.pc_low_bit) { @@ -229,49 +231,34 @@ int substitute_hook_functions(const struct substitute_function_hook *hooks, /* Now commit. */ for (size_t i = 0; i < nhooks; i++) { - const struct substitute_function_hook *hook = &hooks[i]; struct hook_internal *hi = &his[i]; - emw_finished_i = (ssize_t) i; - if ((ret = execmem_write(hi->code, hi->jump_patch, hi->jump_patch_size))) { - /* User is probably screwed, since this probably means a failure to - * re-protect exec, thanks to code signing, so now the function is - * permanently inaccessible. */ - goto end; - } - if (hook->old_ptr) - *(void **) hook->old_ptr = hi->outro_trampoline; + void *page = hi->trampoline_page; + if (page) + execmem_seal(page); + fws[i].dst = hi->code; + fws[i].src = hi->jump_patch; + fws[i].len = hi->jump_patch_size; } - /* *sigh of relief* now we can rewrite the PCs. */ - if (stopped) { - struct pc_callback_info info = {his, nhooks, false}; - if ((ret = apply_pc_patch_callback(stop_token, pc_callback, &info))) - goto end; - if (info.encountered_bad_pc) { - ret = SUBSTITUTE_ERR_UNEXPECTED_PC_ON_OTHER_THREAD; - goto end; - } + struct pc_callback_info info = {his, nhooks, false}; + if ((ret = execmem_foreign_write_with_pc_patch( + fws, nhooks, thread_safe ? pc_callback : NULL, &info))) { + /* Too late to free the trampolines. Chances are this is fatal anyway. */ + goto end_dont_free; + } + if (info.encountered_bad_pc) { + ret = SUBSTITUTE_ERR_UNEXPECTED_PC_ON_OTHER_THREAD; + goto end_dont_free; } end: + /* if we failed, get rid of the trampolines. */ for (size_t i = 0; i < nhooks; i++) { void *page = his[i].trampoline_page; - if (page) { - /* if we failed, get rid of the trampolines. if we succeeded, make - * them executable */ - if (ret && (ssize_t) i >= emw_finished_i) { - execmem_free(page); - } else { - /* we already patched them all, too late to go back.. */ - ret = execmem_seal(page); - } - } - } - if (stopped) { - int r2 = resume_other_threads(stop_token); - if (!ret) - ret = r2; + if (page) + execmem_free(page); } +end_dont_free: free(his); return ret; } diff --git a/lib/stop-other-threads.h b/lib/stop-other-threads.h deleted file mode 100644 index 1f6e639..0000000 --- a/lib/stop-other-threads.h +++ /dev/null @@ -1,9 +0,0 @@ -#pragma once -#include <stdint.h> - -/* Stop the world; return token to be used for applying PC patches and resuming. */ -int stop_other_threads(void **token_ptr); -int apply_pc_patch_callback(void *token, - uintptr_t (*pc_patch_callback)(void *ctx, uintptr_t pc), - void *ctx); -int resume_other_threads(void *token); diff --git a/lib/substitute.h b/lib/substitute.h index 8764bcf..2045c3d 100644 --- a/lib/substitute.h +++ b/lib/substitute.h @@ -37,9 +37,8 @@ enum { /* out of memory */ SUBSTITUTE_ERR_OOM, - /* substitute_hook_functions: mmap or mprotect failure other than ENOMEM - * (preserved in errno on return) - * substitute_hook_functions: vm_region failure (errno = 0) + /* substitute_hook_functions: mmap, mprotect, vm_copy, or + * vm_remap failure * substitute_hook_objc_message: vm_remap failure * Most likely to come up with substitute_hook_functions if the kernel is * preventing pages from being marked executable. */ @@ -48,7 +47,7 @@ enum { /* substitute_hook_functions: not on the main thread (so stopping all other * threads would be unsafe, as concurrent attempts to do the same from * other threads would result in deadlock), and you did not pass - * SUBSTITUTE_DONT_STOP_THREADS */ + * SUBSTITUTE_NO_THREAD_SAFETY */ SUBSTITUTE_ERR_NOT_ON_MAIN_THREAD, /* substitute_hook_functions: when trying to patch the PC of other threads @@ -86,7 +85,7 @@ const char *substitute_strerror(int err); /* substitute_hook_functions options */ enum { - SUBSTITUTE_DONT_STOP_THREADS = 1, + SUBSTITUTE_NO_THREAD_SAFETY = 1, }; /* TODO doc */ diff --git a/test/test-execmem.c b/test/test-execmem.c index 5ec64e9..9125d68 100644 --- a/test/test-execmem.c +++ b/test/test-execmem.c @@ -25,13 +25,18 @@ int test(size_t a) { return 1000; } +static int ewrite(void *dst, const void *src, size_t len) { + struct execmem_foreign_write w = {dst, src, len}; + return execmem_foreign_write_with_pc_patch(&w, 1, NULL, NULL); +} + int main() { printf("this should be 5: %d\n", test(0)); - printf("=> %d\n", execmem_write(test, other, OTHER_SIZE)); + printf("=> %d\n", ewrite(test, other, OTHER_SIZE)); printf(" %s\n", strerror(errno)); printf("this should be 6: %d\n", test(0)); - printf("=> %d\n", execmem_write(hcreate, other, OTHER_SIZE)); + printf("=> %d\n", ewrite(hcreate, other, OTHER_SIZE)); printf(" %s\n", strerror(errno)); - printf("modified shared cache func: %d\n", hcreate(0)); + printf("modified shared cache func should be 6: %d\n", hcreate(0)); } diff --git a/test/test-stop-threads.c b/test/test-pc-patch.c index d53d8cd..5ae3570 100644 --- a/test/test-stop-threads.c +++ b/test/test-pc-patch.c @@ -1,7 +1,8 @@ #include "substitute-internal.h" -#include "stop-other-threads.h" +#include "execmem.h" #include <stdio.h> #include <unistd.h> +#include <stdlib.h> #include <pthread.h> #include <assert.h> /* printf without taking any locks - because they might be taken at stop time */ @@ -32,14 +33,17 @@ int main() { for (long i = 0; i < 10; i++) pthread_create(&pts[i], NULL, some_thread, (void *) i); sleep(1); - void *stop_token; - ulprintf("stopping\n"); - assert(!stop_other_threads(&stop_token)); - ulprintf("stopped\n"); - assert(!apply_pc_patch_callback(stop_token, patch_callback, NULL)); - ulprintf("resuming\n"); - assert(!resume_other_threads(stop_token)); - ulprintf("resumed\n"); + char *foo = malloc(0x10000); + static char bar[16]; + struct execmem_foreign_write writes[] = { + {foo, bar, 5}, + {foo + 7, bar + 7, 3}, + }; + int ret = execmem_foreign_write_with_pc_patch(writes, + sizeof(writes)/sizeof(*writes), + patch_callback, + NULL); + ulprintf("==> %d\n", ret); void *out; for (long i = 0; i < 10; i++) assert(!pthread_join(pts[i], &out)); |