From eb93cee2a22cde812ccd6b9bd418d36185c058f5 Mon Sep 17 00:00:00 2001 From: comex Date: Sun, 8 Feb 2015 23:45:24 -0500 Subject: Refactor disassembly so x86 works, and add x86 transform-dis. This patch is a monolithic mess, because I was too lazy to do the refactor first (that would require some stash fun, since I wasn't actually sure before doing x86 transform-dis what would be needed). Anyway, the resulting code should be cleaner - less duplication. This breaks ARM/ARM64. --- lib/arm/arch-dis.h | 60 +++++++ lib/arm/arch-transform-dis.inc.h | 195 +++++++++++++++++++++ lib/arm/dis-arm-multi.inc.h | 16 -- lib/arm/dis-main.inc.h | 16 ++ lib/arm/misc.h | 58 +------ lib/arm/transform-dis-arm-multi.inc.h | 195 --------------------- lib/arm64/arch-dis.h | 37 ++++ lib/arm64/arch-transform-dis.inc.h | 52 ++++++ lib/arm64/dis-arm64.inc.h | 69 -------- lib/arm64/dis-main.inc.h | 69 ++++++++ lib/arm64/misc.h | 35 +--- lib/arm64/transform-dis-arm64.inc.h | 52 ------ lib/dis.h | 26 +++ lib/hook-functions.c | 4 +- lib/jump-dis.c | 46 +++-- lib/jump-dis.h | 1 + lib/substitute-internal.h | 16 +- lib/transform-dis.c | 55 +++--- lib/transform-dis.h | 5 +- lib/x86/arch-dis.h | 10 ++ lib/x86/arch-transform-dis.inc.h | 58 +++++++ lib/x86/dis-main.inc.h | 312 ++++++++++++++++++++++++++++++++++ lib/x86/dis-x86.inc.h | 305 --------------------------------- lib/x86/jump-patch.h | 21 ++- lib/x86/misc.h | 12 +- 25 files changed, 913 insertions(+), 812 deletions(-) create mode 100644 lib/arm/arch-dis.h create mode 100644 lib/arm/arch-transform-dis.inc.h delete mode 100644 lib/arm/dis-arm-multi.inc.h create mode 100644 lib/arm/dis-main.inc.h delete mode 100644 lib/arm/transform-dis-arm-multi.inc.h create mode 100644 lib/arm64/arch-dis.h create mode 100644 lib/arm64/arch-transform-dis.inc.h delete mode 100644 lib/arm64/dis-arm64.inc.h create mode 100644 lib/arm64/dis-main.inc.h delete mode 100644 lib/arm64/transform-dis-arm64.inc.h create mode 100644 lib/x86/arch-dis.h create mode 100644 lib/x86/arch-transform-dis.inc.h create mode 100644 lib/x86/dis-main.inc.h delete mode 100644 lib/x86/dis-x86.inc.h (limited to 'lib') diff --git a/lib/arm/arch-dis.h b/lib/arm/arch-dis.h new file mode 100644 index 0000000..c64ff2e --- /dev/null +++ b/lib/arm/arch-dis.h @@ -0,0 +1,60 @@ +#pragma once +#define MIN_INSN_SIZE 2 +/* each input instruction might turn into: + * - 2 bytes for Bcc, if in IT + * then ONE of: + * - 2/4 bytes for just the instruction + * - 2+8 bytes for branch (which in *valid* code rules out IT but whatever) + * - up to 7 4-byte insns for pcrel (if dest=pc, and while these can be subject + * to IT, there can only reasonably be two per block, and if there are both + * then that's an unconditional exit - but we don't enforce any of this + * currently) + * - up to 7 4-byte insns for similar moves to PC that fall under 'data' + * the maximum number of possible inputs is 4, plus 4 extras if the last one + * was an IT (but in that case it can't be one of the above cases) + * while this looks huge, it's overly conservative and doesn't matter much, + * since only the actually used space will be taken up in the final output + */ +#define TD_MAX_REWRITTEN_SIZE (7*4*7 + 4) /* 196 */ + +struct arch_pcrel_info { + unsigned reg; + enum pcrel_load_mode lm; +}; + +struct arch_dis_ctx { + /* thumb? */ + bool pc_low_bit; + /* if thumb, IT cond for the next 5 instructions + * (5 because we still advance after IT) */ + uint8_t it_conds[5]; + /* for transform_dis - did we add space for a Bccrel? */ + uint8_t bccrel_bits; + void *bccrel_p; +}; + +static inline void arch_dis_ctx_init(struct arch_dis_ctx *ctx) { + ctx->pc_low_bit = false; + ctx->bccrel_p = NULL; + memset(ctx->it_conds, 0xe, 5); +} + +static inline void advance_it_cond(struct arch_dis_ctx *ctx) { + ctx->it_conds[0] = ctx->it_conds[1]; + ctx->it_conds[1] = ctx->it_conds[2]; + ctx->it_conds[2] = ctx->it_conds[3]; + ctx->it_conds[3] = ctx->it_conds[4]; + ctx->it_conds[4] = 0xe; +} + +#define DFLAG_IS_LDRD_STRD (1 << 16) + +/* Types of conditionals for 'branch' */ +/* a regular old branch-with-condition */ +#define CC_ARMCC (CC_CONDITIONAL | 0x400) +/* already in an IT block - in transform_dis this will be rewritten to a branch + * anyway, so it can be treated as unconditional; in jump_dis we have to know + * to keep going */ +#define CC_ALREADY_IN_IT (CC_CONDITIONAL | 0x800) +/* CBZ/CBNZ is rewritten */ +#define CC_CBXZ (CC_CONDITIONAL | 0xc00) diff --git a/lib/arm/arch-transform-dis.inc.h b/lib/arm/arch-transform-dis.inc.h new file mode 100644 index 0000000..6e91ff5 --- /dev/null +++ b/lib/arm/arch-transform-dis.inc.h @@ -0,0 +1,195 @@ +/* TODO fix BL incl MOV LR, PC */ +#include "arm/assemble.h" + +static struct assemble_ctx tdctx_to_actx(const struct transform_dis_ctx *ctx) { + int cond; + if (ctx->arch.pc_low_bit) { + cond = ctx->op >> 28; + if (cond == 0xf) + cond = 0xe; + } else { + cond = 0; + } + return (struct assemble_ctx) { + ctx->rewritten_ptr_ptr, + ctx->arch.pc_low_bit, + cond + }; + +} + +static int invert_arm_cond(int cc) { + if (cc >= 0xe) + __builtin_abort(); + return cc ^ 1; +} + +static NOINLINE UNUSED void transform_dis_data(struct transform_dis_ctx *ctx, + unsigned o0, unsigned o1, unsigned o2, unsigned o3, unsigned out_mask) { +#ifdef TRANSFORM_DIS_VERBOSE + printf("transform_dis_data: (%p) %x %x %x %x out_mask=%x\n", (void *) ctx->pc, + o0, o1, o2, o3, out_mask); +#endif + /* We only care if at least one op is PC, so quickly test that. */ + if (((o0 | o1 | o2 | o3) & 15) != 15) + return; + unsigned *newval = ctx->newval; + newval[0] = o0; + newval[1] = o1; + newval[2] = o2; + newval[3] = o3; + + void **codep = ctx->rewritten_ptr_ptr; + struct assemble_ctx actx = tdctx_to_actx(ctx); + + /* A few cases: + * 1. Move to PC that does not read PC. Probably fine. + * 2. Move to PC that does read PC, e.g. 'ldrls pc, [pc, r0, lsl #2]'. + * This is different from #4 mainly in that we can't need to do + * something like pop {temp, pc}. Not terribly plausible (only likely + * in non-position-independent code in ARM mode, and I can't get it to + * happen in the first 8 bytes then), but we may as well handle it. + * 3. Read of PC that does not read the register(s) it writes, e.g. adr r3, + * X. In this case we can use that register as a temporary. + * 4. Read of PC that does, or doesn't have any output register, e.g. add + * r3, pc. In this case we use the stack because reliably finding a + * free register would be work, and might not even be possible (thumb + * mov r9, r0; mov r12, r1; ) + * the out register is always first. + */ + uint16_t in_regs = 0; + int out_reg = -1; + for (int i = 0; i < 4; i++) { + if (out_mask & 1 << i) + out_reg = newval[i]; + else if (newval[i] != null_op) + in_regs |= 1 << newval[i]; + } + if (out_mask & DFLAG_IS_LDRD_STRD) + in_regs |= 1 << (newval[0] + 1); + uint32_t pc = ctx->pc + (ctx->arch.pc_low_bit ? 4 : 8); + int scratch = __builtin_ctz(~(in_regs | (1 << out_reg))); + +#ifdef TRANSFORM_DIS_VERBOSE + printf("transform_dis_data: in_regs=%x out_reg=%d pc=%x scratch=%d\n", + in_regs, out_reg, pc, scratch); +#endif + + if (out_reg == 15) { + if (in_regs & 1 << 15) + return; /* case 1 */ + /* case 2 */ + PUSHone(actx, scratch); + PUSHone(actx, scratch); + MOVW_MOVT(actx, scratch, pc); + for (int i = 0; i < 4; i++) + if (newval[i] == 15) + newval[i] = scratch; + ctx->write_newop_here = *codep; *codep += ctx->op_size; + STRri(actx, scratch, 13, 4); + POPmulti(actx, 1 << scratch | 1 << 15); + if (actx.cond != 0xe) + transform_dis_ret(ctx); + } else { + if (out_reg != -1 && !(in_regs & 1 << out_reg)) { + /* case 3 - ignore scratch */ + MOVW_MOVT(actx, out_reg, pc); + for (int i = 0; i < 4; i++) + if (newval[i] == 15) + newval[i] = out_reg; + ctx->write_newop_here = *codep; *codep += ctx->op_size; + } else { + /* case 4 */ + PUSHone(actx, scratch); + MOVW_MOVT(actx, scratch, pc); + for (int i = 0; i < 4; i++) + if (newval[i] == 15) + newval[i] = scratch; + ctx->write_newop_here = *codep; *codep += ctx->op_size; + POPone(actx, scratch); + } + } + ctx->modify = true; +#ifdef TRANSFORM_DIS_VERBOSE + printf("transform_dis_data: => %x %x %x %x\n", + newval[0], newval[1], newval[2], newval[3]); +#endif +} + +static NOINLINE UNUSED void transform_dis_pcrel(struct transform_dis_ctx *ctx, + uintptr_t dpc, unsigned reg, enum pcrel_load_mode load_mode) { +#ifdef TRANSFORM_DIS_VERBOSE + printf("transform_dis_pcrel: (%p) dpc=%p reg=%x mode=%d\n", (void *) ctx->pc, + (void *) dpc, reg, load_mode); +#endif + ctx->write_newop_here = NULL; + struct assemble_ctx actx = tdctx_to_actx(ctx); + if (reg == 15) { + int scratch = 0; + PUSHone(actx, scratch); + PUSHone(actx, scratch); + MOVW_MOVT(actx, scratch, dpc); + if (load_mode != PLM_ADR) + LDRxi(actx, scratch, scratch, 0, load_mode); + STRri(actx, scratch, 13, 4); + POPmulti(actx, 1 << scratch | 1 << 15); + transform_dis_ret(ctx); + } else { + MOVW_MOVT(actx, reg, dpc); + if (load_mode != PLM_ADR) + LDRxi(actx, reg, reg, 0, load_mode); + } +} + +static NOINLINE UNUSED void transform_dis_branch(struct transform_dis_ctx *ctx, + uintptr_t dpc, int cc) { +#ifdef TRANSFORM_DIS_VERBOSE + printf("transform_dis (%p): branch => %p\n", (void *) ctx->pc, (void *) dpc); +#endif + if (dpc >= ctx->pc_patch_start && dpc < ctx->pc_patch_end) { + /* don't support this for now */ + /* making the simplifying assumption here that functions will not try + * to branch into the middle of an IT block, which is the case where + * pc_patch_end changes to include additional instructions (as opposed + * to include the end of a partially included instruction, which is + * common) */ + ctx->err = SUBSTITUTE_ERR_FUNC_BAD_INSN_AT_START; + return; + } + struct assemble_ctx actx = tdctx_to_actx(ctx); + ctx->write_newop_here = NULL; + if ((cc & CC_ARMCC) == CC_ARMCC) { + actx.cond = invert_arm_cond(cc & 0xf); + Bccrel(actx, 2+8); + } else if ((cc & CC_CBXZ) == CC_CBXZ) { + ctx->modify = true; + ctx->newval[0] = ctx->pc + 2+8; + ctx->newval[1] = 1; /* do invert */ + void **codep = ctx->rewritten_ptr_ptr; + ctx->write_newop_here = *codep; *codep += 2; + } + actx.cond = 0xe; + LDR_PC(actx, dpc | ctx->arch.pc_low_bit); +} + +static void transform_dis_pre_dis(struct transform_dis_ctx *ctx) { + /* for simplicity we turn IT into a series of branches for each + * instruction, so... */ + if (ctx->arch.it_conds[0] != 0xe) { + ctx->arch.bccrel_bits = invert_arm_cond(ctx->arch.it_conds[0]); + ctx->arch.bccrel_p = *ctx->rewritten_ptr_ptr; + *ctx->rewritten_ptr_ptr += 2; + } else { + ctx->arch.bccrel_p = NULL; + } +} + +static void transform_dis_post_dis(struct transform_dis_ctx *ctx) { + if (ctx->arch.bccrel_p) { + struct assemble_ctx actx = {&ctx->arch.bccrel_p, + /*thumb*/ true, + ctx->arch.bccrel_bits}; + Bccrel(actx, *ctx->rewritten_ptr_ptr - ctx->arch.bccrel_p); + } + ctx->force_keep_transforming = ctx->arch.it_conds[0] != 0xe; +} diff --git a/lib/arm/dis-arm-multi.inc.h b/lib/arm/dis-arm-multi.inc.h deleted file mode 100644 index bf2767e..0000000 --- a/lib/arm/dis-arm-multi.inc.h +++ /dev/null @@ -1,16 +0,0 @@ -#include "dis-thumb.inc.h" -#include "dis-thumb2.inc.h" -#include "dis-arm.inc.h" - -static INLINE void P(dis)(tdis_ctx ctx) { - if (ctx->arch.pc_low_bit) { - uint16_t op = *(uint16_t *) ctx->ptr; - bool is_32 = (op >> 13 & 7) == 7 && (op >> 11 & 3) != 0; - if (is_32) - return P(dis_thumb2)(ctx); - else - return P(dis_thumb)(ctx); - } else { - return P(dis_arm)(ctx); - } -} diff --git a/lib/arm/dis-main.inc.h b/lib/arm/dis-main.inc.h new file mode 100644 index 0000000..bf2767e --- /dev/null +++ b/lib/arm/dis-main.inc.h @@ -0,0 +1,16 @@ +#include "dis-thumb.inc.h" +#include "dis-thumb2.inc.h" +#include "dis-arm.inc.h" + +static INLINE void P(dis)(tdis_ctx ctx) { + if (ctx->arch.pc_low_bit) { + uint16_t op = *(uint16_t *) ctx->ptr; + bool is_32 = (op >> 13 & 7) == 7 && (op >> 11 & 3) != 0; + if (is_32) + return P(dis_thumb2)(ctx); + else + return P(dis_thumb)(ctx); + } else { + return P(dis_arm)(ctx); + } +} diff --git a/lib/arm/misc.h b/lib/arm/misc.h index ef11a05..c18367d 100644 --- a/lib/arm/misc.h +++ b/lib/arm/misc.h @@ -1,59 +1,3 @@ #pragma once +#define TARGET_POINTER_SIZE 4 #define TARGET_DIS_SUPPORTED -#define TARGET_DIS_HEADER "arm/dis-arm-multi.inc.h" -#define TARGET_JUMP_PATCH_HDR "arm/jump-patch.h" -#define TARGET_TRANSFORM_DIS_HEADER "arm/transform-dis-arm-multi.inc.h" -#define MIN_INSN_SIZE 2 -/* each input instruction might turn into: - * - 2 bytes for Bcc, if in IT - * then ONE of: - * - 2/4 bytes for just the instruction - * - 2+8 bytes for branch (which in *valid* code rules out IT but whatever) - * - up to 7 4-byte insns for pcrel (if dest=pc, and while these can be subject - * to IT, there can only reasonably be two per block, and if there are both - * then that's an unconditional exit - but we don't enforce any of this - * currently) - * - up to 7 4-byte insns for similar moves to PC that fall under 'data' - * the maximum number of possible inputs is 4, plus 4 extras if the last one - * was an IT (but in that case it can't be one of the above cases) - * while this looks huge, it's overly conservative and doesn't matter much, - * since only the actually used space will be taken up in the final output - */ -#define TD_MAX_REWRITTEN_SIZE (7*4*7 + 4) /* 196 */ - -struct arch_dis_ctx { - /* thumb? */ - bool pc_low_bit; - /* if thumb, IT cond for the next 5 instructions - * (5 because we still advance after IT) */ - uint8_t it_conds[5]; - /* for transform_dis - did we add space for a Bccrel? */ - uint8_t bccrel_bits; - void *bccrel_p; -}; - -static inline void arch_dis_ctx_init(struct arch_dis_ctx *ctx) { - ctx->pc_low_bit = false; - ctx->bccrel_p = NULL; - memset(ctx->it_conds, 0xe, 5); -} - -static inline void advance_it_cond(struct arch_dis_ctx *ctx) { - ctx->it_conds[0] = ctx->it_conds[1]; - ctx->it_conds[1] = ctx->it_conds[2]; - ctx->it_conds[2] = ctx->it_conds[3]; - ctx->it_conds[3] = ctx->it_conds[4]; - ctx->it_conds[4] = 0xe; -} - -#define DFLAG_IS_LDRD_STRD (1 << 16) - -/* Types of conditionals for 'branch' */ -/* a regular old branch-with-condition */ -#define CC_ARMCC (CC_CONDITIONAL | 0x400) -/* already in an IT block - in transform_dis this will be rewritten to a branch - * anyway, so it can be treated as unconditional; in jump_dis we have to know - * to keep going */ -#define CC_ALREADY_IN_IT (CC_CONDITIONAL | 0x800) -/* CBZ/CBNZ is rewritten */ -#define CC_CBXZ (CC_CONDITIONAL | 0xc00) diff --git a/lib/arm/transform-dis-arm-multi.inc.h b/lib/arm/transform-dis-arm-multi.inc.h deleted file mode 100644 index 6e91ff5..0000000 --- a/lib/arm/transform-dis-arm-multi.inc.h +++ /dev/null @@ -1,195 +0,0 @@ -/* TODO fix BL incl MOV LR, PC */ -#include "arm/assemble.h" - -static struct assemble_ctx tdctx_to_actx(const struct transform_dis_ctx *ctx) { - int cond; - if (ctx->arch.pc_low_bit) { - cond = ctx->op >> 28; - if (cond == 0xf) - cond = 0xe; - } else { - cond = 0; - } - return (struct assemble_ctx) { - ctx->rewritten_ptr_ptr, - ctx->arch.pc_low_bit, - cond - }; - -} - -static int invert_arm_cond(int cc) { - if (cc >= 0xe) - __builtin_abort(); - return cc ^ 1; -} - -static NOINLINE UNUSED void transform_dis_data(struct transform_dis_ctx *ctx, - unsigned o0, unsigned o1, unsigned o2, unsigned o3, unsigned out_mask) { -#ifdef TRANSFORM_DIS_VERBOSE - printf("transform_dis_data: (%p) %x %x %x %x out_mask=%x\n", (void *) ctx->pc, - o0, o1, o2, o3, out_mask); -#endif - /* We only care if at least one op is PC, so quickly test that. */ - if (((o0 | o1 | o2 | o3) & 15) != 15) - return; - unsigned *newval = ctx->newval; - newval[0] = o0; - newval[1] = o1; - newval[2] = o2; - newval[3] = o3; - - void **codep = ctx->rewritten_ptr_ptr; - struct assemble_ctx actx = tdctx_to_actx(ctx); - - /* A few cases: - * 1. Move to PC that does not read PC. Probably fine. - * 2. Move to PC that does read PC, e.g. 'ldrls pc, [pc, r0, lsl #2]'. - * This is different from #4 mainly in that we can't need to do - * something like pop {temp, pc}. Not terribly plausible (only likely - * in non-position-independent code in ARM mode, and I can't get it to - * happen in the first 8 bytes then), but we may as well handle it. - * 3. Read of PC that does not read the register(s) it writes, e.g. adr r3, - * X. In this case we can use that register as a temporary. - * 4. Read of PC that does, or doesn't have any output register, e.g. add - * r3, pc. In this case we use the stack because reliably finding a - * free register would be work, and might not even be possible (thumb - * mov r9, r0; mov r12, r1; ) - * the out register is always first. - */ - uint16_t in_regs = 0; - int out_reg = -1; - for (int i = 0; i < 4; i++) { - if (out_mask & 1 << i) - out_reg = newval[i]; - else if (newval[i] != null_op) - in_regs |= 1 << newval[i]; - } - if (out_mask & DFLAG_IS_LDRD_STRD) - in_regs |= 1 << (newval[0] + 1); - uint32_t pc = ctx->pc + (ctx->arch.pc_low_bit ? 4 : 8); - int scratch = __builtin_ctz(~(in_regs | (1 << out_reg))); - -#ifdef TRANSFORM_DIS_VERBOSE - printf("transform_dis_data: in_regs=%x out_reg=%d pc=%x scratch=%d\n", - in_regs, out_reg, pc, scratch); -#endif - - if (out_reg == 15) { - if (in_regs & 1 << 15) - return; /* case 1 */ - /* case 2 */ - PUSHone(actx, scratch); - PUSHone(actx, scratch); - MOVW_MOVT(actx, scratch, pc); - for (int i = 0; i < 4; i++) - if (newval[i] == 15) - newval[i] = scratch; - ctx->write_newop_here = *codep; *codep += ctx->op_size; - STRri(actx, scratch, 13, 4); - POPmulti(actx, 1 << scratch | 1 << 15); - if (actx.cond != 0xe) - transform_dis_ret(ctx); - } else { - if (out_reg != -1 && !(in_regs & 1 << out_reg)) { - /* case 3 - ignore scratch */ - MOVW_MOVT(actx, out_reg, pc); - for (int i = 0; i < 4; i++) - if (newval[i] == 15) - newval[i] = out_reg; - ctx->write_newop_here = *codep; *codep += ctx->op_size; - } else { - /* case 4 */ - PUSHone(actx, scratch); - MOVW_MOVT(actx, scratch, pc); - for (int i = 0; i < 4; i++) - if (newval[i] == 15) - newval[i] = scratch; - ctx->write_newop_here = *codep; *codep += ctx->op_size; - POPone(actx, scratch); - } - } - ctx->modify = true; -#ifdef TRANSFORM_DIS_VERBOSE - printf("transform_dis_data: => %x %x %x %x\n", - newval[0], newval[1], newval[2], newval[3]); -#endif -} - -static NOINLINE UNUSED void transform_dis_pcrel(struct transform_dis_ctx *ctx, - uintptr_t dpc, unsigned reg, enum pcrel_load_mode load_mode) { -#ifdef TRANSFORM_DIS_VERBOSE - printf("transform_dis_pcrel: (%p) dpc=%p reg=%x mode=%d\n", (void *) ctx->pc, - (void *) dpc, reg, load_mode); -#endif - ctx->write_newop_here = NULL; - struct assemble_ctx actx = tdctx_to_actx(ctx); - if (reg == 15) { - int scratch = 0; - PUSHone(actx, scratch); - PUSHone(actx, scratch); - MOVW_MOVT(actx, scratch, dpc); - if (load_mode != PLM_ADR) - LDRxi(actx, scratch, scratch, 0, load_mode); - STRri(actx, scratch, 13, 4); - POPmulti(actx, 1 << scratch | 1 << 15); - transform_dis_ret(ctx); - } else { - MOVW_MOVT(actx, reg, dpc); - if (load_mode != PLM_ADR) - LDRxi(actx, reg, reg, 0, load_mode); - } -} - -static NOINLINE UNUSED void transform_dis_branch(struct transform_dis_ctx *ctx, - uintptr_t dpc, int cc) { -#ifdef TRANSFORM_DIS_VERBOSE - printf("transform_dis (%p): branch => %p\n", (void *) ctx->pc, (void *) dpc); -#endif - if (dpc >= ctx->pc_patch_start && dpc < ctx->pc_patch_end) { - /* don't support this for now */ - /* making the simplifying assumption here that functions will not try - * to branch into the middle of an IT block, which is the case where - * pc_patch_end changes to include additional instructions (as opposed - * to include the end of a partially included instruction, which is - * common) */ - ctx->err = SUBSTITUTE_ERR_FUNC_BAD_INSN_AT_START; - return; - } - struct assemble_ctx actx = tdctx_to_actx(ctx); - ctx->write_newop_here = NULL; - if ((cc & CC_ARMCC) == CC_ARMCC) { - actx.cond = invert_arm_cond(cc & 0xf); - Bccrel(actx, 2+8); - } else if ((cc & CC_CBXZ) == CC_CBXZ) { - ctx->modify = true; - ctx->newval[0] = ctx->pc + 2+8; - ctx->newval[1] = 1; /* do invert */ - void **codep = ctx->rewritten_ptr_ptr; - ctx->write_newop_here = *codep; *codep += 2; - } - actx.cond = 0xe; - LDR_PC(actx, dpc | ctx->arch.pc_low_bit); -} - -static void transform_dis_pre_dis(struct transform_dis_ctx *ctx) { - /* for simplicity we turn IT into a series of branches for each - * instruction, so... */ - if (ctx->arch.it_conds[0] != 0xe) { - ctx->arch.bccrel_bits = invert_arm_cond(ctx->arch.it_conds[0]); - ctx->arch.bccrel_p = *ctx->rewritten_ptr_ptr; - *ctx->rewritten_ptr_ptr += 2; - } else { - ctx->arch.bccrel_p = NULL; - } -} - -static void transform_dis_post_dis(struct transform_dis_ctx *ctx) { - if (ctx->arch.bccrel_p) { - struct assemble_ctx actx = {&ctx->arch.bccrel_p, - /*thumb*/ true, - ctx->arch.bccrel_bits}; - Bccrel(actx, *ctx->rewritten_ptr_ptr - ctx->arch.bccrel_p); - } - ctx->force_keep_transforming = ctx->arch.it_conds[0] != 0xe; -} diff --git a/lib/arm64/arch-dis.h b/lib/arm64/arch-dis.h new file mode 100644 index 0000000..f91328b --- /dev/null +++ b/lib/arm64/arch-dis.h @@ -0,0 +1,37 @@ +#pragma once +#define MIN_INSN_SIZE 4 +#define TD_MAX_REWRITTEN_SIZE (7 * 2 * 4) /* also conservative */ + +struct arch_pcrel_info { + unsigned reg; + enum pcrel_load_mode lm; +}; + +struct arch_dis_ctx { + /* For transform_dis only - used to get temporary registers. We assume + * that we can use any caller-saved or IP register which was not written, + * so r9-r18. + * This is a massive overestimate: we just OR in each instruction's bits + * 4:0 (Rd for data, Rt for loads, most common), 14:10 (Rt2 for load-pair + * instructions), and 20:16 (Rs for store-exclusive insturctions). It + * would be easy to restrict the latter two to the few instructions that + * actually use them, but with 10 available registers, and a patch of at + * most 3 instructions (and none of the instructions that require a temp + * use Rt2/Rs or could read their Rd, so the third doesn't count), we won't + * run out even with the dumbest possible thing. */ + uint32_t regs_possibly_written; +}; + +static inline void arch_dis_ctx_init(struct arch_dis_ctx *ctx) { + ctx->regs_possibly_written = 0; +} + +static inline int arm64_get_unwritten_temp_reg(struct arch_dis_ctx *ctx) { + uint32_t avail = ~ctx->regs_possibly_written & ((1 << 19) - (1 << 9)); + if (!avail) + __builtin_abort(); + return 31 - __builtin_clz(avail); +} + +#define CC_ARMCC (CC_CONDITIONAL | 0x400) +#define CC_XBXZ (CC_CONDITIONAL | 0x800) diff --git a/lib/arm64/arch-transform-dis.inc.h b/lib/arm64/arch-transform-dis.inc.h new file mode 100644 index 0000000..d8f831d --- /dev/null +++ b/lib/arm64/arch-transform-dis.inc.h @@ -0,0 +1,52 @@ +#include "arm64/assemble.h" + +static NOINLINE UNUSED +void transform_dis_pcrel(struct transform_dis_ctx *ctx, uint_tptr dpc, unsigned reg, + enum pcrel_load_mode load_mode) { + ctx->write_newop_here = NULL; + void **codep = ctx->rewritten_ptr_ptr; + if (load_mode >= PLM_U32_SIMD) { + int reg = arm64_get_unwritten_temp_reg(&ctx->arch); + MOVi64(codep, 0, dpc); + LDRxi(codep, reg, 0, 0, true, load_mode); + } else { + MOVi64(codep, reg, dpc); + LDRxi(codep, reg, reg, 0, true, load_mode); + } +} + +static NOINLINE UNUSED +void transform_dis_branch(struct transform_dis_ctx *ctx, uint_tptr dpc, int cc) { + /* TODO fix BL */ +#ifdef TRANSFORM_DIS_VERBOSE + printf("transform_dis (%p): branch => %p\n", (void *) ctx->pc, (void *) dpc); +#endif + if (dpc >= ctx->pc_patch_start && dpc < ctx->pc_patch_end) { + ctx->err = SUBSTITUTE_ERR_FUNC_BAD_INSN_AT_START; + return; + } + ctx->write_newop_here = NULL; + int mov_br_size = size_of_MOVi64(dpc) + 4; + + void **codep = ctx->rewritten_ptr_ptr; + if ((cc & CC_ARMCC) == CC_ARMCC) { + int icc = (cc & 0xf) ^ 1; + Bccrel(codep, icc, 4 + mov_br_size); + } else if ((cc & CC_XBXZ) == CC_XBXZ) { + ctx->modify = true; + ctx->newval[0] = ctx->pc + 4 + mov_br_size; + ctx->newval[1] = 1; /* do invert */ + ctx->write_newop_here = *codep; *codep += 4; + } + int reg = arm64_get_unwritten_temp_reg(&ctx->arch); + MOVi64(codep, reg, dpc); + BR(codep, reg); +} + +static void transform_dis_pre_dis(UNUSED struct transform_dis_ctx *ctx) {} +static void transform_dis_post_dis(struct transform_dis_ctx *ctx) { + uint32_t op = ctx->op; + ctx->arch.regs_possibly_written |= op & 31; + ctx->arch.regs_possibly_written |= op >> 10 & 31; + ctx->arch.regs_possibly_written |= op >> 16 & 31; +} diff --git a/lib/arm64/dis-arm64.inc.h b/lib/arm64/dis-arm64.inc.h deleted file mode 100644 index 04349f2..0000000 --- a/lib/arm64/dis-arm64.inc.h +++ /dev/null @@ -1,69 +0,0 @@ -static INLINE void P(adrlabel_label_unk_Xd_1_ADR)(tdis_ctx ctx, struct bitslice Xd, struct bitslice label) { - return P(pcrel)(ctx, ctx->pc + sext(bs_get(label, ctx->op), 22), - bs_get(Xd, ctx->op), PLM_ADR); -} -static INLINE void P(adrplabel_label_unk_Xd_1_ADRP)(tdis_ctx ctx, struct bitslice Xd, struct bitslice label) { - return P(pcrel)(ctx, ctx->pc + (sext(bs_get(label, ctx->op), 22) << 12), - bs_get(Xd, ctx->op), PLM_ADR); -} -static INLINE void P(am_b_target_addr_B_1_B)(tdis_ctx ctx, struct bitslice addr) { - return P(branch)(ctx, ctx->pc + sext(bs_get(addr, ctx->op), 26) * 4, - /*cc*/ 0); -} -static INLINE void P(am_bl_target_addr_1_BL)(tdis_ctx ctx, struct bitslice addr) { - return P(branch)(ctx, ctx->pc + sext(bs_get(addr, ctx->op), 26) * 4, - /*cc*/ 0); -} -static INLINE void P(ccode_cond_am_brcond_target_B_1_Bcc)(tdis_ctx ctx, struct bitslice cond, struct bitslice target) { - int bits = bs_get(cond, ctx->op); - /* Bcc with AL/NV (which is actually just another AL) is useless but possible. */ - int cc = bits >= 0xe ? 0 : (CC_ARMCC | bits); - return P(branch)(ctx, ctx->pc + sext(bs_get(target, ctx->op), 19) * 4, cc); -} -static INLINE void P(am_tbrcond_target_B_4_TBNZW)(tdis_ctx ctx, struct bitslice target) { - P(branch)(ctx, ctx->pc + sext(bs_get(target, ctx->op), 14) * 4, CC_XBXZ); - if (TDIS_CTX_MODIFY(ctx)) { - /* ditto CBNZ on ARM */ - int new_target = (TDIS_CTX_NEWVAL(ctx, 0) - ctx->pc) / 4; - unsigned new = bs_set(target, new_target, ctx->op); - if (TDIS_CTX_NEWVAL(ctx, 1)) - new ^= 1 << 24; - TDIS_CTX_SET_NEWOP(ctx, new); - } -} -static INLINE void P(am_brcond_target_B_4_CBNZW)(tdis_ctx ctx, struct bitslice target) { - /* both have the same bit to control Z/NZ */ - return P(am_tbrcond_target_B_4_TBNZW)(ctx, target); -} -static INLINE void P(am_ldrlit_label_unk_Rt_6_LDRDl)(tdis_ctx ctx, struct bitslice Rt, struct bitslice label) { - enum pcrel_load_mode mode; - if ((ctx->op >> 26) & 1) { - switch (ctx->op >> 30) { - case 0: mode = PLM_U32_SIMD; break; - case 1: mode = PLM_U64_SIMD; break; - case 2: mode = PLM_U128_SIMD; break; - default: __builtin_abort(); - } - } else { - switch (ctx->op >> 30) { - case 0: mode = PLM_U32; break; - case 1: mode = PLM_U64; break; - case 2: mode = PLM_S32; break; - default: __builtin_abort(); - } - } - return P(pcrel)(ctx, ctx->pc + sext(bs_get(label, ctx->op), 19) * 4, - bs_get(Rt, ctx->op), mode); -} -static INLINE void P(GPR64_Rn_1_RET)(tdis_ctx ctx, UNUSED struct bitslice Rn) { - return P(ret)(ctx); -} - -static INLINE void P(dis)(tdis_ctx ctx) { - uint32_t op = ctx->op = *(uint32_t *) ctx->ptr; - ctx->op_size = 4; - /* clang doesn't realize that this is unreachable and generates code like - * "and ecx, 0x1f; cmp ecx, 0x1f; ja abort". Yeah, nice job there. */ - #include "../generated/generic-dis-arm64.inc.h" - __builtin_abort(); -} diff --git a/lib/arm64/dis-main.inc.h b/lib/arm64/dis-main.inc.h new file mode 100644 index 0000000..04349f2 --- /dev/null +++ b/lib/arm64/dis-main.inc.h @@ -0,0 +1,69 @@ +static INLINE void P(adrlabel_label_unk_Xd_1_ADR)(tdis_ctx ctx, struct bitslice Xd, struct bitslice label) { + return P(pcrel)(ctx, ctx->pc + sext(bs_get(label, ctx->op), 22), + bs_get(Xd, ctx->op), PLM_ADR); +} +static INLINE void P(adrplabel_label_unk_Xd_1_ADRP)(tdis_ctx ctx, struct bitslice Xd, struct bitslice label) { + return P(pcrel)(ctx, ctx->pc + (sext(bs_get(label, ctx->op), 22) << 12), + bs_get(Xd, ctx->op), PLM_ADR); +} +static INLINE void P(am_b_target_addr_B_1_B)(tdis_ctx ctx, struct bitslice addr) { + return P(branch)(ctx, ctx->pc + sext(bs_get(addr, ctx->op), 26) * 4, + /*cc*/ 0); +} +static INLINE void P(am_bl_target_addr_1_BL)(tdis_ctx ctx, struct bitslice addr) { + return P(branch)(ctx, ctx->pc + sext(bs_get(addr, ctx->op), 26) * 4, + /*cc*/ 0); +} +static INLINE void P(ccode_cond_am_brcond_target_B_1_Bcc)(tdis_ctx ctx, struct bitslice cond, struct bitslice target) { + int bits = bs_get(cond, ctx->op); + /* Bcc with AL/NV (which is actually just another AL) is useless but possible. */ + int cc = bits >= 0xe ? 0 : (CC_ARMCC | bits); + return P(branch)(ctx, ctx->pc + sext(bs_get(target, ctx->op), 19) * 4, cc); +} +static INLINE void P(am_tbrcond_target_B_4_TBNZW)(tdis_ctx ctx, struct bitslice target) { + P(branch)(ctx, ctx->pc + sext(bs_get(target, ctx->op), 14) * 4, CC_XBXZ); + if (TDIS_CTX_MODIFY(ctx)) { + /* ditto CBNZ on ARM */ + int new_target = (TDIS_CTX_NEWVAL(ctx, 0) - ctx->pc) / 4; + unsigned new = bs_set(target, new_target, ctx->op); + if (TDIS_CTX_NEWVAL(ctx, 1)) + new ^= 1 << 24; + TDIS_CTX_SET_NEWOP(ctx, new); + } +} +static INLINE void P(am_brcond_target_B_4_CBNZW)(tdis_ctx ctx, struct bitslice target) { + /* both have the same bit to control Z/NZ */ + return P(am_tbrcond_target_B_4_TBNZW)(ctx, target); +} +static INLINE void P(am_ldrlit_label_unk_Rt_6_LDRDl)(tdis_ctx ctx, struct bitslice Rt, struct bitslice label) { + enum pcrel_load_mode mode; + if ((ctx->op >> 26) & 1) { + switch (ctx->op >> 30) { + case 0: mode = PLM_U32_SIMD; break; + case 1: mode = PLM_U64_SIMD; break; + case 2: mode = PLM_U128_SIMD; break; + default: __builtin_abort(); + } + } else { + switch (ctx->op >> 30) { + case 0: mode = PLM_U32; break; + case 1: mode = PLM_U64; break; + case 2: mode = PLM_S32; break; + default: __builtin_abort(); + } + } + return P(pcrel)(ctx, ctx->pc + sext(bs_get(label, ctx->op), 19) * 4, + bs_get(Rt, ctx->op), mode); +} +static INLINE void P(GPR64_Rn_1_RET)(tdis_ctx ctx, UNUSED struct bitslice Rn) { + return P(ret)(ctx); +} + +static INLINE void P(dis)(tdis_ctx ctx) { + uint32_t op = ctx->op = *(uint32_t *) ctx->ptr; + ctx->op_size = 4; + /* clang doesn't realize that this is unreachable and generates code like + * "and ecx, 0x1f; cmp ecx, 0x1f; ja abort". Yeah, nice job there. */ + #include "../generated/generic-dis-arm64.inc.h" + __builtin_abort(); +} diff --git a/lib/arm64/misc.h b/lib/arm64/misc.h index f5a6154..066e9d5 100644 --- a/lib/arm64/misc.h +++ b/lib/arm64/misc.h @@ -1,36 +1,3 @@ #pragma once +#define TARGET_POINTER_SIZE 8 #define TARGET_DIS_SUPPORTED -#define TARGET_DIS_HEADER "arm64/dis-arm64.inc.h" -#define TARGET_JUMP_PATCH_HDR "arm64/jump-patch.h" -#define TARGET_TRANSFORM_DIS_HEADER "arm64/transform-dis-arm64.inc.h" -#define MIN_INSN_SIZE 4 -#define TD_MAX_REWRITTEN_SIZE (7 * 2 * 4) /* also conservative */ - -struct arch_dis_ctx { - /* For transform_dis only - used to get temporary registers. We assume - * that we can use any caller-saved or IP register which was not written, - * so r9-r18. - * This is a massive overestimate: we just OR in each instruction's bits - * 4:0 (Rd for data, Rt for loads, most common), 14:10 (Rt2 for load-pair - * instructions), and 20:16 (Rs for store-exclusive insturctions). It - * would be easy to restrict the latter two to the few instructions that - * actually use them, but with 10 available registers, and a patch of at - * most 3 instructions (and none of the instructions that require a temp - * use Rt2/Rs or could read their Rd, so the third doesn't count), we won't - * run out even with the dumbest possible thing. */ - uint32_t regs_possibly_written; -}; - -static inline void arch_dis_ctx_init(struct arch_dis_ctx *ctx) { - ctx->regs_possibly_written = 0; -} - -static inline int arm64_get_unwritten_temp_reg(struct arch_dis_ctx *ctx) { - uint32_t avail = ~ctx->regs_possibly_written & ((1 << 19) - (1 << 9)); - if (!avail) - __builtin_abort(); - return 31 - __builtin_clz(avail); -} - -#define CC_ARMCC (CC_CONDITIONAL | 0x400) -#define CC_XBXZ (CC_CONDITIONAL | 0x800) diff --git a/lib/arm64/transform-dis-arm64.inc.h b/lib/arm64/transform-dis-arm64.inc.h deleted file mode 100644 index 792b835..0000000 --- a/lib/arm64/transform-dis-arm64.inc.h +++ /dev/null @@ -1,52 +0,0 @@ -#include "arm64/assemble.h" - -static NOINLINE UNUSED -void transform_dis_pcrel(struct transform_dis_ctx *ctx, uintptr_t dpc, unsigned reg, - enum pcrel_load_mode load_mode) { - ctx->write_newop_here = NULL; - void **codep = ctx->rewritten_ptr_ptr; - if (load_mode >= PLM_U32_SIMD) { - int reg = arm64_get_unwritten_temp_reg(&ctx->arch); - MOVi64(codep, 0, dpc); - LDRxi(codep, reg, 0, 0, true, load_mode); - } else { - MOVi64(codep, reg, dpc); - LDRxi(codep, reg, reg, 0, true, load_mode); - } -} - -static NOINLINE UNUSED -void transform_dis_branch(struct transform_dis_ctx *ctx, uintptr_t dpc, int cc) { - /* TODO fix BL */ -#ifdef TRANSFORM_DIS_VERBOSE - printf("transform_dis (%p): branch => %p\n", (void *) ctx->pc, (void *) dpc); -#endif - if (dpc >= ctx->pc_patch_start && dpc < ctx->pc_patch_end) { - ctx->err = SUBSTITUTE_ERR_FUNC_BAD_INSN_AT_START; - return; - } - ctx->write_newop_here = NULL; - int mov_br_size = size_of_MOVi64(dpc) + 4; - - void **codep = ctx->rewritten_ptr_ptr; - if ((cc & CC_ARMCC) == CC_ARMCC) { - int icc = (cc & 0xf) ^ 1; - Bccrel(codep, icc, 4 + mov_br_size); - } else if ((cc & CC_XBXZ) == CC_XBXZ) { - ctx->modify = true; - ctx->newval[0] = ctx->pc + 4 + mov_br_size; - ctx->newval[1] = 1; /* do invert */ - ctx->write_newop_here = *codep; *codep += 4; - } - int reg = arm64_get_unwritten_temp_reg(&ctx->arch); - MOVi64(codep, reg, dpc); - BR(codep, reg); -} - -static void transform_dis_pre_dis(UNUSED struct transform_dis_ctx *ctx) {} -static void transform_dis_post_dis(struct transform_dis_ctx *ctx) { - uint32_t op = ctx->op; - ctx->arch.regs_possibly_written |= op & 31; - ctx->arch.regs_possibly_written |= op >> 10 & 31; - ctx->arch.regs_possibly_written |= op >> 16 & 31; -} diff --git a/lib/dis.h b/lib/dis.h index 6b189e0..7455749 100644 --- a/lib/dis.h +++ b/lib/dis.h @@ -111,6 +111,11 @@ static const unsigned null_op = -0x100; #error "no disassembler for the target architecture yet" #endif +static inline void op64(void **codep, uint64_t op) { + *(uint64_t *) *codep = op; + *codep += 8; +} + static inline void op32(void **codep, uint32_t op) { *(uint32_t *) *codep = op; *codep += 4; @@ -121,5 +126,26 @@ static inline void op16(void **codep, uint16_t op) { *codep += 2; } +static inline void op8(void **codep, uint8_t op) { + *(uint8_t *) *codep = op; + (*codep)++; +} + #define CC_CONDITIONAL 0x100 #define CC_CALL 0x200 + +struct dis_ctx_base { + uint_tptr pc; + const void *ptr; +#if defined(TARGET_x86_64) || defined(TARGET_i386) + uint8_t newop[32]; +#else + uint8_t newop[4]; + uint32_t op; +#endif + uint32_t newval[4]; + bool modify; + int op_size, newop_size; +}; + +#include stringify(TARGET_DIR/arch-dis.h) diff --git a/lib/hook-functions.c b/lib/hook-functions.c index e0516cb..953683b 100644 --- a/lib/hook-functions.c +++ b/lib/hook-functions.c @@ -5,7 +5,7 @@ #include "transform-dis.h" #include "execmem.h" #include "stop-other-threads.h" -#include TARGET_JUMP_PATCH_HDR +#include stringify(TARGET_DIR/jump-patch.h) struct hook_internal { int offset_by_pcdiff[MAX_JUMP_PATCH_SIZE + 1]; @@ -168,7 +168,7 @@ int substitute_hook_functions(const struct substitute_function_hook *hooks, &hi->trampoline_page, arch))) goto end; - uintptr_t pc_patch_end = pc_patch_start + patch_size; + uint_tptr pc_patch_end = pc_patch_start + patch_size; /* Generate the rewritten start of the function for the outro * trampoline (complaining if any bad instructions are found) * (on arm64, this modifies regs_possibly_written, which is used by the diff --git a/lib/jump-dis.c b/lib/jump-dis.c index 3e29bf7..528cfc2 100644 --- a/lib/jump-dis.c +++ b/lib/jump-dis.c @@ -1,5 +1,6 @@ #include "substitute-internal.h" #ifdef TARGET_DIS_SUPPORTED +#define DIS_MAY_MODIFY 0 #include "dis.h" #include #include @@ -22,15 +23,14 @@ struct jump_dis_ctx { bool bad_insn; bool continue_after_this_insn; - uintptr_t pc; - uintptr_t pc_patch_start; - uintptr_t pc_patch_end; - unsigned op; - const void *ptr; - int op_size; + struct dis_ctx_base base; + + uint_tptr pc_patch_start; + uint_tptr pc_patch_end; + uint8_t seen_mask[JUMP_ANALYSIS_MAX_INSNS / 8]; /* queue of instructions to visit */ - uintptr_t *queue; + uint_tptr *queue; size_t queue_write_off; size_t queue_read_off; size_t queue_size; @@ -43,12 +43,8 @@ struct jump_dis_ctx { #define P(x) jump_dis_##x #define tdis_ctx struct jump_dis_ctx * -#define TDIS_CTX_MODIFY(ctx) 0 -#define TDIS_CTX_NEWVAL(ctx, n) 0 -#define TDIS_CTX_NEWOP(ctx) 0 -#define TDIS_CTX_SET_NEWOP(ctx, new) ((void) 0) -static void jump_dis_add_to_queue(struct jump_dis_ctx *ctx, uintptr_t pc) { +static void jump_dis_add_to_queue(struct jump_dis_ctx *ctx, uint_tptr pc) { size_t diff = (pc - ctx->pc_patch_start) / MIN_INSN_SIZE; if (diff >= JUMP_ANALYSIS_MAX_INSNS) { #ifdef JUMP_DIS_VERBOSE @@ -89,8 +85,8 @@ void jump_dis_data(UNUSED struct jump_dis_ctx *ctx, } static INLINE UNUSED -void jump_dis_pcrel(struct jump_dis_ctx *ctx, uintptr_t dpc, - UNUSED unsigned reg, UNUSED bool is_load) { +void jump_dis_pcrel(struct jump_dis_ctx *ctx, uint_tptr dpc, + UNUSED struct arch_pcrel_info info) { ctx->bad_insn = dpc >= ctx->pc_patch_start && dpc < ctx->pc_patch_end; } @@ -100,7 +96,7 @@ void jump_dis_ret(struct jump_dis_ctx *ctx) { } static NOINLINE UNUSED -void jump_dis_branch(struct jump_dis_ctx *ctx, uintptr_t dpc, bool conditional) { +void jump_dis_branch(struct jump_dis_ctx *ctx, uint_tptr dpc, int cc) { if (dpc >= ctx->pc_patch_start && dpc < ctx->pc_patch_end) { ctx->bad_insn = true; return; @@ -109,7 +105,7 @@ void jump_dis_branch(struct jump_dis_ctx *ctx, uintptr_t dpc, bool conditional) printf("jump-dis: enqueueing %llx\n", (unsigned long long) dpc); #endif jump_dis_add_to_queue(ctx, dpc); - ctx->continue_after_this_insn = conditional; + ctx->continue_after_this_insn = cc & (CC_CONDITIONAL | CC_CALL); } static INLINE UNUSED @@ -127,25 +123,25 @@ void jump_dis_thumb_it(UNUSED struct jump_dis_ctx *ctx) { static void jump_dis_dis(struct jump_dis_ctx *ctx); -bool jump_dis_main(void *code_ptr, uintptr_t pc_patch_start, uintptr_t pc_patch_end, +bool jump_dis_main(void *code_ptr, uint_tptr pc_patch_start, uint_tptr pc_patch_end, struct arch_dis_ctx initial_dis_ctx) { bool ret; struct jump_dis_ctx ctx; memset(&ctx, 0, sizeof(ctx)); ctx.pc_patch_start = pc_patch_start; ctx.pc_patch_end = pc_patch_end; - ctx.pc = pc_patch_end; + ctx.base.pc = pc_patch_end; ctx.arch = initial_dis_ctx; while (1) { ctx.bad_insn = false; ctx.continue_after_this_insn = true; - ctx.ptr = code_ptr + (ctx.pc - pc_patch_start); + ctx.base.ptr = code_ptr + (ctx.base.pc - pc_patch_start); jump_dis_dis(&ctx); #ifdef JUMP_DIS_VERBOSE printf("jump-dis: pc=%llx op=%08x size=%x bad=%d continue_after=%d\n", - (unsigned long long) ctx.pc, - ctx.op, - ctx.op_size, + (unsigned long long) ctx.base.pc, + ctx.base.op, + ctx.base.op_size, ctx.bad_insn, ctx.continue_after_this_insn); #endif @@ -154,12 +150,12 @@ bool jump_dis_main(void *code_ptr, uintptr_t pc_patch_start, uintptr_t pc_patch_ goto fail; } if (ctx.continue_after_this_insn) - jump_dis_add_to_queue(&ctx, ctx.pc + ctx.op_size); + jump_dis_add_to_queue(&ctx, ctx.base.pc + ctx.base.op_size); /* get next address */ if (ctx.queue_read_off == ctx.queue_write_off) break; - ctx.pc = ctx.queue[ctx.queue_read_off]; + ctx.base.pc = ctx.queue[ctx.queue_read_off]; ctx.queue_read_off = (ctx.queue_read_off + 1) % ctx.queue_size; ctx.queue_count--; } @@ -170,5 +166,5 @@ fail: return ret; } -#include TARGET_DIS_HEADER +#include stringify(TARGET_DIR/dis-main.inc.h) #endif /* TARGET_DIS_SUPPORTED */ diff --git a/lib/jump-dis.h b/lib/jump-dis.h index 575a84d..fccd1a6 100644 --- a/lib/jump-dis.h +++ b/lib/jump-dis.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include stringify(TARGET_DIR/arch-dis.h) bool jump_dis_main(void *code_ptr, uintptr_t pc_patch_start, uintptr_t pc_patch_end, struct arch_dis_ctx initial_dis_ctx); diff --git a/lib/substitute-internal.h b/lib/substitute-internal.h index 17ad6ec..9a91516 100644 --- a/lib/substitute-internal.h +++ b/lib/substitute-internal.h @@ -49,12 +49,22 @@ typedef struct section section_x; #endif #if defined(TARGET_arm) - #include "arm/misc.h" + #define TARGET_DIR arm #elif defined(TARGET_arm64) - #include "arm64/misc.h" + #define TARGET_DIR arm64 #elif defined(TARGET_x86_64) || defined(TARGET_i386) - #include "x86/misc.h" + #define TARGET_DIR x86 #endif +#define stringify_(x) #x +#define stringify(x) stringify_(x) +#include stringify(TARGET_DIR/misc.h) + +#if TARGET_POINTER_SIZE == 8 + typedef uint64_t uint_tptr; +#elif TARGET_POINTER_SIZE == 4 + typedef uint32_t uint_tptr; +#endif + #ifdef __APPLE__ /* This could graduate to a public API but is not yet. Needs more diff --git a/lib/transform-dis.c b/lib/transform-dis.c index 867a981..8f89fb3 100644 --- a/lib/transform-dis.c +++ b/lib/transform-dis.c @@ -1,5 +1,6 @@ #include "substitute-internal.h" #ifdef TARGET_DIS_SUPPORTED +#define DIS_MAY_MODIFY 1 #include "substitute.h" #include "dis.h" @@ -13,21 +14,15 @@ struct transform_dis_ctx { /* outputs */ bool modify; int err; + struct dis_ctx_base base; - uintptr_t pc_patch_start; + uint_tptr pc_patch_start; /* this is only tentative - it will be updated to include parts of * instructions poking out, and instructions forced to be transformed by IT */ - uintptr_t pc_patch_end; - uintptr_t pc; - int op_size; - unsigned op; - unsigned newop; - unsigned newval[4]; - + uint_tptr pc_patch_end; /* for IT - eww */ bool force_keep_transforming; - const void *ptr; void **rewritten_ptr_ptr; void *write_newop_here; @@ -35,10 +30,6 @@ struct transform_dis_ctx { }; #define tdis_ctx struct transform_dis_ctx * -#define TDIS_CTX_MODIFY(ctx) ((ctx)->modify) -#define TDIS_CTX_NEWVAL(ctx, n) ((ctx)->newval[n]) -#define TDIS_CTX_NEWOP(ctx) ((ctx)->newop) -#define TDIS_CTX_SET_NEWOP(ctx, new) ((ctx)->newop = (new)) /* largely similar to jump_dis */ @@ -46,14 +37,14 @@ static INLINE UNUSED void transform_dis_ret(struct transform_dis_ctx *ctx) { /* ret is okay if it's at the end of the required patch (past the original * patch size is good too) */ - if (ctx->pc + ctx->op_size < ctx->pc_patch_end) + if (ctx->base.pc + ctx->base.op_size < ctx->pc_patch_end) ctx->err = SUBSTITUTE_ERR_FUNC_TOO_SHORT; } static INLINE UNUSED void transform_dis_unidentified(UNUSED struct transform_dis_ctx *ctx) { #ifdef TRANSFORM_DIS_VERBOSE - printf("transform_dis (%p): unidentified\n", (void *) ctx->pc); + printf("transform_dis (%p): unidentified\n", (void *) ctx->base.pc); #endif /* this isn't exhaustive, so unidentified is fine */ } @@ -74,15 +65,15 @@ static void transform_dis_post_dis(struct transform_dis_ctx *ctx); int transform_dis_main(const void *restrict code_ptr, void **restrict rewritten_ptr_ptr, - uintptr_t pc_patch_start, - uintptr_t *pc_patch_end_p, + uint_tptr pc_patch_start, + uint_tptr *pc_patch_end_p, struct arch_dis_ctx *arch_ctx_p, int *offset_by_pcdiff) { struct transform_dis_ctx ctx; memset(&ctx, 0, sizeof(ctx)); ctx.pc_patch_start = pc_patch_start; ctx.pc_patch_end = *pc_patch_end_p; - ctx.pc = pc_patch_start; + ctx.base.pc = pc_patch_start; ctx.arch = *arch_ctx_p; /* data is written to rewritten both by this function directly and, in case * additional scaffolding is needed, by arch-specific transform_dis_* */ @@ -90,10 +81,10 @@ int transform_dis_main(const void *restrict code_ptr, void *rewritten_start = *rewritten_ptr_ptr; int written_pcdiff = 0; offset_by_pcdiff[written_pcdiff++] = 0; - while (ctx.pc < ctx.pc_patch_end && !ctx.force_keep_transforming) { - ctx.modify = false; + while (ctx.base.pc < ctx.pc_patch_end && !ctx.force_keep_transforming) { + ctx.base.modify = false; ctx.err = 0; - ctx.ptr = code_ptr + (ctx.pc - pc_patch_start); + ctx.base.ptr = code_ptr + (ctx.base.pc - pc_patch_start); transform_dis_pre_dis(&ctx); @@ -105,33 +96,29 @@ int transform_dis_main(const void *restrict code_ptr, if (ctx.err) return ctx.err; if (ctx.write_newop_here != NULL) { - if (!ctx.modify) - ctx.newop = ctx.op; - if (ctx.op_size == 4) - *(uint32_t *) ctx.write_newop_here = ctx.newop; - else if (ctx.op_size == 2) - *(uint16_t *) ctx.write_newop_here = ctx.newop; + if (ctx.base.modify) + memcpy(ctx.write_newop_here, ctx.base.newop, ctx.base.newop_size); else - __builtin_abort(); + memcpy(ctx.write_newop_here, ctx.base.ptr, ctx.base.op_size); if (*rewritten_ptr_ptr == rewritten_ptr) - *rewritten_ptr_ptr += ctx.op_size; + *rewritten_ptr_ptr += ctx.base.op_size; } - ctx.pc += ctx.op_size; + ctx.base.pc += ctx.base.op_size; transform_dis_post_dis(&ctx); - int pcdiff = ctx.pc - ctx.pc_patch_start; + int pcdiff = ctx.base.pc - ctx.pc_patch_start; while (written_pcdiff < pcdiff) offset_by_pcdiff[written_pcdiff++] = -1; offset_by_pcdiff[written_pcdiff++] = (int) (*rewritten_ptr_ptr - rewritten_start); } - *pc_patch_end_p = ctx.pc; + *pc_patch_end_p = ctx.base.pc; *arch_ctx_p = ctx.arch; return SUBSTITUTE_OK; } -#include TARGET_TRANSFORM_DIS_HEADER -#include TARGET_DIS_HEADER +#include stringify(TARGET_DIR/arch-transform-dis.inc.h) +#include stringify(TARGET_DIR/dis-main.inc.h) #endif /* TARGET_DIS_SUPPORTED */ diff --git a/lib/transform-dis.h b/lib/transform-dis.h index 70fe57a..c1de937 100644 --- a/lib/transform-dis.h +++ b/lib/transform-dis.h @@ -1,10 +1,11 @@ #pragma once #include #include +#include stringify(TARGET_DIR/arch-dis.h) int transform_dis_main(const void *restrict code_ptr, void **restrict rewritten_ptr_ptr, - uintptr_t pc_patch_start, - uintptr_t *pc_patch_end_p, + uint_tptr pc_patch_start, + uint_tptr *pc_patch_end_p, struct arch_dis_ctx *arch_ctx_p, int *offset_by_pcdiff); diff --git a/lib/x86/arch-dis.h b/lib/x86/arch-dis.h new file mode 100644 index 0000000..6447f38 --- /dev/null +++ b/lib/x86/arch-dis.h @@ -0,0 +1,10 @@ +#pragma once +#define MIN_INSN_SIZE 1 +#define TD_MAX_REWRITTEN_SIZE 100 /* XXX */ + +struct arch_pcrel_info { + int reg; +}; + +struct arch_dis_ctx {}; +static inline void arch_dis_ctx_init(UNUSED struct arch_dis_ctx *ctx) {} diff --git a/lib/x86/arch-transform-dis.inc.h b/lib/x86/arch-transform-dis.inc.h new file mode 100644 index 0000000..bb86cf9 --- /dev/null +++ b/lib/x86/arch-transform-dis.inc.h @@ -0,0 +1,58 @@ +/* Pretty trivial, but in its own file to match the other architectures. */ +#include "x86/jump-patch.h" + +static void transform_dis_pcrel(struct transform_dis_ctx *ctx, uint64_t dpc, + struct arch_pcrel_info info) { + /* push %reg; mov $dpc, %reg; ; pop %reg */ + /* reg is rcx, or rax if the instruction might be using rcx. */ + int rax = info.reg == 1; + void *code = *ctx->rewritten_ptr_ptr; + /* push */ + op8(&code, rax ? 0x50 : 0x51); + /* mov */ +#ifdef TARGET_x86_64 + op8(&code, 0x48); + op8(&code, rax ? 0xb8 : 0xb9); + op64(&code, dpc); +#else + op8(&code, rax ? 0xb8 : 0xb9); + op32(&code, dpc); +#endif + ctx->write_newop_here = code; + code += ctx->base.op_size; + /* pop */ + op8(&code, rax ? 0x58 : 0x59); + *ctx->rewritten_ptr_ptr = code; + ctx->base.newop[0] = rax ? 0 : 1; + ctx->base.modify = true; +} + +static void transform_dis_branch(struct transform_dis_ctx *ctx, uint_tptr dpc, + int cc) { + if (dpc >= ctx->pc_patch_start && dpc < ctx->pc_patch_end) { + ctx->err = SUBSTITUTE_ERR_FUNC_BAD_INSN_AT_START; + return; + } + void *code = *ctx->rewritten_ptr_ptr; + + ctx->write_newop_here = code; + code += ctx->base.op_size; + + struct arch_dis_ctx arch; + uintptr_t source = (uintptr_t) code + 2; + int size = jump_patch_size(source, dpc, arch, true); + /* if not taken, jmp past the big jump - this is a bit suboptimal but not that bad */ + op8(&code, 0xeb); + op8(&code, size); + make_jump_patch(&code, source, dpc, arch); + + *ctx->rewritten_ptr_ptr = code; + ctx->base.newop[0] = 2; + ctx->base.modify = true; + + if (!cc) + transform_dis_ret(ctx); +} + +static void transform_dis_pre_dis(UNUSED struct transform_dis_ctx *ctx) {} +static void transform_dis_post_dis(UNUSED struct transform_dis_ctx *ctx) {} diff --git a/lib/x86/dis-main.inc.h b/lib/x86/dis-main.inc.h new file mode 100644 index 0000000..45a0947 --- /dev/null +++ b/lib/x86/dis-main.inc.h @@ -0,0 +1,312 @@ +/* +random notes: + +REX: 0100wrxb + +prefixes REX opc ModR/M SIB displacement immediate + +1A/C: modrm stuff +i64: 32 only +o64: 64 only + +CDEGMNPQRSUVW: modrm +EMQW: modrm w/ address +IJO: immediate +L: 8-bit immediate + +VEX last byte 1:0: {none, 66, f3, f2} + +*/ + + +/* This is probably not the most efficient implementation, but hopefully good + * enough... */ + +#define REP4(x) x, x, x, x +#define REP8(x) REP4(x), REP4(x) +#define REP16(x) REP8(x), REP8(x) +#define I_8 0x01 +#define I_16 0x02 +#define I_24 0x03 +#define I_32 0x04 +#define I_v 0x05 +#define I_z 0x06 +#define I_p 0x07 +#define I_IMM_MASK 0x07 +#define I_MOD 0x08 +#define I_ADDR 0x10 +#define I_MODA (I_MOD|I_ADDR) +/* mutually exclusive types */ +#define I_PFX 0x20 /* prefix */ +#define I_JMP 0x40 /* execution does not continue after this */ +#define I_SPEC 0x60 /* special case */ +#define I_TYPE_MASK 0x60 +#define I_JIMM_ONLY 0x80 /* imm is jump offset */ +#define I_JIMM (0x80|I_JMP) +#define I_BAD 0x80 +#ifdef TARGET_x86_64 +#define if64(_64, _32) _64 +#else +#define if64(_64, _32) _32 +#endif +#define i64(x) if64(I_BAD, x) +#define o64(x) if64(x, I_BAD) + +static const uint8_t onebyte_bits[] = { +/*00*/ REP4(I_MODA), I_8, I_z, i64(0), i64(0), REP4(I_MODA), I_8, I_z, i64(0), I_SPEC, +/*10*/ REP4(I_MODA), I_8, I_z, i64(0), i64(0), REP4(I_MODA), I_8, I_z, i64(0), i64(0), +/*20*/ REP4(I_MODA), I_8, I_z, I_PFX, i64(0), REP4(I_MODA), I_8, I_z, I_PFX, i64(0), +/*30*/ REP4(I_MODA), I_8, I_z, I_PFX, i64(0), REP4(I_MODA), I_8, I_z, I_PFX, i64(0), +/*40*/ REP16(if64(I_PFX, 0)), +/*50*/ REP16(0), +/*60*/ i64(0), i64(0), i64(I_MOD), I_MODA, I_PFX, I_PFX, I_PFX, I_PFX, + /*68*/ I_z, I_MODA|I_z, I_8, I_MODA|I_8, REP4(0), +/*70*/ REP16(I_8|I_JIMM), +/*80*/ I_MODA|I_8, I_MODA|I_v, i64(I_MODA|I_8), I_MODA, I_MODA, I_MODA, I_MODA, I_MODA, + /*88*/ REP4(I_MODA), I_MODA, I_MOD, I_MODA, if64(I_PFX, I_MODA), +/*90*/ REP8(0), 0, 0, i64(I_p), 0, 0, 0, 0, 0, +/*A0*/ I_8, I_v, I_8, I_v, REP4(0), I_8, I_z, 0, 0, 0, 0, 0, 0, +/*B0*/ REP8(I_8), REP8(I_v), +/*C0*/ I_MODA|I_8, I_MODA|I_8, I_16|I_JMP, I_JMP, + /*C4*/ if64(I_PFX, I_MODA), if64(I_PFX, I_MODA), I_MODA|I_8, I_MODA|I_8, + /*C8*/ I_24, 0, I_16|I_JMP, I_JMP, 0, I_8, i64(0), I_JMP, +/*D0*/ REP4(I_MODA), i64(I_8), i64(I_8), I_BAD, 0, REP8(I_SPEC), + /* don't treat ljmp as a jump for now */ +/*E0*/ REP4(I_8|I_JIMM), REP4(I_8), + /*E8*/ I_z|I_JIMM_ONLY, I_z|I_JIMM, i64(I_p), I_8|I_JIMM, 0, 0, 0, 0, +/*F0*/ I_PFX, I_BAD, I_PFX, I_PFX, 0, 0, I_MODA, I_MODA, + /*F8*/ 0, 0, 0, 0, 0, 0, I_MODA, I_SPEC, +}; +_Static_assert(sizeof(onebyte_bits) == 256, "onebyte_bits"); + +/* Note: + *All* currently defined 0f 38 opcodes are I_MODA. Assuming that any + unknown such opcodes are also I_MODA is probably better than generic + unknown. + Similarly, all defined 0f 3a opcodes are I_MODA|I_8. +*/ + +static const uint8_t _0f_bits[] = { +/*00*/ I_MODA, I_MODA, 0, 0, I_BAD, o64(0), 0, o64(0), + /*08*/ 0, 0, I_BAD, 0, 0, I_MODA, 0, 0, +/*10*/ REP8(I_MODA), I_MODA, I_BAD, I_BAD, I_BAD, I_BAD, I_BAD, I_BAD, I_MODA, +/*20*/ REP4(I_MOD), REP4(I_BAD), REP8(I_MODA), +/*30*/ 0, 0, 0, 0, 0, 0, I_BAD, 0, I_MODA, I_BAD, I_MODA|I_8, I_BAD, REP4(I_BAD), +/*40*/ REP16(I_MODA), +/*50*/ I_MOD, I_MODA, I_MODA, I_MODA, REP4(I_MODA), REP8(I_MODA), +/*60*/ REP16(I_MODA), +/*70*/ I_MODA, I_MOD|I_8, I_MOD|I_8, I_MOD|I_8, I_MODA, I_MODA, I_MODA, 0, + /*78*/ I_MODA, I_MODA, I_BAD, I_BAD, REP4(I_MODA), +/*80*/ REP16(I_z), +/*90*/ REP16(I_MODA), +/*Ax*/ 0, 0, 0, 0, 0, 0, I_BAD, I_BAD, + /*A8*/ 0, 0, 0, I_MODA, I_MODA|I_8, I_MODA, I_MODA, I_MODA, +/*B0*/ REP8(I_MODA), I_MODA, 0, I_MODA|I_8, I_MODA, REP4(I_MODA), +/*C0*/ I_MODA, I_MODA, I_MODA|I_8, I_MODA, I_MODA|I_8, I_MOD|I_8, I_MODA|I_8, I_MODA|I_z, + /*C8*/ REP8(0), +/*D0*/ REP4(I_MODA), I_MODA, I_MODA, I_MODA, I_MOD, REP8(I_MODA), +/*E0*/ REP16(I_MODA), +/*F0*/ REP4(I_MODA), I_MODA, I_MODA, I_MODA, I_MOD, + /*F8*/ REP4(I_MODA), I_MODA, I_MODA, I_MODA, I_BAD, +}; +_Static_assert(sizeof(_0f_bits) == 256, "_0f_bits"); + +static void P(dis)(tdis_ctx ctx) { + const uint8_t *orig = ctx->base.ptr; + const uint8_t *ptr = ctx->base.ptr; + + int opnd_size = 4; + int mod, rm = 0; +restart:; + uint8_t byte1 = *ptr++; + uint8_t bits = onebyte_bits[byte1]; + /* printf("b1=%x bytes=%x\n", byte1, bits); */ + if ((bits & I_TYPE_MASK) == I_SPEC) { + if (byte1 == 0x0f) { + uint8_t byte2 = *ptr++; + bits = _0f_bits[byte2]; + } else if ((byte1 & 0xf8) == 0xd8) { + /* ESC */ + ptr++; + bits = I_MODA; + } else if (byte1 == 0xff) { + uint8_t modrm = *ptr; + if (modrm >> 6 == 3) { + int subop = modrm >> 3 & 7; + if (subop == 4 || subop == 5) /* JMP */ + bits = I_JMP | I_MODA; + else + bits = I_MODA; + } + } else { + __builtin_abort(); + } + } +got_bits: UNUSED + if (bits == I_BAD) + return P(bad)(ctx); + if ((bits & I_TYPE_MASK) == I_PFX) { + if (byte1 == 0x66) { + opnd_size = 2; + goto restart; +#ifdef TARGET_x86_64 + } else if ((byte1 & 0xf0) == 0x40) { /* REX */ + if (byte1 & 8) /* W */ + opnd_size = 8; + if (byte1 & 1) /* B */ + rm = 8; + goto restart; + } else if (byte1 == 0xc4) { /* VEX 3 */ + uint8_t byte2 = *ptr++; + if (!(byte2 & 0x20)) /* VEX.~B */ + rm = 8; + UNUSED uint8_t byte3 = *ptr++; + ptr++; + int map = byte2 & 0x1f; + switch (map) { + case 1: + bits = _0f_bits[byte2]; + break; + case 2: + bits = _0f_bits[0x38]; + break; + case 3: + bits = _0f_bits[0x3a]; + break; + default: + bits = I_BAD; + break; + } + goto got_bits; + } else if (byte1 == 0xc5) { /* VEX 2 */ + uint8_t byte2 = *ptr++; + bits = _0f_bits[byte2]; + goto got_bits; + } else if (byte1 == 0x8f) { /* XOP (AMD only) */ + uint8_t byte2 = *ptr; + /* could be modrm */ + if ((byte2 >> 3 & 7) == 0) + goto modrm; + ptr++; /* ok, definitely XOP */ + if (!(byte2 & 0x20)) /* VEX.~B */ + rm = 8; + int map = byte2 & 0x1f; + switch (map) { + case 8: + bits = I_MODA|I_8; + break; + case 9: + bits = I_MODA; + break; + case 10: + bits = I_MODA|I_32; + break; + default: + bits = I_BAD; + break; + } + goto got_bits; +#endif + } else { + /* other prefix we don't care about */ + goto restart; + } + } + UNUSED int modrm_off = ptr - orig; + UNUSED uint8_t modrm; + if (bits & I_MOD) { + modrm: UNUSED; + modrm = *ptr++; + mod = modrm >> 6; + rm |= modrm & 7; + if (rm == 4) { + /* sib */ + ptr++; + } + /* displacement */ +#ifdef TARGET_x86_64 + if (mod == 0 && rm == 5) + ptr += 4; +#endif + else if (mod == 1) + ptr++; + else if (mod == 2) + ptr += 4; + } + + int imm_off = ptr - orig; + + /* disp */ + int imm_bits = bits & I_IMM_MASK; + int imm_size; + if (imm_bits <= I_32) + imm_size = imm_bits; + else if (imm_bits == I_v) + imm_size = opnd_size; + else if (imm_bits == I_z) + imm_size = opnd_size == 2 ? 2 : 4; + else if (imm_bits == I_p) + imm_size = opnd_size == 2 ? 4 : 6; + else /* because GCC is stupid */ + __builtin_abort(); + ptr += imm_size; + + ctx->base.ptr = ptr; + ctx->base.newop_size = ctx->base.op_size = ptr - orig; + /* printf("bits=%x\n", bits); */ + + if (bits & I_JIMM_ONLY) { + int32_t imm; + const void *imm_ptr = orig + imm_off; + switch (imm_size) { + case 1: imm = *(int8_t *) imm_ptr; break; + case 2: imm = *(int16_t *) imm_ptr; break; + case 4: imm = *(int32_t *) imm_ptr; break; + default: __builtin_abort(); + } + + bool cond = (byte1 & 0xf0) != 0xe0; + bool call = !(bits & I_JMP); + P(branch)(ctx, ctx->base.pc + ctx->base.op_size + imm, + cond * CC_CONDITIONAL | call * CC_CALL); + if (DIS_MAY_MODIFY && ctx->base.modify) { + /* newval[0] should be the new immediate */ + int32_t new_imm = ctx->base.newval[0]; + uint8_t *new_op = ctx->base.newop; + memcpy(new_op, orig, ctx->base.op_size); + uint8_t *new_imm_ptr = new_op + imm_off; + switch (imm_size) { + case 1: *(int8_t *) new_imm_ptr = new_imm; break; + case 2: *(int16_t *) new_imm_ptr = new_imm; break; + case 4: *(int32_t *) new_imm_ptr = new_imm; break; + } + } +#ifdef TARGET_x86_64 + } else if ((bits & I_MODA) == I_MODA && mod == 0 && rm == 5) { + int32_t disp = *(int32_t *) (orig + modrm_off + 1); + /* unlike ARM, we can always switch to non-pcrel without making the + * instruction from scratch, so we don't have 'reg' and 'lm' */ + struct arch_pcrel_info info = {modrm >> 3 & 7}; + P(pcrel)(ctx, ctx->base.pc + ctx->base.op_size + disp, info); + if (DIS_MAY_MODIFY && ctx->base.modify) { + uint8_t *new_op = ctx->base.newop; + memcpy(new_op, orig, ctx->base.op_size); + /* newval[0] should be the new register, which should be one that + * fits in r/m directly since that's all I need; + * displacement is removed */ + uint8_t *new_modrm_ptr = new_op + modrm_off; + + *new_modrm_ptr = (*new_modrm_ptr & ~0xc7) | + 0 << 6 | + ctx->base.newval[0]; + memmove(new_modrm_ptr + 1, new_modrm_ptr + 5, + ctx->base.op_size - modrm_off - 1); + ctx->base.newop_size -= 4; + } +#endif + } else if ((bits & I_TYPE_MASK) == I_JMP) { + P(ret)(ctx); + } else { + P(unidentified)(ctx); + } +} diff --git a/lib/x86/dis-x86.inc.h b/lib/x86/dis-x86.inc.h deleted file mode 100644 index e0259ea..0000000 --- a/lib/x86/dis-x86.inc.h +++ /dev/null @@ -1,305 +0,0 @@ -/* -random notes: - -REX: 0100wrxb - -prefixes REX opc ModR/M SIB displacement immediate - -1A/C: modrm stuff -i64: 32 only -o64: 64 only - -CDEGMNPQRSUVW: modrm -EMQW: modrm w/ address -IJO: immediate -L: 8-bit immediate - -VEX last byte 1:0: {none, 66, f3, f2} - -*/ - - -/* This is probably not the most efficient implementation, but hopefully good - * enough... */ - -#define REP4(x) x, x, x, x -#define REP8(x) REP4(x), REP4(x) -#define REP16(x) REP8(x), REP8(x) -#define I_8 0x01 -#define I_16 0x02 -#define I_24 0x03 -#define I_32 0x04 -#define I_v 0x05 -#define I_z 0x06 -#define I_p 0x07 -#define I_IMM_MASK 0x07 -#define I_MOD 0x08 -#define I_ADDR 0x10 -#define I_MODA (I_MOD|I_ADDR) -/* mutually exclusive types */ -#define I_PFX 0x20 /* prefix */ -#define I_JMP 0x40 /* execution does not continue after this */ -#define I_SPEC 0x60 /* special case */ -#define I_TYPE_MASK 0x60 -#define I_JIMM (0x80|I_JMP) /* imm is jump offset */ -#define I_BAD 0x80 -#ifdef TARGET_x86_64 -#define if64(_64, _32) _64 -#else -#define if64(_64, _32) _32 -#endif -#define i64(x) if64(I_BAD, x) -#define o64(x) if64(x, I_BAD) - -static const uint8_t onebyte_bits[] = { -/*00*/ REP4(I_MODA), I_8, I_z, i64(0), i64(0), REP4(I_MODA), I_8, I_z, i64(0), I_SPEC, -/*10*/ REP4(I_MODA), I_8, I_z, i64(0), i64(0), REP4(I_MODA), I_8, I_z, i64(0), i64(0), -/*20*/ REP4(I_MODA), I_8, I_z, I_PFX, i64(0), REP4(I_MODA), I_8, I_z, I_PFX, i64(0), -/*30*/ REP4(I_MODA), I_8, I_z, I_PFX, i64(0), REP4(I_MODA), I_8, I_z, I_PFX, i64(0), -/*40*/ REP16(if64(I_PFX, 0)), -/*50*/ REP16(0), -/*60*/ i64(0), i64(0), i64(I_MOD), I_MODA, I_PFX, I_PFX, I_PFX, I_PFX, - /*68*/ I_z, I_MODA|I_z, I_8, I_MODA|I_8, REP4(0), -/*70*/ REP16(I_8|I_JIMM), -/*80*/ I_MODA|I_8, I_MODA|I_v, i64(I_MODA|I_8), I_MODA, I_MODA, I_MODA, I_MODA, I_MODA, - /*88*/ REP4(I_MODA), I_MODA, I_MOD, I_MODA, if64(I_PFX, I_MODA), -/*90*/ REP8(0), 0, 0, i64(I_p), 0, 0, 0, 0, 0, -/*A0*/ I_8, I_v, I_8, I_v, REP4(0), I_8, I_z, 0, 0, 0, 0, 0, 0, -/*B0*/ REP8(I_8), REP8(I_v), -/*C0*/ I_MODA|I_8, I_MODA|I_8, I_16|I_JMP, I_JMP, - /*C4*/ if64(I_PFX, I_MODA), if64(I_PFX, I_MODA), I_MODA|I_8, I_MODA|I_8, - /*C8*/ I_24, 0, I_16|I_JMP, I_JMP, 0, I_8, i64(0), I_JMP, -/*D0*/ REP4(I_MODA), i64(I_8), i64(I_8), I_BAD, 0, REP8(I_SPEC), - /* don't treat ljmp as a jump for now */ -/*E0*/ REP4(I_8|I_JIMM), REP4(I_8), - /*E8*/ (I_z|I_JIMM)&~I_JMP, I_z|I_JIMM, i64(I_p), I_8|I_JIMM, 0, 0, 0, 0, -/*F0*/ I_PFX, I_BAD, I_PFX, I_PFX, 0, 0, I_MODA, I_MODA, - /*F8*/ 0, 0, 0, 0, 0, 0, I_MODA, I_SPEC, -}; -_Static_assert(sizeof(onebyte_bits) == 256, "onebyte_bits"); - -/* Note: - *All* currently defined 0f 38 opcodes are I_MODA. Assuming that any - unknown such opcodes are also I_MODA is probably better than generic - unknown. - Similarly, all defined 0f 3a opcodes are I_MODA|I_8. -*/ - -static const uint8_t _0f_bits[] = { -/*00*/ I_MODA, I_MODA, 0, 0, I_BAD, o64(0), 0, o64(0), - /*08*/ 0, 0, I_BAD, 0, 0, I_MODA, 0, 0, -/*10*/ REP8(I_MODA), I_MODA, I_BAD, I_BAD, I_BAD, I_BAD, I_BAD, I_BAD, I_MODA, -/*20*/ REP4(I_MOD), REP4(I_BAD), REP8(I_MODA), -/*30*/ 0, 0, 0, 0, 0, 0, I_BAD, 0, I_MODA, I_BAD, I_MODA|I_8, I_BAD, REP4(I_BAD), -/*40*/ REP16(I_MODA), -/*50*/ I_MOD, I_MODA, I_MODA, I_MODA, REP4(I_MODA), REP8(I_MODA), -/*60*/ REP16(I_MODA), -/*70*/ I_MODA, I_MOD|I_8, I_MOD|I_8, I_MOD|I_8, I_MODA, I_MODA, I_MODA, 0, - /*78*/ I_MODA, I_MODA, I_BAD, I_BAD, REP4(I_MODA), -/*80*/ REP16(I_z), -/*90*/ REP16(I_MODA), -/*Ax*/ 0, 0, 0, 0, 0, 0, I_BAD, I_BAD, - /*A8*/ 0, 0, 0, I_MODA, I_MODA|I_8, I_MODA, I_MODA, I_MODA, -/*B0*/ REP8(I_MODA), I_MODA, 0, I_MODA|I_8, I_MODA, REP4(I_MODA), -/*C0*/ I_MODA, I_MODA, I_MODA|I_8, I_MODA, I_MODA|I_8, I_MOD|I_8, I_MODA|I_8, I_MODA|I_z, - /*C8*/ REP8(0), -/*D0*/ REP4(I_MODA), I_MODA, I_MODA, I_MODA, I_MOD, REP8(I_MODA), -/*E0*/ REP16(I_MODA), -/*F0*/ REP4(I_MODA), I_MODA, I_MODA, I_MODA, I_MOD, - /*F8*/ REP4(I_MODA), I_MODA, I_MODA, I_MODA, I_BAD, -}; -_Static_assert(sizeof(_0f_bits) == 256, "_0f_bits"); - -static void P(dis)(tdis_ctx ctx) { - const uint8_t *orig = ctx->ptr; - const uint8_t *ptr = ctx->ptr; - - int opnd_size = 4; - int mod, rm = 0; -restart:; - uint8_t byte1 = *ptr++; - uint8_t bits = onebyte_bits[byte1]; - /* printf("b1=%x bytes=%x\n", byte1, bits); */ - if ((bits & I_TYPE_MASK) == I_SPEC) { - if (byte1 == 0x0f) { - uint8_t byte2 = *ptr++; - bits = _0f_bits[byte2]; - } else if ((byte1 & 0xf8) == 0xd8) { - /* ESC */ - ptr++; - bits = I_MODA; - } else if (byte1 == 0xff) { - uint8_t modrm = *ptr; - if (modrm >> 6 == 3) { - int subop = modrm >> 3 & 7; - if (subop == 4 || subop == 5) /* JMP */ - bits = I_JMP | I_MODA; - else - bits = I_MODA; - } - } else { - __builtin_abort(); - } - } -got_bits: UNUSED - if (bits == I_BAD) - return P(bad)(ctx); - if ((bits & I_TYPE_MASK) == I_PFX) { - if (byte1 == 0x66) { - opnd_size = 2; - goto restart; -#ifdef TARGET_x86_64 - } else if ((byte1 & 0xf0) == 0x40) { /* REX */ - if (byte1 & 8) /* W */ - opnd_size = 8; - if (byte1 & 1) /* B */ - rm = 8; - goto restart; - } else if (byte1 == 0xc4) { /* VEX 3 */ - uint8_t byte2 = *ptr++; - if (!(byte2 & 0x20)) /* VEX.~B */ - rm = 8; - UNUSED uint8_t byte3 = *ptr++; - ptr++; - int map = byte2 & 0x1f; - switch (map) { - case 1: - bits = _0f_bits[byte2]; - break; - case 2: - bits = _0f_bits[0x38]; - break; - case 3: - bits = _0f_bits[0x3a]; - break; - default: - bits = I_BAD; - break; - } - goto got_bits; - } else if (byte1 == 0xc5) { /* VEX 2 */ - uint8_t byte2 = *ptr++; - bits = _0f_bits[byte2]; - goto got_bits; - } else if (byte1 == 0x8f) { /* XOP (AMD only) */ - uint8_t byte2 = *ptr; - /* could be modrm */ - if ((byte2 >> 3 & 7) == 0) - goto modrm; - ptr++; /* ok, definitely XOP */ - if (!(byte2 & 0x20)) /* VEX.~B */ - rm = 8; - int map = byte2 & 0x1f; - switch (map) { - case 8: - bits = I_MODA|I_8; - break; - case 9: - bits = I_MODA; - break; - case 10: - bits = I_MODA|I_32; - break; - default: - bits = I_BAD; - break; - } - goto got_bits; -#endif - } else { - /* other prefix we don't care about */ - goto restart; - } - } - UNUSED int modrm_off = ptr - orig; - if (bits & I_MOD) { - modrm: UNUSED; - uint8_t modrm = *ptr++; - mod = modrm >> 6; - rm |= modrm & 7; - if (rm == 4) { - /* sib */ - ptr++; - } - /* displacement */ -#ifdef TARGET_x86_64 - if (mod == 0 && rm == 5) - ptr += 4; -#endif - else if (mod == 1) - ptr++; - else if (mod == 2) - ptr += 4; - } - - int imm_off = ptr - orig; - - /* disp */ - int imm_bits = bits & I_IMM_MASK; - int imm_size; - if (imm_bits <= I_32) - imm_size = imm_bits; - else if (imm_bits == I_v) - imm_size = opnd_size; - else if (imm_bits == I_z) - imm_size = opnd_size == 2 ? 2 : 4; - else if (imm_bits == I_p) - imm_size = opnd_size == 2 ? 4 : 6; - else /* because GCC is stupid */ - __builtin_abort(); - ptr += imm_size; - - ctx->ptr = ptr; - ctx->op_size = ptr - orig; - /* printf("bits=%x\n", bits); */ - - if ((bits & I_JIMM) == I_JIMM) { - int32_t imm; - const void *imm_ptr = orig + imm_off; - switch (imm_size) { - case 1: imm = *(int8_t *) imm_ptr; break; - case 2: imm = *(int16_t *) imm_ptr; break; - case 4: imm = *(int32_t *) imm_ptr; break; - default: __builtin_abort(); - } - - bool cond = (byte1 & 0xf0) != 0xe0; - bool call = !(bits & I_JMP); - P(branch)(ctx, ctx->pc + ctx->op_size + imm, - cond * CC_CONDITIONAL | call * CC_CALL); - if (TDIS_CTX_MODIFY(ctx)) { - /* newval[0] should be the new immediate */ - int32_t new_imm = TDIS_CTX_NEWVAL(ctx, 0); - uint8_t *new_op = TDIS_CTX_NEWOP(ctx); - memcpy(new_op, orig, ctx->op_size); - uint8_t *new_imm_ptr = new_op + imm_off; - switch (imm_size) { - case 1: *(int8_t *) new_imm_ptr = new_imm; break; - case 2: *(int16_t *) new_imm_ptr = new_imm; break; - case 4: *(int32_t *) new_imm_ptr = new_imm; break; - } - } -#ifdef TARGET_x86_64 - } else if ((bits & I_MODA) == I_MODA && mod == 0 && rm == 5) { - int32_t disp = *(int32_t *) (orig + modrm_off + 1); - /* unlike ARM, we can always switch to non-pcrel without making the - * instruction from scratch, so we don't have 'reg' and 'lm' */ - P(pcrel)(ctx, ctx->pc + ctx->op_size + disp); - if (TDIS_CTX_MODIFY(ctx)) { - uint8_t *new_op = TDIS_CTX_NEWOP(ctx); - memcpy(new_op, orig, ctx->op_size); - /* newval[0] should be the new register, which should be one that - * fits in r/m directly since that's all I need; - * newval[1] should be the new displacement */ - uint8_t *new_modrm_ptr = new_op + modrm_off; - - *new_modrm_ptr = (*new_modrm_ptr & ~0xc7) | 4 << 6 | TDIS_CTX_NEWVAL(ctx, 0); - *(uint32_t *) (new_modrm_ptr + 1) = TDIS_CTX_NEWVAL(ctx, 1); - } -#endif - } else if ((bits & I_TYPE_MASK) == I_JMP) { - P(ret)(ctx); - } else { - P(unidentified)(ctx); - } -} diff --git a/lib/x86/jump-patch.h b/lib/x86/jump-patch.h index efd4825..4c0172d 100644 --- a/lib/x86/jump-patch.h +++ b/lib/x86/jump-patch.h @@ -1,5 +1,6 @@ #pragma once #define MAX_JUMP_PATCH_SIZE 5 +#include "dis.h" static inline int jump_patch_size(uintptr_t pc, uintptr_t dpc, UNUSED struct arch_dis_ctx arch, @@ -12,21 +13,19 @@ static inline int jump_patch_size(uintptr_t pc, uintptr_t dpc, return force ? (2+4+8) : -1; } -static inline void make_jump_patch(void **codep, UNUSED uintptr_t pc, - uintptr_t dpc, +static inline void make_jump_patch(void **codep, uintptr_t pc, uintptr_t dpc, UNUSED struct arch_dis_ctx arch) { uintptr_t diff = pc - (dpc + 5); - uint8_t *code = *codep; + void *code = *codep; if (diff == (uintptr_t) (int32_t) diff) { - *(uint8_t *) code = 0xe9; - *(uint32_t *) (code + 1) = diff; - *codep = code + 5; + op8(&code, 0xe9); + op32(&code, diff); } else { /* jmpq *(%rip) */ - *code++ = 0xff; - *code++ = 0x25; - *(uint32_t *) code = 0; code += 4; - *(uint64_t *) code = dpc; code += 8; - *codep = code; + op8(&code, 0xff); + op8(&code, 0x25); + op32(&code, 0); + op64(&code, dpc); } + *codep = code; } diff --git a/lib/x86/misc.h b/lib/x86/misc.h index c8eee19..e04f1f4 100644 --- a/lib/x86/misc.h +++ b/lib/x86/misc.h @@ -1,9 +1,7 @@ #pragma once +#ifdef TARGET_x86_64 +#define TARGET_POINTER_SIZE 8 +#else +#define TARGET_POINTER_SIZE 4 +#endif #define TARGET_DIS_SUPPORTED -#define TARGET_DIS_HEADER "x86/dis-x86.inc.h" -#define TARGET_JUMP_PATCH_HDR "x86/jump-patch.h" -#define MIN_INSN_SIZE 1 -#define TD_MAX_REWRITTEN_SIZE 100 /* XXX */ - -struct arch_dis_ctx {}; -static inline void arch_dis_ctx_init(UNUSED struct arch_dis_ctx *ctx) {} -- cgit v1.2.3