#!/usr/bin/env python # # arm-as-to-ios Modify ARM assembly code for the iOS assembler # # Copyright (c) 2012 Psellos http://psellos.com/ # Licensed under the MIT License: # http://www.opensource.org/licenses/mit-license.php # # Resources for running OCaml on iOS: http://psellos.com/ocaml/ # import sys import re VERSION = '1.4.0' # Character classes for expression lexing. # g_ccid0 = '[$.A-Z_a-z\x80-\xff]' # Beginning of id g_ccid = '[$.0-9A-Z_a-z\x80-\xff]' # Later in id def ccc(cc): # Complement the class if cc[1] == '^': return cc[0] + cc[2:] return cc[0] + '^' + cc[1:] def ccce(cc): # Complement the class, include EOL return '(?:' + ccc(cc) + '|$)' # Prefixes for pooled symbol labels and jump table base labels. They're # in the space of Linux assembler local symbols. Later rules will # modify them to the Loc() form. # g_poolpfx = '.LP' g_basepfx = '.LB' def exists(p, l): for l1 in l: if p(l1): return True return False def forall(p, l): for l1 in l: if not p(l1): return False return True def add_prefix(instrs): # Add compatibility macros for all systems, plus hardware # definitions and compatibility macros for iOS. # # All systems: # # Glo() cpp macro for making global symbols (xxx vs _xxx) # Loc() cpp macro for making local symbols (.Lxxx vs Lxxx) # .funtype Expands to .thumb_func for iOS armv7 (null for armv6) # Expands to .type %function for others # # iOS: # # .machine armv6/armv7 # .thumb (for armv7) # cbz Expands to cmp/beq for armv6 (Thumb-only instr) # .type Not supported by Apple assembler # .size Not supported by Apple assembler # defre = '#[ \t]*if.*def.*SYS' # Add new defs near first existing ones skipre = '$|\.syntax[ \t]' # Skip comment lines (and .syntax) for i in range(len(instrs)): if re.match(defre, instrs[i][1]): break else: i = 0 for i in range(i, len(instrs)): if not re.match(skipre, instrs[i][1]): break instrs[i:0] = [ ('', '', '\n'), ('/* Apple compatibility macros */', '', '\n'), ('', '#if defined(SYS_macosx)', '\n'), ('', '#define Glo(s) _##s', '\n'), ('', '#define Loc(s) L##s', '\n'), ('', '#if defined(MODEL_armv6)', '\n'), (' ', '.machine armv6', '\n'), (' ', '.macro .funtype', '\n'), (' ', '.endm', '\n'), (' ', '.macro cbz', '\n'), (' ', 'cmp $0, #0', '\n'), (' ', 'beq $1', '\n'), (' ', '.endm', '\n'), ('', '#else', '\n'), (' ', '.machine armv7', '\n'), (' ', '.thumb', '\n'), (' ', '.macro .funtype', '\n'), (' ', '.thumb_func $0', '\n'), (' ', '.endm', '\n'), ('', '#endif', '\n'), (' ', '.macro .type', '\n'), (' ', '.endm', '\n'), (' ', '.macro .size', '\n'), (' ', '.endm', '\n'), ('', '#else', '\n'), ('', '#define Glo(s) s', '\n'), ('', '#define Loc(s) .L##s', '\n'), (' ', '.macro .funtype symbol', '\n'), (' ', '.type \\symbol, %function', '\n'), (' ', '.endm', '\n'), ('', '#endif', '\n'), ('/* End Apple compatibility macros */', '', '\n'), ('', '', '\n') ] return instrs # Regular expression for modified ldr lines # g_ldre = '(ldr[ \t][^,]*,[ \t]*)=(([^ \t\n@,/]|/(?!\*))*)(.*)' def explicit_address_loads(instrs): # Linux assemblers allow the following: # # ldr rM, =symbol # # which loads rM with [mov] (immediately) if possible, or creates an # entry in memory for the symbol value and loads it PC-relatively # with [ldr]. # # The Apple assembler doesn't seem to support this notation. If the # value is a suitable constant, it emits a valid [mov]. Otherwise # it seems to emit an invalid [ldr] that always generates an error. # (At least I have not been able to make it work). So, change uses # of =symbol to explicit PC-relative loads. # # This requires a pool containing the addresses to be loaded. For # now, we just keep track of it ourselves and emit it into the text # segment at the end of the file. # syms = {} result = [] def repl1((syms, result), (a, b, c)): global g_poolpfx global g_ldre (b1, b2, b3) = parse_iparts(b) mo = re.match(g_ldre, b3, re.DOTALL) if mo: if mo.group(2) not in syms: syms[mo.group(2)] = len(syms) psym = mo.group(2) if psym[0:2] == '.L': psym = psym[2:] newb3 = mo.group(1) + g_poolpfx + psym + mo.group(4) result.append((a, b1 + b2 + newb3, c)) else: result.append((a, b, c)) return (syms, result) def pool1(result, s): global g_poolpfx psym = s if psym[0:2] == '.L': psym = psym[2:] result.append(('', g_poolpfx + psym + ':', '\n')) result.append((' ', '.long ' + s, '\n')) return result reduce(repl1, instrs, (syms, result)) if len(syms) > 0: result.append(('', '', '\n')) result.append(('/* Pool of addresses loaded into registers */', '', '\n')) result.append(('', '', '\n')) result.append((' ', '.text', '\n')) result.append((' ', '.align 2', '\n')) reduce(pool1, sorted(syms, key=syms.get), result) return result def global_symbols(instrs): # The form of a global symbol differs between Linux assemblers and # the Apple assember: # # Linux: xxx # Apple: _xxx # # Change occurrences of global symbols to use the Glo() cpp macro # defined in our prefix. # # We consider a symbol to be global if: # # a. It appears in a .globl declaration; or # b. It is referenced, has global form, and is not defined # glosyms = set() refsyms = set() defsyms = set() result = [] def findglo1 (glosyms, (a, b, c)): if re.match('#', b): # Preprocessor line; nothing to do return glosyms (b1, b2, b3) = parse_iparts(b) mo = re.match('(\.globl)' + ccce(g_ccid), b3) if mo: tokens = parse_expr(b3[len(mo.group(1)):]) if forall(lambda t: token_type(t) in ['space', 'id', ','], tokens): for t in tokens: if token_type(t) == 'id': glosyms.add(t) return glosyms def findref1 ((refsyms, skipct), (a, b, c)): def looksglobal(s): if re.match('(r|a|v|p|c|cr|f|s|d|q|mvax|wcgr)[0-9]+$', s, re.I): return False # numbered registers if re.match('(wr|sb|sl|fp|ip|sp|lr|pc)$', s, re.I): return False # named registers if re.match('(fpsid|fpscr|fpexc|mvfr1|mvfr0)$', s, re.I): return False # more named registers if re.match('(mvf|mvd|mvfx|mvdx|dspsc)$', s, re.I): return False # even more named registers if re.match('(wcid|wcon|wcssf|wcasf|acc)$', s, re.I): return False # even more named registers if re.match('\.$|\.L|[0-9]|#', s): return False # dot, local symbol, or number if re.match('(asl|lsl|lsr|asr|ror|rrx)$', s, re.I): return False # shift names return True if re.match('#', b): # Preprocessor line; nothing to do return (refsyms, skipct) # Track nesting of .macro/.endm. For now, we don't look for # global syms in macro defs. (Avoiding scoping probs etc.) # if skipct > 0 and re.match('\.(endm|endmacro)' + ccce(g_ccid), b): return (refsyms, skipct - 1) if re.match('\.macro' + ccce(g_ccid), b): return (refsyms, skipct + 1) if skipct > 0: return (refsyms, skipct) if re.match('\.(type|size|syntax|arch|fpu)' + ccce(g_ccid), b): return (refsyms, skipct) (b1, b2, b3) = parse_iparts(b) rtokens = parse_rexpr(b3) if len(rtokens) > 1 and rtokens[1] == '.req': # .req has atypical syntax; no symbol refs there anyway return (refsyms, skipct) for t in rtokens[1:]: if token_type(t) == 'id' and looksglobal(t): refsyms.add(t) return (refsyms, skipct) def finddef1(defsyms, (a, b, c)): if re.match('#', b): # Preprocessor line return defsyms (b1, b2, b3) = parse_iparts(b) rtokens = parse_rexpr(b3) if b1 != '': defsyms.add(b1) if len(rtokens) > 1 and rtokens[1] == '.req': defsyms.add(rtokens[0]) return defsyms def repl1((glosyms, result), (a, b, c)): if re.match('#', b): # Preprocessor line result.append((a, b, c)) return (glosyms, result) toglo = lambda s: 'Glo(' + s + ')' (b1, b2, b3) = parse_iparts(b) tokens = parse_expr(b3) if b1 in glosyms: b1 = toglo(b1) for i in range(len(tokens)): if token_type(tokens[i]) == 'id' and tokens[i] in glosyms: tokens[i] = toglo(tokens[i]) result.append((a, b1 + b2 + ''.join(tokens), c)) return (glosyms, result) reduce(findglo1, instrs, glosyms) reduce(findref1, instrs, (refsyms, 0)) reduce(finddef1, instrs, defsyms) glosyms |= (refsyms - defsyms) reduce(repl1, instrs, (glosyms, result)) return result def local_symbols(instrs): # The form of a local symbol differs between Linux assemblers and # the Apple assember: # # Linux: .Lxxx # Apple: Lxxx # # Change occurrences of local symbols to use the Loc() cpp macro # defined in our prefix. # lsyms = set() result = [] def find1 (lsyms, (a, b, c)): mo = re.match('(\.L[^ \t:]*)[ \t]*:', b) if mo: lsyms.add(mo.group(1)) return lsyms def repl1((lsyms, result), (a, b, c)): matches = list(re.finditer('\.L[^ \t@:,+*/\-()]+', b)) if matches != []: matches.reverse() newb = b for mo in matches: if mo.group() in lsyms: newb = newb[0:mo.start()] + \ 'Loc(' + mo.group()[2:] + ')' + \ newb[mo.end():] result.append((a, newb, c)) else: result.append((a, b, c)) return (lsyms, result) reduce(find1, instrs, lsyms) reduce(repl1, instrs, (lsyms, result)) return result def funtypes(instrs): # Linux assemblers accept declarations like this: # # .type symbol, %function # # For Thumb functions, the Apple assembler wants to see: # # .thumb_func symbol # # Handle this by converting declarations to this: # # .funtype symbol # # Our prefix defines an appropriate .funtype macro for each # environment. # result = [] def repl1(result, (a, b, c)): mo = re.match('.type[ \t]+([^ \t,]*),[ \t]*%function', b) if mo: result.append((a, '.funtype ' + mo.group(1), c)) else: result.append((a, b, c)) return result reduce(repl1, instrs, result) return result def jump_tables(instrs): # Jump tables for Linux assemblers often look like this: # # tbh [pc, rM, lsl #1] # .short (.Labc-.)/2+0 # .short (.Ldef-.)/2+1 # .short (.Lghi-.)/2+2 # # The Apple assembler disagrees about the meaning of this code, # producing jump tables that don't work. Convert to the following: # # tbh [pc, rM, lsl #1] # .LBxxx: # .short (.Labc-.LBxxx)/2 # .short (.Ldef-.LBxxx)/2 # .short (.Lghi-.LBxxx)/2 # # In fact we just convert sequences of .short pseudo-ops of the # right form. There's no requirement that they follow a tbh # instruction. # baselabs = [] result = [] def short_match(seq, op): # Determine whether the op is a .short of the form that needs to # be converted: .short (symbol-.)/2+k. If so, return a pair # containing the symbol and the value of k. If not, return # None. The short can only be converted if there were at least # k other .shorts in sequence before the current one. A summary # of the previous .shorts is in seq. # # (A real parser would do a better job, but this was quick to # get working.) # sp = '([ \t]|/\*.*?\*/)*' # space sp1 = '([ \t]|/\*.*?\*/)+' # at least 1 space spe = '([ \t]|/\*.*?\*/|@[^\n]*)*$' # end-of-instr space expr_re0 = ( '\.short' + sp + '\(' + sp + # .short ( '([^ \t+\-*/@()]+)' + sp + # symbol '-' + sp + '\.' + sp + '\)' + sp + # -.) '/' + sp + '2' + spe # /2 END ) expr_re1 = ( '\.short' + sp + '\(' + sp + # .short ( '([^ \t+\-*/@()]+)' + sp + # symbol '-' + sp + '\.' + sp + '\)' + sp + # -.) '/' + sp + '2' + sp + # /2 '\+' + sp + # + '((0[xX])?[0-9]+)' + spe # k END ) expr_re2 = ( '\.short' + sp1 + # .short '((0[xX])?[0-9]+)' + sp + # k '\+' + sp + '\(' + sp + # +( '([^ \t+\-*/@()]+)' + sp + # symbol '-' + sp + '\.' + sp + '\)' + sp + # -.) '/' + sp + '2' + spe # /2 END ) mo = re.match(expr_re0, op) if mo: return(mo.group(3), 0) mo = re.match(expr_re1, op) if mo: k = int(mo.group(11), 0) if k > len(seq): return None return (mo.group(3), k) mo = re.match(expr_re2, op) if mo: k = int(mo.group(2), 0) if k > len(seq): return None return (mo.group(7), k) return None def conv1 ((baselabs, shortseq, label, result), (a, b, c)): # Convert current instr (a,b,c) if it's a .short of the right # form that spans a previous sequence of .shorts. # (b1, b2, b3) = parse_iparts(b) if b3 == '': # No operation: just note label if present. result.append((a, b, c)) if re.match('\.L.', b1): return (baselabs, shortseq, b1, result) return (baselabs, shortseq, label, result) if not re.match('.short[ \t]+[^ \t@]', b3): # Not a .short: clear shortseq and label result.append((a, b, c)) return (baselabs, [], '', result) # We have a .short: figure out the label if any if re.match('\.L', b1): sl = b1 else: sl = label mpair = short_match(shortseq, b3) if not mpair: # A .short, but not of right form shortseq.append((len(result), sl)) result.append((a, b, c)) return (baselabs, shortseq, '', result) # OK, we have a .short to convert! (sym, k) = mpair shortseq.append((len(result), sl)) # Figure out base label (create one if necessary). bx = len(shortseq) - 1 - k bl = shortseq[bx][1] if bl == '': bl = g_basepfx + str(shortseq[bx][0]) shortseq[bx] = (shortseq[bx][0], bl) baselabs.append(shortseq[bx]) op = '.short\t(' + sym + '-' + bl + ')/2' result.append ((a, b1 + b2 + op, c)) return (baselabs, shortseq, '', result) # Convert, accumulate result and new labels. reduce(conv1, instrs, (baselabs, [], '', result)) # Add labels created here to the instruction stream. baselabs.reverse() for (ix, lab) in baselabs: result[ix:0] = [('', lab + ':', '\n')] # That does it return result def dot_relative(instrs): # The Apple assembler (or possibly the linker) has trouble with code # that looks like this: # # .word .Label - . + 0x80000000 # .word 0x1966 # .Label: # .word 0x1967 # # One way to describe the problem is that the assembler marks the # first .word for relocation when in fact it's an assembly-time # constant. Translate to the following form, which doesn't generate # a relocation marking: # # DR0 = .Label - . + 0x80000000 # .word DR0 # .word 0x1966 # .Label: # .word 0x1967 # prefix = 'DR' pseudos = '(\.byte|\.short|\.word|\.long|\.quad)' result = [] def tok_ok(t): return t in ['.', '+', '-', '(', ')'] or \ token_type(t) in ['space', 'locid', 'number'] def dotrel_match(expr): # Determine whether the expression is one that needs to be # translated. tokens = parse_expr(expr) return forall(tok_ok, tokens) and \ exists(lambda t: token_type(t) == 'locid', tokens) and \ exists(lambda t: token_type(t) == 'number', tokens) and \ exists(lambda t: t == '-', tokens) and \ exists(lambda t: t == '.', tokens) def conv1(result, (a, b, c)): if re.match('#', b): # Preprocessor line result.append((a, b, c)) else: (b1, b2, b3) = parse_iparts(b) mo = re.match(pseudos + ccce(g_ccid), b3) if mo: p = mo.group(1) expr = b3[len(p):] if dotrel_match(expr): sym = prefix + str(len(result)) instr = sym + ' =' + expr result.append(('', instr, '\n')) result.append((a, b1 + b2 + p + ' ' + sym, c)) else: result.append((a, b, c)) else: result.append((a, b, c)) return result reduce(conv1, instrs, result) return result def read_input(): # Concatenate all the input files into a string. # def fnl(s): if s == '' or s[-1] == '\n': return s else: return s + '\n' if len(sys.argv) < 2: return fnl(sys.stdin.read()) else: input = "" for f in sys.argv[1:]: try: fd = open(f) input = input + fnl(fd.read()) fd.close() except: sys.stderr.write('arm-as-to-ios: cannot open ' + f + '\n') return input def parse_instrs(s): # Parse the string into assembly instructions, also noting C # preprocessor lines. Each instruction is represented as a triple: # (space/comments, instruction, end). The end is either ';' or # '\n'. # def goodmo(mo): if mo == None: # Should never happen sys.stderr.write('arm-as-to-ios: internal parsing error\n') sys.exit(1) cpp_re = '([ \t]*)(#([^\n]*\\\\\n)*[^\n]*[^\\\\\n])\n' comment_re = '[ \t]*#[^\n]*' instr_re = ( '(([ \t]|/\*.*?\*/|@[^\n]*)*)' # Spaces & comments '(([ \t]|/\*.*?\*/|[^;\n])*)' # "Instruction" '([;\n])' # End ) instrs = [] while s != '': if re.match('[ \t]*#[ \t]*(if|ifdef|elif|else|endif|define)', s): mo = re.match(cpp_re, s) goodmo(mo) instrs.append((mo.group(1), mo.group(2), '\n')) elif re.match('[ \t]*#', s): mo = re.match(comment_re, s) goodmo(mo) instrs.append((mo.group(0), '', '\n')) else: mo = re.match(instr_re, s, re.DOTALL) goodmo(mo) instrs.append((mo.group(1), mo.group(3), mo.group(5))) s = s[len(mo.group(0)):] return instrs def parse_iparts(i): # Parse an instruction into smaller parts, returning a triple of # strings (label, colon, operation). The colon part also contains # any surrounding spaces and comments (making the label and the # operation cleaner to process). # # (Caller warrants that the given string doesn't start with space or # a comment. This is true for strings returned by the instruction # parser.) # lab_re = ( '([^ \t:/@]+)' # Label '(([ \t]|/\*.*?\*/|@[^\n]*)*)' # Spaces & comments ':' # Colon '(([ \t]|/\*.*?\*/|@[^\n]*)*)' # Spaces & comments '([^\n]*)' # Operation ) if len(i) > 0 and i[0] == '#': # C preprocessor line; treat as operation. return ('', '', i) mo = re.match(lab_re, i) if mo: return (mo.group(1), mo.group(2) + ':' + mo.group(4), mo.group(6)) # No label, just an operation return ('', '', i) def parse_expr(s): # Parse a string into a sequence of tokens. A segment of white # space (including comments) is treated as a token, so that the # tokens can be reassembled into the string again. # result = [] while s != '': mo = re.match('([ \t]|/\*.*?\*/|@.*)+', s) if not mo: # Glo(...) and Loc(...) are single tokens mo = re.match('(Glo|Loc)\([^()]*\)', s) if not mo: mo = re.match('"([^\\\\"]|\\\\.)*"', s) if not mo: mo = re.match(g_ccid0 + g_ccid + '*', s) if not mo: mo = re.match('[0-9]+[bf]', s) if not mo: mo = re.match('0[Xx][0-9a-fA-F]+|[0-9]+', s) if not mo: mo = re.match('.', s) result.append(mo.group(0)) s = s[len(mo.group(0)):] return result def parse_rexpr(s): # Like parse_expr(), but return only "real" tokens, not the # intervening space. # return filter(lambda t: token_type(t) != 'space', parse_expr(s)) def token_type(t): # Determine the type of a token. Caller warrants that it was # returned by parse_expr() or parse_rexpr(). # if re.match('[ \t]|/\*|@', t): return 'space' if re.match('Glo\(', t): return 'gloid' if re.match('Loc\(', t): return 'locid' if re.match('"', t): return 'string' if re.match(g_ccid0, t): return 'id' if re.match('[0-9]+[bf]', t): return 'label' if re.match('[0-9]', t): return 'number' return t # Sui generis def debug_parse(a, b, c): # Show results of instuction stream parse. # (b1, b2, b3) = parse_iparts(b) newb = '{' + b1 + '}' + '{' + b2 + '}' + '{' + b3 + '}' sys.stdout.write('{' + a + '}' + newb + c) def main(): instrs = parse_instrs(read_input()) instrs = explicit_address_loads(instrs) instrs = funtypes(instrs) instrs = jump_tables(instrs) instrs = global_symbols(instrs) instrs = local_symbols(instrs) instrs = dot_relative(instrs) instrs = add_prefix(instrs) for (a, b, c) in instrs: sys.stdout.write(a + b + c) main()