#!/usr/bin/awk -f BEGIN { if (!of) of="lst" pos=0 errors=0 # lookup table for ord() for (i = 32; i <= 128; i++) { t=sprintf("%c", i) _ord_[t]=i } # 8-bit general purpose registers r8["al"]=0 r8["cl"]=1 r8["dl"]=2 r8["bl"]=3 r8["ah"]=4 r8["ch"]=5 r8["dh"]=6 r8["bh"]=7 # 16-bit general purpose registers r16["ax"]=0 r16["cx"]=1 r16["dx"]=2 r16["bx"]=3 r16["sp"]=4 r16["bp"]=5 r16["si"]=6 r16["di"]=7 # segment registers sreg["es"]=0 sreg["cs"]=1 sreg["ss"]=2 sreg["ds"]=3 # indirect access register combinations modstr["bxsi"]=0 modstr["sibx"]=0 modstr["bxdi"]=1 modstr["dibx"]=1 modstr["bpsi"]=2 modstr["sibp"]=2 modstr["bpdi"]=3 modstr["dibp"]=3 modstr["si"]=4 modstr["di"]=5 modstr["bp"]=6 modstr["bx"]=7 # ALU operations alu["add"]=0 alu["or"]=1 alu["adc"]=2 alu["sbb"]=3 alu["and"]=4 alu["sub"]=5 alu["xor"]=6 alu["cmp"]=7 # near conditional jumps ops_rel8["jo"]=112+0 ops_rel8["jno"]=112+1 ops_rel8["jb"]=112+2 ops_rel8["jc"]=112+2 ops_rel8["jnae"]=112+2 ops_rel8["jae"]=112+3 ops_rel8["jnb"]=112+3 ops_rel8["jnc"]=112+3 ops_rel8["je"]=112+4 ops_rel8["jz"]=112+4 ops_rel8["jne"]=112+5 ops_rel8["jnz"]=112+5 ops_rel8["jbe"]=112+6 ops_rel8["jna"]=112+6 ops_rel8["ja"]=112+7 ops_rel8["jnbe"]=112+7 ops_rel8["js"]=112+8 ops_rel8["jns"]=112+9 ops_rel8["jp"]=112+10 ops_rel8["jpe"]=112+10 ops_rel8["jnp"]=112+11 ops_rel8["jpo"]=112+11 ops_rel8["jl"]=112+12 ops_rel8["jnge"]=112+12 ops_rel8["jge"]=112+13 ops_rel8["jbl"]=112+13 ops_rel8["jle"]=112+14 ops_rel8["jng"]=112+14 ops_rel8["jg"]=112+15 ops_rel8["jnle"]=112+15 ops_rel8["loopne"]=224 ops_rel8["loopnz"]=224 ops_rel8["loope"]=225 ops_rel8["loopz"]=225 ops_rel8["loop"]=226 ops_rel8["jmp"]=235 # single-byte opcodes without operands ops_sb["nop"]=128+16 ops_sb["movsb"]=164 ops_sb["movsw"]=165 ops_sb["stosb"]=170 ops_sb["stosw"]=171 ops_sb["lodsb"]=172 ops_sb["lodsw"]=173 ops_sb["ret"]=195 ops_sb["retf"]=203 ops_sb["hlt"]=244 # prefix instructions prefix["es"]=38 prefix["cs"]=46 prefix["ss"]=54 prefix["ds"]=64 prefix["lock"]=240 prefix["rep"]=243 prefix["repe"]=242 prefix["repz"]=242 prefix["repne"]=243 prefix["repnz"]=243 } # error string to insert into listing function err(str) { errors++ if (of=="lst") printf("**** %s\n", str) } # submit a assembling result to output # set label of current line to off # this outputs a listing line function submit(off) { if (of=="lst") printf("%04X %-18s %s\n", off, hex, $0) if (of=="hex" && hex) printf("%s", hex) if (label) { if (of=="sym") printf("%s\tsym\t%d\n", label, off) if (label in prevsym && prevsym[label]!=off) { err("label " label " different during second pass, was " prevsym[label] ", now " off) } sym[label]=off } pos=pos+length(hex)/2 hex="" } # evaluate an expression # globals set: # ecrit: value known after first pass, 1=yes, 0=no # eregs: concatenated list of registers that add to this expr (for modrm) function expr(str) { val=0 sign=1 ecrit=1 eregs="" gsub(/-/, "+-", str) split(str, ep,"+") for (k in ep) { if (substr(ep[k],1,1)=="-") { gsub(/^-/, "", ep[k]) sign = -1 } else { sign = 1 } if (substr(ep[k],1,1)==".") ep[k] = plabel ep[k] if (ep[k] in r8 || ep[k] in r16 || ep[k] in sreg) { if (sign > 0) { eregs=eregs ep[k] } else { err("Registers cannot be subtractive in expressions") } } else if (match(ep[k], /^[0-9]/)) { if (match(ep[k], /h$/)) { ep[i]="0x" ep[k] } val = val + sign*int(ep[k]) } else if (ep[k] in sym) { val = val + sign*sym[ep[k]] } else if (ep[k] in prevsym) { val = val + sign*prevsym[ep[k]] ecrit=0 } else { err("Undefined label " ep[k]) ecrit=0 } } return val } function imm(str) { val = expr(str) if (eregs) err("Registers not allowed here") return val } function crit(str) { val = imm(str) if (!ecrit) err("Labels from below not allowed here") return val } function push_byte(val) { #print("; pushb " val) if (val<0 || val>=256) err("Value " val " does not fit in byte") hex=hex sprintf("%02X",val) } function push_signed_byte(val) { while (val < 0) val = val + 256 push_byte(val) } function push_word(val) { while (val<0) val=val+65536 t=sprintf("%04X",val) hex=hex substr(t,3) substr(t,1,2) } # rs is the register set (r8, r16) that can show up in str function push_modrm(str, spare, rs) { mod=0 rm=0 if (str in rs) { mod=3 rm=rs[str] } else if (substr(str,1,1)=="[") { gsub(/^\[|\]$/, "", str) disp=expr(str) if (!ecrit || disp) { mod=2 } if (!eregs) { mod=0 rm=6 } else if (eregs in modstr) { rm=modstr[eregs] # [BP] is unencodable, this combination is read as [0000] # so we upgrade [BP] to [BP+00] if (mod==0 && rm==6) { mod=1 } } else { err("Bad modR/M") } } #print("; modR/M:", mod, spare, rm) push_byte(mod*64+spare*8+rm) if (mod == 1) { push_byte(disp) } else if (mod == 2 || (mod == 0 && rm == 6)) { push_word(disp) } } # common encoding: two operands, one is modrm, other is register via spare field # last two bits of opcode specify width (byte/word) and whether modrm operand is first or second function push_op_modrm(opcode) { if (rm1 && byteop && op2 in r8) { push_byte(opcode) push_modrm(op1, r8[op2], r8) } else if (rm1 && wordop && op2 in r16) { push_byte(opcode+1) push_modrm(op1, r16[op2], r16) } else if (rm2 && byteop && op1 in r8) { push_byte(opcode+2) push_modrm(op2, r8[op1], r8) } else if (rm2 && wordop && op1 in r16) { push_byte(opcode+2+1) push_modrm(op2, r16[op1], r16) } } # common encoding: one operand encoded as modrm with fixed spare field # operand can be byte or word, encoded in last bit of opcode function push_op_fixed_spare(opcode, spare) { if (byteop) { push_byte(opcode) push_modrm(op1, spare, r8) return 1 } else if (wordop) { push_byte(opcode+1) push_modrm(op1, spare, r16) return 2 } return 0 } # dont process empty lines or comment lines /^( |\t)*(;|%)/ || /^( |\t)*$/ { if (of=="lst") printf("%24s%s\n","",$0) next } # load symbols from previous pass $2=="sym" { prevsym[$1]=int($3) #printf("; %s (%s=%X)\n", $0,$1,prevsym[$1]) next } # Start parsing the line # and set up per-line vars { label=$1 gsub(/:$/, "",label) opn=2 split("", b, ":") byteop=1 wordop=1 dwordop=1 } # no label on line! fixup /^ / || /^\t/ { label="" opn=1 } { # implement local labels if (substr(label,1,1)==".") { label=plabel label } else if (label) { plabel=label } # take note if we got a instruction size specifier op=$(opn) if (op in prefix) { push_byte(prefix[op]) opn=opn+1 op=$(opn) } if($(opn+1)=="byte" || $(opn+1)=="short") { wordop=0 dwordop=0 opn++ } else if ($(opn+1)=="word" || $(opn+1)=="near") { byteop=0 dwordop=0 opn++ } else if ($(opn+1)=="dword" || $(opn+1)=="far") { byteop=0 wordop=0 opn++ } split("", a, ":") c=0 for (i=opn+1;i<=NF;i++) { if (substr($(i),1,1)==";") break a[++c]=$(i) if (substr($(i),1,1)=="\"") { do { i++ j=index($(i), "\"") if (j) { a[c]=a[c] " " substr($(i), 1, j) break } else { a[c]=a[c] " " $(i) } } while($(i)) } else { gsub(/,$/, "", a[c]) } } op1=a[1] op2=a[2] # pre-estimate if operand could be encoded as modrm rm1=(op1 in r16) || (op1 in r8) || substr(op1,1,1)=="[" rm2=(op2 in r16) || (op2 in r8) || substr(op2,1,1)=="[" # if byte register in operands, it cant be a word or dword operation if (op1 in r8 || op2 in r8) { wordop=0 dwordop=0 } # if word register in operands, it cant be a byte or dword operation if (op1 in r16 || op2 in r16 || op2 in sreg) { byteop=0 dwordop=0 } } # the source line is parsed by here: # - op: opcode name # - a: array of operands, starting with 1 # - c: number of operands # - byteop, wordop, dwordop: test before encoding, all 1 per default # pseudo-opcodes op=="cpu" { next } op=="org" { pos=crit(a[1]) submit(pos);next } op=="equ" { val=crit(a[1]) submit(val);next } op=="db" { for(i=1;i<=c;i++) { if (substr(a[i],1,1)=="\"") { for(j=2;j reg if (rm1 && op2 in r8) { push_byte(136) # 88 push_modrm(op1, r8[op2], r8) } else if (rm1 && op2 in r16) { push_byte(137) # 89 push_modrm(op1, r16[op2], r16) } else if (op1 in r8 && rm2) { push_byte(138) push_modrm(op2, r8[op1], r8) } else if (op1 in r16 && rm2) { push_byte(139) push_modrm(op2, r16[op1], r16) # modrm <-> sreg } else if (rm1 && op2 in sreg) { push_byte(140) push_modrm(op1, sreg[op2], r16) } else if (rm2 && op1 in sreg) { push_byte(142) push_modrm(op2, sreg[op1], r16) # reg <- imm } else if (op1 in r8) { push_byte(176+r8[op1]) push_byte(imm(op2)) } else if (op1 in r16) { push_byte(184+r16[op1]) push_word(imm(op2)) # modrm <- imm } else if (byteop && rm1) { push_byte(198) push_modrm(rm1, 0, r16) } else if (wordop && rm1) { push_byte(199) push_modrm(rm1, 0, r16) } } op=="push" && op1 in r16 { push_byte(80+r16[op1]) } op=="pop" && op1 in r16 { push_byte(88+r16[op1]) } op=="int" && op1=="3" { # CC breakpoint push_byte(204) } op=="int" { # CD push_byte(205) push_byte(imm(op1)) } op=="jmp" { val=imm(op1)-(pos+2) if (val>-127 && val<128 && ecrit) { push_byte(235) push_signed_byte(val) } else { push_byte(233) push_word(val-1) } submit(pos);next } op=="call" && wordop { push_byte(232) push_word(imm(op1)-(pos+3)) } op=="neg" { push_op_fixed_spare(246, 3) } # opcodes with rel8 encoding op in ops_rel8 && byteop && c==1 { push_byte(ops_rel8[op]) push_signed_byte(imm(op1)-(pos+2)) } # opcodes without arguments op in ops_sb { push_byte(ops_sb[op]) } { if (!hex) err("no encoding found") submit(pos) } END{ if (of=="hex") printf("\n") if (errors) exit(1) }