diff --git a/configure.py b/configure.py index f3f11d84..a260f713 100644 --- a/configure.py +++ b/configure.py @@ -167,6 +167,7 @@ ] if args.debug: config.ldflags.append("-g") # Or -gdwarf-2 for Wii linkers + config.ldflags.append("-sym full") if args.map: config.ldflags.append("-mapunused") # config.ldflags.append("-listclosure") # For Wii linkers @@ -208,7 +209,7 @@ # Debug flags if args.debug: # Or -sym dwarf-2 for Wii compilers - cflags_base.extend(["-sym on", "-DDEBUG=1"]) + cflags_base.extend(["-sym full", "-DDEBUG=1"]) else: cflags_base.append("-DNDEBUG=1") diff --git a/ghidra_scripts/README.md b/ghidra_scripts/README.md new file mode 100644 index 00000000..bf455297 --- /dev/null +++ b/ghidra_scripts/README.md @@ -0,0 +1,83 @@ +# Ghidra Importer + +## What do you get from using the importer? + +`bfbb_import` is a script which take basic symbols from the original game (in symbols.txt), and more detailed symbols from the reverse engineered code we can compile so far, and imports them into a Ghidra for easier reverse engineering. + +Results of running the import: + +* Full parameter type, return type information, parameter names, global variable types etc are imported for the contents of cpp files listed as `Matching` in `configure.py`: + +* ![test](gimport/function_with_return.png) + +* All struct types referenced in `Matching` files are imported: + +* ![test](gimport/struct_import.png) + +* Name and parameter types but _not_ return types are imported for other name mangled functions in `symbols.txt`: + +* ![test](gimport/function_with_paramn.png) + +* All other remaining symbols from `symbols.txt` are annotated in some way in the main Ghidra listing via labels. + +## Import Instructions + +### Step 1: Install Ghidra + +Download and "install" a recent version of Ghidra from https://github.com/NationalSecurityAgency/ghidra/releases. "Install" here just means unzipping the folder, there is no global install process for Ghidra. + +Note: You may need to install the JDK if you don't have it already. You will be prompted for this when running Ghidra if you don't have it. + +### Step 2: Install the DOL Extension + +Ghidra can't understand Gamecube DOL files out of the box. Install the Ghidra Gamecube loader from https://github.com/Cuyler36/Ghidra-GameCube-Loader/releases. + +### Step 3: Import the DOL + +Open Ghidra and `File > Import File...`, selecting the DOL file you put in `bfbb/orig/GQPE78/sys/main.dol` when setting up the repo. + +Open up the imported file and ***allow analysis to run when prompted***. This importer script expects the functions to already be created by analysis. + +### Step 4: Install Ghidrathon + +We need to give Ghidra the ability to run Python 3 code, we do this with the Ghidrathon extension. Download Ghidrathon from the releases page: https://github.com/mandiant/Ghidrathon/releases + +Follow the installation instructions on that page. You probably don't need to create a venv in this case, but you do need to run `ghidrathon_configure.py`. + +### Step 5: Install Importer Script Dependencies + +The importer script has a single additional Python package dependency on `elftools` to parse the elf file. Install it with the following command: + +```bash +pip install pyelftools +``` + +### Step 6: Add Script Directory + +In Ghidra, `Window > Script Manager` to open the script manager. This is what we ill use to run the script. + +In the script manager, at the top right, click the "Manage Script Directories" button: ![image](manage_script_directories.png) + +Click `+` at the top right of the script manager, and add `bfbb/ghidra_scripts` to the list of script directories. + +### Step 7: Run the Importer + +In the Script Manager, you should now be able to filter for `bfbb_import.py`. Select it and run it through the context menu or the run button at the top of the Script Manager. + +Importing will take as long as a clean build does because we temporarily have to make a debug build of the executable to get the parameter names and other info from already reverse engineered functions (the script will restore your previous build settings after doing so) + +### Step 8: (Optionally) Change Additional Files to Matching + +The importer script only imports types referenced in files linked into the final DOL file the bulid generates. To generate matching DOLs, the build normally only links compilation units which are 100% matching. + +If you're working on a cpp file with structures you want to import into Ghidra, you're not bound by this limitation! As long as enough contents are defined in the file you're working on for it to link you can import things from it. + +Temporarily change the file in question to "Matching" in `configure.py`, and re-run the importer. Note that if you build with the file changed to Matching when it is not a 100% match yet, this will give you a "not matching" error at the end of the build. That's expected: The import will still be able to import the symbols correctly regardless because it uses the memory mapping in symbols.txt. + +### Step 9: Enjoy The Results + +Most functions should now have name / parameter info rather than just being FUN_xxxxxxxx. No more having to look stuff up in symbols.txt! + + diff --git a/ghidra_scripts/bfbb_import.py b/ghidra_scripts/bfbb_import.py new file mode 100644 index 00000000..2662236b --- /dev/null +++ b/ghidra_scripts/bfbb_import.py @@ -0,0 +1,7 @@ +import gimport.extract_info +import gimport.import_info + +if __name__ == "__main__": + extracted_info = gimport.extract_info.extract_info() + print("Importing info into Ghidra") + gimport.import_info.import_info(currentProgram(), extracted_info) diff --git a/ghidra_scripts/gimport/__init__.py b/ghidra_scripts/gimport/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/ghidra_scripts/gimport/demangle.py b/ghidra_scripts/gimport/demangle.py new file mode 100644 index 00000000..c4fea4c4 --- /dev/null +++ b/ghidra_scripts/gimport/demangle.py @@ -0,0 +1,207 @@ +from typing import Tuple, List +import re +from .dwarf import DW_FT, DwarfSubscriptDataItem +from .gtypes import GType, GPointerType, GFundType, GArrayType + + +SPECIAL_NAME_TO_OPERATOR = { + "__as": "=", + "__ml": "*", + "__amu": "*=", + "__mi": "-", + "__ami": "-=", + "__dv": "/", + "__adv": "/=", + "__pl": "+", + "__apl": "+=", + "__nw": "new", + "__dl": "delete", + "__aor": "|=", + "__or": "|", + "__eq": "==", + "__ne": "!=", + "__vc": "<<", + "__mm": "--", + "__pp": "++", + "__rf": "*", + "__cl": "()", +} + + +SPECIAL_IGNORE = { + # Things containing a "T#", don't know what that means + "setevenodd__FUlPUlUlUlP5BLITST1": True, + "YUV_blit__FPvUlUlUlT0UlUlUlUlUlUlUlT0P5BLITS": True, + "YUV_blit_mask__FPvUlUlUlPUcUlT0UlUlUlUlUlUlUlT0P5BLITS": True, + + # Can't disambiguate with normal mangling + "__end__catch": True, +} + + +def demangle(mangled_name: str, resolve_ud) -> Tuple[str, List[GType]]: + # Cut off name + index = mangled_name.find("__", 1) # 1 instead of 0 to skip __ in names like __ct + if index == -1: + # Not a mangled function + return None + # Don't know how to demangle some things + if mangled_name in SPECIAL_IGNORE: + return None + name = mangled_name[:index] # Name part only + without_name = mangled_name[index+2:] # Cut off name + + # Cut off namespacing bits + namespaces = [] + while len(without_name) > 0 and (without_name[0] == "Q" or str.isdigit(without_name[0])): + if without_name[0] == "Q": + qualification_count = int(without_name[1]) + without_name = without_name[2:] + for i in range(qualification_count): + (namespace_len_text, rest) = re.match(r"^(\d+)(.*)", without_name).groups() + namespace_len = int(namespace_len_text) + namespaces.append(rest[:namespace_len]) + without_name = rest[namespace_len:] + else: + (len_str, rest) = re.match(r"^(\d+)(.*)", without_name).groups() + namespace_len = int(len_str) + namespaces.append(rest[:namespace_len]) + without_name = rest[namespace_len:] + this_type = resolve_ud(namespaces[-1]) if namespaces else None + + # Namespaced global variable, not a function + if len(without_name) == 0: + return None + + # Handle special names + if name.startswith("__"): + if name in SPECIAL_NAME_TO_OPERATOR: + name = f"operator{SPECIAL_NAME_TO_OPERATOR[name]}" + elif name == "__ct": + name = namespaces[-1] + elif name == "__dt": + name = f"~{namespaces[-1]}" + + # Add namespaces to name + name = "::".join(namespaces + [name]) + + # C -> Const method. + is_const = without_name[0] == "C" + if is_const: + without_name = without_name[1:] + + # F -> function, no F -> method. + is_member = without_name[0] != "F" + whole_text = without_name if is_member else without_name[1:] + + # Easier to handle this here + if whole_text == "v": + return (name, []) + + """ + Ann_ Array + P pointer + C constant + Qn qualified name, n parts + + b bool + c char + s short + i int + l long + x long long + f float + d double + e vararg + nn struct + """ + def parse_type(text: str) -> Tuple[GType, str]: + if text.startswith("A"): + (dim, rest) = re.match(r"^A([0-9]+)_(.*)", text).groups() + (type, rest) = parse_type(rest) + array_type = GArrayType() + count = DwarfSubscriptDataItem() + count.highBound.isConstant = True + count.highBound.constant = int(dim) + 1 + element_type = DwarfSubscriptDataItem() + element_type.type = type + array_type.subscripts = [count, element_type] + return (array_type, rest) + elif text.startswith("F"): + text = text[1:] + if text.startswith("v"): + text = text[1:] + else: + while text and not text.startswith("_"): + (param_type, text) = parse_type(text) + assert text.startswith("_"), f"Expect _ after function type in {mangled_name}" + # TODO: Actually handle function type + return (GPointerType(GFundType(DW_FT.void)), text[1:]) + elif text.startswith("Q"): + qualification_count = int(text[1]) + text = text[2:] + parts = [] + for i in range(qualification_count): + (namespace_len_text, rest) = re.match(r"^(\d+)(.*)", text).groups() + namespace_len = int(namespace_len_text) + parts.append(rest[:namespace_len]) + text = rest[namespace_len:] + return (resolve_ud(parts[-1]), text) + elif text.startswith("Pv"): + # Pointer to void is special + return (GPointerType(GFundType(DW_FT.void)), text[2:]) + elif text.startswith("PCv"): + return (GPointerType(GFundType(DW_FT.void)), text[3:]) + elif text.startswith("P") or text.startswith("R"): + (type, rest) = parse_type(text[1:]) + pointer_type = GPointerType(type) + return (pointer_type, rest) + elif text.startswith("C"): + # Constness ignored here + return parse_type(text[1:]) + elif text.startswith("b"): + return (GFundType(DW_FT.bool), text[1:]) + elif text.startswith("c"): + return (GFundType(DW_FT.S8), text[1:]) + elif text.startswith("s"): + return (GFundType(DW_FT.S16), text[1:]) + elif text.startswith("i"): + return (GFundType(DW_FT.S32), text[1:]) + elif text.startswith("l"): + return (GFundType(DW_FT.SLong), text[1:]) + elif text.startswith("x"): + return (GFundType(DW_FT.S64), text[1:]) + elif text.startswith("f"): + return (GFundType(DW_FT.F32), text[1:]) + elif text.startswith("d"): + return (GFundType(DW_FT.F64), text[1:]) + elif text.startswith("Uc"): + return (GFundType(DW_FT.U8), text[2:]) + elif text.startswith("Us"): + return (GFundType(DW_FT.U16), text[2:]) + elif text.startswith("Ui"): + return (GFundType(DW_FT.U32), text[2:]) + elif text.startswith("Ul"): + return (GFundType(DW_FT.ULong), text[2:]) + else: + # Handle struct + if match := re.match(r"^(\d+)(.*)", text): + (ident_len_text, rest) = match.groups() + ident_len = int(ident_len_text) + ident = rest[:ident_len] + rest = rest[ident_len:] + return (resolve_ud(ident), rest) + else: + print("Unexpected mangle:", text, mangled_name) + exit(0) + + result = [] + if this_type: + result.append(GPointerType(this_type)) + while whole_text: + # End of empty arg list, or variable args + if whole_text.startswith("v") or whole_text.startswith("e"): + return (name, result) + (type, whole_text) = parse_type(whole_text) + result.append(type) + return (name, result) diff --git a/ghidra_scripts/gimport/dwarf.py b/ghidra_scripts/gimport/dwarf.py new file mode 100644 index 00000000..2a9b4f55 --- /dev/null +++ b/ghidra_scripts/gimport/dwarf.py @@ -0,0 +1,321 @@ + +from typing import List + + +class DW_FT: + bool = 0 + char = 1 + S8 = 2 + U8 = 3 + S16 = 5 + U16 = 6 + S32 = 7 + U32 = 9 + SLong = 10 + ULong = 12 + pointer = 13 + F32 = 14 + F64 = 15 + void = 20 + S64 = 32776 + + +class DW_FORM: + ADDR = 0x1 + REF = 0x2 + BLOCK2 = 0x3 + BLOCK4 = 0x4 + DATA2 = 0x5 + DATA4 = 0x6 + DATA8 = 0x7 + STRING = 0x8 + + +class DW_AT: + bit_offset = "DW_AT_bit_offset" + bit_size = "DW_AT_bit_size" + byte_size = "DW_AT_byte_size" + common_reference = "DW_AT_common_reference" + comp_dir = "DW_AT_comp_dir" + const_value = "DW_AT_const_value" + containing_type = "DW_AT_containing_type" + default_value = "DW_AT_default_value" + discr = "DW_AT_discr" + discr_value = "DW_AT_discr_value" + element_list = "DW_AT_element_list" + friends = "DW_AT_friends" + fund_type = "DW_AT_fund_type" + high_pc = "DW_AT_high_pc" + inline = "DW_AT_inline" + is_optional = "DW_AT_is_optional" + language = "DW_AT_language" + location = "DW_AT_location" + low_pc = "DW_AT_low_pc" + lower_bound = "DW_AT_lower_bound" + member = "DW_AT_member" + mod_fund_type = "DW_AT_mod_fund_type" + mod_u_d_type = "DW_AT_mod_u_d_type" + name = "DW_AT_name" + ordering = "DW_AT_ordering" + private = "DW_AT_private" + producer = "DW_AT_producer" + program = "DW_AT_program" + protected = "DW_AT_protected" + prototyped = "DW_AT_prototyped" + public = "DW_AT_public" + pure_virtual = "DW_AT_pure_virtual" + return_addr = "DW_AT_return_addr" + sibling = "DW_AT_sibling" + specification = "DW_AT_specification" + start_scope = "DW_AT_start_scope" + stride_size = "DW_AT_stride_size" + string_length = "DW_AT_string_length" + stmt_list = "DW_AT_stmt_list" + subscr_data = "DW_AT_subscr_data" + upper_bound = "DW_AT_upper_bound" + user_def_type = "DW_AT_user_def_type" + virtual = "DW_AT_virtual" + + mangled_name = "DW_AT_user_0x200" + global_ref = "DW_AT_user_0x202" + + DW_AT_fund_type = (0x0050 | DW_FORM.DATA2) + DW_AT_mod_fund_type = (0x0060 | DW_FORM.BLOCK2) + DW_AT_user_def_type = (0x0070 | DW_FORM.REF) + DW_AT_mod_u_d_type = (0x0080 | DW_FORM.BLOCK2) + + +class DW_TAG: + array_type = "DW_TAG_array_type" + class_type = "DW_TAG_class_type" + common_block = "DW_TAG_common_block" + common_inclusion = "DW_TAG_common_inclusion" + compile_unit = "DW_TAG_compile_unit" + entry_point = "DW_TAG_entry_point" + enumeration_type = "DW_TAG_enumeration_type" + formal_parameter = "DW_TAG_formal_parameter" + global_subroutine = "DW_TAG_global_subroutine" + global_variable = "DW_TAG_global_variable" + inheritance = "DW_TAG_inheritance" + inlined_subroutine = "DW_TAG_inlined_subroutine" + label = "DW_TAG_label" + lexical_block = "DW_TAG_lexical_block" + local_variable = "DW_TAG_local_variable" + member = "DW_TAG_member" + module = "DW_TAG_module" + padding = "DW_TAG_padding" + pointer_type = "DW_TAG_pointer_type" + ptr_to_member_type = "DW_TAG_ptr_to_member_type" + reference_type = "DW_TAG_reference_type" + set_type = "DW_TAG_set_type" + source_file = "DW_TAG_source_file" + string_type = "DW_TAG_string_type" + structure_type = "DW_TAG_structure_type" + subrange_type = "DW_TAG_subrange_type" + subroutine = "DW_TAG_subroutine" + subroutine_type = "DW_TAG_subroutine_type" + typedef = "DW_TAG_typedef" + union_type = "DW_TAG_union_type" + unspecified_parameters = "DW_TAG_unspecified_parameters" + variant = "DW_TAG_variant" + with_stmt = "DW_TAG_with_stmt" + + +class DW_FMT: + ET = 0x8 + + +class DW_OP: + REG = 0x01 + BASEREG = 0x02 + ADDR = 0x03 + CONST = 0x04 + DEREF2 = 0x05 + DEREF = 0x06 + DEREF4 = 0x06 + ADD = 0x07 + lo_user = 0xe0 + hi_user = 0xff + + +class DW_MOD: + pointer_to = 0x01 + reference_to = 0x02 + const = 0x03 + volatile = 0x04 + lo_user = 0x80 + hi_user = 0xff + + +class DwarfAttribute: + def __init__(self): + self.name: int = 0 + self.value: int | List[int] = None + + def parse(self, data: List[int], index: int) -> int: + self.name = int.from_bytes(data[index:index+2], byteorder='big') + index += 2 + at_form = self.name & 0xF + if at_form == DW_FORM.DATA2: + self.value = int.from_bytes(data[index:index+2], byteorder='big') + index += 2 + elif at_form == DW_FORM.REF: + self.value = int.from_bytes(data[index:index+4], byteorder='big') + index += 4 + elif at_form == DW_FORM.BLOCK2: + block_len = int.from_bytes(data[index:index+2], byteorder='big') + index += 2 + self.value = data[index:index+block_len] + index += block_len + else: + print("Unhandled form:", at_form) + exit(0) + return index + + +class DwarfLocationAtom: + def __init__(self, op, value): + self.op = op + self.value = value + + +class DwarfLocation: + def __init__(self): + self.atoms: List[DwarfLocationAtom] = [] + + def parse(self, data: List[int], index: int) -> int: + while index < len(data): + op = data[index] + index += 1 + value = int.from_bytes(data[index:index+4], byteorder='big') + self.atoms.append(DwarfLocationAtom(op, value)) + index += 4 + + def __str__(self): + parts = [] + for atom in self.atoms: + if atom.op == DW_OP.REG: + parts.append(f"REG({atom.value})") + elif atom.op == DW_OP.BASEREG: + parts.append(f"BASEREG({atom.value})") + elif atom.op == DW_OP.ADDR: + parts.append(f"ADDR({atom.value})") + elif atom.op == DW_OP.CONST: + parts.append(f"CONST({atom.value})") + elif atom.op == DW_OP.DEREF2: + parts.append("DEREF2") + elif atom.op == DW_OP.DEREF4: + parts.append("DEREF4") + elif atom.op == DW_OP.ADD: + parts.append("ADD") + else: + parts.append(f"UNKNOWN({atom.op})") + return f"Loc({",".join(parts)})" + + +class DwarfSubscriptDataBound: + def __init__(self): + self.isConstant = False + self.constant = 0 + self.location: DwarfLocation = None + + def __str__(self): + if self.isConstant: + return f"{self.constant}" + else: + return f"{self.location}" + + +class DwarfType: + def __init__(self): + self.isFundamental = False + self.fundType = 0 + self.udOffset = 0 + self.modifiers = [] + + def parse(self, attr: DwarfAttribute): + assert attr.name in [ + DW_AT.DW_AT_fund_type, + DW_AT.DW_AT_user_def_type, + DW_AT.DW_AT_mod_fund_type, + DW_AT.DW_AT_mod_u_d_type + ], "Unexpected attribute name" + + if attr.name == DW_AT.DW_AT_fund_type: + self.isFundamental = True + self.fundType = attr.value + + elif attr.name == DW_AT.DW_AT_user_def_type: + self.isFundamental = False + self.udOffset = attr.value + + elif attr.name == DW_AT.DW_AT_mod_fund_type: + self.isFundamental = True + assert isinstance(attr.value, list) and len(attr.value) >= 2, "Invalid block length for mod_fund_type" + + data = attr.value + type_data = data[:-2] + + for modifier in type_data: + self.modifiers.append(modifier) + + self.fundType = int.from_bytes(data[-2:], byteorder='big') + + elif attr.name == DW_AT.DW_AT_mod_u_d_type: + self.isFundamental = False + assert isinstance(attr.value, list) and len(attr.value) >= 4, "Invalid block length for mod_u_d_type" + + data = attr.value + type_data = data[:-4] + + for modifier in type_data: + self.modifiers.append(modifier) + + self.udOffset = int.from_bytes(data[-4:], byteorder='big') + + +class DwarfSubscriptDataItem: + def __init__(self): + self.resolved_type = None + self.dwarf_type = DwarfType() + self.lowBound = DwarfSubscriptDataBound() + self.highBound = DwarfSubscriptDataBound() + + def __str__(self): + return f"SubscriptDataItem({self.type}[{self.lowBound}-{self.highBound}])" + + +def DW_FT_to_string(type): + if type == DW_FT.bool: + return "bool" + elif type == DW_FT.char: + # char in spec, alias to S8 in our codebase + return "S8" + elif type == DW_FT.S8: + return "S8" + elif type == DW_FT.U8: + return "U8" + elif type == DW_FT.S16: + return "S16" + elif type == DW_FT.U16: + return "U16" + elif type == DW_FT.S32: + return "S32" + elif type == DW_FT.U32: + return "U32" + elif type == DW_FT.SLong: + return "SLong" + elif type == DW_FT.ULong: + return "ULong" + elif type == DW_FT.pointer: + return "void*" + elif type == DW_FT.F32: + return "F32" + elif type == DW_FT.F64: + return "F64" + elif type == DW_FT.void: + return "void" + elif type == DW_FT.S64: + return "S64" + + return None diff --git a/ghidra_scripts/gimport/dwarfone.py b/ghidra_scripts/gimport/dwarfone.py new file mode 100644 index 00000000..309d9fb8 --- /dev/null +++ b/ghidra_scripts/gimport/dwarfone.py @@ -0,0 +1,464 @@ +# Taken from DWEX https://github.com/sevaa/dwex and modified + +# Support for DWARF v1.1 in a way that will be more or less compatible with pyelftools + +from io import BytesIO +from collections import OrderedDict, namedtuple +from bisect import bisect_left +from elftools.dwarf.dwarfinfo import DwarfConfig +from elftools.dwarf.die import AttributeValue +from elftools.dwarf.structs import DWARFStructs +from elftools.common.utils import struct_parse, bytelist2string +from elftools.dwarf.enums import ENUM_DW_TAG, ENUM_DW_AT, ENUM_DW_FORM +from elftools.dwarf.lineprogram import LineProgramEntry, LineState +from elftools.dwarf.dwarf_expr import DWARFExprOp +from typing import Iterator +from elftools.elf.elffile import ELFFile + +LineTableHeader = namedtuple('LineTableHeader', 'version file_entry') +CUv1Header = namedtuple('CUv1Header', 'version unit_length debug_abbrev_offset address_size') + +TAG_reverse = dict((v, k) for k, v in ENUM_DW_TAG.items()) +ATTR_reverse = dict((v, k) for k, v in ENUM_DW_AT.items()) +FORM_reverse = dict((v, k) for k, v in ENUM_DW_FORM.items()) + +DW_OP_name2opcode = dict( + DW_OP_reg=0x01, + DW_OP_basereg=0x02, + DW_OP_addr=0x03, + DW_OP_const=0x04, + DW_OP_deref2=0x05, + DW_OP_deref=0x06, + DW_OP_deref4=0x06, + DW_OP_add=0x07, + DW_OP_user_0x80=0x80 # Extension op, not sure what's the deal with that +) + +DW_OP_opcode2name = dict((v, k) for k, v in DW_OP_name2opcode.items()) + + +class DIEV1(object): + def __init__(self, stm, cu, di): + self.cu = cu + self.dwarfinfo = di + self.stream = stm + self.offset = stm.tell() + self.attributes = OrderedDict() + self.tag = None + self.has_children = None + self.abbrev_code = None + self.size = 0 + # Null DIE terminator. It can be used to obtain offset range occupied + # by this DIE including its whole subtree. + self._terminator = None + self._parent = None + + structs = self.dwarfinfo.structs + self.size = struct_parse(structs.Dwarf_uint32(''), stm) + # Size 8+ can be padding if the tag is 0. No attributes in those. + # DW_TAG_null and DW_TAG_padding are both code zero + if self.size < 8: + self.tag = 'DW_TAG_null' # Null terminates the sibling chain + self.has_children = False + else: + tag_code = struct_parse(structs.Dwarf_uint16(''), stm) + # Do what pyelftools does, leave tag as int if unknown + self.tag = TAG_reverse[tag_code] if tag_code in TAG_reverse else tag_code + if self.tag == 'DW_TAG_null': # TAG_padding in DWARF1 spec + self.tag == 'DW_TAG_padding' # Doesn't count for is_null + # No attributes, just advance the stream + stm.seek(self.size-6, 1) + self.has_children = False + else: + while stm.tell() < self.offset + self.size: + attr_offset = self.stream.tell() + attr = struct_parse(structs.Dwarf_uint16(''), stm) + form = FORM_reverse[attr & 0xf] + attr >>= 4 + if attr in ATTR_reverse: + name = ATTR_reverse[attr] + elif 0x200 <= attr <= 0x3ff: # DW_AT_MIPS represented as 0x204??? + name = 'DW_AT_user_0x%x' % attr + else: # Do what pyelftools does, leave tag as int if unknown + name = attr + + raw_value = struct_parse(structs.Dwarf_dw_form[form], stm) + value = raw_value + + self.attributes[name] = AttributeValue( + name=name, + form=form, + value=value, + raw_value=raw_value, + offset=attr_offset, + indirection_length=0) + self.has_children = self.attributes['DW_AT_sibling'].value >= self.offset + self.size + 8 + + def get_parent(self): + return self._parent + + def is_null(self): + return self.tag == 'DW_TAG_null' + + def iter_children(self) -> Iterator['DIEV1']: + return self.cu.iter_children(self) + + def iter_broken_children(self) -> Iterator['DIEV1']: + return self.cu.iter_broken_DIE_children(self) + + def sibling(self): + return self.attributes['DW_AT_sibling'].value + + +class CompileUnitV1(object): + def __init__(self, di, top_die): + self.dwarfinfo = di + self.structs = di.structs + end_offset = top_die.attributes['DW_AT_sibling'].value + self.header = CUv1Header(version=1, unit_length=end_offset - top_die.offset, debug_abbrev_offset=None, address_size=4) + self._dielist = [top_die] + self._diemap = [top_die.offset] + # For compatibility with v2+ CU + self.cu_offset = top_die.offset + self.cu_die_offset = top_die.offset + + def get_top_DIE(self): + return self._dielist[0] + + def __getitem__(self, name): + return self.header._asdict()[name] + + @property + def size(self): + return self.header.unit_length # No CU header here + + # Caches + def DIE_at_offset(self, offset) -> DIEV1: + i = bisect_left(self._diemap, offset) + if i < len(self._diemap) and offset == self._diemap[i]: + die = self._dielist[i] + else: + die = self.dwarfinfo.DIE_at_offset(offset, self) + self._dielist.insert(i, die) + self._diemap.insert(i, offset) + return die + + # pyelftools' iter_DIEs sets parent on discovered DIEs, we should too + def iter_DIEs(self): + offset = self.cu_offset + parent = None + parent_stack = list() + end_offset = self.get_top_DIE().attributes['DW_AT_sibling'].value + # Dump the whole section into locals to catch 1610 + if end_offset - offset <= 4096: + stm = self.dwarfinfo.stm + stm.seek(offset, 0) + import base64 + _ = base64.encodebytes(stm.read(end_offset - offset)).decode('ASCII') + while offset < end_offset: + die = self.DIE_at_offset(offset) + + if die._parent is None: + die._parent = parent + + if not die.is_null(): + yield die + offset += die.size + if offset != die.sibling(): # Start of a subtree + parent_stack.append(parent) + parent = die + else: # null - end of a sibling chain + # Catching 1610 + _ = die.size + _ = die.tag + if parent_stack: # Only pop if there are items in the stack + parent = parent_stack.pop() + else: + # We've reached the end of the DIE tree + parent = None + offset += die.size + # prev_die_tag = die.tag + + def iter_children(self, parent_die) -> Iterator[DIEV1]: + offset = parent_die.offset + parent_die.size + after = parent_die.attributes['DW_AT_sibling'].value if 'DW_AT_sibling' in parent_die.attributes else self.dwarfinfo.section_size + while offset < after: + die = self.DIE_at_offset(offset) + if die._parent is None: + die._parent = parent_die + if die.is_null(): + break + else: + yield die + offset = die.attributes['DW_AT_sibling'].value + + tag_is_child = { + "DW_TAG_global_subroutine": False, + "DW_TAG_structure_type": False, + "DW_TAG_subroutine": False, + "DW_TAG_subroutine_type": False, + "DW_TAG_union_type": False, + "DW_TAG_global_variable": False, + "DW_TAG_array_type": False, + "DW_TAG_enumeration_type": False, + "DW_TAG_typedef": False, + + "DW_TAG_local_variable": True, + "DW_TAG_formal_parameter": True, + "DW_TAG_inheritance": True, + "DW_TAG_member": True, + } + + # Iterate a broken file where all of the sibling attributes + # are zero so all we have to go on is the null children. + def iter_broken_DIE_children(self, die: DIEV1): + # Initial offset + offset = die.offset + die.size + + while offset < self.dwarfinfo.section_size: + # Yield the child + child = self.DIE_at_offset(offset) + offset += child.size + + # End of child list + if child.is_null(): + return + + # Inferred end of child list because its missing + if die.tag != "DW_TAG_compile_unit": + if not self.tag_is_child[child.tag]: + return + + yield child + + # Skip children if present + while True: + skip = self.DIE_at_offset(offset) + if skip.is_null(): + offset += skip.size + break + elif not self.tag_is_child[skip.tag]: + break + else: + offset += skip.size + + def iter_DIE_children(self, die): + if not die.has_children: + return + + # `cur_offset` tracks the stream offset of the next DIE to yield + # as we iterate over our children, + cur_offset = die.offset + die.size + + while True: + child = self.DIE_at_offset(cur_offset) + + if child._parent is None: + child._parent = die + + if child.is_null(): + die._terminator = child + return + + yield child + + if not child.has_children: + cur_offset += child.size + elif "DW_AT_sibling" in child.attributes: + sibling = child.attributes["DW_AT_sibling"] + if sibling.form == 'DW_FORM_ref': + cur_offset = sibling.value + else: + raise NotImplementedError('sibling in form %s' % sibling.form) + else: + # If no DW_AT_sibling attribute is provided by the producer + # then the whole child subtree must be parsed to find its next + # sibling. There is one zero byte representing null DIE + # terminating children list. It is used to locate child subtree + # bounds. + + # If children are not parsed yet, this instruction will manage + # to recursive call of this function which will result in + # setting of `_terminator` attribute of the `child`. + if child._terminator is None: + for _ in self.iter_DIE_children(child): + pass + + cur_offset = child._terminator.offset + child._terminator.size + + def get_DIE_from_refaddr(self, refaddr): + return self.DIE_at_offset(refaddr) + + +class LineTableV1(object): + def __init__(self, stm, structs, len, pc): + self.stm = stm + self.structs = structs + self.len = len + self.pc = pc + self._decoded_entries = None + self.header = LineTableHeader(1, (None)) + + def get_entries(self): + if self._decoded_entries is None: + stm = self.stm + offset = stm.tell() + end_offset = offset + self.len + structs = self.structs + entries = [] + pc = self.pc + while offset < end_offset: + line = struct_parse(structs.Dwarf_uint32(''), stm) + col = struct_parse(structs.Dwarf_uint16(''), stm) + pc_delta = struct_parse(structs.Dwarf_uint32(''), stm) + if line == 0: + break + state = LineState(True) + state.file = 0 + state.line = line + state.column = col if col != 0xffff else None + state.address = pc + entries.append(LineProgramEntry(0, False, [], state)) + pc += pc_delta + self._decoded_entries = entries + return self._decoded_entries + + def __getitem__(self, name): + return self.header[name] + + +class DWARFExprParserV1(object): + def __init__(self, structs): + self.structs = structs + + def parse_expr(self, expr): + stm = BytesIO(bytelist2string(expr)) + parsed = [] + + while True: + # Get the next opcode from the stream. If nothing is left in the + # stream, we're done. + byte = stm.read(1) + if len(byte) == 0: + break + + # Decode the opcode and its name. + op = ord(byte) + op_name = DW_OP_opcode2name.get(op, 'OP:0x%x' % op) + + if op <= 4 or op == 0x80: + args = [struct_parse(self.structs.Dwarf_target_addr(''), stm),] + else: + args = [] + + parsed.append(DWARFExprOp(op=op, op_name=op_name, args=args, offset=stm.tell())) + + return parsed + + +class DWARFInfoV1(object): + def __init__(self, elffile): + section = elffile.get_section_by_name(".debug") + section_data = section.data() + # TODO: relocation? Compression? + self.section_size = len(section_data) + self.stm = BytesIO(section_data) + + lsection = elffile.get_section_by_name(".line") + if lsection: + self.linestream = BytesIO(lsection.data()) + # Sections .debug_pubnames, .debug_aranges also in the spec - + # those are indices into info, we ignore them + + self.config = DwarfConfig( + little_endian=elffile.little_endian, + default_address_size=elffile.elfclass // 8, + machine_arch=elffile.get_machine_arch() + ) + + self.structs = DWARFStructs( + little_endian=self.config.little_endian, + dwarf_format=32, + address_size=self.config.default_address_size) + + def iter_CUs(self): + offset = 0 + while offset < self.section_size: + die = self.DIE_at_offset(offset, None) + if not die.is_null(): + if die.cu is None: + die.cu = cu = CompileUnitV1(self, die) + cu.cu_offset = offset + yield die.cu + offset = die.attributes['DW_AT_sibling'].value + if offset == 0: + break + else: + break + + def iter_all_dies(self): + offset = 0 + while offset < self.section_size: + die = self.DIE_at_offset(offset, None) + yield die + offset += die.size + + # Does not cache + def DIE_at_offset(self, offset, cu): + self.stm.seek(offset, 0) + return DIEV1(self.stm, cu, self) + + def location_lists(self): + return None + + def line_program_for_CU(self, cu): + top_DIE = cu.get_top_DIE() + if 'DW_AT_stmt_list' in top_DIE.attributes: + stm = self.linestream + stm.seek(top_DIE.attributes['DW_AT_stmt_list'].value, 0) + structs = self.structs + len = struct_parse(structs.Dwarf_uint32(''), stm) + pc = struct_parse(structs.Dwarf_target_addr(''), stm) + return LineTableV1(stm, structs, len, pc) + else: + return None + + def range_lists(self): + return None + + def get_aranges(self): + return None + + def has_CFI(self): + return False + + def has_EH_CFI(self): + return False + + +def parse_dwarf1(elffile): + return DWARFInfoV1(elffile) + + +def read_dwarf_info(file) -> DWARFInfoV1: + file.seek(0) + elffile = ELFFile(file, lambda s: None) + + # Retrieve the preferred loading address + load_segment = next((seg for seg in elffile.iter_segments() if seg.header.p_type == 'PT_LOAD'), None) + start_address = load_segment.header.p_vaddr if load_segment else 0 + di = None + if elffile.has_dwarf_info(): + di = elffile.get_dwarf_info() + elif section := elffile.get_section_by_name(".debug"): + if section.data_size == 0: + return None + di = parse_dwarf1(elffile) + else: + return None + + di._format = 0 + di._start_address = start_address + di._arch_code = elffile.header.e_machine + di._frames = None + return di diff --git a/ghidra_scripts/gimport/extract_info.py b/ghidra_scripts/gimport/extract_info.py new file mode 100644 index 00000000..139e5b67 --- /dev/null +++ b/ghidra_scripts/gimport/extract_info.py @@ -0,0 +1,439 @@ + +from .dwarfone import DIEV1, DWARFInfoV1, read_dwarf_info +from typing import Dict, List, assert_never +import re +from .demangle import demangle +from .dwarf import DW_AT, DW_OP, DW_TAG, DW_FMT, DW_MOD, \ + DwarfLocation, DwarfAttribute, DwarfSubscriptDataItem +import pathlib +from .gtypes import GType, GFundType, GArrayType, GPointerType, GStructType, \ + GGlobal, GGlobalSubroutine, GGlobalSubroutineParameter, GGlobalVariable, \ + GMember, GSubroutineType +import os + +GET_ORIGINAL_ADDRESS = True + + +REPO_ROOT = pathlib.Path(__file__).parent.parent.parent.resolve() + + +def get_die_name(die: DIEV1) -> str: + if DW_AT.name in die.attributes: + return die.attributes[DW_AT.name].value.decode("UTF-8") + else: + return None + + +def get_type_class(die: DIEV1): + if die.tag == DW_TAG.structure_type or die.tag == DW_TAG.class_type: + # Don't differentiate + return "struct" + elif die.tag == DW_TAG.union_type: + return "union" + elif die.tag == DW_TAG.enumeration_type: + return "enum" + else: + assert False, f"Unknown type class {die.tag}" + + +def create_array_type(die: DIEV1) -> GArrayType: + new_type = GArrayType() + data = die.attributes[DW_AT.subscr_data].value + index = 0 + + while index < len(data): + format = int.from_bytes(data[index:index+1], byteorder='big') + index += 1 + + item = DwarfSubscriptDataItem() + if format == DW_FMT.ET: + attribute = DwarfAttribute() + index = attribute.parse(data, index) + item.dwarf_type.parse(attribute) + else: + if format & 0x4: # User-defined type + item.dwarf_type.isFundamental = False + item.dwarf_type.udOffset = int.from_bytes(data[index:index+4], byteorder='big') + index += 4 + else: # Fundamental type + item.dwarf_type.isFundamental = True + item.dwarf_type.fundType = int.from_bytes(data[index:index+2], byteorder='big') + index += 2 + + if format & 0x2: # Location + item.lowBound.isConstant = False + block_len = int.from_bytes(data[index:index+2], byteorder='big') + index += 2 + item.lowBound.location.parse(data[index:index+block_len], 0) + index += block_len + else: # Constant + item.lowBound.isConstant = True + item.lowBound.constant = int.from_bytes(data[index:index+4], byteorder='big') + index += 4 + + if format & 0x1: # Location + item.highBound.isConstant = False + block_len = int.from_bytes(data[index:index+2], byteorder='big') + index += 2 + item.highBound.location.parse(data[index:index+block_len], 0) + index += block_len + else: # Constant + item.highBound.isConstant = True + item.highBound.constant = int.from_bytes(data[index:index+4], byteorder='big') # Assuming Elf32_Word is 4 bytes + index += 4 + + new_type.subscripts.append(item) + + return new_type + + +def read_symbol_addresses() -> Dict[str, int]: + symbols = {} + with open(REPO_ROOT / "config/GQPE78/symbols.txt") as f: + lines = f.readlines() + had_multiple = set() + for line in lines: + # Parse parts out of the line + parts = re.match(r"([^=]+) = ([^:]+):0x([0-9a-fA-F]+);", line) + sym_name = parts.group(1) + sym_address = int(parts.group(3), base=16) + + # strip $nnn ending used for duplicate local symbols of the same name + index = re.search(r"\$[0-9]+$", sym_name) + if index: + sym_name = sym_name[:index.start()] + + # Ignore addresses for duplicated symbols that + # we don't have any way to disambiguate. + if sym_name in symbols: + del symbols[sym_name] + had_multiple.add(sym_name) + else: + symbols[sym_name] = sym_address + return symbols + + +class ExtractedInfo: + def __init__(self): + self.structs: List[GStructType] = [] + self.globals: List[GGlobal] = [] + self.labels: Dict[int, str] = dict() + + +class DwarfInfoParser: + def __init__(self, di: DWARFInfoV1): + self.di = di + self.offset_to_type: Dict[int, GType] = dict() + self.globals: List[GGlobal] = [] + self.symbols: Dict[str, int] = None + self.type_by_name: Dict[str, GType] = dict() + self.used_mangled_names = set() + self.labels: Dict[int, str] = dict() + + def parse(self) -> ExtractedInfo: + self.symbols = read_symbol_addresses() + self.prepass_create_types() + self.process_dies() + self.postprocess_cleanup() + + result = ExtractedInfo() + for offset, type in self.offset_to_type.items(): + if isinstance(type, GStructType): + result.structs.append(type) + result.globals.extend(self.globals) + result.labels = self.labels.copy() + return result + + def parse_type_from_die(self, die: DIEV1) -> GType: + def modify(type: GType, modifiers: List[int]) -> GType: + for modifier in modifiers: + if modifier == DW_MOD.pointer_to or modifier == DW_MOD.reference_to: + type = GPointerType(type) + elif modifier == DW_MOD.const or modifier == DW_MOD.volatile: + # No impact on Ghidra use case + pass + else: + assert_never(f"Unknown modifier: {modifier}") + return type + + if DW_AT.user_def_type in die.attributes: + offset = die.attributes[DW_AT.user_def_type].value + if type := self.offset_to_type.get(offset): + return type + else: + return None + elif DW_AT.fund_type in die.attributes: + return GFundType(die.attributes[DW_AT.fund_type].value) + elif DW_AT.mod_u_d_type in die.attributes: + data = die.attributes[DW_AT.mod_u_d_type].value + ud = int.from_bytes(data[-4:], byteorder='big') + if type := self.offset_to_type.get(ud): + return modify(type, data[:-4]) + else: + return None + elif DW_AT.mod_fund_type in die.attributes: + data = die.attributes[DW_AT.mod_fund_type].value + fund_type = int.from_bytes(data[-2:], byteorder='big') + return modify(GFundType(fund_type), data[:-2]) + else: + # Void return type of function -> Hits this case + return None + + def parse_member(self, die: DIEV1) -> GMember: + new_type = GMember(get_die_name(die)) + new_type.type = self.parse_type_from_die(die) + if DW_AT.location in die.attributes: + new_type.location.parse(die.attributes[DW_AT.location].value, 0) + if DW_AT.bit_offset in die.attributes: + new_type.bit_offset = die.attributes[DW_AT.bit_offset].value + if DW_AT.bit_size in die.attributes: + new_type.bit_size = die.attributes[DW_AT.bit_size].value + + return new_type + + def prepass_create_struct_type(self, die: DIEV1) -> GStructType: + # Must be filled in later + if name := get_die_name(die): + struct = GStructType(name, get_type_class(die)) + return struct + else: + return GStructType("unnamed", get_type_class(die)) + + def prepass_create_type(self, die: DIEV1): + if die.tag == DW_TAG.structure_type or die.tag == DW_TAG.union_type or \ + die.tag == DW_TAG.enumeration_type or die.tag == DW_TAG.class_type: + struct_type = self.prepass_create_struct_type(die) + self.type_by_name[struct_type.name] = struct_type + return struct_type + elif die.tag == DW_TAG.array_type: + return create_array_type(die) + elif die.tag == DW_TAG.subroutine_type: + # We can fill in all the contents later + return GSubroutineType() + else: + return None + + def prepass_create_types(self): + for cu in self.di.iter_CUs(): + for die in cu.iter_DIE_children(cu.get_top_DIE()): + if type := self.prepass_create_type(die): + self.offset_to_type[die.offset] = type + + def fill_structure_members(self, die: DIEV1): + structure: GStructType = self.offset_to_type[die.offset] + structure.byte_size = die.attributes[DW_AT.byte_size].value + + for member_die in die.iter_children(): + member = self.parse_member(member_die) + structure.fields.append(member) + + def resolve_subroutine_types(self, die: DIEV1): + subroutine: GSubroutineType = self.offset_to_type[die.offset] + subroutine.type = self.parse_type_from_die(die) + + for parameter_die in die.iter_children(): + subroutine.parameters.append(self.parse_type_from_die(parameter_die)) + + def resolve_array_types(self, die: DIEV1): + array: GArrayType = self.offset_to_type[die.offset] + + for subscript in array.subscripts: + if subscript.dwarf_type.isFundamental: + subscript.resolved_type = GFundType(subscript.dwarf_type.fundType) + else: + if subscript.dwarf_type.udOffset in self.offset_to_type: + subscript.resolved_type = self.offset_to_type[subscript.dwarf_type.udOffset] + else: + assert_never(f"Missing type: {subscript.dwarf_type.udOffset}") + for modifier in subscript.dwarf_type.modifiers: + if modifier == DW_MOD.pointer_to or modifier == DW_MOD.reference_to: + subscript.resolved_type = GPointerType(subscript.resolved_type) + + def create_global_functions(self, die: DIEV1): + global_subroutine = GGlobalSubroutine() + global_subroutine.name = get_die_name(die) + global_subroutine.mangled_name = die.attributes[DW_AT.mangled_name].value.decode("UTF-8") + self.used_mangled_names.add(global_subroutine.mangled_name) + + if GET_ORIGINAL_ADDRESS: + if global_subroutine.mangled_name in self.symbols: + global_subroutine.address = self.symbols[global_subroutine.mangled_name] + else: + # No original address. New functions we added in our implementation. + return + else: + global_subroutine.address = die.attributes[DW_AT.low_pc].value + + # DWARF handles void return type as not present but we want it + # to be present as void type for Ghidra. + global_subroutine.type = self.parse_type_from_die(die) + if global_subroutine.type is None: + global_subroutine.type = GFundType(20) + + # Parse parameters + for child in die.iter_children(): + if child.tag == DW_TAG.formal_parameter: + name = get_die_name(child) + type = self.parse_type_from_die(child) + loc = DwarfLocation() + loc.parse(child.attributes[DW_AT.location].value, 0) + global_subroutine.params.append(GGlobalSubroutineParameter(name, type, loc)) + elif child.tag == DW_TAG.local_variable: + pass + else: + print("Unknown subroutine child tag:", child.offset, child.tag) + exit(0) + + if result := demangle(global_subroutine.mangled_name, lambda ident: self.type_by_name.get(ident)): + (name, param_types) = result + global_subroutine.name = name + same_count = len(param_types) == len(global_subroutine.params) + for i, param_type in enumerate(param_types): + if i < len(global_subroutine.params): + global_subroutine.params[i].type = param_type + # If we had a different count the names are useless + if not same_count: + global_subroutine.params[i].name = f"param{i+1}" + else: + param = GGlobalSubroutineParameter(f"param{i+1}", param_type, DwarfLocation()) + global_subroutine.params.append(param) + + self.globals.append(global_subroutine) + + def create_global_variables(self, die: DIEV1): + global_variable = GGlobalVariable() + global_variable.name = get_die_name(die) + global_variable.type = self.parse_type_from_die(die) + if DW_AT.mangled_name in die.attributes: + global_variable.mangled_name = die.attributes[DW_AT.mangled_name].value.decode("UTF-8") + else: + global_variable.mangled_name = global_variable.name + + if GET_ORIGINAL_ADDRESS: + if global_variable.mangled_name in self.symbols: + global_variable.address = self.symbols[global_variable.mangled_name] + self.globals.append(global_variable) + else: + loc = DwarfLocation() + loc.parse(die.attributes[DW_AT.location].value, 0) + assert loc.atoms[0].op == DW_OP.ADDR, "Non-address global" + assert len(loc.atoms) == 1, "Multiple atoms" + global_variable.address = loc.atoms[0].value + self.globals.append(global_variable) + + def add_enum_members(self, die: DIEV1): + enum_type: GStructType = self.offset_to_type[die.offset] + enum_type.byte_size = die.attributes[DW_AT.byte_size].value + data: List[int] = die.attributes[DW_AT.element_list].value + index = 0 + while index < len(data): + value = int.from_bytes(data[index:index+4], byteorder='big') + index += 4 + name_start = index + while data[index] != 0: + index += 1 + name = bytes(data[name_start:index]).decode("UTF-8") + index += 1 + enum_type.enum_values.append((name, value)) + + def process_die(self, die: DIEV1): + if die.tag == DW_TAG.structure_type or die.tag == DW_TAG.union_type or \ + die.tag == DW_TAG.class_type: + self.fill_structure_members(die) + elif die.tag == DW_TAG.subroutine_type: + self.resolve_subroutine_types(die) + elif die.tag == DW_TAG.array_type: + self.resolve_array_types(die) + elif die.tag == DW_TAG.global_subroutine or die.tag == DW_TAG.subroutine: + self.create_global_functions(die) + elif die.tag == DW_TAG.global_variable or die.tag == DW_TAG.local_variable: + self.create_global_variables(die) + elif die.tag == DW_TAG.enumeration_type: + self.add_enum_members(die) + else: + assert_never(f"Other: {die.tag} {get_die_name(die)}") + + def process_dies(self): + # Fill in types + for cu in self.di.iter_CUs(): + for die in cu.iter_DIE_children(cu.get_top_DIE()): + self.process_die(die) + + def postprocess_cleanup(self): + # Add functions which we don't have in the executable + # because we have not started decomping them yet. + for mangled_name in self.symbols: + if mangled_name not in self.used_mangled_names: + result = demangle(mangled_name, lambda ident: self.type_by_name.get(ident)) + if result: + (name, types) = result + func = GGlobalSubroutine() + func.address = self.symbols[mangled_name] + func.mangled_name = mangled_name + func.name = name + func.type = None + for (i, param_type) in enumerate(types): + func.params.append(GGlobalSubroutineParameter(f"param{i+1}", param_type, DwarfLocation())) + self.globals.append(func) + else: + # Just use it as a label + self.labels[self.symbols[mangled_name]] = mangled_name + + # TODO: Vtable cleanup + + # # debug output + # with open("types.txt", "w") as f: + # for offset, type in self.offset_to_type.items(): + # if isinstance(type, GStructType): + # f.write(f"{type.type_class} {type.name} [{hex(type.byte_size)}]/n") + # for field in type.fields: + # f.write(f" {field.type} {field.name} [{field.location}]\n") + # for (enum_name, enum_value) in type.enum_values: + # f.write(f" {enum_name} = {enum_value}\n") + + # for glob in self.globals: + # f.write(f"@{hex(glob.address)}: {glob}\n") + + +def do_debug_build(): + # Note: The shenanigans in this function with ninja are to avoid leaving + # someone stuck with a debug build which doen't match when that's not + # something they're used to encountering. + + # Save the old ninja file to restore after making the debug build. + ninja_file = REPO_ROOT / "build.ninja" + ninja_save = ninja_file.with_suffix(".bak") + had_ninja_file = False + if ninja_file.exists(): + had_ninja_file = True + if ninja_save.exists(): + os.remove(ninja_save) + os.rename(ninja_file, ninja_save) + + import subprocess + print("Creating a debug build to extract info from. This will take a moment.") + subprocess.run(["python", "configure.py", "--debug"], cwd=REPO_ROOT, check=True, capture_output=True) + subprocess.run(["ninja"], cwd=REPO_ROOT, check=False, capture_output=True) + + # Restore the old ninja file if they had one + os.remove(ninja_file) + if had_ninja_file: + os.rename(ninja_save, ninja_file) + + +def extract_info() -> ExtractedInfo: + executable_path = REPO_ROOT / "build/GQPE78/main.elf" + if not executable_path.exists(): + print("No executable") + do_debug_build() + + di = read_dwarf_info(open(executable_path, mode="rb")) + if di is None: + do_debug_build() + di = read_dwarf_info(open(executable_path, mode="rb")) + + assert di, "Failed to read DWARF info from built executable" + + print("Extracting info from main.elf / symbols.txt") + parser = DwarfInfoParser(di) + return parser.parse() diff --git a/ghidra_scripts/gimport/function_with_paramn.png b/ghidra_scripts/gimport/function_with_paramn.png new file mode 100644 index 00000000..d954e8d1 Binary files /dev/null and b/ghidra_scripts/gimport/function_with_paramn.png differ diff --git a/ghidra_scripts/gimport/function_with_return.png b/ghidra_scripts/gimport/function_with_return.png new file mode 100644 index 00000000..8d0fb893 Binary files /dev/null and b/ghidra_scripts/gimport/function_with_return.png differ diff --git a/ghidra_scripts/gimport/gtypes.py b/ghidra_scripts/gimport/gtypes.py new file mode 100644 index 00000000..88a7db28 --- /dev/null +++ b/ghidra_scripts/gimport/gtypes.py @@ -0,0 +1,105 @@ +from typing import List, Tuple +from .dwarf import DwarfLocation, DwarfSubscriptDataItem, DW_FT_to_string + + +class GType: + def __init__(self): + pass + + +class GPointerType: + def __init__(self, type: GType): + self.type: GType = type + + def __str__(self): + return f"{self.type}*" + + +class GGlobal: + def __init__(self): + self.name: str = None + self.address: int = None + pass + + +class GGlobalVariable(GGlobal): + def __init__(self): + self.name: str = None + self.mangled_name: str = None + self.type: GType = None + + def __str__(self): + return f"{self.type} {self.name}" + + +class GGlobalSubroutineParameter: + def __init__(self, name, type, location): + self.name = name + self.type: GType = type + self.location: DwarfLocation = location + + def __str__(self): + return f"{self.type} {self.name}" + + +class GGlobalSubroutine(GGlobal): + def __init__(self): + self.mangled_name: str = None + self.type: GType = None + self.params: List[GGlobalSubroutineParameter] = [] + + def __str__(self): + params_str = ", ".join(str(param) for param in self.params) + return f"{self.type} {self.name}({params_str})" + + +class GStructType(GType): + def __init__(self, name, type_class: str): + self.name = name + self.type_class = type_class + self.byte_size = 0 + self.fields: List[GMember] = [] + self.enum_values: List[Tuple[str, int]] = [] + + def __str__(self): + return f"{self.name}" + + +class GArrayType(GType): + def __init__(self): + self.subscripts: List[DwarfSubscriptDataItem] = [] + + def __str__(self): + subscripts_str = ", ".join(str(subscript) for subscript in self.subscripts) + return f"ArrayType({subscripts_str})" + + +class GFundType(GType): + def __init__(self, id): + self.id = id + self.name = DW_FT_to_string(id) + assert self.name is not None, f"Unknown fund type: {id}" + + def __str__(self): + return f"{self.name}" + + +class GMember(GType): + def __init__(self, name): + self.name = name + self.type: GType = None + self.location = DwarfLocation() + self.bit_offset = None + self.bit_size = None + + def __str__(self): + return f"{self.type} {self.name}" + + +class GSubroutineType(GType): + def __init__(self): + self.type: GType = None + self.parameters: List[GType] = [] + + def __str__(self): + return f"SubroutineType({self.type}({",".join(str(param) for param in self.parameters)}))" diff --git a/ghidra_scripts/gimport/import_info.py b/ghidra_scripts/gimport/import_info.py new file mode 100644 index 00000000..58331a2a --- /dev/null +++ b/ghidra_scripts/gimport/import_info.py @@ -0,0 +1,231 @@ +from .extract_info import ExtractedInfo +from .gtypes import GType, GFundType, GPointerType, GArrayType, GStructType, \ + GGlobalVariable, GSubroutineType, GGlobal, GGlobalSubroutine +from ghidra.program.database import ProgramDB +from ghidra.program.model.data import BooleanDataType, SignedByteDataType, ByteDataType, \ + ShortDataType, UnsignedShortDataType, IntegerDataType, UnsignedIntegerDataType, \ + LongDataType, UnsignedLongDataType, PointerDataType, FloatDataType, DoubleDataType, \ + VoidDataType, LongLongDataType, DataType, TypedefDataType, DataTypeConflictHandler, \ + StructureDataType, UnionDataType, EnumDataType, ArrayDataType +from ghidra.program.model.symbol import SourceType +from ghidra.program.model.listing import Function +from ghidra.program.model.listing import ParameterImpl + + +fund_types_by_id = { + 0: ("bool", BooleanDataType()), + 1: ("S8", SignedByteDataType()), # char in spec, alias to S8 in our codebase + 2: ("S8", SignedByteDataType()), # Actually signed int8 + 3: ("U8", ByteDataType()), + 5: ("S16", ShortDataType()), + 6: ("U16", UnsignedShortDataType()), + 7: ("S32", IntegerDataType()), + 9: ("U32", UnsignedIntegerDataType()), + 10: ("SLong", LongDataType()), + 12: ("ULong", UnsignedLongDataType()), + 13: ("void*", PointerDataType()), + 14: ("F32", FloatDataType()), + 15: ("F64", DoubleDataType()), + 20: ("void", VoidDataType()), + 32776: ("S64", LongLongDataType()), +} +datatype_cache = {} + + +def generate_datatype_for_gtype(program: ProgramDB, gtype: GType) -> DataType: + if isinstance(gtype, GFundType): + if gtype.id == 13: + # Special case for void* because DWARF considers it a fundamental type + # but Ghidra considers it a pointer to a fundamental type. + return program.getDataTypeManager().addDataType( + PointerDataType(VoidDataType()), DataTypeConflictHandler.KEEP_HANDLER + ) + else: + fund_type_info = fund_types_by_id.get(gtype.id) + base_type = program.getDataTypeManager().addDataType(fund_type_info[1], DataTypeConflictHandler.KEEP_HANDLER) + return program.getDataTypeManager().addDataType( + TypedefDataType(fund_type_info[0], base_type), + DataTypeConflictHandler.KEEP_HANDLER + ) + elif isinstance(gtype, GPointerType): + base_type = get_datatype_for_gtype(program, gtype.type) + return program.getDataTypeManager().addDataType( + PointerDataType(base_type), DataTypeConflictHandler.KEEP_HANDLER + ) + elif isinstance(gtype, GArrayType): + index_part = gtype.subscripts[0] + type_part = gtype.subscripts[1] + assert type_part.highBound.location is None, "Type part should not have bounds" + assert type_part.lowBound.location is None, "Type part should not have bounds" + if type_part.resolved_type is None: + return PointerDataType() + else: + return program.getDataTypeManager().addDataType( + ArrayDataType( + get_datatype_for_gtype(program, type_part.resolved_type), + index_part.highBound.constant + 1 + ), + DataTypeConflictHandler.KEEP_HANDLER + ) + elif isinstance(gtype, GSubroutineType): + # TODO: + return None + elif isinstance(gtype, GStructType): + if gtype.type_class == "enum": + path = f"/{gtype.name}" + existing_type: EnumDataType = program.getDataTypeManager().getDataType(path) + if existing_type is None: + existing_type = program.getDataTypeManager().addDataType( + EnumDataType(gtype.name, gtype.byte_size), DataTypeConflictHandler.KEEP_HANDLER + ) + return existing_type + elif gtype.type_class == "union": + path = f"/{gtype.name}" + existing_type: UnionDataType = program.getDataTypeManager().getDataType(path) + if existing_type is None: + existing_type = program.getDataTypeManager().addDataType( + UnionDataType(gtype.name), DataTypeConflictHandler.KEEP_HANDLER + ) + return existing_type + elif gtype.type_class == "struct": + path = f"/{gtype.name}" + existing_type: StructureDataType = program.getDataTypeManager().getDataType(path) + if existing_type is None: + existing_type = program.getDataTypeManager().addDataType( + StructureDataType(gtype.name, gtype.byte_size), DataTypeConflictHandler.KEEP_HANDLER + ) + existing_type.setLength(gtype.byte_size) + return existing_type + + +def get_datatype_for_gtype(program: ProgramDB, gtype: GType) -> DataType: + if gtype in datatype_cache: + return datatype_cache[gtype] + else: + datatype = generate_datatype_for_gtype(program, gtype) + datatype_cache[gtype] = datatype + return datatype + + +def fill_struct_types(program: ProgramDB, parsed_info: ExtractedInfo): + for gtype in parsed_info.structs: + if gtype.type_class == "enum": + existing_type: EnumDataType = get_datatype_for_gtype(program, gtype) + for (name, value) in gtype.enum_values: + if not existing_type.contains(name): + existing_type.add(name, value) + elif gtype.type_class == "union": + existing_type: UnionDataType = get_datatype_for_gtype(program, gtype) + assert existing_type, f"Failed to get existing type for {gtype.name}" + # TODO: Union types + + elif gtype.type_class == "struct": + existing_type: StructureDataType = get_datatype_for_gtype(program, gtype) + assert existing_type, f"Failed to get existing type for {gtype.name}" + for field in gtype.fields: + if field_type := get_datatype_for_gtype(program, field.type): + if field.location.atoms: + offset = field.location.atoms[0].value + if field.bit_size is not None: + # TODO: Make bitfields work + # existing_type.setPackingEnabled(False) + # print(f"Field {field.name}") + # max_bit = field.bit_offset + field.bit_size - 1 + # bytes_needed = (max_bit // 8) + 1 + # existing_type.insertBitFieldAt(offset, bytes_needed, field.bit_offset, + # field_type, field.bit_size, + # field.name, "") + pass + else: + if existing_field := existing_type.getComponentAt(offset): + if existing_field.getDataType() == field_type: + if existing_field.getFieldName() != field.name: + existing_field.setFieldName(field.name) + else: + existing_type.replaceAtOffset(offset, field_type, -1, field.name, None) + else: + existing_type.replaceAtOffset(offset, field_type, -1, field.name, None) + + +def add_global(program: ProgramDB, glob: GGlobal): + if isinstance(glob, GGlobalVariable): + pass + elif isinstance(glob, GGlobalSubroutine): + address = program.getAddressFactory().getAddress(hex(glob.address)) + if function := program.getFunctionManager().getFunctionAt(address): + function.setName(glob.name, SourceType.USER_DEFINED) + if glob.type: + function.setReturnType( + get_datatype_for_gtype(program, glob.type), + SourceType.USER_DEFINED) + + args = [] + for ord, param in enumerate(glob.params): + args.append(ParameterImpl(param.name, + get_datatype_for_gtype(program, param.type), + program)) + function.replaceParameters(Function.FunctionUpdateType.DYNAMIC_STORAGE_ALL_PARAMS, True, + SourceType.USER_DEFINED, args) + + +def add_labels(program: ProgramDB, parsed_info: ExtractedInfo): + for (address, label) in parsed_info.labels.items(): + address = program.getAddressFactory().getAddress(hex(address)) + if function := program.getFunctionManager().getFunctionAt(address): + function.setName(label, SourceType.USER_DEFINED) + elif symbol := program.getSymbolTable().getPrimarySymbol(address): + symbol.setName(label, SourceType.USER_DEFINED) + else: + program.getSymbolTable().createLabel(address, label, SourceType.USER_DEFINED) + + +def import_info(program: ProgramDB, parsed_info: ExtractedInfo): + # Copy better info into forward declared versions of structs + + # Figure out what structs need disambiguation because there are two of + # them with the same name but different sizes. + nonzero_size_for_name = {} + needs_disambiguation = set() + for gtype in parsed_info.structs: + if gtype.byte_size > 0: + if gtype.name in nonzero_size_for_name: + if gtype.byte_size != nonzero_size_for_name[gtype.name]: + needs_disambiguation.add(gtype.name) + else: + nonzero_size_for_name[gtype.name] = gtype.byte_size + + # Rename types needing disambiguation + for gtype in parsed_info.structs: + if gtype.name in needs_disambiguation: + gtype.name = f"{gtype.name}{hex(gtype.byte_size)}" + + # Record which types are nonzero size so we can eliminate forward declares + nonzero_for_name = {} + for gtype in parsed_info.structs: + if gtype.byte_size != 0: + nonzero_for_name[gtype.name] = gtype + + # Update forward declares to have the same contents + for gtype in parsed_info.structs: + # Some library types are only forward declared + if gtype.name not in nonzero_for_name: + continue + gtype.byte_size = nonzero_for_name[gtype.name].byte_size + gtype.fields = nonzero_for_name[gtype.name].fields + + for id in fund_types_by_id: + get_datatype_for_gtype(program, GFundType(id)) + + # Make the structs exist to avoid problems with circular references + for struct in parsed_info.structs: + get_datatype_for_gtype(program, struct) + + # Now that they all exist, fill the structs + fill_struct_types(program, parsed_info) + + # Finally, handle the globals + for glob in parsed_info.globals: + add_global(program, glob) + + # Add any leftover labels + add_labels(program, parsed_info) diff --git a/ghidra_scripts/gimport/manage_script_directories.png b/ghidra_scripts/gimport/manage_script_directories.png new file mode 100644 index 00000000..60b6dbe3 Binary files /dev/null and b/ghidra_scripts/gimport/manage_script_directories.png differ diff --git a/ghidra_scripts/gimport/struct_import.png b/ghidra_scripts/gimport/struct_import.png new file mode 100644 index 00000000..5e6c073f Binary files /dev/null and b/ghidra_scripts/gimport/struct_import.png differ