Source code for cle.backends.macho.macho

# This file is part of Mach-O Loader for CLE.
# Contributed December 2016 by Fraunhofer SIT (https://www.sit.fraunhofer.de/en/).
import ctypes
import logging
import struct
import sys
import typing
from collections import defaultdict
from io import BufferedReader, BytesIO
from os import SEEK_CUR, SEEK_SET
from typing import DefaultDict, Dict, List, Optional, Tuple, Union

import archinfo
from sortedcontainers import SortedKeyList

from cle.backends.backend import AT, Backend, register_backend
from cle.backends.macho.binding import BindingHelper, MachOPointerRelocation, MachOSymbolRelocation, read_uleb
from cle.backends.regions import Regions
from cle.errors import CLECompatibilityError, CLEInvalidBinaryError, CLEOperationError

from .macho_enums import LoadCommands as LC
from .macho_enums import MachoFiletype, MH_flags
from .section import MachOSection
from .segment import MachOSegment
from .structs import (
    DYLD_CHAINED_PTR_START_NONE,
    ChainedFixupPointerOnDisk,
    DyldChainedPtrFormats,
    DyldImportStruct,
    FileOffset,
    FilePointer,
    MemoryPointer,
    dyld_chained_fixups_header,
    dyld_chained_starts_in_segment,
)
from .symbol import AbstractMachOSymbol, DyldBoundSymbol, SymbolTableSymbol

log = logging.getLogger(name=__name__)

__all__ = ("MachO", "MachOSection", "MachOSegment", "SymbolList")


# pylint: disable=abstract-method
[docs]class SymbolList(SortedKeyList):
    """
    Special data structure that extends SortedKeyList to allow looking up a MachO library by name and ordinal quickly
    without having to iterate over the whole list
    """

    _symbol_cache: DefaultDict[Tuple[str, int], List[AbstractMachOSymbol]]

[docs]    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._symbol_cache = defaultdict(list)

[docs]    def add(self, value: AbstractMachOSymbol):
        super().add(value)
        self._symbol_cache[
            (
                value.name,
                value.library_ordinal,
            )
        ].append(value)

[docs]    def get_by_name_and_ordinal(self, name: str, ordinal: int, include_stab=False) -> List[AbstractMachOSymbol]:
        if include_stab:
            return self._symbol_cache[(name, ordinal)]
        else:
            return [symbol for symbol in self._symbol_cache[(name, ordinal)] if not symbol.is_stab]


# pylint: enable =abstract-method


[docs]class MachO(Backend):
    """
    Mach-O binaries for CLE
    -----------------------

    The Mach-O format is notably different from other formats. Specifically:

    - Sections are always part of a segment, so `self.sections` will be empty.
    - Symbols cannot be categorized like in ELF.
    - Symbol resolution must be handled by the binary.
    - Rebasing in dyld is implemented by adding a small slide to addresses inside the binary, instead of
      changing the base address of the binary. Consequently, the addresses are absolute rather than relative.
      CLE requires relative addresses, leading to numerous `AT.from_lva().to_rva()` calls in this backend.
    """

    is_default = True  # Tell CLE to automatically consider using the MachO backend

    MH_MAGIC_64 = 0xFEEDFACF
    MH_CIGAM_64 = 0xCFFAEDFE
    MH_MAGIC = 0xFEEDFACE
    MH_CIGAM = 0xCEFAEDFE
    ncmds: int
    sizeofcmds: int

[docs]    def __init__(self, *args, **kwargs):
        log.warning("The Mach-O backend is not well-supported. Good luck!")

        super().__init__(*args, **kwargs)
        self.symbols = SymbolList(key=self._get_symbol_relative_addr)

        self.struct_byteorder = None  # holds byteorder for struct.unpack(...)
        self._mapped_base = None  # temporary holder für mapped base derived via loading
        self.cputype = None
        self.cpusubtype = None
        self.filetype: int = None
        self.flags = None  # binary flags
        self.imported_libraries = ["Self"]  # ordinal 0 = SELF_LIBRARY_ORDINAL
        self.sections_by_ordinal = [None]  # ordinal 0 = None == Self
        self.exports_by_name = {}  # note exports is currently a raw and unprocessed datastructure.
        # If we intend to use it we must first upgrade it to a class or somesuch
        self.entryoff = None
        self.unixthread_pc = None
        self.os = "macos"
        self.lc_data_in_code = []  # data from LC_DATA_IN_CODE (if encountered). Format: (offset,length,kind)
        self.lc_function_starts: Optional[List[int]] = None  # data from LC_FUNCTION_STARTS (if encountered)
        self.mod_init_func_pointers = []  # may be TUMB interworking
        self.mod_term_func_pointers = []  # may be THUMB interworking
        self.export_blob: Optional[bytes] = None  # exports trie
        self.binding_blob: Optional[bytes] = None  # binding information
        self.lazy_binding_blob: Optional[bytes] = None  # lazy binding information
        self.weak_binding_blob: Optional[bytes] = None  # weak binidng information
        self.rebase_blob: Optional[bytes] = None  # rebasing information
        self.symtab_offset = None  # offset to the symtab
        self.symtab_nsyms = None  # number of symbols in the symtab
        self.binding_done = False  # if true binding was already done and do_bind will be a no-op
        self.strtab: Optional[bytes] = None
        self._indexed_strtab: Optional[Dict[int, bytes]] = None
        self._dyld_chained_fixups_offset: Optional[int] = None
        self._dyld_imports: List[AbstractMachOSymbol] = []

        # For some analysis the insertion order of the symbols is relevant and needs to be kept.
        # This is has to be separate from self.symbols because the latter is sorted by address
        self._ordered_symbols: List[AbstractMachOSymbol] = []

        # The minimum version encoded by the LC_BUILD_VERSION command
        self._minimum_version: Optional[Tuple[int, int, int]] = None

        # Begin parsing the file
        try:
            binary_file = self._binary_stream
            # get magic value and determine endianness
            self.struct_byteorder = self._detect_byteorder(struct.unpack("=I", binary_file.read(4))[0])

            # parse the mach header:
            # (ignore all irrelevant fields)
            (_, self.cputype, self.cpusubtype, self.filetype, self.ncmds, self.sizeofcmds, self.flags) = self._unpack(
                "7I", binary_file, 0, 28
            )

            # Libraries are always implicitly PIC
            self.pic = bool(self.flags & MH_flags.MH_PIE) or bool(self.filetype & MachoFiletype.MH_DYLIB)

            if not bool(self.flags & MH_flags.MH_TWOLEVEL):  # ensure MH_TWOLEVEL
                log.error(
                    "Binary is not using MH_TWOLEVEL namespacing."
                    "This isn't properly implemented yet and will degrade results in unpredictable ways."
                    "Please open an issue if you encounter this with a binary you can share"
                )

            # determine architecture
            arch_ident = self._detect_arch_ident()
            if not arch_ident:
                raise CLECompatibilityError(f"Unsupported architecture: 0x{self.cputype:X}:0x{self.cpusubtype:X}")

            # Create archinfo
            # Note that this should be customized for Apple ABI (TODO)
            self.set_arch(archinfo.arch_from_id(arch_ident, endness="lsb" if self.struct_byteorder == "<" else "msb"))

            # Determine the base address the binary was linked against
            # and set the values for the Backend and Loader accordingly
            if self.pic and self.filetype == MachoFiletype.MH_EXECUTE:
                assert self.is_main_bin, "An file of type MH_EXECUTE should be the main bin, this should not happen"
                # a Position Independent Main binary would later be loaded at 0x400000, which isn't legal for Mach-O
                # Also, its segment vaddrs are relative to 0x100000000, so we set this as the linked base
                # and the MachO Backend code uses the AdressTranslator to translate linked addresses to relative ones
                # In theory this is the place where the slide for rebasing should be added, but this isn't supported yet
                if self.arch.bits == 64:
                    self.linked_base = self.mapped_base = 2**32
                elif self.arch.bits == 32:
                    self.linked_base = self.mapped_base = 0x4000
            elif self.filetype == MachoFiletype.MH_DYLIB and self.is_main_bin:
                # the segments of dylibs are just relative to the load address, i.e. the lowest segment addr is 0
                # we need to set the load address to something because otherwise the loader will try to map the
                # file to 0x400000, which is technically illegal for Mach-O because of PAGEZERO
                #
                # The problem is that libraries also tend to have relative pointers (e.g. inside ObjC Metadata),
                # which are rebased by parsing the rebase_blob, which isn't supported yet (but coming soon)
                # so we set the base addr to 0 to make them work out without having to deal with this
                # IDA and Ghidra both seem to handle it this way too
                # AFAIU this isn't a problem with iOS15+ binaries anymore that use the new binding fixups
                # but for now we just load all libraries, that are loaded as the main object, at address 0
                #
                # We can't set the linked base to request this, because the MachO Backend implementation
                # uses this to recalculate the addresses
                self._custom_base_addr = 0
            elif self.filetype == MachoFiletype.MH_DYLIB and not self.is_main_bin:
                # A Library is loaded as a dependency, this is fine, the loader will map it to somewhere above the main
                # binary, so we don't need to do anything
                pass
            else:
                # This case is not explicitly supported yet.
                # There are various other MachoFiletypes, which might have different quirks in their loading
                raise CLECompatibilityError(
                    f"Unsupported Mach-O file type: {MachoFiletype(self.filetype)}. "
                    "Please open an issue if you need support for this"
                )

            # Start reading load commands
            lc_offset = (7 if self.arch.bits == 32 else 8) * 4

            self._parse_load_commands(lc_offset)

        except OSError as e:
            log.exception(e)
            raise CLEOperationError(e) from e

        # File is read, begin populating internal fields
        log.info("Parsing exports")
        self._parse_exports()

        if "__mh_execute_header" in self.exports_by_name:
            assert self.exports_by_name["__mh_execute_header"][1] == self.linked_base, (
                "This binary doesn't have a proper __mh_execute_header export, "
                "this breaks assumptions, please report this"
            )

        self._resolve_entry()

        log.info("Parsing %s symbols", self.symtab_nsyms)
        self._parse_symbols(binary_file)
        log.info("Parsing module init/term function pointers")
        self._parse_mod_funcs()

        if self._dyld_chained_fixups_offset:
            log.info("Parsing dyld bound symbols and fixup chains (ios15 and above)")
            self._parse_dyld_chained_fixups()
        else:
            log.info("Parsing binding bytecode stream")
            self.do_binding()

    @property
    def min_addr(self):
        return self.mapped_base

[docs]    @classmethod
    def check_compatibility(cls, spec, obj):
        # TODO: Check properly, but for now libs are just used via force load libs anyway
        return True

    def _parse_load_commands(self, lc_offset):
        # Possible optimization: Remove all unecessary calls to seek()
        # Load commands have a common structure: First 4 bytes identify the command by a magic number
        # second 4 bytes determine the commands size. Everything after this generic "header" is command-specific
        # this makes parsing the commands easy.
        # The documentation for Mach-O is at
        # http://opensource.apple.com//source/xnu/xnu-1228.9.59/EXTERNAL_HEADERS/mach-o/loader.h
        binary_file = self._binary_stream
        count = 0
        offset = lc_offset
        while count < self.ncmds and (offset - lc_offset) < self.sizeofcmds:
            count += 1
            (cmd, size) = self._unpack("II", binary_file, offset, 8)

            # check for segments that interest us
            if cmd in [LC.LC_SEGMENT, LC.LC_SEGMENT_64]:  # LC_SEGMENT,LC_SEGMENT_64
                log.debug("Found LC_SEGMENT(_64) @ %#x", offset)
                self._load_segment(binary_file, offset)
            elif cmd == LC.LC_SYMTAB:  # LC_SYMTAB
                log.debug("Found LC_SYMTAB @ %#x", offset)
                self._load_symtab(binary_file, offset)
            elif cmd in [LC.LC_DYLD_INFO, LC.LC_DYLD_INFO_ONLY]:  # LC_DYLD_INFO(_ONLY)
                log.debug("Found LC_DYLD_INFO(_ONLY) @ %#x", offset)
                self._load_dyld_info(binary_file, offset)
            elif cmd in [LC.LC_LOAD_DYLIB, LC.LC_LOAD_WEAK_DYLIB, LC.LC_REEXPORT_DYLIB]:
                log.debug("Found LC_*_DYLIB @ %#x", offset)
                self._load_dylib_info(binary_file, offset)
            elif cmd == LC.LC_RPATH:  # LC_RPATH
                log.debug("Found LC_RPATH @ %#x", offset)
            elif cmd == LC.LC_MAIN:  # LC_MAIN
                log.debug("Found LC_MAIN @ %#x", offset)
                self._load_lc_main(binary_file, offset)
            elif cmd == LC.LC_UNIXTHREAD:  # LC_UNIXTHREAD
                log.debug("Found LC_UNIXTHREAD @ %#x", offset)
                self._load_lc_unixthread(binary_file, offset)
            elif cmd == LC.LC_FUNCTION_STARTS:  # LC_FUNCTION_STARTS
                log.debug("Found LC_FUNCTION_STARTS @ %#x", offset)
                self._load_lc_function_starts(binary_file, offset)
            elif cmd == LC.LC_DATA_IN_CODE:  # LC_DATA_IN_CODE
                log.debug("Found LC_DATA_IN_CODE @ %#x", offset)
                self._load_lc_data_in_code(binary_file, offset)
            elif cmd in [LC.LC_ENCRYPTION_INFO, LC.LC_ENCRYPTION_INFO_64]:  # LC_ENCRYPTION_INFO(_64)
                log.debug("Found LC_ENCRYPTION_INFO @ %#x", offset)
                # self._assert_unencrypted(binary_file, offset)
            elif cmd in [LC.LC_DYLD_CHAINED_FIXUPS]:
                log.info("Found LC_DYLD_CHAINED_FIXUPS @ %#x", offset)
                (_, _, dataoff, datasize) = self._unpack("4I", binary_file, offset, 16)
                self._dyld_chained_fixups_offset: int = dataoff
            elif cmd in [LC.LC_BUILD_VERSION]:
                log.info("Found LC_BUILD_VERSION @ %#x", offset)
                (_, _, _platform, minos, _sdk, _ntools) = self._unpack("6I", binary_file, offset, 6 * 4)
                patch = (minos >> (8 * 0)) & 0xFF
                minor = (minos >> (8 * 1)) & 0xFF
                major = (minos >> (8 * 2)) & 0xFFFF
                self._minimum_version = (major, minor, patch)
                log.info("Found minimum version %s", ".".join([str(i) for i in self._minimum_version]))
            elif cmd in [LC.LC_DYLD_EXPORTS_TRIE]:
                log.info("Found LC_DYLD_EXPORTS_TRIE @ %#x", offset)
                (_, _, dataoff, datasize) = self._unpack("4I", binary_file, offset, 16)
                self.export_blob = self._read(binary_file, dataoff, datasize)
            elif cmd in [LC.LC_DYSYMTAB]:
                # TODO: This probably relevant for library loading and symbols, but it isn't clear how yet
                pass
            else:
                try:
                    command_name = LC(cmd)
                    log.warning("%s is not handled yet", str(command_name))
                except ValueError:
                    log.error("Command %s is not recognized!", hex(cmd))
            # update bookkeeping
            offset += size

        # Assertion to catch malformed binaries - YES this is needed!
        if count < self.ncmds or (offset - lc_offset) < self.sizeofcmds:
            raise CLEInvalidBinaryError(
                "Assertion triggered: {} < {} or {} < {}".format(
                    count, self.ncmds, (offset - lc_offset), self.sizeofcmds
                )
            )

[docs]    @classmethod
    def is_compatible(cls, stream):
        stream.seek(0)
        identstring = stream.read(0x5)
        stream.seek(0)
        if (
            identstring.startswith(struct.pack("I", MachO.MH_MAGIC_64))
            or identstring.startswith(struct.pack("I", MachO.MH_CIGAM_64))
            or identstring.startswith(struct.pack("I", MachO.MH_MAGIC))
            or identstring.startswith(struct.pack("I", MachO.MH_CIGAM))
        ):
            return True
        return False

[docs]    def is_thumb_interworking(self, address):
        """Returns true if the given address is a THUMB interworking address"""
        # Note: Untested
        return self.arch.bits != 64 and address & 1

[docs]    def decode_thumb_interworking(self, address):
        """Decodes a thumb interworking address"""
        # Note: Untested
        return address & ~1 if self.is_thumb_interworking(address) else address

    def _parse_mod_funcs(self):
        log.debug("Parsing module init/term function pointers")

        fmt = "Q" if self.arch.bits == 64 else "I"
        size = 8 if self.arch.bits == 64 else 4

        # factoring out common code
        def parse_mod_funcs_internal(s, target):
            for i in range(s.vaddr, s.vaddr + s.memsize, size):
                rel_address = AT.from_lva(i, self).to_rva()
                addr = self._unpack_with_byteorder(fmt, self.memory.load(rel_address, size))[0]
                log.debug("Addr: %#x", addr)
                target.append(addr)

        for seg in self.segments:
            seg: Union[MachOSection, MachOSegment]
            for sec in seg.sections:
                if sec.type == 0x9:  # S_MOD_INIT_FUNC_POINTERS
                    log.debug("Section %s contains init pointers", sec.sectname)
                    parse_mod_funcs_internal(sec, self.mod_init_func_pointers)
                elif sec.type == 0xA:  # S_MOD_TERM_FUNC_POINTERS
                    log.debug("Section %s contains term pointers", sec.sectname)
                    parse_mod_funcs_internal(sec, self.mod_term_func_pointers)

        log.debug("Done parsing module init/term function pointers")

[docs]    def find_segment_by_name(self, name):
        for s in self.segments:
            s: Union[MachOSection, MachOSegment]
            if s.segname == name:
                return s
        return None

    def _resolve_entry(self):
        if self.entryoff:
            self._entry = self.linked_base + self.entryoff
        elif self.unixthread_pc:
            self._entry = self.unixthread_pc
        else:
            log.warning("No entry point found")
            self._entry = 0

    @staticmethod
    def _read(fp: BufferedReader, offset: int, size: int) -> bytes:
        """
        Simple read abstraction, reads size bytes from offset in file
        :param offset: Offset to seek() to
        :param size: number of bytes to be read
        :return: string of bytes or "" for EOF
        """
        fp.seek(offset)
        return fp.read(size)

    def _unpack_with_byteorder(self, fmt, data) -> Tuple[typing.Any, ...]:
        """
        Appends self.struct_byteorder before fmt to ensure usage of correct byteorder
        :return: struct.unpack(self.struct_byteorder+fmt,input)
        """
        return struct.unpack(self.struct_byteorder + fmt, data)

    def _unpack(self, fmt: str, fp: BufferedReader, offset: FilePointer, size: int) -> Tuple[typing.Any, ...]:
        """Convenience"""
        return self._unpack_with_byteorder(fmt, self._read(fp, offset, size))

    @staticmethod
    def _detect_byteorder(magic):
        """Determines the binary's byteorder"""

        log.debug("Magic is %#x", magic)

        host_is_little = sys.byteorder == "little"

        if host_is_little:
            if magic in [MachO.MH_MAGIC_64, MachO.MH_MAGIC]:
                log.debug("Detected little-endian")
                return "<"
            elif magic in [MachO.MH_CIGAM, MachO.MH_CIGAM_64]:
                log.debug("Detected big-endian")
                return ">"
            else:
                log.debug("Not a mach-o file")
                raise CLECompatibilityError()
        else:
            if magic in [MachO.MH_MAGIC_64, MachO.MH_MAGIC]:
                log.debug("Detected big-endian")
                return ">"
            elif magic in [MachO.MH_CIGAM_64, MachO.MH_CIGAM]:
                log.debug("Detected little-endian")
                return "<"
            else:
                log.debug("Not a mach-o file")
                raise CLECompatibilityError()

[docs]    def do_binding(self):
        # Perform binding

        if self.binding_done:
            log.warning("Binding already done, reset self.binding_done to override if you know what you are doing")
            return

        bh = BindingHelper(self)  # TODO: Make this configurable
        bh.do_normal_bind(self.binding_blob)
        bh.do_lazy_bind(self.lazy_binding_blob)
        bh.do_rebases(self.rebase_blob)
        if self.weak_binding_blob is not None and len(self.weak_binding_blob) > 0:
            log.info(
                "Found weak binding blob. According to current state of knowledge, weak binding "
                "is only sensible if multiple binaries are involved and is thus skipped."
            )

        self.binding_done = True

    def _parse_exports(self):
        """
        Parses the exports trie
        """
        log.debug("Parsing exports")
        blob = self.export_blob
        if blob is None:
            log.debug("Parsing exports done: No exports found")
            return

        # Note some of these fields are currently not used, keep them in to make used variables explicit
        index = 0
        sym_str = b""
        # index,str
        nodes_to_do = [(0, b"")]
        blob_f = BytesIO(blob)  # easier to handle seeking here

        # constants
        # FLAGS_KIND_MASK = 0x03
        # FLAGS_KIND_REGULAR = 0x00
        # FLAGS_KIND_THREAD_LOCAL = 0x01
        # FLAGS_WEAK_DEFINITION = 0x04
        FLAGS_REEXPORT = 0x08
        FLAGS_STUB_AND_RESOLVER = 0x10

        try:
            while True:
                index, sym_str = nodes_to_do.pop()
                log.debug("Processing node %#x %r", index, sym_str)
                blob_f.seek(index, SEEK_SET)
                info_len = struct.unpack("B", blob_f.read(1))[0]
                if info_len > 127:
                    # special case
                    blob_f.seek(-1, SEEK_CUR)
                    tmp = read_uleb(blob, blob_f.tell())  # a bit kludgy
                    info_len = tmp[0]
                    blob_f.seek(tmp[1], SEEK_CUR)

                if info_len > 0:
                    # a symbol is complete
                    tmp = read_uleb(blob, blob_f.tell())
                    blob_f.seek(tmp[1], SEEK_CUR)
                    flags = tmp[0]
                    if flags & FLAGS_REEXPORT:
                        # REEXPORT: uleb:lib ordinal, zero-term str
                        tmp = read_uleb(blob, blob_f.tell())
                        blob_f.seek(tmp[1], SEEK_CUR)
                        lib_ordinal = tmp[0]
                        lib_sym_name = b""
                        char = blob_f.read(1)
                        while char != b"\0":
                            lib_sym_name += char
                            char = blob_f.read(1)
                        log.info("Found REEXPORT export %r: %d,%r", sym_str, lib_ordinal, lib_sym_name)
                        self.exports_by_name[sym_str.decode()] = (flags, lib_ordinal, lib_sym_name.decode())
                    elif flags & FLAGS_STUB_AND_RESOLVER:
                        # STUB_AND_RESOLVER: uleb: stub offset, uleb: resovler offset
                        log.warning("EXPORT: STUB_AND_RESOLVER found")
                        tmp = read_uleb(blob, blob_f.tell())
                        blob_f.seek(tmp[1], SEEK_CUR)
                        stub_offset = tmp[0]
                        tmp = read_uleb(blob, blob_f.tell())
                        blob_f.seek(tmp[1], SEEK_CUR)
                        resolver_offset = tmp[0]
                        log.info("Found STUB_AND_RESOLVER export %r: %#x,%#x'", sym_str, stub_offset, resolver_offset)
                        self.exports_by_name[sym_str.decode()] = (flags, stub_offset, resolver_offset)
                    else:
                        # normal: offset from mach header
                        tmp = read_uleb(blob, blob_f.tell())
                        blob_f.seek(tmp[1], SEEK_CUR)
                        symbol_offset = tmp[0] + self.linked_base
                        log.debug("Found normal export %r: %#x", sym_str, symbol_offset)
                        self.exports_by_name[sym_str.decode()] = (flags, symbol_offset)

                child_count = struct.unpack("B", blob_f.read(1))[0]
                for i in range(0, child_count):
                    child_str = sym_str
                    char = blob_f.read(1)
                    while char != b"\0":
                        child_str += char
                        char = blob_f.read(1)
                    tmp = read_uleb(blob, blob_f.tell())
                    blob_f.seek(tmp[1], SEEK_CUR)
                    next_node = tmp[0]
                    log.debug("%d. child: (%#x, %r)", i, next_node, child_str)
                    nodes_to_do.append((next_node, child_str))

        except IndexError:
            # List is empty we are done!
            log.debug("Done parsing exports")

    def _detect_arch_ident(self):
        """
        Determines the binary's architecture by inspecting cputype and cpusubtype.
        :return: archinfo.arch_from_id-compatible ident string
        """
        # determine architecture by major CPU type
        try:
            arch_lookup = {
                # contains all supported architectures. Note that apple deviates from standard ABI, see Apple docs
                0x100000C: "aarch64",
                0xC: "arm",
                0x7: "x86",
                0x1000007: "x64",
            }
            return arch_lookup[self.cputype]  # subtype currently not needed
        except KeyError:
            return None

    def _load_lc_data_in_code(self, f, off):
        log.debug("Parsing data in code")

        (_, _, dataoff, datasize) = self._unpack("4I", f, off, 16)
        for i in range(dataoff, datasize, 8):
            blob = self._unpack("IHH", f, i, 8)
            self.lc_data_in_code.append(blob)

        log.debug("Done parsing data in code")

    def _assert_unencrypted(self, f, off):
        log.debug("Asserting unencrypted file")
        (_, _, _, _, cryptid) = self._unpack("5I", f, off, 20)
        if cryptid > 0:
            log.error("Cannot load encrypted files")
            raise CLEInvalidBinaryError()

    def _load_lc_function_starts(self, f, off):
        # note that the logic below is based on Apple's dyldinfo.cpp, no official docs seem to exist
        log.debug("Parsing function starts")
        (_, _, dataoff, datasize) = self._unpack("4I", f, off, 16)

        i = 0
        end = datasize
        blob = self._read(f, dataoff, datasize)
        self.lc_function_starts = []

        address = None
        for seg in self.segments:
            if seg.offset == 0 and seg.filesize != 0:
                address = seg.vaddr
                break

        if address is None:
            log.error("Could not determine base-address for function starts")
            raise CLEInvalidBinaryError()
        log.debug("Located base-address: %#x", address)

        while i < end:
            uleb = read_uleb(blob, i)

            if blob[i] == 0:
                break  # list is 0 terminated

            address += uleb[0]

            self.lc_function_starts.append(address)
            log.debug("Function start @ %#x (%#x)", uleb[0], address)
            i += uleb[1]
        log.debug("Done parsing function starts")

    def _load_lc_main(self, f, offset):
        if self.entryoff is not None or self.unixthread_pc is not None:
            log.error("More than one entry point for main detected, abort.")
            raise CLEInvalidBinaryError()

        (_, _, self.entryoff, _) = self._unpack("2I2Q", f, offset, 24)
        log.debug("LC_MAIN: entryoff=%#x", self.entryoff)

    def _load_lc_unixthread(self, f, offset):
        if self.entryoff is not None or self.unixthread_pc is not None:
            log.error("More than one entry point for main detected, abort.")
            raise CLEInvalidBinaryError()

        # parse basic structure
        # _, cmdsize, flavor, long_count
        _, _, flavor, _ = self._unpack("4I", f, offset, 16)

        # we only support 4 different types of thread state atm
        # TODO: This is the place to add x86 and x86_64 thread states
        if flavor == 1 and self.arch.bits != 64:  # ARM_THREAD_STATE or ARM_UNIFIED_THREAD_STATE or ARM_THREAD_STATE32
            blob = self._unpack("16I", f, offset + 16, 64)  # parses only until __pc
        elif flavor == 1 and self.arch.bits == 64 or flavor == 6:
            # ARM_THREAD_STATE or ARM_UNIFIED_THREAD_STATE or ARM_THREAD_STATE64
            blob = self._unpack("33Q", f, offset + 16, 264)  # parses only until __pc
        else:
            log.error("Unknown thread flavor: %d", flavor)
            raise CLECompatibilityError()

        self.unixthread_pc = blob[-1]
        log.debug("LC_UNIXTHREAD: __pc=%#x", self.unixthread_pc)

    def _load_dylib_info(self, f, offset):
        (_, _, name_offset, _, _, _) = self._unpack("6I", f, offset, 24)
        lib_path = self.parse_lc_str(f, offset + name_offset)
        log.debug("Adding library %r", lib_path)
        lib_base_name = lib_path.decode("utf-8").rsplit("/", 1)[-1]
        self.deps.append(lib_base_name)
        self.imported_libraries.append(lib_path)

    def _load_dyld_info(self, f: BufferedReader, offset):
        """
        Extracts information blobs for rebasing, binding and export
        """
        (_, _, roff, rsize, boff, bsize, wboff, wbsize, lboff, lbsize, eoff, esize) = self._unpack("12I", f, offset, 48)

        def blob_or_None(f: BufferedReader, off: int, size: int) -> Optional[bytes]:  # helper
            return self._read(f, off, size) if off != 0 and size != 0 else None

        # Extract data blobs
        self.rebase_blob = blob_or_None(f, roff, rsize)
        self.binding_blob = blob_or_None(f, boff, bsize)
        self.weak_binding_blob = blob_or_None(f, wboff, wbsize)
        self.lazy_binding_blob = blob_or_None(f, lboff, lbsize)
        self.export_blob = blob_or_None(f, eoff, esize)

    def _load_symtab(self, f, offset):
        """
        Handles loading of the symbol table
        :param f: input file
        :param offset: offset to the LC_SYMTAB structure
        :return:
        """

        (_, _, symoff, nsyms, stroff, strsize) = self._unpack("6I", f, offset, 24)

        # load string table
        self.strtab = self._read(f, stroff, strsize)

        # Create Dictionary of offsets to strings for quick lookups e.g. during later symbol creation
        _indexed_strtab: Dict[int, bytes] = {}
        idx = 0
        for s in self.strtab.split(b"\x00"):
            _indexed_strtab[idx] = s
            idx += len(s) + 1
        self._indexed_strtab = _indexed_strtab

        # store symtab info
        self.symtab_nsyms = nsyms
        self.symtab_offset = symoff

    def _parse_symbols(self, f):
        # parse the symbol entries and create (unresolved) MachOSymbols.
        if self.arch.bits == 64:
            packstr = "I2BHQ"
            structsize = 16
        else:
            packstr = "I2BhI"
            structsize = 12

        for i in range(0, self.symtab_nsyms):
            # The relevant struct is nlist_64 which is defined and documented in mach-o/nlist.h
            offset_in_symtab = i * structsize
            offset = offset_in_symtab + self.symtab_offset
            (n_strx, n_type, n_sect, n_desc, n_value) = self._unpack(packstr, f, offset, structsize)
            log.debug("Adding symbol # %d @ %#x: %s,%s,%s,%s,%s", i, offset, n_strx, n_type, n_sect, n_desc, n_value)
            sym = SymbolTableSymbol(self, offset_in_symtab, n_strx, n_type, n_sect, n_desc, n_value)
            self.symbols.add(sym)
            self._ordered_symbols.append(sym)

            log.debug("Symbol # %d @ %#x is '%s'", i, offset, sym.name)

[docs]    def get_string(self, start):
        """Loads a string from the string table"""

        if start in self._indexed_strtab:
            return self._indexed_strtab[start]

        end = start
        if end > len(self.strtab):
            raise ValueError()

        while end < len(self.strtab):
            if self.strtab[end] == 0:
                return self.strtab[start:end]
            end += 1
        return self.strtab[start:]

[docs]    def parse_lc_str(self, f, start, limit: Optional[int] = None):
        """Parses a lc_str data structure"""
        tmp = self._unpack("c", f, start, 1)[0]
        s = b""
        ctr = 0
        while tmp != b"\0" and (limit is None or ctr < limit):
            s += tmp
            ctr += 1
            tmp = self._unpack("c", f, start + ctr, 1)[0]

        return s

    def _load_segment(self, f, offset):
        """
        Handles LC_SEGMENT(_64) commands
        :param f: input file
        :param offset: starting offset of the LC_SEGMENT command
        :return:
        """
        # determine if 64 or 32 bit segment
        is64 = self.arch.bits == 64
        if not is64:
            segment_s_size = 56
            (_, _, segname, vmaddr, vmsize, fileoff, filesize, maxprot, initprot, nsects, flags) = self._unpack(
                "2I16s8I", f, offset, segment_s_size
            )
        else:
            segment_s_size = 72
            (_, _, segname, vmaddr, vmsize, fileoff, filesize, maxprot, initprot, nsects, flags) = self._unpack(
                "2I16s4Q4I", f, offset, segment_s_size
            )

        # Cleanup segname
        segname = segname.replace(b"\0", b"")
        log.debug("Processing segment %r", segname)

        # create segment
        seg = MachOSegment(fileoff, vmaddr, filesize, vmsize, segname, nsects, [], flags, initprot, maxprot)

        # Parse section datastructures
        if not is64:
            # 32 bit
            section_s_size = 68
            section_s_packstr = "16s16s9I"
        else:
            # 64 bit
            section_s_size = 80
            # The correct packstring is "16s16s2Q8I", however we use a different one that merges the last two reserved
            # fields (reserved2,reserved3) because it makes the parsing logic below easier
            section_s_packstr = "16s16s2Q6IQ"

        section_start = offset + segment_s_size
        for i in range(0, nsects):
            # Read section
            log.debug("Processing section # %d in %r", i + 1, segname)
            (
                section_sectname,
                section_segname,
                section_vaddr,
                section_vsize,
                section_foff,
                section_align,
                section_reloff,
                section_nreloc,
                section_flags,
                r1,
                r2,
            ) = self._unpack(section_s_packstr, f, (i * section_s_size) + section_start, section_s_size)

            # Clean segname and sectname
            section_sectname = section_sectname.replace(b"\0", b"")
            section_segname = section_segname.replace(b"\0", b"")

            # Create section
            sec = MachOSection(
                section_foff,
                section_vaddr,
                section_vsize,
                section_vsize,
                section_segname,
                section_sectname,
                section_align,
                section_reloff,
                section_nreloc,
                section_flags,
                r1,
                r2,
                parent_segment=seg,
            )

            # Store section
            seg.sections.append(sec)
            self.sections.append(sec)

        # add to sections_by_ordinal
        self.sections_by_ordinal.extend(seg.sections)

        if segname == b"__PAGEZERO":
            # PAGEZERO is a complicated mess so we ignore it entirely
            # It would allocate 4GB of unneeded memory and also break rebasing
            # because now there is a segment that must be at address 0, while the other segments should be slid
            log.info("Found PAGEZERO, skipping backer for memory conservation")
        elif seg.filesize > 0:
            # Append segment data to memory
            blob = self._read(f, seg.offset, seg.filesize)
            if seg.filesize < seg.memsize:
                blob += b"\0" * (seg.memsize - seg.filesize)  # padding

            # The memory of the Backend itself should start at 0, where 0 is the lowest meaningful address
            # In our case this would be the Mach header magic
            # Later this will be loaded at an address like 0x1000000, but that's the job of the loader
            vaddr_offset = AT.from_lva(seg.vaddr, self).to_rva()
            self.memory.add_backer(vaddr_offset, blob)

        # Store segment
        self.segments.append(seg)

    S = typing.TypeVar("S", bound=Union[ctypes.Structure, ctypes.Union])

    def _get_struct(self, struct_type: typing.Type[S], offset: int) -> S:
        data = self._read(self._binary_stream, offset, ctypes.sizeof(struct_type))
        return struct_type.from_buffer_copy(data)

    def _read_cstring_from_file(self, start: FilePointer, max_length=None):
        """
        This technically has unnecessary quadratic runtime behavior in `buffer.find` and `buffer+= ...`
        but this shouldn't be noticeable in practice.
        :param start:
        :param max_length:
        :return:
        """
        end = -1
        buffer = b""
        while end == -1:
            buffer += self._read(self._binary_stream, start, 1024)
            end = buffer.find(b"\x00")
            if max_length is not None and len(buffer) > max_length:
                raise ValueError(f"Symbol name exceeds {max_length} bytes, giving up")
        return buffer[:end]

    def _parse_dyld_imports(self, header):
        # Address of Array of dyld_chained_import* structs
        imports_start_addr: FilePointer = self._dyld_chained_fixups_offset + header.imports_offset
        symbols_start_addr: FilePointer = self._dyld_chained_fixups_offset + header.symbols_offset

        import_struct = DyldImportStruct.get_struct(header.imports_format)

        # Parse Imports
        for i in range(header.imports_count):
            import_addr = imports_start_addr + i * ctypes.sizeof(import_struct)
            imp = self._get_struct(import_struct, import_addr)
            sym_name_addr = symbols_start_addr + imp.name_offset
            try:
                sym_name_bytes = self._read_cstring_from_file(sym_name_addr, max_length=2**21)
                sym_name = sym_name_bytes.decode("utf-8")
            except ValueError as e:
                # This symbol string is probably not null terminated, so we can't read it
                log.error("Failed to read symbol name at %x: %s", sym_name_addr, e)
                sym_name = f"<Excessively long symbol name at fileoffset 0x{sym_name_addr:x}>"

            symbols = self.symbols.get_by_name_and_ordinal(sym_name, imp.lib_ordinal)
            if len(symbols) == 1:
                self._dyld_imports.append(symbols[0])
            elif len(symbols) == 0:
                try:
                    log.debug(
                        "Creating DyldBoundSymbol with name %s for library %s",
                        sym_name,
                        self.imported_libraries[imp.lib_ordinal],
                    )
                except IndexError:
                    log.debug(
                        "Creating DyldBoundSymbol with name %s and library ordinal %s (unknown library)",
                        sym_name,
                        imp.lib_ordinal,
                    )
                sym = DyldBoundSymbol(self, sym_name, imp.lib_ordinal)
                self.symbols.add(sym)
                self._dyld_imports.append(sym)
            else:
                raise NotImplementedError(
                    f"Multiple symbols with name {sym_name}" f"for library {self.imported_libraries[imp.lib_ordinal]}."
                )

    def _parse_dyld_chained_fixups(self):
        header: dyld_chained_fixups_header = self._get_struct(
            dyld_chained_fixups_header, self._dyld_chained_fixups_offset
        )

        if header.symbols_format != 0:
            raise NotImplementedError("Dyld fixup symbols are compressed, this isn't supported yet")

        self._parse_dyld_imports(header)

        # Address of the dyld_chained_starts_in_image struct
        segs_addr: FilePointer = self._dyld_chained_fixups_offset + header.starts_offset

        # The struct isn't straightforward to parse with ctypes, so we do it manually
        seg_count = self._unpack("I", self._binary_stream, segs_addr, 4)[0]

        segs: List[FileOffset] = []
        for i in range(seg_count):
            s = self._unpack("I", self._binary_stream, (i * 4) + segs_addr + 4, 4)[0]
            segs.append(s)

            if segs[i] == 0:
                continue

            starts_addr: FilePointer = segs_addr + segs[i]
            starts = self._get_struct(dyld_chained_starts_in_segment, starts_addr)

            seg = self.find_segment_containing(starts.segment_offset)
            # There are weird binaries where the offsets inside the file
            # and inside the virtual addr space don't match anymore.
            # This isn't properly supported yet, and the only known case is the __PII section inside the __ETC segment
            # of rare binaries, which isn't that important for most purposes
            shift = seg.vaddr - (seg.offset)
            if shift != 0:
                assert isinstance(seg, MachOSegment)
                assert seg.segname == "__ETC", (
                    "Only __ETC segments are known to have this shift, please open an"
                    " issue for this binary so it can be investigated"
                )
                log.error("Segment shift detected in, not handling fixups here for now")
                continue

            page_starts_data = self._read(self._binary_stream, starts_addr + 22, starts.page_count * 2)
            page_starts = struct.unpack("<" + ("H" * starts.page_count), page_starts_data)

            pointer_format: DyldChainedPtrFormats = starts.pointer_format
            log.info("Page has pointer_format: %s", pointer_format)
            for j, start in enumerate(page_starts):
                if start == DYLD_CHAINED_PTR_START_NONE:
                    continue
                chain_entry_addr = starts.segment_offset + (j * starts.page_size) + start
                current_chain_addr = chain_entry_addr
                log.info("Reading chain at %x", current_chain_addr)

                while True:
                    chained_rebase_ptr: ChainedFixupPointerOnDisk = self._get_struct(
                        ChainedFixupPointerOnDisk, current_chain_addr
                    )
                    bind = chained_rebase_ptr.isBind(pointer_format)
                    rebase = chained_rebase_ptr.isRebase(pointer_format, self.mapped_base)
                    if bind is not None:
                        libOrdinal, _addend = bind
                        import_symbol = self._dyld_imports[libOrdinal]
                        reloc = MachOSymbolRelocation(self, import_symbol, current_chain_addr, None)
                        self.relocs.append(reloc)
                        # Legacy Code uses bind_xrefs, explicitly add this to make this compatible for now
                        import_symbol.bind_xrefs.append(reloc.dest_addr + self.linked_base)
                        log.debug("Binding for %s found at %x", import_symbol, current_chain_addr)
                    elif rebase is not None:
                        target = self.linked_base + rebase
                        location: MemoryPointer = self.linked_base + current_chain_addr
                        anon_reloc = MachOPointerRelocation(owner=self, relative_addr=current_chain_addr, data=rebase)
                        self.relocs.append(anon_reloc)
                        log.debug("Rebase to %x found at %x", target, location)

                    else:
                        raise CLEInvalidBinaryError("FixupPointer was neither bind nor rebase, that shouldn't happen")

                    skip = chained_rebase_ptr.generic64.rebase.next * 4
                    current_chain_addr += skip
                    if skip == 0:
                        break

[docs]    def get_symbol_by_address_fuzzy(self, address):
        """
        Locates a symbol by checking the given address against sym.addr, sym.bind_xrefs and
        sym.symbol_stubs
        """
        for sym in self.symbols:
            if address == sym.relative_addr or address in sym.bind_xrefs or address in sym.symbol_stubs:
                return sym
        return None

[docs]    def get_symbol(self, name, include_stab=False, fuzzy=False):  # pylint: disable=arguments-differ
        """
        Returns all symbols matching name.

        Note that especially when include_stab=True there may be multiple symbols with the same
        name, therefore this method always returns an array.

        :param name: the name of the symbol
        :param include_stab: Include debugging symbols NOT RECOMMENDED
        :param fuzzy: Replace exact match with "contains"-style match
        """
        result = []
        for sym in self.symbols:
            if sym.is_stab and not include_stab:
                continue

            if fuzzy:
                if name in sym.name:
                    result.append(sym)
            else:
                if name == sym.name:
                    result.append(sym)

        return result

[docs]    def get_symbol_by_insertion_order(self, idx: int) -> AbstractMachOSymbol:
        """

        :param idx: idx when this symbol was inserted
        :return:
        """
        return self._ordered_symbols[idx]

[docs]    def get_segment_by_name(self, name):
        """
        Searches for a MachOSegment with the given name and returns it
        :param name: Name of the sought segment
        :return: MachOSegment or None
        """
        for seg in self.segments:
            seg: Union[MachOSection, MachOSegment]
            if seg.segname == name:
                return seg

        return None

    def __getitem__(self, item):
        """
        Syntactic sugar for get_segment_by_name
        """
        return self.get_segment_by_name(item)

    segments: Regions[MachOSegment]


register_backend("mach-o", MachO)