Source code for angr.procedures.stubs.format_parser
from typing import List, Dict, TYPE_CHECKING
from string import digits as ascii_digits
import logging
import math
import claripy
from ... import sim_type
from ...sim_procedure import SimProcedure
from ...storage.file import SimPackets
if TYPE_CHECKING:
from angr.sim_type import SimType
l = logging.getLogger(name=__name__)
ascii_digits = ascii_digits.encode()
[docs]class FormatString:
"""
Describes a format string.
"""
SCANF_DELIMITERS = [b"\x09", b"\x0a", b"\x0b", b"\x0d", b"\x20"]
[docs] def __init__(self, parser, components):
"""
Takes a list of components which are either just strings or a FormatSpecifier.
"""
self.components = components
self.parser = parser
self.string = None
@property
def state(self):
return self.parser.state
@staticmethod
def _add_to_string(string, c):
if c is None:
return string
if string is None:
return c
return string.concat(c)
def _get_str_at(self, str_addr, max_length=None):
if max_length is None:
strlen = self.parser._sim_strlen(str_addr)
# TODO: we probably could do something more fine-grained here.
# throw away strings which are just the NULL terminator
max_length = self.parser.state.solver.max_int(strlen)
if max_length == 0:
return claripy.BVV(b"")
return self.parser.state.memory.load(str_addr, max_length)
[docs] def replace(self, va_arg):
"""
Implement printf - based on the stored format specifier information, format the values from the arg getter
function `args` into a string.
:param va_arg: A function which takes a type and returns the next argument of that type
:return: The result formatted string
"""
string = None
for component in self.components:
# if this is just concrete data
if isinstance(component, bytes):
string = self._add_to_string(string, self.parser.state.solver.BVV(component))
elif isinstance(component, str):
raise Exception("this branch should be impossible?")
elif isinstance(component, claripy.ast.BV): # pylint:disable=isinstance-second-argument-not-valid-type
string = self._add_to_string(string, component)
else:
# okay now for the interesting stuff
# what type of format specifier is it?
fmt_spec = component
if fmt_spec.spec_type == b"s":
if fmt_spec.length_spec == b".*":
str_length = va_arg("size_t")
else:
str_length = None
str_ptr = va_arg("char*")
string = self._add_to_string(string, self._get_str_at(str_ptr, max_length=str_length))
# integers, for most of these we'll end up concretizing values..
else:
# ummmmmmm this is a cheap translation but I think it should work
i_val = va_arg("void*")
c_val = int(self.parser.state.solver.eval(i_val))
c_val &= (1 << (fmt_spec.size * 8)) - 1
if fmt_spec.signed and (c_val & (1 << ((fmt_spec.size * 8) - 1))):
c_val -= 1 << fmt_spec.size * 8
if fmt_spec.spec_type in (b"d", b"i"):
s_val = str(c_val)
elif fmt_spec.spec_type == b"u":
s_val = str(c_val)
elif fmt_spec.spec_type == b"c":
s_val = chr(c_val & 0xFF)
elif fmt_spec.spec_type == b"x":
s_val = hex(c_val)[2:]
elif fmt_spec.spec_type == b"o":
s_val = oct(c_val)[2:]
elif fmt_spec.spec_type == b"p":
s_val = hex(c_val)
else:
raise SimProcedureError("Unimplemented format specifier '%s'" % fmt_spec.spec_type)
if isinstance(fmt_spec.length_spec, int):
s_val = s_val.rjust(fmt_spec.length_spec, fmt_spec.pad_chr)
string = self._add_to_string(string, self.parser.state.solver.BVV(s_val.encode()))
return string
[docs] def interpret(self, va_arg, addr=None, simfd=None):
"""
implement scanf - extract formatted data from memory or a file according to the stored format
specifiers and store them into the pointers extracted from `args`.
:param va_arg: A function which, given a type, returns the next argument of that type
:param addr: The address in the memory to extract data from, or...
:param simfd: A file descriptor to use for reading data from
:return: The number of arguments parsed
"""
num_args = 0
if simfd is not None and isinstance(simfd.read_storage, SimPackets):
for component in self.components:
if type(component) is bytes:
sdata, _ = simfd.read_data(len(component), short_reads=False)
self.state.add_constraints(sdata == component)
elif isinstance(component, claripy.Bits):
sdata, _ = simfd.read_data(len(component) // 8, short_reads=False)
self.state.add_constraints(sdata == component)
elif component.spec_type == b"s":
if component.length_spec is None:
sdata, slen = simfd.read_data(self.state.libc.buf_symbolic_bytes)
else:
sdata, slen = simfd.read_data(component.length_spec)
for byte in sdata.chop(8):
self.state.add_constraints(claripy.And(*[byte != char for char in self.SCANF_DELIMITERS]))
ptr = va_arg("char*")
self.state.memory.store(ptr, sdata, size=slen)
self.state.memory.store(ptr + slen, claripy.BVV(0, 8))
num_args += 1
elif component.spec_type == b"c":
sdata, _ = simfd.read_data(1, short_reads=False)
self.state.memory.store(va_arg("char*"), sdata)
num_args += 1
else:
bits = component.size * 8
if component.spec_type == b"x":
base = 16
elif component.spec_type == b"o":
base = 8
else:
base = 10
# here's the variable representing the result of the parsing
target_variable = self.state.solver.BVS(
"scanf_" + component.string.decode(), bits, key=("api", "scanf", num_args, component.string)
)
negative = claripy.SLT(target_variable, 0)
# how many digits does it take to represent this variable fully?
max_digits = int(math.ceil(math.log(2**bits, base)))
# how many digits does the format specify?
spec_digits = component.length_spec
# how many bits can we specify as input?
available_bits = float("inf") if spec_digits is None else spec_digits * math.log(base, 2)
not_enough_bits = available_bits < bits
# how many digits will we model this input as?
digits = max_digits if spec_digits is None else spec_digits
# constrain target variable range explicitly if it can't take on all possible values
if not_enough_bits:
self.state.add_constraints(
self.state.solver.And(
self.state.solver.SLE(target_variable, (base**digits) - 1),
self.state.solver.SGE(target_variable, -(base ** (digits - 1) - 1)),
)
)
# perform the parsing in reverse - constrain the input digits to be the string version of the input
# this only works because we're reading from a packet stream and therefore nobody has the ability
# to add other constraints to this data!
# this makes z3's job EXTREMELY easy
sdata, _ = simfd.read_data(digits, short_reads=False)
for i, digit in enumerate(reversed(sdata.chop(8))):
digit_value = (target_variable // (base**i)) % base
digit_ascii = digit_value + ord("0")
if base > 10:
digit_ascii = claripy.If(digit_value >= 10, digit_value + (-10 + ord("a")), digit_ascii)
# if there aren't enough bits, we can increase the range by accounting for the possibility that
# the first digit is a minus sign
if not_enough_bits:
if i == digits - 1:
neg_digit_ascii = ord("-")
else:
neg_digit_value = (-target_variable // (base**i)) % base
neg_digit_ascii = neg_digit_value + ord("0")
if base > 10:
neg_digit_ascii = claripy.If(
neg_digit_value >= 10, neg_digit_value + (-10 + ord("a")), neg_digit_ascii
)
digit_ascii = claripy.If(negative, neg_digit_ascii, digit_ascii)
self.state.add_constraints(digit == digit_ascii[7:0])
# again, a cheap hack
self.state.memory.store(va_arg("void*"), target_variable, endness=self.state.arch.memory_endness)
num_args += 1
return num_args
if simfd is not None:
region = simfd.read_storage
addr = simfd._pos if hasattr(simfd, "_pos") else simfd._read_pos # XXX THIS IS BAD
else:
region = self.parser.state.memory
bits = self.parser.state.arch.bits
failed = self.parser.state.solver.BVV(0, 32)
position = addr
for component in self.components:
if isinstance(component, bytes):
# TODO we skip non-format-specifiers in format string interpretation for now
# if the region doesn't match the concrete component, we need to return immediately
pass
else:
fmt_spec = component
try:
dest = va_arg("void*")
except SimProcedureArgumentError:
dest = None
if fmt_spec.spec_type == b"s":
# set some limits for the find
max_str_len = self.parser.state.libc.max_str_len
max_sym_bytes = self.parser.state.libc.buf_symbolic_bytes
# has the length of the format been limited by the string itself?
if fmt_spec.length_spec is not None:
max_str_len = fmt_spec.length_spec
max_sym_bytes = fmt_spec.length_spec
# TODO: look for limits on other characters which scanf is sensitive to, '\x00', '\x20'
_, _, match_indices = region.find(
position, self.parser.state.solver.BVV(b"\n"), max_str_len, max_symbolic_bytes=max_sym_bytes
)
if not match_indices:
# if no newline is found, mm is position + max_strlen
mm = position + max_str_len
# we're just going to concretize the length, load will do this anyways
length = self.parser.state.solver.max_int(mm - position)
else:
# a newline is found, or a max length is specified with the specifier
length = max(match_indices)
src_str = region.load(position, length)
# TODO all of these should be delimiters we search for above
# add that the contents of the string cannot be any scanf %s string delimiters
for delimiter in set(FormatString.SCANF_DELIMITERS):
delim_bvv = self.parser.state.solver.BVV(delimiter)
for i in range(length):
self.parser.state.add_constraints(region.load(position + i, 1) != delim_bvv)
# write it out to the pointer
self.parser.state.memory.store(dest, src_str)
# store the terminating null byte
self.parser.state.memory.store(dest + length, self.parser.state.solver.BVV(0, 8))
position += length
else:
# XXX: atoi only supports strings of one byte
if fmt_spec.spec_type in [b"d", b"i", b"u", b"x"]:
base = 16 if fmt_spec.spec_type == b"x" else 10
status, i, num_bytes = self.parser._sim_atoi_inner(
position, region, base=base, read_length=fmt_spec.length_spec
)
# increase failed count if we were unable to parse it
failed = self.parser.state.solver.If(status, failed, failed + 1)
position += num_bytes
elif fmt_spec.spec_type == b"c":
i = region.load(position, 1)
i = i.zero_extend(bits - 8)
position += 1
else:
raise SimProcedureError("unsupported format spec '%s' in interpret" % fmt_spec.spec_type)
i = self.parser.state.solver.Extract(fmt_spec.size * 8 - 1, 0, i)
self.parser.state.memory.store(
dest, i, size=fmt_spec.size, endness=self.parser.state.arch.memory_endness
)
num_args += 1
if simfd is not None:
_, realsize = simfd.read_data(position - addr)
self.state.add_constraints(realsize == position - addr)
return num_args - failed
def __repr__(self):
outstr = ""
for comp in self.components:
if isinstance(comp, bytes):
outstr += comp.decode("ascii")
else:
outstr += str(comp)
return outstr
[docs]class FormatSpecifier:
"""
Describes a format specifier within a format string.
"""
__slots__ = (
"string",
"size",
"signed",
"length_spec",
"pad_chr",
)
[docs] def __init__(self, string, length_spec, pad_chr, size, signed):
self.string = string
self.size = size
self.signed = signed
self.length_spec = length_spec
self.pad_chr = pad_chr
@property
def spec_type(self):
return self.string[-1:].lower()
def __str__(self):
return "%%%s" % self.string.decode()
def __len__(self):
return len(self.string)
[docs]class FormatParser(SimProcedure):
"""
For SimProcedures relying on printf-style format strings.
"""
ARGS_MISMATCH = True
# Basic conversion specifiers for format strings, mapped to sim_types
# TODO: support for C and S that are deprecated.
# TODO: We only consider POSIX locales here.
basic_spec = {
b"d": sim_type.SimTypeInt(), # 'int',
b"i": sim_type.SimTypeInt(), # 'int',
b"o": sim_type.SimTypeInt(signed=False), # 'unsigned int',
b"u": sim_type.SimTypeInt(signed=False), # 'unsigned int',
b"x": sim_type.SimTypeInt(signed=False), # 'unsigned int',
b"X": sim_type.SimTypeInt(signed=False), # 'unsigned int',
b"e": sim_type.SimTypeDouble(), # 'double',
b"E": sim_type.SimTypeDouble(), # 'double',
b"f": sim_type.SimTypeDouble(), # 'double',
b"F": sim_type.SimTypeDouble(), # 'double',
b"g": sim_type.SimTypeDouble(), # 'double',
b"G": sim_type.SimTypeDouble(), # 'double',
b"a": sim_type.SimTypeDouble(), # 'double',
b"A": sim_type.SimTypeDouble(), # 'double',
b"c": sim_type.SimTypeChar(), # 'char',
b"s": sim_type.SimTypePointer(sim_type.SimTypeChar()), # 'char*',
b"p": sim_type.SimTypePointer(sim_type.SimTypeInt(signed=False)), # 'uintptr_t',
b"n": sim_type.SimTypePointer(
sim_type.SimTypeInt(signed=False)
), # 'uintptr_t', # pointer to num bytes written so far
# b'm': None, # Those don't expect any argument
# b'%': None, # Those don't expect any argument
}
# Signedness of integers
int_sign = {"signed": [b"d", b"i"], "unsigned": [b"o", b"u", b"x", b"X"]}
# Length modifiers and how they apply to integer conversion (signed / unsigned).
int_len_mod = {
b"hh": (sim_type.SimTypeChar(), sim_type.SimTypeChar(signed=False)), # ('char', 'uint8_t'),
b"h": (sim_type.SimTypeShort(), sim_type.SimTypeShort(signed=False)), # ('int16_t', 'uint16_t'),
b"l": (sim_type.SimTypeLong(), sim_type.SimTypeLong(signed=False)), # ('long', 'unsigned long'),
# FIXME: long long is 64bit according to stdint.h on Linux, but that might not always be the case
b"ll": (sim_type.SimTypeLongLong(), sim_type.SimTypeLongLong(signed=False)), # ('int64_t', 'uint64_t'),
# FIXME: intmax_t seems to be always 64 bit, but not too sure
b"j": (sim_type.SimTypeLongLong(), sim_type.SimTypeLongLong(signed=False)), # ('int64_t', 'uint64_t'),
b"z": (sim_type.SimTypeLength(signed=True), sim_type.SimTypeLength(signed=False)), # ('ssize', 'size_t'),
b"t": (sim_type.SimTypeLong(), sim_type.SimTypeLong()), # ('ptrdiff_t', 'ptrdiff_t'),
}
# Types that are not known by sim_types
# Maps to (size, signedness)
other_types = {("string",): lambda _: (0, True)} # special value for strings, we need to count
# Those flags affect the formatting the output string
flags = ["#", "0", r"\-", r" ", r"\+", r"\'", "I"]
_MOD_SPEC = None
_ALL_SPEC = None
@property
def _mod_spec(self):
"""
Modified length specifiers: mapping between length modifiers and conversion specifiers. This generates all the
possibilities, i.e. hhd, etc.
"""
if FormatParser._MOD_SPEC is None:
mod_spec = {}
for mod, sizes in self.int_len_mod.items():
for conv in self.int_sign["signed"]:
mod_spec[mod + conv] = sizes[0]
for conv in self.int_sign["unsigned"]:
mod_spec[mod + conv] = sizes[1]
FormatParser._MOD_SPEC = mod_spec
return FormatParser._MOD_SPEC
@property
def _all_spec(self) -> Dict[bytes, "SimType"]:
"""
All specifiers and their lengths.
"""
if FormatParser._ALL_SPEC is None:
base = dict(self._mod_spec)
for spec in self.basic_spec:
base[spec] = self.basic_spec[spec]
FormatParser._ALL_SPEC = base
return FormatParser._ALL_SPEC
# Tricky stuff
# Note that $ is not C99 compliant (but posix specific).
def _match_spec(self, nugget):
"""
match the string `nugget` to a format specifier.
"""
# TODO: handle positional modifiers and other similar format string tricks.
all_spec = self._all_spec
# iterate through nugget throwing away anything which is an int
# TODO store this in a size variable
original_nugget = nugget
length_str = []
length_spec = None
length_spec_str_len = 0
pad_chr = " "
if nugget.startswith(b".*"):
# ".*": precision is specified as an argument
nugget = nugget[2:]
length_spec = b".*"
length_spec_str_len = 2
elif nugget.startswith(b"0"):
pad_chr = "0"
elif nugget.startswith(b"."):
pad_chr = "0"
nugget = nugget[1:]
for j, c in enumerate(nugget):
if c in ascii_digits:
length_str.append(c)
else:
nugget = nugget[j:]
if length_spec is None:
length_spec = None if len(length_str) == 0 else int(bytes(length_str))
break
# we need the length of the format's length specifier to extract the format and nothing else
if length_spec_str_len == 0 and length_str:
length_spec_str_len = len(length_str)
# is it an actual format?
for spec in all_spec:
if nugget.startswith(spec):
# this is gross coz sim_type is gross..
nugget = nugget[: len(spec)]
original_nugget = original_nugget[: (length_spec_str_len + len(spec))]
nugtype: "SimType" = all_spec[nugget]
try:
typeobj = nugtype.with_arch(self.state.arch if self.state is not None else self.project.arch)
except Exception:
raise SimProcedureError("format specifier uses unknown type '%s'" % repr(nugtype))
return FormatSpecifier(original_nugget, length_spec, pad_chr, typeobj.size // 8, typeobj.signed)
return None
[docs] def extract_components(self, fmt: List) -> List:
"""
Extract the actual formats from the format string `fmt`.
:param fmt: A list of format chars.
:returns: a FormatString object
"""
# iterate over the format string looking for format specifiers
components = []
i = 0
while i < len(fmt):
if type(fmt[i]) is bytes and fmt[i] == b"%":
# Note that we only support concrete format specifiers
# grab the specifier
# go to the space
specifier = b""
for c in fmt[i + 1 :]:
if type(c) is bytes:
specifier += c
else:
break
specifier = self._match_spec(specifier)
if specifier is not None:
i += len(specifier)
components.append(specifier)
else:
# if we get here we didn't match any specs, the first char will be thrown away
# and we'll add the percent
i += 1
components.append(b"%")
else:
# claripy ASTs, which are usually symbolic variables
# They will be kept as they are - even if those chars can be evaluated to "%"
components.append(fmt[i])
i += 1
return components
def _get_fmt(self, fmt):
"""
Extract the actual formats from the format string `fmt`.
:param list fmt: A list of format chars.
:returns: a FormatString object
"""
components = self.extract_components(fmt)
return FormatString(self, components)
def _sim_atoi_inner(self, str_addr, region, base=10, read_length=None):
"""
Return the result of invoking the atoi simprocedure on `str_addr`.
"""
from .. import SIM_PROCEDURES # pylint:disable=import-outside-toplevel
strtol = SIM_PROCEDURES["libc"]["strtol"]
return strtol.strtol_inner(str_addr, self.state, region, base, True, read_length=read_length)
def _sim_strlen(self, str_addr):
"""
Return the result of invoking the strlen simprocedure on `str_addr`.
"""
from .. import SIM_PROCEDURES # pylint:disable=import-outside-toplevel
strlen = SIM_PROCEDURES["libc"]["strlen"]
return self.inline_call(strlen, str_addr).ret_expr
def _parse(self, fmtstr_ptr):
"""
Parse format strings.
:param fmt_idx: The pointer to the format string from the arguments list.
:returns: A FormatString object which can be used for replacing the format specifiers with arguments or
for scanning into arguments.
"""
if self.state.solver.symbolic(fmtstr_ptr):
raise SimProcedureError("Symbolic pointer to (format) string :(")
length = self._sim_strlen(fmtstr_ptr)
if self.state.solver.symbolic(length):
all_lengths = self.state.solver.eval_upto(length, 2)
if len(all_lengths) != 1:
raise SimProcedureError("Symbolic (format) string, game over :(")
length = all_lengths[0]
if self.state.solver.is_true(length == 0):
return FormatString(self, [b""])
fmt_xpr = self.state.memory.load(fmtstr_ptr, length)
fmt = []
for i in range(fmt_xpr.size(), 0, -8):
char = fmt_xpr[i - 1 : i - 8]
try:
conc_char = self.state.solver.eval_one(char)
except SimSolverError:
# For symbolic chars, just keep them symbolic
fmt.append(char)
else:
# Concrete chars are directly appended to the list
fmt.append(bytes([conc_char]))
# make a FormatString object
fmt_str = self._get_fmt(fmt)
l.debug("Fmt: %r", fmt_str)
return fmt_str
[docs]class ScanfFormatParser(FormatParser):
"""
For SimProcedures relying on scanf-style format strings.
"""
basic_spec = {
b"d": sim_type.SimTypeInt(), # 'int',
b"i": sim_type.SimTypeInt(), # 'int',
b"o": sim_type.SimTypeInt(signed=False), # 'unsigned int',
b"u": sim_type.SimTypeInt(signed=False), # 'unsigned int',
b"x": sim_type.SimTypeInt(signed=False), # 'unsigned int',
b"X": sim_type.SimTypeInt(signed=False), # 'unsigned int',
b"e": sim_type.SimTypeFloat(), # 'float',
b"E": sim_type.SimTypeFloat(), # 'float',
b"f": sim_type.SimTypeFloat(), # 'float',
b"F": sim_type.SimTypeFloat(), # 'float',
b"g": sim_type.SimTypeFloat(), # 'float',
b"G": sim_type.SimTypeFloat(), # 'float',
b"a": sim_type.SimTypeFloat(), # 'float',
b"A": sim_type.SimTypeFloat(), # 'float',
b"c": sim_type.SimTypeChar(), # 'char',
b"s": sim_type.SimTypePointer(sim_type.SimTypeChar()), # 'char*',
b"p": sim_type.SimTypePointer(sim_type.SimTypeInt(signed=False)), # 'uintptr_t',
b"n": sim_type.SimTypePointer(sim_type.SimTypeInt(signed=False)),
}
# All float conversion specifiers
float_spec = [b"e", b"E", b"f", b"F", b"g", b"G", b"a", b"A"]
# Length modifiers and how they apply to float conversion.
float_len_mod = {
b"l": sim_type.SimTypeDouble, # 'double',
b"ll": sim_type.SimTypeDouble, # 'long double',
}
@property
def _mod_spec(self):
"""
Modified length specifiers: mapping between length modifiers and conversion specifiers. This generates all the
possibilities, i.e. lf, etc.
"""
if FormatParser._MOD_SPEC is None:
mod_spec = dict(super()._mod_spec.items())
for mod, size in self.float_len_mod.items():
for conv in self.float_spec:
mod_spec[mod + conv] = size
FormatParser._MOD_SPEC = mod_spec
return FormatParser._MOD_SPEC
from angr.errors import SimProcedureArgumentError, SimProcedureError, SimSolverError