-
Notifications
You must be signed in to change notification settings - Fork 273
/
Copy pathls_parse.py
executable file
·593 lines (515 loc) · 21.2 KB
/
ls_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
#!/usr/bin/env python3
#
# Dump info about linker script symbols that pertain to addresses and sizes.
#
# Author: Kareem Khazem <[email protected]>
# Copyright Amazon, Inc. 2017
import ast
import argparse
import json
import logging
from logging import error, warning, info, debug
import operator
import os
import re
import subprocess
import sys
import textwrap
import traceback
# ast.Num was deprecated in python 3.8
if sys.version_info.major > 3 or \
(sys.version_info.major == 3 and sys.version_info.minor > 7):
ast_num = ast.Constant
else:
ast_num = ast.Num
def epilog():
return textwrap.dedent("""
This script generates a C file containing two kinds of information:
- The values of symbols that are defined in a linker script; these
are printed as C definitions, like
char *bss_start = (char *)4070047185u;
- The extent of ELF sections that are defined in a linker script;
these are printed as CPROVER annotations, like
__CPROVER_allocated_memory(0xe9fda44b, 4096);
A goto-binary of this C file can be linked into the rest of the
codebase that you wish to analyse. This provides CPROVER with
definitions that it otherwise would not have access to, since they
are defined in a linker script rather than C code. This information
can also be printed in JSON rather than as a C file, so that CPROVER
can invoke and use this script without user intervention.
This script needs a list of symbols declared but never defined in C
code. The hacky way of supplying this list is by passing the path to
the codebase with the --dir flag; this script will scan the codebase
for extern-declared variables. A better way is to generate that list
with CPROVER, and pass that list in using --sym-file. The argument
to --sym-file can be a filename, or '-' for stdin.
""")
"""`Running-regex' linker script parser. We don't currently use a full
grammar, as we only need a fraction of the information contained in
linker scripts to give to CBMC. If in the future we need a more
sophisticated parser, we should use an actual grammar from a real
parser. GNU LD uses a YACC/Flex setup and has a very complete grammar,
but we cannot use it (GPL 3). LLD (the LLVM project's linker script
parser) is hand-written (so no explicit grammar), but they do not aim to
support all of GNU LD's syntax, so LLD doesn't work on some real linker
scripts. So in summary: use this regex parser while it's practical;
switch to LLD when needed, and possibly contribute to LLD development to
support parsing your use case."""
def get_linker_script_data(script):
try:
with open(script) as f:
lines = f.read().splitlines()
except IOError:
error("Linker script '%s' not found", script)
exit(1)
text = " ".join(lines)
text = re.sub(r"\s+", " ", text)
# In these regexes, we always start by matching a whitespace. This
# is so that we don't match every substring of an identifier. i.e.
# if we have a section definition .text : { ..., then we only want
# to recognise a section called ".text", and not also "text", "ext",
# "xt", and "t".
#
# Just to be safe, ensure that the first character of the linker
# script is a whitespace.
text = " %s" % text
# Lex out comments
text = re.sub(r"/\*.*?\*/", " ", text)
close_brace = re.compile(r"\s}(\s*>\s*\w+)?")
uwnknown_cmd = re.compile(r"\sPHDRS\s*{") # only this pattern for now, more might follow!
memory_cmd = re.compile(r"\sMEMORY\s*{")
sections_cmd = re.compile(r"\sSECTIONS\s*{")
assign_current = re.compile(r"\s(?P<sym>\w+)\s*=\s*\.\s*;")
sec_def = re.compile(r"\s(?P<sec>([-\.\w]+)|(/DISCARD/))"
r"\s+([^:{};]*?):([^:{};])*?{")
assign_size = re.compile(r"\s(?P<sym>\w+)\s*=\s*SIZEOF\("
r"(?P<sec>\.\w+)\)\s*;")
memory_block = re.compile(r"\s(?P<name>\w+)\s*:\s*ORIGIN\s*=\s*"
r"(?P<orig>0x[a-fA-F0-9]+)\s*,\s*"
r"LENGTH\s*=\s*(?P<len>\d+)\s*"
r"(?P<unit>[KMG])")
exp = r"(ORIGIN\(\w+\)|LENGTH\(\w+\))"
op = r"(\+|\-)"
assign_expr = re.compile(r"\s(?P<sym>\w+)\s*=\s*"
r"(?P<expr>{exp}(\s*{op}\s*{exp})*)"
r"\s*;".format(exp=exp, op=op))
# If we match a regex, call the right function to update the state
# with the info gleaned from the matched string.
jump_table = {
close_brace : close_brace_fun,
uwnknown_cmd : unknown_cmd_fun,
memory_cmd : memory_cmd_fun,
sections_cmd : sections_cmd_fun,
assign_current : assign_current_fun,
memory_block : memory_block_fun,
sec_def : sec_def_fun,
assign_size : assign_size_fun,
assign_expr : assign_expr_fun,
}
# Whenever we match an interesting regex, we'll update the state
# with whatever information we want to rip from that bit of text.
state = {}
# The section definition that we were last in.
state["cur-sec"] = None
# If we know what section *start* the current address (.) points to,
# then this will not be None. It's used to match an assignment to
# the start of a section.
state["start-valid"] = None
# If we have just seen an assignment, then this will not be None.
# It's used to match up an assignment with the end of a section.
state["end-valid"] = None
# Each entry here maps a section name to a map. That map maps "size"
# to the symbol pointing to the size of the section, and "start"
# to the symbol pointing to the start address of the section. One of
# "start" or "size" may be absent, if we couldn't get that bit of
# information from the linker script.
state["sections"] = {}
# We can use the list of valid addresses to sanity-check that the
# start addresses of sections are genuinely addresses.
state["valid-addresses"] = []
# Symbols that get some expression assigned to them, either inside
# or outside a section definition. We'll match them up later.
state["expr-assigns"] = []
# These are to sanity-check the parsing.
state["MEM"] = False
state["SEC"] = False
state["DEF"] = False
state["UNKNOWN"] = False
i = 0
while i < len(text):
buf = text[i:]
i += 1
asrt(not (state["MEM"] and state["SEC"]),
"memory & sections", buf)
asrt(not state["DEF"] or state["SEC"],
"def outside SECTION", buf)
jump_fun = None
matched_str = None
matched_re = None
match = None
for regex, fun in jump_table.items():
m = regex.match(buf)
if m:
if jump_fun is not None:
error("matched multiple regexes\n%s", buf)
exit(1)
jump_fun = fun
match = m
matched_str = buf[m.span()[0]:m.span()[1]]
for s, v in locals().items():
if v is regex and s != "regex":
matched_re = s
if jump_fun is not None:
info("regex '%s' matched '%s'", matched_re, matched_str)
jump_fun(state, match, buf)
i = i + match.span()[1] - 1
else:
debug("Clobbering due to '%s'...", buf[:20])
# There may have been some intermediate command between the
# start of a section definition and where we are. So we have
# no idea what address the current address pointer refers to
state["start-valid"] = None
# There may have been an intermediate command between the
# last assignment and the end of the section.
state["end-valid"] = None
match_up_expr_assigns(state)
return state
def assign_expr_fun(state, match, _):
# Do NOT invalidate 'start-valid' here. Assignments from expressions
# do not actually advance the current address pointer.
sym, expr = match.group("sym"), match.group("expr")
origin_pat = r"ORIGIN\((?P<block>\w+?)\)"
origins = re.findall(origin_pat, expr)
if len(origins) != 1:
info("assign with %d origins, skipping: %s", len(origins),
match.string())
return
ret = {"origin": origins[0], "sym": sym}
for block_name, data in state["blocks"].items():
for op in ["ORIGIN", "LENGTH"]:
old_expr = str(expr)
expr = re.sub(r"%s\(%s\)" % (op, block_name), str(data[op]),
expr)
if expr != old_expr:
info("Subbed %s(%s) with %d", op, block_name, data[op])
try:
ret["addr"] = safe_eval(expr)
except RuntimeError:
error("Unable to evaluate '%s'", expr)
sys.exit(1)
info("Evaluated '%s' to %d", expr, ret["addr"])
state["expr-assigns"].append(ret)
def sec_def_fun(state, match, buf):
asrt(not state["DEF"], "nested sec def", buf)
state["DEF"] = True
sec = match.group("sec")
info("Current section is now '%s'", sec)
state["cur-sec"] = sec
state["start-valid"] = True
def assign_size_fun(state, match, buf):
asrt(state["SEC"], "assignment outside SECTIONS", buf)
sec = match.group("sec")
if sec not in state["sections"]:
state["sections"][sec] = {}
sym = match.group("sym")
info("'%s' marks the size of section '%s'", sym, sec)
state["sections"][sec]["size"] = sym
def assign_current_fun(state, match, buf):
asrt(state["SEC"], "assignment outside SECTIONS", buf)
sec = state["cur-sec"]
state["end-valid"] = match
if state["start-valid"]:
if sec not in state["sections"]:
state["sections"][sec] = {}
sym = match.group("sym")
info("'%s' marks the start of section '%s'", sym, sec)
state["sections"][sec]["start"] = sym
else:
info("Don't know where we are.")
def close_brace_fun(state, _, buf):
# We might have seen an assignment immediately before this.
if state["end-valid"]:
asrt(state["DEF"], "end-valid outside sec-def", buf)
sec = state["cur-sec"]
if sec in state["sections"]:
sym = state["end-valid"].group("sym")
info("'%s' marks the end of section '%s'", sym, sec)
state["sections"][sec]["end"] = sym
state["end-valid"] = None
else:
# Linker script assigned end-of-section to a symbol, but not
# the start. this is useless to us.
pass
if state["DEF"]:
info("Closing sec-def")
state["DEF"] = False
elif state["SEC"]:
info("Closing sections")
state["SEC"] = False
elif state["MEM"]:
info("Closing memory command")
state["MEM"] = False
elif state["UNKNOWN"]:
info("Closing unknown command")
state["UNKNOWN"] = False
else:
error("Not in block\n%s", buf)
traceback.print_stack()
exit(1)
def memory_block_fun(state, m, buf):
asrt(state["MEM"], "memory block outside MEMORY", buf)
start, length, unit = m.group("orig"), m.group("len"), m.group("unit")
length = int(length)
dec_start = int(start, 16)
mul = {"K": 1000, "M": 1000000, "G": 1000000000}
length = length * mul[unit]
end = dec_start + length
info("mem block %s from %d to %d (%d%s long)", start, dec_start, end,
length, unit)
state["valid-addresses"].append({"from": dec_start, "to": end})
name = m.group("name")
if "blocks" not in state:
state["blocks"] = {}
state["blocks"][name] = {"ORIGIN": int(start, 16), "LENGTH": length}
def sections_cmd_fun(state, _, buf):
asrt(not state["SEC"], "encountered SECTIONS twice", buf)
state["SEC"] = True
def memory_cmd_fun(state, _, buf):
asrt(not state["MEM"], "encountered MEMORY twice", buf)
state["MEM"] = True
def unknown_cmd_fun(state, _, buf):
asrt(not state["MEM"], "encountered UNKNOWN twice", buf)
state["UNKNOWN"] = True
def match_up_expr_assigns(state):
blocks = set([data["origin"] for data in state["expr-assigns"]])
for block in blocks:
assigns = [a for a in state["expr-assigns"]
if a["origin"] == block]
assigns = sorted(assigns, key=(lambda a: a["addr"]))
if len(assigns) < 2:
warning("Only 1 assignment to expr involving %s", block)
continue
start_addr, end_addr = assigns[0]["addr"], assigns[-1]["addr"]
start_sym, end_sym = assigns[0]["sym"], assigns[-1]["sym"]
info("Valid memory from %s (%d) to %s (%s) [%s block]",
start_sym, start_addr, end_sym, end_addr, block)
tmp = {"start": start_sym, "end": end_sym}
sec_name = "BLOCK_%s_%s-%s" % (block, start_sym, end_sym)
state["sections"][sec_name] = tmp
def asrt(cond, msg, buf):
if not cond:
error("%s\n%s", msg, buf)
exit(1)
def needed_definitions(all_symbols, root_dir):
ret = []
pat = re.compile(r"extern\s+char\s+(?P<var>\w+)\[\];")
allowed = [".c", ".cpp", ".h"]
for root, _, files in os.walk(root_dir):
for file in files:
file = os.path.join(root, file)
_, ext = os.path.splitext(file)
if ext not in allowed:
continue
with open(file) as f:
for line in f:
m = pat.match(line)
if m:
ret.append(m.group("var"))
bad = [v for v in ret if v not in all_symbols]
if bad:
logging.error("These symbols need definitions but are not "
"in the object file: %s", ", ".join(bad))
exit(1)
logging.info("need symbols:\n%s", "\n".join(ret))
return ret
def symbols_from(object_file):
cmd = ["objdump", "--syms", object_file]
proc = subprocess.Popen(cmd, universal_newlines=True,
stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
proc_output, _ = proc.communicate()
if proc.returncode:
logging.error("`%s` failed. Output:\n%s", " ".join(cmd), proc_output)
exit(1)
# example: "[ 6](sec 1)(fl 0x00)(ty 0)(scl 2) (nx 0) 0x00000000 fred"
pat = re.compile(r"\[\s*\d+\]" # number of the entry
r"\(sec\s+-?\d+\)" # section number
r"\(fl\s+0x[\da-f]+\)" # flag bits
r"\(ty\s+\d+\)" # type
r"\(scl\s+\d+\)\s+" # storage class
r"\(nx\s+\d+\)\s+" # number of auxiliary entries
r"0x(?P<addr>[\da-f]+)\s+"
r"(?P<name>[^\s]*)" # Can be empty!
)
# example: "00000000 g .text 00000000 fred"
pat_elf = re.compile(r"(?P<addr>[\da-f]+)\s+"
r"[lgu! ][w ][C ][W ][Ii ][Dd ][FfO ]\s+" # flags
r"[^\s]+\s+" # section
r"[\da-f]+\s+" # size
r"(?:\.hidden\s+)?"
r"(?P<name>[^\s]*)" # Can be empty!
)
elf_format = False
matching = False
ret = {}
for line in proc_output.splitlines():
if not line:
continue
if not matching:
if re.match(r"^.+:\s+file format .*elf.*$", line):
elf_format = True
elif re.match(r"^SYMBOL TABLE:$", line):
matching = True
continue
if elf_format:
m = pat_elf.match(line)
elif re.match(r"^File\s*$", line):
continue
else:
m = pat.match(line)
if not m:
logging.error("Unexpected line from `%s`:\n%s",
" ".join(cmd), line)
exit(1)
ret[m.group("name")] = m.group("addr")
logging.info("found symbols:\n%s", "\n".join(
["0x%-16s %s" % (v, k) for k, v in ret.items()]))
return ret
def match_up_addresses(script_data, symbol_table):
ret = []
for name, data in script_data["sections"].items():
ok = False
if "size" in data and "start" in data:
ok = True
if "end" in data and "start" in data:
ok = True
if not ok:
continue
region = {}
for sym, value in symbol_table.items():
if "size" in data and sym == data["size"]:
region["size"] = {"sym": sym, "val": value}
if "start" in data and sym == data["start"]:
region["start"] = {"sym": sym, "val": value}
if "end" in data and sym == data["end"]:
region["end"] = {"sym": sym, "val": value}
region["section"] = name
append = False
if "size" in region and "start" in region:
append = True
if "end" in region and "start" in region:
append = True
if append:
ret.append(region)
return ret
def get_region_range(region):
ret = {}
if "end" in region:
start = int(region["start"]["val"], 16)
end = int(region["end"]["val"], 16)
size = end - start
s_var = region["start"]["sym"]
e_var = region["end"]["sym"]
ret["start"] = start
ret["size"] = size
ret["start-symbol"] = s_var
ret["end-symbol"] = e_var
ret["has-end-symbol"] = True
ret["annot"] = "__CPROVER_allocated_memory(%s, %d);" % (hex(start), size)
ret["commt"] = "from %s to %s" % (s_var, e_var)
elif "size" in region:
start = int(region["start"]["val"], 16)
size = int(region["size"]["val"], 16)
s_var = region["start"]["sym"]
z_var = region["size"]["sym"]
ret["start"] = start
ret["size"] = size
ret["start-symbol"] = s_var
ret["size-symbol"] = z_var
ret["has-end-symbol"] = False
ret["annot"] = "__CPROVER_allocated_memory(%s, %d);" % (hex(start), size)
ret["commt"] = "from %s for %s bytes" % (s_var, z_var)
else:
raise "Malformatted region\n%s" % str(region)
ret["section"] = region["section"]
return ret
def final_json_output(regions, symbol_table):
ret = {"regions": [], "addresses": []}
for s, v in symbol_table.items():
ret["addresses"].append({"sym": s, "val": int(v, 16)})
for region in regions:
ret["regions"].append(get_region_range(region))
return ret
def symbols_from_file(sym_file):
if sym_file == "-":
return [s.strip() for s in sys.stdin.readlines()]
with open(sym_file) as f:
return [s.strip() for s in f.readlines()]
def safe_eval(expr):
tree = ast.parse(expr, mode="eval").body
def eval_single_node(node):
logging.debug(node)
if isinstance(node, ast_num):
return node.n
if isinstance(node, ast.BinOp):
left = eval_single_node(node.left)
right = eval_single_node(node.right)
op = eval_single_node(node.op)
return op(left, right)
return {
ast.Add: operator.add,
ast.Sub: operator.sub,
ast.Mult: operator.mul,
# Use floordiv (i.e. //) so that we never need to
# cast to an int
ast.Div: operator.floordiv,
}[type(node)]
return eval_single_node(tree)
def main():
pars = argparse.ArgumentParser(
description="Generate info about linker-defined symbols and regions.",
epilog=epilog(),
formatter_class=argparse.RawDescriptionHelpFormatter)
pars.add_argument("-s", "--script", metavar="S", required=True,
help="path to linker script")
pars.add_argument("-o", "--object", metavar="O", required=True,
help="path to fully-linked binary")
sym_source = pars.add_mutually_exclusive_group(required=True)
sym_source.add_argument("-d", "--dir", metavar="D",
help="path to top-level of codebase")
sym_source.add_argument("-i", "--sym-file",
metavar="F", help="file of names of linker symbols")
pars.add_argument("-t", "--out-file", metavar="F",
help="default: stdout", default=None)
verbs = pars.add_mutually_exclusive_group()
verbs.add_argument("-v", "--verbose", action="store_true")
verbs.add_argument("-w", "--very-verbose", action="store_true")
args = pars.parse_args()
if args.verbose:
lvl = logging.INFO
elif args.very_verbose:
lvl = logging.DEBUG
else:
lvl = logging.WARNING
form = "linkerscript parse %(levelname)s: %(message)s"
logging.basicConfig(format=form, level=lvl)
script_data = get_linker_script_data(args.script)
symbol_table = symbols_from(args.object)
if args.dir:
needed = needed_definitions(symbol_table.keys(), args.dir)
else:
needed = symbols_from_file(args.sym_file)
symbol_table = {k:v for k, v in symbol_table.items() if k in needed}
regions = match_up_addresses(script_data, symbol_table)
info("symbol table %s" % json.dumps(symbol_table, indent=2))
info("script data %s" % json.dumps(script_data, indent=2))
info("regions %s" % json.dumps(regions, indent=2))
final = json.dumps(final_json_output(regions, symbol_table),
indent=2)
if args.out_file:
with open(args.out_file, "w") as f:
f.write(final)
info(final)
else:
print(final)
if __name__ == "__main__":
main()