Blame - testing/libfuzzer/dictionary_generator.py - chromium/src

blob: b720e8e755be6442e737663a4ff4f1eb16dd22d9 [file] [log] [blame]

mmoroz	1a6bef1	2016-07-07 12:07:44	[diff] [blame]	1	#!/usr/bin/python2
				2	#
				3	# Copyright 2016 The Chromium Authors. All rights reserved.
				4	# Use of this source code is governed by a BSD-style license that can be
				5	# found in the LICENSE file.
				6
				7	"""Generate a dictionary for libFuzzer or AFL-based fuzzer.
				8
				9	Invoked manually using a fuzzer binary and target format/protocol specification.
				10	Works better for text formats or protocols. For binary ones may be useless.
				11	"""
				12
				13	import argparse
				14	import HTMLParser
				15	import io
				16	import logging
				17	import os
				18	import re
				19	import shutil
				20	import string
				21	import subprocess
				22	import sys
				23	import tempfile
				24
				25
				26	ENCODING_TYPES = ['ascii', 'utf_16_be', 'utf_16_le', 'utf_32_be', 'utf_32_le']
				27	MIN_STRING_LENGTH = 4
				28
				29
				30	def DecodeHTML(html_data):
				31	"""HTML-decoding of the data."""
				32	html_parser = HTMLParser.HTMLParser()
				33	data = html_parser.unescape(html_data.decode('ascii', 'ignore'))
				34	return data.encode('ascii', 'ignore')
				35
				36
				37	def EscapeDictionaryElement(element):
				38	"""Escape all unprintable and control characters in an element."""
mmoroz	a767005	2016-07-07 13:03:08	[diff] [blame^]	39	element_escaped = element.encode('string_escape')
				40	# Remove escaping for single quote because it breaks libFuzzer.
				41	element_escaped = element_escaped.replace('\\\'', '\'')
				42	# Add escaping for double quote.
				43	element_escaped = element_escaped.replace('"', '\\"')
				44	return element_escaped
mmoroz	1a6bef1	2016-07-07 12:07:44	[diff] [blame]	45
				46
				47	def ExtractWordsFromBinary(filepath, min_length=MIN_STRING_LENGTH):
				48	"""Extract words (splitted strings) from a binary executable file."""
				49	rodata = PreprocessAndReadRodata(filepath)
				50	words = []
				51
				52	strings_re = re.compile(r'[^\x00-\x1F\x7F-\xFF]{%d,}' % min_length)
				53	# Use different encodings for strings extraction.
				54	for encoding in ENCODING_TYPES:
				55	data = rodata.decode(encoding, 'ignore').encode('ascii', 'ignore')
				56	raw_strings = strings_re.findall(data)
				57	for splitted_line in map(lambda line: line.split(), raw_strings):
				58	words += splitted_line
				59
				60	return set(words)
				61
				62
				63	def ExtractWordsFromLines(lines):
				64	"""Extract all words from a list of strings."""
				65	words = set()
				66	for line in lines:
				67	for word in line.split():
				68	words.add(word)
				69
				70	return words
				71
				72
				73	def ExtractWordsFromSpec(filepath, is_html):
				74	"""Extract words from a specification."""
				75	data = ReadSpecification(filepath, is_html)
				76	words = data.split()
				77	return set(words)
				78
				79
				80	def FindIndentedText(text):
				81	"""Find space-indented text blocks, e.g. code or data samples in RFCs."""
				82	lines = text.split('\n')
				83	indented_blocks = []
				84	current_block = ''
				85	previous_number_of_spaces = 0
				86
				87	# Go through every line and concatenate space-indented blocks into lines.
				88	for i in xrange(0, len(lines), 1):
				89	if not lines[i]:
				90	# Ignore empty lines.
				91	continue
				92
				93	# Space-indented text blocks have more leading spaces than regular text.
				94	n = FindNumberOfLeadingSpaces(lines[i])
				95
				96	if n > previous_number_of_spaces:
				97	# Beginning of a space-indented text block, start concatenation.
				98	current_block = lines[i][n : ]
				99	elif n == previous_number_of_spaces and current_block:
				100	# Or continuation of a space-indented text block, concatenate lines.
				101	current_block += '\n' + lines[i][n : ]
				102
				103	if n < previous_number_of_spaces and current_block:
				104	# Current line is not indented, save previously concatenated lines.
				105	indented_blocks.append(current_block)
				106	current_block = ''
				107
				108	previous_number_of_spaces = n
				109
				110	return indented_blocks
				111
				112
				113	def FindNumberOfLeadingSpaces(line):
				114	"""Calculate number of leading whitespace characters in the string."""
				115	n = 0
				116	while n < len(line) and line[n].isspace():
				117	n += 1
				118
				119	return n
				120
				121
				122	def GenerateDictionary(path_to_binary, path_to_spec, strategy, is_html=False):
				123	"""Generate a dictionary for given pair of fuzzer binary and specification."""
				124	for filepath in [path_to_binary, path_to_spec]:
				125	if not os.path.exists(filepath):
				126	logging.error('%s doesn\'t exist. Exit.', filepath)
				127	sys.exit(1)
				128
				129	words_from_binary = ExtractWordsFromBinary(path_to_binary)
				130	words_from_spec = ExtractWordsFromSpec(path_to_spec, is_html)
				131
				132	dictionary_words = set()
				133
				134	if 'i' in strategy:
				135	# Strategy i: only words which are common for binary and for specification.
				136	dictionary_words = words_from_binary.intersection(words_from_spec)
				137
				138	if 'q' in strategy:
				139	# Strategy q: add words from all quoted strings from specification.
				140	# TODO(mmoroz): experimental and very noisy. Not recommended to use.
				141	spec_data = ReadSpecification(path_to_spec, is_html)
				142	quoted_strings = FindIndentedText(spec_data)
				143	quoted_words = ExtractWordsFromLines(quoted_strings)
				144	dictionary_words = dictionary_words.union(quoted_words)
				145
				146	if 'u' in strategy:
				147	# Strategy u: add all uppercase words from specification.
				148	uppercase_words = set(w for w in words_from_spec if w.isupper())
				149	dictionary_words = dictionary_words.union(uppercase_words)
				150
				151	return dictionary_words
				152
				153
				154	def PreprocessAndReadRodata(filepath):
				155	"""Create a stripped copy of the binary and extract .rodata section."""
				156	stripped_file = tempfile.NamedTemporaryFile(prefix='.stripped_')
				157	stripped_filepath = stripped_file.name
				158	shutil.copyfile(filepath, stripped_filepath)
				159
				160	# Strip all symbols to reduce amount of redundant strings.
				161	strip_cmd = ['strip', '--strip-all', stripped_filepath]
				162	result = subprocess.call(strip_cmd)
				163	if result:
				164	logging.warning('Failed to strip the binary. Using the original version.')
				165	stripped_filepath = filepath
				166
				167	# Extract .rodata section to reduce amount of redundant strings.
				168	rodata_file = tempfile.NamedTemporaryFile(prefix='.rodata_')
				169	rodata_filepath = rodata_file.name
				170	objcopy_cmd = ['objcopy', '-j', '.rodata', stripped_filepath, rodata_filepath]
				171
				172	# Hide output from stderr since objcopy prints a warning.
				173	with open(os.devnull, 'w') as devnull:
				174	result = subprocess.call(objcopy_cmd, stderr=devnull)
				175
				176	if result:
				177	logging.warning('Failed to extract .rodata section. Using the whole file.')
				178	rodata_filepath = stripped_filepath
				179
				180	with open(rodata_filepath) as file_handle:
				181	data = file_handle.read()
				182
				183	stripped_file.close()
				184	rodata_file.close()
				185
				186	return data
				187
				188
				189	def ReadSpecification(filepath, is_html):
				190	"""Read a specification file and return its contents."""
				191	with open(filepath, 'r') as file_handle:
				192	data = file_handle.read()
				193
				194	if is_html:
				195	data = DecodeHTML(data)
				196
				197	return data
				198
				199
				200	def WriteDictionary(dictionary_path, dictionary):
				201	"""Write given dictionary to a file."""
				202	with open(dictionary_path, 'wb') as file_handle:
				203	file_handle.write('# This is an automatically generated dictionary.\n')
				204	for word in dictionary:
				205	if not word:
				206	continue
				207	line = '"%s"\n' % EscapeDictionaryElement(word)
				208	file_handle.write(line)
				209
				210
				211	def main():
				212	parser = argparse.ArgumentParser(description="Generate fuzzer dictionary.")
				213	parser.add_argument('--fuzzer', required=True,
				214	help='Path to a fuzzer binary executable. It is '
				215	'recommended to use a binary built with '
				216	'"use_libfuzzer=false is_asan=false" to get a better '
				217	'dictionary with fewer number of redundant elements.')
				218	parser.add_argument('--spec', required=True,
				219	help='Path to a target specification (in textual form).')
				220	parser.add_argument('--html', default=0,
				221	help='Decode HTML [01] (0 is default value): '
				222	'1 - if specification has HTML entities to be decoded.')
				223	parser.add_argument('--out', required=True,
				224	help='Path to a file to write a dictionary into.')
				225	parser.add_argument('--strategy', default='iu',
				226	help='Generation strategy [iqu] ("iu" is default value): '
				227	'i - intersection, q - quoted, u - uppercase.')
				228	args = parser.parse_args()
				229
				230	dictionary = GenerateDictionary(args.fuzzer, args.spec, args.strategy,
				231	is_html=bool(args.html))
				232	WriteDictionary(args.out, dictionary)
				233
				234
				235	if __name__ == '__main__':
				236	main()