blob: b720e8e755be6442e737663a4ff4f1eb16dd22d9 [file] [log] [blame]
mmoroz1a6bef12016-07-07 12:07:441#!/usr/bin/python2
2#
3# Copyright 2016 The Chromium Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7"""Generate a dictionary for libFuzzer or AFL-based fuzzer.
8
9Invoked manually using a fuzzer binary and target format/protocol specification.
10Works better for text formats or protocols. For binary ones may be useless.
11"""
12
13import argparse
14import HTMLParser
15import io
16import logging
17import os
18import re
19import shutil
20import string
21import subprocess
22import sys
23import tempfile
24
25
26ENCODING_TYPES = ['ascii', 'utf_16_be', 'utf_16_le', 'utf_32_be', 'utf_32_le']
27MIN_STRING_LENGTH = 4
28
29
30def DecodeHTML(html_data):
31 """HTML-decoding of the data."""
32 html_parser = HTMLParser.HTMLParser()
33 data = html_parser.unescape(html_data.decode('ascii', 'ignore'))
34 return data.encode('ascii', 'ignore')
35
36
37def EscapeDictionaryElement(element):
38 """Escape all unprintable and control characters in an element."""
mmoroza7670052016-07-07 13:03:0839 element_escaped = element.encode('string_escape')
40 # Remove escaping for single quote because it breaks libFuzzer.
41 element_escaped = element_escaped.replace('\\\'', '\'')
42 # Add escaping for double quote.
43 element_escaped = element_escaped.replace('"', '\\"')
44 return element_escaped
mmoroz1a6bef12016-07-07 12:07:4445
46
47def ExtractWordsFromBinary(filepath, min_length=MIN_STRING_LENGTH):
48 """Extract words (splitted strings) from a binary executable file."""
49 rodata = PreprocessAndReadRodata(filepath)
50 words = []
51
52 strings_re = re.compile(r'[^\x00-\x1F\x7F-\xFF]{%d,}' % min_length)
53 # Use different encodings for strings extraction.
54 for encoding in ENCODING_TYPES:
55 data = rodata.decode(encoding, 'ignore').encode('ascii', 'ignore')
56 raw_strings = strings_re.findall(data)
57 for splitted_line in map(lambda line: line.split(), raw_strings):
58 words += splitted_line
59
60 return set(words)
61
62
63def ExtractWordsFromLines(lines):
64 """Extract all words from a list of strings."""
65 words = set()
66 for line in lines:
67 for word in line.split():
68 words.add(word)
69
70 return words
71
72
73def ExtractWordsFromSpec(filepath, is_html):
74 """Extract words from a specification."""
75 data = ReadSpecification(filepath, is_html)
76 words = data.split()
77 return set(words)
78
79
80def FindIndentedText(text):
81 """Find space-indented text blocks, e.g. code or data samples in RFCs."""
82 lines = text.split('\n')
83 indented_blocks = []
84 current_block = ''
85 previous_number_of_spaces = 0
86
87 # Go through every line and concatenate space-indented blocks into lines.
88 for i in xrange(0, len(lines), 1):
89 if not lines[i]:
90 # Ignore empty lines.
91 continue
92
93 # Space-indented text blocks have more leading spaces than regular text.
94 n = FindNumberOfLeadingSpaces(lines[i])
95
96 if n > previous_number_of_spaces:
97 # Beginning of a space-indented text block, start concatenation.
98 current_block = lines[i][n : ]
99 elif n == previous_number_of_spaces and current_block:
100 # Or continuation of a space-indented text block, concatenate lines.
101 current_block += '\n' + lines[i][n : ]
102
103 if n < previous_number_of_spaces and current_block:
104 # Current line is not indented, save previously concatenated lines.
105 indented_blocks.append(current_block)
106 current_block = ''
107
108 previous_number_of_spaces = n
109
110 return indented_blocks
111
112
113def FindNumberOfLeadingSpaces(line):
114 """Calculate number of leading whitespace characters in the string."""
115 n = 0
116 while n < len(line) and line[n].isspace():
117 n += 1
118
119 return n
120
121
122def GenerateDictionary(path_to_binary, path_to_spec, strategy, is_html=False):
123 """Generate a dictionary for given pair of fuzzer binary and specification."""
124 for filepath in [path_to_binary, path_to_spec]:
125 if not os.path.exists(filepath):
126 logging.error('%s doesn\'t exist. Exit.', filepath)
127 sys.exit(1)
128
129 words_from_binary = ExtractWordsFromBinary(path_to_binary)
130 words_from_spec = ExtractWordsFromSpec(path_to_spec, is_html)
131
132 dictionary_words = set()
133
134 if 'i' in strategy:
135 # Strategy i: only words which are common for binary and for specification.
136 dictionary_words = words_from_binary.intersection(words_from_spec)
137
138 if 'q' in strategy:
139 # Strategy q: add words from all quoted strings from specification.
140 # TODO(mmoroz): experimental and very noisy. Not recommended to use.
141 spec_data = ReadSpecification(path_to_spec, is_html)
142 quoted_strings = FindIndentedText(spec_data)
143 quoted_words = ExtractWordsFromLines(quoted_strings)
144 dictionary_words = dictionary_words.union(quoted_words)
145
146 if 'u' in strategy:
147 # Strategy u: add all uppercase words from specification.
148 uppercase_words = set(w for w in words_from_spec if w.isupper())
149 dictionary_words = dictionary_words.union(uppercase_words)
150
151 return dictionary_words
152
153
154def PreprocessAndReadRodata(filepath):
155 """Create a stripped copy of the binary and extract .rodata section."""
156 stripped_file = tempfile.NamedTemporaryFile(prefix='.stripped_')
157 stripped_filepath = stripped_file.name
158 shutil.copyfile(filepath, stripped_filepath)
159
160 # Strip all symbols to reduce amount of redundant strings.
161 strip_cmd = ['strip', '--strip-all', stripped_filepath]
162 result = subprocess.call(strip_cmd)
163 if result:
164 logging.warning('Failed to strip the binary. Using the original version.')
165 stripped_filepath = filepath
166
167 # Extract .rodata section to reduce amount of redundant strings.
168 rodata_file = tempfile.NamedTemporaryFile(prefix='.rodata_')
169 rodata_filepath = rodata_file.name
170 objcopy_cmd = ['objcopy', '-j', '.rodata', stripped_filepath, rodata_filepath]
171
172 # Hide output from stderr since objcopy prints a warning.
173 with open(os.devnull, 'w') as devnull:
174 result = subprocess.call(objcopy_cmd, stderr=devnull)
175
176 if result:
177 logging.warning('Failed to extract .rodata section. Using the whole file.')
178 rodata_filepath = stripped_filepath
179
180 with open(rodata_filepath) as file_handle:
181 data = file_handle.read()
182
183 stripped_file.close()
184 rodata_file.close()
185
186 return data
187
188
189def ReadSpecification(filepath, is_html):
190 """Read a specification file and return its contents."""
191 with open(filepath, 'r') as file_handle:
192 data = file_handle.read()
193
194 if is_html:
195 data = DecodeHTML(data)
196
197 return data
198
199
200def WriteDictionary(dictionary_path, dictionary):
201 """Write given dictionary to a file."""
202 with open(dictionary_path, 'wb') as file_handle:
203 file_handle.write('# This is an automatically generated dictionary.\n')
204 for word in dictionary:
205 if not word:
206 continue
207 line = '"%s"\n' % EscapeDictionaryElement(word)
208 file_handle.write(line)
209
210
211def main():
212 parser = argparse.ArgumentParser(description="Generate fuzzer dictionary.")
213 parser.add_argument('--fuzzer', required=True,
214 help='Path to a fuzzer binary executable. It is '
215 'recommended to use a binary built with '
216 '"use_libfuzzer=false is_asan=false" to get a better '
217 'dictionary with fewer number of redundant elements.')
218 parser.add_argument('--spec', required=True,
219 help='Path to a target specification (in textual form).')
220 parser.add_argument('--html', default=0,
221 help='Decode HTML [01] (0 is default value): '
222 '1 - if specification has HTML entities to be decoded.')
223 parser.add_argument('--out', required=True,
224 help='Path to a file to write a dictionary into.')
225 parser.add_argument('--strategy', default='iu',
226 help='Generation strategy [iqu] ("iu" is default value): '
227 'i - intersection, q - quoted, u - uppercase.')
228 args = parser.parse_args()
229
230 dictionary = GenerateDictionary(args.fuzzer, args.spec, args.strategy,
231 is_html=bool(args.html))
232 WriteDictionary(args.out, dictionary)
233
234
235if __name__ == '__main__':
236 main()