blob: 282bb1f5d464516533d21db051c6a3096207c0ed [file] [log] [blame]
Tom Andersonc3ed8962017-10-09 19:01:461#!/usr/bin/env python
2# Copyright 2017 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6import argparse
7import cgi
8import colorsys
9import difflib
10import random
11import os
12import re
13import subprocess
14import sys
15import tempfile
16import textwrap
17import webbrowser
18
19
20class TokenContext(object):
21 """Metadata about a token.
22
23 Attributes:
24 row: Row index of the token in the data file.
25 column: Column index of the token in the data file.
26 token: The token string.
27 commit: Hash of the git commit that added this token.
28 """
29 def __init__(self, row, column, token, commit=None):
30 self.row = row
31 self.column = column
32 self.token = token
33 self.commit = commit
34
35
36class Commit(object):
37 """Commit data.
38
39 Attributes:
40 hash: The commit hash.
41 diff: The commit diff.
42 """
43 def __init__(self, hash, diff):
44 self.hash = hash
45 self.diff = diff
46
47
48def tokenize_data(data):
49 """Tokenizes |data|.
50
51 Args:
52 data: String to tokenize.
53
54 Returns:
55 A list of TokenContexts.
56 """
57 contexts = []
58 in_identifier = False
59 identifier_start = 0
60 identifier = ''
61 row = 0
62 column = 0
63 line_contexts = []
64
65 for c in data + '\n':
66 if c.isalnum() or c == '_':
67 if in_identifier:
68 identifier += c
69 else:
70 in_identifier = True
71 identifier_start = column
72 identifier = c
73 else:
74 if in_identifier:
75 line_contexts.append(
76 TokenContext(row, identifier_start, identifier))
77 in_identifier = False
78 if not c.isspace():
79 line_contexts.append(TokenContext(row, column, c))
80
81 if c == '\n':
82 row += 1
83 column = 0
84 contexts.append(line_contexts)
85 line_tokens = []
86 line_contexts = []
87 else:
88 column += 1
89 return contexts
90
91
92def compute_unified_diff(old_tokens, new_tokens):
93 """Computes the diff between |old_tokens| and |new_tokens|.
94
95 Args:
96 old_tokens: Token strings corresponding to the old data.
97 new_tokens: Token strings corresponding to the new data.
98
99 Returns:
100 The diff, in unified diff format.
101 """
102 return difflib.unified_diff(old_tokens, new_tokens, n=0, lineterm='')
103
104
105def parse_chunk_header_file_range(file_range):
106 """Parses a chunk header file range.
107
108 Diff chunk headers have the form:
109 @@ -<file-range> +<file-range> @@
110 File ranges have the form:
111 <start line number>,<number of lines changed>
112
113 Args:
114 file_range: A chunk header file range.
115
116 Returns:
117 A tuple (range_start, range_end). The endpoints are adjusted such that
118 iterating over [range_start, range_end) will give the changed indices.
119 """
120 if ',' in file_range:
121 file_range_parts = file_range.split(',')
122 start = int(file_range_parts[0])
123 amount = int(file_range_parts[1])
124 if amount == 0:
125 return (start, start)
126 return (start - 1, start + amount - 1)
127 else:
128 return (int(file_range) - 1, int(file_range))
129
130
131def compute_changed_token_indices(previous_tokens, current_tokens):
132 """Computes changed and added tokens.
133
134 Args:
135 previous_tokens: Tokens corresponding to the old file.
136 current_tokens: Tokens corresponding to the new file.
137
138 Returns:
139 A tuple (added_tokens, changed_tokens).
140 added_tokens: A list of indices into |current_tokens|.
141 changed_tokens: A map of indices into |current_tokens| to
142 indices into |previous_tokens|.
143 """
144 prev_file_chunk_end = 0
145 prev_patched_chunk_end = 0
146 added_tokens = []
147 changed_tokens = {}
148 for line in compute_unified_diff(previous_tokens, current_tokens):
149 if line.startswith("@@"):
150 parts = line.split(' ')
151 removed = parts[1].lstrip('-')
152 removed_start, removed_end = parse_chunk_header_file_range(removed)
153 added = parts[2].lstrip('+')
154 added_start, added_end = parse_chunk_header_file_range(added)
155 for i in range(added_start, added_end):
156 added_tokens.append(i)
157 for i in range(0, removed_start - prev_patched_chunk_end):
158 changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i
159 prev_patched_chunk_end = removed_end
160 prev_file_chunk_end = added_end
161 for i in range(0, len(previous_tokens) - prev_patched_chunk_end):
162 changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i
163 return added_tokens, changed_tokens
164
165
166def flatten_nested_list(l):
167 """Flattens a list and provides a mapping from elements in the list back
168 into the nested list.
169
170 Args:
171 l: A list of lists.
172
173 Returns:
174 A tuple (flattened, index_to_position):
175 flattened: The flattened list.
176 index_to_position: A list of pairs (r, c) such that
177 index_to_position[i] == (r, c); flattened[i] == l[r][c]
178 """
179 flattened = []
180 index_to_position = {}
181 r = 0
182 c = 0
183 for nested_list in l:
184 for element in nested_list:
185 index_to_position[len(flattened)] = (r, c)
186 flattened.append(element)
187 c += 1
188 r += 1
189 c = 0
190 return (flattened, index_to_position)
191
192
193def compute_changed_token_positions(previous_tokens, current_tokens):
194 """Computes changed and added token positions.
195
196 Args:
197 previous_tokens: A list of lists of token strings. Lines in the file
198 correspond to the nested lists.
199 current_tokens: A list of lists of token strings. Lines in the file
200 correspond to the nested lists.
201
202 Returns:
203 A tuple (added_token_positions, changed_token_positions):
204 added_token_positions: A list of pairs that index into |current_tokens|.
205 changed_token_positions: A map from pairs that index into
206 |current_tokens| to pairs that index into |previous_tokens|.
207 """
208 flat_previous_tokens, previous_index_to_position = flatten_nested_list(
209 previous_tokens)
210 flat_current_tokens, current_index_to_position = flatten_nested_list(
211 current_tokens)
212 added_indices, changed_indices = compute_changed_token_indices(
213 flat_previous_tokens, flat_current_tokens)
214 added_token_positions = [current_index_to_position[i] for i in added_indices]
215 changed_token_positions = {
216 current_index_to_position[current_i]:
217 previous_index_to_position[changed_indices[current_i]]
218 for current_i in changed_indices
219 }
220 return (added_token_positions, changed_token_positions)
221
222
223def parse_chunks_from_diff(diff):
224 """Returns a generator of chunk data from a diff.
225
226 Args:
227 diff: A list of strings, with each string being a line from a diff
228 in unified diff format.
229
230 Returns:
231 A generator of tuples (added_lines_start, added_lines_end,
232 removed_lines, removed_lines_start)
233 """
234 in_chunk = False
235 chunk_previous = []
236 previous_start = None
237 current_start = None
238 current_end = None
239 for line in diff:
240 if line.startswith('@@'):
241 if in_chunk:
242 yield (current_start, current_end,
243 chunk_previous, previous_start)
244 parts = line.split(' ')
245 previous = parts[1].lstrip('-')
246 previous_start, _ = parse_chunk_header_file_range(previous)
247 current = parts[2].lstrip('+')
248 current_start, current_end = parse_chunk_header_file_range(current)
249 in_chunk = True
250 chunk_previous = []
251 elif in_chunk and line.startswith('-'):
252 chunk_previous.append(line[1:])
253 if current_start != None:
254 yield (current_start, current_end,
255 chunk_previous, previous_start)
256
257
258def should_skip_commit(commit):
259 """Decides if |commit| should be skipped when computing the blame.
260
261 Commit 5d4451e deleted all files in the repo except for DEPS. The
262 next commit, 1e7896, brought them back. This is a hack to skip
263 those commits (except for the files they modified). If we did not
264 do this, changes would be incorrectly attributed to 1e7896.
265
266 Args:
267 commit: A Commit object.
268
269 Returns:
270 A boolean indicating if this commit should be skipped.
271 """
272 banned_commits = [
273 '1e78967ed2f1937b3809c19d91e7dd62d756d307',
274 '5d4451ebf298d9d71f716cc0135f465cec41fcd0',
275 ]
276 if commit.hash not in banned_commits:
277 return False
278 banned_commits_file_exceptions = [
279 'DEPS',
280 'chrome/browser/ui/views/file_manager_dialog_browsertest.cc',
281 ]
282 for line in commit.diff:
283 if line.startswith('---') or line.startswith('+++'):
284 if line.split(' ')[1] in banned_commits_file_exceptions:
285 return False
286 elif line.startswith('@@'):
287 return True
288 assert False
289
290
Tom Anderson1e716922017-10-12 19:43:49291def generate_substrings(file):
292 """Generates substrings from a file stream, where substrings are
293 separated by '\0'.
Tom Andersonc3ed8962017-10-09 19:01:46294
Tom Anderson1e716922017-10-12 19:43:49295 For example, the input:
296 'a\0bc\0\0\0d\0'
Tom Andersonc3ed8962017-10-09 19:01:46297 would produce the output:
Tom Anderson1e716922017-10-12 19:43:49298 ['a', 'bc', 'd']
Tom Andersonc3ed8962017-10-09 19:01:46299
300 Args:
Tom Anderson1e716922017-10-12 19:43:49301 file: A readable file.
Tom Andersonc3ed8962017-10-09 19:01:46302 """
Tom Anderson1e716922017-10-12 19:43:49303 data = ''
304 while True:
305 ch = file.read(1)
306 if ch == '':
307 break
308 if ch == '\0':
309 if data != '':
Tom Andersonc3ed8962017-10-09 19:01:46310 yield data
Tom Anderson1e716922017-10-12 19:43:49311 data = ''
Tom Andersonc3ed8962017-10-09 19:01:46312 else:
Tom Anderson1e716922017-10-12 19:43:49313 data += ch
314 if data != '':
Tom Andersonc3ed8962017-10-09 19:01:46315 yield data
316
317
318def generate_commits(git_log_stdout):
319 """Parses git log output into a stream of Commit objects.
320 """
Tom Anderson1e716922017-10-12 19:43:49321 substring_generator = generate_substrings(git_log_stdout)
Tom Andersonc3ed8962017-10-09 19:01:46322 while True:
Tom Anderson1e716922017-10-12 19:43:49323 hash = substring_generator.next().strip('\n')
324 diff = substring_generator.next().strip('\n').split('\n')
Tom Andersonc3ed8962017-10-09 19:01:46325 yield Commit(hash, diff)
326
327
328def uberblame_aux(file_name, git_log_stdout, data):
329 """Computes the uberblame of file |file_name|.
330
331 Args:
332 file_name: File to uberblame.
333 git_log_stdout: A file object that represents the git log output.
334 data: A string containing the data of file |file_name|.
335
336 Returns:
337 A tuple (data, blame).
338 data: File contents.
339 blame: A list of TokenContexts.
340 """
341 blame = tokenize_data(data)
342
343 blamed_tokens = 0
344 total_tokens = len(blame)
345 uber_blame = (data, blame[:])
346
347 for commit in generate_commits(git_log_stdout):
348 if should_skip_commit(commit):
349 continue
350
351 offset = 0
352 for (added_lines_start, added_lines_end, removed_lines,
353 removed_lines_start) in parse_chunks_from_diff(commit.diff):
354 added_lines_start += offset
355 added_lines_end += offset
356 previous_contexts = [token_lines
357 for line_previous in removed_lines
358 for token_lines in tokenize_data(line_previous)]
359 previous_tokens = [
360 [context.token for context in contexts]
361 for contexts in previous_contexts
362 ]
363 current_contexts = blame[added_lines_start:added_lines_end]
364 current_tokens = [
365 [context.token for context in contexts]
366 for contexts in current_contexts
367 ]
368 added_token_positions, changed_token_positions = (
369 compute_changed_token_positions(previous_tokens, current_tokens))
370 for r, c in added_token_positions:
371 current_contexts[r][c].commit = commit.hash
372 blamed_tokens += 1
373 for r, c in changed_token_positions:
374 pr, pc = changed_token_positions[(r, c)]
375 previous_contexts[pr][pc] = current_contexts[r][c]
376
377 assert added_lines_start <= added_lines_end <= len(blame)
378 current_blame_size = len(blame)
379 blame[added_lines_start:added_lines_end] = previous_contexts
380 offset += len(blame) - current_blame_size
381
382 assert blame == [] or blame == [[]]
383 return uber_blame
384
385
386def uberblame(file_name, revision):
387 """Computes the uberblame of file |file_name|.
388
389 Args:
390 file_name: File to uberblame.
391 revision: The revision to start the uberblame at.
392
393 Returns:
394 A tuple (data, blame).
395 data: File contents.
396 blame: A list of TokenContexts.
397 """
398 cmd_git_log = ['git', 'log', '--minimal', '--no-prefix', '--follow', '-m',
Tom Anderson1e716922017-10-12 19:43:49399 '--first-parent', '-p', '-U0', '-z', '--format=%x00%h',
Tom Andersonc3ed8962017-10-09 19:01:46400 revision, '--', file_name]
401 git_log = subprocess.Popen(cmd_git_log,
402 stdout=subprocess.PIPE,
403 stderr=subprocess.PIPE)
404 data = subprocess.check_output(
405 ['git', 'show', '%s:%s' % (revision, file_name)])
406 data, blame = uberblame_aux(file_name, git_log.stdout, data)
407
408 _, stderr = git_log.communicate()
409 if git_log.returncode != 0:
410 raise subprocess.CalledProcessError(git_log.returncode, cmd_git_log, stderr)
411 return data, blame
412
413
414def generate_pastel_color():
415 (h, l, s) = (random.uniform(0, 1),
416 random.uniform(0.8, 0.9),
417 random.uniform(0.5, 1))
418 (r, g, b) = colorsys.hls_to_rgb(h, l, s)
419 return "#%0.2X%0.2X%0.2X" % (int(r*255), int(g*255), int(b*255))
420
421
422def visualize_uberblame(data, blame):
423 """Creates and displays a web page to visualize |blame|.
424
425 Args:
426 data: The data file as returned by uberblame().
427 blame: A list of TokenContexts as returned by uberblame().
428 """
429 # Use the same seed for the color generator on each run so that
430 # loading the same blame of the same file twice will result in the
431 # same generated HTML page.
432 random.seed(0x52937865ec62d1ea)
433 html = """\
434 <html>
435 <head>
436 <style>
437 body {
438 font-family: "Courier New";
439 }
440 pre {
441 display: inline;
442 }
443 a {
444 color: #000000;
445 text-decoration: none;
446 }
447 span {
Tom Andersona671dad2017-10-10 19:19:47448 outline: 1pt solid #00000030;
449 outline-offset: -1pt;
Tom Andersonc3ed8962017-10-09 19:01:46450 }
451 #linenums {
452 text-align: right;
453 }
454 </style>
455 </head>
456 <body>
457 <table>
458 <tbody>
459 <tr>
460 <td valign="top" id="linenums">
461 <pre>%s</pre>
462 </td>
463 <td valign="top">
464 <pre>%s</pre>
465 </td>
466 </tr>
467 </tbody>
468 </table>
469 </body>
470 </html>
471 """
472 html = textwrap.dedent(html)
473 lines = []
474 commit_colors = {}
475 blame_index = 0
476 blame = [context for contexts in blame for context in contexts]
477 row = 0
478 lastline = ''
479 for line in data.split('\n'):
480 lastline = line
481 column = 0
482 for c in line + '\n':
483 if blame_index < len(blame):
484 token_context = blame[blame_index]
485 if (row == token_context.row and
486 column == token_context.column + len(token_context.token)):
487 if (blame_index + 1 == len(blame) or
488 blame[blame_index].commit != blame[blame_index + 1].commit):
489 lines.append('</a></span>')
490 blame_index += 1
491 if blame_index < len(blame):
492 token_context = blame[blame_index]
493 if row == token_context.row and column == token_context.column:
494 if (blame_index == 0 or
495 blame[blame_index - 1].commit != blame[blame_index].commit):
496 commit = token_context.commit
497 assert commit != None
498 lines.append(('<a href="https://chromium.googlesource.com/' +
499 'chromium/src/+/%s">') % commit)
500 if commit not in commit_colors:
501 commit_colors[commit] = generate_pastel_color()
502 color = commit_colors[commit]
503 lines.append('<span style="background-color: %s">' % color)
504 lines.append(cgi.escape(c))
505 column += 1
506 row += 1
507 line_nums = range(1, row if lastline.strip() == '' else row + 1)
508 line_nums = '\n'.join([str(num) for num in line_nums])
509 lines = ''.join(lines)
510 return html % (line_nums, lines)
511
512
513def show_visualization(html):
514 """Display |html| in a web browser.
515
516 Args:
517 html: The contents of the file to display, as a string.
518 """
519 # Keep the temporary file around so the browser has time to open it.
520 # TODO(thomasanderson): spin up a temporary web server to serve this
521 # file so we don't have to leak it.
522 html_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html')
523 html_file.write(html)
524 html_file.flush()
525 if sys.platform.startswith('linux'):
526 # Don't show any messages when starting the browser.
527 saved_stdout = os.dup(1)
528 saved_stderr = os.dup(2)
529 os.close(1)
530 os.close(2)
531 os.open(os.devnull, os.O_RDWR)
532 os.open(os.devnull, os.O_RDWR)
533 webbrowser.open('file://' + html_file.name)
534 if sys.platform.startswith('linux'):
535 os.dup2(saved_stdout, 1)
536 os.dup2(saved_stderr, 2)
537 os.close(saved_stdout)
538 os.close(saved_stderr)
539
540
541def main():
542 parser = argparse.ArgumentParser(
543 description='Show what revision last modified each token of a file')
544 parser.add_argument('revision', default='HEAD', nargs='?',
545 help='Show only commits starting from a revision.')
546 parser.add_argument('file', help='The file to uberblame.')
547 args = parser.parse_args()
548
549 data, blame = uberblame(args.file, args.revision)
550 html = visualize_uberblame(data, blame)
551 show_visualization(html)
552 return 0
553
554
555if __name__ == '__main__':
556 sys.exit(main())