blob: 52ff93aa53b309133d23d048f1fba89a0a7134e0 [file] [log] [blame]
Tom Andersonc3ed8962017-10-09 19:01:461#!/usr/bin/env python
2# Copyright 2017 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6import argparse
7import cgi
8import colorsys
9import difflib
10import random
11import os
12import re
13import subprocess
14import sys
15import tempfile
16import textwrap
17import webbrowser
18
19
20class TokenContext(object):
21 """Metadata about a token.
22
23 Attributes:
24 row: Row index of the token in the data file.
25 column: Column index of the token in the data file.
26 token: The token string.
Tom Anderson65410152017-10-17 01:53:1927 commit: A Commit object that corresponds to the commit that added
28 this token.
Tom Andersonc3ed8962017-10-09 19:01:4629 """
30 def __init__(self, row, column, token, commit=None):
31 self.row = row
32 self.column = column
33 self.token = token
34 self.commit = commit
35
36
37class Commit(object):
38 """Commit data.
39
40 Attributes:
41 hash: The commit hash.
Tom Anderson65410152017-10-17 01:53:1942 author_name: The author's name.
43 author_email: the author's email.
44 author_date: The date and time the author created this commit.
45 message: The commit message.
Tom Andersonc3ed8962017-10-09 19:01:4646 diff: The commit diff.
47 """
Tom Anderson65410152017-10-17 01:53:1948 def __init__(self, hash, author_name, author_email, author_date, message,
49 diff):
Tom Andersonc3ed8962017-10-09 19:01:4650 self.hash = hash
Tom Anderson65410152017-10-17 01:53:1951 self.author_name = author_name
52 self.author_email = author_email
53 self.author_date = author_date
54 self.message = message
Tom Andersonc3ed8962017-10-09 19:01:4655 self.diff = diff
56
57
58def tokenize_data(data):
59 """Tokenizes |data|.
60
61 Args:
62 data: String to tokenize.
63
64 Returns:
65 A list of TokenContexts.
66 """
67 contexts = []
68 in_identifier = False
69 identifier_start = 0
70 identifier = ''
71 row = 0
72 column = 0
73 line_contexts = []
74
75 for c in data + '\n':
76 if c.isalnum() or c == '_':
77 if in_identifier:
78 identifier += c
79 else:
80 in_identifier = True
81 identifier_start = column
82 identifier = c
83 else:
84 if in_identifier:
85 line_contexts.append(
86 TokenContext(row, identifier_start, identifier))
87 in_identifier = False
88 if not c.isspace():
89 line_contexts.append(TokenContext(row, column, c))
90
91 if c == '\n':
92 row += 1
93 column = 0
94 contexts.append(line_contexts)
95 line_tokens = []
96 line_contexts = []
97 else:
98 column += 1
99 return contexts
100
101
102def compute_unified_diff(old_tokens, new_tokens):
103 """Computes the diff between |old_tokens| and |new_tokens|.
104
105 Args:
106 old_tokens: Token strings corresponding to the old data.
107 new_tokens: Token strings corresponding to the new data.
108
109 Returns:
110 The diff, in unified diff format.
111 """
112 return difflib.unified_diff(old_tokens, new_tokens, n=0, lineterm='')
113
114
115def parse_chunk_header_file_range(file_range):
116 """Parses a chunk header file range.
117
118 Diff chunk headers have the form:
119 @@ -<file-range> +<file-range> @@
120 File ranges have the form:
121 <start line number>,<number of lines changed>
122
123 Args:
124 file_range: A chunk header file range.
125
126 Returns:
127 A tuple (range_start, range_end). The endpoints are adjusted such that
128 iterating over [range_start, range_end) will give the changed indices.
129 """
130 if ',' in file_range:
131 file_range_parts = file_range.split(',')
132 start = int(file_range_parts[0])
133 amount = int(file_range_parts[1])
134 if amount == 0:
135 return (start, start)
136 return (start - 1, start + amount - 1)
137 else:
138 return (int(file_range) - 1, int(file_range))
139
140
141def compute_changed_token_indices(previous_tokens, current_tokens):
142 """Computes changed and added tokens.
143
144 Args:
145 previous_tokens: Tokens corresponding to the old file.
146 current_tokens: Tokens corresponding to the new file.
147
148 Returns:
149 A tuple (added_tokens, changed_tokens).
150 added_tokens: A list of indices into |current_tokens|.
151 changed_tokens: A map of indices into |current_tokens| to
152 indices into |previous_tokens|.
153 """
154 prev_file_chunk_end = 0
155 prev_patched_chunk_end = 0
156 added_tokens = []
157 changed_tokens = {}
158 for line in compute_unified_diff(previous_tokens, current_tokens):
159 if line.startswith("@@"):
160 parts = line.split(' ')
161 removed = parts[1].lstrip('-')
162 removed_start, removed_end = parse_chunk_header_file_range(removed)
163 added = parts[2].lstrip('+')
164 added_start, added_end = parse_chunk_header_file_range(added)
165 for i in range(added_start, added_end):
166 added_tokens.append(i)
167 for i in range(0, removed_start - prev_patched_chunk_end):
168 changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i
169 prev_patched_chunk_end = removed_end
170 prev_file_chunk_end = added_end
171 for i in range(0, len(previous_tokens) - prev_patched_chunk_end):
172 changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i
173 return added_tokens, changed_tokens
174
175
176def flatten_nested_list(l):
177 """Flattens a list and provides a mapping from elements in the list back
178 into the nested list.
179
180 Args:
181 l: A list of lists.
182
183 Returns:
184 A tuple (flattened, index_to_position):
185 flattened: The flattened list.
186 index_to_position: A list of pairs (r, c) such that
187 index_to_position[i] == (r, c); flattened[i] == l[r][c]
188 """
189 flattened = []
190 index_to_position = {}
191 r = 0
192 c = 0
193 for nested_list in l:
194 for element in nested_list:
195 index_to_position[len(flattened)] = (r, c)
196 flattened.append(element)
197 c += 1
198 r += 1
199 c = 0
200 return (flattened, index_to_position)
201
202
203def compute_changed_token_positions(previous_tokens, current_tokens):
204 """Computes changed and added token positions.
205
206 Args:
207 previous_tokens: A list of lists of token strings. Lines in the file
208 correspond to the nested lists.
209 current_tokens: A list of lists of token strings. Lines in the file
210 correspond to the nested lists.
211
212 Returns:
213 A tuple (added_token_positions, changed_token_positions):
214 added_token_positions: A list of pairs that index into |current_tokens|.
215 changed_token_positions: A map from pairs that index into
216 |current_tokens| to pairs that index into |previous_tokens|.
217 """
218 flat_previous_tokens, previous_index_to_position = flatten_nested_list(
219 previous_tokens)
220 flat_current_tokens, current_index_to_position = flatten_nested_list(
221 current_tokens)
222 added_indices, changed_indices = compute_changed_token_indices(
223 flat_previous_tokens, flat_current_tokens)
224 added_token_positions = [current_index_to_position[i] for i in added_indices]
225 changed_token_positions = {
226 current_index_to_position[current_i]:
227 previous_index_to_position[changed_indices[current_i]]
228 for current_i in changed_indices
229 }
230 return (added_token_positions, changed_token_positions)
231
232
233def parse_chunks_from_diff(diff):
234 """Returns a generator of chunk data from a diff.
235
236 Args:
237 diff: A list of strings, with each string being a line from a diff
238 in unified diff format.
239
240 Returns:
241 A generator of tuples (added_lines_start, added_lines_end,
242 removed_lines, removed_lines_start)
243 """
244 in_chunk = False
245 chunk_previous = []
246 previous_start = None
247 current_start = None
248 current_end = None
249 for line in diff:
250 if line.startswith('@@'):
251 if in_chunk:
252 yield (current_start, current_end,
253 chunk_previous, previous_start)
254 parts = line.split(' ')
255 previous = parts[1].lstrip('-')
256 previous_start, _ = parse_chunk_header_file_range(previous)
257 current = parts[2].lstrip('+')
258 current_start, current_end = parse_chunk_header_file_range(current)
259 in_chunk = True
260 chunk_previous = []
261 elif in_chunk and line.startswith('-'):
262 chunk_previous.append(line[1:])
263 if current_start != None:
264 yield (current_start, current_end,
265 chunk_previous, previous_start)
266
267
268def should_skip_commit(commit):
269 """Decides if |commit| should be skipped when computing the blame.
270
271 Commit 5d4451e deleted all files in the repo except for DEPS. The
272 next commit, 1e7896, brought them back. This is a hack to skip
273 those commits (except for the files they modified). If we did not
274 do this, changes would be incorrectly attributed to 1e7896.
275
276 Args:
277 commit: A Commit object.
278
279 Returns:
280 A boolean indicating if this commit should be skipped.
281 """
282 banned_commits = [
283 '1e78967ed2f1937b3809c19d91e7dd62d756d307',
284 '5d4451ebf298d9d71f716cc0135f465cec41fcd0',
285 ]
286 if commit.hash not in banned_commits:
287 return False
288 banned_commits_file_exceptions = [
289 'DEPS',
290 'chrome/browser/ui/views/file_manager_dialog_browsertest.cc',
291 ]
292 for line in commit.diff:
293 if line.startswith('---') or line.startswith('+++'):
294 if line.split(' ')[1] in banned_commits_file_exceptions:
295 return False
296 elif line.startswith('@@'):
297 return True
298 assert False
299
300
Tom Anderson1e716922017-10-12 19:43:49301def generate_substrings(file):
302 """Generates substrings from a file stream, where substrings are
303 separated by '\0'.
Tom Andersonc3ed8962017-10-09 19:01:46304
Tom Anderson1e716922017-10-12 19:43:49305 For example, the input:
306 'a\0bc\0\0\0d\0'
Tom Andersonc3ed8962017-10-09 19:01:46307 would produce the output:
Tom Anderson1e716922017-10-12 19:43:49308 ['a', 'bc', 'd']
Tom Andersonc3ed8962017-10-09 19:01:46309
310 Args:
Tom Anderson1e716922017-10-12 19:43:49311 file: A readable file.
Tom Andersonc3ed8962017-10-09 19:01:46312 """
Tom Anderson65410152017-10-17 01:53:19313 BUF_SIZE = 448 # Experimentally found to be pretty fast.
314 data = []
Tom Anderson1e716922017-10-12 19:43:49315 while True:
Tom Anderson65410152017-10-17 01:53:19316 buf = file.read(BUF_SIZE)
317 parts = buf.split('\0')
318 data.append(parts[0])
319 if len(parts) > 1:
320 joined = ''.join(data)
321 if joined != '':
322 yield joined
323 for i in range(1, len(parts) - 1):
324 if parts[i] != '':
325 yield parts[i]
326 data = [parts[-1]]
327 if len(buf) < BUF_SIZE:
328 joined = ''.join(data)
329 if joined != '':
330 yield joined
331 return
Tom Andersonc3ed8962017-10-09 19:01:46332
333
334def generate_commits(git_log_stdout):
335 """Parses git log output into a stream of Commit objects.
336 """
Tom Anderson1e716922017-10-12 19:43:49337 substring_generator = generate_substrings(git_log_stdout)
Tom Andersonc3ed8962017-10-09 19:01:46338 while True:
Tom Anderson65410152017-10-17 01:53:19339 hash = substring_generator.next()
340 author_name = substring_generator.next()
341 author_email = substring_generator.next()
342 author_date = substring_generator.next()
343 message = substring_generator.next()
344 diff = substring_generator.next().split('\n')
345 yield Commit(hash, author_name, author_email, author_date, message, diff)
Tom Andersonc3ed8962017-10-09 19:01:46346
347
348def uberblame_aux(file_name, git_log_stdout, data):
349 """Computes the uberblame of file |file_name|.
350
351 Args:
352 file_name: File to uberblame.
353 git_log_stdout: A file object that represents the git log output.
354 data: A string containing the data of file |file_name|.
355
356 Returns:
357 A tuple (data, blame).
358 data: File contents.
359 blame: A list of TokenContexts.
360 """
361 blame = tokenize_data(data)
362
363 blamed_tokens = 0
364 total_tokens = len(blame)
365 uber_blame = (data, blame[:])
366
367 for commit in generate_commits(git_log_stdout):
368 if should_skip_commit(commit):
369 continue
370
371 offset = 0
372 for (added_lines_start, added_lines_end, removed_lines,
373 removed_lines_start) in parse_chunks_from_diff(commit.diff):
374 added_lines_start += offset
375 added_lines_end += offset
376 previous_contexts = [token_lines
377 for line_previous in removed_lines
378 for token_lines in tokenize_data(line_previous)]
379 previous_tokens = [
380 [context.token for context in contexts]
381 for contexts in previous_contexts
382 ]
383 current_contexts = blame[added_lines_start:added_lines_end]
384 current_tokens = [
385 [context.token for context in contexts]
386 for contexts in current_contexts
387 ]
388 added_token_positions, changed_token_positions = (
389 compute_changed_token_positions(previous_tokens, current_tokens))
390 for r, c in added_token_positions:
Tom Anderson65410152017-10-17 01:53:19391 current_contexts[r][c].commit = commit
Tom Andersonc3ed8962017-10-09 19:01:46392 blamed_tokens += 1
393 for r, c in changed_token_positions:
394 pr, pc = changed_token_positions[(r, c)]
395 previous_contexts[pr][pc] = current_contexts[r][c]
396
397 assert added_lines_start <= added_lines_end <= len(blame)
398 current_blame_size = len(blame)
399 blame[added_lines_start:added_lines_end] = previous_contexts
400 offset += len(blame) - current_blame_size
401
402 assert blame == [] or blame == [[]]
403 return uber_blame
404
405
406def uberblame(file_name, revision):
407 """Computes the uberblame of file |file_name|.
408
409 Args:
410 file_name: File to uberblame.
411 revision: The revision to start the uberblame at.
412
413 Returns:
414 A tuple (data, blame).
415 data: File contents.
416 blame: A list of TokenContexts.
417 """
Tom Anderson65410152017-10-17 01:53:19418 cmd_git_log = [
419 'git',
420 'log',
421 '--minimal',
422 '--no-prefix',
423 '--follow',
424 '-m',
425 '--first-parent',
426 '-p',
427 '-U0',
428 '-z',
429 '--format=%x00%H%x00%an%x00%ae%x00%ad%x00%B',
430 revision,
431 '--',
432 file_name
433 ]
Tom Andersonc3ed8962017-10-09 19:01:46434 git_log = subprocess.Popen(cmd_git_log,
435 stdout=subprocess.PIPE,
436 stderr=subprocess.PIPE)
437 data = subprocess.check_output(
438 ['git', 'show', '%s:%s' % (revision, file_name)])
439 data, blame = uberblame_aux(file_name, git_log.stdout, data)
440
441 _, stderr = git_log.communicate()
442 if git_log.returncode != 0:
443 raise subprocess.CalledProcessError(git_log.returncode, cmd_git_log, stderr)
444 return data, blame
445
446
447def generate_pastel_color():
448 (h, l, s) = (random.uniform(0, 1),
449 random.uniform(0.8, 0.9),
450 random.uniform(0.5, 1))
451 (r, g, b) = colorsys.hls_to_rgb(h, l, s)
452 return "#%0.2X%0.2X%0.2X" % (int(r*255), int(g*255), int(b*255))
453
454
455def visualize_uberblame(data, blame):
456 """Creates and displays a web page to visualize |blame|.
457
458 Args:
459 data: The data file as returned by uberblame().
460 blame: A list of TokenContexts as returned by uberblame().
461 """
462 # Use the same seed for the color generator on each run so that
463 # loading the same blame of the same file twice will result in the
464 # same generated HTML page.
465 random.seed(0x52937865ec62d1ea)
466 html = """\
467 <html>
468 <head>
469 <style>
470 body {
471 font-family: "Courier New";
472 }
473 pre {
474 display: inline;
475 }
Tom Andersonc3ed8962017-10-09 19:01:46476 span {
Tom Andersona671dad2017-10-10 19:19:47477 outline: 1pt solid #00000030;
478 outline-offset: -1pt;
Tom Anderson65410152017-10-17 01:53:19479 cursor: pointer;
Tom Andersonc3ed8962017-10-09 19:01:46480 }
481 #linenums {
482 text-align: right;
483 }
Tom Anderson65410152017-10-17 01:53:19484 #file_display {
485 position: absolute;
486 left: 0;
487 top: 0;
488 width: 50%%;
489 height: 100%%;
490 overflow: scroll;
491 }
492 #commit_display_container {
493 position: absolute;
494 left: 50%%;
495 top: 0;
496 width: 50%%;
497 height: 100%%;
498 overflow: scroll;
499 }
Tom Andersonc3ed8962017-10-09 19:01:46500 </style>
Tom Anderson65410152017-10-17 01:53:19501 <script>
502 commit_data = %s;
503 function display_commit(hash) {
504 var e = document.getElementById("commit_display");
505 e.innerHTML = commit_data[hash]
506 }
507 </script>
Tom Andersonc3ed8962017-10-09 19:01:46508 </head>
509 <body>
Tom Anderson65410152017-10-17 01:53:19510 <div id="file_display">
511 <table>
512 <tbody>
513 <tr>
514 <td valign="top" id="linenums">
515 <pre>%s</pre>
516 </td>
517 <td valign="top">
518 <pre>%s</pre>
519 </td>
520 </tr>
521 </tbody>
522 </table>
523 </div>
524 <div id="commit_display_container" valign="top">
525 <pre id="commit_display" />
526 </div>
Tom Andersonc3ed8962017-10-09 19:01:46527 </body>
528 </html>
529 """
530 html = textwrap.dedent(html)
Tom Anderson65410152017-10-17 01:53:19531 commits = {}
Tom Andersonc3ed8962017-10-09 19:01:46532 lines = []
533 commit_colors = {}
534 blame_index = 0
535 blame = [context for contexts in blame for context in contexts]
536 row = 0
537 lastline = ''
538 for line in data.split('\n'):
539 lastline = line
540 column = 0
541 for c in line + '\n':
542 if blame_index < len(blame):
543 token_context = blame[blame_index]
544 if (row == token_context.row and
545 column == token_context.column + len(token_context.token)):
546 if (blame_index + 1 == len(blame) or
Tom Anderson65410152017-10-17 01:53:19547 blame[blame_index].commit.hash !=
548 blame[blame_index + 1].commit.hash):
549 lines.append('</span>')
Tom Andersonc3ed8962017-10-09 19:01:46550 blame_index += 1
551 if blame_index < len(blame):
552 token_context = blame[blame_index]
553 if row == token_context.row and column == token_context.column:
554 if (blame_index == 0 or
Tom Anderson65410152017-10-17 01:53:19555 blame[blame_index - 1].commit.hash !=
556 blame[blame_index].commit.hash):
557 hash = token_context.commit.hash
558 commits[hash] = token_context.commit
559 if hash not in commit_colors:
560 commit_colors[hash] = generate_pastel_color()
561 color = commit_colors[hash]
562 lines.append(
563 ('<span style="background-color: %s" ' +
564 'onclick="display_commit(&quot;%s&quot;)">') % (color, hash))
Tom Andersonc3ed8962017-10-09 19:01:46565 lines.append(cgi.escape(c))
566 column += 1
567 row += 1
Tom Anderson65410152017-10-17 01:53:19568 commit_data = ['{']
569 commit_display_format = """\
570 commit: {hash}
571 Author: {author_name} <{author_email}>
572 Date: {author_date}
573
574 {message}
575 """
576 commit_display_format = textwrap.dedent(commit_display_format)
577 links = re.compile(r'(https?:\/\/\S+)')
578 for hash in commits:
579 commit = commits[hash]
580 commit_display = commit_display_format.format(
581 hash=hash,
582 author_name=commit.author_name,
583 author_email=commit.author_email,
584 author_date=commit.author_date,
585 message=commit.message,
586 )
587 commit_display = cgi.escape(commit_display, quote=True)
588 commit_display = re.sub(
589 links, '<a href=\\"\\1\\">\\1</a>', commit_display)
590 commit_display = commit_display.replace('\n', '\\n')
591 commit_data.append('"%s": "%s",' % (hash, commit_display))
592 commit_data.append('}')
593 commit_data = ''.join(commit_data)
Tom Andersonc3ed8962017-10-09 19:01:46594 line_nums = range(1, row if lastline.strip() == '' else row + 1)
595 line_nums = '\n'.join([str(num) for num in line_nums])
596 lines = ''.join(lines)
Tom Anderson65410152017-10-17 01:53:19597 return html % (commit_data, line_nums, lines)
Tom Andersonc3ed8962017-10-09 19:01:46598
599
600def show_visualization(html):
601 """Display |html| in a web browser.
602
603 Args:
604 html: The contents of the file to display, as a string.
605 """
606 # Keep the temporary file around so the browser has time to open it.
607 # TODO(thomasanderson): spin up a temporary web server to serve this
608 # file so we don't have to leak it.
609 html_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html')
610 html_file.write(html)
611 html_file.flush()
612 if sys.platform.startswith('linux'):
613 # Don't show any messages when starting the browser.
614 saved_stdout = os.dup(1)
615 saved_stderr = os.dup(2)
616 os.close(1)
617 os.close(2)
618 os.open(os.devnull, os.O_RDWR)
619 os.open(os.devnull, os.O_RDWR)
620 webbrowser.open('file://' + html_file.name)
621 if sys.platform.startswith('linux'):
622 os.dup2(saved_stdout, 1)
623 os.dup2(saved_stderr, 2)
624 os.close(saved_stdout)
625 os.close(saved_stderr)
626
627
628def main():
629 parser = argparse.ArgumentParser(
630 description='Show what revision last modified each token of a file')
631 parser.add_argument('revision', default='HEAD', nargs='?',
632 help='Show only commits starting from a revision.')
633 parser.add_argument('file', help='The file to uberblame.')
634 args = parser.parse_args()
635
636 data, blame = uberblame(args.file, args.revision)
637 html = visualize_uberblame(data, blame)
638 show_visualization(html)
639 return 0
640
641
642if __name__ == '__main__':
643 sys.exit(main())