Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 1 | #!/usr/bin/env python3 |
Avi Drissman | dfd88085 | 2022-09-15 20:11:09 | [diff] [blame] | 2 | # Copyright 2017 The Chromium Authors |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 3 | # Use of this source code is governed by a BSD-style license that can be |
| 4 | # found in the LICENSE file. |
| 5 | |
| 6 | import argparse |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 7 | import colorsys |
| 8 | import difflib |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 9 | import html |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 10 | import random |
| 11 | import os |
| 12 | import re |
| 13 | import subprocess |
| 14 | import sys |
| 15 | import tempfile |
| 16 | import textwrap |
| 17 | import webbrowser |
| 18 | |
| 19 | |
| 20 | class TokenContext(object): |
| 21 | """Metadata about a token. |
| 22 | |
| 23 | Attributes: |
| 24 | row: Row index of the token in the data file. |
| 25 | column: Column index of the token in the data file. |
| 26 | token: The token string. |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 27 | commit: A Commit object that corresponds to the commit that added |
| 28 | this token. |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 29 | """ |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 30 | |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 31 | def __init__(self, row, column, token, commit=None): |
| 32 | self.row = row |
| 33 | self.column = column |
| 34 | self.token = token |
| 35 | self.commit = commit |
| 36 | |
| 37 | |
| 38 | class Commit(object): |
| 39 | """Commit data. |
| 40 | |
| 41 | Attributes: |
| 42 | hash: The commit hash. |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 43 | author_name: The author's name. |
| 44 | author_email: the author's email. |
| 45 | author_date: The date and time the author created this commit. |
| 46 | message: The commit message. |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 47 | diff: The commit diff. |
| 48 | """ |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 49 | |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 50 | def __init__(self, hash, author_name, author_email, author_date, message, |
| 51 | diff): |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 52 | self.hash = hash |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 53 | self.author_name = author_name |
| 54 | self.author_email = author_email |
| 55 | self.author_date = author_date |
| 56 | self.message = message |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 57 | self.diff = diff |
| 58 | |
| 59 | |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 60 | def tokenize_data(data, tokenize_by_char, tokenize_whitespace): |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 61 | """Tokenizes |data|. |
| 62 | |
| 63 | Args: |
| 64 | data: String to tokenize. |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 65 | tokenize_by_char: If true, individual characters are treated as tokens. |
| 66 | Otherwise, tokens are either symbols or strings of both alphanumeric |
| 67 | characters and underscores. |
| 68 | tokenize_whitespace: Treat non-newline whitespace characters as tokens. |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 69 | |
| 70 | Returns: |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 71 | A list of lists of TokenContexts. Each list represents a line. |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 72 | """ |
| 73 | contexts = [] |
| 74 | in_identifier = False |
| 75 | identifier_start = 0 |
| 76 | identifier = '' |
| 77 | row = 0 |
| 78 | column = 0 |
| 79 | line_contexts = [] |
| 80 | |
Tom Anderson | ac918bb6 | 2018-04-17 00:37:34 | [diff] [blame] | 81 | for c in data: |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 82 | if not tokenize_by_char and (c.isalnum() or c == '_'): |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 83 | if in_identifier: |
| 84 | identifier += c |
| 85 | else: |
| 86 | in_identifier = True |
| 87 | identifier_start = column |
| 88 | identifier = c |
| 89 | else: |
| 90 | if in_identifier: |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 91 | line_contexts.append(TokenContext(row, identifier_start, identifier)) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 92 | in_identifier = False |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 93 | if not c.isspace() or (tokenize_whitespace and c != '\n'): |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 94 | line_contexts.append(TokenContext(row, column, c)) |
| 95 | |
| 96 | if c == '\n': |
| 97 | row += 1 |
| 98 | column = 0 |
| 99 | contexts.append(line_contexts) |
| 100 | line_tokens = [] |
| 101 | line_contexts = [] |
| 102 | else: |
| 103 | column += 1 |
Tom Anderson | ac918bb6 | 2018-04-17 00:37:34 | [diff] [blame] | 104 | contexts.append(line_contexts) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 105 | return contexts |
| 106 | |
| 107 | |
| 108 | def compute_unified_diff(old_tokens, new_tokens): |
| 109 | """Computes the diff between |old_tokens| and |new_tokens|. |
| 110 | |
| 111 | Args: |
| 112 | old_tokens: Token strings corresponding to the old data. |
| 113 | new_tokens: Token strings corresponding to the new data. |
| 114 | |
| 115 | Returns: |
| 116 | The diff, in unified diff format. |
| 117 | """ |
| 118 | return difflib.unified_diff(old_tokens, new_tokens, n=0, lineterm='') |
| 119 | |
| 120 | |
| 121 | def parse_chunk_header_file_range(file_range): |
| 122 | """Parses a chunk header file range. |
| 123 | |
| 124 | Diff chunk headers have the form: |
| 125 | @@ -<file-range> +<file-range> @@ |
| 126 | File ranges have the form: |
| 127 | <start line number>,<number of lines changed> |
| 128 | |
| 129 | Args: |
| 130 | file_range: A chunk header file range. |
| 131 | |
| 132 | Returns: |
| 133 | A tuple (range_start, range_end). The endpoints are adjusted such that |
| 134 | iterating over [range_start, range_end) will give the changed indices. |
| 135 | """ |
| 136 | if ',' in file_range: |
| 137 | file_range_parts = file_range.split(',') |
| 138 | start = int(file_range_parts[0]) |
| 139 | amount = int(file_range_parts[1]) |
| 140 | if amount == 0: |
| 141 | return (start, start) |
| 142 | return (start - 1, start + amount - 1) |
| 143 | else: |
| 144 | return (int(file_range) - 1, int(file_range)) |
| 145 | |
| 146 | |
| 147 | def compute_changed_token_indices(previous_tokens, current_tokens): |
| 148 | """Computes changed and added tokens. |
| 149 | |
| 150 | Args: |
| 151 | previous_tokens: Tokens corresponding to the old file. |
| 152 | current_tokens: Tokens corresponding to the new file. |
| 153 | |
| 154 | Returns: |
| 155 | A tuple (added_tokens, changed_tokens). |
| 156 | added_tokens: A list of indices into |current_tokens|. |
| 157 | changed_tokens: A map of indices into |current_tokens| to |
| 158 | indices into |previous_tokens|. |
| 159 | """ |
| 160 | prev_file_chunk_end = 0 |
| 161 | prev_patched_chunk_end = 0 |
| 162 | added_tokens = [] |
| 163 | changed_tokens = {} |
| 164 | for line in compute_unified_diff(previous_tokens, current_tokens): |
| 165 | if line.startswith("@@"): |
| 166 | parts = line.split(' ') |
| 167 | removed = parts[1].lstrip('-') |
| 168 | removed_start, removed_end = parse_chunk_header_file_range(removed) |
| 169 | added = parts[2].lstrip('+') |
| 170 | added_start, added_end = parse_chunk_header_file_range(added) |
| 171 | for i in range(added_start, added_end): |
| 172 | added_tokens.append(i) |
| 173 | for i in range(0, removed_start - prev_patched_chunk_end): |
| 174 | changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i |
| 175 | prev_patched_chunk_end = removed_end |
| 176 | prev_file_chunk_end = added_end |
| 177 | for i in range(0, len(previous_tokens) - prev_patched_chunk_end): |
| 178 | changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i |
| 179 | return added_tokens, changed_tokens |
| 180 | |
| 181 | |
| 182 | def flatten_nested_list(l): |
| 183 | """Flattens a list and provides a mapping from elements in the list back |
| 184 | into the nested list. |
| 185 | |
| 186 | Args: |
| 187 | l: A list of lists. |
| 188 | |
| 189 | Returns: |
| 190 | A tuple (flattened, index_to_position): |
| 191 | flattened: The flattened list. |
| 192 | index_to_position: A list of pairs (r, c) such that |
| 193 | index_to_position[i] == (r, c); flattened[i] == l[r][c] |
| 194 | """ |
| 195 | flattened = [] |
| 196 | index_to_position = {} |
| 197 | r = 0 |
| 198 | c = 0 |
| 199 | for nested_list in l: |
| 200 | for element in nested_list: |
| 201 | index_to_position[len(flattened)] = (r, c) |
| 202 | flattened.append(element) |
| 203 | c += 1 |
| 204 | r += 1 |
| 205 | c = 0 |
| 206 | return (flattened, index_to_position) |
| 207 | |
| 208 | |
| 209 | def compute_changed_token_positions(previous_tokens, current_tokens): |
| 210 | """Computes changed and added token positions. |
| 211 | |
| 212 | Args: |
| 213 | previous_tokens: A list of lists of token strings. Lines in the file |
| 214 | correspond to the nested lists. |
| 215 | current_tokens: A list of lists of token strings. Lines in the file |
| 216 | correspond to the nested lists. |
| 217 | |
| 218 | Returns: |
| 219 | A tuple (added_token_positions, changed_token_positions): |
| 220 | added_token_positions: A list of pairs that index into |current_tokens|. |
| 221 | changed_token_positions: A map from pairs that index into |
| 222 | |current_tokens| to pairs that index into |previous_tokens|. |
| 223 | """ |
| 224 | flat_previous_tokens, previous_index_to_position = flatten_nested_list( |
| 225 | previous_tokens) |
| 226 | flat_current_tokens, current_index_to_position = flatten_nested_list( |
| 227 | current_tokens) |
| 228 | added_indices, changed_indices = compute_changed_token_indices( |
| 229 | flat_previous_tokens, flat_current_tokens) |
| 230 | added_token_positions = [current_index_to_position[i] for i in added_indices] |
| 231 | changed_token_positions = { |
| 232 | current_index_to_position[current_i]: |
| 233 | previous_index_to_position[changed_indices[current_i]] |
| 234 | for current_i in changed_indices |
| 235 | } |
| 236 | return (added_token_positions, changed_token_positions) |
| 237 | |
| 238 | |
| 239 | def parse_chunks_from_diff(diff): |
| 240 | """Returns a generator of chunk data from a diff. |
| 241 | |
| 242 | Args: |
| 243 | diff: A list of strings, with each string being a line from a diff |
| 244 | in unified diff format. |
| 245 | |
| 246 | Returns: |
Tom Anderson | 500216114 | 2018-04-17 00:30:21 | [diff] [blame] | 247 | A generator of tuples (added_lines_start, added_lines_end, removed_lines) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 248 | """ |
Tom Anderson | 500216114 | 2018-04-17 00:30:21 | [diff] [blame] | 249 | it = iter(diff) |
| 250 | for line in it: |
| 251 | while not line.startswith('@@'): |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 252 | line = next(it) |
Tom Anderson | 500216114 | 2018-04-17 00:30:21 | [diff] [blame] | 253 | parts = line.split(' ') |
| 254 | previous_start, previous_end = parse_chunk_header_file_range( |
| 255 | parts[1].lstrip('-')) |
| 256 | current_start, current_end = parse_chunk_header_file_range( |
| 257 | parts[2].lstrip('+')) |
| 258 | |
| 259 | in_delta = False |
| 260 | added_lines_start = None |
| 261 | added_lines_end = None |
| 262 | removed_lines = [] |
| 263 | while previous_start < previous_end or current_start < current_end: |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 264 | line = next(it) |
Tom Anderson | 500216114 | 2018-04-17 00:30:21 | [diff] [blame] | 265 | firstchar = line[0] |
| 266 | line = line[1:] |
| 267 | if not in_delta and (firstchar == '-' or firstchar == '+'): |
| 268 | in_delta = True |
| 269 | added_lines_start = current_start |
| 270 | added_lines_end = current_start |
| 271 | removed_lines = [] |
| 272 | |
| 273 | if firstchar == '-': |
| 274 | removed_lines.append(line) |
| 275 | previous_start += 1 |
| 276 | elif firstchar == '+': |
| 277 | current_start += 1 |
| 278 | added_lines_end = current_start |
| 279 | elif firstchar == ' ': |
| 280 | if in_delta: |
| 281 | in_delta = False |
| 282 | yield (added_lines_start, added_lines_end, removed_lines) |
| 283 | previous_start += 1 |
| 284 | current_start += 1 |
| 285 | if in_delta: |
| 286 | yield (added_lines_start, added_lines_end, removed_lines) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 287 | |
| 288 | |
| 289 | def should_skip_commit(commit): |
| 290 | """Decides if |commit| should be skipped when computing the blame. |
| 291 | |
| 292 | Commit 5d4451e deleted all files in the repo except for DEPS. The |
| 293 | next commit, 1e7896, brought them back. This is a hack to skip |
| 294 | those commits (except for the files they modified). If we did not |
| 295 | do this, changes would be incorrectly attributed to 1e7896. |
| 296 | |
| 297 | Args: |
| 298 | commit: A Commit object. |
| 299 | |
| 300 | Returns: |
| 301 | A boolean indicating if this commit should be skipped. |
| 302 | """ |
| 303 | banned_commits = [ |
| 304 | '1e78967ed2f1937b3809c19d91e7dd62d756d307', |
| 305 | '5d4451ebf298d9d71f716cc0135f465cec41fcd0', |
| 306 | ] |
| 307 | if commit.hash not in banned_commits: |
| 308 | return False |
| 309 | banned_commits_file_exceptions = [ |
| 310 | 'DEPS', |
| 311 | 'chrome/browser/ui/views/file_manager_dialog_browsertest.cc', |
| 312 | ] |
| 313 | for line in commit.diff: |
| 314 | if line.startswith('---') or line.startswith('+++'): |
| 315 | if line.split(' ')[1] in banned_commits_file_exceptions: |
| 316 | return False |
| 317 | elif line.startswith('@@'): |
| 318 | return True |
| 319 | assert False |
| 320 | |
| 321 | |
Tom Anderson | 1e71692 | 2017-10-12 19:43:49 | [diff] [blame] | 322 | def generate_substrings(file): |
| 323 | """Generates substrings from a file stream, where substrings are |
| 324 | separated by '\0'. |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 325 | |
Tom Anderson | 1e71692 | 2017-10-12 19:43:49 | [diff] [blame] | 326 | For example, the input: |
| 327 | 'a\0bc\0\0\0d\0' |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 328 | would produce the output: |
Tom Anderson | 1e71692 | 2017-10-12 19:43:49 | [diff] [blame] | 329 | ['a', 'bc', 'd'] |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 330 | |
| 331 | Args: |
Tom Anderson | 1e71692 | 2017-10-12 19:43:49 | [diff] [blame] | 332 | file: A readable file. |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 333 | """ |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 334 | BUF_SIZE = 448 # Experimentally found to be pretty fast. |
| 335 | data = [] |
Tom Anderson | 1e71692 | 2017-10-12 19:43:49 | [diff] [blame] | 336 | while True: |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 337 | buf = file.read(BUF_SIZE) |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 338 | parts = buf.split(b'\0') |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 339 | data.append(parts[0]) |
| 340 | if len(parts) > 1: |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 341 | joined = b''.join(data) |
| 342 | if joined != b'': |
| 343 | yield joined.decode() |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 344 | for i in range(1, len(parts) - 1): |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 345 | if parts[i] != b'': |
| 346 | yield parts[i].decode() |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 347 | data = [parts[-1]] |
| 348 | if len(buf) < BUF_SIZE: |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 349 | joined = b''.join(data) |
| 350 | if joined != b'': |
| 351 | yield joined.decode() |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 352 | return |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 353 | |
| 354 | |
| 355 | def generate_commits(git_log_stdout): |
| 356 | """Parses git log output into a stream of Commit objects. |
| 357 | """ |
Tom Anderson | 1e71692 | 2017-10-12 19:43:49 | [diff] [blame] | 358 | substring_generator = generate_substrings(git_log_stdout) |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 359 | try: |
| 360 | while True: |
| 361 | hash = next(substring_generator) |
| 362 | author_name = next(substring_generator) |
| 363 | author_email = next(substring_generator) |
| 364 | author_date = next(substring_generator) |
| 365 | message = next(substring_generator).rstrip('\n') |
| 366 | diff = next(substring_generator).split('\n')[1:-1] |
| 367 | yield Commit(hash, author_name, author_email, author_date, message, diff) |
| 368 | except StopIteration: |
| 369 | pass |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 370 | |
| 371 | |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 372 | def uberblame_aux(file_name, git_log_stdout, data, tokenization_method): |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 373 | """Computes the uberblame of file |file_name|. |
| 374 | |
| 375 | Args: |
| 376 | file_name: File to uberblame. |
| 377 | git_log_stdout: A file object that represents the git log output. |
| 378 | data: A string containing the data of file |file_name|. |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 379 | tokenization_method: A function that takes a string and returns a list of |
| 380 | TokenContexts. |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 381 | |
| 382 | Returns: |
| 383 | A tuple (data, blame). |
| 384 | data: File contents. |
| 385 | blame: A list of TokenContexts. |
| 386 | """ |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 387 | blame = tokenization_method(data) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 388 | |
| 389 | blamed_tokens = 0 |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 390 | uber_blame = (data, blame[:]) |
| 391 | |
| 392 | for commit in generate_commits(git_log_stdout): |
| 393 | if should_skip_commit(commit): |
| 394 | continue |
| 395 | |
| 396 | offset = 0 |
Tom Anderson | 500216114 | 2018-04-17 00:30:21 | [diff] [blame] | 397 | for (added_lines_start, added_lines_end, |
| 398 | removed_lines) in parse_chunks_from_diff(commit.diff): |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 399 | added_lines_start += offset |
| 400 | added_lines_end += offset |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 401 | previous_contexts = [ |
| 402 | token_lines |
| 403 | for line_previous in removed_lines |
| 404 | for token_lines in tokenization_method(line_previous) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 405 | ] |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 406 | previous_tokens = [[context.token for context in contexts] |
| 407 | for contexts in previous_contexts] |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 408 | current_contexts = blame[added_lines_start:added_lines_end] |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 409 | current_tokens = [[context.token for context in contexts] |
| 410 | for contexts in current_contexts] |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 411 | added_token_positions, changed_token_positions = ( |
| 412 | compute_changed_token_positions(previous_tokens, current_tokens)) |
| 413 | for r, c in added_token_positions: |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 414 | current_contexts[r][c].commit = commit |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 415 | blamed_tokens += 1 |
| 416 | for r, c in changed_token_positions: |
| 417 | pr, pc = changed_token_positions[(r, c)] |
| 418 | previous_contexts[pr][pc] = current_contexts[r][c] |
| 419 | |
| 420 | assert added_lines_start <= added_lines_end <= len(blame) |
| 421 | current_blame_size = len(blame) |
| 422 | blame[added_lines_start:added_lines_end] = previous_contexts |
| 423 | offset += len(blame) - current_blame_size |
| 424 | |
| 425 | assert blame == [] or blame == [[]] |
| 426 | return uber_blame |
| 427 | |
| 428 | |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 429 | def uberblame(file_name, revision, tokenization_method): |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 430 | """Computes the uberblame of file |file_name|. |
| 431 | |
| 432 | Args: |
| 433 | file_name: File to uberblame. |
| 434 | revision: The revision to start the uberblame at. |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 435 | tokenization_method: A function that takes a string and returns a list of |
| 436 | TokenContexts. |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 437 | |
| 438 | Returns: |
| 439 | A tuple (data, blame). |
| 440 | data: File contents. |
| 441 | blame: A list of TokenContexts. |
| 442 | """ |
Tom Anderson | 500216114 | 2018-04-17 00:30:21 | [diff] [blame] | 443 | DIFF_CONTEXT = 3 |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 444 | cmd_git_log = [ |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 445 | 'git', 'log', '--minimal', '--no-prefix', '--follow', '-m', |
Tom Anderson | 500216114 | 2018-04-17 00:30:21 | [diff] [blame] | 446 | '--first-parent', '-p', |
| 447 | '-U%d' % DIFF_CONTEXT, '-z', '--format=%x00%H%x00%an%x00%ae%x00%ad%x00%B', |
| 448 | revision, '--', file_name |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 449 | ] |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 450 | git_log = subprocess.Popen( |
| 451 | cmd_git_log, stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 452 | data = subprocess.check_output( |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 453 | ['git', 'show', '%s:%s' % (revision, file_name)]).decode() |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 454 | data, blame = uberblame_aux(file_name, git_log.stdout, data, |
| 455 | tokenization_method) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 456 | |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 457 | stderr = git_log.communicate()[1].decode() |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 458 | if git_log.returncode != 0: |
| 459 | raise subprocess.CalledProcessError(git_log.returncode, cmd_git_log, stderr) |
| 460 | return data, blame |
| 461 | |
| 462 | |
| 463 | def generate_pastel_color(): |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 464 | """Generates a random color from a nice looking pastel palette. |
| 465 | |
| 466 | Returns: |
| 467 | The color, formatted as hex string. For example, white is "#FFFFFF". |
| 468 | """ |
| 469 | (h, l, s) = (random.uniform(0, 1), random.uniform(0.8, 0.9), random.uniform( |
| 470 | 0.5, 1)) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 471 | (r, g, b) = colorsys.hls_to_rgb(h, l, s) |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 472 | return "#%0.2X%0.2X%0.2X" % (int(r * 255), int(g * 255), int(b * 255)) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 473 | |
| 474 | |
Tom Anderson | ac918bb6 | 2018-04-17 00:37:34 | [diff] [blame] | 475 | def colorize_diff(diff): |
| 476 | """Colorizes a diff for use in an HTML page. |
| 477 | |
| 478 | Args: |
| 479 | diff: The diff, in unified diff format, as a list of line strings. |
| 480 | |
| 481 | Returns: |
| 482 | The HTML-formatted diff, as a string. The diff will already be escaped. |
| 483 | """ |
| 484 | |
| 485 | colorized = [] |
| 486 | for line in diff: |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 487 | escaped = html.escape(line.replace('\r', ''), quote=True) |
Tom Anderson | ac918bb6 | 2018-04-17 00:37:34 | [diff] [blame] | 488 | if line.startswith('+'): |
| 489 | colorized.append('<span class=\\"addition\\">%s</span>' % escaped) |
| 490 | elif line.startswith('-'): |
| 491 | colorized.append('<span class=\\"deletion\\">%s</span>' % escaped) |
| 492 | elif line.startswith('@@'): |
| 493 | context_begin = escaped.find('@@', 2) |
| 494 | assert context_begin != -1 |
| 495 | colorized.append( |
| 496 | '<span class=\\"chunk_meta\\">%s</span>' |
| 497 | '<span class=\\"chunk_context\\">%s</span' |
| 498 | % (escaped[0:context_begin + 2], escaped[context_begin + 2:])) |
| 499 | elif line.startswith('diff') or line.startswith('index'): |
| 500 | colorized.append('<span class=\\"file_header\\">%s</span>' % escaped) |
| 501 | else: |
| 502 | colorized.append('<span class=\\"context_line\\">%s</span>' % escaped) |
| 503 | return '\n'.join(colorized) |
| 504 | |
| 505 | |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 506 | def create_visualization(data, blame): |
| 507 | """Creates a web page to visualize |blame|. |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 508 | |
| 509 | Args: |
| 510 | data: The data file as returned by uberblame(). |
| 511 | blame: A list of TokenContexts as returned by uberblame(). |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 512 | |
Tom Anderson | ac918bb6 | 2018-04-17 00:37:34 | [diff] [blame] | 513 | Returns: |
| 514 | The HTML for the generated page, as a string. |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 515 | """ |
| 516 | # Use the same seed for the color generator on each run so that |
| 517 | # loading the same blame of the same file twice will result in the |
| 518 | # same generated HTML page. |
| 519 | random.seed(0x52937865ec62d1ea) |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 520 | page = """\ |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 521 | <html> |
| 522 | <head> |
| 523 | <style> |
| 524 | body { |
Tom Anderson | ac918bb6 | 2018-04-17 00:37:34 | [diff] [blame] | 525 | font-family: monospace; |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 526 | } |
| 527 | pre { |
| 528 | display: inline; |
| 529 | } |
Tom Anderson | ac918bb6 | 2018-04-17 00:37:34 | [diff] [blame] | 530 | .token { |
Tom Anderson | a671dad | 2017-10-10 19:19:47 | [diff] [blame] | 531 | outline: 1pt solid #00000030; |
| 532 | outline-offset: -1pt; |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 533 | cursor: pointer; |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 534 | } |
Tom Anderson | ac918bb6 | 2018-04-17 00:37:34 | [diff] [blame] | 535 | .addition { |
| 536 | color: #080; |
| 537 | } |
| 538 | .deletion { |
| 539 | color: #c00; |
| 540 | } |
| 541 | .chunk_meta { |
| 542 | color: #099; |
| 543 | } |
| 544 | .context_line .chunk_context { |
| 545 | // Just normal text. |
| 546 | } |
| 547 | .file_header { |
| 548 | font-weight: bold; |
| 549 | } |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 550 | #linenums { |
| 551 | text-align: right; |
| 552 | } |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 553 | #file_display { |
| 554 | position: absolute; |
| 555 | left: 0; |
| 556 | top: 0; |
| 557 | width: 50%%; |
| 558 | height: 100%%; |
| 559 | overflow: scroll; |
| 560 | } |
| 561 | #commit_display_container { |
| 562 | position: absolute; |
| 563 | left: 50%%; |
| 564 | top: 0; |
| 565 | width: 50%%; |
| 566 | height: 100%%; |
| 567 | overflow: scroll; |
| 568 | } |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 569 | </style> |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 570 | <script> |
| 571 | commit_data = %s; |
| 572 | function display_commit(hash) { |
| 573 | var e = document.getElementById("commit_display"); |
| 574 | e.innerHTML = commit_data[hash] |
| 575 | } |
| 576 | </script> |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 577 | </head> |
| 578 | <body> |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 579 | <div id="file_display"> |
| 580 | <table> |
| 581 | <tbody> |
| 582 | <tr> |
| 583 | <td valign="top" id="linenums"> |
| 584 | <pre>%s</pre> |
| 585 | </td> |
| 586 | <td valign="top"> |
| 587 | <pre>%s</pre> |
| 588 | </td> |
| 589 | </tr> |
| 590 | </tbody> |
| 591 | </table> |
| 592 | </div> |
| 593 | <div id="commit_display_container" valign="top"> |
| 594 | <pre id="commit_display" /> |
| 595 | </div> |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 596 | </body> |
| 597 | </html> |
| 598 | """ |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 599 | page = textwrap.dedent(page) |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 600 | commits = {} |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 601 | lines = [] |
| 602 | commit_colors = {} |
| 603 | blame_index = 0 |
| 604 | blame = [context for contexts in blame for context in contexts] |
| 605 | row = 0 |
| 606 | lastline = '' |
| 607 | for line in data.split('\n'): |
| 608 | lastline = line |
| 609 | column = 0 |
| 610 | for c in line + '\n': |
| 611 | if blame_index < len(blame): |
| 612 | token_context = blame[blame_index] |
| 613 | if (row == token_context.row and |
| 614 | column == token_context.column + len(token_context.token)): |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 615 | if (blame_index + 1 == len(blame) or blame[blame_index].commit.hash != |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 616 | blame[blame_index + 1].commit.hash): |
| 617 | lines.append('</span>') |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 618 | blame_index += 1 |
| 619 | if blame_index < len(blame): |
| 620 | token_context = blame[blame_index] |
| 621 | if row == token_context.row and column == token_context.column: |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 622 | if (blame_index == 0 or blame[blame_index - 1].commit.hash != |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 623 | blame[blame_index].commit.hash): |
| 624 | hash = token_context.commit.hash |
| 625 | commits[hash] = token_context.commit |
| 626 | if hash not in commit_colors: |
| 627 | commit_colors[hash] = generate_pastel_color() |
| 628 | color = commit_colors[hash] |
Tom Anderson | ac918bb6 | 2018-04-17 00:37:34 | [diff] [blame] | 629 | lines.append(('<span class="token" style="background-color: %s" ' + |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 630 | 'onclick="display_commit("%s")">') % (color, |
| 631 | hash)) |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 632 | lines.append(html.escape(c)) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 633 | column += 1 |
| 634 | row += 1 |
Tom Anderson | ac918bb6 | 2018-04-17 00:37:34 | [diff] [blame] | 635 | commit_data = ['{\n'] |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 636 | commit_display_format = """\ |
| 637 | commit: {hash} |
| 638 | Author: {author_name} <{author_email}> |
| 639 | Date: {author_date} |
| 640 | |
| 641 | {message} |
Tom Anderson | ac918bb6 | 2018-04-17 00:37:34 | [diff] [blame] | 642 | |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 643 | """ |
| 644 | commit_display_format = textwrap.dedent(commit_display_format) |
| 645 | links = re.compile(r'(https?:\/\/\S+)') |
| 646 | for hash in commits: |
| 647 | commit = commits[hash] |
| 648 | commit_display = commit_display_format.format( |
| 649 | hash=hash, |
| 650 | author_name=commit.author_name, |
| 651 | author_email=commit.author_email, |
| 652 | author_date=commit.author_date, |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 653 | message=commit.message) |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 654 | commit_display = html.escape(commit_display, quote=True) |
Tom Anderson | ac918bb6 | 2018-04-17 00:37:34 | [diff] [blame] | 655 | commit_display += colorize_diff(commit.diff) |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 656 | commit_display = re.sub(links, '<a href=\\"\\1\\">\\1</a>', commit_display) |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 657 | commit_display = commit_display.replace('\n', '\\n') |
Tom Anderson | ac918bb6 | 2018-04-17 00:37:34 | [diff] [blame] | 658 | commit_data.append('"%s": "%s",\n' % (hash, commit_display)) |
Tom Anderson | 6541015 | 2017-10-17 01:53:19 | [diff] [blame] | 659 | commit_data.append('}') |
| 660 | commit_data = ''.join(commit_data) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 661 | line_nums = range(1, row if lastline.strip() == '' else row + 1) |
| 662 | line_nums = '\n'.join([str(num) for num in line_nums]) |
| 663 | lines = ''.join(lines) |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 664 | return page % (commit_data, line_nums, lines) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 665 | |
| 666 | |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 667 | def show_visualization(page): |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 668 | """Display |html| in a web browser. |
| 669 | |
| 670 | Args: |
| 671 | html: The contents of the file to display, as a string. |
| 672 | """ |
| 673 | # Keep the temporary file around so the browser has time to open it. |
| 674 | # TODO(thomasanderson): spin up a temporary web server to serve this |
| 675 | # file so we don't have to leak it. |
| 676 | html_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html') |
Tom Anderson | 773d809 | 2022-03-23 20:47:29 | [diff] [blame] | 677 | html_file.write(page.encode()) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 678 | html_file.flush() |
| 679 | if sys.platform.startswith('linux'): |
| 680 | # Don't show any messages when starting the browser. |
| 681 | saved_stdout = os.dup(1) |
| 682 | saved_stderr = os.dup(2) |
| 683 | os.close(1) |
| 684 | os.close(2) |
| 685 | os.open(os.devnull, os.O_RDWR) |
| 686 | os.open(os.devnull, os.O_RDWR) |
| 687 | webbrowser.open('file://' + html_file.name) |
| 688 | if sys.platform.startswith('linux'): |
| 689 | os.dup2(saved_stdout, 1) |
| 690 | os.dup2(saved_stderr, 2) |
| 691 | os.close(saved_stdout) |
| 692 | os.close(saved_stderr) |
| 693 | |
| 694 | |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 695 | def main(argv): |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 696 | parser = argparse.ArgumentParser( |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 697 | description='Show what revision last modified each token of a file.') |
| 698 | parser.add_argument( |
| 699 | 'revision', |
| 700 | default='HEAD', |
| 701 | nargs='?', |
| 702 | help='show only commits starting from a revision') |
| 703 | parser.add_argument('file', help='the file to uberblame') |
| 704 | parser.add_argument( |
| 705 | '--skip-visualization', |
| 706 | action='store_true', |
| 707 | help='do not display the blame visualization in a web browser') |
| 708 | parser.add_argument( |
| 709 | '--tokenize-by-char', |
| 710 | action='store_true', |
| 711 | help='treat individual characters as tokens') |
| 712 | parser.add_argument( |
| 713 | '--tokenize-whitespace', |
| 714 | action='store_true', |
| 715 | help='also blame non-newline whitespace characters') |
| 716 | args = parser.parse_args(argv) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 717 | |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 718 | def tokenization_method(data): |
| 719 | return tokenize_data(data, args.tokenize_by_char, args.tokenize_whitespace) |
| 720 | |
| 721 | data, blame = uberblame(args.file, args.revision, tokenization_method) |
| 722 | html = create_visualization(data, blame) |
| 723 | if not args.skip_visualization: |
| 724 | show_visualization(html) |
Tom Anderson | c3ed896 | 2017-10-09 19:01:46 | [diff] [blame] | 725 | return 0 |
| 726 | |
| 727 | |
| 728 | if __name__ == '__main__': |
Tom Anderson | b3d7e64 | 2018-04-13 16:23:42 | [diff] [blame] | 729 | sys.exit(main(sys.argv[1:])) |