Blame - tools/uberblame.py - chromium/src

blob: 282bb1f5d464516533d21db051c6a3096207c0ed [file] [log] [blame]

Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	1	#!/usr/bin/env python
				2	# Copyright 2017 The Chromium Authors. All rights reserved.
				3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	import argparse
				7	import cgi
				8	import colorsys
				9	import difflib
				10	import random
				11	import os
				12	import re
				13	import subprocess
				14	import sys
				15	import tempfile
				16	import textwrap
				17	import webbrowser
				18
				19
				20	class TokenContext(object):
				21	"""Metadata about a token.
				22
				23	Attributes:
				24	row: Row index of the token in the data file.
				25	column: Column index of the token in the data file.
				26	token: The token string.
				27	commit: Hash of the git commit that added this token.
				28	"""
				29	def __init__(self, row, column, token, commit=None):
				30	self.row = row
				31	self.column = column
				32	self.token = token
				33	self.commit = commit
				34
				35
				36	class Commit(object):
				37	"""Commit data.
				38
				39	Attributes:
				40	hash: The commit hash.
				41	diff: The commit diff.
				42	"""
				43	def __init__(self, hash, diff):
				44	self.hash = hash
				45	self.diff = diff
				46
				47
				48	def tokenize_data(data):
				49	"""Tokenizes \|data\|.
				50
				51	Args:
				52	data: String to tokenize.
				53
				54	Returns:
				55	A list of TokenContexts.
				56	"""
				57	contexts = []
				58	in_identifier = False
				59	identifier_start = 0
				60	identifier = ''
				61	row = 0
				62	column = 0
				63	line_contexts = []
				64
				65	for c in data + '\n':
				66	if c.isalnum() or c == '_':
				67	if in_identifier:
				68	identifier += c
				69	else:
				70	in_identifier = True
				71	identifier_start = column
				72	identifier = c
				73	else:
				74	if in_identifier:
				75	line_contexts.append(
				76	TokenContext(row, identifier_start, identifier))
				77	in_identifier = False
				78	if not c.isspace():
				79	line_contexts.append(TokenContext(row, column, c))
				80
				81	if c == '\n':
				82	row += 1
				83	column = 0
				84	contexts.append(line_contexts)
				85	line_tokens = []
				86	line_contexts = []
				87	else:
				88	column += 1
				89	return contexts
				90
				91
				92	def compute_unified_diff(old_tokens, new_tokens):
				93	"""Computes the diff between \|old_tokens\| and \|new_tokens\|.
				94
				95	Args:
				96	old_tokens: Token strings corresponding to the old data.
				97	new_tokens: Token strings corresponding to the new data.
				98
				99	Returns:
				100	The diff, in unified diff format.
				101	"""
				102	return difflib.unified_diff(old_tokens, new_tokens, n=0, lineterm='')
				103
				104
				105	def parse_chunk_header_file_range(file_range):
				106	"""Parses a chunk header file range.
				107
				108	Diff chunk headers have the form:
				109	@@ -<file-range> +<file-range> @@
				110	File ranges have the form:
				111	<start line number>,<number of lines changed>
				112
				113	Args:
				114	file_range: A chunk header file range.
				115
				116	Returns:
				117	A tuple (range_start, range_end). The endpoints are adjusted such that
				118	iterating over [range_start, range_end) will give the changed indices.
				119	"""
				120	if ',' in file_range:
				121	file_range_parts = file_range.split(',')
				122	start = int(file_range_parts[0])
				123	amount = int(file_range_parts[1])
				124	if amount == 0:
				125	return (start, start)
				126	return (start - 1, start + amount - 1)
				127	else:
				128	return (int(file_range) - 1, int(file_range))
				129
				130
				131	def compute_changed_token_indices(previous_tokens, current_tokens):
				132	"""Computes changed and added tokens.
				133
				134	Args:
				135	previous_tokens: Tokens corresponding to the old file.
				136	current_tokens: Tokens corresponding to the new file.
				137
				138	Returns:
				139	A tuple (added_tokens, changed_tokens).
				140	added_tokens: A list of indices into \|current_tokens\|.
				141	changed_tokens: A map of indices into \|current_tokens\| to
				142	indices into \|previous_tokens\|.
				143	"""
				144	prev_file_chunk_end = 0
				145	prev_patched_chunk_end = 0
				146	added_tokens = []
				147	changed_tokens = {}
				148	for line in compute_unified_diff(previous_tokens, current_tokens):
				149	if line.startswith("@@"):
				150	parts = line.split(' ')
				151	removed = parts[1].lstrip('-')
				152	removed_start, removed_end = parse_chunk_header_file_range(removed)
				153	added = parts[2].lstrip('+')
				154	added_start, added_end = parse_chunk_header_file_range(added)
				155	for i in range(added_start, added_end):
				156	added_tokens.append(i)
				157	for i in range(0, removed_start - prev_patched_chunk_end):
				158	changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i
				159	prev_patched_chunk_end = removed_end
				160	prev_file_chunk_end = added_end
				161	for i in range(0, len(previous_tokens) - prev_patched_chunk_end):
				162	changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i
				163	return added_tokens, changed_tokens
				164
				165
				166	def flatten_nested_list(l):
				167	"""Flattens a list and provides a mapping from elements in the list back
				168	into the nested list.
				169
				170	Args:
				171	l: A list of lists.
				172
				173	Returns:
				174	A tuple (flattened, index_to_position):
				175	flattened: The flattened list.
				176	index_to_position: A list of pairs (r, c) such that
				177	index_to_position[i] == (r, c); flattened[i] == l[r][c]
				178	"""
				179	flattened = []
				180	index_to_position = {}
				181	r = 0
				182	c = 0
				183	for nested_list in l:
				184	for element in nested_list:
				185	index_to_position[len(flattened)] = (r, c)
				186	flattened.append(element)
				187	c += 1
				188	r += 1
				189	c = 0
				190	return (flattened, index_to_position)
				191
				192
				193	def compute_changed_token_positions(previous_tokens, current_tokens):
				194	"""Computes changed and added token positions.
				195
				196	Args:
				197	previous_tokens: A list of lists of token strings. Lines in the file
				198	correspond to the nested lists.
				199	current_tokens: A list of lists of token strings. Lines in the file
				200	correspond to the nested lists.
				201
				202	Returns:
				203	A tuple (added_token_positions, changed_token_positions):
				204	added_token_positions: A list of pairs that index into \|current_tokens\|.
				205	changed_token_positions: A map from pairs that index into
				206	\|current_tokens\| to pairs that index into \|previous_tokens\|.
				207	"""
				208	flat_previous_tokens, previous_index_to_position = flatten_nested_list(
				209	previous_tokens)
				210	flat_current_tokens, current_index_to_position = flatten_nested_list(
				211	current_tokens)
				212	added_indices, changed_indices = compute_changed_token_indices(
				213	flat_previous_tokens, flat_current_tokens)
				214	added_token_positions = [current_index_to_position[i] for i in added_indices]
				215	changed_token_positions = {
				216	current_index_to_position[current_i]:
				217	previous_index_to_position[changed_indices[current_i]]
				218	for current_i in changed_indices
				219	}
				220	return (added_token_positions, changed_token_positions)
				221
				222
				223	def parse_chunks_from_diff(diff):
				224	"""Returns a generator of chunk data from a diff.
				225
				226	Args:
				227	diff: A list of strings, with each string being a line from a diff
				228	in unified diff format.
				229
				230	Returns:
				231	A generator of tuples (added_lines_start, added_lines_end,
				232	removed_lines, removed_lines_start)
				233	"""
				234	in_chunk = False
				235	chunk_previous = []
				236	previous_start = None
				237	current_start = None
				238	current_end = None
				239	for line in diff:
				240	if line.startswith('@@'):
				241	if in_chunk:
				242	yield (current_start, current_end,
				243	chunk_previous, previous_start)
				244	parts = line.split(' ')
				245	previous = parts[1].lstrip('-')
				246	previous_start, _ = parse_chunk_header_file_range(previous)
				247	current = parts[2].lstrip('+')
				248	current_start, current_end = parse_chunk_header_file_range(current)
				249	in_chunk = True
				250	chunk_previous = []
				251	elif in_chunk and line.startswith('-'):
				252	chunk_previous.append(line[1:])
				253	if current_start != None:
				254	yield (current_start, current_end,
				255	chunk_previous, previous_start)
				256
				257
				258	def should_skip_commit(commit):
				259	"""Decides if \|commit\| should be skipped when computing the blame.
				260
				261	Commit 5d4451e deleted all files in the repo except for DEPS. The
				262	next commit, 1e7896, brought them back. This is a hack to skip
				263	those commits (except for the files they modified). If we did not
				264	do this, changes would be incorrectly attributed to 1e7896.
				265
				266	Args:
				267	commit: A Commit object.
				268
				269	Returns:
				270	A boolean indicating if this commit should be skipped.
				271	"""
				272	banned_commits = [
				273	'1e78967ed2f1937b3809c19d91e7dd62d756d307',
				274	'5d4451ebf298d9d71f716cc0135f465cec41fcd0',
				275	]
				276	if commit.hash not in banned_commits:
				277	return False
				278	banned_commits_file_exceptions = [
				279	'DEPS',
				280	'chrome/browser/ui/views/file_manager_dialog_browsertest.cc',
				281	]
				282	for line in commit.diff:
				283	if line.startswith('---') or line.startswith('+++'):
				284	if line.split(' ')[1] in banned_commits_file_exceptions:
				285	return False
				286	elif line.startswith('@@'):
				287	return True
				288	assert False
				289
				290
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame^]	291	def generate_substrings(file):
				292	"""Generates substrings from a file stream, where substrings are
				293	separated by '\0'.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	294
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame^]	295	For example, the input:
				296	'a\0bc\0\0\0d\0'
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	297	would produce the output:
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame^]	298	['a', 'bc', 'd']
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	299
				300	Args:
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame^]	301	file: A readable file.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	302	"""
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame^]	303	data = ''
				304	while True:
				305	ch = file.read(1)
				306	if ch == '':
				307	break
				308	if ch == '\0':
				309	if data != '':
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	310	yield data
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame^]	311	data = ''
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	312	else:
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame^]	313	data += ch
				314	if data != '':
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	315	yield data
				316
				317
				318	def generate_commits(git_log_stdout):
				319	"""Parses git log output into a stream of Commit objects.
				320	"""
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame^]	321	substring_generator = generate_substrings(git_log_stdout)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	322	while True:
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame^]	323	hash = substring_generator.next().strip('\n')
				324	diff = substring_generator.next().strip('\n').split('\n')
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	325	yield Commit(hash, diff)
				326
				327
				328	def uberblame_aux(file_name, git_log_stdout, data):
				329	"""Computes the uberblame of file \|file_name\|.
				330
				331	Args:
				332	file_name: File to uberblame.
				333	git_log_stdout: A file object that represents the git log output.
				334	data: A string containing the data of file \|file_name\|.
				335
				336	Returns:
				337	A tuple (data, blame).
				338	data: File contents.
				339	blame: A list of TokenContexts.
				340	"""
				341	blame = tokenize_data(data)
				342
				343	blamed_tokens = 0
				344	total_tokens = len(blame)
				345	uber_blame = (data, blame[:])
				346
				347	for commit in generate_commits(git_log_stdout):
				348	if should_skip_commit(commit):
				349	continue
				350
				351	offset = 0
				352	for (added_lines_start, added_lines_end, removed_lines,
				353	removed_lines_start) in parse_chunks_from_diff(commit.diff):
				354	added_lines_start += offset
				355	added_lines_end += offset
				356	previous_contexts = [token_lines
				357	for line_previous in removed_lines
				358	for token_lines in tokenize_data(line_previous)]
				359	previous_tokens = [
				360	[context.token for context in contexts]
				361	for contexts in previous_contexts
				362	]
				363	current_contexts = blame[added_lines_start:added_lines_end]
				364	current_tokens = [
				365	[context.token for context in contexts]
				366	for contexts in current_contexts
				367	]
				368	added_token_positions, changed_token_positions = (
				369	compute_changed_token_positions(previous_tokens, current_tokens))
				370	for r, c in added_token_positions:
				371	current_contexts[r][c].commit = commit.hash
				372	blamed_tokens += 1
				373	for r, c in changed_token_positions:
				374	pr, pc = changed_token_positions[(r, c)]
				375	previous_contexts[pr][pc] = current_contexts[r][c]
				376
				377	assert added_lines_start <= added_lines_end <= len(blame)
				378	current_blame_size = len(blame)
				379	blame[added_lines_start:added_lines_end] = previous_contexts
				380	offset += len(blame) - current_blame_size
				381
				382	assert blame == [] or blame == [[]]
				383	return uber_blame
				384
				385
				386	def uberblame(file_name, revision):
				387	"""Computes the uberblame of file \|file_name\|.
				388
				389	Args:
				390	file_name: File to uberblame.
				391	revision: The revision to start the uberblame at.
				392
				393	Returns:
				394	A tuple (data, blame).
				395	data: File contents.
				396	blame: A list of TokenContexts.
				397	"""
				398	cmd_git_log = ['git', 'log', '--minimal', '--no-prefix', '--follow', '-m',
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame^]	399	'--first-parent', '-p', '-U0', '-z', '--format=%x00%h',
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	400	revision, '--', file_name]
				401	git_log = subprocess.Popen(cmd_git_log,
				402	stdout=subprocess.PIPE,
				403	stderr=subprocess.PIPE)
				404	data = subprocess.check_output(
				405	['git', 'show', '%s:%s' % (revision, file_name)])
				406	data, blame = uberblame_aux(file_name, git_log.stdout, data)
				407
				408	_, stderr = git_log.communicate()
				409	if git_log.returncode != 0:
				410	raise subprocess.CalledProcessError(git_log.returncode, cmd_git_log, stderr)
				411	return data, blame
				412
				413
				414	def generate_pastel_color():
				415	(h, l, s) = (random.uniform(0, 1),
				416	random.uniform(0.8, 0.9),
				417	random.uniform(0.5, 1))
				418	(r, g, b) = colorsys.hls_to_rgb(h, l, s)
				419	return "#%0.2X%0.2X%0.2X" % (int(r255), int(g255), int(b*255))
				420
				421
				422	def visualize_uberblame(data, blame):
				423	"""Creates and displays a web page to visualize \|blame\|.
				424
				425	Args:
				426	data: The data file as returned by uberblame().
				427	blame: A list of TokenContexts as returned by uberblame().
				428	"""
				429	# Use the same seed for the color generator on each run so that
				430	# loading the same blame of the same file twice will result in the
				431	# same generated HTML page.
				432	random.seed(0x52937865ec62d1ea)
				433	html = """\
				434	<html>
				435	<head>
				436	<style>
				437	body {
				438	font-family: "Courier New";
				439	}
				440	pre {
				441	display: inline;
				442	}
				443	a {
				444	color: #000000;
				445	text-decoration: none;
				446	}
				447	span {
Tom Anderson	a671dad	2017-10-10 19:19:47	[diff] [blame]	448	outline: 1pt solid #00000030;
				449	outline-offset: -1pt;
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	450	}
				451	#linenums {
				452	text-align: right;
				453	}
				454	</style>
				455	</head>
				456	<body>
				457	<table>
				458	<tbody>
				459	<tr>
				460	<td valign="top" id="linenums">
				461	<pre>%s</pre>
				462	</td>
				463	<td valign="top">
				464	<pre>%s</pre>
				465	</td>
				466	</tr>
				467	</tbody>
				468	</table>
				469	</body>
				470	</html>
				471	"""
				472	html = textwrap.dedent(html)
				473	lines = []
				474	commit_colors = {}
				475	blame_index = 0
				476	blame = [context for contexts in blame for context in contexts]
				477	row = 0
				478	lastline = ''
				479	for line in data.split('\n'):
				480	lastline = line
				481	column = 0
				482	for c in line + '\n':
				483	if blame_index < len(blame):
				484	token_context = blame[blame_index]
				485	if (row == token_context.row and
				486	column == token_context.column + len(token_context.token)):
				487	if (blame_index + 1 == len(blame) or
				488	blame[blame_index].commit != blame[blame_index + 1].commit):
				489	lines.append('</a></span>')
				490	blame_index += 1
				491	if blame_index < len(blame):
				492	token_context = blame[blame_index]
				493	if row == token_context.row and column == token_context.column:
				494	if (blame_index == 0 or
				495	blame[blame_index - 1].commit != blame[blame_index].commit):
				496	commit = token_context.commit
				497	assert commit != None
				498	lines.append(('<a href="https://chromium.googlesource.com/' +
				499	'chromium/src/+/%s">') % commit)
				500	if commit not in commit_colors:
				501	commit_colors[commit] = generate_pastel_color()
				502	color = commit_colors[commit]
				503	lines.append('<span style="background-color: %s">' % color)
				504	lines.append(cgi.escape(c))
				505	column += 1
				506	row += 1
				507	line_nums = range(1, row if lastline.strip() == '' else row + 1)
				508	line_nums = '\n'.join([str(num) for num in line_nums])
				509	lines = ''.join(lines)
				510	return html % (line_nums, lines)
				511
				512
				513	def show_visualization(html):
				514	"""Display \|html\| in a web browser.
				515
				516	Args:
				517	html: The contents of the file to display, as a string.
				518	"""
				519	# Keep the temporary file around so the browser has time to open it.
				520	# TODO(thomasanderson): spin up a temporary web server to serve this
				521	# file so we don't have to leak it.
				522	html_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html')
				523	html_file.write(html)
				524	html_file.flush()
				525	if sys.platform.startswith('linux'):
				526	# Don't show any messages when starting the browser.
				527	saved_stdout = os.dup(1)
				528	saved_stderr = os.dup(2)
				529	os.close(1)
				530	os.close(2)
				531	os.open(os.devnull, os.O_RDWR)
				532	os.open(os.devnull, os.O_RDWR)
				533	webbrowser.open('file://' + html_file.name)
				534	if sys.platform.startswith('linux'):
				535	os.dup2(saved_stdout, 1)
				536	os.dup2(saved_stderr, 2)
				537	os.close(saved_stdout)
				538	os.close(saved_stderr)
				539
				540
				541	def main():
				542	parser = argparse.ArgumentParser(
				543	description='Show what revision last modified each token of a file')
				544	parser.add_argument('revision', default='HEAD', nargs='?',
				545	help='Show only commits starting from a revision.')
				546	parser.add_argument('file', help='The file to uberblame.')
				547	args = parser.parse_args()
				548
				549	data, blame = uberblame(args.file, args.revision)
				550	html = visualize_uberblame(data, blame)
				551	show_visualization(html)
				552	return 0
				553
				554
				555	if __name__ == '__main__':
				556	sys.exit(main())