Blame - tools/uberblame.py - chromium/src

blob: b6f1132c16d4c05f054bd84a9b0ca4feda0e7b57 [file] [log] [blame]

Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	1	#!/usr/bin/env python3
Avi Drissman	dfd88085	2022-09-15 20:11:09	[diff] [blame]	2	# Copyright 2017 The Chromium Authors
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	3	# Use of this source code is governed by a BSD-style license that can be
				4	# found in the LICENSE file.
				5
				6	import argparse
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	7	import colorsys
				8	import difflib
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	9	import html
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	10	import random
				11	import os
				12	import re
				13	import subprocess
				14	import sys
				15	import tempfile
				16	import textwrap
				17	import webbrowser
				18
				19
				20	class TokenContext(object):
				21	"""Metadata about a token.
				22
				23	Attributes:
				24	row: Row index of the token in the data file.
				25	column: Column index of the token in the data file.
				26	token: The token string.
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	27	commit: A Commit object that corresponds to the commit that added
				28	this token.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	29	"""
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	30
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	31	def __init__(self, row, column, token, commit=None):
				32	self.row = row
				33	self.column = column
				34	self.token = token
				35	self.commit = commit
				36
				37
				38	class Commit(object):
				39	"""Commit data.
				40
				41	Attributes:
				42	hash: The commit hash.
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	43	author_name: The author's name.
				44	author_email: the author's email.
				45	author_date: The date and time the author created this commit.
				46	message: The commit message.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	47	diff: The commit diff.
				48	"""
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	49
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	50	def __init__(self, hash, author_name, author_email, author_date, message,
				51	diff):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	52	self.hash = hash
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	53	self.author_name = author_name
				54	self.author_email = author_email
				55	self.author_date = author_date
				56	self.message = message
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	57	self.diff = diff
				58
				59
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	60	def tokenize_data(data, tokenize_by_char, tokenize_whitespace):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	61	"""Tokenizes \|data\|.
				62
				63	Args:
				64	data: String to tokenize.
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	65	tokenize_by_char: If true, individual characters are treated as tokens.
				66	Otherwise, tokens are either symbols or strings of both alphanumeric
				67	characters and underscores.
				68	tokenize_whitespace: Treat non-newline whitespace characters as tokens.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	69
				70	Returns:
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	71	A list of lists of TokenContexts. Each list represents a line.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	72	"""
				73	contexts = []
				74	in_identifier = False
				75	identifier_start = 0
				76	identifier = ''
				77	row = 0
				78	column = 0
				79	line_contexts = []
				80
Tom Anderson	ac918bb6	2018-04-17 00:37:34	[diff] [blame]	81	for c in data:
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	82	if not tokenize_by_char and (c.isalnum() or c == '_'):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	83	if in_identifier:
				84	identifier += c
				85	else:
				86	in_identifier = True
				87	identifier_start = column
				88	identifier = c
				89	else:
				90	if in_identifier:
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	91	line_contexts.append(TokenContext(row, identifier_start, identifier))
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	92	in_identifier = False
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	93	if not c.isspace() or (tokenize_whitespace and c != '\n'):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	94	line_contexts.append(TokenContext(row, column, c))
				95
				96	if c == '\n':
				97	row += 1
				98	column = 0
				99	contexts.append(line_contexts)
				100	line_tokens = []
				101	line_contexts = []
				102	else:
				103	column += 1
Tom Anderson	ac918bb6	2018-04-17 00:37:34	[diff] [blame]	104	contexts.append(line_contexts)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	105	return contexts
				106
				107
				108	def compute_unified_diff(old_tokens, new_tokens):
				109	"""Computes the diff between \|old_tokens\| and \|new_tokens\|.
				110
				111	Args:
				112	old_tokens: Token strings corresponding to the old data.
				113	new_tokens: Token strings corresponding to the new data.
				114
				115	Returns:
				116	The diff, in unified diff format.
				117	"""
				118	return difflib.unified_diff(old_tokens, new_tokens, n=0, lineterm='')
				119
				120
				121	def parse_chunk_header_file_range(file_range):
				122	"""Parses a chunk header file range.
				123
				124	Diff chunk headers have the form:
				125	@@ -<file-range> +<file-range> @@
				126	File ranges have the form:
				127	<start line number>,<number of lines changed>
				128
				129	Args:
				130	file_range: A chunk header file range.
				131
				132	Returns:
				133	A tuple (range_start, range_end). The endpoints are adjusted such that
				134	iterating over [range_start, range_end) will give the changed indices.
				135	"""
				136	if ',' in file_range:
				137	file_range_parts = file_range.split(',')
				138	start = int(file_range_parts[0])
				139	amount = int(file_range_parts[1])
				140	if amount == 0:
				141	return (start, start)
				142	return (start - 1, start + amount - 1)
				143	else:
				144	return (int(file_range) - 1, int(file_range))
				145
				146
				147	def compute_changed_token_indices(previous_tokens, current_tokens):
				148	"""Computes changed and added tokens.
				149
				150	Args:
				151	previous_tokens: Tokens corresponding to the old file.
				152	current_tokens: Tokens corresponding to the new file.
				153
				154	Returns:
				155	A tuple (added_tokens, changed_tokens).
				156	added_tokens: A list of indices into \|current_tokens\|.
				157	changed_tokens: A map of indices into \|current_tokens\| to
				158	indices into \|previous_tokens\|.
				159	"""
				160	prev_file_chunk_end = 0
				161	prev_patched_chunk_end = 0
				162	added_tokens = []
				163	changed_tokens = {}
				164	for line in compute_unified_diff(previous_tokens, current_tokens):
				165	if line.startswith("@@"):
				166	parts = line.split(' ')
				167	removed = parts[1].lstrip('-')
				168	removed_start, removed_end = parse_chunk_header_file_range(removed)
				169	added = parts[2].lstrip('+')
				170	added_start, added_end = parse_chunk_header_file_range(added)
				171	for i in range(added_start, added_end):
				172	added_tokens.append(i)
				173	for i in range(0, removed_start - prev_patched_chunk_end):
				174	changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i
				175	prev_patched_chunk_end = removed_end
				176	prev_file_chunk_end = added_end
				177	for i in range(0, len(previous_tokens) - prev_patched_chunk_end):
				178	changed_tokens[prev_file_chunk_end + i] = prev_patched_chunk_end + i
				179	return added_tokens, changed_tokens
				180
				181
				182	def flatten_nested_list(l):
				183	"""Flattens a list and provides a mapping from elements in the list back
				184	into the nested list.
				185
				186	Args:
				187	l: A list of lists.
				188
				189	Returns:
				190	A tuple (flattened, index_to_position):
				191	flattened: The flattened list.
				192	index_to_position: A list of pairs (r, c) such that
				193	index_to_position[i] == (r, c); flattened[i] == l[r][c]
				194	"""
				195	flattened = []
				196	index_to_position = {}
				197	r = 0
				198	c = 0
				199	for nested_list in l:
				200	for element in nested_list:
				201	index_to_position[len(flattened)] = (r, c)
				202	flattened.append(element)
				203	c += 1
				204	r += 1
				205	c = 0
				206	return (flattened, index_to_position)
				207
				208
				209	def compute_changed_token_positions(previous_tokens, current_tokens):
				210	"""Computes changed and added token positions.
				211
				212	Args:
				213	previous_tokens: A list of lists of token strings. Lines in the file
				214	correspond to the nested lists.
				215	current_tokens: A list of lists of token strings. Lines in the file
				216	correspond to the nested lists.
				217
				218	Returns:
				219	A tuple (added_token_positions, changed_token_positions):
				220	added_token_positions: A list of pairs that index into \|current_tokens\|.
				221	changed_token_positions: A map from pairs that index into
				222	\|current_tokens\| to pairs that index into \|previous_tokens\|.
				223	"""
				224	flat_previous_tokens, previous_index_to_position = flatten_nested_list(
				225	previous_tokens)
				226	flat_current_tokens, current_index_to_position = flatten_nested_list(
				227	current_tokens)
				228	added_indices, changed_indices = compute_changed_token_indices(
				229	flat_previous_tokens, flat_current_tokens)
				230	added_token_positions = [current_index_to_position[i] for i in added_indices]
				231	changed_token_positions = {
				232	current_index_to_position[current_i]:
				233	previous_index_to_position[changed_indices[current_i]]
				234	for current_i in changed_indices
				235	}
				236	return (added_token_positions, changed_token_positions)
				237
				238
				239	def parse_chunks_from_diff(diff):
				240	"""Returns a generator of chunk data from a diff.
				241
				242	Args:
				243	diff: A list of strings, with each string being a line from a diff
				244	in unified diff format.
				245
				246	Returns:
Tom Anderson	500216114	2018-04-17 00:30:21	[diff] [blame]	247	A generator of tuples (added_lines_start, added_lines_end, removed_lines)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	248	"""
Tom Anderson	500216114	2018-04-17 00:30:21	[diff] [blame]	249	it = iter(diff)
				250	for line in it:
				251	while not line.startswith('@@'):
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	252	line = next(it)
Tom Anderson	500216114	2018-04-17 00:30:21	[diff] [blame]	253	parts = line.split(' ')
				254	previous_start, previous_end = parse_chunk_header_file_range(
				255	parts[1].lstrip('-'))
				256	current_start, current_end = parse_chunk_header_file_range(
				257	parts[2].lstrip('+'))
				258
				259	in_delta = False
				260	added_lines_start = None
				261	added_lines_end = None
				262	removed_lines = []
				263	while previous_start < previous_end or current_start < current_end:
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	264	line = next(it)
Tom Anderson	500216114	2018-04-17 00:30:21	[diff] [blame]	265	firstchar = line[0]
				266	line = line[1:]
				267	if not in_delta and (firstchar == '-' or firstchar == '+'):
				268	in_delta = True
				269	added_lines_start = current_start
				270	added_lines_end = current_start
				271	removed_lines = []
				272
				273	if firstchar == '-':
				274	removed_lines.append(line)
				275	previous_start += 1
				276	elif firstchar == '+':
				277	current_start += 1
				278	added_lines_end = current_start
				279	elif firstchar == ' ':
				280	if in_delta:
				281	in_delta = False
				282	yield (added_lines_start, added_lines_end, removed_lines)
				283	previous_start += 1
				284	current_start += 1
				285	if in_delta:
				286	yield (added_lines_start, added_lines_end, removed_lines)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	287
				288
				289	def should_skip_commit(commit):
				290	"""Decides if \|commit\| should be skipped when computing the blame.
				291
				292	Commit 5d4451e deleted all files in the repo except for DEPS. The
				293	next commit, 1e7896, brought them back. This is a hack to skip
				294	those commits (except for the files they modified). If we did not
				295	do this, changes would be incorrectly attributed to 1e7896.
				296
				297	Args:
				298	commit: A Commit object.
				299
				300	Returns:
				301	A boolean indicating if this commit should be skipped.
				302	"""
				303	banned_commits = [
				304	'1e78967ed2f1937b3809c19d91e7dd62d756d307',
				305	'5d4451ebf298d9d71f716cc0135f465cec41fcd0',
				306	]
				307	if commit.hash not in banned_commits:
				308	return False
				309	banned_commits_file_exceptions = [
				310	'DEPS',
				311	'chrome/browser/ui/views/file_manager_dialog_browsertest.cc',
				312	]
				313	for line in commit.diff:
				314	if line.startswith('---') or line.startswith('+++'):
				315	if line.split(' ')[1] in banned_commits_file_exceptions:
				316	return False
				317	elif line.startswith('@@'):
				318	return True
				319	assert False
				320
				321
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame]	322	def generate_substrings(file):
				323	"""Generates substrings from a file stream, where substrings are
				324	separated by '\0'.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	325
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame]	326	For example, the input:
				327	'a\0bc\0\0\0d\0'
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	328	would produce the output:
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame]	329	['a', 'bc', 'd']
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	330
				331	Args:
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame]	332	file: A readable file.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	333	"""
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	334	BUF_SIZE = 448 # Experimentally found to be pretty fast.
				335	data = []
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame]	336	while True:
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	337	buf = file.read(BUF_SIZE)
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	338	parts = buf.split(b'\0')
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	339	data.append(parts[0])
				340	if len(parts) > 1:
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	341	joined = b''.join(data)
				342	if joined != b'':
				343	yield joined.decode()
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	344	for i in range(1, len(parts) - 1):
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	345	if parts[i] != b'':
				346	yield parts[i].decode()
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	347	data = [parts[-1]]
				348	if len(buf) < BUF_SIZE:
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	349	joined = b''.join(data)
				350	if joined != b'':
				351	yield joined.decode()
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	352	return
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	353
				354
				355	def generate_commits(git_log_stdout):
				356	"""Parses git log output into a stream of Commit objects.
				357	"""
Tom Anderson	1e71692	2017-10-12 19:43:49	[diff] [blame]	358	substring_generator = generate_substrings(git_log_stdout)
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	359	try:
				360	while True:
				361	hash = next(substring_generator)
				362	author_name = next(substring_generator)
				363	author_email = next(substring_generator)
				364	author_date = next(substring_generator)
				365	message = next(substring_generator).rstrip('\n')
				366	diff = next(substring_generator).split('\n')[1:-1]
				367	yield Commit(hash, author_name, author_email, author_date, message, diff)
				368	except StopIteration:
				369	pass
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	370
				371
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	372	def uberblame_aux(file_name, git_log_stdout, data, tokenization_method):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	373	"""Computes the uberblame of file \|file_name\|.
				374
				375	Args:
				376	file_name: File to uberblame.
				377	git_log_stdout: A file object that represents the git log output.
				378	data: A string containing the data of file \|file_name\|.
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	379	tokenization_method: A function that takes a string and returns a list of
				380	TokenContexts.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	381
				382	Returns:
				383	A tuple (data, blame).
				384	data: File contents.
				385	blame: A list of TokenContexts.
				386	"""
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	387	blame = tokenization_method(data)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	388
				389	blamed_tokens = 0
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	390	uber_blame = (data, blame[:])
				391
				392	for commit in generate_commits(git_log_stdout):
				393	if should_skip_commit(commit):
				394	continue
				395
				396	offset = 0
Tom Anderson	500216114	2018-04-17 00:30:21	[diff] [blame]	397	for (added_lines_start, added_lines_end,
				398	removed_lines) in parse_chunks_from_diff(commit.diff):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	399	added_lines_start += offset
				400	added_lines_end += offset
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	401	previous_contexts = [
				402	token_lines
				403	for line_previous in removed_lines
				404	for token_lines in tokenization_method(line_previous)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	405	]
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	406	previous_tokens = [[context.token for context in contexts]
				407	for contexts in previous_contexts]
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	408	current_contexts = blame[added_lines_start:added_lines_end]
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	409	current_tokens = [[context.token for context in contexts]
				410	for contexts in current_contexts]
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	411	added_token_positions, changed_token_positions = (
				412	compute_changed_token_positions(previous_tokens, current_tokens))
				413	for r, c in added_token_positions:
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	414	current_contexts[r][c].commit = commit
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	415	blamed_tokens += 1
				416	for r, c in changed_token_positions:
				417	pr, pc = changed_token_positions[(r, c)]
				418	previous_contexts[pr][pc] = current_contexts[r][c]
				419
				420	assert added_lines_start <= added_lines_end <= len(blame)
				421	current_blame_size = len(blame)
				422	blame[added_lines_start:added_lines_end] = previous_contexts
				423	offset += len(blame) - current_blame_size
				424
				425	assert blame == [] or blame == [[]]
				426	return uber_blame
				427
				428
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	429	def uberblame(file_name, revision, tokenization_method):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	430	"""Computes the uberblame of file \|file_name\|.
				431
				432	Args:
				433	file_name: File to uberblame.
				434	revision: The revision to start the uberblame at.
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	435	tokenization_method: A function that takes a string and returns a list of
				436	TokenContexts.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	437
				438	Returns:
				439	A tuple (data, blame).
				440	data: File contents.
				441	blame: A list of TokenContexts.
				442	"""
Tom Anderson	500216114	2018-04-17 00:30:21	[diff] [blame]	443	DIFF_CONTEXT = 3
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	444	cmd_git_log = [
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	445	'git', 'log', '--minimal', '--no-prefix', '--follow', '-m',
Tom Anderson	500216114	2018-04-17 00:30:21	[diff] [blame]	446	'--first-parent', '-p',
				447	'-U%d' % DIFF_CONTEXT, '-z', '--format=%x00%H%x00%an%x00%ae%x00%ad%x00%B',
				448	revision, '--', file_name
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	449	]
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	450	git_log = subprocess.Popen(
				451	cmd_git_log, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	452	data = subprocess.check_output(
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	453	['git', 'show', '%s:%s' % (revision, file_name)]).decode()
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	454	data, blame = uberblame_aux(file_name, git_log.stdout, data,
				455	tokenization_method)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	456
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	457	stderr = git_log.communicate()[1].decode()
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	458	if git_log.returncode != 0:
				459	raise subprocess.CalledProcessError(git_log.returncode, cmd_git_log, stderr)
				460	return data, blame
				461
				462
				463	def generate_pastel_color():
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	464	"""Generates a random color from a nice looking pastel palette.
				465
				466	Returns:
				467	The color, formatted as hex string. For example, white is "#FFFFFF".
				468	"""
				469	(h, l, s) = (random.uniform(0, 1), random.uniform(0.8, 0.9), random.uniform(
				470	0.5, 1))
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	471	(r, g, b) = colorsys.hls_to_rgb(h, l, s)
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	472	return "#%0.2X%0.2X%0.2X" % (int(r * 255), int(g * 255), int(b * 255))
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	473
				474
Tom Anderson	ac918bb6	2018-04-17 00:37:34	[diff] [blame]	475	def colorize_diff(diff):
				476	"""Colorizes a diff for use in an HTML page.
				477
				478	Args:
				479	diff: The diff, in unified diff format, as a list of line strings.
				480
				481	Returns:
				482	The HTML-formatted diff, as a string. The diff will already be escaped.
				483	"""
				484
				485	colorized = []
				486	for line in diff:
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	487	escaped = html.escape(line.replace('\r', ''), quote=True)
Tom Anderson	ac918bb6	2018-04-17 00:37:34	[diff] [blame]	488	if line.startswith('+'):
				489	colorized.append('<span class=\\"addition\\">%s</span>' % escaped)
				490	elif line.startswith('-'):
				491	colorized.append('<span class=\\"deletion\\">%s</span>' % escaped)
				492	elif line.startswith('@@'):
				493	context_begin = escaped.find('@@', 2)
				494	assert context_begin != -1
				495	colorized.append(
				496	'<span class=\\"chunk_meta\\">%s</span>'
				497	'<span class=\\"chunk_context\\">%s</span'
				498	% (escaped[0:context_begin + 2], escaped[context_begin + 2:]))
				499	elif line.startswith('diff') or line.startswith('index'):
				500	colorized.append('<span class=\\"file_header\\">%s</span>' % escaped)
				501	else:
				502	colorized.append('<span class=\\"context_line\\">%s</span>' % escaped)
				503	return '\n'.join(colorized)
				504
				505
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	506	def create_visualization(data, blame):
				507	"""Creates a web page to visualize \|blame\|.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	508
				509	Args:
				510	data: The data file as returned by uberblame().
				511	blame: A list of TokenContexts as returned by uberblame().
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	512
Tom Anderson	ac918bb6	2018-04-17 00:37:34	[diff] [blame]	513	Returns:
				514	The HTML for the generated page, as a string.
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	515	"""
				516	# Use the same seed for the color generator on each run so that
				517	# loading the same blame of the same file twice will result in the
				518	# same generated HTML page.
				519	random.seed(0x52937865ec62d1ea)
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	520	page = """\
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	521	<html>
				522	<head>
				523	<style>
				524	body {
Tom Anderson	ac918bb6	2018-04-17 00:37:34	[diff] [blame]	525	font-family: monospace;
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	526	}
				527	pre {
				528	display: inline;
				529	}
Tom Anderson	ac918bb6	2018-04-17 00:37:34	[diff] [blame]	530	.token {
Tom Anderson	a671dad	2017-10-10 19:19:47	[diff] [blame]	531	outline: 1pt solid #00000030;
				532	outline-offset: -1pt;
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	533	cursor: pointer;
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	534	}
Tom Anderson	ac918bb6	2018-04-17 00:37:34	[diff] [blame]	535	.addition {
				536	color: #080;
				537	}
				538	.deletion {
				539	color: #c00;
				540	}
				541	.chunk_meta {
				542	color: #099;
				543	}
				544	.context_line .chunk_context {
				545	// Just normal text.
				546	}
				547	.file_header {
				548	font-weight: bold;
				549	}
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	550	#linenums {
				551	text-align: right;
				552	}
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	553	#file_display {
				554	position: absolute;
				555	left: 0;
				556	top: 0;
				557	width: 50%%;
				558	height: 100%%;
				559	overflow: scroll;
				560	}
				561	#commit_display_container {
				562	position: absolute;
				563	left: 50%%;
				564	top: 0;
				565	width: 50%%;
				566	height: 100%%;
				567	overflow: scroll;
				568	}
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	569	</style>
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	570	<script>
				571	commit_data = %s;
				572	function display_commit(hash) {
				573	var e = document.getElementById("commit_display");
				574	e.innerHTML = commit_data[hash]
				575	}
				576	</script>
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	577	</head>
				578	<body>
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	579	<div id="file_display">
				580	<table>
				581	<tbody>
				582	<tr>
				583	<td valign="top" id="linenums">
				584	<pre>%s</pre>
				585	</td>
				586	<td valign="top">
				587	<pre>%s</pre>
				588	</td>
				589	</tr>
				590	</tbody>
				591	</table>
				592	</div>
				593	<div id="commit_display_container" valign="top">
				594	<pre id="commit_display" />
				595	</div>
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	596	</body>
				597	</html>
				598	"""
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	599	page = textwrap.dedent(page)
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	600	commits = {}
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	601	lines = []
				602	commit_colors = {}
				603	blame_index = 0
				604	blame = [context for contexts in blame for context in contexts]
				605	row = 0
				606	lastline = ''
				607	for line in data.split('\n'):
				608	lastline = line
				609	column = 0
				610	for c in line + '\n':
				611	if blame_index < len(blame):
				612	token_context = blame[blame_index]
				613	if (row == token_context.row and
				614	column == token_context.column + len(token_context.token)):
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	615	if (blame_index + 1 == len(blame) or blame[blame_index].commit.hash !=
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	616	blame[blame_index + 1].commit.hash):
				617	lines.append('</span>')
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	618	blame_index += 1
				619	if blame_index < len(blame):
				620	token_context = blame[blame_index]
				621	if row == token_context.row and column == token_context.column:
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	622	if (blame_index == 0 or blame[blame_index - 1].commit.hash !=
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	623	blame[blame_index].commit.hash):
				624	hash = token_context.commit.hash
				625	commits[hash] = token_context.commit
				626	if hash not in commit_colors:
				627	commit_colors[hash] = generate_pastel_color()
				628	color = commit_colors[hash]
Tom Anderson	ac918bb6	2018-04-17 00:37:34	[diff] [blame]	629	lines.append(('<span class="token" style="background-color: %s" ' +
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	630	'onclick="display_commit("%s")">') % (color,
				631	hash))
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	632	lines.append(html.escape(c))
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	633	column += 1
				634	row += 1
Tom Anderson	ac918bb6	2018-04-17 00:37:34	[diff] [blame]	635	commit_data = ['{\n']
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	636	commit_display_format = """\
				637	commit: {hash}
				638	Author: {author_name} <{author_email}>
				639	Date: {author_date}
				640
				641	{message}
Tom Anderson	ac918bb6	2018-04-17 00:37:34	[diff] [blame]	642
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	643	"""
				644	commit_display_format = textwrap.dedent(commit_display_format)
				645	links = re.compile(r'(https?:\/\/\S+)')
				646	for hash in commits:
				647	commit = commits[hash]
				648	commit_display = commit_display_format.format(
				649	hash=hash,
				650	author_name=commit.author_name,
				651	author_email=commit.author_email,
				652	author_date=commit.author_date,
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	653	message=commit.message)
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	654	commit_display = html.escape(commit_display, quote=True)
Tom Anderson	ac918bb6	2018-04-17 00:37:34	[diff] [blame]	655	commit_display += colorize_diff(commit.diff)
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	656	commit_display = re.sub(links, '<a href=\\"\\1\\">\\1</a>', commit_display)
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	657	commit_display = commit_display.replace('\n', '\\n')
Tom Anderson	ac918bb6	2018-04-17 00:37:34	[diff] [blame]	658	commit_data.append('"%s": "%s",\n' % (hash, commit_display))
Tom Anderson	6541015	2017-10-17 01:53:19	[diff] [blame]	659	commit_data.append('}')
				660	commit_data = ''.join(commit_data)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	661	line_nums = range(1, row if lastline.strip() == '' else row + 1)
				662	line_nums = '\n'.join([str(num) for num in line_nums])
				663	lines = ''.join(lines)
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	664	return page % (commit_data, line_nums, lines)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	665
				666
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	667	def show_visualization(page):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	668	"""Display \|html\| in a web browser.
				669
				670	Args:
				671	html: The contents of the file to display, as a string.
				672	"""
				673	# Keep the temporary file around so the browser has time to open it.
				674	# TODO(thomasanderson): spin up a temporary web server to serve this
				675	# file so we don't have to leak it.
				676	html_file = tempfile.NamedTemporaryFile(delete=False, suffix='.html')
Tom Anderson	773d809	2022-03-23 20:47:29	[diff] [blame]	677	html_file.write(page.encode())
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	678	html_file.flush()
				679	if sys.platform.startswith('linux'):
				680	# Don't show any messages when starting the browser.
				681	saved_stdout = os.dup(1)
				682	saved_stderr = os.dup(2)
				683	os.close(1)
				684	os.close(2)
				685	os.open(os.devnull, os.O_RDWR)
				686	os.open(os.devnull, os.O_RDWR)
				687	webbrowser.open('file://' + html_file.name)
				688	if sys.platform.startswith('linux'):
				689	os.dup2(saved_stdout, 1)
				690	os.dup2(saved_stderr, 2)
				691	os.close(saved_stdout)
				692	os.close(saved_stderr)
				693
				694
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	695	def main(argv):
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	696	parser = argparse.ArgumentParser(
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	697	description='Show what revision last modified each token of a file.')
				698	parser.add_argument(
				699	'revision',
				700	default='HEAD',
				701	nargs='?',
				702	help='show only commits starting from a revision')
				703	parser.add_argument('file', help='the file to uberblame')
				704	parser.add_argument(
				705	'--skip-visualization',
				706	action='store_true',
				707	help='do not display the blame visualization in a web browser')
				708	parser.add_argument(
				709	'--tokenize-by-char',
				710	action='store_true',
				711	help='treat individual characters as tokens')
				712	parser.add_argument(
				713	'--tokenize-whitespace',
				714	action='store_true',
				715	help='also blame non-newline whitespace characters')
				716	args = parser.parse_args(argv)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	717
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	718	def tokenization_method(data):
				719	return tokenize_data(data, args.tokenize_by_char, args.tokenize_whitespace)
				720
				721	data, blame = uberblame(args.file, args.revision, tokenization_method)
				722	html = create_visualization(data, blame)
				723	if not args.skip_visualization:
				724	show_visualization(html)
Tom Anderson	c3ed896	2017-10-09 19:01:46	[diff] [blame]	725	return 0
				726
				727
				728	if __name__ == '__main__':
Tom Anderson	b3d7e64	2018-04-13 16:23:42	[diff] [blame]	729	sys.exit(main(sys.argv[1:]))