# Copyright 2016 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Uses SWIG output to convert C++ code snippets to C# code snippets. Basic Usage: swig_debug_parser.py -d -s When running SWIG, use `-debug-top 4` and pipe the output to a file to collect the debug information about the SWIG parse state. This information can be used by this script to generate a mapping from C++ identifiers to C# identifiers. Then, the script looks for code snippets in the comments of the C# source (denoted by areas in comments wrapped in backticks) and replaces all C++ identifiers with the appropriate C# identifiers. """ import re from absl import app from absl import flags from absl import logging FLAGS = flags.FLAGS flags.DEFINE_spaceseplist('srcs', None, 'The C# source files to process in-place.', short_name='s') flags.DEFINE_string('debug_top', None, 'The SWIG output file generated by running SWIG with the ' 'argument `debug_top 4`.', short_name='d') flags.DEFINE_string('namespace', None, 'Place all C# identifiers in the given namespace.', short_name='n') # Matches strings of the form '+++ SomeToken --------------------------' NODE_HEADER_REGEX = re.compile(r'\+\+\+ ([a-zA-Z0-9_:]+) -+') # Matches strings of the form ' | Key - "StringValue"' NODE_FIELD_REGEX = re.compile(r' *\| ([a-zA-Z0-9_:]+) *- "(.+)"') # Matches strings of the form ' | Key - 0xdeadbeef' NODE_HEX_FIELD_REGEX = re.compile(r' *\| ([a-zA-Z0-9_:]+) *- (0x[0-9a-f]+)') # Matches strings of the form '... `some::function::name(int, std::string)` ...' FUNCTION_NAME_REGEX = re.compile(r'`([a-zA-Z_:]+)[^`]*`') # When parsing nodes it is useful to know what kind of node is being parsed, so # a special key is added to each node so that it is aware of what list it came # from. NODE_TYPE_KEY = '__type__' # We have to store children in a special sub-dict because some of the child node # identifiers can clash with field keys. NODE_CHILDREN_KEY = '__children__' class PeekableIter(object): """A simple iterator wrapper that supplies a peek function. This is useful when doing things like parsing text, where you often want to see the next token before consuming it. """ def __init__(self, iterable): self.iterator = iter(iterable) self.next_value = None def __iter__(self): return self def next(self): """Returns the next value being iterated over.""" if self.next_value is not None: next_value = self.next_value self.next_value = None return next_value else: return next(self.iterator) def peek(self): """Returns the next value being iterated over without consuming it.""" if self.next_value is None: self.next_value = next(self.iterator) return self.next_value def indentation(line): """Returns the number of leading whitespace characters.""" return len(line) - len(line.lstrip()) def parse_children(node, it, indent): """Parses a child node on the given node. Child nodes start with three '+', followed by the node name, followed by a series of -'s. SWIG Debug output looks something like the following: +++ somenode ---------------------------------------- | field - "Value" | another_field - "Some other value" +++ childnode ---------------------------------------- | child_field - "blah blah blah" Each node has a header which declares what kind of node it is, followed by zero or more key value pairs, followed by zero or more child nodes which follow the same pattern. Sometimes there is a blank link, or a line consisting of only a vertical pipe character between the final field and the child nodes. Args: node: The node to parse for children. it: A PeekableIter to iterate over the lines of the swig debug output. indent: The current indentation level. Used to determine when to recurse into children or return. """ field = NODE_HEADER_REGEX.search(it.next()).group(1) children = node.setdefault(NODE_CHILDREN_KEY, {}) field_list = children.setdefault(field, []) child_node = parse_node(it, indent) child_node[NODE_TYPE_KEY] = field field_list.append(child_node) def consume_empty_lines(it): """Skip over the trailing lines at the end of a node. Some nodes have a trailing newline or pipe character for no real reason, so we skip them. Args: it: A PeekableIter to iterate over the lines of the swig debug output. Raises: StopIteration: The iterator reached the end. """ # pylint: disable=g-explicit-bool-comparison while it.peek() == '' or it.peek().strip() == '|': it.next() def parse_node(it, indent): """Parses the fields of the node and returns a dict with those fields. Child nodes start with three '+', followed by the node name, followed by a series of -'s. Args: it: A PeekableIter to iterate over the lines of the swig debug output. indent: The current indentation level. Used to determine when to recurse into children or return. Returns: A dict containing the parsed fields. """ node = {} try: while indentation(it.peek()) == indent: line = it.peek() # This is the start of a new node. if '+++' in line: break # Check if this a field we can read matches = NODE_FIELD_REGEX.search(line) if matches: key, value = matches.groups() node[key] = value else: matches = NODE_HEX_FIELD_REGEX.search(line) if matches: key, value = matches.groups() node[key] = int(value, 16) it.next() consume_empty_lines(it) except StopIteration: return node # Parse child nodes. try: while indentation(it.peek()) > indent: parse_children(node, it, indentation(it.peek())) consume_empty_lines(it) except StopIteration: return node return node def parse_swig_debug_top(it): """Parses the output of `swig --debug-top 4`. Parses the debug output into a series of nested dictionaries, which we can use to # generate a map of C++ classes and enums to C# classes and enums. Args: it: A PeekableIter to iterate over the lines of the swig debug output. Returns: A dictionary representing the root node of the parse tree. """ # The deubg output begins with a bunch of stuff we don't care about. Parse # forward until we find a line containing +++, which indicates the root node # of the tree. while it.peek()[0:3] != '+++': it.next() root_node = {} parse_children(root_node, it, 0) return root_node def gather_subtitution_data(node, identifier_metadata_map, file_module_map, current_class=None, includes=None): """Builds metadata needed to perform identifer substitution on the C# sources. This is basically doing the first phase of two-phase parsing. We need to build a mapping between C++ identifiers to C# identifiers. However, the C# class names are not necessarily known until after the nodes are parsed. To solve that, a second mapping between filenames and the modules they represent is used. The identifer map stores some metadata, including what file it it was declared in. Those together can later be used to evaluate what class a given identifier should belong to. Args: node: The current node being scanned for metadata. identifier_metadata_map: A map between C++ identifiers and the metadata needed to determine their C# identifer. file_module_map: The map between files and which module they represent. current_class: The class that the data in the current node refers to. includes: The list of includes that we had to parse to get to this object. """ # Check if we've recursed into an included file. local_includes = includes or [] node_type = node.get(NODE_TYPE_KEY) if node_type in ('include', 'import'): name = node.get('name') if name: local_includes = list(includes) local_includes.append(name) # If this include file has declared that its contents is part of a module, # record the mapping between the file name and the module name. module = node.get('module') if module and isinstance(name, str): file_module_map[name] = module # Add classes and nodes to the identifier map. elif node_type == 'class' or node_type == 'struct': current_class = node.get('name') metadata = {'includes': local_includes} if FLAGS.namespace: metadata['namespace'] = FLAGS.namespace identifier_metadata_map[current_class] = metadata # Add C++ function declarations to the identifier map. elif node_type == 'cdecl': symname = node.get('sym:name') name = node.get('name') if current_class: name = '%s::%s' % (current_class, name) if name and symname: metadata = { 'includes': local_includes, 'symname': symname, } if FLAGS.namespace: metadata['namespace'] = FLAGS.namespace identifier_metadata_map[name] = metadata # Recurse into all children nodes and repeat child_nodes = node.get(NODE_CHILDREN_KEY, {}) for children in child_nodes.values(): for child_node in children: gather_subtitution_data(child_node, identifier_metadata_map, file_module_map, current_class, local_includes) def resolve_module(includes, file_module_map): """Scan backwards through the list of includes to find the module. The files that SWIG operates on can recursively include other files. We are only interested in the most recent module declaration, so we scan the list backwards until we find a node that has declared a module. Args: includes: The list of includes that we had to parse to get to this object. file_module_map: The map between files and which module they represent. Returns: The current module, if any, for the given list of includes. """ for filename in reversed(includes): module = file_module_map.get(filename) if module: return module return None def generated_substitution_map(identifier_metadata_map, file_module_map): """Uses the collected metadata to create a dict of C++ to C# identifiers. Args: identifier_metadata_map: A map between C++ identifiers and the metadata needed to determine their C# identifer. file_module_map: The map between files and which module they represent. Returns: A dict of fully qualified C++ identifiers and the C# identifiers they map to. """ substitution_map = {} for identifier, metadata in identifier_metadata_map.items(): includes = metadata.get('includes') if includes: cs_name = [] namespace = metadata.get('namespace') module = resolve_module(includes, file_module_map) symname = metadata.get('symname') if namespace: cs_name.append(namespace) if module: cs_name.append(module) if symname: cs_name.append(symname) substitution_map[identifier] = '.'.join(cs_name) return substitution_map def perform_substitution(match, substitution_map): """Substitutes C++ identifiers with C# identifiers. We want to perform subsitutions on function names, but not accidentally hit anything else in the string. For example, if the line looks like this: /// Returns true if `firebase::crash::Initialize()` has been called. Then we want the final string to be: /// Returns true if `Firebase.Crash.Initialize()` has been called. The regex looks for identifiers enclosed within backticks ignoring things like parentheses. If we did the substitution directly, the backticks and parentheses would be lost. Instead, what we do is find out what the captured match was (in this case, 'firebase::crash::Initialize') then take the whole match ('`firebase::crash::Initialize()`'), and subtitute just the portion we care about so that the surrounding characters can be preserved. Args: match: The re.Match object representing the match. substitution_map: The dict of potential substitutions. Returns: The new C# code resulting from performing the substitutions. """ full_match = match.group(0) cpp = match.group(1) cs = substitution_map.get(cpp) if cs: return full_match.replace(cpp, cs) else: return full_match def apply_substitution(file_content, substitution_map): """Apply the substitution map to an entire file. Args: file_content: The text on which to perform the substitutions. substitution_map: The dict of potential substitutions. Returns: A new string with all substitutions performed. """ return FUNCTION_NAME_REGEX.sub( lambda match: perform_substitution(match, substitution_map), file_content) def main(unused_argv): """Converts references to C++ identifiers into C# identifiers. Given the output of running SWIG with the argument `-debug-top 4`, convert all references to C++ identifiers into C# identifiers in the given files, optionally prepending all C# identifiers with the namespace given by -n """ with open(FLAGS.debug_top, 'r') as debug_file: debug_file_content = debug_file.read() # Parse the debug output into a format we can work with. debug_line_iter = PeekableIter(debug_file_content.splitlines()) debug_data = parse_swig_debug_top(debug_line_iter) # Gather the data we need to perform the substitutions. identifier_metadata_map = {} file_module_map = {} gather_subtitution_data(debug_data, identifier_metadata_map, file_module_map) substitution_map = generated_substitution_map(identifier_metadata_map, file_module_map) for src in FLAGS.srcs: with open(src, 'r') as cs_file: file_content = cs_file.read() # Apply the maps to the C# code. file_content = apply_substitution(file_content, substitution_map) try: with open(src, 'w') as cs_file: cs_file.write(file_content) except IOError as e: logging.warning('Unable to patch file %s (%s)', cs_file, str(e)) if __name__ == '__main__': flags.mark_flag_as_required('debug_top') flags.mark_flag_as_required('srcs') app.run(main)