# Copyright 2022 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import difflib import getopt import json import os import pathlib import sys from publicsuffixlist import PublicSuffixList from RwsCheck import RwsCheck def find_format_diff(rws_json_string, rws_sites): """Returns the diff of the rws_json_string and the formatted string generated from rws_sites. Args: rws_json_string: string rws_sites: JSON Object Returns: String """ # Add final newline by convention formatted_file = json.dumps(rws_sites, indent=2, ensure_ascii=False) + "\n" if rws_json_string == formatted_file: return "" diff = difflib.unified_diff( rws_json_string.splitlines(keepends=True), formatted_file.splitlines(keepends=True), fromfile="PR file", tofile="expected", ) joined_diff = "".join(diff) return f"Formatting for JSON is incorrect;\nerror was:\n```diff\n{joined_diff}\n```" def find_diff_sets(old_sets, new_sets): """Finds changes made between two dictionaries of Related Website Sets Finds Related Website Sets that have been added or modified in old_sets to create new_sets and returns them as the dictionary diff_sets. Additionally, finds Related Website Sets that have been removed from old_sets to create new_sets and returns them as subtracted_sets. Args: old_sets: Dict[string, RwsSet] new_sets: Dict[string, RwsSet] Returns: Tuple[Dict[string, RwsSet], Dict[string, RwsSet]] """ diff_sets = { primary: rws for primary, rws in new_sets.items() if rws != old_sets.get(primary) } subtracted_sets = { primary: old_sets[primary] for primary in set(old_sets) - set(new_sets) if not any(rws.includes(primary) for rws in new_sets.values()) } return diff_sets, subtracted_sets def run_nonbreaking_checks(rws_checker, rws_json_string, strict_formatting, check_sets): """Runs all checks from check_sites and RWSCheck whose exceptions should not cause the program to immediately exit. Returns a list of `error_texts` that result from running `find_format_diff` as well as a number of RWSCheck functions. The RWSCheck function calls may also result in changes to `rws_checker.error_list`. Args: rws_checker: RWSCheck object rws_json_string: string strict_formatting: boolean check_sets: Dict[string, RwsSet] Returns: [String] """ error_texts = [] if strict_formatting and ( format_diff := find_format_diff(rws_json_string, rws_checker.rws_sites) ): error_texts.append(format_diff) try: rws_checker.check_exclusivity(rws_checker.load_sets()) except Exception as inst: error_texts.append(inst) # These are RWSCheck functions that may append to the # rws_checker's error_list. check_list = [ rws_checker.has_all_rationales, rws_checker.find_non_https_urls, rws_checker.find_invalid_eTLD_Plus1, rws_checker.find_invalid_well_known, rws_checker.find_invalid_alias_eSLDs, rws_checker.find_robots_tag, rws_checker.find_ads_txt, rws_checker.check_for_service_redirect, ] for check in check_list: try: check(check_sets) except Exception as inst: error_texts.append(inst) return error_texts def main(): args = sys.argv[1:] input_filepath = "related_website_sets.JSON" cli_primaries = [] with_diff = False strict_formatting = False opts, _ = getopt.getopt( args, "i:p:", ["with_diff", "strict_formatting", "primaries="] ) for opt, arg in opts: if opt == "-i": input_filepath = arg if opt == "--with_diff": with_diff = True if opt == "--strict_formatting": strict_formatting = True if opt == "--primaries" or opt == "-p": cli_primaries.extend(arg.split(",")) rws_json_string = pathlib.Path(input_filepath).read_text() try: rws_sites = json.loads(rws_json_string) except Exception as inst: # If the file cannot be loaded, we will not run any other checks print(f"There was an error when parsing the JSON;\nerror was: {inst}") return # Load the etlds from the public suffix list with open("effective_tld_names.dat", "rb") as f: etlds = PublicSuffixList(f) # Get all the ICANN domains icanns = set() with open("ICANN_domains") as f: for line in f: l = line.strip() icanns.add(l) rws_checker = RwsCheck(rws_sites, etlds, icanns) try: rws_checker.validate_schema("SCHEMA.json") except Exception as inst: # If the schema is invalid, we will not run any other checks print(inst) return error_texts = [] check_sets = {} subtracted_sets = {} # If called with with_diff, we must determine the sets that are different # to properly construct our check_sets if with_diff: with open("related_website_sets.JSON") as f: try: old_sites = json.load(f) except Exception as inst: # If the file cannot be loaded, we will not run any other # checks print( "There was an error when loading " + "related_website_sets.JSON" + "\nerror was: " + inst ) return old_checker = RwsCheck(old_sites, etlds, icanns) check_sets, subtracted_sets = find_diff_sets( old_checker.load_sets(), rws_checker.load_sets() ) else: check_sets = rws_checker.load_sets() if cli_primaries: absent_primaries = [p for p in cli_primaries if p not in check_sets] for p in absent_primaries: error_texts.append( "There was an error loading the set:\n" + f'could not find set with primary site "{p}"' ) check_sets = {p: check_sets[p] for p in cli_primaries if p in check_sets} # Run check on subtracted sets rws_checker.find_invalid_removal(subtracted_sets) # Run remaining technical checks error_texts += run_nonbreaking_checks( rws_checker, rws_json_string, strict_formatting, check_sets ) # This message allows us to check the succes of our action if rws_checker.error_list or error_texts: for checker_error in rws_checker.error_list: print(checker_error) for error_text in error_texts: print(error_text) else: print("success", end="") if __name__ == "__main__": main()