#!/bin/env python3
# Generates a report on the amount of code sharing in this repo
#
# The purpose of this is
# a) To be able to understand the structure and dependencies
# b) To provide a metric that measures the amount of shared vs non-shared code

import datetime
from pathlib import Path
import json
import yaml

# To add more languages, add them to this list:
languages = ['cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ql', 'ruby', 'swift']

repo_location = Path(__file__).parent.parent.parent

# Gets the total number of lines in a file
def linecount(file):
    with open(file, 'r') as fp: return len(fp.readlines())

# Gets the language name from the path
def get_language(path):
    return path.parts[len(repo_location.parts)]

# Is this path a CodeQL query file
def is_query(path):
    return path.suffix == '.ql'

# Is this path a CodeQL library file
def is_library(path):
    return path.suffix == '.qll'

# Is this path a relevant CodeQL file
def is_ql(path):
    return is_query(path) or is_library(path)

# Is this file a CodeQL package file
def is_package(path):
    return path.name == 'qlpack.yml'

# A CodeQL source file
class QlFile:
    def __init__(self, path):
        self.path = path
        self.lines = linecount(path)
    shared = False

    def language(self):
        return get_language(self.path)

    def query(self):
        return is_query(self.path)

    def library(self):
        return is_library(self.path)

    # Returns if this qlfile is not shared, and is in a pack that is only in one language
    def isOnlyInLanguage(self, language):
        return not self.shared and (self.package is None or self.package.languages == {language}) and self.language() == language

# Represents a language folder
class Language:
    def __init__(self, name):
        self.name = name
        self.packs = []
        self.nonshared_files = 0
        self.nonshared_lines = 0
        self.imported_files = 0
        self.imported_lines = 0

    def addQlFile(self, qlfile):
        if not qlfile.shared:
            self.nonshared_files += 1
            self.nonshared_lines += qlfile.lines

    def addSharedAsset(self, package):
        self.imported_files += package.files
        self.imported_lines += package.lines

# A shared package or file
class SharedAsset:
    def __init__(self, name):
        self.name = name

# A file shared using identical-files.json
class IdenticalFileSet(SharedAsset):
    def __init__(self, name, ql_files):
        self.name = name
        self.languages = set()
        self.files = 0
        self.lines = 0
        for file in ql_files:
            file.package = self
            file.shared = True
            self.files = 1
            self.lines = file.lines
            self.languages.add(file.language())

    # Gets a pretty-printed markdown link
    def link(self):
        return self.name

# Represents all files shared in `identical-files.json`
# Reads the file and builds a list of assets
class IdenticalFiles:
    def __init__(self, repo_location, ql_file_index):
        identical_files = repo_location/'config'/'identical-files.json'
        with open(identical_files, "r") as fp:
            identical_files_json = json.load(fp)
        # Create a list of assets
        self.assets = []
        for group in identical_files_json:
            paths = []
            for file in identical_files_json[group]:
                path = repo_location / file
                if is_ql(path):
                    ql_file_index[path].shared = True
                    paths.append(ql_file_index[path])
            self.assets.append(IdenticalFileSet(group, paths))

# A package created from a `qlpack.yml`` file
class Package(SharedAsset):
    def __init__(self, path, ql_file_index):
        self.path = path
        self.language = get_language(path)
        self.lines = 0
        self.files = 0
        self.languages = set()
        self.languages.add(self.language)
        self.identical_files_dependencies = set()
        with open(path, 'r') as fp:
            y = yaml.safe_load(fp)
            if 'name' in y:
                self.name = y['name']
            else:
                self.name = path.parent.name
            if 'dependencies' in y:
                self.deps = y['dependencies']
                if self.deps is None:
                    self.deps = {}
            else:
                self.deps = {}
        # Mark all relevant files with their package
        for file in ql_file_index:
            if self.containsDirectory(file):
                file = ql_file_index[file]
                if not file.shared:
                    file.package = self
                    self.lines += file.lines
                    self.files += 1
                else:
                    self.identical_files_dependencies.add(file.package)
        self.url = "https://github.com/github/codeql/blob/main/" + str(path.relative_to(repo_location))

    # Gets a pretty-printed markdown link
    def link(self):
        return '[' + self.name + '](' + self.url + ')'

    def containsDirectory(self, dir):
        return self.path.parent.parts == dir.parts[:len(self.path.parent.parts)]
        # dir.startsWith(self.path.parent)

    # Constructs a list of transitive depedencies of this package.
    def calculateDependencies(self, packageNameMap):
        self.transitive_dependencies = set(self.deps)
        queue = list(self.deps)
        while len(queue):
            item = queue.pop()
            for dep2 in packageNameMap[item].deps:
                if dep2 not in self.transitive_dependencies:
                    self.transitive_dependencies.add(dep2)
                    queue.append(dep2)
        # Calculate the amount of imported code
        self.total_imported_files = 0
        self.total_imported_lines = 0
        self.all_dependencies = set(self.identical_files_dependencies)
        for dep in self.transitive_dependencies:
            self.all_dependencies.add(packageNameMap[dep])
        for dep in self.all_dependencies:
            self.total_imported_files += dep.files
            self.total_imported_lines += dep.lines
            dep.languages.add(self.language)

# Create a big index of all files and their line counts.

# Map from path to line count
ql_file_index = {}
package_files = []

# Queue of directories to read
directories_to_scan = [repo_location]

while len(directories_to_scan)!=0:
    dir = directories_to_scan.pop()
    for p in dir.iterdir():
        if p.is_dir():
            directories_to_scan.append(p)
        elif is_ql(p):
            ql_file_index[p] = QlFile(p)
        elif is_package(p):
            package_files.append(p)

# Create identical_files_json
identical_files = IdenticalFiles(repo_location, ql_file_index)

# Create packages
# Do this after identical_files so that we can figure out the package sizes
# Do this after getting the ql_file_index fully built
packages = []
for file in package_files:
    packages.append(Package(file, ql_file_index))

# List all shared assets
shared_assets = packages + identical_files.assets

# Construct statistics for each language
language_info = {}
for l in languages:
    language_info[l] = Language(l)

for qlfile in ql_file_index.values():
    lang = qlfile.language()
    if lang in language_info:
        info = language_info[lang]
        if qlfile.isOnlyInLanguage(lang):
            info.addQlFile(qlfile)

# Determine all package dependencies

packageNameMap = {}

for package in packages:
    packageNameMap[package.name] = package

for package in packages:
    package.calculateDependencies(packageNameMap)

for asset in shared_assets:
    if len(asset.languages)>1:
        for lang in asset.languages:
            if lang in language_info:
                language_info[lang].addSharedAsset(asset)


# Functions to output the results

def list_assets(shared_assets, language_info):
    print('| Asset | Files | Lines |', end='')
    for lang in language_info:
        print('', lang, '|', end='')
    print()
    print('| ----- | ----- | ----- |', end='')
    for lang in language_info:
        print(' ---- |', end='')
    print()
    for asset in shared_assets:
        print('|', asset.link(), '|', asset.files ,'|', asset.lines, '|', end=' ')
        for lang in language_info:
            if lang in asset.languages:
                print('yes |', end=' ')
            else:
                print('    |', end=' ');
        print()
    print()

def list_package_dependencies(package):
    print("Package", package.path, package.name, package.files, package.lines, package.total_imported_files, package.total_imported_lines)
    for dep in package.all_dependencies:
        print("  ", dep.name, dep.files, dep.lines)

def print_package_dependencies(packages):
    print('| Package name | Non-shared files | Non-shared lines of code | Imported files | Imported lines of code | Shared code % |')
    print('| ------------ | ---------------- | ------------------------ | -------------- | ---------------------- | ------------- |')
    for package in packages:
        nlines = package.lines + package.total_imported_lines
        shared_percentage = 100 * package.total_imported_lines / nlines if nlines>0 else 0
        print('|', package.link(), '|', package.files, '|', package.lines, '|', package.total_imported_files, '|', package.total_imported_lines, '|',
            # ','.join([p.name for p in package.all_dependencies]),
            "%.2f" % shared_percentage, '|')
    print()

def print_language_dependencies(packages):
    print_package_dependencies([p for p in packages if p.name.endswith('-all') and p.name.count('-')==1])

def list_shared_code_by_language(language_info):
    # For each language directory, list the files that are (1) inside the directory and not shared,
    # (2) packages from outside the directory, plus identical files
    print('| Language | Non-shared files | Non-shared lines of code | Imported files | Imported lines of code | Shared code % |')
    print('| -------- | ---------------- | ------------------------ | -------------- | ---------------------- | ------------- |')
    for lang in language_info:
        info = language_info[lang]
        total = info.imported_lines + info.nonshared_lines
        shared_percentage = 100 * info.imported_lines / total if total>0 else 0
        print('|', lang, '|', info.nonshared_files, '|', info.nonshared_lines, '|', info.imported_files, '|', info.imported_lines, '|', "%.2f" % shared_percentage, '|')
    print()


# Output reports

print('# Report on CodeQL code sharing\n')
print('Generated on', datetime.datetime.now())
print()

print('## Shared code by language\n')

list_shared_code_by_language(language_info)

print('''
* *Non-shared files*: The number of CodeQL files  (`.ql`/`.qll`) that are only used within this language folder. Excludes `identical-files.json` that are shared between multiple languages.
* *Non-shared lines of code*: The number of lines of code in the non-shared files.
* *Imported files*: All CodeQL files (`.ql`/`.qll`) files that are transitively used in this language folder, either via packages or `identical-files.json`
* *Imported lines of code*: The number of lines of code in the imported files
* *Shared code %*: The proportion of imported lines / total lines (nonshared + imported).

## Shared packages use by language

A package is *used* if it is a direct or indirect dependency, or a file shared via `identical-files.json`.

''')

list_assets(shared_assets, language_info)

print('## Shared code by language pack\n')

print_language_dependencies(packages)

print('## Shared code by package\n')

print_package_dependencies(packages)