docs: add "Supported pandas APIs" reference to the documentation (#542)

tswast · web-flow · commit 74c391586280 · 2024-03-29T23:28:16.000Z
Thank you for opening a Pull Request! Before submitting your PR, there are a few things you can do to make sure it goes smoothly: - [ ] Make sure to open an issue as a [bug/issue](https://togithub.com/googleapis/python-bigquery-dataframes/issues/new/choose) before writing your code! That way we can discuss the change, evaluate designs, and agree on the general idea - [ ] Ensure the tests and linter pass - [ ] Code coverage does not decrease (if any source code was changed) - [ ] Appropriate docs were updated (if necessary) 🦕
diff --git a/.kokoro/release-nightly.sh b/.kokoro/release-nightly.sh
@@ -106,6 +106,7 @@ for gcs_path in gs://vertex_sdk_private_releases/bigframe/ \
     # write access to
     COVERAGE_TABLE=bigframes-metrics.coverage_report.bigframes_coverage_nightly
     python3.10 scripts/publish_api_coverage.py \
+      bigquery \
       --bigframes_version=$BIGFRAMES_VERSION \
       --release_version=$RELEASE_VERSION \
       --bigquery_table=$COVERAGE_TABLE
diff --git a/docs/index.rst b/docs/index.rst
@@ -7,6 +7,7 @@ API reference
     :maxdepth: 3
 
     reference/index
+    supported_pandas_apis
 
 Changelog
 ---------
diff --git a/docs/supported_pandas_apis.rst b/docs/supported_pandas_apis.rst
@@ -0,0 +1,62 @@
+Supported pandas APIs
+=====================
+
+The following tables show the pandas APIs that have been implemented (or not)
+in BigQuery DataFrames.
+
+* 'Y' means it implements all parameters.
+* 'P' means it implements only some parameters.
+
+DataFrame
+---------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_dataframe.html
+
+DataFrameGroupBy
+----------------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_dataframegroupby.html
+
+Index
+-----
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_index.html
+
+pandas module
+-------------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_pandas.html
+
+Series
+------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_series.html
+
+Series.dt methods
+-----------------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_datetimemethods.html
+
+Series.str methods
+------------------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_stringmethods.html
+
+SeriesGroupBy
+-------------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_seriesgroupby.html
+
+Window
+------
+
+.. raw:: html
+    :file: supported_pandas_apis/bf_window.html
diff --git a/docs/supported_pandas_apis/.gitignore b/docs/supported_pandas_apis/.gitignore
@@ -0,0 +1 @@
+*.html
diff --git a/docs/templates/toc.yml b/docs/templates/toc.yml
@@ -72,6 +72,8 @@
       name: Series
     - name: Window
       uid: bigframes.core.window.Window
+    - href: supported_pandas_apis.html
+      name: Supported pandas APIs
     name: bigframes.pandas
   - items:
     - items:
diff --git a/noxfile.py b/noxfile.py
@@ -467,6 +467,12 @@ def docs(session):
     )
 
     shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True)
+
+    session.run(
+        "python",
+        "scripts/publish_api_coverage.py",
+        "docs",
+    )
     session.run(
         "sphinx-build",
         "-W",  # warnings as errors
@@ -503,6 +509,12 @@ def docfx(session):
     )
 
     shutil.rmtree(os.path.join("docs", "_build"), ignore_errors=True)
+
+    session.run(
+        "python",
+        "scripts/publish_api_coverage.py",
+        "docs",
+    )
     session.run(
         "sphinx-build",
         "-T",  # show full traceback on exception
diff --git a/scripts/publish_api_coverage.py b/scripts/publish_api_coverage.py
@@ -17,27 +17,110 @@
 
 import argparse
 import inspect
+import pathlib
+import sys
 
 import pandas as pd
+import pandas.core.groupby
+import pandas.core.indexes.accessors
+import pandas.core.strings.accessor
+import pandas.core.window.rolling
 
+import bigframes
+import bigframes.core.groupby
+import bigframes.core.window
+import bigframes.operations.datetimes
 import bigframes.pandas as bpd
 
+REPO_ROOT = pathlib.Path(__file__).parent.parent
+
+URL_PREFIX = {
+    "pandas": (
+        "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.pandas#bigframes_pandas_"
+    ),
+    "dataframe": (
+        "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.dataframe.DataFrame#bigframes_dataframe_DataFrame_"
+    ),
+    "dataframegroupby": (
+        "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.groupby.DataFrameGroupBy#bigframes_core_groupby_DataFrameGroupBy_"
+    ),
+    "series": (
+        "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.series.Series#bigframes_series_Series_"
+    ),
+    "seriesgroupby": (
+        "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.groupby.SeriesGroupBy#bigframes_core_groupby_SeriesGroupBy_"
+    ),
+    "datetimemethods": (
+        "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.datetimes.DatetimeMethods#bigframes_operations_datetimes_DatetimeMethods_"
+    ),
+    "stringmethods": (
+        "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.operations.strings.StringMethods#bigframes_operations_strings_StringMethods_"
+    ),
+    "window": (
+        "https://cloud.google.com/python/docs/reference/bigframes/latest/bigframes.core.window.Window#bigframes_core_window_Window_"
+    ),
+    # TODO: Index not documented.
+}
+
+
+PANDAS_TARGETS = [
+    ("pandas", pd, bpd),
+    ("dataframe", pd.DataFrame, bpd.DataFrame),
+    (
+        "dataframegroupby",
+        pandas.core.groupby.DataFrameGroupBy,
+        bigframes.core.groupby.DataFrameGroupBy,
+    ),
+    ("series", pd.Series, bpd.Series),
+    (
+        "seriesgroupby",
+        pandas.core.groupby.DataFrameGroupBy,
+        bigframes.core.groupby.DataFrameGroupBy,
+    ),
+    (
+        "datetimemethods",
+        pandas.core.indexes.accessors.CombinedDatetimelikeProperties,
+        bigframes.operations.datetimes.DatetimeMethods,
+    ),
+    (
+        "stringmethods",
+        pandas.core.strings.accessor.StringMethods,
+        bigframes.operations.strings.StringMethods,
+    ),
+    (
+        "window",
+        pandas.core.window.rolling.Rolling,
+        bigframes.core.window.Window,
+    ),
+    ("index", pd.Index, bpd.Index),
+]
+
+
+def names_from_signature(signature):
+    """Extract the names of parameters from signature
+
+    See: https://docs.python.org/3/library/inspect.html#inspect.signature
+    """
+    return frozenset({parameter for parameter in signature.parameters})
+
+
+def calculate_missing_parameters(bigframes_function, target_function):
+    bigframes_params = names_from_signature(inspect.signature(bigframes_function))
+    target_params = names_from_signature(inspect.signature(target_function))
+    return target_params - bigframes_params
+
 
 def generate_pandas_api_coverage():
     """Inspect all our pandas objects, and compare with the real pandas objects, to see
     which methods we implement. For each, generate a regex that can be used to check if
     its present in a notebook"""
-    header = ["api", "pattern", "kind", "is_in_bigframes"]
+    header = ["api", "pattern", "kind", "is_in_bigframes", "missing_parameters"]
     api_patterns = []
-    targets = [
-        ("pandas", pd, bpd),
-        ("dataframe", pd.DataFrame, bpd.DataFrame),
-        ("series", pd.Series, bpd.Series),
-        ("index", pd.Index, bpd.Index),
-    ]
     indexers = ["loc", "iloc", "iat", "ix", "at"]
-    for name, pandas_obj, bigframes_obj in targets:
+    for name, pandas_obj, bigframes_obj in PANDAS_TARGETS:
         for member in dir(pandas_obj):
+            missing_parameters = ""
+
             # skip private functions and properties
             if member[0] == "_" and member[1] != "_":
                 continue
@@ -50,6 +133,17 @@ def generate_pandas_api_coverage():
                 # Function, match .member(
                 token = f"\\.{member}\\("
                 token_type = "function"
+
+                if hasattr(bigframes_obj, member):
+                    bigframes_function = getattr(bigframes_obj, member)
+                    pandas_function = getattr(pandas_obj, member)
+                    missing_parameters = ", ".join(
+                        sorted(
+                            calculate_missing_parameters(
+                                bigframes_function, pandas_function
+                            )
+                        )
+                    )
             elif member in indexers:
                 # Indexer, match .indexer[
                 token = f"\\.{member}\\["
@@ -62,7 +156,13 @@ def generate_pandas_api_coverage():
             is_in_bigframes = hasattr(bigframes_obj, member)
 
             api_patterns.append(
-                [f"{name}.{member}", token, token_type, is_in_bigframes]
+                [
+                    f"{name}.{member}",
+                    token,
+                    token_type,
+                    is_in_bigframes,
+                    missing_parameters,
+                ]
             )
 
     return pd.DataFrame(api_patterns, columns=header)
@@ -165,14 +265,112 @@ def build_api_coverage_table(bigframes_version: str, release_version: str):
     return combined_df.infer_objects().convert_dtypes()
 
 
+def format_api(api_names, is_in_bigframes, api_prefix):
+    api_names = api_names.str.slice(start=len(f"{api_prefix}."))
+    formatted = "<code>" + api_names + "</code>"
+    url_prefix = URL_PREFIX.get(api_prefix)
+    if url_prefix is None:
+        return formatted
+
+    linked = '<a href="' + url_prefix + api_names + '">' + formatted + "</a>"
+    return formatted.mask(is_in_bigframes, linked)
+
+
+def generate_api_coverage(df, api_prefix):
+    dataframe_apis = df.loc[df["api"].str.startswith(f"{api_prefix}.")]
+    fully_implemented = (
+        dataframe_apis["missing_parameters"].str.len() == 0
+    ) & dataframe_apis["is_in_bigframes"]
+    partial_implemented = (
+        dataframe_apis["missing_parameters"].str.len() != 0
+    ) & dataframe_apis["is_in_bigframes"]
+    not_implemented = ~dataframe_apis["is_in_bigframes"]
+    dataframe_table = pd.DataFrame(
+        {
+            "API": format_api(
+                dataframe_apis["api"],
+                dataframe_apis["is_in_bigframes"],
+                api_prefix,
+            ),
+            "Implemented": "",
+            "Missing parameters": dataframe_apis["missing_parameters"],
+        }
+    )
+    dataframe_table.loc[fully_implemented, "Implemented"] = "Y"
+    dataframe_table.loc[partial_implemented, "Implemented"] = "P"
+    dataframe_table.loc[not_implemented, "Implemented"] = "N"
+    return dataframe_table
+
+
+def generate_api_coverage_doc(df, api_prefix):
+    dataframe_table = generate_api_coverage(df, api_prefix)
+    dataframe_table = dataframe_table.loc[~(dataframe_table["Implemented"] == "N")]
+    dataframe_table["Implemented"] = dataframe_table["Implemented"].map(
+        {
+            "Y": "<b>Y</b>",
+            "P": "<i>P</i>",
+        }
+    )
+
+    with open(
+        REPO_ROOT / "docs" / "supported_pandas_apis" / f"bf_{api_prefix}.html",
+        "w",
+    ) as html_file:
+        dataframe_table.to_html(
+            html_file, index=False, header=True, escape=False, border=0, col_space="8em"
+        )
+
+
+def generate_api_coverage_docs(df):
+    for target in PANDAS_TARGETS:
+        api_prefix = target[0]
+        generate_api_coverage_doc(df, api_prefix)
+
+
+def print_api_coverage_summary(df, api_prefix):
+    dataframe_table = generate_api_coverage(df, api_prefix)
+
+    print(api_prefix)
+    print(dataframe_table[["Implemented", "API"]].groupby(["Implemented"]).count())
+    print(f"{api_prefix} APIs: {dataframe_table.shape[0]}\n")
+
+
+def print_api_coverage_summaries(df):
+    for target in PANDAS_TARGETS:
+        api_prefix = target[0]
+        print_api_coverage_summary(df, api_prefix)
+
+    print(f"\nAll APIs: {len(df.index)}")
+    fully_implemented = (df["missing_parameters"].str.len() == 0) & df[
+        "is_in_bigframes"
+    ]
+    print(f"Y: {fully_implemented.sum()}")
+    partial_implemented = (df["missing_parameters"].str.len() != 0) & df[
+        "is_in_bigframes"
+    ]
+    print(f"P: {partial_implemented.sum()}")
+    not_implemented = ~df["is_in_bigframes"]
+    print(f"N: {not_implemented.sum()}")
+
+
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("--bigframes_version")
-    parser.add_argument("--release_version")
+    parser.add_argument("output_type")
+    parser.add_argument("--bigframes_version", default=bigframes.__version__)
+    parser.add_argument("--release_version", default="")
     parser.add_argument("--bigquery_table_name")
     args = parser.parse_args()
     df = build_api_coverage_table(args.bigframes_version, args.release_version)
-    df.to_gbq(args.bigquery_table_name, if_exists="append")
+
+    if args.output_type == "bigquery":
+        df.to_gbq(args.bigquery_table_name, if_exists="append")
+    elif args.output_type == "docs":
+        generate_api_coverage_docs(df)
+    elif args.output_type == "summary":
+        print_api_coverage_summaries(df)
+    else:
+        print(f"Unexpected output_type {repr(args.output_type)}")
+        sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/scripts/test_publish_api_coverage.py b/scripts/test_publish_api_coverage.py
@@ -27,6 +27,7 @@ def test_api_coverage_produces_expected_schema():
                 "string",
                 "boolean",
                 "string",
+                "string",
                 "datetime64[ns]",
                 "string",
                 "string",
@@ -36,6 +37,7 @@ def test_api_coverage_produces_expected_schema():
                 "pattern",
                 "kind",
                 "is_in_bigframes",
+                "missing_parameters",
                 "module",
                 "timestamp",
                 "bigframes_version",