From 14ac0873a292e4952d842139fc4a630397ff0efd Mon Sep 17 00:00:00 2001
From: Iurii Iurchenko <stilferx@Mac.lan>
Date: Wed, 1 Apr 2026 20:11:51 -0600
Subject: [PATCH 1/2] Add FabricNotebookAPITools to samples folder

---
 .../notebook_fabric_api_tools/GetAllTables.py | 163 ++++++++++++++++++
 samples/notebook_fabric_api_tools/README.md   |  47 +++++
 2 files changed, 210 insertions(+)
 create mode 100644 samples/notebook_fabric_api_tools/GetAllTables.py
 create mode 100644 samples/notebook_fabric_api_tools/README.md

diff --git a/samples/notebook_fabric_api_tools/GetAllTables.py b/samples/notebook_fabric_api_tools/GetAllTables.py
new file mode 100644
index 00000000..59757f11
--- /dev/null
+++ b/samples/notebook_fabric_api_tools/GetAllTables.py
@@ -0,0 +1,163 @@
+# Get All Tables Across All Workspaces in All Lakehouses
+# (with pagination on all 3 API calls)
+
+# This snippet returns a Spark view with all tables
+# It helps solve challenges:
+# - Which table has XYZ in the name (e.g. payments, sales, etc)
+# - How many tables we have
+# - How many duplicated tables we have
+
+import requests
+import pandas as pd
+from notebookutils import mssparkutils
+from pyspark.sql import SparkSession
+
+access_token = mssparkutils.credentials.getToken(
+    "https://api.fabric.microsoft.com"
+)
+
+api_headers = {
+    "Authorization": f"Bearer {access_token}",
+    "Content-Type": "application/json"
+}
+
+API_ROOT = "https://api.fabric.microsoft.com/v1"
+
+# -----------------------------------------------------------
+# Generic paginated GET — works for all Fabric list endpoints
+# -----------------------------------------------------------
+def get_all_pages(url, result_key="value"):
+    """
+    Fetch all pages from a Fabric REST API list endpoint.
+    Uses continuationUri/continuationToken for pagination.
+
+    Args:
+        url:        The initial API URL
+        result_key: "value" for workspaces/lakehouses, "data" for tables
+    Returns:
+        List of all items across all pages
+    """
+    all_items = []
+
+    while url:
+        resp = requests.get(url, headers=api_headers)
+        resp.raise_for_status()
+        body = resp.json()
+
+        items = body.get(result_key, [])
+        all_items.extend(items)
+
+        # Prefer continuationUri (full URL), fall back to token
+        next_url = body.get("continuationUri")
+        if next_url:
+            url = next_url
+        elif body.get("continuationToken"):
+            separator = "&" if "?" in url else "?"
+            base_url = url.split("?")[0] if "?" in url else url
+            url = f"{base_url}{separator}continuationToken={body['continuationToken']}"
+        else:
+            url = None
+
+    return all_items
+
+
+# -----------------------------------------------------------
+# 1. Get ALL workspaces (paginated)
+# -----------------------------------------------------------
+workspace_list = get_all_pages(f"{API_ROOT}/workspaces", result_key="value")
+
+print(f"Discovered {len(workspace_list)} workspaces\n")
+
+inventory_records = []
+schema_enabled_lakehouses = []
+
+# -----------------------------------------------------------
+# 2. Loop through workspaces → lakehouses → tables
+# -----------------------------------------------------------
+for workspace in workspace_list:
+    workspace_id = workspace["id"]
+    workspace_name = workspace["displayName"]
+
+    print(f"Processing workspace: {workspace_name}")
+
+    workspace_failed = False
+
+    try:
+        # Get ALL lakehouses in this workspace (paginated)
+        lakehouse_list = get_all_pages(
+            f"{API_ROOT}/workspaces/{workspace_id}/lakehouses",
+            result_key="value"
+        )
+
+        for lakehouse in lakehouse_list:
+            lakehouse_id = lakehouse["id"]
+            lakehouse_name = lakehouse["displayName"]
+
+            # Check if lakehouse is schema-enabled
+            properties = lakehouse.get("properties", {})
+            if (
+                properties.get("defaultSchema") is not None
+                or properties.get("enableSchemas", False)
+            ):
+                schema_enabled_lakehouses.append({
+                    "workspace_name": workspace_name,
+                    "lakehouse_name": lakehouse_name,
+                })
+                print(f"  ⚠ Skipped '{lakehouse_name}' — schema-enabled lakehouse (REST API not supported)")
+                continue
+
+            # Get ALL tables in this lakehouse (paginated)
+            table_list = get_all_pages(
+                f"{API_ROOT}/workspaces/{workspace_id}/lakehouses/{lakehouse_id}/tables",
+                result_key="data"
+            )
+
+            for table in table_list:
+                inventory_records.append({
+                    "workspace_name": workspace_name,
+                    "lakehouse_name": lakehouse_name,
+                    "table_name": table.get("name"),
+                    "table_type": table.get("type"),
+                    "location": table.get("location"),
+                    "format": table.get("format")
+                })
+
+        print(f"  ✓ Workspace processed successfully\n")
+
+    except Exception as ex:
+        workspace_failed = True
+        print(f"  ✗ Workspace failed: {workspace_name}")
+        print(f"    Error: {str(ex)}\n")
+
+if inventory_records:
+    pandas_df = pd.DataFrame(inventory_records)
+    spark_df = spark.createDataFrame(pandas_df)
+    spark_df.createOrReplaceTempView("fabric_lakehouse_inventory")
+
+    print("Temp view created: fabric_lakehouse_inventory")
+    print(f"Total tables indexed: {len(inventory_records)}")
+else:
+    print("No lakehouses or tables found.")
+
+if schema_enabled_lakehouses:
+    print(f"\n⚠ Skipped {len(schema_enabled_lakehouses)} schema-enabled lakehouse(s):")
+    for lh in schema_enabled_lakehouses:
+        print(f"  - {lh['workspace_name']} → {lh['lakehouse_name']}")
+    print("  (The REST API /tables endpoint does not support schema-enabled lakehouses yet)")
+
+query_text = """SELECT
+    workspace_name,
+    lakehouse_name,
+    table_name,
+    table_type,
+    location,
+    format
+FROM fabric_lakehouse_inventory
+"""
+
+display(spark.sql(query_text))
+
+print("Use that query to get data:")
+print("****************")
+print("%%sql")
+print(query_text)
\ No newline at end of file
diff --git a/samples/notebook_fabric_api_tools/README.md b/samples/notebook_fabric_api_tools/README.md
new file mode 100644
index 00000000..3cc4d1b3
--- /dev/null
+++ b/samples/notebook_fabric_api_tools/README.md
@@ -0,0 +1,47 @@
+# FabricNotebookAPITools
+
+A collection of PySpark scripts designed to run inside **Microsoft Fabric Notebooks**. These tools use the Fabric and Power BI REST APIs to inventory, search, and audit resources across all workspaces in your Fabric tenant.
+
+All tools authenticate via `mssparkutils` (available natively in Fabric notebooks) and produce **Spark temporary views** that can be queried with SQL immediately after running.
+
+---
+
+## Tools
+
+### [GetAllTables.py](GetAllTables.py)
+
+**Purpose:** Builds a complete inventory of all tables across every workspace and lakehouse in the tenant.
+
+**What it does:**
+- Iterates over all workspaces, then all lakehouses within each workspace
+- Collects table metadata: name, type, storage location, and format
+- Registers the result as a Spark temporary view for SQL querying
+
+**Output view:** `fabric_lakehouse_inventory`
+
+| Column | Description |
+|---|---|
+| `workspace_name` | Name of the workspace |
+| `lakehouse_name` | Name of the lakehouse |
+| `table_name` | Name of the table |
+| `table_type` | Table type (e.g., Managed, External) |
+| `location` | Storage path of the table |
+| `format` | File format (e.g., delta, parquet) |
+
+**Limitations:**
+- **Schema-enabled lakehouses are skipped.** The Fabric REST API `/tables` endpoint does not support lakehouses with schemas enabled (`defaultSchema` or `enableSchemas`). These lakehouses are detected automatically and excluded from the inventory. The script prints a summary of skipped lakehouses at the end of the run, so you can identify gaps in coverage.
+
+**Use cases:**
+- Find all tables matching a name pattern across the entire tenant
+- Count total tables per lakehouse or workspace
+- Identify duplicate or redundant tables
+- Generate a full table inventory report for governance
+
+**Example query:**
+```sql
+SELECT * FROM fabric_lakehouse_inventory
+WHERE table_name LIKE '%payments%'
+ORDER BY workspace_name, lakehouse_name
+```
+
+---

From 1942215ef35656062ae4a859454cff286e7d1f14 Mon Sep 17 00:00:00 2001
From: Iurii Iurchenko <4iurchenko@gmail.com>
Date: Sun, 19 Apr 2026 11:37:29 -0600
Subject: [PATCH 2/2] Address Copilot review feedback: timeouts, pagination,
 cleanups

---
 .../notebook_fabric_api_tools/GetAllTables.py | 89 +++++++++++--------
 1 file changed, 54 insertions(+), 35 deletions(-)

diff --git a/samples/notebook_fabric_api_tools/GetAllTables.py b/samples/notebook_fabric_api_tools/GetAllTables.py
index 59757f11..6eef6e22 100644
--- a/samples/notebook_fabric_api_tools/GetAllTables.py
+++ b/samples/notebook_fabric_api_tools/GetAllTables.py
@@ -8,9 +8,10 @@
 # - How many duplicated tables we have
 
 import requests
-import pandas as pd
 from notebookutils import mssparkutils
-from pyspark.sql import SparkSession
+from urllib.parse import urlparse, urlencode, parse_qsl, urlunparse
+from pyspark.sql.types import StructType, StructField, StringType
+
 
 access_token = mssparkutils.credentials.getToken(
     "https://api.fabric.microsoft.com"
@@ -26,38 +27,41 @@
 # -----------------------------------------------------------
 # Generic paginated GET — works for all Fabric list endpoints
 # -----------------------------------------------------------
-def get_all_pages(url, result_key="value"):
+def _get_with_retry(url, headers, timeout=30, retries=3, backoff=1.0):
+    for attempt in range(retries + 1):
+        resp = requests.get(url, headers=headers, timeout=timeout)
+        if resp.status_code < 500 and resp.status_code != 429:
+            resp.raise_for_status()
+            return resp
+        if attempt == retries:
+            resp.raise_for_status()
+        wait = float(resp.headers.get("Retry-After", backoff * (2 ** attempt)))
+        time.sleep(wait)
+
+def get_all_pages(url, result_key=("value", "data")):
     """
-    Fetch all pages from a Fabric REST API list endpoint.
-    Uses continuationUri/continuationToken for pagination.
-
-    Args:
-        url:        The initial API URL
-        result_key: "value" for workspaces/lakehouses, "data" for tables
-    Returns:
-        List of all items across all pages
+    result_key: a string, or a tuple of keys tried in order.
+    The first key present in the response body is used.
     """
-    all_items = []
+    if isinstance(result_key, str):
+        result_key = (result_key,)
 
+    all_items = []
     while url:
-        resp = requests.get(url, headers=api_headers)
-        resp.raise_for_status()
-        body = resp.json()
+        body = _get_with_retry(url, api_headers).json()
 
-        items = body.get(result_key, [])
+        items = next((body[k] for k in result_key if k in body), [])
         all_items.extend(items)
 
-        # Prefer continuationUri (full URL), fall back to token
-        next_url = body.get("continuationUri")
-        if next_url:
-            url = next_url
+        if body.get("continuationUri"):
+            url = body["continuationUri"]
         elif body.get("continuationToken"):
-            separator = "&" if "?" in url else "?"
-            base_url = url.split("?")[0] if "?" in url else url
-            url = f"{base_url}{separator}continuationToken={body['continuationToken']}"
+            parsed = urlparse(url)
+            params = dict(parse_qsl(parsed.query))
+            params["continuationToken"] = body["continuationToken"]
+            url = urlunparse(parsed._replace(query=urlencode(params)))
         else:
             url = None
-
     return all_items
 
 
@@ -70,6 +74,7 @@ def get_all_pages(url, result_key="value"):
 
 inventory_records = []
 schema_enabled_lakehouses = []
+failed_workspaces = []
 
 # -----------------------------------------------------------
 # 2. Loop through workspaces → lakehouses → tables
@@ -80,8 +85,6 @@ def get_all_pages(url, result_key="value"):
 
     print(f"Processing workspace: {workspace_name}")
 
-    workspace_failed = False
-
     try:
         # Get ALL lakehouses in this workspace (paginated)
         lakehouse_list = get_all_pages(
@@ -125,17 +128,27 @@ def get_all_pages(url, result_key="value"):
         print(f"  ✓ Workspace processed successfully\n")
 
     except Exception as ex:
-        workspace_failed = True
+        failed_workspaces.append({
+            "workspace_name": workspace_name,
+            "error": str(ex),
+        })
         print(f"  ✗ Workspace failed: {workspace_name}")
-        print(f"    Error: {str(ex)}\n")
+        print(f"    Error: {ex}\n")
+
+inventory_schema = StructType([
+    StructField("workspace_name", StringType(), True),
+    StructField("lakehouse_name", StringType(), True),
+    StructField("table_name",     StringType(), True),
+    StructField("table_type",     StringType(), True),
+    StructField("location",       StringType(), True),
+    StructField("format",         StringType(), True),
+])
 
 if inventory_records:
-    pandas_df = pd.DataFrame(inventory_records)
-    spark_df = spark.createDataFrame(pandas_df)
+    spark_df = spark.createDataFrame(inventory_records, schema=inventory_schema)
     spark_df.createOrReplaceTempView("fabric_lakehouse_inventory")
-
-    print("Temp view created: fabric_lakehouse_inventory")
-    print(f"Total tables indexed: {len(inventory_records)}")
+    print("A view fabric_lakehouse_inventory created")
+    print(f"Total tables indexed: {spark_df.count()}")
 else:
     print("No lakehouses or tables found.")
 
@@ -145,6 +158,11 @@ def get_all_pages(url, result_key="value"):
         print(f"  - {lh['workspace_name']} → {lh['lakehouse_name']}")
     print("  (The REST API /tables endpoint does not support schema-enabled lakehouses yet)")
 
+if failed_workspaces:
+    print(f"\n✗ Failed to process {len(failed_workspaces)} workspace(s):")
+    for fw in failed_workspaces:
+        print(f"  - {fw['workspace_name']}: {fw['error']}")
+
 query_text = """SELECT
     workspace_name,
     lakehouse_name,
@@ -155,9 +173,10 @@ def get_all_pages(url, result_key="value"):
 FROM fabric_lakehouse_inventory
 """
 
-display(spark.sql(query_text))
+if schema_enabled_lakehouses:
+    display(spark.sql(query_text))
 
 print("Use that query to get data:")
 print("****************")
 print("%%sql")
-print(query_text)
\ No newline at end of file
+print(query_text)