From 14ac0873a292e4952d842139fc4a630397ff0efd Mon Sep 17 00:00:00 2001 From: Iurii Iurchenko Date: Wed, 1 Apr 2026 20:11:51 -0600 Subject: [PATCH 1/2] Add FabricNotebookAPITools to samples folder --- .../notebook_fabric_api_tools/GetAllTables.py | 163 ++++++++++++++++++ samples/notebook_fabric_api_tools/README.md | 47 +++++ 2 files changed, 210 insertions(+) create mode 100644 samples/notebook_fabric_api_tools/GetAllTables.py create mode 100644 samples/notebook_fabric_api_tools/README.md diff --git a/samples/notebook_fabric_api_tools/GetAllTables.py b/samples/notebook_fabric_api_tools/GetAllTables.py new file mode 100644 index 00000000..59757f11 --- /dev/null +++ b/samples/notebook_fabric_api_tools/GetAllTables.py @@ -0,0 +1,163 @@ +# Get All Tables Across All Workspaces in All Lakehouses +# (with pagination on all 3 API calls) + +# This snippet returns a Spark view with all tables +# It helps solve challenges: +# - Which table has XYZ in the name (e.g. payments, sales, etc) +# - How many tables we have +# - How many duplicated tables we have + +import requests +import pandas as pd +from notebookutils import mssparkutils +from pyspark.sql import SparkSession + +access_token = mssparkutils.credentials.getToken( + "https://api.fabric.microsoft.com" +) + +api_headers = { + "Authorization": f"Bearer {access_token}", + "Content-Type": "application/json" +} + +API_ROOT = "https://api.fabric.microsoft.com/v1" + +# ----------------------------------------------------------- +# Generic paginated GET — works for all Fabric list endpoints +# ----------------------------------------------------------- +def get_all_pages(url, result_key="value"): + """ + Fetch all pages from a Fabric REST API list endpoint. + Uses continuationUri/continuationToken for pagination. + + Args: + url: The initial API URL + result_key: "value" for workspaces/lakehouses, "data" for tables + Returns: + List of all items across all pages + """ + all_items = [] + + while url: + resp = requests.get(url, headers=api_headers) + resp.raise_for_status() + body = resp.json() + + items = body.get(result_key, []) + all_items.extend(items) + + # Prefer continuationUri (full URL), fall back to token + next_url = body.get("continuationUri") + if next_url: + url = next_url + elif body.get("continuationToken"): + separator = "&" if "?" in url else "?" + base_url = url.split("?")[0] if "?" in url else url + url = f"{base_url}{separator}continuationToken={body['continuationToken']}" + else: + url = None + + return all_items + + +# ----------------------------------------------------------- +# 1. Get ALL workspaces (paginated) +# ----------------------------------------------------------- +workspace_list = get_all_pages(f"{API_ROOT}/workspaces", result_key="value") + +print(f"Discovered {len(workspace_list)} workspaces\n") + +inventory_records = [] +schema_enabled_lakehouses = [] + +# ----------------------------------------------------------- +# 2. Loop through workspaces → lakehouses → tables +# ----------------------------------------------------------- +for workspace in workspace_list: + workspace_id = workspace["id"] + workspace_name = workspace["displayName"] + + print(f"Processing workspace: {workspace_name}") + + workspace_failed = False + + try: + # Get ALL lakehouses in this workspace (paginated) + lakehouse_list = get_all_pages( + f"{API_ROOT}/workspaces/{workspace_id}/lakehouses", + result_key="value" + ) + + for lakehouse in lakehouse_list: + lakehouse_id = lakehouse["id"] + lakehouse_name = lakehouse["displayName"] + + # Check if lakehouse is schema-enabled + properties = lakehouse.get("properties", {}) + if ( + properties.get("defaultSchema") is not None + or properties.get("enableSchemas", False) + ): + schema_enabled_lakehouses.append({ + "workspace_name": workspace_name, + "lakehouse_name": lakehouse_name, + }) + print(f" ⚠ Skipped '{lakehouse_name}' — schema-enabled lakehouse (REST API not supported)") + continue + + # Get ALL tables in this lakehouse (paginated) + table_list = get_all_pages( + f"{API_ROOT}/workspaces/{workspace_id}/lakehouses/{lakehouse_id}/tables", + result_key="data" + ) + + for table in table_list: + inventory_records.append({ + "workspace_name": workspace_name, + "lakehouse_name": lakehouse_name, + "table_name": table.get("name"), + "table_type": table.get("type"), + "location": table.get("location"), + "format": table.get("format") + }) + + print(f" ✓ Workspace processed successfully\n") + + except Exception as ex: + workspace_failed = True + print(f" ✗ Workspace failed: {workspace_name}") + print(f" Error: {str(ex)}\n") + +if inventory_records: + pandas_df = pd.DataFrame(inventory_records) + spark_df = spark.createDataFrame(pandas_df) + spark_df.createOrReplaceTempView("fabric_lakehouse_inventory") + + print("Temp view created: fabric_lakehouse_inventory") + print(f"Total tables indexed: {len(inventory_records)}") +else: + print("No lakehouses or tables found.") + +if schema_enabled_lakehouses: + print(f"\n⚠ Skipped {len(schema_enabled_lakehouses)} schema-enabled lakehouse(s):") + for lh in schema_enabled_lakehouses: + print(f" - {lh['workspace_name']} → {lh['lakehouse_name']}") + print(" (The REST API /tables endpoint does not support schema-enabled lakehouses yet)") + +query_text = """SELECT + workspace_name, + lakehouse_name, + table_name, + table_type, + location, + format +FROM fabric_lakehouse_inventory +""" + +display(spark.sql(query_text)) + +print("Use that query to get data:") +print("****************") +print("%%sql") +print(query_text) \ No newline at end of file diff --git a/samples/notebook_fabric_api_tools/README.md b/samples/notebook_fabric_api_tools/README.md new file mode 100644 index 00000000..3cc4d1b3 --- /dev/null +++ b/samples/notebook_fabric_api_tools/README.md @@ -0,0 +1,47 @@ +# FabricNotebookAPITools + +A collection of PySpark scripts designed to run inside **Microsoft Fabric Notebooks**. These tools use the Fabric and Power BI REST APIs to inventory, search, and audit resources across all workspaces in your Fabric tenant. + +All tools authenticate via `mssparkutils` (available natively in Fabric notebooks) and produce **Spark temporary views** that can be queried with SQL immediately after running. + +--- + +## Tools + +### [GetAllTables.py](GetAllTables.py) + +**Purpose:** Builds a complete inventory of all tables across every workspace and lakehouse in the tenant. + +**What it does:** +- Iterates over all workspaces, then all lakehouses within each workspace +- Collects table metadata: name, type, storage location, and format +- Registers the result as a Spark temporary view for SQL querying + +**Output view:** `fabric_lakehouse_inventory` + +| Column | Description | +|---|---| +| `workspace_name` | Name of the workspace | +| `lakehouse_name` | Name of the lakehouse | +| `table_name` | Name of the table | +| `table_type` | Table type (e.g., Managed, External) | +| `location` | Storage path of the table | +| `format` | File format (e.g., delta, parquet) | + +**Limitations:** +- **Schema-enabled lakehouses are skipped.** The Fabric REST API `/tables` endpoint does not support lakehouses with schemas enabled (`defaultSchema` or `enableSchemas`). These lakehouses are detected automatically and excluded from the inventory. The script prints a summary of skipped lakehouses at the end of the run, so you can identify gaps in coverage. + +**Use cases:** +- Find all tables matching a name pattern across the entire tenant +- Count total tables per lakehouse or workspace +- Identify duplicate or redundant tables +- Generate a full table inventory report for governance + +**Example query:** +```sql +SELECT * FROM fabric_lakehouse_inventory +WHERE table_name LIKE '%payments%' +ORDER BY workspace_name, lakehouse_name +``` + +--- From 1942215ef35656062ae4a859454cff286e7d1f14 Mon Sep 17 00:00:00 2001 From: Iurii Iurchenko <4iurchenko@gmail.com> Date: Sun, 19 Apr 2026 11:37:29 -0600 Subject: [PATCH 2/2] Address Copilot review feedback: timeouts, pagination, cleanups --- .../notebook_fabric_api_tools/GetAllTables.py | 89 +++++++++++-------- 1 file changed, 54 insertions(+), 35 deletions(-) diff --git a/samples/notebook_fabric_api_tools/GetAllTables.py b/samples/notebook_fabric_api_tools/GetAllTables.py index 59757f11..6eef6e22 100644 --- a/samples/notebook_fabric_api_tools/GetAllTables.py +++ b/samples/notebook_fabric_api_tools/GetAllTables.py @@ -8,9 +8,10 @@ # - How many duplicated tables we have import requests -import pandas as pd from notebookutils import mssparkutils -from pyspark.sql import SparkSession +from urllib.parse import urlparse, urlencode, parse_qsl, urlunparse +from pyspark.sql.types import StructType, StructField, StringType + access_token = mssparkutils.credentials.getToken( "https://api.fabric.microsoft.com" @@ -26,38 +27,41 @@ # ----------------------------------------------------------- # Generic paginated GET — works for all Fabric list endpoints # ----------------------------------------------------------- -def get_all_pages(url, result_key="value"): +def _get_with_retry(url, headers, timeout=30, retries=3, backoff=1.0): + for attempt in range(retries + 1): + resp = requests.get(url, headers=headers, timeout=timeout) + if resp.status_code < 500 and resp.status_code != 429: + resp.raise_for_status() + return resp + if attempt == retries: + resp.raise_for_status() + wait = float(resp.headers.get("Retry-After", backoff * (2 ** attempt))) + time.sleep(wait) + +def get_all_pages(url, result_key=("value", "data")): """ - Fetch all pages from a Fabric REST API list endpoint. - Uses continuationUri/continuationToken for pagination. - - Args: - url: The initial API URL - result_key: "value" for workspaces/lakehouses, "data" for tables - Returns: - List of all items across all pages + result_key: a string, or a tuple of keys tried in order. + The first key present in the response body is used. """ - all_items = [] + if isinstance(result_key, str): + result_key = (result_key,) + all_items = [] while url: - resp = requests.get(url, headers=api_headers) - resp.raise_for_status() - body = resp.json() + body = _get_with_retry(url, api_headers).json() - items = body.get(result_key, []) + items = next((body[k] for k in result_key if k in body), []) all_items.extend(items) - # Prefer continuationUri (full URL), fall back to token - next_url = body.get("continuationUri") - if next_url: - url = next_url + if body.get("continuationUri"): + url = body["continuationUri"] elif body.get("continuationToken"): - separator = "&" if "?" in url else "?" - base_url = url.split("?")[0] if "?" in url else url - url = f"{base_url}{separator}continuationToken={body['continuationToken']}" + parsed = urlparse(url) + params = dict(parse_qsl(parsed.query)) + params["continuationToken"] = body["continuationToken"] + url = urlunparse(parsed._replace(query=urlencode(params))) else: url = None - return all_items @@ -70,6 +74,7 @@ def get_all_pages(url, result_key="value"): inventory_records = [] schema_enabled_lakehouses = [] +failed_workspaces = [] # ----------------------------------------------------------- # 2. Loop through workspaces → lakehouses → tables @@ -80,8 +85,6 @@ def get_all_pages(url, result_key="value"): print(f"Processing workspace: {workspace_name}") - workspace_failed = False - try: # Get ALL lakehouses in this workspace (paginated) lakehouse_list = get_all_pages( @@ -125,17 +128,27 @@ def get_all_pages(url, result_key="value"): print(f" ✓ Workspace processed successfully\n") except Exception as ex: - workspace_failed = True + failed_workspaces.append({ + "workspace_name": workspace_name, + "error": str(ex), + }) print(f" ✗ Workspace failed: {workspace_name}") - print(f" Error: {str(ex)}\n") + print(f" Error: {ex}\n") + +inventory_schema = StructType([ + StructField("workspace_name", StringType(), True), + StructField("lakehouse_name", StringType(), True), + StructField("table_name", StringType(), True), + StructField("table_type", StringType(), True), + StructField("location", StringType(), True), + StructField("format", StringType(), True), +]) if inventory_records: - pandas_df = pd.DataFrame(inventory_records) - spark_df = spark.createDataFrame(pandas_df) + spark_df = spark.createDataFrame(inventory_records, schema=inventory_schema) spark_df.createOrReplaceTempView("fabric_lakehouse_inventory") - - print("Temp view created: fabric_lakehouse_inventory") - print(f"Total tables indexed: {len(inventory_records)}") + print("A view fabric_lakehouse_inventory created") + print(f"Total tables indexed: {spark_df.count()}") else: print("No lakehouses or tables found.") @@ -145,6 +158,11 @@ def get_all_pages(url, result_key="value"): print(f" - {lh['workspace_name']} → {lh['lakehouse_name']}") print(" (The REST API /tables endpoint does not support schema-enabled lakehouses yet)") +if failed_workspaces: + print(f"\n✗ Failed to process {len(failed_workspaces)} workspace(s):") + for fw in failed_workspaces: + print(f" - {fw['workspace_name']}: {fw['error']}") + query_text = """SELECT workspace_name, lakehouse_name, @@ -155,9 +173,10 @@ def get_all_pages(url, result_key="value"): FROM fabric_lakehouse_inventory """ -display(spark.sql(query_text)) +if schema_enabled_lakehouses: + display(spark.sql(query_text)) print("Use that query to get data:") print("****************") print("%%sql") -print(query_text) \ No newline at end of file +print(query_text)