diff --git a/.claude/skills/dataverse-sdk-use/SKILL.md b/.claude/skills/dataverse-sdk-use/SKILL.md index f351c176..9edb733f 100644 --- a/.claude/skills/dataverse-sdk-use/SKILL.md +++ b/.claude/skills/dataverse-sdk-use/SKILL.md @@ -30,6 +30,9 @@ The SDK supports Dataverse's native bulk operations: Pass lists to `create()`, ` - Control page size with `page_size` parameter - Use `top` parameter to limit total records returned +### DataFrame Support +- DataFrame operations are accessed via the `client.dataframe` namespace: `client.dataframe.get()`, `client.dataframe.create()`, `client.dataframe.update()`, `client.dataframe.delete()` + ## Common Operations ### Import @@ -129,7 +132,7 @@ client.records.update("account", [id1, id2, id3], {"industry": "Technology"}) ``` #### Upsert Records -Creates or updates records identified by alternate keys. Single item → PATCH; multiple items → `UpsertMultiple` bulk action. +Creates or updates records identified by alternate keys. Single item -> PATCH; multiple items -> `UpsertMultiple` bulk action. > **Prerequisite**: The table must have an alternate key configured in Dataverse for the columns used in `alternate_key`. Without it, Dataverse will reject the request with a 400 error. ```python from PowerPlatform.Dataverse.models.upsert import UpsertItem @@ -171,6 +174,42 @@ client.records.delete("account", account_id) client.records.delete("account", [id1, id2, id3], use_bulk_delete=True) ``` +### DataFrame Operations + +The SDK provides DataFrame wrappers for all CRUD operations via the `client.dataframe` namespace, using pandas DataFrames and Series as input/output. + +```python +import pandas as pd + +# Query records -- returns a single DataFrame +df = client.dataframe.get("account", filter="statecode eq 0", select=["name"]) +print(f"Got {len(df)} rows") + +# Limit results with top for large tables +df = client.dataframe.get("account", select=["name"], top=100) + +# Fetch single record as one-row DataFrame +df = client.dataframe.get("account", record_id=account_id, select=["name"]) + +# Create records from a DataFrame (returns a Series of GUIDs) +new_accounts = pd.DataFrame([ + {"name": "Contoso", "telephone1": "555-0100"}, + {"name": "Fabrikam", "telephone1": "555-0200"}, +]) +new_accounts["accountid"] = client.dataframe.create("account", new_accounts) + +# Update records from a DataFrame (id_column identifies the GUID column) +new_accounts["telephone1"] = ["555-0199", "555-0299"] +client.dataframe.update("account", new_accounts, id_column="accountid") + +# Clear a field by setting clear_nulls=True (by default, NaN/None fields are skipped) +df = pd.DataFrame([{"accountid": "guid-1", "websiteurl": None}]) +client.dataframe.update("account", df, id_column="accountid", clear_nulls=True) + +# Delete records by passing a Series of GUIDs +client.dataframe.delete("account", new_accounts["accountid"]) +``` + ### SQL Queries SQL queries are **read-only** and support limited SQL syntax. A single SELECT statement with optional WHERE, TOP (integer literal), ORDER BY (column names only), and a simple table alias after FROM is supported. But JOIN and subqueries may not be. Refer to the Dataverse documentation for the current feature set. diff --git a/.gitignore b/.gitignore index 2c664173..aa762db2 100644 --- a/.gitignore +++ b/.gitignore @@ -25,3 +25,4 @@ Thumbs.db # Claude local settings .claude/*.local.json +.claude/*.local.md diff --git a/README.md b/README.md index aaa8984a..1426f96a 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ A Python client library for Microsoft Dataverse that provides a unified interfac - [Basic CRUD operations](#basic-crud-operations) - [Bulk operations](#bulk-operations) - [Upsert operations](#upsert-operations) + - [DataFrame operations](#dataframe-operations) - [Query data](#query-data) - [Table management](#table-management) - [Relationship management](#relationship-management) @@ -39,6 +40,7 @@ A Python client library for Microsoft Dataverse that provides a unified interfac - **📊 SQL Queries**: Execute read-only SQL queries via the Dataverse Web API `?sql=` parameter - **🏗️ Table Management**: Create, inspect, and delete custom tables and columns programmatically - **🔗 Relationship Management**: Create one-to-many and many-to-many relationships between tables with full metadata control +- **🐼 DataFrame Support**: Pandas wrappers for all CRUD operations, returning DataFrames and Series - **📎 File Operations**: Upload files to Dataverse file columns with automatic chunking for large files - **🔐 Azure Identity**: Built-in authentication using Azure Identity credential providers with comprehensive support - **🛡️ Error Handling**: Structured exception hierarchy with detailed error context and retry guidance @@ -232,6 +234,42 @@ client.records.upsert("account", [ ]) ``` +### DataFrame operations + +The SDK provides pandas wrappers for all CRUD operations via the `client.dataframe` namespace, using DataFrames and Series for input and output. + +```python +import pandas as pd + +# Query records as a single DataFrame +df = client.dataframe.get("account", filter="statecode eq 0", select=["name", "telephone1"]) +print(f"Found {len(df)} accounts") + +# Limit results with top for large tables +df = client.dataframe.get("account", select=["name"], top=100) + +# Fetch a single record as a one-row DataFrame +df = client.dataframe.get("account", record_id=account_id, select=["name"]) + +# Create records from a DataFrame (returns a Series of GUIDs) +new_accounts = pd.DataFrame([ + {"name": "Contoso", "telephone1": "555-0100"}, + {"name": "Fabrikam", "telephone1": "555-0200"}, +]) +new_accounts["accountid"] = client.dataframe.create("account", new_accounts) + +# Update records from a DataFrame (id_column identifies the GUID column) +new_accounts["telephone1"] = ["555-0199", "555-0299"] +client.dataframe.update("account", new_accounts, id_column="accountid") + +# Clear a field by setting clear_nulls=True (by default, NaN/None fields are skipped) +df = pd.DataFrame([{"accountid": new_accounts["accountid"].iloc[0], "websiteurl": None}]) +client.dataframe.update("account", df, id_column="accountid", clear_nulls=True) + +# Delete records by passing a Series of GUIDs +client.dataframe.delete("account", new_accounts["accountid"]) +``` + ### Query data ```python diff --git a/examples/advanced/dataframe_operations.py b/examples/advanced/dataframe_operations.py new file mode 100644 index 00000000..7c0b6010 --- /dev/null +++ b/examples/advanced/dataframe_operations.py @@ -0,0 +1,174 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +""" +PowerPlatform Dataverse Client - DataFrame Operations Walkthrough + +This example demonstrates how to use the pandas DataFrame extension methods +for CRUD operations with Microsoft Dataverse. + +Prerequisites: + pip install PowerPlatform-Dataverse-Client + pip install azure-identity +""" + +import sys +import uuid + +import pandas as pd +from azure.identity import InteractiveBrowserCredential + +from PowerPlatform.Dataverse.client import DataverseClient + + +def main(): + # -- Setup & Authentication ------------------------------------ + base_url = input("Enter Dataverse org URL (e.g. https://yourorg.crm.dynamics.com): ").strip() + if not base_url: + print("[ERR] No URL entered; exiting.") + sys.exit(1) + base_url = base_url.rstrip("/") + + print("[INFO] Authenticating via browser...") + credential = InteractiveBrowserCredential() + + with DataverseClient(base_url, credential) as client: + _run_walkthrough(client) + + +def _run_walkthrough(client): + table = input("Enter table schema name to use [default: account]: ").strip() or "account" + print(f"[INFO] Using table: {table}") + + # Unique tag to isolate test records from existing data + tag = uuid.uuid4().hex[:8] + test_filter = f"contains(name,'{tag}')" + print(f"[INFO] Using tag '{tag}' to identify test records") + + select_cols = ["name", "telephone1", "websiteurl", "lastonholdtime"] + + # -- 1. Create records from a DataFrame ------------------------ + print("\n" + "-" * 60) + print("1. Create records from a DataFrame") + print("-" * 60) + + new_accounts = pd.DataFrame( + [ + { + "name": f"Contoso_{tag}", + "telephone1": "555-0100", + "websiteurl": "https://contoso.com", + "lastonholdtime": pd.Timestamp("2024-06-15 10:30:00"), + }, + {"name": f"Fabrikam_{tag}", "telephone1": "555-0200", "websiteurl": None, "lastonholdtime": None}, + { + "name": f"Northwind_{tag}", + "telephone1": None, + "websiteurl": "https://northwind.com", + "lastonholdtime": pd.Timestamp("2024-12-01 08:00:00"), + }, + ] + ) + print(f" Input DataFrame:\n{new_accounts.to_string(index=False)}\n") + + # create_dataframe returns a Series of GUIDs aligned with the input rows + new_accounts["accountid"] = client.dataframe.create(table, new_accounts) + print(f"[OK] Created {len(new_accounts)} records") + print(f" IDs: {new_accounts['accountid'].tolist()}") + + # -- 2. Query records as a DataFrame ------------------------- + print("\n" + "-" * 60) + print("2. Query records as a DataFrame") + print("-" * 60) + + df_all = client.dataframe.get(table, select=select_cols, filter=test_filter) + print(f"[OK] Got {len(df_all)} records in one DataFrame") + print(f" Columns: {list(df_all.columns)}") + print(f"{df_all.to_string(index=False)}") + + # -- 3. Limit results with top ------------------------------ + print("\n" + "-" * 60) + print("3. Limit results with top") + print("-" * 60) + + df_top2 = client.dataframe.get(table, select=select_cols, filter=test_filter, top=2) + print(f"[OK] Got {len(df_top2)} records with top=2") + print(f"{df_top2.to_string(index=False)}") + + # -- 4. Fetch a single record by ID ---------------------------- + print("\n" + "-" * 60) + print("4. Fetch a single record by ID") + print("-" * 60) + + first_id = new_accounts["accountid"].iloc[0] + print(f" Fetching record {first_id}...") + single = client.dataframe.get(table, record_id=first_id, select=select_cols) + print(f"[OK] Single record DataFrame:\n{single.to_string(index=False)}") + + # -- 5. Update records from a DataFrame ------------------------ + print("\n" + "-" * 60) + print("5. Update records with different values per row") + print("-" * 60) + + new_accounts["telephone1"] = ["555-1100", "555-1200", "555-1300"] + print(f" New telephone numbers: {new_accounts['telephone1'].tolist()}") + client.dataframe.update(table, new_accounts[["accountid", "telephone1"]], id_column="accountid") + print("[OK] Updated 3 records") + + # Verify the updates + verified = client.dataframe.get(table, select=select_cols, filter=test_filter) + print(f" Verified:\n{verified.to_string(index=False)}") + + # -- 6. Broadcast update (same value to all records) ----------- + print("\n" + "-" * 60) + print("6. Broadcast update (same value to all records)") + print("-" * 60) + + broadcast_df = new_accounts[["accountid"]].copy() + broadcast_df["websiteurl"] = "https://updated.example.com" + print(f" Setting websiteurl to 'https://updated.example.com' for all {len(broadcast_df)} records") + client.dataframe.update(table, broadcast_df, id_column="accountid") + print("[OK] Broadcast update complete") + + # Verify all records have the same websiteurl + verified = client.dataframe.get(table, select=select_cols, filter=test_filter) + print(f" Verified:\n{verified.to_string(index=False)}") + + # Default: NaN/None fields are skipped (not overridden on server) + print("\n Updating with NaN values (default: clear_nulls=False, fields should stay unchanged)...") + sparse_df = pd.DataFrame( + [ + {"accountid": new_accounts["accountid"].iloc[0], "telephone1": "555-9999", "websiteurl": None}, + ] + ) + client.dataframe.update(table, sparse_df, id_column="accountid") + verified = client.dataframe.get(table, select=select_cols, filter=test_filter) + print(f" Verified (Contoso telephone1 updated, websiteurl unchanged):\n{verified.to_string(index=False)}") + + # Opt-in: clear_nulls=True sends None as null to clear the field + print("\n Clearing websiteurl for Contoso with clear_nulls=True...") + clear_df = pd.DataFrame([{"accountid": new_accounts["accountid"].iloc[0], "websiteurl": None}]) + client.dataframe.update(table, clear_df, id_column="accountid", clear_nulls=True) + verified = client.dataframe.get(table, select=select_cols, filter=test_filter) + print(f" Verified (Contoso websiteurl should be empty):\n{verified.to_string(index=False)}") + + # -- 7. Delete records by passing a Series of GUIDs ------------ + print("\n" + "-" * 60) + print("7. Delete records by passing a Series of GUIDs") + print("-" * 60) + + print(f" Deleting {len(new_accounts)} records...") + client.dataframe.delete(table, new_accounts["accountid"], use_bulk_delete=False) + print(f"[OK] Deleted {len(new_accounts)} records") + + # Verify deletions - filter for our tagged records should return 0 + remaining = client.dataframe.get(table, select=select_cols, filter=test_filter) + print(f" Verified: {len(remaining)} test records remaining (expected 0)") + + print("\n" + "=" * 60) + print("[OK] DataFrame operations walkthrough complete!") + print("=" * 60) + + +if __name__ == "__main__": + main() diff --git a/examples/advanced/datascience_risk_assessment.py b/examples/advanced/datascience_risk_assessment.py new file mode 100644 index 00000000..338713e4 --- /dev/null +++ b/examples/advanced/datascience_risk_assessment.py @@ -0,0 +1,764 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +""" +PowerPlatform Dataverse Client - Data Science Risk Assessment Pipeline + +End-to-end example: Extract Dataverse data into DataFrames, run statistical +analysis, generate LLM-powered risk summaries, and write results back to +Dataverse -- a realistic data analyst / data scientist workflow. + +Pipeline flow (matches the SDK architecture): + Dataverse SDK --> Pandas DataFrame --> Analysis + LLM --> Write-back & Reports + +Scenario: + A financial services company tracks customer accounts, service cases, and + revenue opportunities in Dataverse. The risk team needs to: + 1) Pull data from multiple tables into DataFrames + 2) Compute risk scores using statistical analysis (pandas/numpy) + 3) Classify and summarize risk using an LLM + 4) Write risk assessments back to Dataverse + 5) Produce a summary report + + Note: This example reads from existing Dataverse tables (account, + incident, opportunity) and does not create or delete any tables. + Step 4 (write-back) is disabled by default -- uncomment it in + run_risk_pipeline() to write risk scores back to account records. + +Prerequisites (required -- included in SDK dependencies): + pip install PowerPlatform-Dataverse-Client + pip install azure-identity + +Additional libraries (optional -- used for visualization and LLM; not part +of the SDK and must be installed separately. Pick ONE LLM provider): + pip install matplotlib # for charts / visualization + pip install azure-ai-inference # Option A: Azure AI Foundry / Azure OpenAI + pip install openai # Option B: OpenAI / Azure OpenAI + pip install github-copilot-sdk # Option C: GitHub Copilot SDK (requires Copilot CLI) +""" + +import sys +import warnings +from pathlib import Path +from textwrap import dedent + +# Suppress MSAL advisory about response_mode (third-party library, not actionable here) +warnings.filterwarnings("ignore", message="response_mode=.*form_post", category=UserWarning) + +import numpy as np +import pandas as pd +from azure.identity import InteractiveBrowserCredential + +from PowerPlatform.Dataverse.client import DataverseClient + +# -- Optional imports (graceful degradation if not installed) ------ + +try: + import matplotlib + + matplotlib.use("Agg") # non-interactive backend (no GUI required) + import matplotlib.pyplot as plt + + HAS_MATPLOTLIB = True +except ImportError: + HAS_MATPLOTLIB = False + + +# ================================================================ +# LLM Provider Configuration +# ================================================================ +# Choose one of three LLM providers. The first available one is used. +# Each provider has its own init function below. +# +# Option A: Azure AI Inference (azure-ai-inference) +# - Works with Azure AI Foundry, Azure OpenAI, and GitHub Models +# - Uses azure.core.credentials (same auth pattern as Dataverse SDK) +# - pip install azure-ai-inference +# +# Option B: OpenAI (openai) +# - Works with OpenAI API and Azure OpenAI +# - pip install openai +# +# Option C: GitHub Copilot SDK (github-copilot-sdk) +# - Uses your existing GitHub Copilot subscription (no separate API key) +# - Requires the Copilot CLI binary and async execution +# - pip install github-copilot-sdk +# ================================================================ + + +def get_llm_client(provider=None, endpoint=None, api_key=None, model="gpt-4o"): + """Create an LLM client using the specified (or first available) provider. + + Returns a callable: llm_complete(system_prompt, user_prompt) -> str + Returns None if no provider is available. + + The returned callable also has a `.log` attribute (list) that records + each call's prompt, response, timing, and provider metadata. + """ + providers = [provider] if provider else ["azure-ai-inference", "openai", "copilot-sdk"] + + for p in providers: + client = _try_init_provider(p, endpoint, api_key, model) + if client is not None: + return client + + return None + + +def _wrap_with_logging(raw_complete, provider_name, model_name): + """Wrap a raw complete function with timing and logging.""" + import time + + log = [] + + def complete(system_prompt, user_prompt): + start = time.time() + response = raw_complete(system_prompt, user_prompt) + elapsed = time.time() - start + log.append( + { + "provider": provider_name, + "model": model_name, + "system_prompt": system_prompt, + "user_prompt": user_prompt, + "response": response, + "elapsed_seconds": round(elapsed, 2), + } + ) + return response + + complete.log = log + complete.provider_name = provider_name + complete.model_name = model_name + return complete + + +def _try_init_provider(name, endpoint, api_key, model): + """Try to initialize a specific LLM provider. Returns callable or None.""" + if name == "azure-ai-inference": + return _init_azure_ai(endpoint, api_key, model) + elif name == "openai": + return _init_openai(endpoint, api_key, model) + elif name == "copilot-sdk": + return _init_copilot_sdk() + return None + + +def _init_azure_ai(endpoint, api_key, model): + """Initialize Azure AI Inference client (Azure AI Foundry / Azure OpenAI).""" + try: + from azure.ai.inference import ChatCompletionsClient + from azure.ai.inference.models import SystemMessage, UserMessage + from azure.core.credentials import AzureKeyCredential + except ImportError: + return None + + if not endpoint or not api_key: + return None + + client = ChatCompletionsClient( + endpoint=endpoint, + credential=AzureKeyCredential(api_key), + ) + + def complete(system_prompt, user_prompt): + response = client.complete( + messages=[ + SystemMessage(content=system_prompt), + UserMessage(content=user_prompt), + ], + max_tokens=150, + temperature=0.3, + ) + return response.choices[0].message.content.strip() + + print("[INFO] LLM provider: Azure AI Inference") + return _wrap_with_logging(complete, "Azure AI Inference", model) + + +def _init_openai(endpoint, api_key, model): + """Initialize OpenAI client (OpenAI API or Azure OpenAI).""" + try: + import openai + except ImportError: + return None + + if not api_key: + return None + + if endpoint: + # Azure OpenAI + client = openai.AzureOpenAI( + azure_endpoint=endpoint, + api_key=api_key, + api_version="2024-02-01", + ) + else: + # OpenAI API + client = openai.OpenAI(api_key=api_key) + + def complete(system_prompt, user_prompt): + response = client.chat.completions.create( + model=model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ], + max_tokens=150, + temperature=0.3, + ) + return response.choices[0].message.content.strip() + + provider_name = "Azure OpenAI" if endpoint else "OpenAI" + print(f"[INFO] LLM provider: {provider_name}") + return _wrap_with_logging(complete, provider_name, model) + + +def _init_copilot_sdk(): + """Initialize GitHub Copilot SDK. + + # Uncomment and configure to use your Copilot subscription as the LLM provider. + # Requires: pip install github-copilot-sdk + # Copilot CLI must be installed. See: https://github.com/github/copilot-sdk + """ + # To enable, install the SDK and uncomment the implementation below. + # from copilot import CopilotClient + # ... (see Copilot SDK docs for session/send_and_wait usage) + return None + + +# ================================================================ +# Configuration +# ================================================================ + +# Tables used in this demo (schema names) +TABLE_ACCOUNTS = "account" +TABLE_CASES = "incident" +TABLE_OPPORTUNITIES = "opportunity" + +# Risk score thresholds +RISK_HIGH = 75 +RISK_MEDIUM = 40 + +# -- Output folder for exports and reports (relative to this script) -- +_SCRIPT_DIR = Path(__file__).resolve().parent +OUTPUT_DIR = _SCRIPT_DIR / "risk_assessment_output" + + +def main(): + """Entry point -- authenticate and run the pipeline.""" + base_url = input("Enter Dataverse org URL (e.g. https://yourorg.crm.dynamics.com): ").strip() + if not base_url: + print("[ERR] No URL entered; exiting.") + sys.exit(1) + base_url = base_url.rstrip("/") + + print("[INFO] Authenticating via browser...") + credential = InteractiveBrowserCredential() + + with DataverseClient(base_url, credential) as client: + run_risk_pipeline(client) + + +# ================================================================ +# Step 1: Extract -- Pull data from Dataverse into DataFrames +# ================================================================ + + +def step1_extract(client): + """Extract accounts, cases, and opportunities from Dataverse.""" + print("\n" + "=" * 60) + print("STEP 1: Extract data from Dataverse") + print("=" * 60) + + # Pull accounts + accounts = client.dataframe.get( + TABLE_ACCOUNTS, + select=["accountid", "name", "revenue", "numberofemployees", "industrycode"], + filter="statecode eq 0", + top=200, + ) + print(f"[OK] Extracted {len(accounts)} active accounts") + + # Pull open cases (service incidents) + cases = client.dataframe.get( + TABLE_CASES, + select=[ + "incidentid", + "_customerid_value", + "title", + "severitycode", + "prioritycode", + "createdon", + ], + filter="statecode eq 0", + top=1000, + ) + print(f"[OK] Extracted {len(cases)} open cases") + + # Pull active opportunities + opportunities = client.dataframe.get( + TABLE_OPPORTUNITIES, + select=[ + "opportunityid", + "_parentaccountid_value", + "name", + "estimatedvalue", + "closeprobability", + "estimatedclosedate", + ], + filter="statecode eq 0", + top=1000, + ) + print(f"[OK] Extracted {len(opportunities)} active opportunities") + + return accounts, cases, opportunities + + +# ================================================================ +# Step 2: Transform & Analyze -- Statistical risk scoring +# ================================================================ + + +def step2_analyze(accounts, cases, opportunities): + """Compute risk scores using pandas statistical operations.""" + print("\n" + "=" * 60) + print("STEP 2: Statistical analysis -- compute risk scores") + print("=" * 60) + + # -- Case severity analysis per account -- + if not cases.empty and "_customerid_value" in cases.columns: + case_stats = ( + cases.groupby("_customerid_value") + .agg( + total_cases=("incidentid", "count"), + high_severity_cases=("severitycode", lambda x: (x == 1).sum()), + avg_priority=("prioritycode", "mean"), + ) + .reset_index() + .rename(columns={"_customerid_value": "accountid"}) + ) + else: + case_stats = pd.DataFrame(columns=["accountid", "total_cases", "high_severity_cases", "avg_priority"]) + + # -- Opportunity pipeline analysis per account -- + if not opportunities.empty and "_parentaccountid_value" in opportunities.columns: + # Precompute weighted value to avoid closure-based aggregation + opportunities = opportunities.copy() + opportunities["_weighted_value"] = ( + pd.to_numeric(opportunities["estimatedvalue"], errors="coerce").fillna(0) + * pd.to_numeric(opportunities["closeprobability"], errors="coerce").fillna(0) + / 100 + ) + opp_stats = ( + opportunities.groupby("_parentaccountid_value") + .agg( + total_opportunities=("opportunityid", "count"), + pipeline_value=("estimatedvalue", "sum"), + avg_close_probability=("closeprobability", "mean"), + weighted_pipeline=("_weighted_value", "sum"), + ) + .reset_index() + .rename(columns={"_parentaccountid_value": "accountid"}) + ) + else: + opp_stats = pd.DataFrame( + columns=[ + "accountid", + "total_opportunities", + "pipeline_value", + "avg_close_probability", + "weighted_pipeline", + ] + ) + + # -- Join everything into a single risk DataFrame -- + risk_df = accounts.merge(case_stats, on="accountid", how="left") + risk_df = risk_df.merge(opp_stats, on="accountid", how="left") + + # Ensure source numeric columns from Dataverse are proper numeric dtypes + for col in ["revenue", "numberofemployees"]: + if col in risk_df.columns: + risk_df[col] = pd.to_numeric(risk_df[col], errors="coerce").fillna(0) + + # Fill NaN for accounts with no cases/opportunities and ensure numeric dtypes + # (Dataverse may return values as object dtype) + for col in ["total_cases", "high_severity_cases"]: + risk_df[col] = pd.to_numeric(risk_df[col], errors="coerce").fillna(0).astype(int) + for col in ["avg_priority", "pipeline_value", "avg_close_probability", "weighted_pipeline"]: + risk_df[col] = pd.to_numeric(risk_df[col], errors="coerce").fillna(0).astype(float) + risk_df["total_opportunities"] = ( + pd.to_numeric(risk_df["total_opportunities"], errors="coerce").fillna(0).astype(int) + ) + + # -- Compute composite risk score (0-100) -- + # Factors: case severity, case volume, low pipeline, low close probability + risk_df["risk_score"] = compute_risk_score(risk_df) + + # -- Classify risk tier -- + risk_df["risk_tier"] = risk_df["risk_score"].apply(classify_risk) + + print(f"[OK] Computed risk scores for {len(risk_df)} accounts") + print(f" High risk: {(risk_df['risk_tier'] == 'High').sum()}") + print(f" Medium risk: {(risk_df['risk_tier'] == 'Medium').sum()}") + print(f" Low risk: {(risk_df['risk_tier'] == 'Low').sum()}") + + # -- Summary statistics -- + print("\n Risk score distribution:") + print(f" Mean: {risk_df['risk_score'].mean():.1f}") + print(f" Median: {risk_df['risk_score'].median():.1f}") + print(f" Std: {risk_df['risk_score'].std():.1f}") + print(f" Min: {risk_df['risk_score'].min():.1f}") + print(f" Max: {risk_df['risk_score'].max():.1f}") + + return risk_df + + +def compute_risk_score(df): + """Compute a 0-100 risk score from multiple factors. + + Higher score = higher risk. Weighted formula: + - 35%: case severity (high-severity cases relative to total) + - 25%: case volume (normalized by percentile) + - 20%: pipeline weakness (inverse of weighted pipeline value) + - 20%: close probability risk (inverse of avg close probability) + """ + scores = pd.Series(0.0, index=df.index) + + # Factor 1: Case severity ratio (35%) + case_total = df["total_cases"].clip(lower=1) + severity_ratio = df["high_severity_cases"] / case_total + scores += severity_ratio * 35 + + # Factor 2: Case volume (25%) -- percentile rank + if df["total_cases"].max() > 0: + case_pctile = df["total_cases"].rank(pct=True) + scores += case_pctile * 25 + else: + scores += 12.5 # neutral if no cases exist + + # Factor 3: Pipeline weakness (20%) -- low pipeline = high risk + max_pipeline = df["weighted_pipeline"].max() + if max_pipeline > 0: + pipeline_strength = df["weighted_pipeline"] / max_pipeline + scores += (1 - pipeline_strength) * 20 + else: + scores += 10 # neutral + + # Factor 4: Close probability risk (20%) + close_risk = (100 - df["avg_close_probability"]) / 100 + scores += close_risk * 20 + + return scores.clip(0, 100).round(1) + + +def classify_risk(score): + """Classify a risk score into High / Medium / Low.""" + if score >= RISK_HIGH: + return "High" + elif score >= RISK_MEDIUM: + return "Medium" + return "Low" + + +# ================================================================ +# Step 3: LLM Summarization -- Generate risk narratives +# ================================================================ + + +def step3_summarize(risk_df, llm_complete=None): + """Generate per-account risk summaries using LLM or template fallback.""" + print("\n" + "=" * 60) + print("STEP 3: Generate risk summaries") + print("=" * 60) + + # Focus on high and medium risk accounts + flagged = risk_df[risk_df["risk_tier"].isin(["High", "Medium"])].copy() + print(f"[INFO] Generating summaries for {len(flagged)} flagged accounts") + + if llm_complete is not None: + summaries = _summarize_with_llm(flagged, llm_complete) + # Export LLM interaction log + if hasattr(llm_complete, "log") and llm_complete.log: + _export_llm_log(llm_complete) + else: + print("[INFO] No LLM provider configured -- using template-based summarization") + summaries = _summarize_with_template(flagged) + + flagged["risk_summary"] = summaries + summary_map = dict(zip(flagged["accountid"], flagged["risk_summary"])) + risk_df["risk_summary"] = risk_df["accountid"].map(summary_map).fillna("Low risk -- no action needed.") + + print(f"[OK] Generated {len(summaries)} risk summaries") + + # Show top 3 highest risk accounts + top_risk = risk_df.nlargest(3, "risk_score") + for _, row in top_risk.iterrows(): + print(f"\n Account: {row.get('name', 'Unknown')}") + print(f" Risk Score: {row['risk_score']} ({row['risk_tier']})") + print(f" Summary: {row['risk_summary'][:120]}...") + + return risk_df + + +def _summarize_with_llm(flagged_df, llm_complete): + """Generate risk narratives using the configured LLM provider.""" + system_prompt = ( + "You are a customer risk analyst at a financial services company. " + "Write exactly 2-3 sentences per account. " + "Sentence 1: State the risk level and primary driver. " + "Sentence 2: Quantify the key metric(s) behind the risk. " + "Sentence 3 (if needed): Recommend one specific action. " + "Use plain business language. Do not use bullet points or markdown." + ) + + summaries = [] + for _, row in flagged_df.iterrows(): + user_prompt = dedent(f"""\ + Summarize the risk for this account: + + Account Name: {row.get("name", "Unknown")} + Risk Score: {row["risk_score"]:.0f}/100 ({row["risk_tier"]} risk) + Open Support Cases: {row["total_cases"]} total, {row["high_severity_cases"]} high-severity + Revenue Pipeline: ${row["pipeline_value"]:,.0f} total, ${row["weighted_pipeline"]:,.0f} probability-weighted + Average Deal Close Probability: {row["avg_close_probability"]:.0f}% + """) + + summary = llm_complete(system_prompt, user_prompt) + summaries.append(summary) + + return summaries + + +def _summarize_with_template(flagged_df): + """Template-based fallback when LLM is not available.""" + summaries = [] + for _, row in flagged_df.iterrows(): + name = row.get("name", "Unknown") + parts = [] + + if row["high_severity_cases"] > 0: + parts.append(f"{row['high_severity_cases']} high-severity cases require immediate attention") + if row["total_cases"] > 5: + parts.append(f"elevated case volume ({row['total_cases']} open)") + + if row["weighted_pipeline"] < 10000: + parts.append("weak revenue pipeline") + if row["avg_close_probability"] < 30: + parts.append(f"low close probability ({row['avg_close_probability']:.0f}%)") + + if not parts: + parts.append("multiple moderate risk factors detected") + + risk_factors = "; ".join(parts) + summary = ( + f"{name} has a {row['risk_tier'].lower()} risk score of " + f"{row['risk_score']:.0f}/100. Key factors: {risk_factors}. " + f"Recommend proactive outreach and account review." + ) + summaries.append(summary) + + return summaries + + +def _export_llm_log(llm_complete, include_prompts=False): + """Export LLM interaction log (timing, provider metadata) to a text file. + + By default, prompt and response content is not included to avoid logging + sensitive data (PII, customer data). Set include_prompts=True to include + full content for debugging. + """ + log_path = OUTPUT_DIR / "llm_interactions.txt" + with open(log_path, "w", encoding="utf-8") as f: + f.write("LLM Interaction Log\n") + f.write("=" * 70 + "\n") + f.write(f"Provider: {llm_complete.provider_name}\n") + f.write(f"Model: {llm_complete.model_name}\n") + f.write(f"Total calls: {len(llm_complete.log)}\n") + total_time = sum(entry["elapsed_seconds"] for entry in llm_complete.log) + f.write(f"Total time: {total_time:.1f}s\n") + f.write("=" * 70 + "\n\n") + + for i, entry in enumerate(llm_complete.log, 1): + f.write(f"--- Call {i} ({entry['elapsed_seconds']:.2f}s) ---\n\n") + if include_prompts: + f.write(f"[System Prompt]\n{entry['system_prompt']}\n\n") + f.write(f"[User Prompt]\n{entry['user_prompt']}\n\n") + f.write(f"[Response]\n{entry['response']}\n\n") + else: + f.write(f"[Response length: {len(entry['response'])} chars]\n\n") + + print(f"[OK] LLM interaction log saved to {log_path}") + + +# ================================================================ +# Step 4: Write-back -- Store results in Dataverse +# ================================================================ + + +def step4_writeback(client, risk_df): + """Write risk scores and summaries back to Dataverse accounts.""" + print("\n" + "=" * 60) + print("STEP 4: Write risk assessments back to Dataverse") + print("=" * 60) + + # Update accounts with risk data + # Note: These columns must exist on the account table in your environment. + # In a real deployment, you would create custom columns like: + # new_riskscore (Whole Number), new_risktier (Text), new_risksummary (Multi-line Text) + update_df = risk_df[["accountid", "description"]].copy() + update_df["description"] = risk_df.apply( + lambda r: f"[Risk: {r['risk_tier']} ({r['risk_score']:.0f}/100)] {r['risk_summary']}", + axis=1, + ) + + client.dataframe.update(TABLE_ACCOUNTS, update_df, id_column="accountid") + print(f"[OK] Updated {len(update_df)} account records with risk assessments") + + +# ================================================================ +# Step 5: Report -- Produce summary output +# ================================================================ + + +def step5_report(risk_df): + """Generate a summary report with optional visualization.""" + print("\n" + "=" * 60) + print("STEP 5: Risk assessment report") + print("=" * 60) + + # -- Tabular summary -- + tier_summary = ( + risk_df.groupby("risk_tier") + .agg( + count=("accountid", "count"), + avg_score=("risk_score", "mean"), + total_cases=("total_cases", "sum"), + total_pipeline=("pipeline_value", "sum"), + ) + .round(1) + ) + print("\nRisk Tier Summary:") + print(tier_summary.to_string()) + + # -- Top 10 highest risk accounts -- + top10 = risk_df.nlargest(10, "risk_score")[ + ["name", "risk_score", "risk_tier", "total_cases", "high_severity_cases", "pipeline_value"] + ] + print("\nTop 10 Highest Risk Accounts:") + print(top10.to_string(index=False)) + + # -- Visualization (optional) -- + if HAS_MATPLOTLIB: + _generate_charts(risk_df) + else: + print("\n[INFO] Install matplotlib for risk visualization charts") + + # -- Export data to CSV -- + risk_df.to_csv(OUTPUT_DIR / "risk_scores.csv", index=False) + top10.to_csv(OUTPUT_DIR / "top10_risk.csv", index=False) + tier_summary.to_csv(OUTPUT_DIR / "tier_summary.csv") + print(f"\n[OK] Exported CSV reports to {OUTPUT_DIR}/") + + print("\n[OK] Risk assessment pipeline complete!") + + +def _generate_charts(risk_df): + """Generate risk distribution charts.""" + fig, axes = plt.subplots(1, 3, figsize=(16, 5)) + fig.suptitle("Customer Account Risk Assessment", fontsize=14, fontweight="bold") + + # Chart 1: Risk score distribution + axes[0].hist(risk_df["risk_score"], bins=20, color="#4472C4", edgecolor="white") + axes[0].axvline(RISK_HIGH, color="red", linestyle="--", label=f"High ({RISK_HIGH})") + axes[0].axvline(RISK_MEDIUM, color="orange", linestyle="--", label=f"Medium ({RISK_MEDIUM})") + axes[0].set_title("Risk Score Distribution") + axes[0].set_xlabel("Risk Score") + axes[0].set_ylabel("Number of Accounts") + axes[0].legend() + + # Chart 2: Risk tier breakdown (pie) + tier_counts = risk_df["risk_tier"].value_counts() + colors = {"High": "#FF4444", "Medium": "#FFA500", "Low": "#44BB44"} + axes[1].pie( + tier_counts.values, + labels=tier_counts.index, + colors=[colors.get(t, "#888") for t in tier_counts.index], + autopct="%1.0f%%", + startangle=90, + ) + axes[1].set_title("Risk Tier Breakdown") + + # Chart 3: Cases vs Pipeline scatter + axes[2].scatter( + risk_df["total_cases"], + risk_df["pipeline_value"], + c=risk_df["risk_score"], + cmap="RdYlGn_r", + alpha=0.7, + edgecolors="gray", + s=60, + ) + axes[2].set_title("Cases vs Pipeline (color = risk)") + axes[2].set_xlabel("Open Cases") + axes[2].set_ylabel("Pipeline Value ($)") + + plt.tight_layout() + chart_path = OUTPUT_DIR / "risk_assessment_report.png" + plt.savefig(chart_path, dpi=150, bbox_inches="tight") + print(f"[OK] Saved {chart_path}") + # plt.show() # Uncomment for interactive display + + +# ================================================================ +# Pipeline Orchestrator +# ================================================================ + + +def run_risk_pipeline(client): + """Run the full risk assessment pipeline.""" + OUTPUT_DIR.mkdir(exist_ok=True) + print(f"[INFO] Output folder: {OUTPUT_DIR.resolve()}") + + print("\n" + "#" * 60) + print(" CUSTOMER RISK ASSESSMENT PIPELINE") + print(" Dataverse SDK -> Pandas -> Analysis -> LLM -> Write-back") + print("#" * 60) + + # Step 1: Extract data from Dataverse into DataFrames + accounts, cases, opportunities = step1_extract(client) + + if accounts.empty: + print("[WARN] No accounts found -- nothing to analyze.") + return + + # Step 2: Statistical analysis and risk scoring + risk_df = step2_analyze(accounts, cases, opportunities) + + # Step 3: LLM-powered risk summarization + # Configure your LLM provider (uncomment one): + # Option A: Azure AI Inference + # llm = get_llm_client("azure-ai-inference", endpoint="https://...", api_key="...") + # Option B: OpenAI + # llm = get_llm_client("openai", api_key="sk-...") + # Option C: Azure OpenAI (via openai package) + # llm = get_llm_client("openai", endpoint="https://...", api_key="...") + # Option D: GitHub Copilot SDK (no API key -- uses your Copilot subscription) + # llm = get_llm_client("copilot-sdk") + # Auto-detect (tries all providers in order): + # llm = get_llm_client(endpoint="https://...", api_key="...") + llm = None # Set to get_llm_client(...) to enable LLM summarization + risk_df = step3_summarize(risk_df, llm_complete=llm) + + # Step 4: Write results back to Dataverse + # Uncomment the next line to write back (requires custom columns on account table) + # step4_writeback(client, risk_df) + print("\n[INFO] Step 4 (write-back) is commented out by default.") + print(" Uncomment step4_writeback() after adding custom columns to account table.") + + # Step 5: Generate summary report + charts + step5_report(risk_df) + + +if __name__ == "__main__": + main() diff --git a/examples/advanced/prodev_quick_start.py b/examples/advanced/prodev_quick_start.py new file mode 100644 index 00000000..e28d1575 --- /dev/null +++ b/examples/advanced/prodev_quick_start.py @@ -0,0 +1,514 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +""" +PowerPlatform Dataverse Client - Pro-Dev Quick Start + +A developer-focused example that demonstrates the full SDK lifecycle: +install, authenticate, create a system with 4 related tables, populate +data, query it, and clean up -- all in a single script. + +What this example covers: + 1) SDK installation and authentication + 2) Create 4 custom tables (Customer, Project, Task, TimeEntry) + 3) Create columns and relationships between tables + 4) Populate with sample data using DataFrame CRUD + 5) Query and join data across tables + 6) Clean up (delete tables) + + Note: The last step (cleanup) automatically deletes all demo tables. + Comment out the cleanup() call in run_demo() if you want to keep the + tables in your environment for inspection. + +Why pandas DataFrames? + This example uses client.dataframe (pandas) instead of raw dict/list CRUD + because DataFrames provide significant advantages for multi-record operations: + + - Batch operations are natural: create 100 records from a DataFrame in one + call vs. looping over 100 dicts + - Column operations (broadcast a value, compute derived fields) are one-liners + instead of for-loops + - Joins and aggregations across tables use pandas merge/groupby -- far more + readable than manual dict matching + - NaN/None handling is built in (clear_nulls flag controls whether missing + values clear server fields or are skipped) + - NumPy type normalization is automatic (int64, float64, Timestamps all + serialize to JSON correctly without manual conversion) + + The SDK also supports plain dict/list CRUD via client.records for single-record + operations or when pandas is not needed. Both approaches use the same underlying + Dataverse Web API calls. + +Prerequisites: + pip install PowerPlatform-Dataverse-Client + pip install azure-identity +""" + +import sys +import uuid +import warnings +from pathlib import Path + +# Suppress MSAL advisory about response_mode (third-party library, not actionable here) +warnings.filterwarnings("ignore", message="response_mode=.*form_post", category=UserWarning) + +import pandas as pd +from azure.identity import InteractiveBrowserCredential + +from PowerPlatform.Dataverse.client import DataverseClient + +# -- Table schema names -- +# Uses the standard 'new_' publisher prefix (default Dataverse publisher). +# A unique suffix avoids collisions with existing tables. +SUFFIX = uuid.uuid4().hex[:6] +TABLE_CUSTOMER = f"new_DemoCustomer{SUFFIX}" +TABLE_PROJECT = f"new_DemoProject{SUFFIX}" +TABLE_TASK = f"new_DemoTask{SUFFIX}" +TABLE_TIMEENTRY = f"new_DemoTimeEntry{SUFFIX}" + +# -- Output folder for exported data (relative to this script) -- +_SCRIPT_DIR = Path(__file__).resolve().parent +OUTPUT_DIR = _SCRIPT_DIR / "prodev_output" + + +def main(): + """Entry point.""" + print("=" * 60) + print(" DATAVERSE PYTHON SDK -- PRO-DEV QUICK START") + print("=" * 60) + print() + print(" Step 0: pip install PowerPlatform-Dataverse-Client") + print(" (already done if you're running this script)") + print() + + base_url = input("Enter Dataverse org URL (e.g. https://yourorg.crm.dynamics.com): ").strip() + if not base_url: + print("[ERR] No URL entered; exiting.") + sys.exit(1) + base_url = base_url.rstrip("/") + + print("[INFO] Authenticating via browser (Azure Identity)...") + credential = InteractiveBrowserCredential() + + with DataverseClient(base_url, credential) as client: + try: + run_demo(client) + except Exception as e: + print(f"\n[ERR] {e}") + print("[INFO] Attempting cleanup...") + cleanup(client) + raise + + +def run_demo(client): + """Run the full pro-dev demo pipeline.""" + OUTPUT_DIR.mkdir(exist_ok=True) + print(f"[INFO] Output folder: {OUTPUT_DIR.resolve()}") + + # -- Step 1: Create 4 tables -- + primary_name_col, primary_id_col = step1_create_tables(client) + + # -- Step 2: Create relationships -- + step2_create_relationships(client) + + # -- Step 3: Populate with sample data -- + customer_ids, project_ids, task_ids = step3_populate_data(client, primary_name_col) + + # -- Step 4: Query and analyze -- + step4_query_and_analyze(client, customer_ids, primary_name_col) + + # -- Step 5: Update and delete -- + step5_update_and_delete(client, task_ids, primary_name_col, primary_id_col) + + # -- Step 6: Cleanup (optional -- prompts before deleting) -- + do_cleanup = input("\n6. Delete demo tables and cleanup? (Y/n): ").strip() or "y" + if do_cleanup.lower() in ("y", "yes"): + cleanup(client) + else: + print("[INFO] Tables kept for inspection.") + + print("\n" + "=" * 60) + print("[OK] Pro-dev quick start demo complete!") + print("=" * 60) + + +# ================================================================ +# Step 1: Create tables +# ================================================================ + + +def step1_create_tables(client): + """Create 4 custom tables.""" + print("\n" + "-" * 60) + print("STEP 1: Create 4 custom tables") + print("-" * 60) + + # Customer table + result = client.tables.create( + TABLE_CUSTOMER, + { + f"{TABLE_CUSTOMER}_Email": "string", + f"{TABLE_CUSTOMER}_Industry": "string", + f"{TABLE_CUSTOMER}_Revenue": "money", + }, + ) + # The primary column logical names are returned by tables.create() + # so we know exactly what keys to use in payloads and queries. + primary_name_col = result.primary_name_attribute + primary_id_col = result.primary_id_attribute + print(f"[OK] Created table: {TABLE_CUSTOMER} (name: {primary_name_col}, id: {primary_id_col})") + + # Project table + client.tables.create( + TABLE_PROJECT, + { + f"{TABLE_PROJECT}_Budget": "money", + f"{TABLE_PROJECT}_Status": "string", + f"{TABLE_PROJECT}_StartDate": "datetime", + }, + ) + print(f"[OK] Created table: {TABLE_PROJECT}") + + # Task table + client.tables.create( + TABLE_TASK, + { + f"{TABLE_TASK}_Priority": "integer", + f"{TABLE_TASK}_Status": "string", + f"{TABLE_TASK}_EstimatedHours": "decimal", + }, + ) + print(f"[OK] Created table: {TABLE_TASK}") + + # TimeEntry table + client.tables.create( + TABLE_TIMEENTRY, + { + f"{TABLE_TIMEENTRY}_Hours": "decimal", + f"{TABLE_TIMEENTRY}_Date": "datetime", + f"{TABLE_TIMEENTRY}_Description": "string", + }, + ) + print(f"[OK] Created table: {TABLE_TIMEENTRY}") + print(f"[OK] All 4 tables created (suffix: {SUFFIX})") + print(f"[INFO] Primary name column: '{primary_name_col}', ID column: '{primary_id_col}'") + + return primary_name_col, primary_id_col + + +# ================================================================ +# Step 2: Create relationships +# ================================================================ + + +def step2_create_relationships(client): + """Create relationships between the 4 tables using lookup fields.""" + print("\n" + "-" * 60) + print("STEP 2: Create relationships (lookup fields)") + print("-" * 60) + + # Customer 1:N Project (lookup on Project pointing to Customer) + client.tables.create_lookup_field( + referencing_table=TABLE_PROJECT.lower(), + lookup_field_name=f"{TABLE_PROJECT}_CustomerId", + referenced_table=TABLE_CUSTOMER.lower(), + display_name="Customer", + ) + print(f"[OK] {TABLE_CUSTOMER} 1:N {TABLE_PROJECT}") + + # Project 1:N Task (lookup on Task pointing to Project) + client.tables.create_lookup_field( + referencing_table=TABLE_TASK.lower(), + lookup_field_name=f"{TABLE_TASK}_ProjectId", + referenced_table=TABLE_PROJECT.lower(), + display_name="Project", + ) + print(f"[OK] {TABLE_PROJECT} 1:N {TABLE_TASK}") + + # Task 1:N TimeEntry (lookup on TimeEntry pointing to Task) + client.tables.create_lookup_field( + referencing_table=TABLE_TIMEENTRY.lower(), + lookup_field_name=f"{TABLE_TIMEENTRY}_TaskId", + referenced_table=TABLE_TASK.lower(), + display_name="Task", + ) + print(f"[OK] {TABLE_TASK} 1:N {TABLE_TIMEENTRY}") + + print("[OK] 3 lookup relationships created (Customer -> Project -> Task -> TimeEntry)") + + +# ================================================================ +# Step 3: Populate with sample data using DataFrame CRUD +# ================================================================ + + +def step3_populate_data(client, primary_name_col): + """Create sample records using client.dataframe.create(). + + Why DataFrames here instead of client.records.create()? + + With client.records (dict/list): + ids = client.records.create("Customer", [ + {"name": "Contoso", "Email": "info@contoso.com", ...}, + {"name": "Fabrikam", "Email": "contact@fabrikam.com", ...}, + ]) + # ids is a plain list -- manual index tracking needed + + With client.dataframe (pandas): + df = pd.DataFrame([{"name": "Contoso", ...}, {"name": "Fabrikam", ...}]) + df["id"] = client.dataframe.create("Customer", df) + # IDs auto-aligned to rows -- use df["id"].iloc[0] to reference later + + The DataFrame approach is more natural when you need to: + - Reference created IDs for relationship binding (as we do here) + - Compute derived columns before writing + - Join/merge data across multiple tables for analysis + """ + print("\n" + "-" * 60) + print("STEP 3: Populate with sample data (DataFrame CRUD)") + print("-" * 60) + + # -- Customers -- + # Use the primary name column returned by tables.create() + name_col = primary_name_col + customers_df = pd.DataFrame( + [ + { + name_col: "Contoso Ltd", + f"{TABLE_CUSTOMER}_Email": "info@contoso.com", + f"{TABLE_CUSTOMER}_Industry": "Technology", + f"{TABLE_CUSTOMER}_Revenue": 5000000, + }, + { + name_col: "Fabrikam Inc", + f"{TABLE_CUSTOMER}_Email": "contact@fabrikam.com", + f"{TABLE_CUSTOMER}_Industry": "Manufacturing", + f"{TABLE_CUSTOMER}_Revenue": 12000000, + }, + { + name_col: "Northwind Traders", + f"{TABLE_CUSTOMER}_Email": "sales@northwind.com", + f"{TABLE_CUSTOMER}_Industry": "Retail", + f"{TABLE_CUSTOMER}_Revenue": 3000000, + }, + ] + ) + customer_ids = client.dataframe.create(TABLE_CUSTOMER, customers_df) + customers_df["id"] = customer_ids + print(f"[OK] Created {len(customers_df)} customers") + + # -- Projects (linked to customers via lookup) -- + # @odata.bind keys use the navigation property logical name (lowercase) + # and the entity set name (also lowercase) in the value. + customer_lookup = f"{TABLE_PROJECT}_CustomerId".lower() + "@odata.bind" + customer_set = TABLE_CUSTOMER.lower() + "s" + projects_df = pd.DataFrame( + [ + { + name_col: "Cloud Migration", + f"{TABLE_PROJECT}_Budget": 250000, + f"{TABLE_PROJECT}_Status": "Active", + f"{TABLE_PROJECT}_StartDate": pd.Timestamp("2026-01-15"), + customer_lookup: f"/{customer_set}({customer_ids.iloc[0]})", + }, + { + name_col: "ERP Upgrade", + f"{TABLE_PROJECT}_Budget": 500000, + f"{TABLE_PROJECT}_Status": "Active", + f"{TABLE_PROJECT}_StartDate": pd.Timestamp("2026-02-01"), + customer_lookup: f"/{customer_set}({customer_ids.iloc[1]})", + }, + { + name_col: "POS Modernization", + f"{TABLE_PROJECT}_Budget": 150000, + f"{TABLE_PROJECT}_Status": "Planning", + f"{TABLE_PROJECT}_StartDate": pd.Timestamp("2026-03-01"), + customer_lookup: f"/{customer_set}({customer_ids.iloc[2]})", + }, + { + name_col: "Data Analytics Platform", + f"{TABLE_PROJECT}_Budget": 180000, + f"{TABLE_PROJECT}_Status": "Active", + f"{TABLE_PROJECT}_StartDate": pd.Timestamp("2026-01-20"), + customer_lookup: f"/{customer_set}({customer_ids.iloc[0]})", + }, + ] + ) + project_ids = client.dataframe.create(TABLE_PROJECT, projects_df) + projects_df["id"] = project_ids + print(f"[OK] Created {len(projects_df)} projects across 3 customers") + + # -- Tasks (linked to projects) -- + tasks_data = [] + task_names = [ + ("Infrastructure Setup", 1, "In Progress", 40), + ("Data Assessment", 2, "Not Started", 20), + ("Testing & QA", 1, "Not Started", 60), + ("Requirements Gathering", 1, "Complete", 30), + ("Development Sprint 1", 1, "In Progress", 80), + ("User Training", 3, "Not Started", 16), + ] + project_assignment = [0, 0, 0, 1, 1, 2] # which project each task belongs to + + for i, (task_name, priority, status, hours) in enumerate(task_names): + proj_idx = project_assignment[i] + project_lookup = f"{TABLE_TASK}_ProjectId".lower() + "@odata.bind" + project_set = TABLE_PROJECT.lower() + "s" + tasks_data.append( + { + name_col: task_name, + f"{TABLE_TASK}_Priority": priority, + f"{TABLE_TASK}_Status": status, + f"{TABLE_TASK}_EstimatedHours": hours, + project_lookup: f"/{project_set}({project_ids.iloc[proj_idx]})", + } + ) + + tasks_df = pd.DataFrame(tasks_data) + task_ids = client.dataframe.create(TABLE_TASK, tasks_df) + tasks_df["id"] = task_ids + print(f"[OK] Created {len(tasks_df)} tasks across 4 projects") + + print( + f"\n Total records: {len(customers_df) + len(projects_df) + len(tasks_df)} " + f"({len(customers_df)} customers, {len(projects_df)} projects, {len(tasks_df)} tasks)" + ) + + return customer_ids, project_ids, task_ids + + +# ================================================================ +# Step 4: Query and analyze data +# ================================================================ + + +def step4_query_and_analyze(client, customer_ids, primary_name_col): + """Query data and demonstrate DataFrame analysis.""" + print("\n" + "-" * 60) + print("STEP 4: Query and analyze data") + print("-" * 60) + + # Query all projects as a DataFrame + # Note: The SDK lowercases $select values automatically, so schema-name + # casing (e.g., new_DemoProject_Budget) works -- it becomes the logical name. + name_attr = primary_name_col + projects = client.dataframe.get( + TABLE_PROJECT, + select=[ + name_attr, + f"{TABLE_PROJECT}_Budget", + f"{TABLE_PROJECT}_Status", + ], + ) + print(f"\n All projects ({len(projects)} rows):") + print(f"{projects.to_string(index=False)}") + + # Query tasks and analyze + tasks = client.dataframe.get( + TABLE_TASK, + select=[ + name_attr, + f"{TABLE_TASK}_Priority", + f"{TABLE_TASK}_Status", + f"{TABLE_TASK}_EstimatedHours", + ], + ) + print(f"\n All tasks ({len(tasks)} rows):") + print(f"{tasks.to_string(index=False)}") + + # -- DataFrame analysis -- + hours_col = f"{TABLE_TASK}_EstimatedHours" + status_col = f"{TABLE_TASK}_Status" + budget_col = f"{TABLE_PROJECT}_Budget" + + if hours_col in tasks.columns: + print(f"\n Task hours summary:") + print(f" Total estimated hours: {tasks[hours_col].sum():.0f}") + print(f" Average per task: {tasks[hours_col].mean():.1f}") + print(f" Max single task: {tasks[hours_col].max():.0f}") + + if status_col in tasks.columns: + print(f"\n Tasks by status:") + status_counts = tasks[status_col].value_counts() + for status, count in status_counts.items(): + print(f" {status}: {count}") + + if budget_col in projects.columns: + print(f"\n Project budget summary:") + print(f" Total budget: ${projects[budget_col].sum():,.0f}") + print(f" Average budget: ${projects[budget_col].mean():,.0f}") + + # Fetch single record by ID + first_id = customer_ids.iloc[0] + single = client.dataframe.get(TABLE_CUSTOMER, record_id=first_id) + print(f"\n Single customer record (by ID):") + print(f"{single.to_string(index=False)}") + + # -- Export query results to CSV -- + projects.to_csv(OUTPUT_DIR / "projects.csv", index=False) + tasks.to_csv(OUTPUT_DIR / "tasks.csv", index=False) + single.to_csv(OUTPUT_DIR / "single_customer.csv", index=False) + print(f"\n[OK] Exported query results to {OUTPUT_DIR}/") + + +# ================================================================ +# Step 5: Update and delete records +# ================================================================ + + +def step5_update_and_delete(client, task_ids, primary_name_col, primary_id_col): + """Demonstrate update and delete with DataFrames.""" + print("\n" + "-" * 60) + print("STEP 5: Update and delete records") + print("-" * 60) + + status_col = f"{TABLE_TASK}_Status" + + # Update: mark first two tasks as "Complete" + # Use primary_id_col (from tables.create metadata) as the ID column name + update_df = pd.DataFrame( + { + primary_id_col: [task_ids.iloc[0], task_ids.iloc[1]], + status_col: ["Complete", "Complete"], + } + ) + client.dataframe.update(TABLE_TASK, update_df, id_column=primary_id_col) + print(f"[OK] Updated 2 tasks to 'Complete'") + + # Delete: remove the last task + delete_ids = pd.Series([task_ids.iloc[-1]]) + client.dataframe.delete(TABLE_TASK, delete_ids) + print(f"[OK] Deleted 1 task") + + # Verify + remaining = client.dataframe.get( + TABLE_TASK, + select=[primary_name_col, status_col], + ) + print(f"\n Remaining tasks ({len(remaining)}):") + print(f"{remaining.to_string(index=False)}") + + +# ================================================================ +# Cleanup +# ================================================================ + + +def cleanup(client): + """Delete all demo tables.""" + print("\n" + "-" * 60) + print("CLEANUP: Removing demo tables") + print("-" * 60) + + for table in [TABLE_TIMEENTRY, TABLE_TASK, TABLE_PROJECT, TABLE_CUSTOMER]: + try: + client.tables.delete(table) + print(f"[OK] Deleted table: {table}") + except Exception as e: + print(f"[WARN] Could not delete {table}: {e}") + + print("[OK] Cleanup complete") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 3efdf8f3..5c06f5ad 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ dependencies = [ "azure-identity>=1.17.0", "azure-core>=1.30.2", "requests>=2.32.0", + "pandas>=2.0.0", ] [project.urls] diff --git a/src/PowerPlatform/Dataverse/claude_skill/dataverse-sdk-use/SKILL.md b/src/PowerPlatform/Dataverse/claude_skill/dataverse-sdk-use/SKILL.md index f351c176..9edb733f 100644 --- a/src/PowerPlatform/Dataverse/claude_skill/dataverse-sdk-use/SKILL.md +++ b/src/PowerPlatform/Dataverse/claude_skill/dataverse-sdk-use/SKILL.md @@ -30,6 +30,9 @@ The SDK supports Dataverse's native bulk operations: Pass lists to `create()`, ` - Control page size with `page_size` parameter - Use `top` parameter to limit total records returned +### DataFrame Support +- DataFrame operations are accessed via the `client.dataframe` namespace: `client.dataframe.get()`, `client.dataframe.create()`, `client.dataframe.update()`, `client.dataframe.delete()` + ## Common Operations ### Import @@ -129,7 +132,7 @@ client.records.update("account", [id1, id2, id3], {"industry": "Technology"}) ``` #### Upsert Records -Creates or updates records identified by alternate keys. Single item → PATCH; multiple items → `UpsertMultiple` bulk action. +Creates or updates records identified by alternate keys. Single item -> PATCH; multiple items -> `UpsertMultiple` bulk action. > **Prerequisite**: The table must have an alternate key configured in Dataverse for the columns used in `alternate_key`. Without it, Dataverse will reject the request with a 400 error. ```python from PowerPlatform.Dataverse.models.upsert import UpsertItem @@ -171,6 +174,42 @@ client.records.delete("account", account_id) client.records.delete("account", [id1, id2, id3], use_bulk_delete=True) ``` +### DataFrame Operations + +The SDK provides DataFrame wrappers for all CRUD operations via the `client.dataframe` namespace, using pandas DataFrames and Series as input/output. + +```python +import pandas as pd + +# Query records -- returns a single DataFrame +df = client.dataframe.get("account", filter="statecode eq 0", select=["name"]) +print(f"Got {len(df)} rows") + +# Limit results with top for large tables +df = client.dataframe.get("account", select=["name"], top=100) + +# Fetch single record as one-row DataFrame +df = client.dataframe.get("account", record_id=account_id, select=["name"]) + +# Create records from a DataFrame (returns a Series of GUIDs) +new_accounts = pd.DataFrame([ + {"name": "Contoso", "telephone1": "555-0100"}, + {"name": "Fabrikam", "telephone1": "555-0200"}, +]) +new_accounts["accountid"] = client.dataframe.create("account", new_accounts) + +# Update records from a DataFrame (id_column identifies the GUID column) +new_accounts["telephone1"] = ["555-0199", "555-0299"] +client.dataframe.update("account", new_accounts, id_column="accountid") + +# Clear a field by setting clear_nulls=True (by default, NaN/None fields are skipped) +df = pd.DataFrame([{"accountid": "guid-1", "websiteurl": None}]) +client.dataframe.update("account", df, id_column="accountid", clear_nulls=True) + +# Delete records by passing a Series of GUIDs +client.dataframe.delete("account", new_accounts["accountid"]) +``` + ### SQL Queries SQL queries are **read-only** and support limited SQL syntax. A single SELECT statement with optional WHERE, TOP (integer literal), ORDER BY (column names only), and a simple table alias after FROM is supported. But JOIN and subqueries may not be. Refer to the Dataverse documentation for the current feature set. diff --git a/src/PowerPlatform/Dataverse/client.py b/src/PowerPlatform/Dataverse/client.py index 38fefd39..402be881 100644 --- a/src/PowerPlatform/Dataverse/client.py +++ b/src/PowerPlatform/Dataverse/client.py @@ -14,6 +14,7 @@ from .core._auth import _AuthManager from .core.config import DataverseConfig from .data._odata import _ODataClient +from .operations.dataframe import DataFrameOperations from .operations.records import RecordOperations from .operations.query import QueryOperations from .operations.files import FileOperations @@ -60,6 +61,7 @@ class DataverseClient: - ``client.query`` -- query and search operations - ``client.tables`` -- table and column metadata management - ``client.files`` -- file upload operations + - ``client.dataframe`` -- pandas DataFrame wrappers for record CRUD The client supports Python's context manager protocol for automatic resource cleanup and HTTP connection pooling: @@ -106,6 +108,7 @@ def __init__( self.query = QueryOperations(self) self.tables = TableOperations(self) self.files = FileOperations(self) + self.dataframe = DataFrameOperations(self) def _get_odata(self) -> _ODataClient: """ diff --git a/src/PowerPlatform/Dataverse/data/_odata.py b/src/PowerPlatform/Dataverse/data/_odata.py index d05cd424..633c736f 100644 --- a/src/PowerPlatform/Dataverse/data/_odata.py +++ b/src/PowerPlatform/Dataverse/data/_odata.py @@ -960,7 +960,7 @@ def _get_entity_by_table_schema_name( logical_lower = table_schema_name.lower() logical_escaped = self._escape_odata_quotes(logical_lower) params = { - "$select": "MetadataId,LogicalName,SchemaName,EntitySetName", + "$select": "MetadataId,LogicalName,SchemaName,EntitySetName,PrimaryNameAttribute,PrimaryIdAttribute", "$filter": f"LogicalName eq '{logical_escaped}'", } r = self._request("get", url, params=params, headers=headers) @@ -1445,6 +1445,8 @@ def _get_table_info(self, table_schema_name: str) -> Optional[Dict[str, Any]]: "table_logical_name": ent.get("LogicalName"), "entity_set_name": ent.get("EntitySetName"), "metadata_id": ent.get("MetadataId"), + "primary_name_attribute": ent.get("PrimaryNameAttribute"), + "primary_id_attribute": ent.get("PrimaryIdAttribute"), "columns_created": [], } @@ -1689,6 +1691,8 @@ def _create_table( "table_logical_name": metadata.get("LogicalName"), "entity_set_name": metadata.get("EntitySetName"), "metadata_id": metadata.get("MetadataId"), + "primary_name_attribute": metadata.get("PrimaryNameAttribute"), + "primary_id_attribute": metadata.get("PrimaryIdAttribute"), "columns_created": created_cols, } diff --git a/src/PowerPlatform/Dataverse/models/table_info.py b/src/PowerPlatform/Dataverse/models/table_info.py index 34b1d383..842fe2b6 100644 --- a/src/PowerPlatform/Dataverse/models/table_info.py +++ b/src/PowerPlatform/Dataverse/models/table_info.py @@ -112,6 +112,8 @@ class TableInfo: logical_name: str = "" entity_set_name: str = "" metadata_id: str = "" + primary_name_attribute: Optional[str] = None + primary_id_attribute: Optional[str] = None display_name: Optional[str] = None description: Optional[str] = None columns: Optional[List[ColumnInfo]] = field(default=None, repr=False) @@ -123,6 +125,8 @@ class TableInfo: "table_logical_name": "logical_name", "entity_set_name": "entity_set_name", "metadata_id": "metadata_id", + "primary_name_attribute": "primary_name_attribute", + "primary_id_attribute": "primary_id_attribute", "columns_created": "columns_created", } @@ -187,6 +191,8 @@ def from_dict(cls, data: Dict[str, Any]) -> TableInfo: logical_name=data.get("table_logical_name", ""), entity_set_name=data.get("entity_set_name", ""), metadata_id=data.get("metadata_id", ""), + primary_name_attribute=data.get("primary_name_attribute"), + primary_id_attribute=data.get("primary_id_attribute"), columns_created=data.get("columns_created"), ) @@ -213,6 +219,8 @@ def from_api_response(cls, response_data: Dict[str, Any]) -> TableInfo: logical_name=response_data.get("LogicalName", ""), entity_set_name=response_data.get("EntitySetName", ""), metadata_id=response_data.get("MetadataId", ""), + primary_name_attribute=response_data.get("PrimaryNameAttribute"), + primary_id_attribute=response_data.get("PrimaryIdAttribute"), display_name=display_name, description=description, ) diff --git a/src/PowerPlatform/Dataverse/operations/__init__.py b/src/PowerPlatform/Dataverse/operations/__init__.py index bfe7386f..19c8a9e5 100644 --- a/src/PowerPlatform/Dataverse/operations/__init__.py +++ b/src/PowerPlatform/Dataverse/operations/__init__.py @@ -8,4 +8,6 @@ SDK operations into logical groups: records, query, and tables. """ -__all__ = [] +from typing import List + +__all__: List[str] = [] diff --git a/src/PowerPlatform/Dataverse/operations/dataframe.py b/src/PowerPlatform/Dataverse/operations/dataframe.py new file mode 100644 index 00000000..3aec0cc7 --- /dev/null +++ b/src/PowerPlatform/Dataverse/operations/dataframe.py @@ -0,0 +1,358 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""DataFrame CRUD operations namespace for the Dataverse SDK.""" + +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Dict, List, Optional + +import pandas as pd + +from ..utils._pandas import dataframe_to_records + +if TYPE_CHECKING: + from ..client import DataverseClient + + +__all__ = ["DataFrameOperations"] + + +class DataFrameOperations: + """Namespace for pandas DataFrame CRUD operations. + + Accessed via ``client.dataframe``. Provides DataFrame-oriented wrappers + around the record-level CRUD operations. + + :param client: The parent :class:`~PowerPlatform.Dataverse.client.DataverseClient` instance. + :type client: ~PowerPlatform.Dataverse.client.DataverseClient + + Example:: + + import pandas as pd + + client = DataverseClient(base_url, credential) + + # Query records as a DataFrame + df = client.dataframe.get("account", select=["name"], top=100) + + # Create records from a DataFrame + new_df = pd.DataFrame([{"name": "Contoso"}, {"name": "Fabrikam"}]) + new_df["accountid"] = client.dataframe.create("account", new_df) + + # Update records + new_df["telephone1"] = ["555-0100", "555-0200"] + client.dataframe.update("account", new_df, id_column="accountid") + + # Delete records + client.dataframe.delete("account", new_df["accountid"]) + """ + + def __init__(self, client: DataverseClient) -> None: + self._client = client + + # -------------------------------------------------------------------- get + + def get( + self, + table: str, + record_id: Optional[str] = None, + select: Optional[List[str]] = None, + filter: Optional[str] = None, + orderby: Optional[List[str]] = None, + top: Optional[int] = None, + expand: Optional[List[str]] = None, + page_size: Optional[int] = None, + ) -> pd.DataFrame: + """Fetch records and return as a single pandas DataFrame. + + When ``record_id`` is provided, returns a single-row DataFrame. + When ``record_id`` is None, internally iterates all pages and returns one + consolidated DataFrame. + + :param table: Schema name of the table (e.g. ``"account"`` or ``"new_MyTestTable"``). + :type table: :class:`str` + :param record_id: Optional GUID to fetch a specific record. If None, queries multiple records. + :type record_id: :class:`str` or None + :param select: Optional list of attribute logical names to retrieve. + :type select: :class:`list` of :class:`str` or None + :param filter: Optional OData filter string. Column names must use exact lowercase logical names. + :type filter: :class:`str` or None + :param orderby: Optional list of attributes to sort by. + :type orderby: :class:`list` of :class:`str` or None + :param top: Optional maximum number of records to return. + :type top: :class:`int` or None + :param expand: Optional list of navigation properties to expand (case-sensitive). + :type expand: :class:`list` of :class:`str` or None + :param page_size: Optional number of records per page for pagination. + :type page_size: :class:`int` or None + + :return: DataFrame containing all matching records. Returns an empty DataFrame + when no records match. + :rtype: ~pandas.DataFrame + + :raises ValueError: If ``record_id`` is not a non-empty string, or if + query parameters (``filter``, ``orderby``, ``top``, ``expand``, + ``page_size``) are provided alongside ``record_id``. + + .. tip:: + For large tables, use ``top`` or ``filter`` to limit the result set. + + Example: + Fetch a single record as a DataFrame:: + + df = client.dataframe.get("account", record_id=account_id, select=["name", "telephone1"]) + print(df) + + Query with filtering:: + + df = client.dataframe.get("account", filter="statecode eq 0", select=["name"]) + print(f"Got {len(df)} active accounts") + + Limit result size:: + + df = client.dataframe.get("account", select=["name"], top=100) + """ + if record_id is not None: + if not isinstance(record_id, str) or not record_id.strip(): + raise ValueError("record_id must be a non-empty string") + record_id = record_id.strip() + if any(p is not None for p in (filter, orderby, top, expand, page_size)): + raise ValueError( + "Cannot specify query parameters (filter, orderby, top, " + "expand, page_size) when fetching a single record by ID" + ) + result = self._client.records.get( + table, + record_id, + select=select, + ) + return pd.DataFrame([result.data]) + + rows: List[dict] = [] + for batch in self._client.records.get( + table, + select=select, + filter=filter, + orderby=orderby, + top=top, + expand=expand, + page_size=page_size, + ): + rows.extend(row.data for row in batch) + + if not rows: + return pd.DataFrame(columns=select) if select else pd.DataFrame() + return pd.DataFrame.from_records(rows) + + # ----------------------------------------------------------------- create + + def create( + self, + table: str, + records: pd.DataFrame, + ) -> pd.Series: + """Create records from a pandas DataFrame. + + :param table: Schema name of the table (e.g. ``"account"`` or ``"new_MyTestTable"``). + :type table: :class:`str` + :param records: DataFrame where each row is a record to create. + :type records: ~pandas.DataFrame + + :return: Series of created record GUIDs, aligned with the input DataFrame index. + :rtype: ~pandas.Series + + :raises TypeError: If ``records`` is not a pandas DataFrame. + :raises ValueError: If ``records`` is empty or the number of returned + IDs does not match the number of input rows. + + .. tip:: + All rows are sent in a single ``CreateMultiple`` request. For very + large DataFrames, consider splitting into smaller batches to avoid + request timeouts. + + Example: + Create records from a DataFrame:: + + import pandas as pd + + df = pd.DataFrame([ + {"name": "Contoso", "telephone1": "555-0100"}, + {"name": "Fabrikam", "telephone1": "555-0200"}, + ]) + df["accountid"] = client.dataframe.create("account", df) + """ + if not isinstance(records, pd.DataFrame): + raise TypeError("records must be a pandas DataFrame") + + if records.empty: + raise ValueError("records must be a non-empty DataFrame") + + record_list = dataframe_to_records(records) + + # Detect rows where all values were NaN/None (empty dicts after normalization) + empty_rows = [records.index[i] for i, r in enumerate(record_list) if not r] + if empty_rows: + raise ValueError( + f"Records at index(es) {empty_rows} have no non-null values. " + "All rows must contain at least one field to create." + ) + + ids = self._client.records.create(table, record_list) + + if len(ids) != len(records): + raise ValueError(f"Server returned {len(ids)} IDs for {len(records)} input rows") + + return pd.Series(ids, index=records.index) + + # ----------------------------------------------------------------- update + + def update( + self, + table: str, + changes: pd.DataFrame, + id_column: str, + clear_nulls: bool = False, + ) -> None: + """Update records from a pandas DataFrame. + + Each row in the DataFrame represents an update. The ``id_column`` specifies which + column contains the record GUIDs. + + :param table: Schema name of the table (e.g. ``"account"`` or ``"new_MyTestTable"``). + :type table: :class:`str` + :param changes: DataFrame where each row contains a record GUID and the fields to update. + :type changes: ~pandas.DataFrame + :param id_column: Name of the DataFrame column containing record GUIDs. + :type id_column: :class:`str` + :param clear_nulls: When ``False`` (default), missing values (NaN/None) are skipped + (the field is left unchanged on the server). When ``True``, missing values are sent + as ``null`` to Dataverse, clearing the field. Use ``True`` only when you intentionally + want NaN/None values to clear fields. + :type clear_nulls: :class:`bool` + + :raises TypeError: If ``changes`` is not a pandas DataFrame. + :raises ValueError: If ``changes`` is empty, ``id_column`` is not found in the + DataFrame, ``id_column`` contains invalid (non-string, empty, or whitespace-only) + values, or no updatable columns exist besides ``id_column``. + When ``clear_nulls`` is ``False`` (default), rows where all change values + are NaN/None produce empty patches and are silently skipped. If all rows + are skipped, the method returns without making an API call. When + ``clear_nulls`` is ``True``, NaN/None values become explicit nulls, so + rows are never skipped. + + .. tip:: + All rows are sent in a single ``UpdateMultiple`` request (or a + single PATCH for one row). For very large DataFrames, consider + splitting into smaller batches to avoid request timeouts. + + Example: + Update records with different values per row:: + + import pandas as pd + + df = pd.DataFrame([ + {"accountid": "guid-1", "telephone1": "555-0100"}, + {"accountid": "guid-2", "telephone1": "555-0200"}, + ]) + client.dataframe.update("account", df, id_column="accountid") + + Broadcast the same change to all records:: + + df = pd.DataFrame({"accountid": ["guid-1", "guid-2", "guid-3"]}) + df["websiteurl"] = "https://example.com" + client.dataframe.update("account", df, id_column="accountid") + + Clear a field by setting clear_nulls=True:: + + df = pd.DataFrame([{"accountid": "guid-1", "websiteurl": None}]) + client.dataframe.update("account", df, id_column="accountid", clear_nulls=True) + """ + if not isinstance(changes, pd.DataFrame): + raise TypeError("changes must be a pandas DataFrame") + if changes.empty: + raise ValueError("changes must be a non-empty DataFrame") + if id_column not in changes.columns: + raise ValueError(f"id_column '{id_column}' not found in DataFrame columns") + + raw_ids = changes[id_column].tolist() + invalid = [changes.index[i] for i, v in enumerate(raw_ids) if not isinstance(v, str) or not v.strip()] + if invalid: + raise ValueError( + f"id_column '{id_column}' contains invalid values at row index(es) {invalid}. " + "All IDs must be non-empty strings." + ) + ids = [v.strip() for v in raw_ids] + + change_columns = [column for column in changes.columns if column != id_column] + if not change_columns: + raise ValueError( + "No columns to update. The DataFrame must contain at least one column besides the id_column." + ) + change_list = dataframe_to_records(changes[change_columns], na_as_null=clear_nulls) + + # Filter out rows where all change values were NaN/None (empty dicts) + paired = [(rid, patch) for rid, patch in zip(ids, change_list) if patch] + if not paired: + return + ids_filtered: List[str] = [p[0] for p in paired] + change_filtered: List[Dict[str, Any]] = [p[1] for p in paired] + + if len(ids_filtered) == 1: + self._client.records.update(table, ids_filtered[0], change_filtered[0]) + else: + self._client.records.update(table, ids_filtered, change_filtered) + + # ----------------------------------------------------------------- delete + + def delete( + self, + table: str, + ids: pd.Series, + use_bulk_delete: bool = True, + ) -> Optional[str]: + """Delete records by passing a pandas Series of GUIDs. + + :param table: Schema name of the table (e.g. ``"account"`` or ``"new_MyTestTable"``). + :type table: :class:`str` + :param ids: Series of record GUIDs to delete. + :type ids: ~pandas.Series + :param use_bulk_delete: When ``True`` (default) and ``ids`` contains multiple values, execute the BulkDelete + action and return its async job identifier. When ``False`` each record is deleted sequentially. + :type use_bulk_delete: :class:`bool` + + :raises TypeError: If ``ids`` is not a pandas Series. + :raises ValueError: If ``ids`` contains invalid (non-string, empty, or + whitespace-only) values. + + :return: BulkDelete job ID when deleting multiple records via BulkDelete; + ``None`` when deleting a single record, using sequential deletion, or + when ``ids`` is empty. + :rtype: :class:`str` or None + + Example: + Delete records using a Series:: + + import pandas as pd + + ids = pd.Series(["guid-1", "guid-2", "guid-3"]) + client.dataframe.delete("account", ids) + """ + if not isinstance(ids, pd.Series): + raise TypeError("ids must be a pandas Series") + + raw_list = ids.tolist() + if not raw_list: + return None + + invalid = [ids.index[i] for i, v in enumerate(raw_list) if not isinstance(v, str) or not v.strip()] + if invalid: + raise ValueError( + f"ids Series contains invalid values at index(es) {invalid}. " f"All IDs must be non-empty strings." + ) + id_list = [v.strip() for v in raw_list] + + if len(id_list) == 1: + self._client.records.delete(table, id_list[0]) + return None + return self._client.records.delete(table, id_list, use_bulk_delete=use_bulk_delete) diff --git a/src/PowerPlatform/Dataverse/utils/_pandas.py b/src/PowerPlatform/Dataverse/utils/_pandas.py new file mode 100644 index 00000000..4ebd01a7 --- /dev/null +++ b/src/PowerPlatform/Dataverse/utils/_pandas.py @@ -0,0 +1,60 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Internal pandas helpers""" + +from __future__ import annotations + +import datetime +from typing import Any, Dict, List + +import numpy as np +import pandas as pd + + +def _normalize_scalar(v: Any) -> Any: + """Convert numpy scalar types to their Python native equivalents. + + :param v: A scalar value to normalize. + :return: The value converted to a JSON-serializable Python type. + """ + if isinstance(v, pd.Timestamp): + return v.isoformat() + if isinstance(v, (datetime.datetime, datetime.date)): + return v.isoformat() + if isinstance(v, np.datetime64): + return pd.Timestamp(v).isoformat() + if isinstance(v, np.integer): + return int(v) + if isinstance(v, np.floating): + return float(v) + if isinstance(v, np.bool_): + return bool(v) + return v + + +def dataframe_to_records(df: pd.DataFrame, na_as_null: bool = False) -> List[Dict[str, Any]]: + """Convert a DataFrame to a list of dicts, normalizing values for JSON serialization. + + :param df: Input DataFrame. + :param na_as_null: When False (default), missing values are omitted from each dict. + When True, missing values are included as None (sends null to Dataverse, clearing the field). + """ + records = [] + for row in df.to_dict(orient="records"): + clean = {} + for k, v in row.items(): + if pd.api.types.is_scalar(v): + if pd.notna(v): + clean[k] = _normalize_scalar(v) + elif na_as_null: + clean[k] = None + else: + # Convert np.ndarray to list for JSON serialization; + # pass through lists, dicts, etc. as-is. + if isinstance(v, np.ndarray): + clean[k] = v.tolist() + else: + clean[k] = v + records.append(clean) + return records diff --git a/tests/unit/models/test_table_info.py b/tests/unit/models/test_table_info.py index 0e0754e7..b3da07fa 100644 --- a/tests/unit/models/test_table_info.py +++ b/tests/unit/models/test_table_info.py @@ -49,11 +49,19 @@ def test_legacy_key_iteration(self): keys = list(self.info) self.assertEqual( keys, - ["table_schema_name", "table_logical_name", "entity_set_name", "metadata_id", "columns_created"], + [ + "table_schema_name", + "table_logical_name", + "entity_set_name", + "metadata_id", + "primary_name_attribute", + "primary_id_attribute", + "columns_created", + ], ) def test_len(self): - self.assertEqual(len(self.info), 5) + self.assertEqual(len(self.info), 7) def test_keys_values_items(self): self.assertEqual(list(self.info.keys()), list(self.info._LEGACY_KEY_MAP.keys())) @@ -76,6 +84,8 @@ def test_from_dict(self): "table_logical_name": "new_product", "entity_set_name": "new_products", "metadata_id": "meta-guid-1", + "primary_name_attribute": "new_name", + "primary_id_attribute": "new_productid", "columns_created": ["new_Price"], } info = TableInfo.from_dict(data) @@ -83,13 +93,32 @@ def test_from_dict(self): self.assertEqual(info.logical_name, "new_product") self.assertEqual(info.entity_set_name, "new_products") self.assertEqual(info.metadata_id, "meta-guid-1") + self.assertEqual(info.primary_name_attribute, "new_name") + self.assertEqual(info.primary_id_attribute, "new_productid") self.assertEqual(info.columns_created, ["new_Price"]) def test_from_dict_missing_keys(self): info = TableInfo.from_dict({}) self.assertEqual(info.schema_name, "") + self.assertIsNone(info.primary_name_attribute) + self.assertIsNone(info.primary_id_attribute) self.assertIsNone(info.columns_created) + def test_from_dict_legacy_access_primary_fields(self): + """Primary fields are accessible via legacy dict-key access.""" + data = { + "table_schema_name": "new_Product", + "table_logical_name": "new_product", + "entity_set_name": "new_products", + "metadata_id": "meta-guid-1", + "primary_name_attribute": "new_name", + "primary_id_attribute": "new_productid", + "columns_created": [], + } + info = TableInfo.from_dict(data) + self.assertEqual(info["primary_name_attribute"], "new_name") + self.assertEqual(info["primary_id_attribute"], "new_productid") + class TestTableInfoFromApiResponse(unittest.TestCase): """Tests for TableInfo.from_api_response factory (PascalCase keys).""" @@ -100,6 +129,8 @@ def test_from_api_response(self): "LogicalName": "account", "EntitySetName": "accounts", "MetadataId": "meta-guid-2", + "PrimaryNameAttribute": "name", + "PrimaryIdAttribute": "accountid", "DisplayName": {"UserLocalizedLabel": {"Label": "Account", "LanguageCode": 1033}}, "Description": {"UserLocalizedLabel": {"Label": "Business account", "LanguageCode": 1033}}, } @@ -108,6 +139,8 @@ def test_from_api_response(self): self.assertEqual(info.logical_name, "account") self.assertEqual(info.entity_set_name, "accounts") self.assertEqual(info.metadata_id, "meta-guid-2") + self.assertEqual(info.primary_name_attribute, "name") + self.assertEqual(info.primary_id_attribute, "accountid") self.assertEqual(info.display_name, "Account") self.assertEqual(info.description, "Business account") diff --git a/tests/unit/test_client_dataframe.py b/tests/unit/test_client_dataframe.py new file mode 100644 index 00000000..d19419ac --- /dev/null +++ b/tests/unit/test_client_dataframe.py @@ -0,0 +1,365 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import unittest +from unittest.mock import MagicMock + +import pandas as pd +from azure.core.credentials import TokenCredential + +from PowerPlatform.Dataverse.client import DataverseClient + + +class TestDataFrameGet(unittest.TestCase): + """Tests for client.dataframe.get().""" + + def setUp(self): + self.mock_credential = MagicMock(spec=TokenCredential) + self.base_url = "https://example.crm.dynamics.com" + self.client = DataverseClient(self.base_url, self.mock_credential) + self.client._odata = MagicMock() + + def test_get_single_record(self): + """Single record_id returns a one-row DataFrame.""" + expected = {"accountid": "guid-1", "name": "Contoso"} + self.client._odata._get.return_value = expected + + df = self.client.dataframe.get("account", record_id="guid-1") + + self.assertIsInstance(df, pd.DataFrame) + self.assertEqual(len(df), 1) + self.assertEqual(df.iloc[0]["name"], "Contoso") + self.client._odata._get.assert_called_once_with("account", "guid-1", select=None) + + def test_get_single_record_with_select(self): + """Single record with select columns.""" + expected = {"accountid": "guid-1", "name": "Contoso"} + self.client._odata._get.return_value = expected + + df = self.client.dataframe.get("account", record_id="guid-1", select=["name"]) + + self.assertIsInstance(df, pd.DataFrame) + self.assertEqual(len(df), 1) + self.client._odata._get.assert_called_once_with("account", "guid-1", select=["name"]) + + def test_get_multiple_records_single_page(self): + """Single page returns a DataFrame with all rows.""" + batch = [ + {"accountid": "guid-1", "name": "A"}, + {"accountid": "guid-2", "name": "B"}, + ] + self.client._odata._get_multiple.return_value = iter([batch]) + + df = self.client.dataframe.get("account", filter="statecode eq 0") + + self.assertIsInstance(df, pd.DataFrame) + self.assertEqual(len(df), 2) + self.assertListEqual(df["name"].tolist(), ["A", "B"]) + + def test_get_multiple_records_multi_page(self): + """Multiple pages are concatenated into a single DataFrame.""" + page1 = [{"accountid": "guid-1", "name": "A"}] + page2 = [{"accountid": "guid-2", "name": "B"}] + self.client._odata._get_multiple.return_value = iter([page1, page2]) + + df = self.client.dataframe.get("account", top=100) + + self.assertIsInstance(df, pd.DataFrame) + self.assertEqual(len(df), 2) + self.assertEqual(df.iloc[0]["name"], "A") + self.assertEqual(df.iloc[1]["name"], "B") + + def test_get_index_is_reset(self): + """Returned DataFrame has a clean 0-based integer index.""" + page1 = [{"accountid": "guid-1", "name": "A"}] + page2 = [{"accountid": "guid-2", "name": "B"}] + self.client._odata._get_multiple.return_value = iter([page1, page2]) + + df = self.client.dataframe.get("account", top=100) + + self.assertListEqual(list(df.index), [0, 1]) + + def test_get_empty_result(self): + """Empty result set returns an empty DataFrame.""" + self.client._odata._get_multiple.return_value = iter([]) + + df = self.client.dataframe.get("account") + + self.assertIsInstance(df, pd.DataFrame) + self.assertEqual(len(df), 0) + + def test_get_passes_all_parameters(self): + """All query parameters are forwarded to the underlying get method.""" + self.client._odata._get_multiple.return_value = iter([]) + + self.client.dataframe.get( + "account", + select=["name"], + filter="statecode eq 0", + orderby=["name asc"], + top=50, + expand=["primarycontactid"], + page_size=25, + ) + + self.client._odata._get_multiple.assert_called_once_with( + "account", + select=["name"], + filter="statecode eq 0", + orderby=["name asc"], + top=50, + expand=["primarycontactid"], + page_size=25, + ) + + +class TestDataFrameCreate(unittest.TestCase): + """Tests for client.dataframe.create().""" + + def setUp(self): + self.mock_credential = MagicMock(spec=TokenCredential) + self.base_url = "https://example.crm.dynamics.com" + self.client = DataverseClient(self.base_url, self.mock_credential) + self.client._odata = MagicMock() + + def test_create_dataframe(self): + """DataFrame rows are converted to dicts and returned IDs are a Series.""" + df = pd.DataFrame( + [ + {"name": "Contoso", "telephone1": "555-0100"}, + {"name": "Fabrikam", "telephone1": "555-0200"}, + ] + ) + self.client._odata._create_multiple.return_value = ["guid-1", "guid-2"] + self.client._odata._entity_set_from_schema_name.return_value = "accounts" + + ids = self.client.dataframe.create("account", df) + + self.assertIsInstance(ids, pd.Series) + self.assertListEqual(ids.tolist(), ["guid-1", "guid-2"]) + call_args = self.client._odata._create_multiple.call_args + records_arg = call_args[0][2] + self.assertEqual(len(records_arg), 2) + self.assertEqual(records_arg[0]["name"], "Contoso") + self.assertEqual(records_arg[1]["name"], "Fabrikam") + + def test_create_assigns_to_column(self): + """Returned Series can be assigned directly as a DataFrame column.""" + df = pd.DataFrame([{"name": "Contoso"}, {"name": "Fabrikam"}]) + self.client._odata._create_multiple.return_value = ["guid-1", "guid-2"] + self.client._odata._entity_set_from_schema_name.return_value = "accounts" + + df["accountid"] = self.client.dataframe.create("account", df) + + self.assertListEqual(df["accountid"].tolist(), ["guid-1", "guid-2"]) + + def test_create_single_row_dataframe(self): + """Single-row DataFrame returns a single-element Series.""" + df = pd.DataFrame([{"name": "Contoso"}]) + self.client._odata._create_multiple.return_value = ["guid-1"] + self.client._odata._entity_set_from_schema_name.return_value = "accounts" + + ids = self.client.dataframe.create("account", df) + + self.assertIsInstance(ids, pd.Series) + self.assertEqual(ids.iloc[0], "guid-1") + + def test_create_rejects_non_dataframe(self): + """Non-DataFrame input raises TypeError.""" + with self.assertRaises(TypeError) as ctx: + self.client.dataframe.create("account", [{"name": "Contoso"}]) + self.assertIn("pandas DataFrame", str(ctx.exception)) + + def test_create_empty_dataframe_raises(self): + """Empty DataFrame raises ValueError.""" + df = pd.DataFrame(columns=["name", "telephone1"]) + + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.create("account", df) + self.assertIn("non-empty DataFrame", str(ctx.exception)) + self.client._odata._create_multiple.assert_not_called() + + def test_create_length_mismatch_raises(self): + """ValueError raised when returned IDs don't match input row count.""" + df = pd.DataFrame([{"name": "Contoso"}, {"name": "Fabrikam"}]) + self.client._odata._create_multiple.return_value = ["guid-1"] + self.client._odata._entity_set_from_schema_name.return_value = "accounts" + + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.create("account", df) + self.assertIn("1 IDs for 2 input rows", str(ctx.exception)) + + def test_create_drops_nan_values(self): + """NaN/None values are omitted from the create payload.""" + df = pd.DataFrame( + [ + {"name": "Contoso", "telephone1": "555-0100"}, + {"name": "Fabrikam", "telephone1": None}, + ] + ) + self.client._odata._create_multiple.return_value = ["guid-1", "guid-2"] + self.client._odata._entity_set_from_schema_name.return_value = "accounts" + + self.client.dataframe.create("account", df) + + call_args = self.client._odata._create_multiple.call_args + records_arg = call_args[0][2] + self.assertEqual(records_arg[0], {"name": "Contoso", "telephone1": "555-0100"}) + self.assertEqual(records_arg[1], {"name": "Fabrikam"}) + self.assertNotIn("telephone1", records_arg[1]) + + def test_create_converts_timestamps_to_iso(self): + """Timestamp values are converted to ISO 8601 strings.""" + ts = pd.Timestamp("2024-01-15 10:30:00") + df = pd.DataFrame([{"name": "Contoso", "createdon": ts}]) + self.client._odata._create_multiple.return_value = ["guid-1"] + self.client._odata._entity_set_from_schema_name.return_value = "accounts" + + self.client.dataframe.create("account", df) + + call_args = self.client._odata._create_multiple.call_args + records_arg = call_args[0][2] + self.assertEqual(records_arg[0]["createdon"], "2024-01-15T10:30:00") + + +class TestDataFrameUpdate(unittest.TestCase): + """Tests for client.dataframe.update().""" + + def setUp(self): + self.mock_credential = MagicMock(spec=TokenCredential) + self.base_url = "https://example.crm.dynamics.com" + self.client = DataverseClient(self.base_url, self.mock_credential) + self.client._odata = MagicMock() + + def test_update_dataframe(self): + """DataFrame rows are split into IDs and changes, then passed to update.""" + df = pd.DataFrame( + [ + {"accountid": "guid-1", "telephone1": "555-0100"}, + {"accountid": "guid-2", "telephone1": "555-0200"}, + ] + ) + + self.client.dataframe.update("account", df, id_column="accountid") + + self.client._odata._update_by_ids.assert_called_once() + call_args = self.client._odata._update_by_ids.call_args[0] + self.assertEqual(call_args[0], "account") + self.assertEqual(call_args[1], ["guid-1", "guid-2"]) + self.assertEqual(call_args[2], [{"telephone1": "555-0100"}, {"telephone1": "555-0200"}]) + + def test_update_rejects_non_dataframe(self): + """Non-DataFrame input raises TypeError.""" + with self.assertRaises(TypeError) as ctx: + self.client.dataframe.update("account", {"id": "guid-1"}, id_column="id") + self.assertIn("pandas DataFrame", str(ctx.exception)) + + def test_update_rejects_missing_id_column(self): + """Missing id_column raises ValueError.""" + df = pd.DataFrame([{"name": "Contoso"}]) + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.update("account", df, id_column="accountid") + self.assertIn("accountid", str(ctx.exception)) + + def test_update_multiple_change_columns(self): + """Multiple change columns are all included in the update payload (single row uses _update).""" + df = pd.DataFrame( + [ + {"accountid": "guid-1", "name": "New Name", "telephone1": "555-0100"}, + ] + ) + + self.client.dataframe.update("account", df, id_column="accountid") + + self.client._odata._update.assert_called_once() + call_args = self.client._odata._update.call_args[0] + self.assertEqual(call_args[0], "account") + self.assertEqual(call_args[1], "guid-1") + changes = call_args[2] + self.assertIn("name", changes) + self.assertIn("telephone1", changes) + self.assertNotIn("accountid", changes) + + def test_update_skips_nan_by_default(self): + """NaN/None values are skipped by default (field left unchanged on server).""" + df = pd.DataFrame( + [ + {"accountid": "guid-1", "name": "New Name", "telephone1": None}, + {"accountid": "guid-2", "name": None, "telephone1": "555-0200"}, + ] + ) + + self.client.dataframe.update("account", df, id_column="accountid") + + call_args = self.client._odata._update_by_ids.call_args[0] + changes = call_args[2] + self.assertEqual(changes[0], {"name": "New Name"}) + self.assertEqual(changes[1], {"telephone1": "555-0200"}) + + def test_update_clear_nulls_sends_none(self): + """With clear_nulls=True, NaN/None values are sent as None to clear fields.""" + df = pd.DataFrame( + [ + {"accountid": "guid-1", "name": "New Name", "telephone1": None}, + {"accountid": "guid-2", "name": None, "telephone1": "555-0200"}, + ] + ) + + self.client.dataframe.update("account", df, id_column="accountid", clear_nulls=True) + + call_args = self.client._odata._update_by_ids.call_args[0] + changes = call_args[2] + self.assertEqual(changes[0], {"name": "New Name", "telephone1": None}) + self.assertEqual(changes[1], {"name": None, "telephone1": "555-0200"}) + + +class TestDataFrameDelete(unittest.TestCase): + """Tests for client.dataframe.delete().""" + + def setUp(self): + self.mock_credential = MagicMock(spec=TokenCredential) + self.base_url = "https://example.crm.dynamics.com" + self.client = DataverseClient(self.base_url, self.mock_credential) + self.client._odata = MagicMock() + + def test_delete_dataframe_bulk(self): + """Series of GUIDs passed to bulk delete.""" + ids = pd.Series(["guid-1", "guid-2", "guid-3"]) + self.client._odata._delete_multiple.return_value = "job-123" + + job_id = self.client.dataframe.delete("account", ids) + + self.assertEqual(job_id, "job-123") + self.client._odata._delete_multiple.assert_called_once_with("account", ["guid-1", "guid-2", "guid-3"]) + + def test_delete_from_dataframe_column(self): + """Series extracted from a DataFrame column works directly.""" + df = pd.DataFrame({"accountid": ["guid-1", "guid-2"], "name": ["A", "B"]}) + self.client._odata._delete_multiple.return_value = "job-123" + + self.client.dataframe.delete("account", df["accountid"]) + + self.client._odata._delete_multiple.assert_called_once_with("account", ["guid-1", "guid-2"]) + + def test_delete_dataframe_sequential(self): + """use_bulk_delete=False deletes records sequentially.""" + ids = pd.Series(["guid-1", "guid-2"]) + + result = self.client.dataframe.delete("account", ids, use_bulk_delete=False) + + self.assertIsNone(result) + self.assertEqual(self.client._odata._delete.call_count, 2) + + def test_delete_rejects_non_series(self): + """Non-Series input raises TypeError.""" + with self.assertRaises(TypeError) as ctx: + self.client.dataframe.delete("account", ["guid-1"]) + self.assertIn("pandas Series", str(ctx.exception)) + + def test_delete_empty_series(self): + """Empty Series returns None without calling delete.""" + ids = pd.Series([], dtype="str") + + result = self.client.dataframe.delete("account", ids) + + self.assertIsNone(result) diff --git a/tests/unit/test_dataframe_operations.py b/tests/unit/test_dataframe_operations.py new file mode 100644 index 00000000..7931c3f5 --- /dev/null +++ b/tests/unit/test_dataframe_operations.py @@ -0,0 +1,495 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Comprehensive unit tests for the DataFrameOperations namespace (client.dataframe).""" + +import unittest +from unittest.mock import MagicMock + +import numpy as np +import pandas as pd +from azure.core.credentials import TokenCredential + +from PowerPlatform.Dataverse.client import DataverseClient +from PowerPlatform.Dataverse.operations.dataframe import DataFrameOperations + + +class TestDataFrameOperationsNamespace(unittest.TestCase): + """Tests for the DataFrameOperations namespace itself.""" + + def setUp(self): + self.mock_credential = MagicMock(spec=TokenCredential) + self.client = DataverseClient("https://example.crm.dynamics.com", self.mock_credential) + self.client._odata = MagicMock() + + def test_namespace_exists(self): + """client.dataframe is a DataFrameOperations instance.""" + self.assertIsInstance(self.client.dataframe, DataFrameOperations) + + +class TestDataFrameGet(unittest.TestCase): + """Tests for client.dataframe.get().""" + + def setUp(self): + self.mock_credential = MagicMock(spec=TokenCredential) + self.client = DataverseClient("https://example.crm.dynamics.com", self.mock_credential) + self.client._odata = MagicMock() + + def test_get_single_record(self): + """record_id returns a one-row DataFrame using result.data.""" + self.client._odata._get.return_value = {"accountid": "guid-1", "name": "Contoso"} + df = self.client.dataframe.get("account", record_id="guid-1") + self.assertIsInstance(df, pd.DataFrame) + self.assertEqual(len(df), 1) + self.assertEqual(df.iloc[0]["name"], "Contoso") + + def test_get_multiple_records(self): + """Without record_id, pages are iterated and consolidated into one DataFrame.""" + page1 = [{"accountid": "guid-1", "name": "A"}] + page2 = [{"accountid": "guid-2", "name": "B"}] + self.client._odata._get_multiple.return_value = iter([page1, page2]) + df = self.client.dataframe.get("account") + self.assertIsInstance(df, pd.DataFrame) + self.assertEqual(len(df), 2) + + def test_get_no_results(self): + """Empty result set returns an empty DataFrame.""" + self.client._odata._get_multiple.return_value = iter([]) + df = self.client.dataframe.get("account") + self.assertIsInstance(df, pd.DataFrame) + self.assertEqual(len(df), 0) + + def test_get_no_results_with_select_preserves_columns(self): + """Empty result with select returns DataFrame with expected columns.""" + self.client._odata._get_multiple.return_value = iter([]) + df = self.client.dataframe.get("account", select=["name", "telephone1"]) + self.assertIsInstance(df, pd.DataFrame) + self.assertEqual(len(df), 0) + self.assertListEqual(list(df.columns), ["name", "telephone1"]) + + def test_create_all_nan_rows_raises(self): + """DataFrame where all values are NaN raises ValueError.""" + df = pd.DataFrame([{"name": None, "phone": None}]) + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.create("account", df) + self.assertIn("no non-null values", str(ctx.exception)) + + def test_get_passes_all_params(self): + """All OData parameters are forwarded to the underlying API call.""" + self.client._odata._get_multiple.return_value = iter([]) + self.client.dataframe.get( + "account", + select=["name"], + filter="statecode eq 0", + orderby=["name asc"], + top=50, + expand=["primarycontactid"], + page_size=25, + ) + self.client._odata._get_multiple.assert_called_once_with( + "account", + select=["name"], + filter="statecode eq 0", + orderby=["name asc"], + top=50, + expand=["primarycontactid"], + page_size=25, + ) + + def test_get_record_id_with_query_params_raises(self): + """ValueError raised when record_id is provided with query params.""" + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.get("account", record_id="guid-1", filter="name eq 'X'") + self.assertIn("Cannot specify query parameters", str(ctx.exception)) + + def test_get_record_id_with_top_raises(self): + """ValueError raised when record_id is provided with top.""" + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.get("account", record_id="guid-1", top=10) + self.assertIn("Cannot specify query parameters", str(ctx.exception)) + + def test_get_empty_record_id_raises(self): + """ValueError raised when record_id is empty or whitespace.""" + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.get("account", record_id=" ") + self.assertIn("non-empty string", str(ctx.exception)) + + def test_get_record_id_stripped(self): + """Leading/trailing whitespace in record_id is stripped.""" + self.client._odata._get.return_value = {"accountid": "guid-1", "name": "Contoso"} + self.client.dataframe.get("account", record_id=" guid-1 ") + self.client._odata._get.assert_called_once_with("account", "guid-1", select=None) + + +class TestDataFrameCreate(unittest.TestCase): + """Tests for client.dataframe.create().""" + + def setUp(self): + self.mock_credential = MagicMock(spec=TokenCredential) + self.client = DataverseClient("https://example.crm.dynamics.com", self.mock_credential) + self.client._odata = MagicMock() + self.client._odata._entity_set_from_schema_name.return_value = "accounts" + + def test_create_returns_series(self): + """Returns a Series of GUIDs aligned with the input DataFrame index.""" + df = pd.DataFrame([{"name": "Contoso"}, {"name": "Fabrikam"}]) + self.client._odata._create_multiple.return_value = ["guid-1", "guid-2"] + ids = self.client.dataframe.create("account", df) + self.assertIsInstance(ids, pd.Series) + self.assertListEqual(ids.tolist(), ["guid-1", "guid-2"]) + + def test_create_type_error(self): + """Non-DataFrame input raises TypeError.""" + with self.assertRaises(TypeError) as ctx: + self.client.dataframe.create("account", [{"name": "Contoso"}]) + self.assertIn("pandas DataFrame", str(ctx.exception)) + + def test_create_empty_dataframe_raises(self): + """Empty DataFrame raises ValueError without calling the API.""" + df = pd.DataFrame(columns=["name"]) + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.create("account", df) + self.assertIn("non-empty", str(ctx.exception)) + self.client._odata._create_multiple.assert_not_called() + + def test_create_id_count_mismatch_raises(self): + """ValueError raised when returned IDs count doesn't match input row count.""" + df = pd.DataFrame([{"name": "Contoso"}, {"name": "Fabrikam"}]) + self.client._odata._create_multiple.return_value = ["guid-1"] + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.create("account", df) + self.assertIn("1 IDs for 2 input rows", str(ctx.exception)) + + def test_create_normalizes_values(self): + """NumPy types and Timestamps are normalized before sending to the API.""" + ts = pd.Timestamp("2024-01-15 10:30:00") + df = pd.DataFrame([{"count": np.int64(5), "score": np.float64(9.8), "createdon": ts}]) + self.client._odata._create_multiple.return_value = ["guid-1"] + self.client.dataframe.create("account", df) + records_arg = self.client._odata._create_multiple.call_args[0][2] + rec = records_arg[0] + self.assertIsInstance(rec["count"], int) + self.assertIsInstance(rec["score"], float) + self.assertEqual(rec["createdon"], "2024-01-15T10:30:00") + + +class TestDataFrameUpdate(unittest.TestCase): + """Tests for client.dataframe.update().""" + + def setUp(self): + self.mock_credential = MagicMock(spec=TokenCredential) + self.client = DataverseClient("https://example.crm.dynamics.com", self.mock_credential) + self.client._odata = MagicMock() + + def test_update_single_record(self): + """Single-row DataFrame calls single-record update path.""" + df = pd.DataFrame([{"accountid": "guid-1", "name": "New Name"}]) + self.client.dataframe.update("account", df, id_column="accountid") + self.client._odata._update.assert_called_once_with("account", "guid-1", {"name": "New Name"}) + + def test_update_multiple_records(self): + """Multi-row DataFrame calls batch update path.""" + df = pd.DataFrame( + [ + {"accountid": "guid-1", "telephone1": "555-0100"}, + {"accountid": "guid-2", "telephone1": "555-0200"}, + ] + ) + self.client.dataframe.update("account", df, id_column="accountid") + self.client._odata._update_by_ids.assert_called_once_with( + "account", + ["guid-1", "guid-2"], + [{"telephone1": "555-0100"}, {"telephone1": "555-0200"}], + ) + + def test_update_type_error(self): + """Non-DataFrame input raises TypeError.""" + with self.assertRaises(TypeError) as ctx: + self.client.dataframe.update("account", {"id": "guid-1"}, id_column="id") + self.assertIn("pandas DataFrame", str(ctx.exception)) + + def test_update_missing_id_column(self): + """ValueError raised when id_column is not in DataFrame columns.""" + df = pd.DataFrame([{"name": "Contoso"}]) + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.update("account", df, id_column="accountid") + self.assertIn("accountid", str(ctx.exception)) + + def test_update_invalid_id_values(self): + """ValueError raised when id_column contains NaN or non-string values.""" + df = pd.DataFrame( + [ + {"accountid": "guid-1", "name": "A"}, + {"accountid": None, "name": "B"}, + ] + ) + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.update("account", df, id_column="accountid") + self.assertIn("invalid values", str(ctx.exception)) + self.assertIn("[1]", str(ctx.exception)) + + def test_update_empty_change_columns(self): + """ValueError raised when DataFrame contains only the id_column.""" + df = pd.DataFrame([{"accountid": "guid-1"}]) + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.update("account", df, id_column="accountid") + self.assertIn("No columns to update", str(ctx.exception)) + + def test_update_empty_dataframe_raises(self): + """Empty DataFrame raises ValueError without calling the API.""" + df = pd.DataFrame(columns=["accountid", "name"]) + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.update("account", df, id_column="accountid") + self.assertIn("non-empty", str(ctx.exception)) + self.client._odata._update.assert_not_called() + + def test_update_clear_nulls_false(self): + """NaN values are omitted from the update payload when clear_nulls=False.""" + df = pd.DataFrame([{"accountid": "guid-1", "name": "New Name", "telephone1": None}]) + self.client.dataframe.update("account", df, id_column="accountid") + call_args = self.client._odata._update.call_args[0] + changes = call_args[2] + self.assertIn("name", changes) + self.assertNotIn("telephone1", changes) + + def test_update_all_nan_rows_skipped(self): + """When all change values are NaN for every row, no API call is made.""" + df = pd.DataFrame( + [ + {"accountid": "guid-1", "telephone1": None, "websiteurl": None}, + {"accountid": "guid-2", "telephone1": None, "websiteurl": None}, + ] + ) + self.client.dataframe.update("account", df, id_column="accountid") + self.client._odata._update.assert_not_called() + self.client._odata._update_by_ids.assert_not_called() + + def test_update_partial_nan_rows_filtered(self): + """Rows where all changes are NaN are filtered; remaining rows proceed.""" + df = pd.DataFrame( + [ + {"accountid": "guid-1", "name": "Updated", "telephone1": None}, + {"accountid": "guid-2", "name": None, "telephone1": None}, + ] + ) + self.client.dataframe.update("account", df, id_column="accountid") + self.client._odata._update.assert_called_once_with("account", "guid-1", {"name": "Updated"}) + + def test_update_invalid_ids_reports_index_labels(self): + """Error message reports DataFrame index labels, not positional indices.""" + df = pd.DataFrame( + [ + {"accountid": "guid-1", "name": "A"}, + {"accountid": None, "name": "B"}, + ], + index=["row_a", "row_b"], + ) + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.update("account", df, id_column="accountid") + self.assertIn("row_b", str(ctx.exception)) + + def test_update_strips_whitespace_from_ids(self): + """Leading/trailing whitespace in IDs is stripped before API call.""" + df = pd.DataFrame([{"accountid": " guid-1 ", "name": "Contoso"}]) + self.client.dataframe.update("account", df, id_column="accountid") + call_args = self.client._odata._update.call_args[0] + self.assertEqual(call_args[1], "guid-1") + + def test_update_clear_nulls_true(self): + """NaN values are sent as None in the update payload when clear_nulls=True.""" + df = pd.DataFrame([{"accountid": "guid-1", "name": "New Name", "telephone1": None}]) + self.client.dataframe.update("account", df, id_column="accountid", clear_nulls=True) + call_args = self.client._odata._update.call_args[0] + changes = call_args[2] + self.assertIn("name", changes) + self.assertIn("telephone1", changes) + self.assertIsNone(changes["telephone1"]) + + +class TestDataFrameDelete(unittest.TestCase): + """Tests for client.dataframe.delete().""" + + def setUp(self): + self.mock_credential = MagicMock(spec=TokenCredential) + self.client = DataverseClient("https://example.crm.dynamics.com", self.mock_credential) + self.client._odata = MagicMock() + + def test_delete_single_record(self): + """Single-element Series calls single-record delete.""" + ids = pd.Series(["guid-1"]) + self.client.dataframe.delete("account", ids) + self.client._odata._delete.assert_called_once_with("account", "guid-1") + + def test_delete_multiple_records(self): + """Multi-element Series calls bulk delete.""" + ids = pd.Series(["guid-1", "guid-2", "guid-3"]) + self.client._odata._delete_multiple.return_value = "job-123" + job_id = self.client.dataframe.delete("account", ids) + self.assertEqual(job_id, "job-123") + self.client._odata._delete_multiple.assert_called_once_with("account", ["guid-1", "guid-2", "guid-3"]) + + def test_delete_type_error(self): + """Non-Series input raises TypeError.""" + with self.assertRaises(TypeError) as ctx: + self.client.dataframe.delete("account", ["guid-1"]) + self.assertIn("pandas Series", str(ctx.exception)) + + def test_delete_empty_series(self): + """Empty Series returns None without calling delete.""" + ids = pd.Series([], dtype="str") + result = self.client.dataframe.delete("account", ids) + self.assertIsNone(result) + self.client._odata._delete.assert_not_called() + self.client._odata._delete_multiple.assert_not_called() + + def test_delete_invalid_ids(self): + """ValueError raised when Series contains NaN or non-string values.""" + ids = pd.Series(["guid-1", None, " "]) + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.delete("account", ids) + self.assertIn("invalid values", str(ctx.exception)) + + def test_delete_with_bulk_delete_false(self): + """use_bulk_delete=False passes through to the underlying delete call.""" + ids = pd.Series(["guid-1", "guid-2"]) + result = self.client.dataframe.delete("account", ids, use_bulk_delete=False) + self.assertIsNone(result) + self.assertEqual(self.client._odata._delete.call_count, 2) + + def test_delete_invalid_ids_reports_index_labels(self): + """Error message reports Series index labels, not positional indices.""" + ids = pd.Series(["guid-1", None], index=["row_x", "row_y"]) + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.delete("account", ids) + self.assertIn("row_y", str(ctx.exception)) + + def test_delete_strips_whitespace_from_ids(self): + """Leading/trailing whitespace in IDs is stripped before API call.""" + ids = pd.Series([" guid-1 "]) + self.client.dataframe.delete("account", ids) + self.client._odata._delete.assert_called_once_with("account", "guid-1") + + +class TestDataFrameEndToEnd(unittest.TestCase): + """End-to-end mocked flow: create -> get -> update -> delete.""" + + def setUp(self): + self.mock_credential = MagicMock(spec=TokenCredential) + self.client = DataverseClient("https://example.crm.dynamics.com", self.mock_credential) + self.client._odata = MagicMock() + self.client._odata._entity_set_from_schema_name.return_value = "accounts" + + def test_create_get_update_delete_flow(self): + """Full CRUD cycle works end-to-end through the dataframe namespace.""" + # Step 1: create + df = pd.DataFrame( + [{"name": "Contoso", "telephone1": "555-0100"}, {"name": "Fabrikam", "telephone1": "555-0200"}] + ) + self.client._odata._create_multiple.return_value = ["guid-1", "guid-2"] + + ids = self.client.dataframe.create("account", df) + + self.assertIsInstance(ids, pd.Series) + self.assertListEqual(ids.tolist(), ["guid-1", "guid-2"]) + + # Step 2: get + df["accountid"] = ids + self.client._odata._get_multiple.return_value = iter( + [[{"accountid": "guid-1", "name": "Contoso"}, {"accountid": "guid-2", "name": "Fabrikam"}]] + ) + + result_df = self.client.dataframe.get("account", select=["accountid", "name"]) + + self.assertIsInstance(result_df, pd.DataFrame) + self.assertEqual(len(result_df), 2) + + # Step 3: update + df["telephone1"] = ["555-9999", "555-8888"] + + self.client.dataframe.update("account", df, id_column="accountid") + + self.client._odata._update_by_ids.assert_called_once() + + # Step 4: delete + self.client._odata._delete_multiple.return_value = "job-abc" + + job_id = self.client.dataframe.delete("account", df["accountid"]) + + self.assertEqual(job_id, "job-abc") + self.client._odata._delete_multiple.assert_called_once_with("account", ["guid-1", "guid-2"]) + + def test_create_normalizes_numpy_types_before_api(self): + """NumPy types in DataFrame cells are normalized to Python types before the API call.""" + df = pd.DataFrame( + [ + { + "count": np.int64(10), + "score": np.float64(9.5), + "active": np.bool_(True), + "createdon": pd.Timestamp("2024-06-01"), + } + ] + ) + self.client._odata._create_multiple.return_value = ["guid-1"] + + self.client.dataframe.create("account", df) + + records_arg = self.client._odata._create_multiple.call_args[0][2] + rec = records_arg[0] + self.assertIsInstance(rec["count"], int) + self.assertIsInstance(rec["score"], float) + self.assertIsInstance(rec["active"], bool) + self.assertIsInstance(rec["createdon"], str) + self.assertEqual(rec["createdon"], "2024-06-01T00:00:00") + + def test_get_with_expand_includes_nested_data(self): + """get() with expand returns DataFrame including expanded navigation property data.""" + page = [ + { + "accountid": "guid-1", + "name": "Contoso", + "primarycontactid": {"contactid": "c-1", "fullname": "John"}, + } + ] + self.client._odata._get_multiple.return_value = iter([page]) + df = self.client.dataframe.get("account", expand=["primarycontactid"]) + self.assertEqual(len(df), 1) + self.assertEqual(df.iloc[0]["name"], "Contoso") + self.assertIsInstance(df.iloc[0]["primarycontactid"], dict) + self.assertEqual(df.iloc[0]["primarycontactid"]["fullname"], "John") + + def test_get_single_record_no_odata_keys(self): + """Single-record get strips @odata.* keys from the returned DataFrame.""" + self.client._odata._get.return_value = { + "@odata.context": "https://example.crm.dynamics.com/$metadata#accounts/$entity", + "@odata.etag": 'W/"123"', + "accountid": "guid-1", + "name": "Contoso", + } + df = self.client.dataframe.get("account", record_id="guid-1") + self.assertNotIn("@odata.context", df.columns) + self.assertNotIn("@odata.etag", df.columns) + self.assertIn("name", df.columns) + self.assertEqual(df.iloc[0]["name"], "Contoso") + + def test_delete_whitespace_only_ids_rejected(self): + """Series containing whitespace-only strings raises ValueError.""" + ids = pd.Series(["guid-1", " ", "guid-3"]) + with self.assertRaises(ValueError) as ctx: + self.client.dataframe.delete("account", ids) + self.assertIn("invalid values", str(ctx.exception)) + self.assertIn("[1]", str(ctx.exception)) + + def test_update_with_timezone_aware_timestamps(self): + """Update correctly normalizes timezone-aware Timestamps.""" + ts = pd.Timestamp("2024-06-15 10:30:00", tz="UTC") + df = pd.DataFrame([{"accountid": "guid-1", "lastonholdtime": ts}]) + self.client.dataframe.update("account", df, id_column="accountid") + call_args = self.client._odata._update.call_args[0] + changes = call_args[2] + self.assertIsInstance(changes["lastonholdtime"], str) + self.assertIn("2024-06-15T10:30:00", changes["lastonholdtime"]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/unit/test_pandas_helpers.py b/tests/unit/test_pandas_helpers.py new file mode 100644 index 00000000..f6c0c2fc --- /dev/null +++ b/tests/unit/test_pandas_helpers.py @@ -0,0 +1,301 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +"""Unit tests for the dataframe_to_records() helper in utils/_pandas.py.""" + +import unittest + +import numpy as np +import pandas as pd + +from PowerPlatform.Dataverse.utils._pandas import _normalize_scalar, dataframe_to_records + + +class TestNormalizeScalar(unittest.TestCase): + """Unit tests for _normalize_scalar().""" + + def test_timestamp(self): + """pd.Timestamp is converted to an ISO 8601 string.""" + ts = pd.Timestamp("2024-01-15 10:30:00") + result = _normalize_scalar(ts) + self.assertEqual(result, "2024-01-15T10:30:00") + + def test_numpy_integer(self): + """np.int64 is converted to Python int.""" + result = _normalize_scalar(np.int64(42)) + self.assertIsInstance(result, int) + self.assertEqual(result, 42) + + def test_numpy_floating(self): + """np.float64 is converted to Python float.""" + result = _normalize_scalar(np.float64(3.14)) + self.assertIsInstance(result, float) + self.assertAlmostEqual(result, 3.14) + + def test_numpy_bool(self): + """np.bool_ is converted to Python bool.""" + result = _normalize_scalar(np.bool_(True)) + self.assertIsInstance(result, bool) + self.assertTrue(result) + + def test_python_str_passthrough(self): + """Python str values pass through unchanged.""" + result = _normalize_scalar("hello") + self.assertEqual(result, "hello") + + def test_python_int_passthrough(self): + """Native Python int values pass through unchanged.""" + result = _normalize_scalar(42) + self.assertIsInstance(result, int) + self.assertEqual(result, 42) + + def test_python_float_passthrough(self): + """Native Python float values pass through unchanged.""" + result = _normalize_scalar(3.14) + self.assertIsInstance(result, float) + self.assertAlmostEqual(result, 3.14) + + def test_python_bool_passthrough(self): + """Native Python bool values pass through unchanged.""" + result = _normalize_scalar(True) + self.assertIsInstance(result, bool) + self.assertTrue(result) + + def test_none_passthrough(self): + """None passes through unchanged (caller is responsible for NA handling).""" + result = _normalize_scalar(None) + self.assertIsNone(result) + + def test_timestamp_with_timezone(self): + """Timezone-aware pd.Timestamp is converted to ISO 8601 with tz offset.""" + ts = pd.Timestamp("2024-06-15 10:30:00", tz="UTC") + result = _normalize_scalar(ts) + self.assertIn("2024-06-15T10:30:00", result) + self.assertIsInstance(result, str) + + def test_numpy_int32(self): + """np.int32 is also converted to Python int.""" + result = _normalize_scalar(np.int32(7)) + self.assertIsInstance(result, int) + self.assertEqual(result, 7) + + def test_numpy_float32(self): + """np.float32 is also converted to Python float.""" + result = _normalize_scalar(np.float32(2.5)) + self.assertIsInstance(result, float) + self.assertAlmostEqual(result, 2.5, places=5) + + def test_python_datetime(self): + """datetime.datetime is converted to ISO 8601 string.""" + import datetime + + dt = datetime.datetime(2024, 6, 15, 10, 30, 0) + result = _normalize_scalar(dt) + self.assertIsInstance(result, str) + self.assertEqual(result, "2024-06-15T10:30:00") + + def test_python_date(self): + """datetime.date is converted to ISO 8601 string.""" + import datetime + + d = datetime.date(2024, 6, 15) + result = _normalize_scalar(d) + self.assertIsInstance(result, str) + self.assertEqual(result, "2024-06-15") + + def test_numpy_datetime64(self): + """np.datetime64 is converted to ISO 8601 string.""" + dt = np.datetime64("2024-06-15T10:30:00") + result = _normalize_scalar(dt) + self.assertIsInstance(result, str) + self.assertIn("2024-06-15T10:30:00", result) + + +class TestDataframeToRecords(unittest.TestCase): + """Unit tests for dataframe_to_records().""" + + def test_basic(self): + """Basic DataFrame with string values is converted correctly.""" + df = pd.DataFrame([{"name": "Contoso", "city": "Seattle"}]) + result = dataframe_to_records(df) + self.assertEqual(result, [{"name": "Contoso", "city": "Seattle"}]) + + def test_nan_dropped(self): + """NaN values are omitted from records when na_as_null=False (default).""" + df = pd.DataFrame([{"name": "Contoso", "telephone1": None}]) + result = dataframe_to_records(df) + self.assertEqual(result, [{"name": "Contoso"}]) + self.assertNotIn("telephone1", result[0]) + + def test_nan_as_null(self): + """NaN values become None when na_as_null=True.""" + df = pd.DataFrame([{"name": "Contoso", "telephone1": None}]) + result = dataframe_to_records(df, na_as_null=True) + self.assertEqual(result, [{"name": "Contoso", "telephone1": None}]) + self.assertIn("telephone1", result[0]) + self.assertIsNone(result[0]["telephone1"]) + + def test_timestamp_conversion(self): + """pd.Timestamp values are converted to ISO 8601 strings.""" + ts = pd.Timestamp("2024-01-15 10:30:00") + df = pd.DataFrame([{"name": "Contoso", "createdon": ts}]) + result = dataframe_to_records(df) + self.assertEqual(result[0]["createdon"], "2024-01-15T10:30:00") + + def test_numpy_int(self): + """np.int64 values are converted to Python int.""" + df = pd.DataFrame([{"priority": np.int64(42)}]) + result = dataframe_to_records(df) + self.assertIsInstance(result[0]["priority"], int) + self.assertEqual(result[0]["priority"], 42) + + def test_numpy_float(self): + """np.float64 values are converted to Python float.""" + df = pd.DataFrame([{"score": np.float64(3.14)}]) + result = dataframe_to_records(df) + self.assertIsInstance(result[0]["score"], float) + self.assertAlmostEqual(result[0]["score"], 3.14) + + def test_numpy_bool(self): + """np.bool_ values are converted to Python bool.""" + df = pd.DataFrame([{"active": np.bool_(True)}]) + result = dataframe_to_records(df) + self.assertIsInstance(result[0]["active"], bool) + self.assertTrue(result[0]["active"]) + + def test_list_value(self): + """Cells containing lists pass through without raising ValueError.""" + df = pd.DataFrame([{"tags": ["a", "b", "c"]}]) + result = dataframe_to_records(df) + self.assertEqual(result[0]["tags"], ["a", "b", "c"]) + + def test_dict_value(self): + """Cells containing dicts pass through without raising ValueError.""" + df = pd.DataFrame([{"metadata": {"key": "value"}}]) + result = dataframe_to_records(df) + self.assertEqual(result[0]["metadata"], {"key": "value"}) + + def test_ndarray_converted_to_list(self): + """np.ndarray values are converted to Python lists for JSON serialization.""" + arr = np.array([1, 2, 3]) + df = pd.DataFrame([{"values": arr}]) + result = dataframe_to_records(df) + self.assertIsInstance(result[0]["values"], list) + self.assertEqual(result[0]["values"], [1, 2, 3]) + + def test_empty_dataframe(self): + """Empty DataFrame returns an empty list.""" + df = pd.DataFrame(columns=["name", "telephone1"]) + result = dataframe_to_records(df) + self.assertEqual(result, []) + + def test_mixed_types(self): + """DataFrame with mixed types (str, int, float, None, Timestamp) converts correctly.""" + ts = pd.Timestamp("2024-06-01") + df = pd.DataFrame( + [ + { + "name": "Contoso", + "count": np.int64(5), + "score": np.float64(9.8), + "active": np.bool_(True), + "createdon": ts, + "notes": None, + } + ] + ) + result = dataframe_to_records(df) + self.assertEqual(len(result), 1) + rec = result[0] + self.assertEqual(rec["name"], "Contoso") + self.assertIsInstance(rec["count"], int) + self.assertEqual(rec["count"], 5) + self.assertIsInstance(rec["score"], float) + self.assertAlmostEqual(rec["score"], 9.8) + self.assertIsInstance(rec["active"], bool) + self.assertTrue(rec["active"]) + self.assertEqual(rec["createdon"], "2024-06-01T00:00:00") + self.assertNotIn("notes", rec) + + def test_timezone_aware_timestamp(self): + """Timezone-aware Timestamp in DataFrame is converted to ISO string with tz.""" + ts = pd.Timestamp("2024-06-15 10:30:00", tz="US/Eastern") + df = pd.DataFrame([{"createdon": ts}]) + result = dataframe_to_records(df) + self.assertIn("2024-06-15T10:30:00", result[0]["createdon"]) + self.assertIsInstance(result[0]["createdon"], str) + + def test_multiple_rows_some_with_nan(self): + """Multi-row DataFrame with mixed NaN positions drops correct keys per row.""" + df = pd.DataFrame( + [ + {"name": "A", "phone": "555-0100", "city": None}, + {"name": "B", "phone": None, "city": "Seattle"}, + {"name": None, "phone": "555-0300", "city": "Portland"}, + ] + ) + result = dataframe_to_records(df) + self.assertEqual(result[0], {"name": "A", "phone": "555-0100"}) + self.assertEqual(result[1], {"name": "B", "city": "Seattle"}) + self.assertEqual(result[2], {"phone": "555-0300", "city": "Portland"}) + + def test_multiple_rows_na_as_null(self): + """Multi-row DataFrame with na_as_null=True includes None for all missing values.""" + df = pd.DataFrame( + [ + {"name": "A", "phone": None}, + {"name": None, "phone": "555-0200"}, + ] + ) + result = dataframe_to_records(df, na_as_null=True) + self.assertEqual(result[0], {"name": "A", "phone": None}) + self.assertEqual(result[1], {"name": None, "phone": "555-0200"}) + + def test_empty_string_preserved(self): + """Empty string is kept in output, not treated as missing.""" + df = pd.DataFrame([{"name": ""}]) + result = dataframe_to_records(df) + self.assertIn("name", result[0]) + self.assertEqual(result[0]["name"], "") + + def test_zero_and_false_preserved(self): + """Zero and False are kept in output, not treated as missing.""" + df = pd.DataFrame([{"count": 0, "score": 0.0, "active": False}]) + result = dataframe_to_records(df) + self.assertEqual(result[0]["count"], 0) + self.assertEqual(result[0]["score"], 0.0) + self.assertIs(result[0]["active"], False) + + def test_pd_na_nullable_int(self): + """pd.NA in nullable Int64 column is dropped by default.""" + df = pd.DataFrame({"val": pd.array([1, pd.NA], dtype="Int64")}) + result = dataframe_to_records(df) + self.assertEqual(result[0]["val"], 1) + self.assertNotIn("val", result[1]) + + def test_pd_na_nullable_int_as_null(self): + """pd.NA in nullable Int64 column becomes None with na_as_null=True.""" + df = pd.DataFrame({"val": pd.array([1, pd.NA], dtype="Int64")}) + result = dataframe_to_records(df, na_as_null=True) + self.assertEqual(result[0]["val"], 1) + self.assertIsNone(result[1]["val"]) + + def test_datetime_in_dataframe(self): + """datetime.datetime values in a DataFrame are converted to ISO strings.""" + import datetime + + dt = datetime.datetime(2024, 6, 15, 10, 30) + df = pd.DataFrame([{"createdon": dt}]) + result = dataframe_to_records(df) + self.assertIsInstance(result[0]["createdon"], str) + self.assertIn("2024-06-15", result[0]["createdon"]) + + def test_literal_nan_string(self): + """Literal string 'NaN' is preserved, not treated as missing.""" + df = pd.DataFrame([{"name": "NaN"}]) + result = dataframe_to_records(df) + self.assertEqual(result[0]["name"], "NaN") + + +if __name__ == "__main__": + unittest.main()