This guide provides a comprehensive solution for automatically generating documentation for your PowerBI Semantic Model using Semantic Link, AI enhancement, and multiple output formats.
This documentation generator automatically:
PowerBI Semantic Model → Semantic Link API → Metadata Extraction
→ AI Enhancement → Documentation Generation → Multiple Outputspip install semantic-link-core
pip install semantic-link
pip install pandas
pip install openai
pip install azure-identity
pip install msal
pip install markdown2
pip install graphviz
pip install jinja2# config.py
import os
from azure.identity import DefaultAzureCredential
# Fabric/PowerBI Configuration
FABRIC_CONFIG = {
'workspace_name': os.getenv('FABRIC_WORKSPACE_NAME'),
'semantic_model_name': os.getenv('SEMANTIC_MODEL_NAME'),
'tenant_id': os.getenv('AZURE_TENANT_ID'),
'client_id': os.getenv('AZURE_CLIENT_ID'),
'client_secret': os.getenv('AZURE_CLIENT_SECRET')
}
# AI Configuration (Azure OpenAI or OpenAI)
AI_CONFIG = {
'provider': 'azure', # or 'openai'
'api_key': os.getenv('OPENAI_API_KEY'),
'endpoint': os.getenv('AZURE_OPENAI_ENDPOINT'),
'deployment_name': os.getenv('AZURE_OPENAI_DEPLOYMENT'),
'api_version': '2024-02-15-preview'
}# metadata_extractor.py
import sempy.fabric as fabric
import pandas as pd
from typing import Dict, List, Any
import json
from datetime import datetime
class SemanticModelMetadataExtractor:
def __init__(self, workspace_name: str, model_name: str):
self.workspace_name = workspace_name
self.model_name = model_name
self.metadata = {}
def extract_all_metadata(self) -> Dict[str, Any]:
"""Extract complete metadata from the semantic model."""
print(f"Extracting metadata from {self.model_name}...")
try:
# Extract tables
self.metadata['tables'] = self._extract_tables()
# Extract columns with enhanced metadata
self.metadata['columns'] = self._extract_columns()
# Extract measures with DAX
self.metadata['measures'] = self._extract_measures()
# Extract relationships
self.metadata['relationships'] = self._extract_relationships()
# Extract hierarchies
self.metadata['hierarchies'] = self._extract_hierarchies()
# Extract roles and RLS
self.metadata['roles'] = self._extract_roles()
# Extract calculation groups
self.metadata['calculation_groups'] = self._extract_calculation_groups()
# Add metadata summary
self.metadata['summary'] = self._generate_summary()
return self.metadata
except Exception as e:
print(f"Error extracting metadata: {str(e)}")
raise
def _extract_tables(self) -> List[Dict]:
"""Extract table information."""
tables = fabric.list_tables(
dataset=self.model_name,
workspace=self.workspace_name
)
enhanced_tables = []
for table in tables:
enhanced_table = {
'name': table['Name'],
'type': table.get('Type', 'Unknown'),
'is_hidden': table.get('IsHidden', False),
'description': table.get('Description', ''),
'source_type': self._determine_source_type(table),
'row_count': self._get_row_count(table['Name'])
}
enhanced_tables.append(enhanced_table)
return enhanced_tables
def _extract_columns(self) -> List[Dict]:
"""Extract column information with enhanced metadata."""
columns = fabric.list_columns(
dataset=self.model_name,
workspace=self.workspace_name
)
enhanced_columns = []
for col in columns:
enhanced_col = {
'table_name': col['TableName'],
'column_name': col['Name'],
'data_type': col['DataType'],
'is_hidden': col.get('IsHidden', False),
'is_key': col.get('IsKey', False),
'is_nullable': col.get('IsNullable', True),
'format_string': col.get('FormatString', ''),
'description': col.get('Description', ''),
'sort_by_column': col.get('SortByColumn', ''),
'data_category': col.get('DataCategory', ''),
'summarize_by': col.get('SummarizeBy', 'Default')
}
enhanced_columns.append(enhanced_col)
return enhanced_columns
def _extract_measures(self) -> List[Dict]:
"""Extract measures with DAX expressions."""
measures = fabric.list_measures(
dataset=self.model_name,
workspace=self.workspace_name
)
enhanced_measures = []
for measure in measures:
try:
dax_expression = fabric.get_measure(
dataset=self.model_name,
measure=measure['Name'],
workspace=self.workspace_name
)['Expression']
except:
dax_expression = "Unable to retrieve DAX expression"
enhanced_measure = {
'name': measure['Name'],
'table_name': measure.get('TableName', ''),
'description': measure.get('Description', ''),
'format_string': measure.get('FormatString', ''),
'is_hidden': measure.get('IsHidden', False),
'dax_expression': dax_expression,
'display_folder': measure.get('DisplayFolder', ''),
'kpi': measure.get('KPI', None)
}
# Analyze DAX complexity
enhanced_measure['complexity'] = self._analyze_dax_complexity(dax_expression)
enhanced_measure['dependencies'] = self._extract_measure_dependencies(dax_expression)
enhanced_measures.append(enhanced_measure)
return enhanced_measures
def _extract_relationships(self) -> List[Dict]:
"""Extract relationship information."""
relationships = fabric.list_relationships(
dataset=self.model_name,
workspace=self.workspace_name
)
enhanced_relationships = []
for rel in relationships:
enhanced_rel = {
'name': rel.get('Name', f"{rel['FromTable']}_to_{rel['ToTable']}"),
'from_table': rel['FromTable'],
'from_column': rel['FromColumn'],
'to_table': rel['ToTable'],
'to_column': rel['ToColumn'],
'cardinality': rel.get('Cardinality', 'ManyToOne'),
'cross_filter_direction': rel.get('CrossFilteringBehavior', 'SingleDirection'),
'is_active': rel.get('IsActive', True),
'security_filtering_behavior': rel.get('SecurityFilteringBehavior', 'None')
}
enhanced_relationships.append(enhanced_rel)
return enhanced_relationships
def _extract_hierarchies(self) -> List[Dict]:
"""Extract hierarchy information."""
hierarchies = fabric.list_hierarchies(
dataset=self.model_name,
workspace=self.workspace_name
)
return hierarchies
def _extract_roles(self) -> List[Dict]:
"""Extract security roles and RLS definitions."""
try:
roles = fabric.list_roles(
dataset=self.model_name,
workspace=self.workspace_name
)
return roles
except:
return []
def _extract_calculation_groups(self) -> List[Dict]:
"""Extract calculation groups if present."""
try:
calc_groups = fabric.list_calculation_groups(
dataset=self.model_name,
workspace=self.workspace_name
)
return calc_groups
except:
return []
def _determine_source_type(self, table: Dict) -> str:
"""Determine the source type of a table."""
# Logic to determine if table is imported, DirectQuery, composite, etc.
return table.get('Mode', 'Import')
def _get_row_count(self, table_name: str) -> int:
"""Get approximate row count for a table."""
try:
query = f'EVALUATE ROW("Count", COUNTROWS({table_name}))'
result = fabric.evaluate_dax(
dataset=self.model_name,
workspace=self.workspace_name,
dax_string=query
)
return result.iloc[0, 0] if not result.empty else 0
except:
return 0
def _analyze_dax_complexity(self, dax_expression: str) -> str:
"""Analyze DAX expression complexity."""
if not dax_expression or dax_expression == "Unable to retrieve DAX expression":
return "Unknown"
# Simple complexity analysis based on DAX patterns
complex_functions = ['CALCULATE', 'CALCULATETABLE', 'FILTER', 'ALL', 'ALLEXCEPT',
'EARLIER', 'RANKX', 'TOPN', 'GENERATE', 'SUMMARIZE']
complexity_score = 0
for func in complex_functions:
complexity_score += dax_expression.upper().count(func)
if complexity_score == 0:
return "Simple"
elif complexity_score <= 3:
return "Moderate"
else:
return "Complex"
def _extract_measure_dependencies(self, dax_expression: str) -> List[str]:
"""Extract measure dependencies from DAX expression."""
import re
if not dax_expression or dax_expression == "Unable to retrieve DAX expression":
return []
# Find measure references in square brackets
pattern = r'\[([^\]]+)\]'
matches = re.findall(pattern, dax_expression)
# Filter out column references (containing table names)
dependencies = [m for m in matches if '(' not in m and ')' not in m]
return list(set(dependencies))
def _generate_summary(self) -> Dict[str, Any]:
"""Generate metadata summary statistics."""
return {
'extraction_timestamp': datetime.now().isoformat(),
'workspace_name': self.workspace_name,
'model_name': self.model_name,
'statistics': {
'table_count': len(self.metadata.get('tables', [])),
'column_count': len(self.metadata.get('columns', [])),
'measure_count': len(self.metadata.get('measures', [])),
'relationship_count': len(self.metadata.get('relationships', [])),
'hierarchy_count': len(self.metadata.get('hierarchies', [])),
'role_count': len(self.metadata.get('roles', [])),
'calculation_group_count': len(self.metadata.get('calculation_groups', []))
},
'complexity_analysis': {
'simple_measures': len([m for m in self.metadata.get('measures', [])
if m.get('complexity') == 'Simple']),
'moderate_measures': len([m for m in self.metadata.get('measures', [])
if m.get('complexity') == 'Moderate']),
'complex_measures': len([m for m in self.metadata.get('measures', [])
if m.get('complexity') == 'Complex'])
}
}
def save_metadata(self, output_path: str = 'semantic_model_metadata.json'):
"""Save extracted metadata to JSON file."""
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(self.metadata, f, indent=2, ensure_ascii=False)
print(f"Metadata saved to {output_path}")# visualizer.py
import graphviz
from typing import List, Dict
class ModelVisualizer:
def __init__(self, metadata: Dict):
self.metadata = metadata
def generate_relationship_diagram(self, output_format: str = 'png'):
"""Generate a visual diagram of table relationships."""
dot = graphviz.Digraph(comment='Semantic Model Relationships')
dot.attr(rankdir='LR', size='12,8')
dot.attr('node', shape='box', style='rounded,filled', fillcolor='lightblue')
# Add tables as nodes
for table in self.metadata['tables']:
if not table.get('is_hidden', False):
label = f"{table['name']}\n({table['type']})"
dot.node(table['name'], label)
# Add relationships as edges
for rel in self.metadata['relationships']:
if rel['is_active']:
label = f"{rel['from_column']} → {rel['to_column']}\n{rel['cardinality']}"
style = 'solid' if rel['cross_filter_direction'] == 'Both' else 'dashed'
dot.edge(rel['from_table'], rel['to_table'], label=label, style=style)
# Render the diagram
output_path = f'semantic_model_relationships.{output_format}'
dot.render(output_path, format=output_format, cleanup=True)
print(f"Relationship diagram saved to {output_path}")
return dot
def generate_measure_dependency_graph(self, output_format: str = 'png'):
"""Generate a dependency graph for measures."""
dot = graphviz.Digraph(comment='Measure Dependencies')
dot.attr(rankdir='TB', size='12,8')
dot.attr('node', shape='ellipse', style='filled', fillcolor='lightgreen')
# Create a map of all measures
measure_map = {m['name']: m for m in self.metadata['measures']}
# Add measures as nodes
for measure in self.metadata['measures']:
if not measure.get('is_hidden', False):
complexity_color = {
'Simple': 'lightgreen',
'Moderate': 'yellow',
'Complex': 'orange'
}.get(measure.get('complexity', 'Simple'), 'lightgreen')
dot.node(measure['name'],
label=measure['name'],
fillcolor=complexity_color)
# Add dependencies as edges
for measure in self.metadata['measures']:
for dep in measure.get('dependencies', []):
if dep in measure_map:
dot.edge(dep, measure['name'])
# Render the diagram
output_path = f'measure_dependencies.{output_format}'
dot.render(output_path, format=output_format, cleanup=True)
print(f"Measure dependency graph saved to {output_path}")
return dot# ai_enhancer.py
import openai
from typing import Dict, List, Any
import json
import time
from tenacity import retry, stop_after_attempt, wait_exponential
class AIDocumentationEnhancer:
def __init__(self, ai_config: Dict):
self.ai_config = ai_config
self._setup_ai_client()
def _setup_ai_client(self):
"""Setup AI client based on provider."""
if self.ai_config['provider'] == 'azure':
openai.api_type = "azure"
openai.api_base = self.ai_config['endpoint']
openai.api_version = self.ai_config['api_version']
openai.api_key = self.ai_config['api_key']
self.deployment_name = self.ai_config['deployment_name']
else:
openai.api_key = self.ai_config['api_key']
self.deployment_name = "gpt-4"
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def _call_ai(self, prompt: str, system_prompt: str = None) -> str:
"""Call AI API with retry logic."""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
if self.ai_config['provider'] == 'azure':
response = openai.ChatCompletion.create(
engine=self.deployment_name,
messages=messages,
temperature=0.3,
max_tokens=1000
)
else:
response = openai.ChatCompletion.create(
model=self.deployment_name,
messages=messages,
temperature=0.3,
max_tokens=1000
)
return response.choices[0].message.content
def enhance_metadata(self, metadata: Dict) -> Dict:
"""Enhance all metadata with AI-generated descriptions."""
enhanced_metadata = metadata.copy()
# Define the business context
system_prompt = """You are a data documentation expert specializing in hotel analytics
and business intelligence. You're documenting a PowerBI semantic model for the ARIEL
Analytics platform. Provide clear, business-focused descriptions that help users
understand the purpose and usage of each component."""
# Enhance tables
print("Enhancing table descriptions...")
enhanced_metadata['tables'] = self._enhance_tables(
metadata['tables'],
metadata['columns'],
system_prompt
)
# Enhance measures
print("Enhancing measure descriptions...")
enhanced_metadata['measures'] = self._enhance_measures(
metadata['measures'],
system_prompt
)
# Generate business glossary
print("Generating business glossary...")
enhanced_metadata['business_glossary'] = self._generate_business_glossary(
metadata,
system_prompt
)
# Generate usage examples
print("Generating usage examples...")
enhanced_metadata['usage_examples'] = self._generate_usage_examples(
metadata,
system_prompt
)
return enhanced_metadata
def _enhance_tables(self, tables: List[Dict], columns: List[Dict],
system_prompt: str) -> List[Dict]:
"""Enhance table descriptions with AI."""
enhanced_tables = []
for table in tables:
# Get columns for this table
table_columns = [col for col in columns if col['table_name'] == table['name']]
column_names = [col['column_name'] for col in table_columns]
prompt = f"""
Analyze this PowerBI table and provide comprehensive documentation:
Table Name: {table['name']}
Table Type: {table.get('type', 'Unknown')}
Columns: {', '.join(column_names[:20])} # Limit to first 20 columns
Row Count: {table.get('row_count', 'Unknown')}
Provide a JSON response with:
1. business_description: Clear explanation of what this table represents in the hotel context
2. purpose: The business purpose and primary use cases
3. key_metrics: Important metrics or KPIs derived from this table
4. data_quality_notes: Any important considerations for data quality
5. common_filters: Typical filters users might apply
6. related_tables: Tables commonly joined with this one
Format as valid JSON.
"""
try:
response = self._call_ai(prompt, system_prompt)
ai_enhancements = json.loads(response)
# Merge AI enhancements with original table data
enhanced_table = {**table, **ai_enhancements}
# Enhance columns for this table
enhanced_table['columns'] = self._enhance_columns(
table_columns,
table['name'],
system_prompt
)
enhanced_tables.append(enhanced_table)
# Add delay to avoid rate limiting
time.sleep(0.5)
except Exception as e:
print(f"Error enhancing table {table['name']}: {str(e)}")
enhanced_table = table.copy()
enhanced_table['ai_enhancement_error'] = str(e)
enhanced_tables.append(enhanced_table)
return enhanced_tables
def _enhance_columns(self, columns: List[Dict], table_name: str,
system_prompt: str) -> List[Dict]:
"""Enhance column descriptions in batches."""
enhanced_columns = []
# Process columns in batches of 10
batch_size = 10
for i in range(0, len(columns), batch_size):
batch = columns[i:i + batch_size]
column_info = []
for col in batch:
column_info.append(f"{col['column_name']} ({col['data_type']})")
prompt = f"""
For the table "{table_name}" in a hotel analytics system, provide business descriptions
for these columns:
{json.dumps(column_info, indent=2)}
Return a JSON object where each key is the column name and the value is an object with:
- business_meaning: What this column represents in business terms
- typical_values: Example values or ranges
- usage_notes: When and how to use this column
Keep descriptions concise but informative.
"""
try:
response = self._call_ai(prompt, system_prompt)
column_descriptions = json.loads(response)
# Merge descriptions with original column data
for col in batch:
col_name = col['column_name']
if col_name in column_descriptions:
enhanced_col = {**col, **column_descriptions[col_name]}
else:
enhanced_col = col.copy()
enhanced_columns.append(enhanced_col)
except Exception as e:
print(f"Error enhancing columns batch: {str(e)}")
enhanced_columns.extend(batch)
return enhanced_columns
def _enhance_measures(self, measures: List[Dict], system_prompt: str) -> List[Dict]:
"""Enhance measure descriptions with AI analysis."""
enhanced_measures = []
for measure in measures:
prompt = f"""
Analyze this PowerBI measure for a hotel analytics system:
Measure Name: {measure['name']}
Table: {measure.get('table_name', 'Model')}
DAX Expression: {measure.get('dax_expression', 'Not available')[:500]} # Limit length
Complexity: {measure.get('complexity', 'Unknown')}
Dependencies: {', '.join(measure.get('dependencies', [])[:5])}
Provide a JSON response with:
1. business_description: Plain English explanation of what this calculates
2. use_cases: Specific business scenarios where this measure is valuable
3. interpretation_guide: How to interpret the results
4. common_mistakes: Common misinterpretations or usage errors
5. performance_notes: Any performance considerations
6. related_measures: Other measures often used together
Format as valid JSON.
"""
try:
response = self._call_ai(prompt, system_prompt)
ai_enhancements = json.loads(response)
# Merge AI enhancements
enhanced_measure = {**measure, **ai_enhancements}
enhanced_measures.append(enhanced_measure)
time.sleep(0.5)
except Exception as e:
print(f"Error enhancing measure {measure['name']}: {str(e)}")
enhanced_measure = measure.copy()
enhanced_measure['ai_enhancement_error'] = str(e)
enhanced_measures.append(enhanced_measure)
return enhanced_measures
def _generate_business_glossary(self, metadata: Dict, system_prompt: str) -> Dict:
"""Generate a business glossary from the model."""
prompt = f"""
Based on this PowerBI semantic model for hotel analytics with:
- {len(metadata['tables'])} tables
- {len(metadata['measures'])} measures
- Table names: {', '.join([t['name'] for t in metadata['tables'][:10]])}
- Key measures: {', '.join([m['name'] for m in metadata['measures'][:10]])}
Generate a business glossary with 15-20 key terms that users need to understand.
Include both general hotel industry terms and specific terms from this model.
Format as JSON with structure:
{{
"term": {{
"definition": "Clear business definition",
"context": "How it's used in this model",
"related_objects": ["list of related tables/measures"]
}}
}}
"""
try:
response = self._call_ai(prompt, system_prompt)
return json.loads(response)
except Exception as e:
print(f"Error generating business glossary: {str(e)}")
return {}
def _generate_usage_examples(self, metadata: Dict, system_prompt: str) -> List[Dict]:
"""Generate practical usage examples."""
prompt = f"""
Create 5 practical usage examples for this hotel analytics semantic model.
Available key measures: {', '.join([m['name'] for m in metadata['measures'][:15]])}
Available tables: {', '.join([t['name'] for t in metadata['tables'][:10]])}
For each example, provide:
1. scenario: Business question being answered
2. measures_used: List of measures involved
3. filters_applied: Common filters for this scenario
4. expected_insights: What users should look for
5. visualization_recommendation: Best chart type for this analysis
Format as JSON array.
"""
try:
response = self._call_ai(prompt, system_prompt)
return json.loads(response)
except Exception as e:
print(f"Error generating usage examples: {str(e)}")
return []# doc_generator.py
from jinja2 import Template
import markdown2
from datetime import datetime
from typing import Dict, List, Any
import json
class DocumentationGenerator:
def __init__(self, enhanced_metadata: Dict):
self.metadata = enhanced_metadata
self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
def generate_all_outputs(self):
"""Generate all documentation outputs."""
# Generate Markdown documentation
md_content = self.generate_markdown()
with open('semantic_model_documentation.md', 'w', encoding='utf-8') as f:
f.write(md_content)
print("Generated: semantic_model_documentation.md")
# Generate HTML documentation
html_content = self.generate_html()
with open('semantic_model_documentation.html', 'w', encoding='utf-8') as f:
f.write(html_content)
print("Generated: semantic_model_documentation.html")
# Generate JSON catalog
with open('semantic_model_catalog.json', 'w', encoding='utf-8') as f:
json.dump(self.metadata, f, indent=2, ensure_ascii=False)
print("Generated: semantic_model_catalog.json")
# Generate quick reference guide
ref_content = self.generate_quick_reference()
with open('quick_reference.md', 'w', encoding='utf-8') as f:
f.write(ref_content)
print("Generated: quick_reference.md")
def generate_markdown(self) -> str:
"""Generate comprehensive Markdown documentation."""
template = Template("""# {{ model_name }} Semantic Model Documentation
**Generated:** {{ timestamp }}
**Workspace:** {{ workspace_name }}
**Last Updated:** {{ last_updated }}
## Executive Summary
{{ summary_stats }}
## Table of Contents
1. [Overview](#overview)
2. [Tables](#tables)
3. [Measures](#measures)
4. [Relationships](#relationships)
5. [Business Glossary](#business-glossary)
6. [Usage Examples](#usage-examples)
7. [Best Practices](#best-practices)
---
## Overview
This semantic model supports the ARIEL Analytics platform for hotel operations optimization. It provides comprehensive insights into:
- **Operational Efficiency**: Automation rates, processing volumes, and time savings
- **Financial Performance**: Revenue optimization, cost reduction, and ROI metrics
- **Guest Experience**: Service quality metrics and satisfaction indicators
- **Resource Utilization**: Staff productivity and system performance
### Model Statistics
| Metric | Count |
|--------|-------|
| Tables | {{ table_count }} |
| Columns | {{ column_count }} |
| Measures | {{ measure_count }} |
| Relationships | {{ relationship_count }} |
| Hierarchies | {{ hierarchy_count }} |
| Security Roles | {{ role_count }} |
### Measure Complexity Distribution
- **Simple Measures**: {{ simple_count }} ({{ simple_pct }}%)
- **Moderate Measures**: {{ moderate_count }} ({{ moderate_pct }}%)
- **Complex Measures**: {{ complex_count }} ({{ complex_pct }}%)
---
## Tables
{% for table in tables %}
### {{ table.name }}
**Type:** {{ table.type }}
**Row Count:** {{ "{:,}".format(table.row_count) if table.row_count else "N/A" }}
**Hidden:** {{ "Yes" if table.is_hidden else "No" }}
{{ table.business_description }}
**Purpose:** {{ table.purpose }}
**Key Metrics:** {{ table.key_metrics }}
{% if table.data_quality_notes %}
**Data Quality Notes:** {{ table.data_quality_notes }}
{% endif %}
#### Columns
| Column | Type | Description | Business Meaning |
|--------|------|-------------|------------------|
{% for col in table.columns %}
| {{ col.column_name }} | {{ col.data_type }} | {{ col.description }} | {{ col.business_meaning }} |
{% endfor %}
{% endfor %}
---
## Measures
{% for measure in measures %}
### {{ measure.name }}
**Table:** {{ measure.table_name }}
**Complexity:** {{ measure.complexity }}
**Format:** {{ measure