#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
TTL Documentation Generator - v52 with External Predicates from DB Table
================================================================================
Generates professional MS Word documentation for Turtle RDF triples metadata
from Oracle 23ai database tables using R2RML mappings.

CHANGES FROM v51:
- **External Predicates from DB** - Queries UTIL_GRF_R2RML_COLUMN_EXTERNAL table
  for predicates sourced outside R2RML database metadata (e.g., file uploads)
  * Same schema as UTIL_GRF_R2RML_COLUMN except COLUMN_NAME -> SOURCE_FILE
  * R2RML_ROLE controls the Object Type column in the document
  * COLUMN_ORDER integrates naturally with main table ordering
  * External predicates merge inline with DB predicates per class

CHANGES FROM v50:
- **Fixed: Predicate Ordering** - Predicates now appear in COLUMN_ORDER sequence
  from metadata table, not arbitrary insertion order

ENHANCED FEATURES v49+:
- **Automatic Graph Name Extraction** - Reads graph name from TTL @prefix mt: declaration
  * IRIs now correctly include the graph name (e.g., multum-pharma)
  * Instance IRI prefixes include graph name (e.g., /multum-pharma/AgeStratification/)
  * Output filename based on input TTL filename
- **Fixed: Duplicate predicates** - Deduplicates predicates by name (DISTINCT)
- **Fixed: Example Values** - Properly resolves sample data for cross-table predicates
- **Fixed: Required/Optional status** - Checks nullability against correct source table
- **Fixed: Object Property formatting** - All IRI predicates show full format with -> Class
- **Auto-versioned output** - If output file is locked/open, saves to _v1, _v2, etc.
- Fast Database-Driven Predicate Discovery (from v48)
- OWL Consistency Validation
- Complete OWL ontology documentation
- Domain and Range specifications for all properties
- Inverse property relationships

PERFORMANCE:
- v46/v47: Parses entire TTL file = 5-30 minutes for 5GB files
- v48+: Queries database only = 10-60 seconds (NO TTL parsing)

Author: Enhanced Documentation Script
Date: 2025
"""

import os
import re
import time
from pathlib import Path
from collections import defaultdict, OrderedDict
from datetime import datetime
from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.oxml.shared import OxmlElement as OxmlElement2
import oracledb
import getpass
from dotenv import load_dotenv

# Oracle Instant Client configuration
os.add_dll_directory(r"C:\Oracle\instant_client\instantclient_23_8")

# Load environment variables
load_dotenv()
wallet_path = Path(os.getenv("ORACLE_WALLET_PATH"))
dsn = os.getenv("ORACLE_DSN")
oracle_user = os.getenv("ORACLE_USER")
oracle_password = os.getenv("ORACLE_PASSWORD") or getpass.getpass("Enter Oracle password: ")

# Initialize Oracle client if needed
if oracledb.is_thin_mode():
    oracledb.init_oracle_client()

# Global tracking dictionaries
ALL_AVAILABLE_CLASSES = {}
CLASSES_IN_DOC_CONFIG = set()
CLASSES_DOCUMENTED = set()
CLASSES_SKIPPED = {}
TTL_CLASSES = set()

# OWL ontology structures
OWL_CLASSES = set()
OWL_OBJECT_PROPERTIES = {}
OWL_DATATYPE_PROPERTIES = {}
INVERSE_PROPERTIES = {}

# Graph name extracted from TTL file (e.g., "multum-pharma")
GRAPH_NAME = None
TTL_INPUT_FILENAME = None

# External predicate metadata keyed by "[EXTERNAL] predicate_name"
# Stores R2RML_ROLE and SOURCE_FILE for rendering
EXTERNAL_PREDICATE_ROLES = {}


def clean_column_name(column_name):
    """Strip source-marker prefixes from column names.
    
    Handles [DB-VERIFIED] and [EXTERNAL] prefixes used to track
    where supplemental predicates originated.
    """
    result = column_name
    result = result.replace('[DB-VERIFIED] ', '')
    result = result.replace('[EXTERNAL] ', '')
    return result


def extract_graph_name_from_ttl(ttl_path):
    """Extract the graph name from TTL file's @prefix mt: declaration.
    
    Parses the TTL file to find: @prefix mt: <http://multum.health.oraclecloud.com/{graph_name}/ns#>
    Returns the graph_name portion (e.g., "multum-pharma").
    """
    global GRAPH_NAME
    
    if not os.path.exists(ttl_path):
        print(f"[WARNING] TTL file not found at: {ttl_path}")
        return None
    
    print("\n" + "="*80)
    print("EXTRACTING GRAPH NAME FROM TTL FILE")
    print("="*80)
    
    try:
        with open(ttl_path, 'r', encoding='utf-8') as f:
            # Read first 100 lines to find prefix declarations
            for i, line in enumerate(f):
                if i > 100:
                    break
                # Look for: @prefix mt: <http://multum.health.oraclecloud.com/{graph_name}/ns#>
                match = re.search(r'@prefix\s+mt:\s*<http://multum\.health\.oraclecloud\.com/([^/]+)/ns#>\s*\.', line)
                if match:
                    GRAPH_NAME = match.group(1)
                    print(f"[SUCCESS] Found graph name: {GRAPH_NAME}")
                    print(f"[INFO] Full namespace: http://multum.health.oraclecloud.com/{GRAPH_NAME}/ns#")
                    print("="*80 + "\n")
                    return GRAPH_NAME
    except UnicodeDecodeError:
        try:
            with open(ttl_path, 'r', encoding='utf-8', errors='replace') as f:
                for i, line in enumerate(f):
                    if i > 100:
                        break
                    match = re.search(r'@prefix\s+mt:\s*<http://multum\.health\.oraclecloud\.com/([^/]+)/ns#>\s*\.', line)
                    if match:
                        GRAPH_NAME = match.group(1)
                        print(f"[SUCCESS] Found graph name: {GRAPH_NAME}")
                        print("="*80 + "\n")
                        return GRAPH_NAME
        except Exception as e:
            print(f"[ERROR] Could not read TTL file: {e}")
    
    # Fallback: try to extract from filename (e.g., "multum-pharma_v251203.1.64.ttl")
    filename = os.path.basename(ttl_path)
    filename_match = re.match(r'^([a-zA-Z\-]+)_v', filename)
    if filename_match:
        GRAPH_NAME = filename_match.group(1)
        print(f"[FALLBACK] Extracted graph name from filename: {GRAPH_NAME}")
        print("="*80 + "\n")
        return GRAPH_NAME
    
    print("[WARNING] Could not extract graph name from TTL file or filename")
    print("[WARNING] Using default: 'multum-pharma'")
    GRAPH_NAME = "multum-pharma"
    print("="*80 + "\n")
    return GRAPH_NAME


def get_base_namespace():
    """Get the base namespace IRI including the graph name."""
    if GRAPH_NAME:
        return f"http://multum.health.oraclecloud.com/{GRAPH_NAME}/ns#"
    return "http://multum.health.oraclecloud.com/ns#"


def get_instance_base_url():
    """Get the base URL for instance IRIs including the graph name."""
    if GRAPH_NAME:
        return f"http://multum.health.oraclecloud.com/{GRAPH_NAME}/"
    return "http://multum.health.oraclecloud.com/"


def transform_iri_template(iri_template):
    """Transform an IRI template from database to include the graph name.
    
    Converts:
      http://multum.health.oraclecloud.com/ClassName/{col}
    To:
      http://multum.health.oraclecloud.com/{graph_name}/ClassName/{col}
    
    If the template already contains the graph name, it's returned unchanged.
    """
    if not iri_template or not GRAPH_NAME:
        return iri_template
    
    base_url = "http://multum.health.oraclecloud.com/"
    
    # Check if template already has the graph name
    if f"/{GRAPH_NAME}/" in iri_template:
        return iri_template
    
    # Check if this is a Multum IRI that needs transformation
    if iri_template.startswith(base_url):
        # Extract the part after the base URL (e.g., "AgeStratification/{AGE_STRATIFICATION_ID}")
        path_part = iri_template[len(base_url):]
        # Insert the graph name
        return f"{base_url}{GRAPH_NAME}/{path_part}"
    
    return iri_template


def parse_ttl_ontology(ttl_path):
    """Parse TTL file to extract OWL ontology information."""
    global OWL_CLASSES, OWL_OBJECT_PROPERTIES, OWL_DATATYPE_PROPERTIES, INVERSE_PROPERTIES
    
    if not os.path.exists(ttl_path):
        print(f"[WARNING] TTL file not found at: {ttl_path}")
        return
    
    print("\n" + "="*80)
    print("PARSING OWL ONTOLOGY FROM TTL FILE")
    print("="*80)
    
    try:
        # Try UTF-8 first
        with open(ttl_path, 'r', encoding='utf-8') as f:
            content = f.read()
    except UnicodeDecodeError:
        print("[WARNING] UTF-8 decoding failed, trying with error handling...")
        try:
            # Fallback: UTF-8 with error replacement
            with open(ttl_path, 'r', encoding='utf-8', errors='replace') as f:
                content = f.read()
        except Exception as e:
            print(f"[ERROR] Could not read TTL file: {e}")
            try:
                # Last resort: latin-1 (accepts all byte values)
                with open(ttl_path, 'r', encoding='latin-1') as f:
                    content = f.read()
                print("[INFO] Successfully read file using latin-1 encoding")
            except Exception as e2:
                print(f"[CRITICAL] Cannot read TTL file: {e2}")
                return
    
    # Parse OWL Classes
    class_pattern = r'mt:(\w+)\s+a\s+owl:Class'
    for match in re.finditer(class_pattern, content):
        class_name = match.group(1)
        OWL_CLASSES.add(class_name)
    
    # Parse Object Properties with their metadata
    # Pattern matches property definition spanning multiple lines
    prop_sections = re.split(r'\n(?=mt:\w+\s+a\s+owl:(?:Object|Datatype)Property)', content)
    
    for section in prop_sections:
        # Object Property
        obj_prop_match = re.match(r'mt:(\w+)\s+a\s+owl:ObjectProperty\s*;', section)
        if obj_prop_match:
            prop_name = obj_prop_match.group(1)
            
            # Extract domain - handle blank nodes with brackets
            # First, try to match a blank node structure [ ... ]
            blank_node_match = re.search(r'rdfs:domain\s+\[\s*(.+?)\s*\]\s*[;.]', section, re.DOTALL)
            if blank_node_match:
                domain_text = blank_node_match.group(1).strip()
            else:
                # If not a blank node, match simple domain up to semicolon/period
                domain_match = re.search(r'rdfs:domain\s+(mt:\w+)\s*[;.]', section)
                domain_text = domain_match.group(1).strip() if domain_match else None
            
            domain = None
            if domain_text:
                # Handle union types
                if 'owl:unionOf' in domain_text:
                    union_match = re.search(r'owl:unionOf\s*\(\s*(.+?)\s*\)', domain_text, re.DOTALL)
                    if union_match:
                        classes = union_match.group(1).strip().split()
                        domain = [c.replace('mt:', '').replace('[', '').replace(']', '') for c in classes if c.startswith('mt:')]
                elif domain_text.startswith('mt:'):
                    domain = domain_text.replace('mt:', '')
            
            # Extract range
            range_match = re.search(r'rdfs:range\s+mt:(\w+)', section)
            range_val = range_match.group(1) if range_match else None
            
            OWL_OBJECT_PROPERTIES[prop_name] = {
                'domain': domain,
                'range': range_val,
                'inverse': None
            }
        
        # Datatype Property
        data_prop_match = re.match(r'mt:(\w+)\s+a\s+owl:DatatypeProperty\s*;', section)
        if data_prop_match:
            prop_name = data_prop_match.group(1)
            
            # Extract domain
            domain_match = re.search(r'rdfs:domain\s+mt:(\w+)', section)
            domain = domain_match.group(1) if domain_match else None
            
            # Extract range
            range_match = re.search(r'rdfs:range\s+xsd:(\w+)', section)
            range_val = range_match.group(1) if range_match else None
            
            OWL_DATATYPE_PROPERTIES[prop_name] = {
                'domain': domain,
                'range': range_val
            }
    
    # Parse inverse properties
    inverse_pattern = r'mt:(\w+)\s+owl:inverseOf\s+mt:(\w+)'
    for match in re.finditer(inverse_pattern, content):
        inverse_prop = match.group(1)
        original_prop = match.group(2)
        INVERSE_PROPERTIES[inverse_prop] = original_prop
        
        # Store inverse in the object property
        if original_prop in OWL_OBJECT_PROPERTIES:
            OWL_OBJECT_PROPERTIES[original_prop]['inverse'] = inverse_prop
    
    print(f"\n[OWL PARSE] Found {len(OWL_CLASSES)} OWL Classes")
    print(f"[OWL PARSE] Found {len(OWL_OBJECT_PROPERTIES)} Object Properties")
    if OWL_OBJECT_PROPERTIES:
        print(f"[OWL PARSE] Object Properties list:")
        for prop_name, prop_info in sorted(OWL_OBJECT_PROPERTIES.items()):
            print(f"  - {prop_name}: domain={prop_info['domain']}, range={prop_info['range']}, inverse={prop_info['inverse']}")
    print(f"[OWL PARSE] Found {len(OWL_DATATYPE_PROPERTIES)} Datatype Properties")
    print(f"[OWL PARSE] Found {len(INVERSE_PROPERTIES)} Inverse Property Declarations")
    print("="*80 + "\n")


def scan_ttl_for_classes(ttl_path):
    """Scan the TTL file to find all classes that were actually generated."""
    ttl_classes = set()
    
    if not os.path.exists(ttl_path):
        print(f"[WARNING] TTL file not found at: {ttl_path}")
        return ttl_classes
    
    print("\n" + "="*80)
    print("STEP 0: SCANNING TTL FILE FOR ACTUAL CLASSES")
    print("="*80)
    
    base_ns = get_base_namespace()
    
    try:
        # Try UTF-8 first
        with open(ttl_path, 'r', encoding='utf-8') as f:
            for line in f:
                # Match pattern: <IRI> a mt:ClassName, skos:Concept ;
                match = re.search(r'a mt:([A-Za-z]+),\s*skos:Concept', line)
                if match:
                    ttl_classes.add(f"{base_ns}{match.group(1)}")
    except UnicodeDecodeError:
        print("[WARNING] UTF-8 decoding failed, trying with error handling...")
        try:
            # Fallback: UTF-8 with error replacement
            with open(ttl_path, 'r', encoding='utf-8', errors='replace') as f:
                for line in f:
                    match = re.search(r'a mt:([A-Za-z]+),\s*skos:Concept', line)
                    if match:
                        ttl_classes.add(f"{base_ns}{match.group(1)}")
        except Exception as e:
            print(f"[ERROR] Could not read TTL file: {e}")
            try:
                # Last resort: latin-1
                with open(ttl_path, 'r', encoding='latin-1') as f:
                    for line in f:
                        match = re.search(r'a mt:([A-Za-z]+),\s*skos:Concept', line)
                        if match:
                            ttl_classes.add(f"{base_ns}{match.group(1)}")
                print("[INFO] Successfully read file using latin-1 encoding")
            except Exception as e2:
                print(f"[CRITICAL] Cannot read TTL file: {e2}")
                return ttl_classes
    
    print(f"\n[TTL SCAN] Found {len(ttl_classes)} classes in TTL file:")
    for i, cls in enumerate(sorted(ttl_classes), 1):
        short_name = cls.split('#')[-1]
        print(f"  {i:3d}. {short_name}")
    
    print("="*80 + "\n")
    return ttl_classes


def get_example_from_ttl(ttl_path, predicate_name):
    """Scan TTL file for a single example value of a specific predicate.
    
    Efficiently reads line-by-line and stops at first match.
    Returns the raw TTL value string, or None if not found.
    
    Args:
        ttl_path: Path to TTL file
        predicate_name: Predicate to find (e.g., "mt:LeafletHtmlFileId")
    """
    if not ttl_path or not os.path.exists(ttl_path):
        return None
    
    # Extract the short name for matching (e.g., "LeafletHtmlFileId")
    if ':' in predicate_name:
        short_pred = predicate_name.split(':')[-1]
    else:
        short_pred = predicate_name
    
    # Match: mt:PredicateName followed by the value up to ; or .
    # Handles quoted strings, IRIs in <>, and bare values
    pattern = re.compile(r'mt:' + re.escape(short_pred) + r'\s+(.+)')
    
    try:
        with open(ttl_path, 'r', encoding='utf-8', errors='replace') as f:
            for line in f:
                match = pattern.search(line)
                if match:
                    value = match.group(1).strip()
                    # Remove trailing ; or . (TTL statement terminators)
                    value = value.rstrip(';').rstrip('.').rstrip(',').strip()
                    if value:
                        print(f"[TTL SCAN] Found example for {predicate_name}: {value[:80]}{'...' if len(value) > 80 else ''}")
                        return value
    except Exception as e:
        print(f"[TTL SCAN] Error scanning for {predicate_name}: {e}")
    
    return None


def create_professional_document():
    """Create a new Word document with professional styling."""
    doc = Document()
    
    # Set document margins
    sections = doc.sections
    for section in sections:
        section.top_margin = Inches(1)
        section.bottom_margin = Inches(1)
        section.left_margin = Inches(1)
        section.right_margin = Inches(1)
    
    return doc

def add_styled_heading(doc, text, level=1):
    """Add a styled heading to the document."""
    heading = doc.add_heading(text, level)
    
    # Style the heading based on level
    for run in heading.runs:
        if level == 1:
            run.font.size = Pt(14)
            run.font.color.rgb = RGBColor(0x2E, 0x4C, 0x6E)
        elif level == 2:
            run.font.size = Pt(11)
            run.font.color.rgb = RGBColor(0, 0, 0)
        elif level == 3:
            run.font.size = Pt(10)
            run.font.color.rgb = RGBColor(0x4A, 0x6F, 0xA5)
    
    return heading

def add_metadata_section(doc):
    """Add document metadata section."""
    para = doc.add_paragraph()
    run = para.add_run(f"Last updated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    run.italic = True
    run.font.size = Pt(7)
    para.alignment = WD_ALIGN_PARAGRAPH.CENTER
    
    doc.add_paragraph()

def add_namespace_table(doc):
    """Add namespace prefixes table."""
    heading = add_styled_heading(doc, "Namespace Prefixes", 2)
    
    para = doc.add_paragraph("The following namespace prefixes are used throughout this ontology documentation:")
    for run in para.runs:
        run.font.size = Pt(7)
    
    table = doc.add_table(rows=1, cols=2)
    table.style = 'Light Grid'
    table.alignment = WD_TABLE_ALIGNMENT.CENTER
    
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = 'Prefix'
    hdr_cells[1].text = 'Namespace'
    
    for cell in hdr_cells:
        cell.paragraphs[0].runs[0].font.bold = True
        cell.paragraphs[0].runs[0].font.size = Pt(7)
        cell.paragraphs[0].runs[0].font.color.rgb = RGBColor(255, 255, 255)
        shading_elm = OxmlElement('w:shd')
        shading_elm.set(qn('w:fill'), '3A5F8B')
        cell._element.get_or_add_tcPr().append(shading_elm)
    
    namespaces = [
        ('mt', get_base_namespace()),
        ('skos', 'http://www.w3.org/2004/02/skos/core#'),
        ('owl', 'http://www.w3.org/2002/07/owl#'),
        ('rdfs', 'http://www.w3.org/2000/01/rdf-schema#'),
        ('xsd', 'http://www.w3.org/2001/XMLSchema#'),
        ('rdf', 'http://www.w3.org/1999/02/22-rdf-syntax-ns#')
    ]
    
    for prefix, namespace in namespaces:
        row_cells = table.add_row().cells
        row_cells[0].text = prefix
        row_cells[1].text = namespace
        row_cells[0].paragraphs[0].runs[0].font.bold = True
        row_cells[0].paragraphs[0].runs[0].font.size = Pt(7)
        row_cells[1].paragraphs[0].runs[0].font.size = Pt(7)
    
    doc.add_paragraph()
    doc.add_paragraph()


def add_owl_ontology_documentation(doc):
    """Add comprehensive OWL ontology documentation section."""
    print("\n[DOC] Adding OWL Ontology Documentation section...")
    
    # Main section heading
    add_styled_heading(doc, "OWL Ontology Documentation", 2)
    
    intro = doc.add_paragraph(
        "This ontology is formally defined using the Web Ontology Language (OWL). "
        "The following sections describe the classes, object properties (relationships between entities), "
        "and datatype properties (literal attributes) that comprise this ontology."
    )
    for run in intro.runs:
        run.font.size = Pt(8)
    
    doc.add_paragraph()
    
    # ===== OWL CLASSES SECTION =====
    add_styled_heading(doc, "OWL Classes", 3)
    
    para = doc.add_paragraph(f"The ontology defines {len(OWL_CLASSES)} classes:")
    for run in para.runs:
        run.font.size = Pt(8)
    
    # Create a table for OWL classes
    class_table = doc.add_table(rows=1, cols=2)
    class_table.style = 'Light Grid'
    
    hdr_cells = class_table.rows[0].cells
    hdr_cells[0].text = 'Class Name'
    hdr_cells[1].text = 'Full IRI'
    
    for cell in hdr_cells:
        cell.paragraphs[0].runs[0].font.bold = True
        cell.paragraphs[0].runs[0].font.size = Pt(7)
        cell.paragraphs[0].runs[0].font.color.rgb = RGBColor(255, 255, 255)
        shading_elm = OxmlElement('w:shd')
        shading_elm.set(qn('w:fill'), '4A6FA5')
        cell._element.get_or_add_tcPr().append(shading_elm)
    
    for cls in sorted(OWL_CLASSES):
        row_cells = class_table.add_row().cells
        row_cells[0].text = f'mt:{cls}'
        row_cells[0].paragraphs[0].runs[0].font.size = Pt(7)
        row_cells[0].paragraphs[0].runs[0].font.bold = True
        
        row_cells[1].text = f'{get_base_namespace()}{cls}'
        row_cells[1].paragraphs[0].runs[0].font.size = Pt(7)
        row_cells[1].paragraphs[0].runs[0].font.color.rgb = RGBColor(100, 100, 100)
    
    doc.add_paragraph()
    doc.add_paragraph()
    
    # ===== OBJECT PROPERTIES SECTION =====
    add_styled_heading(doc, "Object Properties", 3)
    
    para = doc.add_paragraph(
        f"Object properties define relationships between entities. "
        f"The ontology defines {len(OWL_OBJECT_PROPERTIES)} object properties:"
    )
    for run in para.runs:
        run.font.size = Pt(8)
    
    # Create table for object properties
    obj_prop_table = doc.add_table(rows=1, cols=4)
    obj_prop_table.style = 'Light Grid'
    
    hdr_cells = obj_prop_table.rows[0].cells
    headers = ['Property', 'Domain (applies to)', 'Range (points to)', 'Inverse Property']
    for i, header in enumerate(headers):
        hdr_cells[i].text = header
        hdr_cells[i].paragraphs[0].runs[0].font.bold = True
        hdr_cells[i].paragraphs[0].runs[0].font.size = Pt(7)
        hdr_cells[i].paragraphs[0].runs[0].font.color.rgb = RGBColor(255, 255, 255)
        shading_elm = OxmlElement('w:shd')
        shading_elm.set(qn('w:fill'), '4A6FA5')
        hdr_cells[i]._element.get_or_add_tcPr().append(shading_elm)
    
    for prop_name in sorted(OWL_OBJECT_PROPERTIES.keys()):
        prop_info = OWL_OBJECT_PROPERTIES[prop_name]
        row_cells = obj_prop_table.add_row().cells
        
        # Property name
        row_cells[0].text = f'mt:{prop_name}'
        row_cells[0].paragraphs[0].runs[0].font.size = Pt(7)
        row_cells[0].paragraphs[0].runs[0].font.bold = True
        
        # Domain
        domain = prop_info['domain']
        if domain:
            if isinstance(domain, list):
                domain_text = ' OR\n'.join([f'mt:{d}' for d in domain])
            else:
                domain_text = f'mt:{domain}'
        else:
            domain_text = 'Any'
        row_cells[1].text = domain_text
        row_cells[1].paragraphs[0].runs[0].font.size = Pt(7)
        
        # Range
        range_val = prop_info['range']
        row_cells[2].text = f"mt:{range_val}" if range_val else 'Any'
        row_cells[2].paragraphs[0].runs[0].font.size = Pt(7)
        
        # Inverse
        inverse = prop_info['inverse']
        row_cells[3].text = f"mt:{inverse}" if inverse else '-'
        row_cells[3].paragraphs[0].runs[0].font.size = Pt(7)
        if inverse:
            row_cells[3].paragraphs[0].runs[0].font.color.rgb = RGBColor(0, 100, 0)
    
    doc.add_paragraph()
    doc.add_paragraph()
    
    # ===== DATATYPE PROPERTIES SECTION =====
    add_styled_heading(doc, "Datatype Properties", 3)
    
    para = doc.add_paragraph(
        f"Datatype properties define literal attributes of entities. "
        f"The ontology defines {len(OWL_DATATYPE_PROPERTIES)} datatype properties:"
    )
    for run in para.runs:
        run.font.size = Pt(8)
    
    # Create table for datatype properties
    data_prop_table = doc.add_table(rows=1, cols=3)
    data_prop_table.style = 'Light Grid'
    
    hdr_cells = data_prop_table.rows[0].cells
    headers = ['Property', 'Domain (applies to)', 'Range (datatype)']
    for i, header in enumerate(headers):
        hdr_cells[i].text = header
        hdr_cells[i].paragraphs[0].runs[0].font.bold = True
        hdr_cells[i].paragraphs[0].runs[0].font.size = Pt(7)
        hdr_cells[i].paragraphs[0].runs[0].font.color.rgb = RGBColor(255, 255, 255)
        shading_elm = OxmlElement('w:shd')
        shading_elm.set(qn('w:fill'), '4A6FA5')
        hdr_cells[i]._element.get_or_add_tcPr().append(shading_elm)
    
    for prop_name in sorted(OWL_DATATYPE_PROPERTIES.keys()):
        prop_info = OWL_DATATYPE_PROPERTIES[prop_name]
        row_cells = data_prop_table.add_row().cells
        
        # Property name
        row_cells[0].text = f'mt:{prop_name}'
        row_cells[0].paragraphs[0].runs[0].font.size = Pt(7)
        row_cells[0].paragraphs[0].runs[0].font.bold = True
        
        # Domain
        domain = prop_info['domain']
        row_cells[1].text = f"mt:{domain}" if domain else 'Any'
        row_cells[1].paragraphs[0].runs[0].font.size = Pt(7)
        
        # Range
        range_val = prop_info['range']
        row_cells[2].text = f"xsd:{range_val}" if range_val else 'xsd:string'
        row_cells[2].paragraphs[0].runs[0].font.size = Pt(7)
        row_cells[2].paragraphs[0].runs[0].font.color.rgb = RGBColor(0, 0, 139)
    
    doc.add_paragraph()
    doc.add_page_break()
    
    print("[DOC] OWL Ontology Documentation section complete!")


def get_all_available_classes(cursor):
    """Get ALL classes available in R2RML metadata - comprehensive inventory."""
    global ALL_AVAILABLE_CLASSES
    
    print("\n" + "="*80)
    print("STEP 1: INVENTORYING ALL AVAILABLE CLASSES IN R2RML METADATA")
    print("="*80)
    
    cursor.execute("""
        SELECT DISTINCT R2RML_CLASS, TABLE_NAME
        FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN
        WHERE R2RML_CLASS IS NOT NULL
        ORDER BY R2RML_CLASS
    """)
    
    all_classes = {}
    for row in cursor.fetchall():
        class_name, table_name = row
        if class_name not in all_classes:
            all_classes[class_name] = []
        all_classes[class_name].append(table_name)
    
    ALL_AVAILABLE_CLASSES = all_classes
    
    print(f"\n[INVENTORY] TOTAL UNIQUE CLASSES FOUND: {len(all_classes)}")
    print(f"\n[INVENTORY] Complete Class List:\n")
    
    for i, (class_name, tables) in enumerate(sorted(all_classes.items()), 1):
        # Extract short class name (after #)
        short_name = class_name.split('#')[-1] if '#' in class_name else class_name
        print(f"  {i:3d}. {short_name:<40s} | Full: {class_name}")
        for table in tables:
            print(f"       +-> Table: {table}")
    
    print("\n" + "="*80 + "\n")
    return all_classes

def inventory_external_predicates(cursor):
    """Inventory everything in UTIL_GRF_R2RML_COLUMN_EXTERNAL for diagnostic visibility."""
    print("\n" + "="*80)
    print("STEP 1b: INVENTORYING EXTERNAL PREDICATES TABLE")
    print("="*80)
    
    try:
        cursor.execute("""
            SELECT COUNT(*) FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN_EXTERNAL
        """)
        total = cursor.fetchone()[0]
        print(f"\n[EXTERNAL INVENTORY] Total rows: {total}")
        
        if total == 0:
            print("[EXTERNAL INVENTORY] Table is empty - no external predicates to add")
            print("="*80 + "\n")
            return
        
        # Show all distinct R2RML_CLASS values with exact repr for debugging
        cursor.execute("""
            SELECT DISTINCT R2RML_CLASS, 
                   LENGTH(R2RML_CLASS) AS LEN,
                   DUMP(R2RML_CLASS, 16) AS HEX_DUMP
            FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN_EXTERNAL
            ORDER BY R2RML_CLASS
        """)
        classes = cursor.fetchall()
        print(f"[EXTERNAL INVENTORY] Distinct R2RML_CLASS values ({len(classes)}):")
        for cls_val, cls_len, cls_hex in classes:
            print(f"  - '{cls_val}' (length={cls_len})")
        
        # Show all rows
        cursor.execute("""
            SELECT  TABLE_NAME, R2RML_CLASS, SOURCE_FILE, COLUMN_ORDER,
                    IS_KEY, XSD_DATATYPE, R2RML_ROLE, IRI_TEMPLATE, 
                    PREDICATE_DESCRIPTION
            FROM    GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN_EXTERNAL
            ORDER BY R2RML_CLASS, COLUMN_ORDER
        """)
        rows = cursor.fetchall()
        print(f"\n[EXTERNAL INVENTORY] All external predicate rows:")
        for row in rows:
            tbl, cls, src, order, key, xsd, role, iri, pred = row
            print(f"  [{cls}] {pred} (table={tbl}, order={order}, role={role}, source={src})")
        
    except oracledb.DatabaseError as e:
        error_obj, = e.args
        if error_obj.code == 942:
            print("[EXTERNAL INVENTORY] Table UTIL_GRF_R2RML_COLUMN_EXTERNAL does not exist")
            print("[EXTERNAL INVENTORY] External predicates feature not active")
        else:
            print(f"[EXTERNAL INVENTORY] Database error: {e}")
    except Exception as e:
        print(f"[EXTERNAL INVENTORY] Error: {type(e).__name__}: {e}")
    
    print("="*80 + "\n")

def get_table_list(cursor):
    """Get list of tables with R2RML metadata."""
    print("\n[DEBUG] Querying available R2RML tables...")
    cursor.execute("""
        SELECT DISTINCT table_name
        FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN
        WHERE table_name LIKE 'GRF%'
        ORDER BY table_name
    """)
    tables = [row[0] for row in cursor.fetchall()]
    print(f"[DEBUG] Found {len(tables)} tables with R2RML metadata")
    if tables:
        print(f"[DEBUG] First 5 tables: {', '.join(tables[:5])}")
    return tables

def get_content_areas_and_classes(cursor):
    """Get content areas and their associated classes."""
    global CLASSES_IN_DOC_CONFIG
    
    print("\n" + "="*80)
    print("STEP 2: QUERYING DOC CONFIGURATION TABLES")
    print("="*80)
    
    # First, check if tables exist
    try:
        cursor.execute("""
            SELECT COUNT(*) FROM GLOBAL_DISTRIBUTE.UTIL_GRF_DOC_CONTENT_AREA
        """)
        area_count = cursor.fetchone()[0]
        print(f"[CONFIG] UTIL_GRF_DOC_CONTENT_AREA has {area_count} rows")
        
        cursor.execute("""
            SELECT COUNT(*) FROM GLOBAL_DISTRIBUTE.UTIL_GRF_DOC_CLASS_CONTENT_AREA
        """)
        class_count = cursor.fetchone()[0]
        print(f"[CONFIG] UTIL_GRF_DOC_CLASS_CONTENT_AREA has {class_count} rows")
    except Exception as e:
        print(f"[ERROR] Could not count rows in doc tables: {e}")
    
    cursor.execute("""
        SELECT      DCA.DISPLAY_ORDER            AS CONTENT_AREA_DISPLAY_ORDER,
                    DCA.CONTENT_AREA_NAME,
                    DCA.CONTENT_AREA_DESCRIPTION,
                    CCA.R2RML_CLASS             AS CLASS_NAME,
                    CCA.DISPLAY_ORDER           AS CLASS_DISPLAY_ORDER
        FROM        GLOBAL_DISTRIBUTE.UTIL_GRF_DOC_CONTENT_AREA DCA
        INNER JOIN  GLOBAL_DISTRIBUTE.UTIL_GRF_DOC_CLASS_CONTENT_AREA CCA
                ON  CCA.CONTENT_AREA_ID     = DCA.CONTENT_AREA_ID
        ORDER BY    DCA.DISPLAY_ORDER, CCA.DISPLAY_ORDER
    """)
    results = cursor.fetchall()
    
    # Track classes in doc config
    for row in results:
        class_name = row[3]
        CLASSES_IN_DOC_CONFIG.add(class_name)
    
    print(f"\n[CONFIG] Query returned {len(results)} content area/class mappings")
    print(f"[CONFIG] Unique classes in doc config: {len(CLASSES_IN_DOC_CONFIG)}")
    
    if len(results) > 0:
        print(f"\n[CONFIG] Classes configured for documentation:")
        for class_name in sorted(CLASSES_IN_DOC_CONFIG):
            short_name = class_name.split('#')[-1] if '#' in class_name else class_name
            print(f"  - {short_name:<40s} | {class_name}")
    else:
        print("[WARNING] NO RESULTS RETURNED! The document will be empty.")
    
    print("="*80 + "\n")
    
    # Compare with available classes
    print("\n" + "="*80)
    print("COMPARISON: DOC CONFIG vs AVAILABLE CLASSES")
    print("="*80)
    
    classes_available_but_not_configured = set(ALL_AVAILABLE_CLASSES.keys()) - CLASSES_IN_DOC_CONFIG
    classes_configured_but_not_available = CLASSES_IN_DOC_CONFIG - set(ALL_AVAILABLE_CLASSES.keys())
    
    if classes_available_but_not_configured:
        print(f"\n[ALERT] {len(classes_available_but_not_configured)} CLASSES EXIST IN R2RML BUT NOT IN DOC CONFIG:")
        for class_name in sorted(classes_available_but_not_configured):
            short_name = class_name.split('#')[-1] if '#' in class_name else class_name
            tables = ALL_AVAILABLE_CLASSES.get(class_name, [])
            print(f"  ? {short_name:<40s} | Tables: {', '.join(tables)}")
    else:
        print("\n[OK] All available classes are configured for documentation")
    
    if classes_configured_but_not_available:
        print(f"\n[WARNING] {len(classes_configured_but_not_available)} CLASSES IN DOC CONFIG BUT NOT IN R2RML:")
        for class_name in sorted(classes_configured_but_not_available):
            short_name = class_name.split('#')[-1] if '#' in class_name else class_name
            print(f"  ? {short_name}")
    
    print("="*80 + "\n")
    
    print("\n[DEBUG] Returning content area data for documentation pipeline...")
    print(f"[DEBUG] Total class-area mappings to process: {len(results)}")
    
    return results

def get_table_name_for_class(cursor, class_name):
    """Get the table name associated with a class."""
    print(f"[DEBUG]     Looking up table for class: {class_name}")
    cursor.execute("""
        SELECT DISTINCT TABLE_NAME
        FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN
        WHERE R2RML_CLASS = :1
        AND ROWNUM = 1
    """, [class_name])
    result = cursor.fetchone()
    if result:
        print(f"[DEBUG]     Found table: {result[0]}")
    else:
        print(f"[DEBUG]     NO TABLE FOUND for class: {class_name}")
        # Try to find similar class names
        cursor.execute("""
            SELECT DISTINCT R2RML_CLASS 
            FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN 
            WHERE R2RML_CLASS LIKE :1
            AND ROWNUM <= 5
        """, [f'%{class_name.split("#")[-1][:10]}%'])
        similar = cursor.fetchall()
        if similar:
            print(f"[DEBUG]     Similar classes in R2RML_COLUMN table:")
            for s in similar:
                print(f"[DEBUG]       - {s[0]}")
    return result[0] if result else None

def build_iri_to_class_mapping(cursor):
    """Build a mapping of IRI patterns to class names."""
    print("\n[DEBUG] Building IRI to class mapping...")
    cursor.execute("""
        SELECT DISTINCT 
            R2RML_CLASS,
            TABLE_NAME,
            IRI_TEMPLATE
        FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN
        WHERE IS_KEY = 'Y' AND IRI_TEMPLATE IS NOT NULL
    """)
    
    iri_mapping = {}
    rows = cursor.fetchall()
    print(f"[DEBUG] Found {len(rows)} IRI templates")
    for row in rows:
        class_name, table_name, iri_template = row
        if iri_template:
            base_pattern = iri_template.split('{')[0] if '{' in iri_template else iri_template
            iri_mapping[base_pattern] = class_name
    
    return iri_mapping

def get_class_metadata(cursor, table_name, class_name=None):
    """
    Get class and predicate metadata for a class including predicate groups.
    NOW WITH FAST DATABASE-DRIVEN PREDICATE DISCOVERY (v48).
    
    FIXED v51: Query by R2RML_CLASS instead of TABLE_NAME to get ALL columns
    for the class across all tables, properly ordered by COLUMN_ORDER.
    """
    print(f"[DEBUG]     Fetching metadata for table: {table_name}, class: {class_name}")
    
    # If we have a class_name, query by CLASS to get ALL columns across all tables
    # This ensures proper ordering when a class spans multiple tables
    if class_name:
        cursor.execute("""
            SELECT  C.TABLE_NAME,
                    C.COLUMN_NAME,
                    C.R2RML_CLASS AS CLASS_NAME,
                    C.PREDICATE_DESCRIPTION,
                    C.XSD_DATATYPE,
                    C.IRI_TEMPLATE,
                    C.IS_KEY,
                    C.COLUMN_ORDER,
                    G.ORDER_SEQUENCE AS GROUP_COLUMN_NAME_SEQUENCE,
                    G.GROUP_PREDICATE_NAME
            FROM    GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN C
            LEFT JOIN   GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN_GROUP G
                    ON  G.TABLE_NAME = C.TABLE_NAME
                        AND G.COLUMN_NAME = C.COLUMN_NAME
            WHERE   C.R2RML_CLASS = :1
            ORDER BY C.TABLE_NAME, C.COLUMN_ORDER, G.GROUP_PREDICATE_NAME, G.ORDER_SEQUENCE
        """, [class_name])
    else:
        # Fallback to table-based query if no class specified
        cursor.execute("""
            SELECT  C.TABLE_NAME,
                    C.COLUMN_NAME,
                    C.R2RML_CLASS AS CLASS_NAME,
                    C.PREDICATE_DESCRIPTION,
                    C.XSD_DATATYPE,
                    C.IRI_TEMPLATE,
                    C.IS_KEY,
                    C.COLUMN_ORDER,
                    G.ORDER_SEQUENCE AS GROUP_COLUMN_NAME_SEQUENCE,
                    G.GROUP_PREDICATE_NAME
            FROM    GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN C
            LEFT JOIN   GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN_GROUP G
                    ON  G.TABLE_NAME = C.TABLE_NAME
                        AND G.COLUMN_NAME = C.COLUMN_NAME
            WHERE   C.TABLE_NAME = :1
            ORDER BY C.R2RML_CLASS, C.COLUMN_ORDER, G.GROUP_PREDICATE_NAME, G.ORDER_SEQUENCE
        """, [table_name])
    
    results = cursor.fetchall()
    print(f"[DEBUG]     Found {len(results)} base metadata rows")
    
    # Get class name if not provided
    if not class_name and results:
        class_name = results[0][2]
    
    # v52: Query external predicates table for this class
    # External table uses short class name (e.g., "Leaflet") or full IRI
    if class_name:
        short_class = class_name.split('#')[-1] if '#' in class_name else class_name
        print(f"[EXTERNAL]  Querying UTIL_GRF_R2RML_COLUMN_EXTERNAL for class='{class_name}' / short='{short_class}'")
        try:
            # Use TRIM to handle trailing whitespace in R2RML_CLASS values
            cursor.execute("""
                SELECT  E.TABLE_NAME,
                        E.SOURCE_FILE,
                        E.R2RML_CLASS,
                        E.PREDICATE_DESCRIPTION,
                        E.XSD_DATATYPE,
                        E.IRI_TEMPLATE,
                        E.IS_KEY,
                        E.COLUMN_ORDER,
                        E.R2RML_ROLE
                FROM    GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN_EXTERNAL E
                WHERE   TRIM(E.R2RML_CLASS) = TRIM(:1)
                   OR   TRIM(E.R2RML_CLASS) = TRIM(:2)
                ORDER BY E.COLUMN_ORDER
            """, [class_name, short_class])
            ext_rows = cursor.fetchall()
            
            if ext_rows:
                print(f"[EXTERNAL]  *** FOUND {len(ext_rows)} external predicate(s) ***")
                results = list(results)
                for ext_row in ext_rows:
                    ext_table, source_file, ext_class, ext_pred, ext_xsd, ext_iri, ext_key, ext_order, ext_role = ext_row
                    
                    # Key on predicate name (not source_file) to avoid collision
                    # when multiple external predicates share the same source file
                    col_key = f"[EXTERNAL] {ext_pred}"
                    
                    # Store R2RML_ROLE and SOURCE_FILE for rendering
                    EXTERNAL_PREDICATE_ROLES[col_key] = {
                        'r2rml_role': ext_role,
                        'source_file': source_file,
                        'predicate': ext_pred
                    }
                    
                    # Append as standard 10-element tuple (group fields = None)
                    results.append((
                        ext_table,          # TABLE_NAME
                        col_key,            # COLUMN_NAME slot (with [EXTERNAL] marker)
                        class_name,         # CLASS_NAME (use full IRI for consistency)
                        ext_pred,           # PREDICATE_DESCRIPTION
                        ext_xsd,            # XSD_DATATYPE
                        ext_iri,            # IRI_TEMPLATE
                        ext_key or 'N',     # IS_KEY
                        ext_order,          # COLUMN_ORDER
                        None,               # GROUP_COLUMN_NAME_SEQUENCE
                        None                # GROUP_PREDICATE_NAME
                    ))
                    
                    role_display = ext_role or 'unspecified'
                    print(f"[EXTERNAL]    + {ext_pred} (order={ext_order}, role={role_display}, source={source_file})")
            else:
                print(f"[EXTERNAL]  No external predicates found for class '{short_class}'")
                # Diagnostic: show what IS in the table for troubleshooting
                cursor.execute("""
                    SELECT DISTINCT TRIM(R2RML_CLASS), LENGTH(R2RML_CLASS)
                    FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN_EXTERNAL
                """)
                available = cursor.fetchall()
                if available:
                    print(f"[EXTERNAL]  Available R2RML_CLASS values in external table: {[f'{r[0]}(len={r[1]})' for r in available]}")
                    print(f"[EXTERNAL]  Looking for: '{class_name}'(len={len(class_name)}) or '{short_class}'(len={len(short_class)})")
                else:
                    print(f"[EXTERNAL]  External table is empty")
        except oracledb.DatabaseError as e:
            error_obj, = e.args
            if error_obj.code == 942:
                # ORA-00942: table or view does not exist - expected if not yet created
                print(f"[EXTERNAL]  Table UTIL_GRF_R2RML_COLUMN_EXTERNAL does not exist (ORA-00942) - skipping")
            else:
                # Real database error - print prominently
                print(f"[EXTERNAL]  *** DATABASE ERROR querying external predicates: {e} ***")
                print(f"[EXTERNAL]  *** External predicates for '{short_class}' will NOT appear in document ***")
        except Exception as e:
            print(f"[EXTERNAL]  *** UNEXPECTED ERROR querying external predicates: {type(e).__name__}: {e} ***")
            print(f"[EXTERNAL]  *** External predicates for '{short_class}' will NOT appear in document ***")
    
    # Fast database-driven supplement - still useful for cross-class references
    if class_name:
        results = supplement_metadata_with_database_check(results, cursor, class_name, table_name)
    
    return results

def organize_metadata_with_groups(metadata):
    """Organize metadata to handle predicate groups and deduplicate predicates.
    
    Sorts by TABLE_NAME then COLUMN_ORDER to match expected ordering.
    """
    groups = defaultdict(list)
    regular_predicates = []
    seen_predicates = set()  # Track predicates we've already added (normalized)
    
    def normalize_predicate(pred):
        """Normalize predicate name for deduplication."""
        if not pred:
            return None
        # Remove common prefixes
        p = pred.strip()
        if p.startswith('mt:'):
            p = p[3:]
        if p.startswith('skos:'):
            p = p[5:]
        return p.lower()
    
    # DEBUG: Print incoming metadata
    print(f"\n[DEBUG ORDERING] ========== INCOMING METADATA ({len(metadata)} rows) ==========")
    for i, row in enumerate(metadata):
        table_name, column_name, class_name, predicate, xsd_type, iri_template, is_key, order, group_seq, group_name = row
        print(f"[DEBUG ORDERING]   {i}: TABLE={table_name}, COL={column_name}, PRED={predicate}, ORDER={order}, IS_KEY={is_key}, GROUP={group_name}")
    
    for row in metadata:
        table_name, column_name, class_name, predicate, xsd_type, iri_template, is_key, order, group_seq, group_name = row
        
        if group_name:
            groups[group_name].append({
                'sequence': group_seq,
                'column_name': column_name,
                'predicate': predicate,
                'xsd_type': xsd_type,
                'iri_template': iri_template,
                'is_key': is_key,
                'order': order,
                'full_row': row
            })
        else:
            # Deduplicate: only add if we haven't seen this predicate before
            pred_key = normalize_predicate(predicate) or column_name.lower()
            if pred_key not in seen_predicates:
                seen_predicates.add(pred_key)
                regular_predicates.append(row)
                print(f"[DEBUG ORDERING]   ADDED: {predicate} (order={order}, table={table_name})")
            else:
                print(f"[DEBUG ORDERING]   SKIPPED (dup): {predicate} (order={order}, table={table_name})")
    
    for group_name in groups:
        groups[group_name].sort(key=lambda x: x['sequence'] if x['sequence'] else 0)
    
    # DEBUG: Print before sort
    print(f"\n[DEBUG ORDERING] ========== BEFORE SORT ({len(regular_predicates)} predicates) ==========")
    for i, row in enumerate(regular_predicates):
        print(f"[DEBUG ORDERING]   {i}: TABLE={row[0]}, PRED={row[3]}, ORDER={row[7]}")
    
    # Sort by TABLE_NAME (index 0) then COLUMN_ORDER (index 7)
    # This handles supplemented rows that were appended out of order
    regular_predicates.sort(key=lambda row: (row[0], row[7] if row[7] is not None else 9999))
    
    # DEBUG: Print after sort
    print(f"\n[DEBUG ORDERING] ========== AFTER SORT ==========")
    for i, row in enumerate(regular_predicates):
        print(f"[DEBUG ORDERING]   {i}: TABLE={row[0]}, PRED={row[3]}, ORDER={row[7]}")
    print(f"[DEBUG ORDERING] ==========================================\n")
    
    return regular_predicates, dict(groups)

def check_column_has_data(cursor, table_name, column_name):
    """Fast check: Does this column have ANY non-null data?"""
    try:
        safe_column = f'"{column_name}"'
        query = f"""
            SELECT 1 
            FROM GLOBAL_DISTRIBUTE.{table_name} 
            WHERE {safe_column} IS NOT NULL 
            AND ROWNUM = 1
        """
        cursor.execute(query)
        return cursor.fetchone() is not None
    except:
        return False



def get_sample_from_table(cursor, table_name, column_name):
    """
    Get a single sample value from specified table and column.
    Tries multiple strategies to find a good, meaningful example.
    
    Args:
        cursor: Database cursor
        table_name: Table to query
        column_name: Column to get sample from
        
    Returns:
        Sample value or placeholder if none found
    """
    safe_col = f'"{column_name}"'
    
    # Try progressively more lenient strategies to find a good sample
    strategies = [
        # Strategy 1: Best quality - non-null, not empty, not placeholder
        f"""
            SELECT {safe_col}
            FROM GLOBAL_DISTRIBUTE.{table_name}
            WHERE {safe_col} IS NOT NULL
            AND {safe_col} NOT IN ('-', '--', ' ', '', '0')
            AND ROWNUM = 1
        """,
        # Strategy 2: Good quality - non-null, not obvious placeholders
        f"""
            SELECT {safe_col}
            FROM GLOBAL_DISTRIBUTE.{table_name}
            WHERE {safe_col} IS NOT NULL
            AND {safe_col} NOT IN ('-', '--')
            AND ROWNUM = 1
        """,
        # Strategy 3: Acceptable - any non-null
        f"""
            SELECT {safe_col}
            FROM GLOBAL_DISTRIBUTE.{table_name}
            WHERE {safe_col} IS NOT NULL
            AND ROWNUM = 1
        """,
        # Strategy 4: Last resort - any value
        f"""
            SELECT {safe_col}
            FROM GLOBAL_DISTRIBUTE.{table_name}
            WHERE ROWNUM = 1
        """
    ]
    
    for strategy_num, query in enumerate(strategies, 1):
        try:
            cursor.execute(query.strip())
            result = cursor.fetchone()
            if result and result[0] is not None:
                val = result[0]
                str_val = str(val).strip()
                # Accept any non-empty, non-placeholder value
                if str_val and str_val not in ['-', '--', '']:
                    return val
        except Exception as e:
            # Column might not exist or query error
            if strategy_num == len(strategies):
                # All strategies failed - return placeholder
                return f"<{column_name.lower()}_value>"
            continue
    
    # If all strategies failed, return placeholder
    return f"<{column_name.lower()}_value>"


def get_all_class_predicates(cursor, class_name):
    """Get ALL predicates for a class from R2RML (including from other tables).
    
    Includes predicates FROM this class and predicates TO this class from other classes,
    but excludes self-referencing predicates (where target class = documented class).
    """
    short_class = class_name.split('#')[-1] if '#' in class_name else class_name
    
    # Predicates FROM this class
    cursor.execute("""
        SELECT DISTINCT TABLE_NAME, COLUMN_NAME, PREDICATE_DESCRIPTION, 
               XSD_DATATYPE, IRI_TEMPLATE, IS_KEY, COLUMN_ORDER, R2RML_CLASS
        FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN
        WHERE R2RML_CLASS = :1 AND PREDICATE_DESCRIPTION IS NOT NULL
    """, [class_name])
    predicates_from = cursor.fetchall()
    
    # Predicates TO this class (other tables pointing here)
    cursor.execute("""
        SELECT DISTINCT TABLE_NAME, COLUMN_NAME, PREDICATE_DESCRIPTION,
               XSD_DATATYPE, IRI_TEMPLATE, IS_KEY, COLUMN_ORDER, R2RML_CLASS
        FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN
        WHERE IRI_TEMPLATE IS NOT NULL 
        AND (IRI_TEMPLATE LIKE :1 OR IRI_TEMPLATE LIKE :2)
        AND PREDICATE_DESCRIPTION IS NOT NULL
        AND R2RML_CLASS != :3
    """, [f'%/{short_class}/%', f'%{short_class}/%', class_name])
    predicates_to = cursor.fetchall()
    
    all_preds = {}
    for table, col, pred, xsd, iri, key, order, r2rml_class in predicates_from:
        all_preds[(table, col, pred)] = {
            'table_name': table, 'column_name': col, 'predicate': pred,
            'xsd_type': xsd, 'iri_template': iri, 'is_key': key, 'order': order, 'direction': 'FROM'
        }
    
    for table, col, pred, xsd, iri, key, order, r2rml_class in predicates_to:
        # Skip self-referencing predicates (IRI points to the same class we're documenting)
        # Check if the IRI template contains the class name we're documenting
        if iri and f'/{short_class}/' in iri:
            # This predicate points TO the class we're documenting - skip it
            print(f"      [SKIP SELF-REF] {pred} from {table} points to {short_class} (self-reference)")
            continue
        
        key_tuple = (table, col, pred)
        if key_tuple not in all_preds:
            all_preds[key_tuple] = {
                'table_name': table, 'column_name': col, 'predicate': pred,
                'xsd_type': xsd, 'iri_template': iri, 'is_key': key, 'order': order, 'direction': 'TO'
            }
    
    return list(all_preds.values())

def supplement_metadata_with_database_check(metadata, cursor, class_name, table_name):
    """
    Fast database-driven predicate discovery with sample collection.
    
    Returns:
        tuple: (enhanced_metadata, supplemental_samples_dict)
    """
    all_predicates = get_all_class_predicates(cursor, class_name)
    if not all_predicates:
        return metadata, {}
    
    # Normalize predicate names for comparison
    def normalize_pred(pred):
        if not pred:
            return None
        p = pred.strip()
        if p.startswith('mt:'):
            p = p[3:]
        return p.lower()
    
    existing_preds = {normalize_pred(row[3]) for row in metadata if row[3]}
    missing_with_data = []
    
    for pred_info in all_predicates:
        norm_pred = normalize_pred(pred_info['predicate'])
        if norm_pred in existing_preds:
            continue
        if check_column_has_data(cursor, pred_info['table_name'], pred_info['column_name']):
            print(f"      ✓ FOUND: {pred_info['predicate']} has data in {pred_info['table_name']}.{pred_info['column_name']}")
            missing_with_data.append(pred_info)
            existing_preds.add(norm_pred)  # Prevent adding same predicate from multiple tables
    
    if not missing_with_data:
        return metadata, {}
    
    print(f"    [DISCOVERY] Adding {len(missing_with_data)} predicates WITH samples from source tables")
    supplemented = list(metadata)
    max_order = max([row[7] for row in metadata if row[7] is not None], default=1000)
    
    # NEW: Collect samples from source tables immediately
    supplemental_samples = {}
    
    for idx, pred_info in enumerate(missing_with_data, 1):
        source_table = pred_info['table_name']
        source_column = pred_info['column_name']
        predicate = pred_info['predicate']
        
        # Get sample from the actual source table
        sample_value = get_sample_from_table(cursor, source_table, source_column)
        
        # Store with key matching the column name we'll use in metadata
        sample_key = f"[DB-VERIFIED] {source_column}"
        supplemental_samples[sample_key] = sample_value
        
        print(f"      • {predicate}: sample='{sample_value}' from {source_table}.{source_column}")
        
        supplemented.append((
            source_table,                      # Keep source table for reference
            sample_key,                        # Column name with marker
            class_name,
            predicate,
            pred_info['xsd_type'],
            pred_info['iri_template'],
            pred_info['is_key'] or 'N',
            pred_info['order'],                # Use ACTUAL column order from source, not artificial max+idx
            None,
            None
        ))
    
    print(f"    [SUCCESS] Enhanced: {len(metadata)} → {len(supplemented)} predicates with {len(supplemental_samples)} samples")
    
    return supplemented, supplemental_samples

def get_sample_data(cursor, table_name):
    """Get sample data for EVERY column in the table - bulletproof version."""
    samples = {}
    
    try:
        # First, get ALL columns from the R2RML metadata
        cursor.execute("""
            SELECT DISTINCT COLUMN_NAME 
            FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN 
            WHERE TABLE_NAME = :1
        """, [table_name])
        r2rml_columns = [row[0] for row in cursor.fetchall()]
        
        if not r2rml_columns:
            return {}
        
        # Get data types for each column to handle queries appropriately
        column_types = {}
        for col in r2rml_columns:
            try:
                query = """
                    SELECT XSD_DATATYPE
                    FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN
                    WHERE TABLE_NAME = :1 AND COLUMN_NAME = :2
                    AND ROWNUM = 1
                """
                cursor.execute(query, [table_name, col])
                result = cursor.fetchone()
                column_types[col] = result[0] if result and result[0] else 'xsd:string'
            except Exception as e:
                column_types[col] = 'xsd:string'
        
        # Process all columns individually with type-aware queries
        for col in r2rml_columns:
            safe_col = f'"{col}"'
            xsd_type = column_types.get(col, 'xsd:string')
            is_string_type = 'string' in xsd_type.lower() or not xsd_type
            
            not_dash_condition = "AND {} NOT IN ('-', '--')".format(safe_col) if is_string_type else ""
            not_empty_condition = "AND {} NOT IN (' ', '')".format(safe_col) if is_string_type else ""
            
            strategies = [
                """
                    SELECT {}
                    FROM GLOBAL_DISTRIBUTE.{}
                    WHERE {} IS NOT NULL
                    {}
                    {}
                    AND ROWNUM <= 10
                """.format(safe_col, table_name, safe_col, not_dash_condition, not_empty_condition).strip(),
                
                """
                    SELECT {}
                    FROM GLOBAL_DISTRIBUTE.{}
                    WHERE {} IS NOT NULL
                    {}
                    AND ROWNUM <= 5
                """.format(safe_col, table_name, safe_col, not_dash_condition).strip(),
                
                """
                    SELECT {}
                    FROM GLOBAL_DISTRIBUTE.{}
                    WHERE {} IS NOT NULL
                    AND ROWNUM <= 5
                """.format(safe_col, table_name, safe_col),
                
                """
                    SELECT {}
                    FROM GLOBAL_DISTRIBUTE.{}
                    WHERE ROWNUM = 1
                """.format(safe_col, table_name)
            ]
            
            for i, query in enumerate(strategies):
                if col in samples:
                    break
                    
                try:
                    cursor.execute(query)
                    results = cursor.fetchmany(10)
                    
                    best_value = None
                    for row in results:
                        if row and row[0] is not None:
                            val = row[0]
                            str_val = str(val).strip()
                            
                            if str_val and str_val not in ['-', '--', '0', '']:
                                samples[col] = val
                                break
                            elif str_val and best_value is None:
                                best_value = val
                    
                    if col not in samples and best_value is not None:
                        samples[col] = best_value
                            
                except Exception as e:
                    continue
        
        # Strategy 3: For still-missing columns, try bulk fetch
        still_missing = [col for col in r2rml_columns if col not in samples]
        
        if still_missing:
            try:
                bulk_query = f"""
                    SELECT * FROM GLOBAL_DISTRIBUTE.{table_name}
                    WHERE ROWNUM <= 1000
                """
                cursor.execute(bulk_query)
                
                col_names = [desc[0] for desc in cursor.description]
                rows = cursor.fetchall()
                
                for col in still_missing:
                    if col in col_names:
                        col_idx = col_names.index(col)
                        
                        best_value = None
                        for row in rows:
                            val = row[col_idx]
                            if val is not None:
                                str_val = str(val).strip()
                                
                                if str_val and str_val not in ['-', '--', '0', '']:
                                    samples[col] = val
                                    break
                                elif str_val and best_value is None:
                                    best_value = val
                        
                        if col not in samples and best_value is not None:
                            samples[col] = best_value
                            
            except Exception as e:
                pass
        
        # Strategy 4: Generate synthetic examples for still-missing columns
        final_missing = [col for col in r2rml_columns if col not in samples]
        
        if final_missing:
            for col in final_missing:
                xsd_type = column_types.get(col, 'xsd:string')
                
                if 'integer' in xsd_type.lower():
                    samples[col] = 12345
                elif 'decimal' in xsd_type.lower() or 'float' in xsd_type.lower():
                    samples[col] = 123.45
                elif 'boolean' in xsd_type.lower():
                    samples[col] = 'Y'
                elif 'date' in xsd_type.lower():
                    samples[col] = '2024-01-15'
                else:
                    samples[col] = f'Example {col.replace("_", " ").title()}'
        
        return samples
        
    except Exception as e:
        try:
            cursor.execute("""
                SELECT COLUMN_NAME
                FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN
                WHERE TABLE_NAME = :1
            """, [table_name])
            
            for row in cursor.fetchall():
                col = row[0]
                if col not in samples:
                    samples[col] = f'[Example {col}]'
                    
        except:
            pass
            
        return samples

def check_column_nullability(cursor, table_name, columns, column_tables=None):
    """Check if columns are optional or required.
    
    Args:
        cursor: Database cursor
        table_name: Default table name for columns without specific mapping
        columns: List of column names to check
        column_tables: Optional dict mapping column_name -> source_table_name
    """
    nullability = {}
    column_tables = column_tables or {}
    
    for col in columns:
        try:
            # Use specific source table if provided, otherwise use default
            source_table = column_tables.get(col, table_name)
            
            # Skip EXTERNAL columns - no DB data to check nullability against
            if col.startswith('[EXTERNAL]'):
                nullability[col] = 'Optional'
                continue
            
            # Strip [DB-VERIFIED] prefix if present
            clean_col = clean_column_name(col)
            safe_col = f'"{clean_col}"'
            
            query = f"""
                SELECT 
                    CASE WHEN COUNT(*) - COUNT({safe_col}) > 0 
                         THEN 'Optional' 
                         ELSE 'Required' 
                    END AS nullability
                FROM GLOBAL_DISTRIBUTE.{source_table}
                WHERE ROWNUM <= 1000
            """
            cursor.execute(query)
            result = cursor.fetchone()
            nullability[col] = result[0] if result else 'Optional'
        except Exception as e:
            # Default to Optional if we can't determine
            nullability[col] = 'Optional'
    
    return nullability

def format_sample_value(value, xsd_type):
    """Format sample value for display."""
    if value is None:
        return "[No sample available]"
    
    if isinstance(value, str) and value.startswith('[') and value.endswith(']'):
        return value
    
    str_val = str(value).strip()
    
    if str_val in ['', '-', '--', 'NULL', '[NULL]', 'None']:
        return "[Empty/Null value]"
    
    if isinstance(value, (bytes, bytearray)):
        try:
            decoded = value.decode('utf-8')
            if decoded.strip() in ['', '-', '--']:
                return "[Binary data]"
            return decoded[:100]
        except:
            hex_val = value.hex().upper()[:100]
            if hex_val:
                return hex_val
            return "[Binary data]"
    
    if hasattr(value, 'read'):
        try:
            content = value.read()
            if isinstance(content, bytes):
                decoded = content.decode('utf-8')
            else:
                decoded = str(content)
            
            if decoded.strip() in ['', '-', '--']:
                return "[LOB data]"
            return decoded[:100]
        except:
            return "[LOB data]"
    
    if len(str_val) > 100:
        return str_val[:100] + "..."
    
    return str_val

def set_cell_no_wrap(cell):
    """Set a table cell to not wrap text."""
    tcPr = cell._element.get_or_add_tcPr()
    tcW = OxmlElement('w:tcW')
    tcW.set(qn('w:type'), 'auto')
    tcPr.append(tcW)
    
    for paragraph in cell.paragraphs:
        pPr = paragraph._element.get_or_add_pPr()
        noWrap = OxmlElement('w:noWrap')
        pPr.append(noWrap)

def set_column_widths(table):
    """Set specific column widths for the predicate table."""
    tbl = table._tbl
    tblGrid = tbl.tblGrid
    
    col_widths = [
        Inches(2),
        Inches(1.5),
        Inches(2.5),
        Inches(0.5)
    ]
    
    for i, width in enumerate(col_widths):
        if i < len(tblGrid.gridCol_lst):
            tblGrid.gridCol_lst[i].w = width


def get_class_properties_from_owl(class_name):
    """Get all object and datatype properties that apply to a class."""
    object_props = []
    datatype_props = []
    
    for prop_name, prop_info in OWL_OBJECT_PROPERTIES.items():
        domain = prop_info['domain']
        if domain:
            if isinstance(domain, list):
                if class_name in domain:
                    object_props.append((prop_name, prop_info))
            elif domain == class_name:
                object_props.append((prop_name, prop_info))
    
    for prop_name, prop_info in OWL_DATATYPE_PROPERTIES.items():
        domain = prop_info['domain']
        if domain == class_name:
            datatype_props.append((prop_name, prop_info))
    
    return object_props, datatype_props


def add_class_documentation(doc, cursor, table_name, metadata, sample_data, is_last_class_in_area=False, is_last_area=False):
    """Add documentation for a single class with OWL property information."""
    global CLASSES_DOCUMENTED
    
    if not metadata:
        print(f"[DEBUG]     No metadata to document!")
        return
    
    class_name = metadata[0][2]
    CLASSES_DOCUMENTED.add(class_name)
    print(f"[DEBUG]     Documenting class: {class_name}")
    print(f"[DEBUG]     OWL Object Properties available: {len(OWL_OBJECT_PROPERTIES)}")
    print(f"[DEBUG]     OWL Datatype Properties available: {len(OWL_DATATYPE_PROPERTIES)}")
    
    # Extract short class name
    short_class_name = class_name.split('#')[-1] if '#' in class_name else class_name
    
    heading = add_styled_heading(doc, f"RDF Class: {class_name}", 3)
    
    para = doc.add_paragraph()
    run = para.add_run("Subject pattern: ")
    run.italic = True
    run.font.size = Pt(8)
    
    key_col = None
    key_template = None
    key_xsd_type = None
    for row in metadata:
        if row[6] == 'Y':
            key_col = row[1]
            key_xsd_type = row[4]  # Capture XSD datatype from metadata
            raw_template = row[5] or f"{get_instance_base_url()}{key_col.lower()}/{{value}}"
            key_template = transform_iri_template(raw_template)
            run = para.add_run(key_template.replace('{' + key_col + '}', '{value}'))
            run.font.size = Pt(8)
            break
    
    # Add OWL property information
    obj_props, data_props = get_class_properties_from_owl(short_class_name)
    
    # Create a lookup for OWL properties
    owl_obj_props_dict = {prop_name: prop_info for prop_name, prop_info in obj_props}
    owl_data_props_dict = {prop_name: prop_info for prop_name, prop_info in data_props}
    
    regular_predicates, predicate_groups = organize_metadata_with_groups(metadata)
    
    # Build column list and source table mapping for nullability check
    all_columns = set()
    column_tables = {}  # Map column_name -> source_table_name
    for row in metadata:
        col_name = row[1]
        source_table = row[0]  # TABLE_NAME is first column
        all_columns.add(col_name)
        column_tables[col_name] = source_table
    
    nullability = check_column_nullability(cursor, table_name, list(all_columns), column_tables)
    
    # v52: Override nullability for external predicates (no DB data to check)
    for col_name in all_columns:
        if col_name in EXTERNAL_PREDICATE_ROLES:
            # External predicates default to Optional unless IS_KEY = Y
            nullability[col_name] = 'Optional'
    
    table = doc.add_table(rows=1, cols=4)
    table.style = 'Light Grid'
    
    set_column_widths(table)
    
    headers = ['Predicate [-> Class (inverse)]', 'Object Type', 'Example Value', 'Required/Optional']
    hdr_cells = table.rows[0].cells
    for i, header in enumerate(headers):
        hdr_cells[i].text = header
        hdr_cells[i].paragraphs[0].runs[0].font.bold = True
        hdr_cells[i].paragraphs[0].runs[0].font.size = Pt(7)
        hdr_cells[i].paragraphs[0].runs[0].font.color.rgb = RGBColor(255, 255, 255)
        shading_elm = OxmlElement('w:shd')
        shading_elm.set(qn('w:fill'), '4A6FA5')
        hdr_cells[i]._element.get_or_add_tcPr().append(shading_elm)
        set_cell_no_wrap(hdr_cells[i])
    
    processed_columns = set()
    
    for group_name, group_members in predicate_groups.items():
        for member in group_members:
            processed_columns.add(member['column_name'])
    
    
    # Add skos:notation row (automatically added by TTL generation script)
    if key_col and key_xsd_type:
        row_cells = table.add_row().cells
        
        row_cells[0].text = 'skos:notation'
        row_cells[0].paragraphs[0].runs[0].font.size = Pt(7)
        set_cell_no_wrap(row_cells[0])
        
        row_cells[1].text = key_xsd_type if key_xsd_type.startswith('xsd:') else f"xsd:{key_xsd_type.replace('xsd:', '')}"
        row_cells[1].paragraphs[0].runs[0].font.size = Pt(7)
        set_cell_no_wrap(row_cells[1])
        
        # Use the key column value as the example
        if key_col in sample_data:
            example_val = format_sample_value(sample_data[key_col], key_xsd_type)
            row_cells[2].text = example_val
        else:
            row_cells[2].text = "[Key value]"
        
        if row_cells[2].paragraphs[0].runs:
            row_cells[2].paragraphs[0].runs[0].font.size = Pt(7)
            row_cells[2].paragraphs[0].runs[0].font.color.rgb = RGBColor(100, 100, 100)
        
        row_cells[3].text = 'Required'
        row_cells[3].paragraphs[0].runs[0].font.size = Pt(7)
        set_cell_no_wrap(row_cells[3])
    
    print(f"\n[DEBUG RENDER] ========== RENDERING PREDICATES ==========")
    render_index = 0
    for row in regular_predicates:
        table_name, column_name, class_name, predicate, xsd_type, iri_template, is_key, order = row[:8]
        
        if column_name in processed_columns or is_key == 'Y':
            print(f"[DEBUG RENDER]   SKIP: {predicate} (is_key={is_key}, in_processed={column_name in processed_columns})")
            continue
        
        render_index += 1
        is_ext = column_name.startswith('[EXTERNAL]')
        ext_tag = " [EXTERNAL]" if is_ext else ""
        print(f"[DEBUG RENDER]   {render_index}: RENDERING {predicate} from {table_name} (order={order}){ext_tag}")
        
        row_cells = table.add_row().cells
        
        # Extract predicate name - handle full URIs, prefixed names, and local names
        pred_value = predicate or column_name
        if '#' in pred_value:
            # Full URI: extract local name after #
            pred_name = pred_value.split('#')[-1]
        elif ':' in pred_value:
            # Prefixed name: extract local name after :
            pred_name = pred_value.split(':')[-1]
        else:
            # Already just the local name
            pred_name = pred_value
        
        # Check if this is an OWL property
        is_owl_obj_prop = pred_name in owl_obj_props_dict
        is_owl_data_prop = pred_name in owl_data_props_dict
        
        # Debug logging
        print(f"[DEBUG]       Processing predicate: '{predicate}' -> extracted name: '{pred_name}'")
        print(f"[DEBUG]         Is OWL Object Property: {is_owl_obj_prop}")
        print(f"[DEBUG]         Is OWL Datatype Property: {is_owl_data_prop}")
        if is_owl_obj_prop:
            print(f"[DEBUG]         OWL Object Property Info: {owl_obj_props_dict[pred_name]}")
        
        # Format the predicate column - ALWAYS use full format for IRI predicates
        if is_owl_obj_prop:
            # Object property with OWL definition - show full definition with range and inverse
            prop_info = owl_obj_props_dict[pred_name]
            range_val = prop_info['range']
            inverse = prop_info['inverse']
            pred_text = f"mt:{pred_name} -> mt:{range_val}"
            if inverse:
                pred_text += f"\n(inverse: mt:{inverse})"
            row_cells[0].text = pred_text
        elif iri_template:
            # IRI predicate without OWL definition - extract target class from template
            target_class = None
            
            # Debug: show what we're trying to match
            print(f"[DEBUG]         IRI template: '{iri_template}'")
            
            # Pattern 1: Template with {COLUMN} placeholder - /ClassName/{
            match = re.search(r'/([A-Z][a-zA-Z0-9_]+)/\{', iri_template)
            if match:
                target_class = match.group(1)
                print(f"[DEBUG]         Pattern 1 matched: {target_class}")
            
            # Pattern 2: Look for /ClassName/ pattern anywhere in the URL
            # This catches cases like http://host/graph/ClassName/{col} or http://host/ClassName/{col}
            if not target_class:
                # Find all uppercase-starting path segments
                matches = re.findall(r'/([A-Z][a-zA-Z0-9_]+)/', iri_template)
                if matches:
                    # Use the last one that looks like a class name (not a graph name)
                    for m in reversed(matches):
                        # Skip known non-class segments
                        if m not in ('GLOBAL_DISTRIBUTE', 'XMLSchema'):
                            target_class = m
                            print(f"[DEBUG]         Pattern 2 matched: {target_class}")
                            break
            
            # Pattern 3: URL ending with /ClassName (no trailing slash or value)
            if not target_class:
                match = re.search(r'/([A-Z][a-zA-Z0-9_]+)$', iri_template)
                if match:
                    target_class = match.group(1)
                    print(f"[DEBUG]         Pattern 3 matched: {target_class}")
            
            if not target_class:
                print(f"[DEBUG]         NO PATTERN MATCHED for template: '{iri_template}'")
            
            if target_class:
                pred_text = f"mt:{pred_name} -> mt:{target_class}"
                row_cells[0].text = pred_text
            else:
                row_cells[0].text = f"mt:{pred_name}" if not pred_name.startswith('mt:') else pred_name
        else:
            # Datatype property
            row_cells[0].text = f"mt:{pred_name}" if pred_name and not pred_name.startswith('mt:') and not pred_name.startswith('skos:') else (predicate or column_name)
        row_cells[0].paragraphs[0].runs[0].font.size = Pt(7)
        set_cell_no_wrap(row_cells[0])
        
        # Format the Object Type column
        # External and DB predicates both use XSD_DATATYPE / OWL / IRI logic
        if iri_template:
            row_cells[1].text = "IRI"
        elif is_owl_data_prop:
            # Use OWL datatype property range
            prop_info = owl_data_props_dict[pred_name]
            range_val = prop_info['range']
            row_cells[1].text = f"xsd:{range_val}" if range_val else "xsd:string"
        elif xsd_type:
            row_cells[1].text = xsd_type if xsd_type.startswith('xsd:') else f"xsd:{xsd_type.replace('xsd:', '')}"
        else:
            row_cells[1].text = "xsd:string"
        row_cells[1].paragraphs[0].runs[0].font.size = Pt(7)
        set_cell_no_wrap(row_cells[1])
        
        example_text = ""
        # Extract clean column name (without [DB-VERIFIED] or [EXTERNAL] prefix)
        cleaned_col = clean_column_name(column_name)
        
        # Try to find sample data - check both prefixed and clean column names
        sample_value = None
        if column_name in sample_data:
            sample_value = sample_data[column_name]
        elif cleaned_col in sample_data:
            sample_value = sample_data[cleaned_col]
        
        if sample_value is not None:
            example = format_sample_value(sample_value, xsd_type)
            if iri_template and example and not example.startswith('['):
                # Transform the IRI template to include graph name
                transformed_template = transform_iri_template(iri_template)
                # Try replacement with clean column name (what's in the template)
                example_text = transformed_template.replace('{' + cleaned_col + '}', str(example))
                # If replacement didn't work, the template still contains {}, show just the example
                if '{' in example_text:
                    example_text = example
            else:
                example_text = example
        else:
            example_text = f"[No sample found for {cleaned_col}]"
        
        if not example_text:
            example_text = "[No example available]"
        
        row_cells[2].text = example_text
        if row_cells[2].paragraphs[0].runs:
            row_cells[2].paragraphs[0].runs[0].font.size = Pt(7)
            row_cells[2].paragraphs[0].runs[0].font.color.rgb = RGBColor(100, 100, 100)
        
        # Get nullability - try both prefixed and clean column names
        null_val = nullability.get(column_name)
        if null_val is None:
            null_val = nullability.get(cleaned_col, 'Optional')
        row_cells[3].text = null_val
        row_cells[3].paragraphs[0].runs[0].font.size = Pt(7)
        set_cell_no_wrap(row_cells[3])
    
    for group_name, group_members in predicate_groups.items():
        row_cells = table.add_row().cells
        
        predicates_list = [member['predicate'] or member['column_name'] for member in group_members]
        row_cells[0].text = f"{group_name} [\n  " + ",\n  ".join(predicates_list) + "\n]"
        row_cells[0].paragraphs[0].runs[0].font.size = Pt(7)
        row_cells[0].paragraphs[0].runs[0].font.bold = True
        
        row_cells[1].text = "Blank Node Group"
        row_cells[1].paragraphs[0].runs[0].font.size = Pt(7)
        row_cells[1].paragraphs[0].runs[0].font.italic = True
        set_cell_no_wrap(row_cells[1])
        
        example_parts = []
        for member in group_members:
            col_name = member['column_name']
            clean_col_name = clean_column_name(col_name)
            
            # Try to find sample data - check both prefixed and clean column names
            sample_value = None
            if col_name in sample_data:
                sample_value = sample_data[col_name]
            elif clean_col_name in sample_data:
                sample_value = sample_data[clean_col_name]
            
            if sample_value is not None:
                example_val = format_sample_value(sample_value, member['xsd_type'])
                if member['iri_template'] and example_val and not example_val.startswith('['):
                    # Transform the IRI template to include graph name
                    transformed_template = transform_iri_template(member['iri_template'])
                    example_val = transformed_template.replace('{' + clean_col_name + '}', str(example_val))
                    # If replacement didn't work, show just the example
                    if '{' in example_val:
                        example_val = str(sample_value)
            else:
                example_val = f"[No sample for {clean_col_name}]"
            
            example_parts.append(f"  {member['predicate']}: {example_val}")
        
        if example_parts:
            row_cells[2].text = "[\n" + "\n".join(example_parts) + "\n]"
        else:
            row_cells[2].text = "[No examples available for group]"
            
        if row_cells[2].text and row_cells[2].paragraphs[0].runs:
            row_cells[2].paragraphs[0].runs[0].font.size = Pt(7)
            row_cells[2].paragraphs[0].runs[0].font.color.rgb = RGBColor(100, 100, 100)
        
        group_nullabilities = []
        for member in group_members:
            col_name = member['column_name']
            # Try both prefixed and clean column names
            null_val = nullability.get(col_name)
            if null_val is None:
                clean_col = clean_column_name(col_name)
                null_val = nullability.get(clean_col, 'Optional')
            group_nullabilities.append(null_val)
        
        if len(set(group_nullabilities)) == 1:
            row_cells[3].text = group_nullabilities[0]
        else:
            row_cells[3].text = "Mixed (varies by member)"
        row_cells[3].paragraphs[0].runs[0].font.size = Pt(7)
        set_cell_no_wrap(row_cells[3])
    
    if not (is_last_area and is_last_class_in_area):
        if is_last_class_in_area:
            doc.add_paragraph()
        else:
            doc.add_paragraph()

def print_owl_consistency_validation():
    """
    Validate consistency between OWL definitions, DB metadata, and documented classes.
    This ensures:
    1. All classes in DB metadata are also defined in OWL
    2. All OWL classes have a section in the document (if configured)
    """
    print("\n" + "="*80)
    print("OWL CONSISTENCY VALIDATION")
    print("="*80)
    
    # Extract short names for easier comparison
    db_classes_short = {cls.split('#')[-1] if '#' in cls else cls 
                        for cls in ALL_AVAILABLE_CLASSES.keys()}
    owl_classes_short = set(OWL_CLASSES)
    doc_classes_short = {cls.split('#')[-1] if '#' in cls else cls 
                        for cls in CLASSES_DOCUMENTED}
    config_classes_short = {cls.split('#')[-1] if '#' in cls else cls 
                           for cls in CLASSES_IN_DOC_CONFIG}
    
    # Check 1: Classes in DB metadata but NOT in OWL definitions
    db_not_in_owl = db_classes_short - owl_classes_short
    if db_not_in_owl:
        print(f"\n[CRITICAL] CHECK 1: {len(db_not_in_owl)} CLASSES IN DB METADATA BUT NOT IN OWL DEFINITIONS:")
        print("   These classes have R2RML mappings but no OWL class definition in the TTL file.")
        print("   Action: Add owl:Class definitions for these classes in your TTL ontology.\n")
        for cls_name in sorted(db_not_in_owl):
            full_name = f"{get_base_namespace()}{cls_name}"
            tables = ALL_AVAILABLE_CLASSES.get(full_name, [])
            print(f"   ✗ {cls_name:<40s} | Tables: {', '.join(tables)}")
    else:
        print("\n[✓] CHECK 1: All DB metadata classes have OWL definitions")
    
    # Check 2: Classes in OWL but NOT in DB metadata
    owl_not_in_db = owl_classes_short - db_classes_short
    if owl_not_in_db:
        print(f"\n[WARNING] CHECK 2: {len(owl_not_in_db)} OWL CLASSES WITHOUT DB METADATA:")
        print("   These classes are defined in OWL but have no R2RML mapping.")
        print("   This may be intentional if they're abstract classes or future classes.\n")
        for cls_name in sorted(owl_not_in_db):
            print(f"   ? {cls_name}")
    else:
        print("\n[✓] CHECK 2: All OWL classes have DB metadata")
    
    # Check 3: OWL classes that are configured but NOT documented
    owl_configured_not_documented = (owl_classes_short & config_classes_short) - doc_classes_short
    if owl_configured_not_documented:
        print(f"\n[CRITICAL] CHECK 3: {len(owl_configured_not_documented)} OWL CLASSES CONFIGURED BUT NOT DOCUMENTED:")
        print("   These classes are in OWL, configured for documentation, but missing from the document.")
        print("   Action: Investigate why these classes were skipped during documentation.\n")
        for cls_name in sorted(owl_configured_not_documented):
            full_name = f"{get_base_namespace()}{cls_name}"
            reason = CLASSES_SKIPPED.get(full_name, "Unknown reason")
            print(f"   ✗ {cls_name:<40s} | Reason: {reason}")
    else:
        print("\n[✓] CHECK 3: All configured OWL classes are documented")
    
    # Check 4: OWL classes that should have documentation sections
    owl_in_db = owl_classes_short & db_classes_short
    owl_in_db_not_documented = owl_in_db - doc_classes_short
    if owl_in_db_not_documented:
        print(f"\n[INFO] CHECK 4: {len(owl_in_db_not_documented)} OWL CLASSES WITH METADATA BUT NOT DOCUMENTED:")
        print("   These classes are in both OWL and DB metadata but not documented.")
        print("   Reason: They are not in the UTIL_GRF_DOC_CLASS_CONTENT_AREA configuration.\n")
        for cls_name in sorted(owl_in_db_not_documented):
            full_name = f"{get_base_namespace()}{cls_name}"
            tables = ALL_AVAILABLE_CLASSES.get(full_name, [])
            in_config = cls_name in config_classes_short
            status = "IN CONFIG" if in_config else "NOT IN CONFIG"
            print(f"   → {cls_name:<40s} | Tables: {', '.join(tables):<30s} | {status}")
    else:
        print("\n[✓] CHECK 4: All OWL classes with metadata are documented (or intentionally excluded)")
    
    # Summary statistics
    print(f"\n" + "-"*80)
    print("SUMMARY STATISTICS:")
    print(f"   OWL Classes defined:              {len(owl_classes_short)}")
    print(f"   DB Classes with R2RML metadata:   {len(db_classes_short)}")
    print(f"   Classes in both OWL and DB:       {len(owl_in_db)}")
    print(f"   Classes configured for docs:      {len(config_classes_short)}")
    print(f"   Classes documented:               {len(doc_classes_short)}")
    
    # Calculate consistency scores
    if db_classes_short:
        db_owl_consistency = (len(db_classes_short - db_not_in_owl) / len(db_classes_short)) * 100
        print(f"\n   DB→OWL Consistency:    {db_owl_consistency:.1f}% ({len(db_classes_short - db_not_in_owl)}/{len(db_classes_short)})")
    
    if owl_in_db:
        owl_doc_coverage = (len(owl_in_db & doc_classes_short) / len(owl_in_db)) * 100
        print(f"   OWL→Doc Coverage:      {owl_doc_coverage:.1f}% ({len(owl_in_db & doc_classes_short)}/{len(owl_in_db)})")
    
    print("="*80)

def print_final_validation_report():
    """Print comprehensive validation report at the end."""
    print("\n" + "="*80)
    print("FINAL VALIDATION REPORT")
    print("="*80)
    
    print(f"\n1. INVENTORY:")
    print(f"   - Total classes in R2RML metadata: {len(ALL_AVAILABLE_CLASSES)}")
    print(f"   - Classes in doc configuration:     {len(CLASSES_IN_DOC_CONFIG)}")
    print(f"   - Classes documented:               {len(CLASSES_DOCUMENTED)}")
    print(f"   - Classes skipped:                  {len(CLASSES_SKIPPED)}")
    
    # Classes documented successfully
    print(f"\n2. DOCUMENTED CLASSES ({len(CLASSES_DOCUMENTED)}):")
    for class_name in sorted(CLASSES_DOCUMENTED):
        short_name = class_name.split('#')[-1] if '#' in class_name else class_name
        print(f"   [OK] {short_name}")
    
    # Classes skipped
    if CLASSES_SKIPPED:
        print(f"\n3. SKIPPED CLASSES ({len(CLASSES_SKIPPED)}):")
        for class_name, reason in sorted(CLASSES_SKIPPED.items()):
            short_name = class_name.split('#')[-1] if '#' in class_name else class_name
            print(f"   [SKIP] {short_name:<40s} | Reason: {reason}")
    
    # Classes missing from documentation
    classes_in_config_not_documented = CLASSES_IN_DOC_CONFIG - CLASSES_DOCUMENTED
    if classes_in_config_not_documented:
        print(f"\n4. CONFIGURED BUT NOT DOCUMENTED ({len(classes_in_config_not_documented)}):")
        for class_name in sorted(classes_in_config_not_documented):
            short_name = class_name.split('#')[-1] if '#' in class_name else class_name
            reason = CLASSES_SKIPPED.get(class_name, "Unknown reason")
            print(f"   ? {short_name:<40s} | {reason}")
    
    # Classes available but not configured
    classes_available_not_configured = set(ALL_AVAILABLE_CLASSES.keys()) - CLASSES_IN_DOC_CONFIG
    if classes_available_not_configured:
        print(f"\n5. AVAILABLE IN R2RML BUT NOT CONFIGURED ({len(classes_available_not_configured)}):")
        print("   These classes exist in the database but are not in the doc configuration tables.")
        print("   To include them, add entries to UTIL_GRF_DOC_CLASS_CONTENT_AREA.\n")
        for class_name in sorted(classes_available_not_configured):
            short_name = class_name.split('#')[-1] if '#' in class_name else class_name
            tables = ALL_AVAILABLE_CLASSES.get(class_name, [])
            print(f"   ? {short_name:<40s} | Tables: {', '.join(tables)}")
    
    # Success rate
    if CLASSES_IN_DOC_CONFIG:
        success_rate = (len(CLASSES_DOCUMENTED) / len(CLASSES_IN_DOC_CONFIG)) * 100
        print(f"\n6. SUCCESS RATE:")
        print(f"   {success_rate:.1f}% of configured classes were documented")
        print(f"   ({len(CLASSES_DOCUMENTED)}/{len(CLASSES_IN_DOC_CONFIG)})")
    
    # External predicates summary
    if EXTERNAL_PREDICATE_ROLES:
        print(f"\n7. EXTERNAL PREDICATES FROM UTIL_GRF_R2RML_COLUMN_EXTERNAL ({len(EXTERNAL_PREDICATE_ROLES)}):")
        for col_key, ext_info in sorted(EXTERNAL_PREDICATE_ROLES.items()):
            pred = ext_info.get('predicate', col_key)
            role = ext_info.get('r2rml_role', 'unspecified')
            src = ext_info.get('source_file', 'unknown')
            print(f"   [EXT] {pred:<40s} | role={role}, source={src}")
    
    print("\n" + "="*80)

def main():
    """Main execution function."""
    global CLASSES_SKIPPED, TTL_INPUT_FILENAME, EXTERNAL_PREDICATE_ROLES
    
    print("="*80)
    print("TTL Documentation Generator - v52 with External Predicates")
    print("="*80)
    
    # STEP 0: Get TTL file path from user input
    ttl_filename = input("\nEnter the TTL filename (e.g., multum-pharma_v251203.1.64.ttl): ").strip()
    if not ttl_filename:
        print("[ERROR] No filename provided. Exiting.")
        return
    
    ttl_output_path = os.path.join(os.path.dirname(__file__), ttl_filename)
    
    if not os.path.exists(ttl_output_path):
        print(f"[ERROR] File not found: {ttl_output_path}")
        return
    
    # Store the input filename for output naming
    TTL_INPUT_FILENAME = ttl_filename
    
    print(f"[INFO] Using TTL file: {ttl_output_path}")
    
    # CRITICAL: Extract graph name from TTL file BEFORE any other processing
    extract_graph_name_from_ttl(ttl_output_path)
    
    # Parse OWL ontology definitions
    parse_ttl_ontology(ttl_output_path)
    
    # Scan for class instances
    TTL_CLASSES = scan_ttl_for_classes(ttl_output_path)
    
    print("\n[DEBUG] Connecting to Oracle database...")
    conn = oracledb.connect(user=oracle_user, password=oracle_password, dsn=dsn)
    cursor = conn.cursor()
    print("[DEBUG] Connected successfully!")
    
    # STEP 1: Inventory all available classes
    all_available_classes = get_all_available_classes(cursor)
    
    # STEP 1b: Inventory external predicates table
    inventory_external_predicates(cursor)
    
    print("\n[DEBUG] Creating Word document...")
    doc = create_professional_document()
    
    title = add_styled_heading(doc, "Multum RDF Ontology Data in SKG", 1)
    title.alignment = WD_ALIGN_PARAGRAPH.CENTER
    
    add_metadata_section(doc)
    
    print("\n[DEBUG] Building IRI to class mapping...")
    iri_mapping = build_iri_to_class_mapping(cursor)
    
    # Add namespace table
    add_namespace_table(doc)
    
    # Add OWL ontology documentation section
    add_owl_ontology_documentation(doc)
    
    # STEP 2: Get doc configuration and compare with available classes
    content_area_data = get_content_areas_and_classes(cursor)
    
    if not content_area_data:
        print("\n" + "!"*80)
        print("!!! CRITICAL: No content area data found!")
        print("!!! The database tables are empty or not accessible.")
        print("!"*80)
    
    content_areas = OrderedDict()
    for row in content_area_data:
        area_order, area_name, area_desc, class_name, class_order = row
        
        if area_name not in content_areas:
            content_areas[area_name] = {
                'order': area_order,
                'description': area_desc,
                'classes': []
            }
        
        content_areas[area_name]['classes'].append({
            'name': class_name,
            'order': class_order
        })
    
    print(f"\n[DEBUG] Organized into {len(content_areas)} content areas")
    for area_name, area_info in content_areas.items():
        print(f"[DEBUG]   - {area_name}: {len(area_info['classes'])} classes")
    
    # STEP 3: Process each content area
    print("\n" + "="*80)
    print("STEP 3: DOCUMENTING CLASSES")
    print("="*80)
    
    area_count = 0
    total_areas = len(content_areas)
    
    for area_name, area_info in content_areas.items():
        area_count += 1
        is_last_area = (area_count == total_areas)
        print(f"\n{'='*80}")
        print(f"[{area_count}/{total_areas}] Processing content area: {area_name}")
        print(f"{'='*80}")
        print(f"[DEBUG] Classes configured for this area: {len(area_info['classes'])}")
        for cls_info in area_info['classes']:
            cls_short = cls_info['name'].split('#')[-1] if '#' in cls_info['name'] else cls_info['name']
            print(f"  - {cls_short}")
        
        # FIRST: Check if this content area has any documentable classes
        documentable_classes = []
        for class_info in area_info['classes']:
            class_name = class_info['name']
            table_name = get_table_name_for_class(cursor, class_name)
            if table_name:
                metadata, supplemental_samples = get_class_metadata(cursor, table_name, class_name)
                if metadata:
                    documentable_classes.append({
                        'info': class_info,
                        'table_name': table_name,
                        'metadata': metadata,
                        'class_name': class_name,
                        'supplemental_samples': supplemental_samples
                    })
                else:
                    print(f"  [WARNING] No metadata found for {table_name} (class: {class_name}) - SKIPPING")
                    CLASSES_SKIPPED[class_name] = f"No metadata for table {table_name}"
            else:
                print(f"  [WARNING] No table found for class {class_name} - SKIPPING")
                CLASSES_SKIPPED[class_name] = "No table found"
        
        # Only add content area heading if there are classes to document
        if not documentable_classes:
            print(f"  [WARNING] No documentable classes in this content area - SKIPPING ENTIRE AREA")
            print(f"  [DEBUG] All {len(area_info['classes'])} classes in this area were filtered out")
            continue
        
        print(f"  [INFO] Found {len(documentable_classes)} documentable classes in this area")
        
        # Add content area heading (only if we have classes to document)
        heading = add_styled_heading(doc, area_name, 2)
        
        if area_info['description']:
            desc_para = doc.add_paragraph(area_info['description'])
            for run in desc_para.runs:
                run.font.size = Pt(8)
        
        doc.add_paragraph()
        
        # Now document all the classes we found
        class_count = 0
        total_classes = len(documentable_classes)
        for idx, doc_class in enumerate(documentable_classes):
            class_count += 1
            is_last_class_in_area = (class_count == total_classes)
            class_name = doc_class['class_name']
            table_name = doc_class['table_name']
            metadata = doc_class['metadata']
            
            print(f"\n  [{class_count}/{total_classes}] Documenting class: {class_name}")
            
            # Collect sample data from ALL tables referenced in metadata
            # (metadata now spans multiple tables for the same class)
            sample_data = {}
            tables_in_metadata = set(row[0] for row in metadata)  # TABLE_NAME is index 0
            for tbl in tables_in_metadata:
                tbl_samples = get_sample_data(cursor, tbl)
                sample_data.update(tbl_samples)
            
            # Merge supplemental samples from cross-table predicates
            supplemental_samples = doc_class.get('supplemental_samples', {})
            sample_data.update(supplemental_samples)
            
            # v52: Scan TTL for example values of external predicates
            for row in metadata:
                col_key = row[1]  # COLUMN_NAME slot
                if col_key in EXTERNAL_PREDICATE_ROLES:
                    ext_info = EXTERNAL_PREDICATE_ROLES[col_key]
                    predicate = ext_info.get('predicate', col_key)
                    
                    # Scan the TTL file for a real example value
                    ttl_example = get_example_from_ttl(ttl_output_path, predicate)
                    if ttl_example:
                        sample_data[col_key] = ttl_example
                        print(f"    [EXTERNAL] TTL example for {predicate}: {ttl_example[:60]}")
                    else:
                        source_file = ext_info.get('source_file', 'external source')
                        sample_data[col_key] = f"[No example found in TTL - source: {source_file}]"
                        print(f"    [EXTERNAL] No TTL example found for {predicate}")
            
            add_class_documentation(doc, cursor, table_name, metadata, sample_data,
                                  is_last_class_in_area, is_last_area)
            print(f"  [SUCCESS] Class documented successfully!")
    
    # Generate output filename based on input TTL filename
    # e.g., "multum-pharma_v251203.1.64.ttl" -> "multum-pharma_v251203.1.64_RDF_ontology.docx"
    if TTL_INPUT_FILENAME:
        base_name = TTL_INPUT_FILENAME.rsplit('.', 1)[0]  # Remove .ttl extension
        output_base = f"{base_name}_RDF_ontology"
    else:
        output_base = "RDF_Ontology_Documentation"
    
    # Try to save, with automatic versioning if file is locked or exists
    output_file = f"{output_base}.docx"
    version = 0
    max_versions = 100  # Safety limit
    saved = False
    
    while version < max_versions and not saved:
        try:
            doc.save(output_file)
            saved = True
        except PermissionError:
            # File is open/locked - try next version
            version += 1
            output_file = f"{output_base}_v{version}.docx"
            print(f"  [INFO] File locked, trying: {output_file}")
        except Exception as e:
            print(f"  [ERROR] Failed to save: {e}")
            raise
    
    if not saved:
        raise RuntimeError(f"Could not save file after {max_versions} version attempts")
    
    output_full_path = os.path.abspath(output_file)
    print(f"\n{'='*80}")
    print(f"DOCUMENTATION GENERATED SUCCESSFULLY")
    print(f"{'='*80}")
    print(f"Filename: {output_file}")
    print(f"Location: {output_full_path}")
    print(f"{'='*80}\n")
    
    # STEP 3.5: Compare TTL classes with what was documented
    print("\n" + "="*80)
    print("TTL vs DOCUMENTATION COMPARISON")
    print("="*80)
    
    if TTL_CLASSES:
        ttl_short_names = {cls.split('#')[-1] for cls in TTL_CLASSES}
        doc_short_names = {cls.split('#')[-1] if '#' in cls else cls for cls in CLASSES_DOCUMENTED}
        
        missing_from_doc = ttl_short_names - doc_short_names
        if missing_from_doc:
            print(f"\n[CRITICAL] {len(missing_from_doc)} CLASSES IN TTL BUT NOT DOCUMENTED:")
            print("\nDIAGNOSTICS FOR EACH MISSING CLASS:")
            print("-" * 80)
            for cls_name in sorted(missing_from_doc):
                # Check if it's in available classes
                full_name = f"{get_base_namespace()}{cls_name}"
                print(f"\n{cls_name}:")
                print(f"  Full URI: {full_name}")
                
                if full_name in ALL_AVAILABLE_CLASSES:
                    tables = ALL_AVAILABLE_CLASSES[full_name]
                    print(f"  ✓ Found in R2RML metadata")
                    print(f"  ✓ Associated tables: {', '.join(tables)}")
                    
                    in_config = full_name in CLASSES_IN_DOC_CONFIG
                    if in_config:
                        print(f"  ✓ Found in UTIL_GRF_DOC_CLASS_CONTENT_AREA configuration table")
                    else:
                        print(f"  ✗ NOT found in UTIL_GRF_DOC_CLASS_CONTENT_AREA configuration table")
                        print(f"  → ACTION REQUIRED: Add this class to UTIL_GRF_DOC_CLASS_CONTENT_AREA")
                        print(f"     to include it in the documentation")
                    
                    # If in config but not documented, explain why
                    if in_config and full_name in CLASSES_SKIPPED:
                        print(f"  ✗ Skipped during documentation: {CLASSES_SKIPPED[full_name]}")
                else:
                    print(f"  ✗ NOT found in R2RML metadata (UTIL_GRF_R2RML_COLUMN table)")
                    print(f"  → This class has instances in the TTL file but no R2RML mapping")
            print("-" * 80)
        else:
            print("\n[SUCCESS] All TTL classes are documented!")
        
        extra_in_doc = doc_short_names - ttl_short_names
        if extra_in_doc:
            print(f"\n[INFO] {len(extra_in_doc)} classes documented but not in TTL sample:")
            for cls_name in sorted(extra_in_doc):
                print(f"  ? {cls_name}")
    else:
        print("\n[WARNING] Could not scan TTL file for comparison")
    
    print("="*80)
    
    # STEP 4: OWL Consistency Validation
    print_owl_consistency_validation()
    
    # STEP 5: Final validation report
    print_final_validation_report()
    
    cursor.close()
    conn.close()
    print("\n[DEBUG] Done!")

if __name__ == "__main__":
    main()