# -*- coding: utf-8 -*-
"""
Oracle to Turtle RDF Converter with OWL Ontology Generation
===========================================================
Converts Oracle database tables to Turtle RDF format using R2RML metadata.
Automatically generates OWL ontology (classes, properties, inverses) from metadata.
Adds SKOS Concept typing and notation to all class instances.

Author: M Dombaugh
Last Modified: 2025-02-24
Version: 35.0 - Output TTL and intermediate CSV files written to output/ folder
                relative to project root (skg/output/). Output folder is created
                automatically if it does not exist.
         34.0 - Fixed language tag detection to apply to ALL literal predicates,
                not just TextBlock predicates. Values like "filename @en" now correctly
                produce "filename"@en instead of "filename @en" (plain string).
                Also strips trailing whitespace between value and language tag.
                Uses regular quotes for single-line values, triple quotes only for multi-line.
         33.0 - Changed OWL Ontology declaration from N-Triples to Turtle shorthand
                to ensure compatibility with downstream sort/consolidate scripts.
                Publisher now output as string instead of mailto: URI.
                Added mt:allowAugmentation "false" to ontology block.
                Changed dcterms:source to string per internal standard.
                Graph name now selected from menu (mulcor-enus, mulivc-enus, mullpd-enus, mul3di-enus).
         32.0 - Added OWL Ontology declaration block with metadata
                (versionInfo, label, comment, imports SKOS, dcterms created/publisher/source)
                Graph name variable drives ontology IRI and rdfs:label.
"""

# Standard library imports for core functionality
import csv                    # CSV file reading/writing operations
from pathlib import Path      # Modern path handling (Python 3.4+)
import os                     # Operating system interface functions
import time                   # Time-related functions for performance tracking
from collections import defaultdict  # Dictionary that returns default values for missing keys
from dotenv import load_dotenv       # Load environment variables from .env file
import oracledb              # Oracle Database driver (modern replacement for cx_Oracle)
import getpass               # Secure password input without displaying characters
import re                    # Regular expressions for property name parsing

# External tool requirements and change history
# turtle validator: npm install -g turtle-validator OR npm install -g npm@11.4.2
# M Dombaugh	2025-07-26	Changed import process to handle CLOB data types.
# M Dombaugh	2025-07-28	Added triple quotes to clinical text with line feeds & scaped instring backslashes (\ -> \\)
# M Dombaugh	2025-10-07	Added automatic OWL ontology generation with inverse properties
# M Dombaugh	2025-10-07	Fixed object property ranges to point to classes instead of XSD types
#                           Fixed inverse property naming to use proper namespace prefix
# M Dombaugh	2025-10-07	Removed hard-coded unit pattern mappings - now fully data-driven
# M Dombaugh	2025-10-08	Added skos:Concept typing and skos:notation to all class instances
# M Dombaugh	2025-10-08	Fully data-driven raw value handling with deduplication
# M Dombaugh	2025-10-13	Use XSD_DATATYPE from metadata for skos:notation instead of hardcoded xsd:integer
# M Dombaugh	2025-10-15	Added dynamic IRI selection for Condition class based on ICD code type (ICD-10 vs ICD-9)
#                           Fixed handling of "CUSTOM" placeholder in IRI_TEMPLATE - now properly replaces with ICD namespace
# M Dombaugh	2025-10-30	CRITICAL FIX: Dual-mode export strategy to handle CLOB tables without slowing down regular tables
#                           Automatically detects CLOB/BLOB columns via Oracle data dictionary
#                           CLOB tables: arraysize=500, chunked reading (4KB chunks), periodic flush
#                           Regular tables: arraysize=5000 (original fast performance)
#                           All other code remains unchanged from v13 (TTL formatting, ontology generation, etc.)
# M Dombaugh	2025-11-04	PERFORMANCE: Optimized CLOB-safe export for 10x speed improvement
#                           Increased arraysize and prefetchrows from 500 to 5000
#                           Added batch writing (1000 rows/batch) instead of row-by-row
#                           Maintains CLOB safety with chunked reading while dramatically improving throughput
# M Dombaugh	2025-11-05	CRITICAL FIX: 100% DATA DRIVEN - Python NEVER constructs composite keys
#                           Script now ONLY reads actual values from CSV columns
#                           Composite keys must be pre-formed in the database (e.g., "846/239/20080")
#                           Removed all composite key construction logic - purely data-driven
#                           This fixes invalid IRI errors caused by constructed keys with pipe characters (|)
# M Dombaugh	2025-11-10	ENHANCEMENT: Skip entities with no predicates beyond type and notation
#                           Entities with only primary key (all other columns NULL) are now skipped
#                           Reduces output clutter and file size by excluding stub entities
#                           build_ttl_block() returns None for empty entities instead of generating minimal TTL
# M Dombaugh	2025-11-10	CRITICAL FIX: CSV quoting for multi-line CLOB data (v18)
#                           Added csv.QUOTE_ALL to both export functions to prevent newlines in CLOBs 
#                           from being interpreted as row breaks during CSV parsing
#                           Fixed CLOB chunking with ceiling division + safety margin to ensure complete data retrieval
#                           This fixes ClinicalText being cut off mid-sentence due to improper CSV handling
# M Dombaugh	2025-11-10	CRITICAL FIX: Non-grouped predicate deduplication (v19)
#                           Added added_predicates set to track which predicates have been output
#                           Prevents same predicate from appearing multiple times in output
#                           Fixes issue where multiple columns mapping to same predicate caused duplicates
#                           Example: RELATED_DRUG_NAME_ID and ANOTHER_DRUG_NAME_ID both map to mt:hasDrugName
# M Dombaugh	2025-11-19	ENHANCEMENT: Automatic lowercase prefix generation (v21)
#                           No longer requires UTIL_GRF_TABLE_IRI_ABBREVIATION table
#                           Automatically generates prefixes by lowercasing class names
#                           DrugProduct -> drugproduct:, AgeStratification -> agestratification:
#                           Simplifies maintenance - one less table to manage
# M Dombaugh	2025-11-20	CRITICAL FIX: Prefixed names no longer wrapped in angle brackets (v22)
#                           Fixed format_multiline() to recognize already-abbreviated IRIs
#                           Prefixed names like "drug:d00001" now output without <> 
#                           Full URIs like "http://skg.health.oraclecloud.com/icd9cm/591" still get <>
#                           Logic differentiates between prefixed names and full URIs correctly
#                           This fixes SPARQL join failures where abbreviated IRIs in angle brackets
#                           were treated as literal URI strings instead of proper URI references
# M Dombaugh	2025-11-25	ENHANCEMENT: Language-tagged text block support (v24)
#                           Added format_language_tagged_text() function for TextBlock predicates
#                           Detects language tags (@en, @es, @en-US, etc.) at end of values
#                           Formats as: (triple-quote) text content (triple-quote) @en
#                           Only applies to predicates ending in 'TextBlock'
#                           Supports BCP 47 language tags (ISO 639-1 codes with optional regions)
#                           CRITICAL FIX: Added NLS_LANG=AL32UTF8 and output type handler for
#                           proper UTF-8 encoding of Spanish/international characters from Oracle
# M Dombaugh	2025-12-03  Amende to accept graph name as input and added graph name to IRI components
# M Dombaugh	2025-12-03  CRITICAL FIX: Turtle-reserved characters in IRI local names (v30)
#                           Characters like () are reserved Turtle syntax (collections) and break parsing
#                           when used in prefixed names like ivdrugcompat:NAME_(MMPR)
#                           Solution: abbreviate_iri() now detects reserved chars in local name
#                           and returns full IRI with angle brackets instead of abbreviating.
#                           Reserved chars checked: ( ) ~ ! $ & ' * + , ; = ? # @ % and space
#                           PKs remain clean/unencoded - only output format changes.
# M Dombaugh	2025-12-03  CRITICAL FIX: Improved percent-encoding detection (v31)
#                           Source data may contain pre-encoded values like %28, %29, etc.
#                           Added explicit regex check for %XX hex sequences in IRI local names.
#                           All IRIs with percent-encoding now use full <IRI> syntax.
# M Dombaugh	2025-12-05  ENHANCEMENT: Added OWL Ontology declaration block (v32)
#                           Outputs owl:Ontology type, rdfs:label, rdfs:comment, owl:versionInfo,
#                           owl:imports (SKOS), dcterms:created, dcterms:publisher, dcterms:source.
#                           Ontology IRI and label driven by GRAPH_NAME input variable.

# Oracle Instant Client DLL path configuration for Windows
# This is required for the oracledb driver to locate Oracle client libraries
os.add_dll_directory(r"C:\Oracle\instant_client\instantclient_23_8")

# Environment variable loading and database connection setup
load_dotenv(Path(__file__).resolve().parent.parent / ".env")  # skg/.env
wallet_path = Path(os.getenv("ORACLE_WALLET_PATH"))    # Path to Oracle Wallet for secure connections
dsn = os.getenv("ORACLE_DSN")                          # Data Source Name (connection string)
oracle_user = os.getenv("ORACLE_USER")                 # Database username
# Password from env var or prompt user securely if not set
oracle_password = os.getenv("ORACLE_PASSWORD") or getpass.getpass("Enter Oracle password: ")

# User configuration options for processing behavior
# Allow users to skip CSV download phase and work with existing files
SKIP_DOWNLOAD = input("Skip table download and use existing CSVs? (y/n): ").strip().lower() == 'y'

# Limit processing to specific number of subjects for testing/development
try:
    MAX_SUBJECTS = int(input("Max subjects to process per table (0 = all): ").strip())
except ValueError:
    MAX_SUBJECTS = 0  # Default to processing all subjects if invalid input

# Graph name for namespace URI (e.g., "mulcor-enus" produces mt: <http://multum.health.oraclecloud.com/mulcor-enus/ns#>)
VALID_GRAPH_NAMES = [
    "mulcor-enus",
    "mulivc-enus",
    "mullpd-enus",
    "mul3di-enus",
]

print("\nSelect graph name:")
for i, name in enumerate(VALID_GRAPH_NAMES, 1):
    print(f"  {i}. {name}")

while True:
    try:
        selection = int(input("Enter selection (1-4): ").strip())
        if 1 <= selection <= len(VALID_GRAPH_NAMES):
            GRAPH_NAME = VALID_GRAPH_NAMES[selection - 1]
            print(f"Selected: {GRAPH_NAME}")
            break
        else:
            print("Invalid selection. Enter 1-4.")
    except ValueError:
        print("Invalid input. Enter a number 1-4.")

# Oracle client initialization - ensures proper thick mode setup
# Thick mode provides full Oracle client functionality vs thin mode (pure Python)
if oracledb.is_thin_mode():
    oracledb.init_oracle_client()

# Set NLS_LANG for proper UTF-8 handling of Spanish/international characters
os.environ["NLS_LANG"] = "AMERICAN_AMERICA.AL32UTF8"

# Database connection establishment with performance optimizations
conn = oracledb.connect(user=oracle_user, password=oracle_password, dsn=dsn)
conn.autocommit = True  # Added for performance - auto-commit each statement
cursor = conn.cursor()

# Explicitly set output type handler for proper encoding of CLOB/string data
def output_type_handler(cursor, name, default_type, size, precision, scale):
    if default_type == oracledb.DB_TYPE_CLOB:
        return cursor.var(oracledb.DB_TYPE_LONG, arraysize=cursor.arraysize)
    if default_type == oracledb.DB_TYPE_BLOB:
        return cursor.var(oracledb.DB_TYPE_LONG_RAW, arraysize=cursor.arraysize)

conn.outputtypehandler = output_type_handler

# Performance tuning configuration for handling large datasets and CLOB fields
cursor.arraysize = 5000      # Number of rows fetched per network round trip
cursor.prefetchrows = 5000   # Number of rows to prefetch into client memory

# CLOB (Character Large Object) handling configuration
LOB_LIMIT = 100000          # Maximum characters/bytes to read from CLOB/BLOB fields

# Table discovery - either from environment variable or dynamic query
table_env = os.getenv("TTL_TABLE_LIST", "").strip()
if table_env:
    # Use explicitly defined table list from environment variable
    # Parse newline-separated table names, filtering out empty lines
    table_list = [line.strip() for line in table_env.splitlines() if line.strip()]
else:
    # Auto-discover tables from R2RML metadata repository
    # Query for all tables starting with 'GRF' prefix
    cursor.execute("""
        SELECT DISTINCT table_name
        FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN
    """)
    table_list = [row[0] for row in cursor.fetchall()]

# ============================================================================
# IRI Abbreviation Support
# ============================================================================

def load_iri_abbreviations(cursor):
    """
    Generate IRI abbreviations automatically by converting class names to lowercase.
    No longer requires UTIL_GRF_TABLE_IRI_ABBREVIATION table.
    
    Retrieves all distinct R2RML_CLASS values from metadata and creates
    abbreviations by lowercasing the class name.
    
    Returns:
        Dictionary mapping class names to their lowercase abbreviations
        Example: {'Drug': 'drug', 'DrugProduct': 'drugproduct', 'MfgDrugProduct': 'mfgdrugproduct'}
    """
    print("[INFO] Generating automatic IRI abbreviations from class names...")
    cursor.execute("""
        SELECT DISTINCT R2RML_CLASS
        FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN
        WHERE R2RML_CLASS IS NOT NULL
        ORDER BY R2RML_CLASS
    """)
    
    abbreviations = {}
    for (class_name,) in cursor.fetchall():
        class_name = class_name.strip()
        # Generate abbreviation by converting to lowercase
        abbrev = class_name.lower()
        abbreviations[class_name] = abbrev
    
    print(f"[INFO] Generated {len(abbreviations)} automatic IRI abbreviations")
    return abbreviations

def abbreviate_iri(iri, abbreviations, base_url):
    """
    Convert a full IRI to abbreviated form using prefix if available.
    If local name contains Turtle-reserved characters or percent-encoded sequences,
    keep full IRI syntax with angle brackets to avoid parse errors.
    
    Args:
        iri: Full IRI string (may or may not have angle brackets)
        abbreviations: Dictionary of class name -> abbreviation mappings
        base_url: Base URL for matching (e.g., 'http://multum.health.oraclecloud.com/')
    
    Returns:
        Abbreviated IRI (e.g., 'drug:d00001') if safe to abbreviate,
        or full IRI with angle brackets if local name contains reserved chars
    """
    # Characters that are reserved/problematic in Turtle prefixed name local parts
    # These would cause parse errors if used in prefix:localname format
    # Reference: https://www.w3.org/TR/turtle/#grammar-production-PN_LOCAL
    reserved_chars = set('()~!$&\'*+,;=?#@% "\\')
    
    # Regex to detect percent-encoded sequences (%XX where X is hex digit)
    # These come from source data and are also invalid in prefixed names
    percent_encoded_pattern = re.compile(r'%[0-9A-Fa-f]{2}')
    
    def has_problematic_chars(value):
        """Check if value contains any chars that prevent abbreviation."""
        # Check for reserved characters
        if any(c in reserved_chars for c in value):
            return True
        # Check for percent-encoded sequences (e.g., %28, %29, %20)
        if percent_encoded_pattern.search(value):
            return True
        # Check for non-ASCII characters (Unicode issues, mojibake, etc.)
        try:
            value.encode('ascii')
        except UnicodeEncodeError:
            return True
        return False
    
    # Handle both <IRI> and IRI formats
    if iri.startswith('<') and iri.endswith('>'):
        full_iri = iri[1:-1]
    else:
        full_iri = iri
    
    # Check if IRI matches our base URL pattern
    if not full_iri.startswith(base_url):
        # External IRI - ensure it has angle brackets if needed
        if has_problematic_chars(full_iri):
            return f"<{full_iri}>"
        return iri  # External IRI without issues, keep as-is
    
    # Extract class name and instance ID
    # Pattern: http://multum.health.oraclecloud.com/graph/ClassName/InstanceID
    remainder = full_iri[len(base_url):]
    parts = remainder.split('/')
    
    # Need at least graph/class/id structure
    if len(parts) >= 3:
        # parts[0] = graph name, parts[1] = class name, parts[2:] = instance ID
        class_name = parts[1]
        instance_id = '/'.join(parts[2:])  # Handle multi-part IDs like "d00001/1"
        
        # Check if instance_id contains problematic characters
        if has_problematic_chars(instance_id):
            # Cannot safely abbreviate - return full IRI with angle brackets
            return f"<{full_iri}>"
        
        # Safe to abbreviate - no reserved chars in local name
        if class_name in abbreviations:
            abbrev = abbreviations[class_name]
            return f"{abbrev}:{instance_id}"
    elif len(parts) >= 2:
        # Fallback for simpler structure: class/id
        class_name = parts[0]
        instance_id = '/'.join(parts[1:])
        
        # Check if instance_id contains problematic characters
        if has_problematic_chars(instance_id):
            return f"<{full_iri}>"
        
        if class_name in abbreviations:
            abbrev = abbreviations[class_name]
            return f"{abbrev}:{instance_id}"
    
    # No abbreviation available or can't parse, return full IRI with brackets
    return f"<{full_iri}>"


# Utility functions for data processing and RDF generation

def get_metadata(cursor, table_name):
    """
    Retrieve R2RML metadata for a specific table.
    
    Returns mapping information including:
    - Column to RDF predicate mappings
    - IRI templates for object generation
    - XSD datatypes for proper RDF literal formatting
    - Primary key identification for subject generation
    - R2RML role for filtering logic
    
    Returns tuple: (column_name, predicate_description, iri_template, is_key, 
                    xsd_datatype, column_order, r2rml_class, r2rml_role)
    """
    cursor.execute("""
        SELECT 
            COLUMN_NAME, 
            PREDICATE_DESCRIPTION, 
            IRI_TEMPLATE, 
            IS_KEY, 
            XSD_DATATYPE, 
            COLUMN_ORDER, 
            R2RML_CLASS, 
            R2RML_ROLE
        FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN
        WHERE TABLE_NAME = :1
        ORDER BY COLUMN_ORDER
    """, [table_name])
    return cursor.fetchall()

def get_group_metadata(cursor, table_name):
    """
    Retrieve group information for generating blank nodes.
    
    Returns group mappings for columns that should be grouped together
    in blank node structures.
    """
    cursor.execute("""
        SELECT 
            COLUMN_NAME, 
            GROUP_PREDICATE_NAME
        FROM GLOBAL_DISTRIBUTE.UTIL_GRF_R2RML_COLUMN_GROUP
        WHERE TABLE_NAME = :1
        ORDER BY GROUP_PREDICATE_NAME, COLUMN_NAME
    """, [table_name])
    return cursor.fetchall()

def build_iri_to_class_mapping(cursor, table_list):
    """
    Build a comprehensive mapping of IRI patterns to their actual classes.
    This is data-driven - extracted from actual IRI templates in the database.
    
    Returns:
        Dictionary mapping IRI path patterns to class names
        Example: {'Unit': 'UnitOfMeasure', 'MinAgeUnit': 'UnitOfMeasure'}
    """
    iri_to_class = {}
    
    for table_name in table_list:
        metadata = get_metadata(cursor, table_name)
        
        for col, pred, iri_tmpl, is_key, xsd, _, r2rml_class, r2rml_role in metadata:
            if iri_tmpl and is_key == "Y":
                # Extract the IRI path component from primary key IRI templates
                # Pattern: http://domain/ClassName/{COLUMN_NAME}
                match = re.search(r'/([A-Z][a-zA-Z0-9]+)/\{', iri_tmpl)
                if match and r2rml_class:
                    iri_path = match.group(1)
                    iri_to_class[iri_path] = r2rml_class
    
    return iri_to_class

def extract_class_from_iri_template(iri_template, iri_to_class_map):
    """
    Extract the target class name from an IRI template using data-driven mapping.
    
    Args:
        iri_template: The IRI template string (e.g., http://multum.../MinAgeUnit/{MIN_AGE_UNIT_ID})
        iri_to_class_map: Dictionary mapping IRI path patterns to actual class names
    
    Returns:
        The class name that the IRI points to, looked up from actual data
    
    Examples:
        http://multum.../Condition/{CONDITION_ID} -> Condition
        http://multum.../MinAgeUnit/{MIN_AGE_UNIT_ID} -> UnitOfMeasure (from data)
    """
    if not iri_template:
        return None
    
    # Extract the path component before the placeholder
    # Pattern: http://domain/ClassName/{COLUMN_NAME}
    match = re.search(r'/([A-Z][a-zA-Z0-9]+)/\{', iri_template)
    if match:
        iri_path = match.group(1)
        
        # Look up the actual class from our data-driven mapping
        # This replaces the hard-coded unit_patterns list
        if iri_path in iri_to_class_map:
            return iri_to_class_map[iri_path]
        
        # If not in mapping, use the IRI path as the class name
        # (this handles cases where IRI path matches class name exactly)
        return iri_path
    
    return None

def get_all_ontology_metadata(cursor, table_list):
    """
    Retrieve all ontology metadata for generating OWL classes and properties.
    Fully data-driven - no hard-coded mappings.
    
    Returns:
    - Set of all classes (r2rml_class values)
    - Dictionary of properties with their domain, range, and datatype info
    """
    # First, build the IRI-to-class mapping from actual data
    iri_to_class_map = build_iri_to_class_mapping(cursor, table_list)
    
    classes = set()
    properties = {}  # property_name -> {domains, ranges, datatype, is_object_property}
    
    for table_name in table_list:
        metadata = get_metadata(cursor, table_name)
        
        for col, pred, iri_tmpl, is_key, xsd, _, r2rml_class, r2rml_role in metadata:
            # Add the subject's class
            if r2rml_class:
                classes.add(r2rml_class)
            
            # Skip primary key columns - they define the subject, not predicates
            if is_key == "Y":
                continue
            
            # Skip raw value columns (no predicate to define in ontology)
            # These are output as raw RDF statements, not properties
            if (r2rml_role == 'subject (templated)' and 
                (iri_tmpl is None or iri_tmpl == '') and 
                (pred is None or pred == '')):
                continue  # Skip - these don't define properties
            
            if pred:
                if pred not in properties:
                    properties[pred] = {
                        'domains': set(),
                        'ranges': set(),
                        'datatype': xsd,
                        'is_object_property': bool(iri_tmpl)
                    }
                
                # Domain is the class of the subject (the table's r2rml_class)
                if r2rml_class:
                    properties[pred]['domains'].add(r2rml_class)
                
                # Range determination
                if iri_tmpl:
                    # Object property - range is the target class
                    # Use data-driven mapping instead of hard-coded patterns
                    range_class = extract_class_from_iri_template(iri_tmpl, iri_to_class_map)
                    if range_class:
                        properties[pred]['ranges'].add(range_class)
                        classes.add(range_class)  # Ensure target class is in class list
                # For datatype properties, the XSD type is already stored in 'datatype'
    
    return classes, properties

def generate_inverse_property_name(property_name):
    """
    Generate intuitive inverse property name following standard conventions.
    
    Convention for object properties:
    - Remove namespace prefix for processing
    - Remove "has" prefix if present
    - Pattern: hasProperty -> isPropertyOf
    
    Examples:
        mt:hasCondition -> mt:isConditionOf
        mt:hasAgeStratification -> mt:isAgeStratificationOf
        mt:hasDoseRangeRoute -> mt:isDoseRangeRouteOf
        mt:condition -> mt:isConditionOf (no "has" to remove)
    """
    # Extract property name without namespace prefix
    prop = property_name.split(':')[-1] if ':' in property_name else property_name
    
    # Remove "has" prefix if present
    if prop.startswith('has') and len(prop) > 3:
        # Remove "has" and ensure next character exists
        prop = prop[3:]  # Remove "has"
    
    # Create inverse: is + Property + Of
    inverse_name = f"is{prop[0].upper()}{prop[1:]}Of"
    
    # Return with namespace prefix
    namespace = property_name.split(':')[0] if ':' in property_name else 'mt'
    return f"{namespace}:{inverse_name}"
    
    

def write_ontology_section(f, classes, properties):
    """
    Write the OWL ontology section with class and property definitions.
    
    Generates:
    - owl:Class declarations for all discovered classes
    - owl:ObjectProperty or owl:DatatypeProperty declarations
    - rdfs:domain and rdfs:range assertions with proper types
    - owl:inverseOf declarations for object properties
    
    CRITICAL: Object properties point to classes (not XSD types)
              Datatype properties point to XSD types
    """
    f.write("# ========================================\n")
    f.write("# OWL ONTOLOGY DEFINITIONS\n")
    f.write("# ========================================\n\n")
    
    # Ontology declaration with metadata (Turtle shorthand)
    f.write("# Ontology Declaration\n")
    ontology_iri = f"<http://multum.health.oraclecloud.com/{GRAPH_NAME}>"
    f.write(f'{ontology_iri} a owl:Ontology ;\n')
    f.write(f'    <http://skg.health.oraclecloud.com/ns#allowAugmentation> "false"^^xsd:boolean ;\n')
    f.write(f'    rdfs:label "{GRAPH_NAME}" ;\n')
    f.write(f'    rdfs:comment "This graph models core Multum pharmacy data to support medication ordering, clinical decision support, and pharmacy workflows. The content encompasses drug codes, nomenclatures, compatibility attributes, and mappings frequently used in healthcare applications." ;\n')
    f.write(f'    owl:versionInfo "2.8" ;\n')
    f.write(f'    owl:imports <http://www.w3.org/2004/02/skos/core> ;\n')
    f.write(f'    dcterms:created "2025-12-03"^^xsd:date ;\n')
    f.write(f'    dcterms:publisher "multumsupport_ww@oracle.com" ;\n')
    f.write(f'    dcterms:source "http://www.multum.com" .\n\n')
    
    # Write class definitions
    f.write("# Classes\n")
    for cls in sorted(classes):
        f.write(f"mt:{cls} a owl:Class .\n")
    f.write("\n")
    
    # Separate properties into object and datatype for clearer organization
    object_properties = {k: v for k, v in properties.items() if v['is_object_property']}
    datatype_properties = {k: v for k, v in properties.items() if not v['is_object_property']}
    
    # Write object properties
    f.write("# Object Properties (relationships to other entities)\n")
    for prop_name in sorted(object_properties.keys()):
        prop_info = object_properties[prop_name]
        
        # Start property definition
        f.write(f"{prop_name} a owl:ObjectProperty")
        
        # Add domain(s)
        if prop_info['domains']:
            domains = sorted(prop_info['domains'])
            if len(domains) == 1:
                f.write(f" ;\n    rdfs:domain mt:{domains[0]}")
            else:
                f.write(f" ;\n    rdfs:domain [ a owl:Class ; owl:unionOf ( {' '.join('mt:' + d for d in domains)} ) ]")
        
        # Add range(s) - MUST be classes for object properties
        if prop_info['ranges']:
            ranges = sorted(prop_info['ranges'])
            if len(ranges) == 1:
                f.write(f" ;\n    rdfs:range mt:{ranges[0]}")
            else:
                f.write(f" ;\n    rdfs:range [ a owl:Class ; owl:unionOf ( {' '.join('mt:' + r for r in ranges)} ) ]")
        
        f.write(" .\n")
        
        # Generate inverse property - properly namespaced
        inverse_name = generate_inverse_property_name(prop_name)
        f.write(f"{inverse_name} owl:inverseOf {prop_name} .\n")
        f.write("\n")
    
    # Write datatype properties
    f.write("# Datatype Properties (literal values)\n")
    for prop_name in sorted(datatype_properties.keys()):
        prop_info = datatype_properties[prop_name]
        
        # Start property definition
        f.write(f"{prop_name} a owl:DatatypeProperty")
        
        # Add domain(s)
        if prop_info['domains']:
            domains = sorted(prop_info['domains'])
            if len(domains) == 1:
                f.write(f" ;\n    rdfs:domain mt:{domains[0]}")
            else:
                f.write(f" ;\n    rdfs:domain [ a owl:Class ; owl:unionOf ( {' '.join('mt:' + d for d in domains)} ) ]")
        
        # Add range - MUST be XSD datatype for datatype properties
        if prop_info['datatype']:
            f.write(f" ;\n    rdfs:range {prop_info['datatype']}")
        
        f.write(" .\n\n")
    
    # Add skos:prefLabel and skos:notation as datatype properties used across multiple classes
    f.write("# SKOS vocabulary properties\n")
    f.write("skos:prefLabel a owl:DatatypeProperty .\n")
    f.write("skos:notation a owl:DatatypeProperty ;\n")
    f.write("    rdfs:range rdfs:Literal .\n\n")  # Range varies by usage - can be xsd:integer, xsd:string, etc.
    
    f.write("# ========================================\n")
    f.write("# INSTANCE DATA\n")
    f.write("# ========================================\n\n")

def has_clob_columns(cursor, table_name):
    """
    Check if a table contains CLOB, BLOB, or other large object columns.
    
    Returns:
        True if table has CLOB/BLOB columns, False otherwise
    """
    try:
        cursor.execute("""
            SELECT COUNT(*)
            FROM all_tab_columns
            WHERE table_name = :1
            AND owner = 'GLOBAL_DISTRIBUTE'
            AND data_type IN ('CLOB', 'BLOB', 'NCLOB', 'LONG', 'LONG RAW', 'BFILE')
        """, [table_name])
        
        count = cursor.fetchone()[0]
        return count > 0
    except Exception as e:
        print(f"[WARNING] Could not check for CLOB columns in {table_name}: {e}")
        return False  # Default to fast export if we can't determine

def export_table_to_csv_fast(cursor, table_name, csv_path):
    """
    Export Oracle table to CSV - FAST VERSION for non-CLOB tables (ORIGINAL v13 CODE).
    
    Uses original high-performance settings for tables without CLOB/BLOB columns.
    """
    # Skip export if CSV file already exists
    if os.path.exists(csv_path):
        print(f"[INFO] Skipping {table_name}, CSV already exists at {csv_path}")
        return
    
    print(f"[INFO] Exporting full table {table_name} to CSV (FAST MODE - No CLOBs)...")
    
    # Pre-count total rows for progress percentage calculation
    print("[INFO] Counting total rows for progress display...")
    cursor.execute(f"SELECT COUNT(*) FROM GLOBAL_DISTRIBUTE.{table_name}")
    total_rows = cursor.fetchone()[0]
    print(f"[INFO] Total rows to export: {total_rows:,}")
    
    start = time.time()

    # HIGH PERFORMANCE settings for non-CLOB tables (ORIGINAL)
    cursor.arraysize = 5000
    cursor.prefetchrows = 5000

    cursor.execute(f"SELECT * FROM GLOBAL_DISTRIBUTE.{table_name}")
    columns = [desc[0] for desc in cursor.description]
    
    row_count = 0

    with open(csv_path, "w", newline='', encoding='utf-8') as f:
        # CRITICAL: Use QUOTE_ALL to ensure multi-line text data is properly quoted
        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
        writer.writerow(columns)

        try:
            for row in cursor:
                try:
                    formatted = []
                    
                    for val in row:
                        if isinstance(val, (bytes, bytearray)):
                            try:
                                val = val.decode('utf-8')
                            except UnicodeDecodeError:
                                val = val.hex().upper()
                        elif hasattr(val, 'read'):
                            try:
                                val = val.read()
                                if val and len(val) > LOB_LIMIT:
                                    if isinstance(val, bytes):
                                        val = val[:LOB_LIMIT] + b"\n[TRUNCATED]".encode()
                                    elif isinstance(val, str):
                                        val = val[:LOB_LIMIT] + f"\n[TRUNCATED {len(val) - LOB_LIMIT} chars]"
                            except Exception as e:
                                val = f"[LOB READ ERROR: {e}]"
                        formatted.append(val)
                    writer.writerow(formatted)
                except Exception as e:
                    print(f"[ROW ERROR @ {row_count + 1}] {e}")
                    writer.writerow([f"[ROW ERROR: {e}]"] * len(columns))

                row_count += 1
                if row_count % 10000 == 0 or row_count == total_rows:
                    pct = (row_count / total_rows) * 100
                    print(f"[INFO] {row_count:,} rows exported ({pct:.2f}%)")

        except Exception as e:
            print(f"[FATAL ERROR] {e}")

    elapsed = round(time.time() - start, 2)
    print(f"[INFO] Export complete: {row_count:,} rows in {elapsed} sec")
    print(f"[INFO] Table export completed in {format_time(time.time() - start)}")

def export_table_to_csv_clob(cursor, table_name, csv_path):
    """
    Export Oracle table to CSV - CLOB-SAFE VERSION for tables with CLOB/BLOB columns.
    Uses server-side chunking: splits CLOBs into 32K VARCHAR2 chunks, then reassembles in Python.
    This is FAST (no client CLOB reads) and gets ALL content (no truncation).
    """
    if os.path.exists(csv_path):
        print(f"[INFO] Skipping {table_name}, CSV already exists at {csv_path}")
        return
    
    print(f"[INFO] Exporting full table {table_name} to CSV (CLOB-SAFE MODE - Server-side chunking)...")
    
    start = time.time()

    # Step 1: Get max CLOB sizes to determine how many chunks we need
    print(f"[INFO] Analyzing CLOB column sizes...")
    cursor.execute(f"""
        SELECT column_name, data_type
        FROM all_tab_columns
        WHERE table_name = :1
        AND owner = 'GLOBAL_DISTRIBUTE'
        ORDER BY column_id
    """, [table_name])
    
    col_info = cursor.fetchall()
    clob_columns = [col for col, dtype in col_info if dtype in ('CLOB', 'NCLOB')]
    
    # Get max length for each CLOB column
    clob_max_lengths = {}
    if clob_columns:
        for clob_col in clob_columns:
            try:
                cursor.execute(f"""
                    SELECT MAX(DBMS_LOB.GETLENGTH({clob_col}))
                    FROM GLOBAL_DISTRIBUTE.{table_name}
                    WHERE {clob_col} IS NOT NULL
                """)
                max_len = cursor.fetchone()[0] or 0
                clob_max_lengths[clob_col] = max_len
                print(f"[INFO] {clob_col}: max length = {max_len:,} chars")
            except Exception as e:
                print(f"[WARNING] Could not get max length for {clob_col}: {e}")
                clob_max_lengths[clob_col] = 100000  # Default assumption
    
    # Step 2: Build query with chunked CLOB reads
    chunk_size = 32767  # Max VARCHAR2 size
    select_parts = []
    clob_chunk_map = {}  # Maps original column name to list of chunk column aliases
    
    for col_name, data_type in col_info:
        if data_type in ('CLOB', 'NCLOB'):
            max_len = clob_max_lengths.get(col_name, 100000)
            # CRITICAL FIX: Use ceiling division to ensure we get ALL the data
            # Add 1 extra chunk as safety margin to handle edge cases
            num_chunks = ((max_len + chunk_size - 1) // chunk_size) + 1
            
            # Create multiple SUBSTR calls for this CLOB
            chunk_aliases = []
            for i in range(num_chunks):
                start_pos = (i * chunk_size) + 1
                alias = f"{col_name}_CHUNK{i}"
                select_parts.append(f"DBMS_LOB.SUBSTR({col_name}, {chunk_size}, {start_pos}) AS {alias}")
                chunk_aliases.append(alias)
            
            clob_chunk_map[col_name] = chunk_aliases
            print(f"[INFO] {col_name}: splitting into {num_chunks} chunks")
        else:
            select_parts.append(col_name)
    
    select_clause = ", ".join(select_parts)
    query = f"SELECT {select_clause} FROM GLOBAL_DISTRIBUTE.{table_name}"
    
    print(f"[INFO] Executing chunked query...")
    
    # Ultra-aggressive settings
    cursor.arraysize = 10000
    cursor.prefetchrows = 10000
    
    cursor.execute(query)
    fetched_columns = [desc[0] for desc in cursor.description]
    
    # Step 3: Process rows and reassemble CLOBs
    row_count = 0
    batch_size = 5000
    rows_buffer = []
    
    # Map fetched column indices back to original columns
    original_columns = [col for col, dtype in col_info]

    with open(csv_path, "w", newline='', encoding='utf-8') as f:
        # CRITICAL: Use QUOTE_ALL to ensure multi-line CLOB text is properly quoted
        writer = csv.writer(f, quoting=csv.QUOTE_ALL)
        writer.writerow(original_columns)  # Original column names

        try:
            for row in cursor:
                formatted = []
                fetched_idx = 0
                
                for col_name, data_type in col_info:
                    if col_name in clob_chunk_map:
                        # Reassemble CLOB from chunks
                        chunks = []
                        for chunk_alias in clob_chunk_map[col_name]:
                            chunk_val = row[fetched_idx]
                            if chunk_val:
                                chunks.append(chunk_val)
                            fetched_idx += 1
                        
                        # Concatenate all chunks
                        val = ''.join(chunks) if chunks else ''
                    else:
                        # Regular column
                        val = row[fetched_idx]
                        fetched_idx += 1
                        
                        # Handle bytes
                        if isinstance(val, (bytes, bytearray)):
                            try:
                                val = val.decode('utf-8')
                            except UnicodeDecodeError:
                                val = val.hex().upper()
                    
                    formatted.append(val)
                
                rows_buffer.append(formatted)
                
                if len(rows_buffer) >= batch_size:
                    writer.writerows(rows_buffer)
                    row_count += len(rows_buffer)
                    rows_buffer = []
                    
                    if row_count % 10000 == 0:
                        elapsed = time.time() - start
                        rate = row_count / elapsed if elapsed > 0 else 0
                        print(f"[INFO] {row_count:,} rows - {rate:.0f} rows/sec")

            # Write remaining rows
            if rows_buffer:
                writer.writerows(rows_buffer)
                row_count += len(rows_buffer)

        except Exception as e:
            print(f"[FATAL ERROR] {e}")
            import traceback
            traceback.print_exc()

    elapsed = round(time.time() - start, 2)
    print(f"[INFO] Export complete: {row_count:,} rows in {elapsed} sec ({row_count/elapsed:.0f} rows/sec)")
    print(f"[INFO] Table export completed in {format_time(elapsed)}")


def export_table_to_csv(cursor, table_name, csv_path):
    """
    Smart table export dispatcher - automatically selects optimal export method.
    
    Checks if table contains CLOB/BLOB columns and routes to appropriate export function:
    - Tables WITH CLOBs -> export_table_to_csv_clob() (slow but safe)
    - Tables WITHOUT CLOBs -> export_table_to_csv_fast() (fast, high throughput)
    """
    # Check if table has CLOB/BLOB columns
    has_clobs = has_clob_columns(cursor, table_name)
    
    if has_clobs:
        print(f"[INFO] Table {table_name} contains CLOB/BLOB columns - using CLOB-safe export")
        export_table_to_csv_clob(cursor, table_name, csv_path)
    else:
        print(f"[INFO] Table {table_name} has no CLOB columns - using fast export")
        export_table_to_csv_fast(cursor, table_name, csv_path)

def format_literal(val, xsd_type):
    """
    Format values as proper Turtle RDF literals with correct escaping.
    
    Handles:
    - Binary data conversion to strings
    - Backslash escaping for Turtle syntax validity
    - Multi-line vs single-line literal formatting
    - XSD datatype application
    - Special quote escaping for clinical text with line breaks
    """
    import re
    
    # Convert binary data to string representation
    if isinstance(val, (bytes, bytearray)):
        try:
            val_str = val.decode('utf-8')
        except UnicodeDecodeError:
            val_str = val.hex().upper()  # Use hex for non-UTF8 binary data
    else:
        val_str = str(val).strip()

    # Escape backslashes ? critical for Turtle syntax validity
    val_str = val_str.replace('\\', '\\\\')

    # Determine literal format based on presence of line breaks
    if "\n" in val_str or "\r" in val_str:
        # Multi-line literals use triple quotes (common for clinical text)
        val_str = val_str.replace('"', '\\"')           # Escape internal quotes
        val_str = val_str.replace('"""', "'\"\"")       # Handle embedded triple quotes (rare)
        literal = f'"""{val_str}"""'
    else:
        # Single-line literals use regular quotes
        val_str_escaped = val_str.replace('"', '\\"')
        literal = f'"{val_str_escaped}"'

    # Apply XSD datatype according to R2RML mapping specifications
    if xsd_type == "xsd:string":
        return literal
    elif xsd_type:
        return f'{literal}^^{xsd_type}'  # Add datatype annotation
    else:
        return literal

def format_language_tagged_text(val):
    """
    Format text with language tags for TextBlock predicates.
    
    Checks if value ends with a language tag (e.g., @en, @es).
    If so, wraps the text content in triple quotes and appends the language tag.
    
    Args:
        val: The text value, potentially ending with @xx language tag
        
    Returns:
        Formatted TTL literal with language tag outside triple quotes
        Example: triple-quote Some long text... triple-quote @en
    """
    val_str = str(val).strip()
    
    # Check for language tag pattern at the end: @xx where xx is 2-3 letter language code
    # Supports simple tags (@en, @es) and regional variants (@en-US, @es-MX)
    lang_pattern = re.compile(r'^(.+)(@[a-zA-Z]{2,3}(?:-[a-zA-Z]{2,4})?)$', re.DOTALL)
    match = lang_pattern.match(val_str)
    
    if match:
        text_content = match.group(1).rstrip()  # Strip trailing space before language tag
        lang_tag = match.group(2)
        
        # Escape backslashes first, then handle quotes
        text_content = text_content.replace('\\', '\\\\')
        
        # Use triple quotes only for multi-line text, regular quotes otherwise
        if "\n" in text_content or "\r" in text_content:
            text_content = text_content.replace('"""', "'\"\"")  # Handle embedded triple quotes
            return f'"""{text_content}"""{lang_tag}'
        else:
            text_content = text_content.replace('"', '\\"')
            return f'"{text_content}"{lang_tag}'
    else:
        # No language tag found, use regular literal formatting with triple quotes for long text
        val_str = val_str.replace('\\', '\\\\')
        
        if "\n" in val_str or "\r" in val_str:
            val_str = val_str.replace('"', '\\"')
            val_str = val_str.replace('"""', "'\"\"")
            return f'"""{val_str}"""'
        else:
            val_str_escaped = val_str.replace('"', '\\"')
            return f'"{val_str_escaped}"'


def is_icd10_code(icd_code):
    """
    Determine if an ICD code is ICD-10 or ICD-9 based on presence of alphabetic characters.
    
    ICD-10 codes contain letters (e.g., 'A01.0', 'E11.9')
    ICD-9 codes are purely numeric with optional periods (e.g., '250.00', '401.9')
    
    Args:
        icd_code: The ICD code string to check
        
    Returns:
        True if ICD-10 (contains alphabetic characters), False if ICD-9 (purely numeric)
    """
    return any(c.isalpha() for c in str(icd_code))

def select_condition_iri_template(icd_code, default_template, col_name=''):
    """
    Select the appropriate IRI template for Condition class based on ICD code type.
    
    CRITICAL: If template is "CUSTOM" or similar placeholder, replace it entirely
    with the proper ICD namespace IRI.
    
    Args:
        icd_code: The ICD code value
        default_template: The default IRI template from metadata (may be "CUSTOM")
        col_name: The column name (to detect ICD code columns)
        
    Returns:
        The appropriate IRI template with {VALUE} placeholder for the ICD code
    """
    # Normalize inputs for comparison
    template_upper = str(default_template).upper().strip()
    col_upper = str(col_name).upper().strip()
    
    # Check if this is an ICD code column
    is_icd_column = (
        'ICD' in col_upper or 
        'ICD' in template_upper
    )
    
    # Check if template is a placeholder that needs replacement
    is_placeholder = (
        template_upper == 'CUSTOM' or
        template_upper.startswith('CUSTOM:') or
        'V_RDF_TTL' in template_upper or
        (is_icd_column and not '{' in default_template)  # ICD column but no placeholder
    )
    
    # If this is an ICD column with a placeholder template, generate proper IRI
    if is_icd_column and (is_placeholder or 'CONDITION' in template_upper):
        if is_icd10_code(icd_code):
            return 'http://skg.health.oraclecloud.com/icd10cm/{VALUE}'
        else:
            return 'http://skg.health.oraclecloud.com/icd9cm/{VALUE}'
    
    return default_template


def build_subject_iri(template, col_name, value):
    """
    Generate RDF subject IRIs by substituting the PRIMARY KEY column value into IRI template.
    
    CRITICAL: 100% DATA DRIVEN - Only reads the actual key value from the CSV column.
    NEVER constructs composite keys - composite keys must be pre-formed in the database/CSV.
    
    Args:
        template: IRI template with {COLUMN_NAME} placeholder
        col_name: Primary key column name
        value: The ACTUAL value from the primary key column (may be composite like "846/239/20080")
    
    Special handling for Condition class: selects ICD-10-CM or ICD-9-CM namespace
    based on whether ICD_CODE contains alphabetic characters.
    """
    # Special handling for ICD codes in Condition class
    if 'ICD' in col_name.upper() or 'ICD' in template.upper():
        template = select_condition_iri_template(value, template, col_name)
    
    # CRITICAL: Simply replace the key column placeholder with the ACTUAL data value
    # The value may be composite (e.g., "846/239/20080") - we use it AS-IS from the data
    result_template = template
    result_template = result_template.replace('{VALUE}', str(value).strip())
    result_template = result_template.replace('{' + col_name + '}', str(value).strip())
    result_template = result_template.replace('{' + col_name.upper() + '}', str(value).strip())
    result_template = result_template.replace('{' + col_name.lower() + '}', str(value).strip())
    
    return f"<{result_template}>"

def format_multiline(values, is_iri):
    """
    Format multiple values for RDF predicates with proper indentation.
    
    Handles both IRI objects and literal objects.
    For IRIs: 
      - Prefixed names (drug:d00001) → no angle brackets
      - Full URIs (http://...) → needs angle brackets
      - Already has brackets → keep as-is
    Creates comma-separated, properly indented multi-line format.
    """
    formatted_values = []
    for v in values:
        v_stripped = v.strip()
        if is_iri:
            # Check if already has angle brackets
            if v_stripped.startswith('<') and v_stripped.endswith('>'):
                formatted_values.append(v_stripped)  # Already has brackets, keep as-is
            # Check if it's a full URI (starts with http:// or https://)
            elif v_stripped.startswith('http://') or v_stripped.startswith('https://'):
                formatted_values.append(f"<{v_stripped}>")  # Full URI needs brackets
            # Check if it's a prefixed name (contains : but not at start, not a full URI)
            elif ':' in v_stripped and not v_stripped.startswith('http'):
                formatted_values.append(v_stripped)  # Prefixed name, no brackets
            else:
                # Fallback: if unclear, treat as full IRI and add brackets
                formatted_values.append(f"<{v_stripped}>")
        else:
            formatted_values.append(v_stripped)  # Literal value
    
    return ",\n        ".join(formatted_values)

def group_rows_by_key(csv_path, key_col_index):
    """
    Group CSV rows by primary key value.
    
    Essential for clinical data where one subject (patient) may have
    multiple rows (observations, measurements, etc.).
    
    Returns dictionary where key = primary key value, 
    value = list of row dictionaries for that key.
    """
    grouped = defaultdict(list)  # Automatically creates empty list for new keys
    with open(csv_path, newline='', encoding='utf-8') as f:
        reader = csv.reader(f)
        headers = next(reader)  # Read header row
        for row in reader:
            key = row[key_col_index].strip()  # Extract primary key value
            # Create dictionary mapping column names to values for this row
            grouped[key].append({h: v.strip() for h, v in zip(headers, row)})
    return grouped

def build_ttl_block(metadata, grouped_rows, group_metadata, abbreviations, base_url):
    """
    Generate a complete Turtle RDF block for one subject.
    
    Fully data-driven - all behavior determined by R2RML metadata tables.
    No hard-coded column or table logic.
    
    Core RDF conversion logic that:
    1. Identifies the RDF subject from primary key
    2. Identifies raw value columns from metadata (R2RML_ROLE='subject (templated)' with NULL predicate/IRI)
    3. Collects all predicate-object pairs from grouped rows
    4. Handles both IRI objects and literal objects
    5. Generates properly formatted Turtle syntax
    6. Maintains predicate ordering from metadata
    7. Groups related predicates into blank nodes when specified
    8. Adds skos:Concept type and skos:notation for all subjects
    9. Outputs raw values (no predicate) in groups or non-grouped based on metadata
    10. Deduplicates raw values automatically
    """
    row = grouped_rows[0]                    # Use first row for subject generation
    predicates_ordered = []                  # Maintain metadata-defined order
    subject = None                           # Will hold the RDF subject IRI
    class_name = None                        # Will hold the RDF class
    notation_value = None                    # Will hold the ID for skos:notation
    notation_datatype = None                 # Will hold the XSD datatype for skos:notation
    
    # Build group mapping from group metadata - DATA DRIVEN
    # Normalize column names to uppercase for consistent matching
    column_to_group = {}
    if group_metadata:
        for col, group_pred in group_metadata:
            column_to_group[col.upper().strip()] = group_pred

    # Build set of raw value columns from metadata - DATA DRIVEN
    # These columns output their value directly without a predicate wrapper
    # Make checks robust against whitespace and None values
    raw_value_columns = set()
    for m in metadata:
        col, pred, iri_tmpl, is_key, xsd, _, _, r2rml_role = m
        
        # Normalize and check each field more robustly
        role_check = r2rml_role and str(r2rml_role).strip().lower() == 'subject (templated)'
        iri_check = not iri_tmpl or str(iri_tmpl).strip() == ''
        pred_check = not pred or str(pred).strip() == ''
        
        if role_check and iri_check and pred_check:
            raw_value_columns.add(col.upper().strip())  # Normalize to uppercase

    # First pass: identify the RDF subject from primary key column
    for m in metadata:
        col, pred, iri_tmpl, is_key, xsd, _, r2rml_class, r2rml_role = m
        if is_key == "Y" and iri_tmpl:  # This is the primary key column
            key_val = row[col]
            val_str = str(key_val).strip()
            # Special handling for GUID columns stored as binary string representations
            if col.upper().endswith("GUID") and val_str.startswith(("b'", 'b"')):
                try:
                    decoded = eval(val_str)      # Convert string representation to actual bytes
                    val_str = decoded.hex().upper()  # Convert bytes to hex string
                except Exception:
                    pass  # Keep original value if conversion fails
            subject = build_subject_iri(iri_tmpl, col, val_str)  # Use actual data value AS-IS
            # Apply IRI abbreviation to subject
            subject = abbreviate_iri(subject, abbreviations, base_url)
            class_name = r2rml_class
            notation_value = val_str  # Store the ID value for skos:notation
            notation_datatype = xsd  # Store the XSD datatype for skos:notation

    # Return None if no valid subject could be generated
    if not subject:
        return None

    # Second pass: collect all predicate-object pairs from all grouped rows
    # Separate handling for regular predicates vs raw values - DATA DRIVEN
    grouped_predicates_by_row = []  # List of dictionaries: [{group_name: {pred: [values]}}]
    non_grouped_predicates = defaultdict(set)
    grouped_raw_values_by_row = []  # List of dictionaries: [{group_name: [raw_values]}]
    non_grouped_raw_values = set()     # SET for automatic deduplication of raw values not in any group
    
    # Process each row separately to maintain proper grouping
    for row_idx, r in enumerate(grouped_rows):
        row_groups = defaultdict(lambda: defaultdict(list))  # group_name -> predicate -> values for this row
        row_raw_values = defaultdict(list)  # group_name -> [raw_values] for this row
        
        for m in metadata:
            col, pred, iri_tmpl, is_key, xsd, _, _, r2rml_role = m
            if is_key == "Y":
                continue  # Skip primary key columns (already used for subject)
            
            # DATA DRIVEN: Check if this column is marked as a raw value column
            # Use uppercase for comparison to handle name mismatches
            if col.upper().strip() in raw_value_columns:
                # This column outputs its value directly without predicate
                val = r[col].strip()
                if val in (None, "", " "):
                    continue
                
                # DATA DRIVEN: Determine where this raw value belongs from group table
                # Use uppercase for lookup
                group_name = column_to_group.get(col.upper().strip())
                if group_name:
                    # It's in a group - add to group's raw values
                    row_raw_values[group_name].append(val)
                else:
                    # It's not in a group - add to non-grouped raw values SET
                    non_grouped_raw_values.add(val)  # Using .add() for set deduplication
                continue  # Skip normal predicate processing for raw values
            
            # Normal processing for non-raw-value columns
            val = r[col].strip()
            if val in (None, "", " "):       # Skip empty/null values
                continue
            
            # Check if this column is part of a group - DATA DRIVEN
            # Use uppercase for lookup to handle name mismatches
            group_name = column_to_group.get(col.upper().strip())
            
            if iri_tmpl:
                # Object is an IRI (reference to another resource)
                # Special handling for ICD codes in Condition class
                actual_iri_tmpl = iri_tmpl
                if 'ICD' in col.upper() or 'ICD' in iri_tmpl.upper():
                    actual_iri_tmpl = select_condition_iri_template(val, iri_tmpl, col)
                
                # CRITICAL: Only replace the current column's placeholder with its actual data value
                # We do NOT construct composite keys - the value from the column IS the key
                obj_val = actual_iri_tmpl
                obj_val = obj_val.replace('{VALUE}', str(val.strip()))
                obj_val = obj_val.replace("{" + col + "}", str(val.strip()))
                obj_val = obj_val.replace("{" + col.upper() + "}", str(val.strip()))
                obj_val = obj_val.replace("{" + col.lower() + "}", str(val.strip()))
                
                # Apply IRI abbreviation
                obj_val = abbreviate_iri(obj_val, abbreviations, base_url)
                
                # Apply IRI abbreviation
                obj_val = abbreviate_iri(obj_val, abbreviations, base_url)
                
                if group_name:
                    row_groups[group_name][pred].append(obj_val)
                else:
                    non_grouped_predicates[pred].add(obj_val)
            else:
                # Object is a literal value
                val_str = str(val).strip()
                # Special GUID handling for literal values too
                if col.upper().endswith("GUID") and val_str.startswith(("b'", 'b"')):
                    try:
                        decoded = eval(val_str)
                        val_str = decoded.hex().upper()
                    except Exception:
                        pass
                    formatted_val = f'"{val_str}"'
                else:
                    # Use proper literal formatting with XSD types
                    # Detect language tags on any literal value, not just TextBlock
                    val_check = str(val).strip()
                    if re.search(r' @[a-zA-Z]{2,3}(?:-[a-zA-Z]{2,4})?$', val_check):
                        formatted_val = format_language_tagged_text(val)
                    else:
                        formatted_val = format_literal(val, xsd)
                
                if group_name:
                    row_groups[group_name][pred].append(formatted_val)
                else:
                    non_grouped_predicates[pred].add(formatted_val)
        
        # Only add row groups that have content
        if row_groups:
            grouped_predicates_by_row.append(dict(row_groups))
        
        # Track raw values separately - DATA DRIVEN
        if row_raw_values:
            grouped_raw_values_by_row.append(dict(row_raw_values))
    
    # Collect ordered predicates for non-grouped items
    for m in metadata:
        col, pred, iri_tmpl, is_key, xsd, _, _, r2rml_role = m
        
        # Skip raw value columns - DATA DRIVEN, use uppercase comparison
        if col.upper().strip() in raw_value_columns:
            continue
        
        group_name = column_to_group.get(col.upper().strip())
        if is_key != "Y" and not group_name:  # Non-key, non-grouped columns
            predicates_ordered.append((pred, iri_tmpl, xsd, col))

    # Third pass: generate formatted predicate lines
    predicate_lines = []
    
    # Add skos:notation as the FIRST predicate (immediately after type declaration)
    if notation_value:
        if notation_datatype:
            predicate_lines.append(("skos:notation", f'"{notation_value}"^^{notation_datatype}'))
        else:
            predicate_lines.append(("skos:notation", f'"{notation_value}"'))
    
    # Add non-grouped predicates (deduplicate to avoid outputting same predicate multiple times)
    added_predicates = set()  # Track which predicates we've already added
    for pred, iri_tmpl, xsd, col in predicates_ordered:
        # Skip if we've already added this predicate
        if pred in added_predicates:
            continue
            
        vals = non_grouped_predicates[pred]
        if vals:  # Only include predicates that have values
            # Format values with proper indentation, sorted for consistency
            predicate_lines.append((pred, format_multiline(sorted(vals), bool(iri_tmpl))))
            added_predicates.add(pred)  # Mark this predicate as added
    
    # Add non-grouped raw values (if any) - DATA DRIVEN with deduplication
    for raw_val in sorted(non_grouped_raw_values):  # Sort for consistent output
        # Format as string literal with proper escaping
        escaped_val = raw_val.replace('\\', '\\\\').replace('"', '\\"')
        formatted_raw_val = f'{escaped_val}'
        predicate_lines.append((None, formatted_raw_val))  # None indicates no predicate
    
    # Add grouped predicates as separate blank nodes for each row
    for idx, row_groups in enumerate(grouped_predicates_by_row):
        for group_name in sorted(row_groups.keys()):
            group_preds = row_groups[group_name]
            if group_preds:
                # Create blank node structure for this specific row's group
                blank_node_lines = []
                group_pred_items = list(group_preds.items())
                
                for i, (pred, vals) in enumerate(group_pred_items):
                    if vals:
                        # Check metadata to determine if this specific predicate uses IRI template
                        pred_uses_iri = False
                        for m in metadata:
                            col, pred_name, iri_tmpl, is_key, xsd, _, _, r2rml_role = m
                            if pred_name == pred and iri_tmpl:
                                pred_uses_iri = True
                                break
                        
                        formatted_vals = format_multiline(sorted(set(vals)), pred_uses_iri)
                        
                        # Check if there are raw values coming after this - DATA DRIVEN
                        has_raw_values_after = (idx < len(grouped_raw_values_by_row) and 
                                               group_name in grouped_raw_values_by_row[idx])
                        is_last = (i == len(group_pred_items) - 1 and not has_raw_values_after)
                        
                        term = "" if is_last else " ;"
                        blank_node_lines.append(f"        {pred} {formatted_vals}{term}")
                
                # Add raw values for this group (if any) - DATA DRIVEN from group table
                if idx < len(grouped_raw_values_by_row) and group_name in grouped_raw_values_by_row[idx]:
                    raw_values = grouped_raw_values_by_row[idx][group_name]
                    for j, raw_val in enumerate(raw_values):
                        # Format as string literal with proper escaping
                        escaped_val = raw_val.replace('\\', '\\\\').replace('"', '\\"')
                        formatted_raw_val = f'{escaped_val}'
                        
                        # Last raw value has no semicolon
                        term = "" if j == len(raw_values) - 1 else " ;"
                        blank_node_lines.append(f"        {formatted_raw_val}{term}")
                
                if blank_node_lines:
                    blank_node_content = "\n".join(blank_node_lines)
                    predicate_lines.append((group_name, f"[\n{blank_node_content}\n    ]"))

    # Handle edge case: subject with no predicates (only rdf:type and skos:notation)
    # SKIP these entities - they have no useful information beyond their ID
    if not predicate_lines:
        return None

    # Generate final Turtle syntax with skos:Concept type
    lines = [f"{subject} a mt:{class_name}, skos:Concept ;"]      # Start with rdf:type declaration including skos:Concept
    for i, (pred, multiline) in enumerate(predicate_lines):
        # Last predicate ends with period, others with semicolon
        term = "." if i == len(predicate_lines) - 1 else ";"
        
        # Handle raw values (pred is None)
        if pred is None:
            lines.append(f"    {multiline} {term}")  # No predicate for raw values
        else:
            lines.append(f"    {pred} {multiline} {term}")  # Normal predicate output
    
    return "\n".join(lines)

def format_time(seconds):
    """
    Convert seconds to human-readable time format.
    
    Provides both minutes and hours for easy interpretation
    of processing times.
    """
    minutes = seconds / 60
    hours = minutes / 60
    return f"{int(round(minutes))}m ({round(hours, 1)}hr)"

# Main execution block
if __name__ == "__main__":
    total_start = time.time()  # Start total processing timer
    
    # Load IRI abbreviations from database
    base_url = "http://multum.health.oraclecloud.com/"
    abbreviations = load_iri_abbreviations(cursor)
    
    # Resolve output/ folder relative to project root (skg/output/)
    project_root = Path(__file__).resolve().parent.parent
    output_dir   = project_root / "output"
    output_dir.mkdir(exist_ok=True)
    output_path  = str(output_dir / "output.ttl")
    
    # First, collect all ontology metadata across all tables
    print("[INFO] Collecting ontology metadata from all tables...")
    classes, properties = get_all_ontology_metadata(cursor, table_list)
    print(f"[INFO] Found {len(classes)} classes and {len(properties)} properties")
    
    # Open output file and write everything
    with open(output_path, "w", encoding="utf-8") as f:
        # Standard Turtle namespace declarations
        f.write(f"@prefix mt: <http://multum.health.oraclecloud.com/{GRAPH_NAME}/ns#> .\n")
        f.write("@prefix owl: <http://www.w3.org/2002/07/owl#> .\n")
        f.write("@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .\n")
        f.write("@prefix skos: <http://www.w3.org/2004/02/skos/core#> .\n")
        f.write("@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .\n")
        f.write("@prefix dcterms: <http://purl.org/dc/terms/> .\n")
        f.write("\n")
        
        # Instance prefixes (sorted for consistency)
        f.write("# Instance IRI Prefixes\n")
        for class_name in sorted(abbreviations.keys()):
            abbrev = abbreviations[class_name]
            class_url = f"{base_url}{GRAPH_NAME}/{class_name}/"
            f.write(f"@prefix {abbrev}: <{class_url}> .\n")
        f.write("\n")
        
        # Write the ontology section
        write_ontology_section(f, classes, properties)
        
        # Process each table in the discovered/configured table list
        for table in table_list:
            table = table.strip()
            if not table:  # Skip empty table names
                continue

            print(f"[INFO] Loading metadata for {table}")
            # Retrieve R2RML mapping metadata for this table
            metadata = get_metadata(cursor, table)
            
            # Retrieve group metadata for this table
            group_metadata = get_group_metadata(cursor, table)
            
            # Validate that table has a primary key defined
            key_columns = [col[0] for col in metadata if col[3] == "Y"]  # IS_KEY = 'Y'
            if not key_columns:
                print(f"[WARNING] Skipping {table}: No key column defined (IS_KEY = 'Y').")
                continue
            key_column = key_columns[0]  # Use first key column if multiple exist

            # Set up CSV file path in output/ folder
            csv_path = str(output_dir / f"{table}.csv")
            
            # Conditionally export table to CSV (respects SKIP_DOWNLOAD flag)
            if not SKIP_DOWNLOAD:
                export_table_to_csv(cursor, table, csv_path)

            # Read CSV headers to find primary key column index
            with open(csv_path, newline='', encoding='utf-8') as cf:
                headers = next(csv.reader(cf))
                key_col_index = headers.index(key_column)

            # Group CSV rows by primary key for multi-row subject handling
            grouped = group_rows_by_key(csv_path, key_col_index)
            print(f"[INFO] Generating TTL for {len(grouped)} subjects")
            start_time = time.time()  # Start table processing timer

            # Process each subject (group of rows with same primary key)
            for i, (key, rows) in enumerate(grouped.items(), 1):
                # Respect user-defined subject limit for testing
                if MAX_SUBJECTS and i > MAX_SUBJECTS:
                    break
                
                # Generate Turtle RDF block for this subject
                ttl = build_ttl_block(metadata, rows, group_metadata, abbreviations, base_url)
                if ttl:  # Only write if valid TTL was generated
                    f.write(ttl + "\n\n")   # Write with blank line separator
                
                # Progress reporting every 50 subjects
                if i % 50 == 0:
                    print(f"[INFO] Processed {i}/{len(grouped)}")

            # Report table processing completion time
            print(f"[INFO] Total processing time for {table}: {format_time(time.time() - start_time)}")

    # Final completion reporting
    print(f"[INFO] All TTL output written to {output_path}")
    print(f"[INFO] Total end-to-end processing time: {format_time(time.time() - total_start)}")