#!/usr/bin/env python3
"""
Simple TTL Consolidator for Pre-Sorted Files
=============================================
Merges consecutive blocks with the same subject IRI AND primary class by removing duplicate headers.
Supports both full IRI notation (<http://...>) and prefix abbreviations (prefix:localname).

IMPORTANT: Blocks are consolidated only if they are CONSECUTIVE and have the SAME subject AND the SAME primary class.
This ensures that rxbuilder:d00001/1 as mt:RxBuilder stays separate from 
rxbuilder:d00001/1 as mt:RxBuilderDoseDispenseRule.

REQUIREMENT: Input file MUST be sorted by subject IRI AND class together!
For best results, sort by: (1) subject IRI, (2) primary class name
Example sort order:
  rxbuilder:d00001/1 a mt:RxBuilder (block 1)
  rxbuilder:d00001/1 a mt:RxBuilder (block 2) <- will consolidate with block 1
  rxbuilder:d00001/1 a mt:RxBuilderDoseDispenseRule (block 3) <- separate
  rxbuilder:d00001/2 a mt:RxBuilder (block 4) <- separate

This approach is:
- Fast: Processes 1GB file in 1-2 minutes
- Simple: Text-based, no RDF parsing
- Memory efficient: Streams through file
- Safe: Preserves all content including multi-line strings
- Class-aware: Keeps different classes separate even with same subject IRI

Author: M Dombaugh
Version: 11.0 - Output written to output/ folder relative to project root.
                Input read from output/ folder (produced by rdf_from_metadata.py).
         10.0 - Removed hardcoded mt:allowAugmentation (now in metadata script)
                Fixed OWL detection to match full IRI subjects (<http://...> a owl:Ontology)
                Graph IRI now extracted dynamically from ontology declaration
         8.0 - GRAPH wrapper now placed immediately after ALL @prefix declarations
               OWL ontology definitions (classes, properties) now INSIDE graph wrapper
               Per SKG team: everything after prefixes must be inside GRAPH clause
         7.0 - Added named graph wrapper for TriG output format
               Output file extension changed from .ttl to .trig
         6.0 - CRITICAL FIX: Subject detection regex now handles Unicode/special characters
               Old pattern [\\w/.-]+ failed on mojibake like "ELLIOTTâ€™S" causing blocks
               to be incorrectly concatenated. New pattern uses \\S+ to match any
               non-whitespace characters in local names.
"""

import sys
from pathlib import Path
import re


def consolidate_sorted_ttl(input_file, output_file):
    """
    Consolidate a pre-sorted TTL file by merging consecutive blocks with same subject.
    
    Args:
        input_file: Path to sorted input TTL file
        output_file: Path to output consolidated file
    """
    print(f"Reading sorted TTL file: {input_file}")
    
    file_size_mb = input_file.stat().st_size / (1024 * 1024)
    print(f"File size: {file_size_mb:.1f} MB")
    
    # Pre-scan to extract graph IRI from ontology declaration
    graph_iri = None
    with open(input_file, 'r', encoding='utf-8') as prescan:
        for line in prescan:
            # Look for ontology declaration: <http://...> a owl:Ontology
            match = re.match(r'^(<[^>]+>)\s+a\s+owl:Ontology', line.strip())
            if match:
                graph_iri = match.group(1)
                print(f"Detected graph IRI: {graph_iri}")
                break
    
    if not graph_iri:
        print("[WARNING] No ontology declaration found, using default graph IRI")
        graph_iri = "<http://multum.health.oraclecloud.com/default>"
    
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:
        
        # Track state
        current_subject = None
        current_primary_class = None  # Track the primary (first non-skos) class
        current_types = set()
        current_notation = None
        current_predicates = []
        in_triple_quote = False
        in_blank_node = False  # Track if we're inside a blank node [ ... ]
        line_count = 0
        blocks_processed = 0
        blocks_consolidated = 0
        in_prefix_section = True  # Track if we're still in @prefix declarations
        graph_opened = False  # Track if we've opened the GRAPH clause
        in_owl_definition = False  # Track multi-line OWL definitions
        
        for line in infile:
            line_count += 1
            
            if line_count % 100000 == 0:
                print(f"  Processed {line_count:,} lines, {blocks_processed:,} blocks, {blocks_consolidated:,} consolidated", flush=True)
            
            # Track triple-quote state
            line_without_escaped = line.replace(r'\"', '')
            triple_quote_count = line_without_escaped.count('"""')
            if triple_quote_count % 2 == 1:
                in_triple_quote = not in_triple_quote
            
            # Track blank node state - look for [ and ] but not inside triple quotes
            if not in_triple_quote:
                stripped = line.strip()
                # Count opening and closing brackets (ignore those in strings)
                open_bracket_count = stripped.count('[')
                close_bracket_count = stripped.count(']')
                
                if open_bracket_count > close_bracket_count:
                    in_blank_node = True
                elif close_bracket_count > open_bracket_count:
                    in_blank_node = False
            
            # Handle prefix section - pass through all @prefix lines
            if in_prefix_section:
                stripped = line.strip()
                
                # Pass through @prefix lines
                if stripped.startswith('@prefix'):
                    outfile.write(line)
                    continue
                
                # Pass through blank lines and comments while still in prefix section
                if not stripped or stripped.startswith('#'):
                    # Only write blank lines/comments if we haven't hit non-prefix content yet
                    outfile.write(line)
                    continue
                
                # First non-prefix, non-blank, non-comment line means prefixes are done
                # Open the GRAPH clause now
                in_prefix_section = False
                graph_opened = True
                outfile.write(f'\n{graph_iri} {{\n\n')
                # Fall through to process this line as content inside the graph
            
            # Skip INSTANCE DATA comment block - we'll add our own when we hit instance data
            stripped = line.strip()
            if stripped == '# ========================================' or \
               stripped == '# INSTANCE DATA':
                continue
            
            # Handle multi-line OWL definitions (pass through until we hit ending period)
            if in_owl_definition:
                outfile.write(line)
                if stripped.endswith(' .') or stripped == '.':
                    in_owl_definition = False
                continue
            
            # Pass through comments and blank lines (inside graph)
            if not stripped or stripped.startswith('#'):
                outfile.write(line)
                continue
            
            # Check if this is instance data (starts a consolidation block)
            # Instance data has patterns like: rxbuilder:d00001/1 or <http://...>
            if not in_triple_quote and not in_blank_node:
                # Using \S+ to handle Unicode chars, mojibake, and special characters
                match = re.match(r'^(<[^>]+>|[\w-]+:\S+)\s+a\s+', line)
                
                if match:
                    subject = match.group(1)
                    
                    # Check if this is OWL ontology metadata vs instance data
                    # OWL metadata: <http://...> a owl:Ontology OR mt:ClassName a owl:Class
                    # Instance data: drug:d00001 a mt:Drug
                    is_owl_def_line = re.match(r'^(<[^>]+>|[\w-]+:[\w-]+)\s+a\s+owl:(Ontology|Class|ObjectProperty|DatatypeProperty|AnnotationProperty|FunctionalProperty|InverseFunctionalProperty|TransitiveProperty|SymmetricProperty)', stripped)
                    
                    if is_owl_def_line:
                        # This is an OWL ontology definition - write it directly (inside graph)
                        outfile.write(line)
                        # Check if this is a multi-line definition (doesn't end with period)
                        if not stripped.endswith(' .'):
                            in_owl_definition = True
                        continue
                    
                    # This is instance data - start consolidation processing
                    # Write instance data header if this is the first instance
                    if blocks_processed == 0:
                        outfile.write('\n# ========================================\n')
                        outfile.write('# INSTANCE DATA\n')
                        outfile.write('# ========================================\n\n')
                    
                    # Extract types from first line
                    types_match = re.search(r'\s+a\s+([^;]+)', line)
                    if types_match:
                        types_str = types_match.group(1).strip()
                        new_types = set(t.strip() for t in types_str.split(','))
                    else:
                        new_types = set()
                    
                    # Determine primary class (first non-skos:Concept type)
                    primary_class = None
                    for t in new_types:
                        if t != 'skos:Concept':
                            primary_class = t
                            break
                    
                    # Check if this is same subject AND same primary class as previous block
                    if subject == current_subject and primary_class == current_primary_class:
                        # Same subject and class - consolidate
                        blocks_consolidated += 1
                        # Merge types
                        current_types.update(new_types)
                        # Don't write the subject line, we'll add predicates to current block
                        continue
                    else:
                        # Different subject or different class - write out previous block if exists
                        if current_subject:
                            write_consolidated_block(outfile, current_subject, current_types, 
                                                   current_notation, current_predicates)
                        
                        # Start new block
                        current_subject = subject
                        current_primary_class = primary_class
                        current_types = new_types
                        current_notation = None
                        current_predicates = []
                        blocks_processed += 1
                        continue
            
            # Process predicate lines within a block
            if current_subject:
                stripped = line.strip()
                
                # If we're in a triple quote or blank node, just collect the line as-is
                if in_triple_quote or in_blank_node:
                    if stripped.endswith(' .') and not in_blank_node:
                        # End of block inside triple quote
                        line_without_period = line.rstrip()
                        if line_without_period.endswith('.'):
                            line_without_period = line_without_period[:-1].rstrip()
                        current_predicates.append(('preserve', line_without_period))
                    else:
                        current_predicates.append(('preserve', line.rstrip()))
                    continue
                
                # Check for skos:notation (we track this separately to avoid duplicates)
                if stripped.startswith('skos:notation'):
                    # Match everything after skos:notation up to semicolon or period at end
                    # Handle quoted strings that may contain periods
                    notation_match = re.search(r'skos:notation\s+(.+?)\s*[;.]?\s*$', line)
                    if notation_match and not current_notation:
                        notation_value = notation_match.group(1).strip()
                        # Remove trailing semicolon or period if captured
                        if notation_value.endswith(';') or notation_value.endswith('.'):
                            notation_value = notation_value[:-1].strip()
                        current_notation = notation_value
                    continue  # Skip this line, we'll add it later
                
                # Skip the type declaration line (we already captured types)
                if stripped.startswith('a '):
                    continue
                
                # Check if block ends
                if stripped.endswith(' .'):
                    # Add this final predicate (if not empty)
                    if stripped and stripped != '.':
                        # Remove trailing period but preserve original line structure
                        line_without_period = line.rstrip()
                        if line_without_period.endswith('.'):
                            line_without_period = line_without_period[:-1].rstrip()
                        current_predicates.append(('normal', line_without_period))
                    continue  # We'll write the period when we output the block
                
                # Regular predicate line - keep exactly as-is
                if stripped:
                    current_predicates.append(('normal', line.rstrip()))
        
        # Write final block
        if current_subject:
            write_consolidated_block(outfile, current_subject, current_types, 
                                   current_notation, current_predicates)
        
        # Close the named graph
        outfile.write('}\n')
    
    print(f"\nConsolidation complete!")
    print(f"Statistics:")
    print(f"  Lines processed: {line_count:,}")
    print(f"  Blocks processed: {blocks_processed:,}")
    print(f"  Blocks consolidated: {blocks_consolidated:,}")
    print(f"  Final unique subjects: {blocks_processed:,}")


def write_consolidated_block(outfile, subject, types, notation, predicates):
    """
    Write a consolidated block to output file.
    Subject can be either '<IRI>' or 'prefix:localname'
    """
    # Write subject and types - subject already includes <> or is prefix:local
    sorted_types = sorted(types)
    
    # outfile.write(f'{subject}\n')
    # outfile.write(f'    a {", ".join(sorted_types)} ;\n')
    # changed subject to single line here
    
    outfile.write(f'{subject} a {", ".join(sorted_types)} ;\n')
    
    # Write notation if present
    if notation:
        outfile.write(f'    skos:notation {notation} ;\n')
    
    # Write predicates - preserve their original structure
    if predicates:
        for i, pred_item in enumerate(predicates):
            is_last = (i == len(predicates) - 1)
            
            # Handle both tuple format (type, content) and plain string format
            if isinstance(pred_item, tuple):
                pred_type, pred = pred_item
            else:
                pred_type, pred = 'normal', pred_item
            
            pred = pred.rstrip()
            
            # Skip empty predicates
            if not pred:
                continue
            
            # If marked as 'preserve', write exactly as-is (blank node or triple quote content)
            if pred_type == 'preserve':
                outfile.write(f'{pred}\n')
                continue
            
            # For 'normal' predicates, apply formatting rules
            if is_last:
                # Last predicate: ensure it ends with period
                if pred.endswith(';') or pred.endswith(','):
                    pred = pred[:-1].rstrip()
                if not pred.endswith('.'):
                    pred = pred + ' .'
                outfile.write(f'{pred}\n')
            else:
                # Not last: ensure it has a terminator
                if not (pred.endswith(';') or pred.endswith(',') or pred.endswith('.')):
                    pred = pred + ' ;'
                outfile.write(f'{pred}\n')
    else:
        # No predicates, just end the types line with period
        outfile.seek(outfile.tell() - 2)  # Back up to remove " ;"
        outfile.write(' .\n')
    
    # Blank line between blocks
    outfile.write('\n')


def main():
    """Main entry point."""
    if len(sys.argv) < 2:
        print("Usage: python rdf_consolidate.py <sorted_input.ttl> [output.trig]")
        print("\nWARNING: Input file MUST be sorted by subject IRI first!")
        print("Use: python rdf_sort_by_class.py input.ttl output/output_sorted.ttl")
        print("Then: python rdf_consolidate.py output/output_sorted.ttl output/output_consolidated.trig")
        sys.exit(1)
    
    input_file = Path(sys.argv[1])
    
    if not input_file.exists():
        print(f"Error: Input file '{input_file}' not found.")
        sys.exit(1)
    
    # Default output to output/ folder alongside the input file
    if len(sys.argv) >= 3:
        output_file = Path(sys.argv[2])
    else:
        output_dir = Path(__file__).resolve().parent.parent / "output"
        output_file = output_dir / f"{input_file.stem}_consolidated.trig"
    
    try:
        consolidate_sorted_ttl(input_file, output_file)
        
        # Rename output to .trig if it has a different extension
        if output_file.suffix.lower() != '.trig':
            final_output = output_file.with_suffix('.trig')
            if final_output.exists():
                final_output.unlink()
            output_file.rename(final_output)
            print(f"\nOutput written to: {final_output}")
        else:
            print(f"\nOutput written to: {output_file}")
        
    except KeyboardInterrupt:
        print("\n\nInterrupted by user.")
        sys.exit(1)
    except Exception as e:
        print(f"\nError: {e}")
        import traceback
        traceback.print_exc()
        sys.exit(1)


if __name__ == '__main__':
    main()
