# -*- coding: utf-8 -*-
"""
Turtle RDF File Sorter with Multi-line Triple-Quote and Prefix Support
========================================================================
Sorts Turtle RDF entities while preserving multi-line string literals.
Handles both full IRI notation and abbreviated prefix notation.

Author: M Dombaugh
Created: 2025-11-10
Version: 6.0 - Output written to output/ folder relative to project root.
               Input read from output/ folder (produced by rdf_from_metadata.py).
         5.0 - Fixed to handle Turtle shorthand ontology declarations (a owl:Ontology).
               Now correctly identifies owl:* types as ontology metadata, not instance data.
               Supports multi-line ontology statements (semicolon-continued).
         4.0 - Fixed to preserve N-Triples style ontology declarations in header.
               Distinguishes between N-Triples metadata (full predicate IRIs) and
               Turtle instance data (uses 'a' shorthand for rdf:type).
"""

import re
from pathlib import Path

def extract_entities_from_ttl(ttl_content):
    """
    Extract RDF entities from Turtle content, properly handling triple-quoted strings.
    
    This function understands Turtle syntax including:
    - Multi-line strings with triple quotes
    - Escaped quotes inside strings
    - Entity boundaries (lines ending with period)
    - Abbreviated prefix notation (e.g., drug:d00001)
    - Full IRI notation (e.g., <http://...>)
    
    Skips ontology metadata (owl:Ontology, owl:Class, owl:ObjectProperty, etc.).
    Only captures instance data (which uses 'a' shorthand with mt: types).
    
    Returns:
        List of (subject_iri, entity_text) tuples
    """
    entities = []
    current_entity = []
    in_triple_quote = False
    subject_iri = None
    in_instance_data = False
    
    lines = ttl_content.split('\n')
    
    for line in lines:
        # Skip everything until we hit instance data
        # Instance data uses Turtle 'a' shorthand with non-owl types
        if not in_instance_data:
            stripped = line.strip()
            # Check for full IRI with Turtle 'a' shorthand
            if stripped.startswith('<http'):
                parts = stripped.split()
                if len(parts) >= 3 and parts[1] == 'a':
                    # Skip if it's ontology metadata (owl:Ontology, owl:Class, etc.)
                    if parts[2].startswith('owl:'):
                        continue
                    in_instance_data = True
            elif stripped and not stripped.startswith('@') and not stripped.startswith('#'):
                # Check if it looks like a prefixed IRI (prefix:id a Type)
                parts = stripped.split()
                if len(parts) >= 3 and ':' in parts[0] and parts[1] == 'a':
                    # Skip ontology definitions (owl:Class, owl:ObjectProperty, etc.)
                    if parts[2].startswith('owl:'):
                        continue
                    in_instance_data = True
            
            if not in_instance_data:
                continue
        
        # Skip empty lines when not building an entity
        if not current_entity and not line.strip():
            continue
        
        # Track triple-quote state
        # Count unescaped triple quotes in this line
        # Remove escaped quotes first to avoid counting them
        line_without_escaped = line.replace(r'\"', '')
        triple_quote_count = line_without_escaped.count('"""')
        
        # Toggle in_triple_quote state for each occurrence
        if triple_quote_count > 0:
            # If odd number of triple quotes, toggle state
            if triple_quote_count % 2 == 1:
                in_triple_quote = not in_triple_quote
        
        # Add line to current entity
        current_entity.append(line)
        
        # Check if entity is complete (ends with ' .' and not inside triple quotes)
        if not in_triple_quote and line.rstrip().endswith(' .'):
            # Extract subject IRI from first line
            if current_entity:
                first_line = current_entity[0]
                
                # Try to match full IRI format: <http://...> a Type
                match = re.match(r'<([^>]+)>\s+a\s+', first_line)
                if match:
                    subject_iri = match.group(1)
                else:
                    # Try to match abbreviated prefix format: prefix:id a Type
                    match = re.match(r'([a-zA-Z_][a-zA-Z0-9_]*:[^\s]+)\s+a\s+', first_line)
                    if match:
                        subject_iri = match.group(1)
                
                if subject_iri:
                    entity_text = '\n'.join(current_entity)
                    entities.append((subject_iri, entity_text))
            
            # Reset for next entity
            current_entity = []
            in_triple_quote = False
            subject_iri = None
    
    return entities

def extract_ontology_header(ttl_content):
    """
    Extract everything before instance data (prefixes + ontology definitions).
    Stops at the first line that starts instance data.
    
    Ontology metadata (kept in header):
    - <http://...> a owl:Ontology
    - mt:ClassName a owl:Class
    - mt:propName a owl:ObjectProperty / owl:DatatypeProperty
    
    Instance data (triggers break):
    - <http://...> a mt:SomeClass (not owl:*)
    - drug:d00001 a mt:Drug
    """
    header_lines = []
    lines = ttl_content.split('\n')
    in_multiline_header = False  # Track multi-line ontology statements
    
    for line in lines:
        stripped = line.strip()
        
        # If we're in a multi-line header statement, keep adding until we hit the terminator
        if in_multiline_header:
            header_lines.append(line)
            if stripped.endswith('.'):
                in_multiline_header = False
            continue
        
        # Check if this is the start of instance data vs ontology metadata
        if stripped.startswith('<http'):
            parts = stripped.split()
            if len(parts) >= 3 and parts[1] == 'a':
                # Check if this is ontology metadata (owl:Ontology, owl:Class, etc.)
                if parts[2].startswith('owl:'):
                    header_lines.append(line)
                    if not stripped.endswith('.'):
                        in_multiline_header = True
                    continue
                else:
                    # Instance data - stop here
                    break
        
        # Check for abbreviated prefix format (but not @prefix declarations)
        if stripped and not stripped.startswith('@') and not stripped.startswith('#') and not stripped.startswith('<'):
            parts = stripped.split()
            if len(parts) >= 3 and ':' in parts[0] and parts[1] == 'a':
                # Ontology definitions use owl:Class, owl:ObjectProperty, etc.
                if parts[2].startswith('owl:'):
                    header_lines.append(line)
                    if not stripped.endswith('.'):
                        in_multiline_header = True
                    continue
                else:
                    # Instance data - stop here
                    break
        
        header_lines.append(line)
    
    return header_lines

def sort_ttl_file(input_path, output_path):
    """
    Sort Turtle RDF file by subject IRI while preserving all content.
    
    Properly handles:
    - Multi-line triple-quoted strings
    - All Turtle syntax features
    - Namespace prefixes and ontology definitions (kept at top)
    - Both full IRIs and abbreviated prefixes
    """
    print(f"[INFO] Reading {input_path}")
    
    with open(input_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Extract ontology header (prefixes + class/property definitions)
    header_lines = extract_ontology_header(content)
    print(f"[INFO] Found {len(header_lines)} header lines (prefixes + ontology)")
    
    # Extract entities
    entities = extract_entities_from_ttl(content)
    print(f"[INFO] Found {len(entities)} entities")
    
    # Sort entities by IRI
    entities.sort(key=lambda x: x[0])
    print(f"[INFO] Sorted entities by IRI")
    
    # Write sorted output
    with open(output_path, 'w', encoding='utf-8') as f:
        # Write ontology header
        for line in header_lines:
            f.write(line + '\n')
        
        # Write sorted entities
        for i, (iri, entity_text) in enumerate(entities):
            f.write(entity_text)
            # Add blank line between entities (except after last one)
            if i < len(entities) - 1:
                f.write('\n\n')
            else:
                f.write('\n')
    
    print(f"[INFO] Wrote sorted output to {output_path}")
    print(f"[INFO] Complete!")

if __name__ == "__main__":
    import sys
    
    if len(sys.argv) > 1:
        input_file = sys.argv[1]
        output_file = sys.argv[2] if len(sys.argv) > 2 else str(
            Path(__file__).resolve().parent.parent / "output" / "output_sorted.ttl"
        )
    else:
        output_dir = Path(__file__).resolve().parent.parent / "output"
        input_file  = input("Enter input TTL file path (or press Enter for 'output/output.ttl'): ").strip() \
                      or str(output_dir / "output.ttl")
        output_file = input("Enter output file path (or press Enter for 'output/output_sorted.ttl'): ").strip() \
                      or str(output_dir / "output_sorted.ttl")
    
    sort_ttl_file(input_file, output_file)
