#!/usr/bin/env python3
"""
Find underscores in IRIs within TTL files.
Handles both prefixed (prefix:local) and full (<http://...>) IRI formats.
"""

import re
import sys
from pathlib import Path

# Patterns
PREFIXED_IRI = re.compile(r'\b([a-zA-Z][a-zA-Z0-9]*):([a-zA-Z_][a-zA-Z0-9_.-]*)\b')
FULL_IRI = re.compile(r'<([^>]+)>')
STRING_LITERAL = re.compile(r'"(?:[^"\\]|\\.)*"')

def find_underscore_iris(filepath: str) -> list[tuple[int, str, str]]:
    """
    Returns list of (line_number, iri, context) for IRIs containing underscores.
    """
    results = []
    in_multiline_string = False
    
    with open(filepath, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f, 1):
            # Skip comments
            if line.strip().startswith('#'):
                continue
            
            # Simple multiline string tracking (triple quotes)
            if '"""' in line:
                count = line.count('"""')
                if count % 2 == 1:
                    in_multiline_string = not in_multiline_string
                if in_multiline_string:
                    continue
            
            if in_multiline_string:
                continue
            
            # Remove string literals to avoid false positives
            clean_line = STRING_LITERAL.sub('""', line)
            
            # Check prefixed IRIs
            for match in PREFIXED_IRI.finditer(clean_line):
                prefix, local = match.groups()
                full_iri = f"{prefix}:{local}"
                if '_' in prefix or '_' in local:
                    results.append((line_num, full_iri, line.strip()))
            
            # Check full IRIs
            for match in FULL_IRI.finditer(clean_line):
                iri = match.group(1)
                if '_' in iri:
                    results.append((line_num, f"<{iri}>", line.strip()))
    
    return results

def main():
    if len(sys.argv) < 2:
        print("Usage: python find_underscore_iris.py <ttl_file> [--summary]")
        sys.exit(1)
    
    filepath = sys.argv[1]
    summary_only = '--summary' in sys.argv
    
    if not Path(filepath).exists():
        print(f"Error: {filepath} not found")
        sys.exit(1)
    
    results = find_underscore_iris(filepath)
    
    if summary_only:
        unique_iris = set(r[1] for r in results)
        print(f"Found {len(results)} occurrences of {len(unique_iris)} unique IRIs with underscores")
        print("\nUnique IRIs:")
        for iri in sorted(unique_iris):
            print(f"  {iri}")
    else:
        for line_num, iri, context in results:
            print(f"Line {line_num}: {iri}")
            if len(sys.argv) > 2 and '-v' in sys.argv:
                print(f"  Context: {context[:100]}...")
        
        print(f"\nTotal: {len(results)} IRIs with underscores")

if __name__ == '__main__':
    main()