# -*- coding: utf-8 -*-
"""
TTL vs TTL Leaflet ID Validator
=================================
Compares leaflet IDs between two TTL files and reports what is in A but not B,
and what is in B but not A.

Matches the pattern:   leaflet:<ID>
e.g.  leaflet:9 mt:LeafletHtmlFileId "fc22c1b0-..." @es.

Usage:
    python validate_ttl_vs_ttl.py <file_a.ttl> <file_b.ttl>

Log file is written next to file_a using its stem:
    <file_a_stem>_vs_<file_b_stem>_validation_log.txt

Author: M Dombaugh
"""

import sys
import re
from pathlib import Path
from datetime import datetime


TTL_PREFIX = "leaflet"


def extract_ids(ttl_path: Path) -> set:
    """
    Return the set of unique local IDs found with the leaflet: prefix in a TTL file.
    Strips trailing Turtle punctuation that may be captured.
    """
    pattern = re.compile(rf'\b{re.escape(TTL_PREFIX)}:(\S+?)(?=[\s,;.>\]]|$)')
    ids = set()
    with open(ttl_path, "r", encoding="utf-8") as f:
        for line in f:
            for m in pattern.finditer(line):
                local = m.group(1).rstrip(";,.")
                if local:
                    ids.add(local)
    print(f"[INFO] {ttl_path.name}: {len(ids):,} unique leaflet IDs found")
    return ids


def build_report(
    ids_a: set,
    ids_b: set,
    path_a: Path,
    path_b: Path,
) -> list[str]:
    in_a_not_b = sorted(ids_a - ids_b, key=lambda x: (len(x), x))
    in_b_not_a = sorted(ids_b - ids_a, key=lambda x: (len(x), x))
    common     = ids_a & ids_b

    sep   = "=" * 70
    lines = []

    lines.append(sep)
    lines.append("TTL vs TTL LEAFLET ID VALIDATION REPORT")
    lines.append(sep)
    lines.append(f"Timestamp : {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    lines.append(f"File A    : {path_a.resolve()}")
    lines.append(f"File B    : {path_b.resolve()}")
    lines.append(f"Prefix    : {TTL_PREFIX}:")
    lines.append("")
    lines.append("SUMMARY")
    lines.append("-" * 40)
    lines.append(f"  IDs in A                    : {len(ids_a):>8,}")
    lines.append(f"  IDs in B                    : {len(ids_b):>8,}")
    lines.append(f"  IDs in both (matched)       : {len(common):>8,}")
    lines.append(f"  In A but MISSING from B     : {len(in_a_not_b):>8,}")
    lines.append(f"  In B but MISSING from A     : {len(in_b_not_a):>8,}")
    lines.append("")

    lines.append(sep)
    lines.append(f"IN A ({path_a.name}) BUT NOT IN B ({path_b.name})  [{len(in_a_not_b):,} items]")
    lines.append(sep)
    if in_a_not_b:
        for id_val in in_a_not_b:
            lines.append(f"  {TTL_PREFIX}:{id_val}")
    else:
        lines.append("  (none - every ID in A is also in B)")
    lines.append("")

    lines.append(sep)
    lines.append(f"IN B ({path_b.name}) BUT NOT IN A ({path_a.name})  [{len(in_b_not_a):,} items]")
    lines.append(sep)
    if in_b_not_a:
        for id_val in in_b_not_a:
            lines.append(f"  {TTL_PREFIX}:{id_val}")
    else:
        lines.append("  (none - every ID in B is also in A)")
    lines.append("")

    lines.append(sep)
    if not in_a_not_b and not in_b_not_a:
        lines.append("RESULT: PASS - both TTL files contain identical leaflet ID sets.")
    else:
        lines.append("RESULT: DIFFERENCES FOUND - see sections above.")
    lines.append(sep)

    return lines


def main():
    if len(sys.argv) != 3:
        print("Usage: python validate_ttl_vs_ttl.py <file_a.ttl> <file_b.ttl>")
        sys.exit(1)

    path_a = Path(sys.argv[1])
    path_b = Path(sys.argv[2])

    for p in (path_a, path_b):
        if not p.exists():
            print(f"[ERROR] File not found: {p}")
            sys.exit(1)

    ids_a = extract_ids(path_a)
    ids_b = extract_ids(path_b)

    report_lines = build_report(ids_a, ids_b, path_a, path_b)

    print()
    for line in report_lines:
        print(line)

    log_name = f"{path_a.stem}_vs_{path_b.stem}_validation_log.txt"
    log_path = path_a.parent / log_name

    with open(log_path, "w", encoding="utf-8") as lf:
        lf.write("\n".join(report_lines) + "\n")

    print(f"\n[INFO] Log written to: {log_path}")


if __name__ == "__main__":
    main()