"""
engine/slide_selector.py
------------------------
Picks the best-fitting bank infographic slide for a given content payload.

How it works:
  1. Reads the enriched bank catalogue (bank_catalogue_slim.json + dimension data).
  2. Filters slides that can hold the content without text overflow:
       - item_count  <= slide's step_count
       - max label char count fits in the slide's label boxes
       - max description char count fits in the slide's description boxes
  3. Ranks remaining candidates by:
       - Preferred visual type match (+10)
       - Closest step count to content  (penalise empty slots)
       - Wider boxes are better (more breathing room)
  4. Returns the top candidate entry dict, or None if nothing fits.

Usage:
    from engine.slide_selector import select_slide
    entry = select_slide(catalogue, {
        "item_count":    5,
        "max_label_len": 10,   # longest label in chars
        "max_desc_len":  55,   # longest description in chars (0 = no descriptions)
        "visual_type":   "process",
    })
    if entry:
        slide = cloner.clone_slide(str(BANK / entry["file"]), entry["slide_index"])
"""

from __future__ import annotations

# ── Character capacity constants ─────────────────────────────────────────────
# Average character width for proportional fonts (Calibri/Arial) is roughly
# half the point size, divided by 72 to get inches.
# max_chars_on_one_line ≈ box_width_in / (font_pt / 144)
#                       = box_width_in * 144 / font_pt
#
# Default font sizes used when the slide doesn't carry an explicit size:
_DEFAULT_LABEL_PT  = 14.0   # conservative default for label-sized text
_DEFAULT_DESC_PT   = 11.0   # conservative default for description-sized text
_CHAR_WIDTH_FACTOR = 144    # = 2 × 72; gives chars per inch at 1 pt


# Keywords in catalogue entries that mark a slide as technology-specific.
# These entries are excluded when the course is not a tech/IT course.
_TECH_CATALOGUE_KEYWORDS = {"technology", "cybersecurity", "devops", "programming", "coding"}

def select_slide(
    catalogue: list[dict],
    content: dict,
    used_counts: dict | None = None,
    max_uses: int = 2,
    topic_keywords: list[str] | None = None,
    is_tech_course: bool = True,
    used_families: list[str] | None = None,
    used_files: list[str] | None = None,
    excluded_keys: set | None = None,
) -> dict | None:
    """
    Return the best catalogue entry for *content*, or None.

    content keys:
        item_count    int   — number of steps / items needed
        max_label_len int   — length of the longest label string (chars)
        max_desc_len  int   — length of the longest description string (0 = no descs needed)
        visual_type   str   — preferred type: "process", "timeline", "hub", etc. (optional)

    used_counts:  dict mapping (file, slide_index) -> use count. Slides at or
                  above max_uses are excluded. Pass the same dict across calls
                  so it accumulates across the whole deck.
    max_uses:     maximum times a single slide may be used in one course (default 1).
    topic_keywords:  optional list of keyword strings for topic-relevance boosting.
    is_tech_course:  if False, slides tagged with tech keywords are excluded.
    used_families:   recent visual families already used (oldest first); used to
                     penalise repeating the same family.
    used_files:      list of PPTX filenames already used (oldest first); used to
                     penalise drawing from the same source file repeatedly.
    """
    n_items      = content.get("item_count",    0)
    max_lbl      = content.get("max_label_len", 0)
    max_dsc      = content.get("max_desc_len",  0)
    want_type    = content.get("visual_type",   "").lower()
    need_descs   = max_dsc > 0

    scored: list[tuple[float, dict]] = []

    for entry in catalogue:
        # ── Hard filters ──────────────────────────────────────────────────────

        # Enforce max-uses-per-course limit
        key = (entry.get("file"), entry.get("slide_index"))
        if used_counts is not None and used_counts.get(key, 0) >= max_uses:
            continue

        # Skip explicitly excluded entries (e.g. capacity-checked and rejected)
        if excluded_keys and key in excluded_keys:
            continue

        # Exclude tech-themed slides for non-tech courses
        if not is_tech_course:
            entry_kws = set(k.lower() for k in entry.get("keywords", []))
            if entry_kws & _TECH_CATALOGUE_KEYWORDS:
                continue

        # Must have enough item slots
        step_count = entry.get("step_count", 0) or entry.get("text_slot_count", 0)
        if step_count < n_items:
            continue

        # Hard filter: skip slides with non-integer stride — text lands in wrong boxes
        is_reliable = entry.get("txbox_reliable", True)
        if not is_reliable:
            continue

        # Hard filter: comparison-family bank slides are designed for binary
        # comparisons (2 items side by side).  Reject them when the content
        # has any other number of items — otherwise items 3+ go nowhere and
        # the slide renders with empty cells.
        entry_family = entry.get("visual_family", "")
        if entry_family == "comparison" and n_items != 2:
            continue

        # Label capacity check — allow up to 3× single-line capacity since labels wrap
        # across 2-3 lines. normAutofit further reduces font if needed.
        lbl_cap = entry.get("max_label_chars", 999)
        if max_lbl > 0 and lbl_cap * 3 < max_lbl:
            continue

        # Description capacity check — allow up to 6× single-line capacity:
        # text wraps across 4-6 lines and normAutofit shrinks the font to fit.
        # max_desc_chars measures box width at default font size (chars per line).
        if need_descs:
            if not entry.get("has_descriptions", False):
                continue
            dsc_cap = entry.get("max_desc_chars", 0)
            if dsc_cap * 6 < max_dsc:
                continue

        # ── Scoring ───────────────────────────────────────────────────────────
        score = 0.0

        # Penalise banks that have description slots when content needs none.
        # Empty description placeholder shapes remain visible in the output,
        # creating blank bordered boxes that look broken.
        if not need_descs and entry.get("has_descriptions", False):
            score -= 8.0

        # Visual type / family match.
        # visual_type from the content plan maps directly to visual_family in
        # the catalogue (process, cycle, roadmap, framework, hub, list, comparison).
        # An exact family match scores highest — it means the shape's visual grammar
        # correctly represents the content's logical structure.
        types    = entry.get("supported_visual_types", [])
        family   = entry.get("visual_family", "")
        if want_type:
            if want_type == family:             # exact family match → shape semantics align
                score += 15.0
            elif want_type in types:            # supported type match → acceptable
                score += 7.0

        # Prefer closest item count — strongly penalise extra slots so the
        # selector doesn't pick a 5-slot slide for 4-step content and then
        # use the spare row as a title header (which looks wrong visually).
        extra = step_count - n_items
        score -= extra * 3.0 if extra > 0 else abs(extra) * 0.5

        # Prefer slides with more breathing room in label boxes
        if max_lbl > 0 and lbl_cap < 999:
            margin = (lbl_cap - max_lbl) / max(lbl_cap, 1)
            score += margin * 2.0

        # Prefer slides with more breathing room in description boxes
        if need_descs and entry.get("max_desc_chars", 0) > 0:
            dsc_margin = (entry["max_desc_chars"] - max_dsc) / max(entry["max_desc_chars"], 1)
            score += dsc_margin * 2.0

        # Keyword overlap with slide topic
        if topic_keywords and entry.get("keywords"):
            overlap = len(set(k.lower() for k in topic_keywords) & set(entry["keywords"]))
            score += overlap * 3.0

        # Penalise visual family repetition: the more recently a family was used,
        # the higher the penalty (most-recent use = -8, second most = -4, etc.)
        # Hard block if the same family appeared in the last 2 infographic slots.
        if used_families:
            family = entry.get("visual_family", "")
            recent = used_families[-2:]          # last 2 infographic families used
            if family and recent.count(family) >= 2:
                continue                          # hard block: 2 consecutive same family
            for i, used_fam in enumerate(reversed(used_families)):
                if used_fam == family:
                    score -= max(8.0 - i * 2.0, 1.0)
                    break

        # Penalise reuse of the same source PPTX file so slides are drawn from
        # many different files across the deck (file diversity).
        if used_files:
            file_name = entry.get("file", "")
            for i, used_f in enumerate(reversed(used_files)):
                if used_f == file_name:
                    # -6 for the most recent file, -3 for two back, -1.5 beyond
                    score -= max(6.0 - i * 1.5, 0.5)
                    break

        # Small random tiebreaker so equal-scoring slides don't always resolve
        # to the same catalogue order position.
        import random
        score += random.uniform(0, 0.3)

        scored.append((score, entry))

    if not scored:
        return None

    # Prefer exact step count match — a shape designed for exactly N items
    # looks complete; a shape with extra empty slots looks broken.
    exact = [
        (s, e) for (s, e) in scored
        if (e.get("step_count", 0) or e.get("text_slot_count", 0)) == n_items
    ]
    pool = exact if exact else scored
    pool.sort(key=lambda t: -t[0])
    return pool[0][1]


def content_requirements(content: dict) -> dict:
    """
    Derive selector requirements from a content payload (title + steps).

    Input (same shape as replace_slide_text):
        { "title": "...", "steps": [{"label": "...", "description": "..."}, ...] }

    Returns:
        { "item_count": int, "max_label_len": int, "max_desc_len": int }
    """
    steps = content.get("steps", [])
    max_lbl = max((len(s.get("label", "")) for s in steps), default=0)
    max_dsc = max((len(s.get("description", "")) for s in steps), default=0)
    return {
        "item_count":    len(steps),
        "max_label_len": max_lbl,
        "max_desc_len":  max_dsc,
    }