"""
engine/text_replacer.py
-----------------------
Replaces placeholder text in a cloned slide with real course content.

Design principles:
  - Sorts ALL text shapes by absolute visual position (top→bottom, left→right)
    so reading order is correct regardless of XML document order.
  - Handles shapes nested inside groups by applying group coordinate transforms.
  - Preserves 100% of text formatting (font, size, color, bold, italic).
    Only the text *content* changes — no XML structure is altered.
  - Skips step-number shapes (e.g. "01", "02") — they are decorative.

Content payload (what the AI produces, what this module consumes):
    {
        "title": "Slide heading",
        "steps": [                          # optional
            {"label": "Label", "description": "Description text"},
            ...
        ],
        "body": "Plain paragraph text"      # optional, for non-step slides
    }
"""
from __future__ import annotations


import re
from pptx.oxml.ns import qn

# ── Regex: a shape whose full text is a step-number/year decoration ───────────
_STEP_NUMBER_RE = re.compile(r"^\d{1,4}$")   # "1".."99", "01".."09", years 2015…


# ═══════════════════════════════════════════════════════════════════════════════
# Public API
# ═══════════════════════════════════════════════════════════════════════════════

def replace_slide_text(slide, content: dict, txbox_only: bool = False, stride: int = 1):
    """
    Replace placeholder text in *slide* with *content*.

    content keys (all optional except 'title'):
        title   str          – slide heading
        steps   list[dict]   – each dict has 'label' and/or 'description'
        body    str          – single body paragraph (for plain-text slides)

    txbox_only: if True, only fill shapes with cNvSpPr txBox="1" (use for the
                new Infographic_Bank_Final_Clean slides).
    stride:     number of txBox slots per content step (from catalogue
                txbox_stride). When stride=1 the replacer fills sequentially.
                When stride>1 it fills slot[i*stride] with label[i] and
                slot[i*stride+1] with description[i], then clears all other
                slots so no residual template text leaks through.
    """
    slots = _collect_slots(slide, txbox_only=txbox_only)

    # For bank slides (txbox_only=True) swap text only — never touch bodyPr,
    # autofit, or font size so the designed layout is preserved exactly.
    _fill = _swap_text if txbox_only else _set_text

    # ── 1. Title (first/topmost text shape) ──────────────────────────────────
    # For bank slides (txbox_only=True) the title is handled externally by
    # _fill_placeholder_title (title ph) or an injected text box in the builder.
    # Never consume a txBox slot for the title — every txBox row is a content row.
    title = content.get("title", "")
    if title and slots and not txbox_only:
        _fill(slots[0]["element"], title,
              cx_in=slots[0].get("cx_in", 0), cy_in=slots[0].get("cy_in", 0))
        slots = slots[1:]

    # ── 1b. Clear header-wide structural text boxes (>8" wide) that are NOT
    #        step labels — bank slides often have a subtitle/caption row just
    #        below the title.  ONLY clear them when non-wide label slots exist
    #        as alternatives; if ALL remaining slots are wide (full-width row
    #        design) they ARE the label rows and must NOT be cleared.
    if txbox_only and content.get("steps"):
        non_wide = [s for s in slots if s.get("cx_in", 0) <= 8.0]
        if non_wide:
            for s in slots:
                if s.get("cx_in", 0) > 8.0:
                    _clear_slot(s["element"])
            slots = non_wide
        # else: all remaining slots are wide label rows — keep them all

    # ── 2. Body (plain-text slide, no steps) ─────────────────────────────────
    body = content.get("body")
    if body and not content.get("steps"):
        if slots:
            _fill(slots[0]["element"], body,
                  cx_in=slots[0].get("cx_in", 0), cy_in=slots[0].get("cy_in", 0))
        return

    # ── 3. Steps: labels then descriptions ───────────────────────────────────
    steps = content.get("steps", [])
    if not steps:
        return

    need_descs = any("description" in step and step["description"] for step in steps)

    if stride > 1 and need_descs:
        # Stride-based filling: each content step occupies `stride` consecutive
        # txBox slots (e.g. label, description, icon-caption, ...).
        # We fill slot[i*stride] with label[i], slot[i*stride+1] with desc[i],
        # and clear everything else to remove residual template text.
        filled: set[int] = set()
        for i, step in enumerate(steps):
            lbl_idx = i * stride
            if lbl_idx < len(slots):
                s = slots[lbl_idx]
                _fill(s["element"], step.get("label", ""),
                      cx_in=s.get("cx_in", 0), cy_in=s.get("cy_in", 0))
                filled.add(lbl_idx)
            if need_descs:
                dsc_idx = i * stride + 1
                if dsc_idx < len(slots):
                    s = slots[dsc_idx]
                    _fill(s["element"], step.get("description", ""),
                          cx_in=s.get("cx_in", 0), cy_in=s.get("cy_in", 0))
                    filled.add(dsc_idx)
        # Clear all unfilled slots so no residual template text shows through
        for j, s in enumerate(slots):
            if j not in filled:
                _clear_slot(s["element"])
        return

    # ── Sequential filling (stride=1) ────────────────────────────────────────
    # Classify into label slots vs description slots, then fill 1:1.
    label_slots, desc_slots = _classify_slots(slots, need_descs=need_descs)

    for i, step in enumerate(steps):
        if "label" in step and i < len(label_slots):
            s = label_slots[i]
            _fill(s["element"], step["label"],
                  cx_in=s.get("cx_in", 0), cy_in=s.get("cy_in", 0))
        if "description" in step and i < len(desc_slots):
            s = desc_slots[i]
            _fill(s["element"], step["description"],
                  cx_in=s.get("cx_in", 0), cy_in=s.get("cy_in", 0))

    # Clear any unused label slots so no rogue template text floats on screen
    n = len(steps)
    for s in label_slots[n:]:
        _clear_slot(s["element"])
    for s in desc_slots[n:]:
        _clear_slot(s["element"])


# ═══════════════════════════════════════════════════════════════════════════════
# Slot collection — visual position order, group-transform aware
# ═══════════════════════════════════════════════════════════════════════════════

def _classify_slots(slots: list[dict], need_descs: bool = True) -> tuple[list[dict], list[dict]]:
    """
    Given slots sorted by (abs_y, abs_x), split into (label_slots, desc_slots).

    If need_descs is False (overview / label-only content), every slot is treated
    as a label slot — this correctly handles zigzag / alternating layouts where
    left and right boxes are both labels, not label+description pairs.

    Strategy when need_descs=True:
      A) Each row has ≥2 slots → 2-column layout: left col = labels, right = descs.
      B) Single-slot rows (zigzag / hub) → classify by average existing text length:
         rows with avg < 25 chars = label rows; longer rows = description rows.
    """
    if not slots:
        return [], []

    # When content has no descriptions, every box is a label slot (avoids
    # misclassifying zigzag layouts as label+description columns).
    if not need_descs:
        return slots, []

    # ── Group into rows (y within 0.15 in) ───────────────────────────────────
    rows: list[list[dict]] = []
    for s in slots:
        placed = False
        for row in rows:
            if abs(s["abs_y"] - row[0]["abs_y"]) <= 0.15:
                row.append(s)
                placed = True
                break
        if not placed:
            rows.append([s])
    for row in rows:
        row.sort(key=lambda s: s["abs_x"])

    # ── Case A: at least one row has 2+ slots → paired label/desc layout ─────
    if any(len(row) >= 2 for row in rows):
        label_slots: list[dict] = []
        desc_slots:  list[dict] = []
        for row in rows:
            label_slots.append(row[0])
            desc_slots.extend(row[1:])
        return label_slots, desc_slots

    # ── Case B: all rows single-slot — classify by average existing text len ──
    label_slots = []
    desc_slots  = []
    for row in rows:
        avg_len = sum(len(s["text"]) for s in row) / len(row)
        if avg_len < 25:
            label_slots.extend(row)
        else:
            desc_slots.extend(row)

    return label_slots, desc_slots


def _collect_slots(slide, txbox_only: bool = False) -> list[dict]:
    """
    Return all editable text shapes on the slide, sorted by absolute visual
    position (top→bottom, left→right), with step-number decorations removed.

    txbox_only=True  — only collect shapes with cNvSpPr txBox="1" (explicit text
                        boxes).  Use this for the Infographic_Bank_Final_Clean bank
                        where structural background rectangles also carry txBody but
                        must not be filled with content.
    txbox_only=False — legacy behaviour: collect every shape that has a txBody
                        (works for the old bank which had placeholder text).

    Each entry: {"element": txBody_element, "text": str, "abs_x": float, "abs_y": float,
                 "cx_in": float, "cy_in": float}
    """
    raw = []
    _walk(slide._element, raw, offset_x=0, offset_y=0, scale_x=1.0, scale_y=1.0,
          txbox_only=txbox_only)

    # Filter out step-number decorations
    raw = [r for r in raw if not _STEP_NUMBER_RE.match(r["text"].strip())]

    # Sort by visual position (round y to 1 dp so same-row slots stay together)
    raw.sort(key=lambda r: (round(r["abs_y"], 1), round(r["abs_x"], 2)))
    return raw


EMU = 914400.0   # EMUs per inch

def _walk(el, results, offset_x, offset_y, scale_x, scale_y, txbox_only: bool = False):
    """Recursively walk the shape tree, resolving group transforms."""
    tag = el.tag.split("}")[-1] if "}" in el.tag else el.tag

    if tag == "grpSp":
        # Resolve this group's coordinate transform
        nx, ny, nsx, nsy = _group_transform(el, offset_x, offset_y, scale_x, scale_y)
        for child in el:
            _walk(child, results, nx, ny, nsx, nsy, txbox_only)

    elif tag in ("sp", "pic"):
        # txbox_only filter: skip shapes that are not explicit text boxes
        if txbox_only:
            cNvSpPr = el.find(qn("p:nvSpPr") + "/" + qn("p:cNvSpPr"))
            if cNvSpPr is None or cNvSpPr.get("txBox") != "1":
                return

        # Get shape position in current coordinate space
        x, y = _shape_pos(el)
        abs_x = offset_x + x * scale_x
        abs_y = offset_y + y * scale_y

        # Collect ALL text boxes — including empty ones (e.g. blank label boxes,
        # title placeholders with no placeholder text).  Empty boxes are valid
        # target slots; skipping them leaves infographic boxes unfilled.
        txBody = el.find(".//" + qn("p:txBody"))
        if txBody is None:
            txBody = el.find(".//" + qn("a:txBody"))
        if txBody is not None:
            text = "".join(
                t.text or "" for t in txBody.findall(".//" + qn("a:t"))
            ).strip()
            # Store the shape width (cx) so _set_text can size fonts correctly
            spPr = el.find(qn("p:spPr"))
            if spPr is None:
                spPr = el.find(".//" + qn("p:spPr"))
            cx_emu = cy_emu = 0
            if spPr is not None:
                xfrm = spPr.find(qn("a:xfrm"))
                if xfrm is not None:
                    ext = xfrm.find(qn("a:ext"))
                    if ext is not None:
                        cx_emu = int(ext.get("cx", 0))
                        cy_emu = int(ext.get("cy", 0))
            results.append({
                "element": txBody,
                "text":    text,
                "abs_x":   abs_x / EMU,
                "abs_y":   abs_y / EMU,
                "cx_in":   cx_emu / EMU,
                "cy_in":   cy_emu / EMU,
            })

    else:
        # spTree, cSld, etc. — just recurse
        for child in el:
            _walk(child, results, offset_x, offset_y, scale_x, scale_y, txbox_only)


def _group_transform(grpSp_el, parent_x, parent_y, parent_sx, parent_sy):
    """
    Compute absolute offset + scale for children of a grpSp element.
    Returns (new_offset_x, new_offset_y, new_scale_x, new_scale_y).
    """
    grpSpPr = grpSp_el.find(qn("p:grpSpPr"))
    if grpSpPr is None:
        return parent_x, parent_y, parent_sx, parent_sy

    xfrm = grpSpPr.find(qn("a:xfrm"))
    if xfrm is None:
        return parent_x, parent_y, parent_sx, parent_sy

    off   = xfrm.find(qn("a:off"))
    ext   = xfrm.find(qn("a:ext"))
    chOff = xfrm.find(qn("a:chOff"))
    chExt = xfrm.find(qn("a:chExt"))

    if None in (off, ext, chOff, chExt):
        return parent_x, parent_y, parent_sx, parent_sy

    gx  = int(off.get("x",  0))
    gy  = int(off.get("y",  0))
    gcx = int(ext.get("cx", 1))
    gcy = int(ext.get("cy", 1))
    cox = int(chOff.get("x", 0))
    coy = int(chOff.get("y", 0))
    ccx = int(chExt.get("cx", 1))
    ccy = int(chExt.get("cy", 1))

    sx = gcx / ccx if ccx else 1.0
    sy = gcy / ccy if ccy else 1.0

    new_offset_x = parent_x + (gx - 0) * parent_sx - cox * sx * parent_sx
    new_offset_y = parent_y + (gy - 0) * parent_sy - coy * sy * parent_sy
    new_scale_x  = parent_sx * sx
    new_scale_y  = parent_sy * sy

    return new_offset_x, new_offset_y, new_scale_x, new_scale_y


def _shape_pos(sp_el):
    """Return (x, y) EMU position of a sp or pic element."""
    spPr = sp_el.find(qn("p:spPr"))
    if spPr is None:
        spPr = sp_el.find(".//" + qn("p:spPr"))
    if spPr is None:
        return 0, 0
    xfrm = spPr.find(qn("a:xfrm"))
    if xfrm is None:
        return 0, 0
    off = xfrm.find(qn("a:off"))
    if off is None:
        return 0, 0
    return int(off.get("x", 0)), int(off.get("y", 0))


# ═══════════════════════════════════════════════════════════════════════════════
# Text replacement — preserves ALL formatting
# ═══════════════════════════════════════════════════════════════════════════════

def _fit_font_pt(
    cx_in: float,
    cy_in: float,
    text: str,
    orig_pt: float,
    default_pt: float = 18.0,
    line_spacing: float = 1.2,
    n_items: int = 1,
    spc_aft_pt: float = 0.0,
    min_pt: int = 10,
) -> float:
    """
    Return the largest font size (pts) at which *text* fits inside a box of
    cx_in x cy_in inches without overflowing, down to min_pt (default 10 pt).

    Uses a proportional font model:
      char_width  ~  font_pt / 144   inches  (avg for Calibri/Arial)
      line_height ~  font_pt / 72 * line_spacing inches

    n_items:    number of separate paragraphs/bullets — each occupies at least
                one line; minimum height = n_items × line_height.
    spc_aft_pt: extra vertical spacing added after each item paragraph (pts).
                Pass the paragraph spcAft value so the formula stays accurate.
    min_pt:     hard floor — never return below this value.

    Tries the original font first; reduces by 1 pt until fit or min_pt floor.
    """
    import math
    floor = max(int(min_pt), 1)
    start_pt = orig_pt if orig_pt >= floor else default_pt
    for pt in range(int(start_pt), floor - 1, -1):
        chars_per_line = cx_in * 144 / pt
        lines_needed   = max(
            math.ceil(len(text) / max(chars_per_line, 1)),
            n_items,   # each item occupies at least one line
        )
        text_height    = lines_needed * (pt / 72) * (line_spacing + 0.1)  # +0.1 safety
        spacing_height = n_items * (spc_aft_pt / 72)
        height_needed  = text_height + spacing_height
        if height_needed <= cy_in:
            return float(pt)
    return float(floor)


def _swap_text(txBody_el, new_text: str, cx_in: float = 0, cy_in: float = 0):
    """
    Replace ONLY the text content — does NOT touch bodyPr, autofit, or font size.
    Use this for bank infographic slides where the original visual design must be
    preserved exactly (position, size, colour, font are all left untouched).
    cx_in / cy_in are accepted but ignored (keeps the same call signature as _set_text).
    """
    if not new_text:
        return
    import copy as _copy
    from lxml import etree
    _NS_A = "http://schemas.openxmlformats.org/drawingml/2006/main"

    paragraphs = txBody_el.findall(qn("a:p"))
    if not paragraphs:
        p_el = etree.Element(f"{{{_NS_A}}}p")
        txBody_el.append(p_el)
        paragraphs = [p_el]

    first_para = paragraphs[0]
    runs = first_para.findall(qn("a:r"))
    if runs:
        # Existing run — just swap the text string, touch nothing else
        t_el = runs[0].find(qn("a:t"))
        if t_el is not None:
            t_el.text = new_text
        for extra_run in runs[1:]:
            first_para.remove(extra_run)
    else:
        # No runs — create one, copying endParaRPr for font/colour/size so the
        # bank slide's designed styling (e.g. white text on coloured box) is kept.
        r_el = etree.Element(f"{{{_NS_A}}}r")
        t_el = etree.SubElement(r_el, f"{{{_NS_A}}}t")
        t_el.text = new_text
        endParaRPr = first_para.find(qn("a:endParaRPr"))
        if endParaRPr is not None:
            rPr = _copy.deepcopy(endParaRPr)
            rPr.tag = f"{{{_NS_A}}}rPr"
            r_el.insert(0, rPr)
            endParaRPr.addprevious(r_el)
        else:
            first_para.append(r_el)

    for extra_para in paragraphs[1:]:
        txBody_el.remove(extra_para)


def _set_text(txBody_el, new_text: str, cx_in: float = 0, cy_in: float = 0,
              font_pt: float | None = None,
              min_font_pt: int = 10,
              font_family: str | None = None):
    """
    Replace the text content of a txBody element with new_text.
    Preserves all formatting (rPr attributes: font, size, color, bold, etc.).

    Real-time font fitting (like a human designer):
      - Reads the original font size from the run's rPr (or estimates 12 pt).
      - Computes the largest font where the text fits cleanly inside the box.
      - Sets that size explicitly so PowerPoint never needs to overflow or wrap
        beyond the box boundaries.
      - Removes noAutofit / spAutoFit so the box stays at its designed size.

    If `font_pt` is given, auto-fit is bypassed and that exact size is applied.
    `min_font_pt` enforces a floor on the auto-fitter (raise to e.g. 32 for
    titles so they don't shrink below a readable size).
    `font_family` if given, stamps an explicit <a:latin typeface=...> child onto
    the rPr so the run renders in the brand font instead of inheriting from
    the layout/master (useful when cloned slides come from mixed sources).
    """
    if not new_text:
        return

    from lxml import etree
    _NS_A = "http://schemas.openxmlformats.org/drawingml/2006/main"

    # ── Lock the box: remove all autofit flags, add noAutofit ────────────────
    # We are setting the font size ourselves, so let the box stay fixed.
    bodyPr = txBody_el.find(qn("a:bodyPr"))
    if bodyPr is not None:
        for tag in ("a:noAutofit", "a:spAutoFit", "a:normAutofit"):
            el = bodyPr.find(qn(tag))
            if el is not None:
                bodyPr.remove(el)
        bodyPr.append(etree.Element(f"{{{_NS_A}}}noAutofit"))

    paragraphs = txBody_el.findall(qn("a:p"))
    if not paragraphs:
        # txBody has no paragraph at all — create one so we have somewhere to write
        p_el = etree.Element(f"{{{_NS_A}}}p")
        txBody_el.append(p_el)
        paragraphs = [p_el]

    first_para = paragraphs[0]
    runs = first_para.findall(qn("a:r"))

    if runs:
        first_run = runs[0]
        t_el = first_run.find(qn("a:t"))
        if t_el is not None:
            t_el.text = new_text
        for extra_run in runs[1:]:
            first_para.remove(extra_run)

        # ── Compute and set the fitting font size ─────────────────────────────
        if font_pt is not None:
            rPr = first_run.find(qn("a:rPr"))
            if rPr is not None:
                rPr.set("sz", str(int(font_pt * 100)))
        elif cx_in > 0 and cy_in > 0:
            rPr = first_run.find(qn("a:rPr"))
            if rPr is not None:
                orig_sz = rPr.get("sz")
                orig_pt = int(orig_sz) / 100.0 if orig_sz else 0.0
                fit_pt  = _fit_font_pt(cx_in, cy_in, new_text, orig_pt,
                                       default_pt=18.0,
                                       min_pt=min_font_pt)
                rPr.set("sz", str(int(fit_pt * 100)))

        # ── Optional explicit typeface stamp ──────────────────────────────────
        if font_family:
            rPr = first_run.find(qn("a:rPr"))
            if rPr is not None:
                # Replace any existing <a:latin> child rather than stack new ones.
                for old in rPr.findall(qn("a:latin")):
                    rPr.remove(old)
                latin = etree.SubElement(rPr, f"{{{_NS_A}}}latin")
                latin.set("typeface", font_family)
    else:
        import copy as _copy
        r_el = _make_run(new_text)
        endParaRPr = first_para.find(qn("a:endParaRPr"))
        if endParaRPr is not None:
            # Copy formatting from endParaRPr (the designer's intended font/size/color)
            # and attach it to the new run as <a:rPr>
            rPr = _copy.deepcopy(endParaRPr)
            rPr.tag = f"{{{_NS_A}}}rPr"
            if font_pt is not None:
                rPr.set("sz", str(int(font_pt * 100)))
            elif cx_in > 0 and cy_in > 0:
                orig_sz = rPr.get("sz")
                orig_pt = int(orig_sz) / 100.0 if orig_sz else 0.0
                fit_pt  = _fit_font_pt(cx_in, cy_in, new_text, orig_pt,
                                       default_pt=18.0,
                                       min_pt=min_font_pt)
                rPr.set("sz", str(int(fit_pt * 100)))
            if font_family:
                for old in rPr.findall(qn("a:latin")):
                    rPr.remove(old)
                latin = etree.SubElement(rPr, f"{{{_NS_A}}}latin")
                latin.set("typeface", font_family)
            r_el.insert(0, rPr)
            # endParaRPr must always be the last element in <a:p>
            endParaRPr.addprevious(r_el)
        else:
            first_para.append(r_el)

    # Remove extra paragraphs
    for extra_para in paragraphs[1:]:
        txBody_el.remove(extra_para)


def _clear_slot(txBody_el):
    """Set all text in a txBody to empty string, preserving formatting."""
    paragraphs = txBody_el.findall(qn("a:p"))
    if not paragraphs:
        return
    first_para = paragraphs[0]
    runs = first_para.findall(qn("a:r"))
    if runs:
        t_el = runs[0].find(qn("a:t"))
        if t_el is not None:
            t_el.text = ""
        for extra_run in runs[1:]:
            first_para.remove(extra_run)
    for extra_para in paragraphs[1:]:
        txBody_el.remove(extra_para)


def _make_run(text: str):
    """Create a bare <a:r><a:t>text</a:t></a:r> element."""
    from lxml import etree
    NS = "http://schemas.openxmlformats.org/drawingml/2006/main"
    r  = etree.SubElement(etree.Element("dummy"), f"{{{NS}}}r")
    t  = etree.SubElement(r, f"{{{NS}}}t")
    t.text = text
    return r