""" engine/text_replacer.py ----------------------- Replaces placeholder text in a cloned slide with real course content. Design principles: - Sorts ALL text shapes by absolute visual position (top→bottom, left→right) so reading order is correct regardless of XML document order. - Handles shapes nested inside groups by applying group coordinate transforms. - Preserves 100% of text formatting (font, size, color, bold, italic). Only the text *content* changes — no XML structure is altered. - Skips step-number shapes (e.g. "01", "02") — they are decorative. Content payload (what the AI produces, what this module consumes): { "title": "Slide heading", "steps": [ # optional {"label": "Label", "description": "Description text"}, ... ], "body": "Plain paragraph text" # optional, for non-step slides } """ from __future__ import annotations import re from pptx.oxml.ns import qn # ── Regex: a shape whose full text is a step-number/year decoration ─────────── _STEP_NUMBER_RE = re.compile(r"^\d{1,4}$") # "1".."99", "01".."09", years 2015… # ═══════════════════════════════════════════════════════════════════════════════ # Public API # ═══════════════════════════════════════════════════════════════════════════════ def replace_slide_text(slide, content: dict, txbox_only: bool = False, stride: int = 1): """ Replace placeholder text in *slide* with *content*. content keys (all optional except 'title'): title str – slide heading steps list[dict] – each dict has 'label' and/or 'description' body str – single body paragraph (for plain-text slides) txbox_only: if True, only fill shapes with cNvSpPr txBox="1" (use for the new Infographic_Bank_Final_Clean slides). stride: number of txBox slots per content step (from catalogue txbox_stride). When stride=1 the replacer fills sequentially. When stride>1 it fills slot[i*stride] with label[i] and slot[i*stride+1] with description[i], then clears all other slots so no residual template text leaks through. """ slots = _collect_slots(slide, txbox_only=txbox_only) # For bank slides (txbox_only=True) swap text only — never touch bodyPr, # autofit, or font size so the designed layout is preserved exactly. _fill = _swap_text if txbox_only else _set_text # ── 1. Title (first/topmost text shape) ────────────────────────────────── # For bank slides (txbox_only=True) the title is handled externally by # _fill_placeholder_title (title ph) or an injected text box in the builder. # Never consume a txBox slot for the title — every txBox row is a content row. title = content.get("title", "") if title and slots and not txbox_only: _fill(slots[0]["element"], title, cx_in=slots[0].get("cx_in", 0), cy_in=slots[0].get("cy_in", 0)) slots = slots[1:] # ── 1b. Clear header-wide structural text boxes (>8" wide) that are NOT # step labels — bank slides often have a subtitle/caption row just # below the title. ONLY clear them when non-wide label slots exist # as alternatives; if ALL remaining slots are wide (full-width row # design) they ARE the label rows and must NOT be cleared. if txbox_only and content.get("steps"): non_wide = [s for s in slots if s.get("cx_in", 0) <= 8.0] if non_wide: for s in slots: if s.get("cx_in", 0) > 8.0: _clear_slot(s["element"]) slots = non_wide # else: all remaining slots are wide label rows — keep them all # ── 2. Body (plain-text slide, no steps) ───────────────────────────────── body = content.get("body") if body and not content.get("steps"): if slots: _fill(slots[0]["element"], body, cx_in=slots[0].get("cx_in", 0), cy_in=slots[0].get("cy_in", 0)) return # ── 3. Steps: labels then descriptions ─────────────────────────────────── steps = content.get("steps", []) if not steps: return need_descs = any("description" in step and step["description"] for step in steps) if stride > 1 and need_descs: # Stride-based filling: each content step occupies `stride` consecutive # txBox slots (e.g. label, description, icon-caption, ...). # We fill slot[i*stride] with label[i], slot[i*stride+1] with desc[i], # and clear everything else to remove residual template text. filled: set[int] = set() for i, step in enumerate(steps): lbl_idx = i * stride if lbl_idx < len(slots): s = slots[lbl_idx] _fill(s["element"], step.get("label", ""), cx_in=s.get("cx_in", 0), cy_in=s.get("cy_in", 0)) filled.add(lbl_idx) if need_descs: dsc_idx = i * stride + 1 if dsc_idx < len(slots): s = slots[dsc_idx] _fill(s["element"], step.get("description", ""), cx_in=s.get("cx_in", 0), cy_in=s.get("cy_in", 0)) filled.add(dsc_idx) # Clear all unfilled slots so no residual template text shows through for j, s in enumerate(slots): if j not in filled: _clear_slot(s["element"]) return # ── Sequential filling (stride=1) ──────────────────────────────────────── # Classify into label slots vs description slots, then fill 1:1. label_slots, desc_slots = _classify_slots(slots, need_descs=need_descs) for i, step in enumerate(steps): if "label" in step and i < len(label_slots): s = label_slots[i] _fill(s["element"], step["label"], cx_in=s.get("cx_in", 0), cy_in=s.get("cy_in", 0)) if "description" in step and i < len(desc_slots): s = desc_slots[i] _fill(s["element"], step["description"], cx_in=s.get("cx_in", 0), cy_in=s.get("cy_in", 0)) # Clear any unused label slots so no rogue template text floats on screen n = len(steps) for s in label_slots[n:]: _clear_slot(s["element"]) for s in desc_slots[n:]: _clear_slot(s["element"]) # ═══════════════════════════════════════════════════════════════════════════════ # Slot collection — visual position order, group-transform aware # ═══════════════════════════════════════════════════════════════════════════════ def _classify_slots(slots: list[dict], need_descs: bool = True) -> tuple[list[dict], list[dict]]: """ Given slots sorted by (abs_y, abs_x), split into (label_slots, desc_slots). If need_descs is False (overview / label-only content), every slot is treated as a label slot — this correctly handles zigzag / alternating layouts where left and right boxes are both labels, not label+description pairs. Strategy when need_descs=True: A) Each row has ≥2 slots → 2-column layout: left col = labels, right = descs. B) Single-slot rows (zigzag / hub) → classify by average existing text length: rows with avg < 25 chars = label rows; longer rows = description rows. """ if not slots: return [], [] # When content has no descriptions, every box is a label slot (avoids # misclassifying zigzag layouts as label+description columns). if not need_descs: return slots, [] # ── Group into rows (y within 0.15 in) ─────────────────────────────────── rows: list[list[dict]] = [] for s in slots: placed = False for row in rows: if abs(s["abs_y"] - row[0]["abs_y"]) <= 0.15: row.append(s) placed = True break if not placed: rows.append([s]) for row in rows: row.sort(key=lambda s: s["abs_x"]) # ── Case A: at least one row has 2+ slots → paired label/desc layout ───── if any(len(row) >= 2 for row in rows): label_slots: list[dict] = [] desc_slots: list[dict] = [] for row in rows: label_slots.append(row[0]) desc_slots.extend(row[1:]) return label_slots, desc_slots # ── Case B: all rows single-slot — classify by average existing text len ── label_slots = [] desc_slots = [] for row in rows: avg_len = sum(len(s["text"]) for s in row) / len(row) if avg_len < 25: label_slots.extend(row) else: desc_slots.extend(row) return label_slots, desc_slots def _collect_slots(slide, txbox_only: bool = False) -> list[dict]: """ Return all editable text shapes on the slide, sorted by absolute visual position (top→bottom, left→right), with step-number decorations removed. txbox_only=True — only collect shapes with cNvSpPr txBox="1" (explicit text boxes). Use this for the Infographic_Bank_Final_Clean bank where structural background rectangles also carry txBody but must not be filled with content. txbox_only=False — legacy behaviour: collect every shape that has a txBody (works for the old bank which had placeholder text). Each entry: {"element": txBody_element, "text": str, "abs_x": float, "abs_y": float, "cx_in": float, "cy_in": float} """ raw = [] _walk(slide._element, raw, offset_x=0, offset_y=0, scale_x=1.0, scale_y=1.0, txbox_only=txbox_only) # Filter out step-number decorations raw = [r for r in raw if not _STEP_NUMBER_RE.match(r["text"].strip())] # Sort by visual position (round y to 1 dp so same-row slots stay together) raw.sort(key=lambda r: (round(r["abs_y"], 1), round(r["abs_x"], 2))) return raw EMU = 914400.0 # EMUs per inch def _walk(el, results, offset_x, offset_y, scale_x, scale_y, txbox_only: bool = False): """Recursively walk the shape tree, resolving group transforms.""" tag = el.tag.split("}")[-1] if "}" in el.tag else el.tag if tag == "grpSp": # Resolve this group's coordinate transform nx, ny, nsx, nsy = _group_transform(el, offset_x, offset_y, scale_x, scale_y) for child in el: _walk(child, results, nx, ny, nsx, nsy, txbox_only) elif tag in ("sp", "pic"): # txbox_only filter: skip shapes that are not explicit text boxes if txbox_only: cNvSpPr = el.find(qn("p:nvSpPr") + "/" + qn("p:cNvSpPr")) if cNvSpPr is None or cNvSpPr.get("txBox") != "1": return # Get shape position in current coordinate space x, y = _shape_pos(el) abs_x = offset_x + x * scale_x abs_y = offset_y + y * scale_y # Collect ALL text boxes — including empty ones (e.g. blank label boxes, # title placeholders with no placeholder text). Empty boxes are valid # target slots; skipping them leaves infographic boxes unfilled. txBody = el.find(".//" + qn("p:txBody")) if txBody is None: txBody = el.find(".//" + qn("a:txBody")) if txBody is not None: text = "".join( t.text or "" for t in txBody.findall(".//" + qn("a:t")) ).strip() # Store the shape width (cx) so _set_text can size fonts correctly spPr = el.find(qn("p:spPr")) if spPr is None: spPr = el.find(".//" + qn("p:spPr")) cx_emu = cy_emu = 0 if spPr is not None: xfrm = spPr.find(qn("a:xfrm")) if xfrm is not None: ext = xfrm.find(qn("a:ext")) if ext is not None: cx_emu = int(ext.get("cx", 0)) cy_emu = int(ext.get("cy", 0)) results.append({ "element": txBody, "text": text, "abs_x": abs_x / EMU, "abs_y": abs_y / EMU, "cx_in": cx_emu / EMU, "cy_in": cy_emu / EMU, }) else: # spTree, cSld, etc. — just recurse for child in el: _walk(child, results, offset_x, offset_y, scale_x, scale_y, txbox_only) def _group_transform(grpSp_el, parent_x, parent_y, parent_sx, parent_sy): """ Compute absolute offset + scale for children of a grpSp element. Returns (new_offset_x, new_offset_y, new_scale_x, new_scale_y). """ grpSpPr = grpSp_el.find(qn("p:grpSpPr")) if grpSpPr is None: return parent_x, parent_y, parent_sx, parent_sy xfrm = grpSpPr.find(qn("a:xfrm")) if xfrm is None: return parent_x, parent_y, parent_sx, parent_sy off = xfrm.find(qn("a:off")) ext = xfrm.find(qn("a:ext")) chOff = xfrm.find(qn("a:chOff")) chExt = xfrm.find(qn("a:chExt")) if None in (off, ext, chOff, chExt): return parent_x, parent_y, parent_sx, parent_sy gx = int(off.get("x", 0)) gy = int(off.get("y", 0)) gcx = int(ext.get("cx", 1)) gcy = int(ext.get("cy", 1)) cox = int(chOff.get("x", 0)) coy = int(chOff.get("y", 0)) ccx = int(chExt.get("cx", 1)) ccy = int(chExt.get("cy", 1)) sx = gcx / ccx if ccx else 1.0 sy = gcy / ccy if ccy else 1.0 new_offset_x = parent_x + (gx - 0) * parent_sx - cox * sx * parent_sx new_offset_y = parent_y + (gy - 0) * parent_sy - coy * sy * parent_sy new_scale_x = parent_sx * sx new_scale_y = parent_sy * sy return new_offset_x, new_offset_y, new_scale_x, new_scale_y def _shape_pos(sp_el): """Return (x, y) EMU position of a sp or pic element.""" spPr = sp_el.find(qn("p:spPr")) if spPr is None: spPr = sp_el.find(".//" + qn("p:spPr")) if spPr is None: return 0, 0 xfrm = spPr.find(qn("a:xfrm")) if xfrm is None: return 0, 0 off = xfrm.find(qn("a:off")) if off is None: return 0, 0 return int(off.get("x", 0)), int(off.get("y", 0)) # ═══════════════════════════════════════════════════════════════════════════════ # Text replacement — preserves ALL formatting # ═══════════════════════════════════════════════════════════════════════════════ def _fit_font_pt( cx_in: float, cy_in: float, text: str, orig_pt: float, default_pt: float = 18.0, line_spacing: float = 1.2, n_items: int = 1, spc_aft_pt: float = 0.0, min_pt: int = 10, ) -> float: """ Return the largest font size (pts) at which *text* fits inside a box of cx_in x cy_in inches without overflowing, down to min_pt (default 10 pt). Uses a proportional font model: char_width ~ font_pt / 144 inches (avg for Calibri/Arial) line_height ~ font_pt / 72 * line_spacing inches n_items: number of separate paragraphs/bullets — each occupies at least one line; minimum height = n_items × line_height. spc_aft_pt: extra vertical spacing added after each item paragraph (pts). Pass the paragraph spcAft value so the formula stays accurate. min_pt: hard floor — never return below this value. Tries the original font first; reduces by 1 pt until fit or min_pt floor. """ import math floor = max(int(min_pt), 1) start_pt = orig_pt if orig_pt >= floor else default_pt for pt in range(int(start_pt), floor - 1, -1): chars_per_line = cx_in * 144 / pt lines_needed = max( math.ceil(len(text) / max(chars_per_line, 1)), n_items, # each item occupies at least one line ) text_height = lines_needed * (pt / 72) * (line_spacing + 0.1) # +0.1 safety spacing_height = n_items * (spc_aft_pt / 72) height_needed = text_height + spacing_height if height_needed <= cy_in: return float(pt) return float(floor) def _swap_text(txBody_el, new_text: str, cx_in: float = 0, cy_in: float = 0): """ Replace ONLY the text content — does NOT touch bodyPr, autofit, or font size. Use this for bank infographic slides where the original visual design must be preserved exactly (position, size, colour, font are all left untouched). cx_in / cy_in are accepted but ignored (keeps the same call signature as _set_text). """ if not new_text: return import copy as _copy from lxml import etree _NS_A = "http://schemas.openxmlformats.org/drawingml/2006/main" paragraphs = txBody_el.findall(qn("a:p")) if not paragraphs: p_el = etree.Element(f"{{{_NS_A}}}p") txBody_el.append(p_el) paragraphs = [p_el] first_para = paragraphs[0] runs = first_para.findall(qn("a:r")) if runs: # Existing run — just swap the text string, touch nothing else t_el = runs[0].find(qn("a:t")) if t_el is not None: t_el.text = new_text for extra_run in runs[1:]: first_para.remove(extra_run) else: # No runs — create one, copying endParaRPr for font/colour/size so the # bank slide's designed styling (e.g. white text on coloured box) is kept. r_el = etree.Element(f"{{{_NS_A}}}r") t_el = etree.SubElement(r_el, f"{{{_NS_A}}}t") t_el.text = new_text endParaRPr = first_para.find(qn("a:endParaRPr")) if endParaRPr is not None: rPr = _copy.deepcopy(endParaRPr) rPr.tag = f"{{{_NS_A}}}rPr" r_el.insert(0, rPr) endParaRPr.addprevious(r_el) else: first_para.append(r_el) for extra_para in paragraphs[1:]: txBody_el.remove(extra_para) def _set_text(txBody_el, new_text: str, cx_in: float = 0, cy_in: float = 0, font_pt: float | None = None, min_font_pt: int = 10, font_family: str | None = None): """ Replace the text content of a txBody element with new_text. Preserves all formatting (rPr attributes: font, size, color, bold, etc.). Real-time font fitting (like a human designer): - Reads the original font size from the run's rPr (or estimates 12 pt). - Computes the largest font where the text fits cleanly inside the box. - Sets that size explicitly so PowerPoint never needs to overflow or wrap beyond the box boundaries. - Removes noAutofit / spAutoFit so the box stays at its designed size. If `font_pt` is given, auto-fit is bypassed and that exact size is applied. `min_font_pt` enforces a floor on the auto-fitter (raise to e.g. 32 for titles so they don't shrink below a readable size). `font_family` if given, stamps an explicit child onto the rPr so the run renders in the brand font instead of inheriting from the layout/master (useful when cloned slides come from mixed sources). """ if not new_text: return from lxml import etree _NS_A = "http://schemas.openxmlformats.org/drawingml/2006/main" # ── Lock the box: remove all autofit flags, add noAutofit ──────────────── # We are setting the font size ourselves, so let the box stay fixed. bodyPr = txBody_el.find(qn("a:bodyPr")) if bodyPr is not None: for tag in ("a:noAutofit", "a:spAutoFit", "a:normAutofit"): el = bodyPr.find(qn(tag)) if el is not None: bodyPr.remove(el) bodyPr.append(etree.Element(f"{{{_NS_A}}}noAutofit")) paragraphs = txBody_el.findall(qn("a:p")) if not paragraphs: # txBody has no paragraph at all — create one so we have somewhere to write p_el = etree.Element(f"{{{_NS_A}}}p") txBody_el.append(p_el) paragraphs = [p_el] first_para = paragraphs[0] runs = first_para.findall(qn("a:r")) if runs: first_run = runs[0] t_el = first_run.find(qn("a:t")) if t_el is not None: t_el.text = new_text for extra_run in runs[1:]: first_para.remove(extra_run) # ── Compute and set the fitting font size ───────────────────────────── if font_pt is not None: rPr = first_run.find(qn("a:rPr")) if rPr is not None: rPr.set("sz", str(int(font_pt * 100))) elif cx_in > 0 and cy_in > 0: rPr = first_run.find(qn("a:rPr")) if rPr is not None: orig_sz = rPr.get("sz") orig_pt = int(orig_sz) / 100.0 if orig_sz else 0.0 fit_pt = _fit_font_pt(cx_in, cy_in, new_text, orig_pt, default_pt=18.0, min_pt=min_font_pt) rPr.set("sz", str(int(fit_pt * 100))) # ── Optional explicit typeface stamp ────────────────────────────────── if font_family: rPr = first_run.find(qn("a:rPr")) if rPr is not None: # Replace any existing child rather than stack new ones. for old in rPr.findall(qn("a:latin")): rPr.remove(old) latin = etree.SubElement(rPr, f"{{{_NS_A}}}latin") latin.set("typeface", font_family) else: import copy as _copy r_el = _make_run(new_text) endParaRPr = first_para.find(qn("a:endParaRPr")) if endParaRPr is not None: # Copy formatting from endParaRPr (the designer's intended font/size/color) # and attach it to the new run as rPr = _copy.deepcopy(endParaRPr) rPr.tag = f"{{{_NS_A}}}rPr" if font_pt is not None: rPr.set("sz", str(int(font_pt * 100))) elif cx_in > 0 and cy_in > 0: orig_sz = rPr.get("sz") orig_pt = int(orig_sz) / 100.0 if orig_sz else 0.0 fit_pt = _fit_font_pt(cx_in, cy_in, new_text, orig_pt, default_pt=18.0, min_pt=min_font_pt) rPr.set("sz", str(int(fit_pt * 100))) if font_family: for old in rPr.findall(qn("a:latin")): rPr.remove(old) latin = etree.SubElement(rPr, f"{{{_NS_A}}}latin") latin.set("typeface", font_family) r_el.insert(0, rPr) # endParaRPr must always be the last element in endParaRPr.addprevious(r_el) else: first_para.append(r_el) # Remove extra paragraphs for extra_para in paragraphs[1:]: txBody_el.remove(extra_para) def _clear_slot(txBody_el): """Set all text in a txBody to empty string, preserving formatting.""" paragraphs = txBody_el.findall(qn("a:p")) if not paragraphs: return first_para = paragraphs[0] runs = first_para.findall(qn("a:r")) if runs: t_el = runs[0].find(qn("a:t")) if t_el is not None: t_el.text = "" for extra_run in runs[1:]: first_para.remove(extra_run) for extra_para in paragraphs[1:]: txBody_el.remove(extra_para) def _make_run(text: str): """Create a bare text element.""" from lxml import etree NS = "http://schemas.openxmlformats.org/drawingml/2006/main" r = etree.SubElement(etree.Element("dummy"), f"{{{NS}}}r") t = etree.SubElement(r, f"{{{NS}}}t") t.text = text return r