""" engine/slide_builder.py ------------------------ Assembles a full PPTX from a typed slide plan (output of content_planner.py). ARCHITECTURE (based on gold standard human course analysis): ───────────────────────────────────────────────────────────────────────────── The template content slide is ALWAYS the base for every content slide. Bank infographic slides are NOT cloned wholesale — their GROUP SHAPES are extracted and placed in the lower zone of the template content slide. This ensures the template's chrome (watermarks, corner accents, accent bands) is present on every slide, making the deck look cohesive. Per-type build strategy: cover, about_us, syllabus, module_intro, ending → clone from template, replace text content text_only (paragraphs) → template content slide + full-height prose body text_only (points) → template content slide + inline bold:regular paragraphs text_only (bullets) → template content slide + checkmark bullet body groups → template content slide + short intro body + bank GROUP SHAPES extracted into lower zone + color-harmonised to template brand color overview_groups → same as groups (labels only, no descriptions) drill_down → template content slide + navy step bar + bullet body Template slide index mapping (0-based): 0 = Cover 1 = About Us 2 = Syllabus 3 = Module Intro 4 = Content (the base for all content slides) 5 = Ending """ from __future__ import annotations import copy import json import math import re from pathlib import Path from pptx.oxml.ns import qn from engine.slide_cloner import SlideCloner from engine.text_replacer import _collect_slots, _set_text from engine.slide_selector import select_slide from engine.template_analyzer import ( analyze_template, apply_chrome, body_cx_for_text_only, title_cx_for_layout, ) _EMU = 914_400 _NS_P = "http://schemas.openxmlformats.org/presentationml/2006/main" _NS_A = "http://schemas.openxmlformats.org/drawingml/2006/main" _LOGO_TOKENS = {"theknowledgeacademy", "the knowledge academy"} _IMG_EXTS = {".gif", ".png", ".jpg", ".jpeg"} # Default template slide indices (overridden per-build by dna["slide_indices"]) _TMPL_COVER = 0 _TMPL_ABOUT_US = 1 _TMPL_SYLLABUS = 2 _TMPL_MODULE_INTRO = 3 _TMPL_CONTENT = 4 _TMPL_ENDING = 5 # Image noise words stripped when extracting keywords from image filenames _IMG_NOISE = { "pana", "bro", "amico", "cuate", "rafiki", "removebg", "preview", "dazzle", "dizzy", "juicy", "techny", "sammy", "flame", "beam", "pixeltrue", "with", "from", "your", "that", "this", "have", "hand", "woman", "girl", "person", "people", "man", "guy", } _TECH_IMAGE_SIGNALS = { "java", "javascript", "python", "css", "html", "code", "coding", "programming", "backend", "frontend", "vscode", "github", "gitlab", "docker", "kubernetes", "devops", "software", "developer", "development", "api", "modeling", "database", "cloud", "debugging", "icons8", "techny", "control", "panel", "operating", "system", "prototyping", "scrum", "segmentation", "interaction", "testing", } _SOCIAL_MARKETING_IMAGE_SIGNALS = { "social", "marketing", "ecommerce", "seo", "influencer", "targeting", "advertising", "online ads", "online shop", "online store", "qr code", "qr", "megaphone", "dashboard", "site stats", "website", "web designer", "landing page", "mobile marketing", "content marketing", "digital marketing", "clothing store", "blogging", "subscriber", "online clothing", } _TECH_COURSE_SIGNALS = { "software", "programming", "coding", "development", "web", "python", "java", "javascript", "html", "css", "data", "cloud", "devops", "technology", "it ", " it ", "cyber", "database", "network", "machine learning", "artificial intelligence", "ai ", " ai", } _MARKETING_COURSE_SIGNALS = { "marketing", "advertising", "digital marketing", "social media", "seo", "content marketing", "ecommerce", "e-commerce", "brand", "campaign", "promotion", "sales funnel", "copywriting", } # Soft-skill / communication / leadership courses don't share vocabulary with # our image-bank filenames (which lean toward "presentation", "lesson", # "meeting", "business"). Without this, the picker has no good fallback and # falls through to alphabetical-unused — which puts sci-fi/robot images on a # public-speaking course. When the course title hits one of these signals, # we INJECT the matching image keywords into course_words so the existing # fb_score logic surfaces sensible visuals. _SOFT_SKILL_COURSE_SIGNALS = { "public speaking", "presentation", "communication", "communicating", "leadership", "negotiation", "negotiating", "conflict", "feedback", "coaching", "mentoring", "facilitation", "facilitating", "interview", "interviewing", "delegation", "delegating", "stakeholder", "influence", "influencing", "persuasion", "team", "teamwork", "collaboration", "collaborating", "emotional intelligence", "assertiveness", "active listening", "storytelling", "speaking", "speech", "presenting", "trainer", "training", "workshop", "facilitator", } # Image keywords to prefer when the course is detected as soft-skill. # These match real filenames in images/: Presentation*, Lesson*, Business*, # Documents*, Animation*, Business support*, Business mission*, Business Plan* _SOFT_SKILL_PREFERRED_IMG_KEYWORDS = { "presentation", "lesson", "business", "support", "mission", "plan", "animation", "documents", "instant", "information", "blogging", } # ═══════════════════════════════════════════════════════════════════════════════ # Public entry point # ═══════════════════════════════════════════════════════════════════════════════ def build_presentation( slide_plan: list[dict], output_path: str, template_path: str, bank_dir: str, catalogue_path: str, gold_standard_path: str = "", images_dir: str = "", api_key: str = "", ) -> str: """ Build a PPTX from *slide_plan* and save to *output_path*. api_key is passed to analyze_template for one-shot LLM zone validation. """ bank_dir = Path(bank_dir) tmpl_path = str(template_path) if not images_dir: images_dir = str(Path(__file__).parent.parent.parent / "images") with open(catalogue_path, encoding="utf-8") as f: catalogue = json.load(f) # Drill-down card catalog (gold-standard card shapes). Optional — falls back # to the procedural icon-banner builder when missing. drill_dir = Path(__file__).parent.parent.parent / "drill_down_bank" drill_cat_path = drill_dir / "drill_down_catalogue.json" drill_catalogue: list[dict] = [] if drill_cat_path.exists(): try: with open(drill_cat_path, encoding="utf-8") as f: drill_catalogue = json.load(f) except Exception as exc: print(f" [warn] drill catalog load failed: {exc!r}") # Template DNA — parsed once, used everywhere. # api_key enables one-shot LLM validation (reads/writes manifest file). dna = analyze_template(tmpl_path, api_key=api_key) cloner = SlideCloner(str(output_path), template_path=tmpl_path) used_slides: dict[tuple, int] = {} used_families: list[str] = [] used_files: list[str] = [] used_card_families: list[str] = [] # drill_down card family rotation # Detect course type from cover title course_title = "" for s in slide_plan: if s.get("type") == "cover": course_title = s.get("title", "") break is_tech = _is_tech_course(course_title) is_marketing = _is_marketing_course(course_title) is_soft_skill = _is_soft_skill_course(course_title) course_words: set[str] = { w.lower() for w in re.sub(r"[^a-zA-Z ]", " ", course_title).split() if len(w) > 3 and w.lower() not in _IMG_NOISE } # Soft-skill courses share no vocabulary with image-bank filenames — inject # preferred image keywords so the picker has sensible fallback options # instead of grabbing the next alphabetical-unused (robot/sci-fi) image. if is_soft_skill: course_words = course_words | _SOFT_SKILL_PREFERRED_IMG_KEYWORDS print(f" Course: {course_title!r} tech={is_tech} marketing={is_marketing} soft_skill={is_soft_skill}") # Illustration pool illus_images = _load_illustration_images( images_dir, tech_course=is_tech, marketing_course=is_marketing, course_title=course_title, ) illus_used: set[str] = set() print(f" Illustration images: {len(illus_images)}") # Topic-level family lock for standalone drill_down runs. # When consecutive drill_downs share the same parent topic, they all use # ONE card style. The planner sometimes appends ": " to the # title, so we normalise the key by stripping that suffix before compare. _drill_lock_title = None _drill_lock_entry = None def _topic_key(d: dict) -> str: title = (d.get("title") or "").strip() label = (d.get("step_label") or "").strip() if label and title.lower().endswith(f": {label}".lower()): title = title[: -(len(label) + 2)].rstrip() return title # Per-topic uniform font for drill_down card labels — so every card in # one parent topic renders at the same size, instead of "Informative" at # 12pt next to "Demonstrative" at 10pt (visually jarring within one set). from collections import defaultdict as _defaultdict _drill_groups: dict[str, list[str]] = _defaultdict(list) for _d in slide_plan: if _d.get("type") == "drill_down": _drill_groups[_topic_key(_d)].append( _d.get("step_label") or _d.get("title") or "" ) _drill_label_fonts: dict[str, float] = { k: _drill_label_pt_for_topic(labels) for k, labels in _drill_groups.items() } for slide_dict in slide_plan: stype = slide_dict.get("type", "") # Reset the topic lock whenever the run of drill_downs is broken # (groups slides have their own internal lock so they're exempt). if stype != "drill_down": _drill_lock_title = None _drill_lock_entry = None # Record slide count BEFORE dispatch so we can identify the new slide(s) # this iteration produces — needed to stamp trainer notes onto the # primary (first new) slide regardless of which builder ran. _slides_before = cloner.slide_count() try: if stype == "cover": _build_cover(cloner, tmpl_path, slide_dict, dna) elif stype == "about_us": cloner.clone_slide(tmpl_path, dna["slide_indices"]["about_us"]) elif stype == "syllabus": _build_syllabus(cloner, tmpl_path, slide_dict, dna) elif stype == "module_intro": _build_module_intro(cloner, tmpl_path, slide_dict, dna, illus_images, illus_used, course_words) elif stype == "text_only": _build_text_only(cloner, tmpl_path, slide_dict, dna, illus_images, illus_used, course_words) elif stype in ("groups", "infographic", "text_infographic"): _build_groups_slide(cloner, tmpl_path, bank_dir, catalogue, slide_dict, used_slides, used_families, is_tech, used_files, dna, drill_dir, drill_catalogue, used_card_families, illus_images=illus_images, illus_used=illus_used, course_words=course_words) elif stype in ("overview_groups", "infographic_overview"): _build_overview_groups(cloner, tmpl_path, bank_dir, catalogue, slide_dict, used_slides, used_families, is_tech, used_files, dna) elif stype == "drill_down": # Lock the card family for consecutive drill_downs that share # the same parent topic. Picks a fresh family when the topic # changes so each topic has its own visual treatment. _this_key = _topic_key(slide_dict) if _this_key != _drill_lock_title: _drill_lock_title = _this_key _drill_lock_entry = None if drill_catalogue: _drill_lock_entry = _pick_card_family(drill_catalogue, used_card_families) if _drill_lock_entry is not None: used_card_families.append(_drill_lock_entry.get("name", "")) _build_drill_down_card( cloner, tmpl_path, drill_dir, drill_catalogue, slide_dict, dna, used_card_families, card_entry=_drill_lock_entry, illus_images=illus_images, illus_used=illus_used, course_words=course_words, label_font_pt=_drill_label_fonts.get(_topic_key(slide_dict)), ) elif stype == "quiz": # 7-slide contract: real MCQ slide. Falls back to the # generic marker if the template has no dedicated quiz slot. if "quiz" in dna.get("slide_indices", {}): _build_quiz(cloner, tmpl_path, slide_dict, dna) else: _build_assessment_marker(cloner, tmpl_path, slide_dict, dna) elif stype == "qa": _build_assessment_marker(cloner, tmpl_path, slide_dict, dna) elif stype == "scenario_qa": _build_scenario_qa(cloner, tmpl_path, slide_dict, dna) elif stype == "scenario": _build_scenario(cloner, tmpl_path, slide_dict, dna) elif stype in ("case_study", "activity"): _build_prose_slide(cloner, tmpl_path, slide_dict, dna) elif stype == "ending": _build_ending(cloner, tmpl_path, slide_dict, dna) else: print(f" [skip] unknown slide type: {stype!r}") except Exception as exc: title = slide_dict.get("title") or slide_dict.get("module_title", stype) print(f" [warn] {stype} failed ({exc!r}), falling back: {title!r}") try: fallback = _make_text_fallback(slide_dict) _build_text_only(cloner, tmpl_path, fallback, dna, illus_images, illus_used, course_words) except Exception as exc2: print(f" [error] fallback also failed ({exc2!r}), skipping") # Stamp trainer notes onto every slide produced this iteration. # # Single-slide builders (text_only, drill_down, quiz, etc): one note # on one slide — the slide_dict's `notes` field. # # Multi-slide builders (overview_groups expands into 1 overview + # N child cards): each child preferentially uses its OWN item-level # `notes` field (LLM-written, course-specific) — falling back to the # deterministic synthesiser only if the LLM omitted it. Parent gets # the top-level slide_dict["notes"]. new_count = cloner.slide_count() - _slides_before if new_count > 0: parent_notes = (slide_dict.get("notes") or "").strip() # Defense in depth: if the plan reached the builder without notes # (e.g. the pad call's safety net didn't run, or a future code path # bypasses it), synthesise notes from the slide content so the # trainer never opens the deck to find blank speaker notes. if not parent_notes: parent_notes = _synthesise_slide_notes(slide_dict) try: # Slide 0 of the new range: parent / primary slide if parent_notes: _set_notes(cloner._prs.slides[_slides_before], parent_notes) # Slides 1..N-1: child cards from items[]. Prefer LLM-written # item.notes; fall back to synthesised template if missing. items = slide_dict.get("items") or [] for offset in range(1, new_count): item_idx = offset - 1 child_notes = "" if item_idx < len(items): item = items[item_idx] item_notes = (item.get("notes") or "").strip() if item_notes: child_notes = item_notes else: child_notes = _synthesise_item_notes(slide_dict, item) else: # No item to anchor against — replicate parent notes # rather than leave the slide blank. child_notes = parent_notes if child_notes: _set_notes(cloner._prs.slides[_slides_before + offset], child_notes) except Exception as exc: print(f" [warn] could not set notes on {stype}: {exc!r}") _sanitize_slides(cloner) cloner.save() return str(output_path) def _sanitize_slides(cloner) -> None: """ Final pass to remove malformed XML that triggers PowerPoint's repair prompt. Fixes: - — empty run with no rPr (PowerPoint flags as invalid) - — self-closing empty t with no rPr """ sp_tag = f"{{{_NS_P}}}sp" r_tag = f"{{{_NS_A}}}r" rPr_tag = f"{{{_NS_A}}}rPr" t_tag = f"{{{_NS_A}}}t" for slide in cloner._prs.slides: for r in list(slide.shapes._spTree.iter(r_tag)): has_rPr = r.find(rPr_tag) is not None t = r.find(t_tag) text = (t.text or "") if t is not None else "" if not has_rPr and not text.strip(): # Remove the empty malformed run entirely parent = r.getparent() if parent is not None: parent.remove(r) # ═══════════════════════════════════════════════════════════════════════════════ # Structural slide builders # ═══════════════════════════════════════════════════════════════════════════════ def _build_cover(cloner: SlideCloner, tmpl_path: str, d: dict, dna: dict): slide = cloner.clone_slide(tmpl_path, dna["slide_indices"]["cover"]) slots = _content_slots(slide) title_slot, _, _ = _find_title_body_slots(slots) if title_slot is None and slots: title_slot = slots[0] if title_slot: cy_eff = min(title_slot.get("cy_in", 1.5), 1.5) _set_text(title_slot["element"], d.get("title", ""), cx_in=title_slot.get("cx_in", 0), cy_in=cy_eff) def _build_syllabus(cloner: SlideCloner, tmpl_path: str, d: dict, dna: dict): slide = cloner.clone_slide(tmpl_path, dna["slide_indices"]["syllabus"]) slots = _content_slots(slide) modules = d.get("modules", []) non_empty = [s for s in slots if s.get("text", "").strip()] module_slots = non_empty[1:] if len(non_empty) > 1 else non_empty if module_slots: tight_cx = min(s.get("cx_in", 10) for s in module_slots) tight_cy = min(s.get("cy_in", 10) for s in module_slots) else: tight_cx = tight_cy = 10 for i, s in enumerate(module_slots): if i < len(modules): _set_text(s["element"], modules[i], cx_in=tight_cx, cy_in=tight_cy) else: _clear_txbody(s["element"]) # Normalise font size across all module slots min_sz = None for i, s in enumerate(module_slots[:len(modules)]): para = s["element"].find(qn("a:p")) if para is not None: run = para.find(qn("a:r")) if run is not None: rPr = run.find(qn("a:rPr")) if rPr is not None and rPr.get("sz"): v = int(rPr.get("sz")) min_sz = v if min_sz is None else min(min_sz, v) if min_sz: for i, s in enumerate(module_slots[:len(modules)]): para = s["element"].find(qn("a:p")) if para is not None: run = para.find(qn("a:r")) if run is not None: rPr = run.find(qn("a:rPr")) if rPr is not None: rPr.set("sz", str(min_sz)) def _fill_combined_module_box(txBody_el, module_num, module_title: str, topics: list) -> None: """ Fill a single combined text box that holds both the module title and topics. Template structure (2023 template): para[0] — bold large font → "Module N: Title" para[1] — blank spacer → preserved as-is para[2+] — bullet items → one topic per paragraph (pPr carries Wingdings bullet) Preserves both pPr (bullet/indent/spacing) and rPr (font size, color, bold). """ import copy from lxml import etree _NS_A = "http://schemas.openxmlformats.org/drawingml/2006/main" paras = txBody_el.findall(f"{{{_NS_A}}}p") if not paras: return def _get_formatting(para): pPr = para.find(f"{{{_NS_A}}}pPr") runs = para.findall(f"{{{_NS_A}}}r") rPr = runs[0].find(f"{{{_NS_A}}}rPr") if runs else None return copy.deepcopy(pPr), copy.deepcopy(rPr) def _set_para_text(para, text, pPr_template, rPr_template): for child in list(para): para.remove(child) if pPr_template is not None: para.append(copy.deepcopy(pPr_template)) if not text: return r = etree.SubElement(para, f"{{{_NS_A}}}r") if rPr_template is not None: r.append(copy.deepcopy(rPr_template)) t = etree.SubElement(r, f"{{{_NS_A}}}t") t.text = text # Para[0]: title — preserve pPr + rPr (bold + large size, no bullet) title_pPr, title_rPr = _get_formatting(paras[0]) _set_para_text(paras[0], f"Module {module_num}: {module_title}", title_pPr, title_rPr) # Para[1]: blank spacer — leave completely untouched # Para[2+]: bullet topics — use pPr+rPr from para[2] as template (carries Wingdings bullet) topic_pPr, topic_rPr = _get_formatting(paras[2]) if len(paras) > 2 else (None, None) # Force topic font to 16pt — overrides whatever the template had. if topic_rPr is not None: topic_rPr.set("sz", "1600") # Remove existing topic paragraphs (para[2] onwards) for p in paras[2:]: txBody_el.remove(p) # Add one paragraph per topic, each carrying the bullet pPr for topic in topics: new_p = etree.SubElement(txBody_el, f"{{{_NS_A}}}p") _set_para_text(new_p, topic, topic_pPr, topic_rPr) def _build_module_intro( cloner: SlideCloner, tmpl_path: str, d: dict, dna: dict, illus_images: list, illus_used: set, course_words: set, ): """ Module intro: clone template slide 3 and replace placeholder text in-place. Template slide 3 (identical to the gold standard) has: • Picture 1 — full-bleed dark background photo (DO NOT REMOVE) • Group 17 — contains TextBox 19 ("Module N:") + TextBox 18 (title) white Poppins SemiBold text on the LEFT half (x < 6in) • TextBox 22/23/24 — topic bullet points on the RIGHT half (x ≥ 7in), white Poppins Light with Wingdings checkmark bullets • TextBox 27 — "theknowledgeacademy" wordmark (DO NOT TOUCH) We do NOT call _remove_decorative_groups (would delete Group 17 = module title) and do NOT remove pictures (Picture 1 is the background). We simply replace the text content in each slot in-place, preserving all white-text formatting. """ from engine.text_replacer import _collect_slots as _raw_slots slide = cloner.clone_slide(tmpl_path, dna["slide_indices"]["module_intro"]) # Collect ALL slots including those inside Group 17 — do NOT filter by logo # here because we need positions first; filter logo after. all_slots = _raw_slots(slide) slots = [s for s in all_slots if s.get("text", "").strip() # only slots that had template text and not any(tok in s["text"].lower() for tok in _LOGO_TOKENS)] if not slots: return # Split by x position: left = module number + title; right = topic bullets. # Threshold is relative to slide width so it works for templates where the # bullet column starts at x ~ 6 in (Template new) or x ~ 7 in (2023 template). _split_x = dna["slide_w_in"] * 0.42 left = sorted([s for s in slots if s["abs_x"] < _split_x], key=lambda s: s["abs_y"]) right = sorted([s for s in slots if s["abs_x"] >= _split_x], key=lambda s: s["abs_y"]) module_num = d.get("module_num", "") module_title = d.get("module_title", "") topics = d.get("topics", []) # ── Combined-box template (e.g. 2023 template) ──────────────────────────── # Detected by: the primary left slot is tall (cy_in > 2.5in), meaning it # holds ALL content — title on para[0] (large bold) + topic list on para[2+]. # The right half contains illustration shapes, not text slots. if left and left[0].get("cy_in", 0) > 2.5: _fill_combined_module_box( left[0]["element"], module_num, module_title, topics, ) return # ── Separate-box template (original): left[0]=label, left[1]=title ──────── # Two left slots mean: topmost = "Module N:" header, second = module title. # BUT if the second left slot is far below the top one (>0.5in gap), it is # actually a separate body topic slot, not a "title continuation". In that # case combine module_num + title into the top slot and treat the second # as a topic slot for the topics column. if len(left) >= 2: gap = left[1].get("abs_y", 0) - (left[0].get("abs_y", 0) + left[0].get("cy_in", 0)) if gap < 0.5: # Vertically adjacent — split header/title across the two slots _set_text(left[0]["element"], f"Module {module_num}:", cx_in=left[0].get("cx_in", 0), cy_in=left[0].get("cy_in", 0)) _set_text(left[1]["element"], module_title, cx_in=left[1].get("cx_in", 0), cy_in=left[1].get("cy_in", 0)) else: # Far apart — second left slot is a topic. Combine title and use # the remaining left slots as additional topic slots. _set_text(left[0]["element"], f"Module {module_num}: {module_title}", cx_in=left[0].get("cx_in", 0), cy_in=left[0].get("cy_in", 0)) # Feed leftover left slots into the right-column topic pool right = list(left[1:]) + list(right) elif left: _set_text(left[0]["element"], f"Module {module_num}: {module_title}", cx_in=left[0].get("cx_in", 0), cy_in=left[0].get("cy_in", 0)) # Topic placement: # • If we have AT LEAST as many right-column slots as topics → one per slot # (this is the original/2023-template module-intro design with 3-4 slots). # • If we have FEWER slots than topics (e.g. Template 2 has only one # topic slot in its module-intro design) → cram ALL topics into the # first slot as a bulleted list so none are lost. if right and len(right) < len(topics): # Render all topics as bullets inside the single available slot. # Lock the font at 16pt — auto-fit was shrinking topic text too small. first_slot = right[0] _set_paragraphs( first_slot["element"], list(topics), cx_in=first_slot.get("cx_in", 6.0), cy_in=first_slot.get("cy_in", 4.0), bullet=True, font_pt=16.0, ) # Clear any extra slots (defensive — should only ever be 0 here) for s in right[1:]: _clear_txbody(s["element"]) else: # One topic per slot, clear extras. Lock all topics at 16pt so bullets # render at the same size regardless of how long each topic string is. for i, s in enumerate(right): if i < len(topics): _set_text(s["element"], topics[i], cx_in=s.get("cx_in", 0), cy_in=s.get("cy_in", 0), font_pt=16.0) else: _clear_txbody(s["element"]) # ═══════════════════════════════════════════════════════════════════════════════ # Content slide builders # ═══════════════════════════════════════════════════════════════════════════════ def _build_text_only( cloner: SlideCloner, tmpl_path: str, d: dict, dna: dict, illus_images: list = None, illus_used: set = None, course_words: set = None, ): """ Build a text_only slide. Supports three body formats: paragraphs — flowing prose points — inline bold label + regular explanation paragraph pairs bullets — checkmark full-sentence bullets (DEFAULT) """ from pptx.util import Inches slide = cloner.clone_slide(tmpl_path, dna["slide_indices"]["content"]) _remove_decorative_groups(slide) slots = _content_slots(slide) title = d.get("title", "") paras = d.get("paragraphs", []) points = d.get("points", []) bullets = d.get("bullets", []) lead_in = d.get("lead_in", "") section_heading = d.get("section_heading", "") title_slot, body_slot, other_slots = _find_title_body_slots(slots) if title_slot is None and slots: title_slot = slots[0] if body_slot is None and len(slots) > 1: body_slot = slots[1] for s in other_slots: _clear_txbody(s["element"]) # Decide whether to show illustration (affects title AND body width) has_image = bool(illus_images) and not points # points use full width body_cx = body_cx_for_text_only(dna, has_image=has_image) # Title — widen the slot when there's no right-side image so the title can # render at 32-36pt single-line instead of shrinking into the narrow default. title_cx_target = title_cx_for_layout(dna, has_right_content=has_image) if title_slot: title_cy_in = title_slot.get("cy_in", dna.get("title", {}).get("cy", 0.71)) if title_cx_target > title_slot.get("cx_in", 0) + 0.05: _resize_shape( title_slot["element"], title_cx_target, title_cy_in, new_x_in=title_slot.get("abs_x", dna.get("title", {}).get("x", 0.66)), new_y_in=title_slot.get("abs_y", dna.get("title", {}).get("y", 0.58)), ) _set_text(title_slot["element"], title, cx_in=title_cx_target, cy_in=title_cy_in, min_font_pt=32, font_family=dna.get("brand_font") or None) body_x = dna["body"]["x"] body_y = dna["body"]["y"] # Use the full content_zone height so body has room to render at a # comfortable font size. Some templates have a tiny body placeholder cy # (e.g. Template 2's body is only 0.4in tall) which would force normAutofit # to shrink text to unreadable sizes. content_zone.cy spans from below # the title down to the chrome floor — plenty of room. cz = dna.get("content_zone") if cz and cz.get("cy", 0) > 0: body_cy = float(cz["cy"]) - max(0.0, body_y - float(cz.get("y", body_y))) body_cy = max(body_cy, 3.0) # never less than 3in else: body_cy = max(dna["body"]["cy"], 3.0) # Push body down to clear the title, accounting for text wrapping. # cy_in is the XML box height, NOT the rendered height when text wraps — # so we estimate rendered height from title length and box width instead. if title_slot: CHAR_WIDTH_IN = 0.23 # avg char width at 36pt bold (calibrated) LINE_HEIGHT_IN = (36.0 / 72.0) * 1.4 # ~0.7in per line incl. spacing title_cx_in = title_cx_target # use the widened width if applied chars_per_line = max(15, int(title_cx_in / CHAR_WIDTH_IN)) n_lines = max(1, math.ceil(len(title) / chars_per_line)) effective_h = n_lines * LINE_HEIGHT_IN title_y = title_slot.get("abs_y", 0.58) min_body_y = title_y + effective_h + 0.15 if body_y < min_body_y: adjustment = min_body_y - body_y body_y = min_body_y body_cy = max(body_cy - adjustment, 2.5) if body_slot: _resize_shape(body_slot["element"], body_cx, body_cy, new_x_in=body_x, new_y_in=body_y) if points: # POINTS format: inline bold label + regular explanation _write_points_body(body_slot["element"], points, cx_in=body_cx, cy_in=body_cy, heading=section_heading, lead_in=lead_in) elif paras: _set_paragraphs(body_slot["element"], paras, cx_in=body_cx, cy_in=body_cy, bullet=False, heading=section_heading, lead_in=lead_in) elif bullets: _set_paragraphs(body_slot["element"], bullets, cx_in=body_cx, cy_in=body_cy, bullet=True, heading=section_heading, lead_in=lead_in) else: # Nothing to write — clear any template placeholder text so the box # doesn't appear as a large empty styled rectangle on the slide. _clear_txbody(body_slot["element"]) # Right-side illustration (not for points slides — they need full width) if illus_images and not points: illus = dna["illus"] # Pass slide body content as additional context for keyword matching — # so the picker can match against the actual concepts on the slide, # not just the short title. body_context = " ".join( (paras or []) + (bullets or []) + [lead_in or ""] ) img_path = _pick_illustration_image( illus_images, title, illus_used or set(), course_words=course_words, context_text=body_context, ) if img_path: if illus_used is not None: illus_used.add(img_path) try: from pptx.util import Inches as _In max_h = _In(illus["cy"]) pic = slide.shapes.add_picture( img_path, _In(illus["x"]), _In(illus["y"]), width=_In(illus["cx"]) ) if pic.height > max_h: pic._element.getparent().remove(pic._element) slide.shapes.add_picture( img_path, _In(illus["x"]), _In(illus["y"]), height=max_h ) except Exception as exc: print(f" [warn] illustration failed: {exc!r}") def _build_groups_slide( cloner: SlideCloner, tmpl_path: str, bank_dir: Path, catalogue: list, d: dict, used_slides: dict, used_families: list, is_tech: bool, used_files: list, dna: dict, drill_dir: Path = None, drill_catalogue: list = None, used_card_families: list = None, illus_images: list = None, illus_used: set = None, course_words: set = None, ): """ V3: Clone bank slide as full base, overlay template chrome, fill text in-place. This avoids the fragility of extracting/scaling group shapes into a lower zone. The bank slide keeps its designed layout; we only replace its title zone and add back the template chrome (watermark, corner accent, left accent strip). """ items = d.get("items") or d.get("steps") or [] title_lower = d.get("title", "").strip().lower() items = [it for it in items if it.get("label", "").strip().lower() != title_lower] if not items: fallback = _make_text_fallback(d) _build_text_only(cloner, tmpl_path, fallback, dna) return # Select bank slide with spatial layout check only # (Capacity check removed — it was too strict and rejected valid slides) from engine.slide_layout_checker import check_layout reqs = { "item_count": len(items), "max_label_len": max((len(it.get("label", "")) for it in items), default=0), "max_desc_len": max((len(it.get("description", "")) for it in items), default=0), "visual_type": d.get("visual_type", ""), } title_words = [w.lower() for w in d.get("title", "").split() if len(w) > 3] excluded_keys: set = set() entry = None for _ in range(15): tmp = select_slide(catalogue, reqs, used_counts=used_slides, topic_keywords=title_words, is_tech_course=is_tech, used_families=used_families, used_files=used_files, excluded_keys=excluded_keys) if tmp is None: break key = (tmp.get("file"), tmp.get("slide_index")) bank_path_tmp = str(bank_dir / tmp["file"]) # Spatial layout check — reject slides where text boxes are # disconnected from their visual elements usable, reason = check_layout(bank_path_tmp, tmp["slide_index"], len(items)) if not usable: print(f" [layout] rejected {tmp['file']}:{tmp['slide_index']} — {reason}") excluded_keys.add(key) continue entry = tmp break if entry is None: print(f" [warn] No bank slide for groups: {d.get('title')!r} — falling back") fallback = _make_text_fallback(d) _build_text_only(cloner, tmpl_path, fallback, dna) return bank_path = str(bank_dir / entry["file"]) # V3 core: clone bank slide as the full base slide slide = cloner.clone_slide(bank_path, entry["slide_index"]) # Robustness check: if the clone produced an empty slide (e.g. the source # bank slide was SmartArt-based and every shape got skipped due to # unresolved diagram rIds), abandon the clone and fall back to text_only. if not _has_visible_body_content(slide): print(f" [warn] bank slide {entry['file']}:{entry['slide_index']} cloned empty (SmartArt?) — falling back") # Remove the empty cloned slide so the fallback doesn't add a second _remove_last_slide(cloner) fallback = _make_text_fallback(d) _build_text_only(cloner, tmpl_path, fallback, dna) return # Strip bank's own title zone and watermark — we replace with template chrome _strip_bank_header(slide, dna) # Fill text BEFORE overlaying chrome so template text boxes aren't mistaken # for content slots by _collect_bank_text_boxes (Pattern B detection). # Always labels-only — bank boxes are not sized for description text. # Writing descriptions into them causes normAutofit to shrink to unreadable # sizes or the text overlaps icons (bank boxes are designed for 3-8 word labels). # Try tag-based fill first (new tagged-infographics bank with "Item N" tags). # Fall back to spatial fill for old bank slides without tags. if not _fill_bank_slide_by_tag(slide, items): _fill_bank_slide_inplace(slide, items, False) # Pre-compute intro bullets — the planner emits intro_points (list of 2 # short bullets). Legacy plans with a string `intro` are normalised in # content_planner._migrate_slide so this read always sees a list. _intro_sentences: list[str] = [ p.strip() for p in (d.get("intro_points") or []) if p and p.strip() ] _intro_box_cy = 0.0 if _intro_sentences: # A long single bullet still needs 2-line height for wrap. n_lines = len(_intro_sentences) if n_lines == 1 and len(_intro_sentences[0]) > 100: n_lines = 2 _intro_box_cy = max(0.55 * n_lines, 0.6) # Overlay template chrome (watermark, corner accents) and title. # Pass intro_height so the repositioning clears the intro text area. _apply_template_chrome(cloner, slide, tmpl_path, d.get("title", ""), dna, intro_height_in=_intro_box_cy) # ── Intro text above the infographic ───────────────────────────────────── if _intro_sentences: from pptx.util import Inches as _In _cz = dna.get("content_zone", dna["body"]) bx = _cz["x"] by = _cz["y"] bcx = body_cx_for_text_only(dna, has_image=False) txbox = slide.shapes.add_textbox(_In(bx), _In(by), _In(bcx), _In(_intro_box_cy)) _set_paragraphs(txbox.text_frame._txBody, _intro_sentences, cx_in=bcx, cy_in=_intro_box_cy, bullet=True) # Track usage key = (entry["file"], entry["slide_index"]) used_slides[key] = used_slides.get(key, 0) + 1 used_families.append(entry.get("visual_family", "")) used_files.append(entry.get("file", "")) # ── Per-item drill_down slides (gold standard pattern) ──────────────────── # For groups slides that carry item descriptions, emit one drill_down slide # per item immediately after the infographic overview. # (overview_groups passes items without descriptions so this block is skipped.) # # IMPORTANT: pick ONE card family for the whole topic so every drill_down # in this group uses the SAME shape style. The next groups topic gets a # different family. This mirrors the gold-standard convention where related # points share one visual treatment. title = d.get("title", "") _card_entry = None if drill_catalogue and drill_dir is not None: _ucf = used_card_families if used_card_families is not None else [] _card_entry = _pick_card_family(drill_catalogue, _ucf) if _card_entry is not None and used_card_families is not None: used_card_families.append(_card_entry.get("name", "")) # Uniform label font across this set of cards so they look consistent. _set_label_pt = _drill_label_pt_for_topic( [item.get("label", "") for item in items] ) for i, item in enumerate(items): # New schema: each item has explicit `bullets` (3 short sentences). # Backwards-compat: if `bullets` missing, derive from `description`. bullets = item.get("bullets") or [] if not bullets: desc = (item.get("description") or "").strip() if not desc: continue raw = desc.rstrip(".") bullets = [s.strip() for s in raw.replace("; ", ".|").replace(". ", ".|").split("|") if s.strip()] if not bullets: bullets = [desc] bullets = bullets[:3] item_d = { "title": title, "step_num": i + 1, "step_label": item.get("label", ""), "bullets": bullets, } _build_drill_down_card( cloner, tmpl_path, drill_dir, drill_catalogue or [], item_d, dna, used_card_families if used_card_families is not None else [], card_entry=_card_entry, illus_images=illus_images, illus_used=illus_used, course_words=course_words, label_font_pt=_set_label_pt, ) def _build_overview_groups( cloner: SlideCloner, tmpl_path: str, bank_dir: Path, catalogue: list, d: dict, used_slides: dict, used_families: list, is_tech: bool, used_files: list, dna: dict, ): """ Overview groups slide: labels only (no descriptions). Same architecture as _build_groups_slide but items have no descriptions. """ items = d.get("items") or d.get("steps") or [] title_lower = d.get("title", "").strip().lower() items = [it for it in items if it.get("label", "").strip().lower() != title_lower] # Strip descriptions for overview — labels only. KEEP intro_points so the # overview slide gets its 2-bullet introduction above the infographic # (matches the gold standard convention). labels_only = [{"label": it.get("label", "")} for it in items] overview_d = dict(d, items=labels_only) _build_groups_slide(cloner, tmpl_path, bank_dir, catalogue, overview_d, used_slides, used_families, is_tech, used_files, dna) def _pick_card_family(drill_catalogue: list, used_card_families: list) -> dict | None: """ Choose one card entry from the catalog, penalising recently-used families. Caller is responsible for appending the chosen name to used_card_families so the rotation tracks topic-level usage (not per-drill_down usage). """ if not drill_catalogue: return None import random as _random def _score(entry): name = entry.get("name", "") s = 0.0 recent = used_card_families[-3:] for i, used in enumerate(reversed(recent)): if used == name: s -= (3.0 - i) return s + _random.uniform(0, 0.3) return max(drill_catalogue, key=_score) def _build_drill_down_card( cloner: SlideCloner, tmpl_path: str, drill_dir: Path, drill_catalogue: list, d: dict, dna: dict, used_card_families: list, card_entry: dict = None, illus_images: list = None, illus_used: set = None, course_words: set = None, label_font_pt: float | None = None, ): """ Drill-down rendered as a gold-standard card cloned from the drill_down_bank. When card_entry is provided, that family is used (lets a groups slide lock all its drill_downs to the same card style). When card_entry is None, a fresh family is picked and used_card_families is updated. For card families with has_image_zone=True, the source slide's baked-in illustration is swapped for one picked from illus_images so each drill_down gets a distinct image instead of repeating the gold standard's example. Falls back to the procedural _build_drill_down when no catalog is available. """ if not drill_catalogue or drill_dir is None: return _build_drill_down(cloner, tmpl_path, d, dna) if card_entry is None: card_entry = _pick_card_family(drill_catalogue, used_card_families) if card_entry is not None: used_card_families.append(card_entry.get("name", "")) entry = card_entry if entry is None: return _build_drill_down(cloner, tmpl_path, d, dna) src_path = str(drill_dir / entry["file"]) try: slide = cloner.clone_slide(src_path, entry["slide_index"]) except Exception as exc: print(f" [warn] drill card clone failed ({exc!r}) — using procedural fallback") return _build_drill_down(cloner, tmpl_path, d, dna) _strip_gold_card_chrome(slide) step_num = str(d.get("step_num", 1)) step_label = d.get("step_label", "") or d.get("title", "") bullets = d.get("bullets", []) or [] body_text = "\n".join(bullets) if bullets else d.get("body", "") _fill_card_slots(slide, step_num, step_label, body_text, bullets_supported=entry.get("bullets_supported", True), label_font_pt=label_font_pt) # If this card family has an illustration zone, swap the gold-standard's # baked-in picture for a fresh one from our illustration pool — so each # drill_down gets its own image instead of repeating the same one. if entry.get("has_image_zone") and illus_images: # Use the item label as the keyword hint so the illustration matches # the drill_down's specific topic (e.g. "Recruitment" → a hiring image). keyword = step_label or d.get("title", "") img_path = _pick_illustration_image( illus_images, keyword, illus_used or set(), course_words=course_words ) if img_path: if illus_used is not None: illus_used.add(img_path) _swap_card_illustration(slide, img_path) # Apply template chrome on top — keep the gold-standard card at its # original position so the card design renders exactly as authored. _apply_template_chrome(cloner, slide, tmpl_path, d.get("title", ""), dna, preserve_group_position=True) def _swap_card_illustration(slide, new_img_path: str) -> None: """ Find the body-area illustration picture on a cloned card slide and replace it with new_img_path, preserving the original position and size so the new image lands exactly where the gold-standard illustration was. Skips small corner-chrome pictures (already removed by _strip_gold_card_chrome). """ pic_tag = f"{{{_NS_P}}}pic" spTree = slide.shapes._spTree target = None target_geom = None target_area = 0 for el in spTree: if el.tag != pic_tag: continue x, y, cx, cy = _get_grp_xfrm(el) # Only consider substantial body-area pictures (>= 2.5×2.5 in) if cx < int(2.5 * _EMU) or cy < int(2.5 * _EMU): continue if y < int(1.0 * _EMU): continue area = cx * cy if area > target_area: target = el target_geom = (x, y, cx, cy) target_area = area if target is None or target_geom is None: return x, y, cx, cy = target_geom # Remove the old picture parent = target.getparent() if parent is not None: parent.remove(target) # Add the new picture preserving its natural aspect ratio. # First insert at width=cx so pptx computes height from aspect; if that # height exceeds cy, re-insert with height=cy and computed width. # Then centre within the target box so the image sits cleanly inside it. try: from pptx.util import Emu pic = slide.shapes.add_picture(new_img_path, Emu(x), Emu(y), width=Emu(cx)) if pic.height > cy: pic._element.getparent().remove(pic._element) pic = slide.shapes.add_picture(new_img_path, Emu(x), Emu(y), height=Emu(cy)) # Centre within the target box pic.left = Emu(x + (cx - pic.width) // 2) pic.top = Emu(y + (cy - pic.height) // 2) except Exception as exc: print(f" [warn] illustration swap failed ({exc!r})") def _strip_gold_card_chrome(slide) -> None: """ Remove the gold-standard's own title, watermark, and corner-decoration pictures from a cloned card slide. Leaves the card group + any body-area illustration picture intact. """ sp_tag = f"{{{_NS_P}}}sp" pic_tag = f"{{{_NS_P}}}pic" t_tag = f"{{{_NS_A}}}t" spTree = slide.shapes._spTree to_remove = [] for el in list(spTree): x, y, cx, cy = _get_grp_xfrm(el) if el.tag == sp_tag: # Strip placeholders if el.find(f".//{{{_NS_P}}}ph") is not None: to_remove.append(el); continue txb = el.find(f"{{{_NS_P}}}txBody") if txb is None: continue text = "".join(t.text or "" for t in txb.iter(t_tag)).strip().lower() # Strip wide title-zone text if y < int(1.4 * _EMU) and cx > int(4 * _EMU) and text: to_remove.append(el); continue # Strip watermark if "knowledgeacademy" in text or "the knowledge academy" in text: to_remove.append(el); continue elif el.tag == pic_tag: # Strip small corner-chrome pictures only — keep body-area illustrations. small = cx < int(2.2 * _EMU) and cy < int(2.2 * _EMU) in_corner = ( (y < int(1.5 * _EMU) and (x < int(2.5 * _EMU) or x + cx > int(11 * _EMU))) or (y + cy > int(5.5 * _EMU) and (x < int(2.5 * _EMU) or x + cx > int(8 * _EMU))) ) if small and in_corner: to_remove.append(el) for el in to_remove: parent = el.getparent() if parent is not None: parent.remove(el) def _fill_card_slots( slide, step_num: str, step_label: str, body_text: str, bullets_supported: bool = True, label_font_pt: float | None = None, ) -> None: """ Detect role slots on a cloned card slide by text-pattern and replace each: • shape with a single digit text → number slot (filled with step_num) • shortest non-numeric, non-empty → label slot (filled with step_label) • longest text shape → body slot (filled with body_text) Searches both top-level sp elements and sps nested inside grpSp containers so it works for cards whose number lives inside a decorative group. """ from lxml import etree sp_tag = f"{{{_NS_P}}}sp" t_tag = f"{{{_NS_A}}}t" txBody_tag = f"{{{_NS_P}}}txBody" candidates: list[tuple] = [] # (sp_el, text, len) for el in slide.shapes._spTree.iter(sp_tag): if el.find(f".//{{{_NS_P}}}ph") is not None: continue txb = el.find(txBody_tag) if txb is None: continue text = "".join(t.text or "" for t in txb.iter(t_tag)).strip() if not text: continue # Skip remaining chrome if "knowledgeacademy" in text.lower(): continue candidates.append((el, text, len(text))) if not candidates: return # Identify number slot — text is short digit(s) "1"-"99" number_el = None for el, text, _ in candidates: if text.isdigit() and len(text) <= 2: number_el = el break # Remaining (non-number) text shapes others = [(el, text, n) for (el, text, n) in candidates if el is not number_el] if not others: return others_sorted = sorted(others, key=lambda r: r[2]) label_el = others_sorted[0][0] body_el = others_sorted[-1][0] if len(others_sorted) > 1 else None if number_el is not None: _replace_text_keeping_run_style(number_el, step_num) if label_el is not None: # Pre-shrink the label's font (and vertical-centre it). If the caller # passed a uniform size (so all cards in a set match), use it directly; # otherwise auto-fit per label. _shrink_label_font_to_fit(label_el, step_label, forced_pt=label_font_pt) _replace_text_keeping_run_style(label_el, step_label) if body_el is not None: # Constrain body box so it doesn't extend into the template's corner- # chrome zone (bottom-left decoration image typically starts at y≈5.58). # The drill_down_bank cards were authored with bodies sized for a # different template; without this, long bullet text overflows the box # and visually overlaps the corner picture. _constrain_card_body_box(body_el, chrome_top_y_in=5.4) if bullets_supported and "\n" in body_text: _replace_text_with_paragraphs(body_el, body_text.split("\n"), bullet=True) else: _replace_text_keeping_run_style(body_el, body_text.replace("\n", " ")) # Layer normAutofit on top so PowerPoint shrinks the font further at # runtime if the LLM produced extra-dense bullets that still overflow. txb = body_el.find(f"{{{_NS_P}}}txBody") if txb is not None: _enable_normAutofit(txb) def _constrain_card_body_box(sp_el, chrome_top_y_in: float = 5.4) -> None: """ Resize a drill_down card's body box so its bottom edge stays above the template's bottom-row chrome. Without this, the bank card's authored body height (typically 5.2-5.5in bottom) sits right where the chrome corner picture lives — long bullets render through it. """ spPr = sp_el.find(f"{{{_NS_P}}}spPr") if spPr is None: return xf = spPr.find(f"{{{_NS_A}}}xfrm") if xf is None: return off = xf.find(f"{{{_NS_A}}}off") ext = xf.find(f"{{{_NS_A}}}ext") if off is None or ext is None: return try: y_in = int(off.get("y", 0)) / _EMU cy_in = int(ext.get("cy", 0)) / _EMU except Exception: return if y_in <= 0 or cy_in <= 0: return max_bottom = chrome_top_y_in new_cy = max(min(cy_in, max_bottom - y_in), 1.5) # never shorter than 1.5in if new_cy < cy_in - 0.05: ext.set("cy", str(int(new_cy * _EMU))) def _shrink_label_font_to_fit(sp_el, new_text: str, forced_pt: float | None = None) -> None: """ Set the pill label font size so the longest single word never wraps, and vertically centre the text inside its box (capsule labels look top- anchored otherwise). If `forced_pt` is given, that exact size is used (caller computed a uniform font for all cards in a set so they look consistent). Otherwise the largest font that fits the longest single word is picked — target 14pt, floor 10pt. """ if not new_text: return spPr = sp_el.find(f"{{{_NS_P}}}spPr") xf = spPr.find(f"{{{_NS_A}}}xfrm") if spPr is not None else None ext = xf.find(f"{{{_NS_A}}}ext") if xf is not None else None if ext is None: return try: cx_in = int(ext.get("cx", 0)) / _EMU except Exception: return if cx_in <= 0: return txb = sp_el.find(f"{{{_NS_P}}}txBody") if txb is None: return rPr = txb.find(f".//{{{_NS_A}}}rPr") if rPr is None: return # Vertically centre the text inside the capsule. Without anchor="ctr" the # text sits at the top of the box, which reads as visually mis-aligned # when the box is taller than one line of text. bodyPr = txb.find(f"{{{_NS_A}}}bodyPr") if bodyPr is not None: bodyPr.set("anchor", "ctr") if forced_pt is not None and forced_pt > 0: rPr.set("sz", str(int(forced_pt * 100))) return # Subtract default text-frame internal margins (lIns/rIns ~ 0.1in each) # so the capacity estimate reflects the usable inside width. effective_cx = max(cx_in - 0.2, 0.3) words = new_text.split() longest_word_len = max((len(w) for w in words), default=len(new_text)) # Char width factor for bold Calibri-ish faces: ~0.0095in per pt at 1.0pt. # i.e. a 14pt bold char averages ~0.133in wide. def fits(pt: float) -> bool: char_w = pt * 0.0095 if char_w <= 0: return False per_line = effective_cx / char_w if longest_word_len > per_line: return False return len(new_text) <= per_line * 2 target_pt = 10.0 for pt_try in (14.0, 13.0, 12.0, 11.0, 10.0): if fits(pt_try): target_pt = pt_try break rPr.set("sz", str(int(target_pt * 100))) def _drill_label_pt_for_topic(step_labels: list[str], box_cx_in: float = 1.47) -> float: """ Pick a single uniform font size for a set of drill_down cards sharing one parent topic, so all cards in the set render at the same size. Picks the largest pt (capped at 14, floor 10) where the longest single word across `step_labels` fits one line in a box of `box_cx_in` inches. """ if not step_labels: return 14.0 effective_cx = max(box_cx_in - 0.2, 0.3) longest_word_len = max( (len(w) for label in step_labels for w in label.split()), default=0, ) if longest_word_len == 0: return 14.0 for pt_try in (14.0, 13.0, 12.0, 11.0, 10.0): if longest_word_len * pt_try * 0.0095 <= effective_cx: return pt_try return 10.0 def _replace_text_keeping_run_style(sp_el, new_text: str) -> None: """ Replace all text inside the first of an sp's txBody, preserving the first 's (font, size, color). Removes any extra paragraphs. IMPORTANT: per the OOXML spec, elements must appear BEFORE any trailing . When PowerPoint sees a run after endParaRPr it silently fails to render the run (white-text-on-white symptom). We build the new run, then re-insert at the correct position relative to endParaRPr. """ from lxml import etree txb = sp_el.find(f"{{{_NS_P}}}txBody") if txb is None: return paras = txb.findall(f"{{{_NS_A}}}p") if not paras: return first_p = paras[0] # Capture first run's rPr to reuse first_r = first_p.find(f"{{{_NS_A}}}r") rPr_src = first_r.find(f"{{{_NS_A}}}rPr") if first_r is not None else None rPr_copy = etree.fromstring(etree.tostring(rPr_src)) if rPr_src is not None else None # Strip all runs from first paragraph (keep endParaRPr intact) for r in first_p.findall(f"{{{_NS_A}}}r"): first_p.remove(r) # Strip extra paragraphs for extra_p in paras[1:]: txb.remove(extra_p) # Force center alignment on the paragraph (user preference for card text). pPr = first_p.find(f"{{{_NS_A}}}pPr") if pPr is None: pPr = etree.Element(f"{{{_NS_A}}}pPr") first_p.insert(0, pPr) pPr.set("algn", "ctr") # Build new run new_r = etree.Element(f"{{{_NS_A}}}r") if rPr_copy is not None: new_r.append(rPr_copy) else: rPr_new = etree.SubElement(new_r, f"{{{_NS_A}}}rPr") rPr_new.set("lang", "en-US") t_el = etree.SubElement(new_r, f"{{{_NS_A}}}t") t_el.text = new_text # Insert BEFORE endParaRPr if present, else append. endParaRPr = first_p.find(f"{{{_NS_A}}}endParaRPr") if endParaRPr is not None: endParaRPr.addprevious(new_r) else: first_p.append(new_r) def _replace_text_with_paragraphs(sp_el, lines: list[str], bullet: bool = False) -> None: """ Replace sp_el's txBody contents with one paragraph per line, preserving the first run's for font styling. When bullet=True, paragraphs get a Wingdings checkmark bullet. Auto-shrinks the body font (down to 10pt minimum) so 3 bullets always fit in the card's body box height — prevents the vertical-line/card families where authored cy is small from overflowing the chrome at the bottom. """ from lxml import etree txb = sp_el.find(f"{{{_NS_P}}}txBody") if txb is None: return paras = txb.findall(f"{{{_NS_A}}}p") rPr_copy = None if paras: first_r = paras[0].find(f"{{{_NS_A}}}r") rPr_src = first_r.find(f"{{{_NS_A}}}rPr") if first_r is not None else None if rPr_src is not None: rPr_copy = etree.fromstring(etree.tostring(rPr_src)) # Remove all existing paragraphs for p in paras: txb.remove(p) # Centralised body layout — top-anchored, top/side padding, noAutofit. # Card-specific: extra top-pad (0.3in) clears the banner/header that sits # just above the body in many card layouts. _apply_body_layout(txb, top_pad_in=0.3, side_pad_in=0.25, bottom_pad_in=0.1) # Compute the font size that lets all paragraphs fit in the box height. # Reads the sp's spPr/xfrm cx/cy, accounts for the insets we just applied. fit_pt = _fit_card_body_pt(sp_el, lines, top_pad_in=0.3, side_pad_in=0.25, bottom_pad_in=0.1, gap_pt=9.0, max_pt=16.0, min_pt=10.0) if rPr_copy is not None and fit_pt is not None: rPr_copy.set("sz", str(int(fit_pt * 100))) for idx, line in enumerate(lines): line = line.strip() if not line: continue p_el = etree.SubElement(txb, f"{{{_NS_A}}}p") pPr = etree.SubElement(p_el, f"{{{_NS_A}}}pPr") # Bullet body stays LEFT-aligned (the standard hanging-bullet pattern). # Card labels/numbers are centered separately in _replace_text_keeping_run_style. pPr.set("algn", "l") # Line spacing 115% within a bullet; space-after between bullets for # the gold-standard breathing room. Skip space-after on the last line. lnSpc = etree.SubElement(pPr, f"{{{_NS_A}}}lnSpc") spcPct_ln = etree.SubElement(lnSpc, f"{{{_NS_A}}}spcPct") spcPct_ln.set("val", "115000") is_last = idx >= len([l for l in lines if l.strip()]) - 1 if not is_last: spcAft = etree.SubElement(pPr, f"{{{_NS_A}}}spcAft") spcPts = etree.SubElement(spcAft, f"{{{_NS_A}}}spcPts") spcPts.set("val", "900") # 9pt space after each bullet if bullet: pPr.set("marL", "285750"); pPr.set("indent", "-285750") buFont = etree.SubElement(pPr, f"{{{_NS_A}}}buFont") buFont.set("typeface", "Wingdings") buFont.set("panose", "05000000000000000000") buFont.set("pitchFamily", "2"); buFont.set("charset", "2") buChar = etree.SubElement(pPr, f"{{{_NS_A}}}buChar") buChar.set("char", "\xfc") r_el = etree.SubElement(p_el, f"{{{_NS_A}}}r") if rPr_copy is not None: rPr_new = etree.fromstring(etree.tostring(rPr_copy)) # Enforce minimum readable font size (14pt) on bullet text try: sz = int(rPr_new.get("sz", "1600")) if sz < 1400: rPr_new.set("sz", "1400") except Exception: pass r_el.append(rPr_new) else: rPr_new = etree.SubElement(r_el, f"{{{_NS_A}}}rPr") rPr_new.set("lang", "en-US") rPr_new.set("sz", "1400") t_el = etree.SubElement(r_el, f"{{{_NS_A}}}t") t_el.text = line def _build_drill_down(cloner: SlideCloner, tmpl_path: str, d: dict, dna: dict): """ Drill-down content slide: large icon circle (left) + brand banner + bullet body (right). Layout mirrors the gold standard: circle with step number on left, banner header + checkmark bullets on the right two-thirds. """ from pptx.util import Inches, Pt from pptx.dml.color import RGBColor from pptx.enum.text import PP_ALIGN from pptx.oxml.ns import qn as _qn slide = cloner.clone_slide(tmpl_path, dna["slide_indices"]["content"]) _remove_decorative_groups(slide) slots = _content_slots(slide) title = d.get("title", "") step_label = d.get("step_label", "") step_num = d.get("step_num", 1) bullets = d.get("bullets", []) title_slot, body_slot, other_slots = _find_title_body_slots(slots) if title_slot is None and slots: title_slot = slots[0] if body_slot is None and len(slots) > 1: body_slot = slots[1] for s in other_slots: _clear_txbody(s["element"]) if title_slot: _set_text(title_slot["element"], title, cx_in=title_slot.get("cx_in", 0), cy_in=title_slot.get("cy_in", 0), min_font_pt=24, font_family=dna.get("brand_font") or None) # Brand colour from DNA (navy fallback) brand = dna.get("brand_color", "1F3564") try: BRAND = RGBColor(int(brand[0:2],16), int(brand[2:4],16), int(brand[4:6],16)) except Exception: BRAND = RGBColor(0x1F, 0x35, 0x64) WHITE = RGBColor(0xFF, 0xFF, 0xFF) body_x = dna["body"]["x"] body_y = dna["body"]["y"] body_cx = body_cx_for_text_only(dna, has_image=False) # ── Icon circle (left) ──────────────────────────────────────────────────── ICON_D = 1.25 # diameter in inches — compact, matches gold standard ICON_GAP = 0.25 # gap between circle right edge and content left content_x = body_x + ICON_D + ICON_GAP content_cx = body_cx - ICON_D - ICON_GAP icon = slide.shapes.add_shape( 9, # oval/circle Inches(body_x), Inches(body_y), Inches(ICON_D), Inches(ICON_D), ) icon.fill.solid() icon.fill.fore_color.rgb = BRAND icon.line.fill.background() tf_i = icon.text_frame tf_i.word_wrap = False tf_i.margin_top = Inches(ICON_D / 2 - 0.22) p_i = tf_i.paragraphs[0] p_i.text = str(step_num) p_i.font.bold = True p_i.font.size = Pt(36) p_i.font.color.rgb = WHITE p_i.alignment = PP_ALIGN.CENTER # ── Brand banner (right of icon) ────────────────────────────────────────── bar = slide.shapes.add_shape( 1, Inches(content_x), Inches(body_y), Inches(content_cx), Inches(0.45) ) bar.fill.solid() bar.fill.fore_color.rgb = BRAND bar.line.fill.background() tf = bar.text_frame tf.margin_left = Inches(0.12) tf.margin_top = Inches(0.04) p = tf.paragraphs[0] p.text = step_label p.font.bold = True p.font.size = Pt(16) p.font.color.rgb = WHITE # ── Bullet body below the banner ────────────────────────────────────────── if body_slot: bullet_y = body_y + 0.55 _resize_shape(body_slot["element"], content_cx, 3.2, new_x_in=content_x) sp_el = body_slot["element"].getparent() spPr = sp_el.find(_qn("p:spPr")) if spPr is not None: xfrm = spPr.find(_qn("a:xfrm")) if xfrm is not None: off = xfrm.find(_qn("a:off")) if off is not None: off.set("y", str(int(bullet_y * _EMU))) _set_paragraphs(body_slot["element"], bullets, cx_in=content_cx, cy_in=3.2, bullet=True) # ═══════════════════════════════════════════════════════════════════════════════ # Assessment and practical slide builders # ═══════════════════════════════════════════════════════════════════════════════ def _build_quiz(cloner: SlideCloner, tmpl_path: str, d: dict, dna: dict): """ Clone the template's quiz slide (slide 6 in the 7-slide contract) and fill it with one MCQ — question + 4 options labelled A/B/C/D. The quiz layout across all current templates has the same shape pattern: • A wide title textbox carrying the literal word "Quiz" • A prominent question container (Rectangle: Diagonal Corners Rounded) nested inside a group, holding the question text + answer placeholders • Two small decorative 3-oval groups in opposing corners We replace: • title → "Quiz — Question N" • question container body → question text + 4 newline-separated options The answer is *not* marked visually — trainers reveal the answer verbally. """ slide = cloner.clone_slide(tmpl_path, dna["slide_indices"]["quiz"]) question_num = int(d.get("question_num", 1)) question_text = (d.get("question") or "").strip() options = list(d.get("options") or []) # Pad / truncate to exactly 4 options so the visual is always consistent options = (options + ["", "", "", ""])[:4] labels = ["A", "B", "C", "D"] body_text = question_text + "\n\n" + "\n".join( f"{labels[i]}. {opt}" for i, opt in enumerate(options) if opt ) title_text = f"Quiz — Question {question_num}" # Find slots — walk into groups so we catch the question rectangle # which is nested inside the decorative quiz group. slots = _collect_slots(slide) # Title slot: topmost wide text-bearing shape with "Quiz" text, OR the # topmost shape whose original text matches "Quiz". title_slot = None question_slot = None for s in slots: original = (s.get("text", "") or "").strip().lower() cx_in = s.get("cx_in", 0) cy_in = s.get("cy_in", 0) y_in = s.get("abs_y", 0) # Title: wide shape near top with "quiz" in original text if title_slot is None and "quiz" in original and y_in < 2.0 and cx_in > 6.0: title_slot = s continue # Question container: large rectangle below the title if cy_in > 2.0 and cx_in > 6.0: if question_slot is None or s.get("cy_in", 0) > question_slot.get("cy_in", 0): question_slot = s if title_slot is not None: _set_text(title_slot["element"], title_text, cx_in=title_slot.get("cx_in", 0), cy_in=title_slot.get("cy_in", 0)) if question_slot is not None: _set_text(question_slot["element"], body_text, cx_in=question_slot.get("cx_in", 0), cy_in=question_slot.get("cy_in", 0)) def _build_assessment_marker(cloner: SlideCloner, tmpl_path: str, d: dict, dna: dict): """ quiz / qa — a trainer-delivery marker slide. No learner-visible content beyond a title and a one-line instruction. The trainer delivers the questions verbally or facilitates discussion. """ stype = d.get("type", "quiz") if stype == "qa": title = d.get("title", "Questions and Answers") body_text = "Open Q&A session — invite learners to ask questions and discuss the topics covered so far." else: title = d.get("title", "Knowledge Check") body_text = "Trainer-led quiz — the trainer will pose questions verbally for the group to discuss and answer." slide = cloner.clone_slide(tmpl_path, dna["slide_indices"]["content"]) _remove_decorative_groups(slide) slots = _content_slots(slide) title_slot, body_slot, other_slots = _find_title_body_slots(slots) if title_slot is None and slots: title_slot = slots[0] if body_slot is None and len(slots) > 1: body_slot = slots[1] for s in other_slots: _clear_txbody(s["element"]) if title_slot: _set_text(title_slot["element"], title, cx_in=title_slot.get("cx_in", 0), cy_in=title_slot.get("cy_in", 0)) body_cx = body_cx_for_text_only(dna, has_image=False) if body_slot: _resize_shape(body_slot["element"], body_cx, dna["body"]["cy"], new_x_in=dna["body"]["x"], new_y_in=dna["body"]["y"]) _set_paragraphs(body_slot["element"], [body_text], cx_in=body_cx, cy_in=dna["body"]["cy"], bullet=False) def _build_scenario_qa(cloner: SlideCloner, tmpl_path: str, d: dict, dna: dict): """ scenario_qa — workplace scenario for trainer-led discussion. Preferred format (STRUCTURED, 4 sections): d carries d["background"], d["task"], d["challenge"], d["objective"] Renders as four labelled point blocks via _write_points_body. Legacy format: d["scenarios"] = list of {"num", "text"} dicts. Renders as plain paragraphs. """ title = d.get("title", "Scenario Discussion") slide = cloner.clone_slide(tmpl_path, dna["slide_indices"]["content"]) _remove_decorative_groups(slide) slots = _content_slots(slide) title_slot, body_slot, other_slots = _find_title_body_slots(slots) if title_slot is None and slots: title_slot = slots[0] if body_slot is None and len(slots) > 1: body_slot = slots[1] for s in other_slots: _clear_txbody(s["element"]) if title_slot: _set_text(title_slot["element"], title, cx_in=title_slot.get("cx_in", 0), cy_in=title_slot.get("cy_in", 0)) body_cx = body_cx_for_text_only(dna, has_image=False) # Use full content_zone height for breathing room cz = dna.get("content_zone") body_y = dna["body"]["y"] if cz and cz.get("cy", 0) > 0: body_cy = float(cz["cy"]) - max(0.0, body_y - float(cz.get("y", body_y))) body_cy = max(body_cy, 3.0) else: body_cy = max(dna["body"]["cy"], 3.0) # Detect structured 4-section format section_keys = ("background", "task", "challenge", "objective") if any(d.get(k) for k in section_keys): points = [] for key in section_keys: text = (d.get(key) or "").strip() if text: points.append({"label": key.title(), "text": text}) if body_slot and points: _resize_shape(body_slot["element"], body_cx, body_cy, new_x_in=dna["body"]["x"], new_y_in=body_y) _write_points_body(body_slot["element"], points, cx_in=body_cx, cy_in=body_cy) return elif body_slot: _clear_txbody(body_slot["element"]) return # Legacy format: paragraph list scenarios = d.get("scenarios", []) paras = [s.get("text", "").strip() for s in scenarios if s.get("text", "").strip()] if body_slot and paras: _resize_shape(body_slot["element"], body_cx, body_cy, new_x_in=dna["body"]["x"], new_y_in=body_y) _set_paragraphs(body_slot["element"], paras, cx_in=body_cx, cy_in=body_cy, bullet=False) elif body_slot: _clear_txbody(body_slot["element"]) def _build_scenario(cloner: SlideCloner, tmpl_path: str, d: dict, dna: dict): """ scenario — practice scenario slide (skills courses). Renders the background text as flowing prose paragraphs. LAYOUT DECISIONS (deliberate): • Title slot is RESIZED to 1.3in tall so a 2-line scenario title ("Scenario N: Long Descriptive Title") renders at a readable size (~24-28pt) instead of being auto-shrunk into 14pt-on-one-line. • Body slot is EXTENDED to fill the remaining vertical space — the old layout left ~40% of the slide blank below the body. • Body uses normAutofit so even the expanded background paragraphs (4-6 sentences with names + stakes + constraints) fit cleanly. """ slide = cloner.clone_slide(tmpl_path, dna["slide_indices"]["content"]) _remove_decorative_groups(slide) slots = _content_slots(slide) title_slot, body_slot, other_slots = _find_title_body_slots(slots) if title_slot is None and slots: title_slot = slots[0] if body_slot is None and len(slots) > 1: body_slot = slots[1] for s in other_slots: _clear_txbody(s["element"]) # Layout dimensions targeting the full content area below the chrome. slide_h = dna.get("slide_h_in", 7.5) title_x = dna.get("title", {}).get("x", 0.66) title_y = dna.get("title", {}).get("y", 0.58) title_cx = dna.get("title", {}).get("cx", 11.4) title_cy_target = 1.3 # taller than default 0.71 to host 2-line titles at ~26pt body_cx_target = body_cx_for_text_only(dna, has_image=False) body_x = dna.get("body", {}).get("x", 0.66) body_y_target = title_y + title_cy_target + 0.2 # 0.2in gap after title body_cy_target = max(slide_h - body_y_target - 0.8, 3.0) # leave 0.8in bottom margin if title_slot: _resize_shape(title_slot["element"], title_cx, title_cy_target, new_x_in=title_x, new_y_in=title_y) # Cap title font at 28pt by pinning explicitly — prevents the placeholder # from rendering huge if the original template font was 36-44pt. _set_text(title_slot["element"], d.get("title", ""), cx_in=title_cx, cy_in=title_cy_target, font_pt=min(28.0, _calc_title_pt(d.get("title", ""), title_cx, title_cy_target))) background = d.get("background", "") paras = [p.strip() for p in background.split("\n\n") if p.strip()] if background else [] if body_slot: _resize_shape(body_slot["element"], body_cx_target, body_cy_target, new_x_in=body_x, new_y_in=body_y_target) _set_paragraphs(body_slot["element"], paras or [""], cx_in=body_cx_target, cy_in=body_cy_target, bullet=False) # Layer normAutofit on top of the fitted font so PowerPoint can shrink # further at runtime if needed (defence in depth against dense prose). _enable_normAutofit(body_slot["element"]) def _build_prose_slide(cloner: SlideCloner, tmpl_path: str, d: dict, dna: dict): """ case_study / activity — flowing prose paragraphs with a section heading embedded in the first paragraph (e.g., "Background:\\n\\n..."). LAYOUT DECISIONS (deliberate): • Title font is CAPPED at 24pt — case-study titles like "Case Study: NexTech Solutions" are short, so the title placeholder's native 36-44pt rendering used to produce a comically oversized heading. • Body slot is EXTENDED to fill the available vertical area below the title, so 2-3 dense paragraphs of case content have room. • Body uses normAutofit + min 9pt floor so prose-heavy slides (the elaborated case_study spec is 150-250 words per slide) actually fit without overflowing the slide chrome. """ slide = cloner.clone_slide(tmpl_path, dna["slide_indices"]["content"]) _remove_decorative_groups(slide) slots = _content_slots(slide) title_slot, body_slot, other_slots = _find_title_body_slots(slots) if title_slot is None and slots: title_slot = slots[0] if body_slot is None and len(slots) > 1: body_slot = slots[1] for s in other_slots: _clear_txbody(s["element"]) slide_h = dna.get("slide_h_in", 7.5) title_x = dna.get("title", {}).get("x", 0.66) title_y = dna.get("title", {}).get("y", 0.58) title_cx = dna.get("title", {}).get("cx", 11.4) title_cy_target = 1.0 # comfortable for a short single-line case_study title body_cx_target = body_cx_for_text_only(dna, has_image=False) body_x = dna.get("body", {}).get("x", 0.66) body_y_target = title_y + title_cy_target + 0.2 body_cy_target = max(slide_h - body_y_target - 0.8, 3.5) if title_slot: _resize_shape(title_slot["element"], title_cx, title_cy_target, new_x_in=title_x, new_y_in=title_y) # Cap title at 24pt — case_study titles are short and look huge otherwise. _set_text(title_slot["element"], d.get("title", ""), cx_in=title_cx, cy_in=title_cy_target, font_pt=24.0) paras = d.get("paragraphs", []) if body_slot and paras: _resize_shape(body_slot["element"], body_cx_target, body_cy_target, new_x_in=body_x, new_y_in=body_y_target) _set_paragraphs(body_slot["element"], paras, cx_in=body_cx_target, cy_in=body_cy_target, bullet=False) # normAutofit so PowerPoint shrinks further if the LLM produced # extra-dense content (the elaborated case_study spec can run long). _enable_normAutofit(body_slot["element"]) elif body_slot: _clear_txbody(body_slot["element"]) def _build_ending(cloner: SlideCloner, tmpl_path: str, d: dict, dna: dict) -> None: """ Clone the template's ending slide and fix the two long-text overflows the template ships with: the 48pt "Congratulations" heading box is too narrow for the word at that size, and the social-handle boxes (e.g. "/The.Knowledge.Academy.Ltd") are sized for shorter handles than the template now carries. """ slide = cloner.clone_slide(tmpl_path, dna["slide_indices"]["ending"]) sp_tag = f"{{{_NS_P}}}sp" t_tag = f"{{{_NS_A}}}t" for sp in slide.shapes._spTree.iter(sp_tag): text = "".join(t.text or "" for t in sp.iter(t_tag)).strip() if not text: continue low = text.lower() if low == "congratulations": # Width needed at 48pt bold: ~15ch × 48 × 0.0095 ≈ 6.84in. # Widen to 7.0in (well within 13.33in slide); keep original y/x. txBody = sp.find(f"{{{_NS_P}}}txBody") if txBody is not None: spPr = sp.find(f"{{{_NS_P}}}spPr") xfrm = spPr.find(f"{{{_NS_A}}}xfrm") if spPr is not None else None if xfrm is not None: ext = xfrm.find(f"{{{_NS_A}}}ext") if ext is not None: ext.set("cx", str(int(7.0 * _EMU))) elif text.startswith("/") and len(text) > 14: # Social-handle text "/The.Knowledge.Academy.Ltd", # "/the-knowledge-academy". The boxes are 3.5in at 16pt; at 26ch # this needs ~4.0in. Shrinking to 13pt fits comfortably (~3.2in) # without touching the icon+text group layout. for rPr in sp.iter(f"{{{_NS_A}}}rPr"): rPr.set("sz", "1300") # ═══════════════════════════════════════════════════════════════════════════════ # V3 helpers: bank-as-base slide building # ═══════════════════════════════════════════════════════════════════════════════ def _strip_bank_header(slide, dna: dict) -> None: """ Remove the bank slide's title zone text shapes, placeholder shapes, and any existing watermark so we can replace them with template chrome. Decorative visual groups in the title zone are left untouched. """ sp_tag = f"{{{_NS_P}}}sp" t_tag = f"{{{_NS_A}}}t" title_zone_y = int(dna.get("content_zone", dna["body"])["y"] * _EMU) spTree = slide.shapes._spTree to_remove = [] for el in list(spTree): if el.tag != sp_tag: continue # Always strip placeholder shapes if el.find(f".//{{{_NS_P}}}ph") is not None: to_remove.append(el) continue txb = el.find(f"{{{_NS_P}}}txBody") if txb is None: continue text = "".join(t.text or "" for t in txb.iter(t_tag)).lower().strip() x, y, cx, cy = _get_grp_xfrm(el) # Remove text shapes in the title zone if y < title_zone_y and text: to_remove.append(el) continue # Remove bank watermark / knowledge academy branding if y > int(6.0 * _EMU) and ("knowledge" in text or len(text) < 5): to_remove.append(el) for el in to_remove: parent = el.getparent() if parent is not None: parent.remove(el) def _grp_actual_content_top(grp_el) -> int: """ Return the y of the topmost CHILD shape in slide-space EMU. Unlike _min_child_y_in_parent_space this does NOT fall back to the group bounding-box y — it skips zero-height stubs and returns the actual first visible child. Falls back to group bounding-box y if no children found. """ spPr = grp_el.find(f"{{{_NS_P}}}grpSpPr") if spPr is None: return _get_grp_xfrm(grp_el)[1] xfrm = spPr.find(f"{{{_NS_A}}}xfrm") if xfrm is None: return _get_grp_xfrm(grp_el)[1] off_el = xfrm.find(f"{{{_NS_A}}}off") ext_el = xfrm.find(f"{{{_NS_A}}}ext") chOff_el = xfrm.find(f"{{{_NS_A}}}chOff") chExt_el = xfrm.find(f"{{{_NS_A}}}chExt") grp_y = int(off_el.get("y", 0)) if off_el is not None else 0 grp_cy = int(ext_el.get("cy", 0)) if ext_el is not None else 0 chOff_y = int(chOff_el.get("y", 0)) if chOff_el is not None else 0 chExt_cy = int(chExt_el.get("cy", 0)) if chExt_el is not None else grp_cy scale = (grp_cy / chExt_cy) if chExt_cy else 1.0 min_y = float("inf") for child in grp_el: ctag = child.tag cPr_tag = (f"{{{_NS_P}}}grpSpPr" if ctag == f"{{{_NS_P}}}grpSp" else f"{{{_NS_P}}}spPr") cPr = child.find(cPr_tag) if cPr is None: continue cxfrm = cPr.find(f"{{{_NS_A}}}xfrm") if cxfrm is None: continue coff = cxfrm.find(f"{{{_NS_A}}}off") cext = cxfrm.find(f"{{{_NS_A}}}ext") if coff is None: continue c_cy = int(cext.get("cy", 0)) if cext is not None else 0 if c_cy == 0: continue # zero-height stub — skip c_y = int(coff.get("y", 0)) slide_y = grp_y + (c_y - chOff_y) * scale if slide_y < min_y: min_y = slide_y return int(min_y) if min_y < float("inf") else grp_y def _bottom_align_bank_shapes(slide, body_y_in: float, intro_height_in: float = 0.0) -> None: """ Shift all non-placeholder top-level bank shapes downward so the highest visible content clears the intro text area. Groups are inspected via their child coordinate system so the computed shift is exact rather than based on the outer bounding box (which may include the stripped bank-title zone). Skips shapes with cy >= 6.5 in (full-slide-height background containers). Call BEFORE _apply_template_chrome so template chrome shapes are not shifted. """ _FULL_SLIDE_CY = int(6.5 * _EMU) _MIN_CY = int(0.3 * _EMU) _SLIDE_MAX_Y = int(7.0 * _EMU) # don't push content below here gap_in = 0.15 if intro_height_in > 0 else 0.05 floor_emu = int((body_y_in + intro_height_in + gap_in) * _EMU) spTree = slide.shapes._spTree grp_tag = f"{{{_NS_P}}}grpSp" sp_tag = f"{{{_NS_P}}}sp" pic_tag = f"{{{_NS_P}}}pic" elems: list = [] max_shift = 0 for el in spTree: if el.tag not in (grp_tag, sp_tag, pic_tag): continue if el.find(f".//{{{_NS_P}}}ph") is not None: continue _, y_emu, _, cy_emu = _get_grp_xfrm(el) if cy_emu >= _FULL_SLIDE_CY or cy_emu < _MIN_CY: continue if el.tag == grp_tag: content_top = _grp_actual_content_top(el) else: content_top = y_emu needed = max(0, floor_emu - content_top) # Cap so shapes don't go below _SLIDE_MAX_Y cap = max(0, _SLIDE_MAX_Y - (y_emu + cy_emu)) needed = min(needed, cap) max_shift = max(max_shift, needed) elems.append(el) if not elems or max_shift == 0: return for el in elems: _is_grp = el.tag == f"{{{_NS_P}}}grpSp" _spPr_tag = f"{{{_NS_P}}}grpSpPr" if _is_grp else f"{{{_NS_P}}}spPr" _spPr = el.find(_spPr_tag) if _spPr is None: continue _xfrm = _spPr.find(f"{{{_NS_A}}}xfrm") if _xfrm is None: continue _off = _xfrm.find(f"{{{_NS_A}}}off") if _off is None: continue _off.set("y", str(int(_off.get("y", 0)) + max_shift)) def _apply_template_chrome( cloner: SlideCloner, slide, tmpl_path: str, title: str, dna: dict, intro_height_in: float = 0.0, preserve_group_position: bool = False, ) -> None: """ Copy chrome shapes (watermark, corner accents, left accent) from the template content slide into a bank-based slide, then add the template's title text box filled with *title*. """ from lxml import etree sp_tag = f"{{{_NS_P}}}sp" grp_tag = f"{{{_NS_P}}}grpSp" pic_tag = f"{{{_NS_P}}}pic" # Chrome zones in the template content slide: # • Bottom watermark area (y > 6.3 in) # • Top-right corner accent (x > 8.0 in, y < 2.0 in) # • Bottom-right corner accent (x > 8.0 in, bottom edge > 5.5 in) # • Left accent strip / corner image (x < 0.6 in) _CHROME_Y_BOTTOM = int(6.3 * _EMU) _CHROME_X_RIGHT = int(8.0 * _EMU) _CHROME_Y_TOP_MAX = int(2.0 * _EMU) _CHROME_Y_BOT_MIN = int(5.5 * _EMU) # bottom-right accent: shape bottom > 5.5 in _CHROME_X_LEFT = int(0.6 * _EMU) def _is_chrome(el): # Include pic elements (corner/border images) as well as sp/grpSp if el.tag not in (sp_tag, grp_tag, pic_tag): return False if el.find(f".//{{{_NS_P}}}ph") is not None: return False # Any non-logo text means this is a content slot (body placeholder, # marker word like "Text", body freeform, etc.) — NOT chrome. # Genuine chrome (borders, watermarks, corner accents) is either # empty or contains the brand logo text only. _t_tag = f"{{{_NS_A}}}t" text = "".join(t.text or "" for t in el.iter(_t_tag)).strip() if text and not any(tok in text.lower() for tok in _LOGO_TOKENS): return False x, y, cx, cy = _get_grp_xfrm(el) return ( y > _CHROME_Y_BOTTOM or (x > _CHROME_X_RIGHT and y < _CHROME_Y_TOP_MAX) or (x > _CHROME_X_RIGHT and (y + cy) > _CHROME_Y_BOT_MIN) or x < _CHROME_X_LEFT ) # Title zone = the actual title placeholder position from the template DNA, # with a small tolerance. Using a hardcoded zone fails for templates whose # body placeholder sits right at the lower edge (e.g. body at y=1.397 in). _t_y = int(dna["title"]["y"] * _EMU) _t_cy = int(dna["title"]["cy"] * _EMU) _TOL = int(0.05 * _EMU) _TITLE_Y1 = max(0, _t_y - _TOL) _TITLE_Y2 = _t_y + _t_cy + _TOL def _is_chrome_or_title(el): if _is_chrome(el): return True if el.tag != sp_tag: return False if el.find(f".//{{{_NS_P}}}ph") is not None: return False x, y, cx, cy = _get_grp_xfrm(el) txb = el.find(f"{{{_NS_P}}}txBody") return txb is not None and _TITLE_Y1 < y < _TITLE_Y2 spTree = slide.shapes._spTree count_before = len(list(spTree)) cloner.copy_elements_from( tmpl_path, dna["slide_indices"]["content"], slide, filter_fn=_is_chrome_or_title ) # Find the newly added title text box (largest cx in title zone) title_el = None best_cx = 0 for el in list(spTree)[count_before:]: if el.tag != sp_tag: continue txb = el.find(f"{{{_NS_P}}}txBody") if txb is None: continue x, y, cx, cy = _get_grp_xfrm(el) if _TITLE_Y1 < y < _TITLE_Y2 and cx > best_cx: best_cx = cx title_el = el if title_el is not None: txb = title_el.find(f"{{{_NS_P}}}txBody") if txb is not None: # Bank slides (overview_groups, groups) put their visual content in # the body area (y > 1.5); the title row is empty so the title can # use the wide layout. Card slides pass preserve_group_position=True # because their cards may extend into the title row. wide_title = not preserve_group_position title_cx_target = title_cx_for_layout(dna, has_right_content=not wide_title) title_cy_in = (_TITLE_Y2 - _TITLE_Y1) / _EMU if wide_title and title_cx_target > best_cx / _EMU + 0.05: _resize_shape( txb, title_cx_target, title_cy_in, new_x_in=dna["title"]["x"], new_y_in=dna["title"]["y"], ) _set_text(txb, title, cx_in=title_cx_target, cy_in=title_cy_in, min_font_pt=32, font_family=dna.get("brand_font") or None) chrome_shapes = list(spTree)[count_before:] _SLIDE_WIDTH = int(13.33 * _EMU) _GAP_BELOW_TTL = int(0.50 * _EMU) # desired gap between title bottom and first visual content # Find the main infographic group (largest non-chrome grpSp) main_group = None max_area = 0 for el in spTree: if el in chrome_shapes: continue if el.tag != grp_tag: continue x, y, cx, cy = _get_grp_xfrm(el) if cx > int(0.5 * _EMU) and cy > int(0.5 * _EMU): if cx * cy > max_area: max_area = cx * cy main_group = el if main_group is not None and not preserve_group_position: grp_x, grp_y, grp_cx, grp_cy = _get_grp_xfrm(main_group) grpSpPr = main_group.find(f"{{{_NS_P}}}grpSpPr") # Read child coordinate system to compute group→slide mapping chOff_y = 0 chExt_cy = grp_cy if grpSpPr is not None: xf = grpSpPr.find(f"{{{_NS_A}}}xfrm") if xf is not None: chO = xf.find(f"{{{_NS_A}}}chOff") chE = xf.find(f"{{{_NS_A}}}chExt") if chO is not None: chOff_y = int(chO.get("y", 0)) if chE is not None: chExt_cy = int(chE.get("cy", 0)) or grp_cy # Find topmost VISIBLE child (local/child coords). # Rules: # - Skip full-width header txBoxes (blanked by _strip_bank_header). # - Skip blank txBoxes (background/placeholder shapes with no text) — # these would drag the infographic down, hiding the real visual content. # - grpSp visual shapes and filled label txBoxes are always included. _HEADER_CX = int(0.70 * _SLIDE_WIDTH) _HEADER_Y = int(1.00 * _EMU) _t_tag = f"{{{_NS_A}}}t" top_content_y = float("inf") for child in main_group: if child.tag not in (grp_tag, sp_tag): continue cr = _get_grp_xfrm(child) if cr is None or cr[3] <= 0: continue cNv = child.find(f"{{{_NS_P}}}nvSpPr/{{{_NS_P}}}cNvSpPr") is_tx = cNv is not None and cNv.get("txBox") == "1" # Skip full-width header if is_tx and cr[2] >= _HEADER_CX and cr[1] < _HEADER_Y: continue # Skip blank txBoxes (no visible text content) if is_tx: txb = child.find(f"{{{_NS_P}}}txBody") text = "".join(t.text or "" for t in txb.iter(_t_tag)).strip() if txb is not None else "" if not text: continue top_content_y = min(top_content_y, cr[1]) if top_content_y == float("inf"): top_content_y = chOff_y # Slide-space y we want the first content to appear at. # Intro text is placed at dna["body"]["y"] (just below the title placeholder), # so when intro exists the infographic must clear body_y + intro_height. title_r = _get_grp_xfrm(title_el) title_bottom = title_r[1] + title_r[3] if intro_height_in > 0: _cz_y = dna.get("content_zone", dna["body"])["y"] target_slide_y = int(_cz_y * _EMU) + int(intro_height_in * _EMU) + int(0.15 * _EMU) else: target_slide_y = title_bottom + _GAP_BELOW_TTL # Inverse of: slide_y = off_y + (child_y - chOff_y) * scale_y scale_y = (grp_cy / chExt_cy) if chExt_cy > 0 else 1.0 new_off_y = int(target_slide_y - (top_content_y - chOff_y) * scale_y) new_off_y = max(0, new_off_y) # never above slide top # Scale group down if it would overflow the slide bottom. # Change ext.cy/cx (keeps chOff/chExt fixed so PPTX scales all children). _SLIDE_HEIGHT = int(7.50 * _EMU) available_h = _SLIDE_HEIGHT - new_off_y new_cx, new_cy = grp_cx, grp_cy if grp_cy > available_h > 0: new_cy = available_h new_cx = int(grp_cx * new_cy / grp_cy) if grp_cy > 0 else grp_cx # Center horizontally with (possibly scaled) width center_x = max(0, (_SLIDE_WIDTH - new_cx) // 2) # Apply position and (if needed) scaled size if grpSpPr is not None: xf = grpSpPr.find(f"{{{_NS_A}}}xfrm") if xf is not None: off_el = xf.find(f"{{{_NS_A}}}off") ext_el = xf.find(f"{{{_NS_A}}}ext") if off_el is not None: if abs(center_x - grp_x) > int(0.05 * _EMU): off_el.set("x", str(center_x)) off_el.set("y", str(new_off_y)) if ext_el is not None and (new_cx != grp_cx or new_cy != grp_cy): ext_el.set("cx", str(new_cx)) ext_el.set("cy", str(new_cy)) # ── Z-order: push chrome decoration to the BACK ────────────────────────── # copy_elements_from() appends to spTree, which puts chrome on TOP of the # bank's body content. That means the corner-accent pictures render over # any overflowing bullet text. Move pure-decoration chrome (pictures and # logo watermarks) to the front of spTree so they sit behind the body in # z-order. The title text element is left at its original (top) position # so it stays visible above the body. _logo_tags = _LOGO_TOKENS def _is_pure_decoration(el): if el is title_el: return False if el.tag == pic_tag: return True if el.tag == sp_tag: # Logo watermark text is also pure decoration _t_tag = f"{{{_NS_A}}}t" text = "".join(t.text or "" for t in el.iter(_t_tag)).strip().lower() if text and any(tok in text for tok in _logo_tags): return True # Empty auto-shape used as a border accent if not text: return True return False # spTree starts with mandatory + headers; insert # after those (typically index 2). Locate the first non-header position. header_offset = 0 for child in spTree: local = etree.QName(child).localname if local in ("nvGrpSpPr", "grpSpPr"): header_offset += 1 else: break insert_at = header_offset for el in list(spTree)[count_before:]: if _is_pure_decoration(el): spTree.remove(el) spTree.insert(insert_at, el) insert_at += 1 def _harmonise_slide_colors(slide, brand_color: str) -> None: """ Selective brand color harmonization for a bank-as-base slide. Replaces only the DOMINANT ACCENT color — the most common srgbClr that is saturated (not gray/cream/white) and mid-brightness (not background or text). Leaves neutral fills (white, cream, light panels) untouched. Also converts accent1-6 schemeClr references to the brand color. """ from collections import Counter from lxml import etree target = brand_color.upper() spTree = slide.shapes._spTree def _is_accent(hex6: str) -> bool: try: r, g, b = int(hex6[0:2],16), int(hex6[2:4],16), int(hex6[4:6],16) brightness = (r + g + b) / 3 saturation = max(r, g, b) - min(r, g, b) return 20 < brightness < 215 and saturation > 45 except Exception: return False # Collect all srgbClr fills that qualify as accent accent_vals: list[str] = [] for srgb in spTree.iter(f"{{{_NS_A}}}srgbClr"): val = srgb.get("val", "").upper() if len(val) == 6 and _is_accent(val): accent_vals.append(val) if accent_vals: dominant = Counter(accent_vals).most_common(1)[0][0] if dominant != target: for srgb in spTree.iter(f"{{{_NS_A}}}srgbClr"): if srgb.get("val", "").upper() == dominant: srgb.set("val", target) # Replace schemeClr accent1-6 with explicit brand color _ACCENT_SCHEMES = {"accent1","accent2","accent3","accent4","accent5","accent6"} for schemeClr in list(spTree.iter(f"{{{_NS_A}}}schemeClr")): if schemeClr.get("val", "") not in _ACCENT_SCHEMES: continue parent = schemeClr.getparent() if parent is None: continue idx = list(parent).index(schemeClr) parent.remove(schemeClr) new_srgb = etree.Element(f"{{{_NS_A}}}srgbClr") new_srgb.set("val", target) parent.insert(idx, new_srgb) def _cluster_txboxes(text_els: list) -> list[list]: """ Group text boxes by 2D visual position. Two consecutive boxes (sorted by x, then y) belong to the same cluster when: dx < 0.5 in AND dy < 1.0 in (They're near the same visual element — label + desc slots for one item.) Returns list of clusters; each cluster is a list of (y, x, cy, el) tuples. The first element in each cluster is the primary label slot. """ _CLUSTER_DX = int(0.50 * _EMU) _CLUSTER_DY = int(1.00 * _EMU) if not text_els: return [] sorted_t = sorted(text_els, key=lambda t: (t[1], t[0])) # by (x, y) clusters: list[list] = [[sorted_t[0]]] for tup in sorted_t[1:]: prev = clusters[-1][-1] dx = abs(tup[1] - prev[1]) dy = abs(tup[0] - prev[0]) if dx < _CLUSTER_DX and dy < _CLUSTER_DY: clusters[-1].append(tup) else: clusters.append([tup]) return clusters def _collect_visual_shapes(container) -> list: """ Collect visual (non-text) shapes from a slide container element (spTree or grpSp). Returns list of (x, y, cx, cy, center_x, center_y) tuples. Excludes text boxes, placeholders, zero-size shapes, and full-slide backgrounds. """ result = [] sp_tag = f"{{{_NS_P}}}sp" grp_tag = f"{{{_NS_P}}}grpSp" _MIN_DIM = int(0.3 * _EMU) # must be at least 0.3in to be a content shape _MAX_CX = int(8.5 * _EMU) # exclude full-width background banners _MAX_CY = int(5.0 * _EMU) # exclude full-height background shapes for el in container: if el.tag == grp_tag: x, y, cx, cy = _get_grp_xfrm(el) if _MIN_DIM <= cx <= _MAX_CX and _MIN_DIM <= cy <= _MAX_CY: result.append((x, y, cx, cy, x + cx // 2, y + cy // 2)) elif el.tag == sp_tag: if el.find(f".//{{{_NS_P}}}ph") is not None: continue cNv = el.find(f"{{{_NS_P}}}nvSpPr/{{{_NS_P}}}cNvSpPr") if cNv is not None and cNv.get("txBox") == "1": continue spPr = el.find(f"{{{_NS_P}}}spPr") if spPr is None: continue if spPr.find(f"{{{_NS_A}}}noFill") is not None: continue has_fill = ( spPr.find(f".//{{{_NS_A}}}solidFill") is not None or spPr.find(f".//{{{_NS_A}}}gradFill") is not None or spPr.find(f".//{{{_NS_A}}}pattFill") is not None ) if not has_fill: continue x, y, cx, cy = _get_grp_xfrm(el) if _MIN_DIM <= cx <= _MAX_CX and _MIN_DIM <= cy <= _MAX_CY: result.append((x, y, cx, cy, x + cx // 2, y + cy // 2)) return result def _label_font_pt(cx_emu: int, cy_emu: int, label: str) -> float: """ Return the largest font size (pt) where *label* fits in a box of cx × cy EMU **without breaking a single word across lines**. Picks the largest pt such that the longest single word fits on one line AND the whole label fits within 2 visible lines vertically — at any size down to 7pt. The longest-word rule is what stops "Demonstrative" wrapping as "Demonstrat\\nive" inside narrow card-label boxes. """ if cx_emu <= 0 or cy_emu <= 0 or not label: return 11.0 cx_in = cx_emu / _EMU cy_in = cy_emu / _EMU effective_cx = max(cx_in - 0.2, 0.3) # account for default lIns/rIns words = label.split() longest_word_len = max((len(w) for w in words), default=len(label)) for pt in (14, 13, 12, 11, 10, 9, 8, 7): char_w_in = pt * 0.0095 # bold avg char width per pt per_line = effective_cx / char_w_in if per_line <= 0: continue if longest_word_len > per_line: continue # would split this word line_h_in = pt * 1.35 / 72.0 n_lines = math.ceil(len(label) / per_line) if n_lines * line_h_in <= cy_in * 0.92 and n_lines <= 2: return float(pt) return 7.0 def _visual_center_fill(txt_els: list, vis_els: list, items: list[dict]) -> None: """ Assign item labels to text boxes via visual-element nearest-neighbour matching. For each text box, find the nearest visual shape (circle, bar, card, etc.) and group the text boxes by that shape. Then sort shapes by reading order (top-left → bottom-right) and assign one item label per shape group. Within each group the largest text box (by area) is the primary label slot; all others are blanked. Font size is calculated to fit the label within the primary box. txt_els : (y, x, cx, cy, el) tuples from _collect_bank_text_boxes or direct scan vis_els : (x, y, cx, cy, center_x, center_y) from _collect_visual_shapes items : [{"label": str}, ...] ordered list of content items """ groups: dict[int, list] = {i: [] for i in range(len(vis_els))} for tup in txt_els: ty, tx, tcx, tcy, el = tup tc_x = tx + tcx // 2 tc_y = ty + tcy // 2 nearest = min( range(len(vis_els)), key=lambda i: (tc_x - vis_els[i][4]) ** 2 + (tc_y - vis_els[i][5]) ** 2, ) groups[nearest].append(tup) sorted_vis = sorted(range(len(vis_els)), key=lambda i: (vis_els[i][1], vis_els[i][0])) # (y, x) for slot_i, vi in enumerate(sorted_vis): cluster = groups[vi] if slot_i < len(items) and cluster: cluster.sort(key=lambda t: t[2] * t[3], reverse=True) # largest box first y, x, cx, cy, el = cluster[0] label = items[slot_i].get("label", "") pt = _label_font_pt(cx, cy, label) _replace_sp_text(el, label, font_pt=pt, bold=True, center_align=True) for _, _, _, _, sp_el in cluster[1:]: _replace_sp_text(sp_el, "") else: for _, _, _, _, sp_el in cluster: _replace_sp_text(sp_el, "") def _fill_bank_slide_by_tag(slide, items: list[dict]) -> bool: """ Tag-based fill for the new tagged-infographics bank. Each shape that needs content has a placeholder text tag identifying its role and position: "Item 1", "Item 2", ... "Item N" → label slots, ordered by tag number "1.", "2.", ... OR "1", "2", ... → number badge slots "Text here" → generic placeholder (cleared if unused) For each item in *items*, find the shape with text exactly matching "Item {i+1}" (case-insensitive, whitespace-normalised) and replace its text with the item's label. Number badge tags are left as-is when their index is within range, otherwise their text is cleared. Unused "Item N" slots beyond len(items) have their text cleared. Honours the placeholder's existing so font sizing is governed by the source designer, not by our rendering layer. Returns True if at least one item was placed; False if no Item tags were found (caller can fall back to the spatial filler). """ import re as _re sp_tag = f"{{{_NS_P}}}sp" t_tag = f"{{{_NS_A}}}t" item_re = _re.compile(r"^\s*Item\s+(\d+)\s*$", _re.IGNORECASE) num_re = _re.compile(r"^\s*(\d+)\.?\s*$") # Collect every tagged sp on the slide. item_slots: dict[int, object] = {} # idx (1-based) → sp element number_slots: dict[int, object] = {} # idx (1-based) → sp element text_here_slots: list[object] = [] for sp in slide.shapes._spTree.iter(sp_tag): text = "".join(t.text or "" for t in sp.iter(t_tag)).strip() if not text: continue m = item_re.match(text) if m: item_slots[int(m.group(1))] = sp continue m = num_re.match(text) if m and len(text) <= 4: number_slots[int(m.group(1))] = sp continue if text.lower() == "text here": text_here_slots.append(sp) if not item_slots: return False n_items = len(items) # Compute a single font size that makes EVERY label on this slide fit # cleanly (no mid-word breaks) given the narrowest item box on the slide. # Applies uniformly so all labels render at the same point size — matching # the source designer's intent for slide-wide consistency. uniform_pt = _compute_uniform_label_pt(item_slots, items) # Fill / clear "Item N" slots for idx, sp in item_slots.items(): if idx <= n_items: label = items[idx - 1].get("label", "") if uniform_pt is not None: _force_first_run_font_size(sp, uniform_pt) _replace_text_keeping_run_style(sp, label) else: _clear_sp_text(sp) # Number badges: keep tags 1..n_items, clear the rest for idx, sp in number_slots.items(): if idx > n_items: _clear_sp_text(sp) # else: leave the badge text as the source authored it (e.g. "1.") # Clear "Text here" generic placeholders so they don't render for sp in text_here_slots: _clear_sp_text(sp) return True def _compute_uniform_label_pt(item_slots: dict, items: list[dict]) -> float | None: """ Pick a font size that makes every item label fit in its slot box WIDTH AND HEIGHT — applied uniformly so all labels render at the same size. Strategy for each (slot, label) pair: • box_cx × 144 / longest_word_chars → max pt that prevents mid-word breaks (longest single word fits one line at this pt) • box_cx × 144 / total_chars → pt at which the whole label fits one line (best — no wrap) • If full-label-on-one-line pt ≥ 10pt, use it. Otherwise, fall back to the longest-word constraint (allows wrap at word boundaries). • Also bound by box HEIGHT: label must fit in at most 2 wrapped lines within the box cy. If 2 lines × line_height > box_cy, shrink further. Take the minimum across all pairs; clamp into [10pt, source_default_pt]. """ if not item_slots or not items: return None # Source default pt — read from any slot's first run rPr source_pt = 16.0 for sp in item_slots.values(): rPr = sp.find(f".//{{{_NS_A}}}rPr") if rPr is not None and rPr.get("sz"): try: source_pt = int(rPr.get("sz")) / 100.0 break except Exception: pass SAFETY = 0.90 LINE_SPACING = 1.15 min_pt = source_pt for idx, sp in item_slots.items(): if idx > len(items): continue label = (items[idx - 1].get("label") or "").strip() if not label: continue # Box geometry spPr = sp.find(f"{{{_NS_P}}}spPr") xf = spPr.find(f"{{{_NS_A}}}xfrm") if spPr is not None else None ext = xf.find(f"{{{_NS_A}}}ext") if xf is not None else None if ext is None: continue try: cx_in = int(ext.get("cx", 0)) / _EMU cy_in = int(ext.get("cy", 0)) / _EMU except Exception: continue if cx_in <= 0 or cy_in <= 0: continue n_chars = len(label) longest_word = max(label.split(), key=len, default=label) n_longword = max(len(longest_word), 1) # Width constraint: try whole-label-one-line first pt_full = (cx_in * 144 * SAFETY) / n_chars # whole label on 1 line pt_word = (cx_in * 144 * SAFETY) / n_longword # longest word on 1 line width_pt = pt_full if pt_full >= 10.0 else pt_word # Height constraint: label may wrap. Cap so wrapped lines fit box cy. # n_lines_at(pt) ≈ ceil(n_chars × pt / 144 / box_cx) # height_used = n_lines × pt × LINE_SPACING / 72 # Solve so height_used <= cy_in × 0.95 (5% safety margin): max_lines = max(int(cy_in * 0.95 * 72 / (width_pt * LINE_SPACING)), 1) # If label needs more lines than fit, shrink width_pt until it does if max_lines < 2: # Box too short for 2 lines at width_pt — force 1-line fit width_pt = min(width_pt, pt_full) # Then iteratively reduce until 1 line fits while width_pt > 10.0: chars_per_line = cx_in * 144 * SAFETY / width_pt if n_chars <= chars_per_line: break width_pt -= 0.5 else: # 2 lines OK — width_pt already prevents mid-word breaks pass if width_pt < min_pt: min_pt = width_pt return max(10.0, min(min_pt, source_pt)) def _force_first_run_font_size(sp_el, pt: float) -> None: """Set sz on the first inside an sp's txBody to *pt* (in points).""" txb = sp_el.find(f"{{{_NS_P}}}txBody") if txb is None: return rPr = txb.find(f".//{{{_NS_A}}}rPr") if rPr is None: return rPr.set("sz", str(int(pt * 100))) def _clear_sp_text(sp_el) -> None: """Empty all text inside an sp's txBody while preserving the txBody itself.""" from lxml import etree txb = sp_el.find(f"{{{_NS_P}}}txBody") if txb is None: return for p in list(txb.findall(f"{{{_NS_A}}}p")): txb.remove(p) # Add one empty so the box still exists but is blank etree.SubElement(txb, f"{{{_NS_A}}}p") def _fill_bank_slide_inplace( slide, items: list[dict], has_descriptions: bool ) -> None: """ Fill content items into the bank slide's existing text slots in-place. Tries Pattern B (direct txBox sp) first, then Pattern A (group shapes). Pattern B clustering: text boxes that are visually co-located (dx < 0.5 in AND dy < 1.0 in from their neighbour) are treated as a label+desc pair for ONE item. Only the first box in each cluster receives the label; the rest are removed (we never write descriptions into bank shapes). """ spTree = slide.shapes._spTree grp_tag = f"{{{_NS_P}}}grpSp" text_els = _collect_bank_text_boxes(spTree) # (y, x, cx, cy, el) direct txBox if text_els: vis_els = _collect_visual_shapes(spTree) if vis_els and len(vis_els) >= len(items): _visual_center_fill(text_els, vis_els, items) else: clusters = _cluster_txboxes(text_els) for i, cluster in enumerate(clusters): if i < len(items): y, x, cx, cy, el = cluster[0] pt = _label_font_pt(cx, cy, items[i].get("label", "")) _replace_sp_text(el, items[i].get("label", ""), font_pt=pt, bold=True, center_align=True) for tup in cluster[1:]: p = tup[4].getparent() if p is not None: p.remove(tup[4]) else: for tup in cluster: p = tup[4].getparent() if p is not None: p.remove(tup[4]) else: group_els = [el for el in spTree if el.tag == grp_tag] labels_only = [{"label": it.get("label", "")} for it in items] _fill_group_text(group_els, labels_only) # ═══════════════════════════════════════════════════════════════════════════════ # Bank group extraction and embedding # ═══════════════════════════════════════════════════════════════════════════════ def _count_usable_group_slots(bank_path: str, bank_slide_idx: int) -> int: """ Count how many top-level group slots a bank slide actually provides for Pattern A (text-in-groups) filling. Applies the same Stage-1 and Stage-2 filters as _fill_group_text so the selector can verify a bank slide before committing to it. Returns 0 if the file can't be opened or has no groups. Also returns the count of direct txBox sp elements for Pattern B slides. """ from pptx import Presentation _MIN_GRP_CX = int(0.7 * _EMU) _MIN_GRP_CY = int(0.5 * _EMU) _LARGE_VIS = int(2.0 * _EMU) def _is_purely_visual_local(grp_el) -> bool: _sp_tag = f"{{{_NS_P}}}sp" _grp_tag = f"{{{_NS_P}}}grpSp" for child in grp_el: if child.tag == _grp_tag: return False if child.tag == _sp_tag: cNvSpPr = child.find(f"{{{_NS_P}}}nvSpPr/{{{_NS_P}}}cNvSpPr") if cNvSpPr is not None and cNvSpPr.get("txBox") == "1": return False return True try: prs = Presentation(bank_path) spTree = prs.slides[bank_slide_idx].shapes._spTree except Exception: return 0 grp_tag = f"{{{_NS_P}}}grpSp" sp_tag = f"{{{_NS_P}}}sp" # Check Pattern B first: direct txBox sp elements in spTree _TITLE_Y = int(0.7 * _EMU) _MAX_LBL_W = int(3.5 * _EMU) # watermarks/title bars are wider than this direct_txbox = [ el for el in spTree if el.tag == sp_tag and el.find(f".//{{{_NS_P}}}ph") is None and (lambda c: c is not None and c.get("txBox") == "1")( el.find(f"{{{_NS_P}}}nvSpPr/{{{_NS_P}}}cNvSpPr") ) and 45_720 <= _get_grp_xfrm(el)[2] <= _MAX_LBL_W and _get_grp_xfrm(el)[1] >= _TITLE_Y ] if direct_txbox: # Cluster by 2D proximity — two boxes are co-located (label+desc pair for # ONE item) when consecutive sorted boxes have dx < 0.5in AND dy < 1.0in. _CLUSTER_DX = int(0.50 * _EMU) _CLUSTER_DY = int(1.00 * _EMU) positions = sorted((_get_grp_xfrm(el)[0], _get_grp_xfrm(el)[1]) for el in direct_txbox) n_clusters = 1 for k in range(1, len(positions)): dx = abs(positions[k][0] - positions[k-1][0]) dy = abs(positions[k][1] - positions[k-1][1]) if dx >= _CLUSTER_DX or dy >= _CLUSTER_DY: n_clusters += 1 return n_clusters # Pattern A: count usable top-level group slots group_els = [el for el in spTree if el.tag == grp_tag] sized = [ g for g in group_els if _get_grp_xfrm(g)[2] >= _MIN_GRP_CX and _get_grp_xfrm(g)[3] >= _MIN_GRP_CY ] if not sized: sized = group_els non_visual = [ g for g in sized if not ( _get_grp_xfrm(g)[2] >= _LARGE_VIS and _get_grp_xfrm(g)[3] >= _LARGE_VIS and _is_purely_visual_local(g) ) ] if not non_visual: non_visual = [ g for g in group_els if not ( _get_grp_xfrm(g)[2] >= _LARGE_VIS and _get_grp_xfrm(g)[3] >= _LARGE_VIS and _is_purely_visual_local(g) ) ] if not non_visual: non_visual = group_els # If exactly one wrapper group remains, drill down to count the actual # per-item card sub-grpSp. Without this, bank slides that wrap N cards # inside one or two outer grpSp layers report 1 usable slot instead of N. if len(non_visual) == 1: inner = [c for c in non_visual[0] if c.tag == grp_tag] if len(inner) == 1: # Double-wrapped (outer → 1 inner → N cards): go one more level. inner2 = [c for c in inner[0] if c.tag == grp_tag] if inner2: return len(inner2) if inner: return len(inner) return len(non_visual) def _embed_bank_groups( dest_slide, bank_path: str, bank_slide_idx: int, items: list[dict], lower_zone: dict, brand_color: str, has_descriptions: bool = False, ) -> bool: """ Extract infographic shapes from a bank slide and place them in the lower zone of *dest_slide*, colour-harmonised to *brand_color*. Bank slides use two patterns: A) Text-in-groups: the grpSp cards contain txBody sp children — legacy pattern. B) Separate text boxes: grpSp = visual decoration (icons); text lives in direct txBox sp children of the spTree at specific x/y positions. This function handles both by copying BOTH grpSp and the direct txBox sp elements, then filling text into whichever set has the content slots. """ from pptx import Presentation try: prs = Presentation(bank_path) bank_slide = prs.slides[bank_slide_idx] except Exception as exc: print(f" [warn] Cannot open bank slide {bank_path}:{bank_slide_idx}: {exc!r}") return False dest_spTree = dest_slide.shapes._spTree bank_spTree = bank_slide.shapes._spTree grp_tag = f"{{{_NS_P}}}grpSp" sp_tag = f"{{{_NS_P}}}sp" # ── Collect shapes from bank slide ──────────────────────────────────────── visual_els = [el for el in bank_spTree if el.tag == grp_tag] text_els = _collect_bank_text_boxes(bank_spTree) # direct txBox sp shapes all_source_els = visual_els + [el for _, _, _, el in text_els] if not all_source_els: print(f" [warn] No infographic shapes in {bank_path} slide {bank_slide_idx}") return False # ── Bounding box of ALL shapes (visual + text) ──────────────────────────── orig_bounds = _shapes_bounding_box(all_source_els) if orig_bounds is None: return False orig_x, orig_y, orig_cx, orig_cy = orig_bounds # ── Scale to fit lower_zone ─────────────────────────────────────────────── lz_x = lower_zone["x"] * _EMU lz_y = lower_zone["y"] * _EMU lz_cx = lower_zone["cx"] * _EMU lz_cy = lower_zone["cy"] * _EMU scale_x = lz_cx / orig_cx if orig_cx > 0 else 1.0 scale_y = lz_cy / orig_cy if orig_cy > 0 else 1.0 scale = min(scale_x, scale_y, 1.2) off_x = int(lz_x - orig_x * scale) off_y = int(lz_y - orig_y * scale) # ── Deep-copy, reposition, colour-harmonise ─────────────────────────────── new_visual_els = [] for el in visual_els: new_el = copy.deepcopy(el) _rescale_and_reposition_group(new_el, scale, off_x, off_y) _harmonise_group_colors(new_el, brand_color) new_visual_els.append(new_el) # text_els tuples: (orig_y, orig_x, orig_cy, el) new_text_tuples = [] for (oy, ox, ocy, el) in text_els: new_el = copy.deepcopy(el) _rescale_and_reposition_sp(new_el, scale, off_x, off_y) _harmonise_group_colors(new_el, brand_color) new_text_tuples.append((oy, ox, ocy, new_el)) # Sort text boxes by (x, y, cy) — column-first ordering so each column's # label and description stay together as a pair. # (y, x) would cluster all labels across columns before all descriptions, # scrambling the label+desc pairing for stride=2 assignment.) new_text_tuples.sort(key=lambda t: (t[1], t[0], t[2])) new_text_els = [t[3] for t in new_text_tuples] # ── Fix off-screen: shift everything down if any shape bleeds above lz_y ─ all_new_els = new_visual_els + new_text_els top_ys = [_get_shape_top_y(el) for el in all_new_els] min_top = min(top_ys) if top_ys else lz_y if min_top < lz_y: extra = int(lz_y - min_top) for el in all_new_els: _shift_shape_y(el, extra) # ── Append to dest spTree ───────────────────────────────────────────────── for el in new_visual_els: dest_spTree.append(el) for el in new_text_els: dest_spTree.append(el) # ── Fill text ───────────────────────────────────────────────────────────── if new_text_els: # Pattern B: fill direct text boxes (sorted by position) _fill_direct_text_boxes(new_text_els, items, has_descriptions) else: # Pattern A: fill text inside group shapes (legacy bank slides) appended_grps = [el for el in dest_spTree if el.tag == grp_tag] new_groups = appended_grps[-len(visual_els):] _fill_group_text(new_groups, items) return True def _shapes_bounding_box(els: list) -> tuple | None: """Return (min_x, min_y, total_cx, total_cy) in EMU for a mixed list of grpSp/sp.""" min_x = min_y = float("inf") max_x = max_y = float("-inf") for el in els: x, y, cx, cy = _get_grp_xfrm(el) if cx == 0 and cy == 0: continue min_x = min(min_x, x) min_y = min(min_y, y) max_x = max(max_x, x + cx) max_y = max(max_y, y + cy) if min_x == float("inf"): return None return min_x, min_y, max_x - min_x, max_y - min_y def _get_grp_xfrm(el) -> tuple[int, int, int, int]: """Return (x, y, cx, cy) in EMU for a grpSp or sp element.""" # grpSp uses grpSpPr/xfrm; sp uses spPr/xfrm spPr_tags = [ f"{{{_NS_P}}}grpSpPr", f"{{{_NS_P}}}spPr", ] for tag in spPr_tags: spPr = el.find(tag) if spPr is not None: xfrm = spPr.find(f"{{{_NS_A}}}xfrm") if xfrm is not None: off = xfrm.find(f"{{{_NS_A}}}off") ext = xfrm.find(f"{{{_NS_A}}}ext") if off is not None and ext is not None: x = int(off.get("x", 0)) y = int(off.get("y", 0)) cx = int(ext.get("cx", 0)) cy = int(ext.get("cy", 0)) return x, y, cx, cy return 0, 0, 0, 0 def _rescale_and_reposition_group(el, scale: float, off_x: int, off_y: int) -> None: """ Reposition and scale a group shape by updating only its top-level off/ext. PPTX group coordinate model: grpSpPr/xfrm/off — group position in parent (slide) space grpSpPr/xfrm/ext — group size in parent space grpSpPr/xfrm/chOff — origin of child coordinate system (unchanged) grpSpPr/xfrm/chExt — size of child coordinate system (unchanged) Children use the child coordinate system. The renderer maps them into parent space via: parent_x = off.x + (child_x - chOff.x) * (ext.cx / chExt.cx) So updating only off/ext (keeping chOff/chExt fixed) correctly scales and repositions all children without touching their own coordinates. """ spPr_tag = f"{{{_NS_P}}}grpSpPr" spPr = el.find(spPr_tag) if spPr is None: return xfrm = spPr.find(f"{{{_NS_A}}}xfrm") if xfrm is None: return off = xfrm.find(f"{{{_NS_A}}}off") ext = xfrm.find(f"{{{_NS_A}}}ext") if off is not None: x = int(off.get("x", 0)) y = int(off.get("y", 0)) off.set("x", str(int(x * scale + off_x))) off.set("y", str(int(y * scale + off_y))) if ext is not None: cx = int(ext.get("cx", 0)) cy = int(ext.get("cy", 0)) ext.set("cx", str(int(cx * scale))) ext.set("cy", str(int(cy * scale))) def _min_child_y_in_parent_space(grp_el) -> float: """ Return the minimum y in SLIDE (parent) space occupied by any direct child of *grp_el* after it has been repositioned. Children whose child-space y < chOff.y map to ABOVE the group's off.y in slide space — causing them to bleed into the intro-text zone. This function finds the worst-case upward extension so callers can shift groups down. """ spPr = grp_el.find(f"{{{_NS_P}}}grpSpPr") if spPr is None: return float("inf") xfrm = spPr.find(f"{{{_NS_A}}}xfrm") if xfrm is None: return float("inf") off_el = xfrm.find(f"{{{_NS_A}}}off") ext_el = xfrm.find(f"{{{_NS_A}}}ext") chOff_el = xfrm.find(f"{{{_NS_A}}}chOff") chExt_el = xfrm.find(f"{{{_NS_A}}}chExt") if off_el is None: return float("inf") grp_y = int(off_el.get("y", 0)) grp_cy = int(ext_el.get("cy", 0)) if ext_el is not None else 0 chOff_y = int(chOff_el.get("y", 0)) if chOff_el is not None else 0 chExt_cy = int(chExt_el.get("cy", 0)) if chExt_el is not None else grp_cy if chExt_cy == 0: return float(grp_y) scale_y = grp_cy / chExt_cy min_y = float(grp_y) for child in grp_el: child_spPr_tag = (f"{{{_NS_P}}}grpSpPr" if child.tag == f"{{{_NS_P}}}grpSp" else f"{{{_NS_P}}}spPr") child_spPr = child.find(child_spPr_tag) if child_spPr is None: continue child_xfrm = child_spPr.find(f"{{{_NS_A}}}xfrm") if child_xfrm is None: continue child_off = child_xfrm.find(f"{{{_NS_A}}}off") if child_off is None: continue c_y = int(child_off.get("y", 0)) parent_y = grp_y + (c_y - chOff_y) * scale_y min_y = min(min_y, parent_y) return min_y def _shift_group_y(grp_el, delta_y: int) -> None: """Shift a repositioned grpSp downward by *delta_y* EMU (updates off.y only).""" spPr = grp_el.find(f"{{{_NS_P}}}grpSpPr") if spPr is None: return xfrm = spPr.find(f"{{{_NS_A}}}xfrm") if xfrm is None: return off = xfrm.find(f"{{{_NS_A}}}off") if off is not None: off.set("y", str(int(off.get("y", 0)) + delta_y)) def _get_shape_top_y(el) -> float: """Return the top y coordinate (EMU) of a grpSp or sp element in slide space.""" x, y, cx, cy = _get_grp_xfrm(el) return float(y) def _shift_shape_y(el, delta_y: int) -> None: """Shift any shape element (grpSp or sp) downward by *delta_y* EMU.""" grp_tag = f"{{{_NS_P}}}grpSp" if el.tag == grp_tag: _shift_group_y(el, delta_y) return spPr = el.find(f"{{{_NS_P}}}spPr") if spPr is None: return xfrm = spPr.find(f"{{{_NS_A}}}xfrm") if xfrm is None: return off = xfrm.find(f"{{{_NS_A}}}off") if off is not None: off.set("y", str(int(off.get("y", 0)) + delta_y)) def _rescale_and_reposition_sp(el, scale: float, off_x: int, off_y: int) -> None: """Rescale and reposition a direct sp element (not a group).""" spPr = el.find(f"{{{_NS_P}}}spPr") if spPr is None: return xfrm = spPr.find(f"{{{_NS_A}}}xfrm") if xfrm is None: return off = xfrm.find(f"{{{_NS_A}}}off") ext = xfrm.find(f"{{{_NS_A}}}ext") if off is not None: x = int(off.get("x", 0)) y = int(off.get("y", 0)) off.set("x", str(int(x * scale + off_x))) off.set("y", str(int(y * scale + off_y))) if ext is not None: cx = int(ext.get("cx", 0)) cy = int(ext.get("cy", 0)) ext.set("cx", str(int(cx * scale))) if cy > 0: ext.set("cy", str(int(cy * scale))) # cy==0 means auto-height (spAutoFit) — leave as-is def _collect_bank_text_boxes(bank_spTree) -> list: """ Collect direct txBox sp elements from a bank slide's spTree that are: - txBox=True (explicit text boxes, not auto-shapes) - Not in title zone (y >= 0.7in = 640,080 EMU) - Not zero-width (cx > 0) Returns list of (orig_y, orig_x, orig_cy, el) tuples for position-sorted assignment. """ _TITLE_ZONE_EMU = int(0.7 * _EMU) sp_tag = f"{{{_NS_P}}}sp" result = [] for el in bank_spTree: if el.tag != sp_tag: continue # Skip placeholders (title/body placeholders from the bank template) if el.find(f".//{{{_NS_P}}}ph") is not None: continue # Must be a txBox nvSpPr = el.find(f"{{{_NS_P}}}nvSpPr") if nvSpPr is None: continue cNvSpPr = nvSpPr.find(f"{{{_NS_P}}}cNvSpPr") if cNvSpPr is None or cNvSpPr.get("txBox") != "1": continue # Must have txBody if el.find(f"{{{_NS_P}}}txBody") is None: continue # Get position x, y, cx, cy = _get_grp_xfrm(el) # Skip title-zone shapes if y < _TITLE_ZONE_EMU: continue # Skip degenerate shapes (cx < 0.05in = 45,720 EMU). Some bank # slides store near-zero cx values (1 EMU) for connector shapes. if cx < 45_720: continue result.append((y, x, cx, cy, el)) return result def _fill_direct_text_boxes( sorted_text_els: list, items: list[dict], has_descriptions: bool ) -> None: """ Fill content items into direct text box sp elements sorted by (y, x, cy). With has_descriptions=True: pairs → (label, description) per item With has_descriptions=False: singles → label per item Unused text boxes beyond n_items are removed from the slide. """ if not sorted_text_els or not items: return stride = 2 if has_descriptions else 1 n_items = len(items) for i, item in enumerate(items): base = i * stride if base >= len(sorted_text_els): break label = item.get("label", "") desc = item.get("description", "") _replace_sp_text(sorted_text_els[base], label, bold=True, center_align=True) if has_descriptions and base + 1 < len(sorted_text_els): _replace_sp_text(sorted_text_els[base + 1], desc) # Remove unused text boxes used_count = min(n_items * stride, len(sorted_text_els)) for el in sorted_text_els[used_count:]: parent = el.getparent() if parent is not None: parent.remove(el) def _harmonise_group_colors(el, brand_color: str) -> None: """ Replace the dominant accent colour in bank group shapes with the template brand colour. Two paths: A) Bank uses srgbClr (explicit hex) — find the most common non-neutral fill and swap it for brand_color. B) Bank uses schemeClr (theme-relative) — shapes become invisible when the XML is pasted into a different theme. Replace accent1-6 scheme colours with brand_color as a flat srgbClr so the shapes always render visibly. """ from lxml import etree from collections import Counter target = brand_color.upper() # ── Path A: srgbClr fills ──────────────────────────────────────────────── all_fills: list[str] = [] for srgb in el.iter(f"{{{_NS_A}}}srgbClr"): val = srgb.get("val", "").upper() if len(val) == 6: r, g, b = int(val[0:2],16), int(val[2:4],16), int(val[4:6],16) brightness = (r + g + b) / 3 saturation = max(r, g, b) - min(r, g, b) if brightness > 230 or brightness < 25 or saturation < 30: continue all_fills.append(val) if all_fills: bank_accent = Counter(all_fills).most_common(1)[0][0] if bank_accent != target: for srgb in el.iter(f"{{{_NS_A}}}srgbClr"): if srgb.get("val", "").upper() == bank_accent: srgb.set("val", target) # Don't return — schemeClr fills may coexist with srgbClr fills # ── Path B: schemeClr fills — replace accent1-6 with brand_color ──────── # Shapes that use theme-relative colours render as the template theme colour, # which is often white on a white background → invisible. Replacing accent # scheme slots with a flat srgbClr restores visibility. _ACCENT_SCHEMES = {"accent1", "accent2", "accent3", "accent4", "accent5", "accent6"} for schemeClr in list(el.iter(f"{{{_NS_A}}}schemeClr")): if schemeClr.get("val", "") not in _ACCENT_SCHEMES: continue parent = schemeClr.getparent() if parent is None: continue idx = list(parent).index(schemeClr) parent.remove(schemeClr) new_srgb = etree.Element(f"{{{_NS_A}}}srgbClr") new_srgb.set("val", target) parent.insert(idx, new_srgb) def _fill_group_text(group_els: list, items: list[dict]) -> None: """ Write item labels and descriptions into text boxes in bank group shapes. Handles three bank layouts: A) 1 top-level group with N direct txBox sp as labels (pointer/multi type) → assign items[i].label directly to sp[i] sorted by (y, x) B) 1 top-level group with N sub-grpSp as cards → descend into sub-groups, one per item C) N top-level groups as cards (flat card layout) → assign items[i] to group[i] Unused group slots are blanked so bank placeholder text does not bleed through. Portrait-oriented text boxes (cx < cy) get truncated text to prevent vertical character-by-character wrapping. """ if not group_els or not items: return grp_tag = f"{{{_NS_P}}}grpSp" sp_tag = f"{{{_NS_P}}}sp" def _portrait_safe_label(sp_el, label: str) -> str: """Truncate label if the text box is portrait-oriented (narrow column).""" _, _, cx, cy = _get_grp_xfrm(sp_el) if cx > 0 and cy > 0 and cx < cy * 0.75: words = label.split() return " ".join(words[:3]) return label def _find_icon_shape(text_sps: list): """ Return the best candidate auto-shape to display a label inside a circle. Looks for non-txBox shapes among text_sps that: - Have a solid colour fill (coloured icon circle, not transparent bg) - Are roughly square (0.5 ≤ cx/cy ≤ 2.0) in child-space - Larger than tiny (both dims > _MIN_TEXT_DIM) Returns (shape_el, area) for the largest passing candidate, or None. """ _A_SOLID = f"{{{_NS_A}}}solidFill" candidates = [] for sp in text_sps: cNvSpPr = sp.find(f"{{{_NS_P}}}nvSpPr/{{{_NS_P}}}cNvSpPr") if cNvSpPr is not None and cNvSpPr.get("txBox") == "1": continue # skip txBox, only want auto-shapes spPr = sp.find(f"{{{_NS_P}}}spPr") if spPr is None: continue # Must have a solid colour fill so text is visible against background if spPr.find(f".//{_A_SOLID}") is None: continue _, _, cx, cy = _get_grp_xfrm(sp) if cx < 100_000 or cy < 100_000: continue # tiny icon glyph, skip aspect = cx / cy if cy > 0 else 0 if not (0.4 <= aspect <= 2.5): continue # wide background rectangles are excluded candidates.append((cx * cy, len(candidates), sp)) if not candidates: return None candidates.sort(key=lambda t: t[0], reverse=True) return candidates[0][2] def _write_label_into_icon(sp_el, label: str) -> None: """Write label text centred in white inside an icon auto-shape.""" from lxml import etree txb = sp_el.find(f"{{{_NS_P}}}txBody") if txb is None: return bodyPr = txb.find(f"{{{_NS_A}}}bodyPr") if bodyPr is not None: bodyPr.attrib.pop("vert", None) bodyPr.set("anchor", "ctr") bodyPr.set("anchorCtr", "0") bodyPr.set("wrap", "square") for tag in ("noAutofit", "spAutoFit", "normAutofit"): el = bodyPr.find(f"{{{_NS_A}}}{tag}") if el is not None: bodyPr.remove(el) bodyPr.append(etree.Element(f"{{{_NS_A}}}normAutofit")) for p_el in list(txb.findall(f"{{{_NS_A}}}p")): txb.remove(p_el) new_p = etree.SubElement(txb, f"{{{_NS_A}}}p") pPr = etree.SubElement(new_p, f"{{{_NS_A}}}pPr") pPr.set("algn", "ctr") new_r = etree.SubElement(new_p, f"{{{_NS_A}}}r") rPr = etree.SubElement(new_r, f"{{{_NS_A}}}rPr") rPr.set("lang", "en-US") rPr.set("sz", "1200") # 12 pt start; normAutofit shrinks if needed rPr.set("b", "1") rPr.set("dirty", "0") sf = etree.SubElement(rPr, f"{{{_NS_A}}}solidFill") clr = etree.SubElement(sf, f"{{{_NS_A}}}srgbClr") clr.set("val", "FFFFFF") new_t = etree.SubElement(new_r, f"{{{_NS_A}}}t") new_t.text = label if len(group_els) == 1: top = group_els[0] def _is_txbox_child(child): nvSpPr = child.find(f"{{{_NS_P}}}nvSpPr") if nvSpPr is None: return False cNv = nvSpPr.find(f"{{{_NS_P}}}cNvSpPr") return cNv is not None and cNv.get("txBox") == "1" # Collect direct txBox sp children as (y, x, cx, cy, child). direct_txbox = [] for child in top: if child.tag != sp_tag: continue if child.find(f"{{{_NS_P}}}txBody") is None: continue if _is_txbox_child(child): x, y, cx, cy = _get_grp_xfrm(child) direct_txbox.append((y, x, cx, cy, child)) # Strip infographic header: the topmost direct txBox that is ALONE at its # y-level (no other txBox within 0.3in) AND separated from the next batch # of txBoxes by ≥ 0.5in. Such a box is a visual title/header of the # infographic design, not a content slot. if direct_txbox and len(direct_txbox) > len(items): min_y = min(t[0] for t in direct_txbox) _Y_TOL = int(0.3 * _EMU) at_min = [t for t in direct_txbox if abs(t[0] - min_y) < _Y_TOL] if len(at_min) == 1: rest_ys = [t[0] for t in direct_txbox if abs(t[0] - min_y) >= _Y_TOL] if rest_ys and (min(rest_ys) - min_y) >= int(0.5 * _EMU): _replace_sp_text(at_min[0][4], "") hdr_id = id(at_min[0][4]) direct_txbox = [t for t in direct_txbox if id(t[4]) != hdr_id] # Sub-grpSp children of the outer group. sub_groups = [child for child in top if child.tag == grp_tag] # AI-bank pattern: one visual sub-grpSp whose direct children include the # label txBoxes, while the outer grp's direct txBoxes are description slots. # Detect this when exactly 1 sub-grpSp contains ≥ n_items txBox children. if len(sub_groups) == 1: inner_labels = [] for child in sub_groups[0]: if child.tag == sp_tag and _is_txbox_child(child): x, y, cx, cy = _get_grp_xfrm(child) inner_labels.append((y, x, cx, cy, child)) if len(inner_labels) >= len(items): # Labels live inside the sub-grpSp; direct txboxes are desc slots. inner_sorted = sorted(inner_labels, key=lambda t: t[1]) # sort by local x direct_sorted = sorted(direct_txbox, key=lambda t: t[1]) # Uniform font size across all inner label slots _inner_u_pt: float | None = None for _i2, (_y2, _x2, _cx2, _cy2, _el2) in enumerate(inner_sorted[:len(items)]): _pt2 = _label_font_pt(_cx2, _cy2, items[_i2].get("label", "")) _inner_u_pt = _pt2 if _inner_u_pt is None else min(_inner_u_pt, _pt2) for i, item in enumerate(items): label = _portrait_safe_label( inner_sorted[i][4] if i < len(inner_sorted) else inner_sorted[0][4], item.get("label", ""), ) desc = item.get("description", "") if i < len(inner_sorted): _, _, cx, cy, el = inner_sorted[i] _replace_sp_text(el, label, font_pt=_inner_u_pt or _label_font_pt(cx, cy, label), bold=True, center_align=True) if i < len(direct_sorted): _replace_sp_text(direct_sorted[i][4], desc) for j in range(len(items), len(inner_sorted)): _replace_sp_text(inner_sorted[j][4], "") for j in range(len(items), len(direct_sorted)): _replace_sp_text(direct_sorted[j][4], "") return # Double-wrap pattern: outer → 1 wrapper → N card sub-grpSp. # The single inner group has no txBox children but contains the actual # per-item card containers as its own sub-grpSp. Unwrap one level so # Pattern C can handle those N cards. deep_groups = [c for c in sub_groups[0] if c.tag == grp_tag] if len(deep_groups) >= 2: group_els = deep_groups if len(direct_txbox) >= len(items): # Pre-filter 1: drop very thin connector-label bars when taller shapes # already supply enough slots. Thin bars (cy < 0.45 in) sit between # card rows in 2-row layouts and are not content slots. _THIN_CY = int(0.45 * _EMU) _thin = [t for t in direct_txbox if t[3] < _THIN_CY] _large = [t for t in direct_txbox if t[3] >= _THIN_CY] if _thin and len(_large) >= len(items): for t in _thin: _replace_sp_text(t[4], "") direct_txbox = _large # Pre-filter 2: drop a right-side outlier shape whose x-gap from its # nearest left neighbour exceeds 1.5 in, when removing it still leaves # enough shapes. This removes sidebar/description panels that have been # mixed into the label pool. _X_GAP_OUTLIER = int(1.5 * _EMU) _srt_x = sorted(direct_txbox, key=lambda t: t[1]) if len(_srt_x) >= 2: _rx_gap = _srt_x[-1][1] - _srt_x[-2][1] if _rx_gap > _X_GAP_OUTLIER and len(_srt_x) - 1 >= len(items): _replace_sp_text(_srt_x[-1][4], "") direct_txbox = _srt_x[:-1] # Cluster text boxes by 2D proximity: boxes within _CL_DX/DY of each # other represent the same visual slot (e.g. label + secondary line for # one circle/arrow). Sort clusters by (avg_y, avg_x) reading order and # assign one item per cluster. _CL_DX = int(0.6 * _EMU) _CL_DY = int(0.8 * _EMU) sorted_t = sorted(direct_txbox, key=lambda t: (t[1], t[0])) clusters: list[list] = [[sorted_t[0]]] for tup in sorted_t[1:]: prev = clusters[-1][-1] if abs(tup[1] - prev[1]) < _CL_DX and abs(tup[0] - prev[0]) < _CL_DY: clusters[-1].append(tup) else: clusters.append([tup]) clusters.sort(key=lambda cl: ( sum(t[0] for t in cl) / len(cl), sum(t[1] for t in cl) / len(cl), )) for i, cluster in enumerate(clusters): if i < len(items): y, x, cx, cy, el = cluster[0] label = _portrait_safe_label(el, items[i].get("label", "")) pt = _label_font_pt(cx, cy, label) _replace_sp_text(el, label, font_pt=pt, bold=True, center_align=True) for _, _, _, _, sp_el in cluster[1:]: _replace_sp_text(sp_el, "") else: for _, _, _, _, sp_el in cluster: _replace_sp_text(sp_el, "") return # Pattern B: multiple sub-grpSp children are the card containers if len(sub_groups) >= 2: group_els = sub_groups # Pattern C (and B after reassignment): each group = one item # Three-stage filtering: # Stage 1 (size): dot/connector groups smaller than 0.7 × 0.5 in are never # content slots — filter them out unconditionally. # Stage 2 (visual-only): large groups (> 2 × 2 in) that contain no txBox # children and no sub-grpSp are purely visual icon circles — skip them. # Stage 3 (content): among remaining groups, prefer ones where at least # one text box contains > 2 chars (real content, not icon glyphs). # If all size-passing groups still appear empty (bank template has blank # placeholder slots), use all size-passing groups as-is. _MIN_GRP_CX = int(0.7 * _EMU) # 0.7 in _MIN_GRP_CY = int(0.5 * _EMU) # 0.5 in _LARGE_VISUAL_THRESHOLD = int(2.0 * _EMU) # 2.0 in def _has_real_text(grp_el) -> bool: sps = _collect_text_sps_in_group(grp_el) for sp in sps: txb = sp.find(f"{{{_NS_P}}}txBody") if txb is None: continue t = "".join(el.text or "" for el in txb.iter(f"{{{_NS_A}}}t")).strip() if len(t) > 2: return True return False def _is_purely_visual(grp_el) -> bool: """True if group has no txBox children and no sub-grpSp — purely decorative.""" _sp_tag = f"{{{_NS_P}}}sp" _grp_tag = f"{{{_NS_P}}}grpSp" for child in grp_el: if child.tag == _grp_tag: return False # has sub-groups → likely content layout if child.tag == _sp_tag: cNvSpPr = child.find(f"{{{_NS_P}}}nvSpPr/{{{_NS_P}}}cNvSpPr") if cNvSpPr is not None and cNvSpPr.get("txBox") == "1": return False # has txBox child → not purely visual return True def _grp_pos(el): x, y, _, _ = _get_grp_xfrm(el) return (y, x) # Stage 1: discard shapes too small to hold content sized_groups = [ g for g in group_els if _get_grp_xfrm(g)[2] >= _MIN_GRP_CX and _get_grp_xfrm(g)[3] >= _MIN_GRP_CY ] if not sized_groups: sized_groups = group_els # safety fallback (shouldn't happen) # Stage 2: remove large purely-visual icon groups (no txBox, no sub-grpSp) non_visual_groups = [ g for g in sized_groups if not ( _get_grp_xfrm(g)[2] >= _LARGE_VISUAL_THRESHOLD and _get_grp_xfrm(g)[3] >= _LARGE_VISUAL_THRESHOLD and _is_purely_visual(g) ) ] if not non_visual_groups: # All Stage-1-passing groups were purely visual decorations. # The actual content slots are thin rows that failed Stage 1 — # fall back to ALL groups minus large purely-visual ones. non_visual_groups = [ g for g in group_els if not ( _get_grp_xfrm(g)[2] >= _LARGE_VISUAL_THRESHOLD and _get_grp_xfrm(g)[3] >= _LARGE_VISUAL_THRESHOLD and _is_purely_visual(g) ) ] if not non_visual_groups: non_visual_groups = group_els # absolute last resort # Stage 3: prefer groups with real text content; fall back to all non-visual groups # FIX: Don't filter out empty groups - they need to be filled with content! # The old logic skipped empty groups, leaving them blank in the output. content_groups = [g for g in non_visual_groups if _has_real_text(g)] if not content_groups: # FIX: Use ALL non-visual groups, not just ones with existing text # Empty groups are VALID content slots that need to be filled content_groups = non_visual_groups sorted_groups = sorted(content_groups, key=_grp_pos) # Pre-compute uniform font size: scan every writable slot, take the minimum # so all labels on the same infographic render at the same point size. def _is_txbox_sp(sp_el): c = sp_el.find(f"{{{_NS_P}}}nvSpPr/{{{_NS_P}}}cNvSpPr") return c is not None and c.get("txBox") == "1" uniform_pt: float | None = None _tmp_idx = 0 for _g in sorted_groups: _sps = _collect_text_sps_in_group(_g) if not _sps: continue if _tmp_idx >= len(items): break _lbl = items[_tmp_idx].get("label", "") _txb = [s for s in _sps if _is_txbox_sp(s)] _ref = _txb[0] if _txb else _sps[0] _, _, _cx, _cy = _get_grp_xfrm(_ref) _pt = _label_font_pt(_cx, _cy, _lbl) uniform_pt = _pt if uniform_pt is None else min(uniform_pt, _pt) _tmp_idx += 1 # Use a separate item counter so visual-only groups (no text slots) are # skipped without consuming an item index. Previously, using enumerate() # meant that if N visual groups appear before the label groups in sort order, # those label groups got indices >= N and their text boxes were removed as # "unused" even when N == len(items). item_idx = 0 for grp_el in sorted_groups: text_sps = _collect_text_sps_in_group(grp_el) if not text_sps: continue # visual-only group — skip without consuming an item slot if item_idx >= len(items): # Remove unused text slots — empty boxes with fills render as lines. for sp in text_sps: parent = sp.getparent() if parent is not None: parent.remove(sp) continue item = items[item_idx] item_idx += 1 desc = item.get("description", "") # Separate txBox slots (label / desc) from auto-shape icon candidates txbox_sps = [ sp for sp in text_sps if (lambda c: c is not None and c.get("txBox") == "1")( sp.find(f"{{{_NS_P}}}nvSpPr/{{{_NS_P}}}cNvSpPr") ) ] if txbox_sps: # Always write the label into the txBox (the arrow/label slot). # Avoid writing into icon auto-shapes — they have images on top that # would cover the text (e.g. hexagon+arrow layouts). _, _, cx, cy = _get_grp_xfrm(txbox_sps[0]) label = _portrait_safe_label(txbox_sps[0], item.get("label", "")) pt = uniform_pt if uniform_pt else _label_font_pt(cx, cy, label) _replace_sp_text(txbox_sps[0], label, font_pt=pt, bold=True, center_align=True) if len(txbox_sps) >= 2: if desc: _replace_sp_text(txbox_sps[1], desc) else: p = txbox_sps[1].getparent() if p is not None: p.remove(txbox_sps[1]) else: # No txBox found — try writing the label inside a coloured icon/circle # auto-shape (e.g. rounded badge, circle). Only correct when no txBox # exists because auto-shapes may carry icon images on top. icon_sp = _find_icon_shape(text_sps) if icon_sp is not None: _write_label_into_icon(icon_sp, item.get("label", "")) else: label = _portrait_safe_label(text_sps[0], item.get("label", "")) _, _, cx, cy = _get_grp_xfrm(text_sps[0]) pt = uniform_pt if uniform_pt else _label_font_pt(cx, cy, label) _replace_sp_text(text_sps[0], label, font_pt=pt, bold=True, center_align=True) if desc and len(text_sps) >= 2: _replace_sp_text(text_sps[1], desc) elif len(text_sps) >= 2: p = text_sps[1].getparent() if p is not None: p.remove(text_sps[1]) def _collect_text_sps_in_group(grp_el) -> list: """Recursively collect sp elements with txBody, sorted by y in child space. Filtering rules: - Skip shapes whose BOTH dimensions are < 100 000 EMU (~0.11 in). These are decorative icon/bullet shapes that happen to carry an empty txBody; writing content into them causes catastrophic vertical text overflow. - txBox=1 shapes sort before auto-shapes so label slots come before desc slots. FIX: Reduced minimum dimension from 100k to 50k EMU to catch more text boxes. Many bank slides have smaller text boxes that were being skipped. """ _MIN_TEXT_DIM = 50_000 # EMU; reduced from 100k to catch smaller text boxes results = [] sp_tag = f"{{{_NS_P}}}sp" grp_tag = f"{{{_NS_P}}}grpSp" def _recurse(el): for child in el: if child.tag == sp_tag: txb = child.find(f"{{{_NS_P}}}txBody") if txb is None: continue _, y, cx, cy = _get_grp_xfrm(child) # Skip icon-sized shapes — writing text into them causes overflow # FIX: Changed from AND to OR - skip only if BOTH dimensions are tiny if cx > 0 and cy > 0 and cx < _MIN_TEXT_DIM and cy < _MIN_TEXT_DIM: continue nvSpPr = child.find(f"{{{_NS_P}}}nvSpPr") is_txbox = False if nvSpPr is not None: cNvSpPr = nvSpPr.find(f"{{{_NS_P}}}cNvSpPr") is_txbox = cNvSpPr is not None and cNvSpPr.get("txBox") == "1" results.append((not is_txbox, y, child)) elif child.tag == grp_tag: _recurse(child) _recurse(grp_el) results.sort(key=lambda t: (t[0], t[1])) return [sp for _, _, sp in results] def _replace_sp_text( sp_el, new_text: str, autofit: bool = True, font_pt: float = None, bold: bool | None = None, center_align: bool = False, ) -> None: """ Replace ALL text in a bank shape's txBody with *new_text*. font_pt: if given, overrides the run's sz with this point size. bold: if True/False, forces bold on/off; None preserves original. center_align: if True, sets algn="ctr" on the paragraph. """ from lxml import etree txb = sp_el.find(f"{{{_NS_P}}}txBody") if txb is None: return # Strip vertical-text orientation — some bank shapes carry vert="vert" or # vert="vert270" on bodyPr which makes every character render top-to-bottom. bodyPr = txb.find(f"{{{_NS_A}}}bodyPr") if bodyPr is not None: bodyPr.attrib.pop("vert", None) if autofit: # Remove any existing autofit child, then add normAutofit so # PowerPoint shrinks the font to fit the fixed-size text box. for tag in ("noAutofit", "spAutoFit", "normAutofit"): el = bodyPr.find(f"{{{_NS_A}}}{tag}") if el is not None: bodyPr.remove(el) bodyPr.append(etree.Element(f"{{{_NS_A}}}normAutofit")) paras = txb.findall(f"{{{_NS_A}}}p") # Extract pPr from the first paragraph (alignment, spacing, indent, etc.) first_pPr = paras[0].find(f"{{{_NS_A}}}pPr") if paras else None # Extract rPr from the very first run anywhere in the txBody for formatting. first_rPr = None for p_el in paras: for r_el in p_el.findall(f"{{{_NS_A}}}r"): first_rPr = r_el.find(f"{{{_NS_A}}}rPr") break if first_rPr is not None: break # Remove ALL paragraphs — we rebuild from scratch. for p_el in list(paras): txb.remove(p_el) # Re-create one clean paragraph with new text. new_p = etree.SubElement(txb, f"{{{_NS_A}}}p") # Restore paragraph properties (alignment, indent, spacing). pPr_copy = copy.deepcopy(first_pPr) if first_pPr is not None else None if center_align: if pPr_copy is None: pPr_copy = etree.Element(f"{{{_NS_A}}}pPr") pPr_copy.set("algn", "ctr") if pPr_copy is not None: new_p.append(pPr_copy) new_r = etree.SubElement(new_p, f"{{{_NS_A}}}r") if first_rPr is not None: rPr_copy = copy.deepcopy(first_rPr) if font_pt is not None: rPr_copy.set("sz", str(int(font_pt * 100))) rPr_copy.attrib.pop("dirty", None) if bold is not None: rPr_copy.set("b", "1" if bold else "0") new_r.append(rPr_copy) elif font_pt is not None or bold is not None: rPr = etree.SubElement(new_r, f"{{{_NS_A}}}rPr") rPr.set("lang", "en-GB") if font_pt is not None: rPr.set("sz", str(int(font_pt * 100))) if bold is not None: rPr.set("b", "1" if bold else "0") rPr.set("dirty", "0") new_t = etree.SubElement(new_r, f"{{{_NS_A}}}t") new_t.text = new_text # ═══════════════════════════════════════════════════════════════════════════════ # Body text writers # ═══════════════════════════════════════════════════════════════════════════════ # # CENTRALISED BODY-TEXT LAYOUT # ---------------------------- # Every body-text renderer (text_only paragraphs, points, bullets; # drill-down card bodies; scenario_qa structured; case_study; scenario) # routes its bodyPr + paragraph spacing through these helpers so the # visual result is consistent and predictable. # # Guarantees enforced by `_apply_body_layout`: # • anchor="t" — content starts at the top of the box, not centred # (prevents the "first bullet merges into header" # symptom we kept patching) # • tIns=top_pad_in — clearance from anything sitting just above the # body box (banner headers, pill labels, etc.) # • lIns/rIns=side_pad_in — text doesn't hug card borders # • bIns=bottom_pad_in — gap before chrome/watermark # • noAutofit — PowerPoint cannot collapse spcAft or shrink font # behind our backs. Content must fit at the # chosen font; if it doesn't, the planner needs # to split the slide. # # Guarantees enforced by `_stamp_para_spacing`: # • 115% line spacing within a paragraph # • Configurable paragraph gap (default 24pt — > line height at 18pt # so the gap is visibly larger than a line, reads as a true break) # • No spcAft on the last paragraph (no trailing white space) def _apply_body_layout(txBody_el, *, top_pad_in: float = 0.2, side_pad_in: float = 0.2, bottom_pad_in: float = 0.1) -> None: """Apply the consistent bodyPr settings to *txBody_el*.""" from lxml import etree bodyPr = txBody_el.find(f"{{{_NS_A}}}bodyPr") if bodyPr is None: bodyPr = etree.Element(f"{{{_NS_A}}}bodyPr") txBody_el.insert(0, bodyPr) # Remove any existing autoFit children — we always use noAutofit for tag in ("noAutofit", "spAutoFit", "normAutofit"): existing = bodyPr.find(f"{{{_NS_A}}}{tag}") if existing is not None: bodyPr.remove(existing) etree.SubElement(bodyPr, f"{{{_NS_A}}}noAutofit") bodyPr.set("anchor", "t") bodyPr.set("anchorCtr", "0") bodyPr.set("wrap", "square") bodyPr.set("lIns", str(int(side_pad_in * _EMU))) bodyPr.set("rIns", str(int(side_pad_in * _EMU))) bodyPr.set("tIns", str(int(top_pad_in * _EMU))) bodyPr.set("bIns", str(int(bottom_pad_in * _EMU))) def _stamp_para_spacing(pPr_el, *, is_last: bool, gap_pt: float = 36.0, line_spacing_pct: int = 115) -> None: """Add lnSpc + spcAft to a pPr element. is_last suppresses spcAft.""" from lxml import etree lnSpc = etree.SubElement(pPr_el, f"{{{_NS_A}}}lnSpc") spcPct = etree.SubElement(lnSpc, f"{{{_NS_A}}}spcPct") spcPct.set("val", str(int(line_spacing_pct * 1000))) if not is_last: spcAft = etree.SubElement(pPr_el, f"{{{_NS_A}}}spcAft") spcPts = etree.SubElement(spcAft, f"{{{_NS_A}}}spcPts") spcPts.set("val", str(int(gap_pt * 100))) def _fit_card_body_pt(sp_el, lines: list[str], *, top_pad_in: float = 0.3, side_pad_in: float = 0.25, bottom_pad_in: float = 0.1, gap_pt: float = 9.0, line_spacing: float = 1.15, max_pt: float = 16.0, min_pt: float = 10.0) -> float: """ Largest font point size at which all paragraphs in *lines* fit within sp_el's box height (after accounting for insets and inter-paragraph gap). Iteratively reduces from max_pt to min_pt by 0.5pt steps. """ spPr = sp_el.find(f"{{{_NS_P}}}spPr") xf = spPr.find(f"{{{_NS_A}}}xfrm") if spPr is not None else None ext = xf.find(f"{{{_NS_A}}}ext") if xf is not None else None if ext is None: return max_pt try: box_cx = int(ext.get("cx", 0)) / _EMU box_cy = int(ext.get("cy", 0)) / _EMU except Exception: return max_pt if box_cx <= 0 or box_cy <= 0: return max_pt avail_cx = max(box_cx - 2 * side_pad_in, 1.0) avail_cy = max(box_cy - top_pad_in - bottom_pad_in, 0.5) real_lines = [l.strip() for l in lines if l.strip()] if not real_lines: return max_pt SAFETY = 0.92 pt = max_pt while pt >= min_pt: line_h = pt * line_spacing / 72.0 # inches per rendered line chars_per_line = max(1, int(avail_cx * 144 / pt * SAFETY)) total_h = 0.0 for ln in real_lines: n_lines = max(1, -(-len(ln) // chars_per_line)) # ceil divison total_h += n_lines * line_h # Inter-paragraph gaps (n-1 of them) total_h += max(0, len(real_lines) - 1) * (gap_pt / 72.0) if total_h <= avail_cy: return pt pt -= 0.5 return min_pt def _write_points_body(txBody_el, points: list[dict], cx_in: float, cy_in: float, heading: str = "", lead_in: str = "") -> None: """ Write inline bold label + regular explanation paragraphs. Each point renders as: [Bold "Label: "][Regular "explanation text..."] in a single paragraph, matching the gold standard pattern. """ from lxml import etree from engine.text_replacer import _fit_font_pt if not points: return # Centralised body layout — top anchor, clear top-pad, noAutofit, insets. _apply_body_layout(txBody_el) # Lock body font at 16pt for all points content. fit_pt = 16.0 # Remove existing paragraphs for p in list(txBody_el.findall(qn("a:p"))): txBody_el.remove(p) # Optional section heading if heading: h_el = etree.SubElement(txBody_el, f"{{{_NS_A}}}p") hPr = etree.SubElement(h_el, f"{{{_NS_A}}}pPr") etree.SubElement(hPr, f"{{{_NS_A}}}buNone") hr_el = etree.SubElement(h_el, f"{{{_NS_A}}}r") hrPr = etree.SubElement(hr_el, f"{{{_NS_A}}}rPr") hrPr.set("lang", "en-US"); hrPr.set("b", "1") hrPr.set("sz", "2200") ht_el = etree.SubElement(hr_el, f"{{{_NS_A}}}t") ht_el.text = heading # Helper: stamp 115% line spacing + ~18pt space-after on every pPr so # paragraphs render with a clear full-line gap between them (matches the # gold standard's blank-line separation pattern). Skip space-after on # the final paragraph so there's no trailing white space. def _stamp_spacing(pPr_el, is_last: bool): lnSpc = etree.SubElement(pPr_el, f"{{{_NS_A}}}lnSpc") spcPct = etree.SubElement(lnSpc, f"{{{_NS_A}}}spcPct") spcPct.set("val", "115000") if not is_last: spcAft = etree.SubElement(pPr_el, f"{{{_NS_A}}}spcAft") spcPts = etree.SubElement(spcAft, f"{{{_NS_A}}}spcPts") spcPts.set("val", "3600") # 24pt — bigger than line-height so visible # Wingdings checkmark bullet for the labelled points BULLET_CHAR_PT = "\xfc" _total = (1 if lead_in else 0) + len(points) # Optional lead-in plain prose sentence before the points. # Lead-in is NOT bulleted — it's a prose intro. Gap below it is the # paragraph-break gap (18pt) so the points start cleanly below. if lead_in: li_el = etree.SubElement(txBody_el, f"{{{_NS_A}}}p") liPr = etree.SubElement(li_el, f"{{{_NS_A}}}pPr") etree.SubElement(liPr, f"{{{_NS_A}}}buNone") _stamp_spacing(liPr, is_last=(_total == 1)) r_li = etree.SubElement(li_el, f"{{{_NS_A}}}r") rPr_li = etree.Element(f"{{{_NS_A}}}rPr") rPr_li.set("lang", "en-US"); rPr_li.set("b", "0") rPr_li.set("sz", str(int(fit_pt * 100))) r_li.insert(0, rPr_li) t_li = etree.SubElement(r_li, f"{{{_NS_A}}}t") t_li.text = lead_in for idx, point in enumerate(points): label = point.get("label", "").strip() text = point.get("text", "").strip() p_el = etree.SubElement(txBody_el, f"{{{_NS_A}}}p") pPr = etree.SubElement(p_el, f"{{{_NS_A}}}pPr") # Add a Wingdings checkmark bullet to identify each point at a glance. pPr.set("marL", "285750"); pPr.set("indent", "-285750") buFont = etree.SubElement(pPr, f"{{{_NS_A}}}buFont") buFont.set("typeface", "Wingdings") buFont.set("panose", "05000000000000000000") buFont.set("pitchFamily", "2"); buFont.set("charset", "2") buChar = etree.SubElement(pPr, f"{{{_NS_A}}}buChar") buChar.set("char", BULLET_CHAR_PT) # Apply spacing — 115% line + 18pt space-after between points is_last = idx == len(points) - 1 _stamp_spacing(pPr, is_last=is_last) # Bold label run: "Label: " if label: r_bold = etree.SubElement(p_el, f"{{{_NS_A}}}r") rPr_b = etree.SubElement(r_bold, f"{{{_NS_A}}}rPr") rPr_b.set("lang", "en-US"); rPr_b.set("b", "1") rPr_b.set("sz", str(int(fit_pt * 100))) t_b = etree.SubElement(r_bold, f"{{{_NS_A}}}t") t_b.text = f"{label}: " # Regular text run if text: r_reg = etree.SubElement(p_el, f"{{{_NS_A}}}r") rPr_r = etree.SubElement(r_reg, f"{{{_NS_A}}}rPr") rPr_r.set("lang", "en-US"); rPr_r.set("b", "0") rPr_r.set("sz", str(int(fit_pt * 100))) t_r = etree.SubElement(r_reg, f"{{{_NS_A}}}t") t_r.text = text # ═══════════════════════════════════════════════════════════════════════════════ # Multi-paragraph body builder (prose / bullets) # ═══════════════════════════════════════════════════════════════════════════════ def _set_paragraphs( txBody_el, paragraphs: list[str], cx_in: float = 0, cy_in: float = 0, bullet: bool = False, heading: str = "", lead_in: str = "", font_pt: float | None = None, ) -> None: """ Replace text box body with multiple paragraphs. bullet=True → Wingdings checkmark (✓) bullets bullet=False → plain prose paragraphs Line spacing and space-after are left unset so PowerPoint inherits its defaults (100% / 0pt), matching the gold standard files. normAutofit is used so PowerPoint shrinks the font automatically if the content is too long — we supply an 18pt starting size. """ from lxml import etree from engine.text_replacer import _fit_font_pt BULLET_CHAR = "\xfc" BULLET_FONT = "Wingdings" BULLET_PANOSE = "05000000000000000000" BULLET_MARL = 285750 BULLET_INDENT = -285750 if not paragraphs: return # Centralised body layout — top anchor, top-pad clearance, insets, noAutofit. _apply_body_layout(txBody_el) # Read original run properties to preserve template font colour/face. existing_paras = txBody_el.findall(qn("a:p")) rPr_template = None orig_pt = 0.0 if existing_paras: first_run = existing_paras[0].find(qn("a:r")) if first_run is not None: rPr_template = first_run.find(qn("a:rPr")) if rPr_template is not None: sz = rPr_template.get("sz") if sz: orig_pt = int(sz) / 100.0 # Starting font size: 18pt (gold standard default for body text). # normAutofit reduces this proportionally if content overflows. eff_cx = cx_in - (BULLET_MARL / _EMU) if bullet else cx_in heading_reserved = (22 / 72 * 1.15) if heading else 0.0 lead_in_reserved = (18 / 72 * 1.15) if lead_in else 0.0 eff_cy = max(cy_in - heading_reserved - lead_in_reserved, cy_in * 0.7) if cy_in > 0 else 0 all_text = (lead_in + " " if lead_in else "") + " ".join(paragraphs) if font_pt is not None: # Caller pinned an exact size — skip auto-fit entirely. fit_pt = float(font_pt) elif eff_cx > 0 and eff_cy > 0: fit_pt = _fit_font_pt( eff_cx, eff_cy, all_text, orig_pt, default_pt=18.0, line_spacing=1.15, n_items=len(paragraphs) + (1 if lead_in else 0), spc_aft_pt=0, min_pt=11, ) else: fit_pt = max(orig_pt if orig_pt >= 11 else 18.0, 11.0) for p in list(txBody_el.findall(qn("a:p"))): txBody_el.remove(p) if heading: h_el = etree.SubElement(txBody_el, f"{{{_NS_A}}}p") hPr = etree.SubElement(h_el, f"{{{_NS_A}}}pPr") etree.SubElement(hPr, f"{{{_NS_A}}}buNone") hr_el = etree.SubElement(h_el, f"{{{_NS_A}}}r") if rPr_template is not None: hrPr = etree.fromstring(etree.tostring(rPr_template)) else: hrPr = etree.Element(f"{{{_NS_A}}}rPr") hrPr.set("lang", "en-US"); hrPr.set("dirty", "0") hrPr.set("sz", "2200"); hrPr.set("b", "1") hr_el.insert(0, hrPr) ht_el = etree.SubElement(hr_el, f"{{{_NS_A}}}t") ht_el.text = heading # Helper: stamp standard paragraph spacing (115% line + 18pt space-after # — a full blank-line gap between paragraphs, matching the gold standard). # Applied to every paragraph generated below so text never visually sticks. def _stamp_spacing(pPr_el, is_last: bool): lnSpc = etree.SubElement(pPr_el, f"{{{_NS_A}}}lnSpc") spcPct = etree.SubElement(lnSpc, f"{{{_NS_A}}}spcPct") spcPct.set("val", "115000") if not is_last: spcAft = etree.SubElement(pPr_el, f"{{{_NS_A}}}spcAft") spcPts = etree.SubElement(spcAft, f"{{{_NS_A}}}spcPts") spcPts.set("val", "3600") # 24pt — bigger than line-height so visible # Total paragraphs that will be rendered (for "is_last" detection) _total = (1 if lead_in else 0) + len(paragraphs) # Optional plain prose sentence before bullets/points if lead_in: li_el = etree.SubElement(txBody_el, f"{{{_NS_A}}}p") liPr = etree.SubElement(li_el, f"{{{_NS_A}}}pPr") etree.SubElement(liPr, f"{{{_NS_A}}}buNone") _stamp_spacing(liPr, is_last=(_total == 1)) r_li = etree.SubElement(li_el, f"{{{_NS_A}}}r") if rPr_template is not None: rPr_li = etree.fromstring(etree.tostring(rPr_template)) else: rPr_li = etree.Element(f"{{{_NS_A}}}rPr") rPr_li.set("lang", "en-US"); rPr_li.set("dirty", "0") rPr_li.set("sz", str(int(fit_pt * 100))); rPr_li.set("b", "0") r_li.insert(0, rPr_li) t_li = etree.SubElement(r_li, f"{{{_NS_A}}}t") t_li.text = lead_in for idx, para_text in enumerate(paragraphs): p_el = etree.SubElement(txBody_el, f"{{{_NS_A}}}p") pPr = etree.SubElement(p_el, f"{{{_NS_A}}}pPr") if bullet: pPr.set("marL", str(BULLET_MARL)) pPr.set("indent", str(BULLET_INDENT)) buFont = etree.SubElement(pPr, f"{{{_NS_A}}}buFont") buFont.set("typeface", BULLET_FONT) buFont.set("panose", BULLET_PANOSE) buFont.set("pitchFamily", "2") buFont.set("charset", "2") buChar = etree.SubElement(pPr, f"{{{_NS_A}}}buChar") buChar.set("char", BULLET_CHAR) else: etree.SubElement(pPr, f"{{{_NS_A}}}buNone") # Apply 115% line spacing + 9pt space-after (skip on last paragraph) is_last = idx == len(paragraphs) - 1 _stamp_spacing(pPr, is_last=is_last) r_el = etree.SubElement(p_el, f"{{{_NS_A}}}r") if rPr_template is not None: new_rPr = etree.fromstring(etree.tostring(rPr_template)) else: new_rPr = etree.Element(f"{{{_NS_A}}}rPr") new_rPr.set("lang", "en-US"); new_rPr.set("dirty", "0") new_rPr.set("sz", str(int(fit_pt * 100))) r_el.insert(0, new_rPr) t_el = etree.SubElement(r_el, f"{{{_NS_A}}}t") t_el.text = para_text # ═══════════════════════════════════════════════════════════════════════════════ # Shared helpers # ═══════════════════════════════════════════════════════════════════════════════ def _calc_title_pt(text: str, cx_in: float, cy_in: float, max_pt: float = 28.0) -> float: """ Return a sensible title font size: the largest pt at which `text` fits inside (cx_in x cy_in), capped at max_pt. Mirrors _fit_font_pt's math but works without reading orig_pt from XML — useful when we want to override what the template's placeholder would naturally render. """ import math text = (text or "").strip() if not text or cx_in <= 0 or cy_in <= 0: return max_pt for pt in range(int(max_pt), 11, -1): chars_per_line = cx_in * 144 / pt lines_needed = max(math.ceil(len(text) / max(chars_per_line, 1)), 1) text_height = lines_needed * (pt / 72) * 1.3 if text_height <= cy_in: return float(pt) return 12.0 def _enable_normAutofit(txBody_el) -> None: """ Switch the body's autofit mode to normAutofit so PowerPoint shrinks the font further at runtime if the content overflows the box. Used on prose- heavy slide types (case_study, activity, scenario) where the fitted font from _fit_font_pt may still be too large at its 10-11pt floor. """ from lxml import etree bodyPr = txBody_el.find(f"{{{_NS_A}}}bodyPr") if bodyPr is None: bodyPr = etree.Element(f"{{{_NS_A}}}bodyPr") txBody_el.insert(0, bodyPr) # Strip any existing autofit children for tag in ("noAutofit", "spAutoFit", "normAutofit"): existing = bodyPr.find(f"{{{_NS_A}}}{tag}") if existing is not None: bodyPr.remove(existing) # normAutofit with no fontScale/lnSpcReduction lets PPT pick the shrink amount norm = etree.SubElement(bodyPr, f"{{{_NS_A}}}normAutofit") # Leave attributes off — PowerPoint will set fontScale automatically def _synthesise_slide_notes(slide_dict: dict) -> str: """ Deterministic fallback when a plan entry arrives without `notes`. Builds a 80-120 word trainer brief from the slide's title + first 3 bullets so no slide ever ships with a blank speaker-notes pane. Mirrors the post-pad synthesiser in pipeline.py so the builder is self-sufficient — it can be invoked directly (e.g. from run_full_course with a hand-edited plan) and still produce non-blank notes. """ title = (slide_dict.get("title") or slide_dict.get("module_title") or "").strip() bullets = slide_dict.get("bullets") or [] points = slide_dict.get("points") or [] paras = slide_dict.get("paragraphs") or [] talking_points: list[str] = [] if bullets: for b in bullets[:3]: text = b if isinstance(b, str) else (b.get("text") or "") if text: talking_points.append(text.strip()) elif points: for p in points[:3]: label = (p.get("label") or "").strip() text = (p.get("text") or "").strip() if label and text: talking_points.append(f"{label}: {text}") elif text: talking_points.append(text) elif paras: for para in paras[:2]: if isinstance(para, str) and para.strip(): talking_points.append(para.strip()) lines: list[str] = [] if title: lines.append( f"This slide covers '{title}'. Open with the headline idea in your " f"own words, then take delegates through each point one at a time." ) else: lines.append( "Open with the headline idea in your own words, then take delegates " "through each point one at a time." ) if talking_points: lines.append("Talking points to cover:") for tp in talking_points: lines.append(f" - {tp}") lines.append( "Pause after the points and ask the room how this connects to their own " "work — one volunteer example is enough to anchor the learning before " "moving on. Spend about 2-3 minutes on this slide." ) return "\n".join(lines) def _synthesise_item_notes(parent: dict, item: dict) -> str: """ Build trainer notes for an expanded child card (overview_groups item) deterministically from the item's own content. Used when a single overview_groups plan entry expands into 1 parent + N child slides — we want each child to have a usable, item-specific note rather than an empty notes pane or a duplicated parent note. Output is ~80-120 words. Trainer can always rewrite, but never sees blank. """ label = (item.get("label") or "").strip() bullets = item.get("bullets") or [] desc = (item.get("description") or item.get("text") or "").strip() parent_title = (parent.get("title") or "").strip() if not label and not bullets and not desc: return "" lines: list[str] = [] # Opening framing: tie back to the parent topic + name the item if label and parent_title: lines.append( f"This card covers '{label}' as part of {parent_title}. " f"Walk delegates through the key points one at a time." ) elif label: lines.append( f"This card covers '{label}'. " f"Walk delegates through the key points one at a time." ) elif desc: lines.append(desc) else: lines.append("Walk delegates through this card one bullet at a time.") # Mid-section: expand on each bullet if bullets: lines.append("Talking points:") for b in bullets: b_text = (b if isinstance(b, str) else (b.get("text") or "")).strip() if b_text: lines.append(f" - {b_text}") elif desc: lines.append(desc) # Closing prompt: engagement cue if label: lines.append( f"Pause after the points and ask the room: 'When have you seen " f"{label.lower()} done well — or done badly? What stood out?' " f"Use the answers to anchor the next slide." ) else: lines.append( "Pause for one question from the room before moving on, " "to check understanding." ) return "\n".join(lines) def _set_notes(slide, notes_text: str) -> None: """ Write trainer notes into the slide's speaker-notes pane (visible to the presenter, hidden from the audience). Idempotent — overwrites any existing notes content. Splits on newlines so multi-paragraph notes render as separate paragraphs in the PowerPoint notes pane. """ if not notes_text or not notes_text.strip(): return # Accessing notes_slide auto-creates the notes XML part if missing tf = slide.notes_slide.notes_text_frame paragraphs = [p.strip() for p in notes_text.split("\n") if p.strip()] if not paragraphs: return tf.text = paragraphs[0] for extra in paragraphs[1:]: p = tf.add_paragraph() p.text = extra def _content_slots(slide) -> list[dict]: slots = _collect_slots(slide) return [s for s in slots if not any(tok in s["text"].lower() for tok in _LOGO_TOKENS)] def _find_title_body_slots(slots: list[dict]) -> tuple: non_empty = [s for s in slots if s.get("text", "").strip()] if not non_empty: return None, None, list(slots) title_slot = min(non_empty, key=lambda s: s["abs_y"]) below = [s for s in slots if s["abs_y"] > title_slot["abs_y"] + 0.1] body_slot = max(below, key=lambda s: s.get("cx_in", 0) * s.get("cy_in", 0)) if below else None other = [s for s in slots if s is not title_slot and s is not body_slot] return title_slot, body_slot, other def _clear_txbody(txBody_el) -> None: from lxml import etree for p in list(txBody_el.findall(qn("a:p"))): txBody_el.remove(p) etree.SubElement(txBody_el, f"{{{_NS_A}}}p") def _remove_decorative_groups(slide) -> None: """Remove decorative group shapes AND non-corner pictures from a cloned slide. Group shapes from the template (decorative infographic containers) are fully removed. Pictures are removed unless they are small corner-chrome accents (width < 2.5in AND positioned in a corner of the slide) — those serve as brand decoration and are intentional. """ spTree = slide.shapes._spTree grp_tag = f"{{{_NS_P}}}grpSp" for el in list(spTree): if el.tag == grp_tag: spTree.remove(el) SLIDE_W = 13.33 SLIDE_H = 7.5 to_remove = [] for shape in slide.shapes: if shape.shape_type != 13: # 13 = MSO_SHAPE_TYPE.PICTURE continue try: cx = shape.width / _EMU cy = shape.height / _EMU x = shape.left / _EMU y = shape.top / _EMU except Exception: to_remove.append(shape._element) continue # Keep small pictures that are in a slide corner (chrome accent) if cx < 2.5: in_top = y < 1.5 in_bottom = (y + cy) > SLIDE_H - 2.0 in_left = x < 1.5 in_right = (x + cx) > SLIDE_W - 2.0 if (in_top or in_bottom) and (in_left or in_right): continue # keep this corner-chrome picture to_remove.append(shape._element) for el in to_remove: parent = el.getparent() if parent is not None: parent.remove(el) def _remove_all_pictures(slide) -> None: """Remove ALL pictures from a cloned slide (used for module intro which has full-slide background images that would otherwise cover content text boxes).""" to_remove = [s._element for s in slide.shapes if s.shape_type == 13] for el in to_remove: parent = el.getparent() if parent is not None: parent.remove(el) def _resize_shape(txBody_el, new_cx_in: float, new_cy_in: float, new_x_in: float = None, new_y_in: float = None): from lxml import etree sp_el = txBody_el.getparent() spPr = sp_el.find(qn("p:spPr")) if spPr is None: return xfrm = spPr.find(qn("a:xfrm")) if xfrm is None: xfrm = etree.SubElement(spPr, f"{{{_NS_A}}}xfrm") off = xfrm.find(qn("a:off")) ext = xfrm.find(qn("a:ext")) if off is None: off = etree.SubElement(xfrm, f"{{{_NS_A}}}off") off.set("x", "0"); off.set("y", "0") if ext is None: ext = etree.SubElement(xfrm, f"{{{_NS_A}}}ext") ext.set("cx", "0"); ext.set("cy", "0") ext.set("cx", str(int(new_cx_in * _EMU))) ext.set("cy", str(int(new_cy_in * _EMU))) if new_x_in is not None: off.set("x", str(int(new_x_in * _EMU))) if new_y_in is not None: off.set("y", str(int(new_y_in * _EMU))) def _has_visible_body_content(slide) -> bool: """ Return True if the slide has at least one body-area shape that would render visibly (a grpSp, a sp with text, or a pic). Used to detect when a bank clone produced an effectively empty slide because all of the source shapes were skipped (e.g. SmartArt with unresolved rIds). """ spTree = slide.shapes._spTree grp_tag = f"{{{_NS_P}}}grpSp" sp_tag = f"{{{_NS_P}}}sp" pic_tag = f"{{{_NS_P}}}pic" t_tag = f"{{{_NS_A}}}t" body_y_min = int(1.0 * _EMU) for el in spTree: if el.tag == grp_tag: _, y, _, cy = _get_grp_xfrm(el) if y + cy > body_y_min: return True elif el.tag == sp_tag: _, y, _, cy = _get_grp_xfrm(el) if y + cy <= body_y_min: continue txb = el.find(f"{{{_NS_P}}}txBody") if txb is not None: text = "".join(t.text or "" for t in txb.iter(t_tag)).strip() if text: return True elif el.tag == pic_tag: _, y, _, cy = _get_grp_xfrm(el) if y + cy > body_y_min: return True return False def _remove_last_slide(cloner) -> None: """Remove the most-recently-added slide from the output presentation.""" try: prs = cloner._prs sldIdLst = prs.slides._sldIdLst last_sldId = list(sldIdLst)[-1] rId = last_sldId.get("{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id") sldIdLst.remove(last_sldId) if rId: prs.part.drop_rel(rId) except Exception as exc: print(f" [warn] could not remove last slide: {exc!r}") def _make_text_fallback(d: dict) -> dict: """Convert any slide dict into a text_only fallback.""" fb = dict(d) fb["type"] = "text_only" if not fb.get("bullets") and not fb.get("paragraphs") and not fb.get("points"): items = fb.get("items") or fb.get("steps") or [] if items: fb["bullets"] = [ f"{it['label']}: {it['description']}" if it.get("description") else it.get("label", "") for it in items if it.get("label") ] if not fb.get("bullets"): fb["bullets"] = [f"Content: {fb.get('title', '')}"] return fb # ── Course type detection ───────────────────────────────────────────────────── def _is_tech_course(t: str) -> bool: t = t.lower() return any(sig in t for sig in _TECH_COURSE_SIGNALS) def _is_marketing_course(t: str) -> bool: t = t.lower() return any(sig in t for sig in _MARKETING_COURSE_SIGNALS) def _is_soft_skill_course(t: str) -> bool: """True if the course title hits a soft-skill / communication signal.""" t = t.lower() return any(sig in t for sig in _SOFT_SKILL_COURSE_SIGNALS) # ── Illustration pool ───────────────────────────────────────────────────────── # Stopwords ignored when matching a course title against topic folder names. _TOPIC_STOPWORDS = { "and", "of", "the", "for", "in", "on", "at", "to", "with", "course", "skills", "training", "workshop", "fundamentals", } # Explicit aliases per topic folder. Word-boundary matched against the # lowercased course title. Covers common team shorthand (HR, AI, L&D, PM, # H&S, D&I, …) that the slug/token matcher can't see (too short or no # vocabulary overlap with the folder name). _TOPIC_ALIASES: dict[str, list[str]] = { "human_resources": ["hr", "human resources", "people management"], "ai_in_business": ["ai in business", "ai for business", "ai for managers"], "data_analytics_and_ai": ["data analytics", "data science", "analytics"], "learning_and_development": ["l&d", "l and d", "ld", "learning development", "learning and development"], "project_management_fundamentals": ["project management", "pm", "pmp", "prince2"], "health_and_safety": ["health safety", "h&s", "ehs", "iosh"], "workplace_diversity_and_inclusion": ["diversity", "d&i", "dei", "inclusion", "diversity and inclusion"], "customer_service_excellence": ["customer service", "csat", "cx", "customer experience"], "public_speaking_and_presentation_skills": ["public speaking", "presentation skills", "presenting"], "sales_and_persuasion_techniques": ["sales", "selling", "persuasion"], "stress_management_and_workplace_wellbeing": ["stress management", "wellbeing", "wellness", "mental health"], "time_management_and_productivity": ["time management", "productivity"], "leadership_and_management_skills": ["leadership", "management skills", "people leadership"], "workplace_communication_skills": ["workplace communication", "business communication", "communication skills"], "teamwork_and_collaboration": ["teamwork", "collaboration", "team building"], "emotional_intelligence_in_the_workplace": ["emotional intelligence", "eq", "self awareness"], "conflict_resolution_and_negotiation": ["conflict resolution", "negotiation", "conflict management"], "change_management": ["change management", "managing change"], "business_analysis": ["business analysis", "ba ", "requirements analysis"], "accounting_and_finance": ["accounting", "finance for", "financial", "bookkeeping"], "digital_marketing": ["digital marketing", "online marketing", "social media marketing"], } def _course_topic_slug(course_title: str, images_root: Path) -> str | None: """Resolve a course title to an existing topic subfolder under *images_root*. Returns the folder name (slug) or None. Match strategy (first hit wins): 1. Exact slug match (course title slugified = folder name). 2. Alias match — explicit team shorthand (HR, AI, L&D, PM, …). Aliases ranked by length descending so 'digital marketing' beats 'marketing'. 3. Token-overlap match: pick folder with the most overlapping meaningful tokens (stopwords removed, len > 2). """ if not course_title or not images_root.exists(): return None def _slugify(s: str) -> str: s = s.lower() s = re.sub(r"&", "and", s) s = re.sub(r"[^a-z0-9]+", "_", s).strip("_") return s folders = [p.name for p in images_root.iterdir() if p.is_dir()] if not folders: return None folder_set = set(folders) title_lower = course_title.lower() title_slug = _slugify(course_title) # 1) Exact slug match if title_slug in folder_set: return title_slug # 2) Alias match — longest alias first alias_pairs: list[tuple[str, str]] = [ (a, folder) for folder, aliases in _TOPIC_ALIASES.items() for a in aliases if folder in folder_set ] alias_pairs.sort(key=lambda pair: -len(pair[0])) for alias, folder in alias_pairs: if re.search(rf"\b{re.escape(alias)}\b", title_lower): return folder # 3) Token-overlap fallback def _tokens(slug: str) -> set[str]: return {t for t in slug.split("_") if t and t not in _TOPIC_STOPWORDS and len(t) > 2} title_tokens = _tokens(title_slug) if not title_tokens: return None best_folder, best_score = None, 0 for f in folders: overlap = len(title_tokens & _tokens(f)) if overlap > best_score: best_score, best_folder = overlap, f return best_folder if best_score >= 1 else None def _load_illustration_images(images_dir: str, tech_course=False, marketing_course=False, course_title: str = ""): """Load illustrations as a list of (path, keywords, is_topic) tuples. Topic images come from images// when the course title resolves to a known topic folder. They have empty keywords and ``is_topic=True`` — the picker prioritises them. Root images (existing flat pool) come after, with extracted keywords and ``is_topic=False``. """ result: list[tuple[str, list[str], bool]] = [] p = Path(images_dir) if not p.exists(): return result # Topic pool — shuffled so picks aren't always 01, 02, 03... topic_slug = _course_topic_slug(course_title, p) if topic_slug: topic_dir = p / topic_slug topic_files = [ f for f in topic_dir.iterdir() if f.is_file() and f.suffix.lower() in _IMG_EXTS ] import random as _random _random.shuffle(topic_files) for fpath in topic_files: result.append((str(fpath), [], True)) print(f" Topic pool: {topic_slug!r} ({len(topic_files)} images)") else: print(f" Topic pool: none (no folder match for course title)") # Root pool — existing behaviour. Skip subdirectories (topic folders). for fpath in sorted(p.iterdir()): if not fpath.is_file(): continue if fpath.suffix.lower() not in _IMG_EXTS: continue stem = fpath.stem.lower() stem_n = re.sub(r"[^a-z ]", " ", stem) stem_w = stem_n.split() if not tech_course and any(s in stem_w or s in stem_n for s in _TECH_IMAGE_SIGNALS): continue if not marketing_course and any(s in stem_n for s in _SOCIAL_MARKETING_IMAGE_SIGNALS): continue words = [w for w in stem_w if len(w) > 3 and w not in _IMG_NOISE] result.append((str(fpath), words, False)) return result def _pick_illustration_image(images, title: str, used: set, course_words: set = None, context_text: str = "") -> str | None: """ Pick an illustration whose keywords best match the slide title (primary) and the slide's broader content (secondary), with the course theme as a safety-net. *context_text* lets the caller feed in the slide's body content (lead_in, point texts, bullets) so the match isn't limited to the short title. For "Building a Proactive HR Mindset", body words like "manager", "consistency", "policy" will steer matching toward an HR-themed image instead of falling back to random unused (which gave us "ONLINE TEST" on what should have been a manager-coaching slide). """ if not images: return None # Topic pool priority — if the course resolved to a topic folder, those # entries (is_topic=True) are always preferred over the generic root pool. # Within the topic pool any unused entry is equally valid (the team # curated each PPT as a topic set; no keyword scoring needed). for path, kws, is_topic in images: if is_topic and path not in used: return path def _words(text): return {w.lower() for w in re.sub(r"[^a-zA-Z ]", " ", text).split() if len(w) > 3 and w.lower() not in _IMG_NOISE} title_words = _words(title) context_words = _words(context_text) if context_text else set() course_words = course_words or set() best_score, best_idx = -1, -1 fb_score, fb_idx = -1, -1 ctx_score, ctx_idx = -1, -1 first_unused = -1 for i, (path, kws, is_topic) in enumerate(images): if is_topic: continue # topic pool already exhausted above if path in used: continue if first_unused == -1: first_unused = i kw_set = set(kws) # Title match — strongest signal score = len(title_words & kw_set) if score > best_score: best_score, best_idx = score, i # Body-content match — secondary signal (weight 2× per match # but only counts when title has no match) ctx = len(context_words & kw_set) if ctx > ctx_score: ctx_score, ctx_idx = ctx, i # Course theme — safety net fb = len(course_words & kw_set) if fb > fb_score: fb_score, fb_idx = fb, i if best_idx == -1: # Every non-topic image is already used. Instead of resetting the # used set (which makes the first image get picked again and again # — the "stuck on 6 different slides" symptom we fixed earlier), # spread the repeats EVENLY across the pool by picking randomly. import random as _random candidates = [i for i, (_, kws, is_t) in enumerate(images) if kws and not is_t] best_idx = _random.choice(candidates) if candidates else 0 elif best_score == 0: # No title match — try body-content match first, then course theme. if ctx_score > 0 and ctx_idx != -1: best_idx = ctx_idx elif fb_score > 0 and fb_idx != -1: best_idx = fb_idx else: best_idx = next( (i for i, (p, kws, is_t) in enumerate(images) if p not in used and kws and not is_t), first_unused if first_unused != -1 else 0, ) return images[best_idx][0]