""" engine/slide_selector.py ------------------------ Picks the best-fitting bank infographic slide for a given content payload. How it works: 1. Reads the enriched bank catalogue (bank_catalogue_slim.json + dimension data). 2. Filters slides that can hold the content without text overflow: - item_count <= slide's step_count - max label char count fits in the slide's label boxes - max description char count fits in the slide's description boxes 3. Ranks remaining candidates by: - Preferred visual type match (+10) - Closest step count to content (penalise empty slots) - Wider boxes are better (more breathing room) 4. Returns the top candidate entry dict, or None if nothing fits. Usage: from engine.slide_selector import select_slide entry = select_slide(catalogue, { "item_count": 5, "max_label_len": 10, # longest label in chars "max_desc_len": 55, # longest description in chars (0 = no descriptions) "visual_type": "process", }) if entry: slide = cloner.clone_slide(str(BANK / entry["file"]), entry["slide_index"]) """ from __future__ import annotations # ── Character capacity constants ───────────────────────────────────────────── # Average character width for proportional fonts (Calibri/Arial) is roughly # half the point size, divided by 72 to get inches. # max_chars_on_one_line ≈ box_width_in / (font_pt / 144) # = box_width_in * 144 / font_pt # # Default font sizes used when the slide doesn't carry an explicit size: _DEFAULT_LABEL_PT = 14.0 # conservative default for label-sized text _DEFAULT_DESC_PT = 11.0 # conservative default for description-sized text _CHAR_WIDTH_FACTOR = 144 # = 2 × 72; gives chars per inch at 1 pt # Keywords in catalogue entries that mark a slide as technology-specific. # These entries are excluded when the course is not a tech/IT course. _TECH_CATALOGUE_KEYWORDS = {"technology", "cybersecurity", "devops", "programming", "coding"} def select_slide( catalogue: list[dict], content: dict, used_counts: dict | None = None, max_uses: int = 2, topic_keywords: list[str] | None = None, is_tech_course: bool = True, used_families: list[str] | None = None, used_files: list[str] | None = None, excluded_keys: set | None = None, ) -> dict | None: """ Return the best catalogue entry for *content*, or None. content keys: item_count int — number of steps / items needed max_label_len int — length of the longest label string (chars) max_desc_len int — length of the longest description string (0 = no descs needed) visual_type str — preferred type: "process", "timeline", "hub", etc. (optional) used_counts: dict mapping (file, slide_index) -> use count. Slides at or above max_uses are excluded. Pass the same dict across calls so it accumulates across the whole deck. max_uses: maximum times a single slide may be used in one course (default 1). topic_keywords: optional list of keyword strings for topic-relevance boosting. is_tech_course: if False, slides tagged with tech keywords are excluded. used_families: recent visual families already used (oldest first); used to penalise repeating the same family. used_files: list of PPTX filenames already used (oldest first); used to penalise drawing from the same source file repeatedly. """ n_items = content.get("item_count", 0) max_lbl = content.get("max_label_len", 0) max_dsc = content.get("max_desc_len", 0) want_type = content.get("visual_type", "").lower() need_descs = max_dsc > 0 scored: list[tuple[float, dict]] = [] for entry in catalogue: # ── Hard filters ────────────────────────────────────────────────────── # Enforce max-uses-per-course limit key = (entry.get("file"), entry.get("slide_index")) if used_counts is not None and used_counts.get(key, 0) >= max_uses: continue # Skip explicitly excluded entries (e.g. capacity-checked and rejected) if excluded_keys and key in excluded_keys: continue # Exclude tech-themed slides for non-tech courses if not is_tech_course: entry_kws = set(k.lower() for k in entry.get("keywords", [])) if entry_kws & _TECH_CATALOGUE_KEYWORDS: continue # Must have enough item slots step_count = entry.get("step_count", 0) or entry.get("text_slot_count", 0) if step_count < n_items: continue # Hard filter: skip slides with non-integer stride — text lands in wrong boxes is_reliable = entry.get("txbox_reliable", True) if not is_reliable: continue # Hard filter: comparison-family bank slides are designed for binary # comparisons (2 items side by side). Reject them when the content # has any other number of items — otherwise items 3+ go nowhere and # the slide renders with empty cells. entry_family = entry.get("visual_family", "") if entry_family == "comparison" and n_items != 2: continue # Label capacity check — allow up to 3× single-line capacity since labels wrap # across 2-3 lines. normAutofit further reduces font if needed. lbl_cap = entry.get("max_label_chars", 999) if max_lbl > 0 and lbl_cap * 3 < max_lbl: continue # Description capacity check — allow up to 6× single-line capacity: # text wraps across 4-6 lines and normAutofit shrinks the font to fit. # max_desc_chars measures box width at default font size (chars per line). if need_descs: if not entry.get("has_descriptions", False): continue dsc_cap = entry.get("max_desc_chars", 0) if dsc_cap * 6 < max_dsc: continue # ── Scoring ─────────────────────────────────────────────────────────── score = 0.0 # Penalise banks that have description slots when content needs none. # Empty description placeholder shapes remain visible in the output, # creating blank bordered boxes that look broken. if not need_descs and entry.get("has_descriptions", False): score -= 8.0 # Visual type / family match. # visual_type from the content plan maps directly to visual_family in # the catalogue (process, cycle, roadmap, framework, hub, list, comparison). # An exact family match scores highest — it means the shape's visual grammar # correctly represents the content's logical structure. types = entry.get("supported_visual_types", []) family = entry.get("visual_family", "") if want_type: if want_type == family: # exact family match → shape semantics align score += 15.0 elif want_type in types: # supported type match → acceptable score += 7.0 # Prefer closest item count — strongly penalise extra slots so the # selector doesn't pick a 5-slot slide for 4-step content and then # use the spare row as a title header (which looks wrong visually). extra = step_count - n_items score -= extra * 3.0 if extra > 0 else abs(extra) * 0.5 # Prefer slides with more breathing room in label boxes if max_lbl > 0 and lbl_cap < 999: margin = (lbl_cap - max_lbl) / max(lbl_cap, 1) score += margin * 2.0 # Prefer slides with more breathing room in description boxes if need_descs and entry.get("max_desc_chars", 0) > 0: dsc_margin = (entry["max_desc_chars"] - max_dsc) / max(entry["max_desc_chars"], 1) score += dsc_margin * 2.0 # Keyword overlap with slide topic if topic_keywords and entry.get("keywords"): overlap = len(set(k.lower() for k in topic_keywords) & set(entry["keywords"])) score += overlap * 3.0 # Penalise visual family repetition: the more recently a family was used, # the higher the penalty (most-recent use = -8, second most = -4, etc.) # Hard block if the same family appeared in the last 2 infographic slots. if used_families: family = entry.get("visual_family", "") recent = used_families[-2:] # last 2 infographic families used if family and recent.count(family) >= 2: continue # hard block: 2 consecutive same family for i, used_fam in enumerate(reversed(used_families)): if used_fam == family: score -= max(8.0 - i * 2.0, 1.0) break # Penalise reuse of the same source PPTX file so slides are drawn from # many different files across the deck (file diversity). if used_files: file_name = entry.get("file", "") for i, used_f in enumerate(reversed(used_files)): if used_f == file_name: # -6 for the most recent file, -3 for two back, -1.5 beyond score -= max(6.0 - i * 1.5, 0.5) break # Small random tiebreaker so equal-scoring slides don't always resolve # to the same catalogue order position. import random score += random.uniform(0, 0.3) scored.append((score, entry)) if not scored: return None # Prefer exact step count match — a shape designed for exactly N items # looks complete; a shape with extra empty slots looks broken. exact = [ (s, e) for (s, e) in scored if (e.get("step_count", 0) or e.get("text_slot_count", 0)) == n_items ] pool = exact if exact else scored pool.sort(key=lambda t: -t[0]) return pool[0][1] def content_requirements(content: dict) -> dict: """ Derive selector requirements from a content payload (title + steps). Input (same shape as replace_slide_text): { "title": "...", "steps": [{"label": "...", "description": "..."}, ...] } Returns: { "item_count": int, "max_label_len": int, "max_desc_len": int } """ steps = content.get("steps", []) max_lbl = max((len(s.get("label", "")) for s in steps), default=0) max_dsc = max((len(s.get("description", "")) for s in steps), default=0) return { "item_count": len(steps), "max_label_len": max_lbl, "max_desc_len": max_dsc, }