""" engine/template_inspector.py ---------------------------- Validates an uploaded PPTX against the 7-slide template contract before generation runs. The contract is role-based, NOT layout-uniform — each template keeps its individual identity. We check that each of the 7 expected slide ROLES is present and structured enough for the pipeline to populate. Expected slide order (positional): 1 Cover - course intro page (wide title) 2 About Us - paragraph copied as-is 3 Course Syllabus - title + N module slots 4 Module Intro - module title + N topic slots 5 Content Layout - wide title + wide body + chrome (gets replicated) 6 Quiz - question block (1 MCQ per slide, cloned 1-2 per module) 7 Ending - any closing copy (Congratulations / Contact Us / etc.) Result envelope: { "status": "pass" | "warn" | "fail", "summary": "...", "meta": { slide_count, aspect_ratio, brand_color, masters }, "slides": [ { num, role, status, reasons:[...], detected:{...} }, ... ], } """ from __future__ import annotations from pathlib import Path from typing import Any _EMU = 914_400 _LOGO_TOKENS = {"theknowledgeacademy", "the knowledge academy", "theknowledge"} # Role spec — positional index → expected role + title keywords _ROLE_SPEC = [ {"role": "cover", "title_kw": []}, {"role": "about_us", "title_kw": ["about us", "about"]}, {"role": "syllabus", "title_kw": ["syllabus", "course outline", "course contents", "contents"]}, {"role": "module_intro", "title_kw": ["module", "chapter", "section"]}, {"role": "content", "title_kw": []}, {"role": "quiz", "title_kw": ["quiz", "question", "assessment", "knowledge check"]}, {"role": "ending", "title_kw": ["congratulations", "thank you", "contact us", "end", "summary"]}, ] # ── Public entry point ───────────────────────────────────────────────────────── def inspect_template(tmpl_path: str | Path) -> dict[str, Any]: """ Inspect a PPTX template and return a structured report. Never raises — wraps all errors and returns a fail envelope. """ path = Path(tmpl_path) if not path.exists(): return _fail_envelope(f"Template file not found: {path}") try: from pptx import Presentation prs = Presentation(str(path)) except Exception as exc: return _fail_envelope(f"Could not open PPTX: {exc!r}") sw = prs.slide_width / _EMU sh = prs.slide_height / _EMU aspect = f"{sw:.2f}x{sh:.2f}" is_169 = abs((sw / sh) - (16 / 9)) < 0.05 meta = { "slide_count": len(prs.slides), "aspect_ratio": aspect, "is_16_9": is_169, "masters": len(prs.slide_masters), "brand_color": _detect_brand_color(prs), "file_size_kb": round(path.stat().st_size / 1024, 1), } slide_reports: list[dict] = [] # Meta-level hard fails if meta["slide_count"] != 7: slide_reports.append({ "num": 0, "role": "meta", "status": "fail", "reasons": [f"Template must have exactly 7 slides (found {meta['slide_count']})"], }) if not is_169: slide_reports.append({ "num": 0, "role": "meta", "status": "fail", "reasons": [f"Template must be 16:9 (found {aspect})"], }) # Per-slide role validation for idx, spec in enumerate(_ROLE_SPEC): if idx >= len(prs.slides): slide_reports.append({ "num": idx + 1, "role": spec["role"], "status": "fail", "reasons": [f"Slide {idx + 1} ({spec['role']}) is missing"], "detected": {}, }) continue slide = prs.slides[idx] report = _inspect_slide(slide, idx, spec, sw) slide_reports.append(report) overall = _aggregate_status(slide_reports) return { "status": overall, "summary": _summarize(overall, slide_reports), "meta": meta, "slides": slide_reports, } # ── Per-slide inspection ────────────────────────────────────────────────────── def _inspect_slide(slide, idx: int, spec: dict, slide_w: float) -> dict: role = spec["role"] title_kw = spec["title_kw"] text_boxes = _collect_text_boxes(slide) auto_shapes = _collect_auto_shapes(slide) groups = _collect_groups(slide) title_text = _first_title_text(text_boxes) has_kw = any(kw in title_text.lower() for kw in title_kw) if title_kw else True reasons: list[str] = [] detected = { "title_text": title_text[:80], "text_box_count": len(text_boxes), "auto_shape_count": len(auto_shapes), "group_count": len(groups), } if role == "cover": wide_titles = [tb for tb in text_boxes if tb["cx"] > slide_w * 0.30] detected["wide_title_count"] = len(wide_titles) if not wide_titles: reasons.append("No wide title textbox found (cover needs a course title)") status = "fail" if not wide_titles else "pass" elif role == "about_us": body_blocks = [tb for tb in text_boxes if tb["cy"] > 0.5 and tb["cx"] > slide_w * 0.30] long_paragraphs = [tb for tb in text_boxes if len(tb["txt"]) > 40] any_text = any(tb["txt"] for tb in text_boxes if not _is_logo_text(tb["txt"])) detected["body_block_count"] = len(body_blocks) detected["paragraph_count"] = len(long_paragraphs) if not any_text: reasons.append("About-us slide appears empty — needs at least some copy") status = "fail" elif not has_kw and not long_paragraphs: reasons.append("No 'About Us' title and no long paragraph — verify this is the about-us page") status = "warn" elif not has_kw: reasons.append("Title doesn't say 'About Us' but content is present — accepting on body") status = "warn" else: status = "pass" elif role == "syllabus": if not has_kw: reasons.append("Title doesn't contain 'syllabus' / 'outline' / 'contents'") slots = _count_stacked_slots(text_boxes + groups, title_y_max=2.0) detected["module_slots"] = slots if slots < 3: reasons.append(f"Only {slots} module slots found (need at least 3 — recommend 4)") if not has_kw and slots < 3: status = "fail" elif slots < 3 or not has_kw: status = "warn" else: status = "pass" elif role == "module_intro": topic_slots = _count_stacked_slots(text_boxes, title_y_max=1.5) any_text = any(tb["txt"] for tb in text_boxes if not _is_logo_text(tb["txt"])) detected["topic_slots"] = topic_slots if not any_text: reasons.append("Module-intro slide appears empty — needs module title placeholder") status = "fail" elif topic_slots < 1: reasons.append("No discrete topic slots — single text block will receive all topics packed together") status = "warn" elif topic_slots < 3: reasons.append(f"Only {topic_slots} topic slot(s) — pipeline will fill what's available") status = "warn" else: status = "pass" elif role == "content": wide_title = next((tb for tb in text_boxes if tb["cx"] > slide_w * 0.45 and tb["y"] < 1.5), None) wide_body = next((tb for tb in text_boxes if tb["cx"] > slide_w * 0.45 and tb["y"] > 1.2 and tb["cy"] > 1.0), None) # Walk slide + layout + master for chrome — bands often live on the master chrome = _collect_chrome_shapes(slide) chrome_autos = chrome["autos"] chrome_wordmarks = chrome["wordmarks"] has_top_banner = any(s for s in chrome_autos if s["cx"] > slide_w * 0.85 and s["y"] < 1.5 and s["cy"] > 0.2) has_bottom_banner = any(s for s in chrome_autos if s["cx"] > slide_w * 0.85 and s["y"] > 5.5 and s["cy"] > 0.2) has_corner_accent = any( s for s in chrome_autos if (s["cx"] > slide_w * 0.25 and s["cy"] > 0.8) and (s["x"] < 0.3 or (s["x"] + s["cx"]) > slide_w - 0.3) ) has_side_stripe = any(s for s in chrome_autos if s["cy"] > 5.0 and s["cx"] < slide_w * 0.25) has_wordmark = bool(chrome_wordmarks) has_any_chrome = has_top_banner or has_bottom_banner or has_corner_accent or has_side_stripe or has_wordmark detected["has_wide_title"] = bool(wide_title) detected["has_wide_body"] = bool(wide_body) detected["has_top_banner"] = has_top_banner detected["has_bottom_banner"] = has_bottom_banner detected["has_corner_accent"] = has_corner_accent detected["has_side_stripe"] = has_side_stripe detected["has_wordmark"] = has_wordmark if not wide_title: reasons.append("No wide title textbox (content slide needs a title placeholder)") if not wide_body: reasons.append("No wide body textbox (content slide needs a body placeholder)") if not has_any_chrome: reasons.append("No chrome (banner, corner accent, side stripe, or wordmark) — content slides will have no template framing") if not wide_title or not wide_body: status = "fail" elif not has_any_chrome: status = "warn" else: status = "pass" elif role == "quiz": if not has_kw: reasons.append("Title doesn't contain 'quiz' / 'question' / 'assessment'") prominent_block = next( (s for s in auto_shapes if s["cx"] > slide_w * 0.4 and s["cy"] > 2.0), None, ) detected["has_question_block"] = bool(prominent_block) if not prominent_block: reasons.append("No prominent question container shape — quiz needs a block to hold the MCQ text") if not has_kw and not prominent_block: status = "fail" elif not has_kw or not prominent_block: status = "warn" else: status = "pass" elif role == "ending": if not title_text: reasons.append("No title text on the ending slide") status = "fail" elif not has_kw: reasons.append(f"Ending title is '{title_text[:40]}' — not a recognised ending phrase (will still work)") status = "warn" else: status = "pass" else: status = "warn" reasons.append(f"Unknown role: {role}") return { "num": idx + 1, "role": role, "status": status, "reasons": reasons, "detected": detected, } # ── Shape collectors ────────────────────────────────────────────────────────── def _collect_text_boxes(slide) -> list[dict]: """ Collect every shape carrying text — textboxes, auto-shapes with text, and shapes nested in groups. Design-heavy templates often put titles and body copy inside auto-shapes (rectangles) instead of plain textboxes. """ out: list[dict] = [] _walk_text_shapes(slide.shapes, out) return out def _walk_text_shapes(shape_iter, out: list[dict]) -> None: for shape in shape_iter: st = _shape_type_str(shape) if st == "group": try: _walk_text_shapes(shape.shapes, out) except Exception: pass continue try: has_tf = shape.has_text_frame except Exception: has_tf = False if not has_tf: continue rect = _shape_rect(shape) if rect is None: continue try: txt = shape.text_frame.text.strip() except Exception: txt = "" out.append({**rect, "txt": txt}) def _collect_auto_shapes(slide) -> list[dict]: """Auto shapes + freeforms, walking into groups recursively.""" out: list[dict] = [] _walk_shapes(slide.shapes, out, want={"auto_shape", "freeform"}) return out def _collect_chrome_shapes(slide) -> dict[str, list[dict]]: """ Collect chrome candidates from slide + its layout + its master. Many templates put their brand bands/stripes/wordmarks on the master so they appear on every slide. Without walking master shapes the chrome check produces false negatives. """ autos: list[dict] = [] wordmarks: list[dict] = [] sources = [slide.shapes] try: sources.append(slide.slide_layout.shapes) sources.append(slide.slide_layout.slide_master.shapes) except Exception: pass for src in sources: _walk_shapes(src, autos, want={"auto_shape", "freeform"}) _walk_wordmarks(src, wordmarks) return {"autos": autos, "wordmarks": wordmarks} def _walk_wordmarks(shape_iter, out: list[dict]) -> None: for shape in shape_iter: st = _shape_type_str(shape) if st == "group": try: _walk_wordmarks(shape.shapes, out) except Exception: pass continue try: has_tf = shape.has_text_frame except Exception: has_tf = False if not has_tf: continue try: txt = shape.text_frame.text.strip() except Exception: continue if not _is_logo_text(txt): continue rect = _shape_rect(shape) if rect is None: continue out.append({**rect, "txt": txt}) def _collect_groups(slide) -> list[dict]: out = [] for shape in slide.shapes: if _shape_type_str(shape) != "group": continue rect = _shape_rect(shape) if rect is None: continue out.append({**rect, "txt": ""}) return out def _walk_shapes(shape_iter, out: list[dict], want: set[str]) -> None: for shape in shape_iter: st = _shape_type_str(shape) if st == "group": try: _walk_shapes(shape.shapes, out, want) except Exception: pass continue if st not in want: continue rect = _shape_rect(shape) if rect is None: continue out.append(rect) def _shape_rect(shape) -> dict | None: try: return { "x": shape.left / _EMU, "y": shape.top / _EMU, "cx": shape.width / _EMU, "cy": shape.height / _EMU, } except Exception: return None def _shape_type_str(shape) -> str: try: st = int(shape.shape_type) except Exception: return "unknown" return {1: "auto_shape", 5: "freeform", 6: "group", 13: "picture", 17: "text_box"}.get(st, "unknown") # ── Heuristics ──────────────────────────────────────────────────────────────── def _first_title_text(text_boxes: list[dict]) -> str: """Return text of the topmost non-logo textbox (likely the slide title).""" candidates = [tb for tb in text_boxes if tb["txt"] and not _is_logo_text(tb["txt"])] if not candidates: return "" candidates.sort(key=lambda tb: tb["y"]) return candidates[0]["txt"] def _is_logo_text(txt: str) -> bool: low = txt.lower().replace(" ", "") return any(tok.replace(" ", "") in low for tok in _LOGO_TOKENS) def _count_stacked_slots(shapes: list[dict], title_y_max: float, min_y: float | None = None) -> int: """ Count repeating slot rows stacked vertically below a title. Looks for shapes whose Y differs by > 0.4in to dedupe overlapping decorations. """ candidates = [s for s in shapes if s["y"] > title_y_max] if min_y is not None: candidates = [s for s in candidates if s["y"] >= min_y or s["y"] > title_y_max] candidates.sort(key=lambda s: s["y"]) rows: list[float] = [] for s in candidates: if not rows or abs(s["y"] - rows[-1]) > 0.4: rows.append(s["y"]) return len(rows) def _detect_brand_color(prs) -> str: """Best-effort: pull the first non-grey solid fill we encounter.""" for slide in prs.slides: for shape in slide.shapes: try: fill = shape.fill if fill.type is None: continue rgb = fill.fore_color.rgb if rgb is None: continue hexcol = str(rgb).upper() if hexcol in {"FFFFFF", "000000"} or hexcol.startswith(("EE", "F0", "F5")): continue return hexcol except Exception: continue return "44318D" # ── Aggregation / envelopes ─────────────────────────────────────────────────── def _aggregate_status(reports: list[dict]) -> str: if any(r["status"] == "fail" for r in reports): return "fail" if any(r["status"] == "warn" for r in reports): return "warn" return "pass" def _summarize(overall: str, reports: list[dict]) -> str: fails = [r for r in reports if r["status"] == "fail"] warns = [r for r in reports if r["status"] == "warn"] if overall == "pass": return "Template passes all 7-slide structure checks." if overall == "warn": roles = ", ".join(r["role"] for r in warns) return f"Template usable with caveats on: {roles}" roles = ", ".join(r["role"] for r in fails) return f"Template needs fixes on: {roles}" def _fail_envelope(msg: str) -> dict: return { "status": "fail", "summary": msg, "meta": {}, "slides": [], } # ── CLI entry ───────────────────────────────────────────────────────────────── if __name__ == "__main__": import json import sys if len(sys.argv) < 2: print("Usage: python -m ppt_generator.engine.template_inspector ") sys.exit(1) report = inspect_template(sys.argv[1]) print(json.dumps(report, indent=2))