"""scripts/sync_banks_from_s3.py — boot-time bank sync (production server). Mirror of scripts/upload_banks_to_s3.py, but in reverse. Downloads every object under ``banks/`` in S3 into a local cache directory. The server runs this once on startup (or whenever banks are updated) so the application reads from local disk — fast, no per-request S3 calls. Behaviour: idempotent. Files that already exist locally with the same size are skipped. Pass ``--force`` to redownload everything. Config (.env or env vars): S3_BUCKET, AWS_REGION, AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY S3_BANKS_PREFIX default 'banks/' BANKS_ROOT default / (used by the local dev workflow; production sets this to e.g. /var/lib/courseware/banks) Run on the server: python scripts/sync_banks_from_s3.py """ from __future__ import annotations import argparse import os import sys from pathlib import Path ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT)) try: from dotenv import load_dotenv load_dotenv(ROOT / ".env") except ImportError: pass import boto3 from botocore.exceptions import ClientError BUCKET = os.environ.get("S3_BUCKET") REGION = os.environ.get("AWS_REGION") PREFIX = os.environ.get("S3_BANKS_PREFIX", "banks/") # Where to land downloaded objects. Dev = project root (re-creates the # 'tagged infographics/', 'images/', etc. layout). Prod overrides with # BANKS_ROOT. BANKS_ROOT = Path(os.environ.get("BANKS_ROOT", str(ROOT))) if not BUCKET or not REGION: print("ERROR: S3_BUCKET and AWS_REGION must be set (.env or environment).", file=sys.stderr) sys.exit(1) def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--force", action="store_true", help="Re-download even files that already exist with same size.") parser.add_argument("--dry-run", action="store_true", help="List what would be downloaded; do not download.") args = parser.parse_args() s3 = boto3.client( "s3", region_name=REGION, aws_access_key_id=os.environ["AWS_ACCESS_KEY_ID"], aws_secret_access_key=os.environ["AWS_SECRET_ACCESS_KEY"], ) print(f"Source: s3://{BUCKET}/{PREFIX}") print(f"Target: {BANKS_ROOT}") print() BANKS_ROOT.mkdir(parents=True, exist_ok=True) downloaded = 0 skipped = 0 bytes_in = 0 token = None while True: kw = {"Bucket": BUCKET, "Prefix": PREFIX} if token: kw["ContinuationToken"] = token try: resp = s3.list_objects_v2(**kw) except ClientError as e: print(f"ERROR listing bucket: {e}", file=sys.stderr) return 1 for obj in resp.get("Contents", []): key = obj["Key"] size = obj["Size"] rel = key[len(PREFIX):] if not rel: continue # skip the prefix itself target = BANKS_ROOT / rel if not args.force and target.exists() and target.stat().st_size == size: if args.dry_run: print(f" SKIP {key} ({size/1024:.1f} KB — same size on disk)") skipped += 1 continue if args.dry_run: print(f" WOULD DOWNLOAD {key} -> {target.relative_to(BANKS_ROOT)} ({size/1024:.1f} KB)") continue target.parent.mkdir(parents=True, exist_ok=True) try: s3.download_file(BUCKET, key, str(target)) print(f" GET {key} ({size/1024:.1f} KB)") downloaded += 1 bytes_in += size except ClientError as e: print(f" ERROR {key} :: {e}", file=sys.stderr) return 1 if resp.get("IsTruncated"): token = resp["NextContinuationToken"] else: break print() if args.dry_run: print(f"Dry-run only. Run without --dry-run to download.") else: print(f"Done. downloaded={downloaded} skipped={skipped} " f"bytes_received={bytes_in/1024/1024:.1f} MB") return 0 if __name__ == "__main__": raise SystemExit(main())