#!/usr/bin/env python3 """ skool-lesson-scrape Downloads lessons from a Skool community classroom to a local folder as Markdown files. Skips lessons already saved — safe to re-run when new content is added. Usage: python scrape.py # full scrape python scrape.py --discover # inspect page structure without saving (debug) Setup: see README.md """ import asyncio import argparse import re import json import tempfile import html2text from pathlib import Path from playwright.async_api import async_playwright # ── CONFIG — edit these two lines ──────────────────────────────────────────── # # COMMUNITY: the slug from your Skool community URL # e.g. https://www.skool.com/navaigate → "navaigate" COMMUNITY = "navaigate" # # OUTPUT_DIR: folder where .md files are saved (created if it doesn't exist) # Mac/Linux: Path.home() / "skool-lessons" # Windows: Path(r"C:\Users\YourName\Documents\skool-lessons") # Obsidian: Path.home() / "Documents" / "ObsidianVault" / "Lessons" OUTPUT_DIR = Path.home() / "skool-lessons" # # ───────────────────────────────────────────────────────────────────────────── BASE = "https://www.skool.com" CLASSROOM = f"{BASE}/{COMMUNITY}/classroom" DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag" CONTENT_SELECTORS = [ ".ProseMirror", # Skool's TipTap editor — primary target "[class*='lesson-content']", "[class*='lessonContent']", "[class*='module-content']", "[class*='content-body']", "article", "main", ] def sanitize(name: str) -> str: name = re.sub(r'[<>:"/\\|?*\n\r\t]', '', str(name)).strip().strip(".") return name[:120] def existing_stems() -> set: OUTPUT_DIR.mkdir(parents=True, exist_ok=True) return {f.stem for f in OUTPUT_DIR.glob("*.md")} def next_data(html: str) -> dict: m = re.search(r'', html, re.DOTALL) return json.loads(m.group(1)) if m else {} def html_to_md(raw: str) -> str: h = html2text.HTML2Text() h.body_width = 0 h.ignore_links = False h.ignore_images = True return h.handle(raw) def write_lesson(course_title: str, lesson_title: str, body: str) -> str: stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}" out = OUTPUT_DIR / f"{stem}.md" out.write_text(f"# {lesson_title}\n\n{body}", encoding="utf-8") return stem async def lesson_body(page) -> str: """Content is rendered client-side into .ProseMirror (Skool's TipTap editor).""" for sel in CONTENT_SELECTORS: el = await page.query_selector(sel) if el: inner = await el.inner_html() if len(inner) > 200: return html_to_md(inner) return html_to_md(await page.evaluate("() => document.body.innerHTML")) async def run(discover: bool = False): existing = existing_stems() print(f"Output folder: {OUTPUT_DIR}") print(f"Lessons already saved: {len(existing)}\n") async with async_playwright() as p: browser = await p.chromium.launch(headless=False, slow_mo=25) ctx = await browser.new_context(viewport={"width": 1440, "height": 900}) page = await ctx.new_page() print("Opening Skool — please log in when the browser window appears.") print("The script will continue automatically once you land on the community.\n") await page.goto("https://www.skool.com/login") await page.wait_for_url(f"**/{COMMUNITY}/**", timeout=300_000) print("Logged in.\n") await page.goto(CLASSROOM) await page.wait_for_load_state("load") await asyncio.sleep(3) nd = next_data(await page.content()) all_crses = nd.get("props", {}).get("pageProps", {}).get("allCourses", []) courses = [c for c in all_crses if c.get("metadata", {}).get("hasAccess", 0)] print(f"Accessible courses: {len(courses)} of {len(all_crses)} total\n") if not courses: DIAG_DIR.mkdir(parents=True, exist_ok=True) (DIAG_DIR / "classroom.html").write_text(await page.content()) print(f"No courses found. Diagnostic HTML saved to {DIAG_DIR}") await browser.close() return if discover: course_url = f"{CLASSROOM}/{courses[0]['name']}" await page.goto(course_url) await page.wait_for_load_state("load") await asyncio.sleep(3) cnd = next_data(await page.content()) children = cnd.get("props", {}).get("pageProps", {}).get("course", {}).get("children", []) first = children[0]["course"] if children else None if first: await page.goto(f"{course_url}?md={first['id']}") await page.wait_for_load_state("load") await asyncio.sleep(3) lpp = next_data(await page.content()).get("props", {}).get("pageProps", {}) print("Lesson pageProps keys:", list(lpp.keys())) DIAG_DIR.mkdir(parents=True, exist_ok=True) (DIAG_DIR / "lesson.html").write_text(await page.content()) await page.screenshot(path=str(DIAG_DIR / "lesson.png"), full_page=True) print(f"Diagnostic files saved to {DIAG_DIR}") await browser.close() return saved = skipped = errors = 0 for course in courses: course_title = course["metadata"]["title"] course_url = f"{CLASSROOM}/{course['name']}" print(f"Course: {course_title}") await page.goto(course_url) await page.wait_for_load_state("load") await asyncio.sleep(2.5) children = ( next_data(await page.content()) .get("props", {}) .get("pageProps", {}) .get("course", {}) .get("children", []) ) if not children: print(" No lessons found — skipping\n") continue print(f" {len(children)} lessons") for child in children: lesson = child.get("course", {}) lesson_title = lesson.get("metadata", {}).get("title") or lesson.get("name") or "Untitled" lesson_id = lesson.get("id", "") stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}" if stem in existing: skipped += 1 continue try: await page.goto(f"{course_url}?md={lesson_id}") await page.wait_for_load_state("load") await asyncio.sleep(2) stem = write_lesson(course_title, lesson_title, await lesson_body(page)) existing.add(stem) saved += 1 print(f" [saved] {lesson_title[:65]}") except Exception as e: errors += 1 print(f" [error] {lesson_title[:65]} — {e}") print() print("─" * 52) print(f"Done. Saved: {saved} Skipped: {skipped} Errors: {errors}") print(f"Output: {OUTPUT_DIR}") await browser.close() if __name__ == "__main__": ap = argparse.ArgumentParser(description="Download Skool community lessons to Markdown") ap.add_argument("--discover", action="store_true", help="Debug page structure without saving") asyncio.run(run(discover=ap.parse_args().discover))