skool-lesson-scrape/scrape.py

#!/usr/bin/env python3
"""
skool-lesson-scrape
Downloads lessons from a Skool community classroom to local Markdown files.
Skips lessons already saved — safe to re-run when new content is added.

Usage:
    python scrape.py <community>                        # full scrape
    python scrape.py <community> --output ~/my-lessons  # custom output folder
    python scrape.py <community> --discover             # debug without saving

Setup: see README.md
"""

import asyncio
import argparse
import re
import json
import tempfile
import html2text
from pathlib import Path
from playwright.async_api import async_playwright

BASE     = "https://www.skool.com"
DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag"

CONTENT_SELECTORS = [
    ".ProseMirror",
    "[class*='lesson-content']",
    "[class*='lessonContent']",
    "[class*='module-content']",
    "[class*='content-body']",
    "article",
    "main",
]


def sanitize(name: str) -> str:
    name = re.sub(r'[<>:"/\|?*

	]', '', str(name)).strip().strip(".")
    return name[:120]


def existing_stems(output_dir: Path) -> set:
    output_dir.mkdir(parents=True, exist_ok=True)
    return {f.stem for f in output_dir.glob("*.md")}


def next_data(html: str) -> dict:
    m = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
    return json.loads(m.group(1)) if m else {}


def html_to_md(raw: str) -> str:
    h = html2text.HTML2Text()
    h.body_width    = 0
    h.ignore_links  = False
    h.ignore_images = True
    return h.handle(raw)


def write_lesson(output_dir: Path, course_title: str, lesson_title: str, body: str) -> str:
    stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}"
    out  = output_dir / f"{stem}.md"
    out.write_text(f"# {lesson_title}

{body}", encoding="utf-8")
    return stem


async def lesson_body(page) -> str:
    """Content is rendered client-side into .ProseMirror (Skool's TipTap editor)."""
    for sel in CONTENT_SELECTORS:
        el = await page.query_selector(sel)
        if el:
            inner = await el.inner_html()
            if len(inner) > 200:
                return html_to_md(inner)
    return html_to_md(await page.evaluate("() => document.body.innerHTML"))


async def run(community: str, output_dir: Path, discover: bool = False):
    classroom = f"{BASE}/{community}/classroom"
    existing  = existing_stems(output_dir)
    print(f"Community:  {community}")
    print(f"Output:     {output_dir}")
    print(f"Lessons already saved: {len(existing)}
")

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False, slow_mo=25)
        ctx     = await browser.new_context(viewport={"width": 1440, "height": 900})
        page    = await ctx.new_page()

        print("Opening Skool — please log in when the browser window appears.")
        print("The script will continue automatically once you land on the community.
")
        await page.goto("https://www.skool.com/login")
        await page.wait_for_url(f"**/{community}/**", timeout=300_000)
        print("Logged in.
")

        await page.goto(classroom)
        await page.wait_for_load_state("load")
        await asyncio.sleep(3)

        nd        = next_data(await page.content())
        all_crses = nd.get("props", {}).get("pageProps", {}).get("allCourses", [])
        courses   = [c for c in all_crses if c.get("metadata", {}).get("hasAccess", 0)]
        print(f"Accessible courses: {len(courses)} of {len(all_crses)} total
")

        if not courses:
            DIAG_DIR.mkdir(parents=True, exist_ok=True)
            (DIAG_DIR / "classroom.html").write_text(await page.content())
            print(f"No courses found. Diagnostic HTML saved to {DIAG_DIR}")
            await browser.close()
            return

        if discover:
            course_url = f"{classroom}/{courses[0]['name']}"
            await page.goto(course_url)
            await page.wait_for_load_state("load")
            await asyncio.sleep(3)
            cnd      = next_data(await page.content())
            children = cnd.get("props", {}).get("pageProps", {}).get("course", {}).get("children", [])
            first    = children[0]["course"] if children else None
            if first:
                await page.goto(f"{course_url}?md={first['id']}")
                await page.wait_for_load_state("load")
                await asyncio.sleep(3)
                lpp = next_data(await page.content()).get("props", {}).get("pageProps", {})
                print("Lesson pageProps keys:", list(lpp.keys()))
                DIAG_DIR.mkdir(parents=True, exist_ok=True)
                (DIAG_DIR / "lesson.html").write_text(await page.content())
                await page.screenshot(path=str(DIAG_DIR / "lesson.png"), full_page=True)
                print(f"Diagnostic files saved to {DIAG_DIR}")
            await browser.close()
            return

        saved = skipped = errors = 0

        for course in courses:
            course_title = course["metadata"]["title"]
            course_url   = f"{classroom}/{course['name']}"
            print(f"Course: {course_title}")

            await page.goto(course_url)
            await page.wait_for_load_state("load")
            await asyncio.sleep(2.5)

            children = (
                next_data(await page.content())
                .get("props", {})
                .get("pageProps", {})
                .get("course", {})
                .get("children", [])
            )

            if not children:
                print("  No lessons found — skipping
")
                continue

            print(f"  {len(children)} lessons")

            for child in children:
                lesson       = child.get("course", {})
                lesson_title = lesson.get("metadata", {}).get("title") or lesson.get("name") or "Untitled"
                lesson_id    = lesson.get("id", "")
                stem         = f"{sanitize(course_title)} -- {sanitize(lesson_title)}"

                if stem in existing:
                    skipped += 1
                    continue

                try:
                    await page.goto(f"{course_url}?md={lesson_id}")
                    await page.wait_for_load_state("load")
                    await asyncio.sleep(2)
                    stem = write_lesson(output_dir, course_title, lesson_title, await lesson_body(page))
                    existing.add(stem)
                    saved += 1
                    print(f"  [saved]  {lesson_title[:65]}")
                except Exception as e:
                    errors += 1
                    print(f"  [error]  {lesson_title[:65]} — {e}")

            print()

        print("─" * 52)
        print(f"Done.   Saved: {saved}   Skipped: {skipped}   Errors: {errors}")
        print(f"Output: {output_dir}")
        await browser.close()


if __name__ == "__main__":
    ap = argparse.ArgumentParser(description="Download Skool community lessons to Markdown")
    ap.add_argument(
        "community",
        help="Community slug from your Skool URL (e.g. skool.com/my-community → my-community)"
    )
    ap.add_argument(
        "--output", "-o",
        default=str(Path.home() / "skool-lessons"),
        help="Folder to save Markdown files (default: ~/skool-lessons)"
    )
    ap.add_argument("--discover", action="store_true", help="Debug page structure without saving")
    args = ap.parse_args()
    asyncio.run(run(
        community=args.community,
        output_dir=Path(args.output).expanduser().resolve(),
        discover=args.discover,
    ))