From 56c7b53254f747cbabc6672a28afc7ce3369120f Mon Sep 17 00:00:00 2001
From: Kisa <kisasttil@gmail.com>
Date: Mon, 4 May 2026 20:28:20 -0400
Subject: [PATCH] =?UTF-8?q?Initial=20release=20=E2=80=94=20Skool=20communi?=
 =?UTF-8?q?ty=20lesson=20scraper?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Downloads lessons from any Skool classroom to local Markdown files.
Cross-platform (Mac/Windows/Linux), membership-gated, safe to re-run.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 README.md        |  95 ++++++++++++++++++++++
 requirements.txt |   2 +
 scrape.py        | 204 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 301 insertions(+)
 create mode 100644 README.md
 create mode 100644 requirements.txt
 create mode 100644 scrape.py

diff --git a/README.md b/README.md
new file mode 100644
index 0000000..426e689
--- /dev/null
+++ b/README.md
@@ -0,0 +1,95 @@
+# skool-lesson-scrape
+
+Download lessons from any Skool community classroom to local Markdown files.
+
+- Works on Mac, Windows, and Linux
+- Skips lessons already saved — safe to re-run when new content is added
+- Saves one `.md` file per lesson: `Course Name -- Lesson Title.md`
+- Respects your membership tier — only downloads content your account can access
+- Works great with Obsidian, Notion, or any Markdown-based knowledge system
+
+---
+
+## Requirements
+
+- Python 3.8 or later
+- A paid Skool account with access to the community you want to scrape
+
+---
+
+## Setup
+
+**1. Install dependencies**
+
+```bash
+pip install -r requirements.txt
+playwright install chromium
+```
+
+**2. Configure the script**
+
+Open `scrape.py` and edit the two lines at the top of the CONFIG section:
+
+```python
+COMMUNITY  = "navaigate"          # slug from your Skool community URL
+OUTPUT_DIR = Path.home() / "skool-lessons"   # where .md files are saved
+```
+
+- `COMMUNITY`: find it in your Skool URL — `skool.com/your-community-slug`
+- `OUTPUT_DIR`: any folder on your machine; created automatically if it doesn't exist
+
+**Obsidian users** — point `OUTPUT_DIR` at a folder inside your vault:
+```python
+OUTPUT_DIR = Path.home() / "Documents" / "MyVault" / "Lessons"
+```
+
+**Windows users** — use a raw string for backslash paths:
+```python
+OUTPUT_DIR = Path(r"C:\Users\YourName\Documents\skool-lessons")
+```
+
+---
+
+## Usage
+
+**Full scrape** — downloads all lessons you have access to:
+```bash
+python scrape.py
+```
+
+A browser window will open. Log in to Skool normally (email/password or Google). The script takes over automatically once you land on the community.
+
+**Re-run anytime** — already-saved lessons are skipped automatically.
+
+**Debug mode** — inspect page structure without saving anything:
+```bash
+python scrape.py --discover
+```
+
+---
+
+## How it works
+
+Skool embeds course and lesson structure as JSON in the page source (`__NEXT_DATA__`). The script reads that directly to get course and lesson IDs, then navigates to each lesson and extracts the body text from Skool's TipTap editor (`.ProseMirror` selector). No fragile DOM scraping — the JSON structure is stable.
+
+---
+
+## Notes
+
+- Content is gated by your own Skool membership — you can only download lessons your account has access to
+- This tool is for personal offline backup, not redistribution of community content
+- Re-running after new lessons are posted will only download what's new
+
+---
+
+## Troubleshooting
+
+**"No courses found"** — a diagnostic HTML file is saved to your system temp folder. The page structure may have changed; open an issue with the HTML attached.
+
+**Browser closes immediately** — make sure you completed the Playwright browser install: `playwright install chromium`
+
+**Lessons saving as navigation boilerplate** — run `--discover` and open an issue with the output.
+
+---
+
+Built by [Kisa Fenn](https://github.com/kisasttil-gif) — STTIL Solutions
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..d5a2b07
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+playwright>=1.40.0
+html2text>=2024.2.26
diff --git a/scrape.py b/scrape.py
new file mode 100644
index 0000000..4e83a38
--- /dev/null
+++ b/scrape.py
@@ -0,0 +1,204 @@
+#!/usr/bin/env python3
+"""
+skool-lesson-scrape
+Downloads lessons from a Skool community classroom to a local folder as Markdown files.
+Skips lessons already saved — safe to re-run when new content is added.
+
+Usage:
+    python scrape.py               # full scrape
+    python scrape.py --discover    # inspect page structure without saving (debug)
+
+Setup: see README.md
+"""
+
+import asyncio
+import argparse
+import re
+import json
+import tempfile
+import html2text
+from pathlib import Path
+from playwright.async_api import async_playwright
+
+# ── CONFIG — edit these two lines ────────────────────────────────────────────
+#
+# COMMUNITY: the slug from your Skool community URL
+#   e.g. https://www.skool.com/navaigate  →  "navaigate"
+COMMUNITY = "navaigate"
+#
+# OUTPUT_DIR: folder where .md files are saved (created if it doesn't exist)
+#   Mac/Linux: Path.home() / "skool-lessons"
+#   Windows:   Path(r"C:\Users\YourName\Documents\skool-lessons")
+#   Obsidian:  Path.home() / "Documents" / "ObsidianVault" / "Lessons"
+OUTPUT_DIR = Path.home() / "skool-lessons"
+#
+# ─────────────────────────────────────────────────────────────────────────────
+
+BASE      = "https://www.skool.com"
+CLASSROOM = f"{BASE}/{COMMUNITY}/classroom"
+DIAG_DIR  = Path(tempfile.gettempdir()) / "skool_scrape_diag"
+
+CONTENT_SELECTORS = [
+    ".ProseMirror",           # Skool's TipTap editor — primary target
+    "[class*='lesson-content']",
+    "[class*='lessonContent']",
+    "[class*='module-content']",
+    "[class*='content-body']",
+    "article",
+    "main",
+]
+
+
+def sanitize(name: str) -> str:
+    name = re.sub(r'[<>:"/\\|?*\n\r\t]', '', str(name)).strip().strip(".")
+    return name[:120]
+
+
+def existing_stems() -> set:
+    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    return {f.stem for f in OUTPUT_DIR.glob("*.md")}
+
+
+def next_data(html: str) -> dict:
+    m = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
+    return json.loads(m.group(1)) if m else {}
+
+
+def html_to_md(raw: str) -> str:
+    h = html2text.HTML2Text()
+    h.body_width    = 0
+    h.ignore_links  = False
+    h.ignore_images = True
+    return h.handle(raw)
+
+
+def write_lesson(course_title: str, lesson_title: str, body: str) -> str:
+    stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}"
+    out  = OUTPUT_DIR / f"{stem}.md"
+    out.write_text(f"# {lesson_title}\n\n{body}", encoding="utf-8")
+    return stem
+
+
+async def lesson_body(page) -> str:
+    """Content is rendered client-side into .ProseMirror (Skool's TipTap editor)."""
+    for sel in CONTENT_SELECTORS:
+        el = await page.query_selector(sel)
+        if el:
+            inner = await el.inner_html()
+            if len(inner) > 200:
+                return html_to_md(inner)
+    return html_to_md(await page.evaluate("() => document.body.innerHTML"))
+
+
+async def run(discover: bool = False):
+    existing = existing_stems()
+    print(f"Output folder: {OUTPUT_DIR}")
+    print(f"Lessons already saved: {len(existing)}\n")
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(headless=False, slow_mo=25)
+        ctx     = await browser.new_context(viewport={"width": 1440, "height": 900})
+        page    = await ctx.new_page()
+
+        print("Opening Skool — please log in when the browser window appears.")
+        print("The script will continue automatically once you land on the community.\n")
+        await page.goto("https://www.skool.com/login")
+        await page.wait_for_url(f"**/{COMMUNITY}/**", timeout=300_000)
+        print("Logged in.\n")
+
+        await page.goto(CLASSROOM)
+        await page.wait_for_load_state("load")
+        await asyncio.sleep(3)
+
+        nd        = next_data(await page.content())
+        all_crses = nd.get("props", {}).get("pageProps", {}).get("allCourses", [])
+        courses   = [c for c in all_crses if c.get("metadata", {}).get("hasAccess", 0)]
+        print(f"Accessible courses: {len(courses)} of {len(all_crses)} total\n")
+
+        if not courses:
+            DIAG_DIR.mkdir(parents=True, exist_ok=True)
+            (DIAG_DIR / "classroom.html").write_text(await page.content())
+            print(f"No courses found. Diagnostic HTML saved to {DIAG_DIR}")
+            await browser.close()
+            return
+
+        if discover:
+            course_url = f"{CLASSROOM}/{courses[0]['name']}"
+            await page.goto(course_url)
+            await page.wait_for_load_state("load")
+            await asyncio.sleep(3)
+            cnd      = next_data(await page.content())
+            children = cnd.get("props", {}).get("pageProps", {}).get("course", {}).get("children", [])
+            first    = children[0]["course"] if children else None
+            if first:
+                await page.goto(f"{course_url}?md={first['id']}")
+                await page.wait_for_load_state("load")
+                await asyncio.sleep(3)
+                lpp = next_data(await page.content()).get("props", {}).get("pageProps", {})
+                print("Lesson pageProps keys:", list(lpp.keys()))
+                DIAG_DIR.mkdir(parents=True, exist_ok=True)
+                (DIAG_DIR / "lesson.html").write_text(await page.content())
+                await page.screenshot(path=str(DIAG_DIR / "lesson.png"), full_page=True)
+                print(f"Diagnostic files saved to {DIAG_DIR}")
+            await browser.close()
+            return
+
+        saved = skipped = errors = 0
+
+        for course in courses:
+            course_title = course["metadata"]["title"]
+            course_url   = f"{CLASSROOM}/{course['name']}"
+            print(f"Course: {course_title}")
+
+            await page.goto(course_url)
+            await page.wait_for_load_state("load")
+            await asyncio.sleep(2.5)
+
+            children = (
+                next_data(await page.content())
+                .get("props", {})
+                .get("pageProps", {})
+                .get("course", {})
+                .get("children", [])
+            )
+
+            if not children:
+                print("  No lessons found — skipping\n")
+                continue
+
+            print(f"  {len(children)} lessons")
+
+            for child in children:
+                lesson       = child.get("course", {})
+                lesson_title = lesson.get("metadata", {}).get("title") or lesson.get("name") or "Untitled"
+                lesson_id    = lesson.get("id", "")
+                stem         = f"{sanitize(course_title)} -- {sanitize(lesson_title)}"
+
+                if stem in existing:
+                    skipped += 1
+                    continue
+
+                try:
+                    await page.goto(f"{course_url}?md={lesson_id}")
+                    await page.wait_for_load_state("load")
+                    await asyncio.sleep(2)
+                    stem = write_lesson(course_title, lesson_title, await lesson_body(page))
+                    existing.add(stem)
+                    saved += 1
+                    print(f"  [saved]  {lesson_title[:65]}")
+                except Exception as e:
+                    errors += 1
+                    print(f"  [error]  {lesson_title[:65]} — {e}")
+
+            print()
+
+        print("─" * 52)
+        print(f"Done.   Saved: {saved}   Skipped: {skipped}   Errors: {errors}")
+        print(f"Output: {OUTPUT_DIR}")
+        await browser.close()
+
+
+if __name__ == "__main__":
+    ap = argparse.ArgumentParser(description="Download Skool community lessons to Markdown")
+    ap.add_argument("--discover", action="store_true", help="Debug page structure without saving")
+    asyncio.run(run(discover=ap.parse_args().discover))