From 56c7b53254f747cbabc6672a28afc7ce3369120f Mon Sep 17 00:00:00 2001 From: Kisa Date: Mon, 4 May 2026 20:28:20 -0400 Subject: [PATCH] =?UTF-8?q?Initial=20release=20=E2=80=94=20Skool=20communi?= =?UTF-8?q?ty=20lesson=20scraper?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Downloads lessons from any Skool classroom to local Markdown files. Cross-platform (Mac/Windows/Linux), membership-gated, safe to re-run. Co-Authored-By: Claude Sonnet 4.6 --- README.md | 95 ++++++++++++++++++++++ requirements.txt | 2 + scrape.py | 204 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 301 insertions(+) create mode 100644 README.md create mode 100644 requirements.txt create mode 100644 scrape.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..426e689 --- /dev/null +++ b/README.md @@ -0,0 +1,95 @@ +# skool-lesson-scrape + +Download lessons from any Skool community classroom to local Markdown files. + +- Works on Mac, Windows, and Linux +- Skips lessons already saved — safe to re-run when new content is added +- Saves one `.md` file per lesson: `Course Name -- Lesson Title.md` +- Respects your membership tier — only downloads content your account can access +- Works great with Obsidian, Notion, or any Markdown-based knowledge system + +--- + +## Requirements + +- Python 3.8 or later +- A paid Skool account with access to the community you want to scrape + +--- + +## Setup + +**1. Install dependencies** + +```bash +pip install -r requirements.txt +playwright install chromium +``` + +**2. Configure the script** + +Open `scrape.py` and edit the two lines at the top of the CONFIG section: + +```python +COMMUNITY = "navaigate" # slug from your Skool community URL +OUTPUT_DIR = Path.home() / "skool-lessons" # where .md files are saved +``` + +- `COMMUNITY`: find it in your Skool URL — `skool.com/your-community-slug` +- `OUTPUT_DIR`: any folder on your machine; created automatically if it doesn't exist + +**Obsidian users** — point `OUTPUT_DIR` at a folder inside your vault: +```python +OUTPUT_DIR = Path.home() / "Documents" / "MyVault" / "Lessons" +``` + +**Windows users** — use a raw string for backslash paths: +```python +OUTPUT_DIR = Path(r"C:\Users\YourName\Documents\skool-lessons") +``` + +--- + +## Usage + +**Full scrape** — downloads all lessons you have access to: +```bash +python scrape.py +``` + +A browser window will open. Log in to Skool normally (email/password or Google). The script takes over automatically once you land on the community. + +**Re-run anytime** — already-saved lessons are skipped automatically. + +**Debug mode** — inspect page structure without saving anything: +```bash +python scrape.py --discover +``` + +--- + +## How it works + +Skool embeds course and lesson structure as JSON in the page source (`__NEXT_DATA__`). The script reads that directly to get course and lesson IDs, then navigates to each lesson and extracts the body text from Skool's TipTap editor (`.ProseMirror` selector). No fragile DOM scraping — the JSON structure is stable. + +--- + +## Notes + +- Content is gated by your own Skool membership — you can only download lessons your account has access to +- This tool is for personal offline backup, not redistribution of community content +- Re-running after new lessons are posted will only download what's new + +--- + +## Troubleshooting + +**"No courses found"** — a diagnostic HTML file is saved to your system temp folder. The page structure may have changed; open an issue with the HTML attached. + +**Browser closes immediately** — make sure you completed the Playwright browser install: `playwright install chromium` + +**Lessons saving as navigation boilerplate** — run `--discover` and open an issue with the output. + +--- + +Built by [Kisa Fenn](https://github.com/kisasttil-gif) — STTIL Solutions diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..d5a2b07 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +playwright>=1.40.0 +html2text>=2024.2.26 diff --git a/scrape.py b/scrape.py new file mode 100644 index 0000000..4e83a38 --- /dev/null +++ b/scrape.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 +""" +skool-lesson-scrape +Downloads lessons from a Skool community classroom to a local folder as Markdown files. +Skips lessons already saved — safe to re-run when new content is added. + +Usage: + python scrape.py # full scrape + python scrape.py --discover # inspect page structure without saving (debug) + +Setup: see README.md +""" + +import asyncio +import argparse +import re +import json +import tempfile +import html2text +from pathlib import Path +from playwright.async_api import async_playwright + +# ── CONFIG — edit these two lines ──────────────────────────────────────────── +# +# COMMUNITY: the slug from your Skool community URL +# e.g. https://www.skool.com/navaigate → "navaigate" +COMMUNITY = "navaigate" +# +# OUTPUT_DIR: folder where .md files are saved (created if it doesn't exist) +# Mac/Linux: Path.home() / "skool-lessons" +# Windows: Path(r"C:\Users\YourName\Documents\skool-lessons") +# Obsidian: Path.home() / "Documents" / "ObsidianVault" / "Lessons" +OUTPUT_DIR = Path.home() / "skool-lessons" +# +# ───────────────────────────────────────────────────────────────────────────── + +BASE = "https://www.skool.com" +CLASSROOM = f"{BASE}/{COMMUNITY}/classroom" +DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag" + +CONTENT_SELECTORS = [ + ".ProseMirror", # Skool's TipTap editor — primary target + "[class*='lesson-content']", + "[class*='lessonContent']", + "[class*='module-content']", + "[class*='content-body']", + "article", + "main", +] + + +def sanitize(name: str) -> str: + name = re.sub(r'[<>:"/\\|?*\n\r\t]', '', str(name)).strip().strip(".") + return name[:120] + + +def existing_stems() -> set: + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + return {f.stem for f in OUTPUT_DIR.glob("*.md")} + + +def next_data(html: str) -> dict: + m = re.search(r'', html, re.DOTALL) + return json.loads(m.group(1)) if m else {} + + +def html_to_md(raw: str) -> str: + h = html2text.HTML2Text() + h.body_width = 0 + h.ignore_links = False + h.ignore_images = True + return h.handle(raw) + + +def write_lesson(course_title: str, lesson_title: str, body: str) -> str: + stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}" + out = OUTPUT_DIR / f"{stem}.md" + out.write_text(f"# {lesson_title}\n\n{body}", encoding="utf-8") + return stem + + +async def lesson_body(page) -> str: + """Content is rendered client-side into .ProseMirror (Skool's TipTap editor).""" + for sel in CONTENT_SELECTORS: + el = await page.query_selector(sel) + if el: + inner = await el.inner_html() + if len(inner) > 200: + return html_to_md(inner) + return html_to_md(await page.evaluate("() => document.body.innerHTML")) + + +async def run(discover: bool = False): + existing = existing_stems() + print(f"Output folder: {OUTPUT_DIR}") + print(f"Lessons already saved: {len(existing)}\n") + + async with async_playwright() as p: + browser = await p.chromium.launch(headless=False, slow_mo=25) + ctx = await browser.new_context(viewport={"width": 1440, "height": 900}) + page = await ctx.new_page() + + print("Opening Skool — please log in when the browser window appears.") + print("The script will continue automatically once you land on the community.\n") + await page.goto("https://www.skool.com/login") + await page.wait_for_url(f"**/{COMMUNITY}/**", timeout=300_000) + print("Logged in.\n") + + await page.goto(CLASSROOM) + await page.wait_for_load_state("load") + await asyncio.sleep(3) + + nd = next_data(await page.content()) + all_crses = nd.get("props", {}).get("pageProps", {}).get("allCourses", []) + courses = [c for c in all_crses if c.get("metadata", {}).get("hasAccess", 0)] + print(f"Accessible courses: {len(courses)} of {len(all_crses)} total\n") + + if not courses: + DIAG_DIR.mkdir(parents=True, exist_ok=True) + (DIAG_DIR / "classroom.html").write_text(await page.content()) + print(f"No courses found. Diagnostic HTML saved to {DIAG_DIR}") + await browser.close() + return + + if discover: + course_url = f"{CLASSROOM}/{courses[0]['name']}" + await page.goto(course_url) + await page.wait_for_load_state("load") + await asyncio.sleep(3) + cnd = next_data(await page.content()) + children = cnd.get("props", {}).get("pageProps", {}).get("course", {}).get("children", []) + first = children[0]["course"] if children else None + if first: + await page.goto(f"{course_url}?md={first['id']}") + await page.wait_for_load_state("load") + await asyncio.sleep(3) + lpp = next_data(await page.content()).get("props", {}).get("pageProps", {}) + print("Lesson pageProps keys:", list(lpp.keys())) + DIAG_DIR.mkdir(parents=True, exist_ok=True) + (DIAG_DIR / "lesson.html").write_text(await page.content()) + await page.screenshot(path=str(DIAG_DIR / "lesson.png"), full_page=True) + print(f"Diagnostic files saved to {DIAG_DIR}") + await browser.close() + return + + saved = skipped = errors = 0 + + for course in courses: + course_title = course["metadata"]["title"] + course_url = f"{CLASSROOM}/{course['name']}" + print(f"Course: {course_title}") + + await page.goto(course_url) + await page.wait_for_load_state("load") + await asyncio.sleep(2.5) + + children = ( + next_data(await page.content()) + .get("props", {}) + .get("pageProps", {}) + .get("course", {}) + .get("children", []) + ) + + if not children: + print(" No lessons found — skipping\n") + continue + + print(f" {len(children)} lessons") + + for child in children: + lesson = child.get("course", {}) + lesson_title = lesson.get("metadata", {}).get("title") or lesson.get("name") or "Untitled" + lesson_id = lesson.get("id", "") + stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}" + + if stem in existing: + skipped += 1 + continue + + try: + await page.goto(f"{course_url}?md={lesson_id}") + await page.wait_for_load_state("load") + await asyncio.sleep(2) + stem = write_lesson(course_title, lesson_title, await lesson_body(page)) + existing.add(stem) + saved += 1 + print(f" [saved] {lesson_title[:65]}") + except Exception as e: + errors += 1 + print(f" [error] {lesson_title[:65]} — {e}") + + print() + + print("─" * 52) + print(f"Done. Saved: {saved} Skipped: {skipped} Errors: {errors}") + print(f"Output: {OUTPUT_DIR}") + await browser.close() + + +if __name__ == "__main__": + ap = argparse.ArgumentParser(description="Download Skool community lessons to Markdown") + ap.add_argument("--discover", action="store_true", help="Debug page structure without saving") + asyncio.run(run(discover=ap.parse_args().discover))