214 lines
7.4 KiB
Python
214 lines
7.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
skool-lesson-scrape
|
|
Downloads lessons from a Skool community classroom to local Markdown files.
|
|
Skips lessons already saved — safe to re-run when new content is added.
|
|
|
|
Usage:
|
|
python scrape.py <community> # full scrape
|
|
python scrape.py <community> --output ~/my-lessons # custom output folder
|
|
python scrape.py <community> --discover # debug without saving
|
|
|
|
Setup: see README.md
|
|
"""
|
|
|
|
import asyncio
|
|
import argparse
|
|
import re
|
|
import json
|
|
import tempfile
|
|
import html2text
|
|
from pathlib import Path
|
|
from playwright.async_api import async_playwright
|
|
|
|
BASE = "https://www.skool.com"
|
|
DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag"
|
|
|
|
CONTENT_SELECTORS = [
|
|
".ProseMirror",
|
|
"[class*='lesson-content']",
|
|
"[class*='lessonContent']",
|
|
"[class*='module-content']",
|
|
"[class*='content-body']",
|
|
"article",
|
|
"main",
|
|
]
|
|
|
|
|
|
def sanitize(name: str) -> str:
|
|
name = re.sub(r'[<>:"/\|?*
|
|
|
|
]', '', str(name)).strip().strip(".")
|
|
return name[:120]
|
|
|
|
|
|
def existing_stems(output_dir: Path) -> set:
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
return {f.stem for f in output_dir.glob("*.md")}
|
|
|
|
|
|
def next_data(html: str) -> dict:
|
|
m = re.search(r'<script id="__NEXT_DATA__"[^>]*>(.*?)</script>', html, re.DOTALL)
|
|
return json.loads(m.group(1)) if m else {}
|
|
|
|
|
|
def html_to_md(raw: str) -> str:
|
|
h = html2text.HTML2Text()
|
|
h.body_width = 0
|
|
h.ignore_links = False
|
|
h.ignore_images = True
|
|
return h.handle(raw)
|
|
|
|
|
|
def write_lesson(output_dir: Path, course_title: str, lesson_title: str, body: str) -> str:
|
|
stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}"
|
|
out = output_dir / f"{stem}.md"
|
|
out.write_text(f"# {lesson_title}
|
|
|
|
{body}", encoding="utf-8")
|
|
return stem
|
|
|
|
|
|
async def lesson_body(page) -> str:
|
|
"""Content is rendered client-side into .ProseMirror (Skool's TipTap editor)."""
|
|
for sel in CONTENT_SELECTORS:
|
|
el = await page.query_selector(sel)
|
|
if el:
|
|
inner = await el.inner_html()
|
|
if len(inner) > 200:
|
|
return html_to_md(inner)
|
|
return html_to_md(await page.evaluate("() => document.body.innerHTML"))
|
|
|
|
|
|
async def run(community: str, output_dir: Path, discover: bool = False):
|
|
classroom = f"{BASE}/{community}/classroom"
|
|
existing = existing_stems(output_dir)
|
|
print(f"Community: {community}")
|
|
print(f"Output: {output_dir}")
|
|
print(f"Lessons already saved: {len(existing)}
|
|
")
|
|
|
|
async with async_playwright() as p:
|
|
browser = await p.chromium.launch(headless=False, slow_mo=25)
|
|
ctx = await browser.new_context(viewport={"width": 1440, "height": 900})
|
|
page = await ctx.new_page()
|
|
|
|
print("Opening Skool — please log in when the browser window appears.")
|
|
print("The script will continue automatically once you land on the community.
|
|
")
|
|
await page.goto("https://www.skool.com/login")
|
|
await page.wait_for_url(f"**/{community}/**", timeout=300_000)
|
|
print("Logged in.
|
|
")
|
|
|
|
await page.goto(classroom)
|
|
await page.wait_for_load_state("load")
|
|
await asyncio.sleep(3)
|
|
|
|
nd = next_data(await page.content())
|
|
all_crses = nd.get("props", {}).get("pageProps", {}).get("allCourses", [])
|
|
courses = [c for c in all_crses if c.get("metadata", {}).get("hasAccess", 0)]
|
|
print(f"Accessible courses: {len(courses)} of {len(all_crses)} total
|
|
")
|
|
|
|
if not courses:
|
|
DIAG_DIR.mkdir(parents=True, exist_ok=True)
|
|
(DIAG_DIR / "classroom.html").write_text(await page.content())
|
|
print(f"No courses found. Diagnostic HTML saved to {DIAG_DIR}")
|
|
await browser.close()
|
|
return
|
|
|
|
if discover:
|
|
course_url = f"{classroom}/{courses[0]['name']}"
|
|
await page.goto(course_url)
|
|
await page.wait_for_load_state("load")
|
|
await asyncio.sleep(3)
|
|
cnd = next_data(await page.content())
|
|
children = cnd.get("props", {}).get("pageProps", {}).get("course", {}).get("children", [])
|
|
first = children[0]["course"] if children else None
|
|
if first:
|
|
await page.goto(f"{course_url}?md={first['id']}")
|
|
await page.wait_for_load_state("load")
|
|
await asyncio.sleep(3)
|
|
lpp = next_data(await page.content()).get("props", {}).get("pageProps", {})
|
|
print("Lesson pageProps keys:", list(lpp.keys()))
|
|
DIAG_DIR.mkdir(parents=True, exist_ok=True)
|
|
(DIAG_DIR / "lesson.html").write_text(await page.content())
|
|
await page.screenshot(path=str(DIAG_DIR / "lesson.png"), full_page=True)
|
|
print(f"Diagnostic files saved to {DIAG_DIR}")
|
|
await browser.close()
|
|
return
|
|
|
|
saved = skipped = errors = 0
|
|
|
|
for course in courses:
|
|
course_title = course["metadata"]["title"]
|
|
course_url = f"{classroom}/{course['name']}"
|
|
print(f"Course: {course_title}")
|
|
|
|
await page.goto(course_url)
|
|
await page.wait_for_load_state("load")
|
|
await asyncio.sleep(2.5)
|
|
|
|
children = (
|
|
next_data(await page.content())
|
|
.get("props", {})
|
|
.get("pageProps", {})
|
|
.get("course", {})
|
|
.get("children", [])
|
|
)
|
|
|
|
if not children:
|
|
print(" No lessons found — skipping
|
|
")
|
|
continue
|
|
|
|
print(f" {len(children)} lessons")
|
|
|
|
for child in children:
|
|
lesson = child.get("course", {})
|
|
lesson_title = lesson.get("metadata", {}).get("title") or lesson.get("name") or "Untitled"
|
|
lesson_id = lesson.get("id", "")
|
|
stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}"
|
|
|
|
if stem in existing:
|
|
skipped += 1
|
|
continue
|
|
|
|
try:
|
|
await page.goto(f"{course_url}?md={lesson_id}")
|
|
await page.wait_for_load_state("load")
|
|
await asyncio.sleep(2)
|
|
stem = write_lesson(output_dir, course_title, lesson_title, await lesson_body(page))
|
|
existing.add(stem)
|
|
saved += 1
|
|
print(f" [saved] {lesson_title[:65]}")
|
|
except Exception as e:
|
|
errors += 1
|
|
print(f" [error] {lesson_title[:65]} — {e}")
|
|
|
|
print()
|
|
|
|
print("─" * 52)
|
|
print(f"Done. Saved: {saved} Skipped: {skipped} Errors: {errors}")
|
|
print(f"Output: {output_dir}")
|
|
await browser.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
ap = argparse.ArgumentParser(description="Download Skool community lessons to Markdown")
|
|
ap.add_argument(
|
|
"community",
|
|
help="Community slug from your Skool URL (e.g. skool.com/my-community → my-community)"
|
|
)
|
|
ap.add_argument(
|
|
"--output", "-o",
|
|
default=str(Path.home() / "skool-lessons"),
|
|
help="Folder to save Markdown files (default: ~/skool-lessons)"
|
|
)
|
|
ap.add_argument("--discover", action="store_true", help="Debug page structure without saving")
|
|
args = ap.parse_args()
|
|
asyncio.run(run(
|
|
community=args.community,
|
|
output_dir=Path(args.output).expanduser().resolve(),
|
|
discover=args.discover,
|
|
))
|