Make community and output dir configurable via CLI args

This commit is contained in:
sttil 2026-05-06 20:07:53 +00:00
parent 56c7b53254
commit 546e7a0b87

View file

@ -1,12 +1,13 @@
#!/usr/bin/env python3
"""
skool-lesson-scrape
Downloads lessons from a Skool community classroom to a local folder as Markdown files.
Downloads lessons from a Skool community classroom to local Markdown files.
Skips lessons already saved safe to re-run when new content is added.
Usage:
python scrape.py # full scrape
python scrape.py --discover # inspect page structure without saving (debug)
python scrape.py <community> # full scrape
python scrape.py <community> --output ~/my-lessons # custom output folder
python scrape.py <community> --discover # debug without saving
Setup: see README.md
"""
@ -20,26 +21,11 @@ import html2text
from pathlib import Path
from playwright.async_api import async_playwright
# ── CONFIG — edit these two lines ────────────────────────────────────────────
#
# COMMUNITY: the slug from your Skool community URL
# e.g. https://www.skool.com/navaigate → "navaigate"
COMMUNITY = "navaigate"
#
# OUTPUT_DIR: folder where .md files are saved (created if it doesn't exist)
# Mac/Linux: Path.home() / "skool-lessons"
# Windows: Path(r"C:\Users\YourName\Documents\skool-lessons")
# Obsidian: Path.home() / "Documents" / "ObsidianVault" / "Lessons"
OUTPUT_DIR = Path.home() / "skool-lessons"
#
# ─────────────────────────────────────────────────────────────────────────────
BASE = "https://www.skool.com"
CLASSROOM = f"{BASE}/{COMMUNITY}/classroom"
DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag"
BASE = "https://www.skool.com"
DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag"
CONTENT_SELECTORS = [
".ProseMirror", # Skool's TipTap editor — primary target
".ProseMirror",
"[class*='lesson-content']",
"[class*='lessonContent']",
"[class*='module-content']",
@ -50,13 +36,14 @@ CONTENT_SELECTORS = [
def sanitize(name: str) -> str:
name = re.sub(r'[<>:"/\\|?*\n\r\t]', '', str(name)).strip().strip(".")
name = re.sub(r'[<>:"/\|?*
]', '', str(name)).strip().strip(".")
return name[:120]
def existing_stems() -> set:
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
return {f.stem for f in OUTPUT_DIR.glob("*.md")}
def existing_stems(output_dir: Path) -> set:
output_dir.mkdir(parents=True, exist_ok=True)
return {f.stem for f in output_dir.glob("*.md")}
def next_data(html: str) -> dict:
@ -72,10 +59,12 @@ def html_to_md(raw: str) -> str:
return h.handle(raw)
def write_lesson(course_title: str, lesson_title: str, body: str) -> str:
def write_lesson(output_dir: Path, course_title: str, lesson_title: str, body: str) -> str:
stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}"
out = OUTPUT_DIR / f"{stem}.md"
out.write_text(f"# {lesson_title}\n\n{body}", encoding="utf-8")
out = output_dir / f"{stem}.md"
out.write_text(f"# {lesson_title}
{body}", encoding="utf-8")
return stem
@ -90,10 +79,13 @@ async def lesson_body(page) -> str:
return html_to_md(await page.evaluate("() => document.body.innerHTML"))
async def run(discover: bool = False):
existing = existing_stems()
print(f"Output folder: {OUTPUT_DIR}")
print(f"Lessons already saved: {len(existing)}\n")
async def run(community: str, output_dir: Path, discover: bool = False):
classroom = f"{BASE}/{community}/classroom"
existing = existing_stems(output_dir)
print(f"Community: {community}")
print(f"Output: {output_dir}")
print(f"Lessons already saved: {len(existing)}
")
async with async_playwright() as p:
browser = await p.chromium.launch(headless=False, slow_mo=25)
@ -101,19 +93,22 @@ async def run(discover: bool = False):
page = await ctx.new_page()
print("Opening Skool — please log in when the browser window appears.")
print("The script will continue automatically once you land on the community.\n")
print("The script will continue automatically once you land on the community.
")
await page.goto("https://www.skool.com/login")
await page.wait_for_url(f"**/{COMMUNITY}/**", timeout=300_000)
print("Logged in.\n")
await page.wait_for_url(f"**/{community}/**", timeout=300_000)
print("Logged in.
")
await page.goto(CLASSROOM)
await page.goto(classroom)
await page.wait_for_load_state("load")
await asyncio.sleep(3)
nd = next_data(await page.content())
all_crses = nd.get("props", {}).get("pageProps", {}).get("allCourses", [])
courses = [c for c in all_crses if c.get("metadata", {}).get("hasAccess", 0)]
print(f"Accessible courses: {len(courses)} of {len(all_crses)} total\n")
print(f"Accessible courses: {len(courses)} of {len(all_crses)} total
")
if not courses:
DIAG_DIR.mkdir(parents=True, exist_ok=True)
@ -123,7 +118,7 @@ async def run(discover: bool = False):
return
if discover:
course_url = f"{CLASSROOM}/{courses[0]['name']}"
course_url = f"{classroom}/{courses[0]['name']}"
await page.goto(course_url)
await page.wait_for_load_state("load")
await asyncio.sleep(3)
@ -147,7 +142,7 @@ async def run(discover: bool = False):
for course in courses:
course_title = course["metadata"]["title"]
course_url = f"{CLASSROOM}/{course['name']}"
course_url = f"{classroom}/{course['name']}"
print(f"Course: {course_title}")
await page.goto(course_url)
@ -163,7 +158,8 @@ async def run(discover: bool = False):
)
if not children:
print(" No lessons found — skipping\n")
print(" No lessons found — skipping
")
continue
print(f" {len(children)} lessons")
@ -182,7 +178,7 @@ async def run(discover: bool = False):
await page.goto(f"{course_url}?md={lesson_id}")
await page.wait_for_load_state("load")
await asyncio.sleep(2)
stem = write_lesson(course_title, lesson_title, await lesson_body(page))
stem = write_lesson(output_dir, course_title, lesson_title, await lesson_body(page))
existing.add(stem)
saved += 1
print(f" [saved] {lesson_title[:65]}")
@ -194,11 +190,25 @@ async def run(discover: bool = False):
print("" * 52)
print(f"Done. Saved: {saved} Skipped: {skipped} Errors: {errors}")
print(f"Output: {OUTPUT_DIR}")
print(f"Output: {output_dir}")
await browser.close()
if __name__ == "__main__":
ap = argparse.ArgumentParser(description="Download Skool community lessons to Markdown")
ap.add_argument(
"community",
help="Community slug from your Skool URL (e.g. skool.com/my-community → my-community)"
)
ap.add_argument(
"--output", "-o",
default=str(Path.home() / "skool-lessons"),
help="Folder to save Markdown files (default: ~/skool-lessons)"
)
ap.add_argument("--discover", action="store_true", help="Debug page structure without saving")
asyncio.run(run(discover=ap.parse_args().discover))
args = ap.parse_args()
asyncio.run(run(
community=args.community,
output_dir=Path(args.output).expanduser().resolve(),
discover=args.discover,
))