Make community and output dir configurable via CLI args

This commit is contained in:
sttil 2026-05-06 20:07:53 +00:00
parent 56c7b53254
commit 546e7a0b87

View file

@ -1,12 +1,13 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
skool-lesson-scrape skool-lesson-scrape
Downloads lessons from a Skool community classroom to a local folder as Markdown files. Downloads lessons from a Skool community classroom to local Markdown files.
Skips lessons already saved safe to re-run when new content is added. Skips lessons already saved safe to re-run when new content is added.
Usage: Usage:
python scrape.py # full scrape python scrape.py <community> # full scrape
python scrape.py --discover # inspect page structure without saving (debug) python scrape.py <community> --output ~/my-lessons # custom output folder
python scrape.py <community> --discover # debug without saving
Setup: see README.md Setup: see README.md
""" """
@ -20,26 +21,11 @@ import html2text
from pathlib import Path from pathlib import Path
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
# ── CONFIG — edit these two lines ──────────────────────────────────────────── BASE = "https://www.skool.com"
# DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag"
# COMMUNITY: the slug from your Skool community URL
# e.g. https://www.skool.com/navaigate → "navaigate"
COMMUNITY = "navaigate"
#
# OUTPUT_DIR: folder where .md files are saved (created if it doesn't exist)
# Mac/Linux: Path.home() / "skool-lessons"
# Windows: Path(r"C:\Users\YourName\Documents\skool-lessons")
# Obsidian: Path.home() / "Documents" / "ObsidianVault" / "Lessons"
OUTPUT_DIR = Path.home() / "skool-lessons"
#
# ─────────────────────────────────────────────────────────────────────────────
BASE = "https://www.skool.com"
CLASSROOM = f"{BASE}/{COMMUNITY}/classroom"
DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag"
CONTENT_SELECTORS = [ CONTENT_SELECTORS = [
".ProseMirror", # Skool's TipTap editor — primary target ".ProseMirror",
"[class*='lesson-content']", "[class*='lesson-content']",
"[class*='lessonContent']", "[class*='lessonContent']",
"[class*='module-content']", "[class*='module-content']",
@ -50,13 +36,14 @@ CONTENT_SELECTORS = [
def sanitize(name: str) -> str: def sanitize(name: str) -> str:
name = re.sub(r'[<>:"/\\|?*\n\r\t]', '', str(name)).strip().strip(".") name = re.sub(r'[<>:"/\|?*
]', '', str(name)).strip().strip(".")
return name[:120] return name[:120]
def existing_stems() -> set: def existing_stems(output_dir: Path) -> set:
OUTPUT_DIR.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
return {f.stem for f in OUTPUT_DIR.glob("*.md")} return {f.stem for f in output_dir.glob("*.md")}
def next_data(html: str) -> dict: def next_data(html: str) -> dict:
@ -72,10 +59,12 @@ def html_to_md(raw: str) -> str:
return h.handle(raw) return h.handle(raw)
def write_lesson(course_title: str, lesson_title: str, body: str) -> str: def write_lesson(output_dir: Path, course_title: str, lesson_title: str, body: str) -> str:
stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}" stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}"
out = OUTPUT_DIR / f"{stem}.md" out = output_dir / f"{stem}.md"
out.write_text(f"# {lesson_title}\n\n{body}", encoding="utf-8") out.write_text(f"# {lesson_title}
{body}", encoding="utf-8")
return stem return stem
@ -90,10 +79,13 @@ async def lesson_body(page) -> str:
return html_to_md(await page.evaluate("() => document.body.innerHTML")) return html_to_md(await page.evaluate("() => document.body.innerHTML"))
async def run(discover: bool = False): async def run(community: str, output_dir: Path, discover: bool = False):
existing = existing_stems() classroom = f"{BASE}/{community}/classroom"
print(f"Output folder: {OUTPUT_DIR}") existing = existing_stems(output_dir)
print(f"Lessons already saved: {len(existing)}\n") print(f"Community: {community}")
print(f"Output: {output_dir}")
print(f"Lessons already saved: {len(existing)}
")
async with async_playwright() as p: async with async_playwright() as p:
browser = await p.chromium.launch(headless=False, slow_mo=25) browser = await p.chromium.launch(headless=False, slow_mo=25)
@ -101,19 +93,22 @@ async def run(discover: bool = False):
page = await ctx.new_page() page = await ctx.new_page()
print("Opening Skool — please log in when the browser window appears.") print("Opening Skool — please log in when the browser window appears.")
print("The script will continue automatically once you land on the community.\n") print("The script will continue automatically once you land on the community.
")
await page.goto("https://www.skool.com/login") await page.goto("https://www.skool.com/login")
await page.wait_for_url(f"**/{COMMUNITY}/**", timeout=300_000) await page.wait_for_url(f"**/{community}/**", timeout=300_000)
print("Logged in.\n") print("Logged in.
")
await page.goto(CLASSROOM) await page.goto(classroom)
await page.wait_for_load_state("load") await page.wait_for_load_state("load")
await asyncio.sleep(3) await asyncio.sleep(3)
nd = next_data(await page.content()) nd = next_data(await page.content())
all_crses = nd.get("props", {}).get("pageProps", {}).get("allCourses", []) all_crses = nd.get("props", {}).get("pageProps", {}).get("allCourses", [])
courses = [c for c in all_crses if c.get("metadata", {}).get("hasAccess", 0)] courses = [c for c in all_crses if c.get("metadata", {}).get("hasAccess", 0)]
print(f"Accessible courses: {len(courses)} of {len(all_crses)} total\n") print(f"Accessible courses: {len(courses)} of {len(all_crses)} total
")
if not courses: if not courses:
DIAG_DIR.mkdir(parents=True, exist_ok=True) DIAG_DIR.mkdir(parents=True, exist_ok=True)
@ -123,7 +118,7 @@ async def run(discover: bool = False):
return return
if discover: if discover:
course_url = f"{CLASSROOM}/{courses[0]['name']}" course_url = f"{classroom}/{courses[0]['name']}"
await page.goto(course_url) await page.goto(course_url)
await page.wait_for_load_state("load") await page.wait_for_load_state("load")
await asyncio.sleep(3) await asyncio.sleep(3)
@ -147,7 +142,7 @@ async def run(discover: bool = False):
for course in courses: for course in courses:
course_title = course["metadata"]["title"] course_title = course["metadata"]["title"]
course_url = f"{CLASSROOM}/{course['name']}" course_url = f"{classroom}/{course['name']}"
print(f"Course: {course_title}") print(f"Course: {course_title}")
await page.goto(course_url) await page.goto(course_url)
@ -163,7 +158,8 @@ async def run(discover: bool = False):
) )
if not children: if not children:
print(" No lessons found — skipping\n") print(" No lessons found — skipping
")
continue continue
print(f" {len(children)} lessons") print(f" {len(children)} lessons")
@ -182,7 +178,7 @@ async def run(discover: bool = False):
await page.goto(f"{course_url}?md={lesson_id}") await page.goto(f"{course_url}?md={lesson_id}")
await page.wait_for_load_state("load") await page.wait_for_load_state("load")
await asyncio.sleep(2) await asyncio.sleep(2)
stem = write_lesson(course_title, lesson_title, await lesson_body(page)) stem = write_lesson(output_dir, course_title, lesson_title, await lesson_body(page))
existing.add(stem) existing.add(stem)
saved += 1 saved += 1
print(f" [saved] {lesson_title[:65]}") print(f" [saved] {lesson_title[:65]}")
@ -194,11 +190,25 @@ async def run(discover: bool = False):
print("" * 52) print("" * 52)
print(f"Done. Saved: {saved} Skipped: {skipped} Errors: {errors}") print(f"Done. Saved: {saved} Skipped: {skipped} Errors: {errors}")
print(f"Output: {OUTPUT_DIR}") print(f"Output: {output_dir}")
await browser.close() await browser.close()
if __name__ == "__main__": if __name__ == "__main__":
ap = argparse.ArgumentParser(description="Download Skool community lessons to Markdown") ap = argparse.ArgumentParser(description="Download Skool community lessons to Markdown")
ap.add_argument(
"community",
help="Community slug from your Skool URL (e.g. skool.com/my-community → my-community)"
)
ap.add_argument(
"--output", "-o",
default=str(Path.home() / "skool-lessons"),
help="Folder to save Markdown files (default: ~/skool-lessons)"
)
ap.add_argument("--discover", action="store_true", help="Debug page structure without saving") ap.add_argument("--discover", action="store_true", help="Debug page structure without saving")
asyncio.run(run(discover=ap.parse_args().discover)) args = ap.parse_args()
asyncio.run(run(
community=args.community,
output_dir=Path(args.output).expanduser().resolve(),
discover=args.discover,
))