diff --git a/scrape.py b/scrape.py index 4e83a38..695cbdd 100644 --- a/scrape.py +++ b/scrape.py @@ -1,12 +1,13 @@ #!/usr/bin/env python3 """ skool-lesson-scrape -Downloads lessons from a Skool community classroom to a local folder as Markdown files. +Downloads lessons from a Skool community classroom to local Markdown files. Skips lessons already saved — safe to re-run when new content is added. Usage: - python scrape.py # full scrape - python scrape.py --discover # inspect page structure without saving (debug) + python scrape.py # full scrape + python scrape.py --output ~/my-lessons # custom output folder + python scrape.py --discover # debug without saving Setup: see README.md """ @@ -20,26 +21,11 @@ import html2text from pathlib import Path from playwright.async_api import async_playwright -# ── CONFIG — edit these two lines ──────────────────────────────────────────── -# -# COMMUNITY: the slug from your Skool community URL -# e.g. https://www.skool.com/navaigate → "navaigate" -COMMUNITY = "navaigate" -# -# OUTPUT_DIR: folder where .md files are saved (created if it doesn't exist) -# Mac/Linux: Path.home() / "skool-lessons" -# Windows: Path(r"C:\Users\YourName\Documents\skool-lessons") -# Obsidian: Path.home() / "Documents" / "ObsidianVault" / "Lessons" -OUTPUT_DIR = Path.home() / "skool-lessons" -# -# ───────────────────────────────────────────────────────────────────────────── - -BASE = "https://www.skool.com" -CLASSROOM = f"{BASE}/{COMMUNITY}/classroom" -DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag" +BASE = "https://www.skool.com" +DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag" CONTENT_SELECTORS = [ - ".ProseMirror", # Skool's TipTap editor — primary target + ".ProseMirror", "[class*='lesson-content']", "[class*='lessonContent']", "[class*='module-content']", @@ -50,13 +36,14 @@ CONTENT_SELECTORS = [ def sanitize(name: str) -> str: - name = re.sub(r'[<>:"/\\|?*\n\r\t]', '', str(name)).strip().strip(".") + name = re.sub(r'[<>:"/\|?* + ]', '', str(name)).strip().strip(".") return name[:120] -def existing_stems() -> set: - OUTPUT_DIR.mkdir(parents=True, exist_ok=True) - return {f.stem for f in OUTPUT_DIR.glob("*.md")} +def existing_stems(output_dir: Path) -> set: + output_dir.mkdir(parents=True, exist_ok=True) + return {f.stem for f in output_dir.glob("*.md")} def next_data(html: str) -> dict: @@ -72,10 +59,12 @@ def html_to_md(raw: str) -> str: return h.handle(raw) -def write_lesson(course_title: str, lesson_title: str, body: str) -> str: +def write_lesson(output_dir: Path, course_title: str, lesson_title: str, body: str) -> str: stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}" - out = OUTPUT_DIR / f"{stem}.md" - out.write_text(f"# {lesson_title}\n\n{body}", encoding="utf-8") + out = output_dir / f"{stem}.md" + out.write_text(f"# {lesson_title} + +{body}", encoding="utf-8") return stem @@ -90,10 +79,13 @@ async def lesson_body(page) -> str: return html_to_md(await page.evaluate("() => document.body.innerHTML")) -async def run(discover: bool = False): - existing = existing_stems() - print(f"Output folder: {OUTPUT_DIR}") - print(f"Lessons already saved: {len(existing)}\n") +async def run(community: str, output_dir: Path, discover: bool = False): + classroom = f"{BASE}/{community}/classroom" + existing = existing_stems(output_dir) + print(f"Community: {community}") + print(f"Output: {output_dir}") + print(f"Lessons already saved: {len(existing)} +") async with async_playwright() as p: browser = await p.chromium.launch(headless=False, slow_mo=25) @@ -101,19 +93,22 @@ async def run(discover: bool = False): page = await ctx.new_page() print("Opening Skool — please log in when the browser window appears.") - print("The script will continue automatically once you land on the community.\n") + print("The script will continue automatically once you land on the community. +") await page.goto("https://www.skool.com/login") - await page.wait_for_url(f"**/{COMMUNITY}/**", timeout=300_000) - print("Logged in.\n") + await page.wait_for_url(f"**/{community}/**", timeout=300_000) + print("Logged in. +") - await page.goto(CLASSROOM) + await page.goto(classroom) await page.wait_for_load_state("load") await asyncio.sleep(3) nd = next_data(await page.content()) all_crses = nd.get("props", {}).get("pageProps", {}).get("allCourses", []) courses = [c for c in all_crses if c.get("metadata", {}).get("hasAccess", 0)] - print(f"Accessible courses: {len(courses)} of {len(all_crses)} total\n") + print(f"Accessible courses: {len(courses)} of {len(all_crses)} total +") if not courses: DIAG_DIR.mkdir(parents=True, exist_ok=True) @@ -123,7 +118,7 @@ async def run(discover: bool = False): return if discover: - course_url = f"{CLASSROOM}/{courses[0]['name']}" + course_url = f"{classroom}/{courses[0]['name']}" await page.goto(course_url) await page.wait_for_load_state("load") await asyncio.sleep(3) @@ -147,7 +142,7 @@ async def run(discover: bool = False): for course in courses: course_title = course["metadata"]["title"] - course_url = f"{CLASSROOM}/{course['name']}" + course_url = f"{classroom}/{course['name']}" print(f"Course: {course_title}") await page.goto(course_url) @@ -163,7 +158,8 @@ async def run(discover: bool = False): ) if not children: - print(" No lessons found — skipping\n") + print(" No lessons found — skipping +") continue print(f" {len(children)} lessons") @@ -182,7 +178,7 @@ async def run(discover: bool = False): await page.goto(f"{course_url}?md={lesson_id}") await page.wait_for_load_state("load") await asyncio.sleep(2) - stem = write_lesson(course_title, lesson_title, await lesson_body(page)) + stem = write_lesson(output_dir, course_title, lesson_title, await lesson_body(page)) existing.add(stem) saved += 1 print(f" [saved] {lesson_title[:65]}") @@ -194,11 +190,25 @@ async def run(discover: bool = False): print("─" * 52) print(f"Done. Saved: {saved} Skipped: {skipped} Errors: {errors}") - print(f"Output: {OUTPUT_DIR}") + print(f"Output: {output_dir}") await browser.close() if __name__ == "__main__": ap = argparse.ArgumentParser(description="Download Skool community lessons to Markdown") + ap.add_argument( + "community", + help="Community slug from your Skool URL (e.g. skool.com/my-community → my-community)" + ) + ap.add_argument( + "--output", "-o", + default=str(Path.home() / "skool-lessons"), + help="Folder to save Markdown files (default: ~/skool-lessons)" + ) ap.add_argument("--discover", action="store_true", help="Debug page structure without saving") - asyncio.run(run(discover=ap.parse_args().discover)) + args = ap.parse_args() + asyncio.run(run( + community=args.community, + output_dir=Path(args.output).expanduser().resolve(), + discover=args.discover, + ))