Make community and output dir configurable via CLI args

2026-05-06 20:07:53 +00:00 · 2026-05-06 20:07:53 +00:00 · 546e7a0b87
commit 546e7a0b87
parent 56c7b53254
1 changed files with 53 additions and 43 deletions
--- a/scrape.py
+++ b/scrape.py
@ -1,12 +1,13 @@
 #!/usr/bin/env python3
 """
 skool-lesson-scrape
-Downloads lessons from a Skool community classroom to a local folder as Markdown files.
+Downloads lessons from a Skool community classroom to local Markdown files.
 Skips lessons already saved — safe to re-run when new content is added.

 Usage:
-    python scrape.py               # full scrape
-    python scrape.py --discover    # inspect page structure without saving (debug)
+    python scrape.py <community>                        # full scrape
+    python scrape.py <community> --output ~/my-lessons  # custom output folder
+    python scrape.py <community> --discover             # debug without saving

 Setup: see README.md
 """
@ -20,26 +21,11 @@ import html2text
 from pathlib import Path
 from playwright.async_api import async_playwright

-# ── CONFIG — edit these two lines ────────────────────────────────────────────
-#
-# COMMUNITY: the slug from your Skool community URL
-#   e.g. https://www.skool.com/navaigate  →  "navaigate"
-COMMUNITY = "navaigate"
-#
-# OUTPUT_DIR: folder where .md files are saved (created if it doesn't exist)
-#   Mac/Linux: Path.home() / "skool-lessons"
-#   Windows:   Path(r"C:\Users\YourName\Documents\skool-lessons")
-#   Obsidian:  Path.home() / "Documents" / "ObsidianVault" / "Lessons"
-OUTPUT_DIR = Path.home() / "skool-lessons"
-#
-# ─────────────────────────────────────────────────────────────────────────────
-
-BASE      = "https://www.skool.com"
-CLASSROOM = f"{BASE}/{COMMUNITY}/classroom"
-DIAG_DIR  = Path(tempfile.gettempdir()) / "skool_scrape_diag"
+BASE     = "https://www.skool.com"
+DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag"

 CONTENT_SELECTORS = [
-    ".ProseMirror",           # Skool's TipTap editor — primary target
+    ".ProseMirror",
    "[class*='lesson-content']",
    "[class*='lessonContent']",
    "[class*='module-content']",
@ -50,13 +36,14 @@ CONTENT_SELECTORS = [


 def sanitize(name: str) -> str:
-    name = re.sub(r'[<>:"/\\|?*\n\r\t]', '', str(name)).strip().strip(".")
+    name = re.sub(r'[<>:"/\|?*
+
	]', '', str(name)).strip().strip(".")
    return name[:120]


-def existing_stems() -> set:
-    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
-    return {f.stem for f in OUTPUT_DIR.glob("*.md")}
+def existing_stems(output_dir: Path) -> set:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    return {f.stem for f in output_dir.glob("*.md")}


 def next_data(html: str) -> dict:
@ -72,10 +59,12 @@ def html_to_md(raw: str) -> str:
    return h.handle(raw)


-def write_lesson(course_title: str, lesson_title: str, body: str) -> str:
+def write_lesson(output_dir: Path, course_title: str, lesson_title: str, body: str) -> str:
    stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}"
-    out  = OUTPUT_DIR / f"{stem}.md"
-    out.write_text(f"# {lesson_title}\n\n{body}", encoding="utf-8")
+    out  = output_dir / f"{stem}.md"
+    out.write_text(f"# {lesson_title}
+
+{body}", encoding="utf-8")
    return stem


@ -90,10 +79,13 @@ async def lesson_body(page) -> str:
    return html_to_md(await page.evaluate("() => document.body.innerHTML"))


-async def run(discover: bool = False):
-    existing = existing_stems()
-    print(f"Output folder: {OUTPUT_DIR}")
-    print(f"Lessons already saved: {len(existing)}\n")
+async def run(community: str, output_dir: Path, discover: bool = False):
+    classroom = f"{BASE}/{community}/classroom"
+    existing  = existing_stems(output_dir)
+    print(f"Community:  {community}")
+    print(f"Output:     {output_dir}")
+    print(f"Lessons already saved: {len(existing)}
+")

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False, slow_mo=25)
@ -101,19 +93,22 @@ async def run(discover: bool = False):
        page    = await ctx.new_page()

        print("Opening Skool — please log in when the browser window appears.")
-        print("The script will continue automatically once you land on the community.\n")
+        print("The script will continue automatically once you land on the community.
+")
        await page.goto("https://www.skool.com/login")
-        await page.wait_for_url(f"**/{COMMUNITY}/**", timeout=300_000)
-        print("Logged in.\n")
+        await page.wait_for_url(f"**/{community}/**", timeout=300_000)
+        print("Logged in.
+")

-        await page.goto(CLASSROOM)
+        await page.goto(classroom)
        await page.wait_for_load_state("load")
        await asyncio.sleep(3)

        nd        = next_data(await page.content())
        all_crses = nd.get("props", {}).get("pageProps", {}).get("allCourses", [])
        courses   = [c for c in all_crses if c.get("metadata", {}).get("hasAccess", 0)]
-        print(f"Accessible courses: {len(courses)} of {len(all_crses)} total\n")
+        print(f"Accessible courses: {len(courses)} of {len(all_crses)} total
+")

        if not courses:
            DIAG_DIR.mkdir(parents=True, exist_ok=True)
@ -123,7 +118,7 @@ async def run(discover: bool = False):
            return

        if discover:
-            course_url = f"{CLASSROOM}/{courses[0]['name']}"
+            course_url = f"{classroom}/{courses[0]['name']}"
            await page.goto(course_url)
            await page.wait_for_load_state("load")
            await asyncio.sleep(3)
@ -147,7 +142,7 @@ async def run(discover: bool = False):

        for course in courses:
            course_title = course["metadata"]["title"]
-            course_url   = f"{CLASSROOM}/{course['name']}"
+            course_url   = f"{classroom}/{course['name']}"
            print(f"Course: {course_title}")

            await page.goto(course_url)
@ -163,7 +158,8 @@ async def run(discover: bool = False):
            )

            if not children:
-                print("  No lessons found — skipping\n")
+                print("  No lessons found — skipping
+")
                continue

            print(f"  {len(children)} lessons")
@ -182,7 +178,7 @@ async def run(discover: bool = False):
                    await page.goto(f"{course_url}?md={lesson_id}")
                    await page.wait_for_load_state("load")
                    await asyncio.sleep(2)
-                    stem = write_lesson(course_title, lesson_title, await lesson_body(page))
+                    stem = write_lesson(output_dir, course_title, lesson_title, await lesson_body(page))
                    existing.add(stem)
                    saved += 1
                    print(f"  [saved]  {lesson_title[:65]}")
@ -194,11 +190,25 @@ async def run(discover: bool = False):

        print("─" * 52)
        print(f"Done.   Saved: {saved}   Skipped: {skipped}   Errors: {errors}")
-        print(f"Output: {OUTPUT_DIR}")
+        print(f"Output: {output_dir}")
        await browser.close()


 if __name__ == "__main__":
    ap = argparse.ArgumentParser(description="Download Skool community lessons to Markdown")
+    ap.add_argument(
+        "community",
+        help="Community slug from your Skool URL (e.g. skool.com/my-community → my-community)"
+    )
+    ap.add_argument(
+        "--output", "-o",
+        default=str(Path.home() / "skool-lessons"),
+        help="Folder to save Markdown files (default: ~/skool-lessons)"
+    )
    ap.add_argument("--discover", action="store_true", help="Debug page structure without saving")
-    asyncio.run(run(discover=ap.parse_args().discover))
+    args = ap.parse_args()
+    asyncio.run(run(
+        community=args.community,
+        output_dir=Path(args.output).expanduser().resolve(),
+        discover=args.discover,
+    ))