Make community and output dir configurable via CLI args

2026-05-06 20:07:53 +00:00 · 2026-05-06 20:07:53 +00:00 · 546e7a0b87
commit 546e7a0b87
parent 56c7b53254
1 changed files with 53 additions and 43 deletions
--- a/scrape.py
+++ b/scrape.py
@ -1,12 +1,13 @@
 #!/usr/bin/env python3
 """
 skool-lesson-scrape
-Downloads lessons from a Skool community classroom to a local folder as Markdown files.
+Downloads lessons from a Skool community classroom to local Markdown files.
 Skips lessons already saved — safe to re-run when new content is added.
 Usage:
-    python scrape.py               # full scrape
+    python scrape.py <community>                        # full scrape
-    python scrape.py --discover    # inspect page structure without saving (debug)
+    python scrape.py <community> --output ~/my-lessons  # custom output folder
    python scrape.py <community> --discover             # debug without saving
 Setup: see README.md
 """
@ -20,26 +21,11 @@ import html2text
 from pathlib import Path
 from playwright.async_api import async_playwright
-# ── CONFIG — edit these two lines ────────────────────────────────────────────
+BASE     = "https://www.skool.com"
-#
+DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag"
 # COMMUNITY: the slug from your Skool community URL
 #   e.g. https://www.skool.com/navaigate  →  "navaigate"
 COMMUNITY = "navaigate"
 #
 # OUTPUT_DIR: folder where .md files are saved (created if it doesn't exist)
 #   Mac/Linux: Path.home() / "skool-lessons"
 #   Windows:   Path(r"C:\Users\YourName\Documents\skool-lessons")
 #   Obsidian:  Path.home() / "Documents" / "ObsidianVault" / "Lessons"
 OUTPUT_DIR = Path.home() / "skool-lessons"
 #
 # ─────────────────────────────────────────────────────────────────────────────
 BASE      = "https://www.skool.com"
 CLASSROOM = f"{BASE}/{COMMUNITY}/classroom"
 DIAG_DIR  = Path(tempfile.gettempdir()) / "skool_scrape_diag"
 CONTENT_SELECTORS = [
-    ".ProseMirror",           # Skool's TipTap editor — primary target
+    ".ProseMirror",
    "[class*='lesson-content']",
    "[class*='lessonContent']",
    "[class*='module-content']",
@ -50,13 +36,14 @@ CONTENT_SELECTORS = [
 def sanitize(name: str) -> str:
-    name = re.sub(r'[<>:"/\\|?*\n\r\t]', '', str(name)).strip().strip(".")
+    name = re.sub(r'[<>:"/\|?*
 
	]', '', str(name)).strip().strip(".")
    return name[:120]
-def existing_stems() -> set:
+def existing_stems(output_dir: Path) -> set:
-    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+    output_dir.mkdir(parents=True, exist_ok=True)
-    return {f.stem for f in OUTPUT_DIR.glob("*.md")}
+    return {f.stem for f in output_dir.glob("*.md")}
 def next_data(html: str) -> dict:
@ -72,10 +59,12 @@ def html_to_md(raw: str) -> str:
    return h.handle(raw)
-def write_lesson(course_title: str, lesson_title: str, body: str) -> str:
+def write_lesson(output_dir: Path, course_title: str, lesson_title: str, body: str) -> str:
    stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}"
-    out  = OUTPUT_DIR / f"{stem}.md"
+    out  = output_dir / f"{stem}.md"
-    out.write_text(f"# {lesson_title}\n\n{body}", encoding="utf-8")
+    out.write_text(f"# {lesson_title}
 {body}", encoding="utf-8")
    return stem
@ -90,10 +79,13 @@ async def lesson_body(page) -> str:
    return html_to_md(await page.evaluate("() => document.body.innerHTML"))
-async def run(discover: bool = False):
+async def run(community: str, output_dir: Path, discover: bool = False):
-    existing = existing_stems()
+    classroom = f"{BASE}/{community}/classroom"
-    print(f"Output folder: {OUTPUT_DIR}")
+    existing  = existing_stems(output_dir)
-    print(f"Lessons already saved: {len(existing)}\n")
+    print(f"Community:  {community}")
    print(f"Output:     {output_dir}")
    print(f"Lessons already saved: {len(existing)}
 ")
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=False, slow_mo=25)
@ -101,19 +93,22 @@ async def run(discover: bool = False):
        page    = await ctx.new_page()
        print("Opening Skool — please log in when the browser window appears.")
-        print("The script will continue automatically once you land on the community.\n")
+        print("The script will continue automatically once you land on the community.
 ")
        await page.goto("https://www.skool.com/login")
-        await page.wait_for_url(f"**/{COMMUNITY}/**", timeout=300_000)
+        await page.wait_for_url(f"**/{community}/**", timeout=300_000)
-        print("Logged in.\n")
+        print("Logged in.
 ")
-        await page.goto(CLASSROOM)
+        await page.goto(classroom)
        await page.wait_for_load_state("load")
        await asyncio.sleep(3)
        nd        = next_data(await page.content())
        all_crses = nd.get("props", {}).get("pageProps", {}).get("allCourses", [])
        courses   = [c for c in all_crses if c.get("metadata", {}).get("hasAccess", 0)]
-        print(f"Accessible courses: {len(courses)} of {len(all_crses)} total\n")
+        print(f"Accessible courses: {len(courses)} of {len(all_crses)} total
 ")
        if not courses:
            DIAG_DIR.mkdir(parents=True, exist_ok=True)
@ -123,7 +118,7 @@ async def run(discover: bool = False):
            return
        if discover:
-            course_url = f"{CLASSROOM}/{courses[0]['name']}"
+            course_url = f"{classroom}/{courses[0]['name']}"
            await page.goto(course_url)
            await page.wait_for_load_state("load")
            await asyncio.sleep(3)
@ -147,7 +142,7 @@ async def run(discover: bool = False):
        for course in courses:
            course_title = course["metadata"]["title"]
-            course_url   = f"{CLASSROOM}/{course['name']}"
+            course_url   = f"{classroom}/{course['name']}"
            print(f"Course: {course_title}")
            await page.goto(course_url)
@ -163,7 +158,8 @@ async def run(discover: bool = False):
            )
            if not children:
-                print("  No lessons found — skipping\n")
+                print("  No lessons found — skipping
 ")
                continue
            print(f"  {len(children)} lessons")
@ -182,7 +178,7 @@ async def run(discover: bool = False):
                    await page.goto(f"{course_url}?md={lesson_id}")
                    await page.wait_for_load_state("load")
                    await asyncio.sleep(2)
-                    stem = write_lesson(course_title, lesson_title, await lesson_body(page))
+                    stem = write_lesson(output_dir, course_title, lesson_title, await lesson_body(page))
                    existing.add(stem)
                    saved += 1
                    print(f"  [saved]  {lesson_title[:65]}")
@ -194,11 +190,25 @@ async def run(discover: bool = False):
        print("─" * 52)
        print(f"Done.   Saved: {saved}   Skipped: {skipped}   Errors: {errors}")
-        print(f"Output: {OUTPUT_DIR}")
+        print(f"Output: {output_dir}")
        await browser.close()
 if __name__ == "__main__":
    ap = argparse.ArgumentParser(description="Download Skool community lessons to Markdown")
    ap.add_argument(
        "community",
        help="Community slug from your Skool URL (e.g. skool.com/my-community → my-community)"
    )
    ap.add_argument(
        "--output", "-o",
        default=str(Path.home() / "skool-lessons"),
        help="Folder to save Markdown files (default: ~/skool-lessons)"
    )
    ap.add_argument("--discover", action="store_true", help="Debug page structure without saving")
-    asyncio.run(run(discover=ap.parse_args().discover))
+    args = ap.parse_args()
    asyncio.run(run(
        community=args.community,
        output_dir=Path(args.output).expanduser().resolve(),
        discover=args.discover,
    ))