Make community and output dir configurable via CLI args
This commit is contained in:
parent
56c7b53254
commit
546e7a0b87
1 changed files with 53 additions and 43 deletions
96
scrape.py
96
scrape.py
|
|
@ -1,12 +1,13 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
skool-lesson-scrape
|
||||
Downloads lessons from a Skool community classroom to a local folder as Markdown files.
|
||||
Downloads lessons from a Skool community classroom to local Markdown files.
|
||||
Skips lessons already saved — safe to re-run when new content is added.
|
||||
|
||||
Usage:
|
||||
python scrape.py # full scrape
|
||||
python scrape.py --discover # inspect page structure without saving (debug)
|
||||
python scrape.py <community> # full scrape
|
||||
python scrape.py <community> --output ~/my-lessons # custom output folder
|
||||
python scrape.py <community> --discover # debug without saving
|
||||
|
||||
Setup: see README.md
|
||||
"""
|
||||
|
|
@ -20,26 +21,11 @@ import html2text
|
|||
from pathlib import Path
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
# ── CONFIG — edit these two lines ────────────────────────────────────────────
|
||||
#
|
||||
# COMMUNITY: the slug from your Skool community URL
|
||||
# e.g. https://www.skool.com/navaigate → "navaigate"
|
||||
COMMUNITY = "navaigate"
|
||||
#
|
||||
# OUTPUT_DIR: folder where .md files are saved (created if it doesn't exist)
|
||||
# Mac/Linux: Path.home() / "skool-lessons"
|
||||
# Windows: Path(r"C:\Users\YourName\Documents\skool-lessons")
|
||||
# Obsidian: Path.home() / "Documents" / "ObsidianVault" / "Lessons"
|
||||
OUTPUT_DIR = Path.home() / "skool-lessons"
|
||||
#
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
BASE = "https://www.skool.com"
|
||||
CLASSROOM = f"{BASE}/{COMMUNITY}/classroom"
|
||||
DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag"
|
||||
BASE = "https://www.skool.com"
|
||||
DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag"
|
||||
|
||||
CONTENT_SELECTORS = [
|
||||
".ProseMirror", # Skool's TipTap editor — primary target
|
||||
".ProseMirror",
|
||||
"[class*='lesson-content']",
|
||||
"[class*='lessonContent']",
|
||||
"[class*='module-content']",
|
||||
|
|
@ -50,13 +36,14 @@ CONTENT_SELECTORS = [
|
|||
|
||||
|
||||
def sanitize(name: str) -> str:
|
||||
name = re.sub(r'[<>:"/\\|?*\n\r\t]', '', str(name)).strip().strip(".")
|
||||
name = re.sub(r'[<>:"/\|?*
|
||||
]', '', str(name)).strip().strip(".")
|
||||
return name[:120]
|
||||
|
||||
|
||||
def existing_stems() -> set:
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
return {f.stem for f in OUTPUT_DIR.glob("*.md")}
|
||||
def existing_stems(output_dir: Path) -> set:
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
return {f.stem for f in output_dir.glob("*.md")}
|
||||
|
||||
|
||||
def next_data(html: str) -> dict:
|
||||
|
|
@ -72,10 +59,12 @@ def html_to_md(raw: str) -> str:
|
|||
return h.handle(raw)
|
||||
|
||||
|
||||
def write_lesson(course_title: str, lesson_title: str, body: str) -> str:
|
||||
def write_lesson(output_dir: Path, course_title: str, lesson_title: str, body: str) -> str:
|
||||
stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}"
|
||||
out = OUTPUT_DIR / f"{stem}.md"
|
||||
out.write_text(f"# {lesson_title}\n\n{body}", encoding="utf-8")
|
||||
out = output_dir / f"{stem}.md"
|
||||
out.write_text(f"# {lesson_title}
|
||||
|
||||
{body}", encoding="utf-8")
|
||||
return stem
|
||||
|
||||
|
||||
|
|
@ -90,10 +79,13 @@ async def lesson_body(page) -> str:
|
|||
return html_to_md(await page.evaluate("() => document.body.innerHTML"))
|
||||
|
||||
|
||||
async def run(discover: bool = False):
|
||||
existing = existing_stems()
|
||||
print(f"Output folder: {OUTPUT_DIR}")
|
||||
print(f"Lessons already saved: {len(existing)}\n")
|
||||
async def run(community: str, output_dir: Path, discover: bool = False):
|
||||
classroom = f"{BASE}/{community}/classroom"
|
||||
existing = existing_stems(output_dir)
|
||||
print(f"Community: {community}")
|
||||
print(f"Output: {output_dir}")
|
||||
print(f"Lessons already saved: {len(existing)}
|
||||
")
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=False, slow_mo=25)
|
||||
|
|
@ -101,19 +93,22 @@ async def run(discover: bool = False):
|
|||
page = await ctx.new_page()
|
||||
|
||||
print("Opening Skool — please log in when the browser window appears.")
|
||||
print("The script will continue automatically once you land on the community.\n")
|
||||
print("The script will continue automatically once you land on the community.
|
||||
")
|
||||
await page.goto("https://www.skool.com/login")
|
||||
await page.wait_for_url(f"**/{COMMUNITY}/**", timeout=300_000)
|
||||
print("Logged in.\n")
|
||||
await page.wait_for_url(f"**/{community}/**", timeout=300_000)
|
||||
print("Logged in.
|
||||
")
|
||||
|
||||
await page.goto(CLASSROOM)
|
||||
await page.goto(classroom)
|
||||
await page.wait_for_load_state("load")
|
||||
await asyncio.sleep(3)
|
||||
|
||||
nd = next_data(await page.content())
|
||||
all_crses = nd.get("props", {}).get("pageProps", {}).get("allCourses", [])
|
||||
courses = [c for c in all_crses if c.get("metadata", {}).get("hasAccess", 0)]
|
||||
print(f"Accessible courses: {len(courses)} of {len(all_crses)} total\n")
|
||||
print(f"Accessible courses: {len(courses)} of {len(all_crses)} total
|
||||
")
|
||||
|
||||
if not courses:
|
||||
DIAG_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
|
@ -123,7 +118,7 @@ async def run(discover: bool = False):
|
|||
return
|
||||
|
||||
if discover:
|
||||
course_url = f"{CLASSROOM}/{courses[0]['name']}"
|
||||
course_url = f"{classroom}/{courses[0]['name']}"
|
||||
await page.goto(course_url)
|
||||
await page.wait_for_load_state("load")
|
||||
await asyncio.sleep(3)
|
||||
|
|
@ -147,7 +142,7 @@ async def run(discover: bool = False):
|
|||
|
||||
for course in courses:
|
||||
course_title = course["metadata"]["title"]
|
||||
course_url = f"{CLASSROOM}/{course['name']}"
|
||||
course_url = f"{classroom}/{course['name']}"
|
||||
print(f"Course: {course_title}")
|
||||
|
||||
await page.goto(course_url)
|
||||
|
|
@ -163,7 +158,8 @@ async def run(discover: bool = False):
|
|||
)
|
||||
|
||||
if not children:
|
||||
print(" No lessons found — skipping\n")
|
||||
print(" No lessons found — skipping
|
||||
")
|
||||
continue
|
||||
|
||||
print(f" {len(children)} lessons")
|
||||
|
|
@ -182,7 +178,7 @@ async def run(discover: bool = False):
|
|||
await page.goto(f"{course_url}?md={lesson_id}")
|
||||
await page.wait_for_load_state("load")
|
||||
await asyncio.sleep(2)
|
||||
stem = write_lesson(course_title, lesson_title, await lesson_body(page))
|
||||
stem = write_lesson(output_dir, course_title, lesson_title, await lesson_body(page))
|
||||
existing.add(stem)
|
||||
saved += 1
|
||||
print(f" [saved] {lesson_title[:65]}")
|
||||
|
|
@ -194,11 +190,25 @@ async def run(discover: bool = False):
|
|||
|
||||
print("─" * 52)
|
||||
print(f"Done. Saved: {saved} Skipped: {skipped} Errors: {errors}")
|
||||
print(f"Output: {OUTPUT_DIR}")
|
||||
print(f"Output: {output_dir}")
|
||||
await browser.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
ap = argparse.ArgumentParser(description="Download Skool community lessons to Markdown")
|
||||
ap.add_argument(
|
||||
"community",
|
||||
help="Community slug from your Skool URL (e.g. skool.com/my-community → my-community)"
|
||||
)
|
||||
ap.add_argument(
|
||||
"--output", "-o",
|
||||
default=str(Path.home() / "skool-lessons"),
|
||||
help="Folder to save Markdown files (default: ~/skool-lessons)"
|
||||
)
|
||||
ap.add_argument("--discover", action="store_true", help="Debug page structure without saving")
|
||||
asyncio.run(run(discover=ap.parse_args().discover))
|
||||
args = ap.parse_args()
|
||||
asyncio.run(run(
|
||||
community=args.community,
|
||||
output_dir=Path(args.output).expanduser().resolve(),
|
||||
discover=args.discover,
|
||||
))
|
||||
|
|
|
|||
Loading…
Reference in a new issue