Make community and output dir configurable via CLI args
This commit is contained in:
parent
56c7b53254
commit
546e7a0b87
1 changed files with 53 additions and 43 deletions
96
scrape.py
96
scrape.py
|
|
@ -1,12 +1,13 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
skool-lesson-scrape
|
skool-lesson-scrape
|
||||||
Downloads lessons from a Skool community classroom to a local folder as Markdown files.
|
Downloads lessons from a Skool community classroom to local Markdown files.
|
||||||
Skips lessons already saved — safe to re-run when new content is added.
|
Skips lessons already saved — safe to re-run when new content is added.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python scrape.py # full scrape
|
python scrape.py <community> # full scrape
|
||||||
python scrape.py --discover # inspect page structure without saving (debug)
|
python scrape.py <community> --output ~/my-lessons # custom output folder
|
||||||
|
python scrape.py <community> --discover # debug without saving
|
||||||
|
|
||||||
Setup: see README.md
|
Setup: see README.md
|
||||||
"""
|
"""
|
||||||
|
|
@ -20,26 +21,11 @@ import html2text
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright
|
||||||
|
|
||||||
# ── CONFIG — edit these two lines ────────────────────────────────────────────
|
BASE = "https://www.skool.com"
|
||||||
#
|
DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag"
|
||||||
# COMMUNITY: the slug from your Skool community URL
|
|
||||||
# e.g. https://www.skool.com/navaigate → "navaigate"
|
|
||||||
COMMUNITY = "navaigate"
|
|
||||||
#
|
|
||||||
# OUTPUT_DIR: folder where .md files are saved (created if it doesn't exist)
|
|
||||||
# Mac/Linux: Path.home() / "skool-lessons"
|
|
||||||
# Windows: Path(r"C:\Users\YourName\Documents\skool-lessons")
|
|
||||||
# Obsidian: Path.home() / "Documents" / "ObsidianVault" / "Lessons"
|
|
||||||
OUTPUT_DIR = Path.home() / "skool-lessons"
|
|
||||||
#
|
|
||||||
# ─────────────────────────────────────────────────────────────────────────────
|
|
||||||
|
|
||||||
BASE = "https://www.skool.com"
|
|
||||||
CLASSROOM = f"{BASE}/{COMMUNITY}/classroom"
|
|
||||||
DIAG_DIR = Path(tempfile.gettempdir()) / "skool_scrape_diag"
|
|
||||||
|
|
||||||
CONTENT_SELECTORS = [
|
CONTENT_SELECTORS = [
|
||||||
".ProseMirror", # Skool's TipTap editor — primary target
|
".ProseMirror",
|
||||||
"[class*='lesson-content']",
|
"[class*='lesson-content']",
|
||||||
"[class*='lessonContent']",
|
"[class*='lessonContent']",
|
||||||
"[class*='module-content']",
|
"[class*='module-content']",
|
||||||
|
|
@ -50,13 +36,14 @@ CONTENT_SELECTORS = [
|
||||||
|
|
||||||
|
|
||||||
def sanitize(name: str) -> str:
|
def sanitize(name: str) -> str:
|
||||||
name = re.sub(r'[<>:"/\\|?*\n\r\t]', '', str(name)).strip().strip(".")
|
name = re.sub(r'[<>:"/\|?*
|
||||||
|
]', '', str(name)).strip().strip(".")
|
||||||
return name[:120]
|
return name[:120]
|
||||||
|
|
||||||
|
|
||||||
def existing_stems() -> set:
|
def existing_stems(output_dir: Path) -> set:
|
||||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
return {f.stem for f in OUTPUT_DIR.glob("*.md")}
|
return {f.stem for f in output_dir.glob("*.md")}
|
||||||
|
|
||||||
|
|
||||||
def next_data(html: str) -> dict:
|
def next_data(html: str) -> dict:
|
||||||
|
|
@ -72,10 +59,12 @@ def html_to_md(raw: str) -> str:
|
||||||
return h.handle(raw)
|
return h.handle(raw)
|
||||||
|
|
||||||
|
|
||||||
def write_lesson(course_title: str, lesson_title: str, body: str) -> str:
|
def write_lesson(output_dir: Path, course_title: str, lesson_title: str, body: str) -> str:
|
||||||
stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}"
|
stem = f"{sanitize(course_title)} -- {sanitize(lesson_title)}"
|
||||||
out = OUTPUT_DIR / f"{stem}.md"
|
out = output_dir / f"{stem}.md"
|
||||||
out.write_text(f"# {lesson_title}\n\n{body}", encoding="utf-8")
|
out.write_text(f"# {lesson_title}
|
||||||
|
|
||||||
|
{body}", encoding="utf-8")
|
||||||
return stem
|
return stem
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -90,10 +79,13 @@ async def lesson_body(page) -> str:
|
||||||
return html_to_md(await page.evaluate("() => document.body.innerHTML"))
|
return html_to_md(await page.evaluate("() => document.body.innerHTML"))
|
||||||
|
|
||||||
|
|
||||||
async def run(discover: bool = False):
|
async def run(community: str, output_dir: Path, discover: bool = False):
|
||||||
existing = existing_stems()
|
classroom = f"{BASE}/{community}/classroom"
|
||||||
print(f"Output folder: {OUTPUT_DIR}")
|
existing = existing_stems(output_dir)
|
||||||
print(f"Lessons already saved: {len(existing)}\n")
|
print(f"Community: {community}")
|
||||||
|
print(f"Output: {output_dir}")
|
||||||
|
print(f"Lessons already saved: {len(existing)}
|
||||||
|
")
|
||||||
|
|
||||||
async with async_playwright() as p:
|
async with async_playwright() as p:
|
||||||
browser = await p.chromium.launch(headless=False, slow_mo=25)
|
browser = await p.chromium.launch(headless=False, slow_mo=25)
|
||||||
|
|
@ -101,19 +93,22 @@ async def run(discover: bool = False):
|
||||||
page = await ctx.new_page()
|
page = await ctx.new_page()
|
||||||
|
|
||||||
print("Opening Skool — please log in when the browser window appears.")
|
print("Opening Skool — please log in when the browser window appears.")
|
||||||
print("The script will continue automatically once you land on the community.\n")
|
print("The script will continue automatically once you land on the community.
|
||||||
|
")
|
||||||
await page.goto("https://www.skool.com/login")
|
await page.goto("https://www.skool.com/login")
|
||||||
await page.wait_for_url(f"**/{COMMUNITY}/**", timeout=300_000)
|
await page.wait_for_url(f"**/{community}/**", timeout=300_000)
|
||||||
print("Logged in.\n")
|
print("Logged in.
|
||||||
|
")
|
||||||
|
|
||||||
await page.goto(CLASSROOM)
|
await page.goto(classroom)
|
||||||
await page.wait_for_load_state("load")
|
await page.wait_for_load_state("load")
|
||||||
await asyncio.sleep(3)
|
await asyncio.sleep(3)
|
||||||
|
|
||||||
nd = next_data(await page.content())
|
nd = next_data(await page.content())
|
||||||
all_crses = nd.get("props", {}).get("pageProps", {}).get("allCourses", [])
|
all_crses = nd.get("props", {}).get("pageProps", {}).get("allCourses", [])
|
||||||
courses = [c for c in all_crses if c.get("metadata", {}).get("hasAccess", 0)]
|
courses = [c for c in all_crses if c.get("metadata", {}).get("hasAccess", 0)]
|
||||||
print(f"Accessible courses: {len(courses)} of {len(all_crses)} total\n")
|
print(f"Accessible courses: {len(courses)} of {len(all_crses)} total
|
||||||
|
")
|
||||||
|
|
||||||
if not courses:
|
if not courses:
|
||||||
DIAG_DIR.mkdir(parents=True, exist_ok=True)
|
DIAG_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
@ -123,7 +118,7 @@ async def run(discover: bool = False):
|
||||||
return
|
return
|
||||||
|
|
||||||
if discover:
|
if discover:
|
||||||
course_url = f"{CLASSROOM}/{courses[0]['name']}"
|
course_url = f"{classroom}/{courses[0]['name']}"
|
||||||
await page.goto(course_url)
|
await page.goto(course_url)
|
||||||
await page.wait_for_load_state("load")
|
await page.wait_for_load_state("load")
|
||||||
await asyncio.sleep(3)
|
await asyncio.sleep(3)
|
||||||
|
|
@ -147,7 +142,7 @@ async def run(discover: bool = False):
|
||||||
|
|
||||||
for course in courses:
|
for course in courses:
|
||||||
course_title = course["metadata"]["title"]
|
course_title = course["metadata"]["title"]
|
||||||
course_url = f"{CLASSROOM}/{course['name']}"
|
course_url = f"{classroom}/{course['name']}"
|
||||||
print(f"Course: {course_title}")
|
print(f"Course: {course_title}")
|
||||||
|
|
||||||
await page.goto(course_url)
|
await page.goto(course_url)
|
||||||
|
|
@ -163,7 +158,8 @@ async def run(discover: bool = False):
|
||||||
)
|
)
|
||||||
|
|
||||||
if not children:
|
if not children:
|
||||||
print(" No lessons found — skipping\n")
|
print(" No lessons found — skipping
|
||||||
|
")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
print(f" {len(children)} lessons")
|
print(f" {len(children)} lessons")
|
||||||
|
|
@ -182,7 +178,7 @@ async def run(discover: bool = False):
|
||||||
await page.goto(f"{course_url}?md={lesson_id}")
|
await page.goto(f"{course_url}?md={lesson_id}")
|
||||||
await page.wait_for_load_state("load")
|
await page.wait_for_load_state("load")
|
||||||
await asyncio.sleep(2)
|
await asyncio.sleep(2)
|
||||||
stem = write_lesson(course_title, lesson_title, await lesson_body(page))
|
stem = write_lesson(output_dir, course_title, lesson_title, await lesson_body(page))
|
||||||
existing.add(stem)
|
existing.add(stem)
|
||||||
saved += 1
|
saved += 1
|
||||||
print(f" [saved] {lesson_title[:65]}")
|
print(f" [saved] {lesson_title[:65]}")
|
||||||
|
|
@ -194,11 +190,25 @@ async def run(discover: bool = False):
|
||||||
|
|
||||||
print("─" * 52)
|
print("─" * 52)
|
||||||
print(f"Done. Saved: {saved} Skipped: {skipped} Errors: {errors}")
|
print(f"Done. Saved: {saved} Skipped: {skipped} Errors: {errors}")
|
||||||
print(f"Output: {OUTPUT_DIR}")
|
print(f"Output: {output_dir}")
|
||||||
await browser.close()
|
await browser.close()
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
ap = argparse.ArgumentParser(description="Download Skool community lessons to Markdown")
|
ap = argparse.ArgumentParser(description="Download Skool community lessons to Markdown")
|
||||||
|
ap.add_argument(
|
||||||
|
"community",
|
||||||
|
help="Community slug from your Skool URL (e.g. skool.com/my-community → my-community)"
|
||||||
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"--output", "-o",
|
||||||
|
default=str(Path.home() / "skool-lessons"),
|
||||||
|
help="Folder to save Markdown files (default: ~/skool-lessons)"
|
||||||
|
)
|
||||||
ap.add_argument("--discover", action="store_true", help="Debug page structure without saving")
|
ap.add_argument("--discover", action="store_true", help="Debug page structure without saving")
|
||||||
asyncio.run(run(discover=ap.parse_args().discover))
|
args = ap.parse_args()
|
||||||
|
asyncio.run(run(
|
||||||
|
community=args.community,
|
||||||
|
output_dir=Path(args.output).expanduser().resolve(),
|
||||||
|
discover=args.discover,
|
||||||
|
))
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue