make gogetaroomie.com work

2025-11-24 19:50:24 +01:00 · 2025-11-24 19:50:24 +01:00 · 95b07cd802
parent bb3f873450
commit 95b07cd802
2 changed files with 95 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,7 @@ build/
 dist/
 wheels/
 *.egg-info
 result/
 # Virtual environments
 .venv
--- a/main.py
+++ b/main.py
@ -1,7 +1,87 @@
 import argparse
 import os
 from pathlib import Path
 from bs4 import BeautifulSoup
 import requests
-def main():
+def fetch_page(
    domain: str,
    i: int,
    url: str,
    download: bool,
    img_tag: str,
    out_dir: Path,
    next_tag: str,
 ) -> str:
    """Download a comic from a URL and return the URL of the next page
    :param domain: the comic's domain
    :param i: which page of the comic this is
    :param url: the URL of the page
    :param download: whether to download the comic
    :param img_tag: the id of the <img> tag where the comic is located
    :param out_dir: the directory the comic is saved to
    :param next_tag: the tag of the "next" button
    :return: the URL of the next page, None if it doesn't exist
    """
    if not url.startswith("https://"):
        url = domain + url
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")
    if download:
        img_url = soup.find("img", id=img_tag).get("src")
        if not img_url.startswith("https://"):
            img_url = domain + img_url
        comic = requests.get(img_url)
        filename = out_dir.joinpath(f"{i}.{img_url.split('.')[-1]}")
        print(filename)
        try:
            output_file = open(filename, "wb")
        except FileNotFoundError:
            os.mkdir(out_dir)
            output_file = open(filename, "wb")
        output_file.write(comic.content)
        output_file.close()
    return soup.find("a", rel=next_tag).get("href")
 def fetch_comic(args):
    """Download the requested comic pages to a local directory."""
    home_page = requests.get(args.domain)
    home_s = BeautifulSoup(home_page.text, "html.parser")
    out_dir = Path(os.getcwd()).joinpath(args.output)
    first_url = home_s.find("a", rel=args.first).get("href")
    if not first_url.startswith("https://"):
        first_url = args.domain + first_url
    next_page = requests.get(first_url)
    next_soup = BeautifulSoup(next_page.text, "html.parser")
    try:
        next_url = next_soup.find("a", rel=args.next).get("href")
    except AttributeError:
        print(next_soup.find("a"))
    i = 1
    while i < args.end:
        begin = args.begin if args.begin else 1
        download = False if i < begin else True
        print(f"Fetching comic from {next_url}...")
        next_url = fetch_page(
            args.domain, i, next_url, download, args.img, out_dir, args.next
        )
        if not next_url:
            break  # end reached
        if i == args.limit:
            break  # limit reached
        i += 1
 def cli():
    """Read the arguments from the command line and initialize commands."""
    parser = argparse.ArgumentParser()
@ -20,17 +100,26 @@ def main():
        "--next", default=None, help="html 'id' tag of the 'next comic' button"
    )
    parser.add_argument("--limit", default=None, help="Maximum of images to download")
    parser.add_argument(
-        "--start", default=None, help="At which page to start with downloading"
+        "--limit", default=None, help="Maximum of images to download", type=int
    )
    parser.add_argument(
-        "--end", default=None, help="At which page to stop with downloading"
+        "--begin",
        default=None,
        help="At which page to start with downloading",
        type=int,
    )
    parser.add_argument(
        "--end",
        default=9999999999999,
        help="At which page to stop with downloading",
        type=int,
    )
    args = parser.parse_args()
    print(f"Fetching {args.domain}...")
    fetch_comic(args)
 if __name__ == "__main__":
-    main()
+    cli()