make gogetaroomie.com work

2025-11-24 19:50:24 +01:00 · 2025-11-24 19:50:24 +01:00 · 95b07cd802
parent bb3f873450
commit 95b07cd802
2 changed files with 95 additions and 5 deletions
--- a/.gitignore
+++ b/.gitignore
@ -5,6 +5,7 @@ build/
 dist/
 wheels/
 *.egg-info
+result/

 # Virtual environments
 .venv
--- a/main.py
+++ b/main.py
@ -1,7 +1,87 @@
 import argparse
+import os
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+import requests


-def main():
+def fetch_page(
+    domain: str,
+    i: int,
+    url: str,
+    download: bool,
+    img_tag: str,
+    out_dir: Path,
+    next_tag: str,
+) -> str:
+    """Download a comic from a URL and return the URL of the next page
+
+    :param domain: the comic's domain
+    :param i: which page of the comic this is
+    :param url: the URL of the page
+    :param download: whether to download the comic
+    :param img_tag: the id of the <img> tag where the comic is located
+    :param out_dir: the directory the comic is saved to
+    :param next_tag: the tag of the "next" button
+    :return: the URL of the next page, None if it doesn't exist
+    """
+    if not url.startswith("https://"):
+        url = domain + url
+    page = requests.get(url)
+    soup = BeautifulSoup(page.text, "html.parser")
+
+    if download:
+        img_url = soup.find("img", id=img_tag).get("src")
+        if not img_url.startswith("https://"):
+            img_url = domain + img_url
+        comic = requests.get(img_url)
+        filename = out_dir.joinpath(f"{i}.{img_url.split('.')[-1]}")
+        print(filename)
+
+        try:
+            output_file = open(filename, "wb")
+        except FileNotFoundError:
+            os.mkdir(out_dir)
+            output_file = open(filename, "wb")
+        output_file.write(comic.content)
+        output_file.close()
+
+    return soup.find("a", rel=next_tag).get("href")
+
+
+def fetch_comic(args):
+    """Download the requested comic pages to a local directory."""
+    home_page = requests.get(args.domain)
+    home_s = BeautifulSoup(home_page.text, "html.parser")
+
+    out_dir = Path(os.getcwd()).joinpath(args.output)
+
+    first_url = home_s.find("a", rel=args.first).get("href")
+    if not first_url.startswith("https://"):
+        first_url = args.domain + first_url
+    next_page = requests.get(first_url)
+    next_soup = BeautifulSoup(next_page.text, "html.parser")
+    try:
+        next_url = next_soup.find("a", rel=args.next).get("href")
+    except AttributeError:
+        print(next_soup.find("a"))
+    i = 1
+    while i < args.end:
+        begin = args.begin if args.begin else 1
+        download = False if i < begin else True
+        print(f"Fetching comic from {next_url}...")
+        next_url = fetch_page(
+            args.domain, i, next_url, download, args.img, out_dir, args.next
+        )
+        if not next_url:
+            break  # end reached
+        if i == args.limit:
+            break  # limit reached
+        i += 1
+
+
+def cli():
    """Read the arguments from the command line and initialize commands."""
    parser = argparse.ArgumentParser()

@ -20,17 +100,26 @@ def main():
        "--next", default=None, help="html 'id' tag of the 'next comic' button"
    )

-    parser.add_argument("--limit", default=None, help="Maximum of images to download")
    parser.add_argument(
-        "--start", default=None, help="At which page to start with downloading"
+        "--limit", default=None, help="Maximum of images to download", type=int
    )
    parser.add_argument(
-        "--end", default=None, help="At which page to stop with downloading"
+        "--begin",
+        default=None,
+        help="At which page to start with downloading",
+        type=int,
+    )
+    parser.add_argument(
+        "--end",
+        default=9999999999999,
+        help="At which page to stop with downloading",
+        type=int,
    )

    args = parser.parse_args()
    print(f"Fetching {args.domain}...")
+    fetch_comic(args)


 if __name__ == "__main__":
-    main()
+    cli()