feat: support comiccontrol CMS by default without --next and --img arguments

2025-11-24 22:14:18 +01:00 · 2025-11-24 22:14:18 +01:00 · 44bb5f2b15
parent 79a10f9c41
commit 44bb5f2b15
1 changed files with 19 additions and 10 deletions
--- a/main.py
+++ b/main.py
@ -11,9 +11,9 @@ def fetch_page(
    i: int,
    url: str,
    download: bool,
-    img_tag: str,
+    img_id: str,
    out_dir: Path,
-    next_tag: str,
+    next_rel: str,
 ) -> str:
    """Download a comic from a URL and return the URL of the next page

@ -21,9 +21,9 @@ def fetch_page(
    :param i: which page of the comic this is
    :param url: the URL of the page
    :param download: whether to download the comic
-    :param img_tag: the id of the <img> tag where the comic is located
+    :param img_id: the id of the <img> tag where the comic is located
    :param out_dir: the directory the comic is saved to
-    :param next_tag: the tag of the "next" button
+    :param next_rel: the tag of the "next" button
    :return: the URL of the next page, None if it doesn't exist
    """
    if not url.startswith("https://"):
@ -32,12 +32,11 @@ def fetch_page(
    soup = BeautifulSoup(page.text, "html.parser")

    if download:
-        img_url = soup.find("img", id=img_tag).get("src")
+        img_url = soup.find("img", id=img_id).get("src")
        if not img_url.startswith("https://"):
            img_url = domain + img_url
        comic = requests.get(img_url)
        filename = out_dir / f"{i:05d}-{url.split('/')[-1]}.{img_url.split('.')[-1]}"
-        print(filename)

        try:
            output_file = open(filename, "wb")
@ -47,7 +46,7 @@ def fetch_page(
        output_file.write(comic.content)
        output_file.close()

-    return soup.find("a", rel=next_tag).get("href")
+    return soup.find("a", rel=next_rel).get("href")


 def fetch_comic(args):
@ -57,23 +56,33 @@ def fetch_comic(args):

    out_dir = Path(os.getcwd()).joinpath(args.output)

+    next_rel = args.next
+    img_id = args.img
+    if not next_rel or not img_id:
+        for style in soup.find_all("link", rel="stylesheet"):
+            if style.get("href").endswith("/comiccontrol/defaultstyles.css"):
+                img_id = "cc-comic" if not args.img else args.img
+                next_rel = "next" if not args.next else args.next
+                break
+
    if args.first:
        first_url = soup.find("a", rel=args.first).get("href")
        if not first_url.startswith("https://"):
            first_url = args.domain + first_url
        page = requests.get(first_url)
        soup = BeautifulSoup(page.text, "html.parser")
+
    try:
-        next_url = soup.find("a", rel=args.next).get("href")
+        next_url = soup.find("a", rel=next_rel).get("href")
    except AttributeError:
        print(soup.find("a"))
    i = 1
    while i < args.end:
        begin = args.begin if args.begin else 1
        download = False if i < begin else True
-        print(f"Fetching comic from {next_url}...")
+        print(f"Fetching: {next_url}")
        next_url = fetch_page(
-            args.domain, i, next_url, download, args.img, out_dir, args.next
+            args.domain, i, next_url, download, img_id, out_dir, next_rel
        )
        if not next_url:
            break  # end reached