fetch-comic/main.py

import argparse
import os
from pathlib import Path

from bs4 import BeautifulSoup
import requests


def fetch_page(
    domain: str,
    i: int,
    url: str,
    download: bool,
    img_id: str,
    out_dir: Path,
    next_rel: str,
) -> str:
    """Download a comic from a URL and return the URL of the next page

    :param domain: the comic's domain
    :param i: which page of the comic this is
    :param url: the URL of the page
    :param download: whether to download the comic
    :param img_id: the id of the <img> tag where the comic is located
    :param out_dir: the directory the comic is saved to
    :param next_rel: the tag of the "next" button
    :return: the URL of the next page, None if it doesn't exist
    """
    if not url.startswith("https://"):
        url = domain + url
    page = requests.get(url)
    soup = BeautifulSoup(page.text, "html.parser")

    if download:
        img_url = soup.find("img", id=img_id).get("src")
        if not img_url.startswith("https://"):
            img_url = domain + img_url
        comic = requests.get(img_url)
        filename = out_dir / f"{i:05d}-{url.split('/')[-1]}.{img_url.split('.')[-1]}"

        try:
            output_file = open(filename, "wb")
        except FileNotFoundError:
            os.mkdir(out_dir)
            output_file = open(filename, "wb")
        output_file.write(comic.content)
        output_file.close()

    return soup.find("a", rel=next_rel).get("href")


def fetch_comic(args):
    """Download the requested comic pages to a local directory."""
    page = requests.get(args.domain)
    soup = BeautifulSoup(page.text, "html.parser")

    out_dir = Path(os.getcwd()).joinpath(args.output)

    next_rel = args.next
    img_id = args.img
    if not next_rel or not img_id:
        for style in soup.find_all("link", rel="stylesheet"):
            if style.get("href").endswith("/comiccontrol/defaultstyles.css"):
                img_id = "cc-comic" if not args.img else args.img
                next_rel = "next" if not args.next else args.next
                break

    if args.first:
        first_url = soup.find("a", rel=args.first).get("href")
        if not first_url.startswith("https://"):
            first_url = args.domain + first_url
        page = requests.get(first_url)
        soup = BeautifulSoup(page.text, "html.parser")

    try:
        next_url = soup.find("a", rel=next_rel).get("href")
    except AttributeError:
        print(soup.find("a"))
    i = 1
    while i < args.end:
        begin = args.begin if args.begin else 1
        download = False if i < begin else True
        print(f"Fetching: {next_url}")
        next_url = fetch_page(
            args.domain, i, next_url, download, img_id, out_dir, next_rel
        )
        if not next_url:
            break  # end reached
        if i == args.limit:
            break  # limit reached
        i += 1


def cli():
    """Read the arguments from the command line and initialize commands."""
    parser = argparse.ArgumentParser()

    parser.add_argument("domain", help="The domain of the webcomic you want to fetch")
    parser.add_argument(
        "--output", default="result", help="The directory where the comic is stored"
    )

    parser.add_argument(
        "--img", default=None, help="html 'id' tag of the comic's img element(s)"
    )
    parser.add_argument(
        "--first", default=None, help="html 'rel' tag of the 'first comic' button"
    )
    parser.add_argument(
        "--next", default=None, help="html 'rel' tag of the 'next comic' button"
    )

    parser.add_argument(
        "--limit", default=None, help="Maximum of images to download", type=int
    )
    parser.add_argument(
        "--begin",
        default=None,
        help="At which page to start with downloading",
        type=int,
    )
    parser.add_argument(
        "--end",
        default=99999,
        help="At which page to stop with downloading",
        type=int,
    )

    args = parser.parse_args()
    print(f"Fetching {args.domain}...")
    fetch_comic(args)


if __name__ == "__main__":
    cli()