import argparse import os from pathlib import Path from bs4 import BeautifulSoup import requests def fetch_page( domain: str, i: int, url: str, download: bool, img_tag: str, out_dir: Path, next_tag: str, ) -> str: """Download a comic from a URL and return the URL of the next page :param domain: the comic's domain :param i: which page of the comic this is :param url: the URL of the page :param download: whether to download the comic :param img_tag: the id of the tag where the comic is located :param out_dir: the directory the comic is saved to :param next_tag: the tag of the "next" button :return: the URL of the next page, None if it doesn't exist """ if not url.startswith("https://"): url = domain + url page = requests.get(url) soup = BeautifulSoup(page.text, "html.parser") if download: img_url = soup.find("img", id=img_tag).get("src") if not img_url.startswith("https://"): img_url = domain + img_url comic = requests.get(img_url) filename = out_dir / f"{i:05d}-{url.split('/')[-1]}.{img_url.split('.')[-1]}" print(filename) try: output_file = open(filename, "wb") except FileNotFoundError: os.mkdir(out_dir) output_file = open(filename, "wb") output_file.write(comic.content) output_file.close() return soup.find("a", rel=next_tag).get("href") def fetch_comic(args): """Download the requested comic pages to a local directory.""" page = requests.get(args.domain) soup = BeautifulSoup(page.text, "html.parser") out_dir = Path(os.getcwd()).joinpath(args.output) if args.first: first_url = soup.find("a", rel=args.first).get("href") if not first_url.startswith("https://"): first_url = args.domain + first_url page = requests.get(first_url) soup = BeautifulSoup(page.text, "html.parser") try: next_url = soup.find("a", rel=args.next).get("href") except AttributeError: print(soup.find("a")) i = 1 while i < args.end: begin = args.begin if args.begin else 1 download = False if i < begin else True print(f"Fetching comic from {next_url}...") next_url = fetch_page( args.domain, i, next_url, download, args.img, out_dir, args.next ) if not next_url: break # end reached if i == args.limit: break # limit reached i += 1 def cli(): """Read the arguments from the command line and initialize commands.""" parser = argparse.ArgumentParser() parser.add_argument("domain", help="The domain of the webcomic you want to fetch") parser.add_argument( "--output", default="result", help="The directory where the comic is stored" ) parser.add_argument( "--img", default=None, help="html 'id' tag of the comic's img element(s)" ) parser.add_argument( "--first", default=None, help="html 'rel' tag of the 'first comic' button" ) parser.add_argument( "--next", default=None, help="html 'rel' tag of the 'next comic' button" ) parser.add_argument( "--limit", default=None, help="Maximum of images to download", type=int ) parser.add_argument( "--begin", default=None, help="At which page to start with downloading", type=int, ) parser.add_argument( "--end", default=99999, help="At which page to stop with downloading", type=int, ) args = parser.parse_args() print(f"Fetching {args.domain}...") fetch_comic(args) if __name__ == "__main__": cli()