diff --git a/.gitignore b/.gitignore index 505a3b1..13fb5d6 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ build/ dist/ wheels/ *.egg-info +result/ # Virtual environments .venv diff --git a/main.py b/main.py index 0817724..7d9b65f 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,87 @@ import argparse +import os +from pathlib import Path + +from bs4 import BeautifulSoup +import requests -def main(): +def fetch_page( + domain: str, + i: int, + url: str, + download: bool, + img_tag: str, + out_dir: Path, + next_tag: str, +) -> str: + """Download a comic from a URL and return the URL of the next page + + :param domain: the comic's domain + :param i: which page of the comic this is + :param url: the URL of the page + :param download: whether to download the comic + :param img_tag: the id of the tag where the comic is located + :param out_dir: the directory the comic is saved to + :param next_tag: the tag of the "next" button + :return: the URL of the next page, None if it doesn't exist + """ + if not url.startswith("https://"): + url = domain + url + page = requests.get(url) + soup = BeautifulSoup(page.text, "html.parser") + + if download: + img_url = soup.find("img", id=img_tag).get("src") + if not img_url.startswith("https://"): + img_url = domain + img_url + comic = requests.get(img_url) + filename = out_dir.joinpath(f"{i}.{img_url.split('.')[-1]}") + print(filename) + + try: + output_file = open(filename, "wb") + except FileNotFoundError: + os.mkdir(out_dir) + output_file = open(filename, "wb") + output_file.write(comic.content) + output_file.close() + + return soup.find("a", rel=next_tag).get("href") + + +def fetch_comic(args): + """Download the requested comic pages to a local directory.""" + home_page = requests.get(args.domain) + home_s = BeautifulSoup(home_page.text, "html.parser") + + out_dir = Path(os.getcwd()).joinpath(args.output) + + first_url = home_s.find("a", rel=args.first).get("href") + if not first_url.startswith("https://"): + first_url = args.domain + first_url + next_page = requests.get(first_url) + next_soup = BeautifulSoup(next_page.text, "html.parser") + try: + next_url = next_soup.find("a", rel=args.next).get("href") + except AttributeError: + print(next_soup.find("a")) + i = 1 + while i < args.end: + begin = args.begin if args.begin else 1 + download = False if i < begin else True + print(f"Fetching comic from {next_url}...") + next_url = fetch_page( + args.domain, i, next_url, download, args.img, out_dir, args.next + ) + if not next_url: + break # end reached + if i == args.limit: + break # limit reached + i += 1 + + +def cli(): """Read the arguments from the command line and initialize commands.""" parser = argparse.ArgumentParser() @@ -20,17 +100,26 @@ def main(): "--next", default=None, help="html 'id' tag of the 'next comic' button" ) - parser.add_argument("--limit", default=None, help="Maximum of images to download") parser.add_argument( - "--start", default=None, help="At which page to start with downloading" + "--limit", default=None, help="Maximum of images to download", type=int ) parser.add_argument( - "--end", default=None, help="At which page to stop with downloading" + "--begin", + default=None, + help="At which page to start with downloading", + type=int, + ) + parser.add_argument( + "--end", + default=9999999999999, + help="At which page to stop with downloading", + type=int, ) args = parser.parse_args() print(f"Fetching {args.domain}...") + fetch_comic(args) if __name__ == "__main__": - main() + cli()