fetch-comic/main.py

127 lines
3.7 KiB
Python

import argparse
import os
from pathlib import Path
from bs4 import BeautifulSoup
import requests
def fetch_page(
domain: str,
i: int,
url: str,
download: bool,
img_tag: str,
out_dir: Path,
next_tag: str,
) -> str:
"""Download a comic from a URL and return the URL of the next page
:param domain: the comic's domain
:param i: which page of the comic this is
:param url: the URL of the page
:param download: whether to download the comic
:param img_tag: the id of the <img> tag where the comic is located
:param out_dir: the directory the comic is saved to
:param next_tag: the tag of the "next" button
:return: the URL of the next page, None if it doesn't exist
"""
if not url.startswith("https://"):
url = domain + url
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")
if download:
img_url = soup.find("img", id=img_tag).get("src")
if not img_url.startswith("https://"):
img_url = domain + img_url
comic = requests.get(img_url)
filename = out_dir.joinpath(f"{i}.{img_url.split('.')[-1]}")
print(filename)
try:
output_file = open(filename, "wb")
except FileNotFoundError:
os.mkdir(out_dir)
output_file = open(filename, "wb")
output_file.write(comic.content)
output_file.close()
return soup.find("a", rel=next_tag).get("href")
def fetch_comic(args):
"""Download the requested comic pages to a local directory."""
page = requests.get(args.domain)
soup = BeautifulSoup(page.text, "html.parser")
out_dir = Path(os.getcwd()).joinpath(args.output)
if args.first:
first_url = soup.find("a", rel=args.first).get("href")
if not first_url.startswith("https://"):
first_url = args.domain + first_url
page = requests.get(first_url)
soup = BeautifulSoup(page.text, "html.parser")
try:
next_url = soup.find("a", rel=args.next).get("href")
except AttributeError:
print(soup.find("a"))
i = 1
while i < args.end:
begin = args.begin if args.begin else 1
download = False if i < begin else True
print(f"Fetching comic from {next_url}...")
next_url = fetch_page(
args.domain, i, next_url, download, args.img, out_dir, args.next
)
if not next_url:
break # end reached
if i == args.limit:
break # limit reached
i += 1
def cli():
"""Read the arguments from the command line and initialize commands."""
parser = argparse.ArgumentParser()
parser.add_argument("domain", help="The domain of the webcomic you want to fetch")
parser.add_argument(
"--output", default="result", help="The directory where the comic is stored"
)
parser.add_argument(
"--img", default=None, help="html 'id' tag of the comic's img element(s)"
)
parser.add_argument(
"--first", default=None, help="html 'rel' tag of the 'first comic' button"
)
parser.add_argument(
"--next", default=None, help="html 'rel' tag of the 'next comic' button"
)
parser.add_argument(
"--limit", default=None, help="Maximum of images to download", type=int
)
parser.add_argument(
"--begin",
default=None,
help="At which page to start with downloading",
type=int,
)
parser.add_argument(
"--end",
default=9999999999999,
help="At which page to stop with downloading",
type=int,
)
args = parser.parse_args()
print(f"Fetching {args.domain}...")
fetch_comic(args)
if __name__ == "__main__":
cli()