136 lines
4 KiB
Python
136 lines
4 KiB
Python
import argparse
|
|
import os
|
|
from pathlib import Path
|
|
|
|
from bs4 import BeautifulSoup
|
|
import requests
|
|
|
|
|
|
def fetch_page(
|
|
domain: str,
|
|
i: int,
|
|
url: str,
|
|
download: bool,
|
|
img_id: str,
|
|
out_dir: Path,
|
|
next_rel: str,
|
|
) -> str:
|
|
"""Download a comic from a URL and return the URL of the next page
|
|
|
|
:param domain: the comic's domain
|
|
:param i: which page of the comic this is
|
|
:param url: the URL of the page
|
|
:param download: whether to download the comic
|
|
:param img_id: the id of the <img> tag where the comic is located
|
|
:param out_dir: the directory the comic is saved to
|
|
:param next_rel: the tag of the "next" button
|
|
:return: the URL of the next page, None if it doesn't exist
|
|
"""
|
|
if not url.startswith("https://"):
|
|
url = domain + url
|
|
page = requests.get(url)
|
|
soup = BeautifulSoup(page.text, "html.parser")
|
|
|
|
if download:
|
|
img_url = soup.find("img", id=img_id).get("src")
|
|
if not img_url.startswith("https://"):
|
|
img_url = domain + img_url
|
|
comic = requests.get(img_url)
|
|
filename = out_dir / f"{i:05d}-{url.split('/')[-1]}.{img_url.split('.')[-1]}"
|
|
|
|
try:
|
|
output_file = open(filename, "wb")
|
|
except FileNotFoundError:
|
|
os.mkdir(out_dir)
|
|
output_file = open(filename, "wb")
|
|
output_file.write(comic.content)
|
|
output_file.close()
|
|
|
|
return soup.find("a", rel=next_rel).get("href")
|
|
|
|
|
|
def fetch_comic(args):
|
|
"""Download the requested comic pages to a local directory."""
|
|
page = requests.get(args.domain)
|
|
soup = BeautifulSoup(page.text, "html.parser")
|
|
|
|
out_dir = Path(os.getcwd()).joinpath(args.output)
|
|
|
|
next_rel = args.next
|
|
img_id = args.img
|
|
if not next_rel or not img_id:
|
|
for style in soup.find_all("link", rel="stylesheet"):
|
|
if style.get("href").endswith("/comiccontrol/defaultstyles.css"):
|
|
img_id = "cc-comic" if not args.img else args.img
|
|
next_rel = "next" if not args.next else args.next
|
|
break
|
|
|
|
if args.first:
|
|
first_url = soup.find("a", rel=args.first).get("href")
|
|
if not first_url.startswith("https://"):
|
|
first_url = args.domain + first_url
|
|
page = requests.get(first_url)
|
|
soup = BeautifulSoup(page.text, "html.parser")
|
|
|
|
try:
|
|
next_url = soup.find("a", rel=next_rel).get("href")
|
|
except AttributeError:
|
|
print(soup.find("a"))
|
|
i = 1
|
|
while i < args.end:
|
|
begin = args.begin if args.begin else 1
|
|
download = False if i < begin else True
|
|
print(f"Fetching: {next_url}")
|
|
next_url = fetch_page(
|
|
args.domain, i, next_url, download, img_id, out_dir, next_rel
|
|
)
|
|
if not next_url:
|
|
break # end reached
|
|
if i == args.limit:
|
|
break # limit reached
|
|
i += 1
|
|
|
|
|
|
def cli():
|
|
"""Read the arguments from the command line and initialize commands."""
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument("domain", help="The domain of the webcomic you want to fetch")
|
|
parser.add_argument(
|
|
"--output", default="result", help="The directory where the comic is stored"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--img", default=None, help="html 'id' tag of the comic's img element(s)"
|
|
)
|
|
parser.add_argument(
|
|
"--first", default=None, help="html 'rel' tag of the 'first comic' button"
|
|
)
|
|
parser.add_argument(
|
|
"--next", default=None, help="html 'rel' tag of the 'next comic' button"
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--limit", default=None, help="Maximum of images to download", type=int
|
|
)
|
|
parser.add_argument(
|
|
"--begin",
|
|
default=None,
|
|
help="At which page to start with downloading",
|
|
type=int,
|
|
)
|
|
parser.add_argument(
|
|
"--end",
|
|
default=99999,
|
|
help="At which page to stop with downloading",
|
|
type=int,
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
print(f"Fetching {args.domain}...")
|
|
fetch_comic(args)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
cli()
|