feat: support comiccontrol CMS by default without --next and --img arguments

This commit is contained in:
missytake 2025-11-24 22:14:18 +01:00
parent 79a10f9c41
commit 44bb5f2b15
Signed by: missytake
GPG key ID: 04CC6658320518DF

29
main.py
View file

@ -11,9 +11,9 @@ def fetch_page(
i: int, i: int,
url: str, url: str,
download: bool, download: bool,
img_tag: str, img_id: str,
out_dir: Path, out_dir: Path,
next_tag: str, next_rel: str,
) -> str: ) -> str:
"""Download a comic from a URL and return the URL of the next page """Download a comic from a URL and return the URL of the next page
@ -21,9 +21,9 @@ def fetch_page(
:param i: which page of the comic this is :param i: which page of the comic this is
:param url: the URL of the page :param url: the URL of the page
:param download: whether to download the comic :param download: whether to download the comic
:param img_tag: the id of the <img> tag where the comic is located :param img_id: the id of the <img> tag where the comic is located
:param out_dir: the directory the comic is saved to :param out_dir: the directory the comic is saved to
:param next_tag: the tag of the "next" button :param next_rel: the tag of the "next" button
:return: the URL of the next page, None if it doesn't exist :return: the URL of the next page, None if it doesn't exist
""" """
if not url.startswith("https://"): if not url.startswith("https://"):
@ -32,12 +32,11 @@ def fetch_page(
soup = BeautifulSoup(page.text, "html.parser") soup = BeautifulSoup(page.text, "html.parser")
if download: if download:
img_url = soup.find("img", id=img_tag).get("src") img_url = soup.find("img", id=img_id).get("src")
if not img_url.startswith("https://"): if not img_url.startswith("https://"):
img_url = domain + img_url img_url = domain + img_url
comic = requests.get(img_url) comic = requests.get(img_url)
filename = out_dir / f"{i:05d}-{url.split('/')[-1]}.{img_url.split('.')[-1]}" filename = out_dir / f"{i:05d}-{url.split('/')[-1]}.{img_url.split('.')[-1]}"
print(filename)
try: try:
output_file = open(filename, "wb") output_file = open(filename, "wb")
@ -47,7 +46,7 @@ def fetch_page(
output_file.write(comic.content) output_file.write(comic.content)
output_file.close() output_file.close()
return soup.find("a", rel=next_tag).get("href") return soup.find("a", rel=next_rel).get("href")
def fetch_comic(args): def fetch_comic(args):
@ -57,23 +56,33 @@ def fetch_comic(args):
out_dir = Path(os.getcwd()).joinpath(args.output) out_dir = Path(os.getcwd()).joinpath(args.output)
next_rel = args.next
img_id = args.img
if not next_rel or not img_id:
for style in soup.find_all("link", rel="stylesheet"):
if style.get("href").endswith("/comiccontrol/defaultstyles.css"):
img_id = "cc-comic" if not args.img else args.img
next_rel = "next" if not args.next else args.next
break
if args.first: if args.first:
first_url = soup.find("a", rel=args.first).get("href") first_url = soup.find("a", rel=args.first).get("href")
if not first_url.startswith("https://"): if not first_url.startswith("https://"):
first_url = args.domain + first_url first_url = args.domain + first_url
page = requests.get(first_url) page = requests.get(first_url)
soup = BeautifulSoup(page.text, "html.parser") soup = BeautifulSoup(page.text, "html.parser")
try: try:
next_url = soup.find("a", rel=args.next).get("href") next_url = soup.find("a", rel=next_rel).get("href")
except AttributeError: except AttributeError:
print(soup.find("a")) print(soup.find("a"))
i = 1 i = 1
while i < args.end: while i < args.end:
begin = args.begin if args.begin else 1 begin = args.begin if args.begin else 1
download = False if i < begin else True download = False if i < begin else True
print(f"Fetching comic from {next_url}...") print(f"Fetching: {next_url}")
next_url = fetch_page( next_url = fetch_page(
args.domain, i, next_url, download, args.img, out_dir, args.next args.domain, i, next_url, download, img_id, out_dir, next_rel
) )
if not next_url: if not next_url:
break # end reached break # end reached