From 44bb5f2b15f36e4d596bd254fbb17230e05ae00c Mon Sep 17 00:00:00 2001 From: missytake Date: Mon, 24 Nov 2025 22:14:18 +0100 Subject: [PATCH] feat: support comiccontrol CMS by default without --next and --img arguments --- main.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/main.py b/main.py index 00208af..9b71b0a 100644 --- a/main.py +++ b/main.py @@ -11,9 +11,9 @@ def fetch_page( i: int, url: str, download: bool, - img_tag: str, + img_id: str, out_dir: Path, - next_tag: str, + next_rel: str, ) -> str: """Download a comic from a URL and return the URL of the next page @@ -21,9 +21,9 @@ def fetch_page( :param i: which page of the comic this is :param url: the URL of the page :param download: whether to download the comic - :param img_tag: the id of the tag where the comic is located + :param img_id: the id of the tag where the comic is located :param out_dir: the directory the comic is saved to - :param next_tag: the tag of the "next" button + :param next_rel: the tag of the "next" button :return: the URL of the next page, None if it doesn't exist """ if not url.startswith("https://"): @@ -32,12 +32,11 @@ def fetch_page( soup = BeautifulSoup(page.text, "html.parser") if download: - img_url = soup.find("img", id=img_tag).get("src") + img_url = soup.find("img", id=img_id).get("src") if not img_url.startswith("https://"): img_url = domain + img_url comic = requests.get(img_url) filename = out_dir / f"{i:05d}-{url.split('/')[-1]}.{img_url.split('.')[-1]}" - print(filename) try: output_file = open(filename, "wb") @@ -47,7 +46,7 @@ def fetch_page( output_file.write(comic.content) output_file.close() - return soup.find("a", rel=next_tag).get("href") + return soup.find("a", rel=next_rel).get("href") def fetch_comic(args): @@ -57,23 +56,33 @@ def fetch_comic(args): out_dir = Path(os.getcwd()).joinpath(args.output) + next_rel = args.next + img_id = args.img + if not next_rel or not img_id: + for style in soup.find_all("link", rel="stylesheet"): + if style.get("href").endswith("/comiccontrol/defaultstyles.css"): + img_id = "cc-comic" if not args.img else args.img + next_rel = "next" if not args.next else args.next + break + if args.first: first_url = soup.find("a", rel=args.first).get("href") if not first_url.startswith("https://"): first_url = args.domain + first_url page = requests.get(first_url) soup = BeautifulSoup(page.text, "html.parser") + try: - next_url = soup.find("a", rel=args.next).get("href") + next_url = soup.find("a", rel=next_rel).get("href") except AttributeError: print(soup.find("a")) i = 1 while i < args.end: begin = args.begin if args.begin else 1 download = False if i < begin else True - print(f"Fetching comic from {next_url}...") + print(f"Fetching: {next_url}") next_url = fetch_page( - args.domain, i, next_url, download, args.img, out_dir, args.next + args.domain, i, next_url, download, img_id, out_dir, next_rel ) if not next_url: break # end reached