remove --first and --begin, just giving the URL where you want to start is way easier

This commit is contained in:
missytake 2025-12-23 08:01:18 +01:00
parent 3ea6392689
commit 001c361290
Signed by: missytake
GPG key ID: 04CC6658320518DF

54
main.py
View file

@ -10,7 +10,6 @@ def fetch_page(
domain: str,
i: int,
url: str,
download: bool,
img_tag: {str,str},
out_dir: Path,
next_tag: {str,str},
@ -20,7 +19,6 @@ def fetch_page(
:param domain: the comic's domain
:param i: which page of the comic this is
:param url: the URL of the page
:param download: whether to download the comic
:param img_tag: the html attribute and its content of the <img> tag where the comic is located
:param out_dir: the directory the comic is saved to
:param next_tag: the html attribute and its content of the "next" button
@ -31,20 +29,19 @@ def fetch_page(
page = requests.get(url)
soup = BeautifulSoup(page.text, "html.parser")
if download:
img_url = soup.find("img", **img_tag).get("src")
if not img_url.startswith("https://"):
img_url = domain + img_url
comic = requests.get(img_url)
filename = out_dir / f"{i:05d}-{url.split('/')[-1]}.{img_url.split('.')[-1]}"
img_url = soup.find("img", **img_tag).get("src")
if not img_url.startswith("https://"):
img_url = domain + img_url
comic = requests.get(img_url)
filename = out_dir / f"{i:05d}-{url.split('/')[-1]}.{img_url.split('.')[-1]}"
try:
output_file = open(filename, "wb")
except FileNotFoundError:
os.mkdir(out_dir)
output_file = open(filename, "wb")
output_file.write(comic.content)
output_file.close()
try:
output_file = open(filename, "wb")
except FileNotFoundError:
os.mkdir(out_dir)
output_file = open(filename, "wb")
output_file.write(comic.content)
output_file.close()
return soup.find("a", **next_tag).get("href")
@ -69,24 +66,12 @@ def fetch_comic(args):
img_split = args.img.split("::")
img_tag = {img_split[0]: img_split[1]}
if args.first:
first_url = soup.find("a", rel=args.first).get("href")
if not first_url.startswith("https://"):
first_url = args.domain + first_url
page = requests.get(first_url)
soup = BeautifulSoup(page.text, "html.parser")
try:
next_url = soup.find("a", **next_tag).get("href")
except AttributeError:
print(soup.find("a"))
i = 1
next_url = args.domain
i = 0
while i < args.end:
begin = args.begin if args.begin else 1
download = False if i < begin else True
print(f"Fetching: {next_url}")
next_url = fetch_page(
args.domain, i, next_url, download, img_tag, out_dir, next_tag
args.domain, i, next_url, img_tag, out_dir, next_tag
)
if not next_url:
break # end reached
@ -107,9 +92,6 @@ def cli():
parser.add_argument(
"--img", default=None, help="a unique html_attribute::content of the comic's img element"
)
parser.add_argument(
"--first", default=None, help="html 'rel' tag of the 'first comic' button"
)
parser.add_argument(
"--next", default=None, help="html attribute and content of the 'next comic' button"
)
@ -117,12 +99,6 @@ def cli():
parser.add_argument(
"--limit", default=None, help="Maximum of images to download", type=int
)
parser.add_argument(
"--begin",
default=None,
help="At which page to start with downloading",
type=int,
)
parser.add_argument(
"--end",
default=99999,