make gogetaroomie.com work
This commit is contained in:
parent
bb3f873450
commit
95b07cd802
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -5,6 +5,7 @@ build/
|
||||||
dist/
|
dist/
|
||||||
wheels/
|
wheels/
|
||||||
*.egg-info
|
*.egg-info
|
||||||
|
result/
|
||||||
|
|
||||||
# Virtual environments
|
# Virtual environments
|
||||||
.venv
|
.venv
|
||||||
|
|
|
||||||
99
main.py
99
main.py
|
|
@ -1,7 +1,87 @@
|
||||||
import argparse
|
import argparse
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def fetch_page(
|
||||||
|
domain: str,
|
||||||
|
i: int,
|
||||||
|
url: str,
|
||||||
|
download: bool,
|
||||||
|
img_tag: str,
|
||||||
|
out_dir: Path,
|
||||||
|
next_tag: str,
|
||||||
|
) -> str:
|
||||||
|
"""Download a comic from a URL and return the URL of the next page
|
||||||
|
|
||||||
|
:param domain: the comic's domain
|
||||||
|
:param i: which page of the comic this is
|
||||||
|
:param url: the URL of the page
|
||||||
|
:param download: whether to download the comic
|
||||||
|
:param img_tag: the id of the <img> tag where the comic is located
|
||||||
|
:param out_dir: the directory the comic is saved to
|
||||||
|
:param next_tag: the tag of the "next" button
|
||||||
|
:return: the URL of the next page, None if it doesn't exist
|
||||||
|
"""
|
||||||
|
if not url.startswith("https://"):
|
||||||
|
url = domain + url
|
||||||
|
page = requests.get(url)
|
||||||
|
soup = BeautifulSoup(page.text, "html.parser")
|
||||||
|
|
||||||
|
if download:
|
||||||
|
img_url = soup.find("img", id=img_tag).get("src")
|
||||||
|
if not img_url.startswith("https://"):
|
||||||
|
img_url = domain + img_url
|
||||||
|
comic = requests.get(img_url)
|
||||||
|
filename = out_dir.joinpath(f"{i}.{img_url.split('.')[-1]}")
|
||||||
|
print(filename)
|
||||||
|
|
||||||
|
try:
|
||||||
|
output_file = open(filename, "wb")
|
||||||
|
except FileNotFoundError:
|
||||||
|
os.mkdir(out_dir)
|
||||||
|
output_file = open(filename, "wb")
|
||||||
|
output_file.write(comic.content)
|
||||||
|
output_file.close()
|
||||||
|
|
||||||
|
return soup.find("a", rel=next_tag).get("href")
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_comic(args):
|
||||||
|
"""Download the requested comic pages to a local directory."""
|
||||||
|
home_page = requests.get(args.domain)
|
||||||
|
home_s = BeautifulSoup(home_page.text, "html.parser")
|
||||||
|
|
||||||
|
out_dir = Path(os.getcwd()).joinpath(args.output)
|
||||||
|
|
||||||
|
first_url = home_s.find("a", rel=args.first).get("href")
|
||||||
|
if not first_url.startswith("https://"):
|
||||||
|
first_url = args.domain + first_url
|
||||||
|
next_page = requests.get(first_url)
|
||||||
|
next_soup = BeautifulSoup(next_page.text, "html.parser")
|
||||||
|
try:
|
||||||
|
next_url = next_soup.find("a", rel=args.next).get("href")
|
||||||
|
except AttributeError:
|
||||||
|
print(next_soup.find("a"))
|
||||||
|
i = 1
|
||||||
|
while i < args.end:
|
||||||
|
begin = args.begin if args.begin else 1
|
||||||
|
download = False if i < begin else True
|
||||||
|
print(f"Fetching comic from {next_url}...")
|
||||||
|
next_url = fetch_page(
|
||||||
|
args.domain, i, next_url, download, args.img, out_dir, args.next
|
||||||
|
)
|
||||||
|
if not next_url:
|
||||||
|
break # end reached
|
||||||
|
if i == args.limit:
|
||||||
|
break # limit reached
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
|
||||||
|
def cli():
|
||||||
"""Read the arguments from the command line and initialize commands."""
|
"""Read the arguments from the command line and initialize commands."""
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
|
@ -20,17 +100,26 @@ def main():
|
||||||
"--next", default=None, help="html 'id' tag of the 'next comic' button"
|
"--next", default=None, help="html 'id' tag of the 'next comic' button"
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument("--limit", default=None, help="Maximum of images to download")
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--start", default=None, help="At which page to start with downloading"
|
"--limit", default=None, help="Maximum of images to download", type=int
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--end", default=None, help="At which page to stop with downloading"
|
"--begin",
|
||||||
|
default=None,
|
||||||
|
help="At which page to start with downloading",
|
||||||
|
type=int,
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--end",
|
||||||
|
default=9999999999999,
|
||||||
|
help="At which page to stop with downloading",
|
||||||
|
type=int,
|
||||||
)
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
print(f"Fetching {args.domain}...")
|
print(f"Fetching {args.domain}...")
|
||||||
|
fetch_comic(args)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
cli()
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue