remove --first and --begin, just giving the URL where you want to start is way easier
This commit is contained in:
parent
3ea6392689
commit
001c361290
54
main.py
54
main.py
|
|
@ -10,7 +10,6 @@ def fetch_page(
|
|||
domain: str,
|
||||
i: int,
|
||||
url: str,
|
||||
download: bool,
|
||||
img_tag: {str,str},
|
||||
out_dir: Path,
|
||||
next_tag: {str,str},
|
||||
|
|
@ -20,7 +19,6 @@ def fetch_page(
|
|||
:param domain: the comic's domain
|
||||
:param i: which page of the comic this is
|
||||
:param url: the URL of the page
|
||||
:param download: whether to download the comic
|
||||
:param img_tag: the html attribute and its content of the <img> tag where the comic is located
|
||||
:param out_dir: the directory the comic is saved to
|
||||
:param next_tag: the html attribute and its content of the "next" button
|
||||
|
|
@ -31,20 +29,19 @@ def fetch_page(
|
|||
page = requests.get(url)
|
||||
soup = BeautifulSoup(page.text, "html.parser")
|
||||
|
||||
if download:
|
||||
img_url = soup.find("img", **img_tag).get("src")
|
||||
if not img_url.startswith("https://"):
|
||||
img_url = domain + img_url
|
||||
comic = requests.get(img_url)
|
||||
filename = out_dir / f"{i:05d}-{url.split('/')[-1]}.{img_url.split('.')[-1]}"
|
||||
img_url = soup.find("img", **img_tag).get("src")
|
||||
if not img_url.startswith("https://"):
|
||||
img_url = domain + img_url
|
||||
comic = requests.get(img_url)
|
||||
filename = out_dir / f"{i:05d}-{url.split('/')[-1]}.{img_url.split('.')[-1]}"
|
||||
|
||||
try:
|
||||
output_file = open(filename, "wb")
|
||||
except FileNotFoundError:
|
||||
os.mkdir(out_dir)
|
||||
output_file = open(filename, "wb")
|
||||
output_file.write(comic.content)
|
||||
output_file.close()
|
||||
try:
|
||||
output_file = open(filename, "wb")
|
||||
except FileNotFoundError:
|
||||
os.mkdir(out_dir)
|
||||
output_file = open(filename, "wb")
|
||||
output_file.write(comic.content)
|
||||
output_file.close()
|
||||
|
||||
return soup.find("a", **next_tag).get("href")
|
||||
|
||||
|
|
@ -69,24 +66,12 @@ def fetch_comic(args):
|
|||
img_split = args.img.split("::")
|
||||
img_tag = {img_split[0]: img_split[1]}
|
||||
|
||||
if args.first:
|
||||
first_url = soup.find("a", rel=args.first).get("href")
|
||||
if not first_url.startswith("https://"):
|
||||
first_url = args.domain + first_url
|
||||
page = requests.get(first_url)
|
||||
soup = BeautifulSoup(page.text, "html.parser")
|
||||
|
||||
try:
|
||||
next_url = soup.find("a", **next_tag).get("href")
|
||||
except AttributeError:
|
||||
print(soup.find("a"))
|
||||
i = 1
|
||||
next_url = args.domain
|
||||
i = 0
|
||||
while i < args.end:
|
||||
begin = args.begin if args.begin else 1
|
||||
download = False if i < begin else True
|
||||
print(f"Fetching: {next_url}")
|
||||
next_url = fetch_page(
|
||||
args.domain, i, next_url, download, img_tag, out_dir, next_tag
|
||||
args.domain, i, next_url, img_tag, out_dir, next_tag
|
||||
)
|
||||
if not next_url:
|
||||
break # end reached
|
||||
|
|
@ -107,9 +92,6 @@ def cli():
|
|||
parser.add_argument(
|
||||
"--img", default=None, help="a unique html_attribute::content of the comic's img element"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--first", default=None, help="html 'rel' tag of the 'first comic' button"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--next", default=None, help="html attribute and content of the 'next comic' button"
|
||||
)
|
||||
|
|
@ -117,12 +99,6 @@ def cli():
|
|||
parser.add_argument(
|
||||
"--limit", default=None, help="Maximum of images to download", type=int
|
||||
)
|
||||
parser.add_argument(
|
||||
"--begin",
|
||||
default=None,
|
||||
help="At which page to start with downloading",
|
||||
type=int,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--end",
|
||||
default=99999,
|
||||
|
|
|
|||
Loading…
Reference in a new issue