76 lines
2.0 KiB
Python
76 lines
2.0 KiB
Python
#!/usr/bin/env python
|
|
|
|
"""
|
|
Download all the pdfs linked on a given webpage
|
|
|
|
Usage -
|
|
|
|
python grab_pdfs.py url <path/to/directory>
|
|
url is required
|
|
path is optional. Path needs to be absolute
|
|
will save in the current directory if no path is given
|
|
will save in the current directory if given path does not exist
|
|
|
|
Requires - requests >= 1.0.4
|
|
beautifulsoup >= 4.0.0
|
|
|
|
Download and install using
|
|
|
|
pip install requests
|
|
pip install beautifulsoup4
|
|
"""
|
|
|
|
__author__= 'elssar <elssar@altrawcode.com>'
|
|
__license__= 'MIT'
|
|
__version__= '1.0.0'
|
|
|
|
from requests import get
|
|
from urllib.parse import urljoin
|
|
from os import path, getcwd
|
|
from bs4 import BeautifulSoup as soup
|
|
from sys import argv
|
|
|
|
def get_page(base_url):
|
|
req= get(base_url)
|
|
if req.status_code==200:
|
|
return req.text
|
|
raise Exception('Error {0}'.format(req.status_code))
|
|
|
|
def get_all_links(html):
|
|
bs= soup(html)
|
|
links= bs.findAll('a')
|
|
return links
|
|
|
|
def get_pdf(base_url, base_dir):
|
|
html= get_page()
|
|
links= get_all_links(html)
|
|
if len(links)==0:
|
|
raise Exception('No links found on the webpage')
|
|
n_pdfs= 0
|
|
for link in links:
|
|
if link['href'][-4:]=='.pdf':
|
|
n_pdfs+= 1
|
|
content= get(urljoin(base_url, link['href']))
|
|
if content.status==200 and content.headers['content-type']=='application/pdf':
|
|
with open(path.join(base_dir, link.text+'.pdf'), 'wb') as pdf:
|
|
pdf.write(content.content)
|
|
if n_pdfs==0:
|
|
raise Exception('No pdfs found on the page')
|
|
print("{n_pdfs} pdfs downloaded and saved in {base_dir}")
|
|
|
|
if __name__=='__main__':
|
|
if len(argv) not in (2, 3):
|
|
print('Error! Invalid arguments')
|
|
print(__doc__)
|
|
exit(-1)
|
|
arg= ''
|
|
url= argv[1]
|
|
if len(argv)==3:
|
|
arg= argv[2]
|
|
base_dir= [getcwd(), arg][path.isdir(arg)]
|
|
try:
|
|
get_pdf(base_dir)
|
|
except Exception as e:
|
|
print(e)
|
|
exit(-1)
|