2025-01-04 20:37:35 +00:00
|
|
|
import sys
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
import requests
|
|
|
|
import base64
|
|
|
|
|
|
|
|
|
|
|
|
def absolute_url(url, base_domain):
|
|
|
|
'''
|
|
|
|
Makes any URL into an absolute URL (i.e. not a relative link)
|
|
|
|
'''
|
|
|
|
if not (url.startswith('http://') or url.startswith('https://')):
|
|
|
|
if not url.startswith('/'):
|
|
|
|
url = '/' + url
|
|
|
|
url = base_domain + url
|
|
|
|
return url
|
|
|
|
|
|
|
|
|
|
|
|
# the link without the path and stuff
|
|
|
|
# e.g. https://google.com/aaaaaa -> https://google.com
|
|
|
|
if sys.argv[1].count('/') > 2:
|
|
|
|
index = sys.argv[1][8:].find('/') + 8
|
|
|
|
domain_thing = sys.argv[1][:index]
|
|
|
|
else:
|
|
|
|
domain_thing = sys.argv[1]
|
|
|
|
|
|
|
|
html = requests.get(sys.argv[1]).content.decode()
|
|
|
|
soup = BeautifulSoup(html, 'html5lib')
|
|
|
|
|
|
|
|
# hardcode favicon
|
|
|
|
favicons = soup.find_all('link', rel='icon')
|
|
|
|
for favicon in favicons:
|
2025-01-04 23:19:32 +00:00
|
|
|
if 'icon' in favicon.attrs['rel']:
|
2025-01-04 20:37:35 +00:00
|
|
|
url = absolute_url(favicon.attrs['href'], domain_thing)
|
|
|
|
|
|
|
|
mime_type = requests.head(url).headers['Content-Type']
|
|
|
|
as_base64 = base64.b64encode(requests.get(url).content).decode()
|
|
|
|
new_url = f'data:{mime_type};base64,{as_base64}'
|
|
|
|
|
|
|
|
favicon.attrs['href'] = new_url
|
|
|
|
|
|
|
|
# hardcode images
|
|
|
|
imgs = soup.find_all('img')
|
|
|
|
for item in imgs:
|
|
|
|
url = absolute_url(item.attrs['src'], domain_thing)
|
|
|
|
mime_type = requests.head(url).headers['Content-Type']
|
|
|
|
as_base64 = base64.b64encode(requests.get(url).content).decode()
|
|
|
|
new_url = f'data:{mime_type};base64,{as_base64}'
|
|
|
|
item.attrs['src'] = new_url
|
|
|
|
|
2025-01-04 23:19:32 +00:00
|
|
|
# hardcode css
|
|
|
|
# note: not sure it matters, but this puts the CSS in <head>
|
|
|
|
head = soup.find_all('link', rel='stylesheet')
|
|
|
|
for item in head:
|
|
|
|
url = absolute_url(item.attrs['href'], domain_thing)
|
|
|
|
style_data = requests.get(url).content.decode()
|
|
|
|
head = soup.find('head')
|
|
|
|
new_tag = soup.new_tag('style')
|
|
|
|
new_tag.string = style_data
|
|
|
|
soup.head.append(new_tag)
|
|
|
|
item.decompose()
|
|
|
|
|
|
|
|
# change relative links to absolute
|
|
|
|
links = soup.find_all('link')
|
|
|
|
for item in links:
|
|
|
|
if 'icon' not in item.attrs['rel']:
|
|
|
|
item.attrs['href'] = absolute_url(item.attrs['href'], domain_thing)
|
|
|
|
|
2025-01-04 20:37:35 +00:00
|
|
|
with open(sys.argv[2], 'wt') as f:
|
|
|
|
f.write(str(soup))
|