one-webpage-under-god/owug.py

87 lines
2.6 KiB
Python
Raw Normal View History

2025-01-04 20:37:35 +00:00
import sys
from bs4 import BeautifulSoup
import requests
from pybase64 import b64encode_as_string
2025-01-04 20:37:35 +00:00
def absolute_url(url, base_domain):
'''
Makes any URL into an absolute URL (i.e. not a relative link)
'''
if not (url.startswith('http://') or url.startswith('https://')):
if not url.startswith('/'):
url = '/' + url
url = base_domain + url
return url
# the link without the path and stuff
# e.g. https://google.com/aaaaaa -> https://google.com
if sys.argv[1].count('/') > 2:
index = sys.argv[1][8:].find('/') + 8
domain_thing = sys.argv[1][:index]
else:
domain_thing = sys.argv[1]
html = requests.get(sys.argv[1]).content.decode()
soup = BeautifulSoup(html, 'html5lib')
# hardcode favicon
favicons = soup.find_all('link', rel='icon')
for favicon in favicons:
2025-01-05 00:13:48 +00:00
url = absolute_url(favicon.attrs['href'], domain_thing)
2025-01-04 20:37:35 +00:00
2025-01-05 00:13:48 +00:00
mime_type = requests.head(url).headers['Content-Type']
as_base64 = b64encode_as_string(requests.get(url).content)
new_url = f'data:{mime_type};base64,{as_base64}'
2025-01-04 20:37:35 +00:00
2025-01-05 00:13:48 +00:00
favicon.attrs['href'] = new_url
2025-01-04 20:37:35 +00:00
# hardcode images
imgs = soup.find_all('img')
for item in imgs:
url = absolute_url(item.attrs['src'], domain_thing)
mime_type = requests.head(url).headers['Content-Type']
as_base64 = b64encode_as_string(requests.get(url).content)
2025-01-04 20:37:35 +00:00
new_url = f'data:{mime_type};base64,{as_base64}'
item.attrs['src'] = new_url
# hardcode css
# note: not sure it matters, but this puts the CSS in <head>
head = soup.find_all('link', rel='stylesheet')
for item in head:
url = absolute_url(item.attrs['href'], domain_thing)
style_data = requests.get(url).content.decode()
2025-01-05 00:39:13 +00:00
# hardcode fonts
index = -1
while True:
index = style_data.find('url(', index + 1)
if index == -1:
break
original_url = style_data[index + 5 : style_data.find(')', index) - 1]
absolute = absolute_url(
original_url, domain_thing
)
mime_type = requests.head(absolute).headers['Content-Type']
as_base64 = b64encode_as_string(requests.get(absolute).content)
new_url = f'data:{mime_type};base64,{as_base64}'
style_data = style_data.replace(original_url, new_url)
head = soup.find('head')
new_tag = soup.new_tag('style')
new_tag.string = style_data
soup.head.append(new_tag)
item.decompose()
# change relative links to absolute
links = soup.find_all('link')
for item in links:
if 'icon' not in item.attrs['rel']:
item.attrs['href'] = absolute_url(item.attrs['href'], domain_thing)
2025-01-04 20:37:35 +00:00
with open(sys.argv[2], 'wt') as f:
f.write(str(soup))