import sys from bs4 import BeautifulSoup import requests from pybase64 import b64encode_as_string def absolute_url(url, base_domain): ''' Makes any URL into an absolute URL (i.e. not a relative link) ''' if not (url.startswith('http://') or url.startswith('https://')): if not url.startswith('/'): url = '/' + url url = base_domain + url return url # the link without the path and stuff # e.g. https://google.com/aaaaaa -> https://google.com if sys.argv[1].count('/') > 2: index = sys.argv[1][8:].find('/') + 8 domain_thing = sys.argv[1][:index] else: domain_thing = sys.argv[1] html = requests.get(sys.argv[1]).content.decode() soup = BeautifulSoup(html, 'html5lib') # hardcode favicon favicons = soup.find_all('link', rel='icon') for favicon in favicons: url = absolute_url(favicon.attrs['href'], domain_thing) mime_type = requests.head(url).headers['Content-Type'] as_base64 = b64encode_as_string(requests.get(url).content) new_url = f'data:{mime_type};base64,{as_base64}' favicon.attrs['href'] = new_url # hardcode images imgs = soup.find_all('img') for item in imgs: url = absolute_url(item.attrs['src'], domain_thing) mime_type = requests.head(url).headers['Content-Type'] as_base64 = b64encode_as_string(requests.get(url).content) new_url = f'data:{mime_type};base64,{as_base64}' item.attrs['src'] = new_url # hardcode css # note: not sure it matters, but this puts the CSS in head = soup.find_all('link', rel='stylesheet') for item in head: url = absolute_url(item.attrs['href'], domain_thing) css_mime_type = requests.head(url).headers['Content-Type'] style_data = requests.get(url).content.decode() # hardcode fonts # i am encoding a font in base64 in css in base64 in html # this program is unholy index = -1 while True: index = style_data.find('url(', index + 1) if index == -1: break original_url = style_data[index + 5 : style_data.find(')', index) - 1] absolute = absolute_url( original_url, domain_thing ) mime_type = requests.head(absolute).headers['Content-Type'] as_base64 = b64encode_as_string(requests.get(absolute).content) new_url = f'data:{mime_type};base64,{as_base64}' style_data = style_data.replace(original_url, new_url) item.attrs['href'] = f'data:{css_mime_type};base64,{b64encode_as_string(bytes(style_data, 'utf-8'))}' # change relative links to absolute links = soup.find_all('link') for item in links: if 'stylesheet' not in item.attrs['rel'] and 'icon' not in item.attrs['rel']: item.attrs['href'] = absolute_url(item.attrs['href'], domain_thing) with open(sys.argv[2], 'wt') as f: f.write(str(soup))