one-webpage-under-god/owug.py

import sys
from bs4 import BeautifulSoup
import requests
import base64


def absolute_url(url, base_domain):
    '''
    Makes any URL into an absolute URL (i.e. not a relative link)
    '''
    if not (url.startswith('http://') or url.startswith('https://')):
        if not url.startswith('/'):
            url = '/' + url
        url = base_domain + url
    return url


# the link without the path and stuff
# e.g. https://google.com/aaaaaa -> https://google.com
if sys.argv[1].count('/') > 2:
    index = sys.argv[1][8:].find('/') + 8
    domain_thing = sys.argv[1][:index]
else:
    domain_thing = sys.argv[1]

html = requests.get(sys.argv[1]).content.decode()
soup = BeautifulSoup(html, 'html5lib')

# hardcode favicon
favicons = soup.find_all('link', rel='icon')
for favicon in favicons:
    if favicon.attrs['rel'].count('icon') > 0:
        url = absolute_url(favicon.attrs['href'], domain_thing)

        mime_type = requests.head(url).headers['Content-Type']
        as_base64 = base64.b64encode(requests.get(url).content).decode()
        new_url = f'data:{mime_type};base64,{as_base64}'

        favicon.attrs['href'] = new_url

# hardcode images
imgs = soup.find_all('img')
for item in imgs:
    url = absolute_url(item.attrs['src'], domain_thing)
    mime_type = requests.head(url).headers['Content-Type']
    as_base64 = base64.b64encode(requests.get(url).content).decode()
    new_url = f'data:{mime_type};base64,{as_base64}'
    item.attrs['src'] = new_url

with open(sys.argv[2], 'wt') as f:
    f.write(str(soup))
initial commit 2025-01-04 20:37:35 +00:00			`import sys`
			`from bs4 import BeautifulSoup`
			`import requests`
			`import base64`


			`def absolute_url(url, base_domain):`
			`'''`
			`Makes any URL into an absolute URL (i.e. not a relative link)`
			`'''`
			`if not (url.startswith('http://') or url.startswith('https://')):`
			`if not url.startswith('/'):`
			`url = '/' + url`
			`url = base_domain + url`
			`return url`


			`# the link without the path and stuff`
			`# e.g. https://google.com/aaaaaa -> https://google.com`
			`if sys.argv[1].count('/') > 2:`
			`index = sys.argv[1][8:].find('/') + 8`
			`domain_thing = sys.argv[1][:index]`
			`else:`
			`domain_thing = sys.argv[1]`

			`html = requests.get(sys.argv[1]).content.decode()`
			`soup = BeautifulSoup(html, 'html5lib')`

			`# hardcode favicon`
			`favicons = soup.find_all('link', rel='icon')`
			`for favicon in favicons:`
			`if favicon.attrs['rel'].count('icon') > 0:`
			`url = absolute_url(favicon.attrs['href'], domain_thing)`

			`mime_type = requests.head(url).headers['Content-Type']`
			`as_base64 = base64.b64encode(requests.get(url).content).decode()`
			`new_url = f'data:{mime_type};base64,{as_base64}'`

			`favicon.attrs['href'] = new_url`

			`# hardcode images`
			`imgs = soup.find_all('img')`
			`for item in imgs:`
			`url = absolute_url(item.attrs['src'], domain_thing)`
			`mime_type = requests.head(url).headers['Content-Type']`
			`as_base64 = base64.b64encode(requests.get(url).content).decode()`
			`new_url = f'data:{mime_type};base64,{as_base64}'`
			`item.attrs['src'] = new_url`

			`with open(sys.argv[2], 'wt') as f:`
			`f.write(str(soup))`