microblog.pub/utils/opengraph.py

import logging
import mimetypes
from typing import Any
from typing import Dict
from typing import Set
from urllib.parse import urlparse

import opengraph
import requests
from bs4 import BeautifulSoup
from little_boxes import activitypub as ap
from little_boxes.errors import NotAnActivityError
from little_boxes.urlutils import check_url
from little_boxes.urlutils import is_url_valid

from .lookup import lookup

logger = logging.getLogger(__name__)


def links_from_note(note: Dict[str, Any]) -> Set[str]:
    note_host = urlparse(ap._get_id(note["id"]) or "").netloc

    links = set()
    if "content" in note:
        soup = BeautifulSoup(note["content"], "html5lib")
        for link in soup.find_all("a"):
            h = link.get("href")
            ph = urlparse(h)
            if (
                ph.scheme in {"http", "https"}
                and ph.netloc != note_host
                and is_url_valid(h)
            ):
                links.add(h)

    # FIXME(tsileo): support summary and name fields

    return links


def fetch_og_metadata(user_agent, links):
    res = []
    for l in links:
        # Try to skip media early
        mimetype, _ = mimetypes.guess_type(l)
        if mimetype and mimetype.split("/")[0] in ["image", "video", "audio"]:
            logger.info(f"skipping media link {l}")
            continue

        check_url(l)

        # Remove any AP objects
        try:
            lookup(l)
            continue
        except NotAnActivityError:
            pass
        except Exception:
            logger.exception(f"skipping {l} because of issues during AP lookup")
            continue

        try:
            h = requests.head(
                l, headers={"User-Agent": user_agent}, timeout=3, allow_redirects=True
            )
            h.raise_for_status()
        except requests.HTTPError as http_err:
            logger.debug(
                f"failed to HEAD {l}, got a {http_err.response.status_code}: {http_err.response.text}"
            )
            continue
        except requests.RequestException as err:
            logger.debug(f"failed to HEAD {l}: {err!r}")
            continue

        if not h.headers.get("content-type").startswith("text/html"):
            logger.debug(f"skipping {l} for bad content type")
            continue

        try:
            r = requests.get(
                l, headers={"User-Agent": user_agent}, timeout=5, allow_redirects=True
            )
            r.raise_for_status()
        except requests.HTTPError as http_err:
            logger.debug(
                f"failed to GET {l}, got a {http_err.response.status_code}: {http_err.response.text}"
            )
            continue
        except requests.RequestException as err:
            logger.debug(f"failed to GET {l}: {err!r}")
            continue

        r.encoding = "UTF-8"
        html = r.text
        try:
            data = dict(opengraph.OpenGraph(html=html))
        except Exception:
            logger.exception(f"failed to parse {l}")
            continue

        # Keep track of the fetched URL as some crappy websites use relative URLs everywhere
        data["_input_url"] = l
        u = urlparse(l)

        # If it's a relative URL, build the absolute version
        if "image" in data and data["image"].startswith("/"):
            data["image"] = u._replace(
                path=data["image"], params="", query="", fragment=""
            ).geturl()

        if "url" in data and data["url"].startswith("/"):
            data["url"] = u._replace(
                path=data["url"], params="", query="", fragment=""
            ).geturl()

        if data.get("url"):
            res.append(data)

    return res
More opengraph tweaks 2018-08-05 07:24:52 -05:00			`import logging`
More OG meta tweaks 2019-08-08 16:48:30 -05:00			`import mimetypes`
Cleanup and add a unique request ID 2019-08-05 15:40:24 -05:00			`from typing import Any`
			`from typing import Dict`
			`from typing import Set`
Tweak OG metadata fetching 2019-08-04 16:36:38 -05:00			`from urllib.parse import urlparse`
Run isort 2019-04-22 02:58:11 -05:00
Initial import 2018-05-18 13:41:41 -05:00			`import opengraph`
			`import requests`
			`from bs4 import BeautifulSoup`
Tweak the lookup and the OG metedata tasks Now we don't fetch OG metadata for AP profile 2018-07-22 04:44:42 -05:00			`from little_boxes import activitypub as ap`
			`from little_boxes.errors import NotAnActivityError`
More cleanup 2018-06-17 12:21:59 -05:00			`from little_boxes.urlutils import check_url`
			`from little_boxes.urlutils import is_url_valid`
Drop more OStatus stuff 2018-07-22 05:04:18 -05:00
Tweak the lookup and the OG metedata tasks Now we don't fetch OG metadata for AP profile 2018-07-22 04:44:42 -05:00			`from .lookup import lookup`
Initial import 2018-05-18 13:41:41 -05:00
More opengraph tweaks 2018-08-05 07:24:52 -05:00			`logger = logging.getLogger(__name__)`

Initial import 2018-05-18 13:41:41 -05:00
Cleanup and add a unique request ID 2019-08-05 15:40:24 -05:00			`def links_from_note(note: Dict[str, Any]) -> Set[str]:`
Tweak OG metadata fetching 2019-08-04 16:36:38 -05:00			`note_host = urlparse(ap._get_id(note["id"]) or "").netloc`
Initial import 2018-05-18 13:41:41 -05:00
			`links = set()`
Question/poll support 2019-04-14 12:17:54 -05:00			`if "content" in note:`
Big cleanup part 2 (#58) * Cleanup little-boxes stuff * Force html5lib for parsing OG data * Bugfixes 2019-08-04 09:34:30 -05:00			`soup = BeautifulSoup(note["content"], "html5lib")`
Question/poll support 2019-04-14 12:17:54 -05:00			`for link in soup.find_all("a"):`
			`h = link.get("href")`
Tweak OG metadata fetching 2019-08-04 16:36:38 -05:00			`ph = urlparse(h)`
			`if (`
			`ph.scheme in {"http", "https"}`
			`and ph.netloc != note_host`
			`and is_url_valid(h)`
			`):`
Question/poll support 2019-04-14 12:17:54 -05:00			`links.add(h)`

			`# FIXME(tsileo): support summary and name fields`
Initial import 2018-05-18 13:41:41 -05:00
			`return links`


Add Open Graph metadata support 2018-07-21 16:16:40 -05:00			`def fetch_og_metadata(user_agent, links):`
More opengraph tweaks 2018-08-05 07:24:52 -05:00			`res = []`
Initial import 2018-05-18 13:41:41 -05:00			`for l in links:`
More OG meta tweaks 2019-08-08 16:48:30 -05:00			`# Try to skip media early`
			`mimetype, _ = mimetypes.guess_type(l)`
			`if mimetype and mimetype.split("/")[0] in ["image", "video", "audio"]:`
			`logger.info(f"skipping media link {l}")`
			`continue`

More url checking 2018-05-25 17:03:30 -05:00			`check_url(l)`
Tweak the lookup and the OG metedata tasks Now we don't fetch OG metadata for AP profile 2018-07-22 04:44:42 -05:00
Improve Open Graph metadata parsing 2019-08-06 13:16:06 -05:00			`# Remove any AP objects`
Tweak the lookup and the OG metedata tasks Now we don't fetch OG metadata for AP profile 2018-07-22 04:44:42 -05:00			`try:`
Improve Open Graph metadata parsing 2019-08-06 13:16:06 -05:00			`lookup(l)`
			`continue`
Tweak the lookup and the OG metedata tasks Now we don't fetch OG metadata for AP profile 2018-07-22 04:44:42 -05:00			`except NotAnActivityError:`
			`pass`
Improve Open Graph metadata parsing 2019-08-06 13:16:06 -05:00			`except Exception:`
			`logger.exception(f"skipping {l} because of issues during AP lookup")`
			`continue`
Tweak the lookup and the OG metedata tasks Now we don't fetch OG metadata for AP profile 2018-07-22 04:44:42 -05:00
Improve Open Graph metadata parsing 2019-08-06 13:16:06 -05:00			`try:`
OG metadata tweaks 2019-08-08 16:19:04 -05:00			`h = requests.head(`
			`l, headers={"User-Agent": user_agent}, timeout=3, allow_redirects=True`
			`)`
Improve Open Graph metadata parsing 2019-08-06 13:16:06 -05:00			`h.raise_for_status()`
			`except requests.HTTPError as http_err:`
Tweak OG error handling 2019-08-06 14:51:58 -05:00			`logger.debug(`
			`f"failed to HEAD {l}, got a {http_err.response.status_code}: {http_err.response.text}"`
			`)`
Improve Open Graph metadata parsing 2019-08-06 13:16:06 -05:00			`continue`
Tweak OG error handling 2019-08-06 14:51:58 -05:00			`except requests.RequestException as err:`
			`logger.debug(f"failed to HEAD {l}: {err!r}")`
More OG tweaks 2019-08-06 14:47:02 -05:00			`continue`
Improve Open Graph metadata parsing 2019-08-06 13:16:06 -05:00
			`if not h.headers.get("content-type").startswith("text/html"):`
			`logger.debug(f"skipping {l} for bad content type")`
			`continue`

			`try:`
OG metadata tweaks 2019-08-08 16:19:04 -05:00			`r = requests.get(`
			`l, headers={"User-Agent": user_agent}, timeout=5, allow_redirects=True`
			`)`
Improve Open Graph metadata parsing 2019-08-06 13:16:06 -05:00			`r.raise_for_status()`
			`except requests.HTTPError as http_err:`
Tweak OG error handling 2019-08-06 14:51:58 -05:00			`logger.debug(`
			`f"failed to GET {l}, got a {http_err.response.status_code}: {http_err.response.text}"`
			`)`
More Open Graph tweaks 2018-08-05 07:45:44 -05:00			`continue`
Tweak OG error handling 2019-08-06 14:51:58 -05:00			`except requests.RequestException as err:`
			`logger.debug(f"failed to GET {l}: {err!r}")`
More OG tweaks 2019-08-06 14:47:02 -05:00			`continue`
Tweak OpenGraph 2018-08-05 06:55:48 -05:00
Reformat the files with black 2019-04-13 03:00:56 -05:00			`r.encoding = "UTF-8"`
More opengraph tweaks 2018-08-05 07:24:52 -05:00			`html = r.text`
			`try:`
			`data = dict(opengraph.OpenGraph(html=html))`
			`except Exception:`
More Open Graph tweaks 2018-08-05 07:45:44 -05:00			`logger.exception(f"failed to parse {l}")`
More opengraph tweaks 2018-08-05 07:24:52 -05:00			`continue`
Improve OpenGrah support 2019-07-05 03:42:04 -05:00
			`# Keep track of the fetched URL as some crappy websites use relative URLs everywhere`
			`data["_input_url"] = l`
Tweak OG metadata fetching 2019-08-04 16:36:38 -05:00			`u = urlparse(l)`
Improve OpenGrah support 2019-07-05 03:42:04 -05:00
			`# If it's a relative URL, build the absolute version`
			`if "image" in data and data["image"].startswith("/"):`
			`data["image"] = u._replace(`
			`path=data["image"], params="", query="", fragment=""`
			`).geturl()`

			`if "url" in data and data["url"].startswith("/"):`
			`data["url"] = u._replace(`
			`path=data["url"], params="", query="", fragment=""`
			`).geturl()`

Tweak OpenGraph 2018-08-05 06:55:48 -05:00			`if data.get("url"):`
			`res.append(data)`

			`return res`