microblog.pub/utils/opengraph.py

import logging
import opengraph
import requests
from bs4 import BeautifulSoup
from little_boxes import activitypub as ap
from little_boxes.errors import NotAnActivityError
from little_boxes.urlutils import check_url
from little_boxes.urlutils import is_url_valid

from .lookup import lookup

logger = logging.getLogger(__name__)


def links_from_note(note):
    tags_href = set()
    for t in note.get("tag", []):
        h = t.get("href")
        if h:
            tags_href.add(h)

    links = set()
    soup = BeautifulSoup(note["content"])
    for link in soup.find_all("a"):
        h = link.get("href")
        if h.startswith("http") and h not in tags_href and is_url_valid(h):
            links.add(h)

    return links


def fetch_og_metadata(user_agent, links):
    res = []
    for l in links:
        check_url(l)

        # Remove any AP actor from the list
        try:
            p = lookup(l)
            if p.has_type(ap.ACTOR_TYPES):
                continue
        except NotAnActivityError:
            pass

        r = requests.get(l, headers={"User-Agent": user_agent}, timeout=15)
        r.raise_for_status()
        if not r.headers.get("content-type").startswith("text/html"):
            logger.debug(f"skipping {l}")
            continue

        html = r.text
        try:
            data = dict(opengraph.OpenGraph(html=html))
        except Exception:
            logger.exception(f"failed to parse {l}")
            continue
        if data.get("url"):
            res.append(data)

    return res
More opengraph tweaks 2018-08-05 07:24:52 -05:00			`import logging`
Initial import 2018-05-18 13:41:41 -05:00			`import opengraph`
			`import requests`
			`from bs4 import BeautifulSoup`
Tweak the lookup and the OG metedata tasks Now we don't fetch OG metadata for AP profile 2018-07-22 04:44:42 -05:00			`from little_boxes import activitypub as ap`
			`from little_boxes.errors import NotAnActivityError`
More cleanup 2018-06-17 12:21:59 -05:00			`from little_boxes.urlutils import check_url`
			`from little_boxes.urlutils import is_url_valid`
Drop more OStatus stuff 2018-07-22 05:04:18 -05:00
Tweak the lookup and the OG metedata tasks Now we don't fetch OG metadata for AP profile 2018-07-22 04:44:42 -05:00			`from .lookup import lookup`
Initial import 2018-05-18 13:41:41 -05:00
More opengraph tweaks 2018-08-05 07:24:52 -05:00			`logger = logging.getLogger(__name__)`

Initial import 2018-05-18 13:41:41 -05:00
			`def links_from_note(note):`
More cleanup 2018-06-17 12:21:59 -05:00			`tags_href = set()`
			`for t in note.get("tag", []):`
			`h = t.get("href")`
Initial import 2018-05-18 13:41:41 -05:00			`if h:`
			`tags_href.add(h)`

			`links = set()`
More cleanup 2018-06-17 12:21:59 -05:00			`soup = BeautifulSoup(note["content"])`
			`for link in soup.find_all("a"):`
			`h = link.get("href")`
			`if h.startswith("http") and h not in tags_href and is_url_valid(h):`
Initial import 2018-05-18 13:41:41 -05:00			`links.add(h)`

			`return links`


Add Open Graph metadata support 2018-07-21 16:16:40 -05:00			`def fetch_og_metadata(user_agent, links):`
More opengraph tweaks 2018-08-05 07:24:52 -05:00			`res = []`
Initial import 2018-05-18 13:41:41 -05:00			`for l in links:`
More url checking 2018-05-25 17:03:30 -05:00			`check_url(l)`
Tweak the lookup and the OG metedata tasks Now we don't fetch OG metadata for AP profile 2018-07-22 04:44:42 -05:00
			`# Remove any AP actor from the list`
			`try:`
			`p = lookup(l)`
			`if p.has_type(ap.ACTOR_TYPES):`
			`continue`
			`except NotAnActivityError:`
			`pass`

Add Open Graph metadata support 2018-07-21 16:16:40 -05:00			`r = requests.get(l, headers={"User-Agent": user_agent}, timeout=15)`
Initial import 2018-05-18 13:41:41 -05:00			`r.raise_for_status()`
More Open Graph tweaks 2018-08-05 07:45:44 -05:00			`if not r.headers.get("content-type").startswith("text/html"):`
			`logger.debug(f"skipping {l}")`
			`continue`
Tweak OpenGraph 2018-08-05 06:55:48 -05:00
More opengraph tweaks 2018-08-05 07:24:52 -05:00			`html = r.text`
			`try:`
			`data = dict(opengraph.OpenGraph(html=html))`
			`except Exception:`
More Open Graph tweaks 2018-08-05 07:45:44 -05:00			`logger.exception(f"failed to parse {l}")`
More opengraph tweaks 2018-08-05 07:24:52 -05:00			`continue`
Tweak OpenGraph 2018-08-05 06:55:48 -05:00			`if data.get("url"):`
			`res.append(data)`

			`return res`