microblog.pub/utils/opengraph.py

61 lines
1.5 KiB
Python
Raw Normal View History

2018-08-05 07:24:52 -05:00
import logging
2018-05-18 13:41:41 -05:00
import opengraph
import requests
from bs4 import BeautifulSoup
from little_boxes import activitypub as ap
from little_boxes.errors import NotAnActivityError
2018-06-17 12:21:59 -05:00
from little_boxes.urlutils import check_url
from little_boxes.urlutils import is_url_valid
2018-07-22 05:04:18 -05:00
from .lookup import lookup
2018-05-18 13:41:41 -05:00
2018-08-05 07:24:52 -05:00
logger = logging.getLogger(__name__)
2018-05-18 13:41:41 -05:00
def links_from_note(note):
2018-06-17 12:21:59 -05:00
tags_href = set()
for t in note.get("tag", []):
h = t.get("href")
2018-05-18 13:41:41 -05:00
if h:
tags_href.add(h)
links = set()
2018-06-17 12:21:59 -05:00
soup = BeautifulSoup(note["content"])
for link in soup.find_all("a"):
h = link.get("href")
if h.startswith("http") and h not in tags_href and is_url_valid(h):
2018-05-18 13:41:41 -05:00
links.add(h)
return links
2018-07-21 16:16:40 -05:00
def fetch_og_metadata(user_agent, links):
2018-08-05 07:24:52 -05:00
res = []
2018-05-18 13:41:41 -05:00
for l in links:
2018-05-25 17:03:30 -05:00
check_url(l)
# Remove any AP actor from the list
try:
p = lookup(l)
if p.has_type(ap.ACTOR_TYPES):
continue
except NotAnActivityError:
pass
2018-07-21 16:16:40 -05:00
r = requests.get(l, headers={"User-Agent": user_agent}, timeout=15)
2018-05-18 13:41:41 -05:00
r.raise_for_status()
2018-08-05 07:45:44 -05:00
if not r.headers.get("content-type").startswith("text/html"):
logger.debug(f"skipping {l}")
continue
2018-08-05 06:55:48 -05:00
2018-08-05 07:24:52 -05:00
html = r.text
try:
data = dict(opengraph.OpenGraph(html=html))
except Exception:
2018-08-05 07:45:44 -05:00
logger.exception(f"failed to parse {l}")
2018-08-05 07:24:52 -05:00
continue
2018-08-05 06:55:48 -05:00
if data.get("url"):
res.append(data)
return res