2018-08-05 07:24:52 -05:00
|
|
|
import logging
|
2019-08-08 16:48:30 -05:00
|
|
|
import mimetypes
|
2019-08-05 15:40:24 -05:00
|
|
|
from typing import Any
|
|
|
|
from typing import Dict
|
|
|
|
from typing import Set
|
2019-08-04 16:36:38 -05:00
|
|
|
from urllib.parse import urlparse
|
2019-04-22 02:58:11 -05:00
|
|
|
|
2018-05-18 13:41:41 -05:00
|
|
|
import opengraph
|
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
2018-07-22 04:44:42 -05:00
|
|
|
from little_boxes import activitypub as ap
|
|
|
|
from little_boxes.errors import NotAnActivityError
|
2018-06-17 12:21:59 -05:00
|
|
|
from little_boxes.urlutils import check_url
|
|
|
|
from little_boxes.urlutils import is_url_valid
|
2018-07-22 05:04:18 -05:00
|
|
|
|
2018-07-22 04:44:42 -05:00
|
|
|
from .lookup import lookup
|
2018-05-18 13:41:41 -05:00
|
|
|
|
2018-08-05 07:24:52 -05:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2018-05-18 13:41:41 -05:00
|
|
|
|
2019-08-05 15:40:24 -05:00
|
|
|
def links_from_note(note: Dict[str, Any]) -> Set[str]:
|
2019-08-04 16:36:38 -05:00
|
|
|
note_host = urlparse(ap._get_id(note["id"]) or "").netloc
|
2018-05-18 13:41:41 -05:00
|
|
|
|
|
|
|
links = set()
|
2019-04-14 12:17:54 -05:00
|
|
|
if "content" in note:
|
2019-08-04 09:34:30 -05:00
|
|
|
soup = BeautifulSoup(note["content"], "html5lib")
|
2019-04-14 12:17:54 -05:00
|
|
|
for link in soup.find_all("a"):
|
|
|
|
h = link.get("href")
|
2019-08-04 16:36:38 -05:00
|
|
|
ph = urlparse(h)
|
|
|
|
if (
|
|
|
|
ph.scheme in {"http", "https"}
|
|
|
|
and ph.netloc != note_host
|
|
|
|
and is_url_valid(h)
|
|
|
|
):
|
2019-04-14 12:17:54 -05:00
|
|
|
links.add(h)
|
|
|
|
|
|
|
|
# FIXME(tsileo): support summary and name fields
|
2018-05-18 13:41:41 -05:00
|
|
|
|
|
|
|
return links
|
|
|
|
|
|
|
|
|
2018-07-21 16:16:40 -05:00
|
|
|
def fetch_og_metadata(user_agent, links):
|
2018-08-05 07:24:52 -05:00
|
|
|
res = []
|
2018-05-18 13:41:41 -05:00
|
|
|
for l in links:
|
2019-08-08 16:48:30 -05:00
|
|
|
# Try to skip media early
|
|
|
|
mimetype, _ = mimetypes.guess_type(l)
|
|
|
|
if mimetype and mimetype.split("/")[0] in ["image", "video", "audio"]:
|
|
|
|
logger.info(f"skipping media link {l}")
|
|
|
|
continue
|
|
|
|
|
2018-05-25 17:03:30 -05:00
|
|
|
check_url(l)
|
2018-07-22 04:44:42 -05:00
|
|
|
|
2019-08-06 13:16:06 -05:00
|
|
|
# Remove any AP objects
|
2018-07-22 04:44:42 -05:00
|
|
|
try:
|
2019-08-06 13:16:06 -05:00
|
|
|
lookup(l)
|
|
|
|
continue
|
2018-07-22 04:44:42 -05:00
|
|
|
except NotAnActivityError:
|
|
|
|
pass
|
2019-08-06 13:16:06 -05:00
|
|
|
except Exception:
|
|
|
|
logger.exception(f"skipping {l} because of issues during AP lookup")
|
|
|
|
continue
|
2018-07-22 04:44:42 -05:00
|
|
|
|
2019-08-06 13:16:06 -05:00
|
|
|
try:
|
2019-08-08 16:19:04 -05:00
|
|
|
h = requests.head(
|
|
|
|
l, headers={"User-Agent": user_agent}, timeout=3, allow_redirects=True
|
|
|
|
)
|
2019-08-06 13:16:06 -05:00
|
|
|
h.raise_for_status()
|
|
|
|
except requests.HTTPError as http_err:
|
2019-08-06 14:51:58 -05:00
|
|
|
logger.debug(
|
|
|
|
f"failed to HEAD {l}, got a {http_err.response.status_code}: {http_err.response.text}"
|
|
|
|
)
|
2019-08-06 13:16:06 -05:00
|
|
|
continue
|
2019-08-06 14:51:58 -05:00
|
|
|
except requests.RequestException as err:
|
|
|
|
logger.debug(f"failed to HEAD {l}: {err!r}")
|
2019-08-06 14:47:02 -05:00
|
|
|
continue
|
2019-08-06 13:16:06 -05:00
|
|
|
|
|
|
|
if not h.headers.get("content-type").startswith("text/html"):
|
|
|
|
logger.debug(f"skipping {l} for bad content type")
|
|
|
|
continue
|
|
|
|
|
|
|
|
try:
|
2019-08-08 16:19:04 -05:00
|
|
|
r = requests.get(
|
|
|
|
l, headers={"User-Agent": user_agent}, timeout=5, allow_redirects=True
|
|
|
|
)
|
2019-08-06 13:16:06 -05:00
|
|
|
r.raise_for_status()
|
|
|
|
except requests.HTTPError as http_err:
|
2019-08-06 14:51:58 -05:00
|
|
|
logger.debug(
|
|
|
|
f"failed to GET {l}, got a {http_err.response.status_code}: {http_err.response.text}"
|
|
|
|
)
|
2018-08-05 07:45:44 -05:00
|
|
|
continue
|
2019-08-06 14:51:58 -05:00
|
|
|
except requests.RequestException as err:
|
|
|
|
logger.debug(f"failed to GET {l}: {err!r}")
|
2019-08-06 14:47:02 -05:00
|
|
|
continue
|
2018-08-05 06:55:48 -05:00
|
|
|
|
2019-04-13 03:00:56 -05:00
|
|
|
r.encoding = "UTF-8"
|
2018-08-05 07:24:52 -05:00
|
|
|
html = r.text
|
|
|
|
try:
|
|
|
|
data = dict(opengraph.OpenGraph(html=html))
|
|
|
|
except Exception:
|
2018-08-05 07:45:44 -05:00
|
|
|
logger.exception(f"failed to parse {l}")
|
2018-08-05 07:24:52 -05:00
|
|
|
continue
|
2019-07-05 03:42:04 -05:00
|
|
|
|
|
|
|
# Keep track of the fetched URL as some crappy websites use relative URLs everywhere
|
|
|
|
data["_input_url"] = l
|
2019-08-04 16:36:38 -05:00
|
|
|
u = urlparse(l)
|
2019-07-05 03:42:04 -05:00
|
|
|
|
|
|
|
# If it's a relative URL, build the absolute version
|
|
|
|
if "image" in data and data["image"].startswith("/"):
|
|
|
|
data["image"] = u._replace(
|
|
|
|
path=data["image"], params="", query="", fragment=""
|
|
|
|
).geturl()
|
|
|
|
|
|
|
|
if "url" in data and data["url"].startswith("/"):
|
|
|
|
data["url"] = u._replace(
|
|
|
|
path=data["url"], params="", query="", fragment=""
|
|
|
|
).geturl()
|
|
|
|
|
2018-08-05 06:55:48 -05:00
|
|
|
if data.get("url"):
|
|
|
|
res.append(data)
|
|
|
|
|
|
|
|
return res
|