Improve Open Graph metadata parsing

2019-08-06 20:16:06 +02:00 · 2019-08-06 20:16:06 +02:00 · b0cb248a23
commit b0cb248a23
parent 5ea22edcb8
1 changed files with 22 additions and 8 deletions
--- a/utils/opengraph.py
+++ b/utils/opengraph.py
@ -43,18 +43,32 @@ def fetch_og_metadata(user_agent, links):
    for l in links:
        check_url(l)
-        # Remove any AP actor from the list
+        # Remove any AP objects
        try:
-            p = lookup(l)
+            lookup(l)
            if p.has_type(ap.ACTOR_TYPES):
            continue
        except NotAnActivityError:
            pass
        except Exception:
            logger.exception(f"skipping {l} because of issues during AP lookup")
            continue
-        r = requests.get(l, headers={"User-Agent": user_agent}, timeout=15)
+        try:
            h = requests.head(l, headers={"User-Agent": user_agent}, timeout=3)
            h.raise_for_status()
        except requests.HTTPError as http_err:
            logger.debug(f"failed to HEAD {l}, got a {http_err.response.status_code}")
            continue
        if not h.headers.get("content-type").startswith("text/html"):
            logger.debug(f"skipping {l} for bad content type")
            continue
        try:
            r = requests.get(l, headers={"User-Agent": user_agent}, timeout=5)
            r.raise_for_status()
-        if not r.headers.get("content-type").startswith("text/html"):
+        except requests.HTTPError as http_err:
-            logger.debug(f"skipping {l}")
+            logger.debug(f"failed to GET {l}, got a {http_err.response.status_code}")
            continue
        r.encoding = "UTF-8"