From 369c38005488f0de81044c1e20270d3b9d1e7764 Mon Sep 17 00:00:00 2001 From: Thomas Sileo Date: Sat, 17 Aug 2019 21:32:52 +0200 Subject: [PATCH] Fix OG parsing --- utils/opengraph.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/utils/opengraph.py b/utils/opengraph.py index e869594..8d487b3 100644 --- a/utils/opengraph.py +++ b/utils/opengraph.py @@ -74,7 +74,9 @@ def fetch_og_metadata(user_agent, links): logger.debug(f"failed to HEAD {l}: {err!r}") continue - if not h.headers.get("content-type").startswith("text/html"): + if h.headers.get("content-type") and not h.headers.get( + "content-type" + ).startswith("text/html"): logger.debug(f"skipping {l} for bad content type") continue @@ -92,6 +94,12 @@ def fetch_og_metadata(user_agent, links): logger.debug(f"failed to GET {l}: {err!r}") continue + # FIXME(tsileo): check mimetype via the URL too (like we do for images) + if not r.headers.get("content-type") or not r.headers.get( + "content-type" + ).startswith("text/html"): + continue + r.encoding = "UTF-8" html = r.text try: