Fix OG parsing
This commit is contained in:
parent
6d1b342af1
commit
369c380054
1 changed files with 9 additions and 1 deletions
|
@ -74,7 +74,9 @@ def fetch_og_metadata(user_agent, links):
|
||||||
logger.debug(f"failed to HEAD {l}: {err!r}")
|
logger.debug(f"failed to HEAD {l}: {err!r}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not h.headers.get("content-type").startswith("text/html"):
|
if h.headers.get("content-type") and not h.headers.get(
|
||||||
|
"content-type"
|
||||||
|
).startswith("text/html"):
|
||||||
logger.debug(f"skipping {l} for bad content type")
|
logger.debug(f"skipping {l} for bad content type")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -92,6 +94,12 @@ def fetch_og_metadata(user_agent, links):
|
||||||
logger.debug(f"failed to GET {l}: {err!r}")
|
logger.debug(f"failed to GET {l}: {err!r}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# FIXME(tsileo): check mimetype via the URL too (like we do for images)
|
||||||
|
if not r.headers.get("content-type") or not r.headers.get(
|
||||||
|
"content-type"
|
||||||
|
).startswith("text/html"):
|
||||||
|
continue
|
||||||
|
|
||||||
r.encoding = "UTF-8"
|
r.encoding = "UTF-8"
|
||||||
html = r.text
|
html = r.text
|
||||||
try:
|
try:
|
||||||
|
|
Loading…
Reference in a new issue