2018-05-18 13:41:41 -05:00
|
|
|
import opengraph
|
|
|
|
import requests
|
|
|
|
from bs4 import BeautifulSoup
|
2018-06-17 12:21:59 -05:00
|
|
|
from little_boxes.urlutils import check_url
|
|
|
|
from little_boxes.urlutils import is_url_valid
|
2018-05-18 13:41:41 -05:00
|
|
|
|
|
|
|
|
|
|
|
def links_from_note(note):
|
2018-06-17 12:21:59 -05:00
|
|
|
tags_href = set()
|
|
|
|
for t in note.get("tag", []):
|
|
|
|
h = t.get("href")
|
2018-05-18 13:41:41 -05:00
|
|
|
if h:
|
|
|
|
# TODO(tsileo): fetch the URL for Actor profile, type=mention
|
|
|
|
tags_href.add(h)
|
|
|
|
|
|
|
|
links = set()
|
2018-06-17 12:21:59 -05:00
|
|
|
soup = BeautifulSoup(note["content"])
|
|
|
|
for link in soup.find_all("a"):
|
|
|
|
h = link.get("href")
|
|
|
|
if h.startswith("http") and h not in tags_href and is_url_valid(h):
|
2018-05-18 13:41:41 -05:00
|
|
|
links.add(h)
|
|
|
|
|
|
|
|
return links
|
|
|
|
|
|
|
|
|
|
|
|
def fetch_og_metadata(user_agent, col, remote_id):
|
2018-06-17 12:21:59 -05:00
|
|
|
doc = col.find_one({"remote_id": remote_id})
|
2018-05-18 13:41:41 -05:00
|
|
|
if not doc:
|
|
|
|
raise ValueError
|
2018-06-17 12:21:59 -05:00
|
|
|
note = doc["activity"]["object"]
|
2018-05-18 13:41:41 -05:00
|
|
|
print(note)
|
|
|
|
links = links_from_note(note)
|
|
|
|
if not links:
|
|
|
|
return 0
|
|
|
|
# FIXME(tsileo): set the user agent by giving HTML directly to OpenGraph
|
|
|
|
htmls = []
|
|
|
|
for l in links:
|
2018-05-25 17:03:30 -05:00
|
|
|
check_url(l)
|
2018-06-17 12:21:59 -05:00
|
|
|
r = requests.get(l, headers={"User-Agent": user_agent})
|
2018-05-18 13:41:41 -05:00
|
|
|
r.raise_for_status()
|
|
|
|
htmls.append(r.text)
|
|
|
|
links_og_metadata = [dict(opengraph.OpenGraph(html=html)) for html in htmls]
|
2018-06-17 12:21:59 -05:00
|
|
|
col.update_one(
|
|
|
|
{"remote_id": remote_id}, {"$set": {"meta.og_metadata": links_og_metadata}}
|
|
|
|
)
|
2018-05-18 13:41:41 -05:00
|
|
|
return len(links)
|