microblog.pub/gc.py

106 lines
3.3 KiB
Python
Raw Normal View History

import logging
from datetime import datetime
from datetime import timedelta
from typing import List
from little_boxes import activitypub as ap
import activitypub
from activitypub import Box
from config import ID
from config import ME
from config import MEDIA_CACHE
2019-07-21 14:47:52 -05:00
from config import DAYS_TO_KEEP
from utils.migrations import DB
back = activitypub.MicroblogPubBackend()
ap.use_backend(back)
MY_PERSON = ap.Person(**ME)
logger = logging.getLogger(__name__)
def threads_of_interest() -> List[str]:
out = set()
# Fetch all the threads we've participed in
for data in DB.activities.find(
{
"meta.thread_root_parent": {"$exists": True},
"box": Box.OUTBOX.value,
"type": ap.ActivityType.CREATE.value,
}
):
out.add(data["meta"]["thread_root_parent"])
# Fetch all threads related to bookmarked activities
for data in DB.activities.find({"meta.bookmarked": True}):
# Keep the replies
out.add(data["meta"]["object_id"])
# And the whole thread if any
if "thread_root_parent" in data["meta"]:
out.add(data["meta"]["thread_root_parent"])
return list(out)
def perform() -> None:
2019-07-21 14:47:52 -05:00
d = (datetime.utcnow() - timedelta(days=DAYS_TO_KEEP)).strftime("%Y-%m-%d")
toi = threads_of_interest()
2019-07-21 14:47:52 -05:00
logger.info(f"thread_of_interest={toi!r}")
# Go over the old Create activities
for data in DB.activities.find(
{
"box": Box.INBOX.value,
"type": ap.ActivityType.CREATE.value,
"activity.published": {"$lt": d},
}
2019-07-21 14:47:52 -05:00
):
try:
remote_id = data["remote_id"]
meta = data["meta"]
activity = ap.parse_activity(data["activity"])
logger.info(f"activity={activity!r}")
# This activity has been bookmarked, keep it
if meta.get("bookmarked"):
continue
# Inspect the object
obj = activity.get_object()
# This activity mentions the server actor, keep it
if obj.has_mention(ID):
continue
# This activity is a direct reply of one the server actor activity, keep it
in_reply_to = obj.get_in_reply_to()
if in_reply_to and in_reply_to.startswith(ID):
continue
2019-07-21 14:47:52 -05:00
# This activity is part of a thread we want to keep, keep it
if in_reply_to and meta.get("thread_root_parent"):
thread_root_parent = meta["thread_root_parent"]
if thread_root_parent.startswith(ID) or thread_root_parent in toi:
continue
# This activity was boosted or liked, keep it
if meta.get("boosted") or meta.get("liked"):
continue
# TODO(tsileo): remove after tests
if meta.get("keep"):
logger.warning(f"{activity!r} would not have been deleted, skipping for now")
continue
2019-07-21 14:47:52 -05:00
# Delete the cached attachment
for grid_item in MEDIA_CACHE.fs.find({"remote_id": remote_id}):
MEDIA_CACHE.fs.delete(grid_item._id)
2019-07-21 14:47:52 -05:00
# Delete the activity
DB.activities.delete_one({"_id": data["_id"]})
except Exception:
logger.exception(f"failed to process {data!r}")