microblog.pub/core/gc.py

206 lines
6.7 KiB
Python
Raw Normal View History

import logging
from datetime import datetime
from datetime import timedelta
2019-07-22 12:27:12 -05:00
from time import perf_counter
from typing import Any
from typing import Dict
from typing import List
from little_boxes import activitypub as ap
2019-08-08 15:54:33 -05:00
from little_boxes.errors import ActivityGoneError
from little_boxes.errors import RemoteServerUnavailableError
2019-07-22 12:27:12 -05:00
from config import DAYS_TO_KEEP
from config import ID
from config import ME
from config import MEDIA_CACHE
2019-08-01 15:00:26 -05:00
from core import activitypub
2019-08-01 15:25:58 -05:00
from core.meta import Box
2019-08-08 16:56:28 -05:00
from core.meta import MetaKey
from core.meta import _meta
from core.meta import by_type
from core.meta import in_inbox
from utils.migrations import DB
back = activitypub.MicroblogPubBackend()
ap.use_backend(back)
MY_PERSON = ap.Person(**ME)
logger = logging.getLogger(__name__)
def threads_of_interest() -> List[str]:
out = set()
# Fetch all the threads we've participed in
for data in DB.activities.find(
{
"meta.thread_root_parent": {"$exists": True},
"box": Box.OUTBOX.value,
"type": ap.ActivityType.CREATE.value,
}
):
out.add(data["meta"]["thread_root_parent"])
# Fetch all threads related to bookmarked activities
for data in DB.activities.find({"meta.bookmarked": True}):
# Keep the replies
out.add(data["meta"]["object_id"])
# And the whole thread if any
if "thread_root_parent" in data["meta"]:
out.add(data["meta"]["thread_root_parent"])
return list(out)
2019-07-30 15:12:20 -05:00
def _keep(data: Dict[str, Any]) -> None:
2019-07-22 12:27:12 -05:00
DB.activities.update_one({"_id": data["_id"]}, {"$set": {"meta.gc_keep": True}})
def perform() -> None: # noqa: C901
start = perf_counter()
2019-07-21 14:47:52 -05:00
d = (datetime.utcnow() - timedelta(days=DAYS_TO_KEEP)).strftime("%Y-%m-%d")
toi = threads_of_interest()
2019-07-21 14:47:52 -05:00
logger.info(f"thread_of_interest={toi!r}")
2019-08-08 15:54:33 -05:00
delete_deleted = DB.activities.delete_many(
{
2019-08-08 16:56:28 -05:00
**in_inbox(),
**by_type(ap.ActivityType.DELETE),
_meta(MetaKey.PUBLISHED): {"$lt": d},
2019-08-08 15:54:33 -05:00
}
).deleted_count
logger.info(f"{delete_deleted} Delete deleted")
2019-07-22 12:27:12 -05:00
create_deleted = 0
create_count = 0
# Go over the old Create activities
for data in DB.activities.find(
{
"box": Box.INBOX.value,
"type": ap.ActivityType.CREATE.value,
2019-08-08 16:56:28 -05:00
_meta(MetaKey.PUBLISHED): {"$lt": d},
2019-07-22 12:27:12 -05:00
"meta.gc_keep": {"$exists": False},
}
2019-07-22 12:27:12 -05:00
).limit(500):
2019-07-21 14:47:52 -05:00
try:
2019-08-08 15:54:33 -05:00
logger.info(f"data={data!r}")
2019-07-22 12:27:12 -05:00
create_count += 1
2019-07-21 14:47:52 -05:00
remote_id = data["remote_id"]
meta = data["meta"]
# This activity has been bookmarked, keep it
if meta.get("bookmarked"):
2019-07-22 12:27:12 -05:00
_keep(data)
2019-07-21 14:47:52 -05:00
continue
2019-08-08 15:54:33 -05:00
obj = None
if not meta.get("deleted"):
try:
activity = ap.parse_activity(data["activity"])
logger.info(f"activity={activity!r}")
obj = activity.get_object()
except (RemoteServerUnavailableError, ActivityGoneError):
logger.exception(
f"failed to load {remote_id}, this activity will be deleted"
)
2019-07-21 14:47:52 -05:00
# This activity mentions the server actor, keep it
2019-08-08 15:54:33 -05:00
if obj and obj.has_mention(ID):
2019-07-22 12:27:12 -05:00
_keep(data)
2019-07-21 14:47:52 -05:00
continue
# This activity is a direct reply of one the server actor activity, keep it
2019-08-08 15:54:33 -05:00
if obj:
in_reply_to = obj.get_in_reply_to()
if in_reply_to and in_reply_to.startswith(ID):
_keep(data)
continue
2019-07-21 14:47:52 -05:00
# This activity is part of a thread we want to keep, keep it
2019-08-08 15:54:33 -05:00
if obj and in_reply_to and meta.get("thread_root_parent"):
2019-07-21 14:47:52 -05:00
thread_root_parent = meta["thread_root_parent"]
if thread_root_parent.startswith(ID) or thread_root_parent in toi:
2019-07-22 12:27:12 -05:00
_keep(data)
2019-07-21 14:47:52 -05:00
continue
# This activity was boosted or liked, keep it
if meta.get("boosted") or meta.get("liked"):
2019-07-22 12:27:12 -05:00
_keep(data)
2019-07-21 14:47:52 -05:00
continue
# TODO(tsileo): remove after tests
if meta.get("keep"):
2019-07-21 14:53:10 -05:00
logger.warning(
f"{activity!r} would not have been deleted, skipping for now"
)
2019-07-22 12:27:12 -05:00
_keep(data)
2019-07-21 14:47:52 -05:00
continue
2019-07-21 14:47:52 -05:00
# Delete the cached attachment
for grid_item in MEDIA_CACHE.fs.find({"remote_id": remote_id}):
MEDIA_CACHE.fs.delete(grid_item._id)
2019-07-21 14:47:52 -05:00
# Delete the activity
DB.activities.delete_one({"_id": data["_id"]})
2019-07-22 12:27:12 -05:00
create_deleted += 1
2019-07-21 14:47:52 -05:00
except Exception:
logger.exception(f"failed to process {data!r}")
2019-07-22 12:27:12 -05:00
after_gc_create = perf_counter()
2019-07-22 14:32:55 -05:00
time_to_gc_create = after_gc_create - start
2019-07-22 12:27:12 -05:00
logger.info(
f"{time_to_gc_create:.2f} seconds to analyze {create_count} Create, {create_deleted} deleted"
)
announce_count = 0
announce_deleted = 0
# Go over the old Create activities
for data in DB.activities.find(
{
"box": Box.INBOX.value,
"type": ap.ActivityType.ANNOUNCE.value,
2019-08-08 16:56:28 -05:00
_meta(MetaKey.PUBLISHED): {"$lt": d},
2019-07-22 12:27:12 -05:00
"meta.gc_keep": {"$exists": False},
}
).limit(500):
try:
announce_count += 1
remote_id = data["remote_id"]
meta = data["meta"]
activity = ap.parse_activity(data["activity"])
logger.info(f"activity={activity!r}")
# This activity has been bookmarked, keep it
if meta.get("bookmarked"):
_keep(data)
continue
object_id = activity.get_object_id()
# This announce is for a local activity (i.e. from the outbox), keep it
if object_id.startswith(ID):
_keep(data)
continue
for grid_item in MEDIA_CACHE.fs.find({"remote_id": remote_id}):
MEDIA_CACHE.fs.delete(grid_item._id)
# TODO(tsileo): here for legacy reason, this needs to be removed at some point
for grid_item in MEDIA_CACHE.fs.find({"remote_id": object_id}):
MEDIA_CACHE.fs.delete(grid_item._id)
# Delete the activity
DB.activities.delete_one({"_id": data["_id"]})
announce_deleted += 1
except Exception:
logger.exception(f"failed to process {data!r}")
after_gc_announce = perf_counter()
2019-07-22 14:32:55 -05:00
time_to_gc_announce = after_gc_announce - after_gc_create
2019-07-22 12:27:12 -05:00
logger.info(
f"{time_to_gc_announce:.2f} seconds to analyze {announce_count} Announce, {announce_deleted} deleted"
)