Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
f2c3ba1
operator crds:
ikreymer Sep 23, 2025
4bd5525
work
ikreymer Sep 23, 2025
183c47b
add crud for CollIndex object in collections
ikreymer Sep 24, 2025
85c8691
add import job, minimally working
ikreymer Sep 25, 2025
933020d
add btrix-crds 0.2.0
ikreymer Sep 25, 2025
f5e609a
add dedupCollId to crawler, support running crawler with dedup!
ikreymer Sep 25, 2025
84c3b53
ensure collindex deleted on collection delete
ikreymer Sep 26, 2025
5b68f54
add 'waiting_for_dedup_index' state to indicate crawl is awaiting ded…
ikreymer Sep 28, 2025
3de00d4
make storage and memory configureable: lower settings for tests
ikreymer Sep 28, 2025
96fa722
configmap: add missing settings
ikreymer Sep 28, 2025
ec7dfc8
make dedupCollId independent, but require dedup coll to also be in au…
ikreymer Sep 30, 2025
61f5d2f
fix typo/formatting
ikreymer Sep 30, 2025
1118d06
index import channel: support setting custom crawler channel to use f…
ikreymer Sep 30, 2025
651878d
configmap: fix quotes
ikreymer Sep 30, 2025
e609e79
fix autoadd uploads to collections
ikreymer Sep 30, 2025
1105ee3
Update backend/btrixcloud/crawlmanager.py
ikreymer Oct 1, 2025
25f01cb
Apply suggestion from @tw4l
ikreymer Oct 1, 2025
016944e
Apply suggestion from @tw4l
ikreymer Oct 1, 2025
25f09a6
refactor toggle_dedup_index():
ikreymer Oct 1, 2025
5507a25
chart: change index importer nodeAffinity to preferred not required
ikreymer Oct 1, 2025
548193a
dedup updates:
ikreymer Oct 13, 2025
b9c4251
lint fix move
ikreymer Oct 13, 2025
c993b0d
more lint fixes
ikreymer Oct 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 98 additions & 12 deletions backend/btrixcloud/colls.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
UserFilePreparer,
MIN_UPLOAD_PART_SIZE,
PublicCollOut,
ResourcesOnly,
)
from .utils import (
dt_now,
Expand All @@ -57,6 +58,8 @@
get_origin,
)

from .crawlmanager import CrawlManager

if TYPE_CHECKING:
from .orgs import OrgOps
from .storages import StorageOps
Expand All @@ -81,8 +84,16 @@ class CollectionOps:
event_webhook_ops: EventWebhookOps
crawl_ops: CrawlOps
page_ops: PageOps
crawl_manager: CrawlManager

def __init__(self, mdb, storage_ops, orgs, event_webhook_ops):
def __init__(
self,
mdb,
orgs: OrgOps,
storage_ops: StorageOps,
crawl_manager: CrawlManager,
event_webhook_ops: EventWebhookOps,
):
self.collections = mdb["collections"]
self.crawls = mdb["crawls"]
self.crawl_configs = mdb["crawl_configs"]
Expand All @@ -91,6 +102,7 @@ def __init__(self, mdb, storage_ops, orgs, event_webhook_ops):

self.orgs = orgs
self.storage_ops = storage_ops
self.crawl_manager = crawl_manager
self.event_webhook_ops = event_webhook_ops

def set_crawl_ops(self, ops):
Expand Down Expand Up @@ -141,11 +153,15 @@ async def add_collection(self, oid: UUID, coll_in: CollIn):
access=coll_in.access,
defaultThumbnailName=coll_in.defaultThumbnailName,
allowPublicDownload=coll_in.allowPublicDownload,
hasDedupIndex=coll_in.hasDedupIndex,
)
try:
await self.collections.insert_one(coll.to_dict())
org = await self.orgs.get_org_by_id(oid)
await self.clear_org_previous_slugs_matching_slug(slug, org)
# create collection index
if coll.hasDedupIndex:
await self.crawl_manager.create_coll_index(coll)

if crawl_ids:
await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
Expand Down Expand Up @@ -194,22 +210,33 @@ async def update_collection(
db_update["$push"] = {"previousSlugs": previous_slug}

try:
result = await self.collections.find_one_and_update(
prev_result = await self.collections.find_one_and_update(
{"_id": coll_id, "oid": org.id},
db_update,
return_document=pymongo.ReturnDocument.AFTER,
return_document=pymongo.ReturnDocument.BEFORE,
)
except pymongo.errors.DuplicateKeyError as err:
# pylint: disable=raise-missing-from
field = get_duplicate_key_error_field(err)
raise HTTPException(status_code=400, detail=f"collection_{field}_taken")

if not result:
if not prev_result:
raise HTTPException(status_code=404, detail="collection_not_found")

if slug_update:
await self.clear_org_previous_slugs_matching_slug(slug_update, org)

# if dedup index is true, but was false
if update.hasDedupIndex and not prev_result.get("hasDedupIndex"):
# get latest coll, create index
coll = await self.get_collection(coll_id, org.id)
await self.crawl_manager.create_coll_index(coll)

# if dedup is false, but was true
if update.hasDedupIndex is False and prev_result.get("hasDedupIndex"):
# delete index -- may need extra restrictions
await self.crawl_manager.delete_coll_index(coll_id)

return {"updated": True}

async def clear_org_previous_slugs_matching_slug(
Expand All @@ -221,6 +248,16 @@ async def clear_org_previous_slugs_matching_slug(
{"$pull": {"previousSlugs": slug}},
)

async def get_coll_dedup_index(self, coll_id: UUID) -> bool:
"""return true/false if collection has dedup index, or raise"""
result = await self.collections.find_one(
{"_id": coll_id}, projection=["hasDedupIndex"]
)
if not result:
raise HTTPException(status_code=404, detail="collection_not_found")

return result["hasDedupIndex"] is True

async def add_crawls_to_collection(
self,
coll_id: UUID,
Expand All @@ -229,8 +266,6 @@ async def add_crawls_to_collection(
headers: Optional[dict] = None,
) -> CollOut:
"""Add crawls to collection"""
await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)

modified = dt_now()
result = await self.collections.find_one_and_update(
{"_id": coll_id},
Expand All @@ -240,8 +275,11 @@ async def add_crawls_to_collection(
if not result:
raise HTTPException(status_code=404, detail="collection_not_found")

# do this after checking if collection exists
await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)

await self.update_collection_counts_and_tags(coll_id)
await self.update_collection_dates(coll_id, org.id)
await self.update_collection_dates(coll_id, org.id, update_index=True)

asyncio.create_task(
self.event_webhook_ops.create_added_to_collection_notification(
Expand Down Expand Up @@ -294,6 +332,24 @@ async def get_collection_raw(

return result

async def enable_dedup_index(self, coll_id: UUID):
"""enable dedup index if it doesn't exist yet"""
result = await self.collections.find_one_and_update(
{"_id": coll_id, "hasDedupIndex": {"$ne": True}},
{"$set": {"hasDedupIndex": True}},
return_document=pymongo.ReturnDocument.AFTER,
)

# not changed, nothing to do
if not result:
return False

coll = Collection.from_dict(result)

await self.crawl_manager.create_coll_index(coll)

return True

async def get_collection_raw_by_slug(
self,
coll_slug: str,
Expand Down Expand Up @@ -396,6 +452,16 @@ async def get_collection_out(

return CollOut.from_dict(result)

async def get_internal_replay_list(self, coll_id: UUID, oid: UUID) -> ResourcesOnly:
"""get list of internally resolved signed WACZ files"""
org = await self.orgs.get_org_by_id(oid)
resources, _, _ = await self.get_collection_crawl_resources(coll_id, org)

for file_ in resources:
file_.path = self.storage_ops.resolve_internal_access_path(file_.path)

return ResourcesOnly(resources=resources)

async def get_public_collection_out(
self,
coll_id: UUID,
Expand Down Expand Up @@ -639,6 +705,9 @@ async def delete_collection(self, coll_id: UUID, org: Organization):
if coll.thumbnail:
await self.delete_thumbnail(coll_id, org)

if coll.hasDedupIndex:
await self.crawl_manager.delete_coll_index(coll.id)

result = await self.collections.delete_one({"_id": coll_id, "oid": org.id})
if result.deleted_count < 1:
raise HTTPException(status_code=404, detail="collection_not_found")
Expand Down Expand Up @@ -740,7 +809,9 @@ async def update_collection_counts_and_tags(self, collection_id: UUID):
},
)

async def update_collection_dates(self, coll_id: UUID, oid: UUID):
async def update_collection_dates(
self, coll_id: UUID, oid: UUID, update_index=False
):
"""Update collection earliest and latest dates from page timestamps"""
# pylint: disable=too-many-locals
coll = await self.get_collection(coll_id, oid)
Expand All @@ -749,6 +820,10 @@ async def update_collection_dates(self, coll_id: UUID, oid: UUID):
earliest_ts = None
latest_ts = None

# update_index is set, update dedup index if it exists
if update_index and coll.hasDedupIndex:
await self.crawl_manager.update_coll_index(coll_id)

match_query = {
"oid": coll.oid,
"crawl_id": {"$in": crawl_ids},
Expand Down Expand Up @@ -783,13 +858,16 @@ async def update_collection_dates(self, coll_id: UUID, oid: UUID):

async def update_crawl_collections(self, crawl_id: str, oid: UUID):
"""Update counts, dates, and modified for all collections in crawl"""
# accessing directly to handle both crawls and uploads
crawl = await self.crawls.find_one({"_id": crawl_id})
crawl_coll_ids = crawl.get("collectionIds")
crawl_coll_ids = crawl.get("collectionIds") or []
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
crawl_coll_ids = crawl.get("collectionIds") or []
crawl_coll_ids = crawl.get("collectionIds", [])

Just a bit more idiomatic

modified = dt_now()

for coll_id in crawl_coll_ids:
await self.update_collection_counts_and_tags(coll_id)
await self.update_collection_dates(coll_id, oid)
await self.update_collection_dates(
coll_id, oid, crawl.get("dedupCollId") != coll_id
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we standardize dedup to dedupe? I believe this is the more common shorthand (#2893 (comment))

)
await self.collections.find_one_and_update(
{"_id": coll_id},
{"$set": {"modified": modified}},
Expand Down Expand Up @@ -1000,12 +1078,20 @@ async def calculate_thumbnail_storage(self, oid: UUID) -> int:
# ============================================================================
# pylint: disable=too-many-locals
def init_collections_api(
app, mdb, orgs, storage_ops, event_webhook_ops, user_dep
app,
mdb,
orgs: OrgOps,
storage_ops: StorageOps,
crawl_manager: CrawlManager,
event_webhook_ops: EventWebhookOps,
user_dep,
) -> CollectionOps:
"""init collections api"""
# pylint: disable=invalid-name, unused-argument, too-many-arguments

colls: CollectionOps = CollectionOps(mdb, storage_ops, orgs, event_webhook_ops)
colls: CollectionOps = CollectionOps(
mdb, orgs, storage_ops, crawl_manager, event_webhook_ops
)

org_crawl_dep = orgs.org_crawl_dep
org_viewer_dep = orgs.org_viewer_dep
Expand Down
39 changes: 38 additions & 1 deletion backend/btrixcloud/crawlconfigs.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,14 @@ async def add_crawl_config(

first_seed = seeds[0].url

# the dedup collection id must also be in auto add collections
if config_in.dedupCollId:
if config_in.autoAddCollections is None:
config_in.autoAddCollections = []

if config_in.dedupCollId not in config_in.autoAddCollections:
config_in.autoAddCollections.append(config_in.dedupCollId)

now = dt_now()
crawlconfig = CrawlConfig(
id=uuid4(),
Expand Down Expand Up @@ -343,6 +351,7 @@ async def add_crawl_config(
firstSeed=first_seed,
seedCount=seed_count,
shareable=config_in.shareable,
dedupCollId=config_in.dedupCollId,
)

if config_in.runNow:
Expand All @@ -359,6 +368,9 @@ async def add_crawl_config(
storage_quota_reached = False
exec_mins_quota_reached = False

if config_in.dedupCollId:
await self.coll_ops.enable_dedup_index(config_in.dedupCollId)

if config_in.runNow:
try:
crawl_id = await self.run_now_internal(crawlconfig, org, user)
Expand Down Expand Up @@ -602,6 +614,19 @@ async def update_crawl_config(
update.tags is not None
and ",".join(orig_crawl_config.tags) != ",".join(update.tags)
)

if isinstance(update.dedupCollId, UUID):
if update.autoAddCollections is None:
update.autoAddCollections = []

if update.dedupCollId not in update.autoAddCollections:
update.autoAddCollections.append(update.dedupCollId)

metadata_changed = metadata_changed or (
update.dedupCollId is not None
and update.dedupCollId != orig_crawl_config.dedupCollId
)

metadata_changed = metadata_changed or (
update.autoAddCollections is not None
and sorted(orig_crawl_config.autoAddCollections)
Expand Down Expand Up @@ -629,14 +654,22 @@ async def update_crawl_config(
query["modifiedByName"] = user.name
query["modified"] = dt_now()

# if empty str, just clear the profile
# profile - if empty str, just clear the profile
if update.profileid == "":
query["profileid"] = None
# else, ensure its a valid profile
elif update.profileid:
await self.profiles.get_profile(cast(UUID, update.profileid), org)
query["profileid"] = update.profileid

# dedup - if empty dedupCollId, clear the coll id
if update.dedupCollId == "":
query["dedupCollId"] = None
# else, enable dedup on collection
if isinstance(update.dedupCollId, UUID):
query["dedupCollId"] = update.dedupCollId
await self.coll_ops.enable_dedup_index(update.dedupCollId)

if update.config is not None:
query["config"] = update.config.dict()

Expand Down Expand Up @@ -1116,6 +1149,10 @@ async def remove_collection_from_all_configs(
{"$pull": {"autoAddCollections": coll_id}},
)

await self.crawl_configs.update_many(
{"oid": org.id, "dedupCollId": coll_id}, {"$set": {"dedupCollId": None}}
)

async def get_crawl_config_tags(self, org):
"""get distinct tags from all crawl configs for this org"""
return await self.crawl_configs.distinct("tags", {"oid": org.id})
Expand Down
31 changes: 30 additions & 1 deletion backend/btrixcloud/crawlmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,14 @@

from typing import Optional, Dict, Tuple
from datetime import datetime, timedelta
from uuid import UUID

from fastapi import HTTPException

from .utils import dt_now, date_to_str, scale_from_browser_windows
from .k8sapi import K8sAPI

from .models import StorageRef, CrawlConfig, BgJobType
from .models import StorageRef, CrawlConfig, BgJobType, Collection


# ============================================================================
Expand Down Expand Up @@ -293,6 +294,9 @@ async def create_crawl_job(
storage_filename=storage_filename,
profile_filename=profile_filename,
proxy_id=crawlconfig.proxyId or DEFAULT_PROXY_ID,
dedup_coll_id=(
str(crawlconfig.dedupCollId) if crawlconfig.dedupCollId else ""
),
is_single_page=is_single_page,
seed_file_url=seed_file_url,
)
Expand Down Expand Up @@ -468,6 +472,31 @@ async def delete_crawl_config_by_id(self, cid: str) -> None:
"""Delete all crawl configs by id"""
await self._delete_crawl_configs(f"btrix.crawlconfig={cid}")

async def create_coll_index(self, collection: Collection):
"""create collection index"""
params = {
"id": str(collection.id),
"oid": str(collection.oid),
"collItemsUpdatedAt": date_to_str(collection.modified or dt_now()),
}
data = self.templates.env.get_template("coll_index.yaml").render(params)

await self.create_from_yaml(data)

return str(collection.id)

async def update_coll_index(self, coll_id: UUID):
"""force collection index to update"""
return await self.patch_custom_object(
f"collindex-{coll_id}",
{"collItemsUpdatedAt": date_to_str(dt_now())},
"collindexes",
)

async def delete_coll_index(self, coll_id: UUID):
"""delete collection index"""
return await self.delete_custom_object(f"collindex-{coll_id}", "collindexes")

# ========================================================================
# Internal Methods
async def _delete_crawl_configs(self, label) -> None:
Expand Down
Loading
Loading