Skip to content

Commit ff5bfcc

Browse files
committed
refactor fmd change check, use in permissions as well
1 parent 804f086 commit ff5bfcc

3 files changed

Lines changed: 99 additions & 99 deletions

File tree

src/main/java/edu/harvard/iq/dataverse/FileMetadata.java

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,15 +67,15 @@
6767
*/
6868
@Table(indexes = {@Index(columnList="datafile_id"), @Index(columnList="datasetversion_id")} )
6969
@NamedNativeQuery(
70-
name = "FileMetadata.compareFileMetadata",
70+
name = "FileMetadata.getDatafilesWithChangedMetadata",
7171
query = "WITH fm_categories AS (" +
7272
" SELECT fmd.filemetadatas_id, " +
7373
" STRING_AGG(dfc.name, ',' ORDER BY dfc.name) AS categories " +
7474
" FROM FileMetadata_DataFileCategory fmd " +
7575
" JOIN DataFileCategory dfc ON fmd.filecategories_id = dfc.id " +
7676
" GROUP BY fmd.filemetadatas_id " +
7777
") " +
78-
"SELECT fm1.id " +
78+
"SELECT fm1.datafile_id " +
7979
"FROM FileMetadata fm1 " +
8080
"LEFT JOIN FileMetadata fm2 ON fm1.datafile_id = fm2.datafile_id " +
8181
" AND fm2.datasetversion_id = ?1 " +
@@ -93,11 +93,11 @@
9393
" ) " +
9494
" ) " +
9595
" )",
96-
resultSetMapping = "IdToLongMapping"
96+
resultSetMapping = "IdToIntegerMapping"
9797
)
9898
/* When this mapping was to Long.class, Postgres was still returning an Integer, causing indexing failures - see #11776 */
9999
@SqlResultSetMapping(
100-
name = "IdToLongMapping",
100+
name = "IdToIntegerMapping",
101101
columns = @ColumnResult(name = "id", type = Integer.class)
102102
)
103103
@Entity

src/main/java/edu/harvard/iq/dataverse/search/IndexServiceBean.java

Lines changed: 24 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -603,10 +603,23 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr
603603
writeDebugInfo(debug, dataset);
604604
}
605605
if (doNormalSolrDocCleanUp) {
606+
List<String> solrIdsOfPermissionDocsToDelete = new ArrayList<>();
606607
try {
608+
607609
solrIdsOfDocsToDelete = findFilesOfParentDataset(dataset.getId());
608610
logger.fine("Existing file docs: " + String.join(", ", solrIdsOfDocsToDelete));
609611
if (!solrIdsOfDocsToDelete.isEmpty()) {
612+
if(!latestVersion.isDraft()) {
613+
//For draft datasets after a published version, we're not reindexing the files unless their metadata changes
614+
//Therefore, to make sure their
615+
for(String fileDocId : solrIdsOfDocsToDelete) {
616+
if(!fileDocId.endsWith(draftSuffix)) {
617+
solrIdsOfPermissionDocsToDelete.add(fileDocId + draftSuffix + discoverabilityPermissionSuffix);
618+
}
619+
}
620+
621+
logger.fine("Existing permission docs: " + String.join(", ", solrIdsOfPermissionDocsToDelete));
622+
}
610623
// We keep the latest version's docs unless it is deaccessioned and there is no
611624
// published/released version
612625
// So skip the loop removing those docs from the delete list except in that case
@@ -650,7 +663,7 @@ private void doIndexDataset(Dataset dataset, boolean doNormalSolrDocCleanUp) thr
650663
logger.fine("Solr docs to delete: " + String.join(", ", solrIdsOfDocsToDelete));
651664

652665
if (!solrIdsOfDocsToDelete.isEmpty()) {
653-
List<String> solrIdsOfPermissionDocsToDelete = new ArrayList<>();
666+
654667
for (String file : solrIdsOfDocsToDelete) {
655668
// Also remove associated permission docs
656669
solrIdsOfPermissionDocsToDelete.add(file + discoverabilityPermissionSuffix);
@@ -1421,7 +1434,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set<Long
14211434
long maxSize = maxFTIndexingSize != null ? maxFTIndexingSize.longValue() : Long.MAX_VALUE;
14221435

14231436
List<String> filesIndexed = new ArrayList<>();
1424-
final List<Long> changedFileMetadataIds = new ArrayList<>();
1437+
final List<Long> changedFileIds = new ArrayList<>();
14251438
if (datasetVersion != null) {
14261439
List<FileMetadata> fileMetadatas = datasetVersion.getFileMetadatas();
14271440
List<FileMetadata> rfm = new ArrayList<>();
@@ -1432,42 +1445,17 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set<Long
14321445
fileMap.put(released.getDataFile().getId(), released);
14331446
}
14341447

1435-
Query query = em.createNamedQuery("FileMetadata.compareFileMetadata", Long.class);
1436-
query.setParameter(1, dataset.getReleasedVersion().getId());
1437-
query.setParameter(2, datasetVersion.getId());
1438-
1439-
/*
1440-
* When the query was configured to return Long, it was returning Integer. The query has been changed to return Integer now. The code here is robust if that changes in the future.
1441-
*/
1442-
List<Object> queryResults = query.getResultList();
1443-
for (Object result : queryResults) {
1444-
if (result != null) {
1445-
// Ensure we're adding Long objects to the list
1446-
if (result instanceof Integer intResult) {
1447-
logger.finest("Converted Integer result to Long: " + result);
1448-
changedFileMetadataIds.add(Long.valueOf(intResult));
1449-
} else if (result instanceof Long longResult) {
1450-
// Already a Long, add directly
1451-
logger.finest("Added existing Long to list: " + result);
1452-
changedFileMetadataIds.add(longResult);
1453-
} else {
1454-
// If it's not a Long, convert it to one via String
1455-
try {
1456-
changedFileMetadataIds.add(Long.valueOf(result.toString()));
1457-
logger.finest("Converted non-Long result to Long: " + result + " of type " + result.getClass().getName());
1458-
} catch (NumberFormatException e) {
1459-
logger.warning("Could not convert query result to Long: " + result);
1460-
}
1461-
}
1462-
}
1463-
}
1448+
solrIndexService.populateChangedFileIds(
1449+
dataset.getReleasedVersion().getId(),
1450+
datasetVersion.getId(),
1451+
changedFileIds);
14641452
logger.fine(
14651453
"We are indexing a draft version of a dataset that has a released version. We'll be checking file metadatas if they are exact clones of the released versions.");
14661454
} else if (datasetVersion.isDraft()) {
14671455
// Add all file metadata ids to changedFileMetadataIds
1468-
changedFileMetadataIds.addAll(
1456+
changedFileIds.addAll(
14691457
fileMetadatas.stream()
1470-
.map(FileMetadata::getId)
1458+
.map(fm -> fm.getDataFile().getId())
14711459
.collect(Collectors.toList())
14721460
);
14731461
}
@@ -1531,7 +1519,7 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set<Long
15311519
start = startDate;
15321520
}
15331521
boolean indexThisFile = false;
1534-
if (indexThisMetadata && (isReleasedVersion || changedFileMetadataIds.contains(fileMetadata.getId()))) {
1522+
if (indexThisMetadata && (isReleasedVersion || changedFileIds.contains(datafile.getId()))) {
15351523
indexThisFile = true;
15361524
} else if (indexThisMetadata) {
15371525
// Draft version, file is not new or all file metadata matches the released version
@@ -1868,6 +1856,8 @@ public SolrInputDocuments toSolrDocs(IndexableDataset indexableDataset, Set<Long
18681856
return new SolrInputDocuments(docs, msg, datasetId);
18691857
}
18701858

1859+
1860+
18711861
private String addOrUpdateDataset(IndexableDataset indexableDataset, Set<Long> datafilesInDraftVersion) throws SolrServerException, IOException {
18721862
final SolrInputDocuments docs = toSolrDocs(indexableDataset, datafilesInDraftVersion);
18731863

src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java

Lines changed: 71 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import jakarta.json.JsonObjectBuilder;
3939
import jakarta.persistence.EntityManager;
4040
import jakarta.persistence.PersistenceContext;
41+
import jakarta.persistence.Query;
4142

4243
import org.apache.solr.client.solrj.SolrQuery;
4344
import org.apache.solr.client.solrj.SolrServerException;
@@ -471,42 +472,100 @@ public void indexDatasetBatchInNewTransaction(List<Long> datasetIds, final int[]
471472
// Process files for this dataset
472473
Map<Long, List<String>> fileDownloadersMap = roleAssigneeSvc.findAssigneesWithDownloadPermissionOnDatasetFiles(dataset.getId());
473474
List<DatasetVersion> versions = versionsToReIndexPermissionsFor(dataset);
475+
final List<Long> changedFileIds = new ArrayList<>();
476+
if(versions.size()>1) {
477+
Long releasedVersionId = versions.get(versions.get(0).isReleased() ? 0 : 1).getId();
478+
Long draftVersionId = versions.get(versions.get(0).isReleased() ? 1 : 0).getId();
479+
480+
populateChangedFileIds(
481+
releasedVersionId,
482+
draftVersionId,
483+
changedFileIds
484+
);
485+
}
474486
for (DatasetVersion version : versions) {
475-
processDatasetVersionFiles(version, fileDownloadersMap, fileCounter, fileQueryMin, versions.size()>1);
487+
processDatasetVersionFiles(version, fileDownloadersMap, fileCounter, fileQueryMin, (versions.size()>1 && version.isDraft()) ? changedFileIds : null);
476488
}
477489
}
478490
}
479491
}
480492

481493
@TransactionAttribute(TransactionAttributeType.REQUIRES_NEW)
482494
public void indexDatasetFilesInNewTransaction(List<DatasetVersion> versions, Map<Long, List<String>> fileDownloadersMap, final int[] fileCounter, int fileQueryMin) {
495+
final List<Long> changedFileIds = new ArrayList<>();
496+
if(versions.size()>1) {
497+
Long releasedVersionId = versions.get(versions.get(0).isReleased() ? 0 : 1).getId();
498+
Long draftVersionId = versions.get(versions.get(0).isReleased() ? 1 : 0).getId();
499+
500+
populateChangedFileIds(
501+
releasedVersionId,
502+
draftVersionId,
503+
changedFileIds
504+
);
505+
}
483506
for (DatasetVersion version : versions) {
484507
// The version object is detached, but its fileMetadatas collection is already loaded.
485508
// We only need its ID and state, which are available.
486-
processDatasetVersionFiles(version, fileDownloadersMap, fileCounter, fileQueryMin, versions.size()>1);
509+
processDatasetVersionFiles(version, fileDownloadersMap, fileCounter, fileQueryMin, (versions.size()>1 && version.isDraft()) ? changedFileIds : null);
510+
}
511+
}
512+
513+
/**
514+
* Retrieves the IDs of file metadatas that have changed between the released version
515+
* and the draft version of a dataset.
516+
*
517+
* @param releasedVersionId the ID of the released dataset version
518+
* @param draftVersionId the ID of the draft dataset version
519+
* @param changedFileMetadataIds the list to populate with changed file metadata IDs
520+
*/
521+
protected void populateChangedFileIds(Long releasedVersionId, Long draftVersionId, List<Long> changedFileIds) {
522+
Query query = em.createNamedQuery("FileMetadata.getDatafilesWithChangedMetadata", Long.class);
523+
query.setParameter(1, releasedVersionId);
524+
query.setParameter(2, draftVersionId);
525+
526+
/*
527+
* When the query was configured to return Long, it was returning Integer.
528+
* The query has been changed to return Integer now. The code here is robust
529+
* if that changes in the future.
530+
*/
531+
List<Object> queryResults = query.getResultList();
532+
for (Object result : queryResults) {
533+
if (result != null) {
534+
// Ensure we're adding Long objects to the list
535+
if (result instanceof Integer intResult) {
536+
logger.finest("Converted Integer result to Long: " + result);
537+
changedFileIds.add(Long.valueOf(intResult));
538+
} else if (result instanceof Long longResult) {
539+
// Already a Long, add directly
540+
logger.finest("Added existing Long to list: " + result);
541+
changedFileIds.add(longResult);
542+
} else {
543+
// If it's not a Long, convert it to one via String
544+
try {
545+
changedFileIds.add(Long.valueOf(result.toString()));
546+
logger.finest("Converted non-Long result to Long: " + result + " of type " + result.getClass().getName());
547+
} catch (NumberFormatException e) {
548+
logger.warning("Could not convert query result to Long: " + result);
549+
}
550+
}
551+
}
487552
}
488553
}
489554

490555
private void processDatasetVersionFiles(DatasetVersion version, Map<Long, List<String>> fileDownloadersMap,
491-
final int[] fileCounter, int fileQueryMin, boolean isReleased) {
556+
final int[] fileCounter, int fileQueryMin, List<Long> changedFileIds) {
492557
List<String> cachedPerms = searchPermissionsService.findDatasetVersionPerms(version);
493558

494559
String solrIdEnd = getDatasetOrDataFileSolrEnding(version.getVersionState());
495560
Long versionId = version.getId();
496561
List<DataFileProxy> filesToReindexAsBatch = new ArrayList<>();
497562

498563
// If the version is draft and there is a released version,
499-
// we only need perm docs for the files with filemetadata changes == those with _draft solr docs already
500-
Set<Long> fileIdsToReindex = null;
501-
if (version.getVersionState().equals(DatasetVersion.VersionState.DRAFT) && isReleased) {
502-
fileIdsToReindex = getFileIdsWithSolrDocs(versionId);
503-
logger.fine("Found " + fileIdsToReindex.size() + " files with draft Solr docs for version " + versionId);
504-
}
564+
// we only need perm docs for the files with filemetadata changes == those in changedFileMetadataIds
505565

506566
// Process files in batches of 100
507567
int batchSize = 100;
508568

509-
final Set<Long> finalFileIdsToReindex = fileIdsToReindex;
510569
if (dataFileService.findCountByDatasetVersionId(version.getId()).intValue() > fileQueryMin) {
511570
// For large datasets, use a more efficient SQL query
512571
// ToDo - only get the ones in finalFileIdsToReindex
@@ -515,7 +574,7 @@ private void processDatasetVersionFiles(DatasetVersion version, Map<Long, List<S
515574
// Process files in batches to avoid memory issues
516575
fileStream.forEach(fileInfo -> {
517576
// Only add files that need reindexing
518-
if (finalFileIdsToReindex == null || finalFileIdsToReindex.contains(fileInfo.getFileId())) {
577+
if (changedFileIds == null || changedFileIds.contains(fileInfo.getFileId())) {
519578
filesToReindexAsBatch.add(fileInfo);
520579
fileCounter[0]++;
521580
if (filesToReindexAsBatch.size() >= batchSize) {
@@ -530,7 +589,7 @@ private void processDatasetVersionFiles(DatasetVersion version, Map<Long, List<S
530589
for (FileMetadata fmd : version.getFileMetadatas()) {
531590
// Only add files that need reindexing
532591
DataFileProxy fileProxy = new DataFileProxy(fmd);
533-
if (finalFileIdsToReindex == null || finalFileIdsToReindex.contains(fileProxy.getFileId())) {
592+
if (changedFileIds == null || changedFileIds.contains(fileProxy.getFileId())) {
534593
filesToReindexAsBatch.add(fileProxy);
535594
fileCounter[0]++;
536595
if (filesToReindexAsBatch.size() >= batchSize) {
@@ -584,55 +643,6 @@ private List<DatasetVersion> versionsToReIndexPermissionsFor(Dataset dataset) {
584643
return versionsToReindexPermissionsFor;
585644
}
586645

587-
/**
588-
* Queries Solr to find file IDs that have draft documents for the given dataset version.
589-
* This is used to optimize permission reindexing by only processing files that have
590-
* metadata changes in the draft version.
591-
*
592-
* @param datasetVersionId The ID of the dataset version
593-
* @return A set of file IDs that have Solr documents associated with this version
594-
*/
595-
private Set<Long> getFileIdsWithSolrDocs(Long datasetVersionId) {
596-
Set<Long> fileIds = new HashSet<>();
597-
598-
try {
599-
SolrQuery solrQuery = new SolrQuery();
600-
601-
// Query for files in this specific version with draft suffix
602-
solrQuery.setQuery("*:*");
603-
solrQuery.addFilterQuery(SearchFields.TYPE + ":" + SearchConstants.FILES);
604-
solrQuery.addFilterQuery(SearchFields.DATASET_VERSION_ID + ":" + datasetVersionId);
605-
606-
// Only return the entity ID field
607-
solrQuery.setFields(SearchFields.ENTITY_ID);
608-
609-
// We want all matching documents
610-
solrQuery.setRows(Integer.MAX_VALUE);
611-
612-
logger.fine("Solr query to find draft files: " + solrQuery);
613-
614-
QueryResponse queryResponse = solrClientService.getSolrClient().query(solrQuery);
615-
SolrDocumentList docs = queryResponse.getResults();
616-
617-
for (SolrDocument doc : docs) {
618-
Long entityId = (Long) doc.getFieldValue(SearchFields.ENTITY_ID);
619-
if (entityId != null) {
620-
fileIds.add(entityId);
621-
}
622-
}
623-
624-
logger.fine("Found " + fileIds.size() + " files with draft Solr docs for version " + datasetVersionId);
625-
626-
} catch (SolrServerException | IOException ex) {
627-
logger.log(Level.WARNING, "Error querying Solr for draft file IDs for version " + datasetVersionId +
628-
". Will reindex all files as fallback.", ex);
629-
// Return null to indicate we should process all files
630-
return null;
631-
}
632-
633-
return fileIds;
634-
}
635-
636646
public IndexResponse deleteMultipleSolrIds(List<String> solrIdsToDelete) {
637647
if (solrIdsToDelete.isEmpty()) {
638648
return new IndexResponse("nothing to delete");

0 commit comments

Comments
 (0)