From 9ff8e4fbac882f73b0f3642ffafc1f9282a4d405 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 11 Aug 2020 09:28:21 +0000 Subject: [PATCH 01/73] Bump luceneVersion from 5.3.0 to 8.6.0 Bumps `luceneVersion` from 5.3.0 to 8.6.0. Updates `lucene-core` from 5.3.0 to 8.6.0 Updates `lucene-queryparser` from 5.3.0 to 8.6.0 Updates `lucene-analyzers-common` from 5.3.0 to 8.6.0 Updates `lucene-join` from 5.3.0 to 8.6.0 Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100755 => 100644 pom.xml diff --git a/pom.xml b/pom.xml old mode 100755 new mode 100644 index 61a4eed..693a538 --- a/pom.xml +++ b/pom.xml @@ -14,7 +14,7 @@ https://repo.icatproject.org/repo github https://github.com/icatproject/icat.lucene - 5.3.0 + 8.6.0 From 48433e59bc16de0ae75630552f7075b2884d68ee Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Tue, 11 Jan 2022 04:53:04 +0000 Subject: [PATCH 02/73] Update imports and replace deprecated functionality --- .../org/icatproject/lucene/IcatAnalyzer.java | 21 ++++++++++---- .../java/org/icatproject/lucene/Lucene.java | 29 ++++++++++--------- 2 files changed, 32 insertions(+), 18 deletions(-) mode change 100644 => 100755 src/main/java/org/icatproject/lucene/IcatAnalyzer.java mode change 100644 => 100755 src/main/java/org/icatproject/lucene/Lucene.java diff --git a/src/main/java/org/icatproject/lucene/IcatAnalyzer.java b/src/main/java/org/icatproject/lucene/IcatAnalyzer.java old mode 100644 new mode 100755 index cb6767e..fcae1c9 --- a/src/main/java/org/icatproject/lucene/IcatAnalyzer.java +++ b/src/main/java/org/icatproject/lucene/IcatAnalyzer.java @@ -4,22 +4,33 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; -import org.apache.lucene.analysis.core.StopAnalyzer; import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.en.EnglishPossessiveFilter; import org.apache.lucene.analysis.en.PorterStemFilter; -import org.apache.lucene.analysis.standard.StandardFilter; +// import org.apache.lucene.analysis.standard.StandardAnalyzer ; import org.apache.lucene.analysis.standard.StandardTokenizer; +// public class IcatAnalyzer extends Analyzer { + +// @Override +// protected TokenStreamComponents createComponents(String fieldName) { +// StandardAnalyzer analyzer = new StandardAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); +// Analyzer.TokenStreamComponents stream = analyzer.createComponents(fieldName); +// sink = new EnglishPossessiveFilter(stream.getTokenStream()); +// sink = new PorterStemFilter(sink); +// return new TokenStreamComponents(source, sink); +// } +// } + public class IcatAnalyzer extends Analyzer { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new StandardTokenizer(); - TokenStream sink = new StandardFilter(source); - sink = new EnglishPossessiveFilter(sink); + TokenStream sink = new EnglishPossessiveFilter(source); sink = new LowerCaseFilter(sink); - sink = new StopFilter(sink, StopAnalyzer.ENGLISH_STOP_WORDS_SET); + sink = new StopFilter(sink, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); sink = new PorterStemFilter(sink); return new TokenStreamComponents(source, sink); } diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java old mode 100644 new mode 100755 index 2015323..9c55eca --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -41,9 +41,10 @@ import javax.ws.rs.core.MediaType; import org.apache.lucene.document.Document; -import org.apache.lucene.document.DoubleField; +import org.apache.lucene.document.DoublePoint; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.IndexWriter; @@ -57,7 +58,6 @@ import org.apache.lucene.search.BooleanQuery.Builder; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.search.NumericRangeQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.SearcherManager; @@ -85,7 +85,7 @@ enum AttributeName { } enum FieldType { - TextField, StringField, SortedDocValuesField, DoubleField + TextField, StringField, SortedDocValuesField, DoublePoint } private class IndexBucket { @@ -238,7 +238,7 @@ private void add(HttpServletRequest request, String entityName, When when, JsonP long num = parser.getLong(); if (fType == FieldType.SortedDocValuesField) { value = Long.toString(num); - } else if (fType == FieldType.DoubleField) { + } else if (fType == FieldType.DoublePoint) { dvalue = parser.getBigDecimal().doubleValue(); } else { throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, @@ -262,8 +262,11 @@ private void add(HttpServletRequest request, String entityName, When when, JsonP doc.add(new StringField(name, value, store)); } else if (fType == FieldType.SortedDocValuesField) { doc.add(new SortedDocValuesField(name, new BytesRef(value))); - } else if (fType == FieldType.DoubleField) { - doc.add(new DoubleField(name, dvalue, store)); + } else if (fType == FieldType.DoublePoint) { + doc.add(new DoublePoint(name, dvalue)); + if (store == Store.YES) { + doc.add(new StoredField(name, dvalue)); + } } } else if (ev == Event.END_ARRAY) { if (id == null) { @@ -353,7 +356,7 @@ public void commit() throws LuceneException { bucket.indexWriter.commit(); if (cached != 0) { logger.debug("Synch has committed {} {} changes to Lucene - now have {} documents indexed", - cached, entry.getKey(), bucket.indexWriter.numDocs()); + cached, entry.getKey(), bucket.indexWriter.getDocStats().numDocs); } bucket.searcherManager.maybeRefreshBlocking(); } @@ -379,10 +382,10 @@ private IndexBucket createBucket(String name) { iwriter.commit(); iwriter.deleteDocuments(new Term("dummy", "dummy")); iwriter.commit(); - logger.debug("Now have " + iwriter.numDocs() + " documents indexed"); + logger.debug("Now have " + iwriter.getDocStats().numDocs + " documents indexed"); } bucket.indexWriter = iwriter; - bucket.searcherManager = new SearcherManager(iwriter, false, null); + bucket.searcherManager = new SearcherManager(iwriter, false, false, null); logger.debug("Bucket for {} is now ready", name); return bucket; } catch (Throwable e) { @@ -791,7 +794,7 @@ private String luceneSearchResult(String name, Search search, int maxResults, Lo TopDocs topDocs = search.lastDoc == null ? isearcher.search(search.query, maxResults) : isearcher.searchAfter(search.lastDoc, search.query, maxResults); ScoreDoc[] hits = topDocs.scoreDocs; - logger.debug("Hits " + topDocs.totalHits + " maxscore " + topDocs.getMaxScore()); + logger.debug("Hits " + topDocs.totalHits + " maxscore " + topDocs.scoreDocs[0].score); ByteArrayOutputStream baos = new ByteArrayOutputStream(); try (JsonGenerator gen = Json.createGenerator(baos)) { gen.writeStartObject(); @@ -850,8 +853,8 @@ private Builder parseParameter(JsonValue p) { new BytesRef(pUpperDateValue), true, true), Occur.MUST); } else if (pLowerNumericValue != null && pUpperNumericValue != null) { - paramQuery.add(NumericRangeQuery.newDoubleRange("numericValue", pLowerNumericValue, pUpperNumericValue, - true, true), Occur.MUST); + paramQuery.add(DoublePoint.newRangeQuery("numericValue", pLowerNumericValue, pUpperNumericValue), + Occur.MUST); } return paramQuery; } @@ -870,7 +873,7 @@ public void unlock(@PathParam("entityName") String entityName) throws LuceneExce bucket.indexWriter.commit(); if (cached != 0) { logger.debug("Unlock has committed {} {} changes to Lucene - now have {} documents indexed", cached, - entityName, bucket.indexWriter.numDocs()); + entityName, bucket.indexWriter.getDocStats().numDocs); } bucket.searcherManager.maybeRefreshBlocking(); } catch (IOException e) { From 60b659b7874b444e40c748bf5d5c844ceb4e5045 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Wed, 12 Jan 2022 17:14:30 +0000 Subject: [PATCH 03/73] Enable basic sorted set facets #19 --- .../java/org/icatproject/lucene/Lucene.java | 491 ++++++++++++------ src/test/java/icat/lucene/TestLucene.java | 123 +++++ 2 files changed, 445 insertions(+), 169 deletions(-) mode change 100644 => 100755 src/test/java/icat/lucene/TestLucene.java diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 11b1d5b..17f48f4 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -8,6 +8,7 @@ import java.nio.file.Files; import java.util.Comparator; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Timer; @@ -47,9 +48,20 @@ import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; +import org.apache.lucene.facet.FacetResult; +import org.apache.lucene.facet.Facets; +import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField; +import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState; +import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.ReaderManager; import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.flexible.core.QueryNodeException; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler.ConfigurationKeys; @@ -85,18 +97,20 @@ enum AttributeName { } enum FieldType { - TextField, StringField, SortedDocValuesField, DoublePoint + TextField, StringField, SortedDocValuesField, DoublePoint, SortedSetDocValuesFacetField, } private class IndexBucket { private FSDirectory directory; private IndexWriter indexWriter; + private ReaderManager readerManager; private SearcherManager searcherManager; private AtomicBoolean locked = new AtomicBoolean(); } public class Search { - public Map map; + public Map readerMap; + public Map searcherMap; public Query query; public ScoreDoc lastDoc; } @@ -109,6 +123,8 @@ enum When { private static final Marker fatal = MarkerFactory.getMarker("FATAL"); + private final FacetsConfig facetsConfig = new FacetsConfig(); + private java.nio.file.Path luceneDirectory; private int luceneCommitMillis; @@ -262,6 +278,8 @@ private void add(HttpServletRequest request, String entityName, When when, JsonP doc.add(new StringField(name, value, store)); } else if (fType == FieldType.SortedDocValuesField) { doc.add(new SortedDocValuesField(name, new BytesRef(value))); + } else if (fType == FieldType.SortedSetDocValuesFacetField) { + doc.add(new SortedSetDocValuesFacetField(name, value)); } else if (fType == FieldType.DoublePoint) { doc.add(new DoublePoint(name, dvalue)); if (store == Store.YES) { @@ -274,13 +292,13 @@ private void add(HttpServletRequest request, String entityName, When when, JsonP throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene locked for " + entityName); } - bucket.indexWriter.addDocument(doc); + bucket.indexWriter.addDocument(facetsConfig.build(doc)); } else { if (bucket.locked.get()) { throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene locked for " + entityName); } - bucket.indexWriter.updateDocument(new Term("id", id.toString()), doc); + bucket.indexWriter.updateDocument(new Term("id", id.toString()), facetsConfig.build(doc)); } return; } else { @@ -358,6 +376,7 @@ public void commit() throws LuceneException { logger.debug("Synch has committed {} {} changes to Lucene - now have {} documents indexed", cached, entry.getKey(), bucket.indexWriter.getDocStats().numDocs); } + bucket.readerManager.maybeRefreshBlocking(); bucket.searcherManager.maybeRefreshBlocking(); } } @@ -385,6 +404,7 @@ private IndexBucket createBucket(String name) { logger.debug("Now have " + iwriter.getDocStats().numDocs + " documents indexed"); } bucket.indexWriter = iwriter; + bucket.readerManager = new ReaderManager(iwriter, false, false); bucket.searcherManager = new SearcherManager(iwriter, false, false, null); logger.debug("Bucket for {} is now ready", name); return bucket; @@ -404,56 +424,7 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("maxRes Long uid = null; try { uid = bucketNum.getAndIncrement(); - Search search = new Search(); - searches.put(uid, search); - Map map = new HashMap<>(); - search.map = map; - - try (JsonReader r = Json.createReader(request.getInputStream())) { - JsonObject o = r.readObject(); - String userName = o.getString("user", null); - - BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); - - if (userName != null) { - Query iuQuery = JoinUtil.createJoinQuery("investigation", false, "id", - new TermQuery(new Term("name", userName)), getSearcher(map, "InvestigationUser"), - ScoreMode.None); - - Query invQuery = JoinUtil.createJoinQuery("id", false, "investigation", iuQuery, - getSearcher(map, "Investigation"), ScoreMode.None); - - Query dsQuery = JoinUtil.createJoinQuery("id", false, "dataset", invQuery, - getSearcher(map, "Dataset"), ScoreMode.None); - - theQuery.add(dsQuery, Occur.MUST); - } - - String text = o.getString("text", null); - if (text != null) { - theQuery.add(parser.parse(text, "text"), Occur.MUST); - } - - String lower = o.getString("lower", null); - String upper = o.getString("upper", null); - if (lower != null && upper != null) { - theQuery.add(new TermRangeQuery("date", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); - } - - if (o.containsKey("params")) { - JsonArray params = o.getJsonArray("params"); - IndexSearcher datafileParameterSearcher = getSearcher(map, "DatafileParameter"); - for (JsonValue p : params) { - BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("datafile", false, "id", paramQuery.build(), - datafileParameterSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); - } - } - search.query = maybeEmptyQuery(theQuery); - } - + Search search = datafilesQuery(request, uid); return luceneSearchResult("Datafile", search, maxResults, uid); } catch (Exception e) { logger.error("Error", e); @@ -483,61 +454,87 @@ public String datafilesAfter(@PathParam("uid") long uid, @QueryParam("maxResults @POST @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) - @Path("datasets") - public String datasets(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults) - throws LuceneException { - + @Path("datafiles/facet") + public String datafilesFacet(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults, + @QueryParam("maxLabels") int maxLabels) throws LuceneException { Long uid = null; try { uid = bucketNum.getAndIncrement(); - Search search = new Search(); - searches.put(uid, search); - Map map = new HashMap<>(); - search.map = map; - try (JsonReader r = Json.createReader(request.getInputStream())) { - JsonObject o = r.readObject(); - String userName = o.getString("user", null); + Search search = datafilesQuery(request, uid); + return luceneFacetResult("Datafile", search, maxResults, maxLabels, uid); + } catch (Exception e) { + logger.error("Error", e); + freeSearcher(uid); + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + } - BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); + } - if (userName != null) { + private Search datafilesQuery(HttpServletRequest request, Long uid) throws IOException, QueryNodeException { + Search search = new Search(); + searches.put(uid, search); + Map searcherMap = new HashMap<>(); + Map readerMap = new HashMap<>(); + search.searcherMap = searcherMap; + search.readerMap = readerMap; + try (JsonReader r = Json.createReader(request.getInputStream())) { + JsonObject o = r.readObject(); + String userName = o.getString("user", null); - Query iuQuery = JoinUtil.createJoinQuery("investigation", false, "id", - new TermQuery(new Term("name", userName)), getSearcher(map, "InvestigationUser"), - ScoreMode.None); + BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); - Query invQuery = JoinUtil.createJoinQuery("id", false, "investigation", iuQuery, - getSearcher(map, "Investigation"), ScoreMode.None); + if (userName != null) { + Query iuQuery = JoinUtil.createJoinQuery("investigation", false, "id", + new TermQuery(new Term("name", userName)), getSearcher(searcherMap, "InvestigationUser"), + ScoreMode.None); - theQuery.add(invQuery, Occur.MUST); - } + Query invQuery = JoinUtil.createJoinQuery("id", false, "investigation", iuQuery, + getSearcher(searcherMap, "Investigation"), ScoreMode.None); - String text = o.getString("text", null); - if (text != null) { - theQuery.add(parser.parse(text, "text"), Occur.MUST); - } + Query dsQuery = JoinUtil.createJoinQuery("id", false, "dataset", invQuery, + getSearcher(searcherMap, "Dataset"), ScoreMode.None); - String lower = o.getString("lower", null); - String upper = o.getString("upper", null); - if (lower != null && upper != null) { - theQuery.add(new TermRangeQuery("startDate", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); - theQuery.add(new TermRangeQuery("endDate", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); - } + theQuery.add(dsQuery, Occur.MUST); + } - if (o.containsKey("params")) { - JsonArray params = o.getJsonArray("params"); - IndexSearcher datasetParameterSearcher = getSearcher(map, "DatasetParameter"); - for (JsonValue p : params) { - BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("dataset", false, "id", paramQuery.build(), - datasetParameterSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); - } + String text = o.getString("text", null); + if (text != null) { + theQuery.add(parser.parse(text, "text"), Occur.MUST); + } + + String lower = o.getString("lower", null); + String upper = o.getString("upper", null); + if (lower != null && upper != null) { + theQuery.add(new TermRangeQuery("date", new BytesRef(lower), new BytesRef(upper), true, true), + Occur.MUST); + } + + if (o.containsKey("params")) { + JsonArray params = o.getJsonArray("params"); + IndexSearcher datafileParameterSearcher = getSearcher(searcherMap, "DatafileParameter"); + for (JsonValue p : params) { + BooleanQuery.Builder paramQuery = parseParameter(p); + Query toQuery = JoinUtil.createJoinQuery("datafile", false, "id", paramQuery.build(), + datafileParameterSearcher, ScoreMode.None); + theQuery.add(toQuery, Occur.MUST); } - search.query = maybeEmptyQuery(theQuery); } + search.query = maybeEmptyQuery(theQuery); + } + return search; + } + + @POST + @Consumes(MediaType.APPLICATION_JSON) + @Produces(MediaType.APPLICATION_JSON) + @Path("datasets") + public String datasets(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults) + throws LuceneException { + + Long uid = null; + try { + uid = bucketNum.getAndIncrement(); + Search search = datasetsQuery(request, uid); return luceneSearchResult("Dataset", search, maxResults, uid); } catch (Exception e) { logger.error("Error", e); @@ -565,6 +562,79 @@ public String datasetsAfter(@PathParam("uid") long uid, @QueryParam("maxResults" } } + @POST + @Consumes(MediaType.APPLICATION_JSON) + @Produces(MediaType.APPLICATION_JSON) + @Path("datasets/facet") + public String datasetsFacet(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults, + @QueryParam("maxLabels") int maxLabels) throws LuceneException { + Long uid = null; + try { + uid = bucketNum.getAndIncrement(); + Search search = datasetsQuery(request, uid); + return luceneFacetResult("Dataset", search, maxResults, maxLabels, uid); + } catch (Exception e) { + logger.error("Error", e); + freeSearcher(uid); + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + } + + } + + private Search datasetsQuery(HttpServletRequest request, Long uid) throws IOException, QueryNodeException { + Search search = new Search(); + searches.put(uid, search); + Map searcherMap = new HashMap<>(); + Map readerMap = new HashMap<>(); + search.searcherMap = searcherMap; + search.readerMap = readerMap; + try (JsonReader r = Json.createReader(request.getInputStream())) { + JsonObject o = r.readObject(); + String userName = o.getString("user", null); + + BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); + + if (userName != null) { + + Query iuQuery = JoinUtil.createJoinQuery("investigation", false, "id", + new TermQuery(new Term("name", userName)), getSearcher(searcherMap, "InvestigationUser"), + ScoreMode.None); + + Query invQuery = JoinUtil.createJoinQuery("id", false, "investigation", iuQuery, + getSearcher(searcherMap, "Investigation"), ScoreMode.None); + + theQuery.add(invQuery, Occur.MUST); + } + + String text = o.getString("text", null); + if (text != null) { + theQuery.add(parser.parse(text, "text"), Occur.MUST); + } + + String lower = o.getString("lower", null); + String upper = o.getString("upper", null); + if (lower != null && upper != null) { + theQuery.add(new TermRangeQuery("startDate", new BytesRef(lower), new BytesRef(upper), true, true), + Occur.MUST); + theQuery.add(new TermRangeQuery("endDate", new BytesRef(lower), new BytesRef(upper), true, true), + Occur.MUST); + } + + if (o.containsKey("params")) { + JsonArray params = o.getJsonArray("params"); + IndexSearcher datasetParameterSearcher = getSearcher(searcherMap, "DatasetParameter"); + for (JsonValue p : params) { + BooleanQuery.Builder paramQuery = parseParameter(p); + Query toQuery = JoinUtil.createJoinQuery("dataset", false, "id", paramQuery.build(), + datasetParameterSearcher, ScoreMode.None); + theQuery.add(toQuery, Occur.MUST); + } + } + search.query = maybeEmptyQuery(theQuery); + } + return search; + } + @PreDestroy private void exit() { logger.info("Closing down icat.lucene"); @@ -576,6 +646,7 @@ private void exit() { try { for (Entry entry : indexBuckets.entrySet()) { IndexBucket bucket = entry.getValue(); + bucket.readerManager.close(); bucket.searcherManager.close(); bucket.indexWriter.commit(); bucket.indexWriter.close(); @@ -592,7 +663,18 @@ private void exit() { public void freeSearcher(@PathParam("uid") Long uid) throws LuceneException { if (uid != null) { // May not be set for internal calls logger.debug("Requesting freeSearcher {}", uid); - Map search = searches.get(uid).map; + Map search = searches.get(uid).searcherMap; + Map read = searches.get(uid).readerMap; + for (Entry entry : read.entrySet()) { + String name = entry.getKey(); + DirectoryReader directoryReader = entry.getValue(); + ReaderManager manager = indexBuckets.computeIfAbsent(name, k -> createBucket(k)).readerManager; + try { + manager.release(directoryReader); + } catch (IOException e) { + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + } + } for (Entry entry : search.entrySet()) { String name = entry.getKey(); IndexSearcher isearcher = entry.getValue(); @@ -607,6 +689,16 @@ public void freeSearcher(@PathParam("uid") Long uid) throws LuceneException { } } + private DirectoryReader getReader(Map bucket, String name) throws IOException { + DirectoryReader directoryReader = bucket.get(name); + if (directoryReader == null) { + directoryReader = indexBuckets.computeIfAbsent(name, k -> createBucket(k)).readerManager.acquire(); + bucket.put(name, directoryReader); + logger.debug("Remember searcher for {}", name); + } + return directoryReader; + } + /* * Need a new set of IndexSearchers for each search as identified by a uid */ @@ -672,83 +764,13 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("m Long uid = null; try { uid = bucketNum.getAndIncrement(); - Search search = new Search(); - searches.put(uid, search); - Map map = new HashMap<>(); - search.map = map; - try (JsonReader r = Json.createReader(request.getInputStream())) { - JsonObject o = r.readObject(); - String userName = o.getString("user", null); - - BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); - - if (userName != null) { - Query iuQuery = JoinUtil.createJoinQuery("investigation", false, "id", - new TermQuery(new Term("name", userName)), getSearcher(map, "InvestigationUser"), - ScoreMode.None); - theQuery.add(iuQuery, Occur.MUST); - } - - String text = o.getString("text", null); - if (text != null) { - theQuery.add(parser.parse(text, "text"), Occur.MUST); - } - - String lower = o.getString("lower", null); - String upper = o.getString("upper", null); - if (lower != null && upper != null) { - theQuery.add(new TermRangeQuery("startDate", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); - theQuery.add(new TermRangeQuery("endDate", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); - } - - if (o.containsKey("params")) { - JsonArray params = o.getJsonArray("params"); - IndexSearcher investigationParameterSearcher = getSearcher(map, "InvestigationParameter"); - - for (JsonValue p : params) { - BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", paramQuery.build(), - investigationParameterSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); - } - } - - if (o.containsKey("samples")) { - JsonArray samples = o.getJsonArray("samples"); - IndexSearcher sampleSearcher = getSearcher(map, "Sample"); - - for (JsonValue s : samples) { - JsonString sample = (JsonString) s; - BooleanQuery.Builder sampleQuery = new BooleanQuery.Builder(); - sampleQuery.add(parser.parse(sample.getString(), "text"), Occur.MUST); - Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", sampleQuery.build(), - sampleSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); - } - } - - String userFullName = o.getString("userFullName", null); - if (userFullName != null) { - BooleanQuery.Builder userFullNameQuery = new BooleanQuery.Builder(); - userFullNameQuery.add(parser.parse(userFullName, "text"), Occur.MUST); - IndexSearcher investigationUserSearcher = getSearcher(map, "InvestigationUser"); - Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", userFullNameQuery.build(), - investigationUserSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); - } - - search.query = maybeEmptyQuery(theQuery); - } - logger.info("Query: {}", search.query); + Search search = investigationsQuery(request, uid); return luceneSearchResult("Investigation", search, maxResults, uid); } catch (Exception e) { logger.error("Error", e); freeSearcher(uid); throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } - } @GET @@ -769,6 +791,100 @@ public String investigationsAfter(@PathParam("uid") long uid, @QueryParam("maxRe } } + @POST + @Consumes(MediaType.APPLICATION_JSON) + @Produces(MediaType.APPLICATION_JSON) + @Path("investigations/facet") + public String investigationsFacet(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults, + @QueryParam("maxLabels") int maxLabels) throws LuceneException { + Long uid = null; + try { + uid = bucketNum.getAndIncrement(); + Search search = investigationsQuery(request, uid); + return luceneFacetResult("Investigation", search, maxResults, maxLabels, uid); + } catch (Exception e) { + logger.error("Error", e); + freeSearcher(uid); + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + } + } + + private Search investigationsQuery(HttpServletRequest request, Long uid) throws IOException, QueryNodeException { + Search search = new Search(); + searches.put(uid, search); + Map searcherMap = new HashMap<>(); + Map readerMap = new HashMap<>(); + search.searcherMap = searcherMap; + search.readerMap = readerMap; + try (JsonReader r = Json.createReader(request.getInputStream())) { + JsonObject o = r.readObject(); + String userName = o.getString("user", null); + + BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); + + if (userName != null) { + Query iuQuery = JoinUtil.createJoinQuery("investigation", false, "id", + new TermQuery(new Term("name", userName)), getSearcher(searcherMap, "InvestigationUser"), + ScoreMode.None); + theQuery.add(iuQuery, Occur.MUST); + } + + String text = o.getString("text", null); + if (text != null) { + theQuery.add(parser.parse(text, "text"), Occur.MUST); + } + + String lower = o.getString("lower", null); + String upper = o.getString("upper", null); + if (lower != null && upper != null) { + theQuery.add(new TermRangeQuery("startDate", new BytesRef(lower), new BytesRef(upper), true, true), + Occur.MUST); + theQuery.add(new TermRangeQuery("endDate", new BytesRef(lower), new BytesRef(upper), true, true), + Occur.MUST); + } + + if (o.containsKey("params")) { + JsonArray params = o.getJsonArray("params"); + IndexSearcher investigationParameterSearcher = getSearcher(searcherMap, "InvestigationParameter"); + + for (JsonValue p : params) { + BooleanQuery.Builder paramQuery = parseParameter(p); + Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", paramQuery.build(), + investigationParameterSearcher, ScoreMode.None); + theQuery.add(toQuery, Occur.MUST); + } + } + + if (o.containsKey("samples")) { + JsonArray samples = o.getJsonArray("samples"); + IndexSearcher sampleSearcher = getSearcher(searcherMap, "Sample"); + + for (JsonValue s : samples) { + JsonString sample = (JsonString) s; + BooleanQuery.Builder sampleQuery = new BooleanQuery.Builder(); + sampleQuery.add(parser.parse(sample.getString(), "text"), Occur.MUST); + Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", sampleQuery.build(), + sampleSearcher, ScoreMode.None); + theQuery.add(toQuery, Occur.MUST); + } + } + + String userFullName = o.getString("userFullName", null); + if (userFullName != null) { + BooleanQuery.Builder userFullNameQuery = new BooleanQuery.Builder(); + userFullNameQuery.add(parser.parse(userFullName, "text"), Occur.MUST); + IndexSearcher investigationUserSearcher = getSearcher(searcherMap, "InvestigationUser"); + Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", userFullNameQuery.build(), + investigationUserSearcher, ScoreMode.None); + theQuery.add(toQuery, Occur.MUST); + } + + search.query = maybeEmptyQuery(theQuery); + } + logger.info("Query: {}", search.query); + return search; + } + @POST @Path("lock/{entityName}") public void lock(@PathParam("entityName") String entityName) throws LuceneException { @@ -785,8 +901,44 @@ public void lock(@PathParam("entityName") String entityName) throws LuceneExcept } } + private String luceneFacetResult(String name, Search search, int maxResults, int maxLabels, Long uid) + throws IOException { + IndexSearcher isearcher = getSearcher(search.searcherMap, name); + DirectoryReader directoryReader = getReader(search.readerMap, name); + logger.debug("To facet in {} for {} {} with {} from {} ", name, search.query, maxResults, isearcher, + search.lastDoc); + DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(directoryReader); + FacetsCollector facetsCollector = new FacetsCollector(); + FacetsCollector.search(isearcher, search.query, maxResults, facetsCollector); + Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); + List results = facets.getAllDims(maxLabels); + logger.debug("Facets found for " + results.size() + " dimensions"); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (JsonGenerator gen = Json.createGenerator(baos)) { + gen.writeStartObject(); + if (uid != null) { + gen.write("uid", uid); + } + gen.writeStartArray("facets"); // array of all facet dimensions + for (FacetResult result : results) { + gen.writeStartArray(result.dim); // array of labelValues for a given dimension + for (LabelAndValue labelValue : result.labelValues) { + gen.writeStartArray("labelValue"); // 2 element array of label, value + gen.write(labelValue.label); + gen.write(labelValue.value.longValue()); + gen.writeEnd(); // array of label, value + } + gen.writeEnd(); // array of labelValues for a given dimension + } + gen.writeEnd(); // array of facet dimensions + gen.writeEnd(); // object + } + logger.debug("Json returned {}", baos.toString()); + return baos.toString(); + } + private String luceneSearchResult(String name, Search search, int maxResults, Long uid) throws IOException { - IndexSearcher isearcher = getSearcher(search.map, name); + IndexSearcher isearcher = getSearcher(search.searcherMap, name); logger.debug("To search in {} for {} {} with {} from {} ", name, search.query, maxResults, isearcher, search.lastDoc); TopDocs topDocs = search.lastDoc == null ? isearcher.search(search.query, maxResults) @@ -874,6 +1026,7 @@ public void unlock(@PathParam("entityName") String entityName) throws LuceneExce entityName, bucket.indexWriter.getDocStats().numDocs); } bucket.searcherManager.maybeRefreshBlocking(); + bucket.readerManager.maybeRefreshBlocking(); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } diff --git a/src/test/java/icat/lucene/TestLucene.java b/src/test/java/icat/lucene/TestLucene.java old mode 100644 new mode 100755 index f5cd493..9624f52 --- a/src/test/java/icat/lucene/TestLucene.java +++ b/src/test/java/icat/lucene/TestLucene.java @@ -9,13 +9,23 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.Arrays; +import java.util.HashMap; import java.util.List; +import java.util.Map; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; +import org.apache.lucene.facet.FacetResult; +import org.apache.lucene.facet.Facets; +import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsConfig; +import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState; +import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts; +import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.StringField; import org.apache.lucene.index.DirectoryReader; @@ -45,6 +55,8 @@ public class TestLucene { static final int scale = (int) 1.0e5; + private final FacetsConfig facetsConfig = new FacetsConfig(); + @Test public void testIcatAnalyzer() throws Exception { final String text = "This is a demo of the 1st (or is it number 2) all singing and dancing TokenStream's API with added aardvarks"; @@ -171,6 +183,57 @@ public void testJoins() throws Exception { System.out.println("Join tests took " + (System.currentTimeMillis() - start) + "ms"); } + @Test + public void testFacets() throws Exception { + Analyzer analyzer = new IcatAnalyzer(); + IndexWriterConfig config; + + Path tmpLuceneDir = Files.createTempDirectory("lucene"); + FSDirectory investigationDirectory = FSDirectory.open(tmpLuceneDir.resolve("Investigation")); + config = new IndexWriterConfig(analyzer); + config.setOpenMode(OpenMode.CREATE); + IndexWriter investigationWriter = new IndexWriter(investigationDirectory, config); + + // Add investigations with parameter and sample Facets + addFacetedInvestigation(investigationWriter, "inv1", 101, "parameter1", "sample1"); + addFacetedInvestigation(investigationWriter, "inv2", 102, "parameter2", "sample2"); + + // Add investigations with only the parameter Facet + for (int i = 0; i < scale; i++) { + addFacetedInvestigation(investigationWriter, "extra" + i, 500 + i, "parameter0"); + } + + investigationWriter.close(); + + DirectoryReader directoryReader = DirectoryReader.open(investigationDirectory); + IndexSearcher investigationSearcher = new IndexSearcher(directoryReader); + StandardQueryParser parser = new StandardQueryParser(); + StandardQueryConfigHandler qpConf = (StandardQueryConfigHandler) parser.getQueryConfigHandler(); + qpConf.set(ConfigurationKeys.ANALYZER, analyzer); + qpConf.set(ConfigurationKeys.ALLOW_LEADING_WILDCARD, true); + Map labelValuesParameter = new HashMap<>(); + Map labelValuesSample = new HashMap<>(); + + long start = System.currentTimeMillis(); + + // Get Facets that are relevant for "inv1" + labelValuesParameter.put("parameter1", 1); + labelValuesSample.put("sample1", 1); + checkFacets(labelValuesParameter, labelValuesSample, "inv1", investigationSearcher, directoryReader, parser); + + // Get Facets that are relevant for "inv*" + labelValuesParameter.put("parameter2", 1); + labelValuesSample.put("sample2", 1); + checkFacets(labelValuesParameter, labelValuesSample, "inv*", investigationSearcher, directoryReader, parser); + + // Get all Facets for "*" + labelValuesParameter.put("parameter0", scale); + checkFacets(labelValuesParameter, labelValuesSample, "*", investigationSearcher, directoryReader, parser); + + System.out.println("Facet tests took " + (System.currentTimeMillis() - start) + "ms"); + } + + private void checkDatafiles(List dnums, String fname, String uname, IndexSearcher investigationSearcher, IndexSearcher investigationUserSearcher, IndexSearcher datasetSearcher, IndexSearcher datafileSearcher, StandardQueryParser parser) throws IOException, QueryNodeException { @@ -253,6 +316,20 @@ private ScoreDoc[] get(String iname, String uname, IndexSearcher investigationSe } + /* Facets */ + private Facets get(String iname, IndexSearcher investigationSearcher, DirectoryReader directoryReader, + StandardQueryParser parser) throws QueryNodeException, IOException { + BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); + if (iname != null) { + theQuery.add(parser.parse(iname, "name"), Occur.MUST); + } + DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(directoryReader); + FacetsCollector facetsCollector = new FacetsCollector(); + FacetsCollector.search(investigationSearcher, theQuery.build(), 50, facetsCollector); + Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); + return facets; + } + private void checkDatasets(List dnums, String sname, String uname, IndexSearcher investigationSearcher, IndexSearcher investigationUserSearcher, IndexSearcher datasetSearcher, StandardQueryParser parser) throws IOException, QueryNodeException { @@ -265,6 +342,31 @@ private void checkDatasets(List dnums, String sname, String uname, Inde } + private void checkFacets(Map labelValuesParameter, Map labelValuesSample, + String iname, IndexSearcher investigationSearcher, DirectoryReader directoryReader, + StandardQueryParser parser) throws QueryNodeException, IOException { + Facets facets = get(iname, investigationSearcher, directoryReader, parser); + List results = facets.getAllDims(50); + if (labelValuesParameter.size() > 0) { + FacetResult parameterResult = results.remove(0); + assertEquals("Dimension", "parameter", parameterResult.dim); + assertEquals("Length", labelValuesParameter.size(), parameterResult.labelValues.length); + for (LabelAndValue labelValue : parameterResult.labelValues) { + assertTrue("Label", labelValuesParameter.containsKey(labelValue.label)); + assertEquals("Value", labelValuesParameter.get(labelValue.label), labelValue.value); + } + } + if (labelValuesSample.size() > 0) { + FacetResult sampleResult = results.remove(0); + assertEquals("Dimension", "sample", sampleResult.dim); + assertEquals("Length", labelValuesSample.size(), sampleResult.labelValues.length); + for (LabelAndValue labelValue : sampleResult.labelValues) { + assertTrue("Label", labelValuesSample.containsKey(labelValue.label)); + assertEquals("Value", labelValuesSample.get(labelValue.label), labelValue.value); + } + } + } + private void checkInvestigations(List dnums, String iname, String uname, IndexSearcher investigationSearcher, IndexSearcher investigationUserSearcher, StandardQueryParser parser) throws QueryNodeException, IOException { @@ -285,6 +387,27 @@ private void addInvestigation(IndexWriter iwriter, String name, long iNum) throw iwriter.addDocument(doc); } + private void addFacetedInvestigation(IndexWriter iwriter, String name, long iNum, String parameterValue, + String sampleValue) throws IOException { + Document doc = new Document(); + doc.add(new StringField("name", name, Store.NO)); + doc.add(new SortedDocValuesField("id", new BytesRef(Long.toString(iNum)))); + doc.add(new StringField("id", Long.toString(iNum), Store.YES)); + doc.add(new SortedSetDocValuesFacetField("parameter", parameterValue)); + doc.add(new SortedSetDocValuesFacetField("sample", sampleValue)); + iwriter.addDocument(facetsConfig.build(doc)); + } + + private void addFacetedInvestigation(IndexWriter iwriter, String name, long iNum, String parameterValue) + throws IOException { + Document doc = new Document(); + doc.add(new StringField("name", name, Store.NO)); + doc.add(new SortedDocValuesField("id", new BytesRef(Long.toString(iNum)))); + doc.add(new StringField("id", Long.toString(iNum), Store.YES)); + doc.add(new SortedSetDocValuesFacetField("parameter", parameterValue)); + iwriter.addDocument(facetsConfig.build(doc)); + } + private void addInvestigationUser(IndexWriter iwriter, String name, long iNum) throws IOException { Document doc = new Document(); doc.add(new StringField("name", name, Store.NO)); From f3b1dff919ec668bc6ac18801be4f731a0d2ce65 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Wed, 12 Jan 2022 17:28:54 +0000 Subject: [PATCH 04/73] Update pom.xml with Facets #19 --- pom.xml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) mode change 100644 => 100755 pom.xml diff --git a/pom.xml b/pom.xml old mode 100644 new mode 100755 index a0b5bad..d6ec96c --- a/pom.xml +++ b/pom.xml @@ -86,6 +86,12 @@ ${luceneVersion} + + org.apache.lucene + lucene-facet + ${luceneVersion} + + javax javaee-api @@ -327,6 +333,3 @@ Exposes lucene calls to an icat server - - - From 1229a9a214fc5824dec995f96c0611b5bb4aaf46 Mon Sep 17 00:00:00 2001 From: Stuart Pullinger Date: Fri, 5 Jun 2020 11:15:57 +0000 Subject: [PATCH 05/73] Query on datafile date property. Fixes #8 --- src/main/java/org/icatproject/lucene/Lucene.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 9c55eca..11b1d5b 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -437,9 +437,7 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("maxRes String lower = o.getString("lower", null); String upper = o.getString("upper", null); if (lower != null && upper != null) { - theQuery.add(new TermRangeQuery("startDate", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); - theQuery.add(new TermRangeQuery("endDate", new BytesRef(lower), new BytesRef(upper), true, true), + theQuery.add(new TermRangeQuery("date", new BytesRef(lower), new BytesRef(upper), true, true), Occur.MUST); } From 290ad81bae3bd9b991791090cad5e13d98c46dfe Mon Sep 17 00:00:00 2001 From: Matthew Richards Date: Tue, 24 Aug 2021 09:13:13 +0000 Subject: [PATCH 06/73] Update release notes for 1.1.1 release --- src/site/xhtml/release-notes.xhtml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/site/xhtml/release-notes.xhtml b/src/site/xhtml/release-notes.xhtml index c9246d4..5ffafa2 100644 --- a/src/site/xhtml/release-notes.xhtml +++ b/src/site/xhtml/release-notes.xhtml @@ -6,6 +6,9 @@

ICAT Lucene Server Release Notes

+

1.1.1

+

Fixes date queries on datafiles

+

1.1.0

Make it work with icat.server 4.9.1 and bug fixes

    From d80515e5bfaa022d429ac87ce2d293453a60245d Mon Sep 17 00:00:00 2001 From: Matthew Richards Date: Tue, 24 Aug 2021 09:36:55 +0000 Subject: [PATCH 07/73] [maven-release-plugin] prepare release v1.1.1 --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 693a538..56c5141 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ org.icatproject icat.lucene - 1.1.1-SNAPSHOT + 1.1.1 war ICAT Lucene @@ -28,7 +28,7 @@ scm:git:${gitUrl}.git scm:git:${gitUrl}.git ${gitUrl} - HEAD + v1.1.1 From 7bf345959e83b9154d5479caedbb08445be21a26 Mon Sep 17 00:00:00 2001 From: Matthew Richards Date: Tue, 24 Aug 2021 09:43:12 +0000 Subject: [PATCH 08/73] [maven-release-plugin] prepare for next development iteration --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 56c5141..1eab5ad 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ org.icatproject icat.lucene - 1.1.1 + 1.1.2-SNAPSHOT war ICAT Lucene @@ -28,7 +28,7 @@ scm:git:${gitUrl}.git scm:git:${gitUrl}.git ${gitUrl} - v1.1.1 + HEAD From a44db6587943c697148ca0fb785ff63638524c42 Mon Sep 17 00:00:00 2001 From: Stuart Pullinger Date: Fri, 27 Aug 2021 11:29:52 +0000 Subject: [PATCH 09/73] [maven-release-plugin] prepare release v1.1.1 --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 1eab5ad..56c5141 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ org.icatproject icat.lucene - 1.1.2-SNAPSHOT + 1.1.1 war ICAT Lucene @@ -28,7 +28,7 @@ scm:git:${gitUrl}.git scm:git:${gitUrl}.git ${gitUrl} - HEAD + v1.1.1 From 32f9fbeee24278a6300e18a2656c49f2dadb11c2 Mon Sep 17 00:00:00 2001 From: Stuart Pullinger Date: Tue, 14 Jan 2020 12:00:13 +0000 Subject: [PATCH 10/73] Converted setup to python 3 --- src/main/scripts/setup | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/scripts/setup b/src/main/scripts/setup index 6371b9f..b0f7f0c 100755 --- a/src/main/scripts/setup +++ b/src/main/scripts/setup @@ -27,14 +27,14 @@ if arg == "INSTALL": ovfiles = [[prop_name, "WEB-INF/classes"]] if os.path.exists("logback.xml"): ovfiles.append(["logback.xml", "WEB-INF/classes"]) actions.deploy(deploymentorder=80, files=ovfiles) - except Exception, e: + except Exception as e: abort(str(e)) if arg == "UNINSTALL": try: uninstall() - except Exception, e: + except Exception as e: abort(str(e)) From b902385ceba087bb9108b17ad00b627c3a319b85 Mon Sep 17 00:00:00 2001 From: Matthew Richards Date: Thu, 16 Sep 2021 09:17:00 +0000 Subject: [PATCH 11/73] Update icat.utils version --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 56c5141..abfc732 100644 --- a/pom.xml +++ b/pom.xml @@ -95,7 +95,7 @@ org.icatproject icat.utils - 4.16.0 + 4.16.1 From 4416be4dfc6752ea877fdd1549631409348e37a3 Mon Sep 17 00:00:00 2001 From: Matthew Richards Date: Thu, 16 Sep 2021 09:25:18 +0000 Subject: [PATCH 12/73] Update version and release notes --- pom.xml | 2 +- src/site/xhtml/release-notes.xhtml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index abfc732..4d0f4cd 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ org.icatproject icat.lucene - 1.1.1 + 1.1.2 war ICAT Lucene diff --git a/src/site/xhtml/release-notes.xhtml b/src/site/xhtml/release-notes.xhtml index 5ffafa2..a8e1b76 100644 --- a/src/site/xhtml/release-notes.xhtml +++ b/src/site/xhtml/release-notes.xhtml @@ -5,6 +5,8 @@

    ICAT Lucene Server Release Notes

    +

    1.1.2

    +

    Changes to support Python 3. Now works on Python 2.7 and Python 3. Note: support for Python 2.6 is now dropped.

    1.1.1

    Fixes date queries on datafiles

    From 8a36fb0a4d88b6b8efeea6b392b53d6244b6d5ea Mon Sep 17 00:00:00 2001 From: Matthew Richards Date: Thu, 16 Sep 2021 09:32:31 +0000 Subject: [PATCH 13/73] Add snapshot to version --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 4d0f4cd..f907525 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ org.icatproject icat.lucene - 1.1.2 + 1.1.2-SNAPSHOT war ICAT Lucene From f0be663477de33791daa3238593aa583c3acb7de Mon Sep 17 00:00:00 2001 From: Matthew Richards Date: Thu, 16 Sep 2021 09:49:18 +0000 Subject: [PATCH 14/73] [maven-release-plugin] prepare release v1.1.2 --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index f907525..a0b5bad 100644 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ org.icatproject icat.lucene - 1.1.2-SNAPSHOT + 1.1.2 war ICAT Lucene @@ -28,7 +28,7 @@ scm:git:${gitUrl}.git scm:git:${gitUrl}.git ${gitUrl} - v1.1.1 + v1.1.2 From 0ea77096c011c5147018028a0235ab788bf97987 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Tue, 11 Jan 2022 07:11:22 +0000 Subject: [PATCH 15/73] Replace travis.yml with ci-build.yml #13 --- .github/workflows/ci-build.yml | 29 +++++++++++++++++++++++++++++ .travis.yml | 16 ---------------- 2 files changed, 29 insertions(+), 16 deletions(-) create mode 100755 .github/workflows/ci-build.yml delete mode 100644 .travis.yml diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml new file mode 100755 index 0000000..c5121a5 --- /dev/null +++ b/.github/workflows/ci-build.yml @@ -0,0 +1,29 @@ +name: CI Build +on: + workflow_dispatch: + pull_request: + push: + branches: + - master + +strategy: + fail-fast: false + matrix: + version: [8] + experimental: [false] + include: + - version: 11 + experimental: true + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up JDK + uses: actions/setup-java@v2 + with: + java-version: ${{ matrix.version }} + distribution: 'open' + - name: Build with Maven + run: mvn install diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 3ba1242..0000000 --- a/.travis.yml +++ /dev/null @@ -1,16 +0,0 @@ -language: java -jdk: - - openjdk8 - - openjdk11 - -jobs: - allow_failures: - - jdk: openjdk11 - -dist: xenial - -cache: - directories: - - $HOME/.m2 - -install: true From df3f18a9cbbb69394b238152c8c00f8c9b6fdc39 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Tue, 11 Jan 2022 07:16:43 +0000 Subject: [PATCH 16/73] Update CI status badge for GHA #13 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) mode change 100644 => 100755 README.md diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 38eba26..b3357b2 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # icat.lucene -[![Build Status](https://travis-ci.org/icatproject/icat.lucene.svg?branch=master)](https://travis-ci.org/icatproject/icat.lucene) +[![Build Status](https://github.com/icatproject/icat.lucene/workflows/CI%20Build/badge.svg?branch=master)](https://github.com/icatproject/icat.lucene/actions?query=workflow%3A%22CI+Build%22) General installation instructions are at http://www.icatproject.org/installation/component From 093a0ff69151d1c39aa2cd0fed12b8b4c4a98372 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Tue, 11 Jan 2022 07:33:26 +0000 Subject: [PATCH 17/73] Move strategy matrix inside build #13 --- .github/workflows/ci-build.yml | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index c5121a5..52dca67 100755 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -6,18 +6,20 @@ on: branches: - master -strategy: - fail-fast: false - matrix: - version: [8] - experimental: [false] - include: - - version: 11 - experimental: true - jobs: build: runs-on: ubuntu-latest + continue-on-error: ${{ matrix.experimental }} + strategy: + fail-fast: false + matrix: + include: + - version: 8 + experimental: false + include: + - version: 11 + experimental: true + steps: - uses: actions/checkout@v2 - name: Set up JDK From e81defda208e0be1e731bb5accdd50b048b053fb Mon Sep 17 00:00:00 2001 From: patrick-austin <61705287+patrick-austin@users.noreply.github.com> Date: Fri, 21 Jan 2022 17:21:18 +0000 Subject: [PATCH 18/73] Remove redundant inclue #13 --- .github/workflows/ci-build.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index 52dca67..f7023e1 100755 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -16,7 +16,6 @@ jobs: include: - version: 8 experimental: false - include: - version: 11 experimental: true From aec760f84cfb427e39f2cf08d9b43c7d77f86398 Mon Sep 17 00:00:00 2001 From: patrick-austin <61705287+patrick-austin@users.noreply.github.com> Date: Fri, 21 Jan 2022 17:36:36 +0000 Subject: [PATCH 19/73] Change OpenJDK distribution #13 --- .github/workflows/ci-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index f7023e1..a30a9da 100755 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -25,6 +25,6 @@ jobs: uses: actions/setup-java@v2 with: java-version: ${{ matrix.version }} - distribution: 'open' + distribution: 'temurin' - name: Build with Maven run: mvn install From 3a4c301dbdf15350a93c00c4143b6dc4cfd58b02 Mon Sep 17 00:00:00 2001 From: patrick-austin <61705287+patrick-austin@users.noreply.github.com> Date: Mon, 24 Jan 2022 10:22:35 +0000 Subject: [PATCH 20/73] Change Maven command to "mvn test -B" #13 --- .github/workflows/ci-build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-build.yml b/.github/workflows/ci-build.yml index a30a9da..0f93ef8 100755 --- a/.github/workflows/ci-build.yml +++ b/.github/workflows/ci-build.yml @@ -26,5 +26,5 @@ jobs: with: java-version: ${{ matrix.version }} distribution: 'temurin' - - name: Build with Maven - run: mvn install + - name: Test with Maven + run: mvn test -B From b5b5d2d5ea29744441a15b3e05360bec4d8cac88 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Wed, 2 Feb 2022 15:30:22 +0000 Subject: [PATCH 21/73] Avoid index error for maxScore --- src/main/java/org/icatproject/lucene/Lucene.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 11b1d5b..737542b 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -792,7 +792,13 @@ private String luceneSearchResult(String name, Search search, int maxResults, Lo TopDocs topDocs = search.lastDoc == null ? isearcher.search(search.query, maxResults) : isearcher.searchAfter(search.lastDoc, search.query, maxResults); ScoreDoc[] hits = topDocs.scoreDocs; - logger.debug("Hits " + topDocs.totalHits + " maxscore " + topDocs.scoreDocs[0].score); + Float maxScore; + if (hits.length == 0) { + maxScore = Float.NaN; + } else { + maxScore = hits[0].score; + } + logger.debug("Hits " + topDocs.totalHits + " maxscore " + maxScore); ByteArrayOutputStream baos = new ByteArrayOutputStream(); try (JsonGenerator gen = Json.createGenerator(baos)) { gen.writeStartObject(); From 3ecdaace1cfee990af3eed9fd6a8d762f2d49ff4 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Tue, 11 Jan 2022 16:15:37 +0000 Subject: [PATCH 22/73] Add synonym injection on search #16 --- .../org/icatproject/lucene/IcatAnalyzer.java | 34 +++-- .../lucene/IcatSynonymAnalyzer.java | 50 +++++++ .../java/org/icatproject/lucene/Lucene.java | 10 +- src/main/resources/synonym.txt | 124 ++++++++++++++++++ src/test/java/icat/lucene/TestLucene.java | 59 +++++++++ 5 files changed, 262 insertions(+), 15 deletions(-) create mode 100755 src/main/java/org/icatproject/lucene/IcatSynonymAnalyzer.java create mode 100755 src/main/resources/synonym.txt mode change 100644 => 100755 src/test/java/icat/lucene/TestLucene.java diff --git a/src/main/java/org/icatproject/lucene/IcatAnalyzer.java b/src/main/java/org/icatproject/lucene/IcatAnalyzer.java index fcae1c9..d02a542 100755 --- a/src/main/java/org/icatproject/lucene/IcatAnalyzer.java +++ b/src/main/java/org/icatproject/lucene/IcatAnalyzer.java @@ -1,36 +1,42 @@ package org.icatproject.lucene; +import java.util.Arrays; +import java.util.List; + import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.CharArraySet; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.Tokenizer; import org.apache.lucene.analysis.core.LowerCaseFilter; import org.apache.lucene.analysis.core.StopFilter; -import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.en.EnglishPossessiveFilter; import org.apache.lucene.analysis.en.PorterStemFilter; -// import org.apache.lucene.analysis.standard.StandardAnalyzer ; import org.apache.lucene.analysis.standard.StandardTokenizer; -// public class IcatAnalyzer extends Analyzer { - -// @Override -// protected TokenStreamComponents createComponents(String fieldName) { -// StandardAnalyzer analyzer = new StandardAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); -// Analyzer.TokenStreamComponents stream = analyzer.createComponents(fieldName); -// sink = new EnglishPossessiveFilter(stream.getTokenStream()); -// sink = new PorterStemFilter(sink); -// return new TokenStreamComponents(source, sink); -// } -// } public class IcatAnalyzer extends Analyzer { + public static final CharArraySet SCIENTIFIC_STOP_WORDS_SET; + + /** + * Do not include (As At Be In No) in the stop words as these are chemical + * symbols. Otherwise, the set should match Lucene's ENGLISH_STOP_WORDS_SET + */ + static { + final List stopWords = + Arrays.asList("a", "an", "and", "are", "but", "by", "for", "if", "into", "is", + "it", "not", "of", "on", "or", "such", "that", "the", "their", "then", + "there", "these", "they", "this", "to", "was", "will", "with"); + final CharArraySet stopSet = new CharArraySet(stopWords, false); + SCIENTIFIC_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); + } + @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new StandardTokenizer(); TokenStream sink = new EnglishPossessiveFilter(source); sink = new LowerCaseFilter(sink); - sink = new StopFilter(sink, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); + sink = new StopFilter(sink, SCIENTIFIC_STOP_WORDS_SET); sink = new PorterStemFilter(sink); return new TokenStreamComponents(source, sink); } diff --git a/src/main/java/org/icatproject/lucene/IcatSynonymAnalyzer.java b/src/main/java/org/icatproject/lucene/IcatSynonymAnalyzer.java new file mode 100755 index 0000000..82703be --- /dev/null +++ b/src/main/java/org/icatproject/lucene/IcatSynonymAnalyzer.java @@ -0,0 +1,50 @@ +package org.icatproject.lucene; + +import java.io.FileNotFoundException; +import java.io.BufferedReader; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.IOException; +import java.text.ParseException; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.core.LowerCaseFilter; +import org.apache.lucene.analysis.core.StopFilter; +import org.apache.lucene.analysis.en.EnglishPossessiveFilter; +import org.apache.lucene.analysis.en.PorterStemFilter; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.analysis.standard.StandardTokenizer; +import org.apache.lucene.analysis.synonym.SolrSynonymParser; +import org.apache.lucene.analysis.synonym.SynonymGraphFilter; +import org.apache.lucene.analysis.synonym.SynonymMap; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class IcatSynonymAnalyzer extends Analyzer { + + private SynonymMap synonyms; + + public IcatSynonymAnalyzer() + throws IOException, ParseException { + super(); + // Load synonyms from resource file + InputStream in = IcatSynonymAnalyzer.class.getClassLoader().getResourceAsStream("synonym.txt"); + BufferedReader reader = new BufferedReader(new InputStreamReader(in)); + SolrSynonymParser parser = new SolrSynonymParser(true, true, new StandardAnalyzer()); + parser.parse(reader); + synonyms = parser.build(); + } + + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer source = new StandardTokenizer(); + TokenStream sink = new EnglishPossessiveFilter(source); + sink = new LowerCaseFilter(sink); + sink = new StopFilter(sink, IcatAnalyzer.SCIENTIFIC_STOP_WORDS_SET); + sink = new PorterStemFilter(sink); + sink = new SynonymGraphFilter(sink, synonyms, false); + return new TokenStreamComponents(source, sink); + } +} diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 737542b..32f2d39 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -6,6 +6,7 @@ import java.net.HttpURLConnection; import java.nio.file.FileVisitOption; import java.nio.file.Files; +import java.text.ParseException; import java.util.Comparator; import java.util.HashMap; import java.util.Map; @@ -638,7 +639,14 @@ private void init() { parser = new StandardQueryParser(); StandardQueryConfigHandler qpConf = (StandardQueryConfigHandler) parser.getQueryConfigHandler(); - qpConf.set(ConfigurationKeys.ANALYZER, analyzer); + try { + // Attempt init an Analyzer which injects synonyms for searching + qpConf.set(ConfigurationKeys.ANALYZER, new IcatSynonymAnalyzer()); + } catch (IOException | ParseException e) { + // If synonym files cannot be parsed, default to using the same analyzer as for writing + logger.info("Synonym files not found, synonyms will not be injected"); + qpConf.set(ConfigurationKeys.ANALYZER, analyzer); + } qpConf.set(ConfigurationKeys.ALLOW_LEADING_WILDCARD, true); timer = new Timer("LuceneCommitTimer"); diff --git a/src/main/resources/synonym.txt b/src/main/resources/synonym.txt new file mode 100755 index 0000000..92cc5e9 --- /dev/null +++ b/src/main/resources/synonym.txt @@ -0,0 +1,124 @@ +# Synonyms to be applied after stemming according to the Porter algorithm + +# Technical terms +ionis, ioniz + +# Elements +Hydrogen, H +Helium, He +Lithium, Li +Beryllium, Be +Boron, B +Carbon, C +Nitrogen, N +Oxygen, O +Fluorine, F +Neon, Ne +Sodium,Na +Magnesium,Mg +Aluminum,Al +Silicon,Si +Phosphorus,P +Sulfur, Sulphur,S +Chlorine,Cl +Argon,Ar +Potassium,K +Calcium,Ca +Scandium,Sc +Titanium,Ti +Vanadium,V +Chromium,Cr +Manganese,Mn +Iron,Fe +Cobalt,Co +Nickel,Ni +Copper,Cu +Zinc,Zn +Gallium,Ga +Germanium,Ge +Arsenic,As +Selenium,Se +Bromine,Br +Krypton,Kr +Rubidium,Rb +Strontium,Sr +Yttrium,Y +Zirconium,Zr +Niobium,Nb +Molybdenum,Mo +Technetium,Tc +Ruthenium,Ru +Rhodium,Rh +Palladium,Pd +Silver,Ag +Cadmium,Cd +Indium,In +Tin,Sn +Antimony,Sb +Tellurium,Te +Iodine,I +Xenon,Xe +Caesium, Cesium, Cs +Barium,Ba +Lanthanum,La +Cerium,Ce +Praseodymium,Pr +Neodymium,Nd +Promethium,Pm +Samarium,Sm +Europium,Eu +Gadolinium,Gd +Terbium,Tb +Dysprosium,Dy +Holmium,Ho +Erbium,Er +Thulium,Tm +Ytterbium,Yb +Lutetium,Lu +Hafnium,Hf +Tantalum,Ta +Tungsten, Wolfram,W +Rhenium,Re +Osmium,Os +Iridium,Ir +Platinum,Pt +Gold,Au +Mercury,Hg +Thallium,Tl +Lead,Pb +Bismuth,Bi +Polonium,Po +Astatine,At +Radon,Rn +Francium,Fr +Radium,Ra +Actinium,Ac +Thorium,Th +Protactinium,Pa +Uranium,U +Neptunium,Np +Plutonium,Pu +Americium,Am +Curium,Cm +Berkelium,Bk +Californium,Cf +Einsteinium,Es +Fermium,Fm +Mendelevium,Md +Nobelium,No +Lawrencium,Lr +Rutherfordium,Rf +Dubnium,Db +Seaborgium,Sg +Bohrium,Bh +Hassium,Hs +Meitnerium, Mt +Darmstadtium ,Ds +Roentgenium ,Rg +Copernicium ,Cn +Nihonium,Nh +Flerovium,Fl +Moscovium,Mc +Livermorium,Lv +Tennessine,Ts +Oganesson,Og \ No newline at end of file diff --git a/src/test/java/icat/lucene/TestLucene.java b/src/test/java/icat/lucene/TestLucene.java old mode 100644 new mode 100755 index f5cd493..a03caa4 --- a/src/test/java/icat/lucene/TestLucene.java +++ b/src/test/java/icat/lucene/TestLucene.java @@ -39,6 +39,7 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.icatproject.lucene.IcatAnalyzer; +import org.icatproject.lucene.IcatSynonymAnalyzer; import org.junit.Test; public class TestLucene { @@ -70,6 +71,64 @@ public void testIcatAnalyzer() throws Exception { assertEquals(" demo 1st number 2 all sing danc tokenstream api ad aardvark", newString); } + /** + * Test that IcatSynonymAnalyzer injects stems for alternate spellings and + * chemical symbols for the elements + */ + @Test + public void testIcatSynonymAnalyzer() throws Exception { + final String text = "hydrogen Helium LITHIUM be B NE ionisation"; + int n = 0; + String newString = ""; + + try (Analyzer analyzer = new IcatSynonymAnalyzer()) { + TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); + CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); + try { + stream.reset(); // Curiously this is required + while (stream.incrementToken()) { + n++; + newString = newString + " " + termAtt; + } + stream.end(); + } finally { + stream.close(); + } + } + + assertEquals(14, n); + assertEquals(" h hydrogen he helium li lithium beryllium be boron b neon ne ioniz ionis", newString); + } + + /** + * Test that we do not stop words that are chemical symbols (As At Be In No) + * but otherwise filter out stop words + */ + @Test + public void testIcatAnalyzerStopWords() throws Exception { + final String text = "as at be in no that the their then there"; + int n = 0; + String newString = ""; + + try (Analyzer analyzer = new IcatAnalyzer()) { + TokenStream stream = analyzer.tokenStream("field", new StringReader(text)); + CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); + try { + stream.reset(); // Curiously this is required + while (stream.incrementToken()) { + n++; + newString = newString + " " + termAtt; + } + stream.end(); + } finally { + stream.close(); + } + } + + assertEquals(5, n); + assertEquals(" as at be in no", newString); + } + @Test public void testJoins() throws Exception { Analyzer analyzer = new IcatAnalyzer(); From bcf46af9630f9fb126d584ac69936fb194dfbb4c Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Wed, 2 Feb 2022 15:30:22 +0000 Subject: [PATCH 23/73] Avoid index error for maxScore --- src/main/java/org/icatproject/lucene/Lucene.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 17f48f4..faba565 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -944,7 +944,13 @@ private String luceneSearchResult(String name, Search search, int maxResults, Lo TopDocs topDocs = search.lastDoc == null ? isearcher.search(search.query, maxResults) : isearcher.searchAfter(search.lastDoc, search.query, maxResults); ScoreDoc[] hits = topDocs.scoreDocs; - logger.debug("Hits " + topDocs.totalHits + " maxscore " + topDocs.scoreDocs[0].score); + Float maxScore; + if (hits.length == 0) { + maxScore = Float.NaN; + } else { + maxScore = hits[0].score; + } + logger.debug("Hits " + topDocs.totalHits + " maxscore " + maxScore); ByteArrayOutputStream baos = new ByteArrayOutputStream(); try (JsonGenerator gen = Json.createGenerator(baos)) { gen.writeStartObject(); From 2046da5398d7d102495550480594746058fee277 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Thu, 10 Feb 2022 17:48:13 +0000 Subject: [PATCH 24/73] Handle facet exceptions from server tests #19 --- .../java/org/icatproject/lucene/Lucene.java | 62 +++++++++++++------ 1 file changed, 42 insertions(+), 20 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index faba565..a02078e 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -6,6 +6,7 @@ import java.net.HttpURLConnection; import java.nio.file.FileVisitOption; import java.nio.file.Files; +import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; import java.util.List; @@ -733,6 +734,8 @@ private void init() { qpConf.set(ConfigurationKeys.ANALYZER, analyzer); qpConf.set(ConfigurationKeys.ALLOW_LEADING_WILDCARD, true); + facetsConfig.setMultiValued("sample", true); + timer = new Timer("LuceneCommitTimer"); timer.schedule(new CommitTimerTask(), luceneCommitMillis, luceneCommitMillis); @@ -902,36 +905,55 @@ public void lock(@PathParam("entityName") String entityName) throws LuceneExcept } private String luceneFacetResult(String name, Search search, int maxResults, int maxLabels, Long uid) - throws IOException { - IndexSearcher isearcher = getSearcher(search.searcherMap, name); - DirectoryReader directoryReader = getReader(search.readerMap, name); - logger.debug("To facet in {} for {} {} with {} from {} ", name, search.query, maxResults, isearcher, - search.lastDoc); - DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(directoryReader); - FacetsCollector facetsCollector = new FacetsCollector(); - FacetsCollector.search(isearcher, search.query, maxResults, facetsCollector); - Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); - List results = facets.getAllDims(maxLabels); - logger.debug("Facets found for " + results.size() + " dimensions"); + throws IOException, IllegalStateException { + List results; + if (maxResults <= 0 || maxLabels <= 0) { + // This will result in no Facets and a null pointer, so return early + logger.warn("No facets possible for maxResults={}, maxLabels={}, returning empty list", maxResults, maxLabels); + results = new ArrayList<>(); + } else { + // TODO Consider either making this approach uniform, or whether to only do it for entities where we facet + DirectoryReader directoryReader = getReader(search.readerMap, name); + IndexSearcher isearcher = new IndexSearcher(directoryReader); + logger.debug("To facet in {} for {} {} with {} from {} ", name, search.query, maxResults, isearcher, + search.lastDoc); + try { + DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(directoryReader); + FacetsCollector facetsCollector = new FacetsCollector(); + logger.debug("DR: {}, IS: {}", directoryReader, isearcher.getTopReaderContext()); + FacetsCollector.search(isearcher, search.query, maxResults, facetsCollector); + Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); + logger.debug("facets: {}, maxLabels: {}, maxResults: {}", facets, maxLabels, maxResults); + results = facets.getAllDims(maxLabels); + } catch (IllegalArgumentException e) { + // This can occur if no fields in the index have been faceted + logger.error("No facets found in index, resulting in error: " + e.getClass() + " " + e.getMessage()); + results = new ArrayList<>(); + } catch (IllegalStateException e) { + // This can occur if we do not create the IndexSearcher from the same DirectoryReader as we used to + // create the state + logger.error("IndexSearcher used is not based on the DirectoryReader used for facet counting: " + + e.getClass() + " " + e.getMessage()); + throw e; + } + logger.debug("Facets found for " + results.size() + " dimensions"); + } ByteArrayOutputStream baos = new ByteArrayOutputStream(); try (JsonGenerator gen = Json.createGenerator(baos)) { gen.writeStartObject(); if (uid != null) { gen.write("uid", uid); } - gen.writeStartArray("facets"); // array of all facet dimensions + gen.writeStartObject("dimensions"); // object containing all facet dimensions for (FacetResult result : results) { - gen.writeStartArray(result.dim); // array of labelValues for a given dimension + gen.writeStartObject(result.dim); // object containing labelValues for a given dimension for (LabelAndValue labelValue : result.labelValues) { - gen.writeStartArray("labelValue"); // 2 element array of label, value - gen.write(labelValue.label); - gen.write(labelValue.value.longValue()); - gen.writeEnd(); // array of label, value + gen.write(labelValue.label, labelValue.value.longValue()); } - gen.writeEnd(); // array of labelValues for a given dimension + gen.writeEnd(); // object containing labelValues } - gen.writeEnd(); // array of facet dimensions - gen.writeEnd(); // object + gen.writeEnd(); // object containing dimensions + gen.writeEnd(); } logger.debug("Json returned {}", baos.toString()); return baos.toString(); From 7c127688a92399022f423d620f0596f3dcf99aec Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Fri, 11 Feb 2022 05:36:20 +0000 Subject: [PATCH 25/73] Add script to generate synonyms from csv #16 --- .../org/icatproject/lucene/IcatAnalyzer.java | 2 +- .../lucene/IcatSynonymAnalyzer.java | 6 +- src/main/resources/synonym.txt | 127 ++++++++++++- src/main/scripts/parse_synonyms.py | 176 ++++++++++++++++++ src/test/java/icat/lucene/TestLucene.java | 10 +- 5 files changed, 307 insertions(+), 14 deletions(-) create mode 100644 src/main/scripts/parse_synonyms.py diff --git a/src/main/java/org/icatproject/lucene/IcatAnalyzer.java b/src/main/java/org/icatproject/lucene/IcatAnalyzer.java index d02a542..a70cbd2 100755 --- a/src/main/java/org/icatproject/lucene/IcatAnalyzer.java +++ b/src/main/java/org/icatproject/lucene/IcatAnalyzer.java @@ -25,7 +25,7 @@ public class IcatAnalyzer extends Analyzer { static { final List stopWords = Arrays.asList("a", "an", "and", "are", "but", "by", "for", "if", "into", "is", - "it", "not", "of", "on", "or", "such", "that", "the", "their", "then", + "it", "not", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"); final CharArraySet stopSet = new CharArraySet(stopWords, false); SCIENTIFIC_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); diff --git a/src/main/java/org/icatproject/lucene/IcatSynonymAnalyzer.java b/src/main/java/org/icatproject/lucene/IcatSynonymAnalyzer.java index 82703be..26841f1 100755 --- a/src/main/java/org/icatproject/lucene/IcatSynonymAnalyzer.java +++ b/src/main/java/org/icatproject/lucene/IcatSynonymAnalyzer.java @@ -1,6 +1,5 @@ package org.icatproject.lucene; -import java.io.FileNotFoundException; import java.io.BufferedReader; import java.io.InputStream; import java.io.InputStreamReader; @@ -14,13 +13,10 @@ import org.apache.lucene.analysis.core.StopFilter; import org.apache.lucene.analysis.en.EnglishPossessiveFilter; import org.apache.lucene.analysis.en.PorterStemFilter; -import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.standard.StandardTokenizer; import org.apache.lucene.analysis.synonym.SolrSynonymParser; import org.apache.lucene.analysis.synonym.SynonymGraphFilter; import org.apache.lucene.analysis.synonym.SynonymMap; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; public class IcatSynonymAnalyzer extends Analyzer { @@ -32,7 +28,7 @@ public IcatSynonymAnalyzer() // Load synonyms from resource file InputStream in = IcatSynonymAnalyzer.class.getClassLoader().getResourceAsStream("synonym.txt"); BufferedReader reader = new BufferedReader(new InputStreamReader(in)); - SolrSynonymParser parser = new SolrSynonymParser(true, true, new StandardAnalyzer()); + SolrSynonymParser parser = new SolrSynonymParser(true, true, new IcatAnalyzer()); parser.parse(reader); synonyms = parser.build(); } diff --git a/src/main/resources/synonym.txt b/src/main/resources/synonym.txt index 92cc5e9..1d3bc5d 100755 --- a/src/main/resources/synonym.txt +++ b/src/main/resources/synonym.txt @@ -1,7 +1,7 @@ # Synonyms to be applied after stemming according to the Porter algorithm -# Technical terms -ionis, ioniz +# Alternate spellings +ionise, ionize # Elements Hydrogen, H @@ -121,4 +121,125 @@ Flerovium,Fl Moscovium,Mc Livermorium,Lv Tennessine,Ts -Oganesson,Og \ No newline at end of file +Oganesson,Og + +# Techniques +propagation technique, forward scattering technique => propagation technique, forward scattering technique +time of flight technique, TOF => time of flight technique, TOF +ultrafast probe, femtosecond probe => ultrafast probe, femtosecond probe +MuSR, muon spin resonance => MuSR, muon spin resonance +obtain crystal structure, crystallography => obtain crystal structure, crystallography +time dependent study, time resolved study => time dependent study, time resolved study +ARPES, angle resolved photoemission spectroscopy => ARPES, angle resolved photoemission spectroscopy +grazing incidence SAS, grazing incidence small angle scattering, GISAS => grazing incidence SAS, grazing incidence small angle scattering, GISAS +NPD, neutron powder diffraction => NPD, neutron powder diffraction +XPD, x-ray powder diffraction => XPD, x-ray powder diffraction +single crystal x-ray diffraction, x-ray single crystal diffraction, SXRD => single crystal x-ray diffraction, x-ray single crystal diffraction, SXRD +hard photoelectron spectroscopy, hard x-ray photoelectron spectroscopy, HAXPES => hard photoelectron spectroscopy, hard x-ray photoelectron spectroscopy, HAXPES +inelastic small angle scatteringng, inelastic SAS => inelastic small angle scatteringng, inelastic SAS +infrared spectroscopy, IR spectroscopy => infrared spectroscopy, IR spectroscopy +fluorescence microscopy, microfluorescence, Micro XRF => fluorescence microscopy, microfluorescence, Micro XRF +PCS, photon correlation spectroscopy => PCS, photon correlation spectroscopy +quasielastic spin echo, quasi elastic spin echo, quasielastic neutron spin echo scattering => quasielastic spin echo, quasi elastic spin echo, quasielastic neutron spin echo scattering +reflectivity, reflectometry => reflectivity, reflectometry +anomalous scattering, anomalous diffraction, resonant diffraction => anomalous scattering, anomalous diffraction, resonant diffraction +STM, scanning transmission microscopy => STM, scanning transmission microscopy +small angle diffraction, small angle scattering, SAS => small angle diffraction, small angle scattering, SAS +spin echo small angle scattering, spin echo SANS => spin echo small angle scattering, spin echo SANS +UV circular dichroism, UVCD => UV circular dichroism, UVCD +ultra small angle scattering, USAS => ultra small angle scattering, USAS +topography, diffraction imaging => topography, diffraction imaging +XMCD, x-ray magnetic circular dichroism => XMCD, x-ray magnetic circular dichroism +linear dichroism, LD => linear dichroism, LD +x-ray excited optical luminescence, XEOL => x-ray excited optical luminescence, XEOL +magnetic circular dichroism, MCD => magnetic circular dichroism, MCD +magnetochiral dichroism, MChD => magnetochiral dichroism, MChD +natural circular dichroism, NCD => natural circular dichroism, NCD +EM, electron microscopy => EM, electron microscopy +photoemission microscopy, PEEM, photoemission electron microscopy => photoemission microscopy, PEEM, photoemission electron microscopy +scanning microscopy, scanning probe microscopy => scanning microscopy, scanning probe microscopy +XRR, x-ray reflectometry, x-ray reflectivity => XRR, x-ray reflectometry, x-ray reflectivity +energy dispersive diffraction, EDD => energy dispersive diffraction, EDD +energy dispersive x-ray diffraction, EDXRD => energy dispersive x-ray diffraction, EDXRD +grazing incidence x-ray diffraction, GIXD => grazing incidence x-ray diffraction, GIXD +grazing incidence small angle x-ray scattering, GISAXS => grazing incidence small angle x-ray scattering, GISAXS +high pressure single crystal diffraction, Diffraction => high pressure single crystal diffraction, Diffraction +protein crystallography, macromolecular crystallography, MX => protein crystallography, macromolecular crystallography, MX +multi wavelength anomalous dispersion, multi wavelength anomalous diffraction, MAD => multi wavelength anomalous dispersion, multi wavelength anomalous diffraction, MAD +PhD, photoelectron diffraction => PhD, photoelectron diffraction +SFX, serial femtosecond crystallography => SFX, serial femtosecond crystallography +serial synchrotron crystallography, SSX => serial synchrotron crystallography, SSX +single wavelength anomalous diffraction, SAD, single wavelength anomalous dispersion => single wavelength anomalous diffraction, SAD, single wavelength anomalous dispersion +chemical crystallography, small molecule diffraction, small molecule crystallography => chemical crystallography, small molecule diffraction, small molecule crystallography +x-ray standing wave, XSW => x-ray standing wave, XSW +coherent diffraction imaging, coherent diffractive imaging, CDI => coherent diffraction imaging, coherent diffractive imaging, CDI +nano infrared spectroscopy, infrared nanospectroscopy imaging => nano infrared spectroscopy, infrared nanospectroscopy imaging +XRF, x-ray fluorescence => XRF, x-ray fluorescence +infrared microscopy, IR microscopy => infrared microscopy, IR microscopy +pair distribution function, PDF => pair distribution function, PDF +inelastic x-ray scattering, IXS => inelastic x-ray scattering, IXS +resonant inelastic x-ray scattering, RIXS => resonant inelastic x-ray scattering, RIXS +resonant x-ray scattering, RXS => resonant x-ray scattering, RXS +resonant soft x-ray scattering, RSXS => resonant soft x-ray scattering, RSXS +small angle x-ray scattering, SAXS => small angle x-ray scattering, SAXS +SANS, small angle neutron scattering => SANS, small angle neutron scattering +wide angle x-ray scattering, WAXS => wide angle x-ray scattering, WAXS +circular dichroism, CD => circular dichroism, CD +EDX, energy dispersive x-ray spectroscopy => EDX, energy dispersive x-ray spectroscopy +XAS, x-ray absorption spectroscopy => XAS, x-ray absorption spectroscopy +XAFS, x-ray absorption fine structure => XAFS, x-ray absorption fine structure +extended x-ray absorption fine structure, EXAFS => extended x-ray absorption fine structure, EXAFS +XANES, x-ray absorption near edge structure, NEXAFS => XANES, x-ray absorption near edge structure, NEXAFS +x-ray emission spectroscopy, XES => x-ray emission spectroscopy, XES +PES, photoelectron spectroscopy => PES, photoelectron spectroscopy +x-ray photoelectron spectroscopy, XPS => x-ray photoelectron spectroscopy, XPS +x-ray photon correlation spectroscopy, XPCS => x-ray photon correlation spectroscopy, XPCS +x-ray tomography, CT scan => x-ray tomography, CT scan +Absorption-based tomographic microscopy, absorption microtomography => Absorption-based tomographic microscopy, absorption microtomography +Ultra-fast tomographic microscopy, ultrafast microtomography => Ultra-fast tomographic microscopy, ultrafast microtomography +XRD, x-ray diffraction => XRD, x-ray diffraction +STXM, scanning transmission x-ray microscopy => STXM, scanning transmission x-ray microscopy +TEY, total electron yield => TEY, total electron yield +XMCD total electron yield, XMCD TEY => XMCD total electron yield, XMCD TEY +neutron reflectivity, neutron reflectometry => neutron reflectivity, neutron reflectometry +ultra small angle x-ray scattering, USAXS => ultra small angle x-ray scattering, USAXS +polarized neutron reflectometry, polarized neutron reflectivity => polarized neutron reflectometry, polarized neutron reflectivity +TOF spectrometry, time-of-flight spectrometry, TOF spectroscopy => TOF spectrometry, time-of-flight spectrometry, TOF spectroscopy +inelastic neutron scattering spectroscopy, inelastic neutron spectroscopy, inelastic neutron scattering => inelastic neutron scattering spectroscopy, inelastic neutron spectroscopy, inelastic neutron scattering +x-ray magnetic linear dichroism, XMLD => x-ray magnetic linear dichroism, XMLD +resonant elastic x-ray scattering, REXS => resonant elastic x-ray scattering, REXS +x-ray refraction radiography, x-ray refraction imaging => x-ray refraction radiography, x-ray refraction imaging +time dependent scattering, time resolved scattering => time dependent scattering, time resolved scattering +time resolved diffraction, time dependent diffraction => time resolved diffraction, time dependent diffraction +time dependent absorption, time resolved absorption => time dependent absorption, time resolved absorption +anomalous small angle x-ray scattering, ASAXS => anomalous small angle x-ray scattering, ASAXS +ASAX, anomalous solution x-ray scattering => ASAX, anomalous solution x-ray scattering +grazing incidence small angle neutron scattering, GISANS => grazing incidence small angle neutron scattering, GISANS +VSANS, very small angle neutron scattering => VSANS, very small angle neutron scattering +micro SAXS tomography, micro small angle x-ray scattering tomography => micro SAXS tomography, micro small angle x-ray scattering tomography +micro grazing incidence small angle x-ray scattering tomography, micro GISAXS tomography => micro grazing incidence small angle x-ray scattering tomography, micro GISAXS tomography +nano angle resolved photoemission spectroscopy, nano ARPES => nano angle resolved photoemission spectroscopy, nano ARPES +scanning x-ray microscopy, x-ray scanning microscopy => scanning x-ray microscopy, x-ray scanning microscopy +high resolution x-ray photoelectron spectroscopy, HR-XPS => high resolution x-ray photoelectron spectroscopy, HR-XPS +resolution elastic neutron scattering, elastic neutron scattering spectroscopy, RENS => resolution elastic neutron scattering, elastic neutron scattering spectroscopy, RENS +x-ray magnetochiral dichroism, XMChiD => x-ray magnetochiral dichroism, XMChiD +x-ray natural circular dichroism, XNCD => x-ray natural circular dichroism, XNCD +XNLD, x-ray natural linear dichroism => XNLD, x-ray natural linear dichroism +fragment screening, crystallographic fragment screening => fragment screening, crystallographic fragment screening +microfocus macromolecular crystallography, microfocus MX => microfocus macromolecular crystallography, microfocus MX +nanofocus MX, nanofocus macromolecular crystallography => nanofocus MX, nanofocus macromolecular crystallography +MR, molecular replacement => MR, molecular replacement +time resolved serial femtosecond crystallography, TR-SFX => time resolved serial femtosecond crystallography, TR-SFX +fixed target serial synchrotron crystallography, FT-SSX => fixed target serial synchrotron crystallography, FT-SSX +LCP-SSX, lipidic cubic phase serial synchrotron crystallography => LCP-SSX, lipidic cubic phase serial synchrotron crystallography +TR-SSX, time resolved serial synchrotron crystallography => TR-SSX, time resolved serial synchrotron crystallography +CLXM, correlative light x-ray microscopy => CLXM, correlative light x-ray microscopy +grazing incidence wide angle scattering, GIWAXS => grazing incidence wide angle scattering, GIWAXS +high resolution angle resolved photoemission spectroscopy, HR-ARPES => high resolution angle resolved photoemission spectroscopy, HR-ARPES +atomic force microscopy, AFM => atomic force microscopy, AFM +AFM-IR, atomic force microscope infrared spectroscopy => AFM-IR, atomic force microscope infrared spectroscopy +fourier transform infrared spectroscopy, FTIR => fourier transform infrared spectroscopy, FTIR +EDE, energy dispersive extended x-ray absorption fine structure, ED-EXAFS => EDE, energy dispersive extended x-ray absorption fine structure, ED-EXAFS +radiation therapy, radiotherapy => radiation therapy, radiotherapy +surface crystallography, obtain surface atomic structure => surface crystallography, obtain surface atomic structure +x-ray birefringence imaging , XBI => x-ray birefringence imaging , XBI diff --git a/src/main/scripts/parse_synonyms.py b/src/main/scripts/parse_synonyms.py new file mode 100644 index 0000000..11e7621 --- /dev/null +++ b/src/main/scripts/parse_synonyms.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python + +import csv +import sys +from typing import Dict, List + + +def addToParents( + relationships: Dict[str, Dict[str, List[str]]], + label: str, + parents: List[str], + childDepth: int +): + """ + Adds the `label` to all the entries in `relationships` that have a key in + `parents`, then recursively calls itself to add `label` to any + grandparents. `childDepth` is decreased by 1 for each generation to prevent + exponentially large injections. + + Parameters + ---------- + relationships: Dict[str, Dict[str, List[str]]] + Maps terms to an inner dictionary containing arrays for "alternatives", + "parents", and "children". + label: str + The term to be added to its `parents`. + parents: List[str] + The direct parents of the current `label` + childDepth: int + The number of generations of children to inject for each term. + For example, a value of 2 would inject children and their children. + 0 will only add alternative terms. Negative integers will add all + children, grandchildren, etc. Note that this may result in an + exponentially large number of terms + """ + if childDepth != 0: + for parent in parents: + try: + relationships[parent]["children"].append(label) + addToParents( + relationships, + label, + relationships[parent]["parents"], + childDepth - 1, + ) + except KeyError: + pass + + +def main(inputFile: str, outputFile: str, mode: str, maxChildDepth: int): + """ + Reads an CSV file of terminology and writes it into Solr synonym format + for use in synonym injection. Alternative terms are always written, and the + number of child terms is configurable by `maxChildDepth`. + + Parameters + ---------- + inputFile: str + CSV file to read ontology from. + outputFile: str + Solr synonym output file. + mode: str + Python file mode (w, a, ...) to use when writing the output file. + maxChildDepth: int + The maximum number of generations of children to inject for each term. + For example, a value of 2 would inject children and their children. + 0 will only add alternative terms. Negative integers will add all + children, grandchildren, etc. Note that this may result in an + exponentially large number of terms + """ + altIndices = [] + parentIndices = [] + # equivalentIndices = [] + relationships = {} + with open(inputFile) as f: + reader = csv.reader(f) + + # Dynamically determine header positions + headers = next(reader) + for i, header in enumerate(headers): + if "Label" == header.strip(): + labelIndex = i + # elif "Class Type" == header: + # classIndex = i + elif "Alt Label" in header.strip(): + altIndices.append(i) + elif "Parent IRI" == header.strip(): + parentIndices.append(i) + # elif "Equivalent" == header.strip(): + # equivalentIndices.append(i) + + for entries in reader: + try: + int(entries[0]) + except (ValueError, IndexError): + # If we do not have an ID, continue to the next line + continue + + label = entries[labelIndex] + if label in relationships.keys(): + raise ValueError(f"Duplicate entry for label {label}") + + # relationships[label] = { + # "alternatives": [], + # "parents": [], + # "equivalent": [], + # "children": [], + # } + relationships[label] = { + "alternatives": [], "parents": [], "children": [] + } + # classType = entries[classIndex] + for altIndex in altIndices: + alternativeLabel = entries[altIndex] + if alternativeLabel != "": + relationships[label]["alternatives"].append( + alternativeLabel + ) + for parentIndex in parentIndices: + parent = entries[parentIndex] + if parent != "": + relationships[label]["parents"].append(parent) + # for equivalentIndex in equivalentIndices: + # equivalentLabel = entries[equivalentIndex] + # if equivalentLabel != "": + # relationships[label]["equivalent"].append(equivalentLabel) + + print(f"{len(relationships)} relationships found") + for label, relationship in relationships.items(): + addToParents( + relationships, label, relationship["parents"], maxChildDepth + ) + + output = "" + for label, relationship in relationships.items(): + # Only write to file if we have alternative or child terms + if (len(relationship["alternatives"]) > 0 + or len(relationship["children"]) > 0): + leftHandSide = ", ".join( + set([label] + relationship["alternatives"]) + ) + rightHandSide = ", ".join( + set( + [label] + + relationship["alternatives"] + + relationship["children"] + ) + ) + output += leftHandSide + " => " + rightHandSide + "\n" + + with open(outputFile, mode) as f: + f.write(output) + + +if __name__ == "__main__": + args = sys.argv + try: + inputFile = args[1] + except IndexError as e: + raise IndexError("inputFile to parse not provided") from e + try: + outputFile = args[2] + except IndexError as e: + raise IndexError("outputFile to write to not provided") from e + try: + mode = args[3] + except IndexError: + # Default to appending to the outputFile (no overwrite) + mode = "a" + try: + maxChildDepth = int(args[4]) + except (IndexError, ValueError): + # Default to 0 depth (only alternative terms) + maxChildDepth = 0 + + main(inputFile, outputFile, mode, maxChildDepth) diff --git a/src/test/java/icat/lucene/TestLucene.java b/src/test/java/icat/lucene/TestLucene.java index a03caa4..82b8cb3 100755 --- a/src/test/java/icat/lucene/TestLucene.java +++ b/src/test/java/icat/lucene/TestLucene.java @@ -67,8 +67,8 @@ public void testIcatAnalyzer() throws Exception { } } - assertEquals(11, n); - assertEquals(" demo 1st number 2 all sing danc tokenstream api ad aardvark", newString); + assertEquals(12, n); + assertEquals(" demo of 1st number 2 all sing danc tokenstream api ad aardvark", newString); } /** @@ -77,7 +77,7 @@ public void testIcatAnalyzer() throws Exception { */ @Test public void testIcatSynonymAnalyzer() throws Exception { - final String text = "hydrogen Helium LITHIUM be B NE ionisation"; + final String text = "hydrogen Helium LITHIUM be B NE ionisation TIME of FLIGHT technique ArPeS"; int n = 0; String newString = ""; @@ -96,8 +96,8 @@ public void testIcatSynonymAnalyzer() throws Exception { } } - assertEquals(14, n); - assertEquals(" h hydrogen he helium li lithium beryllium be boron b neon ne ioniz ionis", newString); + assertEquals(24, n); + assertEquals(" h hydrogen he helium li lithium beryllium be boron b neon ne ioniz ionis time tof of flight techniqu arp angl resolv photoemiss spectroscopi", newString); } /** From b32f3aa0460bbe544c49e1972d1a1d72c0dd59f7 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Sat, 12 Feb 2022 12:48:28 +0000 Subject: [PATCH 26/73] Take equivalent labels into account #16 --- src/main/resources/synonym.txt | 159 +++++++++++++++-------------- src/main/scripts/parse_synonyms.py | 45 +++++--- 2 files changed, 108 insertions(+), 96 deletions(-) diff --git a/src/main/resources/synonym.txt b/src/main/resources/synonym.txt index 1d3bc5d..5e633da 100755 --- a/src/main/resources/synonym.txt +++ b/src/main/resources/synonym.txt @@ -124,122 +124,123 @@ Tennessine,Ts Oganesson,Og # Techniques -propagation technique, forward scattering technique => propagation technique, forward scattering technique -time of flight technique, TOF => time of flight technique, TOF -ultrafast probe, femtosecond probe => ultrafast probe, femtosecond probe +forward scattering technique, propagation technique => forward scattering technique, propagation technique +TOF, time of flight technique => TOF, time of flight technique +femtosecond probe, ultrafast probe => femtosecond probe, ultrafast probe MuSR, muon spin resonance => MuSR, muon spin resonance -obtain crystal structure, crystallography => obtain crystal structure, crystallography +crystallography, obtain crystal structure => crystallography, obtain crystal structure time dependent study, time resolved study => time dependent study, time resolved study ARPES, angle resolved photoemission spectroscopy => ARPES, angle resolved photoemission spectroscopy -grazing incidence SAS, grazing incidence small angle scattering, GISAS => grazing incidence SAS, grazing incidence small angle scattering, GISAS +GISAS, grazing incidence SAS, grazing incidence small angle scattering => GISAS, grazing incidence SAS, grazing incidence small angle scattering NPD, neutron powder diffraction => NPD, neutron powder diffraction XPD, x-ray powder diffraction => XPD, x-ray powder diffraction -single crystal x-ray diffraction, x-ray single crystal diffraction, SXRD => single crystal x-ray diffraction, x-ray single crystal diffraction, SXRD -hard photoelectron spectroscopy, hard x-ray photoelectron spectroscopy, HAXPES => hard photoelectron spectroscopy, hard x-ray photoelectron spectroscopy, HAXPES -inelastic small angle scatteringng, inelastic SAS => inelastic small angle scatteringng, inelastic SAS -infrared spectroscopy, IR spectroscopy => infrared spectroscopy, IR spectroscopy -fluorescence microscopy, microfluorescence, Micro XRF => fluorescence microscopy, microfluorescence, Micro XRF +SXRD, single crystal x-ray diffraction, x-ray single crystal diffraction => SXRD, single crystal x-ray diffraction, x-ray single crystal diffraction +HAXPES, hard photoelectron spectroscopy, hard x-ray photoelectron spectroscopy => HAXPES, hard photoelectron spectroscopy, hard x-ray photoelectron spectroscopy +inelastic SAS, inelastic small angle scatteringng => inelastic SAS, inelastic small angle scatteringng +IR spectroscopy, infrared spectroscopy => IR spectroscopy, infrared spectroscopy +Micro XRF, fluorescence microscopy, microfluorescence => Micro XRF, fluorescence microscopy, microfluorescence PCS, photon correlation spectroscopy => PCS, photon correlation spectroscopy -quasielastic spin echo, quasi elastic spin echo, quasielastic neutron spin echo scattering => quasielastic spin echo, quasi elastic spin echo, quasielastic neutron spin echo scattering +quasi elastic spin echo, quasielastic neutron spin echo scattering, quasielastic spin echo => quasi elastic spin echo, quasielastic neutron spin echo scattering, quasielastic spin echo reflectivity, reflectometry => reflectivity, reflectometry -anomalous scattering, anomalous diffraction, resonant diffraction => anomalous scattering, anomalous diffraction, resonant diffraction +anomalous diffraction, anomalous scattering, resonant diffraction => anomalous diffraction, anomalous scattering, resonant diffraction STM, scanning transmission microscopy => STM, scanning transmission microscopy -small angle diffraction, small angle scattering, SAS => small angle diffraction, small angle scattering, SAS -spin echo small angle scattering, spin echo SANS => spin echo small angle scattering, spin echo SANS +SAS, small angle diffraction, small angle scattering => SAS, small angle diffraction, small angle scattering +spin echo SANS, spin echo small angle scattering => spin echo SANS, spin echo small angle scattering UV circular dichroism, UVCD => UV circular dichroism, UVCD -ultra small angle scattering, USAS => ultra small angle scattering, USAS -topography, diffraction imaging => topography, diffraction imaging +USAS, ultra small angle scattering => USAS, ultra small angle scattering +diffraction imaging, topography => diffraction imaging, topography XMCD, x-ray magnetic circular dichroism => XMCD, x-ray magnetic circular dichroism -linear dichroism, LD => linear dichroism, LD -x-ray excited optical luminescence, XEOL => x-ray excited optical luminescence, XEOL -magnetic circular dichroism, MCD => magnetic circular dichroism, MCD -magnetochiral dichroism, MChD => magnetochiral dichroism, MChD -natural circular dichroism, NCD => natural circular dichroism, NCD +LD, linear dichroism => LD, linear dichroism +XEOL, x-ray excited optical luminescence => XEOL, x-ray excited optical luminescence +MCD, magnetic circular dichroism => MCD, magnetic circular dichroism +MChD, magnetochiral dichroism => MChD, magnetochiral dichroism +NCD, natural circular dichroism => NCD, natural circular dichroism EM, electron microscopy => EM, electron microscopy -photoemission microscopy, PEEM, photoemission electron microscopy => photoemission microscopy, PEEM, photoemission electron microscopy +PEEM, photoemission electron microscopy, photoemission microscopy => PEEM, photoemission electron microscopy, photoemission microscopy scanning microscopy, scanning probe microscopy => scanning microscopy, scanning probe microscopy -XRR, x-ray reflectometry, x-ray reflectivity => XRR, x-ray reflectometry, x-ray reflectivity -energy dispersive diffraction, EDD => energy dispersive diffraction, EDD -energy dispersive x-ray diffraction, EDXRD => energy dispersive x-ray diffraction, EDXRD -grazing incidence x-ray diffraction, GIXD => grazing incidence x-ray diffraction, GIXD -grazing incidence small angle x-ray scattering, GISAXS => grazing incidence small angle x-ray scattering, GISAXS -high pressure single crystal diffraction, Diffraction => high pressure single crystal diffraction, Diffraction -protein crystallography, macromolecular crystallography, MX => protein crystallography, macromolecular crystallography, MX -multi wavelength anomalous dispersion, multi wavelength anomalous diffraction, MAD => multi wavelength anomalous dispersion, multi wavelength anomalous diffraction, MAD +XRR, x-ray reflectivity, x-ray reflectometry => XRR, x-ray reflectivity, x-ray reflectometry +EDD, energy dispersive diffraction => EDD, energy dispersive diffraction +EDXRD, energy dispersive x-ray diffraction => EDXRD, energy dispersive x-ray diffraction +GIXD, grazing incidence x-ray diffraction => GIXD, grazing incidence x-ray diffraction +GISAXS, grazing incidence small angle x-ray scattering => GISAXS, grazing incidence small angle x-ray scattering +Diffraction, high pressure single crystal diffraction => Diffraction, high pressure single crystal diffraction +MX, macromolecular crystallography, protein crystallography => MX, macromolecular crystallography, protein crystallography +MAD, multi wavelength anomalous diffraction, multi wavelength anomalous dispersion => MAD, multi wavelength anomalous diffraction, multi wavelength anomalous dispersion PhD, photoelectron diffraction => PhD, photoelectron diffraction SFX, serial femtosecond crystallography => SFX, serial femtosecond crystallography -serial synchrotron crystallography, SSX => serial synchrotron crystallography, SSX -single wavelength anomalous diffraction, SAD, single wavelength anomalous dispersion => single wavelength anomalous diffraction, SAD, single wavelength anomalous dispersion -chemical crystallography, small molecule diffraction, small molecule crystallography => chemical crystallography, small molecule diffraction, small molecule crystallography -x-ray standing wave, XSW => x-ray standing wave, XSW -coherent diffraction imaging, coherent diffractive imaging, CDI => coherent diffraction imaging, coherent diffractive imaging, CDI -nano infrared spectroscopy, infrared nanospectroscopy imaging => nano infrared spectroscopy, infrared nanospectroscopy imaging +SSX, serial synchrotron crystallography => SSX, serial synchrotron crystallography +SAD, single wavelength anomalous diffraction, single wavelength anomalous dispersion => SAD, single wavelength anomalous diffraction, single wavelength anomalous dispersion +chemical crystallography, small molecule crystallography, small molecule diffraction => chemical crystallography, small molecule crystallography, small molecule diffraction +XSW, x-ray standing wave => XSW, x-ray standing wave +CDI, coherent diffraction imaging, coherent diffractive imaging => CDI, coherent diffraction imaging, coherent diffractive imaging +infrared nanospectroscopy imaging, nano infrared spectroscopy => infrared nanospectroscopy imaging, nano infrared spectroscopy XRF, x-ray fluorescence => XRF, x-ray fluorescence -infrared microscopy, IR microscopy => infrared microscopy, IR microscopy -pair distribution function, PDF => pair distribution function, PDF -inelastic x-ray scattering, IXS => inelastic x-ray scattering, IXS -resonant inelastic x-ray scattering, RIXS => resonant inelastic x-ray scattering, RIXS -resonant x-ray scattering, RXS => resonant x-ray scattering, RXS -resonant soft x-ray scattering, RSXS => resonant soft x-ray scattering, RSXS -small angle x-ray scattering, SAXS => small angle x-ray scattering, SAXS +IR microscopy, infrared microscopy => IR microscopy, infrared microscopy +PDF, pair distribution function => PDF, pair distribution function +IXS, inelastic x-ray scattering => IXS, inelastic x-ray scattering +RIXS, resonant inelastic x-ray scattering => RIXS, resonant inelastic x-ray scattering +RXS, resonant x-ray scattering => RXS, resonant x-ray scattering +RSXS, resonant soft x-ray scattering => RSXS, resonant soft x-ray scattering +SAXS, small angle x-ray scattering => SAXS, small angle x-ray scattering SANS, small angle neutron scattering => SANS, small angle neutron scattering -wide angle x-ray scattering, WAXS => wide angle x-ray scattering, WAXS -circular dichroism, CD => circular dichroism, CD +WAXS, wide angle x-ray scattering => WAXS, wide angle x-ray scattering +CD, circular dichroism => CD, circular dichroism EDX, energy dispersive x-ray spectroscopy => EDX, energy dispersive x-ray spectroscopy XAS, x-ray absorption spectroscopy => XAS, x-ray absorption spectroscopy XAFS, x-ray absorption fine structure => XAFS, x-ray absorption fine structure -extended x-ray absorption fine structure, EXAFS => extended x-ray absorption fine structure, EXAFS -XANES, x-ray absorption near edge structure, NEXAFS => XANES, x-ray absorption near edge structure, NEXAFS -x-ray emission spectroscopy, XES => x-ray emission spectroscopy, XES +EXAFS, extended x-ray absorption fine structure => EXAFS, extended x-ray absorption fine structure +NEXAFS, XANES, x-ray absorption near edge structure => NEXAFS, XANES, x-ray absorption near edge structure +XES, x-ray emission spectroscopy => XES, x-ray emission spectroscopy PES, photoelectron spectroscopy => PES, photoelectron spectroscopy -x-ray photoelectron spectroscopy, XPS => x-ray photoelectron spectroscopy, XPS -x-ray photon correlation spectroscopy, XPCS => x-ray photon correlation spectroscopy, XPCS -x-ray tomography, CT scan => x-ray tomography, CT scan +XPS, x-ray photoelectron spectroscopy => XPS, x-ray photoelectron spectroscopy +XPCS, x-ray photon correlation spectroscopy => XPCS, x-ray photon correlation spectroscopy +CT scan, x-ray tomography => CT scan, x-ray tomography Absorption-based tomographic microscopy, absorption microtomography => Absorption-based tomographic microscopy, absorption microtomography Ultra-fast tomographic microscopy, ultrafast microtomography => Ultra-fast tomographic microscopy, ultrafast microtomography XRD, x-ray diffraction => XRD, x-ray diffraction STXM, scanning transmission x-ray microscopy => STXM, scanning transmission x-ray microscopy TEY, total electron yield => TEY, total electron yield -XMCD total electron yield, XMCD TEY => XMCD total electron yield, XMCD TEY +XMCD TEY, XMCD total electron yield => XMCD TEY, XMCD total electron yield neutron reflectivity, neutron reflectometry => neutron reflectivity, neutron reflectometry -ultra small angle x-ray scattering, USAXS => ultra small angle x-ray scattering, USAXS -polarized neutron reflectometry, polarized neutron reflectivity => polarized neutron reflectometry, polarized neutron reflectivity -TOF spectrometry, time-of-flight spectrometry, TOF spectroscopy => TOF spectrometry, time-of-flight spectrometry, TOF spectroscopy -inelastic neutron scattering spectroscopy, inelastic neutron spectroscopy, inelastic neutron scattering => inelastic neutron scattering spectroscopy, inelastic neutron spectroscopy, inelastic neutron scattering -x-ray magnetic linear dichroism, XMLD => x-ray magnetic linear dichroism, XMLD -resonant elastic x-ray scattering, REXS => resonant elastic x-ray scattering, REXS -x-ray refraction radiography, x-ray refraction imaging => x-ray refraction radiography, x-ray refraction imaging +USAXS, ultra small angle x-ray scattering => USAXS, ultra small angle x-ray scattering +polarized neutron reflectivity, polarized neutron reflectometry => polarized neutron reflectivity, polarized neutron reflectometry +TOF spectrometry, TOF spectroscopy, time-of-flight spectrometry => TOF spectrometry, TOF spectroscopy, time-of-flight spectrometry +inelastic neutron scattering, inelastic neutron scattering spectroscopy, inelastic neutron spectroscopy => inelastic neutron scattering, inelastic neutron scattering spectroscopy, inelastic neutron spectroscopy +XMLD, x-ray magnetic linear dichroism => XMLD, x-ray magnetic linear dichroism +REXS, resonant elastic x-ray scattering => REXS, resonant elastic x-ray scattering +x-ray refraction imaging, x-ray refraction radiography => x-ray refraction imaging, x-ray refraction radiography time dependent scattering, time resolved scattering => time dependent scattering, time resolved scattering -time resolved diffraction, time dependent diffraction => time resolved diffraction, time dependent diffraction +time dependent diffraction, time resolved diffraction => time dependent diffraction, time resolved diffraction time dependent absorption, time resolved absorption => time dependent absorption, time resolved absorption -anomalous small angle x-ray scattering, ASAXS => anomalous small angle x-ray scattering, ASAXS +ASAXS, anomalous small angle x-ray scattering => ASAXS, anomalous small angle x-ray scattering ASAX, anomalous solution x-ray scattering => ASAX, anomalous solution x-ray scattering -grazing incidence small angle neutron scattering, GISANS => grazing incidence small angle neutron scattering, GISANS +GISANS, grazing incidence small angle neutron scattering => GISANS, grazing incidence small angle neutron scattering VSANS, very small angle neutron scattering => VSANS, very small angle neutron scattering micro SAXS tomography, micro small angle x-ray scattering tomography => micro SAXS tomography, micro small angle x-ray scattering tomography -micro grazing incidence small angle x-ray scattering tomography, micro GISAXS tomography => micro grazing incidence small angle x-ray scattering tomography, micro GISAXS tomography -nano angle resolved photoemission spectroscopy, nano ARPES => nano angle resolved photoemission spectroscopy, nano ARPES +micro GISAXS tomography, micro grazing incidence small angle x-ray scattering tomography => micro GISAXS tomography, micro grazing incidence small angle x-ray scattering tomography +nano ARPES, nano angle resolved photoemission spectroscopy => nano ARPES, nano angle resolved photoemission spectroscopy scanning x-ray microscopy, x-ray scanning microscopy => scanning x-ray microscopy, x-ray scanning microscopy -high resolution x-ray photoelectron spectroscopy, HR-XPS => high resolution x-ray photoelectron spectroscopy, HR-XPS -resolution elastic neutron scattering, elastic neutron scattering spectroscopy, RENS => resolution elastic neutron scattering, elastic neutron scattering spectroscopy, RENS -x-ray magnetochiral dichroism, XMChiD => x-ray magnetochiral dichroism, XMChiD -x-ray natural circular dichroism, XNCD => x-ray natural circular dichroism, XNCD +HR-XPS, high resolution x-ray photoelectron spectroscopy => HR-XPS, high resolution x-ray photoelectron spectroscopy +RENS, elastic neutron scattering spectroscopy, resolution elastic neutron scattering => RENS, elastic neutron scattering spectroscopy, resolution elastic neutron scattering +XMChiD, x-ray magnetochiral dichroism => XMChiD, x-ray magnetochiral dichroism +XNCD, x-ray natural circular dichroism => XNCD, x-ray natural circular dichroism XNLD, x-ray natural linear dichroism => XNLD, x-ray natural linear dichroism -fragment screening, crystallographic fragment screening => fragment screening, crystallographic fragment screening -microfocus macromolecular crystallography, microfocus MX => microfocus macromolecular crystallography, microfocus MX +crystallographic fragment screening, fragment screening => crystallographic fragment screening, fragment screening +microfocus MX, microfocus macromolecular crystallography => microfocus MX, microfocus macromolecular crystallography nanofocus MX, nanofocus macromolecular crystallography => nanofocus MX, nanofocus macromolecular crystallography MR, molecular replacement => MR, molecular replacement -time resolved serial femtosecond crystallography, TR-SFX => time resolved serial femtosecond crystallography, TR-SFX -fixed target serial synchrotron crystallography, FT-SSX => fixed target serial synchrotron crystallography, FT-SSX +TR-SFX, time resolved serial femtosecond crystallography => TR-SFX, time resolved serial femtosecond crystallography +FT-SSX, fixed target serial synchrotron crystallography => FT-SSX, fixed target serial synchrotron crystallography LCP-SSX, lipidic cubic phase serial synchrotron crystallography => LCP-SSX, lipidic cubic phase serial synchrotron crystallography TR-SSX, time resolved serial synchrotron crystallography => TR-SSX, time resolved serial synchrotron crystallography CLXM, correlative light x-ray microscopy => CLXM, correlative light x-ray microscopy -grazing incidence wide angle scattering, GIWAXS => grazing incidence wide angle scattering, GIWAXS -high resolution angle resolved photoemission spectroscopy, HR-ARPES => high resolution angle resolved photoemission spectroscopy, HR-ARPES -atomic force microscopy, AFM => atomic force microscopy, AFM +GIWAXS, grazing incidence wide angle scattering => GIWAXS, grazing incidence wide angle scattering +HR-ARPES, high resolution angle resolved photoemission spectroscopy => HR-ARPES, high resolution angle resolved photoemission spectroscopy +AFM, atomic force microscopy => AFM, atomic force microscopy AFM-IR, atomic force microscope infrared spectroscopy => AFM-IR, atomic force microscope infrared spectroscopy -fourier transform infrared spectroscopy, FTIR => fourier transform infrared spectroscopy, FTIR -EDE, energy dispersive extended x-ray absorption fine structure, ED-EXAFS => EDE, energy dispersive extended x-ray absorption fine structure, ED-EXAFS +FTIR, fourier transform infrared spectroscopy => FTIR, fourier transform infrared spectroscopy +ED-EXAFS, EDE, energy dispersive extended x-ray absorption fine structure => ED-EXAFS, EDE, energy dispersive extended x-ray absorption fine structure radiation therapy, radiotherapy => radiation therapy, radiotherapy -surface crystallography, obtain surface atomic structure => surface crystallography, obtain surface atomic structure -x-ray birefringence imaging , XBI => x-ray birefringence imaging , XBI +obtain surface atomic structure, surface crystallography => obtain surface atomic structure, surface crystallography +XBI, x-ray birefringence imaging => XBI, x-ray birefringence imaging + diff --git a/src/main/scripts/parse_synonyms.py b/src/main/scripts/parse_synonyms.py index 11e7621..3ae3d55 100644 --- a/src/main/scripts/parse_synonyms.py +++ b/src/main/scripts/parse_synonyms.py @@ -37,6 +37,10 @@ def addToParents( for parent in parents: try: relationships[parent]["children"].append(label) + # If the parent is equivalent to anything, also add label as a + # child of the equivalentParent + for equivalentParent in relationships[parent]["equivalent"]: + relationships[equivalentParent]["children"].append(label) addToParents( relationships, label, @@ -70,7 +74,8 @@ def main(inputFile: str, outputFile: str, mode: str, maxChildDepth: int): """ altIndices = [] parentIndices = [] - # equivalentIndices = [] + equivalentIndices = [] + equivalentPairs = {} relationships = {} with open(inputFile) as f: reader = csv.reader(f) @@ -86,8 +91,8 @@ def main(inputFile: str, outputFile: str, mode: str, maxChildDepth: int): altIndices.append(i) elif "Parent IRI" == header.strip(): parentIndices.append(i) - # elif "Equivalent" == header.strip(): - # equivalentIndices.append(i) + elif "Equivalent" == header.strip(): + equivalentIndices.append(i) for entries in reader: try: @@ -100,14 +105,11 @@ def main(inputFile: str, outputFile: str, mode: str, maxChildDepth: int): if label in relationships.keys(): raise ValueError(f"Duplicate entry for label {label}") - # relationships[label] = { - # "alternatives": [], - # "parents": [], - # "equivalent": [], - # "children": [], - # } relationships[label] = { - "alternatives": [], "parents": [], "children": [] + "alternatives": [], + "parents": [], + "equivalent": [], + "children": [], } # classType = entries[classIndex] for altIndex in altIndices: @@ -120,10 +122,19 @@ def main(inputFile: str, outputFile: str, mode: str, maxChildDepth: int): parent = entries[parentIndex] if parent != "": relationships[label]["parents"].append(parent) - # for equivalentIndex in equivalentIndices: - # equivalentLabel = entries[equivalentIndex] - # if equivalentLabel != "": - # relationships[label]["equivalent"].append(equivalentLabel) + for equivalentIndex in equivalentIndices: + equivalentLabel = entries[equivalentIndex] + if equivalentLabel != "": + relationships[label]["equivalent"].append(equivalentLabel) + equivalentPairs[equivalentLabel] = label + + # If A is equivalent to B, then also set B equivalent to A + # This ensures they share all children + for key, value in equivalentPairs.items(): + try: + relationships[key]["equivalent"].append(value) + except KeyError: + pass print(f"{len(relationships)} relationships found") for label, relationship in relationships.items(): @@ -137,14 +148,14 @@ def main(inputFile: str, outputFile: str, mode: str, maxChildDepth: int): if (len(relationship["alternatives"]) > 0 or len(relationship["children"]) > 0): leftHandSide = ", ".join( - set([label] + relationship["alternatives"]) + sorted(set([label] + relationship["alternatives"])) ) rightHandSide = ", ".join( - set( + sorted(set( [label] + relationship["alternatives"] + relationship["children"] - ) + )) ) output += leftHandSide + " => " + rightHandSide + "\n" From 3b5fd8cabf88bd6b6a2a238b9b5ff8dfe288daa1 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Sat, 12 Feb 2022 12:57:05 +0000 Subject: [PATCH 27/73] Change order of terms in tests #16 --- src/test/java/icat/lucene/TestLucene.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/icat/lucene/TestLucene.java b/src/test/java/icat/lucene/TestLucene.java index 82b8cb3..babebb5 100755 --- a/src/test/java/icat/lucene/TestLucene.java +++ b/src/test/java/icat/lucene/TestLucene.java @@ -97,7 +97,7 @@ public void testIcatSynonymAnalyzer() throws Exception { } assertEquals(24, n); - assertEquals(" h hydrogen he helium li lithium beryllium be boron b neon ne ioniz ionis time tof of flight techniqu arp angl resolv photoemiss spectroscopi", newString); + assertEquals(" h hydrogen he helium li lithium beryllium be boron b neon ne ioniz ionis tof time of flight techniqu arp angl resolv photoemiss spectroscopi", newString); } /** From fea2d47ab78fa96c3a8ab2a32b0b84e2c45be38e Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Wed, 9 Mar 2022 20:55:38 +0000 Subject: [PATCH 28/73] Replace searcherManager with readerManager #19 --- .../java/org/icatproject/lucene/Lucene.java | 86 +++++++------------ 1 file changed, 29 insertions(+), 57 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index a02078e..6734362 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -73,7 +73,6 @@ import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.SearcherManager; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TopDocs; @@ -105,13 +104,11 @@ private class IndexBucket { private FSDirectory directory; private IndexWriter indexWriter; private ReaderManager readerManager; - private SearcherManager searcherManager; private AtomicBoolean locked = new AtomicBoolean(); } public class Search { public Map readerMap; - public Map searcherMap; public Query query; public ScoreDoc lastDoc; } @@ -378,7 +375,6 @@ public void commit() throws LuceneException { cached, entry.getKey(), bucket.indexWriter.getDocStats().numDocs); } bucket.readerManager.maybeRefreshBlocking(); - bucket.searcherManager.maybeRefreshBlocking(); } } } catch (IOException e) { @@ -406,7 +402,6 @@ private IndexBucket createBucket(String name) { } bucket.indexWriter = iwriter; bucket.readerManager = new ReaderManager(iwriter, false, false); - bucket.searcherManager = new SearcherManager(iwriter, false, false, null); logger.debug("Bucket for {} is now ready", name); return bucket; } catch (Throwable e) { @@ -474,9 +469,9 @@ public String datafilesFacet(@Context HttpServletRequest request, @QueryParam("m private Search datafilesQuery(HttpServletRequest request, Long uid) throws IOException, QueryNodeException { Search search = new Search(); searches.put(uid, search); - Map searcherMap = new HashMap<>(); + // Map searcherMap = new HashMap<>(); Map readerMap = new HashMap<>(); - search.searcherMap = searcherMap; + // search.searcherMap = searcherMap; search.readerMap = readerMap; try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); @@ -486,14 +481,14 @@ private Search datafilesQuery(HttpServletRequest request, Long uid) throws IOExc if (userName != null) { Query iuQuery = JoinUtil.createJoinQuery("investigation", false, "id", - new TermQuery(new Term("name", userName)), getSearcher(searcherMap, "InvestigationUser"), + new TermQuery(new Term("name", userName)), getSearcher(readerMap, "InvestigationUser"), ScoreMode.None); Query invQuery = JoinUtil.createJoinQuery("id", false, "investigation", iuQuery, - getSearcher(searcherMap, "Investigation"), ScoreMode.None); + getSearcher(readerMap, "Investigation"), ScoreMode.None); Query dsQuery = JoinUtil.createJoinQuery("id", false, "dataset", invQuery, - getSearcher(searcherMap, "Dataset"), ScoreMode.None); + getSearcher(readerMap, "Dataset"), ScoreMode.None); theQuery.add(dsQuery, Occur.MUST); } @@ -510,10 +505,10 @@ private Search datafilesQuery(HttpServletRequest request, Long uid) throws IOExc Occur.MUST); } - if (o.containsKey("params")) { - JsonArray params = o.getJsonArray("params"); - IndexSearcher datafileParameterSearcher = getSearcher(searcherMap, "DatafileParameter"); - for (JsonValue p : params) { + if (o.containsKey("parameters")) { + JsonArray parameters = o.getJsonArray("parameters"); + IndexSearcher datafileParameterSearcher = getSearcher(readerMap, "DatafileParameter"); + for (JsonValue p : parameters) { BooleanQuery.Builder paramQuery = parseParameter(p); Query toQuery = JoinUtil.createJoinQuery("datafile", false, "id", paramQuery.build(), datafileParameterSearcher, ScoreMode.None); @@ -585,9 +580,7 @@ public String datasetsFacet(@Context HttpServletRequest request, @QueryParam("ma private Search datasetsQuery(HttpServletRequest request, Long uid) throws IOException, QueryNodeException { Search search = new Search(); searches.put(uid, search); - Map searcherMap = new HashMap<>(); Map readerMap = new HashMap<>(); - search.searcherMap = searcherMap; search.readerMap = readerMap; try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); @@ -598,11 +591,11 @@ private Search datasetsQuery(HttpServletRequest request, Long uid) throws IOExce if (userName != null) { Query iuQuery = JoinUtil.createJoinQuery("investigation", false, "id", - new TermQuery(new Term("name", userName)), getSearcher(searcherMap, "InvestigationUser"), + new TermQuery(new Term("name", userName)), getSearcher(readerMap, "InvestigationUser"), ScoreMode.None); Query invQuery = JoinUtil.createJoinQuery("id", false, "investigation", iuQuery, - getSearcher(searcherMap, "Investigation"), ScoreMode.None); + getSearcher(readerMap, "Investigation"), ScoreMode.None); theQuery.add(invQuery, Occur.MUST); } @@ -621,10 +614,10 @@ private Search datasetsQuery(HttpServletRequest request, Long uid) throws IOExce Occur.MUST); } - if (o.containsKey("params")) { - JsonArray params = o.getJsonArray("params"); - IndexSearcher datasetParameterSearcher = getSearcher(searcherMap, "DatasetParameter"); - for (JsonValue p : params) { + if (o.containsKey("parameters")) { + JsonArray parameters = o.getJsonArray("parameters"); + IndexSearcher datasetParameterSearcher = getSearcher(readerMap, "DatasetParameter"); + for (JsonValue p : parameters) { BooleanQuery.Builder paramQuery = parseParameter(p); Query toQuery = JoinUtil.createJoinQuery("dataset", false, "id", paramQuery.build(), datasetParameterSearcher, ScoreMode.None); @@ -648,7 +641,6 @@ private void exit() { for (Entry entry : indexBuckets.entrySet()) { IndexBucket bucket = entry.getValue(); bucket.readerManager.close(); - bucket.searcherManager.close(); bucket.indexWriter.commit(); bucket.indexWriter.close(); bucket.directory.close(); @@ -664,9 +656,8 @@ private void exit() { public void freeSearcher(@PathParam("uid") Long uid) throws LuceneException { if (uid != null) { // May not be set for internal calls logger.debug("Requesting freeSearcher {}", uid); - Map search = searches.get(uid).searcherMap; - Map read = searches.get(uid).readerMap; - for (Entry entry : read.entrySet()) { + Map readerMap = searches.get(uid).readerMap; + for (Entry entry : readerMap.entrySet()) { String name = entry.getKey(); DirectoryReader directoryReader = entry.getValue(); ReaderManager manager = indexBuckets.computeIfAbsent(name, k -> createBucket(k)).readerManager; @@ -676,16 +667,6 @@ public void freeSearcher(@PathParam("uid") Long uid) throws LuceneException { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } } - for (Entry entry : search.entrySet()) { - String name = entry.getKey(); - IndexSearcher isearcher = entry.getValue(); - SearcherManager manager = indexBuckets.computeIfAbsent(name, k -> createBucket(k)).searcherManager; - try { - manager.release(isearcher); - } catch (IOException e) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } - } searches.remove(uid); } } @@ -703,14 +684,8 @@ private DirectoryReader getReader(Map bucket, String na /* * Need a new set of IndexSearchers for each search as identified by a uid */ - private IndexSearcher getSearcher(Map bucket, String name) throws IOException { - IndexSearcher isearcher = bucket.get(name); - if (isearcher == null) { - isearcher = indexBuckets.computeIfAbsent(name, k -> createBucket(k)).searcherManager.acquire(); - bucket.put(name, isearcher); - logger.debug("Remember searcher for {}", name); - } - return isearcher; + private IndexSearcher getSearcher(Map bucket, String name) throws IOException { + return new IndexSearcher(getReader(bucket, name)); } @PostConstruct @@ -734,7 +709,8 @@ private void init() { qpConf.set(ConfigurationKeys.ANALYZER, analyzer); qpConf.set(ConfigurationKeys.ALLOW_LEADING_WILDCARD, true); - facetsConfig.setMultiValued("sample", true); + facetsConfig.setMultiValued("sampleName", true); + facetsConfig.setMultiValued("parameterName", true); timer = new Timer("LuceneCommitTimer"); timer.schedule(new CommitTimerTask(), luceneCommitMillis, luceneCommitMillis); @@ -815,9 +791,7 @@ public String investigationsFacet(@Context HttpServletRequest request, @QueryPar private Search investigationsQuery(HttpServletRequest request, Long uid) throws IOException, QueryNodeException { Search search = new Search(); searches.put(uid, search); - Map searcherMap = new HashMap<>(); Map readerMap = new HashMap<>(); - search.searcherMap = searcherMap; search.readerMap = readerMap; try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); @@ -827,7 +801,7 @@ private Search investigationsQuery(HttpServletRequest request, Long uid) throws if (userName != null) { Query iuQuery = JoinUtil.createJoinQuery("investigation", false, "id", - new TermQuery(new Term("name", userName)), getSearcher(searcherMap, "InvestigationUser"), + new TermQuery(new Term("name", userName)), getSearcher(readerMap, "InvestigationUser"), ScoreMode.None); theQuery.add(iuQuery, Occur.MUST); } @@ -846,11 +820,11 @@ private Search investigationsQuery(HttpServletRequest request, Long uid) throws Occur.MUST); } - if (o.containsKey("params")) { - JsonArray params = o.getJsonArray("params"); - IndexSearcher investigationParameterSearcher = getSearcher(searcherMap, "InvestigationParameter"); + if (o.containsKey("parameters")) { + JsonArray parameters = o.getJsonArray("parameters"); + IndexSearcher investigationParameterSearcher = getSearcher(readerMap, "InvestigationParameter"); - for (JsonValue p : params) { + for (JsonValue p : parameters) { BooleanQuery.Builder paramQuery = parseParameter(p); Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", paramQuery.build(), investigationParameterSearcher, ScoreMode.None); @@ -860,7 +834,7 @@ private Search investigationsQuery(HttpServletRequest request, Long uid) throws if (o.containsKey("samples")) { JsonArray samples = o.getJsonArray("samples"); - IndexSearcher sampleSearcher = getSearcher(searcherMap, "Sample"); + IndexSearcher sampleSearcher = getSearcher(readerMap, "Sample"); for (JsonValue s : samples) { JsonString sample = (JsonString) s; @@ -876,7 +850,7 @@ private Search investigationsQuery(HttpServletRequest request, Long uid) throws if (userFullName != null) { BooleanQuery.Builder userFullNameQuery = new BooleanQuery.Builder(); userFullNameQuery.add(parser.parse(userFullName, "text"), Occur.MUST); - IndexSearcher investigationUserSearcher = getSearcher(searcherMap, "InvestigationUser"); + IndexSearcher investigationUserSearcher = getSearcher(readerMap, "InvestigationUser"); Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", userFullNameQuery.build(), investigationUserSearcher, ScoreMode.None); theQuery.add(toQuery, Occur.MUST); @@ -912,7 +886,6 @@ private String luceneFacetResult(String name, Search search, int maxResults, int logger.warn("No facets possible for maxResults={}, maxLabels={}, returning empty list", maxResults, maxLabels); results = new ArrayList<>(); } else { - // TODO Consider either making this approach uniform, or whether to only do it for entities where we facet DirectoryReader directoryReader = getReader(search.readerMap, name); IndexSearcher isearcher = new IndexSearcher(directoryReader); logger.debug("To facet in {} for {} {} with {} from {} ", name, search.query, maxResults, isearcher, @@ -960,7 +933,7 @@ private String luceneFacetResult(String name, Search search, int maxResults, int } private String luceneSearchResult(String name, Search search, int maxResults, Long uid) throws IOException { - IndexSearcher isearcher = getSearcher(search.searcherMap, name); + IndexSearcher isearcher = getSearcher(search.readerMap, name); logger.debug("To search in {} for {} {} with {} from {} ", name, search.query, maxResults, isearcher, search.lastDoc); TopDocs topDocs = search.lastDoc == null ? isearcher.search(search.query, maxResults) @@ -1053,7 +1026,6 @@ public void unlock(@PathParam("entityName") String entityName) throws LuceneExce logger.debug("Unlock has committed {} {} changes to Lucene - now have {} documents indexed", cached, entityName, bucket.indexWriter.getDocStats().numDocs); } - bucket.searcherManager.maybeRefreshBlocking(); bucket.readerManager.maybeRefreshBlocking(); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); From a4a822b92407df277ff568d204797a9060fb4d75 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Thu, 24 Mar 2022 00:15:59 +0000 Subject: [PATCH 29/73] Enable sorting of string fields #25 --- .../java/org/icatproject/lucene/Lucene.java | 96 +++++++++++++++---- 1 file changed, 79 insertions(+), 17 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 737542b..bf92cde 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -1,13 +1,16 @@ package org.icatproject.lucene; +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; import java.net.HttpURLConnection; import java.nio.file.FileVisitOption; import java.nio.file.Files; +import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Timer; @@ -56,11 +59,14 @@ import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.BooleanQuery.Builder; +import org.apache.lucene.search.SortField.Type; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.SearcherManager; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TopDocs; @@ -99,6 +105,7 @@ public class Search { public Map map; public Query query; public ScoreDoc lastDoc; + public Sort sort; } enum When { @@ -398,7 +405,8 @@ private IndexBucket createBucket(String name) { @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) @Path("datafiles") - public String datafiles(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults) + public String datafiles(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults, + @QueryParam("sort") String sort) throws LuceneException { Long uid = null; @@ -408,6 +416,7 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("maxRes searches.put(uid, search); Map map = new HashMap<>(); search.map = map; + search.sort = parseSort(sort); try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); @@ -441,10 +450,10 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("maxRes Occur.MUST); } - if (o.containsKey("params")) { - JsonArray params = o.getJsonArray("params"); + if (o.containsKey("parameters")) { + JsonArray parameters = o.getJsonArray("parameters"); IndexSearcher datafileParameterSearcher = getSearcher(map, "DatafileParameter"); - for (JsonValue p : params) { + for (JsonValue p : parameters) { BooleanQuery.Builder paramQuery = parseParameter(p); Query toQuery = JoinUtil.createJoinQuery("datafile", false, "id", paramQuery.build(), datafileParameterSearcher, ScoreMode.None); @@ -484,7 +493,8 @@ public String datafilesAfter(@PathParam("uid") long uid, @QueryParam("maxResults @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) @Path("datasets") - public String datasets(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults) + public String datasets(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults, + @QueryParam("sort") String sort) throws LuceneException { Long uid = null; @@ -494,6 +504,7 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("maxResu searches.put(uid, search); Map map = new HashMap<>(); search.map = map; + search.sort = parseSort(sort); try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); String userName = o.getString("user", null); @@ -526,10 +537,10 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("maxResu Occur.MUST); } - if (o.containsKey("params")) { - JsonArray params = o.getJsonArray("params"); + if (o.containsKey("parameters")) { + JsonArray parameters = o.getJsonArray("parameters"); IndexSearcher datasetParameterSearcher = getSearcher(map, "DatasetParameter"); - for (JsonValue p : params) { + for (JsonValue p : parameters) { BooleanQuery.Builder paramQuery = parseParameter(p); Query toQuery = JoinUtil.createJoinQuery("dataset", false, "id", paramQuery.build(), datasetParameterSearcher, ScoreMode.None); @@ -667,7 +678,8 @@ public void run() { @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) @Path("investigations") - public String investigations(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults) + public String investigations(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults, + @QueryParam("sort") String sort) throws LuceneException { Long uid = null; try { @@ -676,6 +688,7 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("m searches.put(uid, search); Map map = new HashMap<>(); search.map = map; + search.sort = parseSort(sort); try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); String userName = o.getString("user", null); @@ -703,11 +716,11 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("m Occur.MUST); } - if (o.containsKey("params")) { - JsonArray params = o.getJsonArray("params"); + if (o.containsKey("parameters")) { + JsonArray parameters = o.getJsonArray("parameters"); IndexSearcher investigationParameterSearcher = getSearcher(map, "InvestigationParameter"); - for (JsonValue p : params) { + for (JsonValue p : parameters) { BooleanQuery.Builder paramQuery = parseParameter(p); Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", paramQuery.build(), investigationParameterSearcher, ScoreMode.None); @@ -789,8 +802,15 @@ private String luceneSearchResult(String name, Search search, int maxResults, Lo IndexSearcher isearcher = getSearcher(search.map, name); logger.debug("To search in {} for {} {} with {} from {} ", name, search.query, maxResults, isearcher, search.lastDoc); - TopDocs topDocs = search.lastDoc == null ? isearcher.search(search.query, maxResults) - : isearcher.searchAfter(search.lastDoc, search.query, maxResults); + TopDocs topDocs; + if (search.sort == null) { + // Use default score sorting + topDocs = search.lastDoc == null ? isearcher.search(search.query, maxResults) + : isearcher.searchAfter(search.lastDoc, search.query, maxResults); + } else { + topDocs = search.lastDoc == null ? isearcher.search(search.query, maxResults, search.sort) + : isearcher.searchAfter(search.lastDoc, search.query, maxResults, search.sort); + } ScoreDoc[] hits = topDocs.scoreDocs; Float maxScore; if (hits.length == 0) { @@ -810,7 +830,13 @@ private String luceneSearchResult(String name, Search search, int maxResults, Lo Document doc = isearcher.doc(hit.doc); gen.writeStartArray(); gen.write(Long.parseLong(doc.get("id"))); - gen.write(hit.score); + Float score = hit.score; + if (score.equals(Float.NaN)) { + // If we didn't sort by score, then this will be NaN + gen.write(-1.); + } else { + gen.write(hit.score); + } gen.writeEnd(); // array } gen.writeEnd(); // array results @@ -847,9 +873,11 @@ private Builder parseParameter(JsonValue p) { String pLowerDateValue = parameter.getString("lowerDateValue", null); String pUpperDateValue = parameter.getString("upperDateValue", null); Double pLowerNumericValue = parameter.containsKey("lowerNumericValue") - ? parameter.getJsonNumber("lowerNumericValue").doubleValue() : null; + ? parameter.getJsonNumber("lowerNumericValue").doubleValue() + : null; Double pUpperNumericValue = parameter.containsKey("upperNumericValue") - ? parameter.getJsonNumber("upperNumericValue").doubleValue() : null; + ? parameter.getJsonNumber("upperNumericValue").doubleValue() + : null; if (pStringValue != null) { paramQuery.add(new WildcardQuery(new Term("stringValue", pStringValue)), Occur.MUST); } else if (pLowerDateValue != null && pUpperDateValue != null) { @@ -863,6 +891,40 @@ private Builder parseParameter(JsonValue p) { return paramQuery; } + /** + * Parses the String from the request into a Lucene Sort object. Multiple sort + * criteria are supported, and will be applied in order. + * + * @param sort String representation of a JSON object with the field(s) to sort + * as keys, and the direction ("asc" or "desc") as value(s). + * @return Lucene Sort object + * @throws LuceneException If the value for any key isn't "asc" or "desc" + */ + private Sort parseSort(String sort) throws LuceneException { + if (sort == null || sort.equals("")) { + return null; + } + try (JsonReader reader = Json.createReader(new ByteArrayInputStream(sort.getBytes()))) { + JsonObject object = reader.readObject(); + List fields = new ArrayList<>(); + for (String key : object.keySet()) { + String order = object.getString(key); + Boolean reverse; + if (order.equals("asc")) { + reverse = false; + } else if (order.equals("desc")) { + reverse = true; + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Sort order must be 'asc' or 'desc' but it was '" + order + "'"); + } + + fields.add(new SortField(key, Type.STRING, reverse)); + } + return new Sort(fields.toArray(new SortField[0])); + } + } + @POST @Path("unlock/{entityName}") public void unlock(@PathParam("entityName") String entityName) throws LuceneException { From 8eda4caf949ae3a4cd68727192deb861e36bf266 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Sat, 26 Mar 2022 00:13:27 +0000 Subject: [PATCH 30/73] Add support for fields and searchAfter #25 --- .../java/org/icatproject/lucene/Lucene.java | 268 ++++++++++-------- 1 file changed, 156 insertions(+), 112 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index bf92cde..5193897 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -4,14 +4,17 @@ import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; +import java.io.StringReader; import java.net.HttpURLConnection; import java.nio.file.FileVisitOption; import java.nio.file.Files; import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.Map.Entry; import java.util.Timer; import java.util.TimerTask; @@ -58,6 +61,7 @@ import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler.ConfigurationKeys; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.BooleanQuery.Builder; import org.apache.lucene.search.SortField.Type; import org.apache.lucene.search.IndexSearcher; @@ -70,6 +74,8 @@ import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopFieldDocs; +import org.apache.lucene.search.TotalHits; import org.apache.lucene.search.WildcardQuery; import org.apache.lucene.search.join.JoinUtil; import org.apache.lucene.search.join.ScoreMode; @@ -104,8 +110,8 @@ private class IndexBucket { public class Search { public Map map; public Query query; - public ScoreDoc lastDoc; public Sort sort; + public Set fields = new HashSet(); } enum When { @@ -213,7 +219,7 @@ private void add(HttpServletRequest request, String entityName, When when, JsonP String name = null; String value = null; Double dvalue = null; - Store store = Store.NO; + Store store = Store.YES; Document doc = new Document(); parser.next(); // Skip the [ @@ -257,18 +263,26 @@ private void add(HttpServletRequest request, String entityName, When when, JsonP } else { throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, "Bad VALUE_TRUE " + attName); } + } else if (ev == Event.VALUE_FALSE) { + if (attName == AttributeName.store) { + store = Store.NO; + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, "Bad VALUE_FALSE " + attName); + } } else if (ev == Event.START_OBJECT) { fType = null; name = null; value = null; - store = Store.NO; + store = Store.YES; } else if (ev == Event.END_OBJECT) { if (fType == FieldType.TextField) { doc.add(new TextField(name, value, store)); } else if (fType == FieldType.StringField) { doc.add(new StringField(name, value, store)); } else if (fType == FieldType.SortedDocValuesField) { + // Any field we sort on must be stored to enable searching after doc.add(new SortedDocValuesField(name, new BytesRef(value))); + doc.add(new StoredField(name, value)); } else if (fType == FieldType.DoublePoint) { doc.add(new DoublePoint(name, dvalue)); if (store == Store.YES) { @@ -405,9 +419,8 @@ private IndexBucket createBucket(String name) { @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) @Path("datafiles") - public String datafiles(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults, - @QueryParam("sort") String sort) - throws LuceneException { + public String datafiles(@Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, + @QueryParam("maxResults") int maxResults, @QueryParam("sort") String sort) throws LuceneException { Long uid = null; try { @@ -420,7 +433,8 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("maxRes try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); - String userName = o.getString("user", null); + JsonObject query = o.getJsonObject("query"); + String userName = query.getString("user", null); BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); @@ -438,20 +452,20 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("maxRes theQuery.add(dsQuery, Occur.MUST); } - String text = o.getString("text", null); + String text = query.getString("text", null); if (text != null) { theQuery.add(parser.parse(text, "text"), Occur.MUST); } - String lower = o.getString("lower", null); - String upper = o.getString("upper", null); + String lower = query.getString("lower", null); + String upper = query.getString("upper", null); if (lower != null && upper != null) { theQuery.add(new TermRangeQuery("date", new BytesRef(lower), new BytesRef(upper), true, true), Occur.MUST); } - if (o.containsKey("parameters")) { - JsonArray parameters = o.getJsonArray("parameters"); + if (query.containsKey("parameters")) { + JsonArray parameters = query.getJsonArray("parameters"); IndexSearcher datafileParameterSearcher = getSearcher(map, "DatafileParameter"); for (JsonValue p : parameters) { BooleanQuery.Builder paramQuery = parseParameter(p); @@ -461,9 +475,13 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("maxRes } } search.query = maybeEmptyQuery(theQuery); + if (o.containsKey("fields")) { + List jsonStrings = o.getJsonArray("fields").getValuesAs(JsonString.class); + jsonStrings.forEach((jsonString) -> search.fields.add(jsonString.getString())); + } } - return luceneSearchResult("Datafile", search, maxResults, uid); + return luceneSearchResult("Datafile", search, searchAfter, maxResults, uid); } catch (Exception e) { logger.error("Error", e); freeSearcher(uid); @@ -471,31 +489,12 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("maxRes } } - @GET - @Produces(MediaType.APPLICATION_JSON) - @Path("datafiles/{uid}") - public String datafilesAfter(@PathParam("uid") long uid, @QueryParam("maxResults") int maxResults) - throws LuceneException { - try { - Search search = searches.get(uid); - try { - return luceneSearchResult("Datafile", search, maxResults, null); - } catch (Exception e) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } - } catch (Exception e) { - freeSearcher(uid); - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } - } - @POST @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) @Path("datasets") - public String datasets(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults, - @QueryParam("sort") String sort) - throws LuceneException { + public String datasets(@Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, + @QueryParam("maxResults") int maxResults, @QueryParam("sort") String sort) throws LuceneException { Long uid = null; try { @@ -507,7 +506,8 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("maxResu search.sort = parseSort(sort); try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); - String userName = o.getString("user", null); + JsonObject query = o.getJsonObject("query"); + String userName = query.getString("user", null); BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); @@ -523,13 +523,13 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("maxResu theQuery.add(invQuery, Occur.MUST); } - String text = o.getString("text", null); + String text = query.getString("text", null); if (text != null) { theQuery.add(parser.parse(text, "text"), Occur.MUST); } - String lower = o.getString("lower", null); - String upper = o.getString("upper", null); + String lower = query.getString("lower", null); + String upper = query.getString("upper", null); if (lower != null && upper != null) { theQuery.add(new TermRangeQuery("startDate", new BytesRef(lower), new BytesRef(upper), true, true), Occur.MUST); @@ -537,8 +537,8 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("maxResu Occur.MUST); } - if (o.containsKey("parameters")) { - JsonArray parameters = o.getJsonArray("parameters"); + if (query.containsKey("parameters")) { + JsonArray parameters = query.getJsonArray("parameters"); IndexSearcher datasetParameterSearcher = getSearcher(map, "DatasetParameter"); for (JsonValue p : parameters) { BooleanQuery.Builder paramQuery = parseParameter(p); @@ -548,8 +548,12 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("maxResu } } search.query = maybeEmptyQuery(theQuery); + if (o.containsKey("fields")) { + List jsonStrings = o.getJsonArray("fields").getValuesAs(JsonString.class); + jsonStrings.forEach((jsonString) -> search.fields.add(jsonString.getString())); + } } - return luceneSearchResult("Dataset", search, maxResults, uid); + return luceneSearchResult("Dataset", search, searchAfter, maxResults, uid); } catch (Exception e) { logger.error("Error", e); freeSearcher(uid); @@ -558,24 +562,6 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("maxResu } - @GET - @Produces(MediaType.APPLICATION_JSON) - @Path("datasets/{uid}") - public String datasetsAfter(@PathParam("uid") long uid, @QueryParam("maxResults") int maxResults) - throws LuceneException { - try { - Search search = searches.get(uid); - try { - return luceneSearchResult("Dataset", search, maxResults, null); - } catch (Exception e) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } - } catch (Exception e) { - freeSearcher(uid); - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } - } - @PreDestroy private void exit() { logger.info("Closing down icat.lucene"); @@ -678,9 +664,8 @@ public void run() { @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) @Path("investigations") - public String investigations(@Context HttpServletRequest request, @QueryParam("maxResults") int maxResults, - @QueryParam("sort") String sort) - throws LuceneException { + public String investigations(@Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, + @QueryParam("maxResults") int maxResults, @QueryParam("sort") String sort) throws LuceneException { Long uid = null; try { uid = bucketNum.getAndIncrement(); @@ -691,7 +676,8 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("m search.sort = parseSort(sort); try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); - String userName = o.getString("user", null); + JsonObject query = o.getJsonObject("query"); + String userName = query.getString("user", null); BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); @@ -702,13 +688,13 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("m theQuery.add(iuQuery, Occur.MUST); } - String text = o.getString("text", null); + String text = query.getString("text", null); if (text != null) { theQuery.add(parser.parse(text, "text"), Occur.MUST); } - String lower = o.getString("lower", null); - String upper = o.getString("upper", null); + String lower = query.getString("lower", null); + String upper = query.getString("upper", null); if (lower != null && upper != null) { theQuery.add(new TermRangeQuery("startDate", new BytesRef(lower), new BytesRef(upper), true, true), Occur.MUST); @@ -716,8 +702,8 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("m Occur.MUST); } - if (o.containsKey("parameters")) { - JsonArray parameters = o.getJsonArray("parameters"); + if (query.containsKey("parameters")) { + JsonArray parameters = query.getJsonArray("parameters"); IndexSearcher investigationParameterSearcher = getSearcher(map, "InvestigationParameter"); for (JsonValue p : parameters) { @@ -728,8 +714,8 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("m } } - if (o.containsKey("samples")) { - JsonArray samples = o.getJsonArray("samples"); + if (query.containsKey("samples")) { + JsonArray samples = query.getJsonArray("samples"); IndexSearcher sampleSearcher = getSearcher(map, "Sample"); for (JsonValue s : samples) { @@ -742,7 +728,7 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("m } } - String userFullName = o.getString("userFullName", null); + String userFullName = query.getString("userFullName", null); if (userFullName != null) { BooleanQuery.Builder userFullNameQuery = new BooleanQuery.Builder(); userFullNameQuery.add(parser.parse(userFullName, "text"), Occur.MUST); @@ -753,9 +739,13 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("m } search.query = maybeEmptyQuery(theQuery); + if (o.containsKey("fields")) { + List jsonStrings = o.getJsonArray("fields").getValuesAs(JsonString.class); + jsonStrings.forEach((jsonString) -> search.fields.add(jsonString.getString())); + } } logger.info("Query: {}", search.query); - return luceneSearchResult("Investigation", search, maxResults, uid); + return luceneSearchResult("Investigation", search, searchAfter, maxResults, uid); } catch (Exception e) { logger.error("Error", e); freeSearcher(uid); @@ -764,24 +754,6 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("m } - @GET - @Produces(MediaType.APPLICATION_JSON) - @Path("investigations/{uid}") - public String investigationsAfter(@PathParam("uid") long uid, @QueryParam("maxResults") int maxResults) - throws LuceneException { - try { - Search search = searches.get(uid); - try { - return luceneSearchResult("Investigation", search, maxResults, null); - } catch (Exception e) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } - } catch (Exception e) { - freeSearcher(uid); - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } - } - @POST @Path("lock/{entityName}") public void lock(@PathParam("entityName") String entityName) throws LuceneException { @@ -798,52 +770,91 @@ public void lock(@PathParam("entityName") String entityName) throws LuceneExcept } } - private String luceneSearchResult(String name, Search search, int maxResults, Long uid) throws IOException { + private String luceneSearchResult(String name, Search search, String searchAfter, int maxResults, Long uid) + throws IOException, LuceneException { IndexSearcher isearcher = getSearcher(search.map, name); logger.debug("To search in {} for {} {} with {} from {} ", name, search.query, maxResults, isearcher, - search.lastDoc); - TopDocs topDocs; + searchAfter); + FieldDoc searchAfterDoc = parseSearchAfter(searchAfter); + ScoreDoc[] hits; + TotalHits totalHits; + SortField[] fields = null; if (search.sort == null) { // Use default score sorting - topDocs = search.lastDoc == null ? isearcher.search(search.query, maxResults) - : isearcher.searchAfter(search.lastDoc, search.query, maxResults); + TopDocs topDocs; + topDocs = searchAfterDoc == null ? isearcher.search(search.query, maxResults) + : isearcher.searchAfter(searchAfterDoc, search.query, maxResults); + hits = topDocs.scoreDocs; + totalHits = topDocs.totalHits; } else { - topDocs = search.lastDoc == null ? isearcher.search(search.query, maxResults, search.sort) - : isearcher.searchAfter(search.lastDoc, search.query, maxResults, search.sort); + // Use specified sorting + TopFieldDocs topFieldDocs; + topFieldDocs = searchAfterDoc == null ? isearcher.search(search.query, maxResults, search.sort) + : isearcher.searchAfter(searchAfterDoc, search.query, maxResults, search.sort, false); + hits = topFieldDocs.scoreDocs; + totalHits = topFieldDocs.totalHits; + fields = topFieldDocs.fields; } - ScoreDoc[] hits = topDocs.scoreDocs; Float maxScore; if (hits.length == 0) { maxScore = Float.NaN; } else { maxScore = hits[0].score; } - logger.debug("Hits " + topDocs.totalHits + " maxscore " + maxScore); + logger.debug("Hits " + totalHits + " maxscore " + maxScore); ByteArrayOutputStream baos = new ByteArrayOutputStream(); try (JsonGenerator gen = Json.createGenerator(baos)) { gen.writeStartObject(); - if (uid != null) { - gen.write("uid", uid); - } gen.writeStartArray("results"); for (ScoreDoc hit : hits) { Document doc = isearcher.doc(hit.doc); - gen.writeStartArray(); - gen.write(Long.parseLong(doc.get("id"))); + gen.writeStartObject().write("id", Long.parseLong(doc.get("id"))); Float score = hit.score; - if (score.equals(Float.NaN)) { - // If we didn't sort by score, then this will be NaN - gen.write(-1.); - } else { - gen.write(hit.score); + if (!score.equals(Float.NaN)) { + gen.write("score", hit.score); } - gen.writeEnd(); // array + gen.writeStartObject("source"); + doc.forEach((field) -> { + if (search.fields.contains(field.name())) { + if (field.stringValue() != null) { + gen.write(field.name(), field.stringValue()); + } else if (field.numericValue() != null) { + gen.write(field.name(), field.numericValue().doubleValue()); + } + } + }); + gen.writeEnd(); + gen.writeEnd(); // result object } gen.writeEnd(); // array results + if (hits.length == maxResults) { + ScoreDoc lastDoc = hits[hits.length - 1]; + gen.writeStartObject("search_after").write("doc", lastDoc.doc).write("shardIndex", lastDoc.shardIndex); + float lastScore = lastDoc.score; + if (!Float.isNaN(lastScore)) { + gen.write("score", lastScore); + } + if (fields != null) { + Document lastDocument = isearcher.doc(lastDoc.doc); + gen.writeStartArray("fields"); + for (SortField sortField : fields) { + Type type = sortField.getType(); + if (type.equals(Type.STRING)) { + String lastValue = lastDocument.get(sortField.getField()); + if (lastValue == null) { + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Field " + + sortField.getField() + + " used for sorting was not present on the Lucene Document; all sortable fields must also be stored."); + } + gen.write(lastValue); + } + } + gen.writeEnd(); + } + gen.writeEnd(); + } gen.writeEnd(); // object } - - search.lastDoc = hits.length == 0 ? null : hits[hits.length - 1]; logger.debug("Json returned {}", baos.toString()); return baos.toString(); } @@ -925,6 +936,39 @@ private Sort parseSort(String sort) throws LuceneException { } } + /** + * Parses a Lucene ScoreDoc to be "searched after" from a String representation + * of a JSON array. + * + * @param searchAfter String representation of a JSON object containing the + * document id or "doc" (String), score ("float") in that + * order. + * @return FieldDoc object built from the provided String, or null if + * searchAfter was itself null or an empty String. + */ + private FieldDoc parseSearchAfter(String searchAfter) { + if (searchAfter != null && !searchAfter.equals("")) { + logger.debug("Attempting to parseSearchAfter from {}", searchAfter); + JsonReader reader = Json.createReader(new StringReader(searchAfter)); + JsonObject object = reader.readObject(); + int doc = object.getInt("doc"); + int shardIndex = object.getInt("shardIndex"); + float score = Float.NaN; + List fields = new ArrayList<>(); + if (object.containsKey("score")) { + score = object.getJsonNumber("score").bigDecimalValue().floatValue(); + } + if (object.containsKey("fields")) { + List jsonStrings = object.getJsonArray("fields").getValuesAs(JsonString.class); + for (JsonString jsonString : jsonStrings) { + fields.add(new BytesRef(jsonString.getString())); + } + } + return new FieldDoc(doc, score, fields.toArray(), shardIndex); + } + return null; + } + @POST @Path("unlock/{entityName}") public void unlock(@PathParam("entityName") String entityName) throws LuceneException { From 851cedb7b8dd6e1915bbe74cab67752d72e046c6 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Mon, 4 Apr 2022 11:17:46 +0000 Subject: [PATCH 31/73] Implement incremental sharding #26 --- src/main/config/run.properties.example | 1 + .../java/org/icatproject/lucene/Lucene.java | 321 +++++++++++++----- src/main/resources/run.properties | 1 + 3 files changed, 237 insertions(+), 86 deletions(-) diff --git a/src/main/config/run.properties.example b/src/main/config/run.properties.example index b010790..4aeab39 100644 --- a/src/main/config/run.properties.example +++ b/src/main/config/run.properties.example @@ -3,4 +3,5 @@ directory = ${HOME}/data/lucene commitSeconds = 5 +maxShardSize = 2147483648 ip = 127.0.0.1/32 diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 11b1d5b..8b6b0b4 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -6,8 +6,10 @@ import java.net.HttpURLConnection; import java.nio.file.FileVisitOption; import java.nio.file.Files; +import java.util.ArrayList; import java.util.Comparator; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Timer; @@ -47,8 +49,11 @@ import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; +import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.MultiReader; +import org.apache.lucene.index.ReaderManager; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; @@ -60,7 +65,6 @@ import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.SearcherManager; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TopDocs; @@ -88,15 +92,185 @@ enum FieldType { TextField, StringField, SortedDocValuesField, DoublePoint } - private class IndexBucket { + private class ShardBucket { private FSDirectory directory; private IndexWriter indexWriter; - private SearcherManager searcherManager; + private ReaderManager readerManager; + + /** + * Creates a bucket for accessing the read and write functionality for a single + * "shard" Lucene index which can then be grouped to represent a single document + * type. + * + * @param shardPath Path to the directory used as storage for this shard. + * @throws IOException + */ + public ShardBucket(java.nio.file.Path shardPath) throws IOException { + directory = FSDirectory.open(shardPath); + IndexWriterConfig config = new IndexWriterConfig(analyzer); + indexWriter = new IndexWriter(directory, config); + String[] files = directory.listAll(); + if (files.length == 1 && files[0].equals("write.lock")) { + logger.debug("Directory only has the write.lock file so store and delete a dummy document"); + Document doc = new Document(); + doc.add(new StringField("dummy", "dummy", Store.NO)); + indexWriter.addDocument(doc); + indexWriter.commit(); + indexWriter.deleteDocuments(new Term("dummy", "dummy")); + indexWriter.commit(); + logger.debug("Now have " + indexWriter.getDocStats().numDocs + " documents indexed"); + } + readerManager = new ReaderManager(indexWriter); + } + } + + private class IndexBucket { + private String entityName; + private Map shardMap = new HashMap<>(); private AtomicBoolean locked = new AtomicBoolean(); + + /** + * Creates a bucket for accessing the high level functionality, such as + * searching, for a single document type. Incoming documents will be routed to + * one of the individual "shard" indices that are grouped by this Object. + * + * @param entityName The name of the entity that this index contains documents + * for. + */ + public IndexBucket(String entityName) { + try { + this.entityName = entityName; + Long shardIndex = 0L; + java.nio.file.Path shardPath = luceneDirectory.resolve(entityName); + do { + ShardBucket shardBucket = new ShardBucket(shardPath); + shardMap.put(shardIndex, shardBucket); + shardIndex++; + shardPath = luceneDirectory.resolve(entityName + "_" + shardIndex); + } while (Files.isDirectory(shardPath)); + logger.debug("Bucket for {} is now ready with {} shards", entityName, shardIndex); + } catch (Throwable e) { + logger.error("Can't continue " + e.getClass() + " " + e.getMessage()); + } + } + + /** + * Acquires DirectoryReaders from the ReaderManagers of the individual shards in + * this bucket. + * + * @return Array of DirectoryReaders for all shards in this bucket. + * @throws IOException + */ + public DirectoryReader[] acquireReaders() throws IOException { + List subReaders = new ArrayList<>(); + for (ShardBucket shardBucket : shardMap.values()) { + subReaders.add(shardBucket.readerManager.acquire()); + } + return subReaders.toArray(new DirectoryReader[0]); + } + + /** + * Creates a new ShardBucket and stores it in the shardMap. + * + * @param shardKey The identifier for the new shard to be created. For + * simplicity, should a Long starting at 0 and incrementing by 1 + * for each new shard. + * @return A new ShardBucket with the provided shardKey. + * @throws IOException + */ + public ShardBucket buildShardBucket(Long shardKey) throws IOException { + ShardBucket shardBucket = new ShardBucket(luceneDirectory.resolve(entityName + "_" + shardKey)); + shardMap.put(shardKey, shardBucket); + return shardBucket; + } + + /** + * Commits Documents for writing on all "shard" indices for this bucket. + * + * @param command The high level command which called this function. Only + * used for debug logging. + * @param entityName The name of the entities being committed. Only used for + * debug logging. + * @throws IOException + */ + public void commit(String command, String entityName) throws IOException { + for (Entry entry : shardMap.entrySet()) { + ShardBucket shardBucket = entry.getValue(); + int cached = shardBucket.indexWriter.numRamDocs(); + shardBucket.indexWriter.commit(); + if (cached != 0) { + logger.debug("{} has committed {} {} changes to Lucene - now have {} documents indexed in shard {}", + command, cached, entityName, shardBucket.indexWriter.getDocStats().numDocs, entry.getKey()); + } + shardBucket.readerManager.maybeRefreshBlocking(); + } + } + + /** + * Commits and closes all "shard" indices for this bucket. + * + * @throws IOException + */ + public void close() throws IOException { + for (ShardBucket shardBucket : shardMap.values()) { + shardBucket.readerManager.close(); + shardBucket.indexWriter.commit(); + shardBucket.indexWriter.close(); + shardBucket.directory.close(); + } + } + + /** + * Provides the ShardBucket that should be used for reading/writing the Document + * with the provided id. All ids up to luceneMaxShardSize are indexed in the + * first shard, after that a new shard is created for the next + * luceneMaxShardSize Documents and so on. + * + * @param id The id of a Document to be routed. + * @return The ShardBucket that the relevant Document is/should be indexed in. + * @throws IOException + */ + public ShardBucket routeShard(Long id) throws IOException { + if (id == null) { + // If we don't have id, provide the first bucket + return shardMap.get(0L); + } + Long shard = id / luceneMaxShardSize; + ShardBucket shardBucket = shardMap.get(shard); + if (shardBucket == null) { + shardBucket = buildShardBucket(shard); + } + return shardBucket; + } + + /** + * Provides the IndexWriter that should be used for writing the Document with + * the provided id. + * + * @param id The id of a Document to be routed. + * @return The relevant IndexWriter. + * @throws IOException + */ + public IndexWriter getWriter(Long id) throws IOException { + return routeShard(id).indexWriter; + } + + public void releaseReaders(DirectoryReader[] subReaders) throws IOException, LuceneException { + if (subReaders.length != shardMap.size()) { + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, + "Was expecting the same number of DirectoryReaders as ShardBuckets, but had " + + subReaders.length + ", " + shardMap.size() + " respectively."); + } + int i = 0; + for (ShardBucket shardBucket : shardMap.values()) { + shardBucket.readerManager.release(subReaders[i]); + i++; + } + } } public class Search { - public Map map; + public Map map; public Query query; public ScoreDoc lastDoc; } @@ -112,6 +286,7 @@ enum When { private java.nio.file.Path luceneDirectory; private int luceneCommitMillis; + private Long luceneMaxShardSize; private AtomicLong bucketNum = new AtomicLong(); private Map indexBuckets = new ConcurrentHashMap<>(); @@ -170,12 +345,13 @@ public void modify(@Context HttpServletRequest request) throws LuceneException { ev = parser.next(); if (ev == Event.VALUE_NULL) { try { - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> createBucket(k)); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); if (bucket.locked.get()) { throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene locked for " + entityName); } - bucket.indexWriter.deleteDocuments(new Term("id", Long.toString(id))); + ShardBucket shardBucket = bucket.routeShard(id); + shardBucket.indexWriter.deleteDocuments(new Term("id", Long.toString(id))); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } @@ -199,7 +375,7 @@ public void modify(@Context HttpServletRequest request) throws LuceneException { private void add(HttpServletRequest request, String entityName, When when, JsonParser parser, Long id) throws LuceneException, IOException { - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> createBucket(k)); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); AttributeName attName = null; FieldType fType = null; @@ -274,13 +450,20 @@ private void add(HttpServletRequest request, String entityName, When when, JsonP throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene locked for " + entityName); } - bucket.indexWriter.addDocument(doc); + String documentId = doc.get("id"); + if (documentId == null) { + logger.warn( + "Adding Document without an id field is not recommended, routing, updates and deletions will not be available for this Document."); + bucket.getWriter(null).addDocument(doc); + } else { + bucket.getWriter(Long.valueOf(documentId)).addDocument(doc); + } } else { if (bucket.locked.get()) { throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene locked for " + entityName); } - bucket.indexWriter.updateDocument(new Term("id", id.toString()), doc); + bucket.getWriter(id).updateDocument(new Term("id", id.toString()), doc); } return; } else { @@ -352,13 +535,7 @@ public void commit() throws LuceneException { for (Entry entry : indexBuckets.entrySet()) { IndexBucket bucket = entry.getValue(); if (!bucket.locked.get()) { - int cached = bucket.indexWriter.numRamDocs(); - bucket.indexWriter.commit(); - if (cached != 0) { - logger.debug("Synch has committed {} {} changes to Lucene - now have {} documents indexed", - cached, entry.getKey(), bucket.indexWriter.getDocStats().numDocs); - } - bucket.searcherManager.maybeRefreshBlocking(); + bucket.commit("Synch", entry.getKey()); } } } catch (IOException e) { @@ -366,34 +543,6 @@ public void commit() throws LuceneException { } } - private IndexBucket createBucket(String name) { - try { - IndexBucket bucket = new IndexBucket(); - FSDirectory directory = FSDirectory.open(luceneDirectory.resolve(name)); - bucket.directory = directory; - IndexWriterConfig config = new IndexWriterConfig(analyzer); - IndexWriter iwriter = new IndexWriter(directory, config); - String[] files = directory.listAll(); - if (files.length == 1 && files[0].equals("write.lock")) { - logger.debug("Directory only has the write.lock file so store and delete a dummy document"); - Document doc = new Document(); - doc.add(new StringField("dummy", "dummy", Store.NO)); - iwriter.addDocument(doc); - iwriter.commit(); - iwriter.deleteDocuments(new Term("dummy", "dummy")); - iwriter.commit(); - logger.debug("Now have " + iwriter.getDocStats().numDocs + " documents indexed"); - } - bucket.indexWriter = iwriter; - bucket.searcherManager = new SearcherManager(iwriter, false, false, null); - logger.debug("Bucket for {} is now ready", name); - return bucket; - } catch (Throwable e) { - logger.error("Can't continue " + e.getClass() + " " + e.getMessage()); - return null; - } - } - @POST @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) @@ -406,7 +555,7 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("maxRes uid = bucketNum.getAndIncrement(); Search search = new Search(); searches.put(uid, search); - Map map = new HashMap<>(); + Map map = new HashMap<>(); search.map = map; try (JsonReader r = Json.createReader(request.getInputStream())) { @@ -441,10 +590,10 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("maxRes Occur.MUST); } - if (o.containsKey("params")) { - JsonArray params = o.getJsonArray("params"); + if (o.containsKey("parameters")) { + JsonArray parameters = o.getJsonArray("parameters"); IndexSearcher datafileParameterSearcher = getSearcher(map, "DatafileParameter"); - for (JsonValue p : params) { + for (JsonValue p : parameters) { BooleanQuery.Builder paramQuery = parseParameter(p); Query toQuery = JoinUtil.createJoinQuery("datafile", false, "id", paramQuery.build(), datafileParameterSearcher, ScoreMode.None); @@ -492,7 +641,7 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("maxResu uid = bucketNum.getAndIncrement(); Search search = new Search(); searches.put(uid, search); - Map map = new HashMap<>(); + Map map = new HashMap<>(); search.map = map; try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); @@ -526,10 +675,10 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("maxResu Occur.MUST); } - if (o.containsKey("params")) { - JsonArray params = o.getJsonArray("params"); + if (o.containsKey("parameters")) { + JsonArray parameters = o.getJsonArray("parameters"); IndexSearcher datasetParameterSearcher = getSearcher(map, "DatasetParameter"); - for (JsonValue p : params) { + for (JsonValue p : parameters) { BooleanQuery.Builder paramQuery = parseParameter(p); Query toQuery = JoinUtil.createJoinQuery("dataset", false, "id", paramQuery.build(), datasetParameterSearcher, ScoreMode.None); @@ -574,12 +723,8 @@ private void exit() { timer = null; // This seems to be necessary to make it really stop } try { - for (Entry entry : indexBuckets.entrySet()) { - IndexBucket bucket = entry.getValue(); - bucket.searcherManager.close(); - bucket.indexWriter.commit(); - bucket.indexWriter.close(); - bucket.directory.close(); + for (IndexBucket bucket : indexBuckets.values()) { + bucket.close(); } logger.info("Closed down icat.lucene"); } catch (Exception e) { @@ -592,13 +737,12 @@ private void exit() { public void freeSearcher(@PathParam("uid") Long uid) throws LuceneException { if (uid != null) { // May not be set for internal calls logger.debug("Requesting freeSearcher {}", uid); - Map search = searches.get(uid).map; - for (Entry entry : search.entrySet()) { + Map search = searches.get(uid).map; + for (Entry entry : search.entrySet()) { String name = entry.getKey(); - IndexSearcher isearcher = entry.getValue(); - SearcherManager manager = indexBuckets.computeIfAbsent(name, k -> createBucket(k)).searcherManager; + DirectoryReader[] subReaders = entry.getValue(); try { - manager.release(isearcher); + indexBuckets.computeIfAbsent(name, k -> new IndexBucket(k)).releaseReaders(subReaders); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } @@ -610,14 +754,14 @@ public void freeSearcher(@PathParam("uid") Long uid) throws LuceneException { /* * Need a new set of IndexSearchers for each search as identified by a uid */ - private IndexSearcher getSearcher(Map bucket, String name) throws IOException { - IndexSearcher isearcher = bucket.get(name); - if (isearcher == null) { - isearcher = indexBuckets.computeIfAbsent(name, k -> createBucket(k)).searcherManager.acquire(); - bucket.put(name, isearcher); + private IndexSearcher getSearcher(Map bucket, String name) throws IOException { + DirectoryReader[] subReaders = bucket.get(name); + if (subReaders == null) { + subReaders = indexBuckets.computeIfAbsent(name, k -> new IndexBucket(k)).acquireReaders(); + bucket.put(name, subReaders); logger.debug("Remember searcher for {}", name); } - return isearcher; + return new IndexSearcher(new MultiReader(subReaders, false)); } @PostConstruct @@ -633,6 +777,7 @@ private void init() { } luceneCommitMillis = props.getPositiveInt("commitSeconds") * 1000; + luceneMaxShardSize = Math.max(props.getPositiveLong("maxShardSize"), new Long(Integer.MAX_VALUE + 1)); analyzer = new IcatAnalyzer(); @@ -674,7 +819,7 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("m uid = bucketNum.getAndIncrement(); Search search = new Search(); searches.put(uid, search); - Map map = new HashMap<>(); + Map map = new HashMap<>(); search.map = map; try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); @@ -703,11 +848,11 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("m Occur.MUST); } - if (o.containsKey("params")) { - JsonArray params = o.getJsonArray("params"); + if (o.containsKey("parameters")) { + JsonArray parameters = o.getJsonArray("parameters"); IndexSearcher investigationParameterSearcher = getSearcher(map, "InvestigationParameter"); - for (JsonValue p : params) { + for (JsonValue p : parameters) { BooleanQuery.Builder paramQuery = parseParameter(p); Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", paramQuery.build(), investigationParameterSearcher, ScoreMode.None); @@ -773,13 +918,15 @@ public String investigationsAfter(@PathParam("uid") long uid, @QueryParam("maxRe @Path("lock/{entityName}") public void lock(@PathParam("entityName") String entityName) throws LuceneException { logger.info("Requesting lock of {} index", entityName); - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> createBucket(k)); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); if (!bucket.locked.compareAndSet(false, true)) { throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene already locked for " + entityName); } try { - bucket.indexWriter.deleteAll(); + for (ShardBucket shardBucket : bucket.shardMap.values()) { + shardBucket.indexWriter.deleteAll(); + } } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } @@ -792,7 +939,13 @@ private String luceneSearchResult(String name, Search search, int maxResults, Lo TopDocs topDocs = search.lastDoc == null ? isearcher.search(search.query, maxResults) : isearcher.searchAfter(search.lastDoc, search.query, maxResults); ScoreDoc[] hits = topDocs.scoreDocs; - logger.debug("Hits " + topDocs.totalHits + " maxscore " + topDocs.scoreDocs[0].score); + Float maxScore; + if (hits.length == 0) { + maxScore = Float.NaN; + } else { + maxScore = hits[0].score; + } + logger.debug("Hits " + topDocs.totalHits + " maxscore " + maxScore); ByteArrayOutputStream baos = new ByteArrayOutputStream(); try (JsonGenerator gen = Json.createGenerator(baos)) { gen.writeStartObject(); @@ -841,9 +994,11 @@ private Builder parseParameter(JsonValue p) { String pLowerDateValue = parameter.getString("lowerDateValue", null); String pUpperDateValue = parameter.getString("upperDateValue", null); Double pLowerNumericValue = parameter.containsKey("lowerNumericValue") - ? parameter.getJsonNumber("lowerNumericValue").doubleValue() : null; + ? parameter.getJsonNumber("lowerNumericValue").doubleValue() + : null; Double pUpperNumericValue = parameter.containsKey("upperNumericValue") - ? parameter.getJsonNumber("upperNumericValue").doubleValue() : null; + ? parameter.getJsonNumber("upperNumericValue").doubleValue() + : null; if (pStringValue != null) { paramQuery.add(new WildcardQuery(new Term("stringValue", pStringValue)), Occur.MUST); } else if (pLowerDateValue != null && pUpperDateValue != null) { @@ -861,19 +1016,13 @@ private Builder parseParameter(JsonValue p) { @Path("unlock/{entityName}") public void unlock(@PathParam("entityName") String entityName) throws LuceneException { logger.debug("Requesting unlock of {} index", entityName); - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> createBucket(k)); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); if (!bucket.locked.compareAndSet(true, false)) { throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene is not currently locked for " + entityName); } try { - int cached = bucket.indexWriter.numRamDocs(); - bucket.indexWriter.commit(); - if (cached != 0) { - logger.debug("Unlock has committed {} {} changes to Lucene - now have {} documents indexed", cached, - entityName, bucket.indexWriter.getDocStats().numDocs); - } - bucket.searcherManager.maybeRefreshBlocking(); + bucket.commit("Unlock", entityName); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } diff --git a/src/main/resources/run.properties b/src/main/resources/run.properties index b010790..4aeab39 100644 --- a/src/main/resources/run.properties +++ b/src/main/resources/run.properties @@ -3,4 +3,5 @@ directory = ${HOME}/data/lucene commitSeconds = 5 +maxShardSize = 2147483648 ip = 127.0.0.1/32 From 9477ea84b4ef5e89cbc044ec33627b11de3fb757 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Wed, 6 Apr 2022 03:30:42 +0000 Subject: [PATCH 32/73] Rename JSON keys for clarity over id #18 --- .../java/org/icatproject/lucene/Lucene.java | 60 ++++++++++++------- 1 file changed, 38 insertions(+), 22 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index ba9c859..400caf1 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -71,7 +71,6 @@ import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.SearcherManager; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TermQuery; @@ -457,7 +456,7 @@ private void add(HttpServletRequest request, String entityName, When when, JsonP } else if (fType == FieldType.SortedDocValuesField) { // Any field we sort on must be stored to enable searching after doc.add(new SortedDocValuesField(name, new BytesRef(value))); - doc.add(new StoredField(name, value)); + doc.add(new StoredField(name, value)); // TODO potentially remove this, or the version in LuceneApi } else if (fType == FieldType.DoublePoint) { doc.add(new DoublePoint(name, dvalue)); if (store == Store.YES) { @@ -710,6 +709,41 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("search_ } + /** + * Encodes core Lucene information (keys preceded by underscores) and a + * selection of the Document's source fields to JSON to be returned to + * icat.server. Note that "_id" is the Lucene Document id, and should not be + * confused with the ICAT entity id, which should be denoted by the key "id" + * within the "_source" object. + * + * @param gen JsonGenerator to encode the information to. + * @param hit ScoreDoc representing a single search result. + * @param searcher IndexSearcher used to get the Document for the hit. + * @param search Search object containing the fields to return. + * @throws IOException + */ + private void encodeResult(JsonGenerator gen, ScoreDoc hit, IndexSearcher searcher, Search search) + throws IOException { + int luceneDocId = hit.doc; + Document document = searcher.doc(luceneDocId); + gen.writeStartObject().write("_id", luceneDocId); + Float score = hit.score; + if (!score.equals(Float.NaN)) { + gen.write("_score", hit.score); + } + gen.writeStartObject("_source"); + document.forEach((field) -> { + if (search.fields.contains(field.name())) { + if (field.stringValue() != null) { + gen.write(field.name(), field.stringValue()); + } else if (field.numericValue() != null) { + gen.write(field.name(), field.numericValue().doubleValue()); + } + } + }); + gen.writeEnd().writeEnd(); // source object, result object + } + @PreDestroy private void exit() { logger.info("Closing down icat.lucene"); @@ -950,27 +984,9 @@ private String luceneSearchResult(String name, Search search, String searchAfter logger.debug("Hits " + totalHits + " maxscore " + maxScore); ByteArrayOutputStream baos = new ByteArrayOutputStream(); try (JsonGenerator gen = Json.createGenerator(baos)) { - gen.writeStartObject(); - gen.writeStartArray("results"); + gen.writeStartObject().writeStartArray("results"); for (ScoreDoc hit : hits) { - Document doc = isearcher.doc(hit.doc); - gen.writeStartObject().write("id", Long.parseLong(doc.get("id"))); - Float score = hit.score; - if (!score.equals(Float.NaN)) { - gen.write("score", hit.score); - } - gen.writeStartObject("source"); - doc.forEach((field) -> { - if (search.fields.contains(field.name())) { - if (field.stringValue() != null) { - gen.write(field.name(), field.stringValue()); - } else if (field.numericValue() != null) { - gen.write(field.name(), field.numericValue().doubleValue()); - } - } - }); - gen.writeEnd(); - gen.writeEnd(); // result object + encodeResult(gen, hit, isearcher, search); } gen.writeEnd(); // array results if (hits.length == maxResults) { From 434b66b8e5d33d93ee508da680abb106bfd1235f Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Fri, 8 Apr 2022 09:28:53 +0100 Subject: [PATCH 33/73] Text fields and related entities #30 --- .../java/org/icatproject/lucene/Lucene.java | 777 +++++++++++------- 1 file changed, 498 insertions(+), 279 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 400caf1..4a69314 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -8,13 +8,16 @@ import java.net.HttpURLConnection; import java.nio.file.FileVisitOption; import java.nio.file.Files; +import java.text.SimpleDateFormat; import java.util.ArrayList; +import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TimeZone; import java.util.Map.Entry; import java.util.Timer; import java.util.TimerTask; @@ -27,13 +30,13 @@ import javax.ejb.Singleton; import javax.json.Json; import javax.json.JsonArray; +import javax.json.JsonNumber; import javax.json.JsonObject; import javax.json.JsonReader; import javax.json.JsonString; import javax.json.JsonValue; +import javax.json.JsonValue.ValueType; import javax.json.stream.JsonGenerator; -import javax.json.stream.JsonParser; -import javax.json.stream.JsonParser.Event; import javax.servlet.http.HttpServletRequest; import javax.ws.rs.Consumes; import javax.ws.rs.DELETE; @@ -48,20 +51,21 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.DoublePoint; +import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.MultiReader; import org.apache.lucene.index.ReaderManager; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; -import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; -import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler.ConfigurationKeys; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FieldDoc; @@ -73,8 +77,8 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortedNumericSortField; import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.TermRangeQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopFieldDocs; import org.apache.lucene.search.TotalHits; @@ -83,6 +87,7 @@ import org.apache.lucene.search.join.ScoreMode; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.NumericUtils; import org.icatproject.lucene.exceptions.LuceneException; import org.icatproject.utils.CheckedProperties; import org.slf4j.Logger; @@ -94,14 +99,6 @@ @Singleton public class Lucene { - enum AttributeName { - type, name, value, date, store - } - - enum FieldType { - TextField, StringField, SortedDocValuesField, DoublePoint - } - private class ShardBucket { private FSDirectory directory; private IndexWriter indexWriter; @@ -286,13 +283,88 @@ public class Search { public Set fields = new HashSet(); } - enum When { - Now, Sometime + private static class ParentRelationship { + public String parentName; + public String fieldPrefix; + + public ParentRelationship(String parentName, String fieldPrefix) { + this.parentName = parentName; + this.fieldPrefix = fieldPrefix; + } + } private static final Logger logger = LoggerFactory.getLogger(Lucene.class); - private static final Marker fatal = MarkerFactory.getMarker("FATAL"); + private static final SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmm"); + + private static final Set doubleFields = new HashSet<>(); + private static final Set longFields = new HashSet<>(); + private static final Set sortFields = new HashSet<>(); + private static final Set textFields = new HashSet<>(); + private static final Set indexedEntities = new HashSet<>(); + private static final Map relationships = new HashMap<>(); + + private static final IcatAnalyzer analyzer = new IcatAnalyzer(); + private static final StandardQueryParser genericParser = new StandardQueryParser(); + private static final StandardQueryParser datafileParser = new StandardQueryParser(); + private static final StandardQueryParser datasetParser = new StandardQueryParser(); + private static final StandardQueryParser investigationParser = new StandardQueryParser(); + private static final StandardQueryParser sampleParser = new StandardQueryParser(); + + static { + TimeZone tz = TimeZone.getTimeZone("GMT"); + df.setTimeZone(tz); + + doubleFields.add("numericValue"); + longFields.addAll(Arrays.asList("date", "startDate", "endDate", "dateTimeValue")); + sortFields.addAll(Arrays.asList("datafile.id", "dataset.id", "investigation.id", "id", "date", "startDate", + "endDate", "name")); + textFields.addAll(Arrays.asList("name", "visitId", "description", "datafileFormat.name", "sample.name", + "sample.type.name", "title", "summary", "facility.name", "user.fullName")); + + indexedEntities.addAll(Arrays.asList("Datafile", "Dataset", "Investigation", "DatafileParameter", + "DatasetParameter", "InvestigationParameter", "InvestigationUser", "Sample")); + + relationships.put("User", new ParentRelationship[] { new ParentRelationship("InvestigationUser", "user") }); + relationships.put("Sample", new ParentRelationship[] { new ParentRelationship("Dataset", "sample") }); + relationships.put("SampleType", new ParentRelationship[] { new ParentRelationship("Sample", "type"), + new ParentRelationship("Dataset", "sample.type") }); + relationships.put("InvestigationType", + new ParentRelationship[] { new ParentRelationship("Investigation", "type") }); + relationships.put("DatasetType", new ParentRelationship[] { new ParentRelationship("Dataset", "type") }); + relationships.put("DatafileFormat", + new ParentRelationship[] { new ParentRelationship("Datafile", "datafileFormat") }); + relationships.put("Facility", new ParentRelationship[] { new ParentRelationship("Investigation", "facility") }); + relationships.put("ParameterType", + new ParentRelationship[] { new ParentRelationship("DatafileParameter", "type"), + new ParentRelationship("DatasetParameter", "type"), + new ParentRelationship("InvestigationParameter", "type") }); + + genericParser.setAllowLeadingWildcard(true); + genericParser.setAnalyzer(analyzer); + + CharSequence[] datafileFields = { "name", "description", "doi", "datafileFormat.name" }; + datafileParser.setAllowLeadingWildcard(true); + datafileParser.setAnalyzer(analyzer); + datafileParser.setMultiFields(datafileFields); + + CharSequence[] datasetFields = { "name", "description", "doi", "sample.name", "sample.type.name", "type.name" }; + datasetParser.setAllowLeadingWildcard(true); + datasetParser.setAnalyzer(analyzer); + datasetParser.setMultiFields(datasetFields); + + CharSequence[] investigationFields = { "name", "visitId", "title", "summary", "doi", "facility.name", + "type.name" }; + investigationParser.setAllowLeadingWildcard(true); + investigationParser.setAnalyzer(analyzer); + investigationParser.setMultiFields(investigationFields); + + CharSequence[] sampleFields = { "sample.name", "sample.type.name" }; + sampleParser.setAllowLeadingWildcard(true); + sampleParser.setAnalyzer(analyzer); + sampleParser.setMultiFields(sampleFields); + } private java.nio.file.Path luceneDirectory; @@ -301,12 +373,9 @@ enum When { private AtomicLong bucketNum = new AtomicLong(); private Map indexBuckets = new ConcurrentHashMap<>(); - private StandardQueryParser parser; private Timer timer; - private IcatAnalyzer analyzer; - private Map searches = new ConcurrentHashMap<>(); /** @@ -333,48 +402,26 @@ public void modify(@Context HttpServletRequest request) throws LuceneException { logger.debug("Requesting modify"); int count = 0; - - try (JsonParser parser = Json.createParser(request.getInputStream())) { - - Event ev = parser.next(); - if (ev != Event.START_ARRAY) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Unexpected " + ev.name()); - } - ev = parser.next(); - - while (true) { - if (ev == Event.END_ARRAY) { - break; - } - if (ev != Event.START_ARRAY) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Unexpected " + ev.name()); - } - ev = parser.next(); - String entityName = parser.getString(); - ev = parser.next(); - Long id = (ev == Event.VALUE_NULL) ? null : parser.getLong(); - ev = parser.next(); - if (ev == Event.VALUE_NULL) { - try { - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); - if (bucket.locked.get()) { - throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, - "Lucene locked for " + entityName); - } - ShardBucket shardBucket = bucket.routeShard(id); - shardBucket.indexWriter.deleteDocuments(new Term("id", Long.toString(id))); - } catch (IOException e) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } + try (JsonReader reader = Json.createReader(request.getInputStream())) { + List operations = reader.readArray().getValuesAs(JsonObject.class); + for (JsonObject operation : operations) { + if (operation.size() != 1) { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Operation object should only have one key/value pair, but request had " + + operation.size()); + } else if (operation.containsKey("create")) { + create(operation.getJsonObject("create")); + } else if (operation.containsKey("update")) { + update(operation.getJsonObject("update")); + } else if (operation.containsKey("delete")) { + delete(operation.getJsonObject("delete")); } else { - add(request, entityName, When.Sometime, parser, id); + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Operation key should be one of 'create', 'update', 'delete', but it was " + + operation.keySet()); } - ev = parser.next(); // end of triple - count++; - ev = parser.next(); // either end of input or start of new - // triple } - + count = operations.size(); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } @@ -382,115 +429,6 @@ public void modify(@Context HttpServletRequest request) throws LuceneException { } - /* if id is not null this is actually an update */ - private void add(HttpServletRequest request, String entityName, When when, JsonParser parser, Long id) - throws LuceneException, IOException { - - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); - - AttributeName attName = null; - FieldType fType = null; - String name = null; - String value = null; - Double dvalue = null; - Store store = Store.YES; - Document doc = new Document(); - - parser.next(); // Skip the [ - while (parser.hasNext()) { - Event ev = parser.next(); - if (ev == Event.KEY_NAME) { - try { - attName = AttributeName.valueOf(parser.getString()); - } catch (Exception e) { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "Found unknown field type " + e.getMessage()); - } - } else if (ev == Event.VALUE_STRING) { - if (attName == AttributeName.type) { - try { - fType = FieldType.valueOf(parser.getString()); - } catch (Exception e) { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "Found unknown field type " + e.getMessage()); - } - } else if (attName == AttributeName.name) { - name = parser.getString(); - } else if (attName == AttributeName.value) { - value = parser.getString(); - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, "Bad VALUE_STRING " + attName); - } - } else if (ev == Event.VALUE_NUMBER) { - long num = parser.getLong(); - if (fType == FieldType.SortedDocValuesField) { - value = Long.toString(num); - } else if (fType == FieldType.DoublePoint) { - dvalue = parser.getBigDecimal().doubleValue(); - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "Bad VALUE_NUMBER " + attName + " " + fType); - } - } else if (ev == Event.VALUE_TRUE) { - if (attName == AttributeName.store) { - store = Store.YES; - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, "Bad VALUE_TRUE " + attName); - } - } else if (ev == Event.VALUE_FALSE) { - if (attName == AttributeName.store) { - store = Store.NO; - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, "Bad VALUE_FALSE " + attName); - } - } else if (ev == Event.START_OBJECT) { - fType = null; - name = null; - value = null; - store = Store.YES; - } else if (ev == Event.END_OBJECT) { - if (fType == FieldType.TextField) { - doc.add(new TextField(name, value, store)); - } else if (fType == FieldType.StringField) { - doc.add(new StringField(name, value, store)); - } else if (fType == FieldType.SortedDocValuesField) { - // Any field we sort on must be stored to enable searching after - doc.add(new SortedDocValuesField(name, new BytesRef(value))); - doc.add(new StoredField(name, value)); // TODO potentially remove this, or the version in LuceneApi - } else if (fType == FieldType.DoublePoint) { - doc.add(new DoublePoint(name, dvalue)); - if (store == Store.YES) { - doc.add(new StoredField(name, dvalue)); - } - } - } else if (ev == Event.END_ARRAY) { - if (id == null) { - if (bucket.locked.get() && when == When.Sometime) { - throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, - "Lucene locked for " + entityName); - } - String documentId = doc.get("id"); - if (documentId == null) { - logger.warn( - "Adding Document without an id field is not recommended, routing, updates and deletions will not be available for this Document."); - bucket.getWriter(null).addDocument(doc); - } else { - bucket.getWriter(Long.valueOf(documentId)).addDocument(doc); - } - } else { - if (bucket.locked.get()) { - throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, - "Lucene locked for " + entityName); - } - bucket.getWriter(id).updateDocument(new Term("id", id.toString()), doc); - } - return; - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, "Unexpected token in Json: " + ev); - } - } - } - /** * Expect an array of documents each encoded as an array of things to add to * the document @@ -500,22 +438,48 @@ private void add(HttpServletRequest request, String entityName, When when, JsonP @Path("addNow/{entityName}") public void addNow(@Context HttpServletRequest request, @PathParam("entityName") String entityName) throws LuceneException { + List documents; logger.debug("Requesting addNow of {}", entityName); - int count = 0; - try (JsonParser parser = Json.createParser(request.getInputStream())) { - Event ev = parser.next(); // Opening [ - while (true) { - ev = parser.next(); // Final ] or another document - if (ev == Event.END_ARRAY) { - break; - } - add(request, entityName, When.Now, parser, null); - count++; + try (JsonReader reader = Json.createReader(request.getInputStream())) { + documents = reader.readArray().getValuesAs(JsonObject.class); + for (JsonObject document : documents) { + createNow(entityName, document); } } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } - logger.debug("Added {} {} documents", count, entityName); + logger.debug("Added {} {} documents", documents.size(), entityName); + } + + /** + * Extracts values from queryJson in order to add one or more range query terms + * using queryBuilder. + * + * Note that values in queryJson are expected to be precise only to the minute, + * and so to ensure that our range is inclusive, we add 59.999 seconds onto the + * upper value only. + * + * If either upper or lower keys do not yield values then a half open range is + * created. If both are absent, then nothing is added to the query. + * + * @param queryBuilder Builder for the Lucene query. + * @param queryJson JsonObject representing the query parameters. + * @param lowerKey Key in queryJson of the lower date value + * @param upperKey Key in queryJson of the upper date value + * @param fields Name of one or more fields to apply the range query to. + * @throws LuceneException + */ + private static void buildDateRanges(Builder queryBuilder, JsonObject queryJson, String lowerKey, String upperKey, + String... fields) throws LuceneException { + Long lower = parseDate(queryJson, lowerKey, 0); + Long upper = parseDate(queryJson, upperKey, 59999); + if (lower != null || upper != null) { + lower = (lower == null) ? Long.MIN_VALUE : lower; + upper = (upper == null) ? Long.MAX_VALUE : upper; + for (String field : fields) { + queryBuilder.add(LongPoint.newRangeQuery(field, lower, upper), Occur.MUST); + } + } } /* @@ -562,6 +526,37 @@ public void commit() throws LuceneException { } } + private void create(JsonObject operationBody) throws NumberFormatException, IOException, LuceneException { + String entityName = operationBody.getString("_index"); + if (relationships.containsKey(entityName)) { + updateByRelation(operationBody, false); + } + if (indexedEntities.contains(entityName)) { + String icatId = operationBody.getString("_id"); + Document document = parseDocument(operationBody.getJsonObject("doc")); + logger.trace("create {} {}", entityName, document.toString()); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); + if (bucket.locked.get()) { + throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, + "Lucene locked for " + entityName); + } + bucket.getWriter(new Long(icatId)).addDocument(document); + } + } + + private void createNow(String entityName, JsonObject documentJson) + throws NumberFormatException, IOException, LuceneException { + if (!documentJson.containsKey("id")) { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "id was not in the document keys " + documentJson.keySet()); + } + String icatId = documentJson.getString("id"); + Document document = parseDocument(documentJson); + logger.trace("create {} {}", entityName, document.toString()); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); + bucket.getWriter(new Long(icatId)).addDocument(document); + } + @POST @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) @@ -586,37 +581,25 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("search BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); if (userName != null) { - Query iuQuery = JoinUtil.createJoinQuery("investigation", false, "id", - new TermQuery(new Term("name", userName)), getSearcher(map, "InvestigationUser"), + Query iuQuery = JoinUtil.createJoinQuery("investigation.id", false, "investigation.id", + new TermQuery(new Term("user.name", userName)), getSearcher(map, "InvestigationUser"), ScoreMode.None); - - Query invQuery = JoinUtil.createJoinQuery("id", false, "investigation", iuQuery, - getSearcher(map, "Investigation"), ScoreMode.None); - - Query dsQuery = JoinUtil.createJoinQuery("id", false, "dataset", invQuery, - getSearcher(map, "Dataset"), ScoreMode.None); - - theQuery.add(dsQuery, Occur.MUST); + theQuery.add(iuQuery, Occur.MUST); } String text = query.getString("text", null); if (text != null) { - theQuery.add(parser.parse(text, "text"), Occur.MUST); + theQuery.add(datafileParser.parse(text, null), Occur.MUST); } - String lower = query.getString("lower", null); - String upper = query.getString("upper", null); - if (lower != null && upper != null) { - theQuery.add(new TermRangeQuery("date", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); - } + buildDateRanges(theQuery, query, "lower", "upper", "date"); if (query.containsKey("parameters")) { JsonArray parameters = query.getJsonArray("parameters"); IndexSearcher datafileParameterSearcher = getSearcher(map, "DatafileParameter"); for (JsonValue p : parameters) { BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("datafile", false, "id", paramQuery.build(), + Query toQuery = JoinUtil.createJoinQuery("datafile.id", false, "id", paramQuery.build(), datafileParameterSearcher, ScoreMode.None); theQuery.add(toQuery, Occur.MUST); } @@ -660,36 +643,26 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("search_ if (userName != null) { - Query iuQuery = JoinUtil.createJoinQuery("investigation", false, "id", - new TermQuery(new Term("name", userName)), getSearcher(map, "InvestigationUser"), + Query iuQuery = JoinUtil.createJoinQuery("investigation.id", false, "investigation.id", + new TermQuery(new Term("user.name", userName)), getSearcher(map, "InvestigationUser"), ScoreMode.None); - Query invQuery = JoinUtil.createJoinQuery("id", false, "investigation", iuQuery, - getSearcher(map, "Investigation"), ScoreMode.None); - - theQuery.add(invQuery, Occur.MUST); + theQuery.add(iuQuery, Occur.MUST); } String text = query.getString("text", null); if (text != null) { - theQuery.add(parser.parse(text, "text"), Occur.MUST); + theQuery.add(datasetParser.parse(text, null), Occur.MUST); } - String lower = query.getString("lower", null); - String upper = query.getString("upper", null); - if (lower != null && upper != null) { - theQuery.add(new TermRangeQuery("startDate", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); - theQuery.add(new TermRangeQuery("endDate", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); - } + buildDateRanges(theQuery, query, "lower", "upper", "startDate", "endDate"); if (query.containsKey("parameters")) { JsonArray parameters = query.getJsonArray("parameters"); IndexSearcher datasetParameterSearcher = getSearcher(map, "DatasetParameter"); for (JsonValue p : parameters) { BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("dataset", false, "id", paramQuery.build(), + Query toQuery = JoinUtil.createJoinQuery("dataset.id", false, "id", paramQuery.build(), datasetParameterSearcher, ScoreMode.None); theQuery.add(toQuery, Occur.MUST); } @@ -709,6 +682,45 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("search_ } + private void delete(JsonObject operationBody) throws LuceneException, IOException { + String entityName = operationBody.getString("_index"); + if (relationships.containsKey(entityName)) { + updateByRelation(operationBody, true); + } + if (indexedEntities.contains(entityName)) { + String icatId = operationBody.getString("_id"); + try { + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); + if (bucket.locked.get()) { + throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, + "Lucene locked for " + entityName); + } + logger.trace("delete {} {}", entityName, icatId); + ShardBucket shardBucket = bucket.routeShard(new Long(icatId)); + shardBucket.indexWriter.deleteDocuments(new Term("id", icatId)); + } catch (IOException e) { + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + } + } + } + + /** + * Converts String into number of ms since epoch. + * + * @param value String representing a Date in the format "yyyyMMddHHmm". + * @return Number of ms since epoch, or null if value was null + * @throws java.text.ParseException + */ + protected static Long decodeTime(String value) throws java.text.ParseException { + if (value == null) { + return null; + } else { + synchronized (df) { + return df.parse(value).getTime(); + } + } + } + /** * Encodes core Lucene information (keys preceded by underscores) and a * selection of the Document's source fields to JSON to be returned to @@ -733,11 +745,14 @@ private void encodeResult(JsonGenerator gen, ScoreDoc hit, IndexSearcher searche } gen.writeStartObject("_source"); document.forEach((field) -> { - if (search.fields.contains(field.name())) { - if (field.stringValue() != null) { - gen.write(field.name(), field.stringValue()); - } else if (field.numericValue() != null) { - gen.write(field.name(), field.numericValue().doubleValue()); + String fieldName = field.name(); + if (search.fields.contains(fieldName)) { + if (longFields.contains(fieldName)) { + gen.write(fieldName, field.numericValue().longValue()); + } else if (doubleFields.contains(fieldName)) { + gen.write(fieldName, field.numericValue().doubleValue()); + } else { + gen.write(fieldName, field.stringValue()); } } }); @@ -784,11 +799,11 @@ public void freeSearcher(@PathParam("uid") Long uid) throws LuceneException { /* * Need a new set of IndexSearchers for each search as identified by a uid */ - private IndexSearcher getSearcher(Map bucket, String name) throws IOException { - DirectoryReader[] subReaders = bucket.get(name); + private IndexSearcher getSearcher(Map map, String name) throws IOException { + DirectoryReader[] subReaders = map.get(name); if (subReaders == null) { subReaders = indexBuckets.computeIfAbsent(name, k -> new IndexBucket(k)).acquireReaders(); - bucket.put(name, subReaders); + map.put(name, subReaders); logger.debug("Remember searcher for {}", name); } return new IndexSearcher(new MultiReader(subReaders, false)); @@ -809,13 +824,6 @@ private void init() { luceneCommitMillis = props.getPositiveInt("commitSeconds") * 1000; luceneMaxShardSize = Math.max(props.getPositiveLong("maxShardSize"), new Long(Integer.MAX_VALUE + 1)); - analyzer = new IcatAnalyzer(); - - parser = new StandardQueryParser(); - StandardQueryConfigHandler qpConf = (StandardQueryConfigHandler) parser.getQueryConfigHandler(); - qpConf.set(ConfigurationKeys.ANALYZER, analyzer); - qpConf.set(ConfigurationKeys.ALLOW_LEADING_WILDCARD, true); - timer = new Timer("LuceneCommitTimer"); timer.schedule(new CommitTimerTask(), luceneCommitMillis, luceneCommitMillis); @@ -860,25 +868,18 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("s BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); if (userName != null) { - Query iuQuery = JoinUtil.createJoinQuery("investigation", false, "id", - new TermQuery(new Term("name", userName)), getSearcher(map, "InvestigationUser"), + Query iuQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", + new TermQuery(new Term("user.name", userName)), getSearcher(map, "InvestigationUser"), ScoreMode.None); theQuery.add(iuQuery, Occur.MUST); } String text = query.getString("text", null); if (text != null) { - theQuery.add(parser.parse(text, "text"), Occur.MUST); + theQuery.add(investigationParser.parse(text, null), Occur.MUST); } - String lower = query.getString("lower", null); - String upper = query.getString("upper", null); - if (lower != null && upper != null) { - theQuery.add(new TermRangeQuery("startDate", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); - theQuery.add(new TermRangeQuery("endDate", new BytesRef(lower), new BytesRef(upper), true, true), - Occur.MUST); - } + buildDateRanges(theQuery, query, "lower", "upper", "startDate", "endDate"); if (query.containsKey("parameters")) { JsonArray parameters = query.getJsonArray("parameters"); @@ -886,7 +887,7 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("s for (JsonValue p : parameters) { BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", paramQuery.build(), + Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", paramQuery.build(), investigationParameterSearcher, ScoreMode.None); theQuery.add(toQuery, Occur.MUST); } @@ -899,8 +900,8 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("s for (JsonValue s : samples) { JsonString sample = (JsonString) s; BooleanQuery.Builder sampleQuery = new BooleanQuery.Builder(); - sampleQuery.add(parser.parse(sample.getString(), "text"), Occur.MUST); - Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", sampleQuery.build(), + sampleQuery.add(sampleParser.parse(sample.getString(), null), Occur.MUST); + Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", sampleQuery.build(), sampleSearcher, ScoreMode.None); theQuery.add(toQuery, Occur.MUST); } @@ -909,9 +910,9 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("s String userFullName = query.getString("userFullName", null); if (userFullName != null) { BooleanQuery.Builder userFullNameQuery = new BooleanQuery.Builder(); - userFullNameQuery.add(parser.parse(userFullName, "text"), Occur.MUST); + userFullNameQuery.add(genericParser.parse(userFullName, "user.fullName"), Occur.MUST); IndexSearcher investigationUserSearcher = getSearcher(map, "InvestigationUser"); - Query toQuery = JoinUtil.createJoinQuery("investigation", false, "id", userFullNameQuery.build(), + Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", userFullNameQuery.build(), investigationUserSearcher, ScoreMode.None); theQuery.add(toQuery, Occur.MUST); } @@ -1000,22 +1001,35 @@ private String luceneSearchResult(String name, Search search, String searchAfter Document lastDocument = isearcher.doc(lastDoc.doc); gen.writeStartArray("fields"); for (SortField sortField : fields) { - Type type = sortField.getType(); - if (type.equals(Type.STRING)) { - String lastValue = lastDocument.get(sortField.getField()); - if (lastValue == null) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Field " - + sortField.getField() - + " used for sorting was not present on the Lucene Document; all sortable fields must also be stored."); - } - gen.write(lastValue); + IndexableField indexableField = lastDocument.getField(sortField.getField()); + if (indexableField == null) { + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Field " + + sortField.getField() + + " used for sorting was not present on the Lucene Document; all sortable fields must also be stored."); + } + Type type = (sortField instanceof SortedNumericSortField) + ? ((SortedNumericSortField) sortField).getNumericType() + : sortField.getType(); + switch (type) { + case LONG: + gen.write(indexableField.numericValue().longValue()); + break; + case DOUBLE: + gen.write(indexableField.numericValue().doubleValue()); + break; + case STRING: + gen.write(indexableField.stringValue()); + break; + default: + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, + "SortField.Type must be one of LONG, DOUBLE, STRING, but it was " + type); } } - gen.writeEnd(); + gen.writeEnd(); // end "fields" array } - gen.writeEnd(); + gen.writeEnd(); // end "search_after" object } - gen.writeEnd(); // object + gen.writeEnd(); // end enclosing object } logger.debug("Json returned {}", baos.toString()); return baos.toString(); @@ -1030,34 +1044,167 @@ private Query maybeEmptyQuery(Builder theQuery) { return query; } - private Builder parseParameter(JsonValue p) { + /** + * Parses a date/time value from jsonObject. Can account for either a Long + * value, or a String value encoded in the format yyyyMMddHHmm. + * + * @param jsonObject JsonObject containing the date to be parsed. + * @param key Key of the date/time value in jsonObject. + * @param offset In the case of STRING ValueType, add offset ms before + * returning. This accounts for the fact the String format + * used is only precise to minutes and not seconds. + * @return null if jsonObject does not contain the key, number of ms since epoch + * otherwise. + * @throws LuceneException If the ValueType is not NUMBER or STRING, or if a + * STRING value cannot be parsed. + */ + private static Long parseDate(JsonObject jsonObject, String key, int offset) throws LuceneException { + if (jsonObject.containsKey(key)) { + ValueType valueType = jsonObject.get(key).getValueType(); + switch (valueType) { + case STRING: + String dateString = jsonObject.getString(key); + try { + return decodeTime(dateString) + offset; + } catch (Exception e) { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Could not parse date " + dateString + " using expected format yyyyMMddHHmm"); + } + case NUMBER: + return jsonObject.getJsonNumber(key).longValueExact(); + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Dates should be represented by a NUMBER or STRING JsonValue, but got " + valueType); + } + } + return null; + } + + /** + * Builds a Lucene Document from the parsed json. + * + * @param json Key value pairs of fields. + * @return Lucene Document. + */ + private Document parseDocument(JsonObject json) { + Document document = new Document(); + for (String key : json.keySet()) { + addField(json, document, key); + } + return document; + } + + private void addField(JsonObject json, Document document, String key) { + // SortedDocValuesField need to be indexed in addition to indexing a Field for + // searching/storing, so deal with that first + addSortField(json, document, key); + + if (doubleFields.contains(key)) { + Double value = json.getJsonNumber(key).doubleValue(); + document.add(new DoublePoint(key, value)); + document.add(new StoredField(key, value)); + } else if (longFields.contains(key)) { + Long value = json.getJsonNumber(key).longValueExact(); + document.add(new LongPoint(key, value)); + document.add(new StoredField(key, value)); + } else if (textFields.contains(key)) { + document.add(new TextField(key, json.getString(key), Store.YES)); + } else { + document.add(new StringField(key, json.getString(key), Store.YES)); + } + } + + private void addSortField(JsonObject json, Document document, String key) { + if (sortFields.contains(key)) { + if (longFields.contains(key)) { + document.add(new SortedNumericDocValuesField(key, json.getJsonNumber(key).longValueExact())); + } else if (doubleFields.contains(key)) { + long sortableLong = NumericUtils.doubleToSortableLong(json.getJsonNumber(key).doubleValue()); + document.add(new SortedNumericDocValuesField(key, sortableLong)); + } else { + document.add(new SortedDocValuesField(key, new BytesRef(json.getString(key)))); + } + } + } + + private void addSortField(IndexableField field, Document document) { + String key = field.name(); + if (sortFields.contains(key)) { + if (longFields.contains(key)) { + document.add(new SortedNumericDocValuesField(key, field.numericValue().longValue())); + } else if (doubleFields.contains(key)) { + long sortableLong = NumericUtils.doubleToSortableLong(field.numericValue().doubleValue()); + document.add(new SortedNumericDocValuesField(key, sortableLong)); + } else { + document.add(new SortedDocValuesField(key, new BytesRef(field.stringValue()))); + } + } + } + + /** + * Returns a new Lucene Document that has the same fields as were present in + * oldDocument, except in cases where json has an entry for that field. In this + * case, the json value is used instead. + * + * @param json Key value pairs of fields to overwrite fields already + * present in oldDocument. + * @param oldDocument Lucene Document to be updated. + * @return Lucene Document with updated fields. + */ + private Document updateDocument(JsonObject json, Document oldDocument) { + Document newDocument = new Document(); + for (IndexableField field : oldDocument.getFields()) { + String fieldName = field.name(); + if (json.keySet().contains(fieldName)) { + addField(json, newDocument, fieldName); + } else { + addSortField(field, newDocument); + newDocument.add(field); + } + } + return newDocument; + } + + /** + * Returns a new Lucene Document that has the same fields as were present in + * oldDocument, except in cases where the field name starts with fieldPrefix. + * + * @param fieldPrefix Any fields with a name starting with this String will not + * be present in the returned Document. + * @param oldDocument Lucene Document to be pruned. + * @return Lucene Document with pruned fields. + */ + private Document pruneDocument(String fieldPrefix, Document oldDocument) { + Document newDocument = new Document(); + for (IndexableField field : oldDocument.getFields()) { + if (!field.name().startsWith(fieldPrefix)) { + addSortField(field, newDocument); + newDocument.add(field); + } + } + return newDocument; + } + + private Builder parseParameter(JsonValue p) throws LuceneException { JsonObject parameter = (JsonObject) p; BooleanQuery.Builder paramQuery = new BooleanQuery.Builder(); String pName = parameter.getString("name", null); if (pName != null) { - paramQuery.add(new WildcardQuery(new Term("name", pName)), Occur.MUST); + paramQuery.add(new WildcardQuery(new Term("type.name", pName)), Occur.MUST); } String pUnits = parameter.getString("units", null); if (pUnits != null) { - paramQuery.add(new WildcardQuery(new Term("units", pUnits)), Occur.MUST); + paramQuery.add(new WildcardQuery(new Term("type.units", pUnits)), Occur.MUST); } - String pStringValue = parameter.getString("stringValue", null); - String pLowerDateValue = parameter.getString("lowerDateValue", null); - String pUpperDateValue = parameter.getString("upperDateValue", null); - Double pLowerNumericValue = parameter.containsKey("lowerNumericValue") - ? parameter.getJsonNumber("lowerNumericValue").doubleValue() - : null; - Double pUpperNumericValue = parameter.containsKey("upperNumericValue") - ? parameter.getJsonNumber("upperNumericValue").doubleValue() - : null; - if (pStringValue != null) { + if (parameter.containsKey("stringValue")) { + String pStringValue = parameter.getString("stringValue", null); paramQuery.add(new WildcardQuery(new Term("stringValue", pStringValue)), Occur.MUST); - } else if (pLowerDateValue != null && pUpperDateValue != null) { - paramQuery.add(new TermRangeQuery("dateTimeValue", new BytesRef(pLowerDateValue), - new BytesRef(pUpperDateValue), true, true), Occur.MUST); - - } else if (pLowerNumericValue != null && pUpperNumericValue != null) { + } else if (parameter.containsKey("lowerDateValue") && parameter.containsKey("upperDateValue")) { + buildDateRanges(paramQuery, parameter, "lowerDateValue", "upperDateValue", "dateTimeValue"); + } else if (parameter.containsKey("lowerNumericValue") && parameter.containsKey("upperNumericValue")) { + Double pLowerNumericValue = parameter.getJsonNumber("lowerNumericValue").doubleValue(); + Double pUpperNumericValue = parameter.getJsonNumber("upperNumericValue").doubleValue(); paramQuery.add(DoublePoint.newRangeQuery("numericValue", pLowerNumericValue, pUpperNumericValue), Occur.MUST); } @@ -1092,7 +1239,13 @@ private Sort parseSort(String sort) throws LuceneException { "Sort order must be 'asc' or 'desc' but it was '" + order + "'"); } - fields.add(new SortField(key, Type.STRING, reverse)); + if (longFields.contains(key)) { + fields.add(new SortedNumericSortField(key, Type.LONG, reverse)); + } else if (doubleFields.contains(key)) { + fields.add(new SortedNumericSortField(key, Type.DOUBLE, reverse)); + } else { + fields.add(new SortField(key, Type.STRING, reverse)); + } } return new Sort(fields.toArray(new SortField[0])); } @@ -1107,8 +1260,10 @@ private Sort parseSort(String sort) throws LuceneException { * order. * @return FieldDoc object built from the provided String, or null if * searchAfter was itself null or an empty String. + * @throws LuceneException If an entry in the fields array is not a STRING or + * NUMBER */ - private FieldDoc parseSearchAfter(String searchAfter) { + private FieldDoc parseSearchAfter(String searchAfter) throws LuceneException { if (searchAfter != null && !searchAfter.equals("")) { logger.debug("Attempting to parseSearchAfter from {}", searchAfter); JsonReader reader = Json.createReader(new StringReader(searchAfter)); @@ -1116,14 +1271,30 @@ private FieldDoc parseSearchAfter(String searchAfter) { int doc = object.getInt("doc"); int shardIndex = object.getInt("shardIndex"); float score = Float.NaN; - List fields = new ArrayList<>(); + List fields = new ArrayList<>(); if (object.containsKey("score")) { score = object.getJsonNumber("score").bigDecimalValue().floatValue(); } if (object.containsKey("fields")) { - List jsonStrings = object.getJsonArray("fields").getValuesAs(JsonString.class); - for (JsonString jsonString : jsonStrings) { - fields.add(new BytesRef(jsonString.getString())); + JsonArray jsonArray = object.getJsonArray("fields"); + for (JsonValue value : jsonArray) { + switch (value.getValueType()) { + case NUMBER: + JsonNumber number = ((JsonNumber) value); + if (number.toString().contains(".")) { + fields.add(number.doubleValue()); + } else { + fields.add(number.longValueExact()); + } + break; + case STRING: + fields.add(new BytesRef(((JsonString) value).getString())); + break; + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "fields should be an array of STRING and NUMBER, but had entry of type " + + value.getValueType()); + } } } return new FieldDoc(doc, score, fields.toArray(), shardIndex); @@ -1147,4 +1318,52 @@ public void unlock(@PathParam("entityName") String entityName) throws LuceneExce } } + private void update(JsonObject operationBody) throws LuceneException, NumberFormatException, IOException { + String entityName = operationBody.getString("_index"); + if (relationships.containsKey(entityName)) { + updateByRelation(operationBody, false); + } + if (indexedEntities.contains(entityName)) { + String icatId = operationBody.getString("_id"); + Document document = parseDocument(operationBody.getJsonObject("doc")); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); + if (bucket.locked.get()) { + throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, + "Lucene locked for " + entityName); + } + logger.trace("update: {}", document); + bucket.getWriter(new Long(icatId)).updateDocument(new Term("id", icatId), document); + } + } + + private void updateByRelation(JsonObject operationBody, Boolean delete) + throws LuceneException, NumberFormatException, IOException { + for (ParentRelationship parentRelationship : relationships.get(operationBody.getString("_index"))) { + String childId = operationBody.getString("_id"); + IndexBucket bucket = indexBuckets.computeIfAbsent(parentRelationship.parentName, k -> new IndexBucket(k)); + if (bucket.locked.get()) { + throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, + "Lucene locked for " + parentRelationship.parentName); + } + IndexSearcher searcher = getSearcher(new HashMap<>(), parentRelationship.parentName); + + int blockSize = 10000; + TermQuery query = new TermQuery(new Term(parentRelationship.fieldPrefix + ".id", childId)); + Sort sort = new Sort(new SortField("id", Type.STRING)); + ScoreDoc[] scoreDocs = searcher.search(query, blockSize, sort).scoreDocs; + while (scoreDocs.length != 0) { + TopDocs topDocs = searcher.search(query, blockSize); + for (ScoreDoc scoreDoc : topDocs.scoreDocs) { + Document oldDocument = searcher.doc(scoreDoc.doc); + String parentId = oldDocument.get("id"); + Document newDocument = delete ? pruneDocument(parentRelationship.fieldPrefix, oldDocument) + : updateDocument(operationBody.getJsonObject("doc"), oldDocument); + logger.trace("updateByRelation: {}", newDocument); + bucket.getWriter(new Long(parentId)).updateDocument(new Term("id", parentId), newDocument); + } + scoreDocs = searcher.searchAfter(scoreDocs[scoreDocs.length - 1], query, blockSize, sort).scoreDocs; + } + } + } + } From fbc99e667474342b1ba482a233c522ada98685c3 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Thu, 14 Apr 2022 00:21:02 +0100 Subject: [PATCH 34/73] Enable generic String and range facets #19 --- .../lucene/FacetDimensionRequest.java | 26 ++ .../java/org/icatproject/lucene/Lucene.java | 344 +++++++++++------- 2 files changed, 239 insertions(+), 131 deletions(-) create mode 100644 src/main/java/org/icatproject/lucene/FacetDimensionRequest.java diff --git a/src/main/java/org/icatproject/lucene/FacetDimensionRequest.java b/src/main/java/org/icatproject/lucene/FacetDimensionRequest.java new file mode 100644 index 0000000..736f2d3 --- /dev/null +++ b/src/main/java/org/icatproject/lucene/FacetDimensionRequest.java @@ -0,0 +1,26 @@ +package org.icatproject.lucene; + +import java.util.ArrayList; +import java.util.List; + +import org.apache.lucene.facet.range.Range; + +public class FacetDimensionRequest { + + private String dimension; + private List ranges; + + public FacetDimensionRequest(String dimension) { + this.dimension = dimension; + this.ranges = new ArrayList<>(); + } + + public List getRanges() { + return ranges; + } + + public String getDimension() { + return dimension; + } + +} diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index c73d56e..6e118f6 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -52,9 +52,9 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.DoublePoint; import org.apache.lucene.document.LongPoint; +import org.apache.lucene.document.NumericDocValuesField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.SortedDocValuesField; -import org.apache.lucene.document.SortedNumericDocValuesField; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; @@ -63,8 +63,14 @@ import org.apache.lucene.facet.FacetsCollector; import org.apache.lucene.facet.FacetsConfig; import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.range.DoubleRange; +import org.apache.lucene.facet.range.DoubleRangeFacetCounts; +import org.apache.lucene.facet.range.LongRange; +import org.apache.lucene.facet.range.LongRangeFacetCounts; +import org.apache.lucene.facet.range.Range; import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState; import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts; +import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; @@ -86,6 +92,7 @@ import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortedNumericSortField; +import org.apache.lucene.search.TermInSetQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopFieldDocs; @@ -129,7 +136,7 @@ public ShardBucket(java.nio.file.Path shardPath) throws IOException { logger.debug("Directory only has the write.lock file so store and delete a dummy document"); Document doc = new Document(); doc.add(new StringField("dummy", "dummy", Store.NO)); - indexWriter.addDocument(doc); + indexWriter.addDocument(facetsConfig.build(doc)); indexWriter.commit(); indexWriter.deleteDocuments(new Term("dummy", "dummy")); indexWriter.commit(); @@ -289,6 +296,7 @@ public class Search { public Query query; public Sort sort; public Set fields = new HashSet(); + public Set dimensions = new HashSet(); } private static class ParentRelationship { @@ -307,6 +315,7 @@ public ParentRelationship(String parentName, String fieldPrefix) { private static final SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmm"); private static final Set doubleFields = new HashSet<>(); + private static final Set facetFields = new HashSet<>(); private static final Set longFields = new HashSet<>(); private static final Set sortFields = new HashSet<>(); private static final Set textFields = new HashSet<>(); @@ -325,6 +334,7 @@ public ParentRelationship(String parentName, String fieldPrefix) { df.setTimeZone(tz); doubleFields.add("numericValue"); + facetFields.addAll(Arrays.asList("type.name", "datafileFormat.name")); longFields.addAll(Arrays.asList("date", "startDate", "endDate", "dateTimeValue")); sortFields.addAll(Arrays.asList("datafile.id", "dataset.id", "investigation.id", "id", "date", "startDate", "endDate", "name")); @@ -550,7 +560,7 @@ private void create(JsonObject operationBody) throws NumberFormatException, IOEx throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene locked for " + entityName); } - bucket.getWriter(new Long(icatId)).addDocument(document); + bucket.getWriter(new Long(icatId)).addDocument(facetsConfig.build(document)); } } @@ -564,7 +574,7 @@ private void createNow(String entityName, JsonObject documentJson) Document document = parseDocument(documentJson); logger.trace("create {} {}", entityName, document.toString()); IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); - bucket.getWriter(new Long(icatId)).addDocument(document); + bucket.getWriter(new Long(icatId)).addDocument(facetsConfig.build(document)); } @POST @@ -585,26 +595,8 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("search } } - @POST - @Consumes(MediaType.APPLICATION_JSON) - @Produces(MediaType.APPLICATION_JSON) - @Path("datafiles/facet") - public String datafilesFacet(@Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, @QueryParam("maxResults") int maxResults, - @QueryParam("maxLabels") int maxLabels, @QueryParam("sort") String sort) throws LuceneException { - Long uid = null; - try { - uid = bucketNum.getAndIncrement(); - Search search = datafilesQuery(request, sort, uid); - return luceneFacetResult("Datafile", search, searchAfter, maxResults, maxLabels, uid); - } catch (Exception e) { - logger.error("Error", e); - freeSearcher(uid); - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } - - } - - private Search datafilesQuery(HttpServletRequest request, String sort, Long uid) throws IOException, QueryNodeException, LuceneException { + private Search datafilesQuery(HttpServletRequest request, String sort, Long uid) + throws IOException, QueryNodeException, LuceneException { Search search = new Search(); searches.put(uid, search); Map readerMap = new HashMap<>(); @@ -659,72 +651,10 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("search_ @QueryParam("maxResults") int maxResults, @QueryParam("sort") String sort) throws LuceneException { Long uid = null; - try { - uid = bucketNum.getAndIncrement(); - Search search = new Search(); - searches.put(uid, search); - Map readerMap = new HashMap<>(); - search.readerMap = readerMap; - search.sort = parseSort(sort); - try (JsonReader r = Json.createReader(request.getInputStream())) { - JsonObject o = r.readObject(); - JsonObject query = o.getJsonObject("query"); - String userName = query.getString("user", null); - - BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); - - if (userName != null) { - - Query iuQuery = JoinUtil.createJoinQuery("investigation.id", false, "investigation.id", - new TermQuery(new Term("user.name", userName)), getSearcher(readerMap, "InvestigationUser"), - ScoreMode.None); - - theQuery.add(iuQuery, Occur.MUST); - } - - String text = query.getString("text", null); - if (text != null) { - theQuery.add(datasetParser.parse(text, null), Occur.MUST); - } - - buildDateRanges(theQuery, query, "lower", "upper", "startDate", "endDate"); - - if (query.containsKey("parameters")) { - JsonArray parameters = query.getJsonArray("parameters"); - IndexSearcher datasetParameterSearcher = getSearcher(readerMap, "DatasetParameter"); - for (JsonValue p : parameters) { - BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("dataset.id", false, "id", paramQuery.build(), - datasetParameterSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); - } - } - search.query = maybeEmptyQuery(theQuery); - if (o.containsKey("fields")) { - List jsonStrings = o.getJsonArray("fields").getValuesAs(JsonString.class); - jsonStrings.forEach((jsonString) -> search.fields.add(jsonString.getString())); - } - } - return luceneSearchResult("Dataset", search, searchAfter, maxResults, uid); - } catch (Exception e) { - logger.error("Error", e); - freeSearcher(uid); - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } - - } - - @POST - @Consumes(MediaType.APPLICATION_JSON) - @Produces(MediaType.APPLICATION_JSON) - @Path("datasets/facet") - public String datasetsFacet(@Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, @QueryParam("maxResults") int maxResults, - @QueryParam("maxLabels") int maxLabels, @QueryParam("sort") String sort) throws LuceneException { - Long uid = null; try { uid = bucketNum.getAndIncrement(); Search search = datasetsQuery(request, sort, uid); - return luceneFacetResult("Dataset", search, searchAfter, maxResults, maxLabels, uid); + return luceneSearchResult("Dataset", search, searchAfter, maxResults, uid); } catch (Exception e) { logger.error("Error", e); freeSearcher(uid); @@ -733,7 +663,8 @@ public String datasetsFacet(@Context HttpServletRequest request, @QueryParam("se } - private Search datasetsQuery(HttpServletRequest request, String sort, Long uid) throws IOException, QueryNodeException, LuceneException { + private Search datasetsQuery(HttpServletRequest request, String sort, Long uid) + throws IOException, QueryNodeException, LuceneException { Search search = new Search(); searches.put(uid, search); Map readerMap = new HashMap<>(); @@ -876,6 +807,25 @@ private void exit() { } } + @POST + @Consumes(MediaType.APPLICATION_JSON) + @Produces(MediaType.APPLICATION_JSON) + @Path("{entityName}/facet") + public String facet(@PathParam("entityName") String entityName, @Context HttpServletRequest request, + @QueryParam("search_after") String searchAfter, @QueryParam("maxResults") int maxResults, + @QueryParam("maxLabels") int maxLabels, @QueryParam("sort") String sort) throws LuceneException { + Long uid = null; + try { + uid = bucketNum.getAndIncrement(); + Search search = genericQuery(request, sort, uid); + return luceneFacetResult(entityName, search, searchAfter, maxResults, maxLabels, uid); + } catch (Exception e) { + logger.error("Error", e); + freeSearcher(uid); + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + } + } + @DELETE @Path("freeSearcher/{uid}") public void freeSearcher(@PathParam("uid") Long uid) throws LuceneException { @@ -895,6 +845,125 @@ public void freeSearcher(@PathParam("uid") Long uid) throws LuceneException { } } + /** + * Parses a query and associated information from an incoming request without + * any logic specific to a single index or entity. As such it may not be as + * powerful, but is sufficient for simple queries (like those for faceting). + * + * @param request Request containing the query and other Json encoded + * information such as fields and dimensions. + * @param sort String representing the sorting criteria for the search. + * @param uid Identifier for the search. + * @return Search object with the query, sort, and optionally the fields and + * dimensions to search set. + * @throws IOException If Json cannot be parsed from the request + * @throws LuceneException If the types of the JsonValues in the query do not + * match those supported by icat.lucene + */ + private Search genericQuery(HttpServletRequest request, String sort, Long uid) throws IOException, LuceneException { + Search search = new Search(); + searches.put(uid, search); + Map readerMap = new HashMap<>(); + search.readerMap = readerMap; + search.sort = parseSort(sort); + try (JsonReader r = Json.createReader(request.getInputStream())) { + JsonObject o = r.readObject(); + JsonObject jsonQuery = o.getJsonObject("query"); + BooleanQuery.Builder luceneQuery = new BooleanQuery.Builder(); + for (Entry entry : jsonQuery.entrySet()) { + String field = entry.getKey(); + ValueType valueType = entry.getValue().getValueType(); + switch (valueType) { + case STRING: + JsonString stringValue = (JsonString) entry.getValue(); + luceneQuery.add(new TermQuery(new Term(field, stringValue.getString())), Occur.MUST); + break; + case NUMBER: + JsonNumber numberValue = (JsonNumber) entry.getValue(); + if (longFields.contains(field)) { + luceneQuery.add(LongPoint.newExactQuery(field, numberValue.longValueExact()), Occur.FILTER); + } else if (doubleFields.contains(field)) { + luceneQuery.add(DoublePoint.newExactQuery(field, numberValue.doubleValue()), Occur.FILTER); + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Value had type NUMBER, but field " + field + + " is not a known longField or doubleField"); + } + break; + case ARRAY: + // Only support array of String as list of ICAT ids is currently only use case + JsonArray arrayValue = (JsonArray) entry.getValue(); + ArrayList bytesArray = new ArrayList<>(); + for (JsonString value : arrayValue.getValuesAs(JsonString.class)) { + bytesArray.add(new BytesRef(value.getChars())); + } + luceneQuery.add(new TermInSetQuery(field, bytesArray), Occur.MUST); + break; + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Query values should be ARRAY, STRING or NUMBER, but had value of type " + valueType); + } + } + search.query = maybeEmptyQuery(luceneQuery); + logger.info("Query: {}", search.query); + if (o.containsKey("fields")) { + List jsonStrings = o.getJsonArray("fields").getValuesAs(JsonString.class); + jsonStrings.forEach((jsonString) -> search.fields.add(jsonString.getString())); + logger.info("Fields: {}", search.fields); + } + if (o.containsKey("dimensions")) { + List dimensionObjects = o.getJsonArray("dimensions").getValuesAs(JsonObject.class); + for (JsonObject dimensionObject : dimensionObjects) { + if (!dimensionObject.containsKey("dimension")) { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "'dimension' not specified for facet request " + dimensionObject.toString()); + } + String dimension = dimensionObject.getString("dimension"); + FacetDimensionRequest facetDimensionRequest = new FacetDimensionRequest(dimension); + if (dimensionObject.containsKey("ranges")) { + List ranges = facetDimensionRequest.getRanges(); + if (longFields.contains(dimension)) { + for (JsonObject range : dimensionObject.getJsonArray("ranges") + .getValuesAs(JsonObject.class)) { + Long lower = Long.MIN_VALUE; + Long upper = Long.MAX_VALUE; + if (range.containsKey("lower")) { + lower = range.getJsonNumber("lower").longValueExact(); + } + if (range.containsKey("upper")) { + upper = range.getJsonNumber("upper").longValueExact(); + } + String label = lower.toString() + "_" + upper.toString(); + ranges.add(new LongRange(label, lower, true, upper, true)); + } + } else if (doubleFields.contains(dimension)) { + for (JsonObject range : dimensionObject.getJsonArray("ranges") + .getValuesAs(JsonObject.class)) { + Double lower = Double.MIN_VALUE; + Double upper = Double.MAX_VALUE; + if (range.containsKey("lower")) { + lower = range.getJsonNumber("lower").doubleValue(); + } + if (range.containsKey("upper")) { + upper = range.getJsonNumber("upper").doubleValue(); + } + String label = lower.toString() + "_" + upper.toString(); + ranges.add(new DoubleRange(label, lower, true, upper, true)); + } + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "'ranges' specified for dimension " + dimension + + " but this is not a supported numeric field"); + } + } + search.dimensions.add(facetDimensionRequest); + } + logger.info("Dimensions: {}", search.dimensions.size()); + } + } + return search; + } + private MultiReader getMultiReader(Map readerMap, String name) throws IOException { DirectoryReader[] subReaders = readerMap.get(name); if (subReaders == null) { @@ -964,25 +1033,8 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("s } } - @POST - @Consumes(MediaType.APPLICATION_JSON) - @Produces(MediaType.APPLICATION_JSON) - @Path("investigations/facet") - public String investigationsFacet(@Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, @QueryParam("maxResults") int maxResults, - @QueryParam("maxLabels") int maxLabels, @QueryParam("sort") String sort) throws LuceneException { - Long uid = null; - try { - uid = bucketNum.getAndIncrement(); - Search search = investigationsQuery(request, sort, uid); - return luceneFacetResult("Investigation", search, searchAfter, maxResults, maxLabels, uid); - } catch (Exception e) { - logger.error("Error", e); - freeSearcher(uid); - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } - } - - private Search investigationsQuery(HttpServletRequest request, String sort, Long uid) throws IOException, QueryNodeException, LuceneException { + private Search investigationsQuery(HttpServletRequest request, String sort, Long uid) + throws IOException, QueryNodeException, LuceneException { Search search = new Search(); searches.put(uid, search); Map readerMap = new HashMap<>(); @@ -1073,54 +1125,78 @@ public void lock(@PathParam("entityName") String entityName) throws LuceneExcept } } - private String luceneFacetResult(String name, Search search, String searchAfter, int maxResults, int maxLabels, Long uid) - throws IOException, IllegalStateException { - List results; + private String luceneFacetResult(String name, Search search, String searchAfter, int maxResults, int maxLabels, + Long uid) throws IOException, IllegalStateException, LuceneException { + List results = new ArrayList<>(); + List rangeResults = new ArrayList<>(); if (maxResults <= 0 || maxLabels <= 0) { // This will result in no Facets and a null pointer, so return early - logger.warn("No facets possible for maxResults={}, maxLabels={}, returning empty list", maxResults, maxLabels); - results = new ArrayList<>(); + logger.warn("Cannot facet when maxResults={}, maxLabels={}, returning empty list", maxResults, maxLabels); } else { MultiReader directoryReader = getMultiReader(search.readerMap, name); IndexSearcher indexSearcher = new IndexSearcher(directoryReader); + FacetsCollector facetsCollector = new FacetsCollector(); + FacetsCollector.search(indexSearcher, search.query, maxResults, facetsCollector); logger.debug("To facet in {} for {} {} with {} from {} ", name, search.query, maxResults, indexSearcher, searchAfter); + for (FacetDimensionRequest facetDimensionRequest : search.dimensions) { + if (facetDimensionRequest.getRanges().size() > 0) { + String dimension = facetDimensionRequest.getDimension(); + if (longFields.contains(dimension)) { + LongRange[] ranges = facetDimensionRequest.getRanges().toArray(new LongRange[0]); + Facets facets = new LongRangeFacetCounts(dimension, facetsCollector, ranges); + rangeResults.addAll(facets.getAllDims(maxLabels)); + } else if (doubleFields.contains(dimension)) { + DoubleRange[] ranges = facetDimensionRequest.getRanges().toArray(new DoubleRange[0]); + Facets facets = new DoubleRangeFacetCounts(dimension, facetsCollector, ranges); + rangeResults.addAll(facets.getAllDims(maxLabels)); + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "'ranges' specified for dimension " + dimension + + " but this is not a supported numeric field"); + } + } + } try { DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(directoryReader); - FacetsCollector facetsCollector = new FacetsCollector(); - FacetsCollector.search(indexSearcher, search.query, maxResults, facetsCollector); Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); logger.debug("facets: {}, maxLabels: {}, maxResults: {}", facets, maxLabels, maxResults); results = facets.getAllDims(maxLabels); } catch (IllegalArgumentException e) { // This can occur if no fields in the index have been faceted logger.error("No facets found in index, resulting in error: " + e.getClass() + " " + e.getMessage()); - results = new ArrayList<>(); } catch (IllegalStateException e) { - // This can occur if we do not create the IndexSearcher from the same DirectoryReader as we used to - // create the state + // This can occur if we do not create the IndexSearcher from the same + // DirectoryReader as we used to create the state logger.error("IndexSearcher used is not based on the DirectoryReader used for facet counting: " - + e.getClass() + " " + e.getMessage()); + + e.getClass() + " " + e.getMessage()); throw e; } logger.debug("Facets found for " + results.size() + " dimensions"); } ByteArrayOutputStream baos = new ByteArrayOutputStream(); try (JsonGenerator gen = Json.createGenerator(baos)) { - gen.writeStartObject(); - gen.writeStartObject("dimensions"); // object containing all facet dimensions - for (FacetResult result : results) { + gen.writeStartObject().writeStartObject("dimensions"); // object containing all facet dimensions + Set dimensionSet = new HashSet<>(); + search.dimensions.forEach(d -> dimensionSet.add(d.getDimension())); + writeFacetResults(dimensionSet, results, gen); + writeFacetResults(new HashSet<>(), rangeResults, gen); + gen.writeEnd().writeEnd(); // object containing dimensions + } + logger.debug("Json returned {}", baos.toString()); + return baos.toString(); + } + + private void writeFacetResults(Set dimensionSet, List results, JsonGenerator gen) { + for (FacetResult result : results) { + if (dimensionSet.size() == 0 || dimensionSet.contains(result.dim)) { gen.writeStartObject(result.dim); // object containing labelValues for a given dimension for (LabelAndValue labelValue : result.labelValues) { gen.write(labelValue.label, labelValue.value.longValue()); } gen.writeEnd(); // object containing labelValues } - gen.writeEnd(); // object containing dimensions - gen.writeEnd(); } - logger.debug("Json returned {}", baos.toString()); - return baos.toString(); } private String luceneSearchResult(String name, Search search, String searchAfter, int maxResults, Long uid) @@ -1271,6 +1347,11 @@ private void addField(JsonObject json, Document document, String key) { // searching/storing, so deal with that first addSortField(json, document, key); + // Likewise, faceted fields should be considered separately + if (facetFields.contains(key)) { + document.add(new SortedSetDocValuesFacetField(key, json.getString(key))); + } + if (doubleFields.contains(key)) { Double value = json.getJsonNumber(key).doubleValue(); document.add(new DoublePoint(key, value)); @@ -1289,10 +1370,10 @@ private void addField(JsonObject json, Document document, String key) { private void addSortField(JsonObject json, Document document, String key) { if (sortFields.contains(key)) { if (longFields.contains(key)) { - document.add(new SortedNumericDocValuesField(key, json.getJsonNumber(key).longValueExact())); + document.add(new NumericDocValuesField(key, json.getJsonNumber(key).longValueExact())); } else if (doubleFields.contains(key)) { long sortableLong = NumericUtils.doubleToSortableLong(json.getJsonNumber(key).doubleValue()); - document.add(new SortedNumericDocValuesField(key, sortableLong)); + document.add(new NumericDocValuesField(key, sortableLong)); } else { document.add(new SortedDocValuesField(key, new BytesRef(json.getString(key)))); } @@ -1303,10 +1384,10 @@ private void addSortField(IndexableField field, Document document) { String key = field.name(); if (sortFields.contains(key)) { if (longFields.contains(key)) { - document.add(new SortedNumericDocValuesField(key, field.numericValue().longValue())); + document.add(new NumericDocValuesField(key, field.numericValue().longValue())); } else if (doubleFields.contains(key)) { long sortableLong = NumericUtils.doubleToSortableLong(field.numericValue().doubleValue()); - document.add(new SortedNumericDocValuesField(key, sortableLong)); + document.add(new NumericDocValuesField(key, sortableLong)); } else { document.add(new SortedDocValuesField(key, new BytesRef(field.stringValue()))); } @@ -1504,7 +1585,7 @@ private void update(JsonObject operationBody) throws LuceneException, NumberForm "Lucene locked for " + entityName); } logger.trace("update: {}", document); - bucket.getWriter(new Long(icatId)).updateDocument(new Term("id", icatId), document); + bucket.getWriter(new Long(icatId)).updateDocument(new Term("id", icatId), facetsConfig.build(document)); } } @@ -1531,7 +1612,8 @@ private void updateByRelation(JsonObject operationBody, Boolean delete) Document newDocument = delete ? pruneDocument(parentRelationship.fieldPrefix, oldDocument) : updateDocument(operationBody.getJsonObject("doc"), oldDocument); logger.trace("updateByRelation: {}", newDocument); - bucket.getWriter(new Long(parentId)).updateDocument(new Term("id", parentId), newDocument); + bucket.getWriter(new Long(parentId)).updateDocument(new Term("id", parentId), + facetsConfig.build(newDocument)); } scoreDocs = searcher.searchAfter(scoreDocs[scoreDocs.length - 1], query, blockSize, sort).scoreDocs; } From 8907a7c351d3825a90d40ac20ed555ab98cdfaa0 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Thu, 14 Apr 2022 05:02:32 +0100 Subject: [PATCH 35/73] Basic unit conversion #19 --- .../java/org/icatproject/lucene/Lucene.java | 44 ++++++++++++++++++- 1 file changed, 42 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 6e118f6..ebf9dba 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -37,6 +37,10 @@ import javax.json.JsonValue; import javax.json.JsonValue.ValueType; import javax.json.stream.JsonGenerator; +import javax.measure.IncommensurableException; +import javax.measure.Unit; +import javax.measure.UnitConverter; +import javax.measure.format.MeasurementParseException; import javax.servlet.http.HttpServletRequest; import javax.ws.rs.Consumes; import javax.ws.rs.DELETE; @@ -110,6 +114,9 @@ import org.slf4j.Marker; import org.slf4j.MarkerFactory; +import tech.units.indriya.format.SimpleUnitFormat; +import tech.units.indriya.unit.Units; + @Path("/") @Singleton public class Lucene { @@ -313,6 +320,7 @@ public ParentRelationship(String parentName, String fieldPrefix) { private static final Logger logger = LoggerFactory.getLogger(Lucene.class); private static final Marker fatal = MarkerFactory.getMarker("FATAL"); private static final SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmm"); + private static final SimpleUnitFormat unitFormat = SimpleUnitFormat.getInstance(); private static final Set doubleFields = new HashSet<>(); private static final Set facetFields = new HashSet<>(); @@ -333,11 +341,13 @@ public ParentRelationship(String parentName, String fieldPrefix) { TimeZone tz = TimeZone.getTimeZone("GMT"); df.setTimeZone(tz); - doubleFields.add("numericValue"); + unitFormat.alias(Units.CELSIUS, "celsius"); // TODO this should be generalised with the units we need + + doubleFields.addAll(Arrays.asList("numericValue", "numericValueSI")); facetFields.addAll(Arrays.asList("type.name", "datafileFormat.name")); longFields.addAll(Arrays.asList("date", "startDate", "endDate", "dateTimeValue")); sortFields.addAll(Arrays.asList("datafile.id", "dataset.id", "investigation.id", "id", "date", "startDate", - "endDate", "name")); + "endDate", "name", "stringValue", "dateTimeValue", "numericValue", "numericValueSI")); textFields.addAll(Arrays.asList("name", "visitId", "description", "datafileFormat.name", "sample.name", "sample.type.name", "title", "summary", "facility.name", "user.fullName")); @@ -1365,6 +1375,36 @@ private void addField(JsonObject json, Document document, String key) { } else { document.add(new StringField(key, json.getString(key), Store.YES)); } + + // Whenever the units are set or changed, convert to SI + if (key.equals("type.units")) { + String unitString = json.getString("type.units"); + IndexableField field = document.getField("numericValue"); + double value; + if (field != null) { + value = NumericUtils.sortableLongToDouble(field.numericValue().longValue()); + } else if (json.containsKey("numericValue")) { + value = json.getJsonNumber(key).doubleValue(); + } else { + // Strings and date/time values also have units, so if we aren't dealing with a + // number don't convert + return; + } + try { + logger.trace("Attempting to convert {} {}", value, unitString); + Unit unit = unitFormat.parse(unitString); + Unit systemUnit = unit.getSystemUnit(); + UnitConverter converter = unit.getConverterToAny(systemUnit); + Double systemValue = converter.convert(value); + document.add(new DoublePoint("numericValueSI", systemValue)); + document.add(new StoredField("numericValueSI", systemValue)); + long sortableLong = NumericUtils.doubleToSortableLong(systemValue); + document.add(new NumericDocValuesField("numericValueSI", sortableLong)); + document.add(new StringField("type.unitsSI", systemUnit.getName(), Store.YES)); + } catch (IncommensurableException | MeasurementParseException e) { + logger.error("Unable to convert 'type.units' of {} due to {}", unitString, e.getMessage()); + } + } } private void addSortField(JsonObject json, Document document, String key) { From 8438e1fbc72f21d7a8048ccd4554b04e58826df1 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Thu, 14 Apr 2022 05:19:10 +0100 Subject: [PATCH 36/73] Add unit conversion dependencies #19 --- pom.xml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pom.xml b/pom.xml index d6ec96c..ab0d094 100755 --- a/pom.xml +++ b/pom.xml @@ -98,6 +98,18 @@ 7.0 + + javax.measure + unit-api + 2.1.3 + + + + tech.units + indriya + 2.1.3 + + org.icatproject icat.utils From 45a39485abf063bed2d423f67db3d11f0ca630d6 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Sat, 30 Apr 2022 02:03:33 +0100 Subject: [PATCH 37/73] Refactor unit conversion to utils #19 --- pom.xml | 14 +- .../java/org/icatproject/lucene/Lucene.java | 251 +++++++++--------- src/main/resources/run.properties | 1 + 3 files changed, 129 insertions(+), 137 deletions(-) diff --git a/pom.xml b/pom.xml index ab0d094..ae6d0c3 100755 --- a/pom.xml +++ b/pom.xml @@ -98,22 +98,10 @@ 7.0 - - javax.measure - unit-api - 2.1.3 - - - - tech.units - indriya - 2.1.3 - - org.icatproject icat.utils - 4.16.1 + 4.16.2-SNAPSHOT diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index ebf9dba..1edc024 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -32,18 +32,14 @@ import javax.json.JsonArray; import javax.json.JsonNumber; import javax.json.JsonObject; +import javax.json.JsonObjectBuilder; import javax.json.JsonReader; import javax.json.JsonString; import javax.json.JsonValue; import javax.json.JsonValue.ValueType; import javax.json.stream.JsonGenerator; -import javax.measure.IncommensurableException; -import javax.measure.Unit; -import javax.measure.UnitConverter; -import javax.measure.format.MeasurementParseException; import javax.servlet.http.HttpServletRequest; import javax.ws.rs.Consumes; -import javax.ws.rs.DELETE; import javax.ws.rs.GET; import javax.ws.rs.POST; import javax.ws.rs.Path; @@ -109,14 +105,13 @@ import org.apache.lucene.util.NumericUtils; import org.icatproject.lucene.exceptions.LuceneException; import org.icatproject.utils.CheckedProperties; +import org.icatproject.utils.IcatUnits; +import org.icatproject.utils.IcatUnits.SystemValue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Marker; import org.slf4j.MarkerFactory; -import tech.units.indriya.format.SimpleUnitFormat; -import tech.units.indriya.unit.Units; - @Path("/") @Singleton public class Lucene { @@ -302,6 +297,7 @@ public class Search { public Map readerMap; public Query query; public Sort sort; + public boolean scored; public Set fields = new HashSet(); public Set dimensions = new HashSet(); } @@ -320,7 +316,6 @@ public ParentRelationship(String parentName, String fieldPrefix) { private static final Logger logger = LoggerFactory.getLogger(Lucene.class); private static final Marker fatal = MarkerFactory.getMarker("FATAL"); private static final SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmm"); - private static final SimpleUnitFormat unitFormat = SimpleUnitFormat.getInstance(); private static final Set doubleFields = new HashSet<>(); private static final Set facetFields = new HashSet<>(); @@ -341,8 +336,6 @@ public ParentRelationship(String parentName, String fieldPrefix) { TimeZone tz = TimeZone.getTimeZone("GMT"); df.setTimeZone(tz); - unitFormat.alias(Units.CELSIUS, "celsius"); // TODO this should be generalised with the units we need - doubleFields.addAll(Arrays.asList("numericValue", "numericValueSI")); facetFields.addAll(Arrays.asList("type.name", "datafileFormat.name")); longFields.addAll(Arrays.asList("date", "startDate", "endDate", "dateTimeValue")); @@ -388,7 +381,7 @@ public ParentRelationship(String parentName, String fieldPrefix) { investigationParser.setAnalyzer(analyzer); investigationParser.setMultiFields(investigationFields); - CharSequence[] sampleFields = { "sample.name", "sample.type.name" }; + CharSequence[] sampleFields = { "name", "type.name" }; sampleParser.setAllowLeadingWildcard(true); sampleParser.setAnalyzer(analyzer); sampleParser.setMultiFields(sampleFields); @@ -407,6 +400,7 @@ public ParentRelationship(String parentName, String fieldPrefix) { private Timer timer; private Map searches = new ConcurrentHashMap<>(); + private IcatUnits icatUnits; /** * return the version of the lucene server @@ -590,7 +584,7 @@ private void createNow(String entityName, JsonObject documentJson) @POST @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) - @Path("datafiles") + @Path("datafile") public String datafiles(@Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, @QueryParam("maxResults") int maxResults, @QueryParam("sort") String sort) throws LuceneException { Long uid = null; @@ -611,6 +605,7 @@ private Search datafilesQuery(HttpServletRequest request, String sort, Long uid) searches.put(uid, search); Map readerMap = new HashMap<>(); search.readerMap = readerMap; + search.scored = (sort == null || sort.equals("")); search.sort = parseSort(sort); try (JsonReader r = Json.createReader(request.getInputStream())) { @@ -656,7 +651,7 @@ private Search datafilesQuery(HttpServletRequest request, String sort, Long uid) @POST @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) - @Path("datasets") + @Path("dataset") public String datasets(@Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, @QueryParam("maxResults") int maxResults, @QueryParam("sort") String sort) throws LuceneException { @@ -679,6 +674,7 @@ private Search datasetsQuery(HttpServletRequest request, String sort, Long uid) searches.put(uid, search); Map readerMap = new HashMap<>(); search.readerMap = readerMap; + search.scored = (sort == null || sort.equals("")); search.sort = parseSort(sort); try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); @@ -836,11 +832,8 @@ public String facet(@PathParam("entityName") String entityName, @Context HttpSer } } - @DELETE - @Path("freeSearcher/{uid}") - public void freeSearcher(@PathParam("uid") Long uid) throws LuceneException { + public void freeSearcher(Long uid) throws LuceneException { if (uid != null) { // May not be set for internal calls - logger.debug("Requesting freeSearcher {}", uid); Map search = searches.get(uid).readerMap; for (Entry entry : search.entrySet()) { String name = entry.getKey(); @@ -875,6 +868,7 @@ private Search genericQuery(HttpServletRequest request, String sort, Long uid) t searches.put(uid, search); Map readerMap = new HashMap<>(); search.readerMap = readerMap; + search.scored = (sort == null || sort.equals("")); search.sort = parseSort(sort); try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); @@ -937,28 +931,34 @@ private Search genericQuery(HttpServletRequest request, String sort, Long uid) t .getValuesAs(JsonObject.class)) { Long lower = Long.MIN_VALUE; Long upper = Long.MAX_VALUE; - if (range.containsKey("lower")) { - lower = range.getJsonNumber("lower").longValueExact(); + if (range.containsKey("from")) { + lower = range.getJsonNumber("from").longValueExact(); + } + if (range.containsKey("to")) { + upper = range.getJsonNumber("to").longValueExact(); } - if (range.containsKey("upper")) { - upper = range.getJsonNumber("upper").longValueExact(); + String label = lower.toString() + "-" + upper.toString(); + if (range.containsKey("key")) { + label = range.getString("key"); } - String label = lower.toString() + "_" + upper.toString(); - ranges.add(new LongRange(label, lower, true, upper, true)); + ranges.add(new LongRange(label, lower, true, upper, false)); } } else if (doubleFields.contains(dimension)) { for (JsonObject range : dimensionObject.getJsonArray("ranges") .getValuesAs(JsonObject.class)) { Double lower = Double.MIN_VALUE; Double upper = Double.MAX_VALUE; - if (range.containsKey("lower")) { - lower = range.getJsonNumber("lower").doubleValue(); + String label = lower.toString() + "-" + upper.toString(); + if (range.containsKey("from")) { + lower = range.getJsonNumber("from").doubleValue(); } - if (range.containsKey("upper")) { - upper = range.getJsonNumber("upper").doubleValue(); + if (range.containsKey("to")) { + upper = range.getJsonNumber("to").doubleValue(); } - String label = lower.toString() + "_" + upper.toString(); - ranges.add(new DoubleRange(label, lower, true, upper, true)); + if (range.containsKey("key")) { + label = range.getString("key"); + } + ranges.add(new DoubleRange(label, lower, true, upper, false)); } } else { throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, @@ -1006,6 +1006,8 @@ private void init() { timer = new Timer("LuceneCommitTimer"); timer.schedule(new CommitTimerTask(), luceneCommitMillis, luceneCommitMillis); + icatUnits = new IcatUnits(props.getString("units", "")); + } catch (Exception e) { logger.error(fatal, e.getMessage()); throw new IllegalStateException(e.getMessage()); @@ -1028,7 +1030,7 @@ public void run() { @POST @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) - @Path("investigations") + @Path("investigation") public String investigations(@Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, @QueryParam("maxResults") int maxResults, @QueryParam("sort") String sort) throws LuceneException { Long uid = null; @@ -1049,6 +1051,7 @@ private Search investigationsQuery(HttpServletRequest request, String sort, Long searches.put(uid, search); Map readerMap = new HashMap<>(); search.readerMap = readerMap; + search.scored = (sort == null || sort.equals("")); search.sort = parseSort(sort); try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); @@ -1184,60 +1187,43 @@ private String luceneFacetResult(String name, Search search, String searchAfter, } logger.debug("Facets found for " + results.size() + " dimensions"); } - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - try (JsonGenerator gen = Json.createGenerator(baos)) { - gen.writeStartObject().writeStartObject("dimensions"); // object containing all facet dimensions - Set dimensionSet = new HashSet<>(); - search.dimensions.forEach(d -> dimensionSet.add(d.getDimension())); - writeFacetResults(dimensionSet, results, gen); - writeFacetResults(new HashSet<>(), rangeResults, gen); - gen.writeEnd().writeEnd(); // object containing dimensions - } - logger.debug("Json returned {}", baos.toString()); - return baos.toString(); - } - - private void writeFacetResults(Set dimensionSet, List results, JsonGenerator gen) { + Set dimensionSet = new HashSet<>(); + search.dimensions.forEach(d -> dimensionSet.add(d.getDimension())); + JsonObjectBuilder aggregationsBuilder = Json.createObjectBuilder(); for (FacetResult result : results) { if (dimensionSet.size() == 0 || dimensionSet.contains(result.dim)) { - gen.writeStartObject(result.dim); // object containing labelValues for a given dimension - for (LabelAndValue labelValue : result.labelValues) { - gen.write(labelValue.label, labelValue.value.longValue()); - } - gen.writeEnd(); // object containing labelValues + buildBuckets(aggregationsBuilder, result); } } + for (FacetResult result : rangeResults) { + buildBuckets(aggregationsBuilder, result); + } + return Json.createObjectBuilder().add("aggregations", aggregationsBuilder).build().toString(); + } + + private void buildBuckets(JsonObjectBuilder aggregationsBuilder, FacetResult result) { + JsonObjectBuilder bucketsBuilder = Json.createObjectBuilder(); + for (LabelAndValue labelValue : result.labelValues) { + JsonObjectBuilder bucketBuilder = Json.createObjectBuilder(); + bucketsBuilder.add(labelValue.label, bucketBuilder.add("doc_count", labelValue.value.longValue())); + } + aggregationsBuilder.add(result.dim, Json.createObjectBuilder().add("buckets", bucketsBuilder)); } private String luceneSearchResult(String name, Search search, String searchAfter, int maxResults, Long uid) throws IOException, LuceneException { IndexSearcher isearcher = getSearcher(search.readerMap, name); - logger.debug("To search in {} for {} {} with {} from {} ", name, search.query, maxResults, isearcher, - searchAfter); + String format = "Search {} with: query {}, maxResults, searchAfter {}, scored {} "; + logger.debug(format, name, search.query, maxResults, searchAfter, search.scored); FieldDoc searchAfterDoc = parseSearchAfter(searchAfter); - ScoreDoc[] hits; - TotalHits totalHits; - SortField[] fields = null; - if (search.sort == null) { - // Use default score sorting - TopDocs topDocs; - topDocs = searchAfterDoc == null ? isearcher.search(search.query, maxResults) - : isearcher.searchAfter(searchAfterDoc, search.query, maxResults); - hits = topDocs.scoreDocs; - totalHits = topDocs.totalHits; - } else { - // Use specified sorting - TopFieldDocs topFieldDocs; - topFieldDocs = searchAfterDoc == null ? isearcher.search(search.query, maxResults, search.sort) - : isearcher.searchAfter(searchAfterDoc, search.query, maxResults, search.sort, false); - hits = topFieldDocs.scoreDocs; - totalHits = topFieldDocs.totalHits; - fields = topFieldDocs.fields; - } - Float maxScore; - if (hits.length == 0) { - maxScore = Float.NaN; - } else { + TopFieldDocs topFieldDocs = searchAfterDoc == null + ? isearcher.search(search.query, maxResults, search.sort, search.scored) + : isearcher.searchAfter(searchAfterDoc, search.query, maxResults, search.sort, search.scored); + ScoreDoc[] hits = topFieldDocs.scoreDocs; + TotalHits totalHits = topFieldDocs.totalHits; + SortField[] fields = topFieldDocs.fields; + Float maxScore = Float.NaN; + if (hits.length > 0) { maxScore = hits[0].score; } logger.debug("Hits " + totalHits + " maxscore " + maxScore); @@ -1259,10 +1245,15 @@ private String luceneSearchResult(String name, Search search, String searchAfter Document lastDocument = isearcher.doc(lastDoc.doc); gen.writeStartArray("fields"); for (SortField sortField : fields) { - IndexableField indexableField = lastDocument.getField(sortField.getField()); + String fieldName = sortField.getField(); + if (fieldName == null) { + // SCORE sorting will have a null fieldName + gen.write(lastDoc.score); + continue; + } + IndexableField indexableField = lastDocument.getField(fieldName); if (indexableField == null) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Field " - + sortField.getField() + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Field " + fieldName + " used for sorting was not present on the Lucene Document; all sortable fields must also be stored."); } Type type = (sortField instanceof SortedNumericSortField) @@ -1390,25 +1381,29 @@ private void addField(JsonObject json, Document document, String key) { // number don't convert return; } - try { - logger.trace("Attempting to convert {} {}", value, unitString); - Unit unit = unitFormat.parse(unitString); - Unit systemUnit = unit.getSystemUnit(); - UnitConverter converter = unit.getConverterToAny(systemUnit); - Double systemValue = converter.convert(value); - document.add(new DoublePoint("numericValueSI", systemValue)); - document.add(new StoredField("numericValueSI", systemValue)); - long sortableLong = NumericUtils.doubleToSortableLong(systemValue); + logger.trace("Attempting to convert {} {}", value, unitString); + SystemValue systemValue = icatUnits.new SystemValue(value, unitString); + if (systemValue.units != null) { + document.add(new StringField("type.unitsSI", systemValue.units, Store.YES)); + } + if (systemValue.value != null) { + document.add(new DoublePoint("numericValueSI", systemValue.value)); + document.add(new StoredField("numericValueSI", systemValue.value)); + long sortableLong = NumericUtils.doubleToSortableLong(systemValue.value); document.add(new NumericDocValuesField("numericValueSI", sortableLong)); - document.add(new StringField("type.unitsSI", systemUnit.getName(), Store.YES)); - } catch (IncommensurableException | MeasurementParseException e) { - logger.error("Unable to convert 'type.units' of {} due to {}", unitString, e.getMessage()); } } } private void addSortField(JsonObject json, Document document, String key) { if (sortFields.contains(key)) { + if (key.equals("id")) { + // Id is a special case, as we need to to be SORTED as a byte ref to allow joins + // but also SORTED_NUMERIC to ensure a deterministic order to results + Long value = new Long(json.getString(key)); + document.add(new NumericDocValuesField("id.long", value)); + document.add(new StoredField("id.long", value)); + } if (longFields.contains(key)) { document.add(new NumericDocValuesField(key, json.getJsonNumber(key).longValueExact())); } else if (doubleFields.contains(key)) { @@ -1423,6 +1418,13 @@ private void addSortField(JsonObject json, Document document, String key) { private void addSortField(IndexableField field, Document document) { String key = field.name(); if (sortFields.contains(key)) { + if (key.equals("id")) { + // Id is a special case, as we need to to be SORTED as a byte ref to allow joins + // but also SORTED_NUMERIC to ensure a deterministic order to results + Long value = new Long(field.stringValue()); + document.add(new NumericDocValuesField("id.long", value)); + document.add(new StoredField("id.long", value)); + } if (longFields.contains(key)) { document.add(new NumericDocValuesField(key, field.numericValue().longValue())); } else if (doubleFields.contains(key)) { @@ -1515,7 +1517,7 @@ private Builder parseParameter(JsonValue p) throws LuceneException { */ private Sort parseSort(String sort) throws LuceneException { if (sort == null || sort.equals("")) { - return null; + return new Sort(SortField.FIELD_SCORE, new SortedNumericSortField("id.long", Type.LONG)); } try (JsonReader reader = Json.createReader(new ByteArrayInputStream(sort.getBytes()))) { JsonObject object = reader.readObject(); @@ -1540,6 +1542,7 @@ private Sort parseSort(String sort) throws LuceneException { fields.add(new SortField(key, Type.STRING, reverse)); } } + fields.add(new SortedNumericSortField("id.long", Type.LONG)); return new Sort(fields.toArray(new SortField[0])); } } @@ -1557,42 +1560,42 @@ private Sort parseSort(String sort) throws LuceneException { * NUMBER */ private FieldDoc parseSearchAfter(String searchAfter) throws LuceneException { - if (searchAfter != null && !searchAfter.equals("")) { - logger.debug("Attempting to parseSearchAfter from {}", searchAfter); - JsonReader reader = Json.createReader(new StringReader(searchAfter)); - JsonObject object = reader.readObject(); - int doc = object.getInt("doc"); - int shardIndex = object.getInt("shardIndex"); - float score = Float.NaN; - List fields = new ArrayList<>(); - if (object.containsKey("score")) { - score = object.getJsonNumber("score").bigDecimalValue().floatValue(); - } - if (object.containsKey("fields")) { - JsonArray jsonArray = object.getJsonArray("fields"); - for (JsonValue value : jsonArray) { - switch (value.getValueType()) { - case NUMBER: - JsonNumber number = ((JsonNumber) value); - if (number.toString().contains(".")) { - fields.add(number.doubleValue()); - } else { - fields.add(number.longValueExact()); - } - break; - case STRING: - fields.add(new BytesRef(((JsonString) value).getString())); - break; - default: - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "fields should be an array of STRING and NUMBER, but had entry of type " - + value.getValueType()); - } + if (searchAfter == null || searchAfter.equals("")) { + return null; + } + logger.debug("Attempting to parseSearchAfter from {}", searchAfter); + JsonReader reader = Json.createReader(new StringReader(searchAfter)); + JsonObject object = reader.readObject(); + int doc = object.getInt("doc"); + int shardIndex = object.getInt("shardIndex"); + float score = Float.NaN; + List fields = new ArrayList<>(); + if (object.containsKey("score")) { + score = object.getJsonNumber("score").bigDecimalValue().floatValue(); + } + if (object.containsKey("fields")) { + JsonArray jsonArray = object.getJsonArray("fields"); + for (JsonValue value : jsonArray) { + switch (value.getValueType()) { + case NUMBER: + JsonNumber number = ((JsonNumber) value); + if (number.toString().contains(".")) { + fields.add(number.bigDecimalValue().floatValue()); + } else { + fields.add(number.longValueExact()); + } + break; + case STRING: + fields.add(new BytesRef(((JsonString) value).getString())); + break; + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "fields should be an array of STRING and NUMBER, but had entry of type " + + value.getValueType()); } } - return new FieldDoc(doc, score, fields.toArray(), shardIndex); } - return null; + return new FieldDoc(doc, score, fields.toArray(), shardIndex); } @POST diff --git a/src/main/resources/run.properties b/src/main/resources/run.properties index 4aeab39..25babbd 100644 --- a/src/main/resources/run.properties +++ b/src/main/resources/run.properties @@ -5,3 +5,4 @@ directory = ${HOME}/data/lucene commitSeconds = 5 maxShardSize = 2147483648 ip = 127.0.0.1/32 +units = \u2103: celsius degC, K: kelvin From 885673898951cfdb1d054829f6a498ec18b830f6 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Wed, 25 May 2022 17:51:30 +0100 Subject: [PATCH 38/73] Use mapping for parseSearchAfter types #19 --- .../java/org/icatproject/lucene/Lucene.java | 38 +++++++++++++------ 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 1edc024..a28b3ea 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -1213,12 +1213,10 @@ private void buildBuckets(JsonObjectBuilder aggregationsBuilder, FacetResult res private String luceneSearchResult(String name, Search search, String searchAfter, int maxResults, Long uid) throws IOException, LuceneException { IndexSearcher isearcher = getSearcher(search.readerMap, name); - String format = "Search {} with: query {}, maxResults, searchAfter {}, scored {} "; + String format = "Search {} with: query {}, maxResults {}, searchAfter {}, scored {}"; logger.debug(format, name, search.query, maxResults, searchAfter, search.scored); - FieldDoc searchAfterDoc = parseSearchAfter(searchAfter); - TopFieldDocs topFieldDocs = searchAfterDoc == null - ? isearcher.search(search.query, maxResults, search.sort, search.scored) - : isearcher.searchAfter(searchAfterDoc, search.query, maxResults, search.sort, search.scored); + FieldDoc searchAfterDoc = parseSearchAfter(searchAfter, search.sort.getSort()); + TopFieldDocs topFieldDocs = isearcher.searchAfter(searchAfterDoc, search.query, maxResults, search.sort, search.scored); ScoreDoc[] hits = topFieldDocs.scoreDocs; TotalHits totalHits = topFieldDocs.totalHits; SortField[] fields = topFieldDocs.fields; @@ -1559,7 +1557,7 @@ private Sort parseSort(String sort) throws LuceneException { * @throws LuceneException If an entry in the fields array is not a STRING or * NUMBER */ - private FieldDoc parseSearchAfter(String searchAfter) throws LuceneException { + private FieldDoc parseSearchAfter(String searchAfter, SortField[] sortFields) throws LuceneException { if (searchAfter == null || searchAfter.equals("")) { return null; } @@ -1575,14 +1573,32 @@ private FieldDoc parseSearchAfter(String searchAfter) throws LuceneException { } if (object.containsKey("fields")) { JsonArray jsonArray = object.getJsonArray("fields"); - for (JsonValue value : jsonArray) { + if (jsonArray.size() != sortFields.length) { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "fields should have the same length as sort, but they were " + + jsonArray.size() + " and " + sortFields.length); + } + for (int i = 0; i < sortFields.length; i++) { + JsonValue value = jsonArray.get(i); switch (value.getValueType()) { case NUMBER: JsonNumber number = ((JsonNumber) value); - if (number.toString().contains(".")) { - fields.add(number.bigDecimalValue().floatValue()); - } else { - fields.add(number.longValueExact()); + switch (sortFields[i].getType()) { + case FLOAT: + case DOUBLE: + case SCORE: + fields.add(number.bigDecimalValue().floatValue()); + break; + case INT: + case LONG: + case DOC: + case CUSTOM: + fields.add(number.longValueExact()); + break; + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "fields contained a NUMBER but the corresponding field was " + + sortFields[i]); } break; case STRING: From 008c68a568d6077810ebbad07125e2923495a5ac Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Wed, 1 Jun 2022 11:17:55 +0000 Subject: [PATCH 39/73] WIP sharding changes from stash #19 --- .../java/org/icatproject/lucene/Lucene.java | 471 +++++++++++------- 1 file changed, 294 insertions(+), 177 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index a28b3ea..eead564 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -30,11 +30,13 @@ import javax.ejb.Singleton; import javax.json.Json; import javax.json.JsonArray; +import javax.json.JsonException; import javax.json.JsonNumber; import javax.json.JsonObject; import javax.json.JsonObjectBuilder; import javax.json.JsonReader; import javax.json.JsonString; +import javax.json.JsonStructure; import javax.json.JsonValue; import javax.json.JsonValue.ValueType; import javax.json.stream.JsonGenerator; @@ -71,12 +73,9 @@ import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState; import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts; import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField; -import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; -import org.apache.lucene.index.MultiReader; -import org.apache.lucene.index.ReaderManager; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.flexible.core.QueryNodeException; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; @@ -89,6 +88,7 @@ import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.SearcherManager; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortedNumericSortField; @@ -119,7 +119,8 @@ public class Lucene { private class ShardBucket { private FSDirectory directory; private IndexWriter indexWriter; - private ReaderManager readerManager; + private SearcherManager searcherManager; + private AtomicLong documentCount; /** * Creates a bucket for accessing the read and write functionality for a single @@ -144,13 +145,29 @@ public ShardBucket(java.nio.file.Path shardPath) throws IOException { indexWriter.commit(); logger.debug("Now have " + indexWriter.getDocStats().numDocs + " documents indexed"); } - readerManager = new ReaderManager(indexWriter); + searcherManager = new SearcherManager(indexWriter, null); + IndexSearcher indexSearcher = null; + try { + indexSearcher = searcherManager.acquire(); + int numDocs = indexSearcher.getIndexReader().numDocs(); + documentCount = new AtomicLong(numDocs); + } finally { + searcherManager.release(indexSearcher); + } + } + + public int commit() throws IOException { + int cached = indexWriter.numRamDocs(); + indexWriter.commit(); + searcherManager.maybeRefreshBlocking(); + return cached; } } private class IndexBucket { private String entityName; - private Map shardMap = new HashMap<>(); + // private Map shardMap = new HashMap<>(); + private List shardList = new ArrayList<>(); private AtomicBoolean locked = new AtomicBoolean(); /** @@ -168,7 +185,8 @@ public IndexBucket(String entityName) { java.nio.file.Path shardPath = luceneDirectory.resolve(entityName); do { ShardBucket shardBucket = new ShardBucket(shardPath); - shardMap.put(shardIndex, shardBucket); + // shardMap.put(shardIndex, shardBucket); + shardList.add(shardBucket); shardIndex++; shardPath = luceneDirectory.resolve(entityName + "_" + shardIndex); } while (Files.isDirectory(shardPath)); @@ -185,12 +203,32 @@ public IndexBucket(String entityName) { * @return Array of DirectoryReaders for all shards in this bucket. * @throws IOException */ - public DirectoryReader[] acquireReaders() throws IOException { - List subReaders = new ArrayList<>(); - for (ShardBucket shardBucket : shardMap.values()) { - subReaders.add(shardBucket.readerManager.acquire()); + // public DirectoryReader[] acquireReaders() throws IOException { + // List subReaders = new ArrayList<>(); + // for (ShardBucket shardBucket : shardMap.values()) { + // subReaders.add(shardBucket.searcherManager.acquire()); + // } + // return subReaders.toArray(new DirectoryReader[0]); + // } + + public List acquireSearchers() throws IOException { + List subSearchers = new ArrayList<>(); + // for (ShardBucket shardBucket : shardMap.values()) { + for (ShardBucket shardBucket : shardList) { + subSearchers.add(shardBucket.searcherManager.acquire()); } - return subReaders.toArray(new DirectoryReader[0]); + return subSearchers; + } + + public void addDocument(Document document) throws IOException { + ShardBucket shardBucket = routeShard(); + shardBucket.indexWriter.addDocument(document); + shardBucket.documentCount.incrementAndGet(); + } + + public void updateDocument(Term term, Document document) throws IOException { + ShardBucket shardBucket = routeShard(); + shardBucket.indexWriter.updateDocument(term, document); } /** @@ -202,9 +240,9 @@ public DirectoryReader[] acquireReaders() throws IOException { * @return A new ShardBucket with the provided shardKey. * @throws IOException */ - public ShardBucket buildShardBucket(Long shardKey) throws IOException { + public ShardBucket buildShardBucket(int shardKey) throws IOException { ShardBucket shardBucket = new ShardBucket(luceneDirectory.resolve(entityName + "_" + shardKey)); - shardMap.put(shardKey, shardBucket); + shardList.add(shardBucket); return shardBucket; } @@ -218,15 +256,13 @@ public ShardBucket buildShardBucket(Long shardKey) throws IOException { * @throws IOException */ public void commit(String command, String entityName) throws IOException { - for (Entry entry : shardMap.entrySet()) { - ShardBucket shardBucket = entry.getValue(); - int cached = shardBucket.indexWriter.numRamDocs(); - shardBucket.indexWriter.commit(); + // for (Entry entry : shardMap.entrySet()) { + for (ShardBucket shardBucket : shardList) { + int cached = shardBucket.commit(); if (cached != 0) { - logger.debug("{} has committed {} {} changes to Lucene - now have {} documents indexed in shard {}", - command, cached, entityName, shardBucket.indexWriter.getDocStats().numDocs, entry.getKey()); + logger.debug("{} has committed {} {} changes to Lucene - now have {} documents indexed in {}", + command, cached, entityName, shardBucket.indexWriter.getDocStats().numDocs, shardBucket.directory.getDirectory().toString()); } - shardBucket.readerManager.maybeRefreshBlocking(); } } @@ -236,8 +272,9 @@ public void commit(String command, String entityName) throws IOException { * @throws IOException */ public void close() throws IOException { - for (ShardBucket shardBucket : shardMap.values()) { - shardBucket.readerManager.close(); + // for (ShardBucket shardBucket : shardMap.values()) { + for (ShardBucket shardBucket : shardList) { + shardBucket.searcherManager.close(); shardBucket.indexWriter.commit(); shardBucket.indexWriter.close(); shardBucket.directory.close(); @@ -254,15 +291,22 @@ public void close() throws IOException { * @return The ShardBucket that the relevant Document is/should be indexed in. * @throws IOException */ - public ShardBucket routeShard(Long id) throws IOException { - if (id == null) { - // If we don't have id, provide the first bucket - return shardMap.get(0L); - } - Long shard = id / luceneMaxShardSize; - ShardBucket shardBucket = shardMap.get(shard); - if (shardBucket == null) { - shardBucket = buildShardBucket(shard); + public ShardBucket routeShard() throws IOException { + // if (id == null || !shardedIndices.contains(entityName.toLowerCase())) { + // // If we don't have id, provide the first bucket + // return shardMap.get(0L); + // } + // Long shard = id / luceneMaxShardSize; + // ShardBucket shardBucket = shardMap.get(shard); + // if (shardBucket == null) { + // shardBucket = buildShardBucket(shard); + // } + // return shardBucket; + int size = shardList.size(); + ShardBucket shardBucket = shardList.get(size - 1); + if (shardBucket.documentCount.get() >= luceneMaxShardSize) { + shardBucket.indexWriter.commit(); + shardBucket = buildShardBucket(size); } return shardBucket; } @@ -275,31 +319,75 @@ public ShardBucket routeShard(Long id) throws IOException { * @return The relevant IndexWriter. * @throws IOException */ - public IndexWriter getWriter(Long id) throws IOException { - return routeShard(id).indexWriter; - } + // public IndexWriter getWriter(String entityName, Long id) throws IOException { + // return routeShard(entityName, id).indexWriter; + // } - public void releaseReaders(DirectoryReader[] subReaders) throws IOException, LuceneException { - if (subReaders.length != shardMap.size()) { + public void releaseReaders(List subSearchers) throws IOException, LuceneException { + if (subSearchers.size() != shardList.size()) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Was expecting the same number of DirectoryReaders as ShardBuckets, but had " - + subReaders.length + ", " + shardMap.size() + " respectively."); + + subSearchers.size() + ", " + shardList.size() + " respectively."); } int i = 0; - for (ShardBucket shardBucket : shardMap.values()) { - shardBucket.readerManager.release(subReaders[i]); + for (ShardBucket shardBucket : shardList) { + shardBucket.searcherManager.release(subSearchers.get(i)); i++; } } } public class Search { - public Map readerMap; + public Map> searcherMap; public Query query; public Sort sort; public boolean scored; public Set fields = new HashSet(); public Set dimensions = new HashSet(); + + /** + * Parses the String from the request into a Lucene Sort object. Multiple sort + * criteria are supported, and will be applied in order. + * + * @param sortString String representation of a JSON object with the field(s) to sort + * as keys, and the direction ("asc" or "desc") as value(s). + * @return Lucene Sort object + * @throws LuceneException If the value for any key isn't "asc" or "desc" + */ + public void parseSort(String sortString) throws LuceneException { + if (sortString == null || sortString.equals("")|| sortString.equals("{}")) { + scored = true; + sort = new Sort(SortField.FIELD_SCORE, new SortedNumericSortField("id.long", Type.LONG)); + return; + } + try (JsonReader reader = Json.createReader(new ByteArrayInputStream(sortString.getBytes()))) { + JsonObject object = reader.readObject(); + List fields = new ArrayList<>(); + for (String key : object.keySet()) { + String order = object.getString(key); + Boolean reverse; + if (order.equals("asc")) { + reverse = false; + } else if (order.equals("desc")) { + reverse = true; + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Sort order must be 'asc' or 'desc' but it was '" + order + "'"); + } + + if (longFields.contains(key)) { + fields.add(new SortedNumericSortField(key, Type.LONG, reverse)); + } else if (doubleFields.contains(key)) { + fields.add(new SortedNumericSortField(key, Type.DOUBLE, reverse)); + } else { + fields.add(new SortField(key, Type.STRING, reverse)); + } + } + fields.add(new SortedNumericSortField("id.long", Type.LONG)); + scored = false; + sort = new Sort(fields.toArray(new SortField[0])); + } + } } private static class ParentRelationship { @@ -390,9 +478,10 @@ public ParentRelationship(String parentName, String fieldPrefix) { private final FacetsConfig facetsConfig = new FacetsConfig(); private java.nio.file.Path luceneDirectory; - + private Set shardedIndices; private int luceneCommitMillis; private Long luceneMaxShardSize; + private long maxSearchTimeSeconds; private AtomicLong bucketNum = new AtomicLong(); private Map indexBuckets = new ConcurrentHashMap<>(); @@ -463,13 +552,17 @@ public void modify(@Context HttpServletRequest request) throws LuceneException { public void addNow(@Context HttpServletRequest request, @PathParam("entityName") String entityName) throws LuceneException { List documents; + JsonStructure value = null; logger.debug("Requesting addNow of {}", entityName); try (JsonReader reader = Json.createReader(request.getInputStream())) { - documents = reader.readArray().getValuesAs(JsonObject.class); + value = reader.read(); + documents = ((JsonArray) value).getValuesAs(JsonObject.class); for (JsonObject document : documents) { createNow(entityName, document); } - } catch (IOException e) { + } catch (IOException | JsonException e) { + + logger.error("Could not parse JSON from {}", value.toString()); throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } logger.debug("Added {} {} documents", documents.size(), entityName); @@ -564,7 +657,8 @@ private void create(JsonObject operationBody) throws NumberFormatException, IOEx throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene locked for " + entityName); } - bucket.getWriter(new Long(icatId)).addDocument(facetsConfig.build(document)); + // bucket.getWriter(entityName, new Long(icatId)).addDocument(facetsConfig.build(document)); + bucket.addDocument(facetsConfig.build(document)); } } @@ -574,11 +668,12 @@ private void createNow(String entityName, JsonObject documentJson) throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, "id was not in the document keys " + documentJson.keySet()); } - String icatId = documentJson.getString("id"); + // String icatId = documentJson.getString("id"); Document document = parseDocument(documentJson); logger.trace("create {} {}", entityName, document.toString()); IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); - bucket.getWriter(new Long(icatId)).addDocument(facetsConfig.build(document)); + // bucket.getWriter(entityName, new Long(icatId)).addDocument(facetsConfig.build(document)); + bucket.addDocument(facetsConfig.build(document)); } @POST @@ -603,10 +698,9 @@ private Search datafilesQuery(HttpServletRequest request, String sort, Long uid) throws IOException, QueryNodeException, LuceneException { Search search = new Search(); searches.put(uid, search); - Map readerMap = new HashMap<>(); - search.readerMap = readerMap; - search.scored = (sort == null || sort.equals("")); - search.sort = parseSort(sort); + Map> readerMap = new HashMap<>(); + search.searcherMap = readerMap; + search.parseSort(sort); try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); @@ -672,10 +766,9 @@ private Search datasetsQuery(HttpServletRequest request, String sort, Long uid) throws IOException, QueryNodeException, LuceneException { Search search = new Search(); searches.put(uid, search); - Map readerMap = new HashMap<>(); - search.readerMap = readerMap; - search.scored = (sort == null || sort.equals("")); - search.sort = parseSort(sort); + Map> readerMap = new HashMap<>(); + search.searcherMap = readerMap; + search.parseSort(sort); try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); JsonObject query = o.getJsonObject("query"); @@ -749,8 +842,10 @@ private void delete(JsonObject operationBody) throws LuceneException, IOExceptio "Lucene locked for " + entityName); } logger.trace("delete {} {}", entityName, icatId); - ShardBucket shardBucket = bucket.routeShard(new Long(icatId)); - shardBucket.indexWriter.deleteDocuments(new Term("id", icatId)); + for (ShardBucket shardBucket: bucket.shardList) { + shardBucket.indexWriter.deleteDocuments(new Term("id", icatId)); + } + // ShardBucket shardBucket = bucket.routeShard(entityName, new Long(icatId)); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } @@ -826,7 +921,6 @@ public String facet(@PathParam("entityName") String entityName, @Context HttpSer Search search = genericQuery(request, sort, uid); return luceneFacetResult(entityName, search, searchAfter, maxResults, maxLabels, uid); } catch (Exception e) { - logger.error("Error", e); freeSearcher(uid); throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } @@ -834,10 +928,10 @@ public String facet(@PathParam("entityName") String entityName, @Context HttpSer public void freeSearcher(Long uid) throws LuceneException { if (uid != null) { // May not be set for internal calls - Map search = searches.get(uid).readerMap; - for (Entry entry : search.entrySet()) { + Map> search = searches.get(uid).searcherMap; + for (Entry> entry : search.entrySet()) { String name = entry.getKey(); - DirectoryReader[] subReaders = entry.getValue(); + List subReaders = entry.getValue(); try { indexBuckets.computeIfAbsent(name, k -> new IndexBucket(k)).releaseReaders(subReaders); } catch (IOException e) { @@ -866,10 +960,9 @@ public void freeSearcher(Long uid) throws LuceneException { private Search genericQuery(HttpServletRequest request, String sort, Long uid) throws IOException, LuceneException { Search search = new Search(); searches.put(uid, search); - Map readerMap = new HashMap<>(); - search.readerMap = readerMap; - search.scored = (sort == null || sort.equals("")); - search.sort = parseSort(sort); + Map> readerMap = new HashMap<>(); + search.searcherMap = readerMap; + search.parseSort(sort); try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); JsonObject jsonQuery = o.getJsonObject("query"); @@ -974,18 +1067,23 @@ private Search genericQuery(HttpServletRequest request, String sort, Long uid) t return search; } - private MultiReader getMultiReader(Map readerMap, String name) throws IOException { - DirectoryReader[] subReaders = readerMap.get(name); - if (subReaders == null) { - subReaders = indexBuckets.computeIfAbsent(name, k -> new IndexBucket(k)).acquireReaders(); - readerMap.put(name, subReaders); + private List getSearchers(Map> readerMap, String name) throws IOException { + List subSearchers = readerMap.get(name); + if (subSearchers == null) { + subSearchers = indexBuckets.computeIfAbsent(name, k -> new IndexBucket(k)).acquireSearchers(); + readerMap.put(name, subSearchers); logger.debug("Remember searcher for {}", name); } - return new MultiReader(subReaders, false); + return subSearchers; } - private IndexSearcher getSearcher(Map readerMap, String name) throws IOException { - return new IndexSearcher(getMultiReader(readerMap, name)); + private IndexSearcher getSearcher(Map> readerMap, String name) throws IOException, LuceneException { + List subSearchers = readerMap.get(name); + subSearchers = getSearchers(readerMap, name); + if (subSearchers.size() > 1) { + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Cannot get single IndexSearcher for " + name + " as it has " + subSearchers.size() + " shards"); + } + return subSearchers.get(0); } @PostConstruct @@ -1002,18 +1100,23 @@ private void init() { luceneCommitMillis = props.getPositiveInt("commitSeconds") * 1000; luceneMaxShardSize = Math.max(props.getPositiveLong("maxShardSize"), new Long(Integer.MAX_VALUE + 1)); + maxSearchTimeSeconds = props.has("maxSearchTimeSeconds") ? props.getPositiveLong("maxSearchTimeSeconds") : 5; timer = new Timer("LuceneCommitTimer"); timer.schedule(new CommitTimerTask(), luceneCommitMillis, luceneCommitMillis); icatUnits = new IcatUnits(props.getString("units", "")); + String shardedIndicesString = props.getString("shardedIndices", "").toLowerCase(); + shardedIndices = new HashSet<>(Arrays.asList(shardedIndicesString.split("\\s+"))); + } catch (Exception e) { logger.error(fatal, e.getMessage()); throw new IllegalStateException(e.getMessage()); } - logger.info("Initialised icat.lucene"); + logger.info("Initialised icat.lucene with directory {}, commitSeconds {}, maxShardSize {}, shardedIndices {}, maxSearchTimeSeconds {}", + luceneDirectory, luceneCommitMillis, luceneMaxShardSize, shardedIndices, maxSearchTimeSeconds); } class CommitTimerTask extends TimerTask { @@ -1049,10 +1152,9 @@ private Search investigationsQuery(HttpServletRequest request, String sort, Long throws IOException, QueryNodeException, LuceneException { Search search = new Search(); searches.put(uid, search); - Map readerMap = new HashMap<>(); - search.readerMap = readerMap; - search.scored = (sort == null || sort.equals("")); - search.sort = parseSort(sort); + Map> readerMap = new HashMap<>(); + search.searcherMap = readerMap; + search.parseSort(sort); try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); JsonObject query = o.getJsonObject("query"); @@ -1130,7 +1232,7 @@ public void lock(@PathParam("entityName") String entityName) throws LuceneExcept throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene already locked for " + entityName); } try { - for (ShardBucket shardBucket : bucket.shardMap.values()) { + for (ShardBucket shardBucket : bucket.shardList) { shardBucket.indexWriter.deleteAll(); } } catch (IOException e) { @@ -1140,83 +1242,130 @@ public void lock(@PathParam("entityName") String entityName) throws LuceneExcept private String luceneFacetResult(String name, Search search, String searchAfter, int maxResults, int maxLabels, Long uid) throws IOException, IllegalStateException, LuceneException { - List results = new ArrayList<>(); - List rangeResults = new ArrayList<>(); + Map> results = new HashMap<>(); + Map> rangeResults = new HashMap<>(); if (maxResults <= 0 || maxLabels <= 0) { // This will result in no Facets and a null pointer, so return early logger.warn("Cannot facet when maxResults={}, maxLabels={}, returning empty list", maxResults, maxLabels); } else { - MultiReader directoryReader = getMultiReader(search.readerMap, name); - IndexSearcher indexSearcher = new IndexSearcher(directoryReader); - FacetsCollector facetsCollector = new FacetsCollector(); - FacetsCollector.search(indexSearcher, search.query, maxResults, facetsCollector); - logger.debug("To facet in {} for {} {} with {} from {} ", name, search.query, maxResults, indexSearcher, - searchAfter); - for (FacetDimensionRequest facetDimensionRequest : search.dimensions) { - if (facetDimensionRequest.getRanges().size() > 0) { - String dimension = facetDimensionRequest.getDimension(); - if (longFields.contains(dimension)) { - LongRange[] ranges = facetDimensionRequest.getRanges().toArray(new LongRange[0]); - Facets facets = new LongRangeFacetCounts(dimension, facetsCollector, ranges); - rangeResults.addAll(facets.getAllDims(maxLabels)); - } else if (doubleFields.contains(dimension)) { - DoubleRange[] ranges = facetDimensionRequest.getRanges().toArray(new DoubleRange[0]); - Facets facets = new DoubleRangeFacetCounts(dimension, facetsCollector, ranges); - rangeResults.addAll(facets.getAllDims(maxLabels)); - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "'ranges' specified for dimension " + dimension - + " but this is not a supported numeric field"); + List searchers = getSearchers(search.searcherMap, name); + for (IndexSearcher indexSearcher : searchers) { + FacetsCollector facetsCollector = new FacetsCollector(); + FacetsCollector.search(indexSearcher, search.query, maxResults, facetsCollector); + logger.debug("To facet in {} for {} {} with {} from {} ", name, search.query, maxResults, indexSearcher, + searchAfter); + for (FacetDimensionRequest facetDimensionRequest : search.dimensions) { + if (facetDimensionRequest.getRanges().size() > 0) { + String dimension = facetDimensionRequest.getDimension(); + if (longFields.contains(dimension)) { + LongRange[] ranges = facetDimensionRequest.getRanges().toArray(new LongRange[0]); + Facets facets = new LongRangeFacetCounts(dimension, facetsCollector, ranges); + putFacets(maxLabels, rangeResults, facets); + } else if (doubleFields.contains(dimension)) { + DoubleRange[] ranges = facetDimensionRequest.getRanges().toArray(new DoubleRange[0]); + Facets facets = new DoubleRangeFacetCounts(dimension, facetsCollector, ranges); + putFacets(maxLabels, rangeResults, facets); + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "'ranges' specified for dimension " + dimension + + " but this is not a supported numeric field"); + } } } - } - try { - DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(directoryReader); - Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); - logger.debug("facets: {}, maxLabels: {}, maxResults: {}", facets, maxLabels, maxResults); - results = facets.getAllDims(maxLabels); - } catch (IllegalArgumentException e) { - // This can occur if no fields in the index have been faceted - logger.error("No facets found in index, resulting in error: " + e.getClass() + " " + e.getMessage()); - } catch (IllegalStateException e) { - // This can occur if we do not create the IndexSearcher from the same - // DirectoryReader as we used to create the state - logger.error("IndexSearcher used is not based on the DirectoryReader used for facet counting: " - + e.getClass() + " " + e.getMessage()); - throw e; + try { + DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(indexSearcher.getIndexReader()); + Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); + logger.debug("facets: {}, maxLabels: {}, maxResults: {}", facets, maxLabels, maxResults); + putFacets(maxLabels, results, facets); + } catch (IllegalArgumentException e) { + // This can occur if no fields in the index have been faceted + logger.error("No facets found in index, resulting in error: " + e.getClass() + " " + e.getMessage()); + } catch (IllegalStateException e) { + // This can occur if we do not create the IndexSearcher from the same + // DirectoryReader as we used to create the state + logger.error("IndexSearcher used is not based on the DirectoryReader used for facet counting: " + + e.getClass() + " " + e.getMessage()); + throw e; + } } logger.debug("Facets found for " + results.size() + " dimensions"); } Set dimensionSet = new HashSet<>(); search.dimensions.forEach(d -> dimensionSet.add(d.getDimension())); JsonObjectBuilder aggregationsBuilder = Json.createObjectBuilder(); - for (FacetResult result : results) { - if (dimensionSet.size() == 0 || dimensionSet.contains(result.dim)) { - buildBuckets(aggregationsBuilder, result); + for (Entry> dimensionEntry : results.entrySet()) { + if (dimensionSet.size() == 0 || dimensionSet.contains(dimensionEntry.getKey())) { + buildBuckets(aggregationsBuilder, dimensionEntry); } } - for (FacetResult result : rangeResults) { - buildBuckets(aggregationsBuilder, result); + for (Entry> dimensionEntry : results.entrySet()) { + buildBuckets(aggregationsBuilder, dimensionEntry); } return Json.createObjectBuilder().add("aggregations", aggregationsBuilder).build().toString(); } - private void buildBuckets(JsonObjectBuilder aggregationsBuilder, FacetResult result) { + private void putFacets(int maxLabels, Map> rangeResults, Facets facets) + throws IOException { + for (FacetResult result : facets.getAllDims(maxLabels)) { + String dim = result.dim; + if (rangeResults.containsKey(dim)) { + Map labelMap = rangeResults.get(dim); + for (LabelAndValue labelAndValue : result.labelValues) { + String label = labelAndValue.label; + if (labelMap.containsKey(label)) { + labelMap.put(label, labelMap.get(label) + labelAndValue.value.longValue()); + } else { + labelMap.put(label, labelAndValue.value.longValue()); + } + } + } else { + Map labelMap = new HashMap<>(); + for (LabelAndValue labelAndValue : result.labelValues) { + labelMap.put(labelAndValue.label, labelAndValue.value.longValue()); + } + rangeResults.put(dim, labelMap); + } + } + } + + private void buildBuckets(JsonObjectBuilder aggregationsBuilder, Entry> result) { JsonObjectBuilder bucketsBuilder = Json.createObjectBuilder(); - for (LabelAndValue labelValue : result.labelValues) { + for (Entry labelValue : result.getValue().entrySet()) { JsonObjectBuilder bucketBuilder = Json.createObjectBuilder(); - bucketsBuilder.add(labelValue.label, bucketBuilder.add("doc_count", labelValue.value.longValue())); + bucketsBuilder.add(labelValue.getKey(), bucketBuilder.add("doc_count", labelValue.getValue())); } - aggregationsBuilder.add(result.dim, Json.createObjectBuilder().add("buckets", bucketsBuilder)); + aggregationsBuilder.add(result.getKey(), Json.createObjectBuilder().add("buckets", bucketsBuilder)); } private String luceneSearchResult(String name, Search search, String searchAfter, int maxResults, Long uid) throws IOException, LuceneException { - IndexSearcher isearcher = getSearcher(search.readerMap, name); + List searchers = getSearchers(search.searcherMap, name); String format = "Search {} with: query {}, maxResults {}, searchAfter {}, scored {}"; logger.debug(format, name, search.query, maxResults, searchAfter, search.scored); FieldDoc searchAfterDoc = parseSearchAfter(searchAfter, search.sort.getSort()); - TopFieldDocs topFieldDocs = isearcher.searchAfter(searchAfterDoc, search.query, maxResults, search.sort, search.scored); + TopFieldDocs topFieldDocs; + if (searchers.size() > 0) { + List shardHits = new ArrayList<>(); + int i = 0; + long startTime = System.currentTimeMillis(); + for (IndexSearcher indexSearcher : searchers) { + // checkMaxMatches(name, search, indexSearcher); + TopFieldDocs shardDocs = indexSearcher.searchAfter(searchAfterDoc, search.query, maxResults, search.sort, search.scored); + shardHits.add(shardDocs); + logger.debug("{} hits on shard {} out of {} total docs", shardDocs.totalHits, i, indexSearcher.getIndexReader().numDocs()); + i++; + long duration = (System.currentTimeMillis() - startTime); + if (duration > maxSearchTimeSeconds * 1000) { + logger.info("Stopping search after {} shards due to {} ms having elapsed", i, duration); + break; + } + } + topFieldDocs = TopFieldDocs.merge(search.sort, 0, maxResults, shardHits.toArray(new TopFieldDocs[i]), true); + } else { + IndexSearcher indexSearcher = searchers.get(0); + // checkMaxMatches(name, search, indexSearcher); + topFieldDocs = indexSearcher.searchAfter(searchAfterDoc, search.query, maxResults, search.sort, search.scored); + } ScoreDoc[] hits = topFieldDocs.scoreDocs; TotalHits totalHits = topFieldDocs.totalHits; SortField[] fields = topFieldDocs.fields; @@ -1229,7 +1378,7 @@ private String luceneSearchResult(String name, Search search, String searchAfter try (JsonGenerator gen = Json.createGenerator(baos)) { gen.writeStartObject().writeStartArray("results"); for (ScoreDoc hit : hits) { - encodeResult(gen, hit, isearcher, search); + encodeResult(gen, hit, searchers.get(hit.shardIndex), search); } gen.writeEnd(); // array results if (hits.length == maxResults) { @@ -1240,7 +1389,7 @@ private String luceneSearchResult(String name, Search search, String searchAfter gen.write("score", lastScore); } if (fields != null) { - Document lastDocument = isearcher.doc(lastDoc.doc); + Document lastDocument = searchers.get(lastDoc.shardIndex).doc(lastDoc.doc); gen.writeStartArray("fields"); for (SortField sortField : fields) { String fieldName = sortField.getField(); @@ -1278,10 +1427,17 @@ private String luceneSearchResult(String name, Search search, String searchAfter } gen.writeEnd(); // end enclosing object } - logger.debug("Json returned {}", baos.toString()); + logger.trace("Json returned {}", baos.toString()); return baos.toString(); } + // private void checkMaxMatches(String name, Search search, IndexSearcher indexSearcher) + // throws IOException, LuceneException { + // if (shardedIndices.contains(name.toLowerCase()) && indexSearcher.count(search.query) > luceneMaxMatchingDocuments) { + // throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Query exceeded the maximum number of matching documents " + luceneMaxMatchingDocuments); + // } + // } + private Query maybeEmptyQuery(Builder theQuery) { Query query = theQuery.build(); if (query.toString().isEmpty()) { @@ -1402,6 +1558,7 @@ private void addSortField(JsonObject json, Document document, String key) { document.add(new NumericDocValuesField("id.long", value)); document.add(new StoredField("id.long", value)); } + // TODO add special case for startDate -> date to make sorting easier if (longFields.contains(key)) { document.add(new NumericDocValuesField(key, json.getJsonNumber(key).longValueExact())); } else if (doubleFields.contains(key)) { @@ -1504,47 +1661,6 @@ private Builder parseParameter(JsonValue p) throws LuceneException { return paramQuery; } - /** - * Parses the String from the request into a Lucene Sort object. Multiple sort - * criteria are supported, and will be applied in order. - * - * @param sort String representation of a JSON object with the field(s) to sort - * as keys, and the direction ("asc" or "desc") as value(s). - * @return Lucene Sort object - * @throws LuceneException If the value for any key isn't "asc" or "desc" - */ - private Sort parseSort(String sort) throws LuceneException { - if (sort == null || sort.equals("")) { - return new Sort(SortField.FIELD_SCORE, new SortedNumericSortField("id.long", Type.LONG)); - } - try (JsonReader reader = Json.createReader(new ByteArrayInputStream(sort.getBytes()))) { - JsonObject object = reader.readObject(); - List fields = new ArrayList<>(); - for (String key : object.keySet()) { - String order = object.getString(key); - Boolean reverse; - if (order.equals("asc")) { - reverse = false; - } else if (order.equals("desc")) { - reverse = true; - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "Sort order must be 'asc' or 'desc' but it was '" + order + "'"); - } - - if (longFields.contains(key)) { - fields.add(new SortedNumericSortField(key, Type.LONG, reverse)); - } else if (doubleFields.contains(key)) { - fields.add(new SortedNumericSortField(key, Type.DOUBLE, reverse)); - } else { - fields.add(new SortField(key, Type.STRING, reverse)); - } - } - fields.add(new SortedNumericSortField("id.long", Type.LONG)); - return new Sort(fields.toArray(new SortField[0])); - } - } - /** * Parses a Lucene ScoreDoc to be "searched after" from a String representation * of a JSON array. @@ -1564,7 +1680,6 @@ private FieldDoc parseSearchAfter(String searchAfter, SortField[] sortFields) th logger.debug("Attempting to parseSearchAfter from {}", searchAfter); JsonReader reader = Json.createReader(new StringReader(searchAfter)); JsonObject object = reader.readObject(); - int doc = object.getInt("doc"); int shardIndex = object.getInt("shardIndex"); float score = Float.NaN; List fields = new ArrayList<>(); @@ -1611,7 +1726,7 @@ private FieldDoc parseSearchAfter(String searchAfter, SortField[] sortFields) th } } } - return new FieldDoc(doc, score, fields.toArray(), shardIndex); + return new FieldDoc(0, score, fields.toArray(), shardIndex); // TODO } @POST @@ -1644,7 +1759,8 @@ private void update(JsonObject operationBody) throws LuceneException, NumberForm "Lucene locked for " + entityName); } logger.trace("update: {}", document); - bucket.getWriter(new Long(icatId)).updateDocument(new Term("id", icatId), facetsConfig.build(document)); + // bucket.getWriter(entityName, new Long(icatId)).updateDocument(new Term("id", icatId), facetsConfig.build(document)); + bucket.updateDocument(new Term("id", icatId), facetsConfig.build(document)); } } @@ -1671,8 +1787,9 @@ private void updateByRelation(JsonObject operationBody, Boolean delete) Document newDocument = delete ? pruneDocument(parentRelationship.fieldPrefix, oldDocument) : updateDocument(operationBody.getJsonObject("doc"), oldDocument); logger.trace("updateByRelation: {}", newDocument); - bucket.getWriter(new Long(parentId)).updateDocument(new Term("id", parentId), - facetsConfig.build(newDocument)); + // bucket.getWriter(parentRelationship.parentName, new Long(parentId)).updateDocument(new Term("id", parentId), + // facetsConfig.build(newDocument)); + bucket.updateDocument(new Term("id", parentId), facetsConfig.build(newDocument)); } scoreDocs = searcher.searchAfter(scoreDocs[scoreDocs.length - 1], query, blockSize, sort).scoreDocs; } From 49373f507ae139b330cc361956836409dd2dfd6e Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Wed, 8 Jun 2022 16:00:33 +0000 Subject: [PATCH 40/73] Add fields needed for DGS component #19 --- .../java/org/icatproject/lucene/Lucene.java | 301 ++++++++++-------- 1 file changed, 164 insertions(+), 137 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index eead564..88629b6 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -24,6 +24,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; +import java.util.function.Consumer; import javax.annotation.PostConstruct; import javax.annotation.PreDestroy; @@ -197,23 +198,14 @@ public IndexBucket(String entityName) { } /** - * Acquires DirectoryReaders from the ReaderManagers of the individual shards in + * Acquires IndexSearchers from the SearcherManagers of the individual shards in * this bucket. * * @return Array of DirectoryReaders for all shards in this bucket. * @throws IOException */ - // public DirectoryReader[] acquireReaders() throws IOException { - // List subReaders = new ArrayList<>(); - // for (ShardBucket shardBucket : shardMap.values()) { - // subReaders.add(shardBucket.searcherManager.acquire()); - // } - // return subReaders.toArray(new DirectoryReader[0]); - // } - public List acquireSearchers() throws IOException { List subSearchers = new ArrayList<>(); - // for (ShardBucket shardBucket : shardMap.values()) { for (ShardBucket shardBucket : shardList) { subSearchers.add(shardBucket.searcherManager.acquire()); } @@ -256,12 +248,13 @@ public ShardBucket buildShardBucket(int shardKey) throws IOException { * @throws IOException */ public void commit(String command, String entityName) throws IOException { - // for (Entry entry : shardMap.entrySet()) { for (ShardBucket shardBucket : shardList) { int cached = shardBucket.commit(); if (cached != 0) { + int numDocs = shardBucket.indexWriter.getDocStats().numDocs; + String directoryName = shardBucket.directory.getDirectory().toString(); logger.debug("{} has committed {} {} changes to Lucene - now have {} documents indexed in {}", - command, cached, entityName, shardBucket.indexWriter.getDocStats().numDocs, shardBucket.directory.getDirectory().toString()); + command, cached, entityName, numDocs, directoryName); } } } @@ -282,26 +275,15 @@ public void close() throws IOException { } /** - * Provides the ShardBucket that should be used for reading/writing the Document - * with the provided id. All ids up to luceneMaxShardSize are indexed in the - * first shard, after that a new shard is created for the next - * luceneMaxShardSize Documents and so on. + * Provides the ShardBucket that should be used for writing the next Document. + * All Documents up to luceneMaxShardSize are indexed in the first shard, after + * that a new shard is created for the next luceneMaxShardSize Documents and so + * on. * - * @param id The id of a Document to be routed. * @return The ShardBucket that the relevant Document is/should be indexed in. * @throws IOException */ public ShardBucket routeShard() throws IOException { - // if (id == null || !shardedIndices.contains(entityName.toLowerCase())) { - // // If we don't have id, provide the first bucket - // return shardMap.get(0L); - // } - // Long shard = id / luceneMaxShardSize; - // ShardBucket shardBucket = shardMap.get(shard); - // if (shardBucket == null) { - // shardBucket = buildShardBucket(shard); - // } - // return shardBucket; int size = shardList.size(); ShardBucket shardBucket = shardList.get(size - 1); if (shardBucket.documentCount.get() >= luceneMaxShardSize) { @@ -311,18 +293,6 @@ public ShardBucket routeShard() throws IOException { return shardBucket; } - /** - * Provides the IndexWriter that should be used for writing the Document with - * the provided id. - * - * @param id The id of a Document to be routed. - * @return The relevant IndexWriter. - * @throws IOException - */ - // public IndexWriter getWriter(String entityName, Long id) throws IOException { - // return routeShard(entityName, id).indexWriter; - // } - public void releaseReaders(List subSearchers) throws IOException, LuceneException { if (subSearchers.size() != shardList.size()) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, @@ -343,19 +313,45 @@ public class Search { public Sort sort; public boolean scored; public Set fields = new HashSet(); + public Map> joinedFields = new HashMap<>(); public Set dimensions = new HashSet(); - + + public void parseFields(JsonObject jsonObject) throws LuceneException { + if (jsonObject.containsKey("fields")) { + List fieldStrings = jsonObject.getJsonArray("fields").getValuesAs(JsonString.class); + logger.trace("Parsing fields from {}", fieldStrings); + for (JsonString jsonString : fieldStrings) { + String[] splitString = jsonString.getString().split(" "); + if (splitString.length == 1) { + fields.add(splitString[0]); + } else if (splitString.length == 2) { + if (joinedFields.containsKey(splitString[0])) { + joinedFields.get(splitString[0]).add(splitString[1]); + } else { + joinedFields.putIfAbsent(splitString[0], + new HashSet(Arrays.asList(splitString[1]))); + } + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Could not parse field: " + jsonString.getString()); + } + } + } + + } + /** * Parses the String from the request into a Lucene Sort object. Multiple sort * criteria are supported, and will be applied in order. * - * @param sortString String representation of a JSON object with the field(s) to sort - * as keys, and the direction ("asc" or "desc") as value(s). + * @param sortString String representation of a JSON object with the field(s) to + * sort + * as keys, and the direction ("asc" or "desc") as value(s). * @return Lucene Sort object * @throws LuceneException If the value for any key isn't "asc" or "desc" */ public void parseSort(String sortString) throws LuceneException { - if (sortString == null || sortString.equals("")|| sortString.equals("{}")) { + if (sortString == null || sortString.equals("") || sortString.equals("{}")) { scored = true; sort = new Sort(SortField.FIELD_SCORE, new SortedNumericSortField("id.long", Type.LONG)); return; @@ -426,16 +422,21 @@ public ParentRelationship(String parentName, String fieldPrefix) { doubleFields.addAll(Arrays.asList("numericValue", "numericValueSI")); facetFields.addAll(Arrays.asList("type.name", "datafileFormat.name")); - longFields.addAll(Arrays.asList("date", "startDate", "endDate", "dateTimeValue")); - sortFields.addAll(Arrays.asList("datafile.id", "dataset.id", "investigation.id", "id", "date", "startDate", - "endDate", "name", "stringValue", "dateTimeValue", "numericValue", "numericValueSI")); - textFields.addAll(Arrays.asList("name", "visitId", "description", "datafileFormat.name", "sample.name", - "sample.type.name", "title", "summary", "facility.name", "user.fullName")); + longFields.addAll(Arrays.asList("date", "startDate", "endDate", "dateTimeValue", "investigation.startDate")); + sortFields.addAll(Arrays.asList("datafile.id", "dataset.id", "investigation.id", "instrument.id", "id", "date", + "startDate", "endDate", "name", "stringValue", "dateTimeValue", "numericValue", "numericValueSI")); + textFields.addAll(Arrays.asList("name", "visitId", "description", "location", "dataset.name", + "investigation.name", "instrument.name", "isntrument.fullName", "datafileFormat.name", "sample.name", + "sample.type.name", "title", "summary", "facility.name", "user.fullName", "type.name")); indexedEntities.addAll(Arrays.asList("Datafile", "Dataset", "Investigation", "DatafileParameter", - "DatasetParameter", "InvestigationParameter", "InvestigationUser", "Sample")); + "DatasetParameter", "InstrumentScientist", "InvestigationInstrument", "InvestigationParameter", + "InvestigationUser", "Sample")); - relationships.put("User", new ParentRelationship[] { new ParentRelationship("InvestigationUser", "user") }); + relationships.put("Instrument", + new ParentRelationship[] { new ParentRelationship("InvestigationInstrument", "instrument") }); + relationships.put("User", new ParentRelationship[] { new ParentRelationship("InvestigationUser", "user"), + new ParentRelationship("InstrumentScientist", "user") }); relationships.put("Sample", new ParentRelationship[] { new ParentRelationship("Dataset", "sample") }); relationships.put("SampleType", new ParentRelationship[] { new ParentRelationship("Sample", "type"), new ParentRelationship("Dataset", "sample.type") }); @@ -449,11 +450,14 @@ public ParentRelationship(String parentName, String fieldPrefix) { new ParentRelationship[] { new ParentRelationship("DatafileParameter", "type"), new ParentRelationship("DatasetParameter", "type"), new ParentRelationship("InvestigationParameter", "type") }); + relationships.put("Investigation", + new ParentRelationship[] { new ParentRelationship("Dataset", "investigation"), + new ParentRelationship("datafile", "investigation") }); genericParser.setAllowLeadingWildcard(true); genericParser.setAnalyzer(analyzer); - CharSequence[] datafileFields = { "name", "description", "doi", "datafileFormat.name" }; + CharSequence[] datafileFields = { "name", "description", "doi", "location", "datafileFormat.name" }; datafileParser.setAllowLeadingWildcard(true); datafileParser.setAnalyzer(analyzer); datafileParser.setMultiFields(datafileFields); @@ -561,7 +565,7 @@ public void addNow(@Context HttpServletRequest request, @PathParam("entityName") createNow(entityName, document); } } catch (IOException | JsonException e) { - + logger.error("Could not parse JSON from {}", value.toString()); throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } @@ -599,6 +603,21 @@ private static void buildDateRanges(Builder queryBuilder, JsonObject queryJson, } } + private void buildUserNameQuery(Map> readerMap, String userName, + BooleanQuery.Builder theQuery, String toField) + throws IOException, LuceneException { + TermQuery fromQuery = new TermQuery(new Term("user.name", userName)); + Query investigationUserQuery = JoinUtil.createJoinQuery("investigation.id", false, toField, fromQuery, + getSearcher(readerMap, "InvestigationUser"), ScoreMode.None); + Query instrumentScientistQuery = JoinUtil.createJoinQuery("instrument.id", false, "instrument.id", fromQuery, + getSearcher(readerMap, "InstrumentScientist"), ScoreMode.None); + Query investigationInstrumentQuery = JoinUtil.createJoinQuery("investigation.id", false, toField, + instrumentScientistQuery, getSearcher(readerMap, "InvestigationInstrument"), ScoreMode.None); + Builder userNameQueryBuilder = new BooleanQuery.Builder(); + userNameQueryBuilder.add(investigationUserQuery, Occur.SHOULD).add(investigationInstrumentQuery, Occur.SHOULD); + theQuery.add(userNameQueryBuilder.build(), Occur.MUST); + } + /* * This is only for testing purposes. Other calls to the service will not * work properly while this operation is in progress. @@ -649,7 +668,6 @@ private void create(JsonObject operationBody) throws NumberFormatException, IOEx updateByRelation(operationBody, false); } if (indexedEntities.contains(entityName)) { - String icatId = operationBody.getString("_id"); Document document = parseDocument(operationBody.getJsonObject("doc")); logger.trace("create {} {}", entityName, document.toString()); IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); @@ -657,7 +675,6 @@ private void create(JsonObject operationBody) throws NumberFormatException, IOEx throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene locked for " + entityName); } - // bucket.getWriter(entityName, new Long(icatId)).addDocument(facetsConfig.build(document)); bucket.addDocument(facetsConfig.build(document)); } } @@ -668,11 +685,9 @@ private void createNow(String entityName, JsonObject documentJson) throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, "id was not in the document keys " + documentJson.keySet()); } - // String icatId = documentJson.getString("id"); Document document = parseDocument(documentJson); logger.trace("create {} {}", entityName, document.toString()); IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); - // bucket.getWriter(entityName, new Long(icatId)).addDocument(facetsConfig.build(document)); bucket.addDocument(facetsConfig.build(document)); } @@ -710,10 +725,7 @@ private Search datafilesQuery(HttpServletRequest request, String sort, Long uid) BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); if (userName != null) { - Query iuQuery = JoinUtil.createJoinQuery("investigation.id", false, "investigation.id", - new TermQuery(new Term("user.name", userName)), getSearcher(readerMap, "InvestigationUser"), - ScoreMode.None); - theQuery.add(iuQuery, Occur.MUST); + buildUserNameQuery(readerMap, userName, theQuery, "investigation.id"); } String text = query.getString("text", null); @@ -734,10 +746,7 @@ private Search datafilesQuery(HttpServletRequest request, String sort, Long uid) } } search.query = maybeEmptyQuery(theQuery); - if (o.containsKey("fields")) { - List jsonStrings = o.getJsonArray("fields").getValuesAs(JsonString.class); - jsonStrings.forEach((jsonString) -> search.fields.add(jsonString.getString())); - } + search.parseFields(o); } return search; } @@ -777,12 +786,7 @@ private Search datasetsQuery(HttpServletRequest request, String sort, Long uid) BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); if (userName != null) { - - Query iuQuery = JoinUtil.createJoinQuery("investigation.id", false, "investigation.id", - new TermQuery(new Term("user.name", userName)), getSearcher(readerMap, "InvestigationUser"), - ScoreMode.None); - - theQuery.add(iuQuery, Occur.MUST); + buildUserNameQuery(readerMap, userName, theQuery, "investigation.id"); } String text = query.getString("text", null); @@ -803,10 +807,7 @@ private Search datasetsQuery(HttpServletRequest request, String sort, Long uid) } } search.query = maybeEmptyQuery(theQuery); - if (o.containsKey("fields")) { - List jsonStrings = o.getJsonArray("fields").getValuesAs(JsonString.class); - jsonStrings.forEach((jsonString) -> search.fields.add(jsonString.getString())); - } + search.parseFields(o); } return search; } @@ -842,10 +843,9 @@ private void delete(JsonObject operationBody) throws LuceneException, IOExceptio "Lucene locked for " + entityName); } logger.trace("delete {} {}", entityName, icatId); - for (ShardBucket shardBucket: bucket.shardList) { + for (ShardBucket shardBucket : bucket.shardList) { shardBucket.indexWriter.deleteDocuments(new Term("id", icatId)); } - // ShardBucket shardBucket = bucket.routeShard(entityName, new Long(icatId)); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } @@ -864,9 +864,10 @@ private void delete(JsonObject operationBody) throws LuceneException, IOExceptio * @param searcher IndexSearcher used to get the Document for the hit. * @param search Search object containing the fields to return. * @throws IOException + * @throws LuceneException */ - private void encodeResult(JsonGenerator gen, ScoreDoc hit, IndexSearcher searcher, Search search) - throws IOException { + private void encodeResult(String entityName, JsonGenerator gen, ScoreDoc hit, IndexSearcher searcher, Search search) + throws IOException, LuceneException { int luceneDocId = hit.doc; Document document = searcher.doc(luceneDocId); gen.writeStartObject().write("_id", luceneDocId); @@ -875,9 +876,42 @@ private void encodeResult(JsonGenerator gen, ScoreDoc hit, IndexSearcher searche gen.write("_score", hit.score); } gen.writeStartObject("_source"); - document.forEach((field) -> { + document.forEach(encodeField(gen, search.fields)); + for (String joinedEntityName : search.joinedFields.keySet()) { + List searchers = getSearchers(search.searcherMap, joinedEntityName); + Search joinedSearch = new Search(); + String fld; + String parentId; + if (joinedEntityName.toLowerCase().contains("investigation")) { + fld = "investigation.id"; + if (entityName.toLowerCase().equals("investigation")) { + parentId = document.get("id"); + } else { + parentId = document.get("investigation.id"); + } + } else { + fld = entityName.toLowerCase() + ".id"; + parentId = document.get("id"); + } + joinedSearch.query = new TermQuery(new Term(fld, parentId)); + joinedSearch.sort = new Sort(new SortedNumericSortField("id.long", Type.LONG)); + TopFieldDocs topFieldDocs = searchShards(joinedSearch, 100, searchers, null); + gen.writeStartArray(joinedEntityName.toLowerCase()); + for (ScoreDoc joinedHit : topFieldDocs.scoreDocs) { + gen.writeStartObject(); + Document joinedDocument = searchers.get(joinedHit.shardIndex).doc(joinedHit.doc); + joinedDocument.forEach(encodeField(gen, search.joinedFields.get(joinedEntityName))); + gen.writeEnd(); + } + gen.writeEnd(); + } + gen.writeEnd().writeEnd(); // source object, result object + } + + private Consumer encodeField(JsonGenerator gen, Set fields) { + return (field) -> { String fieldName = field.name(); - if (search.fields.contains(fieldName)) { + if (fields.contains(fieldName)) { if (longFields.contains(fieldName)) { gen.write(fieldName, field.numericValue().longValue()); } else if (doubleFields.contains(fieldName)) { @@ -886,8 +920,7 @@ private void encodeResult(JsonGenerator gen, ScoreDoc hit, IndexSearcher searche gen.write(fieldName, field.stringValue()); } } - }); - gen.writeEnd().writeEnd(); // source object, result object + }; } @PreDestroy @@ -1003,11 +1036,7 @@ private Search genericQuery(HttpServletRequest request, String sort, Long uid) t } search.query = maybeEmptyQuery(luceneQuery); logger.info("Query: {}", search.query); - if (o.containsKey("fields")) { - List jsonStrings = o.getJsonArray("fields").getValuesAs(JsonString.class); - jsonStrings.forEach((jsonString) -> search.fields.add(jsonString.getString())); - logger.info("Fields: {}", search.fields); - } + search.parseFields(o); if (o.containsKey("dimensions")) { List dimensionObjects = o.getJsonArray("dimensions").getValuesAs(JsonObject.class); for (JsonObject dimensionObject : dimensionObjects) { @@ -1067,7 +1096,8 @@ private Search genericQuery(HttpServletRequest request, String sort, Long uid) t return search; } - private List getSearchers(Map> readerMap, String name) throws IOException { + private List getSearchers(Map> readerMap, String name) + throws IOException { List subSearchers = readerMap.get(name); if (subSearchers == null) { subSearchers = indexBuckets.computeIfAbsent(name, k -> new IndexBucket(k)).acquireSearchers(); @@ -1077,11 +1107,13 @@ private List getSearchers(Map> reader return subSearchers; } - private IndexSearcher getSearcher(Map> readerMap, String name) throws IOException, LuceneException { + private IndexSearcher getSearcher(Map> readerMap, String name) + throws IOException, LuceneException { List subSearchers = readerMap.get(name); subSearchers = getSearchers(readerMap, name); if (subSearchers.size() > 1) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Cannot get single IndexSearcher for " + name + " as it has " + subSearchers.size() + " shards"); + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, + "Cannot get single IndexSearcher for " + name + " as it has " + subSearchers.size() + " shards"); } return subSearchers.get(0); } @@ -1100,7 +1132,8 @@ private void init() { luceneCommitMillis = props.getPositiveInt("commitSeconds") * 1000; luceneMaxShardSize = Math.max(props.getPositiveLong("maxShardSize"), new Long(Integer.MAX_VALUE + 1)); - maxSearchTimeSeconds = props.has("maxSearchTimeSeconds") ? props.getPositiveLong("maxSearchTimeSeconds") : 5; + maxSearchTimeSeconds = props.has("maxSearchTimeSeconds") ? props.getPositiveLong("maxSearchTimeSeconds") + : 5; timer = new Timer("LuceneCommitTimer"); timer.schedule(new CommitTimerTask(), luceneCommitMillis, luceneCommitMillis); @@ -1115,7 +1148,8 @@ private void init() { throw new IllegalStateException(e.getMessage()); } - logger.info("Initialised icat.lucene with directory {}, commitSeconds {}, maxShardSize {}, shardedIndices {}, maxSearchTimeSeconds {}", + logger.info( + "Initialised icat.lucene with directory {}, commitSeconds {}, maxShardSize {}, shardedIndices {}, maxSearchTimeSeconds {}", luceneDirectory, luceneCommitMillis, luceneMaxShardSize, shardedIndices, maxSearchTimeSeconds); } @@ -1163,10 +1197,7 @@ private Search investigationsQuery(HttpServletRequest request, String sort, Long BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); if (userName != null) { - Query iuQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", - new TermQuery(new Term("user.name", userName)), getSearcher(readerMap, "InvestigationUser"), - ScoreMode.None); - theQuery.add(iuQuery, Occur.MUST); + buildUserNameQuery(readerMap, userName, theQuery, "id"); } String text = query.getString("text", null); @@ -1213,10 +1244,7 @@ private Search investigationsQuery(HttpServletRequest request, String sort, Long } search.query = maybeEmptyQuery(theQuery); - if (o.containsKey("fields")) { - List jsonStrings = o.getJsonArray("fields").getValuesAs(JsonString.class); - jsonStrings.forEach((jsonString) -> search.fields.add(jsonString.getString())); - } + search.parseFields(o); } logger.info("Query: {}", search.query); return search; @@ -1273,13 +1301,15 @@ private String luceneFacetResult(String name, Search search, String searchAfter, } } try { - DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState(indexSearcher.getIndexReader()); + DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState( + indexSearcher.getIndexReader()); Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); logger.debug("facets: {}, maxLabels: {}, maxResults: {}", facets, maxLabels, maxResults); putFacets(maxLabels, results, facets); } catch (IllegalArgumentException e) { // This can occur if no fields in the index have been faceted - logger.error("No facets found in index, resulting in error: " + e.getClass() + " " + e.getMessage()); + logger.error( + "No facets found in index, resulting in error: " + e.getClass() + " " + e.getMessage()); } catch (IllegalStateException e) { // This can occur if we do not create the IndexSearcher from the same // DirectoryReader as we used to create the state @@ -1343,29 +1373,7 @@ private String luceneSearchResult(String name, Search search, String searchAfter String format = "Search {} with: query {}, maxResults {}, searchAfter {}, scored {}"; logger.debug(format, name, search.query, maxResults, searchAfter, search.scored); FieldDoc searchAfterDoc = parseSearchAfter(searchAfter, search.sort.getSort()); - TopFieldDocs topFieldDocs; - if (searchers.size() > 0) { - List shardHits = new ArrayList<>(); - int i = 0; - long startTime = System.currentTimeMillis(); - for (IndexSearcher indexSearcher : searchers) { - // checkMaxMatches(name, search, indexSearcher); - TopFieldDocs shardDocs = indexSearcher.searchAfter(searchAfterDoc, search.query, maxResults, search.sort, search.scored); - shardHits.add(shardDocs); - logger.debug("{} hits on shard {} out of {} total docs", shardDocs.totalHits, i, indexSearcher.getIndexReader().numDocs()); - i++; - long duration = (System.currentTimeMillis() - startTime); - if (duration > maxSearchTimeSeconds * 1000) { - logger.info("Stopping search after {} shards due to {} ms having elapsed", i, duration); - break; - } - } - topFieldDocs = TopFieldDocs.merge(search.sort, 0, maxResults, shardHits.toArray(new TopFieldDocs[i]), true); - } else { - IndexSearcher indexSearcher = searchers.get(0); - // checkMaxMatches(name, search, indexSearcher); - topFieldDocs = indexSearcher.searchAfter(searchAfterDoc, search.query, maxResults, search.sort, search.scored); - } + TopFieldDocs topFieldDocs = searchShards(search, maxResults, searchers, searchAfterDoc); ScoreDoc[] hits = topFieldDocs.scoreDocs; TotalHits totalHits = topFieldDocs.totalHits; SortField[] fields = topFieldDocs.fields; @@ -1378,7 +1386,7 @@ private String luceneSearchResult(String name, Search search, String searchAfter try (JsonGenerator gen = Json.createGenerator(baos)) { gen.writeStartObject().writeStartArray("results"); for (ScoreDoc hit : hits) { - encodeResult(gen, hit, searchers.get(hit.shardIndex), search); + encodeResult(name, gen, hit, searchers.get(hit.shardIndex), search); } gen.writeEnd(); // array results if (hits.length == maxResults) { @@ -1431,12 +1439,34 @@ private String luceneSearchResult(String name, Search search, String searchAfter return baos.toString(); } - // private void checkMaxMatches(String name, Search search, IndexSearcher indexSearcher) - // throws IOException, LuceneException { - // if (shardedIndices.contains(name.toLowerCase()) && indexSearcher.count(search.query) > luceneMaxMatchingDocuments) { - // throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Query exceeded the maximum number of matching documents " + luceneMaxMatchingDocuments); - // } - // } + private TopFieldDocs searchShards(Search search, int maxResults, List searchers, + FieldDoc searchAfterDoc) throws IOException { + TopFieldDocs topFieldDocs; + if (searchers.size() > 0) { + List shardHits = new ArrayList<>(); + int i = 0; + long startTime = System.currentTimeMillis(); + for (IndexSearcher indexSearcher : searchers) { + TopFieldDocs shardDocs = indexSearcher.searchAfter(searchAfterDoc, search.query, maxResults, + search.sort, search.scored); + shardHits.add(shardDocs); + logger.debug("{} hits on shard {} out of {} total docs", shardDocs.totalHits, i, + indexSearcher.getIndexReader().numDocs()); + i++; + long duration = (System.currentTimeMillis() - startTime); + if (duration > maxSearchTimeSeconds * 1000) { + logger.info("Stopping search after {} shards due to {} ms having elapsed", i, duration); + break; + } + } + topFieldDocs = TopFieldDocs.merge(search.sort, 0, maxResults, shardHits.toArray(new TopFieldDocs[i]), true); + } else { + IndexSearcher indexSearcher = searchers.get(0); + topFieldDocs = indexSearcher.searchAfter(searchAfterDoc, search.query, maxResults, search.sort, + search.scored); + } + return topFieldDocs; + } private Query maybeEmptyQuery(Builder theQuery) { Query query = theQuery.build(); @@ -1504,7 +1534,7 @@ private void addField(JsonObject json, Document document, String key) { // Likewise, faceted fields should be considered separately if (facetFields.contains(key)) { - document.add(new SortedSetDocValuesFacetField(key, json.getString(key))); + document.add(new SortedSetDocValuesFacetField(key + ".keyword", json.getString(key))); } if (doubleFields.contains(key)) { @@ -1558,7 +1588,7 @@ private void addSortField(JsonObject json, Document document, String key) { document.add(new NumericDocValuesField("id.long", value)); document.add(new StoredField("id.long", value)); } - // TODO add special case for startDate -> date to make sorting easier + // TODO add special case for startDate -> date to make sorting easier? if (longFields.contains(key)) { document.add(new NumericDocValuesField(key, json.getJsonNumber(key).longValueExact())); } else if (doubleFields.contains(key)) { @@ -1759,7 +1789,6 @@ private void update(JsonObject operationBody) throws LuceneException, NumberForm "Lucene locked for " + entityName); } logger.trace("update: {}", document); - // bucket.getWriter(entityName, new Long(icatId)).updateDocument(new Term("id", icatId), facetsConfig.build(document)); bucket.updateDocument(new Term("id", icatId), facetsConfig.build(document)); } } @@ -1787,8 +1816,6 @@ private void updateByRelation(JsonObject operationBody, Boolean delete) Document newDocument = delete ? pruneDocument(parentRelationship.fieldPrefix, oldDocument) : updateDocument(operationBody.getJsonObject("doc"), oldDocument); logger.trace("updateByRelation: {}", newDocument); - // bucket.getWriter(parentRelationship.parentName, new Long(parentId)).updateDocument(new Term("id", parentId), - // facetsConfig.build(newDocument)); bucket.updateDocument(new Term("id", parentId), facetsConfig.build(newDocument)); } scoreDocs = searcher.searchAfter(scoreDocs[scoreDocs.length - 1], query, blockSize, sort).scoreDocs; From 2fc0f8ef6a06e95d4ef0681dada6dd7f0881b150 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Fri, 10 Jun 2022 20:18:58 +0100 Subject: [PATCH 41/73] Use .keyword for string facets #19 --- .../lucene/FacetDimensionRequest.java | 26 ---- .../icatproject/lucene/FacetedDimension.java | 88 +++++++++++ .../java/org/icatproject/lucene/Lucene.java | 142 ++++++++++-------- 3 files changed, 165 insertions(+), 91 deletions(-) delete mode 100644 src/main/java/org/icatproject/lucene/FacetDimensionRequest.java create mode 100644 src/main/java/org/icatproject/lucene/FacetedDimension.java diff --git a/src/main/java/org/icatproject/lucene/FacetDimensionRequest.java b/src/main/java/org/icatproject/lucene/FacetDimensionRequest.java deleted file mode 100644 index 736f2d3..0000000 --- a/src/main/java/org/icatproject/lucene/FacetDimensionRequest.java +++ /dev/null @@ -1,26 +0,0 @@ -package org.icatproject.lucene; - -import java.util.ArrayList; -import java.util.List; - -import org.apache.lucene.facet.range.Range; - -public class FacetDimensionRequest { - - private String dimension; - private List ranges; - - public FacetDimensionRequest(String dimension) { - this.dimension = dimension; - this.ranges = new ArrayList<>(); - } - - public List getRanges() { - return ranges; - } - - public String getDimension() { - return dimension; - } - -} diff --git a/src/main/java/org/icatproject/lucene/FacetedDimension.java b/src/main/java/org/icatproject/lucene/FacetedDimension.java new file mode 100644 index 0000000..3173d27 --- /dev/null +++ b/src/main/java/org/icatproject/lucene/FacetedDimension.java @@ -0,0 +1,88 @@ +package org.icatproject.lucene; + +import java.util.ArrayList; +import java.util.List; + +import javax.json.Json; +import javax.json.JsonObjectBuilder; + +import org.apache.lucene.facet.FacetResult; +import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.range.Range; + +public class FacetedDimension { + + private String dimension; + private List ranges; + private List labels; + private List counts; + + /** + * For a single dimension (field), stores labels (the unique values or ranges of + * values for that field in the index) and their respective counts (the number + * of times that label appears in different documents). + * + * For example, a dimension might be "colour", the label "red", and the count 5. + * + * @param dimension The dimension, or field, to be faceted + */ + public FacetedDimension(String dimension) { + this.dimension = dimension; + this.ranges = new ArrayList<>(); + this.labels = new ArrayList<>(); + this.counts = new ArrayList<>(); + } + + /** + * Extracts the count for each label in the FacetResult. If the label has + * already been encountered, the count is incremented rather than being + * overridden. Essentially, this allows faceting to be performed across multiple + * shards. + * + * @param facetResult A Lucene FacetResult object corresponding the relevant + * dimension + */ + public void addResult(FacetResult facetResult) { + for (LabelAndValue labelAndValue : facetResult.labelValues) { + String label = labelAndValue.label; + int labelIndex = labels.indexOf(label); + if (labelIndex == -1) { + labels.add(label); + counts.add(labelAndValue.value.longValue()); + } else { + counts.set(labelIndex, counts.get(labelIndex) + labelAndValue.value.longValue()); + } + } + } + + /** + * Formats the labels and counts into Json. + * + * @param aggregationsBuilder The JsonObjectBuilder to add the facets for this + * dimension to. + */ + public void buildResponse(JsonObjectBuilder aggregationsBuilder) { + JsonObjectBuilder bucketsBuilder = Json.createObjectBuilder(); + for (int i = 0; i < labels.size(); i++) { + JsonObjectBuilder bucketBuilder = Json.createObjectBuilder(); + bucketsBuilder.add(labels.get(i), bucketBuilder.add("doc_count", counts.get(i))); + } + aggregationsBuilder.add(dimension, Json.createObjectBuilder().add("buckets", bucketsBuilder)); + } + + /** + * @return The list of Lucene Range Objects for use with numerical facets. + * For String faceting, this will be empty. + */ + public List getRanges() { + return ranges; + } + + /** + * @return The dimension that these labels and counts correspond to. + */ + public String getDimension() { + return dimension; + } + +} diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 88629b6..b83452b 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -65,7 +65,6 @@ import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; import org.apache.lucene.facet.FacetsConfig; -import org.apache.lucene.facet.LabelAndValue; import org.apache.lucene.facet.range.DoubleRange; import org.apache.lucene.facet.range.DoubleRangeFacetCounts; import org.apache.lucene.facet.range.LongRange; @@ -314,7 +313,7 @@ public class Search { public boolean scored; public Set fields = new HashSet(); public Map> joinedFields = new HashMap<>(); - public Set dimensions = new HashSet(); + public Map dimensions = new HashMap(); public void parseFields(JsonObject jsonObject) throws LuceneException { if (jsonObject.containsKey("fields")) { @@ -426,7 +425,7 @@ public ParentRelationship(String parentName, String fieldPrefix) { sortFields.addAll(Arrays.asList("datafile.id", "dataset.id", "investigation.id", "instrument.id", "id", "date", "startDate", "endDate", "name", "stringValue", "dateTimeValue", "numericValue", "numericValueSI")); textFields.addAll(Arrays.asList("name", "visitId", "description", "location", "dataset.name", - "investigation.name", "instrument.name", "isntrument.fullName", "datafileFormat.name", "sample.name", + "investigation.name", "instrument.name", "instrument.fullName", "datafileFormat.name", "sample.name", "sample.type.name", "title", "summary", "facility.name", "user.fullName", "type.name")); indexedEntities.addAll(Arrays.asList("Datafile", "Dataset", "Investigation", "DatafileParameter", @@ -869,8 +868,9 @@ private void delete(JsonObject operationBody) throws LuceneException, IOExceptio private void encodeResult(String entityName, JsonGenerator gen, ScoreDoc hit, IndexSearcher searcher, Search search) throws IOException, LuceneException { int luceneDocId = hit.doc; + int shardIndex = hit.shardIndex; Document document = searcher.doc(luceneDocId); - gen.writeStartObject().write("_id", luceneDocId); + gen.writeStartObject().write("_id", luceneDocId).write("_shardIndex", shardIndex); Float score = hit.score; if (!score.equals(Float.NaN)) { gen.write("_score", hit.score); @@ -1045,7 +1045,7 @@ private Search genericQuery(HttpServletRequest request, String sort, Long uid) t "'dimension' not specified for facet request " + dimensionObject.toString()); } String dimension = dimensionObject.getString("dimension"); - FacetDimensionRequest facetDimensionRequest = new FacetDimensionRequest(dimension); + FacetedDimension facetDimensionRequest = new FacetedDimension(dimension); if (dimensionObject.containsKey("ranges")) { List ranges = facetDimensionRequest.getRanges(); if (longFields.contains(dimension)) { @@ -1088,7 +1088,7 @@ private Search genericQuery(HttpServletRequest request, String sort, Long uid) t + " but this is not a supported numeric field"); } } - search.dimensions.add(facetDimensionRequest); + search.dimensions.put(dimension, facetDimensionRequest); } logger.info("Dimensions: {}", search.dimensions.size()); } @@ -1270,42 +1270,67 @@ public void lock(@PathParam("entityName") String entityName) throws LuceneExcept private String luceneFacetResult(String name, Search search, String searchAfter, int maxResults, int maxLabels, Long uid) throws IOException, IllegalStateException, LuceneException { - Map> results = new HashMap<>(); - Map> rangeResults = new HashMap<>(); + // If no dimensions were specified, perform "sparse" faceting on all applicable + // string values + boolean sparse = search.dimensions.size() == 0; + // By default, assume we do not need to perform string based faceting for + // specific dimensions + boolean facetStrings = false; if (maxResults <= 0 || maxLabels <= 0) { // This will result in no Facets and a null pointer, so return early logger.warn("Cannot facet when maxResults={}, maxLabels={}, returning empty list", maxResults, maxLabels); } else { + // Iterate over shards and aggregate the facets from each List searchers = getSearchers(search.searcherMap, name); + logger.debug("Faceting {} with {} after {} ", name, search.query, searchAfter); for (IndexSearcher indexSearcher : searchers) { FacetsCollector facetsCollector = new FacetsCollector(); FacetsCollector.search(indexSearcher, search.query, maxResults, facetsCollector); - logger.debug("To facet in {} for {} {} with {} from {} ", name, search.query, maxResults, indexSearcher, - searchAfter); - for (FacetDimensionRequest facetDimensionRequest : search.dimensions) { - if (facetDimensionRequest.getRanges().size() > 0) { - String dimension = facetDimensionRequest.getDimension(); + for (FacetedDimension facetedDimension : search.dimensions.values()) { + if (facetedDimension.getRanges().size() > 0) { + // Perform range based facets for a numeric field + String dimension = facetedDimension.getDimension(); + Facets facets; if (longFields.contains(dimension)) { - LongRange[] ranges = facetDimensionRequest.getRanges().toArray(new LongRange[0]); - Facets facets = new LongRangeFacetCounts(dimension, facetsCollector, ranges); - putFacets(maxLabels, rangeResults, facets); + LongRange[] ranges = facetedDimension.getRanges().toArray(new LongRange[0]); + facets = new LongRangeFacetCounts(dimension, facetsCollector, ranges); } else if (doubleFields.contains(dimension)) { - DoubleRange[] ranges = facetDimensionRequest.getRanges().toArray(new DoubleRange[0]); - Facets facets = new DoubleRangeFacetCounts(dimension, facetsCollector, ranges); - putFacets(maxLabels, rangeResults, facets); + DoubleRange[] ranges = facetedDimension.getRanges().toArray(new DoubleRange[0]); + facets = new DoubleRangeFacetCounts(dimension, facetsCollector, ranges); } else { throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, "'ranges' specified for dimension " + dimension + " but this is not a supported numeric field"); } + FacetResult facetResult = facets.getTopChildren(maxLabels, dimension); + facetedDimension.addResult(facetResult); + } else { + // Have a specific string dimension to facet, but these should all be done at + // once for efficiency + facetStrings = true; } } try { - DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState( - indexSearcher.getIndexReader()); - Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); - logger.debug("facets: {}, maxLabels: {}, maxResults: {}", facets, maxLabels, maxResults); - putFacets(maxLabels, results, facets); + if (sparse) { + // Facet all applicable string fields + DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState( + indexSearcher.getIndexReader()); + Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); + addFacetResults(maxLabels, search.dimensions, facets); + } else if (facetStrings) { + // Only add facets to the results if they match one of the requested dimensions + DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState( + indexSearcher.getIndexReader()); + Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); + List facetResults = facets.getAllDims(maxLabels); + for (FacetResult facetResult : facetResults) { + String dimension = facetResult.dim; + FacetedDimension facetedDimension = search.dimensions.get(dimension); + if (facetedDimension != null) { + facetedDimension.addResult(facetResult); + } + } + } } catch (IllegalArgumentException e) { // This can occur if no fields in the index have been faceted logger.error( @@ -1318,55 +1343,38 @@ private String luceneFacetResult(String name, Search search, String searchAfter, throw e; } } - logger.debug("Facets found for " + results.size() + " dimensions"); } - Set dimensionSet = new HashSet<>(); - search.dimensions.forEach(d -> dimensionSet.add(d.getDimension())); + // Build results JsonObjectBuilder aggregationsBuilder = Json.createObjectBuilder(); - for (Entry> dimensionEntry : results.entrySet()) { - if (dimensionSet.size() == 0 || dimensionSet.contains(dimensionEntry.getKey())) { - buildBuckets(aggregationsBuilder, dimensionEntry); - } - } - for (Entry> dimensionEntry : results.entrySet()) { - buildBuckets(aggregationsBuilder, dimensionEntry); - } + search.dimensions.values().forEach(facetedDimension -> facetedDimension.buildResponse(aggregationsBuilder)); return Json.createObjectBuilder().add("aggregations", aggregationsBuilder).build().toString(); } - private void putFacets(int maxLabels, Map> rangeResults, Facets facets) + /** + * Add Facets for all dimensions. This will create FacetDimension Objects if the + * do not already exist in the facetedDimensionMap, otherwise the counts for + * each label will be aggregated. + * + * @param maxLabels The maximum number of labels for a given + * dimension. This labels with the highest counts are + * returned first. + * @param facetedDimensionMap Map containing the dimensions that have been or + * should be faceted. + * @param facets Lucene facets object containing all dimensions. + * @throws IOException + */ + private void addFacetResults(int maxLabels, Map facetedDimensionMap, Facets facets) throws IOException { - for (FacetResult result : facets.getAllDims(maxLabels)) { - String dim = result.dim; - if (rangeResults.containsKey(dim)) { - Map labelMap = rangeResults.get(dim); - for (LabelAndValue labelAndValue : result.labelValues) { - String label = labelAndValue.label; - if (labelMap.containsKey(label)) { - labelMap.put(label, labelMap.get(label) + labelAndValue.value.longValue()); - } else { - labelMap.put(label, labelAndValue.value.longValue()); - } - } - } else { - Map labelMap = new HashMap<>(); - for (LabelAndValue labelAndValue : result.labelValues) { - labelMap.put(labelAndValue.label, labelAndValue.value.longValue()); - } - rangeResults.put(dim, labelMap); + for (FacetResult facetResult : facets.getAllDims(maxLabels)) { + String dim = facetResult.dim; + FacetedDimension facetedDimension = facetedDimensionMap.get(dim); + if (facetedDimension == null) { + facetedDimension = new FacetedDimension(facetResult.dim); } + facetedDimension.addResult(facetResult); } } - private void buildBuckets(JsonObjectBuilder aggregationsBuilder, Entry> result) { - JsonObjectBuilder bucketsBuilder = Json.createObjectBuilder(); - for (Entry labelValue : result.getValue().entrySet()) { - JsonObjectBuilder bucketBuilder = Json.createObjectBuilder(); - bucketsBuilder.add(labelValue.getKey(), bucketBuilder.add("doc_count", labelValue.getValue())); - } - aggregationsBuilder.add(result.getKey(), Json.createObjectBuilder().add("buckets", bucketsBuilder)); - } - private String luceneSearchResult(String name, Search search, String searchAfter, int maxResults, Long uid) throws IOException, LuceneException { List searchers = getSearchers(search.searcherMap, name); @@ -1535,6 +1543,7 @@ private void addField(JsonObject json, Document document, String key) { // Likewise, faceted fields should be considered separately if (facetFields.contains(key)) { document.add(new SortedSetDocValuesFacetField(key + ".keyword", json.getString(key))); + document.add(new StringField(key + ".keyword", json.getString(key), Store.NO)); } if (doubleFields.contains(key)) { @@ -1670,7 +1679,7 @@ private Builder parseParameter(JsonValue p) throws LuceneException { BooleanQuery.Builder paramQuery = new BooleanQuery.Builder(); String pName = parameter.getString("name", null); if (pName != null) { - paramQuery.add(new WildcardQuery(new Term("type.name", pName)), Occur.MUST); + paramQuery.add(new WildcardQuery(new Term("type.name.keyword", pName)), Occur.MUST); } String pUnits = parameter.getString("units", null); @@ -1710,7 +1719,10 @@ private FieldDoc parseSearchAfter(String searchAfter, SortField[] sortFields) th logger.debug("Attempting to parseSearchAfter from {}", searchAfter); JsonReader reader = Json.createReader(new StringReader(searchAfter)); JsonObject object = reader.readObject(); + // shardIndex and Lucene doc Id are always needed to determine tie breaks, even + // if the field sort resulted in no ties in the first place int shardIndex = object.getInt("shardIndex"); + int doc = object.getInt("doc"); float score = Float.NaN; List fields = new ArrayList<>(); if (object.containsKey("score")) { @@ -1756,7 +1768,7 @@ private FieldDoc parseSearchAfter(String searchAfter, SortField[] sortFields) th } } } - return new FieldDoc(0, score, fields.toArray(), shardIndex); // TODO + return new FieldDoc(doc, score, fields.toArray(), shardIndex); } @POST From 973d31cf6b267a26ccf1d7fd900e29546846b20b Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Thu, 16 Jun 2022 14:08:12 +0000 Subject: [PATCH 42/73] Allow searchAfter for uneven shards #19 --- .../java/org/icatproject/lucene/Lucene.java | 34 ++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index b83452b..237c581 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -879,6 +879,7 @@ private void encodeResult(String entityName, JsonGenerator gen, ScoreDoc hit, In document.forEach(encodeField(gen, search.fields)); for (String joinedEntityName : search.joinedFields.keySet()) { List searchers = getSearchers(search.searcherMap, joinedEntityName); + List shards = getShards(search.searcherMap, joinedEntityName); Search joinedSearch = new Search(); String fld; String parentId; @@ -895,7 +896,7 @@ private void encodeResult(String entityName, JsonGenerator gen, ScoreDoc hit, In } joinedSearch.query = new TermQuery(new Term(fld, parentId)); joinedSearch.sort = new Sort(new SortedNumericSortField("id.long", Type.LONG)); - TopFieldDocs topFieldDocs = searchShards(joinedSearch, 100, searchers, null); + TopFieldDocs topFieldDocs = searchShards(joinedSearch, 100, shards, null); gen.writeStartArray(joinedEntityName.toLowerCase()); for (ScoreDoc joinedHit : topFieldDocs.scoreDocs) { gen.writeStartObject(); @@ -1118,6 +1119,10 @@ private IndexSearcher getSearcher(Map> readerMap, St return subSearchers.get(0); } + private List getShards(Map> readerMap, String name) { + return indexBuckets.computeIfAbsent(name, k -> new IndexBucket(k)).shardList; + } + @PostConstruct private void init() { logger.info("Initialising icat.lucene"); @@ -1317,6 +1322,7 @@ private String luceneFacetResult(String name, Search search, String searchAfter, indexSearcher.getIndexReader()); Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); addFacetResults(maxLabels, search.dimensions, facets); + logger.trace("Sparse faceting found results for {} dimensions", search.dimensions.size()); } else if (facetStrings) { // Only add facets to the results if they match one of the requested dimensions DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState( @@ -1367,9 +1373,11 @@ private void addFacetResults(int maxLabels, Map facete throws IOException { for (FacetResult facetResult : facets.getAllDims(maxLabels)) { String dim = facetResult.dim; + logger.trace("Sparse faceting: FacetResult for {}", dim); FacetedDimension facetedDimension = facetedDimensionMap.get(dim); if (facetedDimension == null) { facetedDimension = new FacetedDimension(facetResult.dim); + facetedDimensionMap.put(dim, facetedDimension); } facetedDimension.addResult(facetResult); } @@ -1378,10 +1386,11 @@ private void addFacetResults(int maxLabels, Map facete private String luceneSearchResult(String name, Search search, String searchAfter, int maxResults, Long uid) throws IOException, LuceneException { List searchers = getSearchers(search.searcherMap, name); + List shards = getShards(search.searcherMap, name); String format = "Search {} with: query {}, maxResults {}, searchAfter {}, scored {}"; logger.debug(format, name, search.query, maxResults, searchAfter, search.scored); FieldDoc searchAfterDoc = parseSearchAfter(searchAfter, search.sort.getSort()); - TopFieldDocs topFieldDocs = searchShards(search, maxResults, searchers, searchAfterDoc); + TopFieldDocs topFieldDocs = searchShards(search, maxResults, shards, searchAfterDoc); ScoreDoc[] hits = topFieldDocs.scoreDocs; TotalHits totalHits = topFieldDocs.totalHits; SortField[] fields = topFieldDocs.fields; @@ -1447,19 +1456,28 @@ private String luceneSearchResult(String name, Search search, String searchAfter return baos.toString(); } - private TopFieldDocs searchShards(Search search, int maxResults, List searchers, + private TopFieldDocs searchShards(Search search, int maxResults, List shards, FieldDoc searchAfterDoc) throws IOException { TopFieldDocs topFieldDocs; - if (searchers.size() > 0) { + if (shards.size() > 0) { List shardHits = new ArrayList<>(); int i = 0; + int doc = searchAfterDoc != null ? searchAfterDoc.doc : -1; long startTime = System.currentTimeMillis(); - for (IndexSearcher indexSearcher : searchers) { + for (ShardBucket shard : shards) { + int docCount = shard.documentCount.intValue(); + if (searchAfterDoc != null) { + if (doc > docCount) { + searchAfterDoc.doc = docCount - 1; + } else { + searchAfterDoc.doc = doc; + } + } + IndexSearcher indexSearcher = shard.searcherManager.acquire(); TopFieldDocs shardDocs = indexSearcher.searchAfter(searchAfterDoc, search.query, maxResults, search.sort, search.scored); shardHits.add(shardDocs); - logger.debug("{} hits on shard {} out of {} total docs", shardDocs.totalHits, i, - indexSearcher.getIndexReader().numDocs()); + logger.debug("{} hits on shard {} out of {} total docs", shardDocs.totalHits, i, docCount); i++; long duration = (System.currentTimeMillis() - startTime); if (duration > maxSearchTimeSeconds * 1000) { @@ -1469,7 +1487,7 @@ private TopFieldDocs searchShards(Search search, int maxResults, List Date: Wed, 15 Jun 2022 23:49:57 +0100 Subject: [PATCH 43/73] Sparse string faceting fix #19 --- src/main/java/org/icatproject/lucene/Lucene.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 237c581..224d77c 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -1398,7 +1398,7 @@ private String luceneSearchResult(String name, Search search, String searchAfter if (hits.length > 0) { maxScore = hits[0].score; } - logger.debug("Hits " + totalHits + " maxscore " + maxScore); + logger.debug("{} maxscore {}", totalHits, maxScore); ByteArrayOutputStream baos = new ByteArrayOutputStream(); try (JsonGenerator gen = Json.createGenerator(baos)) { gen.writeStartObject().writeStartArray("results"); @@ -1477,7 +1477,7 @@ private TopFieldDocs searchShards(Search search, int maxResults, List maxSearchTimeSeconds * 1000) { From 757da57f16464d98b8ac4412a79d745207a5ef66 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Thu, 16 Jun 2022 02:27:00 +0100 Subject: [PATCH 44/73] Filters and aborted search support #19 --- .../java/org/icatproject/lucene/Lucene.java | 115 +++++++++++------- 1 file changed, 71 insertions(+), 44 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 224d77c..8047289 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -314,6 +314,7 @@ public class Search { public Set fields = new HashSet(); public Map> joinedFields = new HashMap<>(); public Map dimensions = new HashMap(); + public boolean aborted = false; public void parseFields(JsonObject jsonObject) throws LuceneException { if (jsonObject.containsKey("fields")) { @@ -723,6 +724,13 @@ private Search datafilesQuery(HttpServletRequest request, String sort, Long uid) BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); + if (query.containsKey("filter")) { + JsonObject filterObject = query.getJsonObject("filter"); + for (String fld : filterObject.keySet()) { + theQuery.add(new TermQuery(new Term(fld, filterObject.getString(fld))), Occur.FILTER); + } + } + if (userName != null) { buildUserNameQuery(readerMap, userName, theQuery, "investigation.id"); } @@ -784,6 +792,13 @@ private Search datasetsQuery(HttpServletRequest request, String sort, Long uid) BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); + if (query.containsKey("filter")) { + JsonObject filterObject = query.getJsonObject("filter"); + for (String fld : filterObject.keySet()) { + theQuery.add(new TermQuery(new Term(fld, filterObject.getString(fld))), Occur.FILTER); + } + } + if (userName != null) { buildUserNameQuery(readerMap, userName, theQuery, "investigation.id"); } @@ -1201,6 +1216,13 @@ private Search investigationsQuery(HttpServletRequest request, String sort, Long BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); + if (query.containsKey("filter")) { + JsonObject filterObject = query.getJsonObject("filter"); + for (String fld : filterObject.keySet()) { + theQuery.add(new TermQuery(new Term(fld, filterObject.getString(fld))), Occur.FILTER); + } + } + if (userName != null) { buildUserNameQuery(readerMap, userName, theQuery, "id"); } @@ -1401,54 +1423,58 @@ private String luceneSearchResult(String name, Search search, String searchAfter logger.debug("{} maxscore {}", totalHits, maxScore); ByteArrayOutputStream baos = new ByteArrayOutputStream(); try (JsonGenerator gen = Json.createGenerator(baos)) { - gen.writeStartObject().writeStartArray("results"); - for (ScoreDoc hit : hits) { - encodeResult(name, gen, hit, searchers.get(hit.shardIndex), search); - } - gen.writeEnd(); // array results - if (hits.length == maxResults) { - ScoreDoc lastDoc = hits[hits.length - 1]; - gen.writeStartObject("search_after").write("doc", lastDoc.doc).write("shardIndex", lastDoc.shardIndex); - float lastScore = lastDoc.score; - if (!Float.isNaN(lastScore)) { - gen.write("score", lastScore); + gen.writeStartObject(); + gen.write("aborted", search.aborted); + if (!search.aborted) { + gen.writeStartArray("results"); + for (ScoreDoc hit : hits) { + encodeResult(name, gen, hit, searchers.get(hit.shardIndex), search); } - if (fields != null) { - Document lastDocument = searchers.get(lastDoc.shardIndex).doc(lastDoc.doc); - gen.writeStartArray("fields"); - for (SortField sortField : fields) { - String fieldName = sortField.getField(); - if (fieldName == null) { - // SCORE sorting will have a null fieldName - gen.write(lastDoc.score); - continue; - } - IndexableField indexableField = lastDocument.getField(fieldName); - if (indexableField == null) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Field " + fieldName - + " used for sorting was not present on the Lucene Document; all sortable fields must also be stored."); - } - Type type = (sortField instanceof SortedNumericSortField) - ? ((SortedNumericSortField) sortField).getNumericType() - : sortField.getType(); - switch (type) { - case LONG: - gen.write(indexableField.numericValue().longValue()); - break; - case DOUBLE: - gen.write(indexableField.numericValue().doubleValue()); - break; - case STRING: - gen.write(indexableField.stringValue()); - break; - default: - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, - "SortField.Type must be one of LONG, DOUBLE, STRING, but it was " + type); + gen.writeEnd(); // array results + if (hits.length == maxResults) { + ScoreDoc lastDoc = hits[hits.length - 1]; + gen.writeStartObject("search_after").write("doc", lastDoc.doc).write("shardIndex", lastDoc.shardIndex); + float lastScore = lastDoc.score; + if (!Float.isNaN(lastScore)) { + gen.write("score", lastScore); + } + if (fields != null) { + Document lastDocument = searchers.get(lastDoc.shardIndex).doc(lastDoc.doc); + gen.writeStartArray("fields"); + for (SortField sortField : fields) { + String fieldName = sortField.getField(); + if (fieldName == null) { + // SCORE sorting will have a null fieldName + gen.write(lastDoc.score); + continue; + } + IndexableField indexableField = lastDocument.getField(fieldName); + if (indexableField == null) { + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Field " + fieldName + + " used for sorting was not present on the Lucene Document; all sortable fields must also be stored."); + } + Type type = (sortField instanceof SortedNumericSortField) + ? ((SortedNumericSortField) sortField).getNumericType() + : sortField.getType(); + switch (type) { + case LONG: + gen.write(indexableField.numericValue().longValue()); + break; + case DOUBLE: + gen.write(indexableField.numericValue().doubleValue()); + break; + case STRING: + gen.write(indexableField.stringValue()); + break; + default: + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, + "SortField.Type must be one of LONG, DOUBLE, STRING, but it was " + type); + } } + gen.writeEnd(); // end "fields" array } - gen.writeEnd(); // end "fields" array + gen.writeEnd(); // end "search_after" object } - gen.writeEnd(); // end "search_after" object } gen.writeEnd(); // end enclosing object } @@ -1482,6 +1508,7 @@ private TopFieldDocs searchShards(Search search, int maxResults, List maxSearchTimeSeconds * 1000) { logger.info("Stopping search after {} shards due to {} ms having elapsed", i, duration); + search.aborted = true; break; } } From 663ea420e617308a2ef7eb074f0d41d3b368fa41 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Fri, 17 Jun 2022 09:30:32 +0000 Subject: [PATCH 45/73] Enable parsing of multivalued filters #19 --- .../java/org/icatproject/lucene/Lucene.java | 67 +++++++++++++------ 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 8047289..c4cee0c 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -603,6 +603,43 @@ private static void buildDateRanges(Builder queryBuilder, JsonObject queryJson, } } + /** + * Builds Term queries (exact string matches without tokenizing) from the filter + * object in the query request. This is intended to be used with the faceting, + * with the fields having the ".keyword" suffix. + * + * @param requestedQuery Json object containing details of the query. + * @param queryBuilder Builder for the overall boolean query to be build. + * @throws LuceneException If the values in the filter object are neither STRING + * nor ARRAY of STRING. + */ + private void buildFilterQueries(JsonObject requestedQuery, BooleanQuery.Builder queryBuilder) + throws LuceneException { + if (requestedQuery.containsKey("filter")) { + JsonObject filterObject = requestedQuery.getJsonObject("filter"); + for (String fld : filterObject.keySet()) { + ValueType valueType = filterObject.get(fld).getValueType(); + switch (valueType) { + case ARRAY: + BooleanQuery.Builder dimensionQuery = new BooleanQuery.Builder(); + for (JsonString value : filterObject.getJsonArray(fld).getValuesAs(JsonString.class)) { + dimensionQuery.add(new TermQuery(new Term(fld, value.getString())), Occur.SHOULD); + } + queryBuilder.add(dimensionQuery.build(), Occur.FILTER); + break; + + case STRING: + queryBuilder.add(new TermQuery(new Term(fld, filterObject.getString(fld))), Occur.FILTER); + break; + + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "filter object values should be STRING or ARRAY, but were " + valueType); + } + } + } + } + private void buildUserNameQuery(Map> readerMap, String userName, BooleanQuery.Builder theQuery, String toField) throws IOException, LuceneException { @@ -724,12 +761,7 @@ private Search datafilesQuery(HttpServletRequest request, String sort, Long uid) BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); - if (query.containsKey("filter")) { - JsonObject filterObject = query.getJsonObject("filter"); - for (String fld : filterObject.keySet()) { - theQuery.add(new TermQuery(new Term(fld, filterObject.getString(fld))), Occur.FILTER); - } - } + buildFilterQueries(query, theQuery); if (userName != null) { buildUserNameQuery(readerMap, userName, theQuery, "investigation.id"); @@ -792,12 +824,7 @@ private Search datasetsQuery(HttpServletRequest request, String sort, Long uid) BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); - if (query.containsKey("filter")) { - JsonObject filterObject = query.getJsonObject("filter"); - for (String fld : filterObject.keySet()) { - theQuery.add(new TermQuery(new Term(fld, filterObject.getString(fld))), Occur.FILTER); - } - } + buildFilterQueries(query, theQuery); if (userName != null) { buildUserNameQuery(readerMap, userName, theQuery, "investigation.id"); @@ -1216,12 +1243,7 @@ private Search investigationsQuery(HttpServletRequest request, String sort, Long BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); - if (query.containsKey("filter")) { - JsonObject filterObject = query.getJsonObject("filter"); - for (String fld : filterObject.keySet()) { - theQuery.add(new TermQuery(new Term(fld, filterObject.getString(fld))), Occur.FILTER); - } - } + buildFilterQueries(query, theQuery); if (userName != null) { buildUserNameQuery(readerMap, userName, theQuery, "id"); @@ -1409,8 +1431,8 @@ private String luceneSearchResult(String name, Search search, String searchAfter throws IOException, LuceneException { List searchers = getSearchers(search.searcherMap, name); List shards = getShards(search.searcherMap, name); - String format = "Search {} with: query {}, maxResults {}, searchAfter {}, scored {}"; - logger.debug(format, name, search.query, maxResults, searchAfter, search.scored); + String format = "Search {} with: query {}, maxResults {}, searchAfter {}, scored {}, fields {}"; + logger.debug(format, name, search.query, maxResults, searchAfter, search.scored, search.fields); FieldDoc searchAfterDoc = parseSearchAfter(searchAfter, search.sort.getSort()); TopFieldDocs topFieldDocs = searchShards(search, maxResults, shards, searchAfterDoc); ScoreDoc[] hits = topFieldDocs.scoreDocs; @@ -1433,7 +1455,8 @@ private String luceneSearchResult(String name, Search search, String searchAfter gen.writeEnd(); // array results if (hits.length == maxResults) { ScoreDoc lastDoc = hits[hits.length - 1]; - gen.writeStartObject("search_after").write("doc", lastDoc.doc).write("shardIndex", lastDoc.shardIndex); + gen.writeStartObject("search_after").write("doc", lastDoc.doc).write("shardIndex", + lastDoc.shardIndex); float lastScore = lastDoc.score; if (!Float.isNaN(lastScore)) { gen.write("score", lastScore); @@ -1478,7 +1501,7 @@ private String luceneSearchResult(String name, Search search, String searchAfter } gen.writeEnd(); // end enclosing object } - logger.trace("Json returned {}", baos.toString()); + logger.debug("Json returned {}", baos.toString()); return baos.toString(); } From eaafc89fda2a25b206963bca9f496d1262301179 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Mon, 20 Jun 2022 17:18:31 +0100 Subject: [PATCH 46/73] Refactors and Javadoc comments #19 --- .../icatproject/lucene/DocumentMapping.java | 105 ++ .../icatproject/lucene/FacetedDimension.java | 29 +- .../java/org/icatproject/lucene/Lucene.java | 1119 +++++------------ .../org/icatproject/lucene/SearchBucket.java | 813 ++++++++++++ src/main/resources/run.properties | 2 +- 5 files changed, 1282 insertions(+), 786 deletions(-) create mode 100644 src/main/java/org/icatproject/lucene/DocumentMapping.java create mode 100644 src/main/java/org/icatproject/lucene/SearchBucket.java diff --git a/src/main/java/org/icatproject/lucene/DocumentMapping.java b/src/main/java/org/icatproject/lucene/DocumentMapping.java new file mode 100644 index 0000000..27aa532 --- /dev/null +++ b/src/main/java/org/icatproject/lucene/DocumentMapping.java @@ -0,0 +1,105 @@ +package org.icatproject.lucene; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; + +public class DocumentMapping { + + /** + * Represents the parent child relationship between two ICAT entities. + */ + public static class ParentRelationship { + public String parentName; + public String fieldPrefix; + + /** + * @param parentName Name of the parent entity. + * @param fieldPrefix How nested fields should be prefixed. + */ + public ParentRelationship(String parentName, String fieldPrefix) { + this.parentName = parentName; + this.fieldPrefix = fieldPrefix; + } + + } + + public static final Set doubleFields = new HashSet<>(); + public static final Set facetFields = new HashSet<>(); + public static final Set longFields = new HashSet<>(); + public static final Set sortFields = new HashSet<>(); + public static final Set textFields = new HashSet<>(); + public static final Set indexedEntities = new HashSet<>(); + public static final Map relationships = new HashMap<>(); + + public static final IcatAnalyzer analyzer = new IcatAnalyzer(); + public static final StandardQueryParser genericParser = new StandardQueryParser(); + public static final StandardQueryParser datafileParser = new StandardQueryParser(); + public static final StandardQueryParser datasetParser = new StandardQueryParser(); + public static final StandardQueryParser investigationParser = new StandardQueryParser(); + public static final StandardQueryParser sampleParser = new StandardQueryParser(); + + static { + doubleFields.addAll(Arrays.asList("numericValue", "numericValueSI")); + facetFields.addAll(Arrays.asList("type.name", "datafileFormat.name", "stringValue")); + longFields.addAll(Arrays.asList("date", "startDate", "endDate", "dateTimeValue", "investigation.startDate")); + sortFields.addAll(Arrays.asList("datafile.id", "dataset.id", "investigation.id", "instrument.id", "id", "date", + "startDate", "endDate", "name", "stringValue", "dateTimeValue", "numericValue", "numericValueSI")); + textFields.addAll(Arrays.asList("name", "visitId", "description", "location", "dataset.name", + "investigation.name", "instrument.name", "instrument.fullName", "datafileFormat.name", "sample.name", + "sample.type.name", "title", "summary", "facility.name", "user.fullName", "type.name")); + + indexedEntities.addAll(Arrays.asList("Datafile", "Dataset", "Investigation", "DatafileParameter", + "DatasetParameter", "InstrumentScientist", "InvestigationInstrument", "InvestigationParameter", + "InvestigationUser", "Sample")); + + relationships.put("Instrument", + new ParentRelationship[] { new ParentRelationship("InvestigationInstrument", "instrument") }); + relationships.put("User", new ParentRelationship[] { new ParentRelationship("InvestigationUser", "user"), + new ParentRelationship("InstrumentScientist", "user") }); + relationships.put("Sample", new ParentRelationship[] { new ParentRelationship("Dataset", "sample") }); + relationships.put("SampleType", new ParentRelationship[] { new ParentRelationship("Sample", "type"), + new ParentRelationship("Dataset", "sample.type") }); + relationships.put("InvestigationType", + new ParentRelationship[] { new ParentRelationship("Investigation", "type") }); + relationships.put("DatasetType", new ParentRelationship[] { new ParentRelationship("Dataset", "type") }); + relationships.put("DatafileFormat", + new ParentRelationship[] { new ParentRelationship("Datafile", "datafileFormat") }); + relationships.put("Facility", new ParentRelationship[] { new ParentRelationship("Investigation", "facility") }); + relationships.put("ParameterType", + new ParentRelationship[] { new ParentRelationship("DatafileParameter", "type"), + new ParentRelationship("DatasetParameter", "type"), + new ParentRelationship("InvestigationParameter", "type") }); + relationships.put("Investigation", + new ParentRelationship[] { new ParentRelationship("Dataset", "investigation"), + new ParentRelationship("datafile", "investigation") }); + + genericParser.setAllowLeadingWildcard(true); + genericParser.setAnalyzer(analyzer); + + CharSequence[] datafileFields = { "name", "description", "location", "datafileFormat.name" }; + datafileParser.setAllowLeadingWildcard(true); + datafileParser.setAnalyzer(analyzer); + datafileParser.setMultiFields(datafileFields); + + CharSequence[] datasetFields = { "name", "description", "sample.name", "sample.type.name", "type.name" }; + datasetParser.setAllowLeadingWildcard(true); + datasetParser.setAnalyzer(analyzer); + datasetParser.setMultiFields(datasetFields); + + CharSequence[] investigationFields = { "name", "visitId", "title", "summary", "facility.name", + "type.name" }; + investigationParser.setAllowLeadingWildcard(true); + investigationParser.setAnalyzer(analyzer); + investigationParser.setMultiFields(investigationFields); + + CharSequence[] sampleFields = { "name", "type.name" }; + sampleParser.setAllowLeadingWildcard(true); + sampleParser.setAnalyzer(analyzer); + sampleParser.setMultiFields(sampleFields); + } +} diff --git a/src/main/java/org/icatproject/lucene/FacetedDimension.java b/src/main/java/org/icatproject/lucene/FacetedDimension.java index 3173d27..98c51c5 100644 --- a/src/main/java/org/icatproject/lucene/FacetedDimension.java +++ b/src/main/java/org/icatproject/lucene/FacetedDimension.java @@ -8,8 +8,17 @@ import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.LabelAndValue; +import org.apache.lucene.facet.range.DoubleRange; +import org.apache.lucene.facet.range.LongRange; import org.apache.lucene.facet.range.Range; +/** + * For a single dimension (field), stores labels (the unique values or ranges of + * values for that field in the index) and their respective counts (the number + * of times that label appears in different documents). + * + * For example, a dimension might be "colour", the label "red", and the count 5. + */ public class FacetedDimension { private String dimension; @@ -18,11 +27,8 @@ public class FacetedDimension { private List counts; /** - * For a single dimension (field), stores labels (the unique values or ranges of - * values for that field in the index) and their respective counts (the number - * of times that label appears in different documents). - * - * For example, a dimension might be "colour", the label "red", and the count 5. + * Creates an "empty" FacetedDimension. The dimension (field) is set but ranges, + * labels and counts are not. * * @param dimension The dimension, or field, to be faceted */ @@ -65,7 +71,18 @@ public void buildResponse(JsonObjectBuilder aggregationsBuilder) { JsonObjectBuilder bucketsBuilder = Json.createObjectBuilder(); for (int i = 0; i < labels.size(); i++) { JsonObjectBuilder bucketBuilder = Json.createObjectBuilder(); - bucketsBuilder.add(labels.get(i), bucketBuilder.add("doc_count", counts.get(i))); + bucketBuilder.add("doc_count", counts.get(i)); + if (ranges.size() > i) { + Range range = ranges.get(i); + if (range.getClass().getSimpleName().equals("LongRange")) { + bucketBuilder.add("from", ((LongRange) range).min); + bucketBuilder.add("to", ((LongRange) range).max); + } else if (range.getClass().getSimpleName().equals("DoubleRange")) { + bucketBuilder.add("from", ((DoubleRange) range).min); + bucketBuilder.add("to", ((DoubleRange) range).max); + } + } + bucketsBuilder.add(labels.get(i), bucketBuilder); } aggregationsBuilder.add(dimension, Json.createObjectBuilder().add("buckets", bucketsBuilder)); } diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index c4cee0c..282d413 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -1,14 +1,11 @@ package org.icatproject.lucene; -import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; -import java.io.StringReader; import java.net.HttpURLConnection; import java.nio.file.FileVisitOption; import java.nio.file.Files; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; @@ -16,9 +13,8 @@ import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; -import java.util.TimeZone; import java.util.Map.Entry; +import java.util.Set; import java.util.Timer; import java.util.TimerTask; import java.util.concurrent.ConcurrentHashMap; @@ -32,14 +28,10 @@ import javax.json.Json; import javax.json.JsonArray; import javax.json.JsonException; -import javax.json.JsonNumber; import javax.json.JsonObject; import javax.json.JsonObjectBuilder; import javax.json.JsonReader; -import javax.json.JsonString; import javax.json.JsonStructure; -import javax.json.JsonValue; -import javax.json.JsonValue.ValueType; import javax.json.stream.JsonGenerator; import javax.servlet.http.HttpServletRequest; import javax.ws.rs.Consumes; @@ -54,9 +46,9 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.DoublePoint; +import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.NumericDocValuesField; -import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; @@ -69,7 +61,6 @@ import org.apache.lucene.facet.range.DoubleRangeFacetCounts; import org.apache.lucene.facet.range.LongRange; import org.apache.lucene.facet.range.LongRangeFacetCounts; -import org.apache.lucene.facet.range.Range; import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState; import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts; import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField; @@ -77,32 +68,22 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; -import org.apache.lucene.queryparser.flexible.core.QueryNodeException; -import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; -import org.apache.lucene.search.BooleanClause.Occur; -import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FieldDoc; -import org.apache.lucene.search.BooleanQuery.Builder; -import org.apache.lucene.search.SortField.Type; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.MatchAllDocsQuery; -import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.SearcherManager; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortField.Type; import org.apache.lucene.search.SortedNumericSortField; -import org.apache.lucene.search.TermInSetQuery; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopFieldDocs; import org.apache.lucene.search.TotalHits; -import org.apache.lucene.search.WildcardQuery; -import org.apache.lucene.search.join.JoinUtil; -import org.apache.lucene.search.join.ScoreMode; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.NumericUtils; +import org.icatproject.lucene.SearchBucket.SearchType; import org.icatproject.lucene.exceptions.LuceneException; import org.icatproject.utils.CheckedProperties; import org.icatproject.utils.IcatUnits; @@ -116,6 +97,10 @@ @Singleton public class Lucene { + /** + * A bucket for accessing the read and write functionality for a single "shard" + * Lucene index which can then be grouped to represent a single document type. + */ private class ShardBucket { private FSDirectory directory; private IndexWriter indexWriter; @@ -132,7 +117,7 @@ private class ShardBucket { */ public ShardBucket(java.nio.file.Path shardPath) throws IOException { directory = FSDirectory.open(shardPath); - IndexWriterConfig config = new IndexWriterConfig(analyzer); + IndexWriterConfig config = new IndexWriterConfig(DocumentMapping.analyzer); indexWriter = new IndexWriter(directory, config); String[] files = directory.listAll(); if (files.length == 1 && files[0].equals("write.lock")) { @@ -156,6 +141,12 @@ public ShardBucket(java.nio.file.Path shardPath) throws IOException { } } + /** + * Commits all pending cached documents to this shard. + * + * @return The number of documents committed to this shard. + * @throws IOException + */ public int commit() throws IOException { int cached = indexWriter.numRamDocs(); indexWriter.commit(); @@ -164,9 +155,13 @@ public int commit() throws IOException { } } + /** + * A bucket for accessing the high level functionality, such as + * searching, for a single document type. Incoming documents will be routed to + * one of the individual "shard" indices that are grouped by this Object. + */ private class IndexBucket { private String entityName; - // private Map shardMap = new HashMap<>(); private List shardList = new ArrayList<>(); private AtomicBoolean locked = new AtomicBoolean(); @@ -180,12 +175,11 @@ private class IndexBucket { */ public IndexBucket(String entityName) { try { - this.entityName = entityName; + this.entityName = entityName.toLowerCase(); Long shardIndex = 0L; java.nio.file.Path shardPath = luceneDirectory.resolve(entityName); do { ShardBucket shardBucket = new ShardBucket(shardPath); - // shardMap.put(shardIndex, shardBucket); shardList.add(shardBucket); shardIndex++; shardPath = luceneDirectory.resolve(entityName + "_" + shardIndex); @@ -200,7 +194,7 @@ public IndexBucket(String entityName) { * Acquires IndexSearchers from the SearcherManagers of the individual shards in * this bucket. * - * @return Array of DirectoryReaders for all shards in this bucket. + * @return List of IndexSearchers for all shards in this bucket. * @throws IOException */ public List acquireSearchers() throws IOException { @@ -211,15 +205,29 @@ public List acquireSearchers() throws IOException { return subSearchers; } + /** + * Adds a document to the appropriate shard for this index. + * + * @param document The document to be added. + * @throws IOException + */ public void addDocument(Document document) throws IOException { ShardBucket shardBucket = routeShard(); shardBucket.indexWriter.addDocument(document); shardBucket.documentCount.incrementAndGet(); } + /** + * Updates documents matching the term with the provided document. + * + * @param term Term identifying the old document(s) to be updated. + * @param document The document that will replace the old document(s). + * @throws IOException + */ public void updateDocument(Term term, Document document) throws IOException { - ShardBucket shardBucket = routeShard(); - shardBucket.indexWriter.updateDocument(term, document); + for (ShardBucket shardBucket : shardList) { + shardBucket.indexWriter.updateDocument(term, document); + } } /** @@ -264,7 +272,6 @@ public void commit(String command, String entityName) throws IOException { * @throws IOException */ public void close() throws IOException { - // for (ShardBucket shardBucket : shardMap.values()) { for (ShardBucket shardBucket : shardList) { shardBucket.searcherManager.close(); shardBucket.indexWriter.commit(); @@ -292,7 +299,14 @@ public ShardBucket routeShard() throws IOException { return shardBucket; } - public void releaseReaders(List subSearchers) throws IOException, LuceneException { + /** + * Releases all provided searchers for the shards in this bucket. + * + * @param subSearchers List of IndexSearcher, in shard order. + * @throws IOException + * @throws LuceneException If the number of searchers and shards isn't the same. + */ + public void releaseSearchers(List subSearchers) throws IOException, LuceneException { if (subSearchers.size() != shardList.size()) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Was expecting the same number of DirectoryReaders as ShardBuckets, but had " @@ -306,178 +320,8 @@ public void releaseReaders(List subSearchers) throws IOException, } } - public class Search { - public Map> searcherMap; - public Query query; - public Sort sort; - public boolean scored; - public Set fields = new HashSet(); - public Map> joinedFields = new HashMap<>(); - public Map dimensions = new HashMap(); - public boolean aborted = false; - - public void parseFields(JsonObject jsonObject) throws LuceneException { - if (jsonObject.containsKey("fields")) { - List fieldStrings = jsonObject.getJsonArray("fields").getValuesAs(JsonString.class); - logger.trace("Parsing fields from {}", fieldStrings); - for (JsonString jsonString : fieldStrings) { - String[] splitString = jsonString.getString().split(" "); - if (splitString.length == 1) { - fields.add(splitString[0]); - } else if (splitString.length == 2) { - if (joinedFields.containsKey(splitString[0])) { - joinedFields.get(splitString[0]).add(splitString[1]); - } else { - joinedFields.putIfAbsent(splitString[0], - new HashSet(Arrays.asList(splitString[1]))); - } - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "Could not parse field: " + jsonString.getString()); - } - } - } - - } - - /** - * Parses the String from the request into a Lucene Sort object. Multiple sort - * criteria are supported, and will be applied in order. - * - * @param sortString String representation of a JSON object with the field(s) to - * sort - * as keys, and the direction ("asc" or "desc") as value(s). - * @return Lucene Sort object - * @throws LuceneException If the value for any key isn't "asc" or "desc" - */ - public void parseSort(String sortString) throws LuceneException { - if (sortString == null || sortString.equals("") || sortString.equals("{}")) { - scored = true; - sort = new Sort(SortField.FIELD_SCORE, new SortedNumericSortField("id.long", Type.LONG)); - return; - } - try (JsonReader reader = Json.createReader(new ByteArrayInputStream(sortString.getBytes()))) { - JsonObject object = reader.readObject(); - List fields = new ArrayList<>(); - for (String key : object.keySet()) { - String order = object.getString(key); - Boolean reverse; - if (order.equals("asc")) { - reverse = false; - } else if (order.equals("desc")) { - reverse = true; - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "Sort order must be 'asc' or 'desc' but it was '" + order + "'"); - } - - if (longFields.contains(key)) { - fields.add(new SortedNumericSortField(key, Type.LONG, reverse)); - } else if (doubleFields.contains(key)) { - fields.add(new SortedNumericSortField(key, Type.DOUBLE, reverse)); - } else { - fields.add(new SortField(key, Type.STRING, reverse)); - } - } - fields.add(new SortedNumericSortField("id.long", Type.LONG)); - scored = false; - sort = new Sort(fields.toArray(new SortField[0])); - } - } - } - - private static class ParentRelationship { - public String parentName; - public String fieldPrefix; - - public ParentRelationship(String parentName, String fieldPrefix) { - this.parentName = parentName; - this.fieldPrefix = fieldPrefix; - } - - } - - private static final Logger logger = LoggerFactory.getLogger(Lucene.class); + static final Logger logger = LoggerFactory.getLogger(Lucene.class); private static final Marker fatal = MarkerFactory.getMarker("FATAL"); - private static final SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmm"); - - private static final Set doubleFields = new HashSet<>(); - private static final Set facetFields = new HashSet<>(); - private static final Set longFields = new HashSet<>(); - private static final Set sortFields = new HashSet<>(); - private static final Set textFields = new HashSet<>(); - private static final Set indexedEntities = new HashSet<>(); - private static final Map relationships = new HashMap<>(); - - private static final IcatAnalyzer analyzer = new IcatAnalyzer(); - private static final StandardQueryParser genericParser = new StandardQueryParser(); - private static final StandardQueryParser datafileParser = new StandardQueryParser(); - private static final StandardQueryParser datasetParser = new StandardQueryParser(); - private static final StandardQueryParser investigationParser = new StandardQueryParser(); - private static final StandardQueryParser sampleParser = new StandardQueryParser(); - - static { - TimeZone tz = TimeZone.getTimeZone("GMT"); - df.setTimeZone(tz); - - doubleFields.addAll(Arrays.asList("numericValue", "numericValueSI")); - facetFields.addAll(Arrays.asList("type.name", "datafileFormat.name")); - longFields.addAll(Arrays.asList("date", "startDate", "endDate", "dateTimeValue", "investigation.startDate")); - sortFields.addAll(Arrays.asList("datafile.id", "dataset.id", "investigation.id", "instrument.id", "id", "date", - "startDate", "endDate", "name", "stringValue", "dateTimeValue", "numericValue", "numericValueSI")); - textFields.addAll(Arrays.asList("name", "visitId", "description", "location", "dataset.name", - "investigation.name", "instrument.name", "instrument.fullName", "datafileFormat.name", "sample.name", - "sample.type.name", "title", "summary", "facility.name", "user.fullName", "type.name")); - - indexedEntities.addAll(Arrays.asList("Datafile", "Dataset", "Investigation", "DatafileParameter", - "DatasetParameter", "InstrumentScientist", "InvestigationInstrument", "InvestigationParameter", - "InvestigationUser", "Sample")); - - relationships.put("Instrument", - new ParentRelationship[] { new ParentRelationship("InvestigationInstrument", "instrument") }); - relationships.put("User", new ParentRelationship[] { new ParentRelationship("InvestigationUser", "user"), - new ParentRelationship("InstrumentScientist", "user") }); - relationships.put("Sample", new ParentRelationship[] { new ParentRelationship("Dataset", "sample") }); - relationships.put("SampleType", new ParentRelationship[] { new ParentRelationship("Sample", "type"), - new ParentRelationship("Dataset", "sample.type") }); - relationships.put("InvestigationType", - new ParentRelationship[] { new ParentRelationship("Investigation", "type") }); - relationships.put("DatasetType", new ParentRelationship[] { new ParentRelationship("Dataset", "type") }); - relationships.put("DatafileFormat", - new ParentRelationship[] { new ParentRelationship("Datafile", "datafileFormat") }); - relationships.put("Facility", new ParentRelationship[] { new ParentRelationship("Investigation", "facility") }); - relationships.put("ParameterType", - new ParentRelationship[] { new ParentRelationship("DatafileParameter", "type"), - new ParentRelationship("DatasetParameter", "type"), - new ParentRelationship("InvestigationParameter", "type") }); - relationships.put("Investigation", - new ParentRelationship[] { new ParentRelationship("Dataset", "investigation"), - new ParentRelationship("datafile", "investigation") }); - - genericParser.setAllowLeadingWildcard(true); - genericParser.setAnalyzer(analyzer); - - CharSequence[] datafileFields = { "name", "description", "doi", "location", "datafileFormat.name" }; - datafileParser.setAllowLeadingWildcard(true); - datafileParser.setAnalyzer(analyzer); - datafileParser.setMultiFields(datafileFields); - - CharSequence[] datasetFields = { "name", "description", "doi", "sample.name", "sample.type.name", "type.name" }; - datasetParser.setAllowLeadingWildcard(true); - datasetParser.setAnalyzer(analyzer); - datasetParser.setMultiFields(datasetFields); - - CharSequence[] investigationFields = { "name", "visitId", "title", "summary", "doi", "facility.name", - "type.name" }; - investigationParser.setAllowLeadingWildcard(true); - investigationParser.setAnalyzer(analyzer); - investigationParser.setMultiFields(investigationFields); - - CharSequence[] sampleFields = { "name", "type.name" }; - sampleParser.setAllowLeadingWildcard(true); - sampleParser.setAnalyzer(analyzer); - sampleParser.setMultiFields(sampleFields); - } private final FacetsConfig facetsConfig = new FacetsConfig(); @@ -492,8 +336,8 @@ public ParentRelationship(String parentName, String fieldPrefix) { private Timer timer; - private Map searches = new ConcurrentHashMap<>(); - private IcatUnits icatUnits; + private Map searches = new ConcurrentHashMap<>(); + public IcatUnits icatUnits; /** * return the version of the lucene server @@ -516,7 +360,6 @@ public String getVersion() { @Consumes(MediaType.APPLICATION_JSON) @Path("modify") public void modify(@Context HttpServletRequest request) throws LuceneException { - logger.debug("Requesting modify"); int count = 0; try (JsonReader reader = Json.createReader(request.getInputStream())) { @@ -572,89 +415,6 @@ public void addNow(@Context HttpServletRequest request, @PathParam("entityName") logger.debug("Added {} {} documents", documents.size(), entityName); } - /** - * Extracts values from queryJson in order to add one or more range query terms - * using queryBuilder. - * - * Note that values in queryJson are expected to be precise only to the minute, - * and so to ensure that our range is inclusive, we add 59.999 seconds onto the - * upper value only. - * - * If either upper or lower keys do not yield values then a half open range is - * created. If both are absent, then nothing is added to the query. - * - * @param queryBuilder Builder for the Lucene query. - * @param queryJson JsonObject representing the query parameters. - * @param lowerKey Key in queryJson of the lower date value - * @param upperKey Key in queryJson of the upper date value - * @param fields Name of one or more fields to apply the range query to. - * @throws LuceneException - */ - private static void buildDateRanges(Builder queryBuilder, JsonObject queryJson, String lowerKey, String upperKey, - String... fields) throws LuceneException { - Long lower = parseDate(queryJson, lowerKey, 0); - Long upper = parseDate(queryJson, upperKey, 59999); - if (lower != null || upper != null) { - lower = (lower == null) ? Long.MIN_VALUE : lower; - upper = (upper == null) ? Long.MAX_VALUE : upper; - for (String field : fields) { - queryBuilder.add(LongPoint.newRangeQuery(field, lower, upper), Occur.MUST); - } - } - } - - /** - * Builds Term queries (exact string matches without tokenizing) from the filter - * object in the query request. This is intended to be used with the faceting, - * with the fields having the ".keyword" suffix. - * - * @param requestedQuery Json object containing details of the query. - * @param queryBuilder Builder for the overall boolean query to be build. - * @throws LuceneException If the values in the filter object are neither STRING - * nor ARRAY of STRING. - */ - private void buildFilterQueries(JsonObject requestedQuery, BooleanQuery.Builder queryBuilder) - throws LuceneException { - if (requestedQuery.containsKey("filter")) { - JsonObject filterObject = requestedQuery.getJsonObject("filter"); - for (String fld : filterObject.keySet()) { - ValueType valueType = filterObject.get(fld).getValueType(); - switch (valueType) { - case ARRAY: - BooleanQuery.Builder dimensionQuery = new BooleanQuery.Builder(); - for (JsonString value : filterObject.getJsonArray(fld).getValuesAs(JsonString.class)) { - dimensionQuery.add(new TermQuery(new Term(fld, value.getString())), Occur.SHOULD); - } - queryBuilder.add(dimensionQuery.build(), Occur.FILTER); - break; - - case STRING: - queryBuilder.add(new TermQuery(new Term(fld, filterObject.getString(fld))), Occur.FILTER); - break; - - default: - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "filter object values should be STRING or ARRAY, but were " + valueType); - } - } - } - } - - private void buildUserNameQuery(Map> readerMap, String userName, - BooleanQuery.Builder theQuery, String toField) - throws IOException, LuceneException { - TermQuery fromQuery = new TermQuery(new Term("user.name", userName)); - Query investigationUserQuery = JoinUtil.createJoinQuery("investigation.id", false, toField, fromQuery, - getSearcher(readerMap, "InvestigationUser"), ScoreMode.None); - Query instrumentScientistQuery = JoinUtil.createJoinQuery("instrument.id", false, "instrument.id", fromQuery, - getSearcher(readerMap, "InstrumentScientist"), ScoreMode.None); - Query investigationInstrumentQuery = JoinUtil.createJoinQuery("investigation.id", false, toField, - instrumentScientistQuery, getSearcher(readerMap, "InvestigationInstrument"), ScoreMode.None); - Builder userNameQueryBuilder = new BooleanQuery.Builder(); - userNameQueryBuilder.add(investigationUserQuery, Occur.SHOULD).add(investigationInstrumentQuery, Occur.SHOULD); - theQuery.add(userNameQueryBuilder.build(), Occur.MUST); - } - /* * This is only for testing purposes. Other calls to the service will not * work properly while this operation is in progress. @@ -683,6 +443,9 @@ public void clear() throws LuceneException { } + /** + * Commits any pending documents to their respective index. + */ @POST @Path("commit") public void commit() throws LuceneException { @@ -699,15 +462,25 @@ public void commit() throws LuceneException { } } + /** + * Creates a new Lucene document, provided that the target index is not locked + * for another operation. + * + * @param operationBody JsonObject containing the "_index" that the new "doc" + * should be created in. + * @throws NumberFormatException + * @throws IOException + * @throws LuceneException + */ private void create(JsonObject operationBody) throws NumberFormatException, IOException, LuceneException { String entityName = operationBody.getString("_index"); - if (relationships.containsKey(entityName)) { + if (DocumentMapping.relationships.containsKey(entityName)) { updateByRelation(operationBody, false); } - if (indexedEntities.contains(entityName)) { + if (DocumentMapping.indexedEntities.contains(entityName)) { Document document = parseDocument(operationBody.getJsonObject("doc")); logger.trace("create {} {}", entityName, document.toString()); - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); if (bucket.locked.get()) { throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene locked for " + entityName); @@ -716,6 +489,15 @@ private void create(JsonObject operationBody) throws NumberFormatException, IOEx } } + /** + * Creates a new Lucene document. + * + * @param entityName Name of the entity/index to create the document in. + * @param documentJson JsonObject representation of the document to be created. + * @throws NumberFormatException + * @throws IOException + * @throws LuceneException + */ private void createNow(String entityName, JsonObject documentJson) throws NumberFormatException, IOException, LuceneException { if (!documentJson.containsKey("id")) { @@ -724,10 +506,22 @@ private void createNow(String entityName, JsonObject documentJson) } Document document = parseDocument(documentJson); logger.trace("create {} {}", entityName, document.toString()); - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); bucket.addDocument(facetsConfig.build(document)); } + /** + * Perform search on the Datafile entity/index. + * + * @param request Incoming Http request containing the query as Json. + * @param searchAfter String of Json representing the last Lucene Document from + * a previous search. + * @param maxResults The maximum number of results to include in the returned + * Json. + * @param sort String of Json representing the sort criteria. + * @return String of Json representing the results of the search. + * @throws LuceneException + */ @POST @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) @@ -737,8 +531,9 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("search Long uid = null; try { uid = bucketNum.getAndIncrement(); - Search search = datafilesQuery(request, sort, uid); - return luceneSearchResult("Datafile", search, searchAfter, maxResults, uid); + SearchBucket search = new SearchBucket(this, SearchType.DATAFILE, request, sort, searchAfter); + searches.put(uid, search); + return luceneSearchResult("Datafile", search, searchAfter, maxResults); } catch (Exception e) { logger.error("Error", e); freeSearcher(uid); @@ -746,50 +541,18 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("search } } - private Search datafilesQuery(HttpServletRequest request, String sort, Long uid) - throws IOException, QueryNodeException, LuceneException { - Search search = new Search(); - searches.put(uid, search); - Map> readerMap = new HashMap<>(); - search.searcherMap = readerMap; - search.parseSort(sort); - - try (JsonReader r = Json.createReader(request.getInputStream())) { - JsonObject o = r.readObject(); - JsonObject query = o.getJsonObject("query"); - String userName = query.getString("user", null); - - BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); - - buildFilterQueries(query, theQuery); - - if (userName != null) { - buildUserNameQuery(readerMap, userName, theQuery, "investigation.id"); - } - - String text = query.getString("text", null); - if (text != null) { - theQuery.add(datafileParser.parse(text, null), Occur.MUST); - } - - buildDateRanges(theQuery, query, "lower", "upper", "date"); - - if (query.containsKey("parameters")) { - JsonArray parameters = query.getJsonArray("parameters"); - IndexSearcher datafileParameterSearcher = getSearcher(readerMap, "DatafileParameter"); - for (JsonValue p : parameters) { - BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("datafile.id", false, "id", paramQuery.build(), - datafileParameterSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); - } - } - search.query = maybeEmptyQuery(theQuery); - search.parseFields(o); - } - return search; - } - + /** + * Perform search on the Dataset entity/index. + * + * @param request Incoming Http request containing the query as Json. + * @param searchAfter String of Json representing the last Lucene Document from + * a previous search. + * @param maxResults The maximum number of results to include in the returned + * Json. + * @param sort String of Json representing the sort criteria. + * @return String of Json representing the results of the search. + * @throws LuceneException + */ @POST @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) @@ -800,8 +563,9 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("search_ Long uid = null; try { uid = bucketNum.getAndIncrement(); - Search search = datasetsQuery(request, sort, uid); - return luceneSearchResult("Dataset", search, searchAfter, maxResults, uid); + SearchBucket search = new SearchBucket(this, SearchType.DATASET, request, sort, searchAfter); + searches.put(uid, search); + return luceneSearchResult("Dataset", search, searchAfter, maxResults); } catch (Exception e) { logger.error("Error", e); freeSearcher(uid); @@ -810,75 +574,24 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("search_ } - private Search datasetsQuery(HttpServletRequest request, String sort, Long uid) - throws IOException, QueryNodeException, LuceneException { - Search search = new Search(); - searches.put(uid, search); - Map> readerMap = new HashMap<>(); - search.searcherMap = readerMap; - search.parseSort(sort); - try (JsonReader r = Json.createReader(request.getInputStream())) { - JsonObject o = r.readObject(); - JsonObject query = o.getJsonObject("query"); - String userName = query.getString("user", null); - - BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); - - buildFilterQueries(query, theQuery); - - if (userName != null) { - buildUserNameQuery(readerMap, userName, theQuery, "investigation.id"); - } - - String text = query.getString("text", null); - if (text != null) { - theQuery.add(datasetParser.parse(text, null), Occur.MUST); - } - - buildDateRanges(theQuery, query, "lower", "upper", "startDate", "endDate"); - - if (query.containsKey("parameters")) { - JsonArray parameters = query.getJsonArray("parameters"); - IndexSearcher datasetParameterSearcher = getSearcher(readerMap, "DatasetParameter"); - for (JsonValue p : parameters) { - BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("dataset.id", false, "id", paramQuery.build(), - datasetParameterSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); - } - } - search.query = maybeEmptyQuery(theQuery); - search.parseFields(o); - } - return search; - } - /** - * Converts String into number of ms since epoch. + * Deletes a Lucene document, provided that the target index is not locked for + * another operation. * - * @param value String representing a Date in the format "yyyyMMddHHmm". - * @return Number of ms since epoch, or null if value was null - * @throws java.text.ParseException + * @param operationBody JsonObject containing the "_index" and the "_id" of the + * Document to be deleted. + * @throws LuceneException + * @throws IOException */ - protected static Long decodeTime(String value) throws java.text.ParseException { - if (value == null) { - return null; - } else { - synchronized (df) { - return df.parse(value).getTime(); - } - } - } - private void delete(JsonObject operationBody) throws LuceneException, IOException { String entityName = operationBody.getString("_index"); - if (relationships.containsKey(entityName)) { + if (DocumentMapping.relationships.containsKey(entityName)) { updateByRelation(operationBody, true); } - if (indexedEntities.contains(entityName)) { + if (DocumentMapping.indexedEntities.contains(entityName)) { String icatId = operationBody.getString("_id"); try { - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); if (bucket.locked.get()) { throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene locked for " + entityName); @@ -907,7 +620,8 @@ private void delete(JsonObject operationBody) throws LuceneException, IOExceptio * @throws IOException * @throws LuceneException */ - private void encodeResult(String entityName, JsonGenerator gen, ScoreDoc hit, IndexSearcher searcher, Search search) + private void encodeResult(String entityName, JsonGenerator gen, ScoreDoc hit, IndexSearcher searcher, + SearchBucket search) throws IOException, LuceneException { int luceneDocId = hit.doc; int shardIndex = hit.shardIndex; @@ -921,8 +635,8 @@ private void encodeResult(String entityName, JsonGenerator gen, ScoreDoc hit, In document.forEach(encodeField(gen, search.fields)); for (String joinedEntityName : search.joinedFields.keySet()) { List searchers = getSearchers(search.searcherMap, joinedEntityName); - List shards = getShards(search.searcherMap, joinedEntityName); - Search joinedSearch = new Search(); + List shards = getShards(joinedEntityName); + SearchBucket joinedSearch = new SearchBucket(this); String fld; String parentId; if (joinedEntityName.toLowerCase().contains("investigation")) { @@ -931,11 +645,13 @@ private void encodeResult(String entityName, JsonGenerator gen, ScoreDoc hit, In parentId = document.get("id"); } else { parentId = document.get("investigation.id"); + logger.debug("investigation.id {}", parentId); } } else { fld = entityName.toLowerCase() + ".id"; parentId = document.get("id"); } + logger.debug("fld {}, parentId {}", fld, parentId); joinedSearch.query = new TermQuery(new Term(fld, parentId)); joinedSearch.sort = new Sort(new SortedNumericSortField("id.long", Type.LONG)); TopFieldDocs topFieldDocs = searchShards(joinedSearch, 100, shards, null); @@ -955,9 +671,9 @@ private Consumer encodeField(JsonGenerator gen, Set { String fieldName = field.name(); if (fields.contains(fieldName)) { - if (longFields.contains(fieldName)) { + if (DocumentMapping.longFields.contains(fieldName)) { gen.write(fieldName, field.numericValue().longValue()); - } else if (doubleFields.contains(fieldName)) { + } else if (DocumentMapping.doubleFields.contains(fieldName)) { gen.write(fieldName, field.numericValue().doubleValue()); } else { gen.write(fieldName, field.stringValue()); @@ -984,6 +700,24 @@ private void exit() { } } + /** + * Perform faceting on an entity/index. The query associated with the request + * should determine which Documents to consider, and optionally the dimensions + * to facet. If no dimensions are provided, "sparse" faceting is performed + * across relevant string fields (but no Range faceting occurs). + * + * @param entityName Name of the entity/index to facet on. + * @param request Incoming Http request containing the query as Json. + * @param searchAfter String of Json representing the last Lucene Document from + * a previous search. + * @param maxResults The maximum number of results to include in the returned + * Json. + * @param maxLabels The maximum number of labels to return for each dimension + * of the facets. + * @param sort String of Json representing the sort criteria. + * @return String of Json representing the results of the faceting. + * @throws LuceneException + */ @POST @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) @@ -994,14 +728,21 @@ public String facet(@PathParam("entityName") String entityName, @Context HttpSer Long uid = null; try { uid = bucketNum.getAndIncrement(); - Search search = genericQuery(request, sort, uid); - return luceneFacetResult(entityName, search, searchAfter, maxResults, maxLabels, uid); + SearchBucket search = new SearchBucket(this, SearchType.GENERIC, request, sort, null); + searches.put(uid, search); + return luceneFacetResult(entityName, search, searchAfter, maxResults, maxLabels); } catch (Exception e) { freeSearcher(uid); throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } } + /** + * Releases all IndexSearchers associated with uid. + * + * @param uid Unique Identifier for a set of IndexSearcher to be released. + * @throws LuceneException + */ public void freeSearcher(Long uid) throws LuceneException { if (uid != null) { // May not be set for internal calls Map> search = searches.get(uid).searcherMap; @@ -1009,7 +750,8 @@ public void freeSearcher(Long uid) throws LuceneException { String name = entry.getKey(); List subReaders = entry.getValue(); try { - indexBuckets.computeIfAbsent(name, k -> new IndexBucket(k)).releaseReaders(subReaders); + indexBuckets.computeIfAbsent(name.toLowerCase(), k -> new IndexBucket(k)) + .releaseSearchers(subReaders); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } @@ -1019,141 +761,39 @@ public void freeSearcher(Long uid) throws LuceneException { } /** - * Parses a query and associated information from an incoming request without - * any logic specific to a single index or entity. As such it may not be as - * powerful, but is sufficient for simple queries (like those for faceting). + * Gets all IndexSearchers needed for the shards of a given entity/index. * - * @param request Request containing the query and other Json encoded - * information such as fields and dimensions. - * @param sort String representing the sorting criteria for the search. - * @param uid Identifier for the search. - * @return Search object with the query, sort, and optionally the fields and - * dimensions to search set. - * @throws IOException If Json cannot be parsed from the request - * @throws LuceneException If the types of the JsonValues in the query do not - * match those supported by icat.lucene + * @param searcherMap Map of entity names to their IndexSearchers. + * @param name Name of the entity to get the IndexSearchers for. + * @return List of IndexSearchers for name. + * @throws IOException */ - private Search genericQuery(HttpServletRequest request, String sort, Long uid) throws IOException, LuceneException { - Search search = new Search(); - searches.put(uid, search); - Map> readerMap = new HashMap<>(); - search.searcherMap = readerMap; - search.parseSort(sort); - try (JsonReader r = Json.createReader(request.getInputStream())) { - JsonObject o = r.readObject(); - JsonObject jsonQuery = o.getJsonObject("query"); - BooleanQuery.Builder luceneQuery = new BooleanQuery.Builder(); - for (Entry entry : jsonQuery.entrySet()) { - String field = entry.getKey(); - ValueType valueType = entry.getValue().getValueType(); - switch (valueType) { - case STRING: - JsonString stringValue = (JsonString) entry.getValue(); - luceneQuery.add(new TermQuery(new Term(field, stringValue.getString())), Occur.MUST); - break; - case NUMBER: - JsonNumber numberValue = (JsonNumber) entry.getValue(); - if (longFields.contains(field)) { - luceneQuery.add(LongPoint.newExactQuery(field, numberValue.longValueExact()), Occur.FILTER); - } else if (doubleFields.contains(field)) { - luceneQuery.add(DoublePoint.newExactQuery(field, numberValue.doubleValue()), Occur.FILTER); - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "Value had type NUMBER, but field " + field - + " is not a known longField or doubleField"); - } - break; - case ARRAY: - // Only support array of String as list of ICAT ids is currently only use case - JsonArray arrayValue = (JsonArray) entry.getValue(); - ArrayList bytesArray = new ArrayList<>(); - for (JsonString value : arrayValue.getValuesAs(JsonString.class)) { - bytesArray.add(new BytesRef(value.getChars())); - } - luceneQuery.add(new TermInSetQuery(field, bytesArray), Occur.MUST); - break; - default: - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "Query values should be ARRAY, STRING or NUMBER, but had value of type " + valueType); - } - } - search.query = maybeEmptyQuery(luceneQuery); - logger.info("Query: {}", search.query); - search.parseFields(o); - if (o.containsKey("dimensions")) { - List dimensionObjects = o.getJsonArray("dimensions").getValuesAs(JsonObject.class); - for (JsonObject dimensionObject : dimensionObjects) { - if (!dimensionObject.containsKey("dimension")) { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "'dimension' not specified for facet request " + dimensionObject.toString()); - } - String dimension = dimensionObject.getString("dimension"); - FacetedDimension facetDimensionRequest = new FacetedDimension(dimension); - if (dimensionObject.containsKey("ranges")) { - List ranges = facetDimensionRequest.getRanges(); - if (longFields.contains(dimension)) { - for (JsonObject range : dimensionObject.getJsonArray("ranges") - .getValuesAs(JsonObject.class)) { - Long lower = Long.MIN_VALUE; - Long upper = Long.MAX_VALUE; - if (range.containsKey("from")) { - lower = range.getJsonNumber("from").longValueExact(); - } - if (range.containsKey("to")) { - upper = range.getJsonNumber("to").longValueExact(); - } - String label = lower.toString() + "-" + upper.toString(); - if (range.containsKey("key")) { - label = range.getString("key"); - } - ranges.add(new LongRange(label, lower, true, upper, false)); - } - } else if (doubleFields.contains(dimension)) { - for (JsonObject range : dimensionObject.getJsonArray("ranges") - .getValuesAs(JsonObject.class)) { - Double lower = Double.MIN_VALUE; - Double upper = Double.MAX_VALUE; - String label = lower.toString() + "-" + upper.toString(); - if (range.containsKey("from")) { - lower = range.getJsonNumber("from").doubleValue(); - } - if (range.containsKey("to")) { - upper = range.getJsonNumber("to").doubleValue(); - } - if (range.containsKey("key")) { - label = range.getString("key"); - } - ranges.add(new DoubleRange(label, lower, true, upper, false)); - } - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "'ranges' specified for dimension " + dimension - + " but this is not a supported numeric field"); - } - } - search.dimensions.put(dimension, facetDimensionRequest); - } - logger.info("Dimensions: {}", search.dimensions.size()); - } - } - return search; - } - - private List getSearchers(Map> readerMap, String name) + private List getSearchers(Map> searcherMap, String name) throws IOException { - List subSearchers = readerMap.get(name); + String nameLowercase = name.toLowerCase(); + List subSearchers = searcherMap.get(nameLowercase); if (subSearchers == null) { - subSearchers = indexBuckets.computeIfAbsent(name, k -> new IndexBucket(k)).acquireSearchers(); - readerMap.put(name, subSearchers); - logger.debug("Remember searcher for {}", name); + subSearchers = indexBuckets.computeIfAbsent(nameLowercase, k -> new IndexBucket(k)).acquireSearchers(); + searcherMap.put(nameLowercase, subSearchers); + logger.debug("Remember searcher for {}", nameLowercase); } return subSearchers; } - private IndexSearcher getSearcher(Map> readerMap, String name) + /** + * Gets a single IndexSearcher for name. When multiple shards are possible, + * getSearchers should be used instead. + * + * @param searcherMap Map of entity names to their IndexSearchers. + * @param name Name of the entity to get the IndexSearcher for. + * @return The IndexSearcher for name. + * @throws IOException + * @throws LuceneException If there are more than one shard for name. + */ + public IndexSearcher getSearcher(Map> searcherMap, String name) throws IOException, LuceneException { - List subSearchers = readerMap.get(name); - subSearchers = getSearchers(readerMap, name); + List subSearchers = searcherMap.get(name); + subSearchers = getSearchers(searcherMap, name); if (subSearchers.size() > 1) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Cannot get single IndexSearcher for " + name + " as it has " + subSearchers.size() + " shards"); @@ -1161,8 +801,14 @@ private IndexSearcher getSearcher(Map> readerMap, St return subSearchers.get(0); } - private List getShards(Map> readerMap, String name) { - return indexBuckets.computeIfAbsent(name, k -> new IndexBucket(k)).shardList; + /** + * Gets all ShardBuckets of a given entity/index. + * + * @param name Name of the entity to get the ShardBuckets for. + * @return List of ShardBuckets for name. + */ + private List getShards(String name) { + return indexBuckets.computeIfAbsent(name.toLowerCase(), k -> new IndexBucket(k)).shardList; } @PostConstruct @@ -1211,6 +857,18 @@ public void run() { } } + /** + * Perform search on the Investigation entity/index. + * + * @param request Incoming Http request containing the query as Json. + * @param searchAfter String of Json representing the last Lucene Document from + * a previous search. + * @param maxResults The maximum number of results to include in the returned + * Json. + * @param sort String of Json representing the sort criteria. + * @return String of Json representing the results of the search. + * @throws LuceneException + */ @POST @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) @@ -1220,8 +878,9 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("s Long uid = null; try { uid = bucketNum.getAndIncrement(); - Search search = investigationsQuery(request, sort, uid); - return luceneSearchResult("Investigation", search, searchAfter, maxResults, uid); + SearchBucket search = new SearchBucket(this, SearchType.INVESTIGATION, request, sort, searchAfter); + searches.put(uid, search); + return luceneSearchResult("Investigation", search, searchAfter, maxResults); } catch (Exception e) { logger.error("Error", e); freeSearcher(uid); @@ -1229,81 +888,19 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("s } } - private Search investigationsQuery(HttpServletRequest request, String sort, Long uid) - throws IOException, QueryNodeException, LuceneException { - Search search = new Search(); - searches.put(uid, search); - Map> readerMap = new HashMap<>(); - search.searcherMap = readerMap; - search.parseSort(sort); - try (JsonReader r = Json.createReader(request.getInputStream())) { - JsonObject o = r.readObject(); - JsonObject query = o.getJsonObject("query"); - String userName = query.getString("user", null); - - BooleanQuery.Builder theQuery = new BooleanQuery.Builder(); - - buildFilterQueries(query, theQuery); - - if (userName != null) { - buildUserNameQuery(readerMap, userName, theQuery, "id"); - } - - String text = query.getString("text", null); - if (text != null) { - theQuery.add(investigationParser.parse(text, null), Occur.MUST); - } - - buildDateRanges(theQuery, query, "lower", "upper", "startDate", "endDate"); - - if (query.containsKey("parameters")) { - JsonArray parameters = query.getJsonArray("parameters"); - IndexSearcher investigationParameterSearcher = getSearcher(readerMap, "InvestigationParameter"); - - for (JsonValue p : parameters) { - BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", paramQuery.build(), - investigationParameterSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); - } - } - - if (query.containsKey("samples")) { - JsonArray samples = query.getJsonArray("samples"); - IndexSearcher sampleSearcher = getSearcher(readerMap, "Sample"); - - for (JsonValue s : samples) { - JsonString sample = (JsonString) s; - BooleanQuery.Builder sampleQuery = new BooleanQuery.Builder(); - sampleQuery.add(sampleParser.parse(sample.getString(), null), Occur.MUST); - Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", sampleQuery.build(), - sampleSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); - } - } - - String userFullName = query.getString("userFullName", null); - if (userFullName != null) { - BooleanQuery.Builder userFullNameQuery = new BooleanQuery.Builder(); - userFullNameQuery.add(genericParser.parse(userFullName, "user.fullName"), Occur.MUST); - IndexSearcher investigationUserSearcher = getSearcher(readerMap, "InvestigationUser"); - Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", userFullNameQuery.build(), - investigationUserSearcher, ScoreMode.None); - theQuery.add(toQuery, Occur.MUST); - } - - search.query = maybeEmptyQuery(theQuery); - search.parseFields(o); - } - logger.info("Query: {}", search.query); - return search; - } - + /** + * Locks the specified index for population, removing all existing documents and + * preventing normal modify operations until the index is unlocked. + * + * @param entityName Name of the entity/index to lock. + * @throws LuceneException If already locked, or if there's an IOException when + * deleting documents. + */ @POST @Path("lock/{entityName}") public void lock(@PathParam("entityName") String entityName) throws LuceneException { logger.info("Requesting lock of {} index", entityName); - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); if (!bucket.locked.compareAndSet(false, true)) { throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene already locked for " + entityName); @@ -1317,8 +914,26 @@ public void lock(@PathParam("entityName") String entityName) throws LuceneExcept } } - private String luceneFacetResult(String name, Search search, String searchAfter, int maxResults, int maxLabels, - Long uid) throws IOException, IllegalStateException, LuceneException { + /** + * Perform faceting on an entity/index. + * + * @param name Entity/index to facet. + * @param search SearchBucket containing the search query, dimensions to + * facet etc. + * @param searchAfter String of Json representing the last Lucene Document from + * a previous search. + * @param maxResults The maximum number of results from the search. + * @param maxLabels The maximum number of labels to return for each dimension + * of the facets. + * @return String of Json representing the facets of the search results. + * @throws IOException + * @throws IllegalStateException If the IndexSearcher and its DirectoryReader + * are not in sync. + * @throws LuceneException If ranges are provided for a non-numeric field, + * or something else goes wrong. + */ + private String luceneFacetResult(String name, SearchBucket search, String searchAfter, int maxResults, + int maxLabels) throws IOException, IllegalStateException, LuceneException { // If no dimensions were specified, perform "sparse" faceting on all applicable // string values boolean sparse = search.dimensions.size() == 0; @@ -1334,16 +949,18 @@ private String luceneFacetResult(String name, Search search, String searchAfter, logger.debug("Faceting {} with {} after {} ", name, search.query, searchAfter); for (IndexSearcher indexSearcher : searchers) { FacetsCollector facetsCollector = new FacetsCollector(); - FacetsCollector.search(indexSearcher, search.query, maxResults, facetsCollector); + TopDocs results = FacetsCollector.search(indexSearcher, search.query, maxResults, facetsCollector); + logger.debug("{}", results.totalHits); for (FacetedDimension facetedDimension : search.dimensions.values()) { if (facetedDimension.getRanges().size() > 0) { + logger.debug("Ranges: {}", facetedDimension.getRanges().get(0).getClass().getSimpleName()); // Perform range based facets for a numeric field String dimension = facetedDimension.getDimension(); Facets facets; - if (longFields.contains(dimension)) { + if (DocumentMapping.longFields.contains(dimension)) { LongRange[] ranges = facetedDimension.getRanges().toArray(new LongRange[0]); facets = new LongRangeFacetCounts(dimension, facetsCollector, ranges); - } else if (doubleFields.contains(dimension)) { + } else if (DocumentMapping.doubleFields.contains(dimension)) { DoubleRange[] ranges = facetedDimension.getRanges().toArray(new DoubleRange[0]); facets = new DoubleRangeFacetCounts(dimension, facetsCollector, ranges); } else { @@ -1374,8 +991,10 @@ private String luceneFacetResult(String name, Search search, String searchAfter, Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); List facetResults = facets.getAllDims(maxLabels); for (FacetResult facetResult : facetResults) { - String dimension = facetResult.dim; + String dimension = facetResult.dim.replace(".keyword", ""); FacetedDimension facetedDimension = search.dimensions.get(dimension); + logger.debug("String facets found for {}, requested dimensions were {}", dimension, + search.dimensions.keySet()); if (facetedDimension != null) { facetedDimension.addResult(facetResult); } @@ -1397,7 +1016,9 @@ private String luceneFacetResult(String name, Search search, String searchAfter, // Build results JsonObjectBuilder aggregationsBuilder = Json.createObjectBuilder(); search.dimensions.values().forEach(facetedDimension -> facetedDimension.buildResponse(aggregationsBuilder)); - return Json.createObjectBuilder().add("aggregations", aggregationsBuilder).build().toString(); + String aggregations = Json.createObjectBuilder().add("aggregations", aggregationsBuilder).build().toString(); + logger.debug("aggregations: {}", aggregations); + return aggregations; } /** @@ -1416,25 +1037,37 @@ private String luceneFacetResult(String name, Search search, String searchAfter, private void addFacetResults(int maxLabels, Map facetedDimensionMap, Facets facets) throws IOException { for (FacetResult facetResult : facets.getAllDims(maxLabels)) { - String dim = facetResult.dim; + String dim = facetResult.dim.replace(".keyword", ""); logger.trace("Sparse faceting: FacetResult for {}", dim); FacetedDimension facetedDimension = facetedDimensionMap.get(dim); if (facetedDimension == null) { - facetedDimension = new FacetedDimension(facetResult.dim); + facetedDimension = new FacetedDimension(dim); facetedDimensionMap.put(dim, facetedDimension); } facetedDimension.addResult(facetResult); } } - private String luceneSearchResult(String name, Search search, String searchAfter, int maxResults, Long uid) + /** + * Perform search on name. + * + * @param name Entity/index to search. + * @param search SearchBucket containing the search query, dimensions to + * facet etc. + * @param searchAfter String of Json representing the last Lucene Document from + * a previous search. + * @param maxResults The maximum number of results from the search. + * @return String of Json representing the results of the search. + * @throws IOException + * @throws LuceneException + */ + private String luceneSearchResult(String name, SearchBucket search, String searchAfter, int maxResults) throws IOException, LuceneException { List searchers = getSearchers(search.searcherMap, name); - List shards = getShards(search.searcherMap, name); + List shards = getShards(name); String format = "Search {} with: query {}, maxResults {}, searchAfter {}, scored {}, fields {}"; logger.debug(format, name, search.query, maxResults, searchAfter, search.scored, search.fields); - FieldDoc searchAfterDoc = parseSearchAfter(searchAfter, search.sort.getSort()); - TopFieldDocs topFieldDocs = searchShards(search, maxResults, shards, searchAfterDoc); + TopFieldDocs topFieldDocs = searchShards(search, maxResults, shards, search.searchAfter); ScoreDoc[] hits = topFieldDocs.scoreDocs; TotalHits totalHits = topFieldDocs.totalHits; SortField[] fields = topFieldDocs.fields; @@ -1505,7 +1138,18 @@ private String luceneSearchResult(String name, Search search, String searchAfter return baos.toString(); } - private TopFieldDocs searchShards(Search search, int maxResults, List shards, + /** + * Performs a search by iterating over all relevant shards. + * + * @param search SearchBucket containing the search query, dimensions to + * facet etc. + * @param maxResults The maximum number of results from the search. + * @param shards List of all ShardBuckets for the entity to be searched. + * @param searchAfterDoc The last Lucene FieldDoc from a previous search. + * @return Lucene TopFieldDocs resulting from the search. + * @throws IOException + */ + private TopFieldDocs searchShards(SearchBucket search, int maxResults, List shards, FieldDoc searchAfterDoc) throws IOException { TopFieldDocs topFieldDocs; if (shards.size() > 0) { @@ -1544,51 +1188,6 @@ private TopFieldDocs searchShards(Search search, int maxResults, List date to make sorting easier? - if (longFields.contains(key)) { + if (DocumentMapping.longFields.contains(key)) { document.add(new NumericDocValuesField(key, json.getJsonNumber(key).longValueExact())); - } else if (doubleFields.contains(key)) { + } else if (DocumentMapping.doubleFields.contains(key)) { long sortableLong = NumericUtils.doubleToSortableLong(json.getJsonNumber(key).doubleValue()); document.add(new NumericDocValuesField(key, sortableLong)); } else { @@ -1677,9 +1291,16 @@ private void addSortField(JsonObject json, Document document, String key) { } } + /** + * Re-adds the content of a Lucene IndexableField to a Lucene Document. This is + * needed when updating Documents to ensure sorting is not lost. + * + * @param field Lucene IndexableField to be re-added to the document. + * @param document Lucene Document being built. + */ private void addSortField(IndexableField field, Document document) { String key = field.name(); - if (sortFields.contains(key)) { + if (DocumentMapping.sortFields.contains(key)) { if (key.equals("id")) { // Id is a special case, as we need to to be SORTED as a byte ref to allow joins // but also SORTED_NUMERIC to ensure a deterministic order to results @@ -1687,9 +1308,9 @@ private void addSortField(IndexableField field, Document document) { document.add(new NumericDocValuesField("id.long", value)); document.add(new StoredField("id.long", value)); } - if (longFields.contains(key)) { + if (DocumentMapping.longFields.contains(key)) { document.add(new NumericDocValuesField(key, field.numericValue().longValue())); - } else if (doubleFields.contains(key)) { + } else if (DocumentMapping.doubleFields.contains(key)) { long sortableLong = NumericUtils.doubleToSortableLong(field.numericValue().doubleValue()); document.add(new NumericDocValuesField(key, sortableLong)); } else { @@ -1742,108 +1363,20 @@ private Document pruneDocument(String fieldPrefix, Document oldDocument) { return newDocument; } - private Builder parseParameter(JsonValue p) throws LuceneException { - JsonObject parameter = (JsonObject) p; - BooleanQuery.Builder paramQuery = new BooleanQuery.Builder(); - String pName = parameter.getString("name", null); - if (pName != null) { - paramQuery.add(new WildcardQuery(new Term("type.name.keyword", pName)), Occur.MUST); - } - - String pUnits = parameter.getString("units", null); - if (pUnits != null) { - paramQuery.add(new WildcardQuery(new Term("type.units", pUnits)), Occur.MUST); - } - if (parameter.containsKey("stringValue")) { - String pStringValue = parameter.getString("stringValue", null); - paramQuery.add(new WildcardQuery(new Term("stringValue", pStringValue)), Occur.MUST); - } else if (parameter.containsKey("lowerDateValue") && parameter.containsKey("upperDateValue")) { - buildDateRanges(paramQuery, parameter, "lowerDateValue", "upperDateValue", "dateTimeValue"); - } else if (parameter.containsKey("lowerNumericValue") && parameter.containsKey("upperNumericValue")) { - Double pLowerNumericValue = parameter.getJsonNumber("lowerNumericValue").doubleValue(); - Double pUpperNumericValue = parameter.getJsonNumber("upperNumericValue").doubleValue(); - paramQuery.add(DoublePoint.newRangeQuery("numericValue", pLowerNumericValue, pUpperNumericValue), - Occur.MUST); - } - return paramQuery; - } - /** - * Parses a Lucene ScoreDoc to be "searched after" from a String representation - * of a JSON array. + * Unlocks the specified index after population, commiting all pending documents + * and + * allowing normal modify operations again. * - * @param searchAfter String representation of a JSON object containing the - * document id or "doc" (String), score ("float") in that - * order. - * @return FieldDoc object built from the provided String, or null if - * searchAfter was itself null or an empty String. - * @throws LuceneException If an entry in the fields array is not a STRING or - * NUMBER + * @param entityName Name of the entity/index to unlock. + * @throws LuceneException If not locked, or if there's an IOException when + * committing documents. */ - private FieldDoc parseSearchAfter(String searchAfter, SortField[] sortFields) throws LuceneException { - if (searchAfter == null || searchAfter.equals("")) { - return null; - } - logger.debug("Attempting to parseSearchAfter from {}", searchAfter); - JsonReader reader = Json.createReader(new StringReader(searchAfter)); - JsonObject object = reader.readObject(); - // shardIndex and Lucene doc Id are always needed to determine tie breaks, even - // if the field sort resulted in no ties in the first place - int shardIndex = object.getInt("shardIndex"); - int doc = object.getInt("doc"); - float score = Float.NaN; - List fields = new ArrayList<>(); - if (object.containsKey("score")) { - score = object.getJsonNumber("score").bigDecimalValue().floatValue(); - } - if (object.containsKey("fields")) { - JsonArray jsonArray = object.getJsonArray("fields"); - if (jsonArray.size() != sortFields.length) { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "fields should have the same length as sort, but they were " - + jsonArray.size() + " and " + sortFields.length); - } - for (int i = 0; i < sortFields.length; i++) { - JsonValue value = jsonArray.get(i); - switch (value.getValueType()) { - case NUMBER: - JsonNumber number = ((JsonNumber) value); - switch (sortFields[i].getType()) { - case FLOAT: - case DOUBLE: - case SCORE: - fields.add(number.bigDecimalValue().floatValue()); - break; - case INT: - case LONG: - case DOC: - case CUSTOM: - fields.add(number.longValueExact()); - break; - default: - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "fields contained a NUMBER but the corresponding field was " - + sortFields[i]); - } - break; - case STRING: - fields.add(new BytesRef(((JsonString) value).getString())); - break; - default: - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "fields should be an array of STRING and NUMBER, but had entry of type " - + value.getValueType()); - } - } - } - return new FieldDoc(doc, score, fields.toArray(), shardIndex); - } - @POST @Path("unlock/{entityName}") public void unlock(@PathParam("entityName") String entityName) throws LuceneException { logger.debug("Requesting unlock of {} index", entityName); - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); if (!bucket.locked.compareAndSet(true, false)) { throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene is not currently locked for " + entityName); @@ -1855,15 +1388,26 @@ public void unlock(@PathParam("entityName") String entityName) throws LuceneExce } } + /** + * Updates an existing Lucene document, provided that the target index is not + * locked + * for another operation. + * + * @param operationBody JsonObject containing the "_index" that the new "doc" + * should be created in. + * @throws LuceneException + * @throws NumberFormatException + * @throws IOException + */ private void update(JsonObject operationBody) throws LuceneException, NumberFormatException, IOException { String entityName = operationBody.getString("_index"); - if (relationships.containsKey(entityName)) { + if (DocumentMapping.relationships.containsKey(entityName)) { updateByRelation(operationBody, false); } - if (indexedEntities.contains(entityName)) { + if (DocumentMapping.indexedEntities.contains(entityName)) { String icatId = operationBody.getString("_id"); Document document = parseDocument(operationBody.getJsonObject("doc")); - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); if (bucket.locked.get()) { throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene locked for " + entityName); @@ -1873,11 +1417,28 @@ private void update(JsonObject operationBody) throws LuceneException, NumberForm } } + /** + * Updates an existing Lucene document, provided that the target index is not + * locked + * for another operation. In this case, the entity being updated does not have + * its own index, but exists as fields on a parent. For example, + * InvestigationType on an Investigation. + * + * @param operationBody JsonObject containing the "_index" that the new "doc" + * should be created in. + * @param delete Whether to delete the related entity (or just update its + * values). + * @throws LuceneException + * @throws NumberFormatException + * @throws IOException + */ private void updateByRelation(JsonObject operationBody, Boolean delete) throws LuceneException, NumberFormatException, IOException { - for (ParentRelationship parentRelationship : relationships.get(operationBody.getString("_index"))) { + for (DocumentMapping.ParentRelationship parentRelationship : DocumentMapping.relationships + .get(operationBody.getString("_index"))) { String childId = operationBody.getString("_id"); - IndexBucket bucket = indexBuckets.computeIfAbsent(parentRelationship.parentName, k -> new IndexBucket(k)); + IndexBucket bucket = indexBuckets.computeIfAbsent(parentRelationship.parentName.toLowerCase(), + k -> new IndexBucket(k)); if (bucket.locked.get()) { throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene locked for " + parentRelationship.parentName); diff --git a/src/main/java/org/icatproject/lucene/SearchBucket.java b/src/main/java/org/icatproject/lucene/SearchBucket.java new file mode 100644 index 0000000..21dd667 --- /dev/null +++ b/src/main/java/org/icatproject/lucene/SearchBucket.java @@ -0,0 +1,813 @@ +package org.icatproject.lucene; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.StringReader; +import java.net.HttpURLConnection; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TimeZone; +import java.util.Map.Entry; + +import javax.json.Json; +import javax.json.JsonArray; +import javax.json.JsonNumber; +import javax.json.JsonObject; +import javax.json.JsonReader; +import javax.json.JsonString; +import javax.json.JsonValue; +import javax.json.JsonValue.ValueType; +import javax.servlet.http.HttpServletRequest; + +import org.apache.lucene.document.DoublePoint; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.facet.range.DoubleRange; +import org.apache.lucene.facet.range.LongRange; +import org.apache.lucene.facet.range.Range; +import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.flexible.core.QueryNodeException; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FieldDoc; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortField; +import org.apache.lucene.search.SortedNumericSortField; +import org.apache.lucene.search.TermInSetQuery; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.WildcardQuery; +import org.apache.lucene.search.BooleanClause.Occur; +import org.apache.lucene.search.BooleanQuery.Builder; +import org.apache.lucene.search.SortField.Type; +import org.apache.lucene.search.join.JoinUtil; +import org.apache.lucene.search.join.ScoreMode; +import org.apache.lucene.util.BytesRef; +import org.icatproject.lucene.exceptions.LuceneException; +import org.icatproject.utils.IcatUnits.SystemValue; + +/** + * Bucket for information relating to a single search. + */ +public class SearchBucket { + + public enum SearchType { + DATAFILE, DATASET, INVESTIGATION, GENERIC + } + + private Lucene lucene; + public Map> searcherMap; + public Query query; + public Sort sort; + public FieldDoc searchAfter; + public boolean scored; + public Set fields = new HashSet(); + public Map> joinedFields = new HashMap<>(); + public Map dimensions = new HashMap(); + public boolean aborted = false; + private static final SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmm"); + + static { + TimeZone tz = TimeZone.getTimeZone("GMT"); + df.setTimeZone(tz); + } + + /** + * Creates an empty search bucket. + * + * @param lucene IcatLucene instance. + */ + public SearchBucket(Lucene lucene) { + this.lucene = lucene; + } + + /** + * Creates a new search from the provided request and Url parameters. + * + * @param lucene IcatLucene instance. + * @param searchType The SearchType determines how the query is built for + * specific entities. + * @param request Incoming Http request containing the query as Json. + * @param sort Sort criteria as a Json encoded string. + * @param searchAfter The last FieldDoc of a previous search, encoded as Json. + * @throws LuceneException + * @throws IOException + * @throws QueryNodeException + */ + public SearchBucket(Lucene lucene, SearchType searchType, HttpServletRequest request, String sort, String searchAfter) + throws LuceneException, IOException, QueryNodeException { + this.lucene = lucene; + searcherMap = new HashMap<>(); + parseSort(sort); + try (JsonReader r = Json.createReader(request.getInputStream())) { + JsonObject o = r.readObject(); + parseFields(o); + parseDimensions(o); // Don't need for DF + JsonObject jsonQuery = o.getJsonObject("query"); + BooleanQuery.Builder luceneQuery = new BooleanQuery.Builder(); + String userName; + String text; + switch (searchType) { + case GENERIC: + parseGenericQuery(jsonQuery, luceneQuery); + return; + case DATAFILE: + parseSearchAfter(searchAfter); + buildFilterQueries("datafile", jsonQuery, luceneQuery); + + userName = jsonQuery.getString("user", null); + if (userName != null) { + buildUserNameQuery(userName, luceneQuery, "investigation.id"); + } + + text = jsonQuery.getString("text", null); + if (text != null) { + luceneQuery.add(DocumentMapping.datafileParser.parse(text, null), Occur.MUST); + } + + buildDateRanges(luceneQuery, jsonQuery, "lower", "upper", "date"); + + if (jsonQuery.containsKey("parameters")) { + JsonArray parameters = jsonQuery.getJsonArray("parameters"); + IndexSearcher datafileParameterSearcher = lucene.getSearcher(searcherMap, "DatafileParameter"); + for (JsonValue p : parameters) { + BooleanQuery.Builder paramQuery = parseParameter(p); + Query toQuery = JoinUtil.createJoinQuery("datafile.id", false, "id", paramQuery.build(), + datafileParameterSearcher, ScoreMode.None); + luceneQuery.add(toQuery, Occur.MUST); + } + } + query = maybeEmptyQuery(luceneQuery); + return; + case DATASET: + parseSearchAfter(searchAfter); + buildFilterQueries("dataset", jsonQuery, luceneQuery); + + userName = jsonQuery.getString("user", null); + if (userName != null) { + buildUserNameQuery(userName, luceneQuery, "investigation.id"); + } + + text = jsonQuery.getString("text", null); + if (text != null) { + luceneQuery.add(DocumentMapping.datasetParser.parse(text, null), Occur.MUST); + } + + buildDateRanges(luceneQuery, jsonQuery, "lower", "upper", "startDate", "endDate"); + + if (jsonQuery.containsKey("parameters")) { + JsonArray parameters = jsonQuery.getJsonArray("parameters"); + IndexSearcher parameterSearcher = lucene.getSearcher(searcherMap, "DatasetParameter"); + for (JsonValue p : parameters) { + BooleanQuery.Builder paramQuery = parseParameter(p); + Query toQuery = JoinUtil.createJoinQuery("dataset.id", false, "id", paramQuery.build(), + parameterSearcher, ScoreMode.None); + luceneQuery.add(toQuery, Occur.MUST); + } + } + query = maybeEmptyQuery(luceneQuery); + return; + case INVESTIGATION: + parseSearchAfter(searchAfter); + buildFilterQueries("investigation", jsonQuery, luceneQuery); + + userName = jsonQuery.getString("user", null); + if (userName != null) { + buildUserNameQuery(userName, luceneQuery, "id"); + } + + text = jsonQuery.getString("text", null); + if (text != null) { + luceneQuery.add(DocumentMapping.investigationParser.parse(text, null), Occur.MUST); + } + + buildDateRanges(luceneQuery, jsonQuery, "lower", "upper", "startDate", "endDate"); + + if (jsonQuery.containsKey("parameters")) { + JsonArray parameters = jsonQuery.getJsonArray("parameters"); + IndexSearcher parameterSearcher = lucene.getSearcher(searcherMap, "InvestigationParameter"); + for (JsonValue p : parameters) { + BooleanQuery.Builder paramQuery = parseParameter(p); + Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", + paramQuery.build(), + parameterSearcher, ScoreMode.None); + luceneQuery.add(toQuery, Occur.MUST); + } + } + + if (jsonQuery.containsKey("samples")) { + JsonArray samples = jsonQuery.getJsonArray("samples"); + IndexSearcher sampleSearcher = lucene.getSearcher(searcherMap, "Sample"); + + for (JsonValue s : samples) { + JsonString sample = (JsonString) s; + BooleanQuery.Builder sampleQuery = new BooleanQuery.Builder(); + sampleQuery.add(DocumentMapping.sampleParser.parse(sample.getString(), null), Occur.MUST); + Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", + sampleQuery.build(), + sampleSearcher, ScoreMode.None); + luceneQuery.add(toQuery, Occur.MUST); + } + } + + String userFullName = jsonQuery.getString("userFullName", null); + if (userFullName != null) { + BooleanQuery.Builder userFullNameQuery = new BooleanQuery.Builder(); + userFullNameQuery.add(DocumentMapping.genericParser.parse(userFullName, "user.fullName"), + Occur.MUST); + IndexSearcher investigationUserSearcher = lucene.getSearcher(searcherMap, "InvestigationUser"); + Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", + userFullNameQuery.build(), + investigationUserSearcher, ScoreMode.None); + luceneQuery.add(toQuery, Occur.MUST); + } + query = maybeEmptyQuery(luceneQuery); + return; + } + } + } + + /** + * Extracts values from queryJson in order to add one or more range query terms + * using queryBuilder. + * + * Note that values in queryJson are expected to be precise only to the minute, + * and so to ensure that our range is inclusive, we add 59.999 seconds onto the + * upper value only. + * + * If either upper or lower keys do not yield values then a half open range is + * created. If both are absent, then nothing is added to the query. + * + * @param queryBuilder Builder for the Lucene query. + * @param queryJson JsonObject representing the query parameters. + * @param lowerKey Key in queryJson of the lower date value + * @param upperKey Key in queryJson of the upper date value + * @param fields Name of one or more fields to apply the range query to. + * @throws LuceneException + */ + private void buildDateRanges(Builder queryBuilder, JsonObject queryJson, String lowerKey, String upperKey, + String... fields) throws LuceneException { + Long lower = parseDate(queryJson, lowerKey, 0); + Long upper = parseDate(queryJson, upperKey, 59999); + if (lower != null || upper != null) { + lower = (lower == null) ? Long.MIN_VALUE : lower; + upper = (upper == null) ? Long.MAX_VALUE : upper; + for (String field : fields) { + queryBuilder.add(LongPoint.newRangeQuery(field, lower, upper), Occur.MUST); + } + } + } + + /** + * Builds Term queries (exact string matches without tokenizing) Range queries + * or Nested/Joined queries from the filter + * object in the query request. + * + * @param requestedQuery Json object containing details of the query. + * @param queryBuilder Builder for the overall boolean query to be build. + * @throws LuceneException If the values in the filter object are neither STRING + * nor ARRAY of STRING. + * @throws IOException + */ + private void buildFilterQueries(String target, JsonObject requestedQuery, Builder queryBuilder) + throws LuceneException, IOException { + if (requestedQuery.containsKey("filter")) { + JsonObject filterObject = requestedQuery.getJsonObject("filter"); + for (String key : filterObject.keySet()) { + JsonValue value = filterObject.get(key); + ValueType valueType = value.getValueType(); + int i = key.indexOf("."); + String filterTarget = i == -1 ? key : key.substring(0, i); + String fld = key.substring(i + 1); + Query dimensionQuery; + switch (valueType) { + case ARRAY: + Builder builder = new BooleanQuery.Builder(); + // If the key was just a nested entity (no ".") then we should FILTER all of our + // queries on that entity. + Occur occur = i == -1 ? Occur.FILTER : Occur.SHOULD; + for (JsonValue arrayValue : filterObject.getJsonArray(key)) { + Query arrayQuery = parseFilter(target, fld, arrayValue); + builder.add(arrayQuery, occur); + } + dimensionQuery = builder.build(); + break; + + default: + dimensionQuery = parseFilter(target, fld, value); + } + // Nest the dimension query if needed + if (i != -1 && !target.equals(filterTarget)) { + // If we are targeting a different entity, nest the entire array as SHOULD + // BUT only if we haven't already nested the queries (as we do when the key was + // just a nested entity) + IndexSearcher nestedSearcher = lucene.getSearcher(searcherMap, filterTarget); + Query nestedQuery = JoinUtil.createJoinQuery(target + ".id", false, "id", dimensionQuery, + nestedSearcher, ScoreMode.None); + queryBuilder.add(nestedQuery, Occur.FILTER); + } else { + // Otherwise, just add as SHOULD to the main query directly + queryBuilder.add(dimensionQuery, Occur.FILTER); + } + } + } + } + + /** + * Parses a single filter field value pair into Lucene objects. Can handle + * simple strings, range objects or nested filters. + * + * @param target The target entity of the search, but not necessarily this + * filter + * @param fld The field to apply the query to + * @param value JsonValue (JsonString or JsonObject) to parse a Lucene Query + * from + * @return A Lucene Query object parsed from the provided value + * @throws IOException + * @throws LuceneException + */ + private Query parseFilter(String target, String fld, JsonValue value) throws IOException, LuceneException { + ValueType valueType = value.getValueType(); + switch (valueType) { + case STRING: + // Simplest case involving a single field/value pair + return new TermQuery(new Term(fld + ".keyword", ((JsonString) value).getString())); + + case OBJECT: + JsonObject valueObject = (JsonObject) value; + if (valueObject.containsKey("filter")) { + // Parse a nested query + IndexSearcher nestedSearcher = lucene.getSearcher(searcherMap, fld); + List nestedFilters = valueObject.getJsonArray("filter").getValuesAs(JsonObject.class); + Builder nestedBoolBuilder = new BooleanQuery.Builder(); + nestedFilters.forEach(nestedFilter -> { + String nestedField = nestedFilter.getString("field"); + if (nestedFilter.containsKey("value")) { + TermQuery query = new TermQuery(new Term(nestedField + ".keyword", nestedFilter.getString("value"))); + nestedBoolBuilder.add(query, Occur.FILTER); + } else { + buildNestedRangeQuery(nestedField, nestedFilter, nestedBoolBuilder); + } + }); + return JoinUtil.createJoinQuery(target + ".id", false, "id", nestedBoolBuilder.build(), + nestedSearcher, ScoreMode.None); + } else { + // Single range of values for a field + JsonNumber from = valueObject.getJsonNumber("from"); + JsonNumber to = valueObject.getJsonNumber("to"); + if (DocumentMapping.longFields.contains(fld)) { + return LongPoint.newRangeQuery(fld, from.longValueExact(), to.longValueExact()); + } else { + return DoublePoint.newRangeQuery(fld, from.doubleValue(), to.doubleValue()); + } + } + + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "filter object values should be STRING or OBJECT, but were " + valueType); + } + } + + /** + * Builds a range query, intended for use with numeric or date/time parameters. + * + * @param fld Name of the field to apply the range to. + * @param valueObject JsonObject containing "from", "to" and optionally "units" + * as keys for a range of values. + * @param builder BooleanQuery.Builder for the nested query + */ + private void buildNestedRangeQuery(String fld, JsonObject valueObject, BooleanQuery.Builder builder) { + if (DocumentMapping.longFields.contains(fld)) { + long from = Long.MIN_VALUE; + long to = Long.MAX_VALUE; + try { + from = valueObject.getJsonNumber("from").longValueExact(); + } catch (ArithmeticException e) { + // pass + } + try { + to = valueObject.getJsonNumber("to").longValueExact(); + } catch (ArithmeticException e) { + // pass + } + builder.add(LongPoint.newRangeQuery(fld, from, to), Occur.FILTER); + } else { + double from = valueObject.getJsonNumber("from").doubleValue(); + double to = valueObject.getJsonNumber("to").doubleValue(); + String units = valueObject.getString("units", null); + if (units != null) { + SystemValue fromValue = lucene.icatUnits.new SystemValue(from, units); + SystemValue toValue = lucene.icatUnits.new SystemValue(to, units); + if (fromValue.value != null && toValue.value != null) { + // If we were able to parse the units, apply query to the SI value + builder.add(DoublePoint.newRangeQuery(fld + "SI", fromValue.value, toValue.value), Occur.FILTER); + } else { + // If units could not be parsed, make them part of the query on the raw data + builder.add(DoublePoint.newRangeQuery(fld, from, to), Occur.FILTER); + builder.add(new TermQuery(new Term("type.units.keyword", units)), Occur.FILTER); + } + } else { + // If units were not provided, just apply to the raw data + builder.add(DoublePoint.newRangeQuery(fld, from, to), Occur.FILTER); + } + } + } + + /** + * Builds a query against InvestigationUser and InstrumentScientist entities + * using the provided userName. + * + * @param userName The value of the user.name field to query for. + * @param luceneQuery BooleanQuery.Builder in use for main entity query. + * @param toField The field on the main entity to join to, practically + * either "id" or "investigation.id". + * @throws IOException + * @throws LuceneException + */ + private void buildUserNameQuery(String userName, BooleanQuery.Builder luceneQuery, String toField) + throws IOException, LuceneException { + TermQuery fromQuery = new TermQuery(new Term("user.name", userName)); + Query investigationUserQuery = JoinUtil.createJoinQuery("investigation.id", false, toField, fromQuery, + lucene.getSearcher(searcherMap, "InvestigationUser"), ScoreMode.None); + Query instrumentScientistQuery = JoinUtil.createJoinQuery("instrument.id", false, "instrument.id", fromQuery, + lucene.getSearcher(searcherMap, "InstrumentScientist"), ScoreMode.None); + Query investigationInstrumentQuery = JoinUtil.createJoinQuery("investigation.id", false, toField, + instrumentScientistQuery, lucene.getSearcher(searcherMap, "InvestigationInstrument"), ScoreMode.None); + Builder userNameQueryBuilder = new BooleanQuery.Builder(); + userNameQueryBuilder.add(investigationUserQuery, Occur.SHOULD).add(investigationInstrumentQuery, Occur.SHOULD); + luceneQuery.add(userNameQueryBuilder.build(), Occur.MUST); + } + + /** + * Converts String into number of ms since epoch. + * + * @param value String representing a Date in the format "yyyyMMddHHmm". + * @return Number of ms since epoch, or null if value was null + * @throws java.text.ParseException + */ + protected static Long decodeTime(String value) throws java.text.ParseException { + if (value == null) { + return null; + } else { + synchronized (df) { + return df.parse(value).getTime(); + } + } + } + + /** + * Either builds the query from the provided builder, or creates a + * MatchAllDocsQuery to use if the Builder was empty. + * + * @param luceneQuery BooleanQuery.Builder + * @return Lucene Query + */ + private Query maybeEmptyQuery(Builder luceneQuery) { + Query query = luceneQuery.build(); + if (query.toString().isEmpty()) { + query = new MatchAllDocsQuery(); + } + return query; + } + + /** + * Parses a date/time value from jsonObject. Can account for either a Long + * value, or a String value encoded in the format yyyyMMddHHmm. + * + * @param jsonObject JsonObject containing the date to be parsed. + * @param key Key of the date/time value in jsonObject. + * @param offset In the case of STRING ValueType, add offset ms before + * returning. This accounts for the fact the String format + * used is only precise to minutes and not seconds. + * @return null if jsonObject does not contain the key, number of ms since epoch + * otherwise. + * @throws LuceneException If the ValueType is not NUMBER or STRING, or if a + * STRING value cannot be parsed. + */ + private Long parseDate(JsonObject jsonObject, String key, int offset) throws LuceneException { + if (jsonObject.containsKey(key)) { + ValueType valueType = jsonObject.get(key).getValueType(); + switch (valueType) { + case STRING: + String dateString = jsonObject.getString(key); + try { + return decodeTime(dateString) + offset; + } catch (Exception e) { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Could not parse date " + dateString + " using expected format yyyyMMddHHmm"); + } + case NUMBER: + return jsonObject.getJsonNumber(key).longValueExact(); + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Dates should be represented by a NUMBER or STRING JsonValue, but got " + valueType); + } + } + return null; + } + + /** + * Parses dimensions to apply faceting to from the incoming Json. If ranges are + * specified, these are also parsed. + * + * @param jsonObject Json from incoming search request. + * @throws LuceneException + */ + private void parseDimensions(JsonObject jsonObject) throws LuceneException { + if (jsonObject.containsKey("dimensions")) { + List dimensionObjects = jsonObject.getJsonArray("dimensions").getValuesAs(JsonObject.class); + for (JsonObject dimensionObject : dimensionObjects) { + if (!dimensionObject.containsKey("dimension")) { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "'dimension' not specified for facet request " + dimensionObject.toString()); + } + String dimension = dimensionObject.getString("dimension"); + FacetedDimension facetDimensionRequest = new FacetedDimension(dimension); + if (dimensionObject.containsKey("ranges")) { + List ranges = facetDimensionRequest.getRanges(); + List jsonRanges = dimensionObject.getJsonArray("ranges").getValuesAs(JsonObject.class); + if (DocumentMapping.longFields.contains(dimension)) { + for (JsonObject range : jsonRanges) { + Long lower = Long.MIN_VALUE; + Long upper = Long.MAX_VALUE; + if (range.containsKey("from")) { + lower = range.getJsonNumber("from").longValueExact(); + } + if (range.containsKey("to")) { + upper = range.getJsonNumber("to").longValueExact(); + } + String label = lower.toString() + "-" + upper.toString(); + if (range.containsKey("key")) { + label = range.getString("key"); + } + ranges.add(new LongRange(label, lower, true, upper, false)); + } + } else if (DocumentMapping.doubleFields.contains(dimension)) { + for (JsonObject range : jsonRanges) { + Double lower = Double.MIN_VALUE; + Double upper = Double.MAX_VALUE; + if (range.containsKey("from")) { + lower = range.getJsonNumber("from").doubleValue(); + } + if (range.containsKey("to")) { + upper = range.getJsonNumber("to").doubleValue(); + } + String label = lower.toString() + "-" + upper.toString(); + if (range.containsKey("key")) { + label = range.getString("key"); + } + ranges.add(new DoubleRange(label, lower, true, upper, false)); + } + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "'ranges' specified for dimension " + dimension + + " but this is not a supported numeric field"); + } + } + dimensions.put(dimension, facetDimensionRequest); + } + } + } + + /** + * Parses the fields to return with the search results from Json. + * + * @param jsonObject The Json from the search request. + * @throws LuceneException If the parsing fails. + */ + public void parseFields(JsonObject jsonObject) throws LuceneException { + if (jsonObject.containsKey("fields")) { + List fieldStrings = jsonObject.getJsonArray("fields").getValuesAs(JsonString.class); + // logger.trace("Parsing fields from {}", fieldStrings); + for (JsonString jsonString : fieldStrings) { + String[] splitString = jsonString.getString().split(" "); + if (splitString.length == 1) { + // Fields without a space apply directly to the target entity + fields.add(splitString[0]); + } else if (splitString.length == 2) { + // Otherwise, the first element is the target of a join, with the second being a + // field on that joined entity. + if (joinedFields.containsKey(splitString[0])) { + joinedFields.get(splitString[0]).add(splitString[1]); + } else { + joinedFields.putIfAbsent(splitString[0], + new HashSet(Arrays.asList(splitString[1]))); + } + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Could not parse field: " + jsonString.getString()); + } + } + } + } + + /** + * Parses a query and associated information from an incoming request without + * any logic specific to a single index or entity. As such it may not be as + * powerful, but is sufficient for simple queries (like those for faceting). + * + * @param jsonQuery Incoming query request encoded as Json. + * @param luceneQuery Lucene BooleanQuery.Builder + * @throws LuceneException If the types of the JsonValues in the query do not + * match those supported by icat.lucene + */ + private void parseGenericQuery(JsonObject jsonQuery, BooleanQuery.Builder luceneQuery) throws LuceneException { + for (Entry entry : jsonQuery.entrySet()) { + String field = entry.getKey(); + ValueType valueType = entry.getValue().getValueType(); + switch (valueType) { + case STRING: + JsonString stringValue = (JsonString) entry.getValue(); + String fld = DocumentMapping.facetFields.contains(field) ? field + ".keyword" : field; + luceneQuery.add(new TermQuery(new Term(fld, stringValue.getString())), Occur.MUST); + break; + case NUMBER: + JsonNumber numberValue = (JsonNumber) entry.getValue(); + if (DocumentMapping.longFields.contains(field)) { + luceneQuery.add(LongPoint.newExactQuery(field, numberValue.longValueExact()), Occur.FILTER); + } else if (DocumentMapping.doubleFields.contains(field)) { + luceneQuery.add(DoublePoint.newExactQuery(field, numberValue.doubleValue()), Occur.FILTER); + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Value had type NUMBER, but field " + field + + " is not a known longField or doubleField"); + } + break; + case ARRAY: + // Only support array of String as list of ICAT ids is currently only use case + JsonArray arrayValue = (JsonArray) entry.getValue(); + ArrayList bytesArray = new ArrayList<>(); + String valueAsString; + for (JsonValue value : arrayValue) { + if (value.getValueType().equals(ValueType.STRING)) { + valueAsString = ((JsonString) value).getString(); + } else { + valueAsString = value.toString(); + } + bytesArray.add(new BytesRef(valueAsString)); + } + luceneQuery.add(new TermInSetQuery(field, bytesArray), Occur.MUST); + break; + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Query values should be ARRAY, STRING or NUMBER, but had value of type " + valueType); + } + } + query = maybeEmptyQuery(luceneQuery); + } + + /** + * Parses query applying to a single parameter from incoming Json. + * + * @param p JsonValue (JsonObject) representing a query against a single + * parameter. + * @return BooleanQuery.Builder for a single parameter. + * @throws LuceneException + */ + private Builder parseParameter(JsonValue p) throws LuceneException { + JsonObject parameter = (JsonObject) p; + BooleanQuery.Builder paramQuery = new BooleanQuery.Builder(); + String pName = parameter.getString("name", null); + if (pName != null) { + paramQuery.add(new WildcardQuery(new Term("type.name.keyword", pName)), Occur.MUST); + } + + String pUnits = parameter.getString("units", null); + if (pUnits != null) { + paramQuery.add(new WildcardQuery(new Term("type.units", pUnits)), Occur.MUST); + } + if (parameter.containsKey("stringValue")) { + String pStringValue = parameter.getString("stringValue", null); + paramQuery.add(new WildcardQuery(new Term("stringValue", pStringValue)), Occur.MUST); + } else if (parameter.containsKey("lowerDateValue") && parameter.containsKey("upperDateValue")) { + buildDateRanges(paramQuery, parameter, "lowerDateValue", "upperDateValue", "dateTimeValue"); + } else if (parameter.containsKey("lowerNumericValue") && parameter.containsKey("upperNumericValue")) { + Double pLowerNumericValue = parameter.getJsonNumber("lowerNumericValue").doubleValue(); + Double pUpperNumericValue = parameter.getJsonNumber("upperNumericValue").doubleValue(); + paramQuery.add(DoublePoint.newRangeQuery("numericValue", pLowerNumericValue, pUpperNumericValue), + Occur.MUST); + } + return paramQuery; + } + + /** + * Parses a Lucene FieldDoc to be "searched after" from a String representation + * of a JSON array. null if searchAfter was itself null or an empty String. + * + * @param searchAfter String representation of a JSON object containing the + * document id or "doc" (String), score ("float") in that + * order. + * @return FieldDoc object built from the provided String, or + * @throws LuceneException If an entry in the fields array is not a STRING or + * NUMBER + */ + private void parseSearchAfter(String searchAfter) throws LuceneException { + if (searchAfter == null || searchAfter.equals("")) { + return; + } + SortField[] sortFields = sort.getSort(); + JsonReader reader = Json.createReader(new StringReader(searchAfter)); + JsonObject object = reader.readObject(); + // shardIndex and Lucene doc Id are always needed to determine tie breaks, even + // if the field sort resulted in no ties in the first place + int shardIndex = object.getInt("shardIndex"); + int doc = object.getInt("doc"); + float score = Float.NaN; + List fields = new ArrayList<>(); + if (object.containsKey("score")) { + score = object.getJsonNumber("score").bigDecimalValue().floatValue(); + } + if (object.containsKey("fields")) { + JsonArray jsonArray = object.getJsonArray("fields"); + if (jsonArray.size() != sortFields.length) { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "fields should have the same length as sort, but they were " + + jsonArray.size() + " and " + sortFields.length); + } + for (int i = 0; i < sortFields.length; i++) { + JsonValue value = jsonArray.get(i); + switch (value.getValueType()) { + case NUMBER: + JsonNumber number = ((JsonNumber) value); + switch (sortFields[i].getType()) { + case FLOAT: + case DOUBLE: + case SCORE: + fields.add(number.bigDecimalValue().floatValue()); + break; + case INT: + case LONG: + case DOC: + case CUSTOM: + fields.add(number.longValueExact()); + break; + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "fields contained a NUMBER but the corresponding field was " + + sortFields[i]); + } + break; + case STRING: + fields.add(new BytesRef(((JsonString) value).getString())); + break; + default: + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "fields should be an array of STRING and NUMBER, but had entry of type " + + value.getValueType()); + } + } + } + this.searchAfter = new FieldDoc(doc, score, fields.toArray(), shardIndex); + } + + /** + * Parses the String from the request into a Lucene Sort object. Multiple sort + * criteria are supported, and will be applied in order. + * + * @param sortString String representation of a JSON object with the field(s) to + * sort + * as keys, and the direction ("asc" or "desc") as value(s). + * @return Lucene Sort object + * @throws LuceneException If the value for any key isn't "asc" or "desc" + */ + public void parseSort(String sortString) throws LuceneException { + if (sortString == null || sortString.equals("") || sortString.equals("{}")) { + scored = true; + sort = new Sort(SortField.FIELD_SCORE, new SortedNumericSortField("id.long", Type.LONG)); + return; + } + try (JsonReader reader = Json.createReader(new ByteArrayInputStream(sortString.getBytes()))) { + JsonObject object = reader.readObject(); + List fields = new ArrayList<>(); + for (String key : object.keySet()) { + String order = object.getString(key); + Boolean reverse; + if (order.equals("asc")) { + reverse = false; + } else if (order.equals("desc")) { + reverse = true; + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "Sort order must be 'asc' or 'desc' but it was '" + order + "'"); + } + + if (DocumentMapping.longFields.contains(key)) { + fields.add(new SortedNumericSortField(key, Type.LONG, reverse)); + } else if (DocumentMapping.doubleFields.contains(key)) { + fields.add(new SortedNumericSortField(key, Type.DOUBLE, reverse)); + } else { + fields.add(new SortField(key, Type.STRING, reverse)); + } + } + fields.add(new SortedNumericSortField("id.long", Type.LONG)); + scored = false; + sort = new Sort(fields.toArray(new SortField[0])); + } + } +} diff --git a/src/main/resources/run.properties b/src/main/resources/run.properties index 25babbd..99fcae0 100644 --- a/src/main/resources/run.properties +++ b/src/main/resources/run.properties @@ -5,4 +5,4 @@ directory = ${HOME}/data/lucene commitSeconds = 5 maxShardSize = 2147483648 ip = 127.0.0.1/32 -units = \u2103: celsius degC, K: kelvin +units = J: eV 1.602176634e-19; \u2103: celsius, degC; K: kelvin From 49132308f685ed8880fe896c55ffbc8fc0387a0b Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Wed, 22 Jun 2022 02:45:00 +0100 Subject: [PATCH 47/73] Support for searching on sample name #19 --- .../icatproject/lucene/DocumentMapping.java | 77 ++++++++++++------- .../java/org/icatproject/lucene/Lucene.java | 13 ++-- .../org/icatproject/lucene/SearchBucket.java | 24 ++---- 3 files changed, 63 insertions(+), 51 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/DocumentMapping.java b/src/main/java/org/icatproject/lucene/DocumentMapping.java index 27aa532..469a7f8 100644 --- a/src/main/java/org/icatproject/lucene/DocumentMapping.java +++ b/src/main/java/org/icatproject/lucene/DocumentMapping.java @@ -15,15 +15,18 @@ public class DocumentMapping { */ public static class ParentRelationship { public String parentName; - public String fieldPrefix; + public String joiningField; + public Set fields; /** - * @param parentName Name of the parent entity. - * @param fieldPrefix How nested fields should be prefixed. + * @param parentName Name of the parent entity. + * @param joiningField Field that joins the child to its parent. + * @param fields Fields that should be updated by this relationship. */ - public ParentRelationship(String parentName, String fieldPrefix) { + public ParentRelationship(String parentName, String joiningField, String... fields) { this.parentName = parentName; - this.fieldPrefix = fieldPrefix; + this.joiningField = joiningField; + this.fields = new HashSet<>(Arrays.asList(fields)); } } @@ -43,12 +46,14 @@ public ParentRelationship(String parentName, String fieldPrefix) { public static final StandardQueryParser investigationParser = new StandardQueryParser(); public static final StandardQueryParser sampleParser = new StandardQueryParser(); - static { + static { doubleFields.addAll(Arrays.asList("numericValue", "numericValueSI")); facetFields.addAll(Arrays.asList("type.name", "datafileFormat.name", "stringValue")); - longFields.addAll(Arrays.asList("date", "startDate", "endDate", "dateTimeValue", "investigation.startDate")); - sortFields.addAll(Arrays.asList("datafile.id", "dataset.id", "investigation.id", "instrument.id", "id", "date", - "startDate", "endDate", "name", "stringValue", "dateTimeValue", "numericValue", "numericValueSI")); + longFields.addAll( + Arrays.asList("date", "startDate", "endDate", "dateTimeValue", "investigation.startDate", "fileSize")); + sortFields.addAll(Arrays.asList("datafile.id", "dataset.id", "investigation.id", "instrument.id", "id", + "sample.investigation.id", "date", "name", "stringValue", "dateTimeValue", "numericValue", + "numericValueSI", "fileSize")); textFields.addAll(Arrays.asList("name", "visitId", "description", "location", "dataset.name", "investigation.name", "instrument.name", "instrument.fullName", "datafileFormat.name", "sample.name", "sample.type.name", "title", "summary", "facility.name", "user.fullName", "type.name")); @@ -58,35 +63,51 @@ public ParentRelationship(String parentName, String fieldPrefix) { "InvestigationUser", "Sample")); relationships.put("Instrument", - new ParentRelationship[] { new ParentRelationship("InvestigationInstrument", "instrument") }); - relationships.put("User", new ParentRelationship[] { new ParentRelationship("InvestigationUser", "user"), - new ParentRelationship("InstrumentScientist", "user") }); - relationships.put("Sample", new ParentRelationship[] { new ParentRelationship("Dataset", "sample") }); - relationships.put("SampleType", new ParentRelationship[] { new ParentRelationship("Sample", "type"), - new ParentRelationship("Dataset", "sample.type") }); + new ParentRelationship[] { new ParentRelationship("InvestigationInstrument", "instrument.id", + "instrument.name", "instrument.fullName") }); + relationships.put("User", + new ParentRelationship[] { + new ParentRelationship("InvestigationUser", "user.id", "user.name", "user.fullName"), + new ParentRelationship("InstrumentScientist", "user.id", "user.name", "user.fullName") }); + relationships.put("Sample", new ParentRelationship[] { + new ParentRelationship("Dataset", "sample.id", "sample.name", "sample.investigation.id"), + new ParentRelationship("Datafile", "sample.id", "sample.name", "sample.investigation.id") }); + relationships.put("SampleType", + new ParentRelationship[] { new ParentRelationship("Sample", "type.id", "type.name"), + new ParentRelationship("Dataset", "sample.type.id", "sample.type.name"), + new ParentRelationship("Datafile", "sample.type.id", "sample.type.name") }); relationships.put("InvestigationType", - new ParentRelationship[] { new ParentRelationship("Investigation", "type") }); - relationships.put("DatasetType", new ParentRelationship[] { new ParentRelationship("Dataset", "type") }); + new ParentRelationship[] { new ParentRelationship("Investigation", "type.id", "type.name") }); + relationships.put("DatasetType", + new ParentRelationship[] { new ParentRelationship("Dataset", "type.id", "type.name") }); relationships.put("DatafileFormat", - new ParentRelationship[] { new ParentRelationship("Datafile", "datafileFormat") }); - relationships.put("Facility", new ParentRelationship[] { new ParentRelationship("Investigation", "facility") }); + new ParentRelationship[] { + new ParentRelationship("Datafile", "datafileFormat.id", "datafileFormat.name") }); + relationships.put("Facility", + new ParentRelationship[] { new ParentRelationship("Investigation", "facility.id", "facility.name") }); relationships.put("ParameterType", - new ParentRelationship[] { new ParentRelationship("DatafileParameter", "type"), - new ParentRelationship("DatasetParameter", "type"), - new ParentRelationship("InvestigationParameter", "type") }); + new ParentRelationship[] { new ParentRelationship("DatafileParameter", "type.id", "type.name"), + new ParentRelationship("DatasetParameter", "type.id", "type.name"), + new ParentRelationship("InvestigationParameter", "type.id", "type.name") }); relationships.put("Investigation", - new ParentRelationship[] { new ParentRelationship("Dataset", "investigation"), - new ParentRelationship("datafile", "investigation") }); + new ParentRelationship[] { + new ParentRelationship("Dataset", "investigation.id", "investigation.name", + "investigation.title", "investigation.startDate", "visitId"), + new ParentRelationship("datafile", "investigation.id", "investigation.name", "visitId") }); + relationships.put("Dataset", + new ParentRelationship[] { new ParentRelationship("Datafile", "dataset.id", "dataset.name") }); genericParser.setAllowLeadingWildcard(true); genericParser.setAnalyzer(analyzer); - CharSequence[] datafileFields = { "name", "description", "location", "datafileFormat.name" }; + CharSequence[] datafileFields = { "name", "description", "location", "datafileFormat.name", "visitId", + "sample.name", "sample.type.name" }; datafileParser.setAllowLeadingWildcard(true); datafileParser.setAnalyzer(analyzer); datafileParser.setMultiFields(datafileFields); - CharSequence[] datasetFields = { "name", "description", "sample.name", "sample.type.name", "type.name" }; + CharSequence[] datasetFields = { "name", "description", "sample.name", "sample.type.name", "type.name", + "visitId" }; datasetParser.setAllowLeadingWildcard(true); datasetParser.setAnalyzer(analyzer); datasetParser.setMultiFields(datasetFields); @@ -97,9 +118,9 @@ public ParentRelationship(String parentName, String fieldPrefix) { investigationParser.setAnalyzer(analyzer); investigationParser.setMultiFields(investigationFields); - CharSequence[] sampleFields = { "name", "type.name" }; + CharSequence[] sampleFields = { "sample.name", "sample.type.name" }; sampleParser.setAllowLeadingWildcard(true); sampleParser.setAnalyzer(analyzer); sampleParser.setMultiFields(sampleFields); - } + } } diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 282d413..bab4747 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -1279,7 +1279,6 @@ private void addSortField(JsonObject json, Document document, String key) { document.add(new NumericDocValuesField("id.long", value)); document.add(new StoredField("id.long", value)); } - // TODO add special case for startDate -> date to make sorting easier? if (DocumentMapping.longFields.contains(key)) { document.add(new NumericDocValuesField(key, json.getJsonNumber(key).longValueExact())); } else if (DocumentMapping.doubleFields.contains(key)) { @@ -1345,17 +1344,17 @@ private Document updateDocument(JsonObject json, Document oldDocument) { /** * Returns a new Lucene Document that has the same fields as were present in - * oldDocument, except in cases where the field name starts with fieldPrefix. + * oldDocument, except those provided as an argument to prune. * - * @param fieldPrefix Any fields with a name starting with this String will not + * @param fields These fields will not * be present in the returned Document. * @param oldDocument Lucene Document to be pruned. * @return Lucene Document with pruned fields. */ - private Document pruneDocument(String fieldPrefix, Document oldDocument) { + private Document pruneDocument(Set fields, Document oldDocument) { Document newDocument = new Document(); for (IndexableField field : oldDocument.getFields()) { - if (!field.name().startsWith(fieldPrefix)) { + if (!fields.contains(field.name())) { addSortField(field, newDocument); newDocument.add(field); } @@ -1446,7 +1445,7 @@ private void updateByRelation(JsonObject operationBody, Boolean delete) IndexSearcher searcher = getSearcher(new HashMap<>(), parentRelationship.parentName); int blockSize = 10000; - TermQuery query = new TermQuery(new Term(parentRelationship.fieldPrefix + ".id", childId)); + TermQuery query = new TermQuery(new Term(parentRelationship.joiningField, childId)); Sort sort = new Sort(new SortField("id", Type.STRING)); ScoreDoc[] scoreDocs = searcher.search(query, blockSize, sort).scoreDocs; while (scoreDocs.length != 0) { @@ -1454,7 +1453,7 @@ private void updateByRelation(JsonObject operationBody, Boolean delete) for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document oldDocument = searcher.doc(scoreDoc.doc); String parentId = oldDocument.get("id"); - Document newDocument = delete ? pruneDocument(parentRelationship.fieldPrefix, oldDocument) + Document newDocument = delete ? pruneDocument(parentRelationship.fields, oldDocument) : updateDocument(operationBody.getJsonObject("doc"), oldDocument); logger.trace("updateByRelation: {}", newDocument); bucket.updateDocument(new Term("id", parentId), facetsConfig.build(newDocument)); diff --git a/src/main/java/org/icatproject/lucene/SearchBucket.java b/src/main/java/org/icatproject/lucene/SearchBucket.java index 21dd667..35afa91 100644 --- a/src/main/java/org/icatproject/lucene/SearchBucket.java +++ b/src/main/java/org/icatproject/lucene/SearchBucket.java @@ -184,7 +184,14 @@ public SearchBucket(Lucene lucene, SearchType searchType, HttpServletRequest req text = jsonQuery.getString("text", null); if (text != null) { - luceneQuery.add(DocumentMapping.investigationParser.parse(text, null), Occur.MUST); + Builder textBuilder = new BooleanQuery.Builder(); + textBuilder.add(DocumentMapping.investigationParser.parse(text, null), Occur.SHOULD); + + IndexSearcher sampleSearcher = lucene.getSearcher(searcherMap, "Sample"); + Query joinedSampleQuery = JoinUtil.createJoinQuery("sample.investigation.id", false, "id", + DocumentMapping.sampleParser.parse(text, null), sampleSearcher, ScoreMode.Avg); + textBuilder.add(joinedSampleQuery, Occur.SHOULD); + luceneQuery.add(textBuilder.build(), Occur.MUST); } buildDateRanges(luceneQuery, jsonQuery, "lower", "upper", "startDate", "endDate"); @@ -201,21 +208,6 @@ public SearchBucket(Lucene lucene, SearchType searchType, HttpServletRequest req } } - if (jsonQuery.containsKey("samples")) { - JsonArray samples = jsonQuery.getJsonArray("samples"); - IndexSearcher sampleSearcher = lucene.getSearcher(searcherMap, "Sample"); - - for (JsonValue s : samples) { - JsonString sample = (JsonString) s; - BooleanQuery.Builder sampleQuery = new BooleanQuery.Builder(); - sampleQuery.add(DocumentMapping.sampleParser.parse(sample.getString(), null), Occur.MUST); - Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", - sampleQuery.build(), - sampleSearcher, ScoreMode.None); - luceneQuery.add(toQuery, Occur.MUST); - } - } - String userFullName = jsonQuery.getString("userFullName", null); if (userFullName != null) { BooleanQuery.Builder userFullNameQuery = new BooleanQuery.Builder(); From 338dda3a41f8630fdb7804646f4a709de84c55a6 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Fri, 22 Jul 2022 13:17:57 +0100 Subject: [PATCH 48/73] SampleParameter, fileCount, value in range #19 --- .../icatproject/lucene/DocumentMapping.java | 31 +-- .../java/org/icatproject/lucene/Lucene.java | 234 +++++++++++++++--- .../org/icatproject/lucene/SearchBucket.java | 78 +++++- 3 files changed, 293 insertions(+), 50 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/DocumentMapping.java b/src/main/java/org/icatproject/lucene/DocumentMapping.java index 469a7f8..75500d4 100644 --- a/src/main/java/org/icatproject/lucene/DocumentMapping.java +++ b/src/main/java/org/icatproject/lucene/DocumentMapping.java @@ -19,16 +19,15 @@ public static class ParentRelationship { public Set fields; /** - * @param parentName Name of the parent entity. + * @param parentName Name of the parent entity. * @param joiningField Field that joins the child to its parent. - * @param fields Fields that should be updated by this relationship. + * @param fields Fields that should be updated by this relationship. */ public ParentRelationship(String parentName, String joiningField, String... fields) { this.parentName = parentName; this.joiningField = joiningField; this.fields = new HashSet<>(Arrays.asList(fields)); } - } public static final Set doubleFields = new HashSet<>(); @@ -47,20 +46,23 @@ public ParentRelationship(String parentName, String joiningField, String... fiel public static final StandardQueryParser sampleParser = new StandardQueryParser(); static { - doubleFields.addAll(Arrays.asList("numericValue", "numericValueSI")); + doubleFields.addAll(Arrays.asList("numericValue", "numericValueSI", "rangeTop", "rangeTopSI", "rangeBottom", + "rangeBottomSI")); facetFields.addAll(Arrays.asList("type.name", "datafileFormat.name", "stringValue")); longFields.addAll( - Arrays.asList("date", "startDate", "endDate", "dateTimeValue", "investigation.startDate", "fileSize")); - sortFields.addAll(Arrays.asList("datafile.id", "dataset.id", "investigation.id", "instrument.id", "id", - "sample.investigation.id", "date", "name", "stringValue", "dateTimeValue", "numericValue", - "numericValueSI", "fileSize")); + Arrays.asList("date", "startDate", "endDate", "dateTimeValue", "investigation.startDate", "fileSize", + "fileCount")); + sortFields.addAll( + Arrays.asList("datafile.id", "dataset.id", "investigation.id", "instrument.id", "id", "sample.id", + "sample.investigation.id", "date", "name", "stringValue", "dateTimeValue", "numericValue", + "numericValueSI", "fileSize", "fileCount")); textFields.addAll(Arrays.asList("name", "visitId", "description", "location", "dataset.name", "investigation.name", "instrument.name", "instrument.fullName", "datafileFormat.name", "sample.name", - "sample.type.name", "title", "summary", "facility.name", "user.fullName", "type.name")); + "sample.type.name", "title", "summary", "facility.name", "user.fullName", "type.name", "doi")); indexedEntities.addAll(Arrays.asList("Datafile", "Dataset", "Investigation", "DatafileParameter", "DatasetParameter", "InstrumentScientist", "InvestigationInstrument", "InvestigationParameter", - "InvestigationUser", "Sample")); + "InvestigationUser", "Sample", "SampleParameter")); relationships.put("Instrument", new ParentRelationship[] { new ParentRelationship("InvestigationInstrument", "instrument.id", @@ -88,7 +90,8 @@ public ParentRelationship(String parentName, String joiningField, String... fiel relationships.put("ParameterType", new ParentRelationship[] { new ParentRelationship("DatafileParameter", "type.id", "type.name"), new ParentRelationship("DatasetParameter", "type.id", "type.name"), - new ParentRelationship("InvestigationParameter", "type.id", "type.name") }); + new ParentRelationship("InvestigationParameter", "type.id", "type.name"), + new ParentRelationship("SampleParameter", "type.id", "type.name") }); relationships.put("Investigation", new ParentRelationship[] { new ParentRelationship("Dataset", "investigation.id", "investigation.name", @@ -101,19 +104,19 @@ public ParentRelationship(String parentName, String joiningField, String... fiel genericParser.setAnalyzer(analyzer); CharSequence[] datafileFields = { "name", "description", "location", "datafileFormat.name", "visitId", - "sample.name", "sample.type.name" }; + "sample.name", "sample.type.name", "doi" }; datafileParser.setAllowLeadingWildcard(true); datafileParser.setAnalyzer(analyzer); datafileParser.setMultiFields(datafileFields); CharSequence[] datasetFields = { "name", "description", "sample.name", "sample.type.name", "type.name", - "visitId" }; + "visitId", "doi" }; datasetParser.setAllowLeadingWildcard(true); datasetParser.setAnalyzer(analyzer); datasetParser.setMultiFields(datasetFields); CharSequence[] investigationFields = { "name", "visitId", "title", "summary", "facility.name", - "type.name" }; + "type.name", "doi" }; investigationParser.setAllowLeadingWildcard(true); investigationParser.setAnalyzer(analyzer); investigationParser.setMultiFields(investigationFields); diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index bab4747..b88d775 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -28,6 +28,7 @@ import javax.json.Json; import javax.json.JsonArray; import javax.json.JsonException; +import javax.json.JsonNumber; import javax.json.JsonObject; import javax.json.JsonObjectBuilder; import javax.json.JsonReader; @@ -478,7 +479,8 @@ private void create(JsonObject operationBody) throws NumberFormatException, IOEx updateByRelation(operationBody, false); } if (DocumentMapping.indexedEntities.contains(entityName)) { - Document document = parseDocument(operationBody.getJsonObject("doc")); + JsonObject documentObject = operationBody.getJsonObject("doc"); + Document document = parseDocument(documentObject); logger.trace("create {} {}", entityName, document.toString()); IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); if (bucket.locked.get()) { @@ -486,6 +488,77 @@ private void create(JsonObject operationBody) throws NumberFormatException, IOEx "Lucene locked for " + entityName); } bucket.addDocument(facetsConfig.build(document)); + // Special case for filesizes + if (entityName.equals("Datafile")) { + JsonNumber jsonFileSize = documentObject.getJsonNumber("fileSize"); + if (jsonFileSize != null) { + String datasetId = documentObject.getString("dataset.id", null); + String investigationId = documentObject.getString("investigation.id", null); + logger.trace("Aggregating {} to {}, {}", jsonFileSize.longValue(), datasetId, investigationId); + aggregateFileSize(jsonFileSize.longValueExact(), 0, 1, datasetId, "dataset"); + aggregateFileSize(jsonFileSize.longValueExact(), 0, 1, investigationId, "investigation"); + } + } + } + } + + /** + * Changes the fileSize on an entity by the specified amount. This is used to + * aggregate the individual fileSize of Datafiles up to Dataset and + * Investigation sizes. + * + * @param sizeToAdd Increases the fileSize of the entity by this much. + * Should be 0 for deletes. + * @param sizeToSubtract Decreases the fileSize of the entity by this much. + * Should be 0 for creates. + * @param deltaFileCount Changes the file count by this much. + * @param entityId Icat id of entity to update. + * @param index Index (entity) to update. + * @throws IOException + */ + private void aggregateFileSize(long sizeToAdd, long sizeToSubtract, long deltaFileCount, String entityId, String index) + throws IOException { + long deltaFileSize = sizeToAdd - sizeToSubtract; + if (entityId != null && (deltaFileSize != 0 || deltaFileCount != 0)) { + IndexBucket indexBucket = indexBuckets.computeIfAbsent(index, k -> new IndexBucket(k)); + for (ShardBucket shardBucket : indexBucket.shardList) { + shardBucket.commit(); + IndexSearcher searcher = shardBucket.searcherManager.acquire(); + Term idTerm = new Term("id", entityId); + TopDocs topDocs = searcher.search(new TermQuery(idTerm), 1); + if (topDocs.totalHits.value == 1) { + int docId = topDocs.scoreDocs[0].doc; + Document document = searcher.doc(docId); + shardBucket.searcherManager.release(searcher); + Set prunedFields = new HashSet<>(); + List fieldsToAdd = new ArrayList<>(); + + if (deltaFileSize != 0) { + prunedFields.add("fileSize"); + long oldSize = document.getField("fileSize").numericValue().longValue(); + long newSize = oldSize == -1 ? deltaFileSize: oldSize + deltaFileSize; + fieldsToAdd.add(new LongPoint("fileSize", newSize)); + fieldsToAdd.add(new StoredField("fileSize", newSize)); + fieldsToAdd.add(new NumericDocValuesField("fileSize", newSize)); + } + + if (deltaFileCount != 0) { + prunedFields.add("fileCount"); + long oldCount = document.getField("fileCount").numericValue().longValue(); + long newCount = oldCount + deltaFileCount; + fieldsToAdd.add(new LongPoint("fileCount", newCount)); + fieldsToAdd.add(new StoredField("fileCount", newCount)); + fieldsToAdd.add(new NumericDocValuesField("fileCount", newCount)); + } + + Document newDocument = pruneDocument(prunedFields, document); + fieldsToAdd.forEach(field -> newDocument.add(field)); + shardBucket.indexWriter.updateDocument(idTerm, facetsConfig.build(newDocument)); + shardBucket.commit(); + break; + } + shardBucket.searcherManager.release(searcher); + } } } @@ -597,8 +670,31 @@ private void delete(JsonObject operationBody) throws LuceneException, IOExceptio "Lucene locked for " + entityName); } logger.trace("delete {} {}", entityName, icatId); + // Special case for filesizes + Term term = new Term("id", icatId); + if (entityName.equals("Datafile")) { + long sizeToSubtract = 0; + for (ShardBucket shardBucket : bucket.shardList) { + IndexSearcher datafileSearcher = shardBucket.searcherManager.acquire(); + TopDocs topDocs = datafileSearcher.search(new TermQuery(term), 1); + if (topDocs.totalHits.value == 1) { + int docId = topDocs.scoreDocs[0].doc; + Document datasetDocument = datafileSearcher.doc(docId); + sizeToSubtract = datasetDocument.getField("fileSize").numericValue().longValue(); + if (sizeToSubtract > 0) { + String datasetId = datasetDocument.getField("dataset.id").stringValue(); + String investigationId = datasetDocument.getField("investigation.id").stringValue(); + aggregateFileSize(0, sizeToSubtract, -1, datasetId, "dataset"); + aggregateFileSize(0, sizeToSubtract, -1, investigationId, "investigation"); + } + shardBucket.searcherManager.release(datafileSearcher); + break; + } + shardBucket.searcherManager.release(datafileSearcher); + } + } for (ShardBucket shardBucket : bucket.shardList) { - shardBucket.indexWriter.deleteDocuments(new Term("id", icatId)); + shardBucket.indexWriter.deleteDocuments(term); } } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); @@ -1114,10 +1210,18 @@ private String luceneSearchResult(String name, SearchBucket search, String searc : sortField.getType(); switch (type) { case LONG: - gen.write(indexableField.numericValue().longValue()); + if (indexableField.numericValue() != null) { + gen.write(indexableField.numericValue().longValue()); + } else if (indexableField.stringValue() != null) { + gen.write(new Long(indexableField.stringValue())); + } break; case DOUBLE: - gen.write(indexableField.numericValue().doubleValue()); + if (indexableField.numericValue() != null) { + gen.write(indexableField.numericValue().doubleValue()); + } else if (indexableField.stringValue() != null) { + gen.write(new Double(indexableField.stringValue())); + } break; case STRING: gen.write(indexableField.stringValue()); @@ -1237,28 +1341,75 @@ private void addField(JsonObject json, Document document, String key) { // Whenever the units are set or changed, convert to SI if (key.equals("type.units")) { String unitString = json.getString("type.units"); - IndexableField field = document.getField("numericValue"); - double value; - if (field != null) { - value = NumericUtils.sortableLongToDouble(field.numericValue().longValue()); - } else if (json.containsKey("numericValue")) { - value = json.getJsonNumber(key).doubleValue(); - } else { - // Strings and date/time values also have units, so if we aren't dealing with a - // number don't convert - return; - } - logger.trace("Attempting to convert {} {}", value, unitString); - SystemValue systemValue = icatUnits.new SystemValue(value, unitString); - if (systemValue.units != null) { - document.add(new StringField("type.unitsSI", systemValue.units, Store.YES)); - } - if (systemValue.value != null) { - document.add(new DoublePoint("numericValueSI", systemValue.value)); - document.add(new StoredField("numericValueSI", systemValue.value)); - long sortableLong = NumericUtils.doubleToSortableLong(systemValue.value); - document.add(new NumericDocValuesField("numericValueSI", sortableLong)); - } + convertValue(document, json, unitString, "numericValue"); + convertValue(document, json, unitString, "rangeTop"); + convertValue(document, json, unitString, "rangeBottom"); + } + } + + /** + * Attempts to convert numericFieldName from json into SI units from its recorded unitString, and then add it to the Lucene document. + * + * @param document Lucene Document to add the field to. + * @param json JsonObject containing the field/value pairs to be added. + * @param unitString Units of the value to be converted. + * @param numericFieldName Name (key) of the field to convert and add. + */ + private void convertValue(Document document, JsonObject json, String unitString, String numericFieldName) { + IndexableField field = document.getField(numericFieldName); + double value; + if (field != null) { + value = NumericUtils.sortableLongToDouble(field.numericValue().longValue()); + } else if (json.containsKey(numericFieldName)) { + value = json.getJsonNumber(numericFieldName).doubleValue(); + } else { + // If we aren't dealing with the desired numeric field don't convert + return; + } + logger.trace("Attempting to convert {} {}", value, unitString); + SystemValue systemValue = icatUnits.new SystemValue(value, unitString); + if (systemValue.units != null) { + document.add(new StringField("type.unitsSI", systemValue.units, Store.YES)); + } + if (systemValue.value != null) { + document.add(new DoublePoint(numericFieldName + "SI", systemValue.value)); + document.add(new StoredField(numericFieldName + "SI", systemValue.value)); + long sortableLong = NumericUtils.doubleToSortableLong(systemValue.value); + document.add(new NumericDocValuesField(numericFieldName + "SI", sortableLong)); + } + } + + /** + * Adds field to document taking its typing, sorting and faceting into account. + * + * @param field Lucene IndexableField to add to the document. + * @param document Lucene Document to add the field to. + */ + private void addField(IndexableField field, Document document) { + // SortedDocValuesField need to be indexed in addition to indexing a Field for + // searching/storing, so deal with that first + addSortField(field, document); + String key = field.name(); + + // Likewise, faceted fields should be considered separately + if (DocumentMapping.facetFields.contains(key)) { + String value = field.stringValue(); + document.add(new SortedSetDocValuesFacetField(key + ".keyword", value)); + document.add(new StringField(key + ".keyword", value, Store.NO)); + } + + if (DocumentMapping.doubleFields.contains(key)) { + Double value = field.numericValue().doubleValue(); + document.add(new DoublePoint(key, value)); + document.add(new StoredField(key, value)); + } else if (DocumentMapping.longFields.contains(key)) { + Long value = field.numericValue().longValue(); + document.add(new LongPoint(key, value)); + document.add(new StoredField(key, value)); + } else if (DocumentMapping.textFields.contains(key)) { + document.add(new TextField(key, field.stringValue(), Store.YES)); + } else { + document.add(new StringField(key, field.stringValue(), Store.YES)); } } @@ -1346,7 +1497,7 @@ private Document updateDocument(JsonObject json, Document oldDocument) { * Returns a new Lucene Document that has the same fields as were present in * oldDocument, except those provided as an argument to prune. * - * @param fields These fields will not + * @param fields These fields will not * be present in the returned Document. * @param oldDocument Lucene Document to be pruned. * @return Lucene Document with pruned fields. @@ -1355,8 +1506,7 @@ private Document pruneDocument(Set fields, Document oldDocument) { Document newDocument = new Document(); for (IndexableField field : oldDocument.getFields()) { if (!fields.contains(field.name())) { - addSortField(field, newDocument); - newDocument.add(field); + addField(field, newDocument); } } return newDocument; @@ -1405,12 +1555,36 @@ private void update(JsonObject operationBody) throws LuceneException, NumberForm } if (DocumentMapping.indexedEntities.contains(entityName)) { String icatId = operationBody.getString("_id"); - Document document = parseDocument(operationBody.getJsonObject("doc")); + JsonObject documentObject = operationBody.getJsonObject("doc"); + Document document = parseDocument(documentObject); IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); if (bucket.locked.get()) { throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene locked for " + entityName); } + // Special case for filesizes + if (entityName.equals("Datafile")) { + JsonNumber jsonFileSize = documentObject.getJsonNumber("fileSize"); + if (jsonFileSize != null) { + long sizeToSubtract = 0; + List datafileSearchers = bucket.acquireSearchers(); + for (IndexSearcher datafileSearcher : datafileSearchers) { + TopDocs topDocs = datafileSearcher.search(new TermQuery(new Term("id", icatId)), 1); + if (topDocs.totalHits.value == 1) { + int docId = topDocs.scoreDocs[0].doc; + Document datasetDocument = datafileSearcher.doc(docId); + sizeToSubtract = datasetDocument.getField("fileSize").numericValue().longValue(); + if (jsonFileSize.longValueExact() != sizeToSubtract) { + String datasetId = documentObject.getString("dataset.id", null); + String investigationId = documentObject.getString("investigation.id", null); + aggregateFileSize(jsonFileSize.longValueExact(), sizeToSubtract, 0, datasetId, "dataset"); + aggregateFileSize(jsonFileSize.longValueExact(), sizeToSubtract, 0, investigationId, "investigation"); + } + break; + } + } + } + } logger.trace("update: {}", document); bucket.updateDocument(new Term("id", icatId), facetsConfig.build(document)); } diff --git a/src/main/java/org/icatproject/lucene/SearchBucket.java b/src/main/java/org/icatproject/lucene/SearchBucket.java index 35afa91..57c6fa4 100644 --- a/src/main/java/org/icatproject/lucene/SearchBucket.java +++ b/src/main/java/org/icatproject/lucene/SearchBucket.java @@ -108,7 +108,7 @@ public SearchBucket(Lucene lucene, SearchType searchType, HttpServletRequest req try (JsonReader r = Json.createReader(request.getInputStream())) { JsonObject o = r.readObject(); parseFields(o); - parseDimensions(o); // Don't need for DF + parseDimensions(o); JsonObject jsonQuery = o.getJsonObject("query"); BooleanQuery.Builder luceneQuery = new BooleanQuery.Builder(); String userName; @@ -300,8 +300,14 @@ private void buildFilterQueries(String target, JsonObject requestedQuery, Builde // BUT only if we haven't already nested the queries (as we do when the key was // just a nested entity) IndexSearcher nestedSearcher = lucene.getSearcher(searcherMap, filterTarget); - Query nestedQuery = JoinUtil.createJoinQuery(target + ".id", false, "id", dimensionQuery, - nestedSearcher, ScoreMode.None); + Query nestedQuery; + if (filterTarget.equals("sample") && !target.equals("investigation")) { + nestedQuery = JoinUtil.createJoinQuery("sample.id", false, "sample.id", dimensionQuery, + nestedSearcher, ScoreMode.None); + } else { + nestedQuery = JoinUtil.createJoinQuery(target + ".id", false, "id", dimensionQuery, + nestedSearcher, ScoreMode.None); + } queryBuilder.add(nestedQuery, Occur.FILTER); } else { // Otherwise, just add as SHOULD to the main query directly @@ -343,12 +349,26 @@ private Query parseFilter(String target, String fld, JsonValue value) throws IOE if (nestedFilter.containsKey("value")) { TermQuery query = new TermQuery(new Term(nestedField + ".keyword", nestedFilter.getString("value"))); nestedBoolBuilder.add(query, Occur.FILTER); + } else if (nestedFilter.containsKey("exact")) { + buildNestedExactQuery(nestedField, nestedFilter, nestedBoolBuilder); } else { buildNestedRangeQuery(nestedField, nestedFilter, nestedBoolBuilder); } }); - return JoinUtil.createJoinQuery(target + ".id", false, "id", nestedBoolBuilder.build(), - nestedSearcher, ScoreMode.None); + if (fld.contains("sample") && !target.equals("investigation")) { + // Datasets and Datafiles join by sample.id on both fields + return JoinUtil.createJoinQuery("sample.id", false, "sample.id", nestedBoolBuilder.build(), + nestedSearcher, ScoreMode.None); + } else if (fld.equals("sampleparameter") && target.equals("investigation")) { + Query sampleQuery = JoinUtil.createJoinQuery("sample.id", false, "sample.id", nestedBoolBuilder.build(), + nestedSearcher, ScoreMode.None); + Query investigationQuery = JoinUtil.createJoinQuery("sample.investigation.id", false, "id", sampleQuery, + lucene.getSearcher(searcherMap, "sample"), ScoreMode.None); + return investigationQuery; + } else { + return JoinUtil.createJoinQuery(target + ".id", false, "id", nestedBoolBuilder.build(), + nestedSearcher, ScoreMode.None); + } } else { // Single range of values for a field JsonNumber from = valueObject.getJsonNumber("from"); @@ -366,6 +386,52 @@ private Query parseFilter(String target, String fld, JsonValue value) throws IOE } } + /** + * Builds an exact numeric query, intended for use with numeric or date/time parameters. + * + * @param fld Name of the field to apply the range to. + * @param valueObject JsonObject containing "exact", and optionally "units" + * as keys for an exact value. + * @param builder BooleanQuery.Builder for the nested query + */ + private void buildNestedExactQuery(String fld, JsonObject valueObject, BooleanQuery.Builder builder) { + if (DocumentMapping.longFields.contains(fld)) { + long exact = valueObject.getJsonNumber("exact").longValueExact(); + builder.add(LongPoint.newExactQuery(fld, exact), Occur.FILTER); + } else { + Builder rangeBuilder = new BooleanQuery.Builder(); + Builder exactOrRangeBuilder = new BooleanQuery.Builder(); + double exact = valueObject.getJsonNumber("exact").doubleValue(); + String units = valueObject.getString("units", null); + if (units != null) { + SystemValue exactValue = lucene.icatUnits.new SystemValue(exact, units); + if (exactValue.value != null ) { + // If we were able to parse the units, apply query to the SI value + rangeBuilder.add(DoublePoint.newRangeQuery("rangeTopSI", exactValue.value, Double.POSITIVE_INFINITY), Occur.FILTER); + rangeBuilder.add(DoublePoint.newRangeQuery("rangeBottomSI", Double.NEGATIVE_INFINITY, exactValue.value), Occur.FILTER); + exactOrRangeBuilder.add(rangeBuilder.build(), Occur.SHOULD); + exactOrRangeBuilder.add(DoublePoint.newExactQuery(fld + "SI", exactValue.value), Occur.SHOULD); + builder.add(exactOrRangeBuilder.build(), Occur.FILTER); + } else { + // If units could not be parsed, make them part of the query on the raw data + rangeBuilder.add(DoublePoint.newRangeQuery("rangeTop", exact, Double.POSITIVE_INFINITY), Occur.FILTER); + rangeBuilder.add(DoublePoint.newRangeQuery("rangeBottom", Double.NEGATIVE_INFINITY, exact), Occur.FILTER); + exactOrRangeBuilder.add(rangeBuilder.build(), Occur.SHOULD); + exactOrRangeBuilder.add(DoublePoint.newExactQuery(fld, exact), Occur.SHOULD); + builder.add(exactOrRangeBuilder.build(), Occur.FILTER); + builder.add(new TermQuery(new Term("type.units", units)), Occur.FILTER); + } + } else { + // If units were not provided, just apply to the raw data + rangeBuilder.add(DoublePoint.newRangeQuery("rangeTop", exact, Double.POSITIVE_INFINITY), Occur.FILTER); + rangeBuilder.add(DoublePoint.newRangeQuery("rangeBottom", Double.NEGATIVE_INFINITY, exact), Occur.FILTER); + exactOrRangeBuilder.add(rangeBuilder.build(), Occur.SHOULD); + exactOrRangeBuilder.add(DoublePoint.newExactQuery(fld, exact), Occur.SHOULD); + builder.add(exactOrRangeBuilder.build(), Occur.FILTER); + } + } + } + /** * Builds a range query, intended for use with numeric or date/time parameters. * @@ -402,7 +468,7 @@ private void buildNestedRangeQuery(String fld, JsonObject valueObject, BooleanQu } else { // If units could not be parsed, make them part of the query on the raw data builder.add(DoublePoint.newRangeQuery(fld, from, to), Occur.FILTER); - builder.add(new TermQuery(new Term("type.units.keyword", units)), Occur.FILTER); + builder.add(new TermQuery(new Term("type.units", units)), Occur.FILTER); } } else { // If units were not provided, just apply to the raw data From ce51e33f0763fda21a830efb093e7bb04899740b Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Tue, 2 Aug 2022 14:44:42 +0000 Subject: [PATCH 49/73] Add utility to lock #19 --- .../java/org/icatproject/lucene/Lucene.java | 67 +++++++++++++------ src/main/resources/run.properties | 11 +-- 2 files changed, 51 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index b88d775..6d7f902 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -71,6 +71,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.SearcherManager; import org.apache.lucene.search.Sort; @@ -331,6 +332,7 @@ public void releaseSearchers(List subSearchers) throws IOExceptio private int luceneCommitMillis; private Long luceneMaxShardSize; private long maxSearchTimeSeconds; + private boolean aggregateFiles; private AtomicLong bucketNum = new AtomicLong(); private Map indexBuckets = new ConcurrentHashMap<>(); @@ -408,11 +410,12 @@ public void addNow(@Context HttpServletRequest request, @PathParam("entityName") for (JsonObject document : documents) { createNow(entityName, document); } - } catch (IOException | JsonException e) { - + } catch (JsonException e) { logger.error("Could not parse JSON from {}", value.toString()); throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } + } catch (IOException e) { + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + } logger.debug("Added {} {} documents", documents.size(), entityName); } @@ -489,7 +492,7 @@ private void create(JsonObject operationBody) throws NumberFormatException, IOEx } bucket.addDocument(facetsConfig.build(document)); // Special case for filesizes - if (entityName.equals("Datafile")) { + if (aggregateFiles && entityName.equals("Datafile")) { JsonNumber jsonFileSize = documentObject.getJsonNumber("fileSize"); if (jsonFileSize != null) { String datasetId = documentObject.getString("dataset.id", null); @@ -573,10 +576,6 @@ private void aggregateFileSize(long sizeToAdd, long sizeToSubtract, long deltaFi */ private void createNow(String entityName, JsonObject documentJson) throws NumberFormatException, IOException, LuceneException { - if (!documentJson.containsKey("id")) { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "id was not in the document keys " + documentJson.keySet()); - } Document document = parseDocument(documentJson); logger.trace("create {} {}", entityName, document.toString()); IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); @@ -672,7 +671,7 @@ private void delete(JsonObject operationBody) throws LuceneException, IOExceptio logger.trace("delete {} {}", entityName, icatId); // Special case for filesizes Term term = new Term("id", icatId); - if (entityName.equals("Datafile")) { + if (aggregateFiles && entityName.equals("Datafile")) { long sizeToSubtract = 0; for (ShardBucket shardBucket : bucket.shardList) { IndexSearcher datafileSearcher = shardBucket.searcherManager.acquire(); @@ -741,13 +740,11 @@ private void encodeResult(String entityName, JsonGenerator gen, ScoreDoc hit, In parentId = document.get("id"); } else { parentId = document.get("investigation.id"); - logger.debug("investigation.id {}", parentId); } } else { fld = entityName.toLowerCase() + ".id"; parentId = document.get("id"); } - logger.debug("fld {}, parentId {}", fld, parentId); joinedSearch.query = new TermQuery(new Term(fld, parentId)); joinedSearch.sort = new Sort(new SortedNumericSortField("id.long", Type.LONG)); TopFieldDocs topFieldDocs = searchShards(joinedSearch, 100, shards, null); @@ -923,6 +920,7 @@ private void init() { luceneMaxShardSize = Math.max(props.getPositiveLong("maxShardSize"), new Long(Integer.MAX_VALUE + 1)); maxSearchTimeSeconds = props.has("maxSearchTimeSeconds") ? props.getPositiveLong("maxSearchTimeSeconds") : 5; + aggregateFiles = props.getBoolean("aggregateFiles", false); timer = new Timer("LuceneCommitTimer"); timer.schedule(new CommitTimerTask(), luceneCommitMillis, luceneCommitMillis); @@ -985,26 +983,51 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("s } /** - * Locks the specified index for population, removing all existing documents and + * Locks the specified index for population, optionally removing all existing documents and * preventing normal modify operations until the index is unlocked. * * @param entityName Name of the entity/index to lock. + * @param request Incoming request. In order to delete all existing documents, the accompanying Json should specify {"delete": true}. * @throws LuceneException If already locked, or if there's an IOException when * deleting documents. */ @POST @Path("lock/{entityName}") - public void lock(@PathParam("entityName") String entityName) throws LuceneException { - logger.info("Requesting lock of {} index", entityName); - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); + @Consumes(MediaType.APPLICATION_JSON) + @Produces(MediaType.APPLICATION_JSON) + public String lock(@PathParam("entityName") String entityName, @Context HttpServletRequest request) throws LuceneException { + try (JsonReader reader = Json.createReader(request.getInputStream())) { + boolean delete = reader.readObject().getBoolean("delete", false); + logger.info("Requesting lock of {} index, delete={}", entityName, delete); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); - if (!bucket.locked.compareAndSet(false, true)) { - throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene already locked for " + entityName); - } - try { - for (ShardBucket shardBucket : bucket.shardList) { - shardBucket.indexWriter.deleteAll(); + if (!bucket.locked.compareAndSet(false, true)) { + throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene already locked for " + entityName); + } + JsonObjectBuilder builder = Json.createObjectBuilder(); + if (delete) { + for (ShardBucket shardBucket : bucket.shardList) { + shardBucket.indexWriter.deleteAll(); + } + // Reset the shardList so we reset the routing + bucket.shardList = Arrays.asList(bucket.shardList.get(0)); + return builder.add("currentId", 0).build().toString(); + } + SearchBucket searchBucket = new SearchBucket(this); + searchBucket.query = new MatchAllDocsQuery(); + searchBucket.fields.add("id"); + searchBucket.scored = false; + searchBucket.sort = new Sort(new SortedNumericSortField("id.long", Type.LONG, true)); + TopFieldDocs topFieldDocs = searchShards(searchBucket, 1, bucket.shardList, null); + if (topFieldDocs.totalHits.value == 0) { + return builder.add("currentId", 0).build().toString(); } + int shardIndex = topFieldDocs.scoreDocs[0].shardIndex; + int doc = topFieldDocs.scoreDocs[0].doc; + IndexSearcher searcher = bucket.shardList.get(shardIndex).searcherManager.acquire(); + String id = searcher.doc(doc).get("id"); + bucket.shardList.get(shardIndex).searcherManager.release(searcher); + return builder.add("currentId", new Long(id)).build().toString(); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } @@ -1563,7 +1586,7 @@ private void update(JsonObject operationBody) throws LuceneException, NumberForm "Lucene locked for " + entityName); } // Special case for filesizes - if (entityName.equals("Datafile")) { + if (aggregateFiles && entityName.equals("Datafile")) { JsonNumber jsonFileSize = documentObject.getJsonNumber("fileSize"); if (jsonFileSize != null) { long sizeToSubtract = 0; diff --git a/src/main/resources/run.properties b/src/main/resources/run.properties index 99fcae0..7189854 100644 --- a/src/main/resources/run.properties +++ b/src/main/resources/run.properties @@ -1,8 +1,9 @@ # Real comments in this file are marked with '#' whereas commented out lines # are marked with '!' -directory = ${HOME}/data/lucene -commitSeconds = 5 -maxShardSize = 2147483648 -ip = 127.0.0.1/32 -units = J: eV 1.602176634e-19; \u2103: celsius, degC; K: kelvin +directory = ${HOME}/data/lucene +commitSeconds = 5 +maxShardSize = 2147483648 +ip = 127.0.0.1/32 +units = J: eV 1.602176634e-19; \u2103: celsius, degC; K: kelvin +aggregateFiles = false From 5f59e1dfc936da1aaefdcab4ea6d0d51958da723 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Sun, 24 Jul 2022 08:03:56 +0100 Subject: [PATCH 50/73] Formatting changes #19 --- .../java/org/icatproject/lucene/Lucene.java | 48 +++++++++++-------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 6d7f902..d0dc91e 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -415,7 +415,7 @@ public void addNow(@Context HttpServletRequest request, @PathParam("entityName") throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } + } logger.debug("Added {} {} documents", documents.size(), entityName); } @@ -519,7 +519,8 @@ private void create(JsonObject operationBody) throws NumberFormatException, IOEx * @param index Index (entity) to update. * @throws IOException */ - private void aggregateFileSize(long sizeToAdd, long sizeToSubtract, long deltaFileCount, String entityId, String index) + private void aggregateFileSize(long sizeToAdd, long sizeToSubtract, long deltaFileCount, String entityId, + String index) throws IOException { long deltaFileSize = sizeToAdd - sizeToSubtract; if (entityId != null && (deltaFileSize != 0 || deltaFileCount != 0)) { @@ -539,7 +540,7 @@ private void aggregateFileSize(long sizeToAdd, long sizeToSubtract, long deltaFi if (deltaFileSize != 0) { prunedFields.add("fileSize"); long oldSize = document.getField("fileSize").numericValue().longValue(); - long newSize = oldSize == -1 ? deltaFileSize: oldSize + deltaFileSize; + long newSize = oldSize + deltaFileSize; fieldsToAdd.add(new LongPoint("fileSize", newSize)); fieldsToAdd.add(new StoredField("fileSize", newSize)); fieldsToAdd.add(new NumericDocValuesField("fileSize", newSize)); @@ -935,9 +936,10 @@ private void init() { throw new IllegalStateException(e.getMessage()); } - logger.info( - "Initialised icat.lucene with directory {}, commitSeconds {}, maxShardSize {}, shardedIndices {}, maxSearchTimeSeconds {}", - luceneDirectory, luceneCommitMillis, luceneMaxShardSize, shardedIndices, maxSearchTimeSeconds); + String format = "Initialised icat.lucene with directory {}, commitSeconds {}, maxShardSize {}, " + + "shardedIndices {}, maxSearchTimeSeconds {}, aggregateFiles {}"; + logger.info(format, luceneDirectory, luceneCommitMillis, luceneMaxShardSize, shardedIndices, + maxSearchTimeSeconds, aggregateFiles); } class CommitTimerTask extends TimerTask { @@ -983,11 +985,14 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("s } /** - * Locks the specified index for population, optionally removing all existing documents and - * preventing normal modify operations until the index is unlocked. + * Locks the specified index for population, optionally removing all existing + * documents and preventing normal modify operations until the index is + * unlocked. * * @param entityName Name of the entity/index to lock. - * @param request Incoming request. In order to delete all existing documents, the accompanying Json should specify {"delete": true}. + * @param request Incoming request. In order to delete all existing + * documents, the accompanying Json should specify + * {"delete": true}. * @throws LuceneException If already locked, or if there's an IOException when * deleting documents. */ @@ -995,14 +1000,16 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("s @Path("lock/{entityName}") @Consumes(MediaType.APPLICATION_JSON) @Produces(MediaType.APPLICATION_JSON) - public String lock(@PathParam("entityName") String entityName, @Context HttpServletRequest request) throws LuceneException { + public String lock(@PathParam("entityName") String entityName, @Context HttpServletRequest request) + throws LuceneException { try (JsonReader reader = Json.createReader(request.getInputStream())) { boolean delete = reader.readObject().getBoolean("delete", false); logger.info("Requesting lock of {} index, delete={}", entityName, delete); IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); if (!bucket.locked.compareAndSet(false, true)) { - throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, "Lucene already locked for " + entityName); + String message = "Lucene already locked for " + entityName; + throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, message); } JsonObjectBuilder builder = Json.createObjectBuilder(); if (delete) { @@ -1371,11 +1378,13 @@ private void addField(JsonObject json, Document document, String key) { } /** - * Attempts to convert numericFieldName from json into SI units from its recorded unitString, and then add it to the Lucene document. + * Attempts to convert numericFieldName from json into SI units from its + * recorded unitString, and then add it to the Lucene document. * - * @param document Lucene Document to add the field to. - * @param json JsonObject containing the field/value pairs to be added. - * @param unitString Units of the value to be converted. + * @param document Lucene Document to add the field to. + * @param json JsonObject containing the field/value pairs to be + * added. + * @param unitString Units of the value to be converted. * @param numericFieldName Name (key) of the field to convert and add. */ private void convertValue(Document document, JsonObject json, String unitString, String numericFieldName) { @@ -1405,7 +1414,7 @@ private void convertValue(Document document, JsonObject json, String unitString, /** * Adds field to document taking its typing, sorting and faceting into account. * - * @param field Lucene IndexableField to add to the document. + * @param field Lucene IndexableField to add to the document. * @param document Lucene Document to add the field to. */ private void addField(IndexableField field, Document document) { @@ -1597,11 +1606,12 @@ private void update(JsonObject operationBody) throws LuceneException, NumberForm int docId = topDocs.scoreDocs[0].doc; Document datasetDocument = datafileSearcher.doc(docId); sizeToSubtract = datasetDocument.getField("fileSize").numericValue().longValue(); - if (jsonFileSize.longValueExact() != sizeToSubtract) { + long sizeToAdd = jsonFileSize.longValueExact(); + if (sizeToAdd != sizeToSubtract) { String datasetId = documentObject.getString("dataset.id", null); String investigationId = documentObject.getString("investigation.id", null); - aggregateFileSize(jsonFileSize.longValueExact(), sizeToSubtract, 0, datasetId, "dataset"); - aggregateFileSize(jsonFileSize.longValueExact(), sizeToSubtract, 0, investigationId, "investigation"); + aggregateFileSize(sizeToAdd, sizeToSubtract, 0, datasetId, "dataset"); + aggregateFileSize(sizeToAdd, sizeToSubtract, 0, investigationId, "investigation"); } break; } From 902654bce4bf493998db0112d8c490b9616546e9 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Fri, 5 Aug 2022 08:23:33 +0000 Subject: [PATCH 51/73] Improved timeout and search syntax errors #19 --- .../java/org/icatproject/lucene/Lucene.java | 248 ++++++++++-------- .../org/icatproject/lucene/SearchBucket.java | 5 +- 2 files changed, 141 insertions(+), 112 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index d0dc91e..0fbaa5f 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -69,21 +69,23 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; -import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.SearcherManager; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.SortField.Type; +import org.apache.lucene.search.TimeLimitingCollector.TimeExceededException; import org.apache.lucene.search.SortedNumericSortField; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TimeLimitingCollector; import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.TopFieldCollector; import org.apache.lucene.search.TopFieldDocs; import org.apache.lucene.search.TotalHits; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.Counter; import org.apache.lucene.util.NumericUtils; import org.icatproject.lucene.SearchBucket.SearchType; import org.icatproject.lucene.exceptions.LuceneException; @@ -610,6 +612,9 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("search } catch (Exception e) { logger.error("Error", e); freeSearcher(uid); + if (e instanceof LuceneException) { + throw (LuceneException) e; + } throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } } @@ -642,6 +647,9 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("search_ } catch (Exception e) { logger.error("Error", e); freeSearcher(uid); + if (e instanceof LuceneException) { + throw (LuceneException) e; + } throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } @@ -748,11 +756,12 @@ private void encodeResult(String entityName, JsonGenerator gen, ScoreDoc hit, In } joinedSearch.query = new TermQuery(new Term(fld, parentId)); joinedSearch.sort = new Sort(new SortedNumericSortField("id.long", Type.LONG)); - TopFieldDocs topFieldDocs = searchShards(joinedSearch, 100, shards, null); + TopFieldDocs topFieldDocs = searchShards(joinedSearch, 100, shards); gen.writeStartArray(joinedEntityName.toLowerCase()); for (ScoreDoc joinedHit : topFieldDocs.scoreDocs) { gen.writeStartObject(); - Document joinedDocument = searchers.get(joinedHit.shardIndex).doc(joinedHit.doc); + int joinedShardIndex = joinedHit.shardIndex > 0 ? joinedHit.shardIndex : 0; + Document joinedDocument = searchers.get(joinedShardIndex).doc(joinedHit.doc); joinedDocument.forEach(encodeField(gen, search.joinedFields.get(joinedEntityName))); gen.writeEnd(); } @@ -827,6 +836,9 @@ public String facet(@PathParam("entityName") String entityName, @Context HttpSer return luceneFacetResult(entityName, search, searchAfter, maxResults, maxLabels); } catch (Exception e) { freeSearcher(uid); + if (e instanceof LuceneException) { + throw (LuceneException) e; + } throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } } @@ -838,7 +850,7 @@ public String facet(@PathParam("entityName") String entityName, @Context HttpSer * @throws LuceneException */ public void freeSearcher(Long uid) throws LuceneException { - if (uid != null) { // May not be set for internal calls + if (uid != null && searches.containsKey(uid)) { // May not be set for internal calls Map> search = searches.get(uid).searcherMap; for (Entry> entry : search.entrySet()) { String name = entry.getKey(); @@ -980,6 +992,9 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("s } catch (Exception e) { logger.error("Error", e); freeSearcher(uid); + if (e instanceof LuceneException) { + throw (LuceneException) e; + } throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } } @@ -1020,20 +1035,11 @@ public String lock(@PathParam("entityName") String entityName, @Context HttpServ bucket.shardList = Arrays.asList(bucket.shardList.get(0)); return builder.add("currentId", 0).build().toString(); } - SearchBucket searchBucket = new SearchBucket(this); - searchBucket.query = new MatchAllDocsQuery(); - searchBucket.fields.add("id"); - searchBucket.scored = false; - searchBucket.sort = new Sort(new SortedNumericSortField("id.long", Type.LONG, true)); - TopFieldDocs topFieldDocs = searchShards(searchBucket, 1, bucket.shardList, null); - if (topFieldDocs.totalHits.value == 0) { - return builder.add("currentId", 0).build().toString(); - } - int shardIndex = topFieldDocs.scoreDocs[0].shardIndex; - int doc = topFieldDocs.scoreDocs[0].doc; - IndexSearcher searcher = bucket.shardList.get(shardIndex).searcherManager.acquire(); - String id = searcher.doc(doc).get("id"); - bucket.shardList.get(shardIndex).searcherManager.release(searcher); + ShardBucket shardBucket = bucket.routeShard(); + int docCount = shardBucket.documentCount.intValue(); + IndexSearcher searcher = shardBucket.searcherManager.acquire(); + String id = searcher.doc(docCount - 1).get("id"); + shardBucket.searcherManager.release(searcher); return builder.add("currentId", new Long(id)).build().toString(); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); @@ -1193,7 +1199,7 @@ private String luceneSearchResult(String name, SearchBucket search, String searc List shards = getShards(name); String format = "Search {} with: query {}, maxResults {}, searchAfter {}, scored {}, fields {}"; logger.debug(format, name, search.query, maxResults, searchAfter, search.scored, search.fields); - TopFieldDocs topFieldDocs = searchShards(search, maxResults, shards, search.searchAfter); + TopFieldDocs topFieldDocs = searchShards(search, maxResults, shards); ScoreDoc[] hits = topFieldDocs.scoreDocs; TotalHits totalHits = topFieldDocs.totalHits; SortField[] fields = topFieldDocs.fields; @@ -1205,121 +1211,141 @@ private String luceneSearchResult(String name, SearchBucket search, String searc ByteArrayOutputStream baos = new ByteArrayOutputStream(); try (JsonGenerator gen = Json.createGenerator(baos)) { gen.writeStartObject(); - gen.write("aborted", search.aborted); - if (!search.aborted) { - gen.writeStartArray("results"); - for (ScoreDoc hit : hits) { - encodeResult(name, gen, hit, searchers.get(hit.shardIndex), search); + gen.writeStartArray("results"); + for (ScoreDoc hit : hits) { + encodeResult(name, gen, hit, searchers.get(hit.shardIndex), search); + } + gen.writeEnd(); // array results + if (hits.length == maxResults) { + ScoreDoc lastDoc = hits[hits.length - 1]; + gen.writeStartObject("search_after").write("doc", lastDoc.doc).write("shardIndex", + lastDoc.shardIndex); + float lastScore = lastDoc.score; + if (!Float.isNaN(lastScore)) { + gen.write("score", lastScore); } - gen.writeEnd(); // array results - if (hits.length == maxResults) { - ScoreDoc lastDoc = hits[hits.length - 1]; - gen.writeStartObject("search_after").write("doc", lastDoc.doc).write("shardIndex", - lastDoc.shardIndex); - float lastScore = lastDoc.score; - if (!Float.isNaN(lastScore)) { - gen.write("score", lastScore); - } - if (fields != null) { - Document lastDocument = searchers.get(lastDoc.shardIndex).doc(lastDoc.doc); - gen.writeStartArray("fields"); - for (SortField sortField : fields) { - String fieldName = sortField.getField(); - if (fieldName == null) { - // SCORE sorting will have a null fieldName + if (fields != null) { + Document lastDocument = searchers.get(lastDoc.shardIndex).doc(lastDoc.doc); + gen.writeStartArray("fields"); + for (SortField sortField : fields) { + String fieldName = sortField.getField(); + if (fieldName == null) { + // SCORE sorting will have a null fieldName + if (Float.isFinite(lastDoc.score)) { gen.write(lastDoc.score); - continue; - } - IndexableField indexableField = lastDocument.getField(fieldName); - if (indexableField == null) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Field " + fieldName - + " used for sorting was not present on the Lucene Document; all sortable fields must also be stored."); - } - Type type = (sortField instanceof SortedNumericSortField) - ? ((SortedNumericSortField) sortField).getNumericType() - : sortField.getType(); - switch (type) { - case LONG: - if (indexableField.numericValue() != null) { - gen.write(indexableField.numericValue().longValue()); - } else if (indexableField.stringValue() != null) { - gen.write(new Long(indexableField.stringValue())); - } - break; - case DOUBLE: - if (indexableField.numericValue() != null) { - gen.write(indexableField.numericValue().doubleValue()); - } else if (indexableField.stringValue() != null) { - gen.write(new Double(indexableField.stringValue())); - } - break; - case STRING: - gen.write(indexableField.stringValue()); - break; - default: - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, - "SortField.Type must be one of LONG, DOUBLE, STRING, but it was " + type); } + continue; + } + IndexableField indexableField = lastDocument.getField(fieldName); + if (indexableField == null) { + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Field " + fieldName + + " used for sorting was not present on the Lucene Document; all sortable fields must also be stored."); + } + Type type = (sortField instanceof SortedNumericSortField) + ? ((SortedNumericSortField) sortField).getNumericType() + : sortField.getType(); + switch (type) { + case LONG: + if (indexableField.numericValue() != null) { + gen.write(indexableField.numericValue().longValue()); + } else if (indexableField.stringValue() != null) { + gen.write(new Long(indexableField.stringValue())); + } + break; + case DOUBLE: + if (indexableField.numericValue() != null) { + gen.write(indexableField.numericValue().doubleValue()); + } else if (indexableField.stringValue() != null) { + gen.write(new Double(indexableField.stringValue())); + } + break; + case STRING: + gen.write(indexableField.stringValue()); + break; + default: + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, + "SortField.Type must be one of LONG, DOUBLE, STRING, but it was " + type); } - gen.writeEnd(); // end "fields" array } - gen.writeEnd(); // end "search_after" object + gen.writeEnd(); // end "fields" array } + gen.writeEnd(); // end "search_after" object } gen.writeEnd(); // end enclosing object } - logger.debug("Json returned {}", baos.toString()); + logger.trace("Json returned {}", baos.toString()); return baos.toString(); } /** * Performs a search by iterating over all relevant shards. * - * @param search SearchBucket containing the search query, dimensions to - * facet etc. - * @param maxResults The maximum number of results from the search. - * @param shards List of all ShardBuckets for the entity to be searched. - * @param searchAfterDoc The last Lucene FieldDoc from a previous search. + * @param search SearchBucket containing the search query, dimensions to + * facet etc. + * @param maxResults The maximum number of results from the search. + * @param shards List of all ShardBuckets for the entity to be searched. * @return Lucene TopFieldDocs resulting from the search. * @throws IOException + * @throws LuceneException If the search runs for longer than the allowed time */ - private TopFieldDocs searchShards(SearchBucket search, int maxResults, List shards, - FieldDoc searchAfterDoc) throws IOException { + private TopFieldDocs searchShards(SearchBucket search, int maxResults, List shards) + throws IOException, LuceneException { + TopFieldDocs topFieldDocs; - if (shards.size() > 0) { - List shardHits = new ArrayList<>(); - int i = 0; - int doc = searchAfterDoc != null ? searchAfterDoc.doc : -1; - long startTime = System.currentTimeMillis(); - for (ShardBucket shard : shards) { - int docCount = shard.documentCount.intValue(); - if (searchAfterDoc != null) { - if (doc > docCount) { - searchAfterDoc.doc = docCount - 1; - } else { - searchAfterDoc.doc = doc; + Counter clock = TimeLimitingCollector.getGlobalCounter(); + TimeLimitingCollector collector = new TimeLimitingCollector(null, clock, maxSearchTimeSeconds * 1000); + int shardsSize = shards.size(); + + try { + if (shardsSize > 1) { + List shardHits = new ArrayList<>(); + int doc = search.searchAfter != null ? search.searchAfter.doc : -1; + for (ShardBucket shard : shards) { + // Handle the possibility of some shards having a higher docCount than the doc + // id on searchAfter + int docCount = shard.documentCount.intValue(); + if (search.searchAfter != null) { + if (doc > docCount) { + search.searchAfter.doc = docCount - 1; + } else { + search.searchAfter.doc = doc; + } + } + + // Wrap Collector with TimeLimitingCollector + TopFieldCollector topFieldCollector = TopFieldCollector.create(search.sort, maxResults, + search.searchAfter, maxResults); + collector.setCollector(topFieldCollector); + + IndexSearcher indexSearcher = shard.searcherManager.acquire(); + indexSearcher.search(search.query, collector); + TopFieldDocs topDocs = topFieldCollector.topDocs(); + if (search.scored) { + TopFieldCollector.populateScores(topDocs.scoreDocs, indexSearcher, search.query); } + shardHits.add(topDocs); } - IndexSearcher indexSearcher = shard.searcherManager.acquire(); - TopFieldDocs shardDocs = indexSearcher.searchAfter(searchAfterDoc, search.query, maxResults, - search.sort, search.scored); - shardHits.add(shardDocs); - logger.debug("{} on shard {} out of {} total docs", shardDocs.totalHits, i, docCount); - i++; - long duration = (System.currentTimeMillis() - startTime); - if (duration > maxSearchTimeSeconds * 1000) { - logger.info("Stopping search after {} shards due to {} ms having elapsed", i, duration); - search.aborted = true; - break; + topFieldDocs = TopFieldDocs.merge(search.sort, 0, maxResults, shardHits.toArray(new TopFieldDocs[0]), + true); + } else { + // Don't need to merge results across shards + TopFieldCollector topFieldCollector = TopFieldCollector.create(search.sort, maxResults, + search.searchAfter, maxResults); + collector.setCollector(topFieldCollector); + IndexSearcher indexSearcher = shards.get(0).searcherManager.acquire(); + indexSearcher.search(search.query, collector); + topFieldDocs = topFieldCollector.topDocs(); + if (search.scored) { + TopFieldCollector.populateScores(topFieldDocs.scoreDocs, indexSearcher, search.query); } } - topFieldDocs = TopFieldDocs.merge(search.sort, 0, maxResults, shardHits.toArray(new TopFieldDocs[i]), true); - } else { - IndexSearcher indexSearcher = shards.get(0).searcherManager.acquire(); - topFieldDocs = indexSearcher.searchAfter(searchAfterDoc, search.query, maxResults, search.sort, - search.scored); + + return topFieldDocs; + + } catch (TimeExceededException e) { + String message = "Search cancelled for exceeding " + maxSearchTimeSeconds + " seconds"; + throw new LuceneException(HttpURLConnection.HTTP_GATEWAY_TIMEOUT, message); } - return topFieldDocs; } /** diff --git a/src/main/java/org/icatproject/lucene/SearchBucket.java b/src/main/java/org/icatproject/lucene/SearchBucket.java index 57c6fa4..91858a5 100644 --- a/src/main/java/org/icatproject/lucene/SearchBucket.java +++ b/src/main/java/org/icatproject/lucene/SearchBucket.java @@ -32,6 +32,7 @@ import org.apache.lucene.facet.range.Range; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.flexible.core.QueryNodeException; +import org.apache.lucene.queryparser.flexible.core.QueryNodeParseException; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.IndexSearcher; @@ -70,7 +71,6 @@ public enum SearchType { public Set fields = new HashSet(); public Map> joinedFields = new HashMap<>(); public Map dimensions = new HashMap(); - public boolean aborted = false; private static final SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmm"); static { @@ -222,6 +222,9 @@ public SearchBucket(Lucene lucene, SearchType searchType, HttpServletRequest req query = maybeEmptyQuery(luceneQuery); return; } + } catch (QueryNodeParseException e) { + String message = "Search term could not be parsed due to syntax errors"; + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, message); } } From 1eac7e06073553ca965f064ba820caa0ebe9a16e Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Tue, 9 Aug 2022 14:43:30 +0000 Subject: [PATCH 52/73] Error handling fix and range check for lock #19 --- .../java/org/icatproject/lucene/Lucene.java | 200 ++++++++++-------- 1 file changed, 110 insertions(+), 90 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 0fbaa5f..d28720a 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -69,7 +69,10 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; +import org.apache.lucene.queryparser.flexible.core.QueryNodeException; import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.SearcherManager; import org.apache.lucene.search.Sort; @@ -136,13 +139,15 @@ public ShardBucket(java.nio.file.Path shardPath) throws IOException { } searcherManager = new SearcherManager(indexWriter, null); IndexSearcher indexSearcher = null; + int numDocs; try { indexSearcher = searcherManager.acquire(); - int numDocs = indexSearcher.getIndexReader().numDocs(); + numDocs = indexSearcher.getIndexReader().numDocs(); documentCount = new AtomicLong(numDocs); } finally { searcherManager.release(indexSearcher); } + logger.info("Created ShardBucket for {} with {} Documents", directory, numDocs); } /** @@ -182,12 +187,15 @@ public IndexBucket(String entityName) { this.entityName = entityName.toLowerCase(); Long shardIndex = 0L; java.nio.file.Path shardPath = luceneDirectory.resolve(entityName); + ShardBucket shardBucket; + // Create at least one shard, then keep creating them so long as directories + // exist and already contain Documents do { - ShardBucket shardBucket = new ShardBucket(shardPath); + shardBucket = new ShardBucket(shardPath); shardList.add(shardBucket); shardIndex++; shardPath = luceneDirectory.resolve(entityName + "_" + shardIndex); - } while (Files.isDirectory(shardPath)); + } while (shardBucket.documentCount.get() > 0 && Files.isDirectory(shardPath)); logger.debug("Bucket for {} is now ready with {} shards", entityName, shardIndex); } catch (Throwable e) { logger.error("Can't continue " + e.getClass() + " " + e.getMessage()); @@ -284,6 +292,14 @@ public void close() throws IOException { } } + /** + * @return The ShardBucket currently in use for indexing new Documents. + */ + public ShardBucket getCurrentShardBucket() { + int size = shardList.size(); + return shardList.get(size - 1); + } + /** * Provides the ShardBucket that should be used for writing the next Document. * All Documents up to luceneMaxShardSize are indexed in the first shard, after @@ -294,11 +310,10 @@ public void close() throws IOException { * @throws IOException */ public ShardBucket routeShard() throws IOException { - int size = shardList.size(); - ShardBucket shardBucket = shardList.get(size - 1); + ShardBucket shardBucket = getCurrentShardBucket(); if (shardBucket.documentCount.get() >= luceneMaxShardSize) { shardBucket.indexWriter.commit(); - shardBucket = buildShardBucket(size); + shardBucket = buildShardBucket(shardList.size()); } return shardBucket; } @@ -330,7 +345,6 @@ public void releaseSearchers(List subSearchers) throws IOExceptio private final FacetsConfig facetsConfig = new FacetsConfig(); private java.nio.file.Path luceneDirectory; - private Set shardedIndices; private int luceneCommitMillis; private Long luceneMaxShardSize; private long maxSearchTimeSeconds; @@ -609,12 +623,9 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("search SearchBucket search = new SearchBucket(this, SearchType.DATAFILE, request, sort, searchAfter); searches.put(uid, search); return luceneSearchResult("Datafile", search, searchAfter, maxResults); - } catch (Exception e) { + } catch (IOException | QueryNodeException e) { logger.error("Error", e); freeSearcher(uid); - if (e instanceof LuceneException) { - throw (LuceneException) e; - } throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } } @@ -644,12 +655,9 @@ public String datasets(@Context HttpServletRequest request, @QueryParam("search_ SearchBucket search = new SearchBucket(this, SearchType.DATASET, request, sort, searchAfter); searches.put(uid, search); return luceneSearchResult("Dataset", search, searchAfter, maxResults); - } catch (Exception e) { + } catch (IOException | QueryNodeException e) { logger.error("Error", e); freeSearcher(uid); - if (e instanceof LuceneException) { - throw (LuceneException) e; - } throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } @@ -760,8 +768,7 @@ private void encodeResult(String entityName, JsonGenerator gen, ScoreDoc hit, In gen.writeStartArray(joinedEntityName.toLowerCase()); for (ScoreDoc joinedHit : topFieldDocs.scoreDocs) { gen.writeStartObject(); - int joinedShardIndex = joinedHit.shardIndex > 0 ? joinedHit.shardIndex : 0; - Document joinedDocument = searchers.get(joinedShardIndex).doc(joinedHit.doc); + Document joinedDocument = searchers.get(joinedHit.shardIndex).doc(joinedHit.doc); joinedDocument.forEach(encodeField(gen, search.joinedFields.get(joinedEntityName))); gen.writeEnd(); } @@ -834,11 +841,9 @@ public String facet(@PathParam("entityName") String entityName, @Context HttpSer SearchBucket search = new SearchBucket(this, SearchType.GENERIC, request, sort, null); searches.put(uid, search); return luceneFacetResult(entityName, search, searchAfter, maxResults, maxLabels); - } catch (Exception e) { + } catch (IOException | QueryNodeException e) { + logger.error("Error", e); freeSearcher(uid); - if (e instanceof LuceneException) { - throw (LuceneException) e; - } throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } } @@ -940,18 +945,15 @@ private void init() { icatUnits = new IcatUnits(props.getString("units", "")); - String shardedIndicesString = props.getString("shardedIndices", "").toLowerCase(); - shardedIndices = new HashSet<>(Arrays.asList(shardedIndicesString.split("\\s+"))); - } catch (Exception e) { logger.error(fatal, e.getMessage()); throw new IllegalStateException(e.getMessage()); } String format = "Initialised icat.lucene with directory {}, commitSeconds {}, maxShardSize {}, " - + "shardedIndices {}, maxSearchTimeSeconds {}, aggregateFiles {}"; - logger.info(format, luceneDirectory, luceneCommitMillis, luceneMaxShardSize, shardedIndices, - maxSearchTimeSeconds, aggregateFiles); + + "maxSearchTimeSeconds {}, aggregateFiles {}"; + logger.info(format, luceneDirectory, luceneCommitMillis, luceneMaxShardSize, maxSearchTimeSeconds, + aggregateFiles); } class CommitTimerTask extends TimerTask { @@ -989,12 +991,9 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("s SearchBucket search = new SearchBucket(this, SearchType.INVESTIGATION, request, sort, searchAfter); searches.put(uid, search); return luceneSearchResult("Investigation", search, searchAfter, maxResults); - } catch (Exception e) { + } catch (IOException | QueryNodeException e) { logger.error("Error", e); freeSearcher(uid); - if (e instanceof LuceneException) { - throw (LuceneException) e; - } throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } } @@ -1004,43 +1003,70 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("s * documents and preventing normal modify operations until the index is * unlocked. * + * A check is also performed against the minId and maxId used for population. + * This ensures that no data is duplicated in the index. + * * @param entityName Name of the entity/index to lock. - * @param request Incoming request. In order to delete all existing - * documents, the accompanying Json should specify - * {"delete": true}. - * @throws LuceneException If already locked, or if there's an IOException when - * deleting documents. + * @param minId The exclusive minimum ICAT id being populated for. If + * Documents already exist with an id greater than this, the + * lock will fail. If null, treated as if it were + * Long.MIN_VALUE + * @param maxId The inclusive maximum ICAT id being populated for. If + * Documents already exist with an id less than or equal to + * this, the lock will fail. If null, treated as if it were + * Long.MAX_VALUE + * @param delete Whether to delete all existing Documents on the index. + * @throws LuceneException If already locked, if there's an IOException when + * deleting documents, or if the min/max id values are + * provided and Documents already exist in that range. */ @POST @Path("lock/{entityName}") - @Consumes(MediaType.APPLICATION_JSON) - @Produces(MediaType.APPLICATION_JSON) - public String lock(@PathParam("entityName") String entityName, @Context HttpServletRequest request) - throws LuceneException { - try (JsonReader reader = Json.createReader(request.getInputStream())) { - boolean delete = reader.readObject().getBoolean("delete", false); - logger.info("Requesting lock of {} index, delete={}", entityName, delete); - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); + public void lock(@PathParam("entityName") String entityName, @QueryParam("minId") Long minId, + @QueryParam("maxId") Long maxId, @QueryParam("delete") Boolean delete) throws LuceneException { + try { + entityName = entityName.toLowerCase(); + logger.info("Requesting lock of {} index, minId={}, maxId={}, delete={}", entityName, minId, maxId, delete); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); if (!bucket.locked.compareAndSet(false, true)) { String message = "Lucene already locked for " + entityName; throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, message); } - JsonObjectBuilder builder = Json.createObjectBuilder(); if (delete) { for (ShardBucket shardBucket : bucket.shardList) { shardBucket.indexWriter.deleteAll(); } // Reset the shardList so we reset the routing bucket.shardList = Arrays.asList(bucket.shardList.get(0)); - return builder.add("currentId", 0).build().toString(); + return; + } + + for (ShardBucket shardBucket : bucket.shardList) { + IndexSearcher searcher = shardBucket.searcherManager.acquire(); + Query query; + if (minId == null && maxId == null) { + query = new MatchAllDocsQuery(); + } else { + if (minId == null) { + minId = Long.MIN_VALUE; + } + if (maxId == null) { + maxId = Long.MAX_VALUE; + } + query = LongPoint.newRangeQuery("id.long", minId + 1, maxId); + } + TopDocs topDoc = searcher.search(query, 1); + if (topDoc.scoreDocs.length != 0) { + // If we have any results in the populating range, unlock and throw + bucket.locked.compareAndSet(true, false); + Document doc = searcher.doc(topDoc.scoreDocs[0].doc); + String id = doc.get("id"); + String message = "While locking index, found id " + id + " in specified range"; + logger.error(message); + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, message); + } } - ShardBucket shardBucket = bucket.routeShard(); - int docCount = shardBucket.documentCount.intValue(); - IndexSearcher searcher = shardBucket.searcherManager.acquire(); - String id = searcher.doc(docCount - 1).get("id"); - shardBucket.searcherManager.release(searcher); - return builder.add("currentId", new Long(id)).build().toString(); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } @@ -1209,23 +1235,25 @@ private String luceneSearchResult(String name, SearchBucket search, String searc } logger.debug("{} maxscore {}", totalHits, maxScore); ByteArrayOutputStream baos = new ByteArrayOutputStream(); + int shardIndex = -1; try (JsonGenerator gen = Json.createGenerator(baos)) { gen.writeStartObject(); gen.writeStartArray("results"); for (ScoreDoc hit : hits) { - encodeResult(name, gen, hit, searchers.get(hit.shardIndex), search); + shardIndex = hit.shardIndex; + encodeResult(name, gen, hit, searchers.get(shardIndex), search); } gen.writeEnd(); // array results if (hits.length == maxResults) { ScoreDoc lastDoc = hits[hits.length - 1]; - gen.writeStartObject("search_after").write("doc", lastDoc.doc).write("shardIndex", - lastDoc.shardIndex); + shardIndex = lastDoc.shardIndex; + gen.writeStartObject("search_after").write("doc", lastDoc.doc).write("shardIndex", shardIndex); float lastScore = lastDoc.score; if (!Float.isNaN(lastScore)) { gen.write("score", lastScore); } if (fields != null) { - Document lastDocument = searchers.get(lastDoc.shardIndex).doc(lastDoc.doc); + Document lastDocument = searchers.get(shardIndex).doc(lastDoc.doc); gen.writeStartArray("fields"); for (SortField sortField : fields) { String fieldName = sortField.getField(); @@ -1272,6 +1300,10 @@ private String luceneSearchResult(String name, SearchBucket search, String searc gen.writeEnd(); // end "search_after" object } gen.writeEnd(); // end enclosing object + } catch (ArrayIndexOutOfBoundsException e) { + String message = "Attempting to access searcher with shardIndex " + shardIndex + ", but only have " + + searchers.size() + " searchers in total"; + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, message); } logger.trace("Json returned {}", baos.toString()); return baos.toString(); @@ -1294,51 +1326,37 @@ private TopFieldDocs searchShards(SearchBucket search, int maxResults, List 1) { - List shardHits = new ArrayList<>(); - int doc = search.searchAfter != null ? search.searchAfter.doc : -1; - for (ShardBucket shard : shards) { - // Handle the possibility of some shards having a higher docCount than the doc - // id on searchAfter - int docCount = shard.documentCount.intValue(); - if (search.searchAfter != null) { - if (doc > docCount) { - search.searchAfter.doc = docCount - 1; - } else { - search.searchAfter.doc = doc; - } - } - - // Wrap Collector with TimeLimitingCollector - TopFieldCollector topFieldCollector = TopFieldCollector.create(search.sort, maxResults, - search.searchAfter, maxResults); - collector.setCollector(topFieldCollector); - - IndexSearcher indexSearcher = shard.searcherManager.acquire(); - indexSearcher.search(search.query, collector); - TopFieldDocs topDocs = topFieldCollector.topDocs(); - if (search.scored) { - TopFieldCollector.populateScores(topDocs.scoreDocs, indexSearcher, search.query); + List shardHits = new ArrayList<>(); + int doc = search.searchAfter != null ? search.searchAfter.doc : -1; + for (ShardBucket shard : shards) { + // Handle the possibility of some shards having a higher docCount than the doc + // id on searchAfter + int docCount = shard.documentCount.intValue(); + if (search.searchAfter != null) { + if (doc > docCount) { + search.searchAfter.doc = docCount - 1; + } else { + search.searchAfter.doc = doc; } - shardHits.add(topDocs); } - topFieldDocs = TopFieldDocs.merge(search.sort, 0, maxResults, shardHits.toArray(new TopFieldDocs[0]), - true); - } else { - // Don't need to merge results across shards + + // Wrap Collector with TimeLimitingCollector TopFieldCollector topFieldCollector = TopFieldCollector.create(search.sort, maxResults, search.searchAfter, maxResults); collector.setCollector(topFieldCollector); - IndexSearcher indexSearcher = shards.get(0).searcherManager.acquire(); + + IndexSearcher indexSearcher = shard.searcherManager.acquire(); indexSearcher.search(search.query, collector); - topFieldDocs = topFieldCollector.topDocs(); + TopFieldDocs topDocs = topFieldCollector.topDocs(); if (search.scored) { - TopFieldCollector.populateScores(topFieldDocs.scoreDocs, indexSearcher, search.query); + TopFieldCollector.populateScores(topDocs.scoreDocs, indexSearcher, search.query); } + shardHits.add(topDocs); } + topFieldDocs = TopFieldDocs.merge(search.sort, 0, maxResults, shardHits.toArray(new TopFieldDocs[0]), + true); return topFieldDocs; @@ -1487,6 +1505,7 @@ private void addSortField(JsonObject json, Document document, String key) { Long value = new Long(json.getString(key)); document.add(new NumericDocValuesField("id.long", value)); document.add(new StoredField("id.long", value)); + document.add(new LongPoint("id.long", value)); } if (DocumentMapping.longFields.contains(key)) { document.add(new NumericDocValuesField(key, json.getJsonNumber(key).longValueExact())); @@ -1515,6 +1534,7 @@ private void addSortField(IndexableField field, Document document) { Long value = new Long(field.stringValue()); document.add(new NumericDocValuesField("id.long", value)); document.add(new StoredField("id.long", value)); + document.add(new LongPoint("id.long", value)); } if (DocumentMapping.longFields.contains(key)) { document.add(new NumericDocValuesField(key, field.numericValue().longValue())); From 182b5e5caeb5b49fa1f094924d0b9eee3caa0fde Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Wed, 17 Aug 2022 13:51:37 +0000 Subject: [PATCH 53/73] Fix shardList not accepting new shards #19 --- src/main/java/org/icatproject/lucene/Lucene.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index d28720a..fd4f84e 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -1038,7 +1038,9 @@ public void lock(@PathParam("entityName") String entityName, @QueryParam("minId" shardBucket.indexWriter.deleteAll(); } // Reset the shardList so we reset the routing - bucket.shardList = Arrays.asList(bucket.shardList.get(0)); + bucket.shardList = new ArrayList<>(); + ShardBucket shardBucket = bucket.shardList.get(0); + bucket.shardList.add(shardBucket); return; } From d8d1e762251936bcf05d897a65e299e2d86967a6 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Wed, 17 Aug 2022 15:41:32 +0100 Subject: [PATCH 54/73] Move synonym analyzer to DocumentMapping #16 --- .../icatproject/lucene/DocumentMapping.java | 63 ++++++++++--------- .../java/org/icatproject/lucene/Lucene.java | 4 +- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/DocumentMapping.java b/src/main/java/org/icatproject/lucene/DocumentMapping.java index 75500d4..2cf4848 100644 --- a/src/main/java/org/icatproject/lucene/DocumentMapping.java +++ b/src/main/java/org/icatproject/lucene/DocumentMapping.java @@ -1,12 +1,17 @@ package org.icatproject.lucene; +import java.io.IOException; +import java.text.ParseException; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; +import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; +import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; +import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler.ConfigurationKeys; public class DocumentMapping { @@ -30,6 +35,8 @@ public ParentRelationship(String parentName, String joiningField, String... fiel } } + private static Analyzer analyzer; + public static final Set doubleFields = new HashSet<>(); public static final Set facetFields = new HashSet<>(); public static final Set longFields = new HashSet<>(); @@ -38,14 +45,24 @@ public ParentRelationship(String parentName, String joiningField, String... fiel public static final Set indexedEntities = new HashSet<>(); public static final Map relationships = new HashMap<>(); - public static final IcatAnalyzer analyzer = new IcatAnalyzer(); - public static final StandardQueryParser genericParser = new StandardQueryParser(); - public static final StandardQueryParser datafileParser = new StandardQueryParser(); - public static final StandardQueryParser datasetParser = new StandardQueryParser(); - public static final StandardQueryParser investigationParser = new StandardQueryParser(); - public static final StandardQueryParser sampleParser = new StandardQueryParser(); + public static final StandardQueryParser genericParser = buildParser(); + public static final StandardQueryParser datafileParser = buildParser("name", "description", "location", "datafileFormat.name", "visitId", + "sample.name", "sample.type.name", "doi"); + public static final StandardQueryParser datasetParser = buildParser("name", "description", "sample.name", "sample.type.name", "type.name", + "visitId", "doi"); + public static final StandardQueryParser investigationParser = buildParser("name", "visitId", "title", "summary", "facility.name", + "type.name", "doi"); + public static final StandardQueryParser sampleParser = buildParser("sample.name", "sample.type.name"); static { + try { + // Attempt init an Analyzer which injects synonyms for searching + analyzer = new IcatSynonymAnalyzer(); + } catch (IOException | ParseException e) { + // If synonym files cannot be parsed, default to using the same analyzer as for writing + analyzer = new IcatAnalyzer(); + } + doubleFields.addAll(Arrays.asList("numericValue", "numericValueSI", "rangeTop", "rangeTopSI", "rangeBottom", "rangeBottomSI")); facetFields.addAll(Arrays.asList("type.name", "datafileFormat.name", "stringValue")); @@ -99,31 +116,17 @@ public ParentRelationship(String parentName, String joiningField, String... fiel new ParentRelationship("datafile", "investigation.id", "investigation.name", "visitId") }); relationships.put("Dataset", new ParentRelationship[] { new ParentRelationship("Datafile", "dataset.id", "dataset.name") }); + } - genericParser.setAllowLeadingWildcard(true); - genericParser.setAnalyzer(analyzer); - - CharSequence[] datafileFields = { "name", "description", "location", "datafileFormat.name", "visitId", - "sample.name", "sample.type.name", "doi" }; - datafileParser.setAllowLeadingWildcard(true); - datafileParser.setAnalyzer(analyzer); - datafileParser.setMultiFields(datafileFields); - - CharSequence[] datasetFields = { "name", "description", "sample.name", "sample.type.name", "type.name", - "visitId", "doi" }; - datasetParser.setAllowLeadingWildcard(true); - datasetParser.setAnalyzer(analyzer); - datasetParser.setMultiFields(datasetFields); - - CharSequence[] investigationFields = { "name", "visitId", "title", "summary", "facility.name", - "type.name", "doi" }; - investigationParser.setAllowLeadingWildcard(true); - investigationParser.setAnalyzer(analyzer); - investigationParser.setMultiFields(investigationFields); + private static StandardQueryParser buildParser(String... defaultFields) { + StandardQueryParser parser = new StandardQueryParser(); + StandardQueryConfigHandler qpConf = (StandardQueryConfigHandler) parser.getQueryConfigHandler(); + qpConf.set(ConfigurationKeys.ANALYZER, analyzer); + qpConf.set(ConfigurationKeys.ALLOW_LEADING_WILDCARD, true); + if (defaultFields.length > 0) { + qpConf.set(ConfigurationKeys.MULTI_FIELDS, defaultFields); + } - CharSequence[] sampleFields = { "sample.name", "sample.type.name" }; - sampleParser.setAllowLeadingWildcard(true); - sampleParser.setAnalyzer(analyzer); - sampleParser.setMultiFields(sampleFields); + return parser; } } diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index fd4f84e..442b866 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -7,7 +7,6 @@ import java.nio.file.FileVisitOption; import java.nio.file.Files; import java.util.ArrayList; -import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; @@ -124,7 +123,7 @@ private class ShardBucket { */ public ShardBucket(java.nio.file.Path shardPath) throws IOException { directory = FSDirectory.open(shardPath); - IndexWriterConfig config = new IndexWriterConfig(DocumentMapping.analyzer); + IndexWriterConfig config = new IndexWriterConfig(analyzer); indexWriter = new IndexWriter(directory, config); String[] files = directory.listAll(); if (files.length == 1 && files[0].equals("write.lock")) { @@ -341,6 +340,7 @@ public void releaseSearchers(List subSearchers) throws IOExceptio static final Logger logger = LoggerFactory.getLogger(Lucene.class); private static final Marker fatal = MarkerFactory.getMarker("FATAL"); + private static final IcatAnalyzer analyzer = new IcatAnalyzer(); private final FacetsConfig facetsConfig = new FacetsConfig(); From 32c2f335fe3c0cc150f6fd8e16039b3733794f35 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Wed, 7 Sep 2022 22:50:50 +0100 Subject: [PATCH 55/73] Add support for faceting DatasetTechnique #18 --- .../java/org/icatproject/lucene/DocumentMapping.java | 12 ++++++++---- src/main/java/org/icatproject/lucene/Lucene.java | 9 ++++----- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/DocumentMapping.java b/src/main/java/org/icatproject/lucene/DocumentMapping.java index 75500d4..95c2692 100644 --- a/src/main/java/org/icatproject/lucene/DocumentMapping.java +++ b/src/main/java/org/icatproject/lucene/DocumentMapping.java @@ -48,7 +48,7 @@ public ParentRelationship(String parentName, String joiningField, String... fiel static { doubleFields.addAll(Arrays.asList("numericValue", "numericValueSI", "rangeTop", "rangeTopSI", "rangeBottom", "rangeBottomSI")); - facetFields.addAll(Arrays.asList("type.name", "datafileFormat.name", "stringValue")); + facetFields.addAll(Arrays.asList("type.name", "datafileFormat.name", "stringValue", "technique.name")); longFields.addAll( Arrays.asList("date", "startDate", "endDate", "dateTimeValue", "investigation.startDate", "fileSize", "fileCount")); @@ -58,11 +58,12 @@ public ParentRelationship(String parentName, String joiningField, String... fiel "numericValueSI", "fileSize", "fileCount")); textFields.addAll(Arrays.asList("name", "visitId", "description", "location", "dataset.name", "investigation.name", "instrument.name", "instrument.fullName", "datafileFormat.name", "sample.name", - "sample.type.name", "title", "summary", "facility.name", "user.fullName", "type.name", "doi")); + "sample.type.name", "technique.name", "technique.description", "technique.pid", "title", "summary", + "facility.name", "user.fullName", "type.name", "doi")); indexedEntities.addAll(Arrays.asList("Datafile", "Dataset", "Investigation", "DatafileParameter", - "DatasetParameter", "InstrumentScientist", "InvestigationInstrument", "InvestigationParameter", - "InvestigationUser", "Sample", "SampleParameter")); + "DatasetParameter", "DatasetTechnique", "InstrumentScientist", "InvestigationInstrument", + "InvestigationParameter", "InvestigationUser", "Sample", "SampleParameter")); relationships.put("Instrument", new ParentRelationship[] { new ParentRelationship("InvestigationInstrument", "instrument.id", @@ -92,6 +93,9 @@ public ParentRelationship(String parentName, String joiningField, String... fiel new ParentRelationship("DatasetParameter", "type.id", "type.name"), new ParentRelationship("InvestigationParameter", "type.id", "type.name"), new ParentRelationship("SampleParameter", "type.id", "type.name") }); + relationships.put("Technique", + new ParentRelationship[] { new ParentRelationship("DatasetTechnique", "technique.id", "technique.name", + "technique.description", "technique.pid") }); relationships.put("Investigation", new ParentRelationship[] { new ParentRelationship("Dataset", "investigation.id", "investigation.name", diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index fd4f84e..172fc0a 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -147,7 +147,7 @@ public ShardBucket(java.nio.file.Path shardPath) throws IOException { } finally { searcherManager.release(indexSearcher); } - logger.info("Created ShardBucket for {} with {} Documents", directory, numDocs); + logger.info("Created ShardBucket for directory {} with {} Documents", directory.getDirectory(), numDocs); } /** @@ -905,7 +905,7 @@ public IndexSearcher getSearcher(Map> searcherMap, S throws IOException, LuceneException { List subSearchers = searcherMap.get(name); subSearchers = getSearchers(searcherMap, name); - if (subSearchers.size() > 1) { + if (subSearchers.size() != 1) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Cannot get single IndexSearcher for " + name + " as it has " + subSearchers.size() + " shards"); } @@ -1025,9 +1025,8 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("s public void lock(@PathParam("entityName") String entityName, @QueryParam("minId") Long minId, @QueryParam("maxId") Long maxId, @QueryParam("delete") Boolean delete) throws LuceneException { try { - entityName = entityName.toLowerCase(); logger.info("Requesting lock of {} index, minId={}, maxId={}, delete={}", entityName, minId, maxId, delete); - IndexBucket bucket = indexBuckets.computeIfAbsent(entityName, k -> new IndexBucket(k)); + IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); if (!bucket.locked.compareAndSet(false, true)) { String message = "Lucene already locked for " + entityName; @@ -1038,8 +1037,8 @@ public void lock(@PathParam("entityName") String entityName, @QueryParam("minId" shardBucket.indexWriter.deleteAll(); } // Reset the shardList so we reset the routing - bucket.shardList = new ArrayList<>(); ShardBucket shardBucket = bucket.shardList.get(0); + bucket.shardList = new ArrayList<>(); bucket.shardList.add(shardBucket); return; } From d051925cda5a4d6897fd25376c259b3d2ab6d554 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Fri, 9 Sep 2022 05:01:10 +0100 Subject: [PATCH 56/73] Update version #18 --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index ae6d0c3..26467e5 100755 --- a/pom.xml +++ b/pom.xml @@ -3,7 +3,7 @@ org.icatproject icat.lucene - 1.1.2 + 2.0.0-SNAPSHOT war ICAT Lucene @@ -101,7 +101,7 @@ org.icatproject icat.utils - 4.16.2-SNAPSHOT + 4.17.0-SNAPSHOT From 2e359eee51a11c92070367a472f8c3fc8e4d6f1c Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Thu, 29 Sep 2022 20:42:35 +0100 Subject: [PATCH 57/73] Refactor Field and large Lucene functions #18 --- .../java/org/icatproject/lucene/Field.java | 193 ++++++ .../org/icatproject/lucene/IcatAnalyzer.java | 13 - .../java/org/icatproject/lucene/Lucene.java | 626 +++++++++--------- .../org/icatproject/lucene/SearchBucket.java | 118 ++-- 4 files changed, 552 insertions(+), 398 deletions(-) create mode 100644 src/main/java/org/icatproject/lucene/Field.java diff --git a/src/main/java/org/icatproject/lucene/Field.java b/src/main/java/org/icatproject/lucene/Field.java new file mode 100644 index 0000000..966332e --- /dev/null +++ b/src/main/java/org/icatproject/lucene/Field.java @@ -0,0 +1,193 @@ +package org.icatproject.lucene; + +import javax.json.JsonObject; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.DoublePoint; +import org.apache.lucene.document.LongPoint; +import org.apache.lucene.document.NumericDocValuesField; +import org.apache.lucene.document.SortedDocValuesField; +import org.apache.lucene.document.StoredField; +import org.apache.lucene.document.StringField; +import org.apache.lucene.document.TextField; +import org.apache.lucene.document.Field.Store; +import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField; +import org.apache.lucene.index.IndexableField; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.NumericUtils; + +/** + * Wrapper for the name, value and type (String/Text, long, double) of a field + * to be added to a Lucene Document. + */ +class Field { + + private abstract class InnerField { + + public abstract Document addSortable(Document document) throws NumberFormatException; + + public abstract Document addToDocument(Document document) throws NumberFormatException; + + } + + private class InnerStringField extends InnerField { + + private String value; + + public InnerStringField(String value) { + this.value = value; + } + + @Override + public Document addSortable(Document document) throws NumberFormatException { + if (DocumentMapping.sortFields.contains(name)) { + if (name.equals("id")) { + // Id is a special case, as we need to to be SORTED as a byte ref to allow joins + // but also SORTED_NUMERIC to ensure a deterministic order to results + Long longValue = new Long(value); + document.add(new NumericDocValuesField("id.long", longValue)); + document.add(new StoredField("id.long", longValue)); + document.add(new LongPoint("id.long", longValue)); + } + document.add(new SortedDocValuesField(name, new BytesRef(value))); + } + return document; + } + + @Override + public Document addToDocument(Document document) throws NumberFormatException { + addSortable(document); + + if (DocumentMapping.facetFields.contains(name)) { + document.add(new SortedSetDocValuesFacetField(name + ".keyword", value)); + document.add(new StringField(name + ".keyword", value, Store.NO)); + } + + if (DocumentMapping.textFields.contains(name)) { + document.add(new TextField(name, value, Store.YES)); + } else { + document.add(new StringField(name, value, Store.YES)); + } + + return document; + } + + } + + private class InnerLongField extends InnerField { + + private long value; + + public InnerLongField(long value) { + this.value = value; + } + + @Override + public Document addSortable(Document document) throws NumberFormatException { + if (DocumentMapping.sortFields.contains(name)) { + document.add(new NumericDocValuesField(name, value)); + } + return document; + } + + @Override + public Document addToDocument(Document document) throws NumberFormatException { + addSortable(document); + document.add(new LongPoint(name, value)); + document.add(new StoredField(name, value)); + return document; + } + + } + + private class InnerDoubleField extends InnerField { + + private double value; + + public InnerDoubleField(double value) { + this.value = value; + } + + @Override + public Document addSortable(Document document) throws NumberFormatException { + if (DocumentMapping.sortFields.contains(name)) { + long sortableLong = NumericUtils.doubleToSortableLong(value); + document.add(new NumericDocValuesField(name, sortableLong)); + } + return document; + } + + @Override + public Document addToDocument(Document document) throws NumberFormatException { + addSortable(document); + document.add(new DoublePoint(name, value)); + document.add(new StoredField(name, value)); + return document; + } + + } + + private String name; + private InnerField innerField; + + /** + * Creates a wrapper for a Field. + * + * @param object JsonObject containing representations of multiple fields + * @param key Key of a specific field in object + */ + public Field(JsonObject object, String key) { + name = key; + if (DocumentMapping.doubleFields.contains(name)) { + innerField = new InnerDoubleField(object.getJsonNumber(name).doubleValue()); + } else if (DocumentMapping.longFields.contains(name)) { + innerField = new InnerLongField(object.getJsonNumber(name).longValueExact()); + } else { + innerField = new InnerStringField(object.getString(name)); + } + } + + /** + * Creates a wrapper for a Field. + * + * @param indexableField A Lucene IndexableField + */ + public Field(IndexableField indexableField) { + name = indexableField.name(); + if (DocumentMapping.doubleFields.contains(name)) { + innerField = new InnerDoubleField(indexableField.numericValue().doubleValue()); + } else if (DocumentMapping.longFields.contains(name)) { + innerField = new InnerLongField(indexableField.numericValue().longValue()); + } else { + innerField = new InnerStringField(indexableField.stringValue()); + } + } + + /** + * Adds a sortable field to the passed document. This only accounts for sorting, + * if storage and searchability are also needed, see {@link #addToDocument}. The + * exact implementation depends on whether this is a String, long or double + * field. + * + * @param document The document to add to + * @return The original document with this field added to it + * @throws NumberFormatException + */ + public Document addSortable(Document document) throws NumberFormatException { + return innerField.addSortable(document); + } + + /** + * Adds this field to the passed document. This accounts for sortable and + * facetable fields. The exact implementation depends on whether this is a + * String, long or double field. + * + * @param document The document to add to + * @return The original document with this field added to it + * @throws NumberFormatException + */ + public Document addToDocument(Document document) throws NumberFormatException { + return innerField.addToDocument(document); + } + +} diff --git a/src/main/java/org/icatproject/lucene/IcatAnalyzer.java b/src/main/java/org/icatproject/lucene/IcatAnalyzer.java index fcae1c9..5a7da51 100755 --- a/src/main/java/org/icatproject/lucene/IcatAnalyzer.java +++ b/src/main/java/org/icatproject/lucene/IcatAnalyzer.java @@ -8,21 +8,8 @@ import org.apache.lucene.analysis.en.EnglishAnalyzer; import org.apache.lucene.analysis.en.EnglishPossessiveFilter; import org.apache.lucene.analysis.en.PorterStemFilter; -// import org.apache.lucene.analysis.standard.StandardAnalyzer ; import org.apache.lucene.analysis.standard.StandardTokenizer; -// public class IcatAnalyzer extends Analyzer { - -// @Override -// protected TokenStreamComponents createComponents(String fieldName) { -// StandardAnalyzer analyzer = new StandardAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET); -// Analyzer.TokenStreamComponents stream = analyzer.createComponents(fieldName); -// sink = new EnglishPossessiveFilter(stream.getTokenStream()); -// sink = new PorterStemFilter(sink); -// return new TokenStreamComponents(source, sink); -// } -// } - public class IcatAnalyzer extends Analyzer { @Override diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 172fc0a..41e3a9d 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -7,7 +7,6 @@ import java.nio.file.FileVisitOption; import java.nio.file.Files; import java.util.ArrayList; -import java.util.Arrays; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; @@ -50,10 +49,8 @@ import org.apache.lucene.document.Field.Store; import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.NumericDocValuesField; -import org.apache.lucene.document.SortedDocValuesField; import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; @@ -64,7 +61,6 @@ import org.apache.lucene.facet.range.LongRangeFacetCounts; import org.apache.lucene.facet.sortedset.DefaultSortedSetDocValuesReaderState; import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts; -import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; @@ -87,7 +83,6 @@ import org.apache.lucene.search.TopFieldDocs; import org.apache.lucene.search.TotalHits; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.Counter; import org.apache.lucene.util.NumericUtils; import org.icatproject.lucene.SearchBucket.SearchType; @@ -246,7 +241,7 @@ public void updateDocument(Term term, Document document) throws IOException { * Creates a new ShardBucket and stores it in the shardMap. * * @param shardKey The identifier for the new shard to be created. For - * simplicity, should a Long starting at 0 and incrementing by 1 + * simplicity, should an int starting at 0 and incrementing by 1 * for each new shard. * @return A new ShardBucket with the provided shardKey. * @throws IOException @@ -346,7 +341,7 @@ public void releaseSearchers(List subSearchers) throws IOExceptio private java.nio.file.Path luceneDirectory; private int luceneCommitMillis; - private Long luceneMaxShardSize; + private long luceneMaxShardSize; private long maxSearchTimeSeconds; private boolean aggregateFiles; @@ -427,7 +422,7 @@ public void addNow(@Context HttpServletRequest request, @PathParam("entityName") createNow(entityName, document); } } catch (JsonException e) { - logger.error("Could not parse JSON from {}", value.toString()); + logger.error("Could not parse JSON from {}", value); throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); @@ -500,7 +495,7 @@ private void create(JsonObject operationBody) throws NumberFormatException, IOEx if (DocumentMapping.indexedEntities.contains(entityName)) { JsonObject documentObject = operationBody.getJsonObject("doc"); Document document = parseDocument(documentObject); - logger.trace("create {} {}", entityName, document.toString()); + logger.trace("create {} {}", entityName, document); IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); if (bucket.locked.get()) { throw new LuceneException(HttpURLConnection.HTTP_NOT_ACCEPTABLE, @@ -553,23 +548,8 @@ private void aggregateFileSize(long sizeToAdd, long sizeToSubtract, long deltaFi Set prunedFields = new HashSet<>(); List fieldsToAdd = new ArrayList<>(); - if (deltaFileSize != 0) { - prunedFields.add("fileSize"); - long oldSize = document.getField("fileSize").numericValue().longValue(); - long newSize = oldSize + deltaFileSize; - fieldsToAdd.add(new LongPoint("fileSize", newSize)); - fieldsToAdd.add(new StoredField("fileSize", newSize)); - fieldsToAdd.add(new NumericDocValuesField("fileSize", newSize)); - } - - if (deltaFileCount != 0) { - prunedFields.add("fileCount"); - long oldCount = document.getField("fileCount").numericValue().longValue(); - long newCount = oldCount + deltaFileCount; - fieldsToAdd.add(new LongPoint("fileCount", newCount)); - fieldsToAdd.add(new StoredField("fileCount", newCount)); - fieldsToAdd.add(new NumericDocValuesField("fileCount", newCount)); - } + incrementFileStatistic("fileSize", deltaFileSize, document, prunedFields, fieldsToAdd); + incrementFileStatistic("fileCount", deltaFileCount, document, prunedFields, fieldsToAdd); Document newDocument = pruneDocument(prunedFields, document); fieldsToAdd.forEach(field -> newDocument.add(field)); @@ -582,6 +562,33 @@ private void aggregateFileSize(long sizeToAdd, long sizeToSubtract, long deltaFi } } + /** + * Increments a field relating to file statistics (count, size) as part of the + * update on a Document. + * + * @param statisticName Name of the field to increment, i.e. fileCount or + * fileSize. + * @param statisticDelta Change in the value of the named statistic. + * @param document Lucene Document containing the old statistic value to + * be incremented. + * @param prunedFields Set of fields which need to be removed from the old + * Document. If the statistic is incremented, this will + * have statisticName added to it. + * @param fieldsToAdd List of Lucene IndexableFields to add to the new + * Document. + */ + private void incrementFileStatistic(String statisticName, long statisticDelta, Document document, + Set prunedFields, List fieldsToAdd) { + if (statisticDelta != 0) { + prunedFields.add(statisticName); + long oldValue = document.getField(statisticName).numericValue().longValue(); + long newValue = oldValue + statisticDelta; + fieldsToAdd.add(new LongPoint(statisticName, newValue)); + fieldsToAdd.add(new StoredField(statisticName, newValue)); + fieldsToAdd.add(new NumericDocValuesField(statisticName, newValue)); + } + } + /** * Creates a new Lucene document. * @@ -594,7 +601,7 @@ private void aggregateFileSize(long sizeToAdd, long sizeToSubtract, long deltaFi private void createNow(String entityName, JsonObject documentJson) throws NumberFormatException, IOException, LuceneException { Document document = parseDocument(documentJson); - logger.trace("create {} {}", entityName, document.toString()); + logger.trace("create {} {}", entityName, document); IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); bucket.addDocument(facetsConfig.build(document)); } @@ -617,17 +624,7 @@ private void createNow(String entityName, JsonObject documentJson) @Path("datafile") public String datafiles(@Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, @QueryParam("maxResults") int maxResults, @QueryParam("sort") String sort) throws LuceneException { - Long uid = null; - try { - uid = bucketNum.getAndIncrement(); - SearchBucket search = new SearchBucket(this, SearchType.DATAFILE, request, sort, searchAfter); - searches.put(uid, search); - return luceneSearchResult("Datafile", search, searchAfter, maxResults); - } catch (IOException | QueryNodeException e) { - logger.error("Error", e); - freeSearcher(uid); - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } + return searchEntity(request, searchAfter, maxResults, sort, SearchType.DATAFILE); } /** @@ -648,19 +645,7 @@ public String datafiles(@Context HttpServletRequest request, @QueryParam("search @Path("dataset") public String datasets(@Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, @QueryParam("maxResults") int maxResults, @QueryParam("sort") String sort) throws LuceneException { - - Long uid = null; - try { - uid = bucketNum.getAndIncrement(); - SearchBucket search = new SearchBucket(this, SearchType.DATASET, request, sort, searchAfter); - searches.put(uid, search); - return luceneSearchResult("Dataset", search, searchAfter, maxResults); - } catch (IOException | QueryNodeException e) { - logger.error("Error", e); - freeSearcher(uid); - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } - + return searchEntity(request, searchAfter, maxResults, sort, SearchType.DATASET); } /** @@ -689,14 +674,13 @@ private void delete(JsonObject operationBody) throws LuceneException, IOExceptio // Special case for filesizes Term term = new Term("id", icatId); if (aggregateFiles && entityName.equals("Datafile")) { - long sizeToSubtract = 0; for (ShardBucket shardBucket : bucket.shardList) { IndexSearcher datafileSearcher = shardBucket.searcherManager.acquire(); TopDocs topDocs = datafileSearcher.search(new TermQuery(term), 1); if (topDocs.totalHits.value == 1) { int docId = topDocs.scoreDocs[0].doc; Document datasetDocument = datafileSearcher.doc(docId); - sizeToSubtract = datasetDocument.getField("fileSize").numericValue().longValue(); + long sizeToSubtract = datasetDocument.getField("fileSize").numericValue().longValue(); if (sizeToSubtract > 0) { String datasetId = datasetDocument.getField("dataset.id").stringValue(); String investigationId = datasetDocument.getField("investigation.id").stringValue(); @@ -753,7 +737,7 @@ private void encodeResult(String entityName, JsonGenerator gen, ScoreDoc hit, In String parentId; if (joinedEntityName.toLowerCase().contains("investigation")) { fld = "investigation.id"; - if (entityName.toLowerCase().equals("investigation")) { + if (entityName.equalsIgnoreCase("investigation")) { parentId = document.get("id"); } else { parentId = document.get("investigation.id"); @@ -985,17 +969,7 @@ public void run() { @Path("investigation") public String investigations(@Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, @QueryParam("maxResults") int maxResults, @QueryParam("sort") String sort) throws LuceneException { - Long uid = null; - try { - uid = bucketNum.getAndIncrement(); - SearchBucket search = new SearchBucket(this, SearchType.INVESTIGATION, request, sort, searchAfter); - searches.put(uid, search); - return luceneSearchResult("Investigation", search, searchAfter, maxResults); - } catch (IOException | QueryNodeException e) { - logger.error("Error", e); - freeSearcher(uid); - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); - } + return searchEntity(request, searchAfter, maxResults, sort, SearchType.INVESTIGATION); } /** @@ -1023,7 +997,7 @@ public String investigations(@Context HttpServletRequest request, @QueryParam("s @POST @Path("lock/{entityName}") public void lock(@PathParam("entityName") String entityName, @QueryParam("minId") Long minId, - @QueryParam("maxId") Long maxId, @QueryParam("delete") Boolean delete) throws LuceneException { + @QueryParam("maxId") Long maxId, @QueryParam("delete") boolean delete) throws LuceneException { try { logger.info("Requesting lock of {} index, minId={}, maxId={}, delete={}", entityName, minId, maxId, delete); IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); @@ -1111,65 +1085,9 @@ private String luceneFacetResult(String name, SearchBucket search, String search TopDocs results = FacetsCollector.search(indexSearcher, search.query, maxResults, facetsCollector); logger.debug("{}", results.totalHits); for (FacetedDimension facetedDimension : search.dimensions.values()) { - if (facetedDimension.getRanges().size() > 0) { - logger.debug("Ranges: {}", facetedDimension.getRanges().get(0).getClass().getSimpleName()); - // Perform range based facets for a numeric field - String dimension = facetedDimension.getDimension(); - Facets facets; - if (DocumentMapping.longFields.contains(dimension)) { - LongRange[] ranges = facetedDimension.getRanges().toArray(new LongRange[0]); - facets = new LongRangeFacetCounts(dimension, facetsCollector, ranges); - } else if (DocumentMapping.doubleFields.contains(dimension)) { - DoubleRange[] ranges = facetedDimension.getRanges().toArray(new DoubleRange[0]); - facets = new DoubleRangeFacetCounts(dimension, facetsCollector, ranges); - } else { - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "'ranges' specified for dimension " + dimension - + " but this is not a supported numeric field"); - } - FacetResult facetResult = facets.getTopChildren(maxLabels, dimension); - facetedDimension.addResult(facetResult); - } else { - // Have a specific string dimension to facet, but these should all be done at - // once for efficiency - facetStrings = true; - } - } - try { - if (sparse) { - // Facet all applicable string fields - DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState( - indexSearcher.getIndexReader()); - Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); - addFacetResults(maxLabels, search.dimensions, facets); - logger.trace("Sparse faceting found results for {} dimensions", search.dimensions.size()); - } else if (facetStrings) { - // Only add facets to the results if they match one of the requested dimensions - DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState( - indexSearcher.getIndexReader()); - Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); - List facetResults = facets.getAllDims(maxLabels); - for (FacetResult facetResult : facetResults) { - String dimension = facetResult.dim.replace(".keyword", ""); - FacetedDimension facetedDimension = search.dimensions.get(dimension); - logger.debug("String facets found for {}, requested dimensions were {}", dimension, - search.dimensions.keySet()); - if (facetedDimension != null) { - facetedDimension.addResult(facetResult); - } - } - } - } catch (IllegalArgumentException e) { - // This can occur if no fields in the index have been faceted - logger.error( - "No facets found in index, resulting in error: " + e.getClass() + " " + e.getMessage()); - } catch (IllegalStateException e) { - // This can occur if we do not create the IndexSearcher from the same - // DirectoryReader as we used to create the state - logger.error("IndexSearcher used is not based on the DirectoryReader used for facet counting: " - + e.getClass() + " " + e.getMessage()); - throw e; + facetStrings = facetRanges(maxLabels, facetStrings, facetsCollector, facetedDimension); } + facetStrings(search, maxLabels, sparse, facetStrings, indexSearcher, facetsCollector); } } // Build results @@ -1180,6 +1098,101 @@ private String luceneFacetResult(String name, SearchBucket search, String search return aggregations; } + /** + * Performs range based faceting on the provided facetedDimension, if possible. + * + * @param maxLabels The maximum number of labels to collect for each + * facet + * @param facetStrings Whether there a String dimensions that will need + * faceting later + * @param facetsCollector Lucene FacetsCollector used to count results + * @param facetedDimension Representation of the dimension to facet, and used to + * store the results of the faceting + * @return If a string dimension was encountered, returns true. Otherwise, + * returns the value of facetStrings originally passed. + * @throws IOException + * @throws LuceneException + */ + private boolean facetRanges(int maxLabels, boolean facetStrings, FacetsCollector facetsCollector, + FacetedDimension facetedDimension) throws IOException, LuceneException { + if (facetedDimension.getRanges().size() > 0) { + logger.debug("Ranges: {}", facetedDimension.getRanges().get(0).getClass().getSimpleName()); + // Perform range based facets for a numeric field + String dimension = facetedDimension.getDimension(); + Facets facets; + if (DocumentMapping.longFields.contains(dimension)) { + LongRange[] ranges = facetedDimension.getRanges().toArray(new LongRange[0]); + facets = new LongRangeFacetCounts(dimension, facetsCollector, ranges); + } else if (DocumentMapping.doubleFields.contains(dimension)) { + DoubleRange[] ranges = facetedDimension.getRanges().toArray(new DoubleRange[0]); + facets = new DoubleRangeFacetCounts(dimension, facetsCollector, ranges); + } else { + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, + "'ranges' specified for dimension " + dimension + + " but this is not a supported numeric field"); + } + FacetResult facetResult = facets.getTopChildren(maxLabels, dimension); + facetedDimension.addResult(facetResult); + } else { + // Have a specific string dimension to facet, but these should all be done at + // once for efficiency + facetStrings = true; + } + return facetStrings; + } + + /** + * Performs String based faceting. Either this will be sparse (all fields + * targetted) or it will occur for specifc fields only. + * + * @param search Bucket being used for this search + * @param maxLabels The maximum number of labels to collect for each facet + * @param sparse Whether to perform sparse faceting (faceting across + * all String fields) + * @param facetStrings Whether specific String dimensions should be faceted + * @param indexSearcher Lucene IndexSearcher used to generate the ReaderState + * @param facetsCollector Lucene FacetsCollector used to count results + * @throws IOException + */ + private void facetStrings(SearchBucket search, int maxLabels, boolean sparse, boolean facetStrings, + IndexSearcher indexSearcher, FacetsCollector facetsCollector) throws IOException { + try { + if (sparse) { + // Facet all applicable string fields + DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState( + indexSearcher.getIndexReader()); + Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); + addFacetResults(maxLabels, search.dimensions, facets); + logger.trace("Sparse faceting found results for {} dimensions", search.dimensions.size()); + } else if (facetStrings) { + // Only add facets to the results if they match one of the requested dimensions + DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState( + indexSearcher.getIndexReader()); + Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); + List facetResults = facets.getAllDims(maxLabels); + for (FacetResult facetResult : facetResults) { + String dimension = facetResult.dim.replace(".keyword", ""); + FacetedDimension facetedDimension = search.dimensions.get(dimension); + logger.debug("String facets found for {}, requested dimensions were {}", dimension, + search.dimensions.keySet()); + if (facetedDimension != null) { + facetedDimension.addResult(facetResult); + } + } + } + } catch (IllegalArgumentException e) { + // This can occur if no fields in the index have been faceted + logger.error( + "No facets found in index, resulting in error: " + e.getClass() + " " + e.getMessage()); + } catch (IllegalStateException e) { + // This can occur if we do not create the IndexSearcher from the same + // DirectoryReader as we used to create the state + logger.error("IndexSearcher used is not based on the DirectoryReader used for facet counting: " + + e.getClass() + " " + e.getMessage()); + throw e; + } + } + /** * Add Facets for all dimensions. This will create FacetDimension Objects if the * do not already exist in the facetedDimensionMap, otherwise the counts for @@ -1207,6 +1220,36 @@ private void addFacetResults(int maxLabels, Map facete } } + /** + * Perform search on the specified entity/index. + * + * @param request Incoming Http request containing the query as Json. + * @param searchAfter String of Json representing the last Lucene Document from + * a previous search. + * @param maxResults The maximum number of results to include in the returned + * Json. + * @param sort String of Json representing the sort criteria. + * @param searchType The type of search query to build, corresponding to one of + * the main entities. + * @return String of Json representing the results of the search. + * @throws LuceneException + */ + private String searchEntity(HttpServletRequest request, String searchAfter, int maxResults, String sort, + SearchType searchType) + throws LuceneException { + Long uid = null; + try { + uid = bucketNum.getAndIncrement(); + SearchBucket search = new SearchBucket(this, searchType, request, sort, searchAfter); + searches.put(uid, search); + return luceneSearchResult(searchType.toString(), search, searchAfter, maxResults); + } catch (IOException | QueryNodeException e) { + logger.error("Error", e); + freeSearcher(uid); + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + } + } + /** * Perform search on name. * @@ -1235,79 +1278,7 @@ private String luceneSearchResult(String name, SearchBucket search, String searc maxScore = hits[0].score; } logger.debug("{} maxscore {}", totalHits, maxScore); - ByteArrayOutputStream baos = new ByteArrayOutputStream(); - int shardIndex = -1; - try (JsonGenerator gen = Json.createGenerator(baos)) { - gen.writeStartObject(); - gen.writeStartArray("results"); - for (ScoreDoc hit : hits) { - shardIndex = hit.shardIndex; - encodeResult(name, gen, hit, searchers.get(shardIndex), search); - } - gen.writeEnd(); // array results - if (hits.length == maxResults) { - ScoreDoc lastDoc = hits[hits.length - 1]; - shardIndex = lastDoc.shardIndex; - gen.writeStartObject("search_after").write("doc", lastDoc.doc).write("shardIndex", shardIndex); - float lastScore = lastDoc.score; - if (!Float.isNaN(lastScore)) { - gen.write("score", lastScore); - } - if (fields != null) { - Document lastDocument = searchers.get(shardIndex).doc(lastDoc.doc); - gen.writeStartArray("fields"); - for (SortField sortField : fields) { - String fieldName = sortField.getField(); - if (fieldName == null) { - // SCORE sorting will have a null fieldName - if (Float.isFinite(lastDoc.score)) { - gen.write(lastDoc.score); - } - continue; - } - IndexableField indexableField = lastDocument.getField(fieldName); - if (indexableField == null) { - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Field " + fieldName - + " used for sorting was not present on the Lucene Document; all sortable fields must also be stored."); - } - Type type = (sortField instanceof SortedNumericSortField) - ? ((SortedNumericSortField) sortField).getNumericType() - : sortField.getType(); - switch (type) { - case LONG: - if (indexableField.numericValue() != null) { - gen.write(indexableField.numericValue().longValue()); - } else if (indexableField.stringValue() != null) { - gen.write(new Long(indexableField.stringValue())); - } - break; - case DOUBLE: - if (indexableField.numericValue() != null) { - gen.write(indexableField.numericValue().doubleValue()); - } else if (indexableField.stringValue() != null) { - gen.write(new Double(indexableField.stringValue())); - } - break; - case STRING: - gen.write(indexableField.stringValue()); - break; - default: - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, - "SortField.Type must be one of LONG, DOUBLE, STRING, but it was " + type); - } - } - gen.writeEnd(); // end "fields" array - } - gen.writeEnd(); // end "search_after" object - } - gen.writeEnd(); // end enclosing object - } catch (ArrayIndexOutOfBoundsException e) { - String message = "Attempting to access searcher with shardIndex " + shardIndex + ", but only have " - + searchers.size() + " searchers in total"; - throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, message); - } - logger.trace("Json returned {}", baos.toString()); - return baos.toString(); + return encodeResults(name, search, maxResults, searchers, hits, fields); } /** @@ -1367,6 +1338,113 @@ private TopFieldDocs searchShards(SearchBucket search, int maxResults, List searchers, + ScoreDoc[] hits, SortField[] fields) throws IOException, LuceneException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + int shardIndex = -1; + try (JsonGenerator gen = Json.createGenerator(baos)) { + gen.writeStartObject(); + gen.writeStartArray("results"); + for (ScoreDoc hit : hits) { + shardIndex = hit.shardIndex; + encodeResult(name, gen, hit, searchers.get(shardIndex), search); + } + gen.writeEnd(); // array results + if (hits.length == maxResults) { + ScoreDoc lastDoc = hits[hits.length - 1]; + shardIndex = lastDoc.shardIndex; + gen.writeStartObject("search_after").write("doc", lastDoc.doc).write("shardIndex", shardIndex); + float lastScore = lastDoc.score; + if (!Float.isNaN(lastScore)) { + gen.write("score", lastScore); + } + if (fields != null) { + Document lastDocument = searchers.get(shardIndex).doc(lastDoc.doc); + gen.writeStartArray("fields"); + for (SortField sortField : fields) { + encodeSearchAfterField(gen, sortField, lastDoc, lastDocument); + } + gen.writeEnd(); // end "fields" array + } + gen.writeEnd(); // end "search_after" object + } + gen.writeEnd(); // end enclosing object + } catch (ArrayIndexOutOfBoundsException e) { + String message = "Attempting to access searcher with shardIndex " + shardIndex + ", but only have " + + searchers.size() + " searchers in total"; + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, message); + } + logger.trace("Json returned {}", baos); + return baos.toString(); + } + + /** + * Encodes a single SortField used in the search into the Json as to enable the + * ability to "search after" the last result of a previous search. + * + * @param gen JsonGenerator used to encode the results + * @param sortField SortField used to sort the hits + * @param lastDoc The final scored hit of the search + * @param lastDocument The full Document corresponding to the last hit of the + * search + * @throws LuceneException + */ + private void encodeSearchAfterField(JsonGenerator gen, SortField sortField, ScoreDoc lastDoc, Document lastDocument) + throws LuceneException { + String fieldName = sortField.getField(); + if (fieldName == null) { + // SCORE sorting will have a null fieldName + if (Float.isFinite(lastDoc.score)) { + gen.write(lastDoc.score); + } + return; + } + IndexableField indexableField = lastDocument.getField(fieldName); + if (indexableField == null) { + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, "Field " + fieldName + + " used for sorting was not present on the Lucene Document; all sortable fields must also be stored."); + } + Type type = (sortField instanceof SortedNumericSortField) + ? ((SortedNumericSortField) sortField).getNumericType() + : sortField.getType(); + switch (type) { + case LONG: + if (indexableField.numericValue() != null) { + gen.write(indexableField.numericValue().longValue()); + } else if (indexableField.stringValue() != null) { + gen.write(new Long(indexableField.stringValue())); + } + break; + case DOUBLE: + if (indexableField.numericValue() != null) { + gen.write(indexableField.numericValue().doubleValue()); + } else if (indexableField.stringValue() != null) { + gen.write(new Double(indexableField.stringValue())); + } + break; + case STRING: + gen.write(indexableField.stringValue()); + break; + default: + throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, + "SortField.Type must be one of LONG, DOUBLE, STRING, but it was " + type); + } + } + /** * Builds a Lucene Document from the parsed json. * @@ -1376,43 +1454,22 @@ private TopFieldDocs searchShards(SearchBucket search, int maxResults, List fields, Document oldDocument) { Document newDocument = new Document(); for (IndexableField field : oldDocument.getFields()) { if (!fields.contains(field.name())) { - addField(field, newDocument); + Field fieldToAdd = new Field(field); + fieldToAdd.addToDocument(newDocument); } } return newDocument; } /** - * Unlocks the specified index after population, commiting all pending documents - * and - * allowing normal modify operations again. + * Unlocks the specified index after population, committing all pending + * documents + * and allowing normal modify operations again. * * @param entityName Name of the entity/index to unlock. * @throws LuceneException If not locked, or if there's an IOException when @@ -1618,8 +1587,7 @@ public void unlock(@PathParam("entityName") String entityName) throws LuceneExce /** * Updates an existing Lucene document, provided that the target index is not - * locked - * for another operation. + * locked for another operation. * * @param operationBody JsonObject containing the "_index" that the new "doc" * should be created in. @@ -1685,7 +1653,7 @@ private void update(JsonObject operationBody) throws LuceneException, NumberForm * @throws NumberFormatException * @throws IOException */ - private void updateByRelation(JsonObject operationBody, Boolean delete) + private void updateByRelation(JsonObject operationBody, boolean delete) throws LuceneException, NumberFormatException, IOException { for (DocumentMapping.ParentRelationship parentRelationship : DocumentMapping.relationships .get(operationBody.getString("_index"))) { diff --git a/src/main/java/org/icatproject/lucene/SearchBucket.java b/src/main/java/org/icatproject/lucene/SearchBucket.java index 91858a5..3a28375 100644 --- a/src/main/java/org/icatproject/lucene/SearchBucket.java +++ b/src/main/java/org/icatproject/lucene/SearchBucket.java @@ -68,9 +68,9 @@ public enum SearchType { public Sort sort; public FieldDoc searchAfter; public boolean scored; - public Set fields = new HashSet(); + public Set fields = new HashSet<>(); public Map> joinedFields = new HashMap<>(); - public Map dimensions = new HashMap(); + public Map dimensions = new HashMap<>(); private static final SimpleDateFormat df = new SimpleDateFormat("yyyyMMddHHmm"); static { @@ -100,8 +100,8 @@ public SearchBucket(Lucene lucene) { * @throws IOException * @throws QueryNodeException */ - public SearchBucket(Lucene lucene, SearchType searchType, HttpServletRequest request, String sort, String searchAfter) - throws LuceneException, IOException, QueryNodeException { + public SearchBucket(Lucene lucene, SearchType searchType, HttpServletRequest request, String sort, + String searchAfter) throws LuceneException, IOException, QueryNodeException { this.lucene = lucene; searcherMap = new HashMap<>(); parseSort(sort); @@ -248,11 +248,10 @@ public SearchBucket(Lucene lucene, SearchType searchType, HttpServletRequest req */ private void buildDateRanges(Builder queryBuilder, JsonObject queryJson, String lowerKey, String upperKey, String... fields) throws LuceneException { - Long lower = parseDate(queryJson, lowerKey, 0); - Long upper = parseDate(queryJson, upperKey, 59999); - if (lower != null || upper != null) { - lower = (lower == null) ? Long.MIN_VALUE : lower; - upper = (upper == null) ? Long.MAX_VALUE : upper; + long lower = parseDate(queryJson, lowerKey, 0); + long upper = parseDate(queryJson, upperKey, 59999); + // Only build the query if at least one of the dates is defined + if (lower != Long.MIN_VALUE || upper != Long.MAX_VALUE) { for (String field : fields) { queryBuilder.add(LongPoint.newRangeQuery(field, lower, upper), Occur.MUST); } @@ -281,21 +280,18 @@ private void buildFilterQueries(String target, JsonObject requestedQuery, Builde String filterTarget = i == -1 ? key : key.substring(0, i); String fld = key.substring(i + 1); Query dimensionQuery; - switch (valueType) { - case ARRAY: - Builder builder = new BooleanQuery.Builder(); - // If the key was just a nested entity (no ".") then we should FILTER all of our - // queries on that entity. - Occur occur = i == -1 ? Occur.FILTER : Occur.SHOULD; - for (JsonValue arrayValue : filterObject.getJsonArray(key)) { - Query arrayQuery = parseFilter(target, fld, arrayValue); - builder.add(arrayQuery, occur); - } - dimensionQuery = builder.build(); - break; - - default: - dimensionQuery = parseFilter(target, fld, value); + if (valueType.equals(ValueType.ARRAY)) { + Builder builder = new BooleanQuery.Builder(); + // If the key was just a nested entity (no ".") then we should FILTER all of our + // queries on that entity. + Occur occur = i == -1 ? Occur.FILTER : Occur.SHOULD; + for (JsonValue arrayValue : filterObject.getJsonArray(key)) { + Query arrayQuery = parseFilter(target, fld, arrayValue); + builder.add(arrayQuery, occur); + } + dimensionQuery = builder.build(); + } else { + dimensionQuery = parseFilter(target, fld, value); } // Nest the dimension query if needed if (i != -1 && !target.equals(filterTarget)) { @@ -350,7 +346,8 @@ private Query parseFilter(String target, String fld, JsonValue value) throws IOE nestedFilters.forEach(nestedFilter -> { String nestedField = nestedFilter.getString("field"); if (nestedFilter.containsKey("value")) { - TermQuery query = new TermQuery(new Term(nestedField + ".keyword", nestedFilter.getString("value"))); + Term term = new Term(nestedField + ".keyword", nestedFilter.getString("value")); + TermQuery query = new TermQuery(term); nestedBoolBuilder.add(query, Occur.FILTER); } else if (nestedFilter.containsKey("exact")) { buildNestedExactQuery(nestedField, nestedFilter, nestedBoolBuilder); @@ -363,11 +360,10 @@ private Query parseFilter(String target, String fld, JsonValue value) throws IOE return JoinUtil.createJoinQuery("sample.id", false, "sample.id", nestedBoolBuilder.build(), nestedSearcher, ScoreMode.None); } else if (fld.equals("sampleparameter") && target.equals("investigation")) { - Query sampleQuery = JoinUtil.createJoinQuery("sample.id", false, "sample.id", nestedBoolBuilder.build(), - nestedSearcher, ScoreMode.None); - Query investigationQuery = JoinUtil.createJoinQuery("sample.investigation.id", false, "id", sampleQuery, + Query sampleQuery = JoinUtil.createJoinQuery("sample.id", false, "sample.id", + nestedBoolBuilder.build(), nestedSearcher, ScoreMode.None); + return JoinUtil.createJoinQuery("sample.investigation.id", false, "id", sampleQuery, lucene.getSearcher(searcherMap, "sample"), ScoreMode.None); - return investigationQuery; } else { return JoinUtil.createJoinQuery(target + ".id", false, "id", nestedBoolBuilder.build(), nestedSearcher, ScoreMode.None); @@ -390,7 +386,8 @@ private Query parseFilter(String target, String fld, JsonValue value) throws IOE } /** - * Builds an exact numeric query, intended for use with numeric or date/time parameters. + * Builds an exact numeric query, intended for use with numeric or date/time + * parameters. * * @param fld Name of the field to apply the range to. * @param valueObject JsonObject containing "exact", and optionally "units" @@ -408,17 +405,23 @@ private void buildNestedExactQuery(String fld, JsonObject valueObject, BooleanQu String units = valueObject.getString("units", null); if (units != null) { SystemValue exactValue = lucene.icatUnits.new SystemValue(exact, units); - if (exactValue.value != null ) { + if (exactValue.value != null) { // If we were able to parse the units, apply query to the SI value - rangeBuilder.add(DoublePoint.newRangeQuery("rangeTopSI", exactValue.value, Double.POSITIVE_INFINITY), Occur.FILTER); - rangeBuilder.add(DoublePoint.newRangeQuery("rangeBottomSI", Double.NEGATIVE_INFINITY, exactValue.value), Occur.FILTER); + rangeBuilder.add( + DoublePoint.newRangeQuery("rangeTopSI", exactValue.value, Double.POSITIVE_INFINITY), + Occur.FILTER); + rangeBuilder.add( + DoublePoint.newRangeQuery("rangeBottomSI", Double.NEGATIVE_INFINITY, exactValue.value), + Occur.FILTER); exactOrRangeBuilder.add(rangeBuilder.build(), Occur.SHOULD); exactOrRangeBuilder.add(DoublePoint.newExactQuery(fld + "SI", exactValue.value), Occur.SHOULD); builder.add(exactOrRangeBuilder.build(), Occur.FILTER); } else { // If units could not be parsed, make them part of the query on the raw data - rangeBuilder.add(DoublePoint.newRangeQuery("rangeTop", exact, Double.POSITIVE_INFINITY), Occur.FILTER); - rangeBuilder.add(DoublePoint.newRangeQuery("rangeBottom", Double.NEGATIVE_INFINITY, exact), Occur.FILTER); + rangeBuilder.add(DoublePoint.newRangeQuery("rangeTop", exact, Double.POSITIVE_INFINITY), + Occur.FILTER); + rangeBuilder.add(DoublePoint.newRangeQuery("rangeBottom", Double.NEGATIVE_INFINITY, exact), + Occur.FILTER); exactOrRangeBuilder.add(rangeBuilder.build(), Occur.SHOULD); exactOrRangeBuilder.add(DoublePoint.newExactQuery(fld, exact), Occur.SHOULD); builder.add(exactOrRangeBuilder.build(), Occur.FILTER); @@ -427,7 +430,8 @@ private void buildNestedExactQuery(String fld, JsonObject valueObject, BooleanQu } else { // If units were not provided, just apply to the raw data rangeBuilder.add(DoublePoint.newRangeQuery("rangeTop", exact, Double.POSITIVE_INFINITY), Occur.FILTER); - rangeBuilder.add(DoublePoint.newRangeQuery("rangeBottom", Double.NEGATIVE_INFINITY, exact), Occur.FILTER); + rangeBuilder.add(DoublePoint.newRangeQuery("rangeBottom", Double.NEGATIVE_INFINITY, exact), + Occur.FILTER); exactOrRangeBuilder.add(rangeBuilder.build(), Occur.SHOULD); exactOrRangeBuilder.add(DoublePoint.newExactQuery(fld, exact), Occur.SHOULD); builder.add(exactOrRangeBuilder.build(), Occur.FILTER); @@ -509,16 +513,12 @@ private void buildUserNameQuery(String userName, BooleanQuery.Builder luceneQuer * Converts String into number of ms since epoch. * * @param value String representing a Date in the format "yyyyMMddHHmm". - * @return Number of ms since epoch, or null if value was null + * @return Number of ms since epoch. * @throws java.text.ParseException */ - protected static Long decodeTime(String value) throws java.text.ParseException { - if (value == null) { - return null; - } else { - synchronized (df) { - return df.parse(value).getTime(); - } + protected static long decodeTime(String value) throws java.text.ParseException { + synchronized (df) { + return df.parse(value).getTime(); } } @@ -551,7 +551,7 @@ private Query maybeEmptyQuery(Builder luceneQuery) { * @throws LuceneException If the ValueType is not NUMBER or STRING, or if a * STRING value cannot be parsed. */ - private Long parseDate(JsonObject jsonObject, String key, int offset) throws LuceneException { + private long parseDate(JsonObject jsonObject, String key, int offset) throws LuceneException { if (jsonObject.containsKey(key)) { ValueType valueType = jsonObject.get(key).getValueType(); switch (valueType) { @@ -570,7 +570,13 @@ private Long parseDate(JsonObject jsonObject, String key, int offset) throws Luc "Dates should be represented by a NUMBER or STRING JsonValue, but got " + valueType); } } - return null; + // If the key wasn't present, use eiter MIN_VALUE or MAX_VALUE based on whether + // we need to offset the date. This is useful for half open ranges. + if (offset == 0) { + return Long.MIN_VALUE; + } else { + return Long.MAX_VALUE; + } } /** @@ -586,7 +592,7 @@ private void parseDimensions(JsonObject jsonObject) throws LuceneException { for (JsonObject dimensionObject : dimensionObjects) { if (!dimensionObject.containsKey("dimension")) { throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, - "'dimension' not specified for facet request " + dimensionObject.toString()); + "'dimension' not specified for facet request " + dimensionObject); } String dimension = dimensionObject.getString("dimension"); FacetedDimension facetDimensionRequest = new FacetedDimension(dimension); @@ -595,15 +601,15 @@ private void parseDimensions(JsonObject jsonObject) throws LuceneException { List jsonRanges = dimensionObject.getJsonArray("ranges").getValuesAs(JsonObject.class); if (DocumentMapping.longFields.contains(dimension)) { for (JsonObject range : jsonRanges) { - Long lower = Long.MIN_VALUE; - Long upper = Long.MAX_VALUE; + long lower = Long.MIN_VALUE; + long upper = Long.MAX_VALUE; if (range.containsKey("from")) { lower = range.getJsonNumber("from").longValueExact(); } if (range.containsKey("to")) { upper = range.getJsonNumber("to").longValueExact(); } - String label = lower.toString() + "-" + upper.toString(); + String label = lower + "-" + upper; if (range.containsKey("key")) { label = range.getString("key"); } @@ -611,15 +617,15 @@ private void parseDimensions(JsonObject jsonObject) throws LuceneException { } } else if (DocumentMapping.doubleFields.contains(dimension)) { for (JsonObject range : jsonRanges) { - Double lower = Double.MIN_VALUE; - Double upper = Double.MAX_VALUE; + double lower = Double.MIN_VALUE; + double upper = Double.MAX_VALUE; if (range.containsKey("from")) { lower = range.getJsonNumber("from").doubleValue(); } if (range.containsKey("to")) { upper = range.getJsonNumber("to").doubleValue(); } - String label = lower.toString() + "-" + upper.toString(); + String label = lower + "-" + upper; if (range.containsKey("key")) { label = range.getString("key"); } @@ -749,8 +755,8 @@ private Builder parseParameter(JsonValue p) throws LuceneException { } else if (parameter.containsKey("lowerDateValue") && parameter.containsKey("upperDateValue")) { buildDateRanges(paramQuery, parameter, "lowerDateValue", "upperDateValue", "dateTimeValue"); } else if (parameter.containsKey("lowerNumericValue") && parameter.containsKey("upperNumericValue")) { - Double pLowerNumericValue = parameter.getJsonNumber("lowerNumericValue").doubleValue(); - Double pUpperNumericValue = parameter.getJsonNumber("upperNumericValue").doubleValue(); + double pLowerNumericValue = parameter.getJsonNumber("lowerNumericValue").doubleValue(); + double pUpperNumericValue = parameter.getJsonNumber("upperNumericValue").doubleValue(); paramQuery.add(DoublePoint.newRangeQuery("numericValue", pLowerNumericValue, pUpperNumericValue), Occur.MUST); } @@ -848,7 +854,7 @@ public void parseSort(String sortString) throws LuceneException { List fields = new ArrayList<>(); for (String key : object.keySet()) { String order = object.getString(key); - Boolean reverse; + boolean reverse; if (order.equals("asc")) { reverse = false; } else if (order.equals("desc")) { From 4a7e9db7755edd7a5e8ad1a1e547a807c46457fd Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Wed, 12 Oct 2022 19:16:37 +0100 Subject: [PATCH 58/73] run.properties settings updates #18 --- src/main/config/run.properties.example | 9 +++++---- src/main/resources/run.properties | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/main/config/run.properties.example b/src/main/config/run.properties.example index 4aeab39..dbe555b 100644 --- a/src/main/config/run.properties.example +++ b/src/main/config/run.properties.example @@ -1,7 +1,8 @@ # Real comments in this file are marked with '#' whereas commented out lines # are marked with '!' -directory = ${HOME}/data/lucene -commitSeconds = 5 -maxShardSize = 2147483648 -ip = 127.0.0.1/32 +directory = ${HOME}/data/search +commitSeconds = 5 +maxShardSize = 2147483648 +ip = 127.0.0.1/32 +aggregateFiles = false diff --git a/src/main/resources/run.properties b/src/main/resources/run.properties index 7189854..c86b66d 100644 --- a/src/main/resources/run.properties +++ b/src/main/resources/run.properties @@ -1,7 +1,7 @@ # Real comments in this file are marked with '#' whereas commented out lines # are marked with '!' -directory = ${HOME}/data/lucene +directory = ${HOME}/data/search commitSeconds = 5 maxShardSize = 2147483648 ip = 127.0.0.1/32 From 7e53648b90b385177a4f91c7c6cafc5c9c1122ae Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Mon, 17 Oct 2022 13:35:26 +0100 Subject: [PATCH 59/73] parse_synonyms clean up and check for null synonyms #16 --- .../icatproject/lucene/DocumentMapping.java | 12 +- .../lucene/IcatSynonymAnalyzer.java | 22 ++-- src/main/scripts/parse_synonyms.py | 111 +++++++++--------- 3 files changed, 70 insertions(+), 75 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/DocumentMapping.java b/src/main/java/org/icatproject/lucene/DocumentMapping.java index 95c023b..4eaf7f0 100644 --- a/src/main/java/org/icatproject/lucene/DocumentMapping.java +++ b/src/main/java/org/icatproject/lucene/DocumentMapping.java @@ -1,7 +1,5 @@ package org.icatproject.lucene; -import java.io.IOException; -import java.text.ParseException; import java.util.Arrays; import java.util.HashMap; import java.util.HashSet; @@ -35,7 +33,7 @@ public ParentRelationship(String parentName, String joiningField, String... fiel } } - private static Analyzer analyzer; + private static Analyzer analyzer = new IcatSynonymAnalyzer();; public static final Set doubleFields = new HashSet<>(); public static final Set facetFields = new HashSet<>(); @@ -55,14 +53,6 @@ public ParentRelationship(String parentName, String joiningField, String... fiel public static final StandardQueryParser sampleParser = buildParser("sample.name", "sample.type.name"); static { - try { - // Attempt init an Analyzer which injects synonyms for searching - analyzer = new IcatSynonymAnalyzer(); - } catch (IOException | ParseException e) { - // If synonym files cannot be parsed, default to using the same analyzer as for writing - analyzer = new IcatAnalyzer(); - } - doubleFields.addAll(Arrays.asList("numericValue", "numericValueSI", "rangeTop", "rangeTopSI", "rangeBottom", "rangeBottomSI")); facetFields.addAll(Arrays.asList("type.name", "datafileFormat.name", "stringValue", "technique.name")); diff --git a/src/main/java/org/icatproject/lucene/IcatSynonymAnalyzer.java b/src/main/java/org/icatproject/lucene/IcatSynonymAnalyzer.java index 26841f1..029f8fc 100755 --- a/src/main/java/org/icatproject/lucene/IcatSynonymAnalyzer.java +++ b/src/main/java/org/icatproject/lucene/IcatSynonymAnalyzer.java @@ -22,15 +22,21 @@ public class IcatSynonymAnalyzer extends Analyzer { private SynonymMap synonyms; - public IcatSynonymAnalyzer() - throws IOException, ParseException { + public IcatSynonymAnalyzer() { super(); // Load synonyms from resource file InputStream in = IcatSynonymAnalyzer.class.getClassLoader().getResourceAsStream("synonym.txt"); - BufferedReader reader = new BufferedReader(new InputStreamReader(in)); - SolrSynonymParser parser = new SolrSynonymParser(true, true, new IcatAnalyzer()); - parser.parse(reader); - synonyms = parser.build(); + if (in != null) { + BufferedReader reader = new BufferedReader(new InputStreamReader(in)); + SolrSynonymParser parser = new SolrSynonymParser(true, true, new IcatAnalyzer()); + try { + parser.parse(reader); + synonyms = parser.build(); + } catch (IOException | ParseException e) { + // If we cannot parse the synonyms, do nothing + // To all purposes this will now act as a plain IcatAnalyzer + } + } } @Override @@ -40,7 +46,9 @@ protected TokenStreamComponents createComponents(String fieldName) { sink = new LowerCaseFilter(sink); sink = new StopFilter(sink, IcatAnalyzer.SCIENTIFIC_STOP_WORDS_SET); sink = new PorterStemFilter(sink); - sink = new SynonymGraphFilter(sink, synonyms, false); + if (synonyms != null) { + sink = new SynonymGraphFilter(sink, synonyms, false); + } return new TokenStreamComponents(source, sink); } } diff --git a/src/main/scripts/parse_synonyms.py b/src/main/scripts/parse_synonyms.py index 3ae3d55..d23d2a4 100644 --- a/src/main/scripts/parse_synonyms.py +++ b/src/main/scripts/parse_synonyms.py @@ -1,21 +1,21 @@ -#!/usr/bin/env python +#!/usr/bin/env python3 import csv import sys from typing import Dict, List -def addToParents( +def add_to_parents( relationships: Dict[str, Dict[str, List[str]]], label: str, parents: List[str], - childDepth: int + child_depth: int ): """ Adds the `label` to all the entries in `relationships` that have a key in `parents`, then recursively calls itself to add `label` to any - grandparents. `childDepth` is decreased by 1 for each generation to prevent - exponentially large injections. + grandparents. `child_depth` is decreased by 1 for each generation to + prevent exponentially large injections. Parameters ---------- @@ -26,73 +26,71 @@ def addToParents( The term to be added to its `parents`. parents: List[str] The direct parents of the current `label` - childDepth: int + child_depth: int The number of generations of children to inject for each term. For example, a value of 2 would inject children and their children. 0 will only add alternative terms. Negative integers will add all children, grandchildren, etc. Note that this may result in an exponentially large number of terms """ - if childDepth != 0: + if child_depth != 0: for parent in parents: try: relationships[parent]["children"].append(label) # If the parent is equivalent to anything, also add label as a - # child of the equivalentParent - for equivalentParent in relationships[parent]["equivalent"]: - relationships[equivalentParent]["children"].append(label) - addToParents( + # child of the equivalent_parent + for equivalent_parent in relationships[parent]["equivalent"]: + relationships[equivalent_parent]["children"].append(label) + add_to_parents( relationships, label, relationships[parent]["parents"], - childDepth - 1, + child_depth - 1, ) except KeyError: pass -def main(inputFile: str, outputFile: str, mode: str, maxChildDepth: int): +def main(input_file: str, output_file: str, mode: str, max_child_depth: int): """ Reads an CSV file of terminology and writes it into Solr synonym format for use in synonym injection. Alternative terms are always written, and the - number of child terms is configurable by `maxChildDepth`. + number of child terms is configurable by `max_child_depth`. Parameters ---------- - inputFile: str + input_file: str CSV file to read ontology from. - outputFile: str + output_file: str Solr synonym output file. mode: str Python file mode (w, a, ...) to use when writing the output file. - maxChildDepth: int + max_child_depth: int The maximum number of generations of children to inject for each term. For example, a value of 2 would inject children and their children. 0 will only add alternative terms. Negative integers will add all children, grandchildren, etc. Note that this may result in an exponentially large number of terms """ - altIndices = [] - parentIndices = [] - equivalentIndices = [] - equivalentPairs = {} + alt_indices = [] + parent_indices = [] + equivalent_indices = [] + equivalent_pairs = {} relationships = {} - with open(inputFile) as f: + with open(input_file) as f: reader = csv.reader(f) # Dynamically determine header positions headers = next(reader) for i, header in enumerate(headers): if "Label" == header.strip(): - labelIndex = i - # elif "Class Type" == header: - # classIndex = i + label_index = i elif "Alt Label" in header.strip(): - altIndices.append(i) + alt_indices.append(i) elif "Parent IRI" == header.strip(): - parentIndices.append(i) + parent_indices.append(i) elif "Equivalent" == header.strip(): - equivalentIndices.append(i) + equivalent_indices.append(i) for entries in reader: try: @@ -101,7 +99,7 @@ def main(inputFile: str, outputFile: str, mode: str, maxChildDepth: int): # If we do not have an ID, continue to the next line continue - label = entries[labelIndex] + label = entries[label_index] if label in relationships.keys(): raise ValueError(f"Duplicate entry for label {label}") @@ -111,26 +109,25 @@ def main(inputFile: str, outputFile: str, mode: str, maxChildDepth: int): "equivalent": [], "children": [], } - # classType = entries[classIndex] - for altIndex in altIndices: - alternativeLabel = entries[altIndex] - if alternativeLabel != "": + for alt_index in alt_indices: + alternative_label = entries[alt_index] + if alternative_label: relationships[label]["alternatives"].append( - alternativeLabel + alternative_label ) - for parentIndex in parentIndices: - parent = entries[parentIndex] - if parent != "": + for parent_index in parent_indices: + parent = entries[parent_index] + if parent: relationships[label]["parents"].append(parent) - for equivalentIndex in equivalentIndices: - equivalentLabel = entries[equivalentIndex] - if equivalentLabel != "": - relationships[label]["equivalent"].append(equivalentLabel) - equivalentPairs[equivalentLabel] = label + for equivalent_index in equivalent_indices: + equivalent_label = entries[equivalent_index] + if equivalent_label: + relationships[label]["equivalent"].append(equivalent_label) + equivalent_pairs[equivalent_label] = label # If A is equivalent to B, then also set B equivalent to A # This ensures they share all children - for key, value in equivalentPairs.items(): + for key, value in equivalent_pairs.items(): try: relationships[key]["equivalent"].append(value) except KeyError: @@ -138,8 +135,8 @@ def main(inputFile: str, outputFile: str, mode: str, maxChildDepth: int): print(f"{len(relationships)} relationships found") for label, relationship in relationships.items(): - addToParents( - relationships, label, relationship["parents"], maxChildDepth + add_to_parents( + relationships, label, relationship["parents"], max_child_depth ) output = "" @@ -147,41 +144,41 @@ def main(inputFile: str, outputFile: str, mode: str, maxChildDepth: int): # Only write to file if we have alternative or child terms if (len(relationship["alternatives"]) > 0 or len(relationship["children"]) > 0): - leftHandSide = ", ".join( + left_hand_side = ", ".join( sorted(set([label] + relationship["alternatives"])) ) - rightHandSide = ", ".join( + right_hand_side = ", ".join( sorted(set( [label] + relationship["alternatives"] + relationship["children"] )) ) - output += leftHandSide + " => " + rightHandSide + "\n" + output += left_hand_side + " => " + right_hand_side + "\n" - with open(outputFile, mode) as f: + with open(output_file, mode) as f: f.write(output) if __name__ == "__main__": args = sys.argv try: - inputFile = args[1] + input_file = args[1] except IndexError as e: - raise IndexError("inputFile to parse not provided") from e + raise IndexError("input_file to parse not provided") from e try: - outputFile = args[2] + output_file = args[2] except IndexError as e: - raise IndexError("outputFile to write to not provided") from e + raise IndexError("output_file to write to not provided") from e try: mode = args[3] except IndexError: - # Default to appending to the outputFile (no overwrite) + # Default to appending to the output_file (no overwrite) mode = "a" try: - maxChildDepth = int(args[4]) + max_child_depth = int(args[4]) except (IndexError, ValueError): # Default to 0 depth (only alternative terms) - maxChildDepth = 0 + max_child_depth = 0 - main(inputFile, outputFile, mode, maxChildDepth) + main(input_file, output_file, mode, max_child_depth) From c790b5d712e25b31a838130a94e8969eff92f18d Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Fri, 21 Oct 2022 15:50:51 +0100 Subject: [PATCH 60/73] Remove returns from Field.java #18 --- .../java/org/icatproject/lucene/Field.java | 32 +++++++------------ 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Field.java b/src/main/java/org/icatproject/lucene/Field.java index 966332e..c043a8b 100644 --- a/src/main/java/org/icatproject/lucene/Field.java +++ b/src/main/java/org/icatproject/lucene/Field.java @@ -24,9 +24,9 @@ class Field { private abstract class InnerField { - public abstract Document addSortable(Document document) throws NumberFormatException; + public abstract void addSortable(Document document) throws NumberFormatException; - public abstract Document addToDocument(Document document) throws NumberFormatException; + public abstract void addToDocument(Document document) throws NumberFormatException; } @@ -39,7 +39,7 @@ public InnerStringField(String value) { } @Override - public Document addSortable(Document document) throws NumberFormatException { + public void addSortable(Document document) throws NumberFormatException { if (DocumentMapping.sortFields.contains(name)) { if (name.equals("id")) { // Id is a special case, as we need to to be SORTED as a byte ref to allow joins @@ -51,11 +51,10 @@ public Document addSortable(Document document) throws NumberFormatException { } document.add(new SortedDocValuesField(name, new BytesRef(value))); } - return document; } @Override - public Document addToDocument(Document document) throws NumberFormatException { + public void addToDocument(Document document) throws NumberFormatException { addSortable(document); if (DocumentMapping.facetFields.contains(name)) { @@ -69,7 +68,6 @@ public Document addToDocument(Document document) throws NumberFormatException { document.add(new StringField(name, value, Store.YES)); } - return document; } } @@ -83,19 +81,17 @@ public InnerLongField(long value) { } @Override - public Document addSortable(Document document) throws NumberFormatException { + public void addSortable(Document document) throws NumberFormatException { if (DocumentMapping.sortFields.contains(name)) { document.add(new NumericDocValuesField(name, value)); } - return document; } @Override - public Document addToDocument(Document document) throws NumberFormatException { + public void addToDocument(Document document) throws NumberFormatException { addSortable(document); document.add(new LongPoint(name, value)); document.add(new StoredField(name, value)); - return document; } } @@ -109,20 +105,18 @@ public InnerDoubleField(double value) { } @Override - public Document addSortable(Document document) throws NumberFormatException { + public void addSortable(Document document) throws NumberFormatException { if (DocumentMapping.sortFields.contains(name)) { long sortableLong = NumericUtils.doubleToSortableLong(value); document.add(new NumericDocValuesField(name, sortableLong)); } - return document; } @Override - public Document addToDocument(Document document) throws NumberFormatException { + public void addToDocument(Document document) throws NumberFormatException { addSortable(document); document.add(new DoublePoint(name, value)); document.add(new StoredField(name, value)); - return document; } } @@ -170,11 +164,10 @@ public Field(IndexableField indexableField) { * field. * * @param document The document to add to - * @return The original document with this field added to it * @throws NumberFormatException */ - public Document addSortable(Document document) throws NumberFormatException { - return innerField.addSortable(document); + public void addSortable(Document document) throws NumberFormatException { + innerField.addSortable(document); } /** @@ -183,11 +176,10 @@ public Document addSortable(Document document) throws NumberFormatException { * String, long or double field. * * @param document The document to add to - * @return The original document with this field added to it * @throws NumberFormatException */ - public Document addToDocument(Document document) throws NumberFormatException { - return innerField.addToDocument(document); + public void addToDocument(Document document) throws NumberFormatException { + innerField.addToDocument(document); } } From 8662e05166c531782a92363e4a146e0003814c4a Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Thu, 24 Nov 2022 14:07:01 +0000 Subject: [PATCH 61/73] Update Lucene to 8.11.2 and remove search caching #18 --- pom.xml | 8 ++- .../java/org/icatproject/lucene/Lucene.java | 61 +++++++++---------- 2 files changed, 36 insertions(+), 33 deletions(-) diff --git a/pom.xml b/pom.xml index 26467e5..44b96bb 100755 --- a/pom.xml +++ b/pom.xml @@ -14,7 +14,7 @@ https://repo.icatproject.org/repo github https://github.com/icatproject/icat.lucene - 8.6.0 + 8.11.2 @@ -92,6 +92,12 @@ ${luceneVersion} + + org.apache.lucene + lucene-backward-codecs + ${luceneVersion} + + javax javaee-api diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 41e3a9d..72e6cec 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -153,8 +153,10 @@ public ShardBucket(java.nio.file.Path shardPath) throws IOException { */ public int commit() throws IOException { int cached = indexWriter.numRamDocs(); - indexWriter.commit(); - searcherManager.maybeRefreshBlocking(); + if (cached > 0) { + indexWriter.commit(); + searcherManager.maybeRefreshBlocking(); + } return cached; } } @@ -344,13 +346,9 @@ public void releaseSearchers(List subSearchers) throws IOExceptio private long luceneMaxShardSize; private long maxSearchTimeSeconds; private boolean aggregateFiles; - - private AtomicLong bucketNum = new AtomicLong(); private Map indexBuckets = new ConcurrentHashMap<>(); - private Timer timer; - private Map searches = new ConcurrentHashMap<>(); public IcatUnits icatUnits; /** @@ -440,11 +438,7 @@ public void clear() throws LuceneException { logger.info("Requesting clear"); exit(); - timer = new Timer("LuceneCommitTimer"); - - bucketNum.set(0); indexBuckets.clear(); - searches.clear(); try { Files.walk(luceneDirectory, FileVisitOption.FOLLOW_LINKS).sorted(Comparator.reverseOrder()) @@ -453,7 +447,7 @@ public void clear() throws LuceneException { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } - timer.schedule(new CommitTimerTask(), luceneCommitMillis, luceneCommitMillis); + initTimer(); logger.info("clear complete - ready to go again"); } @@ -464,11 +458,12 @@ public void clear() throws LuceneException { @POST @Path("commit") public void commit() throws LuceneException { - logger.debug("Requesting commit"); + logger.debug("Requesting commit for {} IndexBuckets", indexBuckets.size()); try { for (Entry entry : indexBuckets.entrySet()) { IndexBucket bucket = entry.getValue(); if (!bucket.locked.get()) { + logger.info("{} is unlocked", entry.getKey()); bucket.commit("Synch", entry.getKey()); } } @@ -819,29 +814,27 @@ private void exit() { public String facet(@PathParam("entityName") String entityName, @Context HttpServletRequest request, @QueryParam("search_after") String searchAfter, @QueryParam("maxResults") int maxResults, @QueryParam("maxLabels") int maxLabels, @QueryParam("sort") String sort) throws LuceneException { - Long uid = null; + SearchBucket search = null; try { - uid = bucketNum.getAndIncrement(); - SearchBucket search = new SearchBucket(this, SearchType.GENERIC, request, sort, null); - searches.put(uid, search); + search = new SearchBucket(this, SearchType.GENERIC, request, sort, null); return luceneFacetResult(entityName, search, searchAfter, maxResults, maxLabels); } catch (IOException | QueryNodeException e) { logger.error("Error", e); - freeSearcher(uid); throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + } finally { + freeSearcher(search); } } /** * Releases all IndexSearchers associated with uid. * - * @param uid Unique Identifier for a set of IndexSearcher to be released. + * @param search SearchBucket to be freed. * @throws LuceneException */ - public void freeSearcher(Long uid) throws LuceneException { - if (uid != null && searches.containsKey(uid)) { // May not be set for internal calls - Map> search = searches.get(uid).searcherMap; - for (Entry> entry : search.entrySet()) { + public void freeSearcher(SearchBucket search) throws LuceneException { + if (search != null) { + for (Entry> entry : search.searcherMap.entrySet()) { String name = entry.getKey(); List subReaders = entry.getValue(); try { @@ -851,7 +844,6 @@ public void freeSearcher(Long uid) throws LuceneException { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); } } - searches.remove(uid); } } @@ -924,8 +916,7 @@ private void init() { : 5; aggregateFiles = props.getBoolean("aggregateFiles", false); - timer = new Timer("LuceneCommitTimer"); - timer.schedule(new CommitTimerTask(), luceneCommitMillis, luceneCommitMillis); + initTimer(); icatUnits = new IcatUnits(props.getString("units", "")); @@ -940,6 +931,14 @@ private void init() { aggregateFiles); } + /** + * Starts a timer and schedules regular commits of the IndexWriter. + */ + private void initTimer() { + timer = new Timer("LuceneCommitTimer"); + timer.schedule(new CommitTimerTask(), luceneCommitMillis, luceneCommitMillis); + } + class CommitTimerTask extends TimerTask { @Override public void run() { @@ -1235,18 +1234,16 @@ private void addFacetResults(int maxLabels, Map facete * @throws LuceneException */ private String searchEntity(HttpServletRequest request, String searchAfter, int maxResults, String sort, - SearchType searchType) - throws LuceneException { - Long uid = null; + SearchType searchType) throws LuceneException { + SearchBucket search = null; try { - uid = bucketNum.getAndIncrement(); - SearchBucket search = new SearchBucket(this, searchType, request, sort, searchAfter); - searches.put(uid, search); + search = new SearchBucket(this, searchType, request, sort, searchAfter); return luceneSearchResult(searchType.toString(), search, searchAfter, maxResults); } catch (IOException | QueryNodeException e) { logger.error("Error", e); - freeSearcher(uid); throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); + } finally { + freeSearcher(search); } } From 885b876907c6222e20154301587ad6343b084757 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Thu, 24 Nov 2022 14:14:09 +0000 Subject: [PATCH 62/73] Replace numRamDocs with hasUncommittedChanges #18 --- src/main/java/org/icatproject/lucene/Lucene.java | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 72e6cec..641cc8e 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -152,12 +152,11 @@ public ShardBucket(java.nio.file.Path shardPath) throws IOException { * @throws IOException */ public int commit() throws IOException { - int cached = indexWriter.numRamDocs(); - if (cached > 0) { + if (indexWriter.hasUncommittedChanges()) { indexWriter.commit(); searcherManager.maybeRefreshBlocking(); } - return cached; + return indexWriter.numRamDocs(); } } From ee9da0295f748f8883db3ed871142b52ab06b579 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Fri, 20 Jan 2023 17:04:23 +0000 Subject: [PATCH 63/73] Cache state for facets #18 --- .../icatproject/lucene/FacetedDimension.java | 4 + .../java/org/icatproject/lucene/Lucene.java | 78 ++++++++++++------- 2 files changed, 56 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/FacetedDimension.java b/src/main/java/org/icatproject/lucene/FacetedDimension.java index 98c51c5..6f9cd3e 100644 --- a/src/main/java/org/icatproject/lucene/FacetedDimension.java +++ b/src/main/java/org/icatproject/lucene/FacetedDimension.java @@ -102,4 +102,8 @@ public String getDimension() { return dimension; } + public String toString() { + return dimension + ": " + labels + ", " + counts; + } + } diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 641cc8e..8f0f639 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -107,6 +107,7 @@ private class ShardBucket { private FSDirectory directory; private IndexWriter indexWriter; private SearcherManager searcherManager; + private DefaultSortedSetDocValuesReaderState state; private AtomicLong documentCount; /** @@ -133,15 +134,10 @@ public ShardBucket(java.nio.file.Path shardPath) throws IOException { logger.debug("Now have " + indexWriter.getDocStats().numDocs + " documents indexed"); } searcherManager = new SearcherManager(indexWriter, null); - IndexSearcher indexSearcher = null; - int numDocs; - try { - indexSearcher = searcherManager.acquire(); - numDocs = indexSearcher.getIndexReader().numDocs(); - documentCount = new AtomicLong(numDocs); - } finally { - searcherManager.release(indexSearcher); - } + IndexSearcher indexSearcher = searcherManager.acquire(); + int numDocs = indexSearcher.getIndexReader().numDocs(); + documentCount = new AtomicLong(numDocs); + initState(indexSearcher); logger.info("Created ShardBucket for directory {} with {} Documents", directory.getDirectory(), numDocs); } @@ -155,9 +151,28 @@ public int commit() throws IOException { if (indexWriter.hasUncommittedChanges()) { indexWriter.commit(); searcherManager.maybeRefreshBlocking(); + initState(searcherManager.acquire()); } return indexWriter.numRamDocs(); } + + /** + * Creates a new DefaultSortedSetDocValuesReaderState object for this shard. This can be expensive for indices with a large number of faceted dimensions and labels, so should only be done when needed. + * + * @param indexSearcher The underlying reader of this searcher is used to build the state + * @throws IOException + */ + private void initState(IndexSearcher indexSearcher) throws IOException { + try { + state = new DefaultSortedSetDocValuesReaderState(indexSearcher.getIndexReader()); + } catch (IllegalArgumentException e) { + // This can occur if no fields in the index have been faceted + logger.error( + "No facets found in index, resulting in error: " + e.getClass() + " " + e.getMessage()); + } finally { + searcherManager.release(indexSearcher); + } + } } /** @@ -180,6 +195,7 @@ private class IndexBucket { */ public IndexBucket(String entityName) { try { + logger.trace("Initialising bucket for {}", entityName); this.entityName = entityName.toLowerCase(); Long shardIndex = 0L; java.nio.file.Path shardPath = luceneDirectory.resolve(entityName); @@ -208,6 +224,7 @@ public IndexBucket(String entityName) { public List acquireSearchers() throws IOException { List subSearchers = new ArrayList<>(); for (ShardBucket shardBucket : shardList) { + logger.trace("Acquiring searcher for shard"); subSearchers.add(shardBucket.searcherManager.acquire()); } return subSearchers; @@ -826,7 +843,7 @@ public String facet(@PathParam("entityName") String entityName, @Context HttpSer } /** - * Releases all IndexSearchers associated with uid. + * Releases all IndexSearchers associated with a SearchBucket. * * @param search SearchBucket to be freed. * @throws LuceneException @@ -857,8 +874,10 @@ public void freeSearcher(SearchBucket search) throws LuceneException { private List getSearchers(Map> searcherMap, String name) throws IOException { String nameLowercase = name.toLowerCase(); + logger.trace("Get searchers for {}", nameLowercase); List subSearchers = searcherMap.get(nameLowercase); if (subSearchers == null) { + logger.trace("No searchers found for {}", nameLowercase); subSearchers = indexBuckets.computeIfAbsent(nameLowercase, k -> new IndexBucket(k)).acquireSearchers(); searcherMap.put(nameLowercase, subSearchers); logger.debug("Remember searcher for {}", nameLowercase); @@ -1076,16 +1095,21 @@ private String luceneFacetResult(String name, SearchBucket search, String search logger.warn("Cannot facet when maxResults={}, maxLabels={}, returning empty list", maxResults, maxLabels); } else { // Iterate over shards and aggregate the facets from each - List searchers = getSearchers(search.searcherMap, name); logger.debug("Faceting {} with {} after {} ", name, search.query, searchAfter); - for (IndexSearcher indexSearcher : searchers) { + List shards = getShards(name); + for (ShardBucket shard : shards) { FacetsCollector facetsCollector = new FacetsCollector(); - TopDocs results = FacetsCollector.search(indexSearcher, search.query, maxResults, facetsCollector); - logger.debug("{}", results.totalHits); - for (FacetedDimension facetedDimension : search.dimensions.values()) { - facetStrings = facetRanges(maxLabels, facetStrings, facetsCollector, facetedDimension); + IndexSearcher indexSearcher = shard.searcherManager.acquire(); + try { + TopDocs results = FacetsCollector.search(indexSearcher, search.query, maxResults, facetsCollector); + logger.debug("{}", results.totalHits); + for (FacetedDimension facetedDimension : search.dimensions.values()) { + facetStrings = facetRanges(maxLabels, facetStrings, facetsCollector, facetedDimension); + } + facetStrings(search, maxLabels, sparse, facetStrings, indexSearcher, facetsCollector, shard.state); + } finally { + shard.searcherManager.release(indexSearcher); } - facetStrings(search, maxLabels, sparse, facetStrings, indexSearcher, facetsCollector); } } // Build results @@ -1150,28 +1174,30 @@ private boolean facetRanges(int maxLabels, boolean facetStrings, FacetsCollector * @param facetStrings Whether specific String dimensions should be faceted * @param indexSearcher Lucene IndexSearcher used to generate the ReaderState * @param facetsCollector Lucene FacetsCollector used to count results + * @param state Lucene State used to count results * @throws IOException */ private void facetStrings(SearchBucket search, int maxLabels, boolean sparse, boolean facetStrings, - IndexSearcher indexSearcher, FacetsCollector facetsCollector) throws IOException { + IndexSearcher indexSearcher, FacetsCollector facetsCollector, DefaultSortedSetDocValuesReaderState state) + throws IOException { try { + if (state == null) { + logger.debug("State not set, this is most likely due to not having any facetable fields"); + return; + } + logger.trace("String faceting"); + Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); if (sparse) { // Facet all applicable string fields - DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState( - indexSearcher.getIndexReader()); - Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); addFacetResults(maxLabels, search.dimensions, facets); - logger.trace("Sparse faceting found results for {} dimensions", search.dimensions.size()); + logger.trace("Sparse string faceting found results for {} dimensions", search.dimensions.size()); } else if (facetStrings) { // Only add facets to the results if they match one of the requested dimensions - DefaultSortedSetDocValuesReaderState state = new DefaultSortedSetDocValuesReaderState( - indexSearcher.getIndexReader()); - Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); List facetResults = facets.getAllDims(maxLabels); for (FacetResult facetResult : facetResults) { String dimension = facetResult.dim.replace(".keyword", ""); FacetedDimension facetedDimension = search.dimensions.get(dimension); - logger.debug("String facets found for {}, requested dimensions were {}", dimension, + logger.trace("String facets found for {}, requested dimensions were {}", dimension, search.dimensions.keySet()); if (facetedDimension != null) { facetedDimension.addResult(facetResult); From 421020b5e920d81e341dbb56ee1208c57b908f0d Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Mon, 23 Jan 2023 16:21:45 +0000 Subject: [PATCH 64/73] InvestigationFacilityCycle support --- .../icatproject/lucene/DocumentMapping.java | 10 +- .../java/org/icatproject/lucene/Lucene.java | 140 ++++++++++-------- 2 files changed, 85 insertions(+), 65 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/DocumentMapping.java b/src/main/java/org/icatproject/lucene/DocumentMapping.java index 95c2692..2d63fa7 100644 --- a/src/main/java/org/icatproject/lucene/DocumentMapping.java +++ b/src/main/java/org/icatproject/lucene/DocumentMapping.java @@ -53,17 +53,17 @@ public ParentRelationship(String parentName, String joiningField, String... fiel Arrays.asList("date", "startDate", "endDate", "dateTimeValue", "investigation.startDate", "fileSize", "fileCount")); sortFields.addAll( - Arrays.asList("datafile.id", "dataset.id", "investigation.id", "instrument.id", "id", "sample.id", - "sample.investigation.id", "date", "name", "stringValue", "dateTimeValue", "numericValue", - "numericValueSI", "fileSize", "fileCount")); + Arrays.asList("datafile.id", "dataset.id", "facilitycycle.id", "investigation.id", "instrument.id", + "id", "sample.id", "sample.investigation.id", "date", "name", "stringValue", "dateTimeValue", + "numericValue", "numericValueSI", "fileSize", "fileCount")); textFields.addAll(Arrays.asList("name", "visitId", "description", "location", "dataset.name", "investigation.name", "instrument.name", "instrument.fullName", "datafileFormat.name", "sample.name", "sample.type.name", "technique.name", "technique.description", "technique.pid", "title", "summary", "facility.name", "user.fullName", "type.name", "doi")); indexedEntities.addAll(Arrays.asList("Datafile", "Dataset", "Investigation", "DatafileParameter", - "DatasetParameter", "DatasetTechnique", "InstrumentScientist", "InvestigationInstrument", - "InvestigationParameter", "InvestigationUser", "Sample", "SampleParameter")); + "DatasetParameter", "DatasetTechnique", "InstrumentScientist", "InvestigationFacilityCycle", + "InvestigationInstrument", "InvestigationParameter", "InvestigationUser", "Sample", "SampleParameter")); relationships.put("Instrument", new ParentRelationship[] { new ParentRelationship("InvestigationInstrument", "instrument.id", diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 8f0f639..25e46a3 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -157,18 +157,23 @@ public int commit() throws IOException { } /** - * Creates a new DefaultSortedSetDocValuesReaderState object for this shard. This can be expensive for indices with a large number of faceted dimensions and labels, so should only be done when needed. + * Creates a new DefaultSortedSetDocValuesReaderState object for this shard. + * This can be expensive for indices with a large number of faceted dimensions + * and labels, so should only be done when needed. * - * @param indexSearcher The underlying reader of this searcher is used to build the state + * @param indexSearcher The underlying reader of this searcher is used to build + * the state * @throws IOException */ private void initState(IndexSearcher indexSearcher) throws IOException { try { state = new DefaultSortedSetDocValuesReaderState(indexSearcher.getIndexReader()); } catch (IllegalArgumentException e) { - // This can occur if no fields in the index have been faceted + // This can occur if no fields in the index have been faceted, in which case set + // state to null to ensure we don't (erroneously) use the old state logger.error( "No facets found in index, resulting in error: " + e.getClass() + " " + e.getMessage()); + state = null; } finally { searcherManager.release(indexSearcher); } @@ -550,25 +555,27 @@ private void aggregateFileSize(long sizeToAdd, long sizeToSubtract, long deltaFi for (ShardBucket shardBucket : indexBucket.shardList) { shardBucket.commit(); IndexSearcher searcher = shardBucket.searcherManager.acquire(); - Term idTerm = new Term("id", entityId); - TopDocs topDocs = searcher.search(new TermQuery(idTerm), 1); - if (topDocs.totalHits.value == 1) { - int docId = topDocs.scoreDocs[0].doc; - Document document = searcher.doc(docId); + try { + Term idTerm = new Term("id", entityId); + TopDocs topDocs = searcher.search(new TermQuery(idTerm), 1); + if (topDocs.totalHits.value == 1) { + int docId = topDocs.scoreDocs[0].doc; + Document document = searcher.doc(docId); + Set prunedFields = new HashSet<>(); + List fieldsToAdd = new ArrayList<>(); + + incrementFileStatistic("fileSize", deltaFileSize, document, prunedFields, fieldsToAdd); + incrementFileStatistic("fileCount", deltaFileCount, document, prunedFields, fieldsToAdd); + + Document newDocument = pruneDocument(prunedFields, document); + fieldsToAdd.forEach(field -> newDocument.add(field)); + shardBucket.indexWriter.updateDocument(idTerm, facetsConfig.build(newDocument)); + shardBucket.commit(); + break; + } + } finally { shardBucket.searcherManager.release(searcher); - Set prunedFields = new HashSet<>(); - List fieldsToAdd = new ArrayList<>(); - - incrementFileStatistic("fileSize", deltaFileSize, document, prunedFields, fieldsToAdd); - incrementFileStatistic("fileCount", deltaFileCount, document, prunedFields, fieldsToAdd); - - Document newDocument = pruneDocument(prunedFields, document); - fieldsToAdd.forEach(field -> newDocument.add(field)); - shardBucket.indexWriter.updateDocument(idTerm, facetsConfig.build(newDocument)); - shardBucket.commit(); - break; } - shardBucket.searcherManager.release(searcher); } } } @@ -687,21 +694,23 @@ private void delete(JsonObject operationBody) throws LuceneException, IOExceptio if (aggregateFiles && entityName.equals("Datafile")) { for (ShardBucket shardBucket : bucket.shardList) { IndexSearcher datafileSearcher = shardBucket.searcherManager.acquire(); - TopDocs topDocs = datafileSearcher.search(new TermQuery(term), 1); - if (topDocs.totalHits.value == 1) { - int docId = topDocs.scoreDocs[0].doc; - Document datasetDocument = datafileSearcher.doc(docId); - long sizeToSubtract = datasetDocument.getField("fileSize").numericValue().longValue(); - if (sizeToSubtract > 0) { - String datasetId = datasetDocument.getField("dataset.id").stringValue(); - String investigationId = datasetDocument.getField("investigation.id").stringValue(); - aggregateFileSize(0, sizeToSubtract, -1, datasetId, "dataset"); - aggregateFileSize(0, sizeToSubtract, -1, investigationId, "investigation"); + try { + TopDocs topDocs = datafileSearcher.search(new TermQuery(term), 1); + if (topDocs.totalHits.value == 1) { + int docId = topDocs.scoreDocs[0].doc; + Document datasetDocument = datafileSearcher.doc(docId); + long sizeToSubtract = datasetDocument.getField("fileSize").numericValue().longValue(); + if (sizeToSubtract > 0) { + String datasetId = datasetDocument.getField("dataset.id").stringValue(); + String investigationId = datasetDocument.getField("investigation.id").stringValue(); + aggregateFileSize(0, sizeToSubtract, -1, datasetId, "dataset"); + aggregateFileSize(0, sizeToSubtract, -1, investigationId, "investigation"); + } + break; } + } finally { shardBucket.searcherManager.release(datafileSearcher); - break; } - shardBucket.searcherManager.release(datafileSearcher); } } for (ShardBucket shardBucket : bucket.shardList) { @@ -1036,27 +1045,31 @@ public void lock(@PathParam("entityName") String entityName, @QueryParam("minId" for (ShardBucket shardBucket : bucket.shardList) { IndexSearcher searcher = shardBucket.searcherManager.acquire(); - Query query; - if (minId == null && maxId == null) { - query = new MatchAllDocsQuery(); - } else { - if (minId == null) { - minId = Long.MIN_VALUE; + try { + Query query; + if (minId == null && maxId == null) { + query = new MatchAllDocsQuery(); + } else { + if (minId == null) { + minId = Long.MIN_VALUE; + } + if (maxId == null) { + maxId = Long.MAX_VALUE; + } + query = LongPoint.newRangeQuery("id.long", minId + 1, maxId); } - if (maxId == null) { - maxId = Long.MAX_VALUE; + TopDocs topDoc = searcher.search(query, 1); + if (topDoc.scoreDocs.length != 0) { + // If we have any results in the populating range, unlock and throw + bucket.locked.compareAndSet(true, false); + Document doc = searcher.doc(topDoc.scoreDocs[0].doc); + String id = doc.get("id"); + String message = "While locking index, found id " + id + " in specified range"; + logger.error(message); + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, message); } - query = LongPoint.newRangeQuery("id.long", minId + 1, maxId); - } - TopDocs topDoc = searcher.search(query, 1); - if (topDoc.scoreDocs.length != 0) { - // If we have any results in the populating range, unlock and throw - bucket.locked.compareAndSet(true, false); - Document doc = searcher.doc(topDoc.scoreDocs[0].doc); - String id = doc.get("id"); - String message = "While locking index, found id " + id + " in specified range"; - logger.error(message); - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, message); + } finally { + shardBucket.searcherManager.release(searcher); } } } catch (IOException e) { @@ -1106,6 +1119,13 @@ private String luceneFacetResult(String name, SearchBucket search, String search for (FacetedDimension facetedDimension : search.dimensions.values()) { facetStrings = facetRanges(maxLabels, facetStrings, facetsCollector, facetedDimension); } + if (shard.state == null) { + logger.debug("State not set, this is most likely due to not having any facetable fields"); + continue; + } else if (shard.state.reader != indexSearcher.getIndexReader()) { + logger.warn("Attempted search with outdated state, create new one from current IndexReader"); + shard.state = new DefaultSortedSetDocValuesReaderState(indexSearcher.getIndexReader()); + } facetStrings(search, maxLabels, sparse, facetStrings, indexSearcher, facetsCollector, shard.state); } finally { shard.searcherManager.release(indexSearcher); @@ -1181,10 +1201,6 @@ private void facetStrings(SearchBucket search, int maxLabels, boolean sparse, bo IndexSearcher indexSearcher, FacetsCollector facetsCollector, DefaultSortedSetDocValuesReaderState state) throws IOException { try { - if (state == null) { - logger.debug("State not set, this is most likely due to not having any facetable fields"); - return; - } logger.trace("String faceting"); Facets facets = new SortedSetDocValuesFacetCounts(state, facetsCollector); if (sparse) { @@ -1342,12 +1358,16 @@ private TopFieldDocs searchShards(SearchBucket search, int maxResults, List Date: Wed, 6 Sep 2023 15:50:29 +0000 Subject: [PATCH 65/73] Replace javax with jakarta in new files --- .../icatproject/lucene/FacetedDimension.java | 4 ++-- .../java/org/icatproject/lucene/Field.java | 2 +- .../org/icatproject/lucene/SearchBucket.java | 18 +++++++++--------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/FacetedDimension.java b/src/main/java/org/icatproject/lucene/FacetedDimension.java index 6f9cd3e..bfd1e7f 100644 --- a/src/main/java/org/icatproject/lucene/FacetedDimension.java +++ b/src/main/java/org/icatproject/lucene/FacetedDimension.java @@ -3,8 +3,8 @@ import java.util.ArrayList; import java.util.List; -import javax.json.Json; -import javax.json.JsonObjectBuilder; +import jakarta.json.Json; +import jakarta.json.JsonObjectBuilder; import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.LabelAndValue; diff --git a/src/main/java/org/icatproject/lucene/Field.java b/src/main/java/org/icatproject/lucene/Field.java index c043a8b..0231546 100644 --- a/src/main/java/org/icatproject/lucene/Field.java +++ b/src/main/java/org/icatproject/lucene/Field.java @@ -1,6 +1,6 @@ package org.icatproject.lucene; -import javax.json.JsonObject; +import jakarta.json.JsonObject; import org.apache.lucene.document.Document; import org.apache.lucene.document.DoublePoint; diff --git a/src/main/java/org/icatproject/lucene/SearchBucket.java b/src/main/java/org/icatproject/lucene/SearchBucket.java index 3a28375..e843e97 100644 --- a/src/main/java/org/icatproject/lucene/SearchBucket.java +++ b/src/main/java/org/icatproject/lucene/SearchBucket.java @@ -15,15 +15,15 @@ import java.util.TimeZone; import java.util.Map.Entry; -import javax.json.Json; -import javax.json.JsonArray; -import javax.json.JsonNumber; -import javax.json.JsonObject; -import javax.json.JsonReader; -import javax.json.JsonString; -import javax.json.JsonValue; -import javax.json.JsonValue.ValueType; -import javax.servlet.http.HttpServletRequest; +import jakarta.json.Json; +import jakarta.json.JsonArray; +import jakarta.json.JsonNumber; +import jakarta.json.JsonObject; +import jakarta.json.JsonReader; +import jakarta.json.JsonString; +import jakarta.json.JsonValue; +import jakarta.json.JsonValue.ValueType; +import jakarta.servlet.http.HttpServletRequest; import org.apache.lucene.document.DoublePoint; import org.apache.lucene.document.LongPoint; From 453a7252fb8a7a8d5d3dd5f4b02ae3ff05c86028 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Fri, 8 Sep 2023 16:09:46 +0000 Subject: [PATCH 66/73] 3.0.0 release notes --- src/site/xhtml/installation.xhtml.vm | 19 +++++++++++++++++++ src/site/xhtml/release-notes.xhtml | 12 ++++++++++++ 2 files changed, 31 insertions(+) diff --git a/src/site/xhtml/installation.xhtml.vm b/src/site/xhtml/installation.xhtml.vm index 37ec5ef..7158410 100644 --- a/src/site/xhtml/installation.xhtml.vm +++ b/src/site/xhtml/installation.xhtml.vm @@ -56,6 +56,11 @@
    the interval in seconds between committing lucene changes to disk and updating the index.
    +
    maxShardSize
    +
    The maximum number of documents to store in a single index before "sharding" + into an additional index. All sharded indices are searched at once when + performing a search. Has a maximum value of 2147483648 (max int + 1).
    +
    ip
    Ranges of ip addresses to accept requests from. This should be as restrictive as possible - just list the icats you need to @@ -63,6 +68,20 @@ take the form of an IPV4 or IPV6 address followed by the number of bits (starting from the most significant) to consider. For example 127.0.0.1/32 is the IPV4 value for localhost.
    + +
    units
    +
    Recognised unit names/symbols. Each symbol recognised by indriya's + SimpleUnitFormat should be followed by a colon, and then a comma separated + list of units measuring the same property. If the unit is simply an alias + (e.g. "K: kelvin") this is sufficient. If a conversion is required, it + should be followed by this factor (e.g. "J: eV 1.602176634e-19"). Different + units can be separated by a semi-colon.
    + +
    aggregateFiles
    +
    Aggregate file sizes/counts for Datasets and Investigations as Datafiles are + added or modified (i.e. in real time). This can have a significant + performance impact when writing to the index. If "false", icat.server can + instead be configured to update sizes at a regular intervals.
    diff --git a/src/site/xhtml/release-notes.xhtml b/src/site/xhtml/release-notes.xhtml index 1b43c6b..145a2f9 100644 --- a/src/site/xhtml/release-notes.xhtml +++ b/src/site/xhtml/release-notes.xhtml @@ -6,6 +6,18 @@

    ICAT Lucene Server Release Notes

    +

    3.0.0

    +

    Significant changes to the functionality and performance of searches:

    +
      +
    • Ability to search on over 2 billion documents
    • +
    • Enable sorting on specific entity fields
    • +
    • "Infinitely" search the data by using the searchAfter parameter
    • +
    • Faceted searches
    • +
    • Replace single "text" field with specific fields that reflect the ICAT schema to allow field targeting
    • +
    • Support for unit conversion on numeric Parameters
    • +
    • Support for synonym injection
    • +
    +

    2.0.2

    Fix compatibility with indexes built by icat.lucene 1.x

    From d31e5b7c93130fee8759e4f53c07065eb0b0ec42 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Tue, 26 Sep 2023 10:07:01 +0000 Subject: [PATCH 67/73] Index id as long instead of String #18 --- .../icatproject/lucene/DocumentMapping.java | 23 +- .../java/org/icatproject/lucene/Field.java | 8 - .../java/org/icatproject/lucene/Lucene.java | 149 ++++++---- .../org/icatproject/lucene/SearchBucket.java | 274 ++++++++++-------- 4 files changed, 257 insertions(+), 197 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/DocumentMapping.java b/src/main/java/org/icatproject/lucene/DocumentMapping.java index b1d88f6..4b7998a 100644 --- a/src/main/java/org/icatproject/lucene/DocumentMapping.java +++ b/src/main/java/org/icatproject/lucene/DocumentMapping.java @@ -33,7 +33,7 @@ public ParentRelationship(String parentName, String joiningField, String... fiel } } - private static Analyzer analyzer = new IcatSynonymAnalyzer();; + private static Analyzer analyzer = new IcatSynonymAnalyzer();; public static final Set doubleFields = new HashSet<>(); public static final Set facetFields = new HashSet<>(); @@ -44,12 +44,12 @@ public ParentRelationship(String parentName, String joiningField, String... fiel public static final Map relationships = new HashMap<>(); public static final StandardQueryParser genericParser = buildParser(); - public static final StandardQueryParser datafileParser = buildParser("name", "description", "location", "datafileFormat.name", "visitId", - "sample.name", "sample.type.name", "doi"); - public static final StandardQueryParser datasetParser = buildParser("name", "description", "sample.name", "sample.type.name", "type.name", - "visitId", "doi"); - public static final StandardQueryParser investigationParser = buildParser("name", "visitId", "title", "summary", "facility.name", - "type.name", "doi"); + public static final StandardQueryParser datafileParser = buildParser("name", "description", "location", + "datafileFormat.name", "visitId", "sample.name", "sample.type.name", "doi"); + public static final StandardQueryParser datasetParser = buildParser("name", "description", "sample.name", + "sample.type.name", "type.name", "visitId", "doi"); + public static final StandardQueryParser investigationParser = buildParser("name", "visitId", "title", "summary", + "facility.name", "type.name", "doi"); public static final StandardQueryParser sampleParser = buildParser("sample.name", "sample.type.name"); static { @@ -58,10 +58,13 @@ public ParentRelationship(String parentName, String joiningField, String... fiel facetFields.addAll(Arrays.asList("type.name", "datafileFormat.name", "stringValue", "technique.name")); longFields.addAll( Arrays.asList("date", "startDate", "endDate", "dateTimeValue", "investigation.startDate", "fileSize", - "fileCount")); + "fileCount", "datafile.id", "datafileFormat.id", "dataset.id", "facility.id", + "facilityCycle.id", "investigation.id", "instrument.id", "id", "sample.id", + "sample.investigation.id", "sample.type.id", "technique.id", "type.id", "user.id")); sortFields.addAll( - Arrays.asList("datafile.id", "dataset.id", "facilitycycle.id", "investigation.id", "instrument.id", - "id", "sample.id", "sample.investigation.id", "date", "name", "stringValue", "dateTimeValue", + Arrays.asList("datafile.id", "datafileFormat.id", "dataset.id", "facility.id", "facilityCycle.id", + "investigation.id", "instrument.id", "id", "sample.id", "sample.investigation.id", + "technique.id", "type.id", "user.id", "date", "name", "stringValue", "dateTimeValue", "numericValue", "numericValueSI", "fileSize", "fileCount")); textFields.addAll(Arrays.asList("name", "visitId", "description", "location", "dataset.name", "investigation.name", "instrument.name", "instrument.fullName", "datafileFormat.name", "sample.name", diff --git a/src/main/java/org/icatproject/lucene/Field.java b/src/main/java/org/icatproject/lucene/Field.java index 0231546..ea33aa3 100644 --- a/src/main/java/org/icatproject/lucene/Field.java +++ b/src/main/java/org/icatproject/lucene/Field.java @@ -41,14 +41,6 @@ public InnerStringField(String value) { @Override public void addSortable(Document document) throws NumberFormatException { if (DocumentMapping.sortFields.contains(name)) { - if (name.equals("id")) { - // Id is a special case, as we need to to be SORTED as a byte ref to allow joins - // but also SORTED_NUMERIC to ensure a deterministic order to results - Long longValue = new Long(value); - document.add(new NumericDocValuesField("id.long", longValue)); - document.add(new StoredField("id.long", longValue)); - document.add(new LongPoint("id.long", longValue)); - } document.add(new SortedDocValuesField(name, new BytesRef(value))); } } diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index 3e84a52..a173aa4 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -76,7 +76,6 @@ import org.apache.lucene.search.SortField.Type; import org.apache.lucene.search.TimeLimitingCollector.TimeExceededException; import org.apache.lucene.search.SortedNumericSortField; -import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TimeLimitingCollector; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopFieldCollector; @@ -248,18 +247,29 @@ public void addDocument(Document document) throws IOException { } /** - * Updates documents matching the term with the provided document. + * Deletes a document from the appropriate shard for this index. * - * @param term Term identifying the old document(s) to be updated. - * @param document The document that will replace the old document(s). + * @param icatId The ICAT id of the document to be deleted. * @throws IOException */ - public void updateDocument(Term term, Document document) throws IOException { + public void deleteDocument(long icatId) throws IOException { for (ShardBucket shardBucket : shardList) { - shardBucket.indexWriter.updateDocument(term, document); + shardBucket.indexWriter.deleteDocuments(LongPoint.newExactQuery("id", icatId)); } } + /** + * Updates the document with the provided ICAT id. + * + * @param icatId The ICAT id of the document to be updated. + * @param document The document that will replace the old document. + * @throws IOException + */ + public void updateDocument(long icatId, Document document) throws IOException { + deleteDocument(icatId); + addDocument(document); + } + /** * Creates a new ShardBucket and stores it in the shardMap. * @@ -485,7 +495,7 @@ public void commit() throws LuceneException { for (Entry entry : indexBuckets.entrySet()) { IndexBucket bucket = entry.getValue(); if (!bucket.locked.get()) { - logger.info("{} is unlocked", entry.getKey()); + logger.trace("{} is unlocked", entry.getKey()); bucket.commit("Synch", entry.getKey()); } } @@ -523,9 +533,8 @@ private void create(JsonObject operationBody) throws NumberFormatException, IOEx if (aggregateFiles && entityName.equals("Datafile")) { JsonNumber jsonFileSize = documentObject.getJsonNumber("fileSize"); if (jsonFileSize != null) { - String datasetId = documentObject.getString("dataset.id", null); - String investigationId = documentObject.getString("investigation.id", null); - logger.trace("Aggregating {} to {}, {}", jsonFileSize.longValue(), datasetId, investigationId); + JsonNumber datasetId = documentObject.getJsonNumber("dataset.id"); + JsonNumber investigationId = documentObject.getJsonNumber("investigation.id"); aggregateFileSize(jsonFileSize.longValueExact(), 0, 1, datasetId, "dataset"); aggregateFileSize(jsonFileSize.longValueExact(), 0, 1, investigationId, "investigation"); } @@ -543,22 +552,42 @@ private void create(JsonObject operationBody) throws NumberFormatException, IOEx * @param sizeToSubtract Decreases the fileSize of the entity by this much. * Should be 0 for creates. * @param deltaFileCount Changes the file count by this much. - * @param entityId Icat id of entity to update. + * @param entityId Icat id of entity to update as a JsonNumber. * @param index Index (entity) to update. * @throws IOException */ - private void aggregateFileSize(long sizeToAdd, long sizeToSubtract, long deltaFileCount, String entityId, - String index) - throws IOException { + private void aggregateFileSize(long sizeToAdd, long sizeToSubtract, long deltaFileCount, JsonNumber entityId, + String index) throws IOException { + if (entityId != null) { + aggregateFileSize(sizeToAdd, sizeToSubtract, deltaFileCount, entityId.longValueExact(), index); + } + } + + /** + * Changes the fileSize on an entity by the specified amount. This is used to + * aggregate the individual fileSize of Datafiles up to Dataset and + * Investigation sizes. + * + * @param sizeToAdd Increases the fileSize of the entity by this much. + * Should be 0 for deletes. + * @param sizeToSubtract Decreases the fileSize of the entity by this much. + * Should be 0 for creates. + * @param deltaFileCount Changes the file count by this much. + * @param entityId Icat id of entity to update as a long. + * @param index Index (entity) to update. + * @throws IOException + */ + private void aggregateFileSize(long sizeToAdd, long sizeToSubtract, long deltaFileCount, long entityId, + String index) throws IOException { long deltaFileSize = sizeToAdd - sizeToSubtract; - if (entityId != null && (deltaFileSize != 0 || deltaFileCount != 0)) { + if (deltaFileSize != 0 || deltaFileCount != 0) { IndexBucket indexBucket = indexBuckets.computeIfAbsent(index, k -> new IndexBucket(k)); for (ShardBucket shardBucket : indexBucket.shardList) { shardBucket.commit(); IndexSearcher searcher = shardBucket.searcherManager.acquire(); try { - Term idTerm = new Term("id", entityId); - TopDocs topDocs = searcher.search(new TermQuery(idTerm), 1); + Query idQuery = LongPoint.newExactQuery("id", entityId); + TopDocs topDocs = searcher.search(idQuery, 1); if (topDocs.totalHits.value == 1) { int docId = topDocs.scoreDocs[0].doc; Document document = searcher.doc(docId); @@ -570,7 +599,8 @@ private void aggregateFileSize(long sizeToAdd, long sizeToSubtract, long deltaFi Document newDocument = pruneDocument(prunedFields, document); fieldsToAdd.forEach(field -> newDocument.add(field)); - shardBucket.indexWriter.updateDocument(idTerm, facetsConfig.build(newDocument)); + shardBucket.indexWriter.deleteDocuments(idQuery); + shardBucket.indexWriter.addDocument(facetsConfig.build(newDocument)); shardBucket.commit(); break; } @@ -682,7 +712,7 @@ private void delete(JsonObject operationBody) throws LuceneException, IOExceptio updateByRelation(operationBody, true); } if (DocumentMapping.indexedEntities.contains(entityName)) { - String icatId = operationBody.getString("_id"); + long icatId = operationBody.getJsonNumber("_id").longValueExact(); try { IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); if (bucket.locked.get()) { @@ -690,20 +720,21 @@ private void delete(JsonObject operationBody) throws LuceneException, IOExceptio "Lucene locked for " + entityName); } logger.trace("delete {} {}", entityName, icatId); + Query idQuery = LongPoint.newExactQuery("id", icatId); // Special case for filesizes - Term term = new Term("id", icatId); if (aggregateFiles && entityName.equals("Datafile")) { for (ShardBucket shardBucket : bucket.shardList) { IndexSearcher datafileSearcher = shardBucket.searcherManager.acquire(); try { - TopDocs topDocs = datafileSearcher.search(new TermQuery(term), 1); + TopDocs topDocs = datafileSearcher.search(idQuery, 1); if (topDocs.totalHits.value == 1) { int docId = topDocs.scoreDocs[0].doc; Document datasetDocument = datafileSearcher.doc(docId); long sizeToSubtract = datasetDocument.getField("fileSize").numericValue().longValue(); if (sizeToSubtract > 0) { - String datasetId = datasetDocument.getField("dataset.id").stringValue(); - String investigationId = datasetDocument.getField("investigation.id").stringValue(); + long datasetId = datasetDocument.getField("dataset.id").numericValue().longValue(); + long investigationId = datasetDocument.getField("investigation.id").numericValue() + .longValue(); aggregateFileSize(0, sizeToSubtract, -1, datasetId, "dataset"); aggregateFileSize(0, sizeToSubtract, -1, investigationId, "investigation"); } @@ -715,7 +746,7 @@ private void delete(JsonObject operationBody) throws LuceneException, IOExceptio } } for (ShardBucket shardBucket : bucket.shardList) { - shardBucket.indexWriter.deleteDocuments(term); + shardBucket.indexWriter.deleteDocuments(idQuery); } } catch (IOException e) { throw new LuceneException(HttpURLConnection.HTTP_INTERNAL_ERROR, e.getMessage()); @@ -755,20 +786,20 @@ private void encodeResult(String entityName, JsonGenerator gen, ScoreDoc hit, In List shards = getShards(joinedEntityName); SearchBucket joinedSearch = new SearchBucket(this); String fld; - String parentId; + long parentId; if (joinedEntityName.toLowerCase().contains("investigation")) { fld = "investigation.id"; if (entityName.equalsIgnoreCase("investigation")) { - parentId = document.get("id"); + parentId = document.getField("id").numericValue().longValue(); } else { - parentId = document.get("investigation.id"); + parentId = document.getField("investigation.id").numericValue().longValue(); } } else { fld = entityName.toLowerCase() + ".id"; - parentId = document.get("id"); + parentId = document.getField("id").numericValue().longValue(); } - joinedSearch.query = new TermQuery(new Term(fld, parentId)); - joinedSearch.sort = new Sort(new SortedNumericSortField("id.long", Type.LONG)); + joinedSearch.query = LongPoint.newExactQuery(fld, parentId); + joinedSearch.sort = new Sort(new SortedNumericSortField("id", Type.LONG)); TopFieldDocs topFieldDocs = searchShards(joinedSearch, 100, shards); gen.writeStartArray(joinedEntityName.toLowerCase()); for (ScoreDoc joinedHit : topFieldDocs.scoreDocs) { @@ -939,7 +970,7 @@ private void init() { } luceneCommitMillis = props.getPositiveInt("commitSeconds") * 1000; - luceneMaxShardSize = Math.max(props.getPositiveLong("maxShardSize"), new Long(Integer.MAX_VALUE + 1)); + luceneMaxShardSize = Math.max(props.getPositiveLong("maxShardSize"), Long.valueOf(Integer.MAX_VALUE + 1)); maxSearchTimeSeconds = props.has("maxSearchTimeSeconds") ? props.getPositiveLong("maxSearchTimeSeconds") : 5; aggregateFiles = props.getBoolean("aggregateFiles", false); @@ -1057,14 +1088,14 @@ public void lock(@PathParam("entityName") String entityName, @QueryParam("minId" if (maxId == null) { maxId = Long.MAX_VALUE; } - query = LongPoint.newRangeQuery("id.long", minId + 1, maxId); + query = LongPoint.newRangeQuery("id", minId + 1, maxId); } TopDocs topDoc = searcher.search(query, 1); if (topDoc.scoreDocs.length != 0) { // If we have any results in the populating range, unlock and throw bucket.locked.compareAndSet(true, false); Document doc = searcher.doc(topDoc.scoreDocs[0].doc); - String id = doc.get("id"); + long id = doc.getField("id").numericValue().longValue(); String message = "While locking index, found id " + id + " in specified range"; logger.error(message); throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, message); @@ -1469,14 +1500,14 @@ private void encodeSearchAfterField(JsonGenerator gen, SortField sortField, Scor if (indexableField.numericValue() != null) { gen.write(indexableField.numericValue().longValue()); } else if (indexableField.stringValue() != null) { - gen.write(new Long(indexableField.stringValue())); + gen.write(Long.valueOf(indexableField.stringValue())); } break; case DOUBLE: if (indexableField.numericValue() != null) { gen.write(indexableField.numericValue().doubleValue()); } else if (indexableField.stringValue() != null) { - gen.write(new Double(indexableField.stringValue())); + gen.write(Double.valueOf(indexableField.stringValue())); } break; case STRING: @@ -1511,15 +1542,18 @@ private Document parseDocument(JsonObject json) { * @param json A JsonObject representing the Document to be built * @param document The new Document being built * @param key A key present in json + * @retrun Whether a conversion has been performed or not */ - private void convertUnits(JsonObject json, Document document, String key) { + private boolean convertUnits(JsonObject json, Document document, String key) { // Whenever the units are set or changed, convert to SI if (key.equals("type.units")) { String unitString = json.getString("type.units"); convertValue(document, json, unitString, "numericValue"); convertValue(document, json, unitString, "rangeTop"); convertValue(document, json, unitString, "rangeBottom"); + return true; } + return false; } /** @@ -1566,20 +1600,28 @@ private void convertValue(Document document, JsonObject json, String unitString, * @param oldDocument Lucene Document to be updated. * @return Lucene Document with updated fields. */ - private Document updateDocument(JsonObject json, Document oldDocument) { + private Document updateDocumentFields(JsonObject json, Document oldDocument) { Document newDocument = new Document(); + List fieldsSI = new ArrayList<>(); + boolean hasNewUnits = false; for (IndexableField field : oldDocument.getFields()) { String fieldName = field.name(); if (json.containsKey(fieldName)) { Field jsonField = new Field(json, fieldName); jsonField.addToDocument(newDocument); - convertUnits(json, newDocument, fieldName); + hasNewUnits = hasNewUnits || convertUnits(json, newDocument, fieldName); + } else if (fieldName.endsWith("SI")) { + fieldsSI.add(new Field(field)); } else { - Field sortField = new Field(field); - sortField.addSortable(newDocument); - newDocument.add(field); + Field oldField = new Field(field); + oldField.addToDocument(newDocument); } } + if (!hasNewUnits) { + fieldsSI.forEach((field) -> { + field.addToDocument(newDocument); + }); + } return newDocument; } @@ -1644,7 +1686,7 @@ private void update(JsonObject operationBody) throws LuceneException, NumberForm updateByRelation(operationBody, false); } if (DocumentMapping.indexedEntities.contains(entityName)) { - String icatId = operationBody.getString("_id"); + long icatId = operationBody.getJsonNumber("_id").longValueExact(); JsonObject documentObject = operationBody.getJsonObject("doc"); Document document = parseDocument(documentObject); IndexBucket bucket = indexBuckets.computeIfAbsent(entityName.toLowerCase(), k -> new IndexBucket(k)); @@ -1659,15 +1701,15 @@ private void update(JsonObject operationBody) throws LuceneException, NumberForm long sizeToSubtract = 0; List datafileSearchers = bucket.acquireSearchers(); for (IndexSearcher datafileSearcher : datafileSearchers) { - TopDocs topDocs = datafileSearcher.search(new TermQuery(new Term("id", icatId)), 1); + TopDocs topDocs = datafileSearcher.search(LongPoint.newExactQuery("id", icatId), 1); if (topDocs.totalHits.value == 1) { int docId = topDocs.scoreDocs[0].doc; Document datasetDocument = datafileSearcher.doc(docId); sizeToSubtract = datasetDocument.getField("fileSize").numericValue().longValue(); long sizeToAdd = jsonFileSize.longValueExact(); if (sizeToAdd != sizeToSubtract) { - String datasetId = documentObject.getString("dataset.id", null); - String investigationId = documentObject.getString("investigation.id", null); + JsonNumber datasetId = documentObject.getJsonNumber("dataset.id"); + JsonNumber investigationId = documentObject.getJsonNumber("investigation.id"); aggregateFileSize(sizeToAdd, sizeToSubtract, 0, datasetId, "dataset"); aggregateFileSize(sizeToAdd, sizeToSubtract, 0, investigationId, "investigation"); } @@ -1677,7 +1719,7 @@ private void update(JsonObject operationBody) throws LuceneException, NumberForm } } logger.trace("update: {}", document); - bucket.updateDocument(new Term("id", icatId), facetsConfig.build(document)); + bucket.updateDocument(icatId, facetsConfig.build(document)); } } @@ -1700,7 +1742,7 @@ private void updateByRelation(JsonObject operationBody, boolean delete) throws LuceneException, NumberFormatException, IOException { for (DocumentMapping.ParentRelationship parentRelationship : DocumentMapping.relationships .get(operationBody.getString("_index"))) { - String childId = operationBody.getString("_id"); + long childId = operationBody.getJsonNumber("_id").longValueExact(); IndexBucket bucket = indexBuckets.computeIfAbsent(parentRelationship.parentName.toLowerCase(), k -> new IndexBucket(k)); if (bucket.locked.get()) { @@ -1710,18 +1752,17 @@ private void updateByRelation(JsonObject operationBody, boolean delete) IndexSearcher searcher = getSearcher(new HashMap<>(), parentRelationship.parentName); int blockSize = 10000; - TermQuery query = new TermQuery(new Term(parentRelationship.joiningField, childId)); - Sort sort = new Sort(new SortField("id", Type.STRING)); + Query query = LongPoint.newExactQuery(parentRelationship.joiningField, childId); + Sort sort = new Sort(new SortField("id", Type.LONG)); ScoreDoc[] scoreDocs = searcher.search(query, blockSize, sort).scoreDocs; while (scoreDocs.length != 0) { - TopDocs topDocs = searcher.search(query, blockSize); - for (ScoreDoc scoreDoc : topDocs.scoreDocs) { + for (ScoreDoc scoreDoc : scoreDocs) { Document oldDocument = searcher.doc(scoreDoc.doc); - String parentId = oldDocument.get("id"); + long parentId = oldDocument.getField("id").numericValue().longValue(); Document newDocument = delete ? pruneDocument(parentRelationship.fields, oldDocument) - : updateDocument(operationBody.getJsonObject("doc"), oldDocument); + : updateDocumentFields(operationBody.getJsonObject("doc"), oldDocument); logger.trace("updateByRelation: {}", newDocument); - bucket.updateDocument(new Term("id", parentId), facetsConfig.build(newDocument)); + bucket.updateDocument(parentId, facetsConfig.build(newDocument)); } scoreDocs = searcher.searchAfter(scoreDocs[scoreDocs.length - 1], query, blockSize, sort).scoreDocs; } diff --git a/src/main/java/org/icatproject/lucene/SearchBucket.java b/src/main/java/org/icatproject/lucene/SearchBucket.java index e843e97..18ad0c1 100644 --- a/src/main/java/org/icatproject/lucene/SearchBucket.java +++ b/src/main/java/org/icatproject/lucene/SearchBucket.java @@ -85,6 +85,7 @@ public enum SearchType { */ public SearchBucket(Lucene lucene) { this.lucene = lucene; + searcherMap = new HashMap<>(); } /** @@ -110,122 +111,137 @@ public SearchBucket(Lucene lucene, SearchType searchType, HttpServletRequest req parseFields(o); parseDimensions(o); JsonObject jsonQuery = o.getJsonObject("query"); - BooleanQuery.Builder luceneQuery = new BooleanQuery.Builder(); - String userName; - String text; switch (searchType) { case GENERIC: - parseGenericQuery(jsonQuery, luceneQuery); + parseGenericQuery(jsonQuery); return; case DATAFILE: - parseSearchAfter(searchAfter); - buildFilterQueries("datafile", jsonQuery, luceneQuery); + parseDatafileQuery(searchAfter, jsonQuery); + return; + case DATASET: + parseDatasetQuery(searchAfter, jsonQuery); + return; + case INVESTIGATION: + parseInvestigationQuery(searchAfter, jsonQuery); + return; + } + } catch (QueryNodeParseException e) { + String message = "Search term could not be parsed due to syntax errors"; + throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, message); + } + } - userName = jsonQuery.getString("user", null); - if (userName != null) { - buildUserNameQuery(userName, luceneQuery, "investigation.id"); - } + private void parseDatafileQuery(String searchAfter, JsonObject jsonQuery) + throws LuceneException, IOException, QueryNodeException { + BooleanQuery.Builder luceneQuery = new BooleanQuery.Builder(); + parseSearchAfter(searchAfter); + buildFilterQueries("datafile", jsonQuery, luceneQuery); - text = jsonQuery.getString("text", null); - if (text != null) { - luceneQuery.add(DocumentMapping.datafileParser.parse(text, null), Occur.MUST); - } + String userName = jsonQuery.getString("user", null); + if (userName != null) { + buildUserNameQuery(userName, luceneQuery, "investigation.id"); + } - buildDateRanges(luceneQuery, jsonQuery, "lower", "upper", "date"); + String text = jsonQuery.getString("text", null); + if (text != null) { + luceneQuery.add(DocumentMapping.datafileParser.parse(text, null), Occur.MUST); + } - if (jsonQuery.containsKey("parameters")) { - JsonArray parameters = jsonQuery.getJsonArray("parameters"); - IndexSearcher datafileParameterSearcher = lucene.getSearcher(searcherMap, "DatafileParameter"); - for (JsonValue p : parameters) { - BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("datafile.id", false, "id", paramQuery.build(), - datafileParameterSearcher, ScoreMode.None); - luceneQuery.add(toQuery, Occur.MUST); - } - } - query = maybeEmptyQuery(luceneQuery); - return; - case DATASET: - parseSearchAfter(searchAfter); - buildFilterQueries("dataset", jsonQuery, luceneQuery); + buildDateRanges(luceneQuery, jsonQuery, "lower", "upper", "date"); - userName = jsonQuery.getString("user", null); - if (userName != null) { - buildUserNameQuery(userName, luceneQuery, "investigation.id"); - } + if (jsonQuery.containsKey("parameters")) { + JsonArray parameters = jsonQuery.getJsonArray("parameters"); + IndexSearcher datafileParameterSearcher = lucene.getSearcher(searcherMap, "DatafileParameter"); + for (JsonValue p : parameters) { + BooleanQuery.Builder paramQuery = parseParameter(p); + Query toQuery = JoinUtil.createJoinQuery("datafile.id", false, "id", Long.class, paramQuery.build(), + datafileParameterSearcher, ScoreMode.None); + luceneQuery.add(toQuery, Occur.MUST); + } + } + query = maybeEmptyQuery(luceneQuery); + } - text = jsonQuery.getString("text", null); - if (text != null) { - luceneQuery.add(DocumentMapping.datasetParser.parse(text, null), Occur.MUST); - } + private void parseDatasetQuery(String searchAfter, JsonObject jsonQuery) + throws LuceneException, IOException, QueryNodeException { + BooleanQuery.Builder luceneQuery = new BooleanQuery.Builder(); + parseSearchAfter(searchAfter); + buildFilterQueries("dataset", jsonQuery, luceneQuery); - buildDateRanges(luceneQuery, jsonQuery, "lower", "upper", "startDate", "endDate"); + String userName = jsonQuery.getString("user", null); + if (userName != null) { + buildUserNameQuery(userName, luceneQuery, "investigation.id"); + } - if (jsonQuery.containsKey("parameters")) { - JsonArray parameters = jsonQuery.getJsonArray("parameters"); - IndexSearcher parameterSearcher = lucene.getSearcher(searcherMap, "DatasetParameter"); - for (JsonValue p : parameters) { - BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("dataset.id", false, "id", paramQuery.build(), - parameterSearcher, ScoreMode.None); - luceneQuery.add(toQuery, Occur.MUST); - } - } - query = maybeEmptyQuery(luceneQuery); - return; - case INVESTIGATION: - parseSearchAfter(searchAfter); - buildFilterQueries("investigation", jsonQuery, luceneQuery); + String text = jsonQuery.getString("text", null); + if (text != null) { + luceneQuery.add(DocumentMapping.datasetParser.parse(text, null), Occur.MUST); + } - userName = jsonQuery.getString("user", null); - if (userName != null) { - buildUserNameQuery(userName, luceneQuery, "id"); - } + buildDateRanges(luceneQuery, jsonQuery, "lower", "upper", "startDate", "endDate"); - text = jsonQuery.getString("text", null); - if (text != null) { - Builder textBuilder = new BooleanQuery.Builder(); - textBuilder.add(DocumentMapping.investigationParser.parse(text, null), Occur.SHOULD); + if (jsonQuery.containsKey("parameters")) { + JsonArray parameters = jsonQuery.getJsonArray("parameters"); + IndexSearcher parameterSearcher = lucene.getSearcher(searcherMap, "DatasetParameter"); + for (JsonValue p : parameters) { + BooleanQuery.Builder paramQuery = parseParameter(p); + Query toQuery = JoinUtil.createJoinQuery("dataset.id", false, "id", Long.class, paramQuery.build(), + parameterSearcher, ScoreMode.None); + luceneQuery.add(toQuery, Occur.MUST); + } + } + query = maybeEmptyQuery(luceneQuery); + } - IndexSearcher sampleSearcher = lucene.getSearcher(searcherMap, "Sample"); - Query joinedSampleQuery = JoinUtil.createJoinQuery("sample.investigation.id", false, "id", - DocumentMapping.sampleParser.parse(text, null), sampleSearcher, ScoreMode.Avg); - textBuilder.add(joinedSampleQuery, Occur.SHOULD); - luceneQuery.add(textBuilder.build(), Occur.MUST); - } + private void parseInvestigationQuery(String searchAfter, JsonObject jsonQuery) + throws LuceneException, IOException, QueryNodeException { + BooleanQuery.Builder luceneQuery = new BooleanQuery.Builder(); + parseSearchAfter(searchAfter); + buildFilterQueries("investigation", jsonQuery, luceneQuery); - buildDateRanges(luceneQuery, jsonQuery, "lower", "upper", "startDate", "endDate"); - - if (jsonQuery.containsKey("parameters")) { - JsonArray parameters = jsonQuery.getJsonArray("parameters"); - IndexSearcher parameterSearcher = lucene.getSearcher(searcherMap, "InvestigationParameter"); - for (JsonValue p : parameters) { - BooleanQuery.Builder paramQuery = parseParameter(p); - Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", - paramQuery.build(), - parameterSearcher, ScoreMode.None); - luceneQuery.add(toQuery, Occur.MUST); - } - } + String userName = jsonQuery.getString("user", null); + if (userName != null) { + buildUserNameQuery(userName, luceneQuery, "id"); + } - String userFullName = jsonQuery.getString("userFullName", null); - if (userFullName != null) { - BooleanQuery.Builder userFullNameQuery = new BooleanQuery.Builder(); - userFullNameQuery.add(DocumentMapping.genericParser.parse(userFullName, "user.fullName"), - Occur.MUST); - IndexSearcher investigationUserSearcher = lucene.getSearcher(searcherMap, "InvestigationUser"); - Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", - userFullNameQuery.build(), - investigationUserSearcher, ScoreMode.None); - luceneQuery.add(toQuery, Occur.MUST); - } - query = maybeEmptyQuery(luceneQuery); - return; + String text = jsonQuery.getString("text", null); + if (text != null) { + Builder textBuilder = new BooleanQuery.Builder(); + textBuilder.add(DocumentMapping.investigationParser.parse(text, null), Occur.SHOULD); + + IndexSearcher sampleSearcher = lucene.getSearcher(searcherMap, "Sample"); + Query joinedSampleQuery = JoinUtil.createJoinQuery("sample.investigation.id", false, "id", Long.class, + DocumentMapping.sampleParser.parse(text, null), sampleSearcher, ScoreMode.Avg); + textBuilder.add(joinedSampleQuery, Occur.SHOULD); + luceneQuery.add(textBuilder.build(), Occur.MUST); + } + + buildDateRanges(luceneQuery, jsonQuery, "lower", "upper", "startDate", "endDate"); + + if (jsonQuery.containsKey("parameters")) { + JsonArray parameters = jsonQuery.getJsonArray("parameters"); + IndexSearcher parameterSearcher = lucene.getSearcher(searcherMap, "InvestigationParameter"); + for (JsonValue p : parameters) { + BooleanQuery.Builder paramQuery = parseParameter(p); + Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", Long.class, + paramQuery.build(), + parameterSearcher, ScoreMode.None); + luceneQuery.add(toQuery, Occur.MUST); } - } catch (QueryNodeParseException e) { - String message = "Search term could not be parsed due to syntax errors"; - throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, message); } + + String userFullName = jsonQuery.getString("userFullName", null); + if (userFullName != null) { + BooleanQuery.Builder userFullNameQuery = new BooleanQuery.Builder(); + userFullNameQuery.add(DocumentMapping.genericParser.parse(userFullName, "user.fullName"), + Occur.MUST); + IndexSearcher investigationUserSearcher = lucene.getSearcher(searcherMap, "InvestigationUser"); + Query toQuery = JoinUtil.createJoinQuery("investigation.id", false, "id", Long.class, + userFullNameQuery.build(), + investigationUserSearcher, ScoreMode.None); + luceneQuery.add(toQuery, Occur.MUST); + } + query = maybeEmptyQuery(luceneQuery); } /** @@ -301,10 +317,10 @@ private void buildFilterQueries(String target, JsonObject requestedQuery, Builde IndexSearcher nestedSearcher = lucene.getSearcher(searcherMap, filterTarget); Query nestedQuery; if (filterTarget.equals("sample") && !target.equals("investigation")) { - nestedQuery = JoinUtil.createJoinQuery("sample.id", false, "sample.id", dimensionQuery, - nestedSearcher, ScoreMode.None); + nestedQuery = JoinUtil.createJoinQuery("sample.id", false, "sample.id", Long.class, + dimensionQuery, nestedSearcher, ScoreMode.None); } else { - nestedQuery = JoinUtil.createJoinQuery(target + ".id", false, "id", dimensionQuery, + nestedQuery = JoinUtil.createJoinQuery(target + ".id", false, "id", Long.class, dimensionQuery, nestedSearcher, ScoreMode.None); } queryBuilder.add(nestedQuery, Occur.FILTER); @@ -357,16 +373,16 @@ private Query parseFilter(String target, String fld, JsonValue value) throws IOE }); if (fld.contains("sample") && !target.equals("investigation")) { // Datasets and Datafiles join by sample.id on both fields - return JoinUtil.createJoinQuery("sample.id", false, "sample.id", nestedBoolBuilder.build(), - nestedSearcher, ScoreMode.None); + return JoinUtil.createJoinQuery("sample.id", false, "sample.id", Long.class, + nestedBoolBuilder.build(), nestedSearcher, ScoreMode.None); } else if (fld.equals("sampleparameter") && target.equals("investigation")) { - Query sampleQuery = JoinUtil.createJoinQuery("sample.id", false, "sample.id", + Query sampleQuery = JoinUtil.createJoinQuery("sample.id", false, "sample.id", Long.class, nestedBoolBuilder.build(), nestedSearcher, ScoreMode.None); - return JoinUtil.createJoinQuery("sample.investigation.id", false, "id", sampleQuery, + return JoinUtil.createJoinQuery("sample.investigation.id", false, "id", Long.class, sampleQuery, lucene.getSearcher(searcherMap, "sample"), ScoreMode.None); } else { - return JoinUtil.createJoinQuery(target + ".id", false, "id", nestedBoolBuilder.build(), - nestedSearcher, ScoreMode.None); + return JoinUtil.createJoinQuery(target + ".id", false, "id", Long.class, + nestedBoolBuilder.build(), nestedSearcher, ScoreMode.None); } } else { // Single range of values for a field @@ -498,11 +514,11 @@ private void buildNestedRangeQuery(String fld, JsonObject valueObject, BooleanQu private void buildUserNameQuery(String userName, BooleanQuery.Builder luceneQuery, String toField) throws IOException, LuceneException { TermQuery fromQuery = new TermQuery(new Term("user.name", userName)); - Query investigationUserQuery = JoinUtil.createJoinQuery("investigation.id", false, toField, fromQuery, - lucene.getSearcher(searcherMap, "InvestigationUser"), ScoreMode.None); - Query instrumentScientistQuery = JoinUtil.createJoinQuery("instrument.id", false, "instrument.id", fromQuery, - lucene.getSearcher(searcherMap, "InstrumentScientist"), ScoreMode.None); - Query investigationInstrumentQuery = JoinUtil.createJoinQuery("investigation.id", false, toField, + Query investigationUserQuery = JoinUtil.createJoinQuery("investigation.id", false, toField, Long.class, + fromQuery, lucene.getSearcher(searcherMap, "InvestigationUser"), ScoreMode.None); + Query instrumentScientistQuery = JoinUtil.createJoinQuery("instrument.id", false, "instrument.id", Long.class, + fromQuery, lucene.getSearcher(searcherMap, "InstrumentScientist"), ScoreMode.None); + Query investigationInstrumentQuery = JoinUtil.createJoinQuery("investigation.id", false, toField, Long.class, instrumentScientistQuery, lucene.getSearcher(searcherMap, "InvestigationInstrument"), ScoreMode.None); Builder userNameQueryBuilder = new BooleanQuery.Builder(); userNameQueryBuilder.add(investigationUserQuery, Occur.SHOULD).add(investigationInstrumentQuery, Occur.SHOULD); @@ -684,7 +700,8 @@ public void parseFields(JsonObject jsonObject) throws LuceneException { * @throws LuceneException If the types of the JsonValues in the query do not * match those supported by icat.lucene */ - private void parseGenericQuery(JsonObject jsonQuery, BooleanQuery.Builder luceneQuery) throws LuceneException { + private void parseGenericQuery(JsonObject jsonQuery) throws LuceneException { + BooleanQuery.Builder luceneQuery = new BooleanQuery.Builder(); for (Entry entry : jsonQuery.entrySet()) { String field = entry.getKey(); ValueType valueType = entry.getValue().getValueType(); @@ -707,19 +724,26 @@ private void parseGenericQuery(JsonObject jsonQuery, BooleanQuery.Builder lucene } break; case ARRAY: - // Only support array of String as list of ICAT ids is currently only use case + Query arrayQuery; JsonArray arrayValue = (JsonArray) entry.getValue(); - ArrayList bytesArray = new ArrayList<>(); - String valueAsString; - for (JsonValue value : arrayValue) { - if (value.getValueType().equals(ValueType.STRING)) { - valueAsString = ((JsonString) value).getString(); - } else { - valueAsString = value.toString(); - } - bytesArray.add(new BytesRef(valueAsString)); + ValueType arrayValueType = arrayValue.get(0).getValueType(); + switch (arrayValueType) { + case NUMBER: + ArrayList longList = new ArrayList<>(); + for (JsonValue value : arrayValue) { + longList.add(((JsonNumber) value).longValueExact()); + } + arrayQuery = LongPoint.newSetQuery(field, longList); + break; + default: + ArrayList bytesRefList = new ArrayList<>(); + for (JsonValue value : arrayValue) { + bytesRefList.add(new BytesRef(((JsonString) value).getString())); + } + arrayQuery = new TermInSetQuery(field, bytesRefList); + break; } - luceneQuery.add(new TermInSetQuery(field, bytesArray), Occur.MUST); + luceneQuery.add(arrayQuery, Occur.MUST); break; default: throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, @@ -846,7 +870,7 @@ private void parseSearchAfter(String searchAfter) throws LuceneException { public void parseSort(String sortString) throws LuceneException { if (sortString == null || sortString.equals("") || sortString.equals("{}")) { scored = true; - sort = new Sort(SortField.FIELD_SCORE, new SortedNumericSortField("id.long", Type.LONG)); + sort = new Sort(SortField.FIELD_SCORE, new SortedNumericSortField("id", Type.LONG)); return; } try (JsonReader reader = Json.createReader(new ByteArrayInputStream(sortString.getBytes()))) { @@ -872,7 +896,7 @@ public void parseSort(String sortString) throws LuceneException { fields.add(new SortField(key, Type.STRING, reverse)); } } - fields.add(new SortedNumericSortField("id.long", Type.LONG)); + fields.add(new SortedNumericSortField("id", Type.LONG)); scored = false; sort = new Sort(fields.toArray(new SortField[0])); } From 3dc957a970765b97e6d167b1694c186d6089aad4 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Thu, 28 Sep 2023 10:00:48 +0000 Subject: [PATCH 68/73] Refactor facetable fields into run.properties #18 --- .../icatproject/lucene/DocumentMapping.java | 2 - .../java/org/icatproject/lucene/Field.java | 13 ++++-- .../java/org/icatproject/lucene/Lucene.java | 15 ++++--- .../org/icatproject/lucene/SearchBucket.java | 40 +++++++++++-------- src/main/resources/run.properties | 1 + src/site/xhtml/installation.xhtml.vm | 9 +++++ 6 files changed, 53 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/DocumentMapping.java b/src/main/java/org/icatproject/lucene/DocumentMapping.java index 4b7998a..42f0e87 100644 --- a/src/main/java/org/icatproject/lucene/DocumentMapping.java +++ b/src/main/java/org/icatproject/lucene/DocumentMapping.java @@ -36,7 +36,6 @@ public ParentRelationship(String parentName, String joiningField, String... fiel private static Analyzer analyzer = new IcatSynonymAnalyzer();; public static final Set doubleFields = new HashSet<>(); - public static final Set facetFields = new HashSet<>(); public static final Set longFields = new HashSet<>(); public static final Set sortFields = new HashSet<>(); public static final Set textFields = new HashSet<>(); @@ -55,7 +54,6 @@ public ParentRelationship(String parentName, String joiningField, String... fiel static { doubleFields.addAll(Arrays.asList("numericValue", "numericValueSI", "rangeTop", "rangeTopSI", "rangeBottom", "rangeBottomSI")); - facetFields.addAll(Arrays.asList("type.name", "datafileFormat.name", "stringValue", "technique.name")); longFields.addAll( Arrays.asList("date", "startDate", "endDate", "dateTimeValue", "investigation.startDate", "fileSize", "fileCount", "datafile.id", "datafileFormat.id", "dataset.id", "facility.id", diff --git a/src/main/java/org/icatproject/lucene/Field.java b/src/main/java/org/icatproject/lucene/Field.java index ea33aa3..ad24647 100644 --- a/src/main/java/org/icatproject/lucene/Field.java +++ b/src/main/java/org/icatproject/lucene/Field.java @@ -2,6 +2,8 @@ import jakarta.json.JsonObject; +import java.util.List; + import org.apache.lucene.document.Document; import org.apache.lucene.document.DoublePoint; import org.apache.lucene.document.LongPoint; @@ -49,7 +51,7 @@ public void addSortable(Document document) throws NumberFormatException { public void addToDocument(Document document) throws NumberFormatException { addSortable(document); - if (DocumentMapping.facetFields.contains(name)) { + if (facetable) { document.add(new SortedSetDocValuesFacetField(name + ".keyword", value)); document.add(new StringField(name + ".keyword", value, Store.NO)); } @@ -115,15 +117,18 @@ public void addToDocument(Document document) throws NumberFormatException { private String name; private InnerField innerField; + private boolean facetable; /** * Creates a wrapper for a Field. * * @param object JsonObject containing representations of multiple fields * @param key Key of a specific field in object + * @param facetFields List of String field names which should be stored as a facetable keyword */ - public Field(JsonObject object, String key) { + public Field(JsonObject object, String key, List facetFields) { name = key; + facetable = facetFields.contains(name); if (DocumentMapping.doubleFields.contains(name)) { innerField = new InnerDoubleField(object.getJsonNumber(name).doubleValue()); } else if (DocumentMapping.longFields.contains(name)) { @@ -137,9 +142,11 @@ public Field(JsonObject object, String key) { * Creates a wrapper for a Field. * * @param indexableField A Lucene IndexableField + * @param facetFields List of String fields which should be stored as a facetable keyword */ - public Field(IndexableField indexableField) { + public Field(IndexableField indexableField, List facetFields) { name = indexableField.name(); + facetable = facetFields.contains(name); if (DocumentMapping.doubleFields.contains(name)) { innerField = new InnerDoubleField(indexableField.numericValue().doubleValue()); } else if (DocumentMapping.longFields.contains(name)) { diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index a173aa4..d0c97ca 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -381,6 +381,7 @@ public void releaseSearchers(List subSearchers) throws IOExceptio private Map indexBuckets = new ConcurrentHashMap<>(); private Timer timer; + public List facetFields = new ArrayList<>(); public IcatUnits icatUnits; /** @@ -979,6 +980,10 @@ private void init() { icatUnits = new IcatUnits(props.getString("units", "")); + String facetFieldsString = props.getString("facetFields", ""); + for (String facetField : facetFieldsString.split("\\s+")) { + facetFields.add(facetField); + } } catch (Exception e) { logger.error(fatal, e.getMessage()); throw new IllegalStateException(e.getMessage()); @@ -1528,7 +1533,7 @@ private void encodeSearchAfterField(JsonGenerator gen, SortField sortField, Scor private Document parseDocument(JsonObject json) { Document document = new Document(); for (String key : json.keySet()) { - Field field = new Field(json, key); + Field field = new Field(json, key, facetFields); field.addToDocument(document); convertUnits(json, document, key); } @@ -1607,13 +1612,13 @@ private Document updateDocumentFields(JsonObject json, Document oldDocument) { for (IndexableField field : oldDocument.getFields()) { String fieldName = field.name(); if (json.containsKey(fieldName)) { - Field jsonField = new Field(json, fieldName); + Field jsonField = new Field(json, fieldName, facetFields); jsonField.addToDocument(newDocument); hasNewUnits = hasNewUnits || convertUnits(json, newDocument, fieldName); } else if (fieldName.endsWith("SI")) { - fieldsSI.add(new Field(field)); + fieldsSI.add(new Field(field, facetFields)); } else { - Field oldField = new Field(field); + Field oldField = new Field(field, facetFields); oldField.addToDocument(newDocument); } } @@ -1638,7 +1643,7 @@ private Document pruneDocument(Set fields, Document oldDocument) { Document newDocument = new Document(); for (IndexableField field : oldDocument.getFields()) { if (!fields.contains(field.name())) { - Field fieldToAdd = new Field(field); + Field fieldToAdd = new Field(field, facetFields); fieldToAdd.addToDocument(newDocument); } } diff --git a/src/main/java/org/icatproject/lucene/SearchBucket.java b/src/main/java/org/icatproject/lucene/SearchBucket.java index 18ad0c1..aca88fa 100644 --- a/src/main/java/org/icatproject/lucene/SearchBucket.java +++ b/src/main/java/org/icatproject/lucene/SearchBucket.java @@ -37,6 +37,7 @@ import org.apache.lucene.search.FieldDoc; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; +import org.apache.lucene.search.MatchNoDocsQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; @@ -708,7 +709,7 @@ private void parseGenericQuery(JsonObject jsonQuery) throws LuceneException { switch (valueType) { case STRING: JsonString stringValue = (JsonString) entry.getValue(); - String fld = DocumentMapping.facetFields.contains(field) ? field + ".keyword" : field; + String fld = lucene.facetFields.contains(field) ? field + ".keyword" : field; luceneQuery.add(new TermQuery(new Term(fld, stringValue.getString())), Occur.MUST); break; case NUMBER: @@ -724,26 +725,31 @@ private void parseGenericQuery(JsonObject jsonQuery) throws LuceneException { } break; case ARRAY: - Query arrayQuery; + ArrayList longList = new ArrayList<>(); + ArrayList bytesRefList = new ArrayList<>(); JsonArray arrayValue = (JsonArray) entry.getValue(); - ValueType arrayValueType = arrayValue.get(0).getValueType(); - switch (arrayValueType) { - case NUMBER: - ArrayList longList = new ArrayList<>(); - for (JsonValue value : arrayValue) { + for (JsonValue value : arrayValue) { + ValueType arrayValueType = value.getValueType(); + switch (arrayValueType) { + case NUMBER: longList.add(((JsonNumber) value).longValueExact()); - } - arrayQuery = LongPoint.newSetQuery(field, longList); - break; - default: - ArrayList bytesRefList = new ArrayList<>(); - for (JsonValue value : arrayValue) { + break; + default: bytesRefList.add(new BytesRef(((JsonString) value).getString())); - } - arrayQuery = new TermInSetQuery(field, bytesRefList); - break; + break; + } + } + + if (longList.size() == 0 && bytesRefList.size() == 0) { + query = new MatchNoDocsQuery("Tried filtering" + field + " with an empty array"); + return; + } + if (longList.size() != 0) { + luceneQuery.add(LongPoint.newSetQuery(field, longList), Occur.MUST); + } + if (bytesRefList.size() != 0) { + luceneQuery.add(new TermInSetQuery(field, bytesRefList), Occur.MUST); } - luceneQuery.add(arrayQuery, Occur.MUST); break; default: throw new LuceneException(HttpURLConnection.HTTP_BAD_REQUEST, diff --git a/src/main/resources/run.properties b/src/main/resources/run.properties index c86b66d..5031cfb 100644 --- a/src/main/resources/run.properties +++ b/src/main/resources/run.properties @@ -6,4 +6,5 @@ commitSeconds = 5 maxShardSize = 2147483648 ip = 127.0.0.1/32 units = J: eV 1.602176634e-19; \u2103: celsius, degC; K: kelvin +facetFields = datafileFormat.name instrument.name sample.type.name stringValue technique.name type.name aggregateFiles = false diff --git a/src/site/xhtml/installation.xhtml.vm b/src/site/xhtml/installation.xhtml.vm index 7158410..8d4f801 100644 --- a/src/site/xhtml/installation.xhtml.vm +++ b/src/site/xhtml/installation.xhtml.vm @@ -77,6 +77,15 @@ should be followed by this factor (e.g. "J: eV 1.602176634e-19"). Different units can be separated by a semi-colon. +
    facetFields
    +
    The names of fields which should be stored as facetable. The names should + correspond to how the field appears in the Lucene index, which may be + different to how it is represented in the ICAT database due to flattening of + one to one relationships between entities. Accurate field names can be taken + from `getDoc` function(s) in icat.server. Note that in order to be available + at search time, the field must have been specified when indexing the + documents.
    +
    aggregateFiles
    Aggregate file sizes/counts for Datasets and Investigations as Datafiles are added or modified (i.e. in real time). This can have a significant From c9f21549a39b46bf76c01b86aa364425b2520fc1 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Thu, 5 Oct 2023 14:42:53 +0000 Subject: [PATCH 69/73] Add short explanations of new properties #18 --- src/main/config/run.properties.example | 6 ++++++ src/main/resources/run.properties | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/src/main/config/run.properties.example b/src/main/config/run.properties.example index dbe555b..76b1475 100644 --- a/src/main/config/run.properties.example +++ b/src/main/config/run.properties.example @@ -5,4 +5,10 @@ directory = ${HOME}/data/search commitSeconds = 5 maxShardSize = 2147483648 ip = 127.0.0.1/32 +# List of units to enable conversion to SI units when querying on numerical parameters +!units = J: eV 1.602176634e-19; \u2103: celsius, degC; K: kelvin +# List of fields that should be stored for facet filtering when searching +# In order to be available, these fields must be set when indexing the data +facetFields = datafileFormat.name instrument.name sample.type.name stringValue technique.name type.name +# Aggregate file sizes and counts in real time (this will have a performance impact on write operations) aggregateFiles = false diff --git a/src/main/resources/run.properties b/src/main/resources/run.properties index 5031cfb..085272d 100644 --- a/src/main/resources/run.properties +++ b/src/main/resources/run.properties @@ -5,6 +5,10 @@ directory = ${HOME}/data/search commitSeconds = 5 maxShardSize = 2147483648 ip = 127.0.0.1/32 +# List of units to enable conversion to SI units when querying on numerical parameters units = J: eV 1.602176634e-19; \u2103: celsius, degC; K: kelvin +# List of fields that should be stored for facet filtering when searching +# In order to be available, these fields must be set when indexing the data facetFields = datafileFormat.name instrument.name sample.type.name stringValue technique.name type.name +# Aggregate file sizes and counts in real time (this will have a performance impact on write operations) aggregateFiles = false From b6d3e602163e0b28fe66d2807b187b157f2d0e06 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Fri, 6 Oct 2023 14:27:30 +0000 Subject: [PATCH 70/73] Add special handling for InvestigationInstrument filters #18 --- src/main/java/org/icatproject/lucene/SearchBucket.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/java/org/icatproject/lucene/SearchBucket.java b/src/main/java/org/icatproject/lucene/SearchBucket.java index aca88fa..31d12a3 100644 --- a/src/main/java/org/icatproject/lucene/SearchBucket.java +++ b/src/main/java/org/icatproject/lucene/SearchBucket.java @@ -320,6 +320,9 @@ private void buildFilterQueries(String target, JsonObject requestedQuery, Builde if (filterTarget.equals("sample") && !target.equals("investigation")) { nestedQuery = JoinUtil.createJoinQuery("sample.id", false, "sample.id", Long.class, dimensionQuery, nestedSearcher, ScoreMode.None); + } else if (filterTarget.toLowerCase().equals("investigationinstrument") && !target.equals("investigation")) { + nestedQuery = JoinUtil.createJoinQuery("investigation.id", false, "investigation.id", Long.class, dimensionQuery, + nestedSearcher, ScoreMode.None); } else { nestedQuery = JoinUtil.createJoinQuery(target + ".id", false, "id", Long.class, dimensionQuery, nestedSearcher, ScoreMode.None); From 61301a251324f3925bec37954624932712ba87da Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Tue, 10 Oct 2023 09:43:17 +0000 Subject: [PATCH 71/73] Fix for Investigation Sample filtering #18 --- src/main/java/org/icatproject/lucene/SearchBucket.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/SearchBucket.java b/src/main/java/org/icatproject/lucene/SearchBucket.java index 31d12a3..f9b5d24 100644 --- a/src/main/java/org/icatproject/lucene/SearchBucket.java +++ b/src/main/java/org/icatproject/lucene/SearchBucket.java @@ -317,8 +317,8 @@ private void buildFilterQueries(String target, JsonObject requestedQuery, Builde // just a nested entity) IndexSearcher nestedSearcher = lucene.getSearcher(searcherMap, filterTarget); Query nestedQuery; - if (filterTarget.equals("sample") && !target.equals("investigation")) { - nestedQuery = JoinUtil.createJoinQuery("sample.id", false, "sample.id", Long.class, + if (filterTarget.equals("sample") && target.equals("investigation")) { + nestedQuery = JoinUtil.createJoinQuery("sample.investigation.id", false, "id", Long.class, dimensionQuery, nestedSearcher, ScoreMode.None); } else if (filterTarget.toLowerCase().equals("investigationinstrument") && !target.equals("investigation")) { nestedQuery = JoinUtil.createJoinQuery("investigation.id", false, "investigation.id", Long.class, dimensionQuery, From e3f393e0d39e523fa1f3d30d655b3ad095468038 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Fri, 22 Mar 2024 11:48:04 +0000 Subject: [PATCH 72/73] Account for IcatUnits refactors --- .../java/org/icatproject/lucene/Lucene.java | 24 +++++++------- .../org/icatproject/lucene/SearchBucket.java | 31 ++++++++++--------- 2 files changed, 28 insertions(+), 27 deletions(-) diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index d0c97ca..dbd3247 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -88,7 +88,7 @@ import org.icatproject.lucene.exceptions.LuceneException; import org.icatproject.utils.CheckedProperties; import org.icatproject.utils.IcatUnits; -import org.icatproject.utils.IcatUnits.SystemValue; +import org.icatproject.utils.IcatUnits.Value; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.slf4j.Marker; @@ -1573,24 +1573,22 @@ private boolean convertUnits(JsonObject json, Document document, String key) { */ private void convertValue(Document document, JsonObject json, String unitString, String numericFieldName) { IndexableField field = document.getField(numericFieldName); - double value; + double numericalValue; if (field != null) { - value = NumericUtils.sortableLongToDouble(field.numericValue().longValue()); + numericalValue = NumericUtils.sortableLongToDouble(field.numericValue().longValue()); } else if (json.containsKey(numericFieldName)) { - value = json.getJsonNumber(numericFieldName).doubleValue(); + numericalValue = json.getJsonNumber(numericFieldName).doubleValue(); } else { // If we aren't dealing with the desired numeric field don't convert return; } - logger.trace("Attempting to convert {} {}", value, unitString); - SystemValue systemValue = icatUnits.new SystemValue(value, unitString); - if (systemValue.units != null) { - document.add(new StringField("type.unitsSI", systemValue.units, Store.YES)); - } - if (systemValue.value != null) { - document.add(new DoublePoint(numericFieldName + "SI", systemValue.value)); - document.add(new StoredField(numericFieldName + "SI", systemValue.value)); - long sortableLong = NumericUtils.doubleToSortableLong(systemValue.value); + logger.trace("Attempting to convert {} {}", numericalValue, unitString); + Value value = icatUnits.convertValueToSiUnits(numericalValue, unitString); + if (value != null) { + document.add(new StringField("type.unitsSI", value.units, Store.YES)); + document.add(new DoublePoint(numericFieldName + "SI", value.numericalValue)); + document.add(new StoredField(numericFieldName + "SI", value.numericalValue)); + long sortableLong = NumericUtils.doubleToSortableLong(value.numericalValue); document.add(new NumericDocValuesField(numericFieldName + "SI", sortableLong)); } } diff --git a/src/main/java/org/icatproject/lucene/SearchBucket.java b/src/main/java/org/icatproject/lucene/SearchBucket.java index f9b5d24..2c51f76 100644 --- a/src/main/java/org/icatproject/lucene/SearchBucket.java +++ b/src/main/java/org/icatproject/lucene/SearchBucket.java @@ -52,7 +52,7 @@ import org.apache.lucene.search.join.ScoreMode; import org.apache.lucene.util.BytesRef; import org.icatproject.lucene.exceptions.LuceneException; -import org.icatproject.utils.IcatUnits.SystemValue; +import org.icatproject.utils.IcatUnits.Value; /** * Bucket for information relating to a single search. @@ -424,17 +424,18 @@ private void buildNestedExactQuery(String fld, JsonObject valueObject, BooleanQu double exact = valueObject.getJsonNumber("exact").doubleValue(); String units = valueObject.getString("units", null); if (units != null) { - SystemValue exactValue = lucene.icatUnits.new SystemValue(exact, units); - if (exactValue.value != null) { + Value exactValue = lucene.icatUnits.convertValueToSiUnits(exact, units); + if (exactValue != null) { // If we were able to parse the units, apply query to the SI value - rangeBuilder.add( - DoublePoint.newRangeQuery("rangeTopSI", exactValue.value, Double.POSITIVE_INFINITY), - Occur.FILTER); - rangeBuilder.add( - DoublePoint.newRangeQuery("rangeBottomSI", Double.NEGATIVE_INFINITY, exactValue.value), - Occur.FILTER); + Query topQuery = DoublePoint.newRangeQuery("rangeTopSI", exactValue.numericalValue, + Double.POSITIVE_INFINITY); + Query bottomQuery = DoublePoint.newRangeQuery("rangeBottomSI", Double.NEGATIVE_INFINITY, + exactValue.numericalValue); + Query exactQuery = DoublePoint.newExactQuery(fld + "SI", exactValue.numericalValue); + rangeBuilder.add(topQuery, Occur.FILTER); + rangeBuilder.add(bottomQuery, Occur.FILTER); exactOrRangeBuilder.add(rangeBuilder.build(), Occur.SHOULD); - exactOrRangeBuilder.add(DoublePoint.newExactQuery(fld + "SI", exactValue.value), Occur.SHOULD); + exactOrRangeBuilder.add(exactQuery, Occur.SHOULD); builder.add(exactOrRangeBuilder.build(), Occur.FILTER); } else { // If units could not be parsed, make them part of the query on the raw data @@ -487,11 +488,13 @@ private void buildNestedRangeQuery(String fld, JsonObject valueObject, BooleanQu double to = valueObject.getJsonNumber("to").doubleValue(); String units = valueObject.getString("units", null); if (units != null) { - SystemValue fromValue = lucene.icatUnits.new SystemValue(from, units); - SystemValue toValue = lucene.icatUnits.new SystemValue(to, units); - if (fromValue.value != null && toValue.value != null) { + Value fromValue = lucene.icatUnits.convertValueToSiUnits(from, units); + Value toValue = lucene.icatUnits.convertValueToSiUnits(to, units); + if (fromValue != null && toValue != null) { // If we were able to parse the units, apply query to the SI value - builder.add(DoublePoint.newRangeQuery(fld + "SI", fromValue.value, toValue.value), Occur.FILTER); + Query rangeQuery = DoublePoint.newRangeQuery(fld + "SI", fromValue.numericalValue, + toValue.numericalValue); + builder.add(rangeQuery, Occur.FILTER); } else { // If units could not be parsed, make them part of the query on the raw data builder.add(DoublePoint.newRangeQuery(fld, from, to), Occur.FILTER); From bcbe497813cd0eee886fe4964df18ccbe2c7afb8 Mon Sep 17 00:00:00 2001 From: Patrick Austin Date: Mon, 8 Apr 2024 11:20:36 +0000 Subject: [PATCH 73/73] Add new properties to init logging --- src/main/config/run.properties.example | 2 ++ src/main/java/org/icatproject/lucene/Lucene.java | 14 +++++++++----- src/main/resources/run.properties | 2 ++ 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/main/config/run.properties.example b/src/main/config/run.properties.example index 76b1475..7702881 100644 --- a/src/main/config/run.properties.example +++ b/src/main/config/run.properties.example @@ -5,6 +5,8 @@ directory = ${HOME}/data/search commitSeconds = 5 maxShardSize = 2147483648 ip = 127.0.0.1/32 +# A search taking longer than this will be cancelled to avoid blocking other users' searches +maxSearchTimeSeconds = 5 # List of units to enable conversion to SI units when querying on numerical parameters !units = J: eV 1.602176634e-19; \u2103: celsius, degC; K: kelvin # List of fields that should be stored for facet filtering when searching diff --git a/src/main/java/org/icatproject/lucene/Lucene.java b/src/main/java/org/icatproject/lucene/Lucene.java index dbd3247..31efaea 100755 --- a/src/main/java/org/icatproject/lucene/Lucene.java +++ b/src/main/java/org/icatproject/lucene/Lucene.java @@ -962,6 +962,8 @@ private List getShards(String name) { private void init() { logger.info("Initialising icat.lucene"); CheckedProperties props = new CheckedProperties(); + String unitsString; + int commitSeconds; try { props.loadFromResource("run.properties"); @@ -970,7 +972,8 @@ private void init() { throw new Exception(luceneDirectory + " is not a directory"); } - luceneCommitMillis = props.getPositiveInt("commitSeconds") * 1000; + commitSeconds = props.getPositiveInt("commitSeconds"); + luceneCommitMillis = commitSeconds * 1000; luceneMaxShardSize = Math.max(props.getPositiveLong("maxShardSize"), Long.valueOf(Integer.MAX_VALUE + 1)); maxSearchTimeSeconds = props.has("maxSearchTimeSeconds") ? props.getPositiveLong("maxSearchTimeSeconds") : 5; @@ -978,7 +981,8 @@ private void init() { initTimer(); - icatUnits = new IcatUnits(props.getString("units", "")); + unitsString = props.getString("units", ""); + icatUnits = new IcatUnits(unitsString); String facetFieldsString = props.getString("facetFields", ""); for (String facetField : facetFieldsString.split("\\s+")) { @@ -990,9 +994,9 @@ private void init() { } String format = "Initialised icat.lucene with directory {}, commitSeconds {}, maxShardSize {}, " - + "maxSearchTimeSeconds {}, aggregateFiles {}"; - logger.info(format, luceneDirectory, luceneCommitMillis, luceneMaxShardSize, maxSearchTimeSeconds, - aggregateFiles); + + "maxSearchTimeSeconds {}, aggregateFiles {}, units {}, facetFields {}"; + logger.info(format, luceneDirectory, commitSeconds, luceneMaxShardSize, maxSearchTimeSeconds, + aggregateFiles, unitsString, facetFields); } /** diff --git a/src/main/resources/run.properties b/src/main/resources/run.properties index 085272d..0e3c2ed 100644 --- a/src/main/resources/run.properties +++ b/src/main/resources/run.properties @@ -5,6 +5,8 @@ directory = ${HOME}/data/search commitSeconds = 5 maxShardSize = 2147483648 ip = 127.0.0.1/32 +# A search taking longer than this will be cancelled to avoid blocking other users' searches +maxSearchTimeSeconds = 5 # List of units to enable conversion to SI units when querying on numerical parameters units = J: eV 1.602176634e-19; \u2103: celsius, degC; K: kelvin # List of fields that should be stored for facet filtering when searching