6.1.0 release notes and icatadmin update

icatproject · Sep 8, 2023 · 2589b52 · 2589b52
1 parent 4fb3ac0
commit 2589b52
Show file tree

Hide file tree

Showing 5 changed files with 145 additions and 67 deletions.
diff --git a/src/main/config/run.properties.example b/src/main/config/run.properties.example
@@ -47,16 +47,16 @@ log.list = SESSION WRITE READ INFO
 search.engine = LUCENE
 search.urls = https://localhost:8181
 search.populateBlockSize = 10000
-# Recommend setting lucene.searchBlockSize equal to maxIdsInQuery, so that all Lucene results can be authorised at once
-# If lucene.searchBlockSize > maxIdsInQuery, then multiple auth checks may be needed for a single search to Lucene
+# Recommend setting search.searchBlockSize equal to maxIdsInQuery, so that all results can be authorised at once
+# If search.searchBlockSize > maxIdsInQuery, then multiple auth checks may be needed for a single search
 # The optimal value depends on how likely a user's auth request fails: larger values are more efficient when rejection is more likely
 search.searchBlockSize = 1000
 search.directory = ${HOME}/data/icat/search
 search.backlogHandlerIntervalSeconds = 60
 search.enqueuedRequestIntervalSeconds = 5
 search.aggregateFilesIntervalSeconds = 3600
 search.maxSearchTimeSeconds = 5
-# The entities to index with the search engine. For example, remove 'Datafile' and 'DatafileParameter' if the number of datafiles exceeds lucene's limit of 2^32 entries in an index
+# The entities to index with the search engine.
 !search.entitiesToIndex = Datafile Dataset Investigation InvestigationUser DatafileParameter DatasetParameter InvestigationParameter Sample
 
 # List members of cluster

diff --git a/src/main/resources/run.properties b/src/main/resources/run.properties
@@ -20,8 +20,8 @@ log.list = SESSION WRITE READ INFO
 search.engine = lucene
 search.urls = https://localhost.localdomain:8181
 search.populateBlockSize = 10000
-# Recommend setting lucene.searchBlockSize equal to maxIdsInQuery, so that all Lucene results can be authorised at once
-# If lucene.searchBlockSize > maxIdsInQuery, then multiple auth checks may be needed for a single search to Lucene
+# Recommend setting search.searchBlockSize equal to maxIdsInQuery, so that all results can be authorised at once
+# If search.searchBlockSize > maxIdsInQuery, then multiple auth checks may be needed for a single search
 # The optimal value depends on how likely a user's auth request fails: larger values are more efficient when rejection is more likely
 search.searchBlockSize = 1000
 search.directory = ${HOME}/data/search

diff --git a/src/main/scripts/icatadmin b/src/main/scripts/icatadmin
@@ -159,37 +159,68 @@ def getPopulating(args):
 def populate(args):
     parser.set_usage(usagebase + "populate [<name>]")
     parser.set_description("Populate lucene (for that entry name)")
+    parser.add_option(
+        "-e",
+        "--entity-name",
+        action="append",
+        dest="entityName",
+        help="Name of entity to populate.",
+    )
+    parser.add_option(
+        "--min-id",
+        dest="minId",
+        help="Minimum (exclusive) ICAT entity id to populate",
+        type="int",
+    )
+    parser.add_option(
+        "--max-id",
+        dest="maxId",
+        help="Maximum (inclusive) ICAT entity id to populate",
+        type="int",
+    )
+    parser.add_option(
+        "-d",
+        "--delete",
+        dest="delete",
+        action="store_true",
+        help="Whether to delete all existing documents for this index",
+    )
     options, args = parser.parse_args(args)
-
-    if len(args) == 0: 
-        try:
-            sessionId = getService() 
-            parameters = {"sessionId": sessionId}
-            for entity in "Datafile", "DatafileParameter", "Dataset", "DatasetParameter", "Investigation", "InvestigationParameter", "InvestigationUser", "Sample":
-                print(entity)
-                _process("lucene/db/" + entity + "/-1", parameters, "POST")
-        except Exception as e:
-            fatal(e)
-        return    
-
-    if len(args) == 1:
-        try:
-            sessionId = getService()
-            parameters = {"sessionId": sessionId}
-            entity = args[0]
-            _process("lucene/db/" + entity + "/-1", parameters, "POST")
-        except Exception as e:
-            fatal(e)
-        return
-
-    if len(args) > 2:
-        fatal("Must have zero arguments after the operation 'populate' or one - the name of the entity or two with the name of the entity and minid")
-
+    entities = options.entityName or []
+    entities += args
+    if not entities:
+        # This does not need to include "nested" entities such as ParameterType, as this
+        # will be included in the READ operation on the DB implicitly
+        entities = [
+            "Datafile",
+            "Dataset",
+            "Investigation",
+            "DatafileParameter",
+			"DatasetParameter",
+            "DatasetTechnique",
+            "InstrumentScientist",
+            "InvestigationFacilityCycle",
+			"InvestigationInstrument",
+            "InvestigationParameter",
+            "InvestigationUser",
+            "Sample",
+            "SampleParameter",
+        ]
+
     try:
         sessionId = getService()
         parameters = {"sessionId": sessionId}
-        entity = args[0]
-        _process("lucene/db/" + entity + "/" + args[1], parameters, "POST")
+        if options.minId:
+            parameters["minId"] = options.minId
+        if options.maxId:
+            parameters["maxId"] = options.maxId
+        if options.delete:
+            parameters["delete"] = True
+        else:
+            parameters["delete"] = False
+
+        for entity in entities:
+            _process("lucene/db/" + entity, parameters, "POST")
     except Exception as e:
         fatal(e)
 

diff --git a/src/site/xhtml/installation.xhtml.vm b/src/site/xhtml/installation.xhtml.vm
@@ -27,7 +27,7 @@
 				installation instructions</a> installed on the server
 		</li>
 		<li>Deployed ICAT authenticators.</li>
-		<li>A deployed icat.lucene server it you plan to use free-text search.</li>
+		<li>A deployed icat.lucene server of at least version 3.0.0 or Open/Elasticsearch cluster if you plan to use free-text search.</li>
 		<li>Python 3.6+ and the suds-community package installed on the server.</li>
 	</ul>
 
@@ -86,11 +86,12 @@
 	</p>
 
 	<h2>Schema upgrade</h2>
-	<h3>Lucene database</h3>
+	<h3>Lucene indices</h3>
 	<p>
-		Any existing lucene database should be removed. The location of
+		Any existing lucene indices should be removed. The location of
 		this would have been specified in the previous icat.properties file.
-		Ensure that the directory specified there is empty.
+		Ensure that the directory specified there is empty. Indices generated by
+		icat.lucene versions before 3 are no longer compatible.
 	</p>
 	<h3>Database schema</h3>
 	<p>
@@ -262,29 +263,61 @@
 			log via JMS calls. The types are specified by a space separated list
 			of values taken from READ, WRITE, SESSION, INFO.</dd>
 
-		<dt>lucene.url</dt>
-		<dd>This is optional. It is the machine url of the icat.lucene
-			server if needed. It is needed for TopCAT to work.</dd>
-
-		<dt>lucene.populateBlockSize</dt>
-		<dd>This is ignored if lucene.url is not set. The number of
-			entries to batch off to the lucene server when using lucenePopulate.</dd>
-
-		<dt>lucene.directory</dt>
-		<dd>This is ignored if lucene.url is not set. Path of a directory
-			holding files for requests that are queued to go the icat.lucene
-			server.</dd>
-
-		<dt>lucene.backlogHandlerIntervalSeconds</dt>
-		<dd>This is ignored if lucene.url is not set. How often to check
-			the backlog file.</dd>
-
-		<dt>lucene.enqueuedRequestIntervalSecond</dt>
-		<dd>This is ignored if lucene.url is not set. How often to
+		<dt>search.engine</dt>
+		<dd>This is optional. Specifies the engine used for free-text searches.
+			Value should be one of LUCENE, OPENSEARCH and ELASTICSEARCH.</dd>
+
+		<dt>search.urls</dt>
+		<dd>This is optional. It is the machine url of the search engine
+			server if needed.</dd>
+
+		<dt>search.populateBlockSize</dt>
+		<dd>This is ignored if search.engine and search.urls are not set. The number of
+			entries to batch off to the lucene server when populating the index.</dd>
+
+		<dt>search.searchBlockSize</dt>
+		<dd>This is ignored if search.engine and search.urls are not set. Recommend
+			setting search.searchBlockSize equal to maxIdsInQuery, so that all results
+			can be authorised at once. If search.searchBlockSize > maxIdsInQuery, then
+			multiple auth checks may be needed for a single search. The optimal value
+			depends on how likely a user's auth request fails: larger values are more
+			efficient when rejection is more likely.</dd>
+
+		<dt>search.directory</dt>
+		<dd>This is ignored if search.engine and search.urls are not set. Path of a
+			directoryholding files for requests that are queued to go the search engine.
+			</dd>
+
+		<dt>search.backlogHandlerIntervalSeconds</dt>
+		<dd>This is ignored if search.engine and search.urls are not set. How often to
+			check the backlog file.</dd>
+
+		<dt>search.enqueuedRequestIntervalSecond</dt>
+		<dd>This is ignored if search.engine and search.urls are not set. How often to
 			transmit lucene requests to the icat.lucene server.</dd>
 
-		<dt>lucene.entitiesToIndex = Datafile Dataset Investigation InvestigationUser DatafileParameter DatasetParameter InvestigationParameter Sample</dt>
-		<dd>The entities to index with Lucene. For example, remove 'Datafile' and 'DatafileParameter' if the number of datafiles exceeds lucene's limit of 2^32 entries in an index</dd>
+		<dt>search.aggregateFilesIntervalSeconds</dt>
+		<dd>This is ignored if search.engine and search.urls are not set. How often to
+			update file size and counts for Datasets and Investigations containing
+			recently modified Datafiles. If 0, then rather than being performed on timer
+			will update the parent documents in real time. Note that this can have a
+			significant performance impact.</dd>
+
+		<dt>search.maxSearchTimeSeconds</dt>
+		<dd>This is ignored if search.engine and search.urls are not set. How long to
+			wait before cancelling a long-running search. This can prevent badly formed
+			queries from blocking other searches from completing.</dd>
+
+		<dt>search.entitiesToIndex = Datafile Dataset Investigation InvestigationUser DatafileParameter DatasetParameter InvestigationParameter Sample</dt>
+		<dd>The entities to index with the search engine.</dd>
+
+		<dt>search.units</dt>
+		<dd>This is optional. Recognised unit names/symbols. Each symbol recognised by
+			indriya's SimpleUnitFormat should be followed by a colon, and then a comma
+			separated list of units measuring the same property. If the unit is simply
+			an alias (e.g. "K: kelvin") this is sufficient. If a conversion is required,
+			it should be followed by this factor (e.g. "J: eV 1.602176634e-19").
+			Different units can be separated by a semi-colon.</dd>
 
 		<dt>jms.topicConnectionFactory</dt>
 		<dd>This is optional and may be used to override the default
@@ -400,18 +433,20 @@
 
 	<dl>
 
-		<dt>populate [&lt;entity name&gt;]</dt>
+		<dt>populate [--min-id 0 --max-id 1 --delete] [&lt;entity names&gt;...]</dt>
 		<dd>re-populates lucene for the specified entity name. This is
 			useful if the database has been modified directly rather than by
-			using the ICAT API. This call is asynchronous and simply places the
-			request in a set of entity types to be populated. When the request is
-			processed all lucene entries of the specified entity type are first
-			cleared then the corresponding icat entries are scanned to
-			re-populate lucene. To find what it is doing please use the
-			"populating" operation described below. It may also be run without an
-			entity name in which case it will process all entities. The new
-			lucene index will not be seen until it is completely rebuilt. While
-			the index is being rebuilt ICAT can be used as normal as any lucene
+			using the ICAT API, or to backpopulate from the database after a breaking
+			change to the search engine. This call is asynchronous and simply places the
+			request in a set of entity types to be populated. By default runs over all
+			relevant entities, or names can be provided as arguments. Also has the
+			options "min-id" to specify a non-inclusive lower limit, and "max-id" for an
+			inclusive upper limit on the operation. If documents are found in this
+			range, then the operation will not proceed, unless "delete" is also
+			specified - in which case all existing documents are cleared first.
+			To find what it is doing please use the "populating" operation described
+			below. The new lucene index will not be seen until it is completely rebuilt.
+			While the index is being rebuilt ICAT can be used as normal as any lucene
 			updates are stored to be applied later.</dd>
 
 		<dt>populating</dt>

diff --git a/src/site/xhtml/release-notes.xhtml b/src/site/xhtml/release-notes.xhtml
@@ -6,6 +6,18 @@
 
 	<h1>ICAT Server Release Notes</h1>
 
+	<h2>6.1.0</h2>
+	<p>Add support for Open/Elasticsearch engine backends for free text searches. Adds to REST endpoints for free-text searches, and deprecates old functionality. Significant changes to the functionality and performance of searches:</p>
+	<ul>
+		<li>Ability to search on over 2 billion documents</li>
+		<li>Enable sorting on specific entity fields</li>
+		<li>"Infinitely" search the data by using the searchAfter parameter</li>
+		<li>Faceted searches</li>
+		<li>Replace single "text" field with specific fields that reflect the ICAT schema to allow field targeting</li>
+		<li>Support for unit conversion on numeric Parameters</li>
+		<li>Support for synonym injection</li>
+	</ul>
+
 	<h2>6.0.0</h2>
 	<p>Upgrade from JavaEE to JakartaEE 10. Requires Java 11+ and an application server that supports JakartaEE 10 such as Payara 6.</p>