Skip to content

Commit 514fab9

Browse files
authored
Merge pull request #14 from fathomnet/fix/taxonomy-lookup-bugs
Fix taxonomy lookup bugs
2 parents 833bd24 + 515fef2 commit 514fab9

7 files changed

Lines changed: 110 additions & 54 deletions

File tree

build.sbt

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import Dependencies._
33
Global / onChangedBuildSource := ReloadOnSourceChanges
44
Laika / sourceDirectories := Seq(baseDirectory.value / "docs")
55

6-
ThisBuild / scalaVersion := "3.8.3"
6+
ThisBuild / scalaVersion := "3.8.4"
77
ThisBuild / organization := "org.fathomnet"
88
ThisBuild / organizationName := "MBARI"
99
ThisBuild / startYear := Some(2021)
@@ -68,6 +68,8 @@ lazy val root = project
6868
),
6969
// Test suites share State.data (global mutable state); parallel execution causes races
7070
Test / parallelExecution := false,
71+
// Fork so the Vert.x event-loop threads keep the JVM alive when running via `sbt run`
72+
run / fork := true,
7173
scalacOptions ++= Seq(
7274
"-deprecation", // Emit warning and location for usages of deprecated APIs.
7375
"-encoding",

project/Dependencies.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@ object Dependencies {
99

1010
lazy val jansi = "org.fusesource.jansi" % "jansi" % "2.4.3"
1111

12-
lazy val logback = "ch.qos.logback" % "logback-classic" % "1.5.32"
12+
lazy val logback = "ch.qos.logback" % "logback-classic" % "1.5.34"
1313
lazy val methanol = "com.github.mizosoft.methanol" % "methanol" % "1.9.0"
14-
lazy val munit = "org.scalameta" %% "munit" % "1.3.0"
14+
lazy val munit = "org.scalameta" %% "munit" % "1.3.3"
1515
lazy val picocli = "info.picocli" % "picocli" % "4.7.7"
1616

1717
lazy val slf4jJdk = "org.slf4j" % "slf4j-jdk-platform-logging" % "2.0.18"
@@ -24,7 +24,7 @@ object Dependencies {
2424
lazy val tapirNetty = "com.softwaremill.sttp.tapir" %% "tapir-netty-server" % tapirVersion
2525
lazy val tapirVertx = "com.softwaremill.sttp.tapir" %% "tapir-vertx-server" % tapirVersion
2626

27-
lazy val typesafeConfig = "com.typesafe" % "config" % "1.4.8"
27+
lazy val typesafeConfig = "com.typesafe" % "config" % "1.4.9"
2828
lazy val zio = "dev.zio" %% "zio" % "2.1.26"
2929

3030
}

project/build.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
sbt.version=1.11.7
1+
sbt.version=1.12.11

src/main/scala/org/fathomnet/worms/Data.scala

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,16 +24,22 @@ final case class Data(rootNode: WormsNode, wormsConcepts: Seq[WormsConcept]):
2424
private val log = System.getLogger(getClass.getName)
2525

2626
/**
27-
* Map of [nodeName, Node] for both the name and alternate name of the node.
27+
* Multimap of [name, nodes] for both the primary name and alternate names of each node.
28+
* A single name can map to multiple nodes when common/vernacular names are shared across
29+
* different taxa (e.g. "limpets" appears as a vernacular name for both Patellidae and
30+
* Acmaeidae). Storing all matching nodes prevents the last-writer-wins silent data loss
31+
* that occurred with the previous SortedMap[String, WormsNode].
2832
*/
29-
lazy val namesMap: SortedMap[String, WormsNode] =
30-
val map = SortedMap.newBuilder[String, WormsNode]
33+
lazy val namesMap: SortedMap[String, Seq[WormsNode]] =
34+
val map = scala.collection.mutable.Map.empty[String, Vector[WormsNode]]
35+
def insert(name: String, node: WormsNode): Unit =
36+
map.updateWith(name)(existing => Some(existing.getOrElse(Vector.empty) :+ node))
3137
def add(node: WormsNode): Unit =
32-
map += node.name -> node
33-
node.alternateNames.foreach(n => map += n -> node)
38+
insert(node.name, node)
39+
node.alternateNames.foreach(n => insert(n, node))
3440
node.children.foreach(add)
3541
add(rootNode)
36-
map.result()
42+
SortedMap.from(map)
3743

3844
/**
3945
* All names used in WoRMS.
@@ -89,7 +95,23 @@ final case class Data(rootNode: WormsNode, wormsConcepts: Seq[WormsConcept]):
8995
add(rootNode)
9096
map.result()
9197

92-
def findNodeByName(name: String): Option[WormsNode] = namesMap.get(name)
98+
/**
99+
* Return all nodes associated with a name (primary or alternate).
100+
* Returns an empty Seq when the name is not present in the tree.
101+
*/
102+
def findNodesByName(name: String): Seq[WormsNode] =
103+
namesMap.getOrElse(name, Vector.empty)
104+
105+
/**
106+
* Return the single best node for a name. When multiple nodes share the same name,
107+
* preference order is: (1) node whose primary name matches exactly, (2) any accepted
108+
* node, (3) first in insertion order.
109+
*/
110+
def findNodeByName(name: String): Option[WormsNode] =
111+
namesMap.get(name).flatMap: nodes =>
112+
nodes.find(_.name == name)
113+
.orElse(nodes.find(_.isAccepted))
114+
.orElse(nodes.headOption)
93115

94116
def findNodeByChildName(name: String): Option[WormsNode] = parents.get(name)
95117

src/main/scala/org/fathomnet/worms/StateController.scala

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -94,13 +94,18 @@ object StateController:
9494

9595
def descendantNames(name: String, acceptedOnly: Boolean = false): Either[ErrorMsg, List[String]] =
9696
def search(data: Data): List[String] =
97-
data.findNodeByName(name) match
98-
case None => Nil
99-
case Some(node) =>
100-
if (acceptedOnly && node.isAccepted)
101-
node.descendants.filter(_.isAccepted).map(_.name).sorted.toList
102-
else
103-
node.descendantNames.sorted.toList
97+
val nodes = data.findNodesByName(name)
98+
if nodes.isEmpty then Nil
99+
else
100+
nodes
101+
.flatMap: node =>
102+
if acceptedOnly then
103+
node.descendants.filter(_.isAccepted).map(_.name)
104+
else
105+
node.descendantNames
106+
.distinct
107+
.sorted
108+
.toList
104109
runSearch(search).fold(
105110
e => Left(e),
106111
v =>
@@ -190,7 +195,9 @@ object StateController:
190195
.rangeFrom(lower)
191196
.takeWhile(_._1.startsWith(lower))
192197
.values
193-
.flatMap(names => names.flatMap(data.findNodeByName))
198+
.flatMap(names => names.flatMap(data.findNodesByName))
199+
.toSeq
200+
.distinctBy(_.aphiaId)
194201
.map(_.simple)
195202
.toList
196203
case Some(parent) =>
@@ -230,7 +237,7 @@ object StateController:
230237
)
231238
names <- synonyms(name)
232239
acceptedName <- names.headOption.toRight(NotFound(s"Unable to find `$name`"))
233-
wormsConcept <- data.namesMap.get(acceptedName)
240+
wormsConcept <- data.findNodeByName(acceptedName)
234241
.flatMap(node => data.wormsConceptsMap.get(node.aphiaId))
235242
.toRight(NotFound(s"Unable to find `$name` accepted name of `$acceptedName`"))
236243
yield WormsDetails

src/main/scala/org/fathomnet/worms/io/model.scala

Lines changed: 16 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,35 +20,34 @@ import scala.util.{Success, Try, Using}
2020
def taxonIDToKey(taxonID: String): Long =
2121
taxonID.split(":").last.toLong
2222

23-
def readFile[A](file: String, rowMapper: String => Option[A]): List[A] = {
23+
def readFile[A](file: String, rowMapper: String => Option[A]): List[A] =
2424
Using(Source.fromFile(file)) { source =>
2525
source.getLines().flatMap(rowMapper).toList
2626
} match
2727
case Success(list) => list
2828
case _ => List.empty[A]
2929

30-
}
31-
3230
final case class Taxon(
3331
taxonID: String,
3432
parentNameUsageID: Option[String],
3533
scientificName: String,
3634
rank: String,
3735
acceptedNameUsageID: Option[String]
3836
):
39-
val id: Long = taxonIDToKey(taxonID)
40-
val parentId: Option[Long] = parentNameUsageID.map(taxonIDToKey)
37+
val id: Long = taxonIDToKey(taxonID)
38+
val parentId: Option[Long] = parentNameUsageID.map(taxonIDToKey)
4139
val acceptedId: Option[Long] = acceptedNameUsageID.map(taxonIDToKey)
4240

4341
object Taxon:
4442
def from(row: String): Option[Taxon] =
4543
Try {
46-
val cols = row.split("\t")
47-
val taxonID = cols(0)
48-
val acceptedNameUsageID = if cols(2).isBlank then None else Some(cols(2))
49-
val parentNameUsageID = if cols(3).isBlank then None else Some(cols(3))
50-
val scientificName = cols(5)
51-
val rank = cols(19)
44+
val cols = row.split("\t", -1).toList
45+
def get(i: Int): String = cols.applyOrElse(i, (_: Int) => "")
46+
val taxonID = get(0)
47+
val acceptedNameUsageID = if get(2).isBlank then None else Some(get(2))
48+
val parentNameUsageID = if get(3).isBlank then None else Some(get(3))
49+
val scientificName = get(5)
50+
val rank = get(19)
5251
Taxon(taxonID, parentNameUsageID, scientificName, rank, acceptedNameUsageID)
5352
}.toOption
5453

@@ -60,8 +59,9 @@ final case class VernacularName(taxonID: String, vernacularName: String):
6059
object VernacularName:
6160
def from(row: String): Option[VernacularName] =
6261
Try {
63-
val cols = row.split("\t")
64-
VernacularName(cols(0), cols(1))
62+
val cols = row.split("\t", -1).toList
63+
def get(i: Int): String = cols.applyOrElse(i, (_: Int) => "")
64+
VernacularName(get(0), get(1))
6565
}.toOption
6666

6767
def read(file: String): List[VernacularName] = readFile(file, VernacularName.from)
@@ -84,8 +84,9 @@ object SpeciesProfile:
8484

8585
def from(row: String): Option[SpeciesProfile] =
8686
Try {
87-
val cols = row.split("\t")
88-
SpeciesProfile(cols(0), toBool(cols(1)), toBool(cols(2)), toBool(cols(3)), toBool(cols(4)), toBool(cols(5)))
87+
val cols = row.split("\t", -1).toList
88+
def get(i: Int): String = cols.applyOrElse(i, (_: Int) => "")
89+
SpeciesProfile(get(0), toBool(get(1)), toBool(get(2)), toBool(get(3)), toBool(get(4)), toBool(get(5)))
8990
}.toOption
9091

9192
def read(file: String): List[SpeciesProfile] = readFile(file, SpeciesProfile.from)

src/test/scala/org/fathomnet/worms/StateControllerSuite.scala

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -13,28 +13,31 @@ class StateControllerSuite extends munit.FunSuite:
1313
// ---- test tree --------------------------------------------------------
1414
// Animalia (2, alternateNames=["animals"])
1515
// Mollusca (51)
16-
// Polyplacophora (55, alternateNames=["chitons"])
16+
// Polyplacophora (55, alternateNames=["chitons", "shells"]) ← "shells" shared
1717
// Arthropoda (100)
18-
// Decapoda (120, alternateNames=["decapods"])
18+
// Decapoda (120, alternateNames=["decapods", "shells"]) ← "shells" shared
1919
// OldArthropoda (200, acceptedAphiaId=100) ← synonym for Arthropoda
2020
// -----------------------------------------------------------------------
21+
// "shells" is intentionally shared between Polyplacophora and Decapoda to verify
22+
// that the namesMap multimap correctly preserves both nodes and that descendant
23+
// lookups by a shared alternate name return the union of their descendants.
2124

22-
private val polyplacophora = WormsNode("Polyplacophora", "Class", 55, 55, Seq("chitons"), Nil)
23-
private val mollusca = WormsNode("Mollusca", "Phylum", 51, 51, Nil, Seq(polyplacophora))
24-
private val decapoda = WormsNode("Decapoda", "Order", 120, 120, Seq("decapods"), Nil)
25-
private val arthropoda = WormsNode("Arthropoda", "Phylum", 100, 100, Nil, Seq(decapoda))
26-
private val oldArthropoda = WormsNode("OldArthropoda", "Phylum", 200, 100, Nil, Nil)
27-
private val root = WormsNode("Animalia", "Kingdom", 2, 2, Seq("animals"), Seq(mollusca, arthropoda, oldArthropoda))
25+
private val polyplacophora = WormsNode("Polyplacophora", "Class", 55, 55, Seq("chitons", "shells"), Nil)
26+
private val mollusca = WormsNode("Mollusca", "Phylum", 51, 51, Nil, Seq(polyplacophora))
27+
private val decapoda = WormsNode("Decapoda", "Order", 120, 120, Seq("decapods", "shells"), Nil)
28+
private val arthropoda = WormsNode("Arthropoda", "Phylum", 100, 100, Nil, Seq(decapoda))
29+
private val oldArthropoda = WormsNode("OldArthropoda", "Phylum", 200, 100, Nil, Nil)
30+
private val root = WormsNode("Animalia", "Kingdom", 2, 2, Seq("animals"), Seq(mollusca, arthropoda, oldArthropoda))
2831

2932
private def cn(name: String, primary: Boolean = true) = WormsConceptName(name, primary)
3033

3134
private val wormsConcepts = Seq(
32-
WormsConcept(2, 2, None, Seq(cn("Animalia"), cn("animals", false)), "Kingdom", isMarine = Some(true)),
33-
WormsConcept(51, 51, Some(2), Seq(cn("Mollusca")), "Phylum", isMarine = Some(true)),
34-
WormsConcept(55, 55, Some(51), Seq(cn("Polyplacophora"), cn("chitons", false)), "Class", isMarine = Some(true)),
35-
WormsConcept(100, 100, Some(2), Seq(cn("Arthropoda")), "Phylum"),
36-
WormsConcept(120, 120, Some(100), Seq(cn("Decapoda"), cn("decapods", false)), "Order"),
37-
WormsConcept(200, 100, Some(2), Seq(cn("OldArthropoda")), "Phylum"),
35+
WormsConcept(2, 2, None, Seq(cn("Animalia"), cn("animals", false)), "Kingdom", isMarine = Some(true)),
36+
WormsConcept(51, 51, Some(2), Seq(cn("Mollusca")), "Phylum", isMarine = Some(true)),
37+
WormsConcept(55, 55, Some(51), Seq(cn("Polyplacophora"), cn("chitons", false), cn("shells", false)), "Class", isMarine = Some(true)),
38+
WormsConcept(100, 100, Some(2), Seq(cn("Arthropoda")), "Phylum"),
39+
WormsConcept(120, 120, Some(100), Seq(cn("Decapoda"), cn("decapods", false), cn("shells", false)), "Order"),
40+
WormsConcept(200, 100, Some(2), Seq(cn("OldArthropoda")), "Phylum"),
3841
)
3942

4043
private val testData = Data(root, wormsConcepts)
@@ -55,12 +58,13 @@ class StateControllerSuite extends munit.FunSuite:
5558
// ---- findAllNames / countAllNames -------------------------------------
5659

5760
test("findAllNames returns paginated results with correct total"):
58-
// The tree has 9 distinct names: Animalia, animals, Arthropoda, chitons, Decapoda,
59-
// decapods, Mollusca, OldArthropoda, Polyplacophora
61+
// 10 distinct names: Animalia, animals, Arthropoda, chitons, Decapoda,
62+
// decapods, Mollusca, OldArthropoda, Polyplacophora, shells
63+
// ("shells" is shared between Polyplacophora and Decapoda but counts once)
6064
val page = StateController.findAllNames(3, 0).getOrElse(fail("expected Right"))
6165
assertEquals(page.limit, 3)
6266
assertEquals(page.offset, 0)
63-
assertEquals(page.total, 9)
67+
assertEquals(page.total, 10)
6468
assertEquals(page.items.size, 3)
6569

6670
test("findAllNames honours offset"):
@@ -69,7 +73,7 @@ class StateControllerSuite extends munit.FunSuite:
6973
assertEquals(page.items.size, 3)
7074

7175
test("countAllNames returns total number of distinct names"):
72-
assertEquals(StateController.countAllNames().getOrElse(fail("expected Right")), 9)
76+
assertEquals(StateController.countAllNames().getOrElse(fail("expected Right")), 10)
7377

7478
// ---- queryNames -------------------------------------------------------
7579

@@ -114,6 +118,26 @@ class StateControllerSuite extends munit.FunSuite:
114118
test("descendantNames returns NotFound for unknown name"):
115119
assert(StateController.descendantNames("Unknown").isLeft)
116120

121+
test("descendantNames via a shared alternate name returns union of descendants from all matching nodes"):
122+
// "shells" is an alternate name for both Polyplacophora (55) and Decapoda (120).
123+
// Before the multimap fix, whichever node was inserted last would silently overwrite
124+
// the other; the "lost" node's descendants would be missing from the result.
125+
val names = StateController.descendantNames("shells").getOrElse(fail("expected Right"))
126+
assert(names.contains("Polyplacophora"), s"expected Polyplacophora in $names")
127+
assert(names.contains("Decapoda"), s"expected Decapoda in $names")
128+
129+
test("findNodesByName returns all nodes sharing a name"):
130+
val nodes = State.data.get.findNodesByName("shells")
131+
assertEquals(nodes.size, 2)
132+
assert(nodes.map(_.name).toSet == Set("Polyplacophora", "Decapoda"))
133+
134+
test("findNodeByName on a shared alternate name returns an accepted node"):
135+
val nodeOpt = State.data.get.findNodeByName("shells")
136+
assert(nodeOpt.isDefined)
137+
val node = nodeOpt.get
138+
assert(node.name == "Polyplacophora" || node.name == "Decapoda")
139+
assert(node.isAccepted)
140+
117141
// ---- ancestorNames ----------------------------------------------------
118142

119143
test("ancestorNames returns full path from root to node"):

0 commit comments

Comments
 (0)