From 03faedc0d7487349c606ebe9009e4ce1ce4184c2 Mon Sep 17 00:00:00 2001 From: abl Date: Sat, 4 Nov 2023 23:38:59 +0100 Subject: [PATCH] musicbrainz enricher --- SEMANTIC_CONVENTIONS.md | 11 +- boudicca.base/enricher-utils/.gitignore | 1 + .../boudicca/enricher_utils/EnricherTester.kt | 2 +- .../enricher_utils/MusicBrainzImporter.kt | 98 +++++++++ .../enricher/service/CategoryEnricher.kt | 2 + .../service/MusicBrainzArtistEnricher.kt | 13 -- .../enricher/service/musicbrainz/Artist.kt | 7 + .../service/musicbrainz/ArtistMatcher.kt | 74 +++++++ .../musicbrainz/MusicBrainzArtistEnricher.kt | 96 +++++++++ .../src/main/resources/application.properties | 10 + .../service/musicbrainz/ArtistMatcherTest.kt | 189 ++++++++++++++++++ 11 files changed, 484 insertions(+), 19 deletions(-) create mode 100644 boudicca.base/enricher-utils/.gitignore create mode 100644 boudicca.base/enricher-utils/src/main/kotlin/base/boudicca/enricher_utils/MusicBrainzImporter.kt delete mode 100644 boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/MusicBrainzArtistEnricher.kt create mode 100644 boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/musicbrainz/Artist.kt create mode 100644 boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/musicbrainz/ArtistMatcher.kt create mode 100644 boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/musicbrainz/MusicBrainzArtistEnricher.kt create mode 100644 boudicca.base/enricher/src/test/kotlin/base/boudicca/enricher/service/musicbrainz/ArtistMatcherTest.kt diff --git a/SEMANTIC_CONVENTIONS.md b/SEMANTIC_CONVENTIONS.md index afff0cc7..e8606169 100644 --- a/SEMANTIC_CONVENTIONS.md +++ b/SEMANTIC_CONVENTIONS.md @@ -19,7 +19,8 @@ We use certain data types for the properties we expect. * `date`: [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) Timestamp as text, for example: `2009-06-30T18:30:00+02:00` * `url`: A URL as text * `coordinates`: longitude + latitude in Decimal degrees (DD) in the format `, ` -* `list`: A list of elements, the `?` describes the type of the elements in the list +* `list`: A list of elements, the `?` describes the type of the elements in the list. Currently elements in a list + are seperated by a newline, but this will probably change sometime * `enum`: Has to be one of the specified distinct values * `boolean`: The text "true" or "false" @@ -75,10 +76,10 @@ We use certain data types for the properties we expect. ### Concert(/Music?) Properties -| Key | Meaning | Format | -|------------------|------------------------------------------------|------------| -| concert.genre | Genre of the concert | text | -| concert.bandlist | List of all bands playing (main act + support) | list | +| Key | Meaning | Format | +|------------------|---------------------------|------------| +| concert.genre | Genre of the concert | text | +| concert.bandlist | List of all bands playing | list | ### Internal Properties diff --git a/boudicca.base/enricher-utils/.gitignore b/boudicca.base/enricher-utils/.gitignore new file mode 100644 index 00000000..07f43b87 --- /dev/null +++ b/boudicca.base/enricher-utils/.gitignore @@ -0,0 +1 @@ +data/* \ No newline at end of file diff --git a/boudicca.base/enricher-utils/src/main/kotlin/base/boudicca/enricher_utils/EnricherTester.kt b/boudicca.base/enricher-utils/src/main/kotlin/base/boudicca/enricher_utils/EnricherTester.kt index b69b50e1..8d8d6d02 100644 --- a/boudicca.base/enricher-utils/src/main/kotlin/base/boudicca/enricher_utils/EnricherTester.kt +++ b/boudicca.base/enricher-utils/src/main/kotlin/base/boudicca/enricher_utils/EnricherTester.kt @@ -14,7 +14,7 @@ fun main() { val events = getEvents() println("fetch all events took ${System.currentTimeMillis() - startTime}ms") - val filteredEvents = events.filter { it.data[SemanticKeys.COLLECTORNAME] == "linz termine" } + val filteredEvents = events//.filter { it.data[SemanticKeys.COLLECTORNAME] == "posthof" } startTime = System.currentTimeMillis() val enrichedEvents = enrich(filteredEvents) diff --git a/boudicca.base/enricher-utils/src/main/kotlin/base/boudicca/enricher_utils/MusicBrainzImporter.kt b/boudicca.base/enricher-utils/src/main/kotlin/base/boudicca/enricher_utils/MusicBrainzImporter.kt new file mode 100644 index 00000000..cd1566d7 --- /dev/null +++ b/boudicca.base/enricher-utils/src/main/kotlin/base/boudicca/enricher_utils/MusicBrainzImporter.kt @@ -0,0 +1,98 @@ +package base.boudicca.enricher_utils + +import org.json.JSONArray +import org.json.JSONObject +import java.io.* +import java.util.zip.GZIPOutputStream +import java.util.zip.ZipOutputStream +import kotlin.streams.asSequence + +fun main() { + + val reader = BufferedReader(FileReader(File("./boudicca.base/enricher-utils/data/artist"))) + val dataOut = OutputStreamWriter(GZIPOutputStream(BufferedOutputStream(FileOutputStream("./boudicca.base/enricher-utils/data/artist_parsed.json.gz", false)))) + val indexOut = GZIPOutputStream(BufferedOutputStream(FileOutputStream("./boudicca.base/enricher-utils/data/artist.index.gz", false))) + + val artists = reader.lines().asSequence() +// .take(100) + .map { mapArtist(it) } + .toList() + + val filteredArtists = getFilteredArtists(artists) + + serialize(filteredArtists).write(dataOut) + writeIndex(indexOut, filteredArtists) + indexOut.close() + dataOut.close() + reader.close() +} + +fun writeIndex(indexOut: OutputStream, filteredArtists: List) { + val allNames = filteredArtists.mapIndexed { i, artist -> Pair(i, artist.name.lowercase()) } + val sortedList = allNames.sortedBy { it.second } + + for (pair in sortedList) { + val i = pair.first + indexOut.write(i.shr(24)) + indexOut.write(i.shr(16)) + indexOut.write(i.shr(8)) + indexOut.write(i) + } +} + +private fun getFilteredArtists(artists: List): List { + var filtered = artists.filter { it.name.length >= 3 /*&& it.aliases.all { it.length >= 3 }*/ } + + filtered = filtered.filter { !it.ended } + +// filtered = filtered.filter { it.genre != null } + + val names = filtered.map { it.name.lowercase() }.groupBy { it }.mapValues { it.value.size } + filtered = filtered.filter { names[it.name.lowercase()]!! == 1 } + + return filtered +} + +fun serialize(artists: List): JSONArray { + val array = JSONArray() + for (artist in artists) { + val artistObject = JSONObject() + artistObject.put("name", artist.name) + artistObject.put("genre", artist.genre) + val aliasesArray = JSONArray() + for (alias in artist.aliases) { + aliasesArray.put(alias) + } +// artistObject.put("aliases", aliasesArray) + array.put(artistObject) + } + return array +} + +fun mapArtist(line: String): Artist { + val jsonObject = JSONObject(line) + val name = jsonObject.getString("name") + val genre = mapGenre(jsonObject.getJSONArray("genres")) + val aliases = jsonObject.getJSONArray("aliases").map { (it as JSONObject).getString("name") } + val ended = jsonObject.has("ended") && jsonObject.getBoolean("ended") + return Artist(name, genre, aliases, ended) +} + +fun mapGenre(jsonArray: JSONArray): String? { + val list = mutableListOf>() + for (entry in jsonArray) { + val obj = entry as JSONObject + list.add(Pair(obj.getString("name"), obj.getInt("count"))) + } + return list.sortedWith( + Comparator.comparing?, Int?> { it.second }.reversed() + .then(Comparator.comparing { it.first }) + ).firstOrNull()?.first +} + +data class Artist( + val name: String, + val genre: String?, + val aliases: List, + val ended: Boolean, +) \ No newline at end of file diff --git a/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/CategoryEnricher.kt b/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/CategoryEnricher.kt index 6d7ec373..dd22bc52 100644 --- a/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/CategoryEnricher.kt +++ b/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/CategoryEnricher.kt @@ -3,9 +3,11 @@ package base.boudicca.enricher.service import base.boudicca.model.Event import base.boudicca.model.EventCategory import base.boudicca.SemanticKeys +import org.springframework.core.annotation.Order import org.springframework.stereotype.Service @Service +@Order(0) class CategoryEnricher : Enricher { override fun enrich(e: Event): Event { diff --git a/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/MusicBrainzArtistEnricher.kt b/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/MusicBrainzArtistEnricher.kt deleted file mode 100644 index c5e2a9e6..00000000 --- a/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/MusicBrainzArtistEnricher.kt +++ /dev/null @@ -1,13 +0,0 @@ -package base.boudicca.enricher.service - -import base.boudicca.model.Event -import org.springframework.stereotype.Service - -@Service -class MusicBrainzArtistEnricher : Enricher { - - override fun enrich(e: Event): Event { - return e //TODO - } - -} \ No newline at end of file diff --git a/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/musicbrainz/Artist.kt b/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/musicbrainz/Artist.kt new file mode 100644 index 00000000..de786a06 --- /dev/null +++ b/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/musicbrainz/Artist.kt @@ -0,0 +1,7 @@ +package base.boudicca.enricher.service.musicbrainz + +data class Artist( + val name: String, + val genre: String?, + val lowercaseName: String = name.lowercase(), +) \ No newline at end of file diff --git a/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/musicbrainz/ArtistMatcher.kt b/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/musicbrainz/ArtistMatcher.kt new file mode 100644 index 00000000..01638c51 --- /dev/null +++ b/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/musicbrainz/ArtistMatcher.kt @@ -0,0 +1,74 @@ +package base.boudicca.enricher.service.musicbrainz + +import java.nio.ByteBuffer + +class ArtistMatcher(private val artists: List, private val index: ByteBuffer) { + fun findArtists(string: String): List { + if (string.isEmpty()) { + return emptyList() + } + + val lowerString = string.lowercase() + + //TODO duplicated artists found? + val foundArtists = mutableListOf() + foundArtists.addAll(matchArtistsFrom(lowerString, 0)) + for (i in lowerString.indices) { + if (!lowerString[i].isLetterOrDigit()) { + if (i + 1 < lowerString.length) { + foundArtists.addAll(matchArtistsFrom(lowerString, i + 1)) + } + } + } + + return foundArtists + } + + private fun matchArtistsFrom(string: String, stringIndex: Int): List { + val matchedArtists = mutableListOf() + + var min = 0 + var max = artists.size - 1 + while (min <= max) { + val next = (min + max) / 2 + val compare = compare(string, stringIndex, next) + if (compare == 0) { + //TODO what about multiple matches? + matchedArtists.add(artists[index.getInt(next * 4)]) + break + } else if (compare < 0) { + max = next - 1 + } else { + min = next + 1 + } + } + + return matchedArtists + } + + private fun compare(string: String, stringIndex: Int, indexIndex: Int): Int { + val artistName = artists[index.getInt(indexIndex * 4)].lowercaseName + + for (i in artistName.indices) { + val currentStringIndex = stringIndex + i + if (currentStringIndex >= string.length) { + return -1 + } + if (string[currentStringIndex] < artistName[i]) { + return -1 + } + if (string[currentStringIndex] > artistName[i]) { + return 1 + } + } + + //substrings matches! now look if we are at a word boundary + if (stringIndex + artistName.length == string.length) { + return 0 + } + if (string[stringIndex + artistName.length].isLetterOrDigit()) { + return 1 + } + return 0 + } +} \ No newline at end of file diff --git a/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/musicbrainz/MusicBrainzArtistEnricher.kt b/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/musicbrainz/MusicBrainzArtistEnricher.kt new file mode 100644 index 00000000..a7173e92 --- /dev/null +++ b/boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/musicbrainz/MusicBrainzArtistEnricher.kt @@ -0,0 +1,96 @@ +package base.boudicca.enricher.service.musicbrainz + +import base.boudicca.SemanticKeys +import base.boudicca.enricher.service.Enricher +import base.boudicca.model.Event +import base.boudicca.model.EventCategory +import com.fasterxml.jackson.core.type.TypeReference +import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.module.kotlin.KotlinModule +import org.slf4j.LoggerFactory +import org.springframework.beans.factory.annotation.Autowired +import org.springframework.beans.factory.annotation.Value +import org.springframework.stereotype.Service +import java.io.BufferedInputStream +import java.io.File +import java.io.FileInputStream +import java.nio.ByteBuffer +import java.util.zip.GZIPInputStream + +@Service +class MusicBrainzArtistEnricher @Autowired constructor( + @Value("\${boudicca.enricher.musicbrainz.data.path:}") musicBrainzDataPath: String?, + @Value("\${boudicca.enricher.musicbrainz.index.path:}") musicBrainzIndexPath: String?, +) : Enricher { + + private val LOG = LoggerFactory.getLogger(this.javaClass) + + private val objectMapper = ObjectMapper().registerModule(KotlinModule.Builder().build()) + private val artistMatcher = createArtistMatcher(musicBrainzDataPath, musicBrainzIndexPath) + + override fun enrich(e: Event): Event { + if (artistMatcher == null) { + return e + } + return doEnrich(e, artistMatcher) + } + + private fun doEnrich(e: Event, artistMatcher: ArtistMatcher): Event { + if (e.data[SemanticKeys.CATEGORY] != EventCategory.MUSIC.name) { + return e + } + val foundArtists = artistMatcher.findArtists(e.name) + if (foundArtists.isNotEmpty()) { + val nonSubstringArtists = foundArtists.filter { artist -> + foundArtists.none { it.name.length != artist.name.length && it.name.contains(artist.name, true) } + } + return insertArtistData(e, nonSubstringArtists) + } + return e + } + + private fun insertArtistData(e: Event, artists: List): Event { + val enrichedData = e.data.toMutableMap() + enrichedData[SemanticKeys.CONCERT_BANDLIST] = artists.joinToString("\n") { it.name } + val genre = artists.firstNotNullOfOrNull { it.genre } + if (genre != null && !enrichedData.containsKey(SemanticKeys.CONCERT_GENRE)) { + enrichedData[SemanticKeys.CONCERT_GENRE] = genre + } + return Event(e.name, e.startDate, enrichedData) + } + + private fun createArtistMatcher(musicBrainzDataPath: String?, musicBrainzIndexPath: String?): ArtistMatcher? { + val data = loadData(musicBrainzDataPath) + val index = loadIndex(musicBrainzIndexPath) + if (data != null && index != null) { + return ArtistMatcher(data, index) + } + return null + } + + private fun loadData(musicBrainzDataPath: String?): List? { + if (musicBrainzDataPath.isNullOrBlank()) { + LOG.debug("no musicBrainzDataPath given, disabling enricher") + return null + } + val file = File(musicBrainzDataPath) + if (!file.exists() || !file.isFile || !file.canRead()) { + throw IllegalArgumentException("musicbrainz data path $musicBrainzDataPath is not a readable file!") + } + return objectMapper + .readValue(BufferedInputStream(GZIPInputStream(FileInputStream(file))), object : TypeReference?>() {}) + } + + private fun loadIndex(musicBrainzIndexPath: String?): ByteBuffer? { + if (musicBrainzIndexPath.isNullOrBlank()) { + LOG.debug("no musicBrainzIndexPath given, disabling enricher") + return null + } + val file = File(musicBrainzIndexPath) + if (!file.exists() || !file.isFile || !file.canRead()) { + throw IllegalArgumentException("musicbrainz index path $musicBrainzIndexPath is not a readable file!") + } + return ByteBuffer.wrap(BufferedInputStream(GZIPInputStream(FileInputStream(file))).readBytes()) + } + +} diff --git a/boudicca.base/enricher/src/main/resources/application.properties b/boudicca.base/enricher/src/main/resources/application.properties index cd2d02be..a4b962f1 100644 --- a/boudicca.base/enricher/src/main/resources/application.properties +++ b/boudicca.base/enricher/src/main/resources/application.properties @@ -1 +1,11 @@ server.port=8085 + +#for local testing, enable if needed + +#musicbrainz enricher +#boudicca.enricher.musicbrainz.data.path=./boudicca.base/enricher-utils/data/artist_parsed.json.gz +#boudicca.enricher.musicbrainz.index.path=./boudicca.base/enricher-utils/data/artist.index.gz + +#location enricher +#boudicca.enricher.location.googleCredentialsPath=./boudicca.base/enricher-utils/data/googleCredentials.json +#boudicca.enricher.location.spreadsheetId=1yYOE5gRR6gjNBim7hwEe3__fXoRAMtREkYbs-lsn7uM diff --git a/boudicca.base/enricher/src/test/kotlin/base/boudicca/enricher/service/musicbrainz/ArtistMatcherTest.kt b/boudicca.base/enricher/src/test/kotlin/base/boudicca/enricher/service/musicbrainz/ArtistMatcherTest.kt new file mode 100644 index 00000000..4223b088 --- /dev/null +++ b/boudicca.base/enricher/src/test/kotlin/base/boudicca/enricher/service/musicbrainz/ArtistMatcherTest.kt @@ -0,0 +1,189 @@ +package base.boudicca.enricher.service.musicbrainz + +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Assertions.assertTrue +import org.junit.jupiter.api.Test +import java.nio.ByteBuffer + +class ArtistMatcherTest { + + @Test + fun emptyInput() { + val artistMatcher = testMatcher(emptyList(), ByteArray(0)) + + val result = artistMatcher.findArtists("test") + + assertEquals(0, result.size) + } + + @Test + fun emptyString() { + val artistMatcher = testMatcher(emptyList(), ByteArray(0)) + + val result = artistMatcher.findArtists("") + + assertEquals(0, result.size) + } + + @Test + fun testSingleArtistMatch() { + val artistMatcher = testMatcher( + listOf( + Artist("artist", null) + ), + byteArrayOf( + 0x00, 0x00, 0x00, 0x00, + ) + ) + + val result = artistMatcher.findArtists("artist") + + assertEquals(1, result.size) + assertEquals("artist", result[0].name) + } + + @Test + fun testThreeArtists() { + val artistMatcher = testMatcher( + listOf( + Artist("b", null), + Artist("a", null), + Artist("c", null), + ), + byteArrayOf( + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x02, + ) + ) + + var result = artistMatcher.findArtists("a") + assertEquals(1, result.size) + assertEquals("a", result[0].name) + + result = artistMatcher.findArtists("b") + assertEquals(1, result.size) + assertEquals("b", result[0].name) + + result = artistMatcher.findArtists("c") + assertEquals(1, result.size) + assertEquals("c", result[0].name) + } + + @Test + fun testThreeArtistsMatch() { + val artistMatcher = testMatcher( + listOf( + Artist("a", null), + Artist("b", null), + Artist("c", null), + ), + byteArrayOf( + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x02, + ) + ) + + val result = artistMatcher.findArtists("a b c") + assertEquals(3, result.size) + assertTrue(result.any { it.name == "a" }) + assertTrue(result.any { it.name == "b" }) + assertTrue(result.any { it.name == "c" }) + } + + @Test + fun testLongerArtistMatch() { + val artistMatcher = testMatcher( + listOf( + Artist("artist fancy name", null), + Artist("b", null), + Artist("c", null), + ), + byteArrayOf( + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x02, + ) + ) + + val result = artistMatcher.findArtists("this is artist fancy name show") + assertEquals(1, result.size) + assertEquals("artist fancy name", result[0].name) + } + + @Test + fun testCaseInsensitive() { + val artistMatcher = testMatcher( + listOf( + Artist("a", null), + Artist("b", null), + Artist("c", null), + ), + byteArrayOf( + 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x02, + ) + ) + + val result = artistMatcher.findArtists("A") + assertEquals(1, result.size) + assertEquals("a", result[0].name) + } + + @Test + fun testNoSubmatch() { + val artistMatcher = testMatcher( + listOf( + Artist("a", null), + ), + byteArrayOf( + 0x00, 0x00, 0x00, 0x00, + ) + ) + + val result = artistMatcher.findArtists("ab") + assertEquals(0, result.size) + } + + @Test + fun testSubmatchWithSpecialChar() { + val artistMatcher = testMatcher( + listOf( + Artist("a", null), + ), + byteArrayOf( + 0x00, 0x00, 0x00, 0x00, + ) + ) + + val result = artistMatcher.findArtists("a&b") + assertEquals(1, result.size) + } + + @Test + fun testArtistWithSpecialChar() { + val artistMatcher = testMatcher( + listOf( + Artist("a&b", null), + Artist("a - b", null), + ), + byteArrayOf( + 0x00, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x00, + ) + ) + + var result = artistMatcher.findArtists("show of a&b wow") + assertEquals(1, result.size) + assertEquals("a&b", result[0].name) + result = artistMatcher.findArtists("show of a - b wow") + assertEquals(1, result.size) + assertEquals("a - b", result[0].name) + } + + private fun testMatcher(artists: List, bytes: ByteArray): ArtistMatcher { + return ArtistMatcher(artists, ByteBuffer.wrap(bytes)) + } +} \ No newline at end of file