Skip to content

Commit

Permalink
Merge pull request #185 from boudicca-events/abl/musicbrainz
Browse files Browse the repository at this point in the history
musicbrainz enricher
  • Loading branch information
kadhonn authored Nov 4, 2023
2 parents 2ed82ff + 03faedc commit 6e7a3ff
Show file tree
Hide file tree
Showing 11 changed files with 484 additions and 19 deletions.
11 changes: 6 additions & 5 deletions SEMANTIC_CONVENTIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ We use certain data types for the properties we expect.
* `date`: [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601) Timestamp as text, for example: `2009-06-30T18:30:00+02:00`
* `url`: A URL as text
* `coordinates`: longitude + latitude in Decimal degrees (DD) in the format `<longitute>, <latitude>`
* `list<?>`: A list of elements, the `?` describes the type of the elements in the list
* `list<?>`: A list of elements, the `?` describes the type of the elements in the list. Currently elements in a list
are seperated by a newline, but this will probably change sometime
* `enum<?>`: Has to be one of the specified distinct values
* `boolean`: The text "true" or "false"

Expand Down Expand Up @@ -75,10 +76,10 @@ We use certain data types for the properties we expect.

### Concert(/Music?) Properties

| Key | Meaning | Format |
|------------------|------------------------------------------------|------------|
| concert.genre | Genre of the concert | text |
| concert.bandlist | List of all bands playing (main act + support) | list<text> |
| Key | Meaning | Format |
|------------------|---------------------------|------------|
| concert.genre | Genre of the concert | text |
| concert.bandlist | List of all bands playing | list<text> |

### Internal Properties

Expand Down
1 change: 1 addition & 0 deletions boudicca.base/enricher-utils/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
data/*
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ fun main() {
val events = getEvents()
println("fetch all events took ${System.currentTimeMillis() - startTime}ms")

val filteredEvents = events.filter { it.data[SemanticKeys.COLLECTORNAME] == "linz termine" }
val filteredEvents = events//.filter { it.data[SemanticKeys.COLLECTORNAME] == "posthof" }

startTime = System.currentTimeMillis()
val enrichedEvents = enrich(filteredEvents)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package base.boudicca.enricher_utils

import org.json.JSONArray
import org.json.JSONObject
import java.io.*
import java.util.zip.GZIPOutputStream
import java.util.zip.ZipOutputStream
import kotlin.streams.asSequence

fun main() {

val reader = BufferedReader(FileReader(File("./boudicca.base/enricher-utils/data/artist")))
val dataOut = OutputStreamWriter(GZIPOutputStream(BufferedOutputStream(FileOutputStream("./boudicca.base/enricher-utils/data/artist_parsed.json.gz", false))))
val indexOut = GZIPOutputStream(BufferedOutputStream(FileOutputStream("./boudicca.base/enricher-utils/data/artist.index.gz", false)))

val artists = reader.lines().asSequence()
// .take(100)
.map { mapArtist(it) }
.toList()

val filteredArtists = getFilteredArtists(artists)

serialize(filteredArtists).write(dataOut)
writeIndex(indexOut, filteredArtists)
indexOut.close()
dataOut.close()
reader.close()
}

fun writeIndex(indexOut: OutputStream, filteredArtists: List<Artist>) {
val allNames = filteredArtists.mapIndexed { i, artist -> Pair(i, artist.name.lowercase()) }
val sortedList = allNames.sortedBy { it.second }

for (pair in sortedList) {
val i = pair.first
indexOut.write(i.shr(24))
indexOut.write(i.shr(16))
indexOut.write(i.shr(8))
indexOut.write(i)
}
}

private fun getFilteredArtists(artists: List<Artist>): List<Artist> {
var filtered = artists.filter { it.name.length >= 3 /*&& it.aliases.all { it.length >= 3 }*/ }

filtered = filtered.filter { !it.ended }

// filtered = filtered.filter { it.genre != null }

val names = filtered.map { it.name.lowercase() }.groupBy { it }.mapValues { it.value.size }
filtered = filtered.filter { names[it.name.lowercase()]!! == 1 }

return filtered
}

fun serialize(artists: List<Artist>): JSONArray {
val array = JSONArray()
for (artist in artists) {
val artistObject = JSONObject()
artistObject.put("name", artist.name)
artistObject.put("genre", artist.genre)
val aliasesArray = JSONArray()
for (alias in artist.aliases) {
aliasesArray.put(alias)
}
// artistObject.put("aliases", aliasesArray)
array.put(artistObject)
}
return array
}

fun mapArtist(line: String): Artist {
val jsonObject = JSONObject(line)
val name = jsonObject.getString("name")
val genre = mapGenre(jsonObject.getJSONArray("genres"))
val aliases = jsonObject.getJSONArray("aliases").map { (it as JSONObject).getString("name") }
val ended = jsonObject.has("ended") && jsonObject.getBoolean("ended")
return Artist(name, genre, aliases, ended)
}

fun mapGenre(jsonArray: JSONArray): String? {
val list = mutableListOf<Pair<String, Int>>()
for (entry in jsonArray) {
val obj = entry as JSONObject
list.add(Pair(obj.getString("name"), obj.getInt("count")))
}
return list.sortedWith(
Comparator.comparing<Pair<String, Int>?, Int?> { it.second }.reversed()
.then(Comparator.comparing { it.first })
).firstOrNull()?.first
}

data class Artist(
val name: String,
val genre: String?,
val aliases: List<String>,
val ended: Boolean,
)
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@ package base.boudicca.enricher.service
import base.boudicca.model.Event
import base.boudicca.model.EventCategory
import base.boudicca.SemanticKeys
import org.springframework.core.annotation.Order
import org.springframework.stereotype.Service

@Service
@Order(0)
class CategoryEnricher : Enricher {

override fun enrich(e: Event): Event {
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package base.boudicca.enricher.service.musicbrainz

data class Artist(
val name: String,
val genre: String?,
val lowercaseName: String = name.lowercase(),
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package base.boudicca.enricher.service.musicbrainz

import java.nio.ByteBuffer

class ArtistMatcher(private val artists: List<Artist>, private val index: ByteBuffer) {
fun findArtists(string: String): List<Artist> {
if (string.isEmpty()) {
return emptyList()
}

val lowerString = string.lowercase()

//TODO duplicated artists found?
val foundArtists = mutableListOf<Artist>()
foundArtists.addAll(matchArtistsFrom(lowerString, 0))
for (i in lowerString.indices) {
if (!lowerString[i].isLetterOrDigit()) {
if (i + 1 < lowerString.length) {
foundArtists.addAll(matchArtistsFrom(lowerString, i + 1))
}
}
}

return foundArtists
}

private fun matchArtistsFrom(string: String, stringIndex: Int): List<Artist> {
val matchedArtists = mutableListOf<Artist>()

var min = 0
var max = artists.size - 1
while (min <= max) {
val next = (min + max) / 2
val compare = compare(string, stringIndex, next)
if (compare == 0) {
//TODO what about multiple matches?
matchedArtists.add(artists[index.getInt(next * 4)])
break
} else if (compare < 0) {
max = next - 1
} else {
min = next + 1
}
}

return matchedArtists
}

private fun compare(string: String, stringIndex: Int, indexIndex: Int): Int {
val artistName = artists[index.getInt(indexIndex * 4)].lowercaseName

for (i in artistName.indices) {
val currentStringIndex = stringIndex + i
if (currentStringIndex >= string.length) {
return -1
}
if (string[currentStringIndex] < artistName[i]) {
return -1
}
if (string[currentStringIndex] > artistName[i]) {
return 1
}
}

//substrings matches! now look if we are at a word boundary
if (stringIndex + artistName.length == string.length) {
return 0
}
if (string[stringIndex + artistName.length].isLetterOrDigit()) {
return 1
}
return 0
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
package base.boudicca.enricher.service.musicbrainz

import base.boudicca.SemanticKeys
import base.boudicca.enricher.service.Enricher
import base.boudicca.model.Event
import base.boudicca.model.EventCategory
import com.fasterxml.jackson.core.type.TypeReference
import com.fasterxml.jackson.databind.ObjectMapper
import com.fasterxml.jackson.module.kotlin.KotlinModule
import org.slf4j.LoggerFactory
import org.springframework.beans.factory.annotation.Autowired
import org.springframework.beans.factory.annotation.Value
import org.springframework.stereotype.Service
import java.io.BufferedInputStream
import java.io.File
import java.io.FileInputStream
import java.nio.ByteBuffer
import java.util.zip.GZIPInputStream

@Service
class MusicBrainzArtistEnricher @Autowired constructor(
@Value("\${boudicca.enricher.musicbrainz.data.path:}") musicBrainzDataPath: String?,
@Value("\${boudicca.enricher.musicbrainz.index.path:}") musicBrainzIndexPath: String?,
) : Enricher {

private val LOG = LoggerFactory.getLogger(this.javaClass)

private val objectMapper = ObjectMapper().registerModule(KotlinModule.Builder().build())
private val artistMatcher = createArtistMatcher(musicBrainzDataPath, musicBrainzIndexPath)

override fun enrich(e: Event): Event {
if (artistMatcher == null) {
return e
}
return doEnrich(e, artistMatcher)
}

private fun doEnrich(e: Event, artistMatcher: ArtistMatcher): Event {
if (e.data[SemanticKeys.CATEGORY] != EventCategory.MUSIC.name) {
return e
}
val foundArtists = artistMatcher.findArtists(e.name)
if (foundArtists.isNotEmpty()) {
val nonSubstringArtists = foundArtists.filter { artist ->
foundArtists.none { it.name.length != artist.name.length && it.name.contains(artist.name, true) }
}
return insertArtistData(e, nonSubstringArtists)
}
return e
}

private fun insertArtistData(e: Event, artists: List<Artist>): Event {
val enrichedData = e.data.toMutableMap()
enrichedData[SemanticKeys.CONCERT_BANDLIST] = artists.joinToString("\n") { it.name }
val genre = artists.firstNotNullOfOrNull { it.genre }
if (genre != null && !enrichedData.containsKey(SemanticKeys.CONCERT_GENRE)) {
enrichedData[SemanticKeys.CONCERT_GENRE] = genre
}
return Event(e.name, e.startDate, enrichedData)
}

private fun createArtistMatcher(musicBrainzDataPath: String?, musicBrainzIndexPath: String?): ArtistMatcher? {
val data = loadData(musicBrainzDataPath)
val index = loadIndex(musicBrainzIndexPath)
if (data != null && index != null) {
return ArtistMatcher(data, index)
}
return null
}

private fun loadData(musicBrainzDataPath: String?): List<Artist>? {
if (musicBrainzDataPath.isNullOrBlank()) {
LOG.debug("no musicBrainzDataPath given, disabling enricher")
return null
}
val file = File(musicBrainzDataPath)
if (!file.exists() || !file.isFile || !file.canRead()) {
throw IllegalArgumentException("musicbrainz data path $musicBrainzDataPath is not a readable file!")
}
return objectMapper
.readValue(BufferedInputStream(GZIPInputStream(FileInputStream(file))), object : TypeReference<List<Artist>?>() {})
}

private fun loadIndex(musicBrainzIndexPath: String?): ByteBuffer? {
if (musicBrainzIndexPath.isNullOrBlank()) {
LOG.debug("no musicBrainzIndexPath given, disabling enricher")
return null
}
val file = File(musicBrainzIndexPath)
if (!file.exists() || !file.isFile || !file.canRead()) {
throw IllegalArgumentException("musicbrainz index path $musicBrainzIndexPath is not a readable file!")
}
return ByteBuffer.wrap(BufferedInputStream(GZIPInputStream(FileInputStream(file))).readBytes())
}

}
10 changes: 10 additions & 0 deletions boudicca.base/enricher/src/main/resources/application.properties
Original file line number Diff line number Diff line change
@@ -1 +1,11 @@
server.port=8085

#for local testing, enable if needed

#musicbrainz enricher
#boudicca.enricher.musicbrainz.data.path=./boudicca.base/enricher-utils/data/artist_parsed.json.gz
#boudicca.enricher.musicbrainz.index.path=./boudicca.base/enricher-utils/data/artist.index.gz

#location enricher
#boudicca.enricher.location.googleCredentialsPath=./boudicca.base/enricher-utils/data/googleCredentials.json
#boudicca.enricher.location.spreadsheetId=1yYOE5gRR6gjNBim7hwEe3__fXoRAMtREkYbs-lsn7uM
Loading

0 comments on commit 6e7a3ff

Please sign in to comment.