-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #185 from boudicca-events/abl/musicbrainz
musicbrainz enricher
- Loading branch information
Showing
11 changed files
with
484 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
data/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
98 changes: 98 additions & 0 deletions
98
...a.base/enricher-utils/src/main/kotlin/base/boudicca/enricher_utils/MusicBrainzImporter.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
package base.boudicca.enricher_utils | ||
|
||
import org.json.JSONArray | ||
import org.json.JSONObject | ||
import java.io.* | ||
import java.util.zip.GZIPOutputStream | ||
import java.util.zip.ZipOutputStream | ||
import kotlin.streams.asSequence | ||
|
||
fun main() { | ||
|
||
val reader = BufferedReader(FileReader(File("./boudicca.base/enricher-utils/data/artist"))) | ||
val dataOut = OutputStreamWriter(GZIPOutputStream(BufferedOutputStream(FileOutputStream("./boudicca.base/enricher-utils/data/artist_parsed.json.gz", false)))) | ||
val indexOut = GZIPOutputStream(BufferedOutputStream(FileOutputStream("./boudicca.base/enricher-utils/data/artist.index.gz", false))) | ||
|
||
val artists = reader.lines().asSequence() | ||
// .take(100) | ||
.map { mapArtist(it) } | ||
.toList() | ||
|
||
val filteredArtists = getFilteredArtists(artists) | ||
|
||
serialize(filteredArtists).write(dataOut) | ||
writeIndex(indexOut, filteredArtists) | ||
indexOut.close() | ||
dataOut.close() | ||
reader.close() | ||
} | ||
|
||
fun writeIndex(indexOut: OutputStream, filteredArtists: List<Artist>) { | ||
val allNames = filteredArtists.mapIndexed { i, artist -> Pair(i, artist.name.lowercase()) } | ||
val sortedList = allNames.sortedBy { it.second } | ||
|
||
for (pair in sortedList) { | ||
val i = pair.first | ||
indexOut.write(i.shr(24)) | ||
indexOut.write(i.shr(16)) | ||
indexOut.write(i.shr(8)) | ||
indexOut.write(i) | ||
} | ||
} | ||
|
||
private fun getFilteredArtists(artists: List<Artist>): List<Artist> { | ||
var filtered = artists.filter { it.name.length >= 3 /*&& it.aliases.all { it.length >= 3 }*/ } | ||
|
||
filtered = filtered.filter { !it.ended } | ||
|
||
// filtered = filtered.filter { it.genre != null } | ||
|
||
val names = filtered.map { it.name.lowercase() }.groupBy { it }.mapValues { it.value.size } | ||
filtered = filtered.filter { names[it.name.lowercase()]!! == 1 } | ||
|
||
return filtered | ||
} | ||
|
||
fun serialize(artists: List<Artist>): JSONArray { | ||
val array = JSONArray() | ||
for (artist in artists) { | ||
val artistObject = JSONObject() | ||
artistObject.put("name", artist.name) | ||
artistObject.put("genre", artist.genre) | ||
val aliasesArray = JSONArray() | ||
for (alias in artist.aliases) { | ||
aliasesArray.put(alias) | ||
} | ||
// artistObject.put("aliases", aliasesArray) | ||
array.put(artistObject) | ||
} | ||
return array | ||
} | ||
|
||
fun mapArtist(line: String): Artist { | ||
val jsonObject = JSONObject(line) | ||
val name = jsonObject.getString("name") | ||
val genre = mapGenre(jsonObject.getJSONArray("genres")) | ||
val aliases = jsonObject.getJSONArray("aliases").map { (it as JSONObject).getString("name") } | ||
val ended = jsonObject.has("ended") && jsonObject.getBoolean("ended") | ||
return Artist(name, genre, aliases, ended) | ||
} | ||
|
||
fun mapGenre(jsonArray: JSONArray): String? { | ||
val list = mutableListOf<Pair<String, Int>>() | ||
for (entry in jsonArray) { | ||
val obj = entry as JSONObject | ||
list.add(Pair(obj.getString("name"), obj.getInt("count"))) | ||
} | ||
return list.sortedWith( | ||
Comparator.comparing<Pair<String, Int>?, Int?> { it.second }.reversed() | ||
.then(Comparator.comparing { it.first }) | ||
).firstOrNull()?.first | ||
} | ||
|
||
data class Artist( | ||
val name: String, | ||
val genre: String?, | ||
val aliases: List<String>, | ||
val ended: Boolean, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
13 changes: 0 additions & 13 deletions
13
...base/enricher/src/main/kotlin/base/boudicca/enricher/service/MusicBrainzArtistEnricher.kt
This file was deleted.
Oops, something went wrong.
7 changes: 7 additions & 0 deletions
7
boudicca.base/enricher/src/main/kotlin/base/boudicca/enricher/service/musicbrainz/Artist.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
package base.boudicca.enricher.service.musicbrainz | ||
|
||
data class Artist( | ||
val name: String, | ||
val genre: String?, | ||
val lowercaseName: String = name.lowercase(), | ||
) |
74 changes: 74 additions & 0 deletions
74
...base/enricher/src/main/kotlin/base/boudicca/enricher/service/musicbrainz/ArtistMatcher.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
package base.boudicca.enricher.service.musicbrainz | ||
|
||
import java.nio.ByteBuffer | ||
|
||
class ArtistMatcher(private val artists: List<Artist>, private val index: ByteBuffer) { | ||
fun findArtists(string: String): List<Artist> { | ||
if (string.isEmpty()) { | ||
return emptyList() | ||
} | ||
|
||
val lowerString = string.lowercase() | ||
|
||
//TODO duplicated artists found? | ||
val foundArtists = mutableListOf<Artist>() | ||
foundArtists.addAll(matchArtistsFrom(lowerString, 0)) | ||
for (i in lowerString.indices) { | ||
if (!lowerString[i].isLetterOrDigit()) { | ||
if (i + 1 < lowerString.length) { | ||
foundArtists.addAll(matchArtistsFrom(lowerString, i + 1)) | ||
} | ||
} | ||
} | ||
|
||
return foundArtists | ||
} | ||
|
||
private fun matchArtistsFrom(string: String, stringIndex: Int): List<Artist> { | ||
val matchedArtists = mutableListOf<Artist>() | ||
|
||
var min = 0 | ||
var max = artists.size - 1 | ||
while (min <= max) { | ||
val next = (min + max) / 2 | ||
val compare = compare(string, stringIndex, next) | ||
if (compare == 0) { | ||
//TODO what about multiple matches? | ||
matchedArtists.add(artists[index.getInt(next * 4)]) | ||
break | ||
} else if (compare < 0) { | ||
max = next - 1 | ||
} else { | ||
min = next + 1 | ||
} | ||
} | ||
|
||
return matchedArtists | ||
} | ||
|
||
private fun compare(string: String, stringIndex: Int, indexIndex: Int): Int { | ||
val artistName = artists[index.getInt(indexIndex * 4)].lowercaseName | ||
|
||
for (i in artistName.indices) { | ||
val currentStringIndex = stringIndex + i | ||
if (currentStringIndex >= string.length) { | ||
return -1 | ||
} | ||
if (string[currentStringIndex] < artistName[i]) { | ||
return -1 | ||
} | ||
if (string[currentStringIndex] > artistName[i]) { | ||
return 1 | ||
} | ||
} | ||
|
||
//substrings matches! now look if we are at a word boundary | ||
if (stringIndex + artistName.length == string.length) { | ||
return 0 | ||
} | ||
if (string[stringIndex + artistName.length].isLetterOrDigit()) { | ||
return 1 | ||
} | ||
return 0 | ||
} | ||
} |
96 changes: 96 additions & 0 deletions
96
...r/src/main/kotlin/base/boudicca/enricher/service/musicbrainz/MusicBrainzArtistEnricher.kt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
package base.boudicca.enricher.service.musicbrainz | ||
|
||
import base.boudicca.SemanticKeys | ||
import base.boudicca.enricher.service.Enricher | ||
import base.boudicca.model.Event | ||
import base.boudicca.model.EventCategory | ||
import com.fasterxml.jackson.core.type.TypeReference | ||
import com.fasterxml.jackson.databind.ObjectMapper | ||
import com.fasterxml.jackson.module.kotlin.KotlinModule | ||
import org.slf4j.LoggerFactory | ||
import org.springframework.beans.factory.annotation.Autowired | ||
import org.springframework.beans.factory.annotation.Value | ||
import org.springframework.stereotype.Service | ||
import java.io.BufferedInputStream | ||
import java.io.File | ||
import java.io.FileInputStream | ||
import java.nio.ByteBuffer | ||
import java.util.zip.GZIPInputStream | ||
|
||
@Service | ||
class MusicBrainzArtistEnricher @Autowired constructor( | ||
@Value("\${boudicca.enricher.musicbrainz.data.path:}") musicBrainzDataPath: String?, | ||
@Value("\${boudicca.enricher.musicbrainz.index.path:}") musicBrainzIndexPath: String?, | ||
) : Enricher { | ||
|
||
private val LOG = LoggerFactory.getLogger(this.javaClass) | ||
|
||
private val objectMapper = ObjectMapper().registerModule(KotlinModule.Builder().build()) | ||
private val artistMatcher = createArtistMatcher(musicBrainzDataPath, musicBrainzIndexPath) | ||
|
||
override fun enrich(e: Event): Event { | ||
if (artistMatcher == null) { | ||
return e | ||
} | ||
return doEnrich(e, artistMatcher) | ||
} | ||
|
||
private fun doEnrich(e: Event, artistMatcher: ArtistMatcher): Event { | ||
if (e.data[SemanticKeys.CATEGORY] != EventCategory.MUSIC.name) { | ||
return e | ||
} | ||
val foundArtists = artistMatcher.findArtists(e.name) | ||
if (foundArtists.isNotEmpty()) { | ||
val nonSubstringArtists = foundArtists.filter { artist -> | ||
foundArtists.none { it.name.length != artist.name.length && it.name.contains(artist.name, true) } | ||
} | ||
return insertArtistData(e, nonSubstringArtists) | ||
} | ||
return e | ||
} | ||
|
||
private fun insertArtistData(e: Event, artists: List<Artist>): Event { | ||
val enrichedData = e.data.toMutableMap() | ||
enrichedData[SemanticKeys.CONCERT_BANDLIST] = artists.joinToString("\n") { it.name } | ||
val genre = artists.firstNotNullOfOrNull { it.genre } | ||
if (genre != null && !enrichedData.containsKey(SemanticKeys.CONCERT_GENRE)) { | ||
enrichedData[SemanticKeys.CONCERT_GENRE] = genre | ||
} | ||
return Event(e.name, e.startDate, enrichedData) | ||
} | ||
|
||
private fun createArtistMatcher(musicBrainzDataPath: String?, musicBrainzIndexPath: String?): ArtistMatcher? { | ||
val data = loadData(musicBrainzDataPath) | ||
val index = loadIndex(musicBrainzIndexPath) | ||
if (data != null && index != null) { | ||
return ArtistMatcher(data, index) | ||
} | ||
return null | ||
} | ||
|
||
private fun loadData(musicBrainzDataPath: String?): List<Artist>? { | ||
if (musicBrainzDataPath.isNullOrBlank()) { | ||
LOG.debug("no musicBrainzDataPath given, disabling enricher") | ||
return null | ||
} | ||
val file = File(musicBrainzDataPath) | ||
if (!file.exists() || !file.isFile || !file.canRead()) { | ||
throw IllegalArgumentException("musicbrainz data path $musicBrainzDataPath is not a readable file!") | ||
} | ||
return objectMapper | ||
.readValue(BufferedInputStream(GZIPInputStream(FileInputStream(file))), object : TypeReference<List<Artist>?>() {}) | ||
} | ||
|
||
private fun loadIndex(musicBrainzIndexPath: String?): ByteBuffer? { | ||
if (musicBrainzIndexPath.isNullOrBlank()) { | ||
LOG.debug("no musicBrainzIndexPath given, disabling enricher") | ||
return null | ||
} | ||
val file = File(musicBrainzIndexPath) | ||
if (!file.exists() || !file.isFile || !file.canRead()) { | ||
throw IllegalArgumentException("musicbrainz index path $musicBrainzIndexPath is not a readable file!") | ||
} | ||
return ByteBuffer.wrap(BufferedInputStream(GZIPInputStream(FileInputStream(file))).readBytes()) | ||
} | ||
|
||
} |
10 changes: 10 additions & 0 deletions
10
boudicca.base/enricher/src/main/resources/application.properties
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,11 @@ | ||
server.port=8085 | ||
|
||
#for local testing, enable if needed | ||
|
||
#musicbrainz enricher | ||
#boudicca.enricher.musicbrainz.data.path=./boudicca.base/enricher-utils/data/artist_parsed.json.gz | ||
#boudicca.enricher.musicbrainz.index.path=./boudicca.base/enricher-utils/data/artist.index.gz | ||
|
||
#location enricher | ||
#boudicca.enricher.location.googleCredentialsPath=./boudicca.base/enricher-utils/data/googleCredentials.json | ||
#boudicca.enricher.location.spreadsheetId=1yYOE5gRR6gjNBim7hwEe3__fXoRAMtREkYbs-lsn7uM |
Oops, something went wrong.