From d1a2f71ac5615de99d07a1734acb9e86d0d23c4a Mon Sep 17 00:00:00 2001 From: Adam Shannon Date: Wed, 2 Jul 2025 10:28:05 -0500 Subject: [PATCH] feat: add sortable keys for record linkage --- go.mod | 8 + go.sum | 15 ++ internal/linksim/helpers_test.go | 64 +++++++ internal/linksim/linksim.go | 305 +++++++++++++++++++++++++++++++ internal/linksim/linksim_test.go | 32 ++++ internal/linksim/sort.go | 223 ++++++++++++++++++++++ internal/linksim/sort_test.go | 35 ++++ 7 files changed, 682 insertions(+) create mode 100644 internal/linksim/helpers_test.go create mode 100644 internal/linksim/linksim.go create mode 100644 internal/linksim/linksim_test.go create mode 100644 internal/linksim/sort.go create mode 100644 internal/linksim/sort_test.go diff --git a/go.mod b/go.mod index 35e3a0f31..c33336339 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( github.com/adamdecaf/merge v0.1.1 github.com/antchfx/htmlquery v1.3.4 github.com/bbalet/stopwords v1.0.0 + github.com/dgryski/go-minhash v0.0.0-20190315135803-ad340ca03076 github.com/dongri/phonenumber v0.1.12 github.com/gorilla/mux v1.8.1 github.com/hashicorp/go-retryablehttp v0.7.8 @@ -19,6 +20,7 @@ require ( github.com/moov-io/iso3166 v0.3.0 github.com/openvenues/gopostal v0.0.0-20240426055609-4fe3a773f519 github.com/pariz/gountries v0.1.6 + github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72 github.com/stretchr/testify v1.11.1 github.com/urfave/cli/v3 v3.5.0 github.com/vmihailenco/msgpack/v5 v5.4.1 @@ -28,6 +30,7 @@ require ( go.uber.org/automaxprocs v1.6.0 golang.org/x/sync v0.17.0 golang.org/x/text v0.30.0 + gopkg.in/go-dedup/simhash.v1 v1.0.0-20170701025421-ab6ea107ab65 ) require ( @@ -55,6 +58,8 @@ require ( github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/cncf/xds/go v0.0.0-20250501225837-2ac532fd4443 // indirect github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect + github.com/dgryski/go-metro v0.0.0-20250106013310-edb8663e5e33 // indirect + github.com/dgryski/go-spooky v0.0.0-20170606183049-ed3d087f40e2 // indirect github.com/envoyproxy/go-control-plane/envoy v1.32.4 // indirect github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect @@ -64,6 +69,9 @@ require ( github.com/fyne-io/glfw-js v0.3.0 // indirect github.com/fyne-io/image v0.1.1 // indirect github.com/fyne-io/oksvg v0.2.0 // indirect + github.com/go-dedup/megophone v0.0.0-20170830025436-f01be21026f5 // indirect + github.com/go-dedup/simhash v0.0.0-20170904020510-9ecaca7b509c // indirect + github.com/go-dedup/text v0.0.0-20170907015346-8bb1b95e3cb7 // indirect github.com/go-gl/gl v0.0.0-20231021071112-07e5d0ea2e71 // indirect github.com/go-gl/glfw/v3.3/glfw v0.0.0-20240506104042-037f3cc74f2a // indirect github.com/go-jose/go-jose/v4 v4.1.2 // indirect diff --git a/go.sum b/go.sum index 7c5dd1c74..cada6b77d 100644 --- a/go.sum +++ b/go.sum @@ -710,6 +710,12 @@ github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSs github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dgryski/go-metro v0.0.0-20250106013310-edb8663e5e33 h1:ucRHb6/lvW/+mTEIGbvhcYU3S8+uSNkuMjx/qZFfhtM= +github.com/dgryski/go-metro v0.0.0-20250106013310-edb8663e5e33/go.mod h1:c9O8+fpSOX1DM8cPNSkX/qsBWdkD4yd2dpciOWQjpBw= +github.com/dgryski/go-minhash v0.0.0-20190315135803-ad340ca03076 h1:EB7M2v8Svo3kvIDy+P1YDE22XskDQP+TEYGzeDwPAN4= +github.com/dgryski/go-minhash v0.0.0-20190315135803-ad340ca03076/go.mod h1:VBi0XHpFy0xiMySf6YpVbRqrupW4RprJ5QTyN+XvGSM= +github.com/dgryski/go-spooky v0.0.0-20170606183049-ed3d087f40e2 h1:lx1ZQgST/imDhmLpYDma1O3Cx9L+4Ie4E8S2RjFPQ30= +github.com/dgryski/go-spooky v0.0.0-20170606183049-ed3d087f40e2/go.mod h1:hgHYKsoIw7S/hlWtP7wD1wZ7SX1jPTtKko5X9jrOgPQ= github.com/dhui/dktest v0.4.6 h1:+DPKyScKSEp3VLtbMDHcUq6V5Lm5zfZZVb0Sk7Ahom4= github.com/dhui/dktest v0.4.6/go.mod h1:JHTSYDtKkvFNFHJKqCzVzqXecyv+tKt8EzceOmQOgbU= github.com/distribution/reference v0.6.0 h1:0IXCQ5g4/QMHHkarYzh5l+u8T3t73zM5QvfrDyIgxBk= @@ -770,6 +776,12 @@ github.com/fyne-io/image v0.1.1/go.mod h1:xrfYBh6yspc+KjkgdZU/ifUC9sPA5Iv7WYUBzQ github.com/fyne-io/oksvg v0.2.0 h1:mxcGU2dx6nwjJsSA9PCYZDuoAcsZ/OuJlvg/Q9Njfo8= github.com/fyne-io/oksvg v0.2.0/go.mod h1:dJ9oEkPiWhnTFNCmRgEze+YNprJF7YRbpjgpWS4kzoI= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= +github.com/go-dedup/megophone v0.0.0-20170830025436-f01be21026f5 h1:4U+x+EB1P66zwYgTjxWXSOT8vF+651Ksr1lojiCZnT8= +github.com/go-dedup/megophone v0.0.0-20170830025436-f01be21026f5/go.mod h1:poR/Cp00iqtqu9ltFwl6C00sKC0HY13u/Gh05ZBmP54= +github.com/go-dedup/simhash v0.0.0-20170904020510-9ecaca7b509c h1:mucYYQn+sMGNSxidhleonzAdwL203RxhjJGnxQU4NWU= +github.com/go-dedup/simhash v0.0.0-20170904020510-9ecaca7b509c/go.mod h1:gO3u2bjRAgUaLdQd2XK+3oooxrheOAx1BzS7WmPzw1s= +github.com/go-dedup/text v0.0.0-20170907015346-8bb1b95e3cb7 h1:11wFcswN+37U+ByjxdKzsRY5KzNqqq5Uk5ztxnLOc7w= +github.com/go-dedup/text v0.0.0-20170907015346-8bb1b95e3cb7/go.mod h1:wSsK4VOECOSfSYTzkBFw+iGY7wj59e7X96ABtNj9aCQ= github.com/go-fonts/dejavu v0.1.0/go.mod h1:4Wt4I4OU2Nq9asgDCteaAaWZOV24E+0/Pwo0gppep4g= github.com/go-fonts/latin-modern v0.2.0/go.mod h1:rQVLdDMK+mK1xscDwsqM5J8U2jrRa3T0ecnM9pNujks= github.com/go-fonts/liberation v0.1.1/go.mod h1:K6qoJYypsmfVjWg8KOVDQhLc8UDgIK2HYqyqAO9z7GY= @@ -1114,6 +1126,7 @@ github.com/sagikazarmark/locafero v0.11.0 h1:1iurJgmM9G3PA/I+wWYIOw/5SyBtxapeHDc github.com/sagikazarmark/locafero v0.11.0/go.mod h1:nVIGvgyzw595SUSUE6tvCp3YYTeHs15MvlmU87WwIik= github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8 h1:+jumHNA0Wrelhe64i8F6HNlS8pkoyMv5sreGx2Ry5Rw= github.com/sourcegraph/conc v0.3.1-0.20240121214520-5f936abd7ae8/go.mod h1:3n1Cwaq1E1/1lhQhtRK2ts/ZwZEhjcQeJQ1RuC6Q/8U= +github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72 h1:qLC7fQah7D6K1B0ujays3HV9gkFtllcxhzImRR7ArPQ= github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/spf13/afero v1.3.3/go.mod h1:5KUK8ByomD5Ti5Artl0RtHeI5pTF7MIDuXL3yY520V4= github.com/spf13/afero v1.6.0/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I= @@ -1893,6 +1906,8 @@ gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= +gopkg.in/go-dedup/simhash.v1 v1.0.0-20170701025421-ab6ea107ab65 h1:TJ8gu/i0KT5Sc0rsphBYjf3yU5BKZcG5DPcP2syQqQ8= +gopkg.in/go-dedup/simhash.v1 v1.0.0-20170701025421-ab6ea107ab65/go.mod h1:BHrmqRyqVLqZ7iHTOShMqvNcbExK9Cfyyc+tVN547+4= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.7/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/internal/linksim/helpers_test.go b/internal/linksim/helpers_test.go new file mode 100644 index 000000000..5e8e1dd85 --- /dev/null +++ b/internal/linksim/helpers_test.go @@ -0,0 +1,64 @@ +package linksim + +import ( + "github.com/moov-io/watchman/pkg/search" +) + +var ( + john = (search.Entity[search.Value]{ + Name: "John Smith", + Type: search.EntityPerson, + Source: search.SourceUSOFAC, + Person: &search.Person{ + Name: "John Smith", + GovernmentIDs: []search.GovernmentID{ + { + Type: search.GovernmentIDPassport, + Country: "US", + Identifier: "1234567890", + }, + }, + }, + Contact: search.ContactInfo{ + EmailAddresses: []string{"john.smith123@example.com"}, + }, + Addresses: []search.Address{ + { + Line1: "541 First St", + City: "Anytown", + State: "CA", + PostalCode: "90210", + Country: "US", + }, + }, + }).Normalize() + + johnathon = (search.Entity[search.Value]{ + Name: "Johnathon Smith", + Type: search.EntityPerson, + Source: search.SourceUSOFAC, + Person: &search.Person{ + Name: "Johnathon Smith", + GovernmentIDs: []search.GovernmentID{ + { + Type: search.GovernmentIDPassport, + Country: "US", + Identifier: "1234567890", + }, + }, + }, + Contact: search.ContactInfo{ + EmailAddresses: []string{"johnathon.smith123@example.com"}, + }, + Addresses: []search.Address{ + { + Line1: "541 First St", + Line2: "Apt 301", + City: "Anytown", + State: "CA", + PostalCode: "90210", + Country: "US", + }, + }, + }).Normalize() +) diff --git a/internal/linksim/linksim.go b/internal/linksim/linksim.go new file mode 100644 index 000000000..820cba9eb --- /dev/null +++ b/internal/linksim/linksim.go @@ -0,0 +1,305 @@ +package linksim + +import ( + "crypto/sha256" + "encoding/binary" + "fmt" + "strconv" + "time" + + "github.com/moov-io/watchman/internal/prepare" + "github.com/moov-io/watchman/pkg/search" + + "github.com/dgryski/go-minhash" + "github.com/spaolacci/murmur3" + "gopkg.in/go-dedup/simhash.v1" +) + +// Hashes represents the output for database storage, mapping field paths to their hash values. +type Hashes map[string]interface{} + +// GenerateHashes computes hashes for each field in the Entity, suitable for database storage and sorting. +func GenerateHashes(e search.Entity[search.Value]) Hashes { + hashes := make(Hashes) + + // Use normalized fields from PreparedFields where available + hashes["Name"] = calculateSimhash(e.PreparedFields.Name) + if len(e.PreparedFields.AltNames) > 0 { + hashes["AltNames"] = calculateMinhash(e.PreparedFields.AltNames, 128) // 128 hash functions for minhash + } + hashes["EntityType"] = hashString(string(e.Type)) + hashes["Source"] = hashString(string(e.Source)) + hashes["SourceID"] = calculateSimhash(e.SourceID) + + // Person + if e.Person != nil { + hashes["Person.Name"] = calculateSimhash(e.Person.Name) + if len(e.Person.AltNames) > 0 { + hashes["Person.AltNames"] = calculateMinhash(normalizeNames(e.Person.AltNames), 128) + } + hashes["Person.Gender"] = hashString(string(e.Person.Gender)) + if e.Person.BirthDate != nil { + hashes["Person.BirthDate"] = hashDate(e.Person.BirthDate) + } + hashes["Person.PlaceOfBirth"] = calculateSimhash(e.Person.PlaceOfBirth) + if e.Person.DeathDate != nil { + hashes["Person.DeathDate"] = hashDate(e.Person.DeathDate) + } + if len(e.Person.Titles) > 0 { + hashes["Person.Titles"] = calculateMinhash(normalizeNames(e.Person.Titles), 128) + } + if len(e.Person.GovernmentIDs) > 0 { + for i, id := range e.Person.GovernmentIDs { + prefix := fmt.Sprintf("Person.GovernmentIDs[%d]", i) + hashes[prefix+".Type"] = hashString(string(id.Type)) + hashes[prefix+".Country"] = calculateSimhash(id.Country) + hashes[prefix+".Identifier"] = calculateSimhash(id.Identifier) + } + } + } + + // Business + if e.Business != nil { + hashes["Business.Name"] = calculateSimhash(e.Business.Name) + if len(e.Business.AltNames) > 0 { + hashes["Business.AltNames"] = calculateMinhash(normalizeNames(e.Business.AltNames), 128) + } + if e.Business.Created != nil { + hashes["Business.Created"] = hashDate(e.Business.Created) + } + if e.Business.Dissolved != nil { + hashes["Business.Dissolved"] = hashDate(e.Business.Dissolved) + } + if len(e.Business.GovernmentIDs) > 0 { + for i, id := range e.Business.GovernmentIDs { + prefix := fmt.Sprintf("Business.GovernmentIDs[%d]", i) + hashes[prefix+".Type"] = hashString(string(id.Type)) + hashes[prefix+".Country"] = calculateSimhash(id.Country) + hashes[prefix+".Identifier"] = calculateSimhash(id.Identifier) + } + } + } + + // Organization (similar to Business) + if e.Organization != nil { + hashes["Organization.Name"] = calculateSimhash(e.Organization.Name) + if len(e.Organization.AltNames) > 0 { + hashes["Organization.AltNames"] = calculateMinhash(normalizeNames(e.Organization.AltNames), 128) + } + if e.Organization.Created != nil { + hashes["Organization.Created"] = hashDate(e.Organization.Created) + } + if e.Organization.Dissolved != nil { + hashes["Organization.Dissolved"] = hashDate(e.Organization.Dissolved) + } + if len(e.Organization.GovernmentIDs) > 0 { + for i, id := range e.Organization.GovernmentIDs { + prefix := fmt.Sprintf("Organization.GovernmentIDs[%d]", i) + hashes[prefix+".Type"] = hashString(string(id.Type)) + hashes[prefix+".Country"] = calculateSimhash(id.Country) + hashes[prefix+".Identifier"] = calculateSimhash(id.Identifier) + } + } + } + + // Aircraft + if e.Aircraft != nil { + hashes["Aircraft.Name"] = calculateSimhash(e.Aircraft.Name) + if len(e.Aircraft.AltNames) > 0 { + hashes["Aircraft.AltNames"] = calculateMinhash(normalizeNames(e.Aircraft.AltNames), 128) + } + hashes["Aircraft.Type"] = hashString(string(e.Aircraft.Type)) + hashes["Aircraft.Flag"] = calculateSimhash(e.Aircraft.Flag) + if e.Aircraft.Built != nil { + hashes["Aircraft.Built"] = hashDate(e.Aircraft.Built) + } + hashes["Aircraft.ICAOCode"] = calculateSimhash(e.Aircraft.ICAOCode) + hashes["Aircraft.Model"] = calculateSimhash(e.Aircraft.Model) + hashes["Aircraft.SerialNumber"] = calculateSimhash(e.Aircraft.SerialNumber) + } + + // Vessel + if e.Vessel != nil { + hashes["Vessel.Name"] = calculateSimhash(e.Vessel.Name) + if len(e.Vessel.AltNames) > 0 { + hashes["Vessel.AltNames"] = calculateMinhash(normalizeNames(e.Vessel.AltNames), 128) + } + hashes["Vessel.IMONumber"] = calculateSimhash(e.Vessel.IMONumber) + hashes["Vessel.Type"] = hashString(string(e.Vessel.Type)) + hashes["Vessel.Flag"] = calculateSimhash(e.Vessel.Flag) + if e.Vessel.Built != nil { + hashes["Vessel.Built"] = hashDate(e.Vessel.Built) + } + hashes["Vessel.Model"] = calculateSimhash(e.Vessel.Model) + hashes["Vessel.MMSI"] = calculateSimhash(e.Vessel.MMSI) + hashes["Vessel.CallSign"] = calculateSimhash(e.Vessel.CallSign) + hashes["Vessel.Owner"] = calculateSimhash(e.Vessel.Owner) + // Numerical fields: simple hash of string representation + hashes["Vessel.Tonnage"] = hashString(fmt.Sprintf("%d", e.Vessel.Tonnage)) + hashes["Vessel.GrossRegisteredTonnage"] = hashString(fmt.Sprintf("%d", e.Vessel.GrossRegisteredTonnage)) + } + + // ContactInfo + if len(e.PreparedFields.Contact.EmailAddresses) > 0 { + hashes["Contact.EmailAddresses"] = calculateMinhash(normalizeNames(e.Contact.EmailAddresses), 128) + } + if len(e.PreparedFields.Contact.PhoneNumbers) > 0 { + hashes["Contact.PhoneNumbers"] = calculateMinhash(e.PreparedFields.Contact.PhoneNumbers, 128) + } + if len(e.PreparedFields.Contact.FaxNumbers) > 0 { + hashes["Contact.FaxNumbers"] = calculateMinhash(e.PreparedFields.Contact.FaxNumbers, 128) + } + if len(e.Contact.Websites) > 0 { + hashes["Contact.Websites"] = calculateMinhash(normalizeNames(e.Contact.Websites), 128) + } + + // Addresses + if len(e.PreparedFields.Addresses) > 0 { + for i, addr := range e.PreparedFields.Addresses { + prefix := fmt.Sprintf("Addresses[%d]", i) + hashes[prefix+".Line1"] = calculateSimhash(addr.Line1) + if len(addr.Line1Fields) > 0 { + hashes[prefix+".Line1Fields"] = calculateMinhash(addr.Line1Fields, 128) + } + hashes[prefix+".Line2"] = calculateSimhash(addr.Line2) + if len(addr.Line2Fields) > 0 { + hashes[prefix+".Line2Fields"] = calculateMinhash(addr.Line2Fields, 128) + } + hashes[prefix+".City"] = calculateSimhash(addr.City) + if len(addr.CityFields) > 0 { + hashes[prefix+".CityFields"] = calculateMinhash(addr.CityFields, 128) + } + hashes[prefix+".PostalCode"] = calculateSimhash(addr.PostalCode) + hashes["Addresses["+strconv.Itoa(i)+"].State"] = calculateSimhash(addr.State) + hashes[prefix+".Country"] = calculateSimhash(addr.Country) + } + } + + // CryptoAddresses + if len(e.CryptoAddresses) > 0 { + for i, crypto := range e.CryptoAddresses { + prefix := fmt.Sprintf("CryptoAddresses[%d]", i) + hashes[prefix+".Currency"] = hashString(crypto.Currency) + hashes[prefix+".Address"] = calculateSimhash(crypto.Address) + } + } + + // Affiliations + if len(e.Affiliations) > 0 { + for i, aff := range e.Affiliations { + prefix := fmt.Sprintf("Affiliations[%d]", i) + hashes[prefix+".EntityName"] = calculateSimhash(aff.EntityName) + hashes[prefix+".Type"] = hashString(aff.Type) + hashes[prefix+".Details"] = calculateSimhash(aff.Details) + } + } + + // SanctionsInfo + if e.SanctionsInfo != nil { + if len(e.SanctionsInfo.Programs) > 0 { + hashes["SanctionsInfo.Programs"] = calculateMinhash(normalizeNames(e.SanctionsInfo.Programs), 128) + } + hashes["SanctionsInfo.Secondary"] = hashString(fmt.Sprintf("%t", e.SanctionsInfo.Secondary)) + hashes["SanctionsInfo.Description"] = calculateSimhash(e.SanctionsInfo.Description) + } + + // HistoricalInfo + if len(e.HistoricalInfo) > 0 { + for i, hist := range e.HistoricalInfo { + prefix := fmt.Sprintf("HistoricalInfo[%d]", i) + hashes[prefix+".Type"] = hashString(hist.Type) + hashes[prefix+".Value"] = calculateSimhash(hist.Value) + hashes[prefix+".Date"] = hashDate(&hist.Date) + } + } + + return hashes +} + +// calculateSimhash computes a simhash for a string using the go-dedup/simhash library. +func calculateSimhash(s string) uint64 { + if s == "" { + return 0 + } + fs := simhash.NewWordFeatureSet([]byte(prepare.LowerAndRemovePunctuation(s))) + return simhash.Simhash(fs) +} + +// calculateMinhash computes a minhash signature for a list of strings. +func calculateMinhash(list []string, k int) []uint64 { + if len(list) == 0 { + return nil + } + // Create a 64-bit hash function using Murmur3 + hashFunc := func(data []byte) uint64 { + h := murmur3.New64() + h.Write(data) + return h.Sum64() + } + // Initialize BottomK with the hash function and k + mh := minhash.NewBottomK(hashFunc, k) + for _, item := range list { + if item != "" { + // Normalize the string before hashing + normalized := prepare.LowerAndRemovePunctuation(item) + mh.Push([]byte(normalized)) + } + } + return mh.Signature() +} + +// hashString computes a simple hash for categorical fields or simple values. +func hashString(s string) uint64 { + if s == "" { + return 0 + } + h := sha256.Sum256([]byte(s)) + return binary.BigEndian.Uint64(h[:8]) +} + +// hashDate computes a hash for a date by discretizing to year. +func hashDate(t *time.Time) uint64 { + if t == nil { + return 0 + } + return uint64(t.Year()) +} + +const ( + DefaultBuckets = 256 +) + +// GenerateBuckets maps hashes to buckets for blocking, using a simple modulo operation. +func (h Hashes) GenerateBuckets(numBuckets int) map[string][]int { + buckets := make(map[string][]int) + for field, hash := range h { + switch v := hash.(type) { + case uint64: + if v != 0 { + bucket := int(v % uint64(numBuckets)) + buckets[field] = []int{bucket} + } + case []uint64: + buckets[field] = make([]int, len(v)) + for i, val := range v { + buckets[field][i] = int(val % uint64(numBuckets)) + } + default: + fmt.Printf("unexpected %v - %v (%T)\n", field, hash, hash) + } + } + return buckets +} + +// normalizeNames normalizes a slice of strings by lowercasing and removing punctuation. +func normalizeNames(altNames []string) []string { + if len(altNames) == 0 { + return nil + } + + out := make([]string, len(altNames)) + for idx := range altNames { + out[idx] = prepare.LowerAndRemovePunctuation(altNames[idx]) + } + return out +} diff --git a/internal/linksim/linksim_test.go b/internal/linksim/linksim_test.go new file mode 100644 index 000000000..c8204c7fa --- /dev/null +++ b/internal/linksim/linksim_test.go @@ -0,0 +1,32 @@ +package linksim + +import ( + "fmt" + "testing" +) + +func TestGenerateHashes(t *testing.T) { + t.Run("John, Johnny, Johnathon", func(t *testing.T) { + johnHashes := GenerateHashes(john) + fmt.Println("john") + for key, value := range johnHashes { + fmt.Printf(" %v - %v\n", key, value) + } + fmt.Printf("buckets(8): %#v\n", johnHashes.GenerateBuckets(32)) + fmt.Printf("buckets(16): %#v\n", johnHashes.GenerateBuckets(32)) + fmt.Printf("buckets(32): %#v\n", johnHashes.GenerateBuckets(32)) + fmt.Printf("buckets(64): %#v\n", johnHashes.GenerateBuckets(32)) + + fmt.Printf("\n\n") + + johnathonHashes := GenerateHashes(johnathon) + fmt.Println("johnathon") + for key, value := range johnathonHashes { + fmt.Printf(" %v - %v\n", key, value) + } + fmt.Printf("buckets(8): %#v\n", johnathonHashes.GenerateBuckets(32)) + fmt.Printf("buckets(16): %#v\n", johnathonHashes.GenerateBuckets(32)) + fmt.Printf("buckets(32): %#v\n", johnathonHashes.GenerateBuckets(32)) + fmt.Printf("buckets(64): %#v\n", johnathonHashes.GenerateBuckets(32)) + }) +} diff --git a/internal/linksim/sort.go b/internal/linksim/sort.go new file mode 100644 index 000000000..6dd2f6cf6 --- /dev/null +++ b/internal/linksim/sort.go @@ -0,0 +1,223 @@ +package linksim + +import ( + "bytes" + "fmt" + "slices" + "strings" +) + +// SortKeys returns a slice of sortable keys from the given buckets. +// +// Sortable keys can be put into a database where the closest neighbors would be records most similar to the target. +func SortKeys(buckets map[string][]int) []string { + var out []string + + // Type key + out = makeKey(buckets, "EntityType", typeKey, out) + + // Source key + source, foundSource := buckets["Source"] + sourceID, foundSourceID := buckets["SourceID"] + if foundSource && foundSourceID { + out = append(out, makeSourceKey(source, sourceID)) + } + + // Name keys + out = makeKey(buckets, "Name", nameKey, out) + out = makeKey(buckets, "AltNames", nameKey, out) + + // GovernmentID keys + out = makeGovernmentIDKeys(buckets, out) + + // Address keys + out = makeAddressKeys(buckets, out) + + // Contact keys // TODO(adam): + // "Contact.EmailAddresses" + // "Contact.PhoneNumbers" + + return out +} + +func makeKey(buckets map[string][]int, key string, f func([]int) string, out []string) []string { + if v, found := buckets[key]; found { + return append(out, f(v)) + } + return out +} + +func typeKey(vs []int) string { + slices.Sort(vs) + + var buf bytes.Buffer + buf.WriteString("TYPE:") + for idx := range vs { + if idx > 1 { + buf.WriteString("|") + } + buf.WriteString(fmt.Sprintf("%4.4d", vs[idx])) + } + return buf.String() +} + +func makeSourceKey(source []int, sourceID []int) string { + slices.Sort(source) + slices.Sort(sourceID) + + var buf bytes.Buffer + buf.WriteString("SOURCE:") + for idx := range source { + if idx > 1 { + buf.WriteString("|") + } + buf.WriteString(fmt.Sprintf("L:%4.4d", source[idx])) + } + + for idx := range sourceID { + if idx > 1 { + buf.WriteString("|") + } + buf.WriteString(fmt.Sprintf("X%4.4d", sourceID[idx])) + } + + return buf.String() +} + +func nameKey(vs []int) string { + slices.Sort(vs) + + var buf bytes.Buffer + buf.WriteString("NAME:") + for idx := range vs { + if idx > 1 { + buf.WriteString("|") + } + buf.WriteString(fmt.Sprintf("%4.4d", vs[idx])) + } + return buf.String() +} + +func makeGovernmentIDKeys(buckets map[string][]int, out []string) []string { + // Look for Person and Business GovernmentIDs + out = readGovernmentIDs(buckets, "Person.GovernmentIDs[%d].%s", out) + out = readGovernmentIDs(buckets, "Business.GovernmentIDs[%d].%s", out) + + return out +} + +func readGovernmentIDs(buckets map[string][]int, pattern string, out []string) []string { + idx := 0 + for { + countryKey := fmt.Sprintf(pattern, idx, "Country") + if country, found := buckets[countryKey]; found { + // Look for Identifier and Type now + identifier, found := buckets[fmt.Sprintf(pattern, idx, "Identifier")] + if !found { + continue + } + tpe, found := buckets[fmt.Sprintf(pattern, idx, "Type")] + if !found { + continue + } + + out = append(out, makeGovernmentIDKey(country, tpe, identifier)) + + idx++ + } else { + break + } + } + return out +} + +func makeGovernmentIDKey(country, tpe, identifier []int) string { + slices.Sort(country) + slices.Sort(tpe) + slices.Sort(identifier) + + var buf bytes.Buffer + buf.WriteString("GOVID:") + + if len(country) == 0 || len(tpe) == 0 || len(identifier) == 0 { + return fmt.Sprintf("INVALID: %v / %v / %v", country, tpe, identifier) + } + + // only take the first + buf.WriteString(fmt.Sprintf("C%4.4d", country[0])) + buf.WriteString("|") + buf.WriteString(fmt.Sprintf("T%4.4d", tpe[0])) + buf.WriteString("|") + buf.WriteString(fmt.Sprintf("X%4.4d", identifier[0])) + + return buf.String() +} + +func makeAddressKeys(buckets map[string][]int, out []string) []string { + for i := 0; i < 20; i++ { + var buf bytes.Buffer + buf.WriteString("ADDR:") + + var fieldsCollected int + pattern := fmt.Sprintf("Addresses[%d].%%s", i) + + // Look for the fields from broadest to most precise (then extra) + // starting with Country. + if v, found := buckets[fmt.Sprintf(pattern, "Country")]; found && len(v) > 0 { + fieldsCollected++ + buf.WriteString(fmt.Sprintf("C%4.4d|", v[0])) + } + if v, found := buckets[fmt.Sprintf(pattern, "State")]; found && len(v) > 0 { + fieldsCollected++ + buf.WriteString(fmt.Sprintf("S%4.4d|", v[0])) + } + if v, found := buckets[fmt.Sprintf(pattern, "PostalCode")]; found && len(v) > 0 { + fieldsCollected++ + buf.WriteString(fmt.Sprintf("P%4.4d|", v[0])) + } + if v, found := buckets[fmt.Sprintf(pattern, "CityFields")]; found && len(v) > 0 { + fieldsCollected++ + buf.WriteString("Y") + + for idx := range v { + if idx > 0 { + buf.WriteString(",") + } + buf.WriteString(fmt.Sprintf("%4.4d", v[idx])) + } + } + buf.WriteString("|") + if v, found := buckets[fmt.Sprintf(pattern, "Line1Fields")]; found && len(v) > 0 { + fieldsCollected++ + buf.WriteString("L") + + for idx := range v { + if idx > 0 { + buf.WriteString(",") + } + buf.WriteString(fmt.Sprintf("%4.4d", v[idx])) + } + } + buf.WriteString("|") + if v, found := buckets[fmt.Sprintf(pattern, "Line2Fields")]; found && len(v) > 0 { + fieldsCollected++ + buf.WriteString("E") + + for idx := range v { + if idx > 0 { + buf.WriteString(",") + } + buf.WriteString(fmt.Sprintf("%4.4d", v[idx])) + } + } + + if fieldsCollected > 0 { + addr := strings.TrimSuffix(buf.String(), "|") + out = append(out, addr) + } else { + break // nothing found for this index, so no further indexes will have anything + } + } + + return out +} diff --git a/internal/linksim/sort_test.go b/internal/linksim/sort_test.go new file mode 100644 index 000000000..42bdfe326 --- /dev/null +++ b/internal/linksim/sort_test.go @@ -0,0 +1,35 @@ +package linksim + +import ( + "testing" + + "github.com/moov-io/watchman/pkg/search" + + "github.com/stretchr/testify/require" +) + +func TestSortKeys(t *testing.T) { + cases := []struct { + entity search.Entity[search.Value] + expected []string + }{ + { + entity: john, + expected: []string{ + "TYPE:0230", + "NAME:0190", + "GOVID:C0173|T0190|X0146", + "ADDR:C0143|S0021|P0007|Y0023|L0201,0028,0173", + }, + }, + } + for _, tc := range cases { + t.Run(tc.entity.Name, func(t *testing.T) { + hashes := GenerateHashes(tc.entity) + buckets := hashes.GenerateBuckets(DefaultBuckets) + keys := SortKeys(buckets) + + require.ElementsMatch(t, tc.expected, keys) + }) + } +}