bakwc · bakwc · Apr 13, 2018 · Apr 13, 2018 · Apr 13, 2018
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@
 *.tar.gz
 *.spell
 *.cache
+.DS_Store
diff --git a/jamspell/lang_model.cpp b/jamspell/lang_model.cpp
@@ -260,6 +260,7 @@ bool TLangModel::Load(const std::string& modelFileName) {
     for (auto&& it: WordToId) {
         IdToWord[it.second] = &it.first;
     }
+    BaseModelLastWordID = LastWordID;
     return true;
 }
 
@@ -320,14 +321,52 @@ TWord TLangModel::GetWordById(TWordId wid) const {
     return TWord(*IdToWord[wid]);
 }
 
+const std::wstring& TLangModel::GetWstrById(TWordId wid) const {
+    assert(wid < IdToWord.size());
+    return *IdToWord[wid];
+}
+
 TCount TLangModel::GetWordCount(TWordId wid) const {
     return GetGram1HashCount(wid);
 }
 
+TWordId TLangModel::GetLastWordID() const {
+    return LastWordID;
+}
+
 uint64_t TLangModel::GetCheckSum() const {
     return CheckSum;
 }
 
+template <typename T>
+inline void IncCount(TRuntimeModelCounts& counts, const T& key, TCount value) {
+    std::string tmp = DumpKey(key);
+    uint32_t cityHash32 = CityHash32(&tmp[0], tmp.size());
+    counts[cityHash32] += value;
+}
+
+void TLangModel::AddTextFragment(const std::wstring& text, uint32_t count) {
+    std::wstring trainText = text;
+    ToLower(trainText);
+    TSentences sentences = Tokenizer.Process(trainText);
+    TIdSentences sentenceIds = ConvertToIds(sentences);
+
+    for (size_t i = 0; i < sentenceIds.size(); ++i) {
+        const TWordIds& words = sentenceIds[i];
+        for (auto w: words) {
+            IncCount(RuntimeModelCounts, w, count);
+        }
+        for (ssize_t j = 0; j < (ssize_t)words.size() - 1; ++j) {
+            TGram2Key key(words[j], words[j+1]);
+            IncCount(RuntimeModelCounts, key, count);
+        }
+        for (ssize_t j = 0; j < (ssize_t)words.size() - 2; ++j) {
+            TGram3Key key(words[j], words[j+1], words[j+2]);
+            IncCount(RuntimeModelCounts, key, count);
+        }
+    }
+}
+
 TWord TLangModel::GetWord(const std::wstring& word) const {
     auto it = WordToId.find(word);
     if (it != WordToId.end()) {
@@ -372,10 +411,18 @@ double TLangModel::GetGram3Prob(TWordId word1, TWordId word2, TWordId word3) con
     return countsGram3 / countsGram2;
 }
 
+enum ECheckPolicy {
+    CP_Both = 0,
+    CP_Base = 1,
+    CP_Runtime = 2,
+};
+
 template<typename T>
 TCount GetGramHashCount(T key,
                         const TPerfectHash& ph,
-                        const std::vector<std::pair<uint16_t, uint16_t>>& buckets)
+                        const TRuntimeModelCounts& runtimeModelCounts,
+                        const std::vector<std::pair<uint16_t, uint16_t>>& buckets,
+                        ECheckPolicy checkPolicy)
 {
     constexpr int TMP_BUF_SIZE = 128;
     static char tmpBuff[TMP_BUF_SIZE];
@@ -392,8 +439,20 @@ TCount GetGramHashCount(T key,
     const std::pair<uint16_t, uint16_t>& data = buckets[bucket];
 
     TCount res = TCount();
-    if (data.first == CityHash16(tmpBuff, tmpBuffStream.Size())) {
-        res = UnpackInt32(data.second);
+    uint32_t cityHash32 = CityHash32(tmpBuff, tmpBuffStream.Size());
+    uint16_t cityHash16 = cityHash32 % std::numeric_limits<uint16_t>::max();
+    if (checkPolicy == CP_Base || checkPolicy == CP_Both) {
+        if (data.first == cityHash16) {
+            res += UnpackInt32(data.second);
+        }
+    }
+
+    if (checkPolicy == CP_Runtime || checkPolicy == CP_Both) {
+        auto it = runtimeModelCounts.find(cityHash32);
+        assert(checkPolicy != CP_Runtime || it != runtimeModelCounts.end());
+        if (it != runtimeModelCounts.end()) {
+            res += it->second;
+        }
     }
     return res;
 }
@@ -403,23 +462,24 @@ TCount TLangModel::GetGram1HashCount(TWordId word) const {
         return TCount();
     }
     TGram1Key key = word;
-    return GetGramHashCount(key, PerfectHash, Buckets);
+    ECheckPolicy policy = key >= BaseModelLastWordID ? CP_Runtime : CP_Base;
+    return GetGramHashCount(key, PerfectHash, RuntimeModelCounts, Buckets, policy);
 }
 
 TCount TLangModel::GetGram2HashCount(TWordId word1, TWordId word2) const {
     if (word1 == UnknownWordId || word2 == UnknownWordId) {
         return TCount();
     }
     TGram2Key key({word1, word2});
-    return GetGramHashCount(key, PerfectHash, Buckets);
+    return GetGramHashCount(key, PerfectHash, RuntimeModelCounts, Buckets, CP_Both);
 }
 
 TCount TLangModel::GetGram3HashCount(TWordId word1, TWordId word2, TWordId word3) const {
     if (word1 == UnknownWordId || word2 == UnknownWordId || word3 == UnknownWordId) {
         return TCount();
     }
     TGram3Key key(word1, word2, word3);
-    return GetGramHashCount(key, PerfectHash, Buckets);
+    return GetGramHashCount(key, PerfectHash, RuntimeModelCounts, Buckets, CP_Both);
 }
 
 } // NJamSpell
diff --git a/jamspell/lang_model.hpp b/jamspell/lang_model.hpp
@@ -55,6 +55,8 @@ class TRobinHash: public tsl::robin_map<std::wstring, TWordId> {
     }
 };
 
+using TRuntimeModelCounts = tsl::robin_map<uint32_t, TCount>;
+
 class TLangModel {
 public:
     bool Train(const std::string& fileName, const std::string& alphabetFile);
@@ -73,10 +75,15 @@ class TLangModel {
     TWordId GetWordId(const TWord& word);
     TWordId GetWordIdNoCreate(const TWord& word) const;
     TWord GetWordById(TWordId wid) const;
+    const std::wstring& GetWstrById(TWordId wid) const;
     TCount GetWordCount(TWordId wid) const;
 
+    TWordId GetLastWordID() const;
+
     uint64_t GetCheckSum() const;
 
+    void AddTextFragment(const std::wstring& text, uint32_t count = 1);
+
     HANDYPACK(WordToId, LastWordID, TotalWords, VocabSize,
               PerfectHash, Buckets, Tokenizer, CheckSum)
 private:
@@ -102,6 +109,9 @@ class TLangModel {
     std::vector<std::pair<uint16_t, uint16_t>> Buckets;
     TPerfectHash PerfectHash;
     uint64_t CheckSum;
+
+    TWordId BaseModelLastWordID = 0;
+    TRuntimeModelCounts RuntimeModelCounts;
 };
 
 

diff --git a/jamspell/spell_corrector.cpp b/jamspell/spell_corrector.cpp
@@ -254,6 +254,22 @@ void TSpellCorrector::SetMaxCandiatesToCheck(size_t maxCandidatesToCheck) {
     MaxCandiatesToCheck = maxCandidatesToCheck;
 }
 
+void TSpellCorrector::AddTextFragment(const std::wstring& text, uint32_t count) {
+    TWordId startWordID = LangModel.GetLastWordID();
+    LangModel.AddTextFragment(text, count);
+    TWordId endWordID = LangModel.GetLastWordID();
+    for (TWordId wid = startWordID; wid < endWordID; ++wid) {
+        const std::wstring& w = LangModel.GetWstrById(wid);
+        auto deletes = GetDeletes2(w);
+        for (auto&& w1: deletes) {
+            Deletes1->Insert(WideToUTF8(w1.back()));
+            for (size_t i = 0; i < w1.size() - 1; ++i) {
+                Deletes2->Insert(WideToUTF8(w1[i]));
+            }
+        }
+    }
+}
+
 template<typename T>
 inline void AddVec(T& target, const T& source) {
     target.insert(target.end(), source.begin(), source.end());

diff --git a/jamspell/spell_corrector.hpp b/jamspell/spell_corrector.hpp
@@ -18,6 +18,7 @@ class TSpellCorrector {
     std::wstring FixFragmentNormalized(const std::wstring& text) const;
     void SetPenalty(double knownWordsPenaly, double unknownWordsPenalty);
     void SetMaxCandiatesToCheck(size_t maxCandidatesToCheck);
+    void AddTextFragment(const std::wstring& text, uint32_t count = 1);
 private:
     void FilterCandidatesByFrequency(std::unordered_set<NJamSpell::TWord, NJamSpell::TWordHashPtr>& uniqueCandidates, NJamSpell::TWord origWord) const;
     NJamSpell::TWords Edits(const NJamSpell::TWord& word) const;

diff --git a/jamspell/utils.cpp b/jamspell/utils.cpp
@@ -152,13 +152,17 @@ wchar_t MakeUpperIfRequired(wchar_t orig, wchar_t sample) {
 }
 
 uint16_t CityHash16(const std::string& str) {
-    uint32_t hash = CityHash32(&str[0], str.size());
+    uint32_t hash = ::CityHash32(&str[0], str.size());
     return hash % std::numeric_limits<uint16_t>::max();
 }
 
 uint16_t CityHash16(const char* str, size_t size) {
-    uint32_t hash = CityHash32(str, size);
+    uint32_t hash = ::CityHash32(str, size);
     return hash % std::numeric_limits<uint16_t>::max();
 }
 
+uint32_t CityHash32(const char *str, size_t size) {
+    return ::CityHash32(str, size);
+}
+
 } // NJamSpell
diff --git a/jamspell/utils.hpp b/jamspell/utils.hpp
@@ -62,5 +62,6 @@ void ToLower(std::wstring& text);
 wchar_t MakeUpperIfRequired(wchar_t orig, wchar_t sample);
 uint16_t CityHash16(const std::string& str);
 uint16_t CityHash16(const char* str, size_t size);
+uint32_t CityHash32(const char* str, size_t size);
 
 } // NJamSpell
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,3 +10,4 @@ @@
     *.tar.gz
     *.spell
     *.cache
+    .DS_Store