rwth-i6 · larissakl · Mar 5, 2025 · Apr 4, 2025 · Apr 4, 2025 · Apr 8, 2025
diff --git a/src/Bliss/Lexicon.cc b/src/Bliss/Lexicon.cc
@@ -138,26 +138,28 @@ Lexicon::~Lexicon() {
 
 void Lexicon::load(const std::string& filename) {
     Core::MD5 md5;
-    if (md5.updateFromFile(filename))
+    if (md5.updateFromFile(filename)) {
         dependency_.setValue(md5);
-    else
+    }
+    else {
         warning("could not derive md5 sum from file '%s'", filename.c_str());
-    LexiconParser parser(config, this);
+    }
+
     log("reading lexicon from file") << " \"" << filename << "\" ...";
-    if (parser.parseFile(filename.c_str()) != 0)
+    if (!formats().read(filename, *this)) {
         error("Error while reading lexicon file.");
+    }
     log("dependency value: ") << dependency_.value();
 }
 
 LexiconRef Lexicon::create(const Configuration& c) {
-    Lexicon* result = new Lexicon(c);
+    auto result = Core::ref(new Lexicon(c));
     result->load(paramFilename(c));
     if (result->hasFatalErrors()) {
-        delete result;
         return LexiconRef();
     }
     result->logStatistics();
-    return LexiconRef(result);
+    return result;
 }
 
 Lemma* Lexicon::newLemma() {
@@ -847,3 +849,35 @@ Core::Ref<LemmaToEvaluationTokenTransducer> Lexicon::createLemmaToEvaluationToke
 Core::Ref<LemmaToEvaluationTokenTransducer> Lexicon::createLemmaToPreferredEvaluationTokenSequenceTransducer() const {
     return createLemmaToEvaluationTokenTransducer(false);
 }
+
+template<>
+class Core::NameHelper<Lexicon> {
+public:
+    operator std::string() const {
+        return "Lexicon";
+    }
+    const char* c_str() const {
+        return "Lexicon";
+    }
+};
+
+template<>
+class Core::NameHelper<Lexicon*> {
+public:
+    operator std::string() const {
+        return "Lexicon*";
+    }
+    const char* c_str() const {
+        return "Lexicon*";
+    }
+};
+
+Core::FormatSet& Lexicon::formats() {
+    if (!formats_) {
+        formats_ = std::make_unique<Core::FormatSet>(Core::Configuration(Core::Application::us()->getConfiguration(), "lexicon-file-format-set"));
+        formats_->registerFormat("xml", new XmlLexiconFormat(), true);
+        formats_->registerFormat("vocab-text", new VocabTextLexiconFormat());
+        formats_->registerFormat("vocab-txt", new VocabTextLexiconFormat());
+    }
+    return *formats_;
+}
diff --git a/src/Bliss/Lexicon.hh b/src/Bliss/Lexicon.hh
@@ -25,10 +25,12 @@
 #include <Core/Component.hh>
 #include <Core/Dependency.hh>
 #include <Core/Extensions.hh>
+#include <Core/FormatSet.hh>
 #include <Core/Obstack.hh>
 #include <Core/Parameter.hh>
 #include <Core/ReferenceCounting.hh>
 #include <Core/StringUtilities.hh>
+#include <Core/Types.hh>
 #include "Phoneme.hh"
 #include "Symbol.hh"
 
@@ -477,7 +479,7 @@ class LemmaToEvaluationTokenTransducer;
  *
  * A lemma may be assigned a symbolic name, which the system can
  * use to identify lemmas which have a special meaning to it.
- * E.g. the silence word is is identified by the symbolic name
+ * E.g. the silence word is identified by the symbolic name
  * "silence".  Such lemmas a called "special lemmas".
  */
 
@@ -607,7 +609,7 @@ public:
     void defineSpecialLemma(const std::string& name, Lemma* lemma);
 
     /**
-     * Load lexicon from XML file.
+     * Load lexicon from XML or txt file.
      */
     void load(const std::string& filename);
 
@@ -883,6 +885,11 @@ public:
      * evaluation token sequences, the first is used.
      */
     Core::Ref<LemmaToEvaluationTokenTransducer> createLemmaToPreferredEvaluationTokenSequenceTransducer() const;
+
+private:
+    std::unique_ptr<Core::FormatSet> formats_;
+
+    Core::FormatSet& formats();
 };
 
 }  // namespace Bliss

diff --git a/src/Bliss/LexiconParser.cc b/src/Bliss/LexiconParser.cc
@@ -257,7 +257,7 @@ void LexiconElement::addPhon(const WeightedPhonemeString& phon) {
         return;
     if (!product_->phonemeInventory()) {
         parser()->warning(
-                "No phoneme inventory defined. Ingnoring pronunciation");
+                "No phoneme inventory defined. Ignoring pronunciation");
         return;
     }
 
@@ -358,7 +358,7 @@ const Core::ParameterString paramEncoding(
         "utf-8");
 }  // namespace
 
-void LexiconParser::loadWhitelist(const Core::Configuration& config, Core::StringHashSet& whitelist) {
+void XmlLexiconParser::loadWhitelist(const Core::Configuration& config, Core::StringHashSet& whitelist) {
     std::string filename = paramFile(config);
     if (!filename.empty()) {
         Core::CompressedInputStream* cis = new Core::CompressedInputStream(filename.c_str());
@@ -379,12 +379,88 @@ void LexiconParser::loadWhitelist(const Core::Configuration& config, Core::Strin
     }
 }
 
-LexiconParser::LexiconParser(const Core::Configuration& c, Lexicon* _lexicon)
-        : Precursor(c) {
+XmlLexiconParser::XmlLexiconParser(const Core::Configuration& c, Lexicon* _lexicon)
+        : LexiconParser(),
+          XmlSchemaParser(c) {
     lexicon_ = _lexicon;
 
     // build schema
     LexiconElement* lexElement = new LexiconElement(this, LexiconElement::creationHandler(&Self::pseudoCreateLexicon), c);
     loadWhitelist(select("vocab"), lexElement->whitelist_);
     setRoot(collect(lexElement));
 }
+
+// use base class parse function
+bool XmlLexiconParser::parseFile(const std::string& filename) {
+    return parser()->Core::XmlSchemaParser::parseFile(filename.c_str()) == 0;
+}
+
+VocabTextLexiconParser::VocabTextLexiconParser(Lexicon* _lexicon)
+        : LexiconParser(),
+          lexicon_(_lexicon) {
+    phonemeInventory_ = Core::Ref(new PhonemeInventory());
+}
+
+// parse txt file line by line to a Bliss::Lexicon
+// in the first step, the phonemes are created and the phoneme inventory is set
+// and afterwards the lemmata can be created from these phonemes
+bool VocabTextLexiconParser::parseFile(const std::string& filename) {
+    // collect all labels from the file and add them as phonemes to the phoneme inventory
+    std::ifstream file(filename);
+    if (!file.is_open()) {
+        return false;
+    }
+    std::string line;
+    while (std::getline(file, line)) {
+        if (line.empty())
+            continue;
+        createPhoneme(line);
+    }
+
+    // set the phoneme inventory
+    lexicon_->setPhonemeInventory(phonemeInventory_);
+    // iterate over the phonemes in the inventory to create the lemmata in the lexicon
+    createLemmata();
+    return true;
+}
+
+// helper function to handle one label and create a corresponding phoneme
+void VocabTextLexiconParser::createPhoneme(const std::string& line) {
+    std::string symbol(line);
+    stripWhitespace(symbol);  // in case there are any unintentional whitespaces
+    suppressTrailingBlank(symbol);
+
+    // check if phoneme was already added (if one label appears more than once)
+    if (phonemeInventory_->phoneme(symbol)) {
+        Core::Application::us()->error("Phoneme \"%s\" was already added to the inventory. It may be duplicated in the lexicon.", symbol.c_str());
+    }
+
+    // create a new phoneme
+    Phoneme* newPhoneme_ = phonemeInventory_->newPhoneme();
+    // set symbol
+    phonemeInventory_->assignSymbol(newPhoneme_, symbol);
+    // set variation to none
+    newPhoneme_->setContextDependent(false);
+}
+
+// helper function to create the lemmata
+void VocabTextLexiconParser::createLemmata() {
+    // iterate over the phonemes which were assigned to the inventory previously
+    auto phonemes = phonemeInventory_->phonemes();
+    for (auto it = phonemes.first; it != phonemes.second; ++it) {
+        const Phoneme* phoneme = *it;
+        std::string    symbol  = phoneme->symbol();
+
+        // make sure that lemma has not been added yet
+        verify(!lexicon_->lemma(symbol));
+
+        // create a new lemma
+        Lemma* newLemma_ = lexicon_->newLemma();
+        // set orth
+        lexicon_->setOrthographicForms(newLemma_, {symbol});
+        // set phon
+        Pronunciation* pron = lexicon_->getPronunciation(symbol);
+        lexicon_->addPronunciation(newLemma_, pron);
+        lexicon_->setDefaultLemmaName(newLemma_);
+    }
+}
diff --git a/src/Bliss/LexiconParser.hh b/src/Bliss/LexiconParser.hh
@@ -51,12 +51,15 @@ struct WeightedPhonemeString;
 class PronunciationElement;
 class LexiconElement;
 class LexiconParser;
+class TextLexiconParser;
+class XmlLexiconParser;
 
 class LexiconElement : public Core::XmlBuilderElement<
                                Lexicon,
                                Core::XmlRegularElement,
                                Core::CreateByContext> {
     friend class LexiconParser;
+    friend class XmlLexiconParser;
     typedef Core::XmlBuilderElement<
             Lexicon,
             Core::XmlRegularElement,
@@ -96,17 +99,25 @@ public:
     virtual void characters(const char*, int) {};
 };
 
+/*
+ * Base lexicon parser class
+ */
+class LexiconParser {
+public:
+    virtual ~LexiconParser() {}
+    virtual bool     parseFile(const std::string& filename) = 0;
+    virtual Lexicon* lexicon() const                        = 0;
+};
+
 /**
  * Parser for Bliss lexicon files.
  * This class implements parsing of the lexicon XML format
  * described in <a href="../../doc/Lexicon.pdf">Lexicon File
  * Format Reference</a>.  It is normally not used directly but
  * through Lexicon.
  */
-
-class LexiconParser : public Core::XmlSchemaParser {
-    typedef Core::XmlSchemaParser Precursor;
-    typedef LexiconParser         Self;
+class XmlLexiconParser : public virtual LexiconParser, public Core::XmlSchemaParser {
+    typedef XmlLexiconParser Self;
 
 private:
     Lexicon* lexicon_;
@@ -116,12 +127,55 @@ private:
     void loadWhitelist(const Core::Configuration&, Core::StringHashSet&);
 
 public:
-    LexiconParser(const Core::Configuration& c, Lexicon*);
-    Lexicon* lexicon() const {
+    XmlLexiconParser(const Core::Configuration& c, Lexicon*);
+    bool     parseFile(const std::string& filename) override;
+    Lexicon* lexicon() const override {
         return lexicon_;
     }
 };
 
+struct XmlLexiconFormat : public Core::FormatSet::Format<Lexicon> {
+    bool read(const std::string& filename, Lexicon& lexicon) const override {
+        XmlLexiconParser parser(Core::Application::us()->getConfiguration(), &lexicon);
+        return parser.parseFile(filename);
+    }
+
+    bool write(const std::string& filename, Lexicon const& lexicon) const override {
+        return false;
+    }
+};
+
+/**
+ * Parser for text lexicon files containing the vocab, so only the labels
+ * This is meant for "lexicon-free" search
+ * The .txt-file should contain one label per line
+ */
+class VocabTextLexiconParser : public LexiconParser {
+private:
+    Core::Ref<Lexicon>          lexicon_;
+    Core::Ref<PhonemeInventory> phonemeInventory_;
+    void                        createPhoneme(const std::string& line);
+    void                        createLemmata();
+
+public:
+    VocabTextLexiconParser(Lexicon*);
+    bool     parseFile(const std::string& filename) override;
+    Lexicon* lexicon() const override {
+        return lexicon_.get();
+    }
+};
+
+struct VocabTextLexiconFormat : public Core::FormatSet::Format<Lexicon> {
+    bool read(const std::string& filename, Lexicon& lexicon) const override {
+        VocabTextLexiconParser parser(&lexicon);
+        return parser.parseFile(filename);
+    }
+
+    bool write(const std::string& filename, Lexicon const& lexicon) const override {
+        return false;
+    }
+};
+
 }  // namespace Bliss
 
 #endif  // _BLISS_LEXICONPARSER_HH