Skip to content

Add VocabTextLexiconParser for simple text-based lexica #105

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 41 additions & 7 deletions src/Bliss/Lexicon.cc
Original file line number Diff line number Diff line change
Expand Up @@ -138,26 +138,28 @@ Lexicon::~Lexicon() {

void Lexicon::load(const std::string& filename) {
Core::MD5 md5;
if (md5.updateFromFile(filename))
if (md5.updateFromFile(filename)) {
dependency_.setValue(md5);
else
}
else {
warning("could not derive md5 sum from file '%s'", filename.c_str());
LexiconParser parser(config, this);
}

log("reading lexicon from file") << " \"" << filename << "\" ...";
if (parser.parseFile(filename.c_str()) != 0)
if (!formats().read(filename, *this)) {
error("Error while reading lexicon file.");
}
log("dependency value: ") << dependency_.value();
}

LexiconRef Lexicon::create(const Configuration& c) {
Lexicon* result = new Lexicon(c);
auto result = Core::ref(new Lexicon(c));
result->load(paramFilename(c));
if (result->hasFatalErrors()) {
delete result;
return LexiconRef();
}
result->logStatistics();
return LexiconRef(result);
return result;
}

Lemma* Lexicon::newLemma() {
Expand Down Expand Up @@ -847,3 +849,35 @@ Core::Ref<LemmaToEvaluationTokenTransducer> Lexicon::createLemmaToEvaluationToke
Core::Ref<LemmaToEvaluationTokenTransducer> Lexicon::createLemmaToPreferredEvaluationTokenSequenceTransducer() const {
return createLemmaToEvaluationTokenTransducer(false);
}

template<>
class Core::NameHelper<Lexicon> {
public:
operator std::string() const {
return "Lexicon";
}
const char* c_str() const {
return "Lexicon";
}
};

template<>
class Core::NameHelper<Lexicon*> {
public:
operator std::string() const {
return "Lexicon*";
}
const char* c_str() const {
return "Lexicon*";
}
};

Core::FormatSet& Lexicon::formats() {
if (!formats_) {
formats_ = std::make_unique<Core::FormatSet>(Core::Configuration(Core::Application::us()->getConfiguration(), "lexicon-file-format-set"));
formats_->registerFormat("xml", new XmlLexiconFormat(), true);
formats_->registerFormat("vocab-text", new VocabTextLexiconFormat());
formats_->registerFormat("vocab-txt", new VocabTextLexiconFormat());
}
return *formats_;
}
11 changes: 9 additions & 2 deletions src/Bliss/Lexicon.hh
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,12 @@
#include <Core/Component.hh>
#include <Core/Dependency.hh>
#include <Core/Extensions.hh>
#include <Core/FormatSet.hh>
#include <Core/Obstack.hh>
#include <Core/Parameter.hh>
#include <Core/ReferenceCounting.hh>
#include <Core/StringUtilities.hh>
#include <Core/Types.hh>
#include "Phoneme.hh"
#include "Symbol.hh"

Expand Down Expand Up @@ -477,7 +479,7 @@ class LemmaToEvaluationTokenTransducer;
*
* A lemma may be assigned a symbolic name, which the system can
* use to identify lemmas which have a special meaning to it.
* E.g. the silence word is is identified by the symbolic name
* E.g. the silence word is identified by the symbolic name
* "silence". Such lemmas a called "special lemmas".
*/

Expand Down Expand Up @@ -607,7 +609,7 @@ public:
void defineSpecialLemma(const std::string& name, Lemma* lemma);

/**
* Load lexicon from XML file.
* Load lexicon from XML or txt file.
*/
void load(const std::string& filename);

Expand Down Expand Up @@ -883,6 +885,11 @@ public:
* evaluation token sequences, the first is used.
*/
Core::Ref<LemmaToEvaluationTokenTransducer> createLemmaToPreferredEvaluationTokenSequenceTransducer() const;

private:
std::unique_ptr<Core::FormatSet> formats_;

Core::FormatSet& formats();
};

} // namespace Bliss
Expand Down
84 changes: 80 additions & 4 deletions src/Bliss/LexiconParser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ void LexiconElement::addPhon(const WeightedPhonemeString& phon) {
return;
if (!product_->phonemeInventory()) {
parser()->warning(
"No phoneme inventory defined. Ingnoring pronunciation");
"No phoneme inventory defined. Ignoring pronunciation");
return;
}

Expand Down Expand Up @@ -358,7 +358,7 @@ const Core::ParameterString paramEncoding(
"utf-8");
} // namespace

void LexiconParser::loadWhitelist(const Core::Configuration& config, Core::StringHashSet& whitelist) {
void XmlLexiconParser::loadWhitelist(const Core::Configuration& config, Core::StringHashSet& whitelist) {
std::string filename = paramFile(config);
if (!filename.empty()) {
Core::CompressedInputStream* cis = new Core::CompressedInputStream(filename.c_str());
Expand All @@ -379,12 +379,88 @@ void LexiconParser::loadWhitelist(const Core::Configuration& config, Core::Strin
}
}

LexiconParser::LexiconParser(const Core::Configuration& c, Lexicon* _lexicon)
: Precursor(c) {
XmlLexiconParser::XmlLexiconParser(const Core::Configuration& c, Lexicon* _lexicon)
: LexiconParser(),
XmlSchemaParser(c) {
lexicon_ = _lexicon;

// build schema
LexiconElement* lexElement = new LexiconElement(this, LexiconElement::creationHandler(&Self::pseudoCreateLexicon), c);
loadWhitelist(select("vocab"), lexElement->whitelist_);
setRoot(collect(lexElement));
}

// use base class parse function
bool XmlLexiconParser::parseFile(const std::string& filename) {
return parser()->Core::XmlSchemaParser::parseFile(filename.c_str()) == 0;
}

VocabTextLexiconParser::VocabTextLexiconParser(Lexicon* _lexicon)
: LexiconParser(),
lexicon_(_lexicon) {
phonemeInventory_ = Core::Ref(new PhonemeInventory());
}

// parse txt file line by line to a Bliss::Lexicon
// in the first step, the phonemes are created and the phoneme inventory is set
// and afterwards the lemmata can be created from these phonemes
bool VocabTextLexiconParser::parseFile(const std::string& filename) {
// collect all labels from the file and add them as phonemes to the phoneme inventory
std::ifstream file(filename);
if (!file.is_open()) {
return false;
}
std::string line;
while (std::getline(file, line)) {
if (line.empty())
continue;
createPhoneme(line);
}

// set the phoneme inventory
lexicon_->setPhonemeInventory(phonemeInventory_);
// iterate over the phonemes in the inventory to create the lemmata in the lexicon
createLemmata();
return true;
}

// helper function to handle one label and create a corresponding phoneme
void VocabTextLexiconParser::createPhoneme(const std::string& line) {
std::string symbol(line);
stripWhitespace(symbol); // in case there are any unintentional whitespaces
suppressTrailingBlank(symbol);

// check if phoneme was already added (if one label appears more than once)
if (phonemeInventory_->phoneme(symbol)) {
Core::Application::us()->error("Phoneme \"%s\" was already added to the inventory. It may be duplicated in the lexicon.", symbol.c_str());
}

// create a new phoneme
Phoneme* newPhoneme_ = phonemeInventory_->newPhoneme();
// set symbol
phonemeInventory_->assignSymbol(newPhoneme_, symbol);
// set variation to none
newPhoneme_->setContextDependent(false);
}

// helper function to create the lemmata
void VocabTextLexiconParser::createLemmata() {
// iterate over the phonemes which were assigned to the inventory previously
auto phonemes = phonemeInventory_->phonemes();
for (auto it = phonemes.first; it != phonemes.second; ++it) {
const Phoneme* phoneme = *it;
std::string symbol = phoneme->symbol();

// make sure that lemma has not been added yet
verify(!lexicon_->lemma(symbol));

// create a new lemma
Lemma* newLemma_ = lexicon_->newLemma();
// set orth
lexicon_->setOrthographicForms(newLemma_, {symbol});
// set phon
Pronunciation* pron = lexicon_->getPronunciation(symbol);
lexicon_->addPronunciation(newLemma_, pron);
lexicon_->setDefaultLemmaName(newLemma_);
}
}
66 changes: 60 additions & 6 deletions src/Bliss/LexiconParser.hh
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,15 @@ struct WeightedPhonemeString;
class PronunciationElement;
class LexiconElement;
class LexiconParser;
class TextLexiconParser;
class XmlLexiconParser;

class LexiconElement : public Core::XmlBuilderElement<
Lexicon,
Core::XmlRegularElement,
Core::CreateByContext> {
friend class LexiconParser;
friend class XmlLexiconParser;
typedef Core::XmlBuilderElement<
Lexicon,
Core::XmlRegularElement,
Expand Down Expand Up @@ -96,17 +99,25 @@ public:
virtual void characters(const char*, int) {};
};

/*
* Base lexicon parser class
*/
class LexiconParser {
public:
virtual ~LexiconParser() {}
virtual bool parseFile(const std::string& filename) = 0;
virtual Lexicon* lexicon() const = 0;
};

/**
* Parser for Bliss lexicon files.
* This class implements parsing of the lexicon XML format
* described in <a href="../../doc/Lexicon.pdf">Lexicon File
* Format Reference</a>. It is normally not used directly but
* through Lexicon.
*/

class LexiconParser : public Core::XmlSchemaParser {
typedef Core::XmlSchemaParser Precursor;
typedef LexiconParser Self;
class XmlLexiconParser : public virtual LexiconParser, public Core::XmlSchemaParser {
typedef XmlLexiconParser Self;

private:
Lexicon* lexicon_;
Expand All @@ -116,12 +127,55 @@ private:
void loadWhitelist(const Core::Configuration&, Core::StringHashSet&);

public:
LexiconParser(const Core::Configuration& c, Lexicon*);
Lexicon* lexicon() const {
XmlLexiconParser(const Core::Configuration& c, Lexicon*);
bool parseFile(const std::string& filename) override;
Lexicon* lexicon() const override {
return lexicon_;
}
};

struct XmlLexiconFormat : public Core::FormatSet::Format<Lexicon> {
bool read(const std::string& filename, Lexicon& lexicon) const override {
XmlLexiconParser parser(Core::Application::us()->getConfiguration(), &lexicon);
return parser.parseFile(filename);
}

bool write(const std::string& filename, Lexicon const& lexicon) const override {
return false;
}
};

/**
* Parser for text lexicon files containing the vocab, so only the labels
* This is meant for "lexicon-free" search
* The .txt-file should contain one label per line
*/
class VocabTextLexiconParser : public LexiconParser {
private:
Core::Ref<Lexicon> lexicon_;
Core::Ref<PhonemeInventory> phonemeInventory_;
void createPhoneme(const std::string& line);
void createLemmata();

public:
VocabTextLexiconParser(Lexicon*);
bool parseFile(const std::string& filename) override;
Lexicon* lexicon() const override {
return lexicon_.get();
}
};

struct VocabTextLexiconFormat : public Core::FormatSet::Format<Lexicon> {
bool read(const std::string& filename, Lexicon& lexicon) const override {
VocabTextLexiconParser parser(&lexicon);
return parser.parseFile(filename);
}

bool write(const std::string& filename, Lexicon const& lexicon) const override {
return false;
}
};

} // namespace Bliss

#endif // _BLISS_LEXICONPARSER_HH