Add general 'phonetisaurus_train', and 'phonetisaurus_apply' wrapper scripts

AdolfVonKleist · AdolfVonKleist · commit 5028ba6149ab · 2017-07-09T16:19:06.000-04:00
for training and evaluation.  Add repo-level LICENSE file since apparently
it was missing.
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2017, Josef Novak
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+    and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+    this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/bin/phonetisaurus-align.cc b/src/bin/phonetisaurus-align.cc
@@ -63,8 +63,7 @@ int load_input_file (M2MFstAligner* aligner, string input_file,
       lines++;
     }
     infile.close ();
-  }
-  else {
+  } else {
     cerr << "Failed to open input file: " << input_file << endl;
     return -1;
   }
@@ -213,7 +212,7 @@ void compileNBestFarArchive (M2MFstAligner* aligner,
       set_syms = true;
     }
 
-    sprintf (keybuf, "%0*d", generate_keys, i+1);
+    snsprintf (keybuf, "%0*d", generate_keys, i+1);
     key = keybuf;
 
     //Write the final result to the FARchive
diff --git a/src/bin/phonetisaurus_apply b/src/bin/phonetisaurus_apply
@@ -0,0 +1,308 @@
+#!/usr/bin/env python
+# -*- mode: python; coding: utf-8 -*-
+from __future__ import print_function
+import os, logging, subprocess, time, re
+from datetime import datetime
+from collections import defaultdict
+import tempfile
+
+class G2PModelTester () :
+    """G2P Model training wrapper class.
+
+    Phonetisaurus G2P modeling training wrapper class.
+    This wraps the alignment, joint n-gram training, and ARPA to
+    WFST conversion steps into one command.
+    """
+    
+    def __init__ (self, model, **kwargs) :
+        self.model = model
+        self.lexicon_file = kwargs.get ("lexicon", None)
+        self.nbest = kwargs.get ("nbest", 1)
+        self.thresh = kwargs.get ("thresh", 99)
+        self.beam = kwargs.get ("beam", 10000)
+        self.greedy = kwargs.get ("greedy", False)
+        self.verbose = kwargs.get ("verbose", False)
+        self.logger = self.setupLogger ()
+
+    def setupLogger (self) :
+        """Setup the logger and logging level.
+
+        Setup the logger and logging level.  We only support
+        verbose and non-verbose mode.
+
+        Args:
+            verbose (bool): Verbose mode, or not.
+
+        Returns:
+            Logger: A configured logger instance.
+        """
+        
+        level = logging.DEBUG if self.verbose else logging.INFO
+        logging.basicConfig (
+            level=level,
+            format="\033[94m%(levelname)s:%(name)s:"\
+            "%(asctime)s\033[0m:  %(message)s",
+            datefmt="%Y-%m-%d %H:%M:%S"
+        )
+
+        return logging.getLogger ("phonetisaurus-apply")
+
+    def _loadLexicon (self) :
+        """Load the lexicon from a file.
+
+        Load the reference lexicon from a file, and store it
+        in a defaultdict (list).
+        """
+        
+        _lexicon = defaultdict (list)
+        if not self.lexicon_file :
+            return _lexicon
+
+        self.logger.debug ("Loading lexicon from file...")
+        with open (self.lexicon_file, "r") as ifp :
+            for line in ifp :
+                line = line.decode ("utf8").strip ()
+                word, pron = re.split (ur"\t", line)
+                _lexicon [word].append (pron)
+
+        return _lexicon
+    
+    def checkPhonetisaurusConfig (self) :
+        """Run some basic checks before training.
+
+        Run some basic checks regarding the $PATH, environment,
+        and provided data before starting training.
+
+        Raises:
+            EnvironmentError: raised if binaries are not found.
+        """
+
+        self.logger.debug ("Checking command configuration...")
+        for program in ["phonetisaurus-g2pfst"] :
+            if not self.which (program) :
+                raise EnvironmentError, "Phonetisaurus command, '{0}', "\
+                    "not found in path.".format (program)
+
+        if self.lexicon_file and not os.path.exists (self.lexicon_file) :
+            self.logger.error ("Could not find provided lexicon file.")
+            sys.exit (1)
+            
+        for key,val in sorted (vars (self).iteritems ()) :
+            self.logger.debug (u"{0}:  {1}".format (key, val))
+            
+        self.lexicon = self._loadLexicon ()
+        
+        return
+    
+    def which (self, program) :
+        """Basic 'which' implementation for python.
+
+        Basic 'which' implementation for python from stackoverflow:
+          * https://stackoverflow.com/a/377028/6739158
+
+        Args:
+            program (str): The program name to search the $PATH for.
+
+        Returns:
+            path/None: The path to the executable, or None.
+        """
+
+        def is_exe (fpath) :
+            return os.path.isfile (fpath) and os.access (fpath, os.X_OK)
+
+        fpath, fname = os.path.split (program)
+        if fpath:
+            if is_exe (program):
+                return program
+        else:
+            for path in os.environ["PATH"].split (os.pathsep) :
+                path = path.strip ('"')
+                exe_file = os.path.join (path, program)
+                if is_exe (exe_file):
+                    return exe_file
+
+        return None
+
+    def makeG2PCommand (self, word_list) :
+        """Build the G2P command.
+
+        Build the G2P command from the provided arguments.
+
+        Returns:
+            list: The command in subprocess list format.
+        """
+
+        command = [
+            u"phonetisaurus-g2pfst",
+            u"--model={0}".format (self.model),
+            u"--nbest={0}".format (self.nbest),
+            u"--beam={0}".format (self.beam),
+            u"--thresh={0}".format (self.thresh),
+            u"--wordlist={0}".format (word_list)
+        ]
+        
+        self.logger.debug (u" ".join (command))
+
+        return command
+
+    def runG2PCommand (self, word_list_file) :
+        """Generate and run the actual G2P command.
+        
+        Generate and run the actual G2P command.  Each synthesized
+        entry will be yielded back on-the-fly via the subprocess
+        stdout readline method.
+
+        Args:
+            word_list_file (str): The input word list.
+        """
+        g2p_command = self.makeG2PCommand (word_list_file)
+        
+        self.logger.debug ("Applying G2P model...")
+
+        with open (os.devnull, "w") as devnull :
+            proc = subprocess.Popen (
+                g2p_command,
+                stdout=subprocess.PIPE,
+                stderr=devnull if not self.verbose else None
+            )
+            
+            for line in iter (proc.stdout.readline, "") :
+                parts = re.split (ur"\t", line.decode ("utf8").strip ())
+                if not len (parts) == 3 :
+                    self.logger.warning (
+                        u"No pronunciation for word: '{0}'".format (parts [0])
+                    )
+                    continue
+                
+                yield parts
+
+        return
+
+    def applyG2POnly (self, word_list_file) :
+        """Apply the G2P model to a word list.
+
+        Apply the G2P model to a word list.  No filtering or application
+        of a reference lexicon is used here.
+
+        Args:
+            word_list_file (str): The input word list.
+        """
+        for word, score, pron in self.runG2PCommand (word_list_file) :
+            line = u""
+            if self.verbose :
+                line = u"{0}\t{1:.2f}\t{2}".format (
+                    word, float (score), pron
+                )
+            else :
+                line = u"{0}\t{1}".format (word, pron)
+            print (line.encode ("utf8"))
+        
+        return
+
+    def applyG2PWithLexicon (self, word_list_file) :
+        """Apply the G2P model to a word list, combined with lexicon.
+
+        Apply the G2P model to a word list, but combine this with 
+        a reference lexicon.  Words for which a reference entry exists
+        will not be sent to the G2P, unless the additional '--greedy'
+        flag is set to True.
+
+        Args:
+            word_list_file (str): The input word list.
+        """
+        target_lexicon = defaultdict (list)
+        tmpwordlist = tempfile.NamedTemporaryFile (delete=False)
+
+        #First, find any words in the target list for which we already
+        # have a canonical pronunciation in the reference lexicon.
+        with open (word_list_file, "r") as ifp :
+            for word in ifp :
+                word = word.decode ("utf8").strip ()
+                if word in self.lexicon :
+                    target_lexicon [word] = [(0.0,pron)
+                                             for pron in self.lexicon [word]]
+                    #In greedy mode we still send words to the G2P, even
+                    # if we have canonical entries in the reference lexicon.
+                    if self.greedy :
+                        print (word.encode ("utf8"), file=tmpwordlist)
+                else :
+                    print (word.encode ("utf8"), file=tmpwordlist)
+        tmpwordlist.close ()
+
+        #Second, iterate through the G2P output, and filter against
+        # any possible duplicates previously found in the reference lexicon.
+        for word, score, pron in self.runG2PCommand (tmpwordlist.name) :
+            prons = set ([p for s,p in target_lexicon [word]])
+            if pron in prons :
+                continue
+            target_lexicon [word].append ((score, pron))
+
+        #Finally, sort everything that is left and print it.
+        for word in sorted (target_lexicon.keys ()) :
+            for score, pron in target_lexicon [word] :
+                line = u""
+                if self.verbose :
+                    line = u"{0}\t{1:.2f}\t{2}".format (
+                        word, float (score), pron
+                    )
+                else :
+                    line = u"{0}\t{1}".format (word, pron)
+                print (line.encode ("utf8"))
+        
+        os.unlink (tmpwordlist.name)
+        return
+    
+    def ApplyG2PModel (self, word_list_file) :
+        """Apply the G2P model to a word list.
+
+        Apply the G2P model to a word list.
+
+        Args:
+            word_list_file (str): The input word list.
+        """
+        self.checkPhonetisaurusConfig ()
+        
+        if not os.path.exists (word_list_file) \
+           or not os.path.isfile (word_list_file) :
+            raise IOError, "Word list file not found."
+
+        if len (self.lexicon) == 0 :
+            self.applyG2POnly (word_list_file)
+        else :
+            self.applyG2PWithLexicon (word_list_file)
+        
+        return
+    
+if __name__ == "__main__" :
+    import sys, argparse
+
+    example = "{0} --model train/model.fst --word test".format (sys.argv [0])
+    
+    parser  = argparse.ArgumentParser (description=example)
+    parser.add_argument ("--model", "-m", help="Phonetisaurus G2P fst model.",
+                         required=True)
+    parser.add_argument ("--lexicon", "-l", help="Optional reference lexicon.",
+                         required=False)
+    parser.add_argument ("--nbest", "-n", help="Nbest highest order.",
+                         default=1, type=int)
+    parser.add_argument ("--beam", "-b", help="Search 'beam'.",
+                         default=10000, type=int)
+    parser.add_argument ("--thresh", "-t", help="Pruning threshold for n-best.",
+                         default=99.0, type=float)
+    parser.add_argument ("--greedy", "-g", help="Use the G2P even if a "
+                         "reference lexicon has been provided.", default=False,
+                         action="store_true")
+    parser.add_argument ("--word_list", "-wl", help="Input word or word list to apply "
+                        "G2P model to.", type=str)
+    
+    parser.add_argument ("--verbose", "-v", help="Verbose mode.",
+                         default=False, action="store_true")
+    args = parser.parse_args ()
+
+    tester = G2PModelTester (
+        args.model,
+        **{key:val for key,val in args.__dict__.iteritems ()
+           if not key in ["model","word_list"]}
+    )
+
+    tester.ApplyG2PModel (args.word_list)
diff --git a/src/bin/phonetisaurus_train b/src/bin/phonetisaurus_train

Original file line number	Diff line number	Diff line change
`@@ -63,8 +63,7 @@ int load_input_file (M2MFstAligner* aligner, string input_file,`
`63`	`63`	`lines++;`
`64`	`64`	`}`
`65`	`65`	`infile.close ();`
`66`		`- }`
`67`		`- else {`
	`66`	`+ } else {`
`68`	`67`	`cerr << "Failed to open input file: " << input_file << endl;`
`69`	`68`	`return -1;`
`70`	`69`	`}`
`@@ -213,7 +212,7 @@ void compileNBestFarArchive (M2MFstAligner* aligner,`
`213`	`212`	`set_syms = true;`
`214`	`213`	`}`
`215`	`214`
`216`		`- sprintf (keybuf, "%0*d", generate_keys, i+1);`
	`215`	`+ snsprintf (keybuf, "%0*d", generate_keys, i+1);`
`217`	`216`	`key = keybuf;`
`218`	`217`
`219`	`218`	`//Write the final result to the FARchive`