diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 0000000..398ff08 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,2 @@ +[run] +branch = True diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..76a3f15 --- /dev/null +++ b/.gitignore @@ -0,0 +1,14 @@ +*.pyc +*.joblib +*egg-info +.tox +build +dist +.idea +.ipynb_checkpoints +htmlcov +.coverage +.joblib +.cache +__pycache__ +docs/_build diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..65b6e1a --- /dev/null +++ b/.travis.yml @@ -0,0 +1,33 @@ +language: python +python: 3.5 +sudo: false +branches: + only: + - master + - /^\d\.\d+$/ + +env: + - TOXENV=py27 + - TOXENV=py33 + - TOXENV=py35 + +addons: + apt: + packages: + - python-numpy + - python-scipy + - libatlas-base-dev + - liblapack-dev + - gfortran + +install: + - pip install -U pip tox codecov + +script: travis_wait tox + +after_success: + - codecov + +cache: + directories: + - $HOME/.cache/pip diff --git a/CHANGES.rst b/CHANGES.rst new file mode 100644 index 0000000..964cca8 --- /dev/null +++ b/CHANGES.rst @@ -0,0 +1,7 @@ +Changes +======= + +0.1 (2015-11-27) +---------------- + +Initial release. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..ae1ebe3 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,8 @@ +include README.rst +include CHANGES.rst +include docs/Makefile +include docs/make.bat +include docs/conf.py + +recursive-include tests *.py +recursive-include docs *.rst diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..f3652d8 --- /dev/null +++ b/README.rst @@ -0,0 +1,32 @@ +================ +sklearn-crfsuite +================ + +.. image:: https://img.shields.io/pypi/v/sklearn-crfsuite.svg + :target: https://pypi.python.org/pypi/sklearn-crfsuite + :alt: PyPI Version + +.. image:: https://img.shields.io/travis/TeamHG-Memex/sklearn-crfsuite/master.svg + :target: http://travis-ci.org/TeamHG-Memex/sklearn-crfsuite + :alt: Build Status + +.. image:: http://codecov.io/github/TeamHG-Memex/sklearn-crfsuite/coverage.svg?branch=master + :target: http://codecov.io/github/TeamHG-Memex/sklearn-crfsuite?branch=master + :alt: Code Coverage + +.. image:: https://readthedocs.org/projects/sklearn-crfsuite/badge/?version=latest + :target: http://sklearn-crfsuite.readthedocs.org/en/latest/?badge=latest + :alt: Documentation + +sklearn-crfsuite is a thin CRFsuite_ (python-crfsuite_) wrapper which provides +interface simlar to scikit-learn_. ``sklearn_crfsuite.CRF`` is a scikit-learn +compatible estimator: you can use e.g. scikit-learn model +selection utilities (cross-validation, hyperparameter optimization) with it. + +.. _CRFsuite: http://www.chokkan.org/software/crfsuite/ +.. _python-crfsuite: https://github.com/tpeng/python-crfsuite +.. _scikit-learn: http://scikit-learn.org/ + +License is MIT. + +Documentation can be found `here `_. diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d013a9d --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,192 @@ +# Makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = sphinx-build +PAPER = +BUILDDIR = _build + +# User-friendly check for sphinx-build +ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) +$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) +endif + +# Internal variables. +PAPEROPT_a4 = -D latex_paper_size=a4 +PAPEROPT_letter = -D latex_paper_size=letter +ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . +# the i18n builder cannot share the environment and doctrees with the others +I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . + +.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext + +help: + @echo "Please use \`make ' where is one of" + @echo " html to make standalone HTML files" + @echo " dirhtml to make HTML files named index.html in directories" + @echo " singlehtml to make a single large HTML file" + @echo " pickle to make pickle files" + @echo " json to make JSON files" + @echo " htmlhelp to make HTML files and a HTML help project" + @echo " qthelp to make HTML files and a qthelp project" + @echo " applehelp to make an Apple Help Book" + @echo " devhelp to make HTML files and a Devhelp project" + @echo " epub to make an epub" + @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" + @echo " latexpdf to make LaTeX files and run them through pdflatex" + @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" + @echo " text to make text files" + @echo " man to make manual pages" + @echo " texinfo to make Texinfo files" + @echo " info to make Texinfo files and run them through makeinfo" + @echo " gettext to make PO message catalogs" + @echo " changes to make an overview of all changed/added/deprecated items" + @echo " xml to make Docutils-native XML files" + @echo " pseudoxml to make pseudoxml-XML files for display purposes" + @echo " linkcheck to check all external links for integrity" + @echo " doctest to run all doctests embedded in the documentation (if enabled)" + @echo " coverage to run coverage check of the documentation (if enabled)" + +clean: + rm -rf $(BUILDDIR)/* + +html: + $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + +dirhtml: + $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." + +singlehtml: + $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml + @echo + @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." + +pickle: + $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle + @echo + @echo "Build finished; now you can process the pickle files." + +json: + $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json + @echo + @echo "Build finished; now you can process the JSON files." + +htmlhelp: + $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp + @echo + @echo "Build finished; now you can run HTML Help Workshop with the" \ + ".hhp project file in $(BUILDDIR)/htmlhelp." + +qthelp: + $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp + @echo + @echo "Build finished; now you can run "qcollectiongenerator" with the" \ + ".qhcp project file in $(BUILDDIR)/qthelp, like this:" + @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/sklearn-crfsuite.qhcp" + @echo "To view the help file:" + @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/sklearn-crfsuite.qhc" + +applehelp: + $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp + @echo + @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." + @echo "N.B. You won't be able to view it unless you put it in" \ + "~/Library/Documentation/Help or install it in your application" \ + "bundle." + +devhelp: + $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp + @echo + @echo "Build finished." + @echo "To view the help file:" + @echo "# mkdir -p $$HOME/.local/share/devhelp/sklearn-crfsuite" + @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/sklearn-crfsuite" + @echo "# devhelp" + +epub: + $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub + @echo + @echo "Build finished. The epub file is in $(BUILDDIR)/epub." + +latex: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo + @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." + @echo "Run \`make' in that directory to run these through (pdf)latex" \ + "(use \`make latexpdf' here to do that automatically)." + +latexpdf: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through pdflatex..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +latexpdfja: + $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex + @echo "Running LaTeX files through platex and dvipdfmx..." + $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja + @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." + +text: + $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text + @echo + @echo "Build finished. The text files are in $(BUILDDIR)/text." + +man: + $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man + @echo + @echo "Build finished. The manual pages are in $(BUILDDIR)/man." + +texinfo: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo + @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." + @echo "Run \`make' in that directory to run these through makeinfo" \ + "(use \`make info' here to do that automatically)." + +info: + $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo + @echo "Running Texinfo files through makeinfo..." + make -C $(BUILDDIR)/texinfo info + @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." + +gettext: + $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale + @echo + @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." + +changes: + $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes + @echo + @echo "The overview file is in $(BUILDDIR)/changes." + +linkcheck: + $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck + @echo + @echo "Link check complete; look for any errors in the above output " \ + "or in $(BUILDDIR)/linkcheck/output.txt." + +doctest: + $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest + @echo "Testing of doctests in the sources finished, look at the " \ + "results in $(BUILDDIR)/doctest/output.txt." + +coverage: + $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage + @echo "Testing of coverage in the sources finished, look at the " \ + "results in $(BUILDDIR)/coverage/python.txt." + +xml: + $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml + @echo + @echo "Build finished. The XML files are in $(BUILDDIR)/xml." + +pseudoxml: + $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml + @echo + @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." diff --git a/docs/api.rst b/docs/api.rst new file mode 100644 index 0000000..1f4649d --- /dev/null +++ b/docs/api.rst @@ -0,0 +1,14 @@ +.. _api: + +API Reference +============= + +CRF +--- + +.. automodule:: sklearn_crfsuite + :members: + +.. autoclass:: CRF + :members: + diff --git a/docs/changes.rst b/docs/changes.rst new file mode 100644 index 0000000..d9e113e --- /dev/null +++ b/docs/changes.rst @@ -0,0 +1 @@ +.. include:: ../CHANGES.rst diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..2195abb --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# sklearn-crfsuite documentation build configuration file, created by +# sphinx-quickstart on Fri Nov 27 03:50:38 2015. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os +import shlex + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +sys.path.insert(0, os.path.abspath('..')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', + 'alabaster', +] +numpydoc_show_class_members = False + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'sklearn-crfsuite' +copyright = '2015, Mikhail Korobov' +author = 'Mikhail Korobov' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.1' +# The full version, including alpha/beta/rc tags. +release = '0.1' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = ['_build'] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. + +import alabaster + +html_theme_path = [alabaster.get_path()] +html_theme = 'alabaster' +html_sidebars = { + '**': [ + 'about.html', + 'navigation.html', + # 'relations.html', + 'searchbox.html', + # 'donate.html', + ] +} + +html_theme_options = { + 'description': 'CRFsuite (python-crfsuite) wrapper which provides interface simlar to scikit-learn.', + 'github_user': 'TeamHG-Memex', + 'github_repo': 'sklearn-crfsuite', + 'github_banner': True, + 'github_button': False, +} + + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'h', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'r', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'sklearn-crfsuitedoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'sklearn-crfsuite.tex', 'sklearn-crfsuite Documentation', + 'Mikhail Korobov', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'sklearn-crfsuite', 'sklearn-crfsuite Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'sklearn-crfsuite', 'sklearn-crfsuite Documentation', + author, 'sklearn-crfsuite', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False diff --git a/docs/contributing.rst b/docs/contributing.rst new file mode 100644 index 0000000..3841781 --- /dev/null +++ b/docs/contributing.rst @@ -0,0 +1,33 @@ +Contributing +============ + +* Source code: https://github.com/TeamHG-Memex/sklearn-crfsuite +* Issue tracker: https://github.com/TeamHG-Memex/sklearn-crfsuite/issues + +Feel free to submit ideas, bugs reports and pull requests. + +In order to run tests install tox_, then type + +:: + + tox + +from the source checkout. + +.. _tox: http://tox.testrun.org + +Authors +------- + +* Mikhail Korobov + +The code was initially extracted from +`webstruct `_ and +`morphine `_ projects and then +cleaned up and improved. + + +License +------- + +License is MIT. diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..7168fd6 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,42 @@ +================ +sklearn-crfsuite +================ + +.. image:: https://img.shields.io/pypi/v/sklearn-crfsuite.svg + :target: https://pypi.python.org/pypi/sklearn-crfsuite + :alt: PyPI Version + +.. image:: https://img.shields.io/travis/TeamHG-Memex/sklearn-crfsuite/master.svg + :target: http://travis-ci.org/TeamHG-Memex/sklearn-crfsuite + :alt: Build Status + +.. image:: http://codecov.io/github/TeamHG-Memex/sklearn-crfsuite/coverage.svg?branch=master + :target: http://codecov.io/github/TeamHG-Memex/sklearn-crfsuite?branch=master + :alt: Code Coverage + +.. image:: https://readthedocs.org/projects/sklearn-crfsuite/badge/?version=latest + :target: http://sklearn-crfsuite.readthedocs.org/en/latest/?badge=latest + :alt: Documentation + +sklearn-crfsuite is thin a CRFsuite_ (python-crfsuite_) wrapper which provides +scikit-learn_-compatible :class:`sklearn_crfsuite.CRF` estimator: +you can use e.g. scikit-learn model selection utilities +(cross-validation, hyperparameter optimization) with it. + +.. _CRFsuite: http://www.chokkan.org/software/crfsuite/ +.. _python-crfsuite: https://github.com/tpeng/python-crfsuite +.. _scikit-learn: http://scikit-learn.org/ + +License is MIT. + +Contents +======== + +.. toctree:: + :maxdepth: 1 + + install + tutorial + api + contributing + changes diff --git a/docs/install.rst b/docs/install.rst new file mode 100644 index 0000000..043c8b7 --- /dev/null +++ b/docs/install.rst @@ -0,0 +1,13 @@ +Install Instructions +==================== + +Make sure scikit-learn_ is installed, then run + +:: + + pip install sklearn-crfsuite + +sklearn-crfsuite requires Python 2.7+ or 3.3+. + + +.. _scikit-learn: http://scikit-learn.org/ diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..28eb9f3 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,263 @@ +@ECHO OFF + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set BUILDDIR=_build +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . +set I18NSPHINXOPTS=%SPHINXOPTS% . +if NOT "%PAPER%" == "" ( + set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% + set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% +) + +if "%1" == "" goto help + +if "%1" == "help" ( + :help + echo.Please use `make ^` where ^ is one of + echo. html to make standalone HTML files + echo. dirhtml to make HTML files named index.html in directories + echo. singlehtml to make a single large HTML file + echo. pickle to make pickle files + echo. json to make JSON files + echo. htmlhelp to make HTML files and a HTML help project + echo. qthelp to make HTML files and a qthelp project + echo. devhelp to make HTML files and a Devhelp project + echo. epub to make an epub + echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter + echo. text to make text files + echo. man to make manual pages + echo. texinfo to make Texinfo files + echo. gettext to make PO message catalogs + echo. changes to make an overview over all changed/added/deprecated items + echo. xml to make Docutils-native XML files + echo. pseudoxml to make pseudoxml-XML files for display purposes + echo. linkcheck to check all external links for integrity + echo. doctest to run all doctests embedded in the documentation if enabled + echo. coverage to run coverage check of the documentation if enabled + goto end +) + +if "%1" == "clean" ( + for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i + del /q /s %BUILDDIR%\* + goto end +) + + +REM Check if sphinx-build is available and fallback to Python version if any +%SPHINXBUILD% 2> nul +if errorlevel 9009 goto sphinx_python +goto sphinx_ok + +:sphinx_python + +set SPHINXBUILD=python -m sphinx.__init__ +%SPHINXBUILD% 2> nul +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +:sphinx_ok + + +if "%1" == "html" ( + %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/html. + goto end +) + +if "%1" == "dirhtml" ( + %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. + goto end +) + +if "%1" == "singlehtml" ( + %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. + goto end +) + +if "%1" == "pickle" ( + %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the pickle files. + goto end +) + +if "%1" == "json" ( + %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can process the JSON files. + goto end +) + +if "%1" == "htmlhelp" ( + %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run HTML Help Workshop with the ^ +.hhp project file in %BUILDDIR%/htmlhelp. + goto end +) + +if "%1" == "qthelp" ( + %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; now you can run "qcollectiongenerator" with the ^ +.qhcp project file in %BUILDDIR%/qthelp, like this: + echo.^> qcollectiongenerator %BUILDDIR%\qthelp\sklearn-crfsuite.qhcp + echo.To view the help file: + echo.^> assistant -collectionFile %BUILDDIR%\qthelp\sklearn-crfsuite.ghc + goto end +) + +if "%1" == "devhelp" ( + %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. + goto end +) + +if "%1" == "epub" ( + %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The epub file is in %BUILDDIR%/epub. + goto end +) + +if "%1" == "latex" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + if errorlevel 1 exit /b 1 + echo. + echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdf" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "latexpdfja" ( + %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex + cd %BUILDDIR%/latex + make all-pdf-ja + cd %~dp0 + echo. + echo.Build finished; the PDF files are in %BUILDDIR%/latex. + goto end +) + +if "%1" == "text" ( + %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The text files are in %BUILDDIR%/text. + goto end +) + +if "%1" == "man" ( + %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The manual pages are in %BUILDDIR%/man. + goto end +) + +if "%1" == "texinfo" ( + %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. + goto end +) + +if "%1" == "gettext" ( + %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The message catalogs are in %BUILDDIR%/locale. + goto end +) + +if "%1" == "changes" ( + %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes + if errorlevel 1 exit /b 1 + echo. + echo.The overview file is in %BUILDDIR%/changes. + goto end +) + +if "%1" == "linkcheck" ( + %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck + if errorlevel 1 exit /b 1 + echo. + echo.Link check complete; look for any errors in the above output ^ +or in %BUILDDIR%/linkcheck/output.txt. + goto end +) + +if "%1" == "doctest" ( + %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest + if errorlevel 1 exit /b 1 + echo. + echo.Testing of doctests in the sources finished, look at the ^ +results in %BUILDDIR%/doctest/output.txt. + goto end +) + +if "%1" == "coverage" ( + %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage + if errorlevel 1 exit /b 1 + echo. + echo.Testing of coverage in the sources finished, look at the ^ +results in %BUILDDIR%/coverage/python.txt. + goto end +) + +if "%1" == "xml" ( + %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The XML files are in %BUILDDIR%/xml. + goto end +) + +if "%1" == "pseudoxml" ( + %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml + if errorlevel 1 exit /b 1 + echo. + echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. + goto end +) + +:end diff --git a/docs/tutorial.rst b/docs/tutorial.rst new file mode 100644 index 0000000..2a01406 --- /dev/null +++ b/docs/tutorial.rst @@ -0,0 +1,4 @@ +Tutorial +======== + +TODO diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e4bb154 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +tqdm >= 2.0 +tabulate >= 0.7.5 +scikit-learn >= 0.15 +python-crfsuite >= 0.8.3 diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..3c6e79c --- /dev/null +++ b/setup.cfg @@ -0,0 +1,2 @@ +[bdist_wheel] +universal=1 diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..e844460 --- /dev/null +++ b/setup.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python +from setuptools import setup + +setup( + name='sklearn-crfsuite', + version='0.1', + author='Mikhail Korobov', + author_email='kmike84@gmail.com', + license='MIT license', + long_description=open('README.rst').read() + "\n\n" + open('CHANGES.rst').read(), + description="CRFsuite (python-crfsuite) wrapper which provides interface simlar to scikit-learn", + url='https://github.com/TeamHG-Memex/sklearn-crfsuite', + zip_safe=False, + packages=['sklearn_crfsuite'], + install_requires=[ + "tqdm >= 2.0", + "six", + "tabulate", + "python-crfsuite >= 0.8.3" + ], + classifiers=[ + 'Development Status :: 3 - Alpha', + 'License :: OSI Approved :: MIT License', + 'Intended Audience :: Developers', + 'Operating System :: OS Independent', + 'Programming Language :: Python', + 'Programming Language :: Python :: 2', + 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3.4', + 'Programming Language :: Python :: 3.5', + ], +) diff --git a/sklearn_crfsuite/__init__.py b/sklearn_crfsuite/__init__.py new file mode 100644 index 0000000..fe61bef --- /dev/null +++ b/sklearn_crfsuite/__init__.py @@ -0,0 +1,2 @@ +# -*- coding: utf-8 -*- +from .estimator import CRF diff --git a/sklearn_crfsuite/_fileresource.py b/sklearn_crfsuite/_fileresource.py new file mode 100644 index 0000000..8975560 --- /dev/null +++ b/sklearn_crfsuite/_fileresource.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import +import os +import tempfile + + +class FileResource(object): + """ + Object that "owns" a file on a filesystem. If the ``filename`` is None, + it maintains a temporary file which name is accessible via ``name`` + attribute; when pickling, the contents of this file is pickled; + when unpickling, a new temp file is created; temp files are auto-deleted. + """ + def __init__(self, filename=None, keep_tempfiles=False, suffix='', prefix=''): + self.name = filename + self.auto = filename is None + self.keep_tempfiles = keep_tempfiles + self.suffix = suffix + self.prefix = prefix + + def ensure_name(self): + """ Ensure that a filename is available """ + if self.name is not None: + return + if self.auto: + fd, self.name = tempfile.mkstemp(self.suffix, self.prefix) + else: + raise ValueError("File name is not provided") + + def cleanup(self): + """ Clean temporary files if needed """ + if self.keep_tempfiles or not self.auto: + return + + if self.name is not None: + try: + os.unlink(self.name) + except OSError: + pass + self.name = None + + def refresh(self): + self.cleanup() + self.ensure_name() + + def __del__(self): + self.cleanup() + + def __getstate__(self): + dct = self.__dict__.copy() + + if self.auto: + filename = dct['name'] + if filename is not None: + try: + with open(filename, 'rb') as f: + dct['__FILE_RESOURCE_DATA__'] = f.read() + except IOError: + pass + dct['name'] = None + + return dct + + def __setstate__(self, state): + data = state.pop('__FILE_RESOURCE_DATA__', None) + self.__dict__.update(state) + + if data is not None: + assert self.name is None + self.ensure_name() + with open(self.name, 'wb') as f: + f.write(data) + diff --git a/sklearn_crfsuite/estimator.py b/sklearn_crfsuite/estimator.py new file mode 100644 index 0000000..71ef5a6 --- /dev/null +++ b/sklearn_crfsuite/estimator.py @@ -0,0 +1,469 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import + +from six.moves import zip +from tqdm import tqdm +import pycrfsuite +from sklearn.metrics import accuracy_score +from sklearn.base import BaseEstimator + +from sklearn_crfsuite._fileresource import FileResource +from sklearn_crfsuite.utils import flatten +from sklearn_crfsuite.trainer import LinePerIterationTrainer + + +class CRF(BaseEstimator): + """ + python-crfsuite wrapper with interface siimlar to scikit-learn. + It allows to use a familiar fit/predict interface and scikit-learn + model selection utilities (cross-validation, hyperparameter optimization). + + Unlike pycrfsuite.Trainer / pycrfsuite.Tagger this object is picklable; + on-disk files are managed automatically. + + Parameters + ---------- + algorithm : str, optional (default='lbfgs') + Training algorithm. Allowed values: + + * ``'lbfgs'`` - Gradient descent using the L-BFGS method + * ``'l2sgd'`` - Stochastic Gradient Descent with L2 regularization term + * ``'ap'`` - Averaged Perceptron + * ``'pa'`` - Passive Aggressive (PA) + * ``'arow'`` - Adaptive Regularization Of Weight Vector (AROW) + + min_freq : float, optional (default=0) + Cut-off threshold for occurrence + frequency of a feature. CRFsuite will ignore features whose + frequencies of occurrences in the training data are no greater + than `min_freq`. The default is no cut-off. + + all_possible_states : bool, optional (default=False) + Specify whether CRFsuite generates state features that do not even + occur in the training data (i.e., negative state features). + When True, CRFsuite generates state features that associate all of + possible combinations between attributes and labels. + + Suppose that the numbers of attributes and labels are A and L + respectively, this function will generate (A * L) features. + Enabling this function may improve the labeling accuracy because + the CRF model can learn the condition where an item is not predicted + to its reference label. However, this function may also increase + the number of features and slow down the training process + drastically. This function is disabled by default. + + all_possible_transitions : bool, optional (default=False) + Specify whether CRFsuite generates transition features that + do not even occur in the training data (i.e., negative transition + features). When True, CRFsuite generates transition features that + associate all of possible label pairs. Suppose that the number + of labels in the training data is L, this function will + generate (L * L) transition features. + This function is disabled by default. + + c1 : float, optional (default=0) + The coefficient for L1 regularization. + If a non-zero value is specified, CRFsuite switches to the + Orthant-Wise Limited-memory Quasi-Newton (OWL-QN) method. + The default value is zero (no L1 regularization). + + Supported training algorithms: lbfgs + + c2 : float, optional (default=1.0) + The coefficient for L2 regularization. + + Supported training algorithms: l2sgd, lbfgs + + max_iterations : int, optional (default=None) + The maximum number of iterations for optimization algorithms. + Default value depends on training algorithm: + + * lbfgs - unlimited; + * l2sgd - 1000; + * ap - 100; + * pa - 100; + * arow - 100. + + num_memories : int, optional (default=6) + The number of limited memories for approximating the inverse hessian + matrix. + + Supported training algorithms: lbfgs + + epsilon : float, optional (default=1e-5) + The epsilon parameter that determines the condition of convergence. + + Supported training algorithms: ap, arow, lbfgs, pa + + period : int, optional (default=10) + The duration of iterations to test the stopping criterion. + + Supported training algorithms: l2sgd, lbfgs + + delta : float, optional (default=1e-5) + The threshold for the stopping criterion; an iteration stops + when the improvement of the log likelihood over the last + `period` iterations is no greater than this threshold. + + Supported training algorithms: l2sgd, lbfgs + + linesearch : str, optional (default='MoreThuente') + The line search algorithm used in L-BFGS updates. Allowed values: + + * ``'MoreThuente'`` - More and Thuente's method; + * ``'Backtracking'`` - backtracking method with regular Wolfe condition; + * ``'StrongBacktracking'`` - backtracking method with strong Wolfe + condition. + + Supported training algorithms: lbfgs + + max_linesearch : int, optional (default=20) + The maximum number of trials for the line search algorithm. + + Supported training algorithms: lbfgs + + calibration_eta : float, optional (default=0.1) + The initial value of learning rate (eta) used for calibration. + + Supported training algorithms: l2sgd + + calibration_rate : float, optional (default=2.0) + The rate of increase/decrease of learning rate for calibration. + + Supported training algorithms: l2sgd + + calibration_samples : int, optional (default=1000) + The number of instances used for calibration. + The calibration routine randomly chooses instances no larger + than `calibration_samples`. + + Supported training algorithms: l2sgd + + calibration_candidates : int, optional (default=10) + The number of candidates of learning rate. + The calibration routine terminates after finding + `calibration_samples` candidates of learning rates + that can increase log-likelihood. + + Supported training algorithms: l2sgd + + calibration_max_trials : int, optional (default=20) + The maximum number of trials of learning rates for calibration. + The calibration routine terminates after trying + `calibration_max_trials` candidate values of learning rates. + + Supported training algorithms: l2sgd + + pa_type : int, optional (default=1) + The strategy for updating feature weights. Allowed values: + + * 0 - PA without slack variables; + * 1 - PA type I; + * 2 - PA type II. + + Supported training algorithms: pa + + c : float, optional (default=1) + Aggressiveness parameter (used only for PA-I and PA-II). + This parameter controls the influence of the slack term on the + objective function. + + Supported training algorithms: pa + + error_sensitive : bool, optional (default=True) + If this parameter is True, the optimization routine includes + into the objective function the square root of the number of + incorrect labels predicted by the model. + + Supported training algorithms: pa + + averaging : bool, optional (default=True) + If this parameter is True, the optimization routine computes + the average of feature weights at all updates in the training + process (similarly to Averaged Perceptron). + + Supported training algorithms: pa + + variance : float, optional (default=1) + The initial variance of every feature weight. + The algorithm initialize a vector of feature weights as + a multivariate Gaussian distribution with mean 0 + and variance `variance`. + + Supported training algorithms: arow + + gamma : float, optional (default=1) + The tradeoff between loss function and changes of feature weights. + + Supported training algorithms: arow + + verbose : bool, optional (default=False) + Enable trainer verbose mode. + + model_filename : str, optional (default=None) + A path to an existing CRFSuite model. + This parameter allows to load and use existing crfsuite models. + + By default, model files are created automatically and saved + in temporary locations; the preferred way to save/load CRF models + is to use pickle (or its alternatives like joblib). + + Attributes + ---------- + tagger : pycrfsuite.Tagger + python-crfsuite Tagger instance. + + """ + def __init__(self, + algorithm=None, + + min_freq=None, + all_possible_states=None, + all_possible_transitions=None, + c1=None, + c2=None, + max_iterations=None, + num_memories=None, + epsilon=None, + period=None, + delta=None, + linesearch=None, + max_linesearch=None, + calibration_eta=None, + calibration_rate=None, + calibration_samples=None, + calibration_candidates=None, + calibration_max_trials=None, + pa_type=None, + c=None, + error_sensitive=None, + averaging=None, + variance=None, + gamma=None, + + verbose=False, + model_filename=None, + keep_tempfiles=False, + trainer_cls=None): + + self.algorithm = algorithm + self.min_freq = min_freq + self.all_possible_states = all_possible_states + self.all_possible_transitions = all_possible_transitions + self.c1 = c1 + self.c2 = c2 + self.max_iterations = max_iterations + self.num_memories = num_memories + self.epsilon = epsilon + self.period = period + self.delta = delta + self.linesearch = linesearch + self.max_linesearch = max_linesearch + self.calibration_eta = calibration_eta + self.calibration_rate = calibration_rate + self.calibration_samples = calibration_samples + self.calibration_candidates = calibration_candidates + self.calibration_max_trials = calibration_max_trials + self.pa_type = pa_type + self.c = c + self.error_sensitive = error_sensitive + self.averaging = averaging + self.variance = variance + self.gamma = gamma + + self.modelfile = FileResource( + filename=model_filename, + keep_tempfiles=keep_tempfiles, + suffix=".crfsuite", + prefix="model" + ) + self.verbose = verbose + self._tagger = None + self.trainer_cls = trainer_cls + + self.training_log_ = None + + def fit(self, X, y, X_dev=None, y_dev=None): + """ + Train a model. + + Parameters + ---------- + X : list of lists of dicts + Feature dicts for several documents (in a python-crfsuite format). + + y : list of lists of strings + Labels for several documents. + + X_dev : (optional) list of lists of dicts + Feature dicts used for testing. + + y_dev : (optional) list of lists of strings + Labels corresponding to X_dev. + """ + if (X_dev is None and y_dev is not None) or (X_dev is not None and y_dev is None): + raise ValueError("Pass both X_dev and y_dev to use the holdout data") + + if self._tagger is not None: + self._tagger.close() + self._tagger = None + self.modelfile.refresh() + + trainer = self._get_trainer() + train_data = zip(X, y) + + if self.verbose: + train_data = tqdm(train_data, "loading training data to CRFsuite", len(X), leave=True) + + for xseq, yseq in train_data: + trainer.append(xseq, yseq) + + if self.verbose: + print("") + + if X_dev is not None: + test_data = zip(X_dev, y_dev) + + if self.verbose: + test_data = tqdm(test_data, "loading dev data to CRFsuite", len(X_dev), leave=True) + + for xseq, yseq in test_data: + trainer.append(xseq, yseq, 1) + + if self.verbose: + print("") + + trainer.train(self.modelfile.name, holdout=-1 if X_dev is None else 1) + self.training_log_ = trainer.logparser + return self + + def predict(self, X): + """ + Make a prediction. + + Parameters + ---------- + X : list of lists of dicts + feature dicts in python-crfsuite format + + Returns + ------- + y : list of lists of strings + predicted labels + + """ + return list(map(self.predict_single, X)) + + def predict_single(self, xseq): + """ + Make a prediction. + + Parameters + ---------- + xseq : list of dicts + feature dicts in python-crfsuite format + + Returns + ------- + y : list of strings + predicted labels + + """ + return self.tagger.tag(xseq) + + def predict_marginals(self, X): + """ + Make a prediction. + + Parameters + ---------- + X : list of lists of dicts + feature dicts in python-crfsuite format + + Returns + ------- + y : list of lists of dicts + predicted probabilities for each label at each position + + """ + return list(map(self.predict_marginals_single, X)) + + def predict_marginals_single(self, xseq): + """ + Make a prediction. + + Parameters + ---------- + xseq : list of dicts + feature dicts in python-crfsuite format + + Returns + ------- + y : list of dicts + predicted probabilities for each label at each position + + """ + labels = self.tagger.labels() + self.tagger.set(xseq) + return [ + {label: self.tagger.marginal(label, i) for label in labels} + for i in range(len(xseq)) + ] + + def score(self, X, y): + """ + Return per-field accuracy score. + """ + y_pred_flat = flatten(self.predict(X)) + y_true_flat = flatten(y) + return accuracy_score(y_true_flat, y_pred_flat) + + @property + def tagger(self): + if self._tagger is None: + if self.modelfile.name is None: + raise Exception("Can't load model. Is the model trained?") + + tagger = pycrfsuite.Tagger() + tagger.open(self.modelfile.name) + self._tagger = tagger + return self._tagger + + def _get_trainer(self): + trainer_cls = self.trainer_cls or LinePerIterationTrainer + params = { + 'feature.minfreq': self.min_freq, + 'feature.possible_states': self.all_possible_states, + 'feature.possible_transitions': self.all_possible_transitions, + 'c1': self.c1, + 'c2': self.c2, + 'max_iterations': self.max_iterations, + 'num_memories': self.num_memories, + 'epsilon': self.epsilon, + 'period': self.period, + 'delta': self.delta, + 'linesearch': self.linesearch, + 'max_linesearch': self.max_linesearch, + 'calibration.eta': self.calibration_eta, + 'calibration.rate': self.calibration_rate, + 'calibration.samples': self.calibration_samples, + 'calibration.candidates': self.calibration_candidates, + 'calibration.max_trials': self.calibration_max_trials, + 'type': self.pa_type, + 'c': self.c, + 'error_sensitive': self.error_sensitive, + 'averaging': self.averaging, + 'variance': self.variance, + 'gamma': self.gamma, + } + params = {k: v for k, v in params.items() if v is not None} + return trainer_cls( + algorithm=self.algorithm, + params=params, + verbose=self.verbose, + ) + + def __getstate__(self): + dct = self.__dict__.copy() + dct['_tagger'] = None + return dct diff --git a/sklearn_crfsuite/trainer.py b/sklearn_crfsuite/trainer.py new file mode 100644 index 0000000..00d47bd --- /dev/null +++ b/sklearn_crfsuite/trainer.py @@ -0,0 +1,52 @@ +# -*- coding: utf-8 -*- +import pycrfsuite +from tabulate import tabulate + + +class LinePerIterationTrainer(pycrfsuite.Trainer): + """ + This pycrfsuite.Trainer prints information about each iteration + on a single line. + """ + def on_iteration(self, log, info): + parts = [ + "Iter {num:<3} ", + "time={time:<5.2f} ", + "loss={loss:<8.2f} ", + ] + + if 'active_features' in info: + parts += ["active={active_features:<5} "] + + if 'avg_precision' in info: + parts += [ + "precision={avg_precision:0.3f} ", + "recall={avg_recall:0.3f} ", + "F1={avg_f1:0.3f} ", + "Acc(item/seq)={item_accuracy_float:0.3f} {instance_accuracy_float:0.3f}" + ] + + if 'feature_norm' in info: + parts += ["feature_norm={feature_norm:<8.2f}"] + + line = "".join(parts) + print(line.format(**info).strip()) + + def on_optimization_end(self, log): + last_iter = self.logparser.last_iteration + if last_iter.get('scores', None): + data = [ + [entity, score.precision, score.recall, score.f1 or 0, score.ref] + for entity, score in sorted(last_iter['scores'].items()) + ] + table = tabulate(data, + headers=["Label", "Precision", "Recall", "F1", "Support"], + # floatfmt="0.4f", + ) + size = len(table.splitlines()[0]) + print("="*size) + print(table) + print("-"*size) + super(LinePerIterationTrainer, self).on_optimization_end(log) + + diff --git a/sklearn_crfsuite/utils.py b/sklearn_crfsuite/utils.py new file mode 100644 index 0000000..7999ba8 --- /dev/null +++ b/sklearn_crfsuite/utils.py @@ -0,0 +1,12 @@ +# -*- coding: utf-8 -*- +from itertools import chain + + +def flatten(y): + """ + Flatten a list of lists. + + >>> flatten([[1,2], [3,4]]) + [1, 2, 3, 4] + """ + return list(chain.from_iterable(y)) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..93f710a --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import +import pytest + + +@pytest.fixture() +def xseq(): + return [ + {'walk': 1, 'shop': 0.5}, + {'walk': 1}, + {'walk': 1, 'clean': 0.5}, + {u'shop': 0.5, u'clean': 0.5}, + {'walk': 0.5, 'clean': 1}, + {'clean': 1, u'shop': 0.1}, + {'walk': 1, 'shop': 0.5}, + {}, + {'clean': 1}, + {u'солнце': u'не светит'.encode('utf8'), 'clean': 1}, + ] + +@pytest.fixture +def yseq(): + return ['sunny', 'sunny', u'sunny', 'rainy', 'rainy', 'rainy', + 'sunny', 'sunny', 'rainy', 'rainy'] diff --git a/tests/test_crf.py b/tests/test_crf.py new file mode 100644 index 0000000..803e689 --- /dev/null +++ b/tests/test_crf.py @@ -0,0 +1,120 @@ +# -*- coding: utf-8 -*- +import os +import pickle + +import pytest +from sklearn.cross_validation import cross_val_score + +from sklearn_crfsuite import CRF + + +ALGORITHMS = ["lbfgs", "l2sgd", "pa", "ap", "arow"] + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_crf(xseq, yseq, algorithm): + crf = CRF(algorithm) + crf.fit([xseq], [yseq]) + + y_pred = crf.predict([xseq]) + if algorithm != 'ap': # Averaged Perceptron is regularized too much + assert y_pred == [yseq] + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +@pytest.mark.parametrize("use_dev", [True, False]) +def test_crf_verbose(xseq, yseq, algorithm, use_dev): + crf = CRF(algorithm, verbose=True) + + if use_dev: + X_dev, y_dev = [xseq], [yseq] + else: + X_dev, y_dev = None, None + + crf.fit( + X=[xseq, xseq], + y=[yseq, yseq], + X_dev=X_dev, + y_dev=y_dev + ) + y_pred = crf.predict([xseq]) + if algorithm != 'ap': # Averaged Perceptron is regularized too much + assert y_pred == [yseq] + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_crf_marginals(xseq, yseq, algorithm): + crf = CRF(algorithm) + crf.fit([xseq], [yseq]) + + y_pred_marginals = crf.predict_marginals([xseq]) + assert len(y_pred_marginals) == 1 + marginals = y_pred_marginals[0] + assert len(marginals) == len(yseq) + + labels = crf.tagger.labels() + for m in marginals: + assert isinstance(m, dict) + assert set(m.keys()) == set(labels) + assert abs(sum(m.values()) - 1.0) < 1e-6 + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_predict_without_fit(xseq, algorithm): + crf = CRF(algorithm) + with pytest.raises(Exception): + crf.predict([xseq]) + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_crf_score(xseq, yseq, algorithm): + crf = CRF(algorithm) + crf.fit([xseq], [yseq]) + + score = crf.score([xseq], [yseq]) + if algorithm != 'ap': + assert score == 1.0 + else: # Averaged Perceptron is regularized too much + assert score > 0.8 + + +@pytest.mark.parametrize("algorithm", ALGORITHMS) +def test_crf_pickling(xseq, yseq, algorithm): + crf = CRF(algorithm=algorithm) + crf.fit([xseq], [yseq]) + data = pickle.dumps(crf, protocol=pickle.HIGHEST_PROTOCOL) + + crf2 = pickle.loads(data) + score = crf2.score([xseq], [yseq]) + if algorithm != 'ap': + assert score == 1.0 + else: # Averaged Perceptron is regularized too much + assert score > 0.8 + assert crf2.algorithm == algorithm + + +def test_crf_model_filename(xseq, yseq, tmpdir): + path = os.path.join(str(tmpdir), "foo.crfsuite") + assert not os.path.exists(path) + + # model file is created at a specified location + crf = CRF(model_filename=path) + crf.fit([xseq], [yseq]) + assert os.path.exists(path) + + # it is possible to load the model just by passing a file name + crf2 = CRF(model_filename=path) + assert crf2.score([xseq], [yseq]) == 1.0 + + # crf is picklable + data = pickle.dumps(crf, protocol=pickle.HIGHEST_PROTOCOL) + crf3 = pickle.loads(data) + assert crf3.score([xseq], [yseq]) == 1.0 + + +def test_cross_validation(xseq, yseq): + crf = CRF() + X = [xseq] * 20 + y = [yseq] * 20 + scores = cross_val_score(crf, X, y, n_jobs=5, cv=5) + assert scores.mean() == 1.0 diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..d7fc4c5 --- /dev/null +++ b/tox.ini @@ -0,0 +1,15 @@ +[tox] +envlist = py27,py33,py34,py35 + +[testenv] +deps= + pytest + pytest-cov + numpy + +commands= + pip install -U wheel pip + pip install scipy + pip install scikit-learn + pip install -e . + py.test --doctest-modules --cov=sklearn_crfsuite --cov-report= {posargs: sklearn_crfsuite tests}