diff --git a/.gitignore b/.gitignore index 6aff18a6..bb6f8320 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ __pycache__/ *.py[cod] *~ ipython/.ipynb_checkpoints +.pytest_cache # C extensions *.so diff --git a/doc/revscoring.datasources.session_oriented.rst b/doc/revscoring.datasources.session_oriented.rst new file mode 100644 index 00000000..48b87d99 --- /dev/null +++ b/doc/revscoring.datasources.session_oriented.rst @@ -0,0 +1,4 @@ +revscoring.datasources.session_oriented +======================================= + +.. automodule:: revscoring.datasources.session_oriented diff --git a/docs/api_reference.rst b/docs/api_reference.rst index c914f864..4c79c899 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -15,6 +15,7 @@ Subpackages revscoring.datasources revscoring.datasources.meta revscoring.datasources.revision_oriented + revscoring.datasources.session_oriented revscoring.extractors revscoring.features revscoring.features.meta diff --git a/docs/conf.py b/docs/conf.py index ca27a6f5..d532856e 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -20,12 +20,12 @@ import sys import alabaster -import revscoring dir_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) print(dir_path) sys.path.insert(0, dir_path) +import revscoring # -- General configuration ------------------------------------------------ diff --git a/docs/revscoring.datasources.session_oriented.rst b/docs/revscoring.datasources.session_oriented.rst new file mode 100644 index 00000000..48b87d99 --- /dev/null +++ b/docs/revscoring.datasources.session_oriented.rst @@ -0,0 +1,4 @@ +revscoring.datasources.session_oriented +======================================= + +.. automodule:: revscoring.datasources.session_oriented diff --git a/revscoring/datasources/__init__.py b/revscoring/datasources/__init__.py index ab76a830..ff7e4bfe 100644 --- a/revscoring/datasources/__init__.py +++ b/revscoring/datasources/__init__.py @@ -8,7 +8,8 @@ :class:`~revscoring.Datasource` processors are tended to be :func:`~revscoring.dependencies.solve`'d as dependencies. The provided datasources are split conceptually into a set of modules. Currently, -there is one module: :mod:`~revscoring.datasources.revision_oriented`. +there are two modules: :mod:`~revscoring.datasources.revision_oriented` and +:mod:`~revscoring.datasources.session_oriented`. Meta-datasources ++++++++++++++++ @@ -22,9 +23,6 @@ ++++++++++++ .. automodule:: revscoring.datasources.datasource - - - """ from .datasource import Datasource diff --git a/revscoring/datasources/meta/expanders.py b/revscoring/datasources/meta/expanders.py new file mode 100644 index 00000000..ab1e5331 --- /dev/null +++ b/revscoring/datasources/meta/expanders.py @@ -0,0 +1,13 @@ +from ..datasource import Datasource + + +class list_of(Datasource): + + def __init__(self, dependent, depends_on=None, name=None): + name = self._format_name(name, [dependent]) + super().__init__( + name, self.process, depends_on=depends_on) + self.dependency = dependent + + def process(self, *lists_of_values): + return [self.dependency(*values) for values in zip(*lists_of_values)] diff --git a/revscoring/datasources/meta/selectors.py b/revscoring/datasources/meta/selectors.py index 39a5acd7..481e6123 100644 --- a/revscoring/datasources/meta/selectors.py +++ b/revscoring/datasources/meta/selectors.py @@ -116,8 +116,7 @@ class filter_keys(Datasource): :Parameters: table_datasource : :class:`revscoring.Datasource` - A datasource that generates a table including only the specified - keys + A datasource that generates a table with keys and values keys : `iterable` ( `hashable` ) The keys to select from the table name : `str` @@ -138,3 +137,43 @@ def process(self, table): new_table[key] = table[key] return new_table + + +class first(Datasource): + """ + Returns the first item in an indexable collection (e.g., a list) + + :Parameters: + items_datasource : :class:`revscoring.Datasource` + A datasource that generates an indexable sequence + name : `str` + A name for the datasource + """ + + def __init__(self, items_datasource, name=None): + name = self._format_name( + name, [items_datasource]) + super().__init__(name, self.process, depends_on=[items_datasource]) + + def process(self, items): + return items[0] + + +class last(Datasource): + """ + Returns the last item in an indexable collection (e.g., a list) + + :Parameters: + items_datasource : :class:`revscoring.Datasource` + A datasource that generates an indexable sequence + name : `str` + A name for the datasource + """ + + def __init__(self, items_datasource, name=None): + name = self._format_name( + name, [items_datasource]) + super().__init__(name, self.process, depends_on=[items_datasource]) + + def process(self, items): + return items[-1] diff --git a/revscoring/datasources/session_oriented.py b/revscoring/datasources/session_oriented.py new file mode 100644 index 00000000..42b039d5 --- /dev/null +++ b/revscoring/datasources/session_oriented.py @@ -0,0 +1,181 @@ +""" +Implements a set of datasources oriented off of a single revision. This is +useful for extracting features of edit and article quality. + +.. autodata:: revscoring.datasources.session_oriented.session + +Supporting classes +++++++++++++++++++ + +.. autoclass:: revscoring.datasources.session_oriented.Session + :members: + :member-order: bysource + +Supporting functions +++++++++++++++++++++ + +.. autofunction:: revscoring.datasources.session_oriented.list_of_tree + +.. autofunction:: revscoring.datasources.session_oriented.list_of_ify +""" +import logging +import re +from functools import wraps +from inspect import getmembers, ismethod + +from revscoring import Feature, FeatureVector +from revscoring.features.meta import expanders as feature_expanders + +from ..dependencies import DependentSet +from .datasource import Datasource +from .meta import expanders as datasource_expanders +from .revision_oriented import Revision, User + +logger = logging.getLogger(__name__) + + +def list_of_tree(dependent_set, rewrite_name=None, cache=None): + """ + Converts a :class:`~revscoring.DependentSet` and all of the + :class:`~revscoring.Dependent` named into a new + :class:`~revscoring.DependentSet` with + :func:`~revscoring.datasources.session_oriented.list_of_ify` applied. + + :Parameters: + dependent_set : :class:`~revscoring.DependentSet` + A dependent set to convert + rewrite_name : function + A function to apply to the dependent's name when re-creating it. + cache : dict(:class:`~revscoring.Feature` | :class:`~revscoring.FeatureVector` | :class:`~revscoring.Datasource`) + A map of dependents that have already been converted. + """ + logger.debug("Applying list_of_tree to {0}".format(dependent_set.name)) + cache = cache if cache is not None else {} + rewrite_name = rewrite_name if rewrite_name is not None else \ + lambda name: name + + # Rewrites all dependents. + for attr, dependent in dependent_set.dependents.items(): + new_dependent = list_of_ify(dependent, rewrite_name, cache) + setattr(dependent_set, attr, new_dependent) + + # Iterate into all sub-DependentSets + for attr, sub_dependent_set in dependent_set.dependent_sets.items(): + if attr.startswith("_"): + pass + else: + logger.debug("Running list_of_tree on {0}".format(attr)) + new_dependent_set = list_of_tree( + sub_dependent_set, rewrite_name, cache) + setattr(dependent_set, attr, new_dependent_set) + + # Iterate into all meta-dependents (methods that return a new dependent) + for attr, method in getmembers(dependent_set, ismethod): + if not hasattr(method, "meta_dependent"): + pass + else: + list_of_meta_method = meta_list_of_ify( + method, rewrite_name, cache) + setattr(dependent_set, attr, list_of_meta_method) + + return dependent_set + + +def list_of_ify(dependent, rewrite_name, cache): + """ + Converts any :class:`~revscoring.Feature`, + :class:`~revscoring.FeatureVector`, or :class:`~revscoring.Datasource` into + an equivalent "list of" the same dependent. Dependencies are converted + recursively and a cache is maintained for memoization. + + :Parameters: + dependent : (:class:`~revscoring.Feature` | :class:`~revscoring.FeatureVector` | :class:`~revscoring.Datasource`) + A dependent to convert + rewrite_name : function + A function to apply to the dependent's name when re-creating it. + cache : dict(:class:`~revscoring.Feature` | :class:`~revscoring.FeatureVector` | :class:`~revscoring.Datasource`) + A map of dependents that have already been converted. + """ + + new_name = rewrite_name(dependent.name) + if new_name in cache: + logger.debug("list_of_ify {0} in the cache".format(dependent.name)) + return cache[new_name] + else: + logger.debug("list_of_ify is modifying {0} into a list_of".format(dependent.name)) + new_dependencies = [list_of_ify(dependency, rewrite_name, cache) + for dependency in dependent.dependencies] + + if isinstance(dependent, Datasource): + new_dependent = datasource_expanders.list_of( + dependent, depends_on=new_dependencies, name=new_name) + elif isinstance(dependent, FeatureVector): + new_dependent = datasource_expanders.list_of( + dependent, depends_on=new_dependencies, name=new_name) + elif isinstance(dependent, Feature): + new_dependent = feature_expanders.list_of( + dependent, depends_on=new_dependencies, name=new_name) + else: + raise TypeError("Cannot convert type {0} into a list_of" + .format(type(dependent))) + + cache[new_name] = new_dependent + return cache[new_name] + + +def meta_list_of_ify(method, rewrite_name, cache): + @wraps(method) + def wrapper(*args, **kwargs): + dependent = method(*args, **kwargs) + return list_of_ify(dependent, rewrite_name, cache) + + return wrapper + + +def rewrite_name(name): + return re.sub(r"(^|\.)revision\.", r"\1session.revisions.", name) + + +class Session(DependentSet): + """ + Represents a session -- an ordered list of revisions + """ + def __init__(self, name): + super().__init__(name) + self.revisions = list_of_tree(Revision( + "session.revisions", + include_page_creation=True, + include_content=True, + include_user=False, + include_page_suggested=True), + rewrite_name=rewrite_name) + """ + :class:`revscoring.datasources.revision_oriented.Revision`: modified by + :func:`~revscoring.datasources.session_oriented.list_of_tree()` + """ + + self.user = User( + name + ".user", + include_info=True, + include_last_revision=True + ) + """ + :class:`revscoring.datasources.revision_oriented.User` + """ + +session = Session("session") +""" +Represents the session of interest. Implements this structure: + +* session: :class:`~revscoring.datasources.session_oriented.Session` + * revisions: :class:`~revscoring.datasources.revision_oriented.Revision` + * diff: :class:`~revscoring.datasources.revision_oriented.Diff` + * page: :class:`~revscoring.datasources.revision_oriented.Page` + * namespace: :class:`~revscoring.datasources.revision_oriented.Namespace` + * creation: :class:`~revscoring.datasources.revision_oriented.Revision` + * parent: :class:`~revscoring.datasources.revision_oriented.Revision` + * user: :class:`~revscoring.datasources.revision_oriented.User` + * user: :class:`~revscoring.datasources.revision_oriented.User` + * info: :class:`~revscoring.datasources.revision_oriented.UserInfo` + * last_revision: :class:`~revscoring.datasources.revision_oriented.Revision` +""" # noqa diff --git a/revscoring/dependencies/dependent.py b/revscoring/dependencies/dependent.py index e73942bf..ccef2a87 100644 --- a/revscoring/dependencies/dependent.py +++ b/revscoring/dependencies/dependent.py @@ -7,7 +7,6 @@ :members: """ import logging -import pickle logger = logging.getLogger(__name__) @@ -68,26 +67,6 @@ def __str__(self): def __repr__(self): return "<" + self.__str__() + ">" - @classmethod - def load(cls, f): - """ - Reads serialized model information from a file. - """ - if hasattr(f, 'buffer'): - return pickle.load(f.buffer) - else: - return pickle.load(f) - - def dump(self, f): - """ - Writes serialized model information to a file. - """ - - if hasattr(f, 'buffer'): - return pickle.dump(self, f.buffer) - else: - return pickle.dump(self, f) - class DependentSet: """ @@ -99,33 +78,45 @@ class DependentSet: A base name for the items in the set """ - def __init__(self, name, _dependents=None, _dependent_sets=None): - self._dependents = _dependents or set() - self._dependent_sets = _dependent_sets or set() - self._name = name + def __init__(self, name, dependents=None, dependent_sets=None, + meta_dependents=None): + self.dependents = dependents or {} + self.dependent_sets = dependent_sets or {} + self.meta_dependents = meta_dependents or {} + self.name = name def __setattr__(self, attr, value): super().__setattr__(attr, value) if isinstance(value, Dependent): logger.log(logging.NOTSET, - "Registering {0} to {1}".format(value, self._name)) - if value in self._dependents: + "Registering {0} to {1}".format(value, self.name)) + if value in self.dependents: logger.warn("{0} has already been added to {1}. Could be " .format(value, self) + "overwritten?") - self._dependents.add(value) + self.dependents[attr] = value elif isinstance(value, DependentSet): - self._dependent_sets.add(value) + self.dependent_sets[attr] = value + else: + pass # Just set it like a regular attribute + + @classmethod + def meta_dependent(cls, method): + """ + A decorator for applying to methods that return a dependent value. + """ + method.meta_dependent = True + return method # String methods def __str__(self): return self.__repr__() def __repr__(self): - return "{" + self._name + "}" + return "{" + self.name + "}" def __hash__(self): - return hash('dependent_set.' + self._name) + return hash('dependent_set.' + self.name) def __eq__(self, other): return hash(self) == hash(other) @@ -133,21 +124,24 @@ def __eq__(self, other): def __ne__(self, other): return not self == other + def all_dependencies(self): + return set(self.dependents.values()).union(*self.dependent_sets.values()) + # Set methods def __len__(self): - return len(self._dependents.union(*self._dependent_sets)) + return len(self.all_dependencies()) def __contains__(self, item): - return item in self._dependents.union(*self._dependent_sets) + return item in self.all_dependencies() def __iter__(self): - return iter(self._dependents.union(*self._dependent_sets)) + return iter(self.all_dependencies()) def __sub__(self, other): - return self._dependents.union(*self._dependent_sets) - other + return self.all_dependencies() - other def __and__(self, other): - return self._dependents.union(*self._dependent_sets) & other + return self.all_dependencies() & other def __or__(self, other): - return self._dependents.union(*self._dependent_sets) | other + return self.all_dependencies() | other diff --git a/revscoring/extractors/api/datasources.py b/revscoring/extractors/api/datasources.py index 9485287d..64df2425 100644 --- a/revscoring/extractors/api/datasources.py +++ b/revscoring/extractors/api/datasources.py @@ -9,7 +9,7 @@ class RevDocById(Datasource): def __init__(self, revision, extractor): self.revision = revision self.extractor = extractor - super().__init__(revision._name + ".doc", self.process, + super().__init__(revision.name + ".doc", self.process, depends_on=[revision.id, extractor.dependents]) def process(self, rev_id, dependents): @@ -34,7 +34,7 @@ class PageCreationRevDoc(Datasource): def __init__(self, page, extractor): self.page = page self.extractor = extractor - super().__init__(page.creation._name + ".doc", self.process, + super().__init__(page.creation.name + ".doc", self.process, depends_on=[page.id, extractor.dependents]) def process(self, page_id, dependents): @@ -77,7 +77,7 @@ class UserInfoDoc(Datasource): def __init__(self, user, extractor): self.user = user self.extractor = extractor - super().__init__(user.info._name + ".doc", self.process, + super().__init__(user.info.name + ".doc", self.process, depends_on=[user.id, user.text]) def process(self, user_id, user_text): @@ -99,7 +99,7 @@ def __init__(self, revision, extractor): self.revision = revision self.extractor = extractor super().__init__( - revision.user.last_revision._name + ".doc", self.process, + revision.user.last_revision.name + ".doc", self.process, depends_on=[revision.user.text, revision.timestamp, extractor.dependents] ) diff --git a/revscoring/extractors/api/extractor.py b/revscoring/extractors/api/extractor.py index 5066e86b..758673aa 100644 --- a/revscoring/extractors/api/extractor.py +++ b/revscoring/extractors/api/extractor.py @@ -3,11 +3,11 @@ import mwapi +from . import datasources +from .. import Extractor as BaseExtractor from ...datasources import Datasource, revision_oriented from ...dependencies import expand from ...errors import QueryNotSupported, RevisionNotFound, UserNotFound -from .. import Extractor as BaseExtractor -from . import datasources from .revision_oriented import Revision from .util import REV_PROPS, USER_PROPS @@ -45,7 +45,7 @@ def get_property_suggestion_search_doc(self, page): return datasources.PropertySuggestionDoc(page, self) def extract(self, rev_ids, dependents, context=None, caches=None, - cache=None, profile=None): + cache=None, profile=None, orientation="revision"): """ Extracts a values for a set of :class:`~revscoring.dependents.dependent.Dependent` (e.g. diff --git a/revscoring/extractors/api/revision_oriented.py b/revscoring/extractors/api/revision_oriented.py index 62dfa0dd..a9ddd862 100644 --- a/revscoring/extractors/api/revision_oriented.py +++ b/revscoring/extractors/api/revision_oriented.py @@ -10,7 +10,7 @@ class Revision(DependentSet): def __init__(self, revision, extractor, rev_doc, id_datasource=None): - super().__init__(revision._name) + super().__init__(revision.name) self.doc = rev_doc @@ -47,9 +47,9 @@ def __init__(self, revision, extractor, rev_doc, id_datasource=None): class RevisionPage(DependentSet): def __init__(self, page, extractor, rev_doc): - super().__init__(page._name) + super().__init__(page.name) namespace_title = Datasource( - page._name + ".namespace_title", normalize_title, + page.name + ".namespace_title", normalize_title, depends_on=[rev_doc] ) self.id = key(['page', 'pageid'], rev_doc, name=page.id.name) @@ -70,7 +70,7 @@ def __init__(self, page, extractor, rev_doc): class PageSuggested(DependentSet): def __init__(self, page, extractor): - super().__init__(page.suggested._name) + super().__init__(page.suggested.name) if hasattr(page.suggested, "properties"): property_suggestion_doc = \ @@ -82,7 +82,7 @@ def __init__(self, page, extractor): class Namespace(DependentSet): def __init__(self, namespace, extractor, rev_doc, namespace_title): - super().__init__(namespace._name) + super().__init__(namespace.name) self.id = key(['page', 'ns'], rev_doc, name=namespace.id.name) self.name = Datasource(namespace.name.name, first, depends_on=[namespace_title]) @@ -91,7 +91,7 @@ def __init__(self, namespace, extractor, rev_doc, namespace_title): class RevisionUser(DependentSet): def __init__(self, revision, extractor, rev_doc): - super().__init__(revision.user._name) + super().__init__(revision.user.name) self.id = key('userid', rev_doc, name=revision.user.id.name, if_missing=(UserDeleted, revision.user)) self.text = key('user', rev_doc, name=revision.user.text.name, @@ -109,7 +109,7 @@ def __init__(self, revision, extractor, rev_doc): class RevisionUserInfo(DependentSet): def __init__(self, user, extractor): - super().__init__(user.info._name) + super().__init__(user.info.name) self.doc = extractor.get_user_info_doc(user) self.editcount = key('editcount', self.doc, diff --git a/revscoring/features/__init__.py b/revscoring/features/__init__.py index 02e767de..f0ea9a66 100644 --- a/revscoring/features/__init__.py +++ b/revscoring/features/__init__.py @@ -49,12 +49,10 @@ ++++++++++++ .. automodule:: revscoring.features.feature - -.. automodule:: revscoring.features.feature_vector """ -from .feature import Constant, Feature, Modifier -from .feature_vector import FeatureVector +from .feature import Constant, ConstantVector, Feature, FeatureVector, Modifier from .functions import trim, vectorize_values -__all__ = [Feature, Modifier, Constant, FeatureVector, trim, vectorize_values] +__all__ = [Feature, Modifier, Constant, ConstantVector, FeatureVector, trim, + vectorize_values] diff --git a/revscoring/features/bytes/__init__.py b/revscoring/features/bytes/__init__.py index 33095684..1c55ac9a 100644 --- a/revscoring/features/bytes/__init__.py +++ b/revscoring/features/bytes/__init__.py @@ -4,13 +4,19 @@ .. autodata:: revscoring.features.bytes.revision +.. autodata:: revscoring.features.bytes.session + Supporting classes ++++++++++++++++++ .. autoclass:: revscoring.features.bytes.Revision :members: +.. autoclass:: revscoring.features.bytes.Session + :members: + """ from .revision_oriented import Revision, revision +from .session_oriented import Session, session -__all__ = [revision, Revision] +__all__ = [revision, Revision, session, Session] diff --git a/revscoring/features/bytes/revision_oriented.py b/revscoring/features/bytes/revision_oriented.py index 4f426400..6811061f 100644 --- a/revscoring/features/bytes/revision_oriented.py +++ b/revscoring/features/bytes/revision_oriented.py @@ -1,8 +1,8 @@ from revscoring.datasources import revision_oriented from revscoring.dependencies import DependentSet -from ..meta import aggregators from . import datasources +from ..meta import aggregators name = "bytes.revision" diff --git a/revscoring/features/bytes/session_oriented.py b/revscoring/features/bytes/session_oriented.py new file mode 100644 index 00000000..3a27aec9 --- /dev/null +++ b/revscoring/features/bytes/session_oriented.py @@ -0,0 +1,26 @@ +from revscoring.datasources import session_oriented +from revscoring.dependencies import DependentSet + +from . import datasources +from .revision_oriented import Revision + +name = "bytes.session" + + +class Session(DependentSet): + """ + Represents an editor's activity session + """ + def __init__(self, name, revisions_datasources): + super().__init__(name) + revision = Revision( + name, datasources.Revision(name, revisions_datasources)) + self.revisions = session_oriented.list_of_tree( + revision, rewrite_name=session_oriented.rewrite_name, + cache={d.name: d for d in revisions_datasources}) + """ + :class:`~revscoring.datasources.meta.expanders.list_of`(:class:`~revscoring.features.bytes.Revision`) : + The revisions saved by the users within the session. + """ + +session = Session(name, session_oriented.session.revisions) diff --git a/revscoring/features/feature.py b/revscoring/features/feature.py index 65836d48..8e99f2a2 100644 --- a/revscoring/features/feature.py +++ b/revscoring/features/feature.py @@ -2,20 +2,24 @@ .. autoclass:: revscoring.Feature :members: +.. autoclass:: revscoring.FeatureVector + :members: + .. autoclass:: revscoring.features.Modifier :members: .. autoclass:: revscoring.features.Constant :members: + +.. autoclass:: revscoring.features.ConstantVector + :members: + """ -from math import log as math_log +import operator +from itertools import repeat from revscoring.dependencies import Dependent -# Sets up refences to overloaded function names -math_max = max -math_min = min - class Feature(Dependent): """ @@ -45,6 +49,23 @@ def __call__(self, *args, **kwargs): else: return value + def _format_name(self, name, args, func_name=None): + arg_names = [] + for arg in args: + if isinstance(arg, Constant) or isinstance(arg, ConstantVector): + arg_names.append(repr(arg.value)) + elif isinstance(arg, Feature): + arg_names.append(arg.name) + else: + arg_names.append(repr(arg)) + + if name is None: + name = "{0}({1})" \ + .format(func_name or self.__class__.__name__, + ", ".join(arg_names)) + + return name + def __hash__(self): return hash('feature.' + self.name) @@ -104,10 +125,30 @@ def validate(self, value): def or_constant(self, val): if isinstance(val, Feature): return val + elif isinstance(val, list): + return ConstantVector(val) else: return Constant(val) +class FeatureVector(Feature): + + def validate(self, vector): + for i, value in enumerate(vector): + if not isinstance(value, self.returns): + raise ValueError( + "Expected {0}, but got {1} instead at position {2}." + .format(self.returns, type(value), i)) + + return vector + + def __hash__(self): + return hash('feature_vector.' + self.name) + + def __str__(self): + return "feature_vector." + self.name + + class Constant(Feature): """ A special sub-type of `revscoring.Feature` that returns a constant value. @@ -130,271 +171,255 @@ def _process(self): return self.value -class Modifier(Feature): +class ConstantVector(FeatureVector): """ - Represents a modification of one or more predictive feature. + A special sub-type of `revscoring.Feature` that returns a constant value. :Parameters: - name : str - The name of the feature - process : `func` - A function that will generate a feature value - return_type : `type` - A type to compare the return of this function to. - dependencies : `list`(`hashable`) - An ordered list of dependencies that correspond - to the `*args` of `process` + value : `mixed` + Any type of potential feature value + name : `str` + A name to give the feature """ - pass - - -class BinaryOperator(Modifier): - - CHAR = "?" - - def __init__(self, left, right, returns=None, name=None): - left = Feature.or_constant(left) - right = Feature.or_constant(right) + def __init__(self, values, name=None): + self.value = values if name is None: - name = "({0} {1} {2})".format(left.name, self.CHAR, right.name) - - if returns is None: - returns = type(self.operate(left.returns(), right.returns())) - - super().__init__(name, self.operate, returns=returns, - depends_on=[left, right]) - - def operate(self, left, right): - raise NotImplementedError() - - -class add(BinaryOperator): - """ - Generates a feature that represents the addition of - two :class:`revscoring.Feature` or constant values. - """ - - CHAR = "+" - - def operate(self, left, right): - return left + right - - -class sub(BinaryOperator): - """ - Generates a feature that represents the subtraction of - two :class:`revscoring.Feature` or constant values. - """ - - CHAR = "-" + name = str(values) + super().__init__(name, self._process, + returns=type(values[0]), depends_on=[]) - def operate(self, left, right): - return left - right + def _process(self): + return self.value -class mul(BinaryOperator): +class Modifier: """ - Generates a feature that represents the multiplacation of - two :class:`revscoring.Feature` or constant values. + A special type of feature that modifies or re-scales another feature. + Modifiers are intended to highlight the signal from features but do not + lend to human interpretability. """ - CHAR = "*" - - def operate(self, left, right): - return left * right - -class div(BinaryOperator): +class FunctionApplier(Modifier): """ - Generates a feature that represents the division of - two :class:`revscoring.Feature` or constant values. + An abstract base class for defining a feature that applies a function onto + one or more other features. """ - - CHAR = "/" - - def __init__(self, left, right, name=None): - # Explicitly setting return type to float. - super().__init__(left, right, returns=float, name=name) - - def operate(self, left, right): - return left / right - - -class Comparison(BinaryOperator): - - def __init__(self, left, right, name=None): - # Explicitly setting return type to boolean. - super().__init__(left, right, returns=bool, name=name) + def __init__(self, *arguments, func, name=None, returns=float): + if name is None: + name = self._format_name( + name, list(arguments), func_name=func.__name__) + super().__init__(name, self.process, depends_on=arguments, + returns=returns) + self.func = func -class gt(Comparison): +class SingletonFunctionApplier(FunctionApplier, Feature): """ - Generates a feature that represents the greater-than relationship of - two :class:`revscoring.Feature` or constant values. + A special type of Feature that applies a function to one or more other + features. """ + def process(self, *arg_vals): + return self.returns(self.func(*arg_vals)) - CHAR = ">" - - def operate(self, left, right): - return left > right - -class lt(Comparison): +class VectorFunctionApplier(FunctionApplier, FeatureVector): """ - Generates a feature that represents the less-than relationship of - two :class:`revscoring.Feature` or constant values. + A special type of FeatureVector that maps a function over one or more + feature vector. """ + def process(self, *arg_vectors): + arg_vectors = self.normalize_vectors(arg_vectors) + return [self.returns(self.func(*arg_vals)) + for arg_vals in zip(*arg_vectors)] - CHAR = "<" - - def operate(self, left, right): - return left < right + def normalize_vectors(self, arg_vectors): + """ + Checks whether all vectors are the same length and repeats singleton + values so that they can be repeatedly applied against vectors. + """ + vector_length = max(len(av) for av in arg_vectors + if isinstance(av, list)) + normalized_vectors = [] + for dependency, arg_vector in zip(self.dependencies, arg_vectors): + if isinstance(dependency, FeatureVector): + if vector_length != len(arg_vector): + raise ValueError( + ("Length of value for {0} ({1}) does not " + + "match the length of other vectors ({2})") + .format(dependency, len(arg_vector), vector_length)) + else: + normalized_vectors.append(arg_vector) + else: + normalized_vectors.append(repeat(arg_vector, vector_length)) -class ge(Comparison): - """ - Generates a feature that represents the greater-than-or-equal relationship - of two :class:`revscoring.Feature` or constant values. - """ + return normalized_vectors - CHAR = ">=" - def operate(self, left, right): - return left >= right - - -class le(Comparison): +def function_applier(func): """ - Generates a feature that represents the less-than-or-equal relationship of - two :class:`revscoring.Feature` or constant values. - """ - - CHAR = "<=" - - def operate(self, left, right): - return left <= right - - -class eq(Comparison): + A decorator for building a FunctionApplier. The decorated function should + take a list of of `arguments` (Feature or FeatureVector), a name for the + new feature, and a `returns` type and return a function to call, a name, + and `returns` type. These values will be used to construct a + SingletonFunctionApplier or VectorFunctionApplier as appropriate. """ - Generates a feature that represents the equality of two - :class:`revscoring.Feature` or constant values. - """ - - CHAR = "==" - - def operate(self, left, right): - return left == right + def wrapper(*arguments, name=None, returns=None): + arguments = [Feature.or_constant(a) for a in arguments] + func_tocall, name, returns = func(*arguments, name, returns) + if returns is None: + returns = type(func_tocall(*(a.returns() for a in arguments))) + if any(isinstance(a, FeatureVector) for a in arguments): + return VectorFunctionApplier( + *arguments, func=func_tocall, name=name, returns=returns) + else: + return SingletonFunctionApplier( + *arguments, func=func_tocall, name=name, returns=returns) + return wrapper -class ne(Comparison): +def binary_operator(func): """ - Generates a feature that represents the inequality of two - :class:`revscoring.Feature` or constant values. + A decorator for building a FunctionApplier. The decorated function should + take a list of two arguments (left and right) (Feature or FeatureVector) + and a `returns` type and return a function to call, an `operator` string + (e.g. "add" has "+" as an operator string), and `returns` type. These + values will be used to construct a SingletonFunctionApplier or + VectorFunctionApplier as appropriate. """ - - CHAR = "!=" - - def operate(self, left, right): - return left != right + def wrapper(left, right, name=None, returns=None): + left = Feature.or_constant(left) + right = Feature.or_constant(right) + func_tocall, operator, returns = func(left, right, returns) + if returns is None: + returns = type(func_tocall(left.returns(), right.returns())) + if name is None: + name = "({0} {1} {2})".format(left.name, operator, right.name) + if isinstance(left, FeatureVector) or isinstance(right, FeatureVector): + return VectorFunctionApplier( + left, right, func=func_tocall, name=name, returns=returns) + else: + return SingletonFunctionApplier( + left, right, func=func_tocall, name=name, returns=returns) + return wrapper -class and_(Comparison): - """ - Generates a feature that represents the conjunction of two - :class:`revscoring.Feature` or constant values. - """ +@binary_operator +def add(left, right, returns): + return operator.add, "+", returns +add.__doc__ = """ +Generates a feature that represents the addition of +two :class:`revscoring.Feature` or constant values. +""" - CHAR = "and" - def operate(self, left, right): - return left and right +@binary_operator +def sub(left, right, returns): + return operator.sub, "-", returns +sub.__doc__ = """ +Generates a feature that represents the subtraction of +two :class:`revscoring.Feature` or constant values. +""" -class or_(Comparison): - """ - Generates a feature that represents the disjunction of two - :class:`revscoring.Feature` or constant values. - """ +@binary_operator +def mul(left, right, returns): + return operator.mul, "*", returns +mul.__doc__ = """ +Generates a feature that represents the multiplacation of +two :class:`revscoring.Feature` or constant values. +""" - CHAR = "or" - def operate(self, left, right): - return left or right +@binary_operator +def div(left, right, returns): + return operator.truediv, "/", returns if returns is not None else float +div.__doc__ = """ +Generates a feature that represents the division of +two :class:`revscoring.Feature` or constant values. +""" -class max(Modifier): - """ - Generates a feature that represents the maximum of a set of - :class:`revscoring.Feature` or constant values. - """ +@binary_operator +def lt(left, right, returns): + return operator.lt, "<", bool +lt.__doc__ = """ +Generates a feature that represents the less-than relationship of +two :class:`revscoring.Feature` or constant values. +""" - def __init__(self, *args, name=None): - dependencies = [Feature.or_constant(arg) for arg in args] - returns = float - # Hardcoded even though max can return strings, it - # shouldn't ever do that - if name is None: - name = "max({0})".format(", ".join(f.name for f in dependencies)) - super().__init__(name, self._process, returns=returns, - depends_on=dependencies) +@binary_operator +def le(left, right, returns): + return operator.le, "<=", bool +le.__doc__ = """ +Generates a feature that represents the less-than-or-equal relationship of +two :class:`revscoring.Feature` or constant values. +""" - def _process(self, *feature_values): - return float(math_max(*feature_values)) +@binary_operator +def gt(left, right, returns): + return operator.gt, ">", bool +gt.__doc__ = """ +Generates a feature that represents the greater-than relationship of +two :class:`revscoring.Feature` or constant values. +""" -class min(Modifier): - """ - Generates a feature that represents the minimum of a set of - :class:`revscoring.Feature` or constant values. - """ - def __init__(self, *args, name=None): - dependencies = [Feature.or_constant(arg) for arg in args] - returns = float - # Hardcoded even though max can return strings, it - # shouldn't ever do that +@binary_operator +def ge(left, right, returns): + return operator.ge, ">=", bool +ge.__doc__ = """ +Generates a feature that represents the greater-than-or-equal relationship +of two :class:`revscoring.Feature` or constant values. +""" - if name is None: - name = "min({0})".format(", ".join(f.name for f in dependencies)) - super().__init__(name, self._process, returns=returns, - depends_on=dependencies) - def _process(self, *feature_values): - return float(math_min(*feature_values)) +@binary_operator +def eq(left, right, returns): + return operator.eq, "==", bool +eq.__doc__ = """ +Generates a feature that represents the equality of two +:class:`revscoring.Feature` or constant values. +""" -class log(Modifier): - """ - Generates a feature that represents the log of a - :class:`revscoring.Feature`'s value. - """ +@binary_operator +def ne(left, right, returns): + return operator.ne, "!=", bool +ne.__doc__ = """ +Generates a feature that represents the inequality of two +:class:`revscoring.Feature` or constant values. +""" - def __init__(self, feature, name=None): - feature = Feature.or_constant(feature) - if name is None: - name = "log({0})".format(feature.name) - super().__init__(name, self._process, - returns=float, depends_on=[feature]) - def _process(self, feature_value): - return math_log(feature_value) +@binary_operator +def or_(left, right, returns): + return operator.or_, "or", bool +or_.__doc__ = """ +Generates a feature that represents the disjunction of two +:class:`revscoring.Feature` or constant values. +""" -class not_(Modifier): - """ - Generates a feature that represents the negation of a - :class:`revscoring.Feature`'s value. - """ +@binary_operator +def and_(left, right, returns): + return operator.and_, "and", bool +and_.__doc__ = """ +Generates a feature that represents the conjunction of two +:class:`revscoring.Feature` or constant values. +""" - def __init__(self, feature, name=None): - feature = Feature.or_constant(feature) - if name is None: - name = "not {0}".format(feature.name) - super().__init__(name, self._process, - returns=bool, depends_on=[feature]) - def _process(self, feature_value): - return not feature_value +@function_applier +def not_(dependency, name, returns): + if name is None: + name = "(not {0})".format(dependency.name) + return operator.not_, name, bool +not_.__doc__ = """ +Generates a feature that represents the negation of one +:class:`revscoring.Feature` or constant value. +""" diff --git a/revscoring/features/feature_vector.py b/revscoring/features/feature_vector.py deleted file mode 100644 index 87299df7..00000000 --- a/revscoring/features/feature_vector.py +++ /dev/null @@ -1,37 +0,0 @@ -""" -.. autoclass:: revscoring.FeatureVector - :members: -""" -from revscoring.features import Feature - - -class FeatureVector(Feature): - """ - Represents a vector of predictive features. - - :Parameters: - name : str - The name of the feature - process : `func` - A function that will generate a feature value - returns : `type` - A type to compare the return vector of this function to. - dependencies : `list`(`hashable`) - An ordered list of dependencies that correspond - to the `*args` of `process` - """ - - def validate(self, vector): - for i, value in enumerate(vector): - if not isinstance(value, self.returns): - raise ValueError( - "Expected {0}, but got {1} instead at position {2}." - .format(self.returns, type(value), i)) - - return vector - - def __hash__(self): - return hash('feature_vector.' + self.name) - - def __str__(self): - return "feature_vector." + self.name diff --git a/revscoring/features/meta/aggregators.py b/revscoring/features/meta/aggregators.py index e239275e..98b03717 100644 --- a/revscoring/features/meta/aggregators.py +++ b/revscoring/features/meta/aggregators.py @@ -2,6 +2,10 @@ These Meta-Features apply an aggregate function to :class:`~revscoring.Datasource` that return lists of values. +.. autoclass revscoring.features.meta.aggregators.any + +.. autoclass revscoring.features.meta.aggregators.all + .. autoclass revscoring.features.meta.aggregators.sum .. autoclass revscoring.features.meta.aggregators.len @@ -11,111 +15,189 @@ .. autoclass revscoring.features.meta.aggregators.min .. autoclass revscoring.features.meta.aggregators.mean + +.. autoclass revscoring.features.meta.aggregators.first + +.. autoclass revscoring.features.meta.aggregators.last """ import statistics -import numpy as np - -from ..feature import Feature -from ..feature_vector import FeatureVector +from ..feature import Feature, FeatureVector +any_builtin = any +all_builtin = all len_builtin = len sum_builtin = sum max_builtin = max min_builtin = min -mean_builtin = statistics.mean -class AggregatorsScalar(Feature): - def __init__(self, items_datasource, func, name=None, returns=float): +def _first(items): + return items[0] + + +def _last(items): + return items[-1] + + +class Aggregator: + + def __init__(self, items_datasource, func, name=None, returns=None, empty_default=None): name = self._format_name( name, [items_datasource], func_name=func.__name__) super().__init__(name, self.process, depends_on=[items_datasource], returns=returns) self.func = func + self.empty_default = empty_default + + +class SingletonAggregator(Aggregator, Feature): def process(self, items): if items is None or len_builtin(items) == 0: - return self.returns() + if self.empty_default is None: + raise ValueError( + "Cannot generate {0} of {1} -- length of zero" + .format(self.func.__name__, self.dependencies[0])) + else: + return self.returns(self.empty_default) else: return self.returns(self.func(items)) -class AggregatorsVector(FeatureVector): - def __init__(self, items_datasource, func, name=None, returns=float): - name = self._format_name( - name, [items_datasource], func_name=func.__name__) - super().__init__(name, self.process, depends_on=[items_datasource], - returns=returns) - self.func = func +class VectorAggregator(Aggregator, FeatureVector): - def process(self, items): - if len_builtin(items) == 0 or items[0] is None or \ - len_builtin(items[0]) == 0: - return [self.returns()] + def process(self, vectors): + if vectors is None or len_builtin(vectors) == 0 or \ + vectors[0] is None or len_builtin(vectors[0]) == 0: + if self.empty_default is None: + raise ValueError( + "Cannot generate {0} of {1} -- length of zero" + .format(self.func.__name__, self.dependencies[0])) + else: + return [self.returns(self.empty_default)] else: - return_func = np.vectorize(self.returns) - # apply the function over each row - return return_func(np.apply_along_axis( - self.func, 0, np.array(items, dtype=self.returns))).tolist() + return [self.returns(self.func(vals)) for vals in zip(*vectors)] -def aggregators_factory(func): - def wrapper(items_datasource, name=None, returns=float, vector=False): - func_tocall = func(items_datasource, name, returns) +def aggregator(func): + def wrapper(items_datasource, name=None, returns=None, empty_default=None, vector=False): + func_tocall, name, returns, empty_default = \ + func(items_datasource, name, returns, empty_default) if vector: - return AggregatorsVector( - items_datasource, func_tocall, name, returns) + return VectorAggregator( + items_datasource, func=func_tocall, empty_default=empty_default, + name=name, returns=returns) else: - return AggregatorsScalar( - items_datasource, func_tocall, name, returns) + return SingletonAggregator( + items_datasource, func=func_tocall, empty_default=empty_default, + name=name, returns=returns) return wrapper -@aggregators_factory -def sum(items_datasource, name=None, returns=float, vector=False): - return sum_builtin +@aggregator +def all(items_datasource, name, returns, empty_default, vector=False): + return all_builtin, name, returns or bool, empty_default or False +all.__doc__ = """ +Constructs a :class:`revscoring.Feature` that returns True when all items are +also True. + +:Parameters: + items_datasource : :class:`revscoring.Datasource` + A datasource that returns a collection of items + name : `str` + A name for the feature + returns : `type` + A type to compare the return of this function to. + vector : `bool` + If True, assume that `items_datasource` returns a vector of values. +""" + + +@aggregator +def any(items_datasource, name, returns, empty_default, vector=False): + return any_builtin, name, returns or bool, empty_default or False +any.__doc__ = """ +Constructs a :class:`revscoring.Feature` that returns True when any items are +also True. + +:Parameters: + items_datasource : :class:`revscoring.Datasource` + A datasource that returns a collection of items + name : `str` + A name for the feature + returns : `type` + A type to compare the return of this function to. + vector : `bool` + If True, assume that `items_datasource` returns a vector of values. +""" + + +@aggregator +def sum(items_datasource, name, returns, empty_default, vector=False): + returns = returns or float + return sum_builtin, name, returns, empty_default or 0.0 sum.__doc__ = """ - Constructs a :class:`revscoring.Feature` that contains returns the - sum of a collection of items. - - :Parameters: - items_datasource : :class:`revscoring.Datasource` - A datasource that returns a collection of items - name : `str` - A name for the feature - returns : `type` - A type to compare the return of this function to. - vector : `bool` - If True, assume that `items_datasource` returns a vector of values. - """ - - -@aggregators_factory -def len(items_datasource, name=None, returns=int, vector=False): - return len_builtin +Constructs a :class:`revscoring.Feature` that returns the +sum of a collection of items. + +:Parameters: + items_datasource : :class:`revscoring.Datasource` + A datasource that returns a collection of items + name : `str` + A name for the feature + returns : `type` + A type to compare the return of this function to. + vector : `bool` + If True, assume that `items_datasource` returns a vector of values. +""" + + +@aggregator +def len(items_datasource, name, returns, empty_default, vector=False): + return len_builtin, name, int, empty_default or 0 len.__doc__ = """ - Constructs a :class:`revscoring.Feature` that contains returns the - len of a collection of items. - - :Parameters: - items_datasource : :class:`revscoring.Datasource` - A datasource that returns a collection of items - name : `str` - A name for the feature - returns : `type` - A type to compare the return of this function to. - vector : `bool` - If True, assume that `items_datasource` returns a vector of values. - """ - - -@aggregators_factory -def max(items_datasource, name=None, returns=float, vector=False): - return max_builtin +Constructs a :class:`revscoring.Feature` that returns the length of a +collection of items. + +:Parameters: + items_datasource : :class:`revscoring.Datasource` + A datasource that returns a collection of items + name : `str` + A name for the feature + returns : `type` + A type to compare the return of this function to. + vector : `bool` + If True, assume that `items_datasource` returns a vector of values. +""" + + +@aggregator +def mean(items_datasource, name, returns, empty_default, vector=False): + returns = returns or float + return statistics.mean, name, returns, empty_default or 0.0 +mean.__doc__ = """ +Constructs a :class:`revscoring.Feature` that returns the mean of a +collection of items. + +:Parameters: + items_datasource : :class:`revscoring.Datasource` + A datasource that returns a collection of items + name : `str` + A name for the feature + returns : `type` + A type to compare the return of this function to. + vector : `bool` + If True, assume that `items_datasource` returns a vector of values. +""" + + +@aggregator +def max(items_datasources, name, returns, empty_default, vector=False): + return max_builtin, name, returns or float, empty_default or 0 max.__doc__ = """ -Constructs a :class:`revscoring.Feature` that contains returns the -max of a collection of items. +Constructs a :class:`revscoring.Feature` that returns the maximum of a +collection of items. :Parameters: items_datasource : :class:`revscoring.Datasource` @@ -129,12 +211,12 @@ def max(items_datasource, name=None, returns=float, vector=False): """ -@aggregators_factory -def min(items_datasource, name=None, returns=float, vector=False): - return min_builtin +@aggregator +def min(items_datasources, name, returns, empty_default, vector=False): + return min_builtin, name, returns or float, empty_default or 0 min.__doc__ = """ -Constructs a :class:`revscoring.Feature` that contains returns the -min of a collection of items. +Constructs a :class:`revscoring.Feature` that returns the minimum of a +collection of items. :Parameters: items_datasource : :class:`revscoring.Datasource` @@ -148,12 +230,31 @@ def min(items_datasource, name=None, returns=float, vector=False): """ -@aggregators_factory -def mean(items_datasource, name=None, returns=np.float64, vector=False): - return mean_builtin -mean.__doc__ = """ -Constructs a :class:`revscoring.Feature` that contains returns the -mean of a collection of items. +@aggregator +def first(items_datasource, name, returns, empty_default, vector=False): + return _first, name, returns or float, empty_default or None +first.__doc__ = """ +Constructs a :class:`revscoring.Feature` that returns the first of a +collection of items. + +:Parameters: + items_datasource : :class:`revscoring.Datasource` + A datasource that returns a collection of items + name : `str` + A name for the feature + returns : `type` + A type to compare the return of this function to. + vector : `bool` + If True, assume that `items_datasource` returns a vector of values. +""" + + +@aggregator +def last(items_datasource, name, returns, empty_default, vector=False): + return _last, name, returns or float, empty_default or None +last.__doc__ = """ +Constructs a :class:`revscoring.Feature` that returns the last of a +collection of items. :Parameters: items_datasource : :class:`revscoring.Datasource` diff --git a/revscoring/features/meta/expanders.py b/revscoring/features/meta/expanders.py new file mode 100644 index 00000000..f0e61489 --- /dev/null +++ b/revscoring/features/meta/expanders.py @@ -0,0 +1,19 @@ +""" +These Meta-Features expand a single feature into multiple deatures. + +.. autoclass revscoring.features.meta.expanders.list_of +""" +from ..feature import FeatureVector + + +class list_of(FeatureVector): + + def __init__(self, feature, depends_on=None, name=None): + name = self._format_name(name, [feature]) + super().__init__( + name, self.process, depends_on=depends_on, + returns=feature.returns) + self.feature = feature + + def process(self, *lists_of_values): + return [self.feature(*values) for values in zip(*lists_of_values)] diff --git a/revscoring/features/meta/operators.py b/revscoring/features/meta/operators.py new file mode 100644 index 00000000..e6dc53f3 --- /dev/null +++ b/revscoring/features/meta/operators.py @@ -0,0 +1,4 @@ +from ..feature import (add, and_, div, eq, ge, gt, le, lt, mul, ne, not_, or_, + sub) + +__all__ = [add, and_, div, eq, ge, gt, le, lt, mul, ne, not_, or_, sub] diff --git a/revscoring/features/meta/rescalers.py b/revscoring/features/meta/rescalers.py new file mode 100644 index 00000000..a5f8f461 --- /dev/null +++ b/revscoring/features/meta/rescalers.py @@ -0,0 +1,20 @@ +import math + +from ..feature import function_applier + +abs_builtin = abs + + +@function_applier +def log(arg, name, returns): + return math.log, name, returns or float + + +@function_applier +def exp(arg, name, returns): + return math.exp, name, returns or float + + +@function_applier +def abs(arg, name, returns): + return abs_builtin, name, returns or float diff --git a/revscoring/features/meta/selectors.py b/revscoring/features/meta/selectors.py new file mode 100644 index 00000000..38f405d9 --- /dev/null +++ b/revscoring/features/meta/selectors.py @@ -0,0 +1,14 @@ +from ..feature import function_applier + +math_min = min +math_max = max + + +@function_applier +def min(fist_arg, second_arg, *other_args, name=None, returns=float): + return math_min, name, returns + + +@function_applier +def max(fist_arg, second_arg, *other_args, name=None, returns=float): + return math_max, name, returns diff --git a/revscoring/features/meta/vectorizers.py b/revscoring/features/meta/vectorizers.py index f1c1ec0f..8fa39955 100644 --- a/revscoring/features/meta/vectorizers.py +++ b/revscoring/features/meta/vectorizers.py @@ -4,7 +4,7 @@ .. autoclass revscoring.features.meta.vectorizers.vectorize """ -from ..feature_vector import FeatureVector +from ..feature import FeatureVector class vectorize(FeatureVector): diff --git a/revscoring/features/modifiers.py b/revscoring/features/modifiers.py index 64859eb7..ae65928c 100644 --- a/revscoring/features/modifiers.py +++ b/revscoring/features/modifiers.py @@ -26,9 +26,15 @@ .. autofunction:: revscoring.features.modifiers.ge .. autofunction:: revscoring.features.modifiers.le +---- + +.. autofunction:: revscoring.features.modifiers.function_applier +.. autofunction:: revscoring.features.modifiers.binary_operator """ -from .feature import (add, and_, div, eq, ge, gt, le, log, lt, max, min, mul, - ne, not_, or_, sub) +from .feature import (add, and_, binary_operator, div, eq, function_applier, + ge, gt, le, lt, mul, ne, not_, or_, sub) +from .meta.rescalers import log +from .meta.selectors import max, min -__all__ = [add, div, eq, ge, gt, le, log, lt, max, min, mul, ne, sub, and_, - or_, not_] +__all__ = [add, div, eq, ge, gt, le, log, lt, mul, ne, sub, and_, + or_, not_, max, min, function_applier, binary_operator] diff --git a/revscoring/features/revision_oriented.py b/revscoring/features/revision_oriented.py index 01ed14db..ea2f0d64 100644 --- a/revscoring/features/revision_oriented.py +++ b/revscoring/features/revision_oriented.py @@ -77,7 +77,7 @@ def comment_matches(self, regex, name=None): regex = re.compile(regex, re.I) if name is None: - name = "{0}({1})".format(self._name + ".comment_matches", + name = "{0}({1})".format(self.name + ".comment_matches", repr(regex.pattern)) return bools.regex_match(regex, self.datasources.comment, @@ -109,7 +109,7 @@ def id_in_set(self, ids, name=None): A name for the new feature. """ if name is None: - name = "{0}({1})".format(self._name + ".id_in_set", repr(ids)) + name = "{0}({1})".format(self.name + ".id_in_set", repr(ids)) return bools.set_contains_item(ids, self.datasources.id, name=name) @@ -128,7 +128,7 @@ def title_matches(self, regex, name=None): regex = re.compile(regex, re.I) if name is None: - name = "{0}({1})".format(self._name + ".title_matches", + name = "{0}({1})".format(self.name + ".title_matches", repr(regex.pattern)) return bools.regex_match(regex, self.datasources.title, name=name) @@ -151,7 +151,7 @@ def id_in_set(self, ids, name=None): A name for the new feature. """ if name is None: - name = "{0}({1})".format(self._name + ".id_in_set", repr(ids)) + name = "{0}({1})".format(self.name + ".id_in_set", repr(ids)) return bools.set_contains_item(ids, self.datasources.id, name=name) @@ -170,7 +170,7 @@ def name_matches(self, regex, name=None): regex = re.compile(regex, re.I) if name is None: - name = "{0}({1})".format(self._name + ".name_matches", + name = "{0}({1})".format(self.name + ".name_matches", repr(regex.pattern)) return bools.regex_match(regex, self.datasources.name, name=name) @@ -182,7 +182,7 @@ def __init__(self, name, user_datasources): super().__init__(name) self.datasources = user_datasources - self.is_anon = Feature(self._name + ".is_anon", _process_is_anon, + self.is_anon = Feature(self.name + ".is_anon", _process_is_anon, returns=bool, depends_on=[self.datasources.id]) def id_in_set(self, ids, name=None): @@ -197,7 +197,7 @@ def id_in_set(self, ids, name=None): A name for the new feature. """ if name is None: - name = "{0}({1})".format(self._name + ".id_in_set", repr(ids)) + name = "{0}({1})".format(self.name + ".id_in_set", repr(ids)) return bools.set_contains_item(ids, self.datasources.id, name=name) @@ -217,7 +217,7 @@ def text_matches(self, regex, name=None): regex = re.compile(regex, re.I) if name is None: - name = "{0}({1})".format(self._name + ".text_matches", + name = "{0}({1})".format(self.name + ".text_matches", repr(regex.pattern)) return bools.regex_match(regex, self.datasources.text, name=name) @@ -234,7 +234,7 @@ def in_group(self, groups, name=None): A name for the new feature. """ if name is None: - name = "{0}({1})".format(self._name + ".in_group", + name = "{0}({1})".format(self.name + ".in_group", repr(groups)) return bools.sets_intersect(groups, self.datasources.info.groups, diff --git a/revscoring/features/temporal/__init__.py b/revscoring/features/temporal/__init__.py index 81fb14d7..d44cac70 100644 --- a/revscoring/features/temporal/__init__.py +++ b/revscoring/features/temporal/__init__.py @@ -34,6 +34,7 @@ """ from .revision_oriented import (LastUserRevision, Page, PageCreation, ParentRevision, Revision, User, revision) +from .session_oriented import (session, Session, SessionUser, LastSessionUserRevision) __all__ = [revision, Revision, ParentRevision, LastUserRevision, PageCreation, - Page, User] + Page, User, session, Session, SessionUser, LastSessionUserRevision] diff --git a/revscoring/features/temporal/revision_oriented.py b/revscoring/features/temporal/revision_oriented.py index 25b9d494..db940bdc 100644 --- a/revscoring/features/temporal/revision_oriented.py +++ b/revscoring/features/temporal/revision_oriented.py @@ -3,7 +3,6 @@ import mwtypes from pytz import utc - from revscoring.datasources import revision_oriented from revscoring.dependencies import DependentSet @@ -17,7 +16,8 @@ class Revision(DependentSet): "Represents a revision" - def __init__(self, name, revision_datasources): + def __init__(self, name, revision_datasources, + revision_user_datasources=None): super().__init__(name) self.datasources = revision_datasources @@ -86,17 +86,17 @@ def __init__(self, name, revision_datasources): class User(DependentSet): "Represents a revision user" - def __init__(self, name, revision_datasources): + def __init__(self, name, revision_datasources, user_datasources=None): super().__init__(name) - self.datasources = revision_datasources.user + self.datasources = user_datasources or revision_datasources.user if hasattr(self.datasources, 'info'): self.seconds_since_registration = Feature( name + ".seconds_since_registration", _process_seconds_since_registration, returns=int, - depends_on=[revision_datasources.user.id, - revision_datasources.user.info.registration, + depends_on=[self.datasources.id, + self.datasources.info.registration, revision_datasources.timestamp]) """ `int` : The number of seconds since the user registered their @@ -120,14 +120,15 @@ def __init__(self, name, revision_datasources): class LastUserRevision(Revision): "Represents a revision user's last revision" - def __init__(self, name, revision_datasources): - super().__init__(name, revision_datasources.user.last_revision) + def __init__(self, name, revision_datasources, user_datasources=None): + user_datasources = user_datasources or revision_datasources.user + super().__init__(name, user_datasources.last_revision) self.seconds_since = Feature( name + ".seconds_since", _process_seconds_since, returns=int, - depends_on=[revision_datasources.user.last_revision.timestamp, + depends_on=[user_datasources.last_revision.timestamp, revision_datasources.timestamp]) "`int`: The number of seconds since the user last saved an edit" diff --git a/revscoring/features/temporal/session_oriented.py b/revscoring/features/temporal/session_oriented.py new file mode 100644 index 00000000..f934f672 --- /dev/null +++ b/revscoring/features/temporal/session_oriented.py @@ -0,0 +1,88 @@ +from revscoring import Feature +from revscoring.datasources import session_oriented +from revscoring.datasources.meta.selectors import first +from revscoring.dependencies import DependentSet + +from .revision_oriented import (Revision, _process_seconds_since, + _process_seconds_since_registration) + +name = "temporal.session" + + +class Session(DependentSet): + """ + Represents an editor's activity session + """ + def __init__(self, name, revisions_datasources): + super().__init__(name) + session_revision = Revision( + name + ".revisions", revisions_datasources) + self.revisions = session_oriented.list_of_tree( + session_revision, rewrite_name=session_oriented.rewrite_name, + cache={d.name: d for d in revisions_datasources}) + """ + :class:`~revscoring.datasources.meta.expanders.list_of`(:class:`~revscoring.features.temporal.Revision`) : + The revisions saved by the users within the session. + """ + + self.user = SessionUser( + name + ".user", session_oriented.session.user, + revisions_datasources) + """ + :class:`~revscoring.features.temporal.session_oriented.SessionUser` : + The session user. + """ + + +class SessionUser(DependentSet): + "Represents a session user" + + def __init__(self, name, user_datasources, revisions_datasources): + super().__init__(name) + self.datasources = user_datasources + + if hasattr(self.datasources, 'info'): + self.seconds_since_registration = Feature( + name + ".seconds_since_registration", + _process_seconds_since_registration, + returns=int, + depends_on=[self.datasources.id, + self.datasources.info.registration, + first(revisions_datasources.timestamp)]) + """ + `int` : The number of seconds since the user registered their + account -- or zero in the case of anons -- before the start of the + current session. If the user has a registration date that is + *after* the revision timestamp (should be implossible, but happens + sometimes), the user is assumed to be 1 year old. + """ + + if hasattr(self.datasources, 'last_revision'): + self.last_revision = LastSessionUserRevision( + name + ".last_revision", user_datasources, + revisions_datasources) + """ + :class:`~revscoring.features.temporal.session_oriented.LastSessionUserRevision` : + The last revision saved by the user before the start of the session. + """ + + +class LastSessionUserRevision(Revision): + "Represents a revision user's last revision before the start of the session" + + def __init__(self, name, user_datasources, revisions_datasources): + super().__init__(name, user_datasources.last_revision) + + self.seconds_since = Feature( + name + ".seconds_since", + _process_seconds_since, + returns=int, + depends_on=[user_datasources.last_revision.timestamp, + first(revisions_datasources.timestamp)]) + """ + `int`: The number of seconds since the user last saved an edit before + the start of the current session. + """ + + +session = Session(name, session_oriented.session.revisions) diff --git a/revscoring/features/wikibase/__init__.py b/revscoring/features/wikibase/__init__.py index 03badaa7..4596d9c1 100644 --- a/revscoring/features/wikibase/__init__.py +++ b/revscoring/features/wikibase/__init__.py @@ -18,6 +18,8 @@ """ from .features import Diff, Revision from .revision_oriented import revision +from .session_oriented import session, Session from .util import DictDiff, diff_dicts -__all__ = [diff_dicts, DictDiff, revision, Revision, Diff] +__all__ = [diff_dicts, DictDiff, revision, Revision, Diff, + session, Session] diff --git a/revscoring/features/wikibase/features/diff.py b/revscoring/features/wikibase/features/diff.py index e505e933..40f9548b 100644 --- a/revscoring/features/wikibase/features/diff.py +++ b/revscoring/features/wikibase/features/diff.py @@ -1,7 +1,6 @@ import re import mwbase - from revscoring.dependencies import DependentSet from ...feature import Feature @@ -141,6 +140,7 @@ def __init__(self, name, datasources): ) "`int` : The number of identifiers that were changed" + @DependentSet.meta_dependent def property_changed(self, property, name=None): """ Returns a :class:`revscoring.Feature` that represents whether a @@ -154,7 +154,7 @@ def property_changed(self, property, name=None): feature's name will be 'property_changed()' """ if name is None: - name = self._name + ".property_changed({0})" \ + name = self.name + ".property_changed({0})" \ .format(repr(property)) return bools.item_in_set(property, self.datasources.properties_changed, name=name) diff --git a/revscoring/features/wikibase/features/revision_oriented.py b/revscoring/features/wikibase/features/revision_oriented.py index 918e5be2..f5e74cf6 100644 --- a/revscoring/features/wikibase/features/revision_oriented.py +++ b/revscoring/features/wikibase/features/revision_oriented.py @@ -48,6 +48,7 @@ def __init__(self, name, revision_datasources): difference between this revision and the parent revision. """ + @DependentSet.meta_dependent def has_property(self, property, name=None): """ Returns True if the specified property exists @@ -60,11 +61,12 @@ def has_property(self, property, name=None): feature's name will be 'has_property()' """ if name is None: - name = self._name + ".has_property({0})".format(repr(property)) + name = self.name + ".has_property({0})".format(repr(property)) return bools.item_in_set(property, self.datasources.properties, name=name) + @DependentSet.meta_dependent def has_property_value(self, property, value, name=None): """ Returns True if the specified property matches the provided value. @@ -80,7 +82,7 @@ def has_property_value(self, property, value, name=None): 'has_property_value(, )' """ if name is None: - name = self._name + ".has_property_value({0}, {1})" \ + name = self.name + ".has_property_value({0}, {1})" \ .format(repr(property), repr(value)) return HasPropertyValue(name, property, value, self.datasources.entity) diff --git a/revscoring/features/wikibase/session_oriented.py b/revscoring/features/wikibase/session_oriented.py new file mode 100644 index 00000000..33a6fce3 --- /dev/null +++ b/revscoring/features/wikibase/session_oriented.py @@ -0,0 +1,33 @@ +from revscoring.datasources import session_oriented +from revscoring.dependencies import DependentSet + +from . import datasources, features + +name = "wikibase.session" + + +class Session(DependentSet): + """ + Represents an editor's activity session + """ + def __init__(self, name, revisions_datasources): + super().__init__(name) + session_revision = features.Revision( + name + ".revisions", + datasources.Revision(name, revisions_datasources)) + self.revisions = session_oriented.list_of_tree( + session_revision, rewrite_name=session_oriented.rewrite_name, + cache={d.name: d for d in revisions_datasources}) + """ + :class:`~revscoring.datasources.meta.expanders.list_of`(:class:`~revscoring.features.wikibase.Revision`) : + The revisions saved by the users within the session. + """ + +session = Session(name, session_oriented.session.revisions) +""" +Represents an editor's activity session. Implements this basic structure: +* session: :class:`~revscoring.features.wikibase.Session` + * revisions: a :class:`~revscoring.datasources.meta.expanders.list_of`(:class:`~revscoring.features.wikibase.Revision`) + * parent: a :class:`~revscoring.datasources.meta.expanders.list_of`(:class:`~revscoring.features.wikibase.Revision`) + * diff: a a :class:`~revscoring.datasources.meta.expanders.list_of`(:class:`~revscoring.features.wikibase.Diff`) +""" diff --git a/revscoring/features/wikitext/__init__.py b/revscoring/features/wikitext/__init__.py index 46a28d23..ba84c613 100644 --- a/revscoring/features/wikitext/__init__.py +++ b/revscoring/features/wikitext/__init__.py @@ -367,5 +367,6 @@ """ # noqa from .features import Diff, Revision from .revision_oriented import revision +from .session_oriented import Session, session -__all__ = [revision, Revision, Diff] +__all__ = [revision, Revision, Diff, Session, session] diff --git a/revscoring/features/wikitext/datasources/edit.py b/revscoring/features/wikitext/datasources/edit.py index cceaf9e1..dbd4c013 100644 --- a/revscoring/features/wikitext/datasources/edit.py +++ b/revscoring/features/wikitext/datasources/edit.py @@ -3,9 +3,9 @@ import time from deltas import segment_matcher - from revscoring.datasources import Datasource from revscoring.datasources.meta import filters +from revscoring.dependencies import DependentSet from .tokenized import TokenIsInTypes, is_uppercase_word @@ -18,12 +18,12 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.operations = Datasource( - self._name + ".operations", _process_operations, + self.name + ".operations", _process_operations, depends_on=[ - self.revision.parent.paragraphs_sentences_and_whitespace, - self.revision.paragraphs_sentences_and_whitespace, - self.revision.parent.tokens, - self.revision.tokens] + self._revision.parent.paragraphs_sentences_and_whitespace, + self._revision.paragraphs_sentences_and_whitespace, + self._revision.parent.tokens, + self._revision.tokens] ) """ Returns a tuple that describes the difference between the parent @@ -37,7 +37,7 @@ def __init__(self, *args, **kwargs): """ self.segments_added = Datasource( - self._name + ".segments_added", _process_segments_added, + self.name + ".segments_added", _process_segments_added, depends_on=[self.operations] ) """ @@ -46,7 +46,7 @@ def __init__(self, *args, **kwargs): """ self.segments_removed = Datasource( - self._name + ".segments_removed", _process_segments_removed, + self.name + ".segments_removed", _process_segments_removed, depends_on=[self.operations] ) """ @@ -55,7 +55,7 @@ def __init__(self, *args, **kwargs): """ self.tokens_added = Datasource( - self._name + ".tokens_added", _process_tokens_added, + self.name + ".tokens_added", _process_tokens_added, depends_on=[self.operations] ) """ @@ -64,7 +64,7 @@ def __init__(self, *args, **kwargs): """ self.tokens_removed = Datasource( - self._name + ".tokens_removed", _process_tokens_removed, + self.name + ".tokens_removed", _process_tokens_removed, depends_on=[self.operations] ) """ @@ -73,28 +73,28 @@ def __init__(self, *args, **kwargs): """ self.numbers_added = self.tokens_added_in_types( - {'number'}, name=self._name + ".numbers_added" + {'number'}, name=self.name + ".numbers_added" ) """ A list of numeric tokens added in the edit """ self.numbers_removed = self.tokens_removed_in_types( - {'number'}, name=self._name + ".numbers_removed" + {'number'}, name=self.name + ".numbers_removed" ) """ A list of numeric tokens removed in the edit """ self.whitespaces_added = self.tokens_added_in_types( - {'whitespace'}, name=self._name + ".whitespaces_added" + {'whitespace'}, name=self.name + ".whitespaces_added" ) """ A list of whitespace tokens added in the edit """ self.whitespaces_removed = self.tokens_removed_in_types( - {'whitespace'}, name=self._name + ".whitespaces_removed" + {'whitespace'}, name=self.name + ".whitespaces_removed" ) """ A list of whitespace tokens removed in the edit @@ -104,7 +104,7 @@ def __init__(self, *args, **kwargs): {'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close', 'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close', 'curly_open', 'curly_close', 'bold', 'italics', 'equals'}, - name=self._name + ".markups_added" + name=self.name + ".markups_added" ) """ A list of markup tokens added in the edit @@ -114,63 +114,63 @@ def __init__(self, *args, **kwargs): {'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close', 'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close', 'curly_open', 'curly_close', 'bold', 'italics', 'equals'}, - name=self._name + ".markups_removed" + name=self.name + ".markups_removed" ) """ A list of markup tokens removed in the edit """ self.cjks_added = self.tokens_added_in_types( - {'cjk'}, name=self._name + ".cjks_added" + {'cjk'}, name=self.name + ".cjks_added" ) """ A list of Chinese/Japanese/Korean tokens added in the edit """ self.cjks_removed = self.tokens_removed_in_types( - {'cjk'}, name=self._name + ".cjks_removed" + {'cjk'}, name=self.name + ".cjks_removed" ) """ A list of Chinese/Japanese/Korean tokens removed in the edit """ self.entities_added = self.tokens_added_in_types( - {'entity'}, name=self._name + ".entities_added" + {'entity'}, name=self.name + ".entities_added" ) """ A list of HTML entity tokens added in the edit """ self.entities_removed = self.tokens_removed_in_types( - {'entity'}, name=self._name + ".entities_removed" + {'entity'}, name=self.name + ".entities_removed" ) """ A list of HTML entity tokens removed in the edit """ self.urls_added = self.tokens_added_in_types( - {'url'}, name=self._name + ".urls_added" + {'url'}, name=self.name + ".urls_added" ) """ A list of URL tokens rempved in the edit """ self.urls_removed = self.tokens_removed_in_types( - {'url'}, name=self._name + ".urls_removed" + {'url'}, name=self.name + ".urls_removed" ) """ A list of URL tokens added in the edit """ self.words_added = self.tokens_added_in_types( - {'word'}, name=self._name + ".words_added" + {'word'}, name=self.name + ".words_added" ) """ A list of word tokens added in the edit """ self.words_removed = self.tokens_removed_in_types( - {'word'}, name=self._name + ".words_removed" + {'word'}, name=self.name + ".words_removed" ) """ A list of word tokens removed in the edit @@ -178,7 +178,7 @@ def __init__(self, *args, **kwargs): self.uppercase_words_added = filters.filter( is_uppercase_word, self.words_added, - name=self._name + ".uppercase_words_added" + name=self.name + ".uppercase_words_added" ) """ A list of fully UPPERCASE word tokens added in the edit @@ -186,7 +186,7 @@ def __init__(self, *args, **kwargs): self.uppercase_words_removed = filters.filter( is_uppercase_word, self.words_removed, - name=self._name + ".uppercase_words_removed" + name=self.name + ".uppercase_words_removed" ) """ A list of fully UPPERCASE word tokens removed in the edit @@ -195,7 +195,7 @@ def __init__(self, *args, **kwargs): self.punctuations_added = self.tokens_added_in_types( {'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon', 'japan_punct'}, - name=self._name + ".punctuations_added" + name=self.name + ".punctuations_added" ) """ A list of punctuation tokens added in the edit @@ -204,7 +204,7 @@ def __init__(self, *args, **kwargs): self.punctuations_removed = self.tokens_removed_in_types( {'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon', 'japan_punct'}, - name=self._name + ".punctuations_removed" + name=self.name + ".punctuations_removed" ) """ A list of punctuation tokens removed in the edit @@ -212,7 +212,7 @@ def __init__(self, *args, **kwargs): self.breaks_added = self.tokens_added_in_types( {'break'}, - name=self._name + ".breaks_added" + name=self.name + ".breaks_added" ) """ A list of break tokens added in the edit @@ -220,12 +220,13 @@ def __init__(self, *args, **kwargs): self.breaks_removed = self.tokens_removed_in_types( {'break'}, - name=self._name + ".breaks_removed" + name=self.name + ".breaks_removed" ) """ A list of break tokens removed in the edit """ + @DependentSet.meta_dependent def tokens_added_matching(self, regex, name=None, regex_flags=re.I): """ Constructs a :class:`revscoring.Datasource` that represents tokens @@ -234,10 +235,11 @@ def tokens_added_matching(self, regex, name=None, regex_flags=re.I): if not hasattr(regex, "pattern"): regex = re.compile(regex, regex_flags) if name is None: - name = "{0}({1})".format(self._name + ".tokens_added_matching", + name = "{0}({1})".format(self.name + ".tokens_added_matching", regex.pattern) return filters.regex_matching(regex, self.tokens_added, name=name) + @DependentSet.meta_dependent def tokens_removed_matching(self, regex, name=None, regex_flags=re.I): """ Constructs a :class:`revscoring.Datasource` that represents tokens @@ -247,11 +249,12 @@ def tokens_removed_matching(self, regex, name=None, regex_flags=re.I): regex = re.compile(regex, regex_flags) if name is None: name = "{0}({1})" \ - .format(self._name + ".tokens_removed_matching", + .format(self.name + ".tokens_removed_matching", regex.pattern) return filters.regex_matching(regex, self.tokens_removed, name=name) + @DependentSet.meta_dependent def tokens_added_in_types(self, types, name=None): """ Constructs a :class:`revscoring.Datasource` that represents tokens @@ -259,11 +262,12 @@ def tokens_added_in_types(self, types, name=None): """ types = set(types) if name is None: - name = "{0}({1})".format(self._name + ".tokens_added_in_types", + name = "{0}({1})".format(self.name + ".tokens_added_in_types", types) return filters.filter(TokenIsInTypes(types).filter, self.tokens_added, name=name) + @DependentSet.meta_dependent def tokens_removed_in_types(self, types, name=None): """ Constructs a :class:`revscoring.Datasource` that represents tokens @@ -271,7 +275,7 @@ def tokens_removed_in_types(self, types, name=None): """ types = set(types) if name is None: - name = "{0}({1})".format(self._name + ".tokens_removed_in_types", + name = "{0}({1})".format(self.name + ".tokens_removed_in_types", types) return filters.filter(TokenIsInTypes(types).filter, self.tokens_removed, name=name) diff --git a/revscoring/features/wikitext/datasources/parsed.py b/revscoring/features/wikitext/datasources/parsed.py index 13f4a9f9..c00f29b7 100644 --- a/revscoring/features/wikitext/datasources/parsed.py +++ b/revscoring/features/wikitext/datasources/parsed.py @@ -1,9 +1,9 @@ import re import mwparserfromhell - from revscoring.datasources import Datasource from revscoring.datasources.meta import filters, mappers +from revscoring.dependencies import DependentSet class Revision: @@ -12,7 +12,7 @@ def __init__(self, name, revision_datasources): super().__init__(name, revision_datasources) self.wikicode = Datasource( - self._name + ".wikicode", + self.name + ".wikicode", _process_wikicode, depends_on=[revision_datasources.text] ) """ @@ -21,7 +21,7 @@ def __init__(self, name, revision_datasources): """ self.node_class_map = Datasource( - self._name + ".node_class_map", + self.name + ".node_class_map", _process_node_class_map, depends_on=[self.wikicode] ) """ @@ -31,7 +31,7 @@ def __init__(self, name, revision_datasources): self.content = execute_method( "strip_code", self.wikicode, - name=self._name + ".content" + name=self.name + ".content" ) """ The viewable content (no markup or templates) of the revision. @@ -40,7 +40,7 @@ def __init__(self, name, revision_datasources): self.headings = get_key( mwparserfromhell.nodes.Heading, self.node_class_map, default=[], - name=self._name + ".headings" + name=self.name + ".headings" ) """ A list of :class:`mwparserfromhell.nodes.heading.Heading`'s @@ -48,7 +48,7 @@ def __init__(self, name, revision_datasources): self.heading_titles = mappers.map( _extract_heading_title, self.headings, - name=self._name + ".heading_titles" + name=self.name + ".heading_titles" ) """ A list of heading titles @@ -57,7 +57,7 @@ def __init__(self, name, revision_datasources): self.external_links = get_key( mwparserfromhell.nodes.ExternalLink, self.node_class_map, default=[], - name=self._name + ".external_links" + name=self.name + ".external_links" ) """ A list of :class:`mwparserfromhell.nodes.heading.ExternalLink`'s @@ -65,7 +65,7 @@ def __init__(self, name, revision_datasources): self.external_link_urls = mappers.map( _extract_external_link_url, self.external_links, - name=self._name + ".external_link_url" + name=self.name + ".external_link_url" ) """ A list of external link urls @@ -74,7 +74,7 @@ def __init__(self, name, revision_datasources): self.wikilinks = get_key( mwparserfromhell.nodes.Wikilink, self.node_class_map, default=[], - name=self._name + ".wikilinks" + name=self.name + ".wikilinks" ) """ A list of :class:`mwparserfromhell.nodes.heading.Wikilink`'s @@ -82,7 +82,7 @@ def __init__(self, name, revision_datasources): self.wikilink_titles = mappers.map( _extract_wikilink_title, self.wikilinks, - name=self._name + ".wikilink_titles" + name=self.name + ".wikilink_titles" ) """ Returns a list of string titles of internal links (aka "targets") @@ -91,7 +91,7 @@ def __init__(self, name, revision_datasources): self.tags = get_key( mwparserfromhell.nodes.Tag, self.node_class_map, default=[], - name=self._name + ".tags" + name=self.name + ".tags" ) """ A list of :class:`mwparserfromhell.nodes.heading.Tag`'s @@ -99,7 +99,7 @@ def __init__(self, name, revision_datasources): self.tag_names = mappers.map( _extract_tag_name, self.tags, - name=self._name + ".tag_names" + name=self.name + ".tag_names" ) """ Returns a list of html tag names present in the content of the revision @@ -108,7 +108,7 @@ def __init__(self, name, revision_datasources): self.templates = get_key( mwparserfromhell.nodes.Template, self.node_class_map, default=[], - name=self._name + ".templates" + name=self.name + ".templates" ) """ A list of :class:`mwparserfromhell.nodes.heading.Templates`'s @@ -116,12 +116,13 @@ def __init__(self, name, revision_datasources): self.template_names = mappers.map( _extract_template_name, self.templates, - name=self._name + ".template_names" + name=self.name + ".template_names" ) """ Returns a list of template names present in the content of the revision """ + @DependentSet.meta_dependent def heading_titles_matching(self, regex, name=None): """ Constructs a :class:`revscoring.Datasource` that generates a `list` of @@ -130,21 +131,23 @@ def heading_titles_matching(self, regex, name=None): if not hasattr(regex, "pattern"): regex = re.compile(regex, re.I) if name is None: - name = "{0}({1})".format(self._name + ".heading_titles_matching", + name = "{0}({1})".format(self.name + ".heading_titles_matching", regex.pattern) return filters.regex_matching(regex, self.heading_titles, name=name) + @DependentSet.meta_dependent def headings_by_level(self, level, name=None): """ Constructs a :class:`revscoring.Datasource` that generates a `list` of all headers of a level. """ if name is None: - name = "{0}({1})".format(self._name + ".headings_by_level", + name = "{0}({1})".format(self.name + ".headings_by_level", level) return filters.filter(HeadingOfLevel(level).filter, self.headings, name=name) + @DependentSet.meta_dependent def external_link_urls_matching(self, regex, name=None): """ Constructs a :class:`revscoring.Datasource` that generates a `list` of @@ -155,12 +158,13 @@ def external_link_urls_matching(self, regex, name=None): if name is None: name = "{0}({1})" \ - .format(self._name + ".external_link_urls_matching", + .format(self.name + ".external_link_urls_matching", regex.pattern) return filters.regex_matching(regex, self.external_link_urls, name=name) + @DependentSet.meta_dependent def wikilink_titles_matching(self, regex, name=None): """ Constructs a :class:`revscoring.Datasource` that generates a `list` @@ -171,11 +175,12 @@ def wikilink_titles_matching(self, regex, name=None): if name is None: name = "{0}({1})" \ - .format(self._name + ".wikilink_titles_matching", + .format(self.name + ".wikilink_titles_matching", regex.pattern) return filters.regex_matching(regex, self.wikilink_titles, name=name) + @DependentSet.meta_dependent def tag_names_matching(self, regex, name=None): """ Constructs a :class:`revscoring.Datasource` that returns all tag names @@ -186,10 +191,11 @@ def tag_names_matching(self, regex, name=None): if name is None: name = "{0}({1})" \ - .format(self._name + ".tag_names_matching", regex.pattern) + .format(self.name + ".tag_names_matching", regex.pattern) return filters.regex_matching(regex, self.tag_names, name=name) + @DependentSet.meta_dependent def template_names_matching(self, regex, name=None): """ Constructs a :class:`revscoring.Datasource` that returns all template @@ -200,7 +206,7 @@ def template_names_matching(self, regex, name=None): if name is None: name = "{0}({1})" \ - .format(self._name + ".template_names_matching", + .format(self.name + ".template_names_matching", regex.pattern) return filters.regex_matching(regex, self.template_names, name=name) diff --git a/revscoring/features/wikitext/datasources/revision_oriented.py b/revscoring/features/wikitext/datasources/revision_oriented.py index ff0dc4a8..53988590 100644 --- a/revscoring/features/wikitext/datasources/revision_oriented.py +++ b/revscoring/features/wikitext/datasources/revision_oriented.py @@ -32,7 +32,7 @@ class BaseDiff(DependentSet): def __init__(self, name, revision): super().__init__(name) - self.revision = revision + self._revision = revision class Diff(edit.Diff, sentences.Diff, tokenized.Diff, BaseDiff): diff --git a/revscoring/features/wikitext/datasources/sentences.py b/revscoring/features/wikitext/datasources/sentences.py index 95be2573..99c3251d 100644 --- a/revscoring/features/wikitext/datasources/sentences.py +++ b/revscoring/features/wikitext/datasources/sentences.py @@ -1,5 +1,4 @@ from deltas.segmenters import MatchableSegment - from revscoring.datasources import Datasource from revscoring.datasources.meta import indexable @@ -10,7 +9,7 @@ def __init__(self, name, revision_datasources): super().__init__(name, revision_datasources) self.sentences = Datasource( - self._name + ".sentences", psw2sentences, + self.name + ".sentences", psw2sentences, depends_on=[self.paragraphs_sentences_and_whitespace] ) """ @@ -24,14 +23,14 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.sentences_added_removed = Datasource( - self._name + ".sentences_added_removed", set_diff, - depends_on=[self.revision.sentences, - self.revision.parent.sentences] + self.name + ".sentences_added_removed", set_diff, + depends_on=[self._revision.sentences, + self._revision.parent.sentences] ) self.sentences_added = indexable.index( 0, self.sentences_added_removed, - name=self._name + ".sentences_added" + name=self.name + ".sentences_added" ) """ A set of sentences that were added in this edit @@ -39,7 +38,7 @@ def __init__(self, *args, **kwargs): self.sentences_removed = indexable.index( 1, self.sentences_added_removed, - name=self._name + ".sentences_removed" + name=self.name + ".sentences_removed" ) """ A set of sentences that were removed in this edit diff --git a/revscoring/features/wikitext/datasources/tokenized.py b/revscoring/features/wikitext/datasources/tokenized.py index 16aa2817..86d75148 100644 --- a/revscoring/features/wikitext/datasources/tokenized.py +++ b/revscoring/features/wikitext/datasources/tokenized.py @@ -2,9 +2,9 @@ from deltas import wikitext_split from deltas.segmenters import ParagraphsSentencesAndWhitespace - from revscoring.datasources import Datasource from revscoring.datasources.meta import filters, frequencies, mappers +from revscoring.dependencies import DependentSet class Revision: @@ -18,7 +18,7 @@ def __init__(self, name, revision_datasources): """ self.paragraphs_sentences_and_whitespace = Datasource( - self._name + ".paragraphs_sentences_and_whitespace", + self.name + ".paragraphs_sentences_and_whitespace", paragraphs_sentences_and_whitespace.segment, depends_on=[self.tokens] ) @@ -30,35 +30,35 @@ def __init__(self, name, revision_datasources): self.token_frequency = frequencies.table( self.tokens, - name=self._name + ".token_frequency" + name=self.name + ".token_frequency" ) """ A frequency table of all tokens. """ self.numbers = self.tokens_in_types( - {'number'}, name=self._name + ".numbers" + {'number'}, name=self.name + ".numbers" ) """ A list of numeric tokens """ self.number_frequency = frequencies.table( - self.numbers, name=self._name + ".number_frequency" + self.numbers, name=self.name + ".number_frequency" ) """ A frequency table of number tokens. """ self.whitespaces = self.tokens_in_types( - {'whitespace'}, name=self._name + ".whitespaces" + {'whitespace'}, name=self.name + ".whitespaces" ) """ A list of whitespace tokens """ self.whitespace_frequency = frequencies.table( - self.whitespaces, name=self._name + ".whitespace_frequency" + self.whitespaces, name=self.name + ".whitespace_frequency" ) """ A frequency table of whichspace tokens. @@ -68,63 +68,63 @@ def __init__(self, name, revision_datasources): {'dbrack_open', 'dbrack_close', 'brack_open', 'brack_close', 'tab_open', 'tab_close', 'dcurly_open', 'dcurly_close', 'curly_open', 'curly_close', 'bold', 'italics', 'equals'}, - name=self._name + ".markups" + name=self.name + ".markups" ) """ A list of markup tokens """ self.markup_frequency = frequencies.table( - self.markups, name=self._name + ".markup_frequency" + self.markups, name=self.name + ".markup_frequency" ) """ A frequency table of markup tokens. """ self.cjks = self.tokens_in_types( - {'cjk'}, name=self._name + ".cjks" + {'cjk'}, name=self.name + ".cjks" ) """ A list of Chinese/Japanese/Korean tokens """ self.cjk_frequency = frequencies.table( - self.cjks, name=self._name + ".cjk_frequency" + self.cjks, name=self.name + ".cjk_frequency" ) """ A frequency table of cjk tokens. """ self.entities = self.tokens_in_types( - {'entity'}, name=self._name + ".entities" + {'entity'}, name=self.name + ".entities" ) """ A list of HTML entity tokens """ self.entity_frequency = frequencies.table( - self.entities, name=self._name + ".entity_frequency" + self.entities, name=self.name + ".entity_frequency" ) """ A frequency table of entity tokens. """ self.urls = self.tokens_in_types( - {'url'}, name=self._name + ".urls" + {'url'}, name=self.name + ".urls" ) """ A list of URL tokens """ self.url_frequency = frequencies.table( - self.urls, name=self._name + ".url_frequency" + self.urls, name=self.name + ".url_frequency" ) """ A frequency table of url tokens. """ self.words = self.tokens_in_types( - {'word'}, name=self._name + ".words" + {'word'}, name=self.name + ".words" ) """ A list of word tokens @@ -132,7 +132,7 @@ def __init__(self, name, revision_datasources): self.word_frequency = frequencies.table( mappers.lower_case(self.words), - name=self._name + ".word_frequency" + name=self.name + ".word_frequency" ) """ A frequency table of lower-cased word tokens. @@ -140,7 +140,7 @@ def __init__(self, name, revision_datasources): self.uppercase_words = filters.filter( is_uppercase_word, self.words, - name=self._name + ".uppercase_words" + name=self.name + ".uppercase_words" ) """ A list of uppercase word tokens that are at least two @@ -149,7 +149,7 @@ def __init__(self, name, revision_datasources): self.uppercase_word_frequency = frequencies.table( self.uppercase_words, - name=self._name + ".uppercase_word_frequency" + name=self.name + ".uppercase_word_frequency" ) """ A frequency table of uppercase word tokens that are at least two @@ -159,33 +159,34 @@ def __init__(self, name, revision_datasources): self.punctuations = self.tokens_in_types( {'period', 'qmark', 'epoint', 'comma', 'colon', 'scolon', 'japan_punct'}, - name=self._name + ".punctuations" + name=self.name + ".punctuations" ) """ A list of punctuation tokens """ self.punctuation_frequency = frequencies.table( - self.punctuations, name=self._name + ".punctuation_frequency" + self.punctuations, name=self.name + ".punctuation_frequency" ) """ A frequency table of punctuation tokens. """ self.breaks = self.tokens_in_types( - {'break'}, name=self._name + ".breaks" + {'break'}, name=self.name + ".breaks" ) """ A list of break tokens """ self.break_frequency = frequencies.table( - self.breaks, name=self._name + ".break_frequency" + self.breaks, name=self.name + ".break_frequency" ) """ A frequency table of break tokens. """ + @DependentSet.meta_dependent def tokens_in_types(self, types, name=None): """ Constructs a :class:`revscoring.Datasource` that returns all content @@ -195,11 +196,12 @@ def tokens_in_types(self, types, name=None): if name is None: name = "{0}({1})" \ - .format(self._name + ".tokens_in_types", types) + .format(self.name + ".tokens_in_types", types) return filters.filter(token_is_in_types.filter, self.tokens, name=name) + @DependentSet.meta_dependent def tokens_matching(self, regex, name=None, regex_flags=re.I): """ Constructs a :class:`revscoring.Datasource` that returns all content @@ -210,7 +212,7 @@ def tokens_matching(self, regex, name=None, regex_flags=re.I): if name is None: name = "{0}({1})" \ - .format(self._name + ".tokens_matching", regex.pattern) + .format(self.name + ".tokens_matching", regex.pattern) return filters.regex_matching(regex, self.tokens, name=name) @@ -222,198 +224,198 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.token_delta = frequencies.delta( - self.revision.parent.token_frequency, - self.revision.token_frequency, - name=self._name + ".token_delta" + self._revision.parent.token_frequency, + self._revision.token_frequency, + name=self.name + ".token_delta" ) """ A token frequency delta table """ self.token_prop_delta = frequencies.prop_delta( - self.revision.parent.token_frequency, + self._revision.parent.token_frequency, self.token_delta, - name=self._name + ".token_prop_delta" + name=self.name + ".token_prop_delta" ) """ A token proportional frequency delta table """ self.number_delta = frequencies.delta( - self.revision.parent.number_frequency, - self.revision.number_frequency, - name=self._name + ".number_delta" + self._revision.parent.number_frequency, + self._revision.number_frequency, + name=self.name + ".number_delta" ) """ A number frequency delta table """ self.number_prop_delta = frequencies.prop_delta( - self.revision.parent.number_frequency, + self._revision.parent.number_frequency, self.number_delta, - name=self._name + ".number_prop_delta" + name=self.name + ".number_prop_delta" ) """ A number proportional frequency delta table """ self.whitespace_delta = frequencies.delta( - self.revision.parent.whitespace_frequency, - self.revision.whitespace_frequency, - name=self._name + ".whitespace_delta" + self._revision.parent.whitespace_frequency, + self._revision.whitespace_frequency, + name=self.name + ".whitespace_delta" ) """ A whitespace frequency delta table """ self.whitespace_prop_delta = frequencies.prop_delta( - self.revision.parent.whitespace_frequency, + self._revision.parent.whitespace_frequency, self.whitespace_delta, - name=self._name + ".whitespace_prop_delta" + name=self.name + ".whitespace_prop_delta" ) """ A whitespace proportional frequency delta table """ self.markup_delta = frequencies.delta( - self.revision.parent.markup_frequency, - self.revision.markup_frequency, - name=self._name + ".markup_delta" + self._revision.parent.markup_frequency, + self._revision.markup_frequency, + name=self.name + ".markup_delta" ) """ A markup frequency delta table """ self.markup_prop_delta = frequencies.prop_delta( - self.revision.parent.markup_frequency, + self._revision.parent.markup_frequency, self.markup_delta, - name=self._name + ".markup_prop_delta" + name=self.name + ".markup_prop_delta" ) """ A markup proportional frequency delta table """ self.cjk_delta = frequencies.delta( - self.revision.parent.cjk_frequency, - self.revision.cjk_frequency, - name=self._name + ".cjk_delta" + self._revision.parent.cjk_frequency, + self._revision.cjk_frequency, + name=self.name + ".cjk_delta" ) """ A cjk frequency delta table """ self.cjk_prop_delta = frequencies.prop_delta( - self.revision.parent.cjk_frequency, + self._revision.parent.cjk_frequency, self.cjk_delta, - name=self._name + ".cjk_prop_delta" + name=self.name + ".cjk_prop_delta" ) """ A cjk proportional frequency delta table """ self.entity_delta = frequencies.delta( - self.revision.parent.entity_frequency, - self.revision.entity_frequency, - name=self._name + ".entity_delta" + self._revision.parent.entity_frequency, + self._revision.entity_frequency, + name=self.name + ".entity_delta" ) """ A entity frequency delta table """ self.entity_prop_delta = frequencies.prop_delta( - self.revision.parent.entity_frequency, + self._revision.parent.entity_frequency, self.entity_delta, - name=self._name + ".entity_prop_delta" + name=self.name + ".entity_prop_delta" ) """ A entity proportional frequency delta table """ self.url_delta = frequencies.delta( - self.revision.parent.url_frequency, - self.revision.url_frequency, - name=self._name + ".url_delta" + self._revision.parent.url_frequency, + self._revision.url_frequency, + name=self.name + ".url_delta" ) """ A url frequency delta table """ self.url_prop_delta = frequencies.prop_delta( - self.revision.parent.url_frequency, + self._revision.parent.url_frequency, self.url_delta, - name=self._name + ".url_prop_delta" + name=self.name + ".url_prop_delta" ) """ A url proportional frequency delta table """ self.word_delta = frequencies.delta( - self.revision.parent.word_frequency, - self.revision.word_frequency, - name=self._name + ".word_delta" + self._revision.parent.word_frequency, + self._revision.word_frequency, + name=self.name + ".word_delta" ) """ A lower-cased word frequency delta table """ self.word_prop_delta = frequencies.prop_delta( - self.revision.parent.word_frequency, + self._revision.parent.word_frequency, self.word_delta, - name=self._name + ".word_prop_delta" + name=self.name + ".word_prop_delta" ) """ A lower-cased word proportional frequency delta table """ self.uppercase_word_delta = frequencies.delta( - self.revision.parent.uppercase_word_frequency, - self.revision.uppercase_word_frequency, - name=self._name + ".uppercase_word_delta" + self._revision.parent.uppercase_word_frequency, + self._revision.uppercase_word_frequency, + name=self.name + ".uppercase_word_delta" ) """ A uppercase word frequency delta table """ self.uppercase_word_prop_delta = frequencies.prop_delta( - self.revision.parent.uppercase_word_frequency, + self._revision.parent.uppercase_word_frequency, self.uppercase_word_delta, - name=self._name + ".uppercase_word_prop_delta" + name=self.name + ".uppercase_word_prop_delta" ) """ A uppercase word proportional frequency delta table """ self.punctuation_delta = frequencies.delta( - self.revision.parent.punctuation_frequency, - self.revision.punctuation_frequency, - name=self._name + ".punctuation_delta" + self._revision.parent.punctuation_frequency, + self._revision.punctuation_frequency, + name=self.name + ".punctuation_delta" ) """ A punctuation frequency delta table """ self.punctuation_prop_delta = frequencies.prop_delta( - self.revision.parent.punctuation_frequency, + self._revision.parent.punctuation_frequency, self.punctuation_delta, - name=self._name + ".punctuation_prop_delta" + name=self.name + ".punctuation_prop_delta" ) """ A punctuation proportional frequency delta table """ self.break_delta = frequencies.delta( - self.revision.parent.break_frequency, - self.revision.break_frequency, - name=self._name + ".break_delta" + self._revision.parent.break_frequency, + self._revision.break_frequency, + name=self.name + ".break_delta" ) """ A break frequency delta table """ self.break_prop_delta = frequencies.prop_delta( - self.revision.parent.break_frequency, + self._revision.parent.break_frequency, self.break_delta, - name=self._name + ".break_prop_delta" + name=self.name + ".break_prop_delta" ) """ A break proportional frequency delta table diff --git a/revscoring/features/wikitext/features/chars.py b/revscoring/features/wikitext/features/chars.py index 6985e0b4..990385de 100644 --- a/revscoring/features/wikitext/features/chars.py +++ b/revscoring/features/wikitext/features/chars.py @@ -1,9 +1,8 @@ from itertools import groupby +from revscoring import Feature from revscoring.datasources.meta import mappers - -from ...feature import Feature -from ...meta import aggregators +from revscoring.features.meta import aggregators class Revision: @@ -13,62 +12,62 @@ def __init__(self, *args, **kwargs): self.chars = aggregators.len( self.datasources.text, - name=self._name + ".chars" + name=self.name + ".chars" ) "`int` : The number of characters in the text" self.numeric_chars = aggregators.sum( mappers.map(len, self.datasources.numbers), - name=self._name + ".numeric_chars", returns=int + name=self.name + ".numeric_chars", returns=int ) "`int` : The number of numeric characters in the text" self.whitespace_chars = aggregators.sum( mappers.map(len, self.datasources.whitespaces), - name=self._name + ".whitespace_chars", returns=int + name=self.name + ".whitespace_chars", returns=int ) "`int` : The number of whitespace characters in the text" self.markup_chars = aggregators.sum( mappers.map(len, self.datasources.markups), - name=self._name + ".markup_chars", returns=int + name=self.name + ".markup_chars", returns=int ) "`int` : The number of wikitext markup characters in the text" self.cjk_chars = aggregators.sum( mappers.map(len, self.datasources.cjks), - name=self._name + ".cjk_chars", returns=int + name=self.name + ".cjk_chars", returns=int ) "`int` : The number of Chinese/Japanese/Korean characters in the text" self.entity_chars = aggregators.sum( mappers.map(len, self.datasources.entities), - name=self._name + ".entity_chars", returns=int + name=self.name + ".entity_chars", returns=int ) "`int` : The number of HTML entity characters in the text" self.url_chars = aggregators.sum( mappers.map(len, self.datasources.urls), - name=self._name + ".url_chars", returns=int + name=self.name + ".url_chars", returns=int ) "`int` : The number of URL characters in the text" self.word_chars = aggregators.sum( mappers.map(len, self.datasources.words), - name=self._name + ".word_chars", returns=int + name=self.name + ".word_chars", returns=int ) "`int` : The number of word characters in the text" self.uppercase_word_chars = aggregators.sum( mappers.map(len, self.datasources.uppercase_words), - name=self._name + ".uppercase_word_chars", returns=int + name=self.name + ".uppercase_word_chars", returns=int ) "`int` : The number of UPPERCASE WORD characters in the text" self.punctuation_chars = aggregators.sum( mappers.map(len, self.datasources.punctuations), - name=self._name + ".punctuation_chars", returns=int + name=self.name + ".punctuation_chars", returns=int ) "`int` : The number of punctuation characters in the text" self.break_chars = aggregators.sum( mappers.map(len, self.datasources.breaks), - name=self._name + ".break_chars", returns=int + name=self.name + ".break_chars", returns=int ) "`int` : The number of break characters in the text" self.longest_repeated_char = \ - Feature(self._name + ".longest_repeated_char", + Feature(self.name + ".longest_repeated_char", _process_longest_repeated_char, returns=int, depends_on=[self.datasources.text]) "`int` : The most repeated character" @@ -81,138 +80,138 @@ def __init__(self, *args, **kwargs): self.chars_added = aggregators.sum( mappers.map(len, self.datasources.segments_added), - name=self._name + ".chars_added", returns=int + name=self.name + ".chars_added", returns=int ) "`int` : The number of characters added" self.chars_removed = aggregators.sum( mappers.map(len, self.datasources.segments_removed), - name=self._name + ".chars_removed", returns=int + name=self.name + ".chars_removed", returns=int ) "`int` : The number of characters removed" self.numeric_chars_added = aggregators.sum( mappers.map(len, self.datasources.numbers_added), - name=self._name + ".numeric_chars_added", returns=int + name=self.name + ".numeric_chars_added", returns=int ) "`int` : The number of numeric characters added" self.numeric_chars_removed = aggregators.sum( mappers.map(len, self.datasources.numbers_removed), - name=self._name + ".numeric_chars_removed", returns=int + name=self.name + ".numeric_chars_removed", returns=int ) "`int` : The number of numeric characters removed" self.whitespace_chars_added = aggregators.sum( mappers.map(len, self.datasources.whitespaces_added), - name=self._name + ".whitespace_chars_added", returns=int + name=self.name + ".whitespace_chars_added", returns=int ) "`int` : The number of whitespace characters added" self.whitespace_chars_removed = aggregators.sum( mappers.map(len, self.datasources.whitespaces_removed), - name=self._name + ".whitespace_chars_removed", returns=int + name=self.name + ".whitespace_chars_removed", returns=int ) "`int` : The number of whitespace characters removed" self.markup_chars_added = aggregators.sum( mappers.map(len, self.datasources.markups_added), - name=self._name + ".markup_chars_added", returns=int + name=self.name + ".markup_chars_added", returns=int ) "`int` : The number of markup characters added" self.markup_chars_removed = aggregators.sum( mappers.map(len, self.datasources.markups_removed), - name=self._name + ".markup_chars_removed", returns=int + name=self.name + ".markup_chars_removed", returns=int ) "`int` : The number of markup characters removed" self.cjk_chars_added = aggregators.sum( mappers.map(len, self.datasources.cjks_added), - name=self._name + ".cjk_chars_added", returns=int + name=self.name + ".cjk_chars_added", returns=int ) "`int` : The number of cjk characters added" self.cjk_chars_removed = aggregators.sum( mappers.map(len, self.datasources.cjks_removed), - name=self._name + ".cjk_chars_removed", returns=int + name=self.name + ".cjk_chars_removed", returns=int ) "`int` : The number of cjk characters removed" self.entity_chars_added = aggregators.sum( mappers.map(len, self.datasources.entities_added), - name=self._name + ".entity_chars_added", returns=int + name=self.name + ".entity_chars_added", returns=int ) "`int` : The number of entity characters added" self.entity_chars_removed = aggregators.sum( mappers.map(len, self.datasources.entities_removed), - name=self._name + ".entity_chars_removed", returns=int + name=self.name + ".entity_chars_removed", returns=int ) "`int` : The number of entity characters removed" self.url_chars_added = aggregators.sum( mappers.map(len, self.datasources.urls_added), - name=self._name + ".url_chars_added", returns=int + name=self.name + ".url_chars_added", returns=int ) "`int` : The number of url characters added" self.url_chars_removed = aggregators.sum( mappers.map(len, self.datasources.urls_removed), - name=self._name + ".url_chars_removed", returns=int + name=self.name + ".url_chars_removed", returns=int ) "`int` : The number of url characters removed" self.word_chars_added = aggregators.sum( mappers.map(len, self.datasources.words_added), - name=self._name + ".word_chars_added", returns=int + name=self.name + ".word_chars_added", returns=int ) "`int` : The number of word characters added" self.word_chars_removed = aggregators.sum( mappers.map(len, self.datasources.words_removed), - name=self._name + ".word_chars_removed", returns=int + name=self.name + ".word_chars_removed", returns=int ) "`int` : The number of word characters removed" self.uppercase_word_chars_added = aggregators.sum( mappers.map(len, self.datasources.uppercase_words_added), - name=self._name + ".uppercase_word_chars_added", returns=int + name=self.name + ".uppercase_word_chars_added", returns=int ) "`int` : The number of UPPERCASE word characters added" self.uppercase_word_chars_removed = aggregators.sum( mappers.map(len, self.datasources.uppercase_words_removed), - name=self._name + ".uppercase_word_chars_removed", returns=int + name=self.name + ".uppercase_word_chars_removed", returns=int ) "`int` : The number of UPPERCASE word characters removed" self.punctuation_chars_added = aggregators.sum( mappers.map(len, self.datasources.punctuations_added), - name=self._name + ".punctuation_chars_added", returns=int + name=self.name + ".punctuation_chars_added", returns=int ) "`int` : The number of punctuation characters added" self.punctuation_chars_removed = aggregators.sum( mappers.map(len, self.datasources.punctuations_removed), - name=self._name + ".punctuation_chars_removed", returns=int + name=self.name + ".punctuation_chars_removed", returns=int ) "`int` : The number of punctuation characters removed" self.break_chars_added = aggregators.sum( mappers.map(len, self.datasources.breaks_added), - name=self._name + ".break_chars_added", returns=int + name=self.name + ".break_chars_added", returns=int ) "`int` : The number of break characters added" self.break_chars_removed = aggregators.sum( mappers.map(len, self.datasources.breaks_removed), - name=self._name + ".break_chars_removed", returns=int + name=self.name + ".break_chars_removed", returns=int ) "`int` : The number of break characters removed" self.longest_repeated_char_added = \ - Feature(self._name + ".longest_repeated_char_added", + Feature(self.name + ".longest_repeated_char_added", _process_longest_repeated_char_added, returns=int, depends_on=[self.datasources.segments_added]) "`int` : The most repeated character added" diff --git a/revscoring/features/wikitext/features/edit_tokens.py b/revscoring/features/wikitext/features/edit_tokens.py index 667d6566..1b3c3132 100644 --- a/revscoring/features/wikitext/features/edit_tokens.py +++ b/revscoring/features/wikitext/features/edit_tokens.py @@ -1,6 +1,5 @@ from revscoring.datasources.meta import mappers - -from ...meta import aggregators +from revscoring.features.meta import aggregators class Diff: @@ -10,151 +9,151 @@ def __init__(self, *args, **kwargs): self.segments_added = aggregators.len( self.datasources.segments_added, - name=self._name + ".segments_added" + name=self.name + ".segments_added" ) "`int` : The number of segments added " self.segments_removed = aggregators.len( self.datasources.segments_removed, - name=self._name + ".segments_removed" + name=self.name + ".segments_removed" ) "`int` : The number of segments removed " self.tokens_added = aggregators.len( self.datasources.tokens_added, - name=self._name + ".tokens_added" + name=self.name + ".tokens_added" ) "`int` : The number of tokens added " self.tokens_removed = aggregators.len( self.datasources.tokens_removed, - name=self._name + ".tokens_removed" + name=self.name + ".tokens_removed" ) "`int` : The number of tokens removed " self.numbers_added = aggregators.len( self.datasources.numbers_added, - name=self._name + ".numbers_added" + name=self.name + ".numbers_added" ) "`int` : The number of number tokens added " self.numbers_removed = aggregators.len( self.datasources.numbers_removed, - name=self._name + ".numbers_removed" + name=self.name + ".numbers_removed" ) "`int` : The number of number tokens removed " self.markups_added = aggregators.len( self.datasources.markups_added, - name=self._name + ".markups_added" + name=self.name + ".markups_added" ) "`int` : The number of markup tokens added " self.markups_removed = aggregators.len( self.datasources.markups_removed, - name=self._name + ".markups_removed" + name=self.name + ".markups_removed" ) "`int` : The number of markup tokens removed " self.whitespaces_added = aggregators.len( self.datasources.whitespaces_added, - name=self._name + ".whitespaces_added" + name=self.name + ".whitespaces_added" ) "`int` : The number of whitespace tokens added " self.whitespaces_removed = aggregators.len( self.datasources.whitespaces_removed, - name=self._name + ".whitespaces_removed" + name=self.name + ".whitespaces_removed" ) "`int` : The number of whitespace tokens removed " self.cjks_added = aggregators.len( self.datasources.cjks_added, - name=self._name + ".cjks_added" + name=self.name + ".cjks_added" ) "`int` : The number of cjk tokens added " self.cjks_removed = aggregators.len( self.datasources.cjks_removed, - name=self._name + ".cjks_removed" + name=self.name + ".cjks_removed" ) "`int` : The number of cjk tokens removed " self.entities_added = aggregators.len( self.datasources.entities_added, - name=self._name + ".entities_added" + name=self.name + ".entities_added" ) "`int` : The number of entity tokens added " self.entities_removed = aggregators.len( self.datasources.entities_removed, - name=self._name + ".entities_removed" + name=self.name + ".entities_removed" ) "`int` : The number of entity tokens removed " self.urls_added = aggregators.len( self.datasources.urls_added, - name=self._name + ".urls_added" + name=self.name + ".urls_added" ) "`int` : The number of url tokens added " self.urls_removed = aggregators.len( self.datasources.urls_removed, - name=self._name + ".urls_removed" + name=self.name + ".urls_removed" ) "`int` : The number of url tokens removed " self.words_added = aggregators.len( self.datasources.words_added, - name=self._name + ".words_added" + name=self.name + ".words_added" ) "`int` : The number of word tokens added " self.words_removed = aggregators.len( self.datasources.words_removed, - name=self._name + ".words_removed" + name=self.name + ".words_removed" ) "`int` : The number of word tokens removed " self.uppercase_words_added = aggregators.len( self.datasources.uppercase_words_added, - name=self._name + ".uppercase_words_added" + name=self.name + ".uppercase_words_added" ) "`int` : The number of word tokens added " self.uppercase_words_removed = aggregators.len( self.datasources.uppercase_words_removed, - name=self._name + ".uppercase_words_removed" + name=self.name + ".uppercase_words_removed" ) "`int` : The number of word tokens removed " self.punctuations_added = aggregators.len( self.datasources.punctuations_added, - name=self._name + ".punctuations_added" + name=self.name + ".punctuations_added" ) "`int` : The number of punctuation tokens added " self.punctuations_removed = aggregators.len( self.datasources.punctuations_removed, - name=self._name + ".punctuations_removed" + name=self.name + ".punctuations_removed" ) "`int` : The number of punctuation tokens removed " self.breaks_added = aggregators.len( self.datasources.breaks_added, - name=self._name + ".breaks_added" + name=self.name + ".breaks_added" ) "`int` : The number of break tokens added " self.breaks_removed = aggregators.len( self.datasources.breaks_removed, - name=self._name + ".breaks_removed" + name=self.name + ".breaks_removed" ) "`int` : The number of break tokens removed" self.longest_token_added = aggregators.max( mappers.map(len, self.datasources.tokens_added), - name=self._name + '.longest_token_added' + name=self.name + '.longest_token_added' ) "`int` : The length of the longest token added" diff --git a/revscoring/features/wikitext/features/parsed.py b/revscoring/features/wikitext/features/parsed.py index a4545c83..7b89a3d9 100644 --- a/revscoring/features/wikitext/features/parsed.py +++ b/revscoring/features/wikitext/features/parsed.py @@ -1,10 +1,10 @@ import re +from revscoring import Feature +from revscoring.dependencies import DependentSet +from revscoring.features.meta import aggregators from textstat.textstat import textstat -from ...feature import Feature -from ...meta import aggregators - class Revision: @@ -13,7 +13,7 @@ def __init__(self, *args, **kwargs): self.content_chars = aggregators.len( self.datasources.content, - name=self._name + ".content_chars" + name=self.name + ".content_chars" ) """ `int` : The number of characters of viewable content (no markup or @@ -21,7 +21,7 @@ def __init__(self, *args, **kwargs): """ self.flesh_kincaid = Feature( - self._name + ".flesh_kincaid", + self.name + ".flesh_kincaid", textstat.flesch_reading_ease, depends_on=[self.datasources.content], returns=float @@ -33,40 +33,41 @@ def __init__(self, *args, **kwargs): self.headings = aggregators.len( self.datasources.headings, - name=self._name + ".headings" + name=self.name + ".headings" ) "`int` : The number of headings" self.external_links = aggregators.len( self.datasources.external_links, - name=self._name + ".external_links" + name=self.name + ".external_links" ) "`int` : The number of external links" self.wikilinks = aggregators.len( self.datasources.wikilinks, - name=self._name + ".wikilinks" + name=self.name + ".wikilinks" ) "`int` : The number of wikilinks (internal to other pages in the wiki)" self.tags = aggregators.len( self.datasources.tags, - name=self._name + ".tags" + name=self.name + ".tags" ) "`int` : The number of HTML tags" self.ref_tags = aggregators.len( self.datasources.tag_names_matching(r"ref"), - name=self._name + ".ref_tags" + name=self.name + ".ref_tags" ) "`int` : The number of tags" self.templates = aggregators.len( self.datasources.templates, - name=self._name + ".templates" + name=self.name + ".templates" ) "`int` : The number of templates" + @DependentSet.meta_dependent def heading_titles_matching(self, regex, name=None): """ Constructs a :class:`revscoring.Feature` that that generates a count of @@ -75,7 +76,7 @@ def heading_titles_matching(self, regex, name=None): if not hasattr(regex, "pattern"): regex = re.compile(regex, re.I) if name is None: - name = "{0}({1})".format(self._name + ".heading_titles_matching", + name = "{0}({1})".format(self.name + ".heading_titles_matching", regex.pattern) return aggregators.len( @@ -83,19 +84,21 @@ def heading_titles_matching(self, regex, name=None): name=name ) + @DependentSet.meta_dependent def headings_by_level(self, level, name=None): """ Constructs a :class:`revscoring.Datasource` that generates a count of all headers of a level. """ if name is None: - name = "{0}({1})".format(self._name + ".headings_by_level", + name = "{0}({1})".format(self.name + ".headings_by_level", level) return aggregators.len( self.datasources.headings_by_level(level), name=name ) + @DependentSet.meta_dependent def external_link_urls_matching(self, regex, name=None): """ Constructs a :class:`revscoring.Datasource` that generates a count of @@ -106,7 +109,7 @@ def external_link_urls_matching(self, regex, name=None): if name is None: name = "{0}({1})" \ - .format(self._name + ".external_link_urls_matching", + .format(self.name + ".external_link_urls_matching", regex.pattern) return aggregators.len( @@ -114,6 +117,7 @@ def external_link_urls_matching(self, regex, name=None): name=name ) + @DependentSet.meta_dependent def wikilink_titles_matching(self, regex, name=None): """ Constructs a :class:`revscoring.Datasource` that that generates a count @@ -124,7 +128,7 @@ def wikilink_titles_matching(self, regex, name=None): if name is None: name = "{0}({1})" \ - .format(self._name + ".wikilink_titles_matching", + .format(self.name + ".wikilink_titles_matching", regex.pattern) return aggregators.len( @@ -132,6 +136,7 @@ def wikilink_titles_matching(self, regex, name=None): name=name ) + @DependentSet.meta_dependent def tag_names_matching(self, regex, name=None): """ Constructs a :class:`revscoring.Datasource` that generates a count of @@ -142,13 +147,14 @@ def tag_names_matching(self, regex, name=None): if name is None: name = "{0}({1})" \ - .format(self._name + ".tag_names_matching", regex.pattern) + .format(self.name + ".tag_names_matching", regex.pattern) return aggregators.len( self.datasources.tag_names_matching(regex), name=name ) + @DependentSet.meta_dependent def template_names_matching(self, regex, name=None): """ Constructs a :class:`revscoring.Feature` that generates a count of @@ -159,7 +165,7 @@ def template_names_matching(self, regex, name=None): if name is None: name = "{0}({1})" \ - .format(self._name + ".template_names_matching", + .format(self.name + ".template_names_matching", regex.pattern) return aggregators.len( diff --git a/revscoring/features/wikitext/features/tokenized.py b/revscoring/features/wikitext/features/tokenized.py index e9333c27..b93c4fc9 100644 --- a/revscoring/features/wikitext/features/tokenized.py +++ b/revscoring/features/wikitext/features/tokenized.py @@ -1,6 +1,5 @@ from revscoring.datasources.meta import dicts, filters, mappers - -from ...meta import aggregators +from revscoring.features.meta import aggregators class Revision: @@ -46,25 +45,25 @@ def __init__(self, *args, **kwargs): self.token_delta_sum = aggregators.sum( dicts.values(self.datasources.token_delta), - name=self._name + ".token_delta_sum" + name=self.name + ".token_delta_sum" ) "`int` : The sum of delta changes in the token frequency table" self.token_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.token_delta)), - name=self._name + ".token_delta_increase" + name=self.name + ".token_delta_increase" ) "`int` : The sum of delta increases in the token frequency table" self.token_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.token_delta)), - name=self._name + ".token_delta_decrease" + name=self.name + ".token_delta_decrease" ) "`int` : The sum of delta decreases in the token frequency table" self.token_prop_delta_sum = aggregators.sum( dicts.values(self.datasources.token_prop_delta), - name=self._name + ".token_prop_delta_sum" + name=self.name + ".token_prop_delta_sum" ) """ `int` : The sum of proportional delta changes in the token @@ -73,7 +72,7 @@ def __init__(self, *args, **kwargs): self.token_prop_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.token_prop_delta)), - name=self._name + ".token_prop_delta_increase" + name=self.name + ".token_prop_delta_increase" ) """ `int` : The sum of proportional delta increases in the token @@ -82,7 +81,7 @@ def __init__(self, *args, **kwargs): self.token_prop_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.token_prop_delta)), - name=self._name + ".token_prop_delta_decrease" + name=self.name + ".token_prop_delta_decrease" ) """ `int` : The sum of proportional delta decreases in the token @@ -92,25 +91,25 @@ def __init__(self, *args, **kwargs): # number self.number_delta_sum = aggregators.sum( dicts.values(self.datasources.number_delta), - name=self._name + ".number_delta_sum" + name=self.name + ".number_delta_sum" ) "`int` : The sum of delta changes in the number frequency table" self.number_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.number_delta)), - name=self._name + ".number_delta_increase" + name=self.name + ".number_delta_increase" ) "`int` : The sum of delta increases in the number frequency table" self.number_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.number_delta)), - name=self._name + ".number_delta_decrease" + name=self.name + ".number_delta_decrease" ) "`int` : The sum of delta decreases in the number frequency table" self.number_prop_delta_sum = aggregators.sum( dicts.values(self.datasources.number_prop_delta), - name=self._name + ".number_prop_delta_sum" + name=self.name + ".number_prop_delta_sum" ) """ `int` : The sum of proportional delta changes in the number @@ -119,7 +118,7 @@ def __init__(self, *args, **kwargs): self.number_prop_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.number_prop_delta)), - name=self._name + ".number_prop_delta_increase" + name=self.name + ".number_prop_delta_increase" ) """ `int` : The sum of proportional delta increases in the number @@ -128,7 +127,7 @@ def __init__(self, *args, **kwargs): self.number_prop_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.number_prop_delta)), - name=self._name + ".number_prop_delta_decrease" + name=self.name + ".number_prop_delta_decrease" ) """ `int` : The sum of proportional delta decreases in the number @@ -138,25 +137,25 @@ def __init__(self, *args, **kwargs): # whitespace self.whitespace_delta_sum = aggregators.sum( dicts.values(self.datasources.whitespace_delta), - name=self._name + ".whitespace_delta_sum" + name=self.name + ".whitespace_delta_sum" ) "`int` : The sum of delta changes in the whitespace frequency table" self.whitespace_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.whitespace_delta)), - name=self._name + ".whitespace_delta_increase" + name=self.name + ".whitespace_delta_increase" ) "`int` : The sum of delta increases in the whitespace frequency table" self.whitespace_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.whitespace_delta)), - name=self._name + ".whitespace_delta_decrease" + name=self.name + ".whitespace_delta_decrease" ) "`int` : The sum of delta decreases in the whitespace frequency table" self.whitespace_prop_delta_sum = aggregators.sum( dicts.values(self.datasources.whitespace_prop_delta), - name=self._name + ".whitespace_prop_delta_sum" + name=self.name + ".whitespace_prop_delta_sum" ) """ `int` : The sum of proportional delta changes in the whitespace @@ -166,7 +165,7 @@ def __init__(self, *args, **kwargs): self.whitespace_prop_delta_increase = aggregators.sum( filters.positive(dicts.values( self.datasources.whitespace_prop_delta)), - name=self._name + ".whitespace_prop_delta_increase" + name=self.name + ".whitespace_prop_delta_increase" ) """ `int` : The sum of proportional delta increases in the whitespace @@ -176,7 +175,7 @@ def __init__(self, *args, **kwargs): self.whitespace_prop_delta_decrease = aggregators.sum( filters.negative(dicts.values( self.datasources.whitespace_prop_delta)), - name=self._name + ".whitespace_prop_delta_decrease" + name=self.name + ".whitespace_prop_delta_decrease" ) """ `int` : The sum of proportional delta decreases in the whitespace @@ -186,25 +185,25 @@ def __init__(self, *args, **kwargs): # markup self.markup_delta_sum = aggregators.sum( dicts.values(self.datasources.markup_delta), - name=self._name + ".markup_delta_sum" + name=self.name + ".markup_delta_sum" ) "`int` : The sum of delta changes in the markup frequency table" self.markup_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.markup_delta)), - name=self._name + ".markup_delta_increase" + name=self.name + ".markup_delta_increase" ) "`int` : The sum of delta increases in the markup frequency table" self.markup_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.markup_delta)), - name=self._name + ".markup_delta_decrease" + name=self.name + ".markup_delta_decrease" ) "`int` : The sum of delta decreases in the markup frequency table" self.markup_prop_delta_sum = aggregators.sum( dicts.values(self.datasources.markup_prop_delta), - name=self._name + ".markup_prop_delta_sum" + name=self.name + ".markup_prop_delta_sum" ) """ `int` : The sum of proportional delta changes in the markup @@ -213,7 +212,7 @@ def __init__(self, *args, **kwargs): self.markup_prop_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.markup_prop_delta)), - name=self._name + ".markup_prop_delta_increase" + name=self.name + ".markup_prop_delta_increase" ) """ `int` : The sum of proportional delta increases in the markup @@ -222,7 +221,7 @@ def __init__(self, *args, **kwargs): self.markup_prop_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.markup_prop_delta)), - name=self._name + ".markup_prop_delta_decrease" + name=self.name + ".markup_prop_delta_decrease" ) """ `int` : The sum of proportional delta decreases in the markup @@ -232,25 +231,25 @@ def __init__(self, *args, **kwargs): # cjk self.cjk_delta_sum = aggregators.sum( dicts.values(self.datasources.cjk_delta), - name=self._name + ".cjk_delta_sum" + name=self.name + ".cjk_delta_sum" ) "`int` : The sum of delta changes in the cjk frequency table" self.cjk_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.cjk_delta)), - name=self._name + ".cjk_delta_increase" + name=self.name + ".cjk_delta_increase" ) "`int` : The sum of delta increases in the cjk frequency table" self.cjk_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.cjk_delta)), - name=self._name + ".cjk_delta_decrease" + name=self.name + ".cjk_delta_decrease" ) "`int` : The sum of delta decreases in the cjk frequency table" self.cjk_prop_delta_sum = aggregators.sum( dicts.values(self.datasources.cjk_prop_delta), - name=self._name + ".cjk_prop_delta_sum" + name=self.name + ".cjk_prop_delta_sum" ) """ `int` : The sum of proportional delta changes in the cjk @@ -259,7 +258,7 @@ def __init__(self, *args, **kwargs): self.cjk_prop_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.cjk_prop_delta)), - name=self._name + ".cjk_prop_delta_increase" + name=self.name + ".cjk_prop_delta_increase" ) """ `int` : The sum of proportional delta increases in the cjk @@ -268,7 +267,7 @@ def __init__(self, *args, **kwargs): self.cjk_prop_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.cjk_prop_delta)), - name=self._name + ".cjk_prop_delta_decrease" + name=self.name + ".cjk_prop_delta_decrease" ) """ `int` : The sum of proportional delta decreases in the cjk @@ -278,25 +277,25 @@ def __init__(self, *args, **kwargs): # entity self.entity_delta_sum = aggregators.sum( dicts.values(self.datasources.entity_delta), - name=self._name + ".entity_delta_sum" + name=self.name + ".entity_delta_sum" ) "`int` : The sum of delta changes in the entity frequency table" self.entity_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.entity_delta)), - name=self._name + ".entity_delta_increase" + name=self.name + ".entity_delta_increase" ) "`int` : The sum of delta increases in the entity frequency table" self.entity_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.entity_delta)), - name=self._name + ".entity_delta_decrease" + name=self.name + ".entity_delta_decrease" ) "`int` : The sum of delta decreases in the entity frequency table" self.entity_prop_delta_sum = aggregators.sum( dicts.values(self.datasources.entity_prop_delta), - name=self._name + ".entity_prop_delta_sum" + name=self.name + ".entity_prop_delta_sum" ) """ `int` : The sum of proportional delta changes in the entity @@ -305,7 +304,7 @@ def __init__(self, *args, **kwargs): self.entity_prop_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.entity_prop_delta)), - name=self._name + ".entity_prop_delta_increase" + name=self.name + ".entity_prop_delta_increase" ) """ `int` : The sum of proportional delta increases in the entity @@ -314,7 +313,7 @@ def __init__(self, *args, **kwargs): self.entity_prop_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.entity_prop_delta)), - name=self._name + ".entity_prop_delta_decrease" + name=self.name + ".entity_prop_delta_decrease" ) """ `int` : The sum of proportional delta decreases in the entity @@ -324,25 +323,25 @@ def __init__(self, *args, **kwargs): # url self.url_delta_sum = aggregators.sum( dicts.values(self.datasources.url_delta), - name=self._name + ".url_delta_sum" + name=self.name + ".url_delta_sum" ) "`int` : The sum of delta changes in the url frequency table" self.url_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.url_delta)), - name=self._name + ".url_delta_increase" + name=self.name + ".url_delta_increase" ) "`int` : The sum of delta increases in the url frequency table" self.url_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.url_delta)), - name=self._name + ".url_delta_decrease" + name=self.name + ".url_delta_decrease" ) "`int` : The sum of delta decreases in the url frequency table" self.url_prop_delta_sum = aggregators.sum( dicts.values(self.datasources.url_prop_delta), - name=self._name + ".url_prop_delta_sum" + name=self.name + ".url_prop_delta_sum" ) """ `int` : The sum of proportional delta changes in the url @@ -351,7 +350,7 @@ def __init__(self, *args, **kwargs): self.url_prop_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.url_prop_delta)), - name=self._name + ".url_prop_delta_increase" + name=self.name + ".url_prop_delta_increase" ) """ `int` : The sum of proportional delta increases in the url @@ -360,7 +359,7 @@ def __init__(self, *args, **kwargs): self.url_prop_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.url_prop_delta)), - name=self._name + ".url_prop_delta_decrease" + name=self.name + ".url_prop_delta_decrease" ) """ `int` : The sum of proportional delta decreases in the url @@ -370,25 +369,25 @@ def __init__(self, *args, **kwargs): # word self.word_delta_sum = aggregators.sum( dicts.values(self.datasources.word_delta), - name=self._name + ".word_delta_sum" + name=self.name + ".word_delta_sum" ) "`int` : The sum of delta changes in the word frequency table" self.word_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.word_delta)), - name=self._name + ".word_delta_increase" + name=self.name + ".word_delta_increase" ) "`int` : The sum of delta increases in the word frequency table" self.word_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.word_delta)), - name=self._name + ".word_delta_decrease" + name=self.name + ".word_delta_decrease" ) "`int` : The sum of delta decreases in the word frequency table" self.word_prop_delta_sum = aggregators.sum( dicts.values(self.datasources.word_prop_delta), - name=self._name + ".word_prop_delta_sum" + name=self.name + ".word_prop_delta_sum" ) """ `int` : The sum of proportional delta changes in the word @@ -397,7 +396,7 @@ def __init__(self, *args, **kwargs): self.word_prop_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.word_prop_delta)), - name=self._name + ".word_prop_delta_increase" + name=self.name + ".word_prop_delta_increase" ) """ `int` : The sum of proportional delta increases in the word @@ -406,7 +405,7 @@ def __init__(self, *args, **kwargs): self.word_prop_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.word_prop_delta)), - name=self._name + ".word_prop_delta_decrease" + name=self.name + ".word_prop_delta_decrease" ) """ `int` : The sum of proportional delta decreases in the word @@ -418,7 +417,7 @@ def __init__(self, *args, **kwargs): dicts.values(self.datasources.uppercase_word_delta) self.uppercase_word_delta_sum = aggregators.sum( uppercase_word_delta_values, - name=self._name + ".uppercase_word_delta_sum" + name=self.name + ".uppercase_word_delta_sum" ) """ `int` : The sum of delta changes in the UPPERCASE word frequency @@ -427,7 +426,7 @@ def __init__(self, *args, **kwargs): self.uppercase_word_delta_increase = aggregators.sum( filters.positive(uppercase_word_delta_values), - name=self._name + ".uppercase_word_delta_increase" + name=self.name + ".uppercase_word_delta_increase" ) """ `int` : The sum of delta increases in the UPPERCASE word frequency @@ -436,7 +435,7 @@ def __init__(self, *args, **kwargs): self.uppercase_word_delta_decrease = aggregators.sum( filters.negative(uppercase_word_delta_values), - name=self._name + ".uppercase_word_delta_decrease" + name=self.name + ".uppercase_word_delta_decrease" ) """ `int` : The sum of delta decreases in the UPPERCASE word frequency @@ -447,7 +446,7 @@ def __init__(self, *args, **kwargs): dicts.values(self.datasources.uppercase_word_prop_delta) self.uppercase_word_prop_delta_sum = aggregators.sum( uppercase_word_prop_delta_values, - name=self._name + ".uppercase_word_prop_delta_sum" + name=self.name + ".uppercase_word_prop_delta_sum" ) """ `float` : The sum of proportional delta changes in the UPPERCASE word @@ -456,7 +455,7 @@ def __init__(self, *args, **kwargs): self.uppercase_word_prop_delta_increase = aggregators.sum( filters.positive(uppercase_word_prop_delta_values), - name=self._name + ".uppercase_word_prop_delta_increase" + name=self.name + ".uppercase_word_prop_delta_increase" ) """ `float` : The sum of proportional delta increases in the UPPERCASE word @@ -465,7 +464,7 @@ def __init__(self, *args, **kwargs): self.uppercase_word_prop_delta_decrease = aggregators.sum( filters.negative(uppercase_word_prop_delta_values), - name=self._name + ".uppercase_word_prop_delta_decrease" + name=self.name + ".uppercase_word_prop_delta_decrease" ) """ `float` : The sum of proportional delta decreases in the UPPERCASE word @@ -475,25 +474,25 @@ def __init__(self, *args, **kwargs): # punctuation self.punctuation_delta_sum = aggregators.sum( dicts.values(self.datasources.punctuation_delta), - name=self._name + ".punctuation_delta_sum" + name=self.name + ".punctuation_delta_sum" ) "`int` : The sum of delta changes in the punctuation frequency table" self.punctuation_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.punctuation_delta)), - name=self._name + ".punctuation_delta_increase" + name=self.name + ".punctuation_delta_increase" ) "`int` : The sum of delta increases in the punctuation frequency table" self.punctuation_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.punctuation_delta)), - name=self._name + ".punctuation_delta_decrease" + name=self.name + ".punctuation_delta_decrease" ) "`int` : The sum of delta decreases in the punctuation frequency table" self.punctuation_prop_delta_sum = aggregators.sum( dicts.values(self.datasources.punctuation_prop_delta), - name=self._name + ".punctuation_prop_delta_sum" + name=self.name + ".punctuation_prop_delta_sum" ) """ `int` : The sum of proportional delta changes in the punctuation @@ -503,7 +502,7 @@ def __init__(self, *args, **kwargs): self.punctuation_prop_delta_increase = aggregators.sum( filters.positive(dicts.values( self.datasources.punctuation_prop_delta)), - name=self._name + ".punctuation_prop_delta_increase" + name=self.name + ".punctuation_prop_delta_increase" ) """ `int` : The sum of proportional delta increases in the punctuation @@ -513,7 +512,7 @@ def __init__(self, *args, **kwargs): self.punctuation_prop_delta_decrease = aggregators.sum( filters.negative(dicts.values( self.datasources.punctuation_prop_delta)), - name=self._name + ".punctuation_prop_delta_decrease" + name=self.name + ".punctuation_prop_delta_decrease" ) """ `int` : The sum of proportional delta decreases in the punctuation @@ -523,25 +522,25 @@ def __init__(self, *args, **kwargs): # break self.break_delta_sum = aggregators.sum( dicts.values(self.datasources.break_delta), - name=self._name + ".break_delta_sum" + name=self.name + ".break_delta_sum" ) "`int` : The sum of delta changes in the break frequency table" self.break_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.break_delta)), - name=self._name + ".break_delta_increase" + name=self.name + ".break_delta_increase" ) "`int` : The sum of delta increases in the break frequency table" self.break_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.break_delta)), - name=self._name + ".break_delta_decrease" + name=self.name + ".break_delta_decrease" ) "`int` : The sum of delta decreases in the break frequency table" self.break_prop_delta_sum = aggregators.sum( dicts.values(self.datasources.break_prop_delta), - name=self._name + ".break_prop_delta_sum" + name=self.name + ".break_prop_delta_sum" ) """ `int` : The sum of proportional delta changes in the break @@ -550,7 +549,7 @@ def __init__(self, *args, **kwargs): self.break_prop_delta_increase = aggregators.sum( filters.positive(dicts.values(self.datasources.break_prop_delta)), - name=self._name + ".break_prop_delta_increase" + name=self.name + ".break_prop_delta_increase" ) """ `int` : The sum of proportional delta increases in the break @@ -559,7 +558,7 @@ def __init__(self, *args, **kwargs): self.break_prop_delta_decrease = aggregators.sum( filters.negative(dicts.values(self.datasources.break_prop_delta)), - name=self._name + ".break_prop_delta_decrease" + name=self.name + ".break_prop_delta_decrease" ) """ `int` : The sum of proportional delta decreases in the break diff --git a/revscoring/features/wikitext/session_oriented.py b/revscoring/features/wikitext/session_oriented.py new file mode 100644 index 00000000..da5918a6 --- /dev/null +++ b/revscoring/features/wikitext/session_oriented.py @@ -0,0 +1,33 @@ +from revscoring.datasources import session_oriented +from revscoring.dependencies import DependentSet + +from . import datasources, features + +name = "wikitext.session" + + +class Session(DependentSet): + """ + Represents an editor's activity session + """ + def __init__(self, name, revisions_datasources): + super().__init__(name) + session_revision = features.Revision( + name + ".revisions", + datasources.Revision(name, revisions_datasources)) + self.revisions = session_oriented.list_of_tree( + session_revision, rewrite_name=session_oriented.rewrite_name, + cache={d.name: d for d in revisions_datasources}) + """ + :class:`~revscoring.datasources.meta.expanders.list_of`(:class:`~revscoring.features.wikitext.Revision`) : + The revisions saved by the users within the session. + """ + +session = Session(name, session_oriented.session.revisions) +""" +Represents an editor's activity session. Implements this basic structure: +* session: :class:`~revscoring.features.wikibase.Session` + * revisions: a :class:`~revscoring.datasources.meta.expanders.list_of`(:class:`~revscoring.features.wikitext.Revision`) + * parent: a :class:`~revscoring.datasources.meta.expanders.list_of`(:class:`~revscoring.features.wikitext.Revision`) + * diff: a a :class:`~revscoring.datasources.meta.expanders.list_of`(:class:`~revscoring.features.wikitext.Diff`) +""" diff --git a/revscoring/languages/features/regex_matches/regex_matches.py b/revscoring/languages/features/regex_matches/regex_matches.py index f31db822..d8efdbfa 100644 --- a/revscoring/languages/features/regex_matches/regex_matches.py +++ b/revscoring/languages/features/regex_matches/regex_matches.py @@ -50,7 +50,7 @@ def excluding(self, exclusions, name=None): will be used """ return self.__class__( - name or self._name + ".excluding({0!r})".format(exclusions), + name or self.name + ".excluding({0!r})".format(exclusions), self._regexes, exclusions=(self._exclusions or []) + exclusions, wrapping=self._wrapping) diff --git a/tests/datasources/meta/tests/test_expanders.py b/tests/datasources/meta/tests/test_expanders.py new file mode 100644 index 00000000..cf4a5933 --- /dev/null +++ b/tests/datasources/meta/tests/test_expanders.py @@ -0,0 +1,21 @@ +import pickle + +from revscoring import Datasource, Feature +from revscoring.datasources.meta import expanders +from revscoring.dependencies import solve + + +def process_chars(text): + return len(text) + +text = Datasource("text") +chars = Feature("chars", process_chars, returns=int, depends_on=[text]) +many_texts = expanders.list_of(text) +many_chars = expanders.list_of(chars, depends_on=[many_texts]) + + +def test_list_of(): + assert solve(many_chars, cache={many_texts: ["foo", "barbaz"]}) == \ + [3, 6] + + assert pickle.loads(pickle.dumps(many_chars)) == many_chars diff --git a/tests/datasources/meta/tests/test_selectors.py b/tests/datasources/meta/tests/test_selectors.py index 163b7930..61787045 100644 --- a/tests/datasources/meta/tests/test_selectors.py +++ b/tests/datasources/meta/tests/test_selectors.py @@ -1,4 +1,4 @@ -import io +import pickle from revscoring.datasources.datasource import Datasource from revscoring.datasources.meta import frequencies, selectors @@ -35,10 +35,7 @@ def test_tfidf(): assert tfidf_table['maybe'] > 0 assert my_tfidf_table.document_n == 9 - f = io.BytesIO() - my_tfidf_table.dump(f) - f.seek(0) - loaded_my_tfidf_table = Datasource.load(f) + loaded_my_tfidf_table = pickle.loads(pickle.dumps(my_tfidf_table)) assert (solve(my_tfidf_table, cache=cache) == solve(loaded_my_tfidf_table, cache=cache)) diff --git a/tests/datasources/test_session_oriented.py b/tests/datasources/test_session_oriented.py new file mode 100644 index 00000000..65fd99cc --- /dev/null +++ b/tests/datasources/test_session_oriented.py @@ -0,0 +1,150 @@ +from mwtypes import Timestamp +from revscoring import Datasource +from revscoring.datasources.session_oriented import (list_of_tree, + rewrite_name, session) +from revscoring.dependencies import DependentSet, solve + +from .util import check_datasource + + +def test_session(): + check_datasource(session.revisions.id) + check_datasource(session.revisions.timestamp) + check_datasource(session.revisions.comment) + check_datasource(session.revisions.byte_len) + check_datasource(session.revisions.minor) + check_datasource(session.revisions.content_model) + check_datasource(session.revisions.text) + assert hasattr(session.revisions, "parent") + assert hasattr(session.revisions, "page") + assert hasattr(session.revisions, "diff") + + # revision.parent + check_datasource(session.revisions.parent.id) + assert hasattr(session.revisions.parent, "user") + check_datasource(session.revisions.parent.user.id) + check_datasource(session.revisions.parent.user.text) + assert not hasattr(session.revisions.parent.user, "info") + check_datasource(session.revisions.parent.timestamp) + check_datasource(session.revisions.parent.comment) + check_datasource(session.revisions.parent.byte_len) + check_datasource(session.revisions.parent.minor) + check_datasource(session.revisions.parent.content_model) + check_datasource(session.revisions.parent.text) + assert not hasattr(session.revisions.parent, "page") + assert not hasattr(session.revisions.parent, "parent") + assert not hasattr(session.revisions.parent, "diff") + + # revision.page + check_datasource(session.revisions.page.id) + assert hasattr(session.revisions.page, "namespace") + check_datasource(session.revisions.page.namespace.id) + check_datasource(session.revisions.page.namespace.name) + check_datasource(session.revisions.page.title) + assert hasattr(session.revisions.page, "creation") + check_datasource(session.revisions.page.creation.id) + assert hasattr(session.revisions.page.creation, "user") + check_datasource(session.revisions.page.creation.timestamp) + check_datasource(session.revisions.page.creation.comment) + check_datasource(session.revisions.page.creation.byte_len) + check_datasource(session.revisions.page.creation.minor) + check_datasource(session.revisions.page.creation.content_model) + assert not hasattr(session.revisions.page.creation, "page") + assert not hasattr(session.revisions.page.creation, "text") + assert not hasattr(session.revisions.page.creation, "diff") + assert hasattr(session.revisions.page, "suggested") + check_datasource(session.revisions.page.suggested.properties) + + # revision.page.creation.user + check_datasource(session.revisions.page.creation.user.id) + check_datasource(session.revisions.page.creation.user.text) + assert hasattr(session.revisions.page.creation.user, "info") + check_datasource(session.revisions.page.creation.user.info.editcount) + check_datasource(session.revisions.page.creation.user.info.registration) + check_datasource(session.revisions.page.creation.user.info.groups) + check_datasource(session.revisions.page.creation.user.info.emailable) + check_datasource(session.revisions.page.creation.user.info.gender) + + # revision.user + check_datasource(session.user.id) + check_datasource(session.user.text) + assert hasattr(session.user, "info") + check_datasource(session.user.info.editcount) + check_datasource(session.user.info.registration) + check_datasource(session.user.info.groups) + check_datasource(session.user.info.emailable) + check_datasource(session.user.info.gender) + + +def test_rewrite_name(): + assert rewrite_name("revision.text") == "session.revisions.text" + assert rewrite_name("bytes.revision.foobar") == \ + "bytes.session.revisions.foobar" + assert rewrite_name("session.revisions.text") == "session.revisions.text" + + +def test_timestamp_str(): + cache = {session.revisions.timestamp_str: ["1970-01-01T00:00:00Z"]} + assert solve(session.revisions.timestamp, cache=cache) == [Timestamp(0)] + + +def test_list_of_meta(): + text = Datasource("text") + + class contains(Datasource): + + def __init__(self, string_datasource, value, name=None): + name = self._format_name(name, [string_datasource, value]) + super().__init__(name, self.process, depends_on=[string_datasource]) + self.value = value + + def process(self, string): + return self.value in string + + def text_contains(value): + return contains(text, value) + + +def test_list_of_tree(): + class TestThing(DependentSet): + + def __init__(self, name): + super().__init__(name) + self.text = Datasource(name + ".text") + self.len = Datasource( + name + ".text.len", self._process_len, depends_on=[self.text]) + + @staticmethod + def _process_len(text): + return len(text) + + @DependentSet.meta_dependent + def contains(self, value): + return contains( + self.text, value, + name=self.name + ".text.contains({0!r})".format(value)) + + class contains(Datasource): + + def __init__(self, string_datasource, value, name=None): + name = self._format_name(name, [string_datasource, value]) + super().__init__(name, self.process, depends_on=[string_datasource]) + self.value = value + + def process(self, string): + return self.value in string + + thing = TestThing("thing") + cache = {thing.text: "Hello"} + assert solve(thing.len, cache=cache) == 5 + assert solve(thing.contains("el"), cache=cache) + assert not solve(thing.contains("Foobar"), cache=cache) + + list_of_thing = list_of_tree( + TestThing("thing"), + rewrite_name=lambda n: "list_of_" + n if not n.startswith("list_of_") else n) + + cache = {list_of_thing.text: ["Hello", "Foobar"]} + assert solve(list_of_thing.len, cache=cache) == [5, 6] + assert solve(list_of_thing.contains("el"), cache=cache) == [True, False] + assert solve(list_of_thing.contains("Foobar"), cache=cache) == [False, True] diff --git a/tests/features/bytes/tests/test_session_oriented.py b/tests/features/bytes/tests/test_session_oriented.py new file mode 100644 index 00000000..ca492e53 --- /dev/null +++ b/tests/features/bytes/tests/test_session_oriented.py @@ -0,0 +1,23 @@ +import pickle + +from revscoring.datasources.session_oriented import session +from revscoring.dependencies import solve +from revscoring.features import bytes + +length_change = \ + bytes.session.revisions.length - bytes.session.revisions.parent.length + + +def test_length(): + cache = {session.revisions.parent.text: ["I am ascii", "I am too"], + session.revisions.text: ["地を南北に縦走する", ""]} + + assert solve(bytes.session.revisions.length, cache=cache) == [27, 0] + assert solve(bytes.session.revisions.parent.length, cache=cache) == [10, 8] + assert solve(length_change, cache=cache) == [17, -8] + + assert pickle.loads(pickle.dumps(bytes.session.revisions.length)) == \ + bytes.session.revisions.length + assert (pickle.loads(pickle.dumps(bytes.session.revisions.parent.length)) == + bytes.session.revisions.parent.length) + assert pickle.loads(pickle.dumps(length_change)) == length_change diff --git a/tests/features/meta/tests/test_aggregators.py b/tests/features/meta/tests/test_aggregators.py index a2d9fdd4..40e67852 100644 --- a/tests/features/meta/tests/test_aggregators.py +++ b/tests/features/meta/tests/test_aggregators.py @@ -47,20 +47,6 @@ def test_min(): assert pickle.loads(pickle.dumps(my_min)) == my_min -def test_min_vectors(): - my_list = Datasource("my_list") - my_min = aggregators.min(my_list, vector=True) - cache = {my_list: [[1, 2, 3], [4, 5, 6]]} - assert all(a == b for a, b in - zip(solve(my_min, cache=cache), [1, 2, 3])) - cache = {my_list: [[]]} - assert solve(my_min, cache=cache) == [0] - cache = {my_list: [None]} - assert solve(my_min, cache=cache) == [0] - - assert pickle.loads(pickle.dumps(my_min)) == my_min - - def test_max(): my_list = Datasource("my_list") my_max = aggregators.max(my_list) @@ -76,7 +62,7 @@ def test_max(): def test_max_vectors(): my_list = Datasource("my_list") - my_max = aggregators.max(my_list, vector=True) + my_max = aggregators.max(my_list, returns=int, vector=True) cache = {my_list: [[1, 2, 3], [4, 5, 6]]} assert all(a == b for a, b in zip(solve(my_max, cache=cache), [4, 5, 6])) @@ -88,6 +74,20 @@ def test_max_vectors(): assert pickle.loads(pickle.dumps(my_max)) == my_max +def test_min_vectors(): + my_list = Datasource("my_list") + my_min = aggregators.min(my_list, returns=int, vector=True) + cache = {my_list: [[1, 2, 3], [4, 5, 6]]} + assert all(a == b for a, b in + zip(solve(my_min, cache=cache), [1, 2, 3])) + cache = {my_list: [[]]} + assert solve(my_min, cache=cache) == [0] + cache = {my_list: [None]} + assert solve(my_min, cache=cache) == [0] + + assert pickle.loads(pickle.dumps(my_min)) == my_min + + def test_len(): my_list = Datasource("my_list") my_len = aggregators.len(my_list) diff --git a/tests/features/meta/tests/test_operators.py b/tests/features/meta/tests/test_operators.py new file mode 100644 index 00000000..ba2359c4 --- /dev/null +++ b/tests/features/meta/tests/test_operators.py @@ -0,0 +1,54 @@ +from revscoring import Feature, FeatureVector +from revscoring.dependencies import solve +from revscoring.features.meta import operators + +int_1 = Feature("int_1", returns=int) +int_2 = Feature("int_2", returns=int) +bool_1 = Feature("bool_1", returns=bool) +bool_2 = Feature("bool_2", returns=bool) +int_vector_1 = FeatureVector("int_vector_1", returns=int) +int_vector_2 = FeatureVector("int_vector_2", returns=int) +bool_vector_1 = FeatureVector("bool_vector_1", returns=bool) +bool_vector_2 = FeatureVector("bool_vector_2", returns=bool) + + +def test_singleton(): + cache = {int_1: 1, int_2: 2} + assert solve(operators.add(int_1, int_2), cache=cache) == 3 + assert solve(operators.sub(int_1, int_2), cache=cache) == -1 + assert solve(operators.mul(int_1, int_2), cache=cache) == 2 + assert solve(operators.div(int_1, int_2), cache=cache) == 0.5 + + assert solve(operators.eq(int_1, int_2), cache=cache) is False + assert solve(operators.ne(int_1, int_2), cache=cache) is True + assert solve(operators.le(int_1, int_2), cache=cache) is True + assert solve(operators.lt(int_1, int_2), cache=cache) is True + assert solve(operators.ge(int_1, int_2), cache=cache) is False + assert solve(operators.gt(int_1, int_2), cache=cache) is False + + cache = {bool_1: True, bool_2: False} + assert solve(operators.and_(bool_1, bool_2), cache=cache) is False + assert solve(operators.or_(bool_1, bool_2), cache=cache) is True + assert solve(operators.not_(bool_1), cache=cache) is False + assert solve(operators.not_(bool_2), cache=cache) is True + + +def test_vector(): + cache = {int_vector_1: [1, 2, 1, 3], int_vector_2: [2, 1, 1, -1]} + assert solve(operators.add(int_vector_1, int_vector_2), cache=cache) == [3, 3, 2, 2] + assert solve(operators.sub(int_vector_1, int_vector_2), cache=cache) == [-1, 1, 0, 4] + assert solve(operators.mul(int_vector_1, int_vector_2), cache=cache) == [2, 2, 1, -3] + assert solve(operators.div(int_vector_1, int_vector_2), cache=cache) == [0.5, 2, 1, -3] + + assert solve(operators.eq(int_vector_1, int_vector_2), cache=cache) == [False, False, True, False] + assert solve(operators.ne(int_vector_1, int_vector_2), cache=cache) == [True, True, False, True] + assert solve(operators.le(int_vector_1, int_vector_2), cache=cache) == [True, False, True, False] + assert solve(operators.lt(int_vector_1, int_vector_2), cache=cache) == [True, False, False, False] + assert solve(operators.ge(int_vector_1, int_vector_2), cache=cache) == [False, True, True, True] + assert solve(operators.gt(int_vector_1, int_vector_2), cache=cache) == [False, True, False, True] + + cache = {bool_vector_1: [True, False, True, False], bool_vector_2: [True, True, False, False]} + assert solve(operators.and_(bool_vector_1, bool_vector_2), cache=cache) == [True, False, False, False] + assert solve(operators.or_(bool_vector_1, bool_vector_2), cache=cache) == [True, True, True, False] + assert solve(operators.not_(bool_vector_1), cache=cache) == [False, True, False, True] + assert solve(operators.not_(bool_vector_2), cache=cache) == [False, False, True, True] diff --git a/tests/features/temporal/tests/test_session_oriented.py b/tests/features/temporal/tests/test_session_oriented.py new file mode 100644 index 00000000..85f3e7f6 --- /dev/null +++ b/tests/features/temporal/tests/test_session_oriented.py @@ -0,0 +1,103 @@ +import pickle + +from mwtypes import Timestamp +from revscoring.datasources import session_oriented +from revscoring.dependencies import solve +from revscoring.features.temporal.revision_oriented import \ + MW_REGISTRATION_EPOCH +from revscoring.features.temporal.session_oriented import session + + +def test_session_revisions(): + cache = {session_oriented.session.revisions.timestamp: [Timestamp(0)]} + assert solve(session.revisions.day_of_week, cache=cache) == [3] # Thursday, Jan 1 1970 + assert solve(session.revisions.hour_of_day, cache=cache) == [0] # Midnight + + assert pickle.loads(pickle.dumps(session.revisions.day_of_week) + ) == session.revisions.day_of_week + assert pickle.loads(pickle.dumps(session.revisions.hour_of_day) + ) == session.revisions.hour_of_day + + +def test_session_revisions_string_timestamp(): + cache = {session_oriented.session.revisions.timestamp_str: ["1970-01-01T00:00:00Z"]} + assert solve(session.revisions.day_of_week, cache=cache) == [3] # Thursday, Jan 1 1970 + assert solve(session.revisions.hour_of_day, cache=cache) == [0] # Midnight + + assert pickle.loads(pickle.dumps(session.revisions.day_of_week)) == \ + session.revisions.day_of_week + assert pickle.loads(pickle.dumps(session.revisions.hour_of_day)) == \ + session.revisions.hour_of_day + + +def test_session_page_creation(): + cache = { + session_oriented.session.revisions.timestamp: [Timestamp(10)], + session_oriented.session.revisions.page.creation.timestamp: [Timestamp(0)] + } + assert solve(session.revisions.page.creation.seconds_since, cache=cache) == [10] + + assert (pickle.loads(pickle.dumps(session.revisions.page.creation.seconds_since)) == + session.revisions.page.creation.seconds_since) + + +def test_session_user_registration(): + cache = { + session_oriented.session.revisions.timestamp: [Timestamp(10)], + session_oriented.session.user.id: 10, + session_oriented.session.user.info.registration: Timestamp(0) + } + assert solve(session.user.seconds_since_registration, cache=cache) == 10 + + # Anon (no registration) + cache = { + session_oriented.session.revisions.timestamp: [Timestamp(10)], + session_oriented.session.user.id: 0, + session_oriented.session.user.info.registration: None + } + assert solve(session.user.seconds_since_registration, cache=cache) == 0 + + # Old user (no registration) + cache = { + session_oriented.session.revisions.timestamp: [MW_REGISTRATION_EPOCH + 10], + session_oriented.session.user.id: 10, + session_oriented.session.user.info.registration: None + } + assert solve(session.user.seconds_since_registration, cache=cache) == 10 + + # Old user (broken registration date) + cache = { + session_oriented.session.revisions.timestamp: [Timestamp(0)], + session_oriented.session.user.id: 10, + session_oriented.session.user.info.registration: Timestamp(10) + } + assert (solve(session.user.seconds_since_registration, cache=cache) == + 60 * 60 * 24 * 365) # one year + + assert (pickle.loads(pickle.dumps(session.user.seconds_since_registration)) == + session.user.seconds_since_registration) + + +def test_last_user_revision(): + cache = { + session_oriented.session.revisions.timestamp: [Timestamp(10)], + session_oriented.session.user.last_revision.timestamp: Timestamp(0) + } + assert solve(session.user.last_revision.seconds_since, cache=cache) == 10 + + cache = { + session_oriented.session.revisions.timestamp: [Timestamp(10)], + session_oriented.session.user.last_revision.timestamp: None + } + assert solve(session.user.last_revision.seconds_since, cache=cache) == 0 + + +def test_parent_revision(): + cache = { + session_oriented.session.revisions.timestamp: [Timestamp(10)], + session_oriented.session.revisions.parent.timestamp: [Timestamp(0)] + } + assert solve(session.revisions.parent.seconds_since, cache=cache) == [10] + + assert (pickle.loads(pickle.dumps(session.revisions.parent.seconds_since)) == + session.revisions.parent.seconds_since) diff --git a/tests/features/test_functions.py b/tests/features/test_functions.py index 92a84a45..919d0b8d 100644 --- a/tests/features/test_functions.py +++ b/tests/features/test_functions.py @@ -1,7 +1,6 @@ from revscoring.datasources import Datasource -from revscoring.features.feature import Constant, Feature -from revscoring.features.feature_vector import FeatureVector +from revscoring.features.feature import Constant, Feature, FeatureVector from revscoring.features.functions import trim, vectorize_values from revscoring.features.modifiers import log, max diff --git a/tests/features/test_modifiers.py b/tests/features/test_modifiers.py index b7aa0c7e..8c985754 100644 --- a/tests/features/test_modifiers.py +++ b/tests/features/test_modifiers.py @@ -5,6 +5,46 @@ from revscoring.features import modifiers +def add_three(arg1, arg2, arg3): + return arg1 + arg2 + arg3 + + +def reverse_div_(left, right): + return right / left + + +def test_function_applier(): + @modifiers.function_applier + def three_way_sum(arg1, arg2, arg3, name, returns): + return add_three, name, returns + + four = three_way_sum(0, 1, 3) + assert solve(four) == 4 + assert solve(pickle.loads(pickle.dumps(four))) == 4 + assert repr(four) == "" + + vector_of_four = three_way_sum([1, 2, 3], [3, 2, 1], [0, 0, 0]) + assert solve(vector_of_four) == [4, 4, 4] + assert solve(pickle.loads(pickle.dumps(vector_of_four))) == [4, 4, 4] + assert repr(vector_of_four) == "" + + +def test_binary_operator(): + @modifiers.binary_operator + def reverse_div(left, right, returns): + return reverse_div_, "\\", returns or float + + four = reverse_div(2, 8) + assert solve(four) == 4 + assert solve(pickle.loads(pickle.dumps(four))) == 4 + assert repr(four) == "" + + vector_of_four = reverse_div([1, 2, 3], [4, 8, 12]) + assert solve(vector_of_four) == [4, 4, 4] + assert solve(pickle.loads(pickle.dumps(vector_of_four))) == [4, 4, 4] + assert repr(vector_of_four) == "" + + def test_log(): log_five = modifiers.log(5) diff --git a/tests/features/wikibase/tests/test_session_oriented.py b/tests/features/wikibase/tests/test_session_oriented.py new file mode 100644 index 00000000..9160fc58 --- /dev/null +++ b/tests/features/wikibase/tests/test_session_oriented.py @@ -0,0 +1,112 @@ +import json +import os +import pickle + +from revscoring.dependencies import solve +from revscoring.features.meta import aggregators +from revscoring.features.wikibase import session + +pwd = os.path.dirname(os.path.realpath(__file__)) +ALAN_TURING = json.load(open(os.path.join(pwd, "alan_turing.json"))) +ALAN_TURING_OLD = json.load(open(os.path.join(pwd, "alan_turing.old.json"))) + +revision_entity_doc = session.revisions.datasources.entity_doc +parent_entity_doc = session.revisions.parent.datasources.entity_doc +diff = session.revisions.diff + + +def test_session_sitelinks_diff(): + cache = {revision_entity_doc: [ALAN_TURING], + parent_entity_doc: [ALAN_TURING_OLD]} + + sitelinks_diff = solve(diff.datasources.sitelinks_diff, cache=cache) + assert solve(diff.sitelinks_added, cache=cache) == [26] + assert (sitelinks_diff[0].added == + {'alswiki', 'fowiki', 'itwikiquote', 'commonswiki', 'mgwiki', + 'cywikiquote', 'ruwikiquote', 'kkwiki', 'ttwiki', 'cawikiquote', + 'eswikiquote', 'cewiki', 'cowiki', 'pawiki', 'cswikiquote', + 'hewikiquote', 'newwiki', 'uzwiki', 'zhwikiquote', 'bawiki', + 'furwiki', 'scowiki', 'dewikiquote', 'frwikiquote', 'plwikiquote', + 'enwikiquote'}) + assert solve(diff.sitelinks_removed, cache=cache) == [0] + assert sitelinks_diff[0].removed == set() + assert (sitelinks_diff[0].intersection == + {'htwiki', 'mtwiki', 'swwiki', 'mkwiki', 'warwiki', 'anwiki', 'rowiki', + 'bgwiki', 'bnwiki', 'orwiki', 'idwiki', 'arwiki', 'skwiki', 'ruewiki', + 'tawiki', 'nnwiki', 'pnbwiki', 'guwiki', 'dewiki', 'cswiki', + 'ilowiki', 'kawiki', 'lvwiki', 'afwiki', 'jvwiki', 'zh_yuewiki', + 'tgwiki', 'hrwiki', 'brwiki', 'iswiki', 'ruwiki', 'dawiki', 'eswiki', + 'ltwiki', 'fawiki', 'bewiki', 'glwiki', 'iowiki', 'vowiki', 'yiwiki', + 'yowiki', 'plwiki', 'be_x_oldwiki', 'mlwiki', 'mswiki', 'astwiki', + 'hifwiki', 'urwiki', 'hewiki', 'aswiki', 'ocwiki', 'sawiki', 'cawiki', + 'tewiki', 'hiwiki', 'shwiki', 'pmswiki', 'trwiki', 'zh_min_nanwiki', + 'tlwiki', 'knwiki', 'jawiki', 'arzwiki', 'cywiki', 'lijwiki', + 'ptwiki', 'zhwiki', 'viwiki', 'mwlwiki', 'nlwiki', 'kowiki', + 'ganwiki', 'lawiki', 'simplewiki', 'bswiki', 'etwiki', 'slwiki', + 'huwiki', 'hywiki', 'sqwiki', 'srwiki', 'liwiki', 'lbwiki', 'fywiki', + 'mnwiki', 'fiwiki', 'lmowiki', 'jbowiki', 'thwiki', 'sahwiki', + 'euwiki', 'gawiki', 'azwiki', 'elwiki', 'kuwiki', 'ukwiki', + 'bat_smgwiki', 'pamwiki', 'mrwiki', 'enwiki', 'ckbwiki', 'frwiki', + 'eowiki', 'svwiki', 'gdwiki', 'scnwiki', 'itwiki', 'nowiki'}) + assert solve(diff.sitelinks_changed, cache=cache) == [2] + assert sitelinks_diff[0].changed == {'skwiki', 'sahwiki'} + assert (sitelinks_diff[0].unchanged == + {'warwiki', 'aswiki', 'cywiki', 'lvwiki', 'sawiki', 'zh_yuewiki', + 'tewiki', 'pnbwiki', 'idwiki', 'mlwiki', 'anwiki', 'pmswiki', + 'kawiki', 'ptwiki', 'iowiki', 'ltwiki', 'bat_smgwiki', 'cswiki', + 'swwiki', 'rowiki', 'mswiki', 'etwiki', 'jvwiki', 'dawiki', + 'hifwiki', 'euwiki', 'simplewiki', 'htwiki', 'srwiki', 'huwiki', + 'bswiki', 'ilowiki', 'brwiki', 'hrwiki', 'eswiki', 'yiwiki', 'bnwiki', + 'glwiki', 'zhwiki', 'hiwiki', 'tawiki', 'eowiki', 'kowiki', 'yowiki', + 'jawiki', 'scnwiki', 'slwiki', 'astwiki', 'lijwiki', 'nnwiki', + 'svwiki', 'ruwiki', 'tlwiki', 'bgwiki', 'pamwiki', 'sqwiki', 'tgwiki', + 'gdwiki', 'fiwiki', 'mwlwiki', 'mnwiki', 'lmowiki', 'ukwiki', + 'arwiki', 'hewiki', 'enwiki', 'orwiki', 'lbwiki', 'thwiki', 'fywiki', + 'knwiki', 'elwiki', 'frwiki', 'shwiki', 'itwiki', 'azwiki', + 'zh_min_nanwiki', 'gawiki', 'liwiki', 'iswiki', 'trwiki', 'cawiki', + 'nlwiki', 'be_x_oldwiki', 'kuwiki', 'lawiki', 'bewiki', 'guwiki', + 'urwiki', 'nowiki', 'fawiki', 'jbowiki', 'ruewiki', 'afwiki', + 'arzwiki', 'ganwiki', 'ckbwiki', 'ocwiki', 'plwiki', 'dewiki', + 'viwiki', 'hywiki', 'mkwiki', 'mrwiki', 'mtwiki', 'vowiki'}) + + assert (pickle.loads(pickle.dumps(diff.datasources.sitelinks_diff)) == + diff.datasources.sitelinks_diff) + + +def test_session_property_changed(): + p999_changed = diff.property_changed('P999') + p19_changed = diff.property_changed('P19') + + cache = {revision_entity_doc: [ALAN_TURING, ALAN_TURING_OLD], + parent_entity_doc: [ALAN_TURING_OLD, ALAN_TURING]} + + assert solve(p999_changed, cache=cache) == [False, False] + assert solve(p19_changed, cache=cache) == [True, True] + + assert pickle.loads(pickle.dumps(p999_changed)) == p999_changed + assert pickle.loads(pickle.dumps(p19_changed)) == p19_changed + + +def test_session_entity(): + assert solve(session.revisions.datasources.entity, + cache={revision_entity_doc: [None]})[0].properties == {} + + solve(session.revisions.datasources.entity, + cache={revision_entity_doc: [ALAN_TURING]}) + + assert (pickle.loads(pickle.dumps(session.revisions.datasources.entity)) == + session.revisions.datasources.entity) + + assert solve(session.revisions.properties, + cache={revision_entity_doc: [ALAN_TURING]}) == [57] + assert solve(aggregators.sum(session.revisions.properties), + cache={revision_entity_doc: [ALAN_TURING]}) == 57 + assert (solve(session.revisions.datasources.properties, + cache={revision_entity_doc: [ALAN_TURING]})[0].keys() == + {'P1430', 'P906', 'P1816', 'P570', 'P31', 'P1343', 'P2021', 'P535', + 'P800', 'P569', 'P373', 'P1819', 'P108', 'P227', 'P185', 'P910', + 'P1273', 'P69', 'P244', 'P20', 'P101', 'P106', 'P18', 'P1563', 'P25', + 'P646', 'P1296', 'P214', 'P950', 'P463', 'P1006', 'P268', 'P21', + 'P1417', 'P22', 'P1207', 'P19', 'P91', 'P735', 'P1412', 'P166', + 'P269', 'P1741', 'P1196', 'P27', 'P140', 'P512', 'P1415', 'P691', + 'P345', 'P949', 'P1263', 'P549', 'P184', 'P935', 'P349', 'P213'}) diff --git a/tests/features/wikitext/tests/test_session_oriented.py b/tests/features/wikitext/tests/test_session_oriented.py new file mode 100644 index 00000000..252d661c --- /dev/null +++ b/tests/features/wikitext/tests/test_session_oriented.py @@ -0,0 +1,81 @@ +import pickle + +from revscoring.datasources import session_oriented +from revscoring.dependencies import solve +from revscoring.features.wikitext import session + +r_text = session_oriented.session.revisions.text +p_text = session_oriented.session.revisions.parent.text + + +cite_templates_ds = session.revisions.datasources.template_names_matching(r"^cite") +cite_templates = session.revisions.template_names_matching(r"^cite") + + +def test_session_chars(): + cache = {p_text: ["This is some nice text.", ""], + r_text: ["This is some more text.", "I have a hat."]} + + assert solve(session.revisions.chars, cache=cache) == [23, 13] + assert solve(session.revisions.parent.chars, cache=cache) == [23, 0] + assert solve(session.revisions.diff.chars_added, cache=cache) == [4, 13] + assert solve(session.revisions.diff.chars_removed, cache=cache) == [4, 0] + + assert (pickle.loads(pickle.dumps(session.revisions.chars)) == + session.revisions.chars) + assert (pickle.loads(pickle.dumps(session.revisions.parent.chars)) == + session.revisions.parent.chars) + assert (pickle.loads(pickle.dumps(session.revisions.diff.chars_added)) == + session.revisions.diff.chars_added) + assert (pickle.loads(pickle.dumps(session.revisions.diff.chars_removed)) == + session.revisions.diff.chars_removed) + + +def test_session_tokens_matching(): + cache = {p_text: ["This is not 55 a sring.", ""], + r_text: ["This is too 56 a tring.", "Foobar!"]} + assert (solve(session.revisions.diff.datasources.tokens_added_matching("^t"), + cache=cache) == + [['too', 'tring'], []]) + assert (solve(session.revisions.diff.datasources.tokens_removed_matching("^(5|s)"), + cache=cache) == + [['55', 'sring'], []]) + + +def test_templates(): + + cache = {r_text: ["This is [https://wikis.com].\n" + + "== Heading! ==\n" + + "{{Cite thing}} the {{citation needed}}\n" + + "=== Another {{heading|foo}}! ===", ""]} + assert (solve(session.revisions.datasources.template_names, cache=cache) == + [["Cite thing", "citation needed", "heading"], []]) + + assert solve(session.revisions.templates, cache=cache) == [3, 0] + + assert solve(cite_templates, cache=cache) == [1, 0] + + assert (pickle.loads(pickle.dumps(session.revisions.templates)) == + session.revisions.templates) + assert (pickle.loads(pickle.dumps(cite_templates)) == + cite_templates) + + +def test_tokens(): + text = """ +This is an m80. It has 50 grams of TNT. Here's some japanese: +修造のための勧進を担った組織の総称。[//google.com?foo=bar hats] +I can use · and  . But [[can]] I {{foo}} a {{bar}}? + +I guess we'll never know. +""" + assert solve(session.revisions.tokens, cache={r_text: [text, ""]}) == [97, 0] + assert pickle.loads(pickle.dumps(session.revisions.tokens)) == session.revisions.tokens + + my_words = session.revisions.datasources.tokens_in_types({"word"}) + assert (solve(my_words, cache={r_text: [text, ""]}) == + [['This', 'is', 'an', 'm80', 'It', 'has', 'grams', 'of', 'TNT', + "Here's", 'some', 'japanese', 'hats', 'I', 'can', 'use', 'and', + 'But', 'can', 'I', 'foo', 'a', 'bar', 'I', 'guess', "we'll", 'never', + 'know'], []]) + assert pickle.loads(pickle.dumps(my_words)) == my_words diff --git a/tests/languages/features/regex_matches/tests/test_regexes.py b/tests/languages/features/regex_matches/tests/test_regexes.py index a1cf336f..b4115409 100644 --- a/tests/languages/features/regex_matches/tests/test_regexes.py +++ b/tests/languages/features/regex_matches/tests/test_regexes.py @@ -14,7 +14,7 @@ p_text = revision_oriented.revision.parent.text badwords_notbad = badwords.excluding( - [r'notbad'], name=badwords._name + "_notbad") + [r'notbad'], name=badwords.name + "_notbad") def test_regexes():