From 5348a6945b873a57226811249d31099c31b0a983 Mon Sep 17 00:00:00 2001 From: Silvia Terragni Date: Wed, 21 Jul 2021 14:43:45 +0200 Subject: [PATCH 01/81] fix #3181 --- gensim/topic_coherence/text_analysis.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 83cbdc6471..bd440c38ad 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -300,18 +300,10 @@ def accumulate(self, texts, window_size): def _iter_texts(self, texts): dtype = np.uint16 if np.iinfo(np.uint16).max >= self._vocab_size else np.uint32 for text in texts: - if self.text_is_relevant(text): - yield np.fromiter(( - self.id2contiguous[self.token2id[w]] if w in self.relevant_words - else self._none_token - for w in text), dtype=dtype, count=len(text)) - - def text_is_relevant(self, text): - """Check if the text has any relevant words.""" - for word in text: - if word in self.relevant_words: - return True - return False + yield np.fromiter(( + self.id2contiguous[self.token2id[w]] if w in self.relevant_words + else self._none_token + for w in text), dtype=dtype, count=len(text)) class InvertedIndexAccumulator(WindowedTextsAnalyzer, InvertedIndexBased): From 9c83e5193396610befd10f50baf840849fd7234b Mon Sep 17 00:00:00 2001 From: Silvia Terragni Date: Wed, 21 Jul 2021 14:46:08 +0200 Subject: [PATCH 02/81] added tests --- gensim/test/test_coherencemodel.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 9396fe5ac0..5ac159ec87 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -70,8 +70,13 @@ def check_coherence_measure(self, coherence): cm2 = CoherenceModel(topics=self.topics2, **kwargs) cm3 = CoherenceModel(topics=self.topics3, **kwargs) cm4 = CoherenceModel(topics=self.topicIds1, **kwargs) + + # check if the same topic always returns the same coherence value + cm5 = CoherenceModel(topics=[self.topics1[0]], **kwargs) + self.assertRaises(ValueError, lambda: CoherenceModel(topics=self.topics4, **kwargs)) self.assertEqual(cm1.get_coherence(), cm4.get_coherence()) + self.assertEqual(cm1.get_coherence_per_topic()[0], cm5.get_coherence()) self.assertIsInstance(cm3.get_coherence(), np.double) self.assertGreater(cm1.get_coherence(), cm2.get_coherence()) From cec067a147a55a988774214b08a46d46a8389810 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Tue, 14 Sep 2021 22:11:56 +0200 Subject: [PATCH 03/81] fix TFIDF docs --- gensim/models/tfidfmodel.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 4152f3eb3d..7953a03b06 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -6,10 +6,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """This module implements functionality related to the `Term Frequency - Inverse Document Frequency -` vector space bag-of-words models. - -For a more in-depth exposition of TF-IDF and its various SMART variants (normalization, weighting schemes), -see the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/ +`_ class of bag-of-words vector space models. """ @@ -347,11 +344,6 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/. - See Also - -------- - ~gensim.sklearn_api.tfidf.TfIdfTransformer : Class that also uses the SMART scheme. - resolve_weights : Function that also uses the SMART scheme. - References ---------- .. [1] Singhal, A., Buckley, C., & Mitra, M. (1996). `Pivoted Document Length From 24e12c2cd1acc9e1e4f62b793d7dc6c0a0ba2e95 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 19 Sep 2021 20:21:40 +0900 Subject: [PATCH 04/81] Create SECURITY.md --- SECURITY.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 SECURITY.md diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000..42acf71f0f --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,16 @@ +# Security Policy + +## Supported Versions + +Use this section to tell people about which versions of your project are +currently being supported with security updates. + +| Version | Supported | +| ------- | ------------------ | +| 4. x | :white_check_mark: | +| < 4.0 | :x: | + +## Reporting a Vulnerability + +Email me@radimrehurek.com and m@penkov.dev. +We will investigate and get back to you as soon as possible. From d8bd1943ab9a03c5fcdd2f2e38742ae3dc56a531 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 19 Sep 2021 23:57:08 +0900 Subject: [PATCH 05/81] Update SECURITY.md --- SECURITY.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/SECURITY.md b/SECURITY.md index 42acf71f0f..3cbff53d79 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -12,5 +12,6 @@ currently being supported with security updates. ## Reporting a Vulnerability -Email me@radimrehurek.com and m@penkov.dev. -We will investigate and get back to you as soon as possible. +Open a ticket and add the "security" label to it. +Describe the vulnerability in general. +We'll reach out to you for specifics. From fe8e2042f0c8c16abc502220f5a4f88c72d2b31d Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Mon, 20 Sep 2021 09:41:35 +0900 Subject: [PATCH 06/81] bump version to 4.1.3.dev0 --- docs/src/conf.py | 2 +- gensim/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/src/conf.py b/docs/src/conf.py index 007c219da7..669a56a20a 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -63,7 +63,7 @@ # The short X.Y version. version = '4.1' # The full version, including alpha/beta/rc tags. -release = '4.1.2' +release = '4.1.3.dev0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/gensim/__init__.py b/gensim/__init__.py index cf85b8bc4e..c97e0f74ae 100644 --- a/gensim/__init__.py +++ b/gensim/__init__.py @@ -4,7 +4,7 @@ """ -__version__ = '4.1.2' +__version__ = '4.1.3.dev0' import logging diff --git a/setup.py b/setup.py index da9c8c23ad..81c9a81ccc 100644 --- a/setup.py +++ b/setup.py @@ -338,7 +338,7 @@ def run(self): setup( name='gensim', - version='4.1.2', + version='4.1.3.dev0', description='Python framework for fast Vector Space Modelling', long_description=LONG_DESCRIPTION, From 5bec27767ad40712e8912d53a896cb2282c33880 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Tue, 28 Sep 2021 16:17:17 +0200 Subject: [PATCH 07/81] fix TFIDF docs (#3235) --- gensim/models/tfidfmodel.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 4152f3eb3d..7953a03b06 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -6,10 +6,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """This module implements functionality related to the `Term Frequency - Inverse Document Frequency -` vector space bag-of-words models. - -For a more in-depth exposition of TF-IDF and its various SMART variants (normalization, weighting schemes), -see the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/ +`_ class of bag-of-words vector space models. """ @@ -347,11 +344,6 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden See also the blog post at https://rare-technologies.com/pivoted-document-length-normalisation/. - See Also - -------- - ~gensim.sklearn_api.tfidf.TfIdfTransformer : Class that also uses the SMART scheme. - resolve_weights : Function that also uses the SMART scheme. - References ---------- .. [1] Singhal, A., Buckley, C., & Mitra, M. (1996). `Pivoted Document Length From f3b19423f1f7797bca5677fede3ed4c406a22a5a Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 24 Oct 2021 12:24:12 +0900 Subject: [PATCH 08/81] Adding lifecycle configuration (#3230) --- .../BucketLifecycleConfiguration.json | 10 ++++++++++ .../BucketLifecycleConfiguration.txt | 15 +++++++++++++++ 2 files changed, 25 insertions(+) create mode 100644 continuous_integration/BucketLifecycleConfiguration.json create mode 100644 continuous_integration/BucketLifecycleConfiguration.txt diff --git a/continuous_integration/BucketLifecycleConfiguration.json b/continuous_integration/BucketLifecycleConfiguration.json new file mode 100644 index 0000000000..1512b59b5c --- /dev/null +++ b/continuous_integration/BucketLifecycleConfiguration.json @@ -0,0 +1,10 @@ +{ + "Rules": [ + { + "Expiration": {"Days": 30}, + "Filter": {"Prefix": ""}, + "ID": "Delete all files older than 30 days to save storage costs", + "Status": "Enabled" + } + ] +} diff --git a/continuous_integration/BucketLifecycleConfiguration.txt b/continuous_integration/BucketLifecycleConfiguration.txt new file mode 100644 index 0000000000..7392c06393 --- /dev/null +++ b/continuous_integration/BucketLifecycleConfiguration.txt @@ -0,0 +1,15 @@ +JSON files can't have comments, so this file is here to explain the rules in BucketLifecycleConfiguration.json. + +Our CI puts wheels in a publicly readable, privately writable S3 bucket (s3://gensim-wheels). +These wheels can be for gensim releases, in which case we fetch them and push them to PyPI when making a release. +Once the wheels are on PyPI, we don't need to keep our own copy. + +These wheels can also be development wheels: we currently build wheels on every push to develop. +These can be helpful when tracking down a problem, but they can also build up quickly, consume storage space and contribute to AWS costs. + +So, we delete all files in the gensim-wheels bucket every 90 days. +We rarely need to access wheels that are several months old, anyway. + +If you modify the JSON configuration, then you can update it using the command: + + aws --profile smart_open s3api put-bucket-lifecycle-configuration --bucket gensim-wheels --lifecycle-configuration file://continuous_integration/BucketLifecycleConfiguration.json From 64fcedf946469f48cec787173222c20d367e815f Mon Sep 17 00:00:00 2001 From: Parashar Date: Sun, 24 Oct 2021 07:21:51 +0200 Subject: [PATCH 09/81] Added random_seed parameter to make LsiModel reproducible (#3194) * added random_seed parameter to stochastic_svd, Projections and LsiModel * fixed code style. Hanging indents in functions and methods along with trailing commas * fixed code style. added hanging indents and trailing commas to more function calls. * Update CHANGELOG.md Co-authored-by: Michael Penkov --- CHANGELOG.md | 2 ++ gensim/models/lsimodel.py | 76 +++++++++++++++++++++++++++------------ 2 files changed, 55 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 73b2f735a2..e4ea52befc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,8 @@ Changes ## Unreleased +* [#3194](https://github.com/RaRe-Technologies/gensim/pull/3194): Added random_seed parameter to make LsiModel reproducible, by [@parashardhapola](https://github.com/parashardhapola) + ## 4.1.2, 2021-09-17 This is a bugfix release that addresses left over compatibility issues with older versions of numpy and MacOS. diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 06055722e1..023498655c 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -23,7 +23,7 @@ * distributed computing for very large corpora, making use of a cluster of machines -Wall-clock `performance on the English Wikipedia `_ +Wall-clock `performance on the English Wikipedia `_ (2G corpus positions, 3.2M documents, 100K features, 0.5G non-zero entries in the final TF-IDF matrix), requesting the top 400 LSI factors: @@ -162,8 +162,11 @@ class Projection(utils.SaveLoad): via :meth:`~gensim.models.lsimodel.Projection.merge`. This is how incremental training actually happens. """ - def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, - extra_dims=P2_EXTRA_DIMS, dtype=np.float64): + + def __init__( + self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, + extra_dims=P2_EXTRA_DIMS, dtype=np.float64, random_seed=None, + ): """Construct the (U, S) projection from a corpus. Parameters @@ -183,11 +186,15 @@ def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITER Extra samples to be used besides the rank `k`. Tune to improve accuracy. dtype : numpy.dtype, optional Enforces a type for elements of the decomposed matrix. + random_seed: {None, int}, optional + Random seed used to initialize the pseudo-random number generator, + a local instance of numpy.random.RandomState instance. """ self.m, self.k = m, k self.power_iters = power_iters self.extra_dims = extra_dims + self.random_seed = random_seed if docs is not None: # base case decomposition: given a job `docs`, compute its decomposition, # *in-core*. @@ -195,7 +202,7 @@ def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITER u, s = stochastic_svd( docs, k, chunksize=sys.maxsize, num_terms=m, power_iters=self.power_iters, - extra_dims=self.extra_dims, dtype=dtype) + extra_dims=self.extra_dims, dtype=dtype, random_seed=self.random_seed) else: try: import sparsesvd @@ -223,7 +230,10 @@ def empty_like(self): An empty copy (without corpus) of the current projection. """ - return Projection(self.m, self.k, power_iters=self.power_iters, extra_dims=self.extra_dims) + return Projection( + self.m, self.k, power_iters=self.power_iters, + extra_dims=self.extra_dims, random_seed=self.random_seed, + ) def merge(self, other, decay=1.0): """Merge current :class:`~gensim.models.lsimodel.Projection` instance with another. @@ -354,9 +364,9 @@ class LsiModel(interfaces.TransformationABC, basemodel.BaseTopicModel): def __init__( self, corpus=None, num_topics=200, id2word=None, chunksize=20000, - decay=1.0, distributed=False, onepass=True, - power_iters=P2_EXTRA_ITERS, extra_samples=P2_EXTRA_DIMS, dtype=np.float64 - ): + decay=1.0, distributed=False, onepass=True, power_iters=P2_EXTRA_ITERS, + extra_samples=P2_EXTRA_DIMS, dtype=np.float64, random_seed=None, + ): """Build an LSI model. Parameters @@ -383,6 +393,9 @@ def __init__( Extra samples to be used besides the rank `k`. Can improve accuracy. dtype : type, optional Enforces a type for elements of the decomposed matrix. + random_seed: {None, int}, optional + Random seed used to initialize the pseudo-random number generator, + a local instance of numpy.random.RandomState instance. """ self.id2word = id2word @@ -396,6 +409,7 @@ def __init__( self.onepass = onepass self.extra_samples, self.power_iters = extra_samples, power_iters self.dtype = dtype + self.random_seed = random_seed if corpus is None and self.id2word is None: raise ValueError( @@ -411,7 +425,8 @@ def __init__( self.docs_processed = 0 self.projection = Projection( - self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples, dtype=dtype + self.num_terms, self.num_topics, power_iters=self.power_iters, + extra_dims=self.extra_samples, dtype=dtype, random_seed=self.random_seed ) self.numworkers = 1 @@ -478,11 +493,15 @@ def add_documents(self, corpus, chunksize=None, decay=None): if not scipy.sparse.issparse(corpus): if not self.onepass: # we are allowed multiple passes over the input => use a faster, randomized two-pass algo - update = Projection(self.num_terms, self.num_topics, None, dtype=self.dtype) + update = Projection( + self.num_terms, self.num_topics, None, + dtype=self.dtype, random_seed=self.random_seed, + ) update.u, update.s = stochastic_svd( corpus, self.num_topics, num_terms=self.num_terms, chunksize=chunksize, - extra_dims=self.extra_samples, power_iters=self.power_iters, dtype=self.dtype + extra_dims=self.extra_samples, power_iters=self.power_iters, dtype=self.dtype, + random_seed=self.random_seed, ) self.projection.merge(update, decay=decay) self.docs_processed += len(corpus) if hasattr(corpus, '__len__') else 0 @@ -499,7 +518,9 @@ def add_documents(self, corpus, chunksize=None, decay=None): # definitely avoid materializing it as a dense matrix! logger.debug("converting corpus to csc format") job = matutils.corpus2csc( - chunk, num_docs=len(chunk), num_terms=self.num_terms, num_nnz=nnz, dtype=self.dtype) + chunk, num_docs=len(chunk), num_terms=self.num_terms, + num_nnz=nnz, dtype=self.dtype, + ) del chunk doc_no += job.shape[1] if self.dispatcher: @@ -513,7 +534,7 @@ def add_documents(self, corpus, chunksize=None, decay=None): # serial version, there is only one "worker" (myself) => process the job directly update = Projection( self.num_terms, self.num_topics, job, extra_dims=self.extra_samples, - power_iters=self.power_iters, dtype=self.dtype + power_iters=self.power_iters, dtype=self.dtype, random_seed=self.random_seed, ) del job self.projection.merge(update, decay=decay) @@ -530,7 +551,7 @@ def add_documents(self, corpus, chunksize=None, decay=None): assert not self.dispatcher, "must be in serial mode to receive jobs" update = Projection( self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, - power_iters=self.power_iters, dtype=self.dtype + power_iters=self.power_iters, dtype=self.dtype, ) self.projection.merge(update, decay=decay) logger.info("processed sparse job of %i documents", corpus.shape[1]) @@ -546,7 +567,7 @@ def __str__(self): """ return "LsiModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % ( - self.num_terms, self.num_topics, self.decay, self.chunksize + self.num_terms, self.num_topics, self.decay, self.chunksize, ) def __getitem__(self, bow, scaled=False, chunksize=512): @@ -731,7 +752,7 @@ def print_debug(self, num_topics=5, num_words=10): print_debug( self.id2word, self.projection.u, self.projection.s, range(min(num_topics, len(self.projection.u.T))), - num_words=num_words + num_words=num_words, ) def save(self, fname, *args, **kwargs): @@ -864,8 +885,10 @@ def print_debug(id2token, u, s, topics, num_words=10, num_neg=None): logger.info('topic #%s(%.3f): %s, ..., %s', topic, s[topic], ', '.join(pos), ', '.join(neg)) -def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, - power_iters=0, dtype=np.float64, eps=1e-6): +def stochastic_svd( + corpus, rank, num_terms, chunksize=20000, extra_dims=None, + power_iters=0, dtype=np.float64, eps=1e-6, random_seed=None, +): """Run truncated Singular Value Decomposition (SVD) on a sparse input. Parameters @@ -888,6 +911,10 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, Enforces a type for elements of the decomposed matrix. eps: float, optional Percentage of the spectrum's energy to be discarded. + random_seed: {None, int}, optional + Random seed used to initialize the pseudo-random number generator, + a local instance of numpy.random.RandomState instance. + Notes ----- @@ -924,13 +951,16 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, # and more memory friendly than processing all documents at once) y = np.zeros(dtype=dtype, shape=(num_terms, samples)) logger.info("1st phase: constructing %s action matrix", str(y.shape)) + random_state = np.random.RandomState(random_seed) if scipy.sparse.issparse(corpus): m, n = corpus.shape assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms) - o = np.random.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix - sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, - corpus.data, o.ravel(), y.ravel()) # y = corpus * o + o = random_state.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix + sparsetools.csc_matvecs( + m, n, samples, corpus.indptr, corpus.indices, + corpus.data, o.ravel(), y.ravel(), + ) # y = corpus * o del o # unlike np, scipy.sparse `astype()` copies everything, even if there is no change to dtype! @@ -960,10 +990,10 @@ def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, assert n <= chunksize # the very last chunk of A is allowed to be smaller in size num_docs += n logger.debug("multiplying chunk * gauss") - o = np.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix + o = random_state.normal(0.0, 1.0, (n, samples), ).astype(dtype) # draw a random gaussian matrix sparsetools.csc_matvecs( m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o - chunk.data, o.ravel(), y.ravel() + chunk.data, o.ravel(), y.ravel(), ) del chunk, o y = [y] From e5cb53144fafd54fd461ba7dd14c230240097d93 Mon Sep 17 00:00:00 2001 From: HLasse Date: Sun, 24 Oct 2021 07:29:41 +0200 Subject: [PATCH 10/81] Fix FastText doc-comment example for `build_vocab` and `train` to use correct argument names (#3227) * fix docs to match argument names * dummy commit to trigger CI * Update CHANGELOG.md Co-authored-by: Michael Penkov Co-authored-by: Michael Penkov --- CHANGELOG.md | 1 + README.md | 1 - gensim/models/fasttext.py | 8 ++++---- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e4ea52befc..45c32056bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Changes ## Unreleased * [#3194](https://github.com/RaRe-Technologies/gensim/pull/3194): Added random_seed parameter to make LsiModel reproducible, by [@parashardhapola](https://github.com/parashardhapola) +* [#3227](https://github.com/RaRe-Technologies/gensim/pull/3227): Fix FastText doc-comment example for `build_vocab` and `train` to use correct argument names, by [@HLasse](https://github.com/HLasse) ## 4.1.2, 2021-09-17 diff --git a/README.md b/README.md index f61cd390e4..f1cb9f3ddd 100644 --- a/README.md +++ b/README.md @@ -176,4 +176,3 @@ BibTeX entry: [OpenBLAS]: http://xianyi.github.io/OpenBLAS/ [source tar.gz]: http://pypi.python.org/pypi/gensim [documentation]: http://radimrehurek.com/gensim/install.html - diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index a94bc17f27..6d992d9b94 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -38,8 +38,8 @@ >>> print(len(common_texts)) 9 >>> model = FastText(vector_size=4, window=3, min_count=1) # instantiate - >>> model.build_vocab(sentences=common_texts) - >>> model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10) # train + >>> model.build_vocab(corpus_iterable=common_texts) + >>> model.train(corpus_iterable=common_texts, total_examples=len(common_texts), epochs=10) # train Once you have a model, you can access its keyed vectors via the `model.wv` attributes. The keyed vectors instance is quite powerful: it can perform a wide range of NLP tasks. @@ -108,9 +108,9 @@ >>> >>> >>> model4 = FastText(vector_size=4, window=3, min_count=1) - >>> model4.build_vocab(sentences=MyIter()) + >>> model4.build_vocab(corpus_iterable=MyIter()) >>> total_examples = model4.corpus_count - >>> model4.train(sentences=MyIter(), total_examples=total_examples, epochs=5) + >>> model4.train(corpus_iterable=MyIter(), total_examples=total_examples, epochs=5) Persist a model to disk with: From 9ca0fe1adfc92d680c79ea0208d359e7703bf227 Mon Sep 17 00:00:00 2001 From: Ivan Menshikh Date: Sun, 24 Oct 2021 09:29:41 +0300 Subject: [PATCH 11/81] Apply new convention of delimiting instance params in str function (#3251) * apply new convention of delimiting instance params in str function * Update CHANGELOG.md * Update gensim/models/lsimodel.py Co-authored-by: Michael Penkov --- CHANGELOG.md | 1 + gensim/corpora/dictionary.py | 4 +++- gensim/examples/dmlcz/dmlcorpus.py | 5 +++-- gensim/models/atmodel.py | 4 ++-- gensim/models/doc2vec.py | 4 ++-- gensim/models/keyedvectors.py | 2 +- gensim/models/ldamodel.py | 4 ++-- gensim/models/logentropy_model.py | 2 +- gensim/models/lsimodel.py | 4 ++-- gensim/models/normmodel.py | 4 +++- gensim/models/rpmodel.py | 2 +- gensim/models/tfidfmodel.py | 2 +- gensim/models/word2vec.py | 2 +- gensim/similarities/docsim.py | 6 +++--- gensim/similarities/termsim.py | 2 +- gensim/topic_coherence/text_analysis.py | 2 +- gensim/utils.py | 2 +- 17 files changed, 29 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 45c32056bf..093b39e918 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ Changes ## Unreleased * [#3194](https://github.com/RaRe-Technologies/gensim/pull/3194): Added random_seed parameter to make LsiModel reproducible, by [@parashardhapola](https://github.com/parashardhapola) +* [#3251](https://github.com/RaRe-Technologies/gensim/pull/3251): Apply new convention of delimiting instance params in str function, by [@menshikh-iv](https://github.com/menshikh-iv) * [#3227](https://github.com/RaRe-Technologies/gensim/pull/3227): Fix FastText doc-comment example for `build_vocab` and `train` to use correct argument names, by [@HLasse](https://github.com/HLasse) ## 4.1.2, 2021-09-17 diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index d954061caf..3bfa65942e 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -143,7 +143,9 @@ def __len__(self): def __str__(self): some_keys = list(itertools.islice(self.token2id.keys(), 5)) - return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '') + return "%s<%i unique tokens: %s%s>" % ( + self.__class__.__name__, len(self), some_keys, '...' if len(self) > 5 else '' + ) @staticmethod def from_documents(documents): diff --git a/gensim/examples/dmlcz/dmlcorpus.py b/gensim/examples/dmlcz/dmlcorpus.py index d76c622c95..24aca6cb65 100644 --- a/gensim/examples/dmlcz/dmlcorpus.py +++ b/gensim/examples/dmlcz/dmlcorpus.py @@ -59,8 +59,9 @@ def addSource(self, source): self.sources[sourceId] = source def __str__(self): - return ("DmlConfig(id=%s, sources=[%s], acceptLangs=[%s])" % - (self.configId, ', '.join(self.sources.iterkeys()), ', '.join(self.acceptLangs))) + return "%s" % ( + self.__class__.__name__, self.configId, ', '.join(self.sources.iterkeys()), ', '.join(self.acceptLangs) + ) # endclass DmlConfig diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 838c7634e3..75893c5ac0 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -331,8 +331,8 @@ def __str__(self): String representation of current instance. """ - return "AuthorTopicModel(num_terms=%s, num_topics=%s, num_authors=%s, decay=%s, chunksize=%s)" % \ - (self.num_terms, self.num_topics, self.num_authors, self.decay, self.chunksize) + return "%s" % \ + (self.__class__.__name__, self.num_terms, self.num_topics, self.num_authors, self.decay, self.chunksize) def init_empty_corpus(self): """Initialize an empty corpus. diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c4b28316b7..de6a0f0fac 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -130,7 +130,7 @@ def __str__(self): Human readable representation of the object's state (words and tags). """ - return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags) + return '%s<%s, %s>' % (self.__class__.__name__, self.words, self.tags) @dataclass @@ -713,7 +713,7 @@ def __str__(self): segments.append('s%g' % self.sample) if self.workers > 1: segments.append('t%d' % self.workers) - return '%s(%s)' % (self.__class__.__name__, ','.join(segments)) + return '%s<%s>' % (self.__class__.__name__, ','.join(segments)) def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False): """Store the input-hidden weight matrix in the same format used by the original C word2vec-tool. diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index b5debb21c1..f56adb0b14 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -1804,7 +1804,7 @@ def __lt__(self, other): # used for sorting in a priority queue def __str__(self): vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')] - return "%s(%s)" % (self.__class__.__name__, ', '.join(vals)) + return "%s<%s>" % (self.__class__.__name__, ', '.join(vals)) # compatibility alias, allowing older pickle-based `.save()`s to load diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 6691ddcc31..10a0c60134 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -615,8 +615,8 @@ def __str__(self): Human readable representation of the most important model parameters. """ - return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % ( - self.num_terms, self.num_topics, self.decay, self.chunksize + return "%s" % ( + self.__class__.__name__, self.num_terms, self.num_topics, self.decay, self.chunksize ) def sync_state(self, current_Elogbeta=None): diff --git a/gensim/models/logentropy_model.py b/gensim/models/logentropy_model.py index a79c685660..16fbace8d2 100644 --- a/gensim/models/logentropy_model.py +++ b/gensim/models/logentropy_model.py @@ -76,7 +76,7 @@ def __init__(self, corpus, normalize=True): self.initialize(corpus) def __str__(self): - return "LogEntropyModel(n_docs=%s, n_words=%s)" % (self.n_docs, self.n_words) + return "%s" % (self.__class__.__name__, self.n_docs, self.n_words) def initialize(self, corpus): """Calculates the global weighting for all terms in a given corpus and transforms the simple diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 023498655c..6a407e860e 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -566,8 +566,8 @@ def __str__(self): A human readable string of the current objects parameters. """ - return "LsiModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % ( - self.num_terms, self.num_topics, self.decay, self.chunksize, + return "%s" % ( + self.__class__.__name__, self.num_terms, self.num_topics, self.decay, self.chunksize ) def __getitem__(self, bow, scaled=False, chunksize=512): diff --git a/gensim/models/normmodel.py b/gensim/models/normmodel.py index 3292f6514e..62cbfc8fef 100644 --- a/gensim/models/normmodel.py +++ b/gensim/models/normmodel.py @@ -41,7 +41,9 @@ def __init__(self, corpus=None, norm='l2'): pass def __str__(self): - return "NormModel(num_docs=%s, num_nnz=%s, norm=%s)" % (self.num_docs, self.num_nnz, self.norm) + return "%s" % ( + self.__class__.__name__, self.num_docs, self.num_nnz, self.norm + ) def calc_norm(self, corpus): """Calculate the norm by calling :func:`~gensim.matutils.unitvec` with the norm parameter. diff --git a/gensim/models/rpmodel.py b/gensim/models/rpmodel.py index 1b2bf9fbb2..cbdaf4cb55 100644 --- a/gensim/models/rpmodel.py +++ b/gensim/models/rpmodel.py @@ -70,7 +70,7 @@ def __init__(self, corpus, id2word=None, num_topics=300): self.add_lifecycle_event("created", msg=f"created {self}") def __str__(self): - return "RpModel(num_terms=%s, num_topics=%s)" % (self.num_terms, self.num_topics) + return "%s" % (self.__class__.__name__, self.num_terms, self.num_topics) def initialize(self, corpus): """Initialize the random projection matrix. diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 7953a03b06..cf2c3d3e1a 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -427,7 +427,7 @@ def load(cls, *args, **kwargs): return model def __str__(self): - return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz) + return "%s" % (self.__class__.__name__, self.num_docs, self.num_nnz) def initialize(self, corpus): """Compute inverse document weights, which will be used to modify term frequencies for documents. diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 356f711408..a96799a33a 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1882,7 +1882,7 @@ def __str__(self): and learning rate. """ - return "%s(vocab=%s, vector_size=%s, alpha=%s)" % ( + return "%s" % ( self.__class__.__name__, len(self.wv.index_to_key), self.wv.vector_size, self.alpha, ) diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index db66db67e0..ee73328ff1 100644 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -147,7 +147,7 @@ def __getstate__(self): return result def __str__(self): - return "%s Shard(%i documents in %s)" % (self.cls.__name__, len(self), self.fullname()) + return "%s<%i documents in %s>" % (self.cls.__name__, len(self), self.fullname()) def get_index(self): """Load & get index. @@ -359,8 +359,8 @@ def __len__(self): return len(self.fresh_docs) + sum(len(shard) for shard in self.shards) def __str__(self): - return "Similarity index with %i documents in %i shards (stored under %s)" % ( - len(self), len(self.shards), self.output_prefix + return "%s<%i documents in %i shards stored under %s>" % ( + self.__class__.__name__, len(self), len(self.shards), self.output_prefix ) def add_documents(self, corpus): diff --git a/gensim/similarities/termsim.py b/gensim/similarities/termsim.py index d2a3f6728f..f97801ca66 100644 --- a/gensim/similarities/termsim.py +++ b/gensim/similarities/termsim.py @@ -61,7 +61,7 @@ def most_similar(self, term, topn=10): def __str__(self): members = ', '.join('%s=%s' % pair for pair in vars(self).items()) - return '%s(%s)' % (self.__class__.__name__, members) + return '%s<%s>' % (self.__class__.__name__, members) class UniformTermSimilarityIndex(TermSimilarityIndex): diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 83cbdc6471..67b0208f5a 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -436,7 +436,7 @@ def __init__(self, processes, *args, **kwargs): self.batch_size = kwargs.get('batch_size', 64) def __str__(self): - return "%s(processes=%s, batch_size=%s)" % ( + return "%s" % ( self.__class__.__name__, self.processes, self.batch_size) def accumulate(self, texts, window_size): diff --git a/gensim/utils.py b/gensim/utils.py index 30b6d85f58..d4fc6a71dc 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -827,7 +827,7 @@ def __init__(self, num_terms): self.num_terms = num_terms def __str__(self): - return "FakeDict(num_terms=%s)" % self.num_terms + return "%s" % (self.__class__.__name__, self.num_terms) def __getitem__(self, val): if 0 <= val < self.num_terms: From d2341aa689621e26836942aa90ffa716fa992d85 Mon Sep 17 00:00:00 2001 From: Ivan Menshikh Date: Sun, 24 Oct 2021 14:51:29 +0300 Subject: [PATCH 12/81] Add Codecov to gensim repo (#3252) * add codecov * add pytest-cov into multibuild * upload coverage report form py38 (linux) --- .github/workflows/build-wheels.yml | 2 +- .github/workflows/tests.yml | 11 ++++++++++- setup.py | 1 + tox.ini | 20 +++++++++++++++++++- 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 85e8637b86..ebe2201a6d 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -114,7 +114,7 @@ jobs: PLAT: x86_64 UNICODE_WIDTH: 32 MB_PYTHON_VERSION: ${{ matrix.python-version }} # MB_PYTHON_VERSION is needed by Multibuild - TEST_DEPENDS: Morfessor==2.0.2a4 python-levenshtein==0.12.0 visdom==0.1.8.9 pytest mock cython nmslib pyemd testfixtures scikit-learn pyemd + TEST_DEPENDS: Morfessor==2.0.2a4 python-levenshtein==0.12.0 visdom==0.1.8.9 pytest pytest-cov mock cython nmslib pyemd testfixtures scikit-learn pyemd DOCKER_TEST_IMAGE: multibuild/xenial_x86_64 TRAVIS_OS_NAME: ${{ matrix.travis-os-name }} SKIP_NETWORK_TESTS: 1 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 41a608ef90..0117fb2ea1 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -53,9 +53,18 @@ jobs: run: ulimit -c unlimited -S # enable core dumps - name: Run tox tests run: tox -e ${{ matrix.tox }} + - name: Upload coverage to Codecov + if: matrix.os == 'ubuntu-20.04' && matrix.python == '3.8' + uses: codecov/codecov-action@v2 + with: + fail_ci_if_error: true + files: ./coverage.xml + verbose: true + + - name: Collect corefile if: ${{ failure() }} run: | pwd COREFILE=$(find . -maxdepth 1 -name "core*" | head -n 1) - if [[ -f "$COREFILE" ]]; then EXECFILE=$(gdb -c "$COREFILE" -batch | grep "Core was generated" | tr -d "\`" | cut -d' ' -f5); file "$COREFILE"; gdb -c "$COREFILE" "$EXECFILE" -x continuous_integration/debug.gdb -batch; fi + if [[ -f "$COREFILE" ]]; then EXECFILE=$(gdb -c "$COREFILE" -batch | grep "Core was generated" | tr -d "\`" | cut -d' ' -f5); file "$COREFILE"; gdb -c "$COREFILE" "$EXECFILE" -x continuous_integration/debug.gdb -batch; fi \ No newline at end of file diff --git a/setup.py b/setup.py index 81c9a81ccc..ccb1142fb6 100644 --- a/setup.py +++ b/setup.py @@ -268,6 +268,7 @@ def run(self): # packages included for build-testing everywhere core_testenv = [ 'pytest', + 'pytest-cov', # 'pytest-rerunfailures', # disabled 2020-08-28 for 'mock', 'cython', diff --git a/tox.ini b/tox.ini index 12811b8ba5..d3feeccca0 100644 --- a/tox.ini +++ b/tox.ini @@ -24,8 +24,26 @@ ignore = E203, # space before : exclude = .venv, .git, .tox, dist, doc, build, gensim/models/deprecated +[coverage:run] +source=gensim + +[coverage:report] +omit = + gensim/test/* + */__init__.py + +exclude_lines = + pragma: no cover + def __repr__ + def __str__ + raise AssertionError + raise NotImplementedError + if __name__ == .__main__.: + +ignore_errors = True + [pytest] -addopts = -rfxEXs --durations=20 --showlocals +addopts = -rfxEXs --durations=20 --showlocals --cov=gensim/ --cov-report=xml [testenv] From 9b196c3d065fa65c5cd60c3d24040757ded8e048 Mon Sep 17 00:00:00 2001 From: Ivan Menshikh Date: Sun, 24 Oct 2021 16:31:56 +0300 Subject: [PATCH 13/81] Move windows tests from azure to github actions (#3255) * windows tests to github actions * get rid azure-pipelines.yml * try to fix condition * AZURE -> GITHUB_ACTIONS_WINDOWS * PIPELINE_WORKSPACE -> RUNNER_OS in tox.ini --- .github/workflows/tests.yml | 15 ++++++++++----- azure-pipelines.yml | 32 -------------------------------- gensim/test/test_corpora.py | 20 ++++++++++---------- gensim/test/test_ldamodel.py | 4 ++-- tox.ini | 2 +- 5 files changed, 23 insertions(+), 50 deletions(-) delete mode 100644 azure-pipelines.yml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0117fb2ea1..3bdcda0bd2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,6 +20,9 @@ jobs: - {name: Linux, python: 3.6, os: ubuntu-20.04, tox: 'py36-linux'} - {name: Linux, python: 3.7, os: ubuntu-20.04, tox: 'py37-linux'} - {name: Linux, python: 3.8, os: ubuntu-20.04, tox: 'py38-linux'} + - {name: Windows, python: 3.6, os: windows-2019, tox: 'py36-win'} + - {name: Windows, python: 3.7, os: windows-2019, tox: 'py37-win'} + - {name: Windows, python: 3.8, os: windows-2019, tox: 'py38-win'} env: TOX_PARALLEL_NO_SPINNER: 1 @@ -38,19 +41,21 @@ jobs: # https://www.scala-sbt.org/1.x/docs/Installing-sbt-on-Linux.html # - name: Update sbt + if: matrix.os == 'ubuntu-20.04' run: | echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | sudo tee /etc/apt/sources.list.d/sbt.list echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | sudo tee /etc/apt/sources.list.d/sbt_old.list curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo apt-key add sudo apt-get update -y sudo apt-get install -y sbt - - name: Install tox, gdb + - name: Install tox + run: pip install tox + - name: Install GDB & enable core dumps + if: matrix.os == 'ubuntu-20.04' run: | - pip install tox sudo apt-get update -y sudo apt-get install -y gdb - - name: Enable core dumps - run: ulimit -c unlimited -S # enable core dumps + ulimit -c unlimited -S # enable core dumps - name: Run tox tests run: tox -e ${{ matrix.tox }} - name: Upload coverage to Codecov @@ -63,7 +68,7 @@ jobs: - name: Collect corefile - if: ${{ failure() }} + if: ${{ failure() }} && matrix.os == 'ubuntu-20.04' run: | pwd COREFILE=$(find . -maxdepth 1 -name "core*" | head -n 1) diff --git a/azure-pipelines.yml b/azure-pipelines.yml deleted file mode 100644 index 8e8102fa12..0000000000 --- a/azure-pipelines.yml +++ /dev/null @@ -1,32 +0,0 @@ -pool: - vmImage: 'vs2017-win2016' - -strategy: - matrix: - py36: - python.version: '3.6' - TOXENV: "py36-win" - py37: - python.version: '3.7' - TOXENV: "py37-win" - py38: - python.version: '3.8' - TOXENV: "py38-win" - py39: - python.version: '3.9' - TOXENV: "py39-win" - -steps: -- task: UsePythonVersion@0 - inputs: - versionSpec: '$(python.version)' - displayName: 'Use Python $(python.version)' - -- script: | - python -m pip install --upgrade pip - python -m pip install tox - displayName: 'Install tox' - -- script: | - tox - displayName: 'Testing' diff --git a/gensim/test/test_corpora.py b/gensim/test/test_corpora.py index 6c09ea2d1f..431b07c0ce 100644 --- a/gensim/test/test_corpora.py +++ b/gensim/test/test_corpora.py @@ -27,7 +27,7 @@ from gensim.test.utils import datapath, get_tmpfile, common_corpus -AZURE = bool(os.environ.get('PIPELINE_WORKSPACE')) +GITHUB_ACTIONS_WINDOWS = os.environ.get('RUNNER_OS') == 'Windows' class DummyTransformer: @@ -62,7 +62,7 @@ def tearDown(self): except OSError: pass - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_load(self): fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) corpus = self.corpus_class(fname) @@ -71,7 +71,7 @@ def test_load(self): # the deerwester corpus always has nine documents self.assertEqual(len(docs), 9) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_len(self): fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) corpus = self.corpus_class(fname) @@ -87,7 +87,7 @@ def test_len(self): self.assertEqual(len(corpus), 9) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_empty_input(self): tmpf = get_tmpfile('gensim_corpus.tst') with open(tmpf, 'w') as f: @@ -102,7 +102,7 @@ def test_empty_input(self): docs = list(corpus) self.assertEqual(len(docs), 0) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_save(self): corpus = self.TEST_CORPUS tmpf = get_tmpfile('gensim_corpus.tst') @@ -114,7 +114,7 @@ def test_save(self): corpus2 = list(self.corpus_class(tmpf)) self.assertEqual(corpus, corpus2) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_serialize(self): corpus = self.TEST_CORPUS tmpf = get_tmpfile('gensim_corpus.tst') @@ -136,7 +136,7 @@ def test_serialize(self): idx = [1, 3, 5, 7] self.assertEqual(corpus[idx], corpus2[idx]) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_serialize_compressed(self): corpus = self.TEST_CORPUS tmpf = get_tmpfile('gensim_corpus.tst') @@ -154,7 +154,7 @@ def test_serialize_compressed(self): for i in range(len(corpus)): self.assertEqual(corpus[i], corpus2[i]) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_switch_id2word(self): fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) corpus = self.corpus_class(fname) @@ -172,7 +172,7 @@ def test_switch_id2word(self): testdoc2 = set((to_unicode(corpus.id2word[x]), y) for x, y in firstdoc2) self.assertEqual(testdoc2, {('computer', 1), ('human', 1), ('interface', 1)}) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_indexing(self): fname = datapath('testcorpus.' + self.file_extension.lstrip('.')) corpus = self.corpus_class(fname) @@ -245,7 +245,7 @@ def test_closed_file_object(self): self.assertEqual(f, 0) self.assertEqual(s, 0) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_load(self): self.assertEqual(self.corpus.num_docs, 9) self.assertEqual(self.corpus.num_terms, 12) diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index b809b39754..297006b75f 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -24,7 +24,7 @@ from gensim.test import basetmtests from gensim.test.utils import datapath, get_tmpfile, common_texts -AZURE = bool(os.environ.get('PIPELINE_WORKSPACE')) +GITHUB_ACTIONS_WINDOWS = os.environ.get('RUNNER_OS') == 'Windows' dictionary = Dictionary(common_texts) corpus = [dictionary.doc2bow(text) for text in common_texts] @@ -232,7 +232,7 @@ def test_get_topic_terms(self): self.assertTrue(isinstance(k, numbers.Integral)) self.assertTrue(np.issubdtype(v, np.floating)) - @unittest.skipIf(AZURE, 'see ') + @unittest.skipIf(GITHUB_ACTIONS_WINDOWS, 'see ') def test_get_document_topics(self): model = self.class_( diff --git a/tox.ini b/tox.ini index d3feeccca0..058b37d9f5 100644 --- a/tox.ini +++ b/tox.ini @@ -64,7 +64,7 @@ setenv = MALLET_HOME={env:MALLET_HOME:} SKIP_NETWORK_TESTS={env:SKIP_NETWORK_TESTS:} BOTO_CONFIG={env:BOTO_CONFIG:} - PIPELINE_WORKSPACE={env:PIPELINE_WORKSPACE:} + RUNNER_OS={env:RUNNER_OS:} PYTHONHASHSEED=1 TOX_PARALLEL_NO_SPINNER=1 From e51288c8bfc20b9c7b1397ca325c2646d2b0d8d7 Mon Sep 17 00:00:00 2001 From: Ivan Menshikh Date: Sun, 24 Oct 2021 16:32:25 +0300 Subject: [PATCH 14/81] skip blinking test on OSX + py3.9 (#3254) --- gensim/test/test_translation_matrix.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/gensim/test/test_translation_matrix.py b/gensim/test/test_translation_matrix.py index b21fdc6063..bd98ca10d9 100644 --- a/gensim/test/test_translation_matrix.py +++ b/gensim/test/test_translation_matrix.py @@ -1,5 +1,6 @@ #!/usr/bin/env python # encoding: utf-8 +import sys from collections import namedtuple import unittest import logging @@ -60,6 +61,10 @@ def test_translate_nn(self): for idx, item in enumerate(self.test_word_pairs): self.assertTrue(item[1] in translated_words[item[0]]) + @unittest.skipIf( + (sys.version_info.major == 3) and (sys.version_info.minor == 9) and (sys.platform == 'darwin'), + 'blinking test, can be related to ' + ) def test_translate_gc(self): # Test globally corrected neighbour retrieval method model = translation_matrix.TranslationMatrix(self.source_word_vec, self.target_word_vec, self.word_pairs) From 6e362663f23967f3c1931e2cb18d3d25f92aabb5 Mon Sep 17 00:00:00 2001 From: Ivan Menshikh Date: Thu, 28 Oct 2021 06:18:52 +0500 Subject: [PATCH 15/81] Make negative ns_exponent work correctly (#3250) * add tests with negative ns_exponent * fix flake8 * explicitly cast ns_exponent to FLOAT * Apply suggestions from code review * dynamic cast * Update CHANGELOG.md * Update CHANGELOG.md Co-authored-by: Michael Penkov --- CHANGELOG.md | 1 + gensim/models/word2vec.py | 4 ++-- gensim/test/test_doc2vec.py | 9 +++++++++ gensim/test/test_fasttext.py | 9 +++++++++ gensim/test/test_word2vec.py | 9 +++++++++ 5 files changed, 30 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 093b39e918..d30bb82d4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ Changes * [#3194](https://github.com/RaRe-Technologies/gensim/pull/3194): Added random_seed parameter to make LsiModel reproducible, by [@parashardhapola](https://github.com/parashardhapola) * [#3251](https://github.com/RaRe-Technologies/gensim/pull/3251): Apply new convention of delimiting instance params in str function, by [@menshikh-iv](https://github.com/menshikh-iv) * [#3227](https://github.com/RaRe-Technologies/gensim/pull/3227): Fix FastText doc-comment example for `build_vocab` and `train` to use correct argument names, by [@HLasse](https://github.com/HLasse) +* [#3250](https://github.com/RaRe-Technologies/gensim/pull/3250): Make negative ns_exponent work correctly, by [@menshikh-iv](https://github.com/menshikh-iv) ## 4.1.2, 2021-09-17 diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index a96799a33a..7a49a8420c 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -833,11 +833,11 @@ def make_cum_table(self, domain=2**31 - 1): train_words_pow = 0.0 for word_index in range(vocab_size): count = self.wv.get_vecattr(word_index, 'count') - train_words_pow += count**self.ns_exponent + train_words_pow += count**float(self.ns_exponent) cumulative = 0.0 for word_index in range(vocab_size): count = self.wv.get_vecattr(word_index, 'count') - cumulative += count**self.ns_exponent + cumulative += count**float(self.ns_exponent) self.cum_table[word_index] = round(cumulative / train_words_pow * domain) if len(self.cum_table) > 0: assert self.cum_table[-1] == domain diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index c8b7516c99..a7e1fa58df 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -720,6 +720,15 @@ def test_train_warning(self, loglines): def test_load_on_class_error(self): """Test if exception is raised when loading doc2vec model on instance""" self.assertRaises(AttributeError, load_on_instance) + + def test_negative_ns_exp(self): + """The model should accept a negative ns_exponent as a valid value.""" + model = doc2vec.Doc2Vec(sentences, ns_exponent=-1, min_count=1, workers=1) + tmpf = get_tmpfile('d2v_negative_exp.tst') + model.save(tmpf) + loaded_model = doc2vec.Doc2Vec.load(tmpf) + loaded_model.train(sentences, total_examples=model.corpus_count, epochs=1) + assert loaded_model.ns_exponent == -1, loaded_model.ns_exponent # endclass TestDoc2VecModel diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index efc6a3ca8e..a557368faa 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -762,6 +762,15 @@ def test_vectors_for_all_without_inference(self): predicted = vectors_for_all['responding'] assert np.allclose(expected, predicted) + def test_negative_ns_exp(self): + """The model should accept a negative ns_exponent as a valid value.""" + model = FT_gensim(sentences, ns_exponent=-1, min_count=1, workers=1) + tmpf = get_tmpfile('fasttext_negative_exp.tst') + model.save(tmpf) + loaded_model = FT_gensim.load(tmpf) + loaded_model.train(sentences, total_examples=model.corpus_count, epochs=1) + assert loaded_model.ns_exponent == -1, loaded_model.ns_exponent + @pytest.mark.parametrize('shrink_windows', [True, False]) def test_cbow_hs_training(shrink_windows): diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 43505b0be2..20216eb8a3 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -1054,6 +1054,15 @@ def test_compute_training_loss(self): training_loss_val = model.get_latest_training_loss() self.assertTrue(training_loss_val > 0.0) + def test_negative_ns_exp(self): + """The model should accept a negative ns_exponent as a valid value.""" + model = word2vec.Word2Vec(sentences, ns_exponent=-1, min_count=1, workers=1) + tmpf = get_tmpfile('w2v_negative_exp.tst') + model.save(tmpf) + loaded_model = word2vec.Word2Vec.load(tmpf) + loaded_model.train(sentences, total_examples=model.corpus_count, epochs=1) + assert loaded_model.ns_exponent == -1, loaded_model.ns_exponent + # endclass TestWord2VecModel From a9817aeefcd11862a854be7b81e932ca2848637d Mon Sep 17 00:00:00 2001 From: Dmitry Chaplinsky Date: Sat, 4 Dec 2021 07:35:56 +0200 Subject: [PATCH 16/81] Adding another check to _check_corpus_sanity for compressed files, adding test (#3258) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Adding another check to _check_corpus_sanity for compressed files, adding test * Update gensim/models/word2vec.py Co-authored-by: Radim Řehůřek * Removing unnecessary warning Co-authored-by: Radim Řehůřek --- gensim/models/word2vec.py | 9 +++++++++ gensim/test/test_word2vec.py | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 7a49a8420c..d7df12e283 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -200,6 +200,7 @@ from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector from gensim import utils, matutils +from smart_open.compression import get_supported_extensions logger = logging.getLogger(__name__) @@ -1502,6 +1503,14 @@ def _check_corpus_sanity(self, corpus_iterable=None, corpus_file=None, passes=1) raise TypeError( f"Using a generator as corpus_iterable can't support {passes} passes. Try a re-iterable sequence.") + if corpus_iterable is None: + _, corpus_ext = os.path.splitext(corpus_file) + if corpus_ext.lower() in get_supported_extensions(): + raise TypeError( + f"Training from compressed files is not supported with the `corpus_path` argument. " + f"Please decompress {corpus_file} or use `corpus_iterable` instead." + ) + def _check_training_sanity(self, epochs=0, total_examples=None, total_words=None, **kwargs): """Checks whether the training parameters make sense. diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 20216eb8a3..79974f97b7 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -13,6 +13,7 @@ import os import bz2 import sys +import tempfile import numpy as np @@ -1040,6 +1041,13 @@ def test_load_on_class_error(self): """Test if exception is raised when loading word2vec model on instance""" self.assertRaises(AttributeError, load_on_instance) + def test_file_should_not_be_compressed(self): + """ + Is corpus_file a compressed file? + """ + with tempfile.NamedTemporaryFile(suffix=".bz2") as fp: + self.assertRaises(TypeError, word2vec.Word2Vec, (None, fp.name)) + def test_reset_from(self): """Test if reset_from() uses pre-built structures from other model""" model = word2vec.Word2Vec(sentences, min_count=1) From 419d82b8a72fe24bb34b0ac2b150a1c6c236a771 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 4 Dec 2021 14:37:11 +0900 Subject: [PATCH 17/81] updating CHANGELOG.md after merge --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d30bb82d4a..785133eb28 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ Changes * [#3251](https://github.com/RaRe-Technologies/gensim/pull/3251): Apply new convention of delimiting instance params in str function, by [@menshikh-iv](https://github.com/menshikh-iv) * [#3227](https://github.com/RaRe-Technologies/gensim/pull/3227): Fix FastText doc-comment example for `build_vocab` and `train` to use correct argument names, by [@HLasse](https://github.com/HLasse) * [#3250](https://github.com/RaRe-Technologies/gensim/pull/3250): Make negative ns_exponent work correctly, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3258](https://github.com/RaRe-Technologies/gensim/pull/3258): Adding another check to _check_corpus_sanity for compressed files, adding test, by [@dchaplinsky](https://github.com/dchaplinsky) ## 4.1.2, 2021-09-17 From fa2d1b174a6d4387f78a2595135416f38545399f Mon Sep 17 00:00:00 2001 From: Blaine Date: Sat, 4 Dec 2021 00:58:53 -0500 Subject: [PATCH 18/81] Fix error message when Doc2Vec does not receive corpus_file or corpus_iterable (#3182) --- gensim/models/doc2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index de6a0f0fac..c1ff25d994 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -494,7 +494,7 @@ def train( """ if corpus_file is None and corpus_iterable is None: - raise TypeError("Either one of corpus_file or documents value must be provided") + raise TypeError("Either one of corpus_file or corpus_iterable value must be provided") if corpus_file is not None and corpus_iterable is not None: raise TypeError("Both corpus_file and corpus_iterable must not be provided at the same time") From 2f182d73a9258cc3df6eec130a149cf0f65f3781 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Primo=C5=BE=20Godec?= Date: Sat, 4 Dec 2021 07:24:54 +0100 Subject: [PATCH 19/81] Sparse2Corpus: update __getitem__ to work on slices, lists and ellipsis (#3247) * Sparse2Corpus: update __getitem__ to work on slices, ellipsis, and iterable * Sparse2Corpus: update __getitem__ to work on slices, ellipsis, and iterable * Update CHANGELOG.md Co-authored-by: Michael Penkov --- CHANGELOG.md | 1 + gensim/matutils.py | 27 +++++++++++++++--------- gensim/test/test_matutils.py | 40 ++++++++++++++++++++++++++++++++++++ 3 files changed, 58 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 785133eb28..c1afd04e54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ Changes * [#3194](https://github.com/RaRe-Technologies/gensim/pull/3194): Added random_seed parameter to make LsiModel reproducible, by [@parashardhapola](https://github.com/parashardhapola) * [#3251](https://github.com/RaRe-Technologies/gensim/pull/3251): Apply new convention of delimiting instance params in str function, by [@menshikh-iv](https://github.com/menshikh-iv) * [#3227](https://github.com/RaRe-Technologies/gensim/pull/3227): Fix FastText doc-comment example for `build_vocab` and `train` to use correct argument names, by [@HLasse](https://github.com/HLasse) +* [#3247](https://github.com/RaRe-Technologies/gensim/pull/3247): Sparse2Corpus: update __getitem__ to work on slices, lists and ellipsis, by [@PrimozGodec](https://github.com/PrimozGodec) * [#3250](https://github.com/RaRe-Technologies/gensim/pull/3250): Make negative ns_exponent work correctly, by [@menshikh-iv](https://github.com/menshikh-iv) * [#3258](https://github.com/RaRe-Technologies/gensim/pull/3258): Adding another check to _check_corpus_sanity for compressed files, adding test, by [@dchaplinsky](https://github.com/dchaplinsky) diff --git a/gensim/matutils.py b/gensim/matutils.py index fb2c54e680..4d4064acc0 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -597,23 +597,30 @@ def __iter__(self): def __len__(self): return self.sparse.shape[1] - def __getitem__(self, document_index): - """Retrieve a document vector from the corpus by its index. + def __getitem__(self, key): + """ + Retrieve a document vector or subset from the corpus by key. Parameters ---------- - document_index : int - Index of document + key: int, ellipsis, slice, iterable object + Index of the document retrieve. + Less commonly, the key can also be a slice, ellipsis, or an iterable + to retrieve multiple documents. Returns ------- - list of (int, number) - Document in BoW format. - + list of (int, number), Sparse2Corpus + Document in BoW format when `key` is an integer. Otherwise :class:`~gensim.matutils.Sparse2Corpus`. """ - indprev = self.sparse.indptr[document_index] - indnow = self.sparse.indptr[document_index + 1] - return list(zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow])) + sparse = self.sparse + if isinstance(key, int): + iprev = self.sparse.indptr[key] + inow = self.sparse.indptr[key + 1] + return list(zip(sparse.indices[iprev:inow], sparse.data[iprev:inow])) + + sparse = self.sparse.__getitem__((slice(None, None, None), key)) + return Sparse2Corpus(sparse) def veclen(vec): diff --git a/gensim/test/test_matutils.py b/gensim/test/test_matutils.py index 97e4189d89..5c5f14398e 100644 --- a/gensim/test/test_matutils.py +++ b/gensim/test/test_matutils.py @@ -7,7 +7,9 @@ import logging import unittest import numpy as np +from numpy.testing import assert_array_equal from scipy import sparse +from scipy.sparse import csc_matrix from scipy.special import psi # gamma function utils import gensim.matutils as matutils @@ -266,6 +268,44 @@ def test_return_norm_zero_vector_gensim_sparse(self): self.assertEqual(norm, 1.0) +class TestSparse2Corpus(unittest.TestCase): + def setUp(self): + self.orig_array = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + self.s2c = matutils.Sparse2Corpus(csc_matrix(self.orig_array)) + + def test_getitem_slice(self): + assert_array_equal(self.s2c[:2].sparse.toarray(), self.orig_array[:, :2]) + assert_array_equal(self.s2c[1:3].sparse.toarray(), self.orig_array[:, 1:3]) + + def test_getitem_index(self): + self.assertListEqual(self.s2c[1], [(0, 2), (1, 5), (2, 8)]) + + def test_getitem_list_of_indices(self): + assert_array_equal( + self.s2c[[1, 2]].sparse.toarray(), self.orig_array[:, [1, 2]] + ) + assert_array_equal(self.s2c[[1]].sparse.toarray(), self.orig_array[:, [1]]) + + def test_getitem_ndarray(self): + assert_array_equal( + self.s2c[np.array([1, 2])].sparse.toarray(), self.orig_array[:, [1, 2]] + ) + assert_array_equal( + self.s2c[np.array([1])].sparse.toarray(), self.orig_array[:, [1]] + ) + + def test_getitem_range(self): + assert_array_equal( + self.s2c[range(1, 3)].sparse.toarray(), self.orig_array[:, [1, 2]] + ) + assert_array_equal( + self.s2c[range(1, 2)].sparse.toarray(), self.orig_array[:, [1]] + ) + + def test_getitem_ellipsis(self): + assert_array_equal(self.s2c[...].sparse.toarray(), self.orig_array) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From a9fc7fcde7ea2975d320078d2e01f01e4848beba Mon Sep 17 00:00:00 2001 From: Stefano Zacchiroli Date: Sat, 4 Dec 2021 07:31:21 +0100 Subject: [PATCH 20/81] Dictionary doc: ref FAQ entry about filter_extremes corpus migration (#3257) --- gensim/corpora/dictionary.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index 3bfa65942e..42f9746861 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -330,7 +330,9 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N After the pruning, resulting gaps in word ids are shrunk. Due to this gap shrinking, **the same word may have a different word id before and after the call - to this function!** + to this function!** See :class:`gensim.models.VocabTransform` and the + `dedicated FAQ entry `_ on how + to transform a corpus built with a dictionary before pruning. Examples -------- From 48d6e55f84c39eccdd88ee55646ce7a77f2c1a70 Mon Sep 17 00:00:00 2001 From: Geo Jacob Date: Sun, 5 Dec 2021 12:43:57 +0530 Subject: [PATCH 21/81] Migrate setup.py from distutils to setuptools (#3274) * Migrate setup.py from distutils to setuptools * flake8 fixes in doc string * Update CHANGELOG.md Co-authored-by: Michael Penkov --- CHANGELOG.md | 1 + gensim/corpora/dictionary.py | 2 +- setup.py | 4 +--- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c1afd04e54..3bea69b900 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Changes * [#3247](https://github.com/RaRe-Technologies/gensim/pull/3247): Sparse2Corpus: update __getitem__ to work on slices, lists and ellipsis, by [@PrimozGodec](https://github.com/PrimozGodec) * [#3250](https://github.com/RaRe-Technologies/gensim/pull/3250): Make negative ns_exponent work correctly, by [@menshikh-iv](https://github.com/menshikh-iv) * [#3258](https://github.com/RaRe-Technologies/gensim/pull/3258): Adding another check to _check_corpus_sanity for compressed files, adding test, by [@dchaplinsky](https://github.com/dchaplinsky) +* [#3274](https://github.com/RaRe-Technologies/gensim/pull/3274): Migrate setup.py from distutils to setuptools, by [@geojacobm6](https://github.com/geojacobm6) ## 4.1.2, 2021-09-17 diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index 42f9746861..51ec35038f 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -331,7 +331,7 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N After the pruning, resulting gaps in word ids are shrunk. Due to this gap shrinking, **the same word may have a different word id before and after the call to this function!** See :class:`gensim.models.VocabTransform` and the - `dedicated FAQ entry `_ on how + `dedicated FAQ entry `_ on how # noqa to transform a corpus built with a dictionary before pruning. Examples diff --git a/setup.py b/setup.py index ccb1142fb6..80379c30e3 100644 --- a/setup.py +++ b/setup.py @@ -10,15 +10,13 @@ python ./setup.py install """ -import distutils.cmd -import distutils.log import itertools import os import platform import shutil import sys -from setuptools import Extension, find_packages, setup +from setuptools import Extension, find_packages, setup, distutils from setuptools.command.build_ext import build_ext c_extensions = { From 7d7bb84598e2e02e839b38c6b662d4357cbdce0a Mon Sep 17 00:00:00 2001 From: austereantelope <95935342+austereantelope@users.noreply.github.com> Date: Mon, 13 Dec 2021 08:39:39 -0600 Subject: [PATCH 22/81] tighten test_parallel bound (#3278) --- gensim/test/test_word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 79974f97b7..71996ca981 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -834,7 +834,7 @@ def test_parallel(self): # the exact vectors and therefore similarities may differ, due to different thread collisions/randomization # so let's test only for top10 neighbor_rank = [word for word, sim in sims].index(expected_neighbor) - self.assertLess(neighbor_rank, 20) + self.assertLess(neighbor_rank, 2) def test_r_n_g(self): """Test word2vec results identical with identical RNG seed.""" From 8e12e869c166f873b2a0b26424851b9af0f4019c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Wed, 22 Dec 2021 14:39:57 +0100 Subject: [PATCH 23/81] add the FastSS and Levenshtein modules to docs --- docs/src/apiref.rst | 2 ++ docs/src/auto_examples/index.rst | 34 +++++++++++++-------------- docs/src/similarities/levenshtein.rst | 8 +++++++ gensim/similarities/fastss.pyx | 9 +++++++ gensim/similarities/levenshtein.py | 2 +- 5 files changed, 37 insertions(+), 18 deletions(-) create mode 100644 docs/src/similarities/levenshtein.rst diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 583e4528a9..39e29b8003 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -60,6 +60,8 @@ Modules: similarities/termsim similarities/annoy similarities/nmslib + similarities/levenshtein + similarities/fastss test/utils topic_coherence/aggregation topic_coherence/direct_confirmation_measure diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst index 05643de00c..d3dd2291be 100644 --- a/docs/src/auto_examples/index.rst +++ b/docs/src/auto_examples/index.rst @@ -71,7 +71,7 @@ Understanding this functionality is vital for using gensim effectively. .. raw:: html -
+
.. only:: html @@ -92,7 +92,7 @@ Understanding this functionality is vital for using gensim effectively. .. raw:: html -
+
.. only:: html @@ -169,14 +169,14 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png - :alt: FastText Model + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png + :alt: Ensemble LDA - :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` + :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` .. raw:: html @@ -186,18 +186,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_fasttext + /auto_examples/tutorials/run_ensemblelda .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png - :alt: Ensemble LDA + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png + :alt: FastText Model - :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` + :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` .. raw:: html @@ -207,11 +207,11 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_ensemblelda + /auto_examples/tutorials/run_fasttext .. raw:: html -
+
.. only:: html @@ -309,7 +309,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. raw:: html -
+
.. only:: html @@ -330,7 +330,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. raw:: html -
+
.. only:: html @@ -447,13 +447,13 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from .. container:: sphx-glr-download sphx-glr-download-python - :download:`Download all examples in Python source code: auto_examples_python.zip ` + :download:`Download all examples in Python source code: auto_examples_python.zip ` .. container:: sphx-glr-download sphx-glr-download-jupyter - :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` + :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` .. only:: html diff --git a/docs/src/similarities/levenshtein.rst b/docs/src/similarities/levenshtein.rst new file mode 100644 index 0000000000..b5be710589 --- /dev/null +++ b/docs/src/similarities/levenshtein.rst @@ -0,0 +1,8 @@ +:mod:`similarities.levenshtein` -- Fast soft-cosine semantic similarity search +============================================================================== + +.. automodule:: gensim.similarities.levenshtein + :synopsis: Fast fuzzy search between strings, using the Soft-Cosine Semantic Similarity + :members: + :inherited-members: + diff --git a/gensim/similarities/fastss.pyx b/gensim/similarities/fastss.pyx index a4e8cba54b..e47a5442b2 100644 --- a/gensim/similarities/fastss.pyx +++ b/gensim/similarities/fastss.pyx @@ -137,6 +137,15 @@ def bytes2set(b): class FastSS: + """ + Fast implementation of FastSS (Fast Similarity Search): https://fastss.csg.uzh.ch/ + + FastSS enables fuzzy search of a dynamic query (a word, string) against a static + dictionary (a set of words, strings). The "fuziness" is configurable by means + of a maximum edit distance (Levenshtein) between the query string and any of the + dictionary words. + + """ def __init__(self, words=None, max_dist=2): """ diff --git a/gensim/similarities/levenshtein.py b/gensim/similarities/levenshtein.py index 51da72c065..768429a62a 100644 --- a/gensim/similarities/levenshtein.py +++ b/gensim/similarities/levenshtein.py @@ -29,7 +29,7 @@ class LevenshteinSimilarityIndex(TermSimilarityIndex): "Levenshtein similarity" is a modification of the Levenshtein (edit) distance, defined in [charletetal17]_. - This implementation uses the FastSS neighbourhood algorithm + This implementation uses the :class:`~gensim.similarities.fastss.FastSS` algorithm for fast kNN nearest-neighbor retrieval. Parameters From 752e477c16b3c16be90607c1e3aab04de9dbba23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Wed, 22 Dec 2021 14:51:35 +0100 Subject: [PATCH 24/81] add doc source for FastSS --- docs/src/similarities/fastss.rst | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 docs/src/similarities/fastss.rst diff --git a/docs/src/similarities/fastss.rst b/docs/src/similarities/fastss.rst new file mode 100644 index 0000000000..66dc0936a1 --- /dev/null +++ b/docs/src/similarities/fastss.rst @@ -0,0 +1,8 @@ +:mod:`similarities.fastss` -- Fast Levenshtein edit distance +================================================================== + +.. automodule:: gensim.similarities.fastss + :synopsis: Fast fuzzy search between strings, using the Levenshtein edit distance + :members: + :inherited-members: + From 8b8203d8df354673732dff635283494a33d0d422 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Fri, 24 Dec 2021 03:18:28 +0100 Subject: [PATCH 25/81] Add the FastSS and Levenshtein modules to docs (#3279) * fix TFIDF docs * add the FastSS and Levenshtein modules to docs * add doc source for FastSS --- docs/src/apiref.rst | 2 ++ docs/src/auto_examples/index.rst | 34 +++++++++++++-------------- docs/src/similarities/fastss.rst | 8 +++++++ docs/src/similarities/levenshtein.rst | 8 +++++++ gensim/similarities/fastss.pyx | 9 +++++++ gensim/similarities/levenshtein.py | 2 +- 6 files changed, 45 insertions(+), 18 deletions(-) create mode 100644 docs/src/similarities/fastss.rst create mode 100644 docs/src/similarities/levenshtein.rst diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 583e4528a9..39e29b8003 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -60,6 +60,8 @@ Modules: similarities/termsim similarities/annoy similarities/nmslib + similarities/levenshtein + similarities/fastss test/utils topic_coherence/aggregation topic_coherence/direct_confirmation_measure diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst index 05643de00c..d3dd2291be 100644 --- a/docs/src/auto_examples/index.rst +++ b/docs/src/auto_examples/index.rst @@ -71,7 +71,7 @@ Understanding this functionality is vital for using gensim effectively. .. raw:: html -
+
.. only:: html @@ -92,7 +92,7 @@ Understanding this functionality is vital for using gensim effectively. .. raw:: html -
+
.. only:: html @@ -169,14 +169,14 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png - :alt: FastText Model + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png + :alt: Ensemble LDA - :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` + :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` .. raw:: html @@ -186,18 +186,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_fasttext + /auto_examples/tutorials/run_ensemblelda .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_ensemblelda_thumb.png - :alt: Ensemble LDA + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_fasttext_thumb.png + :alt: FastText Model - :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` + :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` .. raw:: html @@ -207,11 +207,11 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_ensemblelda + /auto_examples/tutorials/run_fasttext .. raw:: html -
+
.. only:: html @@ -309,7 +309,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. raw:: html -
+
.. only:: html @@ -330,7 +330,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. raw:: html -
+
.. only:: html @@ -447,13 +447,13 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from .. container:: sphx-glr-download sphx-glr-download-python - :download:`Download all examples in Python source code: auto_examples_python.zip ` + :download:`Download all examples in Python source code: auto_examples_python.zip ` .. container:: sphx-glr-download sphx-glr-download-jupyter - :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` + :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` .. only:: html diff --git a/docs/src/similarities/fastss.rst b/docs/src/similarities/fastss.rst new file mode 100644 index 0000000000..66dc0936a1 --- /dev/null +++ b/docs/src/similarities/fastss.rst @@ -0,0 +1,8 @@ +:mod:`similarities.fastss` -- Fast Levenshtein edit distance +================================================================== + +.. automodule:: gensim.similarities.fastss + :synopsis: Fast fuzzy search between strings, using the Levenshtein edit distance + :members: + :inherited-members: + diff --git a/docs/src/similarities/levenshtein.rst b/docs/src/similarities/levenshtein.rst new file mode 100644 index 0000000000..b5be710589 --- /dev/null +++ b/docs/src/similarities/levenshtein.rst @@ -0,0 +1,8 @@ +:mod:`similarities.levenshtein` -- Fast soft-cosine semantic similarity search +============================================================================== + +.. automodule:: gensim.similarities.levenshtein + :synopsis: Fast fuzzy search between strings, using the Soft-Cosine Semantic Similarity + :members: + :inherited-members: + diff --git a/gensim/similarities/fastss.pyx b/gensim/similarities/fastss.pyx index a4e8cba54b..e47a5442b2 100644 --- a/gensim/similarities/fastss.pyx +++ b/gensim/similarities/fastss.pyx @@ -137,6 +137,15 @@ def bytes2set(b): class FastSS: + """ + Fast implementation of FastSS (Fast Similarity Search): https://fastss.csg.uzh.ch/ + + FastSS enables fuzzy search of a dynamic query (a word, string) against a static + dictionary (a set of words, strings). The "fuziness" is configurable by means + of a maximum edit distance (Levenshtein) between the query string and any of the + dictionary words. + + """ def __init__(self, words=None, max_dist=2): """ diff --git a/gensim/similarities/levenshtein.py b/gensim/similarities/levenshtein.py index 51da72c065..768429a62a 100644 --- a/gensim/similarities/levenshtein.py +++ b/gensim/similarities/levenshtein.py @@ -29,7 +29,7 @@ class LevenshteinSimilarityIndex(TermSimilarityIndex): "Levenshtein similarity" is a modification of the Levenshtein (edit) distance, defined in [charletetal17]_. - This implementation uses the FastSS neighbourhood algorithm + This implementation uses the :class:`~gensim.similarities.fastss.FastSS` algorithm for fast kNN nearest-neighbor retrieval. Parameters From 0478cec864d2188ada95d15eda63a63c5e9de73f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 8 Jan 2022 14:36:44 +0100 Subject: [PATCH 26/81] english --- gensim/interfaces.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 3358adaab5..c685602e57 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -29,19 +29,19 @@ class CorpusABC(utils.SaveLoad): .. sourcecode:: pycon - >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class + >>> from gensim.corpora import MmCorpus # inherits from the CorpusABC class >>> from gensim.test.utils import datapath >>> >>> corpus = MmCorpus(datapath("testcorpus.mm")) >>> for doc in corpus: ... pass # do something with the doc... - A document represented in bag-of-word (BoW) format, i.e. list of (attr_id, attr_value), + A document represented in the bag-of-word (BoW) format, i.e. list of (attr_id, attr_value), like ``[(1, 0.2), (4, 0.6), ...]``. .. sourcecode:: pycon - >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class + >>> from gensim.corpora import MmCorpus # inherits from the CorpusABC class >>> from gensim.test.utils import datapath >>> >>> corpus = MmCorpus(datapath("testcorpus.mm")) @@ -49,28 +49,28 @@ class CorpusABC(utils.SaveLoad): >>> print(doc) [(0, 1.0), (1, 1.0), (2, 1.0)] - Remember, that save/load methods save only corpus class (not corpus as data itself), - for save/load functionality, please use this pattern : + Remember that the save/load methods only pickle the corpus object, not + the (streamed) corpus data itself! + To save the corpus data, please use this pattern : .. sourcecode:: pycon - >>> from gensim.corpora import MmCorpus # this is inheritor of CorpusABC class + >>> from gensim.corpora import MmCorpus # MmCorpus inherits from CorpusABC >>> from gensim.test.utils import datapath, get_tmpfile >>> >>> corpus = MmCorpus(datapath("testcorpus.mm")) >>> tmp_path = get_tmpfile("temp_corpus.mm") >>> - >>> MmCorpus.serialize(tmp_path, corpus) # serialize corpus to disk in MmCorpus format - >>> # MmCorpus.save_corpus(tmp_path, corpus) # this variant also possible, but if serialize availbe - call it. + >>> MmCorpus.serialize(tmp_path, corpus) # serialize corpus to disk in the MmCorpus format >>> loaded_corpus = MmCorpus(tmp_path) # load corpus through constructor >>> for (doc_1, doc_2) in zip(corpus, loaded_corpus): - ... assert doc_1 == doc_2 # check that corpuses exactly same + ... assert doc_1 == doc_2 # no change between the original and loaded corpus See Also -------- :mod:`gensim.corpora` - Corpuses in different formats + Corpora in different formats. """ def __iter__(self): @@ -78,14 +78,14 @@ def __iter__(self): raise NotImplementedError('cannot instantiate abstract base class') def save(self, *args, **kwargs): - """Saves corpus in-memory state. + """Saves the in-memory state of the corpus (pickles the object). Warnings -------- - This save only the "state" of a corpus class, not the corpus data! + This saves only the "internal state" of the corpus object, not the corpus data! - For saving data use the `serialize` method of the output format you'd like to use - (e.g. :meth:`gensim.corpora.mmcorpus.MmCorpus.serialize`). + To save the corpus data, use the `serialize` method of your desired output format + instead, e.g. :meth:`gensim.corpora.mmcorpus.MmCorpus.serialize`. """ import warnings From 903ae65ed63d494a381681cc8c80d7618befe9bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Tue, 25 Jan 2022 17:26:56 +0100 Subject: [PATCH 27/81] add github citation file --- CITATION.cff | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 CITATION.cff diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000..ed3be503f0 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,31 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: +- family-names: "Řehůřek" + given-names: "Radim" +title: "Gensim: Topic modelling for humans" +version: 4.1.0 +url: "https://github.com/RaRe-Technologies/gensim" +preferred-citation: + type: conference-paper + authors: + - family-names: "Řehůřek" + given-names: "Radim" + - family-names: "Sojka" + given-names: "Petr" + publisher: + name: "University of Malta" + date-published: "2010-05-22" + year: 2010 + month: 5 + start: 45 # First page number + end: 50 # Last page number + pages: 5 + title: "Software Framework for Topic Modelling with Large Corpora" + languages: ["eng"] + url: "http://is.muni.cz/publication/884893/en" + conference: + name: "Proceedings of LREC 2010 workshop New Challenges for NLP Frameworks" + city: Valetta + country: MT + location: "University of Malta, Valletta, Malta" From 6fc9e3811a1bbfd2d3714ccc6bee06bad3b10fda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gilbert=20Fran=C3=A7ois?= Date: Fri, 25 Feb 2022 08:12:28 +0100 Subject: [PATCH 28/81] Fixes 'not enough arguments for format string' error (#3286) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fixes 'not enough arguments for format string' err * Update gensim/models/doc2vec.py Co-authored-by: Radim Řehůřek * Update gensim/models/doc2vec.py Co-authored-by: Radim Řehůřek * run code coverage on Py3.8 Linux only * messing around with tox.ini * messing around with tox.ini * Update CHANGELOG.md Co-authored-by: Radim Řehůřek Co-authored-by: Michael Penkov --- .github/workflows/tests.yml | 8 +++----- CHANGELOG.md | 1 + gensim/models/doc2vec.py | 4 +++- tox.ini | 11 +++++++---- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 3bdcda0bd2..bd36e19aee 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,11 +16,9 @@ jobs: fail-fast: false matrix: include: - - {name: Linux, python: 3.6, os: ubuntu-20.04, tox: 'flake8,flake8-docs'} - - {name: Linux, python: 3.6, os: ubuntu-20.04, tox: 'py36-linux'} + - {name: Linux, python: 3.7, os: ubuntu-20.04, tox: 'flake8,flake8-docs'} - {name: Linux, python: 3.7, os: ubuntu-20.04, tox: 'py37-linux'} - - {name: Linux, python: 3.8, os: ubuntu-20.04, tox: 'py38-linux'} - - {name: Windows, python: 3.6, os: windows-2019, tox: 'py36-win'} + - {name: Linux, python: 3.8, os: ubuntu-20.04, tox: 'py38-linux-cov'} - {name: Windows, python: 3.7, os: windows-2019, tox: 'py37-win'} - {name: Windows, python: 3.8, os: windows-2019, tox: 'py38-win'} env: @@ -72,4 +70,4 @@ jobs: run: | pwd COREFILE=$(find . -maxdepth 1 -name "core*" | head -n 1) - if [[ -f "$COREFILE" ]]; then EXECFILE=$(gdb -c "$COREFILE" -batch | grep "Core was generated" | tr -d "\`" | cut -d' ' -f5); file "$COREFILE"; gdb -c "$COREFILE" "$EXECFILE" -x continuous_integration/debug.gdb -batch; fi \ No newline at end of file + if [[ -f "$COREFILE" ]]; then EXECFILE=$(gdb -c "$COREFILE" -batch | grep "Core was generated" | tr -d "\`" | cut -d' ' -f5); file "$COREFILE"; gdb -c "$COREFILE" "$EXECFILE" -x continuous_integration/debug.gdb -batch; fi diff --git a/CHANGELOG.md b/CHANGELOG.md index 3bea69b900..8a3be62510 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ Changes * [#3250](https://github.com/RaRe-Technologies/gensim/pull/3250): Make negative ns_exponent work correctly, by [@menshikh-iv](https://github.com/menshikh-iv) * [#3258](https://github.com/RaRe-Technologies/gensim/pull/3258): Adding another check to _check_corpus_sanity for compressed files, adding test, by [@dchaplinsky](https://github.com/dchaplinsky) * [#3274](https://github.com/RaRe-Technologies/gensim/pull/3274): Migrate setup.py from distutils to setuptools, by [@geojacobm6](https://github.com/geojacobm6) +* [#3286](https://github.com/RaRe-Technologies/gensim/pull/3286): Fixes 'not enough arguments for format string' error, by [@gilbertfrancois](https://github.com/gilbertfrancois) ## 4.1.2, 2021-09-17 diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c1ff25d994..7cc5a30e66 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -993,7 +993,9 @@ def _scan_vocab(self, corpus_iterable, progress_per, trim_rule): logger.warning( "Highest int doctag (%i) larger than count of documents (%i). This means " "at least %i excess, unused slots (%i bytes) will be allocated for vectors.", - max_rawint, corpus_count, ((max_rawint - corpus_count) * self.vector_size * 4)) + max_rawint, corpus_count, max_rawint - corpus_count, + (max_rawint - corpus_count) * self.vector_size * dtype(REAL).itemsize, + ) if max_rawint > -1: # adjust indexes/list to account for range of pure-int keyed doctags for key in doctags_list: diff --git a/tox.ini b/tox.ini index 058b37d9f5..f602550e8a 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,6 @@ [tox] minversion = 2.0 -envlist = {py36,py37,py38, py39}-{win,linux}, flake8, docs, docs-upload, download-wheels, upload-wheels, test-pypi +envlist = {py36,py37,py38, py39}-{win,linux}, py38-linux-cov, flake8, docs, docs-upload, download-wheels, upload-wheels, test-pypi skipsdist = True platform = linux: linux win: win64 @@ -42,9 +42,11 @@ exclude_lines = ignore_errors = True +# +# Conditional factors https://tox.wiki/en/latest/config.html#factors +# [pytest] -addopts = -rfxEXs --durations=20 --showlocals --cov=gensim/ --cov-report=xml - +addopts = -rfxEXs --durations=20 --showlocals [testenv] recreate = True @@ -72,7 +74,8 @@ commands = python --version pip --version python setup.py build_ext --inplace - pytest {posargs:gensim/test} + cov: pytest {posargs:gensim/test} --cov=gensim/ --cov-report=xml + !cov: pytest {posargs:gensim/test} [testenv:flake8] From d6620df23a75a67a367c624c9078def3f6d3783c Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Sat, 26 Feb 2022 14:16:04 +0800 Subject: [PATCH 29/81] Detect when a fasttext executable is available in PATH (#3264) Also check that fasttext exists in FT_HOME and is executable. This is useful when using a distro like Debian that has a package of fasttext available to install. Co-authored-by: Michael Penkov --- gensim/test/test_fasttext.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index a557368faa..8922ee0ac9 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -8,6 +8,7 @@ import logging import unittest import os +import shutil import subprocess import struct import sys @@ -44,7 +45,8 @@ BUCKET = 10000 FT_HOME = os.environ.get("FT_HOME") -FT_CMD = os.path.join(FT_HOME, "fasttext") if FT_HOME else None +FT_CMD = shutil.which("fasttext", path=FT_HOME) or \ + shutil.which("fasttext") new_sentences = [ @@ -1661,7 +1663,7 @@ def _save_test_model(out_base_fname, model_params): subprocess.check_call(cmd) -@unittest.skipIf(not FT_HOME, "FT_HOME env variable not set, skipping test") +@unittest.skipIf(not FT_CMD, "fasttext not in FT_HOME or PATH, skipping test") class SaveFacebookByteIdentityTest(unittest.TestCase): """ This class containts tests that check the following scenario: @@ -1708,7 +1710,7 @@ def line_to_array(line): return np.array([line_to_array(line) for line in out.splitlines()], dtype=np.float32) -@unittest.skipIf(not os.environ.get("FT_HOME", None), "FT_HOME env variable not set, skipping test") +@unittest.skipIf(not FT_CMD, "fasttext not in FT_HOME or PATH, skipping test") class SaveFacebookFormatReadingTest(unittest.TestCase): """ This class containts tests that check the following scenario: From 86b1832d75482127a5cb1c11fbcfb36cbe2a58c4 Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Sat, 26 Feb 2022 15:29:26 +0800 Subject: [PATCH 30/81] Remove commented pytest-rerunfailures test dependency (#3263) It is no longer needed. --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index 80379c30e3..d1238e8fa7 100644 --- a/setup.py +++ b/setup.py @@ -267,7 +267,6 @@ def run(self): core_testenv = [ 'pytest', 'pytest-cov', -# 'pytest-rerunfailures', # disabled 2020-08-28 for 'mock', 'cython', 'testfixtures', From 72debfbbf20c1a5425cc98612773303ce8d131b0 Mon Sep 17 00:00:00 2001 From: Silvia Terragni Date: Sat, 26 Feb 2022 12:13:27 +0100 Subject: [PATCH 31/81] improve readability --- gensim/topic_coherence/text_analysis.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 62a4b84df0..2c06185a0b 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -300,10 +300,11 @@ def accumulate(self, texts, window_size): def _iter_texts(self, texts): dtype = np.uint16 if np.iinfo(np.uint16).max >= self._vocab_size else np.uint32 for text in texts: - yield np.fromiter(( - self.id2contiguous[self.token2id[w]] if w in self.relevant_words - else self._none_token - for w in text), dtype=dtype, count=len(text)) + ids = ( + self.id2contiguous[self.token2id[w]] if w in self.relevant_words else self._none_token + for w in text + ) + yield np.fromiter(ids, dtype=dtype, count=len(text)) class InvertedIndexAccumulator(WindowedTextsAnalyzer, InvertedIndexBased): From 298880bbad2f4c582013157cf2354a1e6a2e8436 Mon Sep 17 00:00:00 2001 From: Silvia Terragni Date: Sat, 26 Feb 2022 12:15:21 +0100 Subject: [PATCH 32/81] add test for topics with unseen words --- gensim/test/test_coherencemodel.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/gensim/test/test_coherencemodel.py b/gensim/test/test_coherencemodel.py index 5ac159ec87..2b111f7306 100644 --- a/gensim/test/test_coherencemodel.py +++ b/gensim/test/test_coherencemodel.py @@ -50,6 +50,11 @@ def setUp(self): ['not a token', 'not an id', 'tests using', "this list"], ['should raise', 'an error', 'to pass', 'correctly'] ] + # list of topics with unseen words in the dictionary + self.topics5 = [ + ['aaaaa', 'bbbbb', 'ccccc', 'eeeee'], + ['ddddd', 'fffff', 'ggggh', 'hhhhh'] + ] self.topicIds1 = [] for topic in self.topics1: self.topicIds1.append([self.dictionary.token2id[token] for token in topic]) @@ -75,6 +80,7 @@ def check_coherence_measure(self, coherence): cm5 = CoherenceModel(topics=[self.topics1[0]], **kwargs) self.assertRaises(ValueError, lambda: CoherenceModel(topics=self.topics4, **kwargs)) + self.assertRaises(ValueError, lambda: CoherenceModel(topics=self.topics5, **kwargs)) self.assertEqual(cm1.get_coherence(), cm4.get_coherence()) self.assertEqual(cm1.get_coherence_per_topic()[0], cm5.get_coherence()) self.assertIsInstance(cm3.get_coherence(), np.double) From ee89761593555ae828bc50af46098f8909dad708 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Thu, 17 Mar 2022 17:00:23 +0100 Subject: [PATCH 33/81] re #3291: add special methods to docs --- docs/src/models/doc2vec.rst | 1 + docs/src/models/keyedvectors.rst | 1 + docs/src/models/ldamodel.rst | 1 + docs/src/models/lsimodel.rst | 1 + 4 files changed, 4 insertions(+) diff --git a/docs/src/models/doc2vec.rst b/docs/src/models/doc2vec.rst index b5d2e290b5..99d4d27f01 100644 --- a/docs/src/models/doc2vec.rst +++ b/docs/src/models/doc2vec.rst @@ -7,3 +7,4 @@ :inherited-members: :undoc-members: :show-inheritance: + :special-members: __getitem__ diff --git a/docs/src/models/keyedvectors.rst b/docs/src/models/keyedvectors.rst index db07e034e8..f51e03228d 100644 --- a/docs/src/models/keyedvectors.rst +++ b/docs/src/models/keyedvectors.rst @@ -7,3 +7,4 @@ :inherited-members: :undoc-members: :show-inheritance: + :special-members: __getitem__, __setitem__, __contains__ diff --git a/docs/src/models/ldamodel.rst b/docs/src/models/ldamodel.rst index 2dfb736ea6..d1bf9632fc 100644 --- a/docs/src/models/ldamodel.rst +++ b/docs/src/models/ldamodel.rst @@ -4,6 +4,7 @@ .. automodule:: gensim.models.ldamodel :synopsis: Latent Dirichlet Allocation :members: + :special-members: __getitem__ :inherited-members: :undoc-members: :show-inheritance: diff --git a/docs/src/models/lsimodel.rst b/docs/src/models/lsimodel.rst index fec09efbf4..278d39cf0b 100644 --- a/docs/src/models/lsimodel.rst +++ b/docs/src/models/lsimodel.rst @@ -4,6 +4,7 @@ .. automodule:: gensim.models.lsimodel :synopsis: Latent Semantic Indexing :members: + :special-members: __getitem__ :inherited-members: :undoc-members: :show-inheritance: From 490676cc34d909b8a361fa1ae1e835263a13673b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Fri, 18 Mar 2022 08:12:28 +0100 Subject: [PATCH 34/81] re #2809: update the doc2vec notebook --- docs/notebooks/doc2vec-wikipedia.ipynb | 471 +++++++++++++------------ gensim/corpora/wikicorpus.py | 21 +- gensim/models/doc2vec.py | 28 +- gensim/models/lsimodel.py | 4 +- gensim/models/word2vec.py | 8 +- 5 files changed, 281 insertions(+), 251 deletions(-) diff --git a/docs/notebooks/doc2vec-wikipedia.ipynb b/docs/notebooks/doc2vec-wikipedia.ipynb index ff4786d3fd..ed1f37df7f 100644 --- a/docs/notebooks/doc2vec-wikipedia.ipynb +++ b/docs/notebooks/doc2vec-wikipedia.ipynb @@ -4,29 +4,30 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Doc2Vec to wikipedia articles" + "# Training Doc2Vec on Wikipedia articles" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We conduct the replication to **Document Embedding with Paragraph Vectors** (http://arxiv.org/abs/1507.07998).\n", - "In this paper, they showed only DBOW results to Wikipedia data. So we replicate this experiments using not only DBOW but also DM." + "We replicate the **Document Embedding with Paragraph Vectors** paper, http://arxiv.org/abs/1507.07998.\n", + "\n", + "In this paper, the authors showed only DBOW results trained from Wikipedia data. So we replicate this experiments using not only DBOW but also the DM mode of the \"paragraph vector\" algorithm aka Doc2Vec." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Basic Setup" + "## Basic setup" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Let's import Doc2Vec module." + "Let's import the necessary modules and set up logging. The code below assumes Python 3.7+ and Gensim 4.0+." ] }, { @@ -35,10 +36,15 @@ "metadata": {}, "outputs": [], "source": [ - "from gensim.corpora.wikicorpus import WikiCorpus\n", - "from gensim.models.doc2vec import Doc2Vec, TaggedDocument\n", + "import logging\n", + "import multiprocessing\n", "from pprint import pprint\n", - "import multiprocessing" + "\n", + "import smart_open\n", + "from gensim.corpora.wikicorpus import WikiCorpus, tokenize\n", + "from gensim.models.doc2vec import Doc2Vec, TaggedDocument\n", + "\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" ] }, { @@ -52,197 +58,313 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, download the dump of all Wikipedia articles from [here](http://download.wikimedia.org/enwiki/) (you want the file enwiki-latest-pages-articles.xml.bz2, or enwiki-YYYYMMDD-pages-articles.xml.bz2 for date-specific dumps).\n", + "First, download the dump of all Wikipedia articles from [here](http://download.wikimedia.org/enwiki/). You want the file named `enwiki-latest-pages-articles.xml.bz2`.\n", + "\n", + "Second, convert that Wikipedia article dump from the arcane Wikimedia XML format into a plain text file. This will make the subsequent training faster and also allow easy inspection of the data = \"input eyeballing\".\n", "\n", - "Second, convert the articles to WikiCorpus. WikiCorpus construct a corpus from a Wikipedia (or other MediaWiki-based) database dump.\n", + "We'll preprocess each article at the same time, normalizing its text to lowercase, splitting into tokens, etc.\n", "\n", - "For more details on WikiCorpus, you should access [Corpus from a Wikipedia dump](https://radimrehurek.com/gensim/corpora/wikicorpus.html)." + "Below I use a regexp tokenizer that simply looks for alphabetic sequences as tokens. But feel free to adapt the text preprocessing to your own domain. High quality preprocessing is often critical for the final pipeline accuracy – garbage in, garbage out!" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "wiki = WikiCorpus(\"enwiki-latest-pages-articles.xml.bz2\")\n", - "#wiki = WikiCorpus(\"enwiki-YYYYMMDD-pages-articles.xml.bz2\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define **TaggedWikiDocument** class to convert WikiCorpus into suitable form for Doc2Vec." + "wiki = WikiCorpus(\n", + " \"enwiki-latest-pages-articles.xml.bz2\", # path to the file you downloaded above\n", + " tokenizer_func=tokenize, # simple regexp; plug in your own tokenizer here\n", + " metadata=True, # also return the article titles and ids when parsing\n", + " dictionary={}, # don't start processing the data yet\n", + ")" ] }, { "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "class TaggedWikiDocument(object):\n", - " def __init__(self, wiki):\n", - " self.wiki = wiki\n", - " self.wiki.metadata = True\n", - " def __iter__(self):\n", - " for content, (page_id, title) in self.wiki.get_texts():\n", - " yield TaggedDocument([c.decode(\"utf-8\") for c in content], [title])" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": { - "collapsed": true - }, - "outputs": [], + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Volumes/work/workspace/gensim/trunk/gensim/utils.py:1332: UserWarning: detected OSX with python3.8+; aliasing chunkize to chunkize_serial\n", + " warnings.warn(\"detected %s; aliasing chunkize to chunkize_serial\" % entity)\n", + "2022-03-17 21:15:32,118 : INFO : processing article #0: 'Anarchism' (6538 tokens)\n", + "2022-03-17 21:30:00,138 : INFO : processing article #500000: 'Spiritual Formation Bible' (54 tokens)\n", + "2022-03-17 21:40:22,219 : INFO : processing article #1000000: 'Adolf von Liebenberg' (52 tokens)\n", + "2022-03-17 21:49:43,825 : INFO : processing article #1500000: 'Small nucleolar RNA U6-53/MBII-28' (123 tokens)\n", + "2022-03-17 21:59:23,620 : INFO : processing article #2000000: 'Xie Fei' (50 tokens)\n", + "2022-03-17 22:09:17,460 : INFO : processing article #2500000: 'Rhein, Saskatchewan' (185 tokens)\n", + "2022-03-17 22:19:39,293 : INFO : processing article #3000000: 'Kunyinsky District' (969 tokens)\n", + "2022-03-17 22:30:41,221 : INFO : processing article #3500000: 'Lake Saint-Charles' (555 tokens)\n", + "2022-03-17 22:41:17,487 : INFO : processing article #4000000: 'Mahāyānasaṃgraha' (612 tokens)\n", + "2022-03-17 22:52:27,834 : INFO : processing article #4500000: 'Liriomyza trifolii' (1493 tokens)\n", + "2022-03-17 23:04:41,464 : INFO : processing article #5000000: 'Daniel O. Griffin' (594 tokens)\n", + "2022-03-17 23:08:58,451 : INFO : finished iterating over Wikipedia corpus of 5176019 documents with 2996051328 positions (total 21837336 articles, 3072543084 positions before pruning articles shorter than 50 words)\n" + ] + } + ], "source": [ - "documents = TaggedWikiDocument(wiki)" + "with smart_open.open(\"wiki.txt.gz\", \"w\", encoding='utf8') as fout:\n", + " for article_no, (content, (page_id, title)) in enumerate(wiki.get_texts()):\n", + " title = ' '.join(title.split())\n", + " if article_no % 500000 == 0:\n", + " logging.info(\"processing article #%i: %r (%i tokens)\", article_no, title, len(content))\n", + " fout.write(f\"{title}\\t{' '.join(content)}\\n\") # title_of_article [TAB] words of the article" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Preprocessing\n", - "To set the same vocabulary size with original paper. We first calculate the optimal **min_count** parameter." + "The above takes about 2 hours on my 2021 M1 MacbookPro, and creates a new ~5.8 GB file named `wiki.txt.gz`. We're compressing the text into `.gz` (GZIP) right away to save on disk space, using the [smart_open](https://github.com/RaRe-Technologies/smart_open) library.\n", + "\n", + "Next we'll set up a stream to load the preprocessed articles from `wiki.txt.gz` one by one, in the format expected by Doc2Vec, ready for training. We don't want to load everything into RAM at once, because that would blow up the memory. And it is not necessary – Gensim can handle streamed training data:" ] }, { "cell_type": "code", - "execution_count": 5, - "metadata": { - "collapsed": true - }, + "execution_count": 4, + "metadata": {}, "outputs": [], "source": [ - "pre = Doc2Vec(min_count=0)\n", - "pre.scan_vocab(documents)" + "class TaggedWikiCorpus:\n", + " def __init__(self, wiki_text_path):\n", + " self.wiki_text_path = wiki_text_path\n", + " \n", + " def __iter__(self):\n", + " for line in smart_open.open(self.wiki_text_path, encoding='utf8'):\n", + " title, words = line.split('\\t')\n", + " yield TaggedDocument(words=words.split(), tags=[title])\n", + "\n", + "documents = TaggedWikiCorpus('wiki.txt.gz') # A streamed iterable; nothing in RAM yet." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "min_count: 0, size of vocab: 8545782.0\n", - "min_count: 1, size of vocab: 8545782.0\n", - "min_count: 2, size of vocab: 4227783.0\n", - "min_count: 3, size of vocab: 3008772.0\n", - "min_count: 4, size of vocab: 2439367.0\n", - "min_count: 5, size of vocab: 2090709.0\n", - "min_count: 6, size of vocab: 1856609.0\n", - "min_count: 7, size of vocab: 1681670.0\n", - "min_count: 8, size of vocab: 1546914.0\n", - "min_count: 9, size of vocab: 1437367.0\n", - "min_count: 10, size of vocab: 1346177.0\n", - "min_count: 11, size of vocab: 1267916.0\n", - "min_count: 12, size of vocab: 1201186.0\n", - "min_count: 13, size of vocab: 1142377.0\n", - "min_count: 14, size of vocab: 1090673.0\n", - "min_count: 15, size of vocab: 1043973.0\n", - "min_count: 16, size of vocab: 1002395.0\n", - "min_count: 17, size of vocab: 964684.0\n", - "min_count: 18, size of vocab: 930382.0\n", - "min_count: 19, size of vocab: 898725.0\n" + "['Anarchism'] : anarchism is political philosophy and movement that is sceptical of authority and rejects all involuntary coercive forms of hierarchy anarchism calls for the abolition of the state which it holds to be unnecessary undesirable and harmful as historically left wing movement placed on the farthest left of the political spectrum ……… criticism of philosophical anarchism defence of philosophical anarchism stating that both kinds of anarchism philosophical and political anarchism are philosophical and political claims anarchistic popular fiction novel an argument for philosophical anarchism external links anarchy archives anarchy archives is an online research center on the history and theory of anarchism\n" ] } ], "source": [ - "for num in range(0, 20):\n", - " print('min_count: {}, size of vocab: '.format(num), pre.scale_vocab(min_count=num, dry_run=True)['memory']['vocab']/700)" + "# Load and print the first preprocessed Wikipedia document, as a sanity check = \"input eyeballing\".\n", + "first_doc = next(iter(documents))\n", + "print(first_doc.tags, ': ', ' '.join(first_doc.words[:50] + ['………'] + first_doc.words[-50:]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The document seems legit so let's move on to finally training some Doc2vec models." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In the original paper, they set the vocabulary size 915,715. It seems similar size of vocabulary if we set min_count = 19. (size of vocab = 898,725)" + "## Training Doc2Vec" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## Training the Doc2Vec Model\n", - "To train Doc2Vec model by several method, DBOW and DM, we define the list of models." + "The original paper had a vocabulary size of 915,715 word types, so we'll try to match it by setting `max_final_vocab=915715` in the Doc2vec constructor." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-03-17 23:12:37,360 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-17T23:12:37.360576', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n", + "2022-03-17 23:12:37,365 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-17T23:12:37.365118', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n" + ] + } + ], "source": [ "cores = multiprocessing.cpu_count()\n", "\n", "models = [\n", " # PV-DBOW \n", - " Doc2Vec(dm=0, dbow_words=1, size=200, window=8, min_count=19, iter=10, workers=cores),\n", - " # PV-DM w/average\n", - " Doc2Vec(dm=1, dm_mean=1, size=200, window=8, min_count=19, iter =10, workers=cores),\n", + " Doc2Vec(dm=0, dbow_words=1, vector_size=200, window=8, epochs=10, workers=cores, max_final_vocab=915715),\n", + " # PV-DM with average\n", + " Doc2Vec(dm=1, dm_mean=1, vector_size=200, window=8, epochs=10, workers=cores, max_final_vocab=915715),\n", "]" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-03-17 23:14:38,521 : INFO : collecting all words and their counts\n", + "2022-03-17 23:14:38,529 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags\n", + "2022-03-17 23:16:33,505 : INFO : PROGRESS: at example #500000, processed 654950164 words (5696698/s), 3222179 word types, 500000 tags\n", + "2022-03-17 23:17:41,900 : INFO : PROGRESS: at example #1000000, processed 1018611068 words (5317131/s), 4480366 word types, 1000000 tags\n", + "2022-03-17 23:18:36,271 : INFO : PROGRESS: at example #1500000, processed 1305140647 words (5269927/s), 5420104 word types, 1500000 tags\n", + "2022-03-17 23:19:23,908 : INFO : PROGRESS: at example #2000000, processed 1550245240 words (5145361/s), 6188355 word types, 2000000 tags\n", + "2022-03-17 23:20:10,242 : INFO : PROGRESS: at example #2500000, processed 1790661139 words (5188872/s), 6941128 word types, 2500000 tags\n", + "2022-03-17 23:20:56,600 : INFO : PROGRESS: at example #3000000, processed 2028261627 words (5125392/s), 7664997 word types, 3000000 tags\n", + "2022-03-17 23:21:41,918 : INFO : PROGRESS: at example #3500000, processed 2264063867 words (5203393/s), 8347719 word types, 3500000 tags\n", + "2022-03-17 23:22:25,048 : INFO : PROGRESS: at example #4000000, processed 2488354257 words (5200461/s), 8971529 word types, 4000000 tags\n", + "2022-03-17 23:23:07,487 : INFO : PROGRESS: at example #4500000, processed 2703313059 words (5065249/s), 9605666 word types, 4500000 tags\n", + "2022-03-17 23:23:50,776 : INFO : PROGRESS: at example #5000000, processed 2925111571 words (5123692/s), 10217554 word types, 5000000 tags\n", + "2022-03-17 23:24:19,393 : INFO : collected 10427023 word types and 5176019 unique tags from a corpus of 5176019 examples and 2996051328 words\n", + "2022-03-17 23:24:22,841 : INFO : Doc2Vec lifecycle event {'msg': 'max_final_vocab=915715 and min_count=5 resulted in calc_min_count=27, effective_min_count=27', 'datetime': '2022-03-17T23:24:22.841740', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-17 23:24:22,842 : INFO : Creating a fresh vocabulary\n", + "2022-03-17 23:24:26,131 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=27 retains 894446 unique words (8.578153131531407%% of original 10427023, drops 9532577)', 'datetime': '2022-03-17T23:24:26.131147', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-17 23:24:26,131 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=27 leaves 2965917340 word corpus (98.99420988824929%% of original 2996051328, drops 30133988)', 'datetime': '2022-03-17T23:24:26.131643', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-17 23:24:28,513 : INFO : deleting the raw counts dictionary of 10427023 items\n", + "2022-03-17 23:24:28,581 : INFO : sample=0.001 downsamples 23 most-common words\n", + "2022-03-17 23:24:28,581 : INFO : Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 2412497836.8123784 word corpus (81.3%% of prior 2965917340)', 'datetime': '2022-03-17T23:24:28.581828', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-17 23:24:32,724 : INFO : estimated required memory for 894446 words and 200 dimensions: 7054355600 bytes\n", + "2022-03-17 23:24:32,725 : INFO : resetting layer weights\n", + "2022-03-17 23:24:37,804 : INFO : resetting layer weights\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec(dbow+w,d200,hs,w8,mc19,t8)\n", - "Doc2Vec(dm/m,d200,hs,w8,mc19,t8)\n" + "Doc2Vec\n", + "Doc2Vec\n" ] } ], "source": [ - "models[0].build_vocab(documents)\n", - "print(str(models[0]))\n", + "models[0].build_vocab(documents, progress_per=500000)\n", + "print(models[0])\n", + "\n", + "# Save some time by copying the vocabulary structures from the first model.\n", + "# Both models are built on top of exactly the same data, so there's no need to repeat the vocab-building step.\n", "models[1].reset_from(models[0])\n", - "print(str(models[1]))" + "print(models[1])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now we’re ready to train Doc2Vec of the English Wikipedia. " + "Now we’re ready to train Doc2Vec on the English Wikipedia. " ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "scrolled": true }, "outputs": [ { - "name": "stdout", + "name": "stderr", "output_type": "stream", "text": [ - "CPU times: user 5d 18h 24min 30s, sys: 26min 6s, total: 5d 18h 50min 36s\n", - "Wall time: 1d 2h 58min 58s\n", - "CPU times: user 1d 1h 28min 2s, sys: 33min 15s, total: 1d 2h 1min 18s\n", - "Wall time: 15h 27min 18s\n" + "2022-03-17 23:29:03,317 : WARNING : Effective 'alpha' higher than previous training cycles\n", + "2022-03-17 23:29:03,320 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 10 workers on 894446 vocabulary and 200 features, using sg=1 hs=0 sample=0.001 negative=5 window=8 shrink_windows=True', 'datetime': '2022-03-17T23:29:03.320153', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", + "2022-03-17 23:29:04,361 : INFO : EPOCH 1 - PROGRESS: at 0.00% examples, 379389 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-17 23:59:04,372 : INFO : EPOCH 1 - PROGRESS: at 17.95% examples, 429937 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 00:29:04,379 : INFO : EPOCH 1 - PROGRESS: at 55.55% examples, 437068 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-18 00:59:04,423 : INFO : EPOCH 1 - PROGRESS: at 98.13% examples, 439343 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-18 01:00:11,996 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 01:00:12,013 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 01:00:12,028 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 01:00:12,045 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 01:00:12,084 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 01:00:12,110 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 01:00:12,124 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 01:00:12,127 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 01:00:12,128 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 01:00:12,149 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 01:00:12,150 : INFO : EPOCH - 1 : training on 2996051328 raw words (2402988821 effective words) took 5468.8s, 439397 effective words/s\n", + "2022-03-18 01:00:13,169 : INFO : EPOCH 2 - PROGRESS: at 0.00% examples, 390039 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 01:30:13,186 : INFO : EPOCH 2 - PROGRESS: at 19.41% examples, 451763 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-18 02:00:13,162 : INFO : EPOCH 2 - PROGRESS: at 57.23% examples, 446954 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-18 02:30:05,143 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 02:30:05,151 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 02:30:05,152 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 02:30:05,162 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 02:30:05,206 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 02:30:05,229 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 02:30:05,232 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 02:30:05,244 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 02:30:05,248 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 02:30:05,255 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 02:30:05,255 : INFO : EPOCH - 2 : training on 2996051328 raw words (2402947663 effective words) took 5393.0s, 445566 effective words/s\n", + "2022-03-18 02:30:06,266 : INFO : EPOCH 3 - PROGRESS: at 0.00% examples, 414962 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 03:00:06,339 : INFO : EPOCH 3 - PROGRESS: at 19.29% examples, 449902 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 03:30:06,314 : INFO : EPOCH 3 - PROGRESS: at 57.27% examples, 447187 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 03:59:56,898 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 03:59:56,905 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 03:59:56,908 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 03:59:56,919 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 03:59:56,982 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 03:59:56,989 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 03:59:57,008 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 03:59:57,020 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 03:59:57,025 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 03:59:57,034 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 03:59:57,034 : INFO : EPOCH - 3 : training on 2996051328 raw words (2402969567 effective words) took 5391.8s, 445672 effective words/s\n", + "2022-03-18 03:59:58,059 : INFO : EPOCH 4 - PROGRESS: at 0.00% examples, 400337 words/s, in_qsize 19, out_qsize 1\n", + "2022-03-18 04:29:58,091 : INFO : EPOCH 4 - PROGRESS: at 19.41% examples, 451678 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 04:59:58,167 : INFO : EPOCH 4 - PROGRESS: at 57.02% examples, 445731 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-18 05:29:58,234 : INFO : EPOCH 4 - PROGRESS: at 99.74% examples, 444166 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-18 05:30:07,257 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 05:30:07,259 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 05:30:07,262 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 05:30:07,296 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 05:30:07,321 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 05:30:07,327 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 05:30:07,337 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 05:30:07,360 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 05:30:07,363 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 05:30:07,395 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 05:30:07,395 : INFO : EPOCH - 4 : training on 2996051328 raw words (2402983106 effective words) took 5410.2s, 444155 effective words/s\n", + "2022-03-18 05:30:08,414 : INFO : EPOCH 5 - PROGRESS: at 0.00% examples, 407411 words/s, in_qsize 19, out_qsize 1\n", + "2022-03-18 06:00:08,451 : INFO : EPOCH 5 - PROGRESS: at 19.32% examples, 450435 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 06:30:08,480 : INFO : EPOCH 5 - PROGRESS: at 57.18% examples, 446664 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 07:00:08,481 : INFO : EPOCH 5 - PROGRESS: at 99.48% examples, 443405 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 07:00:27,046 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 07:00:27,057 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 07:00:27,070 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 07:00:27,083 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 07:00:27,103 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 07:00:27,110 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 07:00:27,117 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 07:00:27,123 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 07:00:27,151 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 07:00:27,156 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 07:00:27,156 : INFO : EPOCH - 5 : training on 2996051328 raw words (2402973707 effective words) took 5419.7s, 443375 effective words/s\n", + "2022-03-18 07:00:28,166 : INFO : EPOCH 6 - PROGRESS: at 0.00% examples, 411123 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 07:30:28,233 : INFO : EPOCH 6 - PROGRESS: at 19.44% examples, 452181 words/s, in_qsize 18, out_qsize 1\n", + "2022-03-18 08:00:28,218 : INFO : EPOCH 6 - PROGRESS: at 57.42% examples, 447976 words/s, in_qsize 19, out_qsize 0\n" ] } ], "source": [ "for model in models:\n", - " %%time model.train(documents, total_examples=model.corpus_count, epochs=model.iter)" + " model.train(documents, total_examples=model.corpus_count, epochs=model.epochs, report_delay=30*60)" ] }, { @@ -256,118 +378,42 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "After that, let's test both models! DBOW model show similar results with the original paper. First, calculating cosine similarity of \"Machine learning\" using Paragraph Vector. Word Vector and Document Vector are separately stored. We have to add .docvecs after model name to extract Document Vector from Doc2Vec Model." + "After that, let's test both models! The DBOW model shows similar results as the original paper.\n", + "\n", + "First, calculate the most similar Wikipedia articles to the \"Machine learning\" article. The calculated word vectors and document vectors are separately stored, in `model.wv` and `model.dv` respectively:" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Doc2Vec(dbow+w,d200,hs,w8,mc19,t8)\n", - "[('Theoretical computer science', 0.7256590127944946),\n", - " ('Artificial neural network', 0.7162272930145264),\n", - " ('Pattern recognition', 0.6948175430297852),\n", - " ('Data mining', 0.6938608884811401),\n", - " ('Bayesian network', 0.6938260197639465),\n", - " ('Support vector machine', 0.6706081628799438),\n", - " ('Glossary of artificial intelligence', 0.670173704624176),\n", - " ('Computational learning theory', 0.6648679971694946),\n", - " ('Outline of computer science', 0.6638073921203613),\n", - " ('List of important publications in computer science', 0.663051187992096),\n", - " ('Mathematical optimization', 0.655048131942749),\n", - " ('Theory of computation', 0.6508707404136658),\n", - " ('Word-sense disambiguation', 0.6505812406539917),\n", - " ('Reinforcement learning', 0.6480429172515869),\n", - " (\"Solomonoff's theory of inductive inference\", 0.6459559202194214),\n", - " ('Computational intelligence', 0.6458009481430054),\n", - " ('Information visualization', 0.6437181234359741),\n", - " ('Algorithmic composition', 0.643247127532959),\n", - " ('Ray Solomonoff', 0.6425477862358093),\n", - " ('Kriging', 0.6425424814224243)]\n", - "Doc2Vec(dm/m,d200,hs,w8,mc19,t8)\n", - "[('Artificial neural network', 0.640324592590332),\n", - " ('Pattern recognition', 0.6244156360626221),\n", - " ('Data stream mining', 0.6140210032463074),\n", - " ('Theoretical computer science', 0.5964258909225464),\n", - " ('Outline of computer science', 0.5862746834754944),\n", - " ('Supervised learning', 0.5847170352935791),\n", - " ('Data mining', 0.5817658305168152),\n", - " ('Decision tree learning', 0.5785809755325317),\n", - " ('Bayesian network', 0.5768401622772217),\n", - " ('Computational intelligence', 0.5717238187789917),\n", - " ('Theory of computation', 0.5703311562538147),\n", - " ('Bayesian programming', 0.5693561434745789),\n", - " ('Reinforcement learning', 0.564978837966919),\n", - " ('Helmholtz machine', 0.564972460269928),\n", - " ('Inductive logic programming', 0.5631471276283264),\n", - " ('Algorithmic learning theory', 0.563083291053772),\n", - " ('Semi-supervised learning', 0.5628935694694519),\n", - " ('Early stopping', 0.5597405433654785),\n", - " ('Decision tree', 0.5596889853477478),\n", - " ('Artificial intelligence', 0.5569720268249512)]\n" - ] - } - ], + "outputs": [], "source": [ "for model in models:\n", - " print(str(model))\n", - " pprint(model.docvecs.most_similar(positive=[\"Machine learning\"], topn=20))" + " print(model)\n", + " pprint(model.dv.most_similar(positive=[\"Machine learning\"], topn=20))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "DBOW model interpret the word 'Machine Learning' as a part of Computer Science field, and DM model as Data Science related field.\n", + "The DBOW model interprets the word 'Machine Learning' as a part of the Computer Science field, while the DM model as a Data Science related field.\n", "\n", - "Second, calculating cosine simillarity of \"Lady Gaga\" using Paragraph Vector." + "Second, let's calculate the most similar Wikipedia entries to \"Lady Gaga\" using Paragraph Vector:" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "scrolled": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Doc2Vec(dbow+w,d200,hs,w8,mc19,t8)\n", - "[('Katy Perry', 0.7374469637870789),\n", - " ('Adam Lambert', 0.6972734928131104),\n", - " ('Miley Cyrus', 0.6212848424911499),\n", - " ('List of awards and nominations received by Lady Gaga', 0.6138384938240051),\n", - " ('Nicole Scherzinger', 0.6092700958251953),\n", - " ('Christina Aguilera', 0.6062655448913574),\n", - " ('Nicki Minaj', 0.6019431948661804),\n", - " ('Taylor Swift', 0.5973174571990967),\n", - " ('The Pussycat Dolls', 0.5888757705688477),\n", - " ('Beyoncé', 0.5844652652740479)]\n", - "Doc2Vec(dm/m,d200,hs,w8,mc19,t8)\n", - "[('ArtRave: The Artpop Ball', 0.5719832181930542),\n", - " ('Artpop', 0.5651129484176636),\n", - " ('Katy Perry', 0.5571318864822388),\n", - " ('The Fame', 0.5388195514678955),\n", - " ('The Fame Monster', 0.5380634069442749),\n", - " ('G.U.Y.', 0.5365751385688782),\n", - " ('Beautiful, Dirty, Rich', 0.5329179763793945),\n", - " ('Applause (Lady Gaga song)', 0.5328119993209839),\n", - " ('The Monster Ball Tour', 0.5299569368362427),\n", - " ('Lindsey Stirling', 0.5281971096992493)]\n" - ] - } - ], + "outputs": [], "source": [ "for model in models:\n", - " print(str(model))\n", - " pprint(model.docvecs.most_similar(positive=[\"Lady Gaga\"], topn=10))" + " print(model)\n", + " pprint(model.dv.most_similar(positive=[\"Lady Gaga\"], topn=10))" ] }, { @@ -376,67 +422,40 @@ "collapsed": true }, "source": [ - "DBOW model reveal the similar singer in the U.S., and DM model understand that many of Lady Gaga's songs are similar with the word \"Lady Gaga\".\n", + "The DBOW model reveals similar singers in the U.S., while the DM model understands that many of Lady Gaga's songs contain the word \"Lady Gaga\".\n", "\n", - "Third, calculating cosine simillarity of \"Lady Gaga\" - \"American\" + \"Japanese\" using Document vector and Word Vectors. \"American\" and \"Japanese\" are Word Vectors, not Paragraph Vectors. Word Vectors are already converted to lowercases by WikiCorpus." + "Finally, let's do some wilder artihmetics that embeddings are famous for. What are the entries most similar to \"Lady Gaga\" - \"American\" + \"Japanese\"? Note that \"American\" and \"Japanese\" are word vectors, but they live in the same space as the document vectors so we can add / subtract them for some interesting results. Note that all word vectors were already lowercased by our tokenizer above, so we look for the lowercased version here:" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "scrolled": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Doc2Vec(dbow+w,d200,hs,w8,mc19,t8)\n", - "[('Game (Perfume album)', 0.5571034550666809),\n", - " ('Katy Perry', 0.5537782311439514),\n", - " ('Taboo (Kumi Koda song)', 0.5304880142211914),\n", - " ('Kylie Minogue', 0.5234110355377197),\n", - " ('Ayumi Hamasaki', 0.5110630989074707),\n", - " (\"Girls' Generation\", 0.4996713399887085),\n", - " ('Britney Spears', 0.49094343185424805),\n", - " ('Koda Kumi', 0.48719698190689087),\n", - " ('Perfume (Japanese band)', 0.48536181449890137),\n", - " ('Kara (South Korean band)', 0.48507893085479736)]\n", - "Doc2Vec(dm/m,d200,hs,w8,mc19,t8)\n", - "[('Artpop', 0.47699037194252014),\n", - " ('Jessie J', 0.4439432621002197),\n", - " ('Haus of Gaga', 0.43463900685310364),\n", - " ('The Fame', 0.4278091788291931),\n", - " ('List of awards and nominations received by Lady Gaga', 0.4268512427806854),\n", - " ('Applause (Lady Gaga song)', 0.41563737392425537),\n", - " ('New Cutie Honey', 0.4152414798736572),\n", - " ('M.I.A. (rapper)', 0.4091864228248596),\n", - " ('Mama Do (Uh Oh, Uh Oh)', 0.4044945538043976),\n", - " ('The Fame Monster', 0.40421998500823975)]\n" - ] - } - ], + "outputs": [], "source": [ "for model in models:\n", - " print(str(model))\n", - " vec = [model.docvecs[\"Lady Gaga\"] - model[\"american\"] + model[\"japanese\"]]\n", - " pprint([m for m in model.docvecs.most_similar(vec, topn=11) if m[0] != \"Lady Gaga\"])" + " print(model)\n", + " vec = [model.dv[\"Lady Gaga\"] - model.wv[\"american\"] + model.wv[\"japanese\"]]\n", + " pprint([m for m in model.dv.most_similar(vec, topn=11) if m[0] != \"Lady Gaga\"])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "As a result, DBOW model demonstrate similar artists to Lady Gaga in Japan such as 'Perfume', who is the most famous idol in Japan. On the other hand, DM model results don't include Japanese artists in top 10 similar documents. It's almost the same with no vector calculated results.\n", + "As a result, the DBOW model surfaced artists similar to Lady Gaga in Japan, such as 'Perfume', who is the most famous idol in Japan.\n", + "\n", + "On the other hand, results from the DM model don't include any Japanese artists in its top 10 most similar documents.\n", "\n", - "These results demonstrate that the DBOW employed in the original paper is outstanding for calculating similarity between Document Vector and Word Vector." + "These results demonstrate that the DBOW training mode employed in the original paper is serviceable for calculating similarity between document vectors, word vectors, or a combination of both." ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -450,7 +469,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.1" + "version": "3.8.9" } }, "nbformat": 4, diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 5f4c173b8a..ee8c4ef281 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -452,8 +452,10 @@ def extract_pages(f, filter_namespaces=False, filter_articles=None): _extract_pages = extract_pages # for backward compatibility -def process_article(args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, - token_max_len=TOKEN_MAX_LEN, lower=True): +def process_article( + args, tokenizer_func=tokenize, token_min_len=TOKEN_MIN_LEN, + token_max_len=TOKEN_MAX_LEN, lower=True, + ): """Parse a Wikipedia article, extract all tokens. Notes @@ -525,7 +527,7 @@ def _process_article(args): return process_article( args, tokenizer_func=tokenizer_func, token_min_len=token_min_len, - token_max_len=token_max_len, lower=lower + token_max_len=token_max_len, lower=lower, ) @@ -567,9 +569,11 @@ class WikiCorpus(TextCorpus): >>> MmCorpus.serialize(corpus_path, wiki) # another 8h, creates a file in MatrixMarket format and mapping """ - def __init__(self, fname, processes=None, lemmatize=None, dictionary=None, - filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, - token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None): + def __init__( + self, fname, processes=None, lemmatize=None, dictionary=None, metadata=False, + filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, + token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None, + ): """Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, @@ -602,6 +606,9 @@ def __init__(self, fname, processes=None, lemmatize=None, dictionary=None, If set, each XML article element will be passed to this callable before being processed. Only articles where the callable returns an XML element are processed, returning None allows filtering out some articles based on customised rules. + metadata: bool + Have the `get_texts()` method yield `(content_tokens, (page_id, page_title))` tuples, instead + of just `content_tokens`. Warnings -------- @@ -618,7 +625,7 @@ def __init__(self, fname, processes=None, lemmatize=None, dictionary=None, self.fname = fname self.filter_namespaces = filter_namespaces self.filter_articles = filter_articles - self.metadata = False + self.metadata = metadata if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index c1ff25d994..bb3a10689a 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -156,9 +156,11 @@ def count(self, new_val): class Doc2Vec(Word2Vec): - def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, - dm_tag_count=1, dv=None, dv_mapfile=None, comment=None, trim_rule=None, callbacks=(), - window=5, epochs=10, shrink_windows=True, **kwargs): + def __init__( + self, documents=None, corpus_file=None, vector_size=100, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, + dm_tag_count=1, dv=None, dv_mapfile=None, comment=None, trim_rule=None, callbacks=(), + window=5, epochs=10, shrink_windows=True, **kwargs, + ): """Class for training, using and evaluating neural networks described in `Distributed Representations of Sentences and Documents `_. @@ -655,7 +657,7 @@ def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None): return doctag_vectors[0] def __getitem__(self, tag): - """Get the vector representation of (possible multi-term) tag. + """Get the vector representation of (possibly multi-term) tag. Parameters ---------- @@ -836,8 +838,10 @@ def estimate_memory(self, vocab_size=None, report=None): report['doctag_syn0'] = len(self.dv) * self.vector_size * dtype(REAL).itemsize return super(Doc2Vec, self).estimate_memory(vocab_size, report=report) - def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, - keep_raw_vocab=False, trim_rule=None, **kwargs): + def build_vocab( + self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, + keep_raw_vocab=False, trim_rule=None, **kwargs, + ): """Build vocabulary from a sequence of documents (can be a once-only generator stream). Parameters @@ -877,7 +881,7 @@ def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, prog """ total_words, corpus_count = self.scan_vocab( corpus_iterable=corpus_iterable, corpus_file=corpus_file, - progress_per=progress_per, trim_rule=trim_rule + progress_per=progress_per, trim_rule=trim_rule, ) self.corpus_count = corpus_count self.corpus_total_words = total_words @@ -959,7 +963,7 @@ def _scan_vocab(self, corpus_iterable, progress_per, trim_rule): if document_no % progress_per == 0: interval_rate = (total_words - interval_count) / (default_timer() - interval_start) logger.info( - "PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags", + "PROGRESS: at example #%i, processed %i words (%i words/s), %i word types, %i tags", document_no, total_words, interval_rate, len(vocab), len(doctags_list) ) interval_start = default_timer() @@ -1008,8 +1012,8 @@ def _scan_vocab(self, corpus_iterable, progress_per, trim_rule): self.raw_vocab = vocab return total_words, corpus_count - def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000, trim_rule=None): - """Create the models Vocabulary: A mapping from unique words in the corpus to their frequency count. + def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=100000, trim_rule=None): + """Create the model's vocabulary: a mapping from unique words in the corpus to their frequency count. Parameters ---------- @@ -1038,7 +1042,7 @@ def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000, Returns ------- (int, int) - Tuple of (Total words in the corpus, number of documents) + Tuple of `(total words in the corpus, number of documents)`. """ logger.info("collecting all words and their counts") @@ -1049,7 +1053,7 @@ def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000, logger.info( "collected %i word types and %i unique tags from a corpus of %i examples and %i words", - len(self.raw_vocab), len(self.dv), corpus_count, total_words + len(self.raw_vocab), len(self.dv), corpus_count, total_words, ) return total_words, corpus_count diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 6a407e860e..de247d6353 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -888,7 +888,7 @@ def print_debug(id2token, u, s, topics, num_words=10, num_neg=None): def stochastic_svd( corpus, rank, num_terms, chunksize=20000, extra_dims=None, power_iters=0, dtype=np.float64, eps=1e-6, random_seed=None, -): + ): """Run truncated Singular Value Decomposition (SVD) on a sparse input. Parameters @@ -955,7 +955,7 @@ def stochastic_svd( if scipy.sparse.issparse(corpus): m, n = corpus.shape - assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms) + assert num_terms == m, f"mismatch in number of features: {m} in sparse matrix vs. {num_terms} parameter" o = random_state.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix sparsetools.csc_matvecs( m, n, samples, corpus.indptr, corpus.indices, diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index d7df12e283..b289223032 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -661,7 +661,7 @@ def prepare_vocab( "prepare_vocab", msg=( f"effective_min_count={self.effective_min_count} retains {len(retain_words)} unique " - f"words ({retain_unique_pct}%% of original {original_unique_total}, drops {drop_unique})" + f"words ({retain_unique_pct:.2f}% of original {original_unique_total}, drops {drop_unique})" ), ) @@ -671,7 +671,7 @@ def prepare_vocab( "prepare_vocab", msg=( f"effective_min_count={self.effective_min_count} leaves {retain_total} word corpus " - f"({retain_pct}%% of original {original_total}, drops {drop_total})" + f"({retain_pct:.2f}% of original {original_total}, drops {drop_total})" ), ) else: @@ -706,9 +706,9 @@ def prepare_vocab( self.add_lifecycle_event( "prepare_vocab", msg=( - f"added {len(new_words)} new unique words ({new_unique_pct}%% of original " + f"added {len(new_words)} new unique words ({new_unique_pct:.2f}% of original " f"{original_unique_total}) and increased the count of {len(pre_exist_words)} " - f"pre-existing words ({pre_exist_unique_pct}%% of original {original_unique_total})" + f"pre-existing words ({pre_exist_unique_pct:.2f}% of original {original_unique_total})" ), ) retain_words = new_words + pre_exist_words From acce8a21377d1f599f8b9ec56b10321c91d4109d Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 18 Mar 2022 23:05:04 +0900 Subject: [PATCH 35/81] test and build wheels for Py3.{7,8,9,10} (#3298) * test and build wheels for Py3.{7,8,9,10} * fix yaml * upgrade pip inside the wheel building environment * use xfail with a broader filter instead of skip * debugging pip issues with Py3.10 * upgrade setuptools as well as pip * fix xfail * more multibuild wizardry * override install_run * Revert "override install_run" This reverts commit 8b7691a82efbd7393b557bacf64ea3f77641c005. * try a different way to upgrade pip install the docker container * try to upgrade pip inside the docker container * Revert "try to upgrade pip inside the docker container" This reverts commit 6fb2f1e483de5dba6a23c26cc581c99f11b4ce95. * yet another attempt to upgrade pip * replace curl with python call * remove 3.6 from wheel build matrix * adjust config.sh * urlretrieve script * improve urlretrieve.py hack * argh, no f-strings * upgrade setuptools as well * remove .egg files before install_run * try upgrading importlib_metadata * more .egg file removal * disable some builds for faster turn-around while hacking * disable some builds for faster turn-around while hacking * update build-wheels.yml * update numpy for py3.10 * get rid of morfessor in wheel build I don't think we use it anywhere * trim TEST_DEPENDS * clean up, lock scipy version for py310 wheel builds * upgrade manylinux version for Py310 linux build * bump Cython version to 0.29.28 * adjust build_wheels workflow test wheels in a separate step that gets skipped for Py3.10. this is a work-around for a segfault that I cannot reproduce locally. * double quotes bad, single quotes good * re-enable all wheel builds * re-enable all wheel builds * separate test step for wheels * fix yaml * argh tabs dammit * delete old test step * fixup * actually install wheel prior to test * copy pytest command from config.sh * do the tests really fail on Linux Py3.10? * fix testing under Windows * mark blinking test as always xfail * remove unused import * make wheel tests simpler * re-enable tests, increase tolerance --- .github/workflows/build-wheels.yml | 85 ++++++++++++++++----- .github/workflows/tests.yml | 4 + config.sh | 14 +++- continuous_integration/upgrade_pip_py310.py | 10 +++ gensim/test/test_translation_matrix.py | 7 +- gensim/test/test_word2vec.py | 2 +- setup.py | 4 +- tox.ini | 2 +- 8 files changed, 98 insertions(+), 30 deletions(-) create mode 100644 continuous_integration/upgrade_pip_py310.py diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index ebe2201a6d..ff304ea1c7 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.6, 3.7, 3.8, 3.9] + python-version: ['3.7', '3.8', '3.9', '3.10'] os: [ubuntu-latest, macos-latest, windows-latest] platform: [x64] include: @@ -43,11 +43,6 @@ jobs: # https://github.com/scipy/oldest-supported-numpy/blob/master/setup.cfg # with the exception that we enforce the minimum version to be 1.17.0. # - - os: ubuntu-latest - manylinux-version: 2010 - python-version: 3.6 - build-depends: numpy==1.17.0 - - os: ubuntu-latest manylinux-version: 2010 python-version: 3.7 @@ -63,11 +58,10 @@ jobs: python-version: 3.9 build-depends: numpy==1.19.3 - - os: macos-latest - travis-os-name: osx - manylinux-version: 1 - python-version: 3.6 - build-depends: numpy==1.17.0 + - os: ubuntu-latest + manylinux-version: 2014 + python-version: "3.10" + build-depends: numpy==1.22.2 scipy==1.8.0 - os: macos-latest travis-os-name: osx @@ -87,10 +81,11 @@ jobs: python-version: 3.9 build-depends: numpy==1.19.3 - - os: windows-latest - manylinux-version: 2010 - python-version: 3.6 - build-depends: numpy==1.17.0 + - os: macos-latest + travis-os-name: osx + manylinux-version: 1 + python-version: "3.10" + build-depends: numpy==1.22.2 scipy==1.8.0 - os: windows-latest manylinux-version: 2010 @@ -107,6 +102,11 @@ jobs: python-version: 3.9 build-depends: numpy==1.19.3 + - os: windows-latest + manylinux-version: 2010 + python-version: "3.10" + build-depends: numpy==1.22.2 scipy==1.8.0 + env: PKG_NAME: gensim REPO_DIR: gensim @@ -114,7 +114,7 @@ jobs: PLAT: x86_64 UNICODE_WIDTH: 32 MB_PYTHON_VERSION: ${{ matrix.python-version }} # MB_PYTHON_VERSION is needed by Multibuild - TEST_DEPENDS: Morfessor==2.0.2a4 python-levenshtein==0.12.0 visdom==0.1.8.9 pytest pytest-cov mock cython nmslib pyemd testfixtures scikit-learn pyemd + TEST_DEPENDS: pytest mock testfixtures DOCKER_TEST_IMAGE: multibuild/xenial_x86_64 TRAVIS_OS_NAME: ${{ matrix.travis-os-name }} SKIP_NETWORK_TESTS: 1 @@ -144,7 +144,7 @@ jobs: run: | python -m pip install --upgrade pip pip install virtualenv - - name: Build and Install Wheels (Multibuild) + - name: Build Wheel (Multibuild) if: matrix.os != 'windows-latest' run: | echo ::group::Set up Multibuild @@ -156,17 +156,16 @@ jobs: before_install echo ::endgroup:: echo ::group::Build wheel + find . -type f -name "*.egg" -exec rm -v {} \; build_wheel $REPO_DIR ${{ matrix.PLAT }} echo ::endgroup:: - echo ::group::Install run - install_run ${{ matrix.PLAT }} - echo ::endgroup:: + # # We can't use multibuild on Windows, so we have to roll our own build script. # Adapted from # https://github.com/RaRe-Technologies/gensim-wheels/commit/084b863390edee05bbe15d4ec05d1ab726e52202 # - - name: Build and Install Wheels (Windows) + - name: Build Wheel (Windows) if: matrix.os == 'windows-latest' run: | echo ::group::Set up dependencies @@ -190,6 +189,50 @@ jobs: # mv dist wheelhouse + - name: Prepare for testing + run: | + # + # FIXME: Why are these eggs here? + # + # These eggs prevent the wheel from building and running on Py3.10 + # + find . -type f -name "*.egg" -exec rm -v {} \; + python -m venv test_environment + + # + # Multibuild has a test step but it essentially just installs the wheel + # and runs the test, and requires a lot of magic to get it working. + # It also does not work under Windows. + # So, we create our own simple test step here. + # + - name: Install and Test Wheel (Linux, MacOS) + if: matrix.os != 'windows-latest' + run: | + . test_environment/bin/activate + pip install pytest testfixtures mock + pip install wheelhouse/*.whl + cd test_environment + python -c 'import gensim;print(gensim.__version__)' + # + # This part relies on the wheel containing tests and required data. + # If we remove that from the wheel, we'll need to rewrite this step. + # + pytest -rfxEXs --durations=20 --disable-warnings --showlocals --pyargs gensim + + # + # We need a separate testing step for windows because the command for + # activating the virtual environment is slightly different + # + - name: Install and Test Wheel (Windows) + if: matrix.os == 'windows-latest' + run: | + test_environment/Scripts/activate.bat + pip install pytest testfixtures mock + pip install wheelhouse/*.whl + cd test_environment + python -c 'import gensim;print(gensim.__version__)' + pytest -rfxEXs --durations=20 --disable-warnings --showlocals --pyargs gensim + - name: Upload wheels to s3://gensim-wheels # # Only do this if the credentials are set. diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index bd36e19aee..530aff2683 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -19,8 +19,12 @@ jobs: - {name: Linux, python: 3.7, os: ubuntu-20.04, tox: 'flake8,flake8-docs'} - {name: Linux, python: 3.7, os: ubuntu-20.04, tox: 'py37-linux'} - {name: Linux, python: 3.8, os: ubuntu-20.04, tox: 'py38-linux-cov'} + - {name: Linux, python: 3.9, os: ubuntu-20.04, tox: 'py39-linux'} + - {name: Linux, python: '3.10', os: ubuntu-20.04, tox: 'py310-linux'} - {name: Windows, python: 3.7, os: windows-2019, tox: 'py37-win'} - {name: Windows, python: 3.8, os: windows-2019, tox: 'py38-win'} + - {name: Windows, python: 3.9, os: windows-2019, tox: 'py39-win'} + - {name: Windows, python: '3.10', os: windows-2019, tox: 'py310-win'} env: TOX_PARALLEL_NO_SPINNER: 1 diff --git a/config.sh b/config.sh index ed9bea2b31..30c2e9d8eb 100755 --- a/config.sh +++ b/config.sh @@ -31,8 +31,20 @@ function build_wheel_cmd { function run_tests { # Runs tests on installed distribution from an empty directory set -x - python --version pip freeze pytest -rfxEXs --durations=20 --disable-warnings --showlocals --pyargs gensim set +x } + +# +# We do this here because we want to upgrade pip before the wheel gets installed. +# docker_test_wrap.sh sources this file before the wheel install. The sourcing +# happens from multiple places, and some of the Python versions can be really +# ancient (e.g. when working outside a virtual environment, using the default +# Python install). +# +# We don't use pip to do the actual upgrade because something appears broken +# with the default pip on the Python 3.10 multibuild image. This is really +# dodgy, but I couldn't work out a better way to get this done. +# +python continuous_integration/upgrade_pip_py310.py diff --git a/continuous_integration/upgrade_pip_py310.py b/continuous_integration/upgrade_pip_py310.py new file mode 100644 index 0000000000..2a9cb68893 --- /dev/null +++ b/continuous_integration/upgrade_pip_py310.py @@ -0,0 +1,10 @@ +# This script needs to be able run under both Python 2 and 3 without crashing +# It only achieves the desired effect under Py3.10 on Linux and MacOS. +import subprocess +import sys +import tempfile +if sys.platform in ('linux', 'darwin') and sys.version_info[:2] == (3, 10): + import urllib.request + with tempfile.NamedTemporaryFile(suffix='.py') as fout: + urllib.request.urlretrieve("https://bootstrap.pypa.io/get-pip.py", fout.name) + subprocess.call([sys.executable, fout.name]) diff --git a/gensim/test/test_translation_matrix.py b/gensim/test/test_translation_matrix.py index bd98ca10d9..c725fc0139 100644 --- a/gensim/test/test_translation_matrix.py +++ b/gensim/test/test_translation_matrix.py @@ -6,6 +6,7 @@ import logging import numpy as np +import pytest from scipy.spatial.distance import cosine from gensim.models.doc2vec import Doc2Vec @@ -61,9 +62,9 @@ def test_translate_nn(self): for idx, item in enumerate(self.test_word_pairs): self.assertTrue(item[1] in translated_words[item[0]]) - @unittest.skipIf( - (sys.version_info.major == 3) and (sys.version_info.minor == 9) and (sys.platform == 'darwin'), - 'blinking test, can be related to ' + @pytest.mark.xfail( + sys.platform == 'darwin', + reason='blinking test, can be related to ' ) def test_translate_gc(self): # Test globally corrected neighbour retrieval method diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 71996ca981..56a1ecfae0 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -834,7 +834,7 @@ def test_parallel(self): # the exact vectors and therefore similarities may differ, due to different thread collisions/randomization # so let's test only for top10 neighbor_rank = [word for word, sim in sims].index(expected_neighbor) - self.assertLess(neighbor_rank, 2) + self.assertLess(neighbor_rank, 3) def test_r_n_g(self): """Test word2vec results identical with identical RNG seed.""" diff --git a/setup.py b/setup.py index d1238e8fa7..28104d86d9 100644 --- a/setup.py +++ b/setup.py @@ -270,7 +270,6 @@ def run(self): 'mock', 'cython', 'testfixtures', - 'Morfessor>=2.0.2a4', ] if not (sys.platform.lower().startswith("win") and sys.version_info[:2] >= (3, 9)): @@ -319,13 +318,12 @@ def run(self): # to build with any sane version of Cython, so we should update this pin # periodically. # -CYTHON_STR = 'Cython==0.29.23' +CYTHON_STR = 'Cython==0.29.28' install_requires = [ NUMPY_STR, 'scipy >= 0.18.1', 'smart_open >= 1.8.1', - "dataclasses; python_version < '3.7'", # pre-py3.7 needs `dataclasses` backport for use of `dataclass` in doc2vec.py ] setup_requires = [NUMPY_STR] diff --git a/tox.ini b/tox.ini index f602550e8a..566e331997 100644 --- a/tox.ini +++ b/tox.ini @@ -1,6 +1,6 @@ [tox] minversion = 2.0 -envlist = {py36,py37,py38, py39}-{win,linux}, py38-linux-cov, flake8, docs, docs-upload, download-wheels, upload-wheels, test-pypi +envlist = {py37,py38,py39,py310}-{win,linux}, py38-linux-cov, flake8, docs, docs-upload, download-wheels, upload-wheels, test-pypi skipsdist = True platform = linux: linux win: win64 From a9365210f3dfd3fed4c8f91faff7ad119dbb5cf4 Mon Sep 17 00:00:00 2001 From: Andrii Oriekhov Date: Fri, 18 Mar 2022 16:06:49 +0200 Subject: [PATCH 36/81] add GitHub URL for PyPi (#3303) --- setup.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/setup.py b/setup.py index 28104d86d9..1be3057c3e 100644 --- a/setup.py +++ b/setup.py @@ -346,6 +346,9 @@ def run(self): author_email='me@radimrehurek.com', url='http://radimrehurek.com/gensim', + project_urls={ + 'Source': 'https://github.com/RaRe-Technologies/gensim', + }, download_url='http://pypi.python.org/pypi/gensim', license='LGPL-2.1-only', From 766b9e19585968d011649b53057fba3663eff551 Mon Sep 17 00:00:00 2001 From: Gordon Mohr Date: Fri, 18 Mar 2022 07:10:52 -0700 Subject: [PATCH 37/81] Ensure next_index available when loading old stored KeyedVectors models (#3117) * fix #3114: ensure next_index available * rm trailing whitespace --- gensim/models/keyedvectors.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index f56adb0b14..674689afce 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -274,6 +274,9 @@ def _load_specials(self, *args, **kwargs): # fixup rename of vocab into map if 'key_to_index' not in self.__dict__: self._upconvert_old_vocab() + # ensure older instances have next_index + if not hasattr(self, 'next_index'): + self.next_index = len(self) def _upconvert_old_vocab(self): """Convert a loaded, pre-gensim-4.0.0 version instance that had a 'vocab' dict of data objects.""" From 9e8e90a5b3d75c753dd6d0c492aed9c89610e5f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Fri, 18 Mar 2022 19:09:47 +0100 Subject: [PATCH 38/81] finished doc2vec-wikipedia notebook --- docs/notebooks/doc2vec-wikipedia.ipynb | 416 +++++++++++++++++++++++-- docs/src/models/fasttext.rst | 2 +- gensim/models/word2vec.py | 2 +- 3 files changed, 396 insertions(+), 24 deletions(-) diff --git a/docs/notebooks/doc2vec-wikipedia.ipynb b/docs/notebooks/doc2vec-wikipedia.ipynb index ed1f37df7f..66b48b9062 100644 --- a/docs/notebooks/doc2vec-wikipedia.ipynb +++ b/docs/notebooks/doc2vec-wikipedia.ipynb @@ -11,9 +11,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We replicate the **Document Embedding with Paragraph Vectors** paper, http://arxiv.org/abs/1507.07998.\n", + "This notebook replicates the **Document Embedding with Paragraph Vectors** paper, http://arxiv.org/abs/1507.07998.\n", "\n", - "In this paper, the authors showed only DBOW results trained from Wikipedia data. So we replicate this experiments using not only DBOW but also the DM mode of the \"paragraph vector\" algorithm aka Doc2Vec." + "In that paper, the authors only showed results from the DBOW (\"distributed bag of words\") mode, trained on the English Wikipedia. Here we replicate this experiment using not only DBOW, but also the DM mode of the \"paragraph vector\" algorithm aka Doc2Vec." ] }, { @@ -62,9 +62,7 @@ "\n", "Second, convert that Wikipedia article dump from the arcane Wikimedia XML format into a plain text file. This will make the subsequent training faster and also allow easy inspection of the data = \"input eyeballing\".\n", "\n", - "We'll preprocess each article at the same time, normalizing its text to lowercase, splitting into tokens, etc.\n", - "\n", - "Below I use a regexp tokenizer that simply looks for alphabetic sequences as tokens. But feel free to adapt the text preprocessing to your own domain. High quality preprocessing is often critical for the final pipeline accuracy – garbage in, garbage out!" + "We'll preprocess each article at the same time, normalizing its text to lowercase, splitting into tokens, etc. Below I use a regexp tokenizer that simply looks for alphabetic sequences as tokens. But feel free to adapt the text preprocessing to your own domain. High quality preprocessing is often critical for the final pipeline accuracy – garbage in, garbage out!" ] }, { @@ -120,7 +118,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The above takes about 2 hours on my 2021 M1 MacbookPro, and creates a new ~5.8 GB file named `wiki.txt.gz`. We're compressing the text into `.gz` (GZIP) right away to save on disk space, using the [smart_open](https://github.com/RaRe-Technologies/smart_open) library.\n", + "The above takes about 2 hours on my 2021 M1 MacbookPro, and creates a new ~5.8 GB file named `wiki.txt.gz`. We're compressing the text into `.gz` (GZIP) right away to save on disk space, using the [smart_open](https://github.com/RaRe-Technologies/smart_open) library.\n", "\n", "Next we'll set up a stream to load the preprocessed articles from `wiki.txt.gz` one by one, in the format expected by Doc2Vec, ready for training. We don't want to load everything into RAM at once, because that would blow up the memory. And it is not necessary – Gensim can handle streamed training data:" ] @@ -267,12 +265,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we’re ready to train Doc2Vec on the English Wikipedia. " + "Now we’re ready to train Doc2Vec on the English Wikipedia. **Warning!** Training the DBOW model takes ~16 hours, and DM ~4 hours, on my 2021 laptop." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "scrolled": true }, @@ -358,7 +356,221 @@ "2022-03-18 07:00:27,156 : INFO : EPOCH - 5 : training on 2996051328 raw words (2402973707 effective words) took 5419.7s, 443375 effective words/s\n", "2022-03-18 07:00:28,166 : INFO : EPOCH 6 - PROGRESS: at 0.00% examples, 411123 words/s, in_qsize 19, out_qsize 0\n", "2022-03-18 07:30:28,233 : INFO : EPOCH 6 - PROGRESS: at 19.44% examples, 452181 words/s, in_qsize 18, out_qsize 1\n", - "2022-03-18 08:00:28,218 : INFO : EPOCH 6 - PROGRESS: at 57.42% examples, 447976 words/s, in_qsize 19, out_qsize 0\n" + "2022-03-18 08:00:28,218 : INFO : EPOCH 6 - PROGRESS: at 57.42% examples, 447976 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 08:30:28,213 : INFO : EPOCH 6 - PROGRESS: at 99.29% examples, 442848 words/s, in_qsize 19, out_qsize 0\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-03-18 08:30:54,071 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 08:30:54,085 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 08:30:54,094 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 08:30:54,131 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 08:30:54,132 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 08:30:54,145 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 08:30:54,164 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 08:30:54,171 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 08:30:54,183 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 08:30:54,189 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 08:30:54,189 : INFO : EPOCH - 6 : training on 2996051328 raw words (2402970085 effective words) took 5427.0s, 442777 effective words/s\n", + "2022-03-18 08:30:55,193 : INFO : EPOCH 7 - PROGRESS: at 0.00% examples, 410013 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 09:00:55,267 : INFO : EPOCH 7 - PROGRESS: at 18.94% examples, 444759 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 09:30:55,268 : INFO : EPOCH 7 - PROGRESS: at 55.80% examples, 438741 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 10:00:55,370 : INFO : EPOCH 7 - PROGRESS: at 96.36% examples, 433564 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-18 10:03:28,340 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 10:03:28,355 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 10:03:28,356 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 10:03:28,376 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 10:03:28,384 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 10:03:28,418 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 10:03:28,419 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 10:03:28,428 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 10:03:28,454 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 10:03:28,468 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 10:03:28,468 : INFO : EPOCH - 7 : training on 2996051328 raw words (2402959910 effective words) took 5554.2s, 432641 effective words/s\n", + "2022-03-18 10:03:29,479 : INFO : EPOCH 8 - PROGRESS: at 0.00% examples, 369271 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 10:33:29,480 : INFO : EPOCH 8 - PROGRESS: at 17.46% examples, 422524 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-18 11:03:29,445 : INFO : EPOCH 8 - PROGRESS: at 50.17% examples, 408225 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 11:33:29,516 : INFO : EPOCH 8 - PROGRESS: at 90.78% examples, 414606 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-18 11:39:44,147 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 11:39:44,166 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 11:39:44,171 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 11:39:44,186 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 11:39:44,215 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 11:39:44,226 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 11:39:44,231 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 11:39:44,256 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 11:39:44,262 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 11:39:44,268 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 11:39:44,268 : INFO : EPOCH - 8 : training on 2996051328 raw words (2402978662 effective words) took 5775.8s, 416044 effective words/s\n", + "2022-03-18 11:39:45,288 : INFO : EPOCH 9 - PROGRESS: at 0.00% examples, 394893 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 12:09:45,255 : INFO : EPOCH 9 - PROGRESS: at 19.03% examples, 446055 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 12:39:45,321 : INFO : EPOCH 9 - PROGRESS: at 56.60% examples, 443313 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-18 13:09:45,381 : INFO : EPOCH 9 - PROGRESS: at 97.33% examples, 436797 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 13:11:27,488 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 13:11:27,492 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 13:11:27,504 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 13:11:27,535 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 13:11:27,552 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 13:11:27,553 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 13:11:27,564 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 13:11:27,567 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 13:11:27,584 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 13:11:27,592 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 13:11:27,592 : INFO : EPOCH - 9 : training on 2996051328 raw words (2402988333 effective words) took 5503.3s, 436648 effective words/s\n", + "2022-03-18 13:11:28,615 : INFO : EPOCH 10 - PROGRESS: at 0.00% examples, 391894 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-18 13:41:28,611 : INFO : EPOCH 10 - PROGRESS: at 17.83% examples, 428194 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-18 14:11:28,622 : INFO : EPOCH 10 - PROGRESS: at 51.72% examples, 416555 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 14:41:28,621 : INFO : EPOCH 10 - PROGRESS: at 91.60% examples, 417311 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-18 14:47:23,420 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 14:47:23,432 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 14:47:23,433 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 14:47:23,437 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 14:47:23,469 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 14:47:23,484 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 14:47:23,502 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 14:47:23,512 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 14:47:23,516 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 14:47:23,534 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 14:47:23,534 : INFO : EPOCH - 10 : training on 2996051328 raw words (2402969667 effective words) took 5755.9s, 417476 effective words/s\n", + "2022-03-18 14:47:23,536 : INFO : Doc2Vec lifecycle event {'msg': 'training on 29960513280 raw words (24029729521 effective words) took 55099.9s, 436112 effective words/s', 'datetime': '2022-03-18T14:47:23.536569', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", + "2022-03-18 14:47:23,537 : WARNING : Effective 'alpha' higher than previous training cycles\n", + "2022-03-18 14:47:23,537 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 10 workers on 894446 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=8 shrink_windows=True', 'datetime': '2022-03-18T14:47:23.537351', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", + "2022-03-18 14:47:24,546 : INFO : EPOCH 1 - PROGRESS: at 0.00% examples, 121520 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-18 15:12:45,307 : INFO : worker thread finished; awaiting finish of 9 more threads\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-03-18 15:12:45,325 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 15:12:45,326 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 15:12:45,327 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 15:12:45,327 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 15:12:45,332 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 15:12:45,338 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 15:12:45,346 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 15:12:45,348 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 15:12:45,354 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 15:12:45,355 : INFO : EPOCH - 1 : training on 2996051328 raw words (2402951760 effective words) took 1521.7s, 1579074 effective words/s\n", + "2022-03-18 15:12:46,373 : INFO : EPOCH 2 - PROGRESS: at 0.01% examples, 1835607 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-18 15:38:18,272 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 15:38:18,310 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 15:38:18,312 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 15:38:18,313 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 15:38:18,319 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 15:38:18,320 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 15:38:18,325 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 15:38:18,336 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 15:38:18,338 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 15:38:18,339 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 15:38:18,339 : INFO : EPOCH - 2 : training on 2996051328 raw words (2402972271 effective words) took 1533.0s, 1567541 effective words/s\n", + "2022-03-18 15:38:19,355 : INFO : EPOCH 3 - PROGRESS: at 0.01% examples, 1940890 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-18 16:02:47,736 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 16:02:47,759 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 16:02:47,762 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 16:02:47,764 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 16:02:47,764 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 16:02:47,775 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 16:02:47,781 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 16:02:47,788 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 16:02:47,789 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 16:02:47,791 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 16:02:47,791 : INFO : EPOCH - 3 : training on 2996051328 raw words (2402988495 effective words) took 1469.4s, 1635360 effective words/s\n", + "2022-03-18 16:02:48,814 : INFO : EPOCH 4 - PROGRESS: at 0.01% examples, 2013560 words/s, in_qsize 0, out_qsize 2\n", + "2022-03-18 16:26:11,222 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 16:26:11,234 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 16:26:11,236 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 16:26:11,239 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 16:26:11,242 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 16:26:11,245 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 16:26:11,254 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 16:26:11,258 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 16:26:11,261 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 16:26:11,262 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 16:26:11,262 : INFO : EPOCH - 4 : training on 2996051328 raw words (2402958098 effective words) took 1403.4s, 1712179 effective words/s\n", + "2022-03-18 16:26:12,270 : INFO : EPOCH 5 - PROGRESS: at 0.01% examples, 2003817 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-18 16:50:15,159 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 16:50:15,175 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 16:50:15,176 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 16:50:15,177 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 16:50:15,183 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 16:50:15,186 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 16:50:15,198 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 16:50:15,206 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 16:50:15,206 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 16:50:15,207 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 16:50:15,207 : INFO : EPOCH - 5 : training on 2996051328 raw words (2402956752 effective words) took 1443.9s, 1664163 effective words/s\n", + "2022-03-18 16:50:16,219 : INFO : EPOCH 6 - PROGRESS: at 0.01% examples, 1987406 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-18 17:13:45,624 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 17:13:45,632 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 17:13:45,635 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 17:13:45,636 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 17:13:45,637 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 17:13:45,640 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 17:13:45,651 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 17:13:45,657 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 17:13:45,663 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 17:13:45,665 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 17:13:45,665 : INFO : EPOCH - 6 : training on 2996051328 raw words (2402960350 effective words) took 1410.5s, 1703664 effective words/s\n", + "2022-03-18 17:13:46,675 : INFO : EPOCH 7 - PROGRESS: at 0.01% examples, 1985995 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-18 17:37:06,489 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 17:37:06,523 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 17:37:06,524 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 17:37:06,524 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 17:37:06,525 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 17:37:06,529 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 17:37:06,537 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 17:37:06,543 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 17:37:06,545 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 17:37:06,546 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 17:37:06,546 : INFO : EPOCH - 7 : training on 2996051328 raw words (2402972246 effective words) took 1400.9s, 1715338 effective words/s\n", + "2022-03-18 17:37:07,560 : INFO : EPOCH 8 - PROGRESS: at 0.01% examples, 2069561 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-18 18:00:31,024 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 18:00:31,053 : INFO : worker thread finished; awaiting finish of 8 more threads\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-03-18 18:00:31,056 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 18:00:31,057 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 18:00:31,059 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 18:00:31,059 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 18:00:31,068 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 18:00:31,072 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 18:00:31,075 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 18:00:31,076 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 18:00:31,076 : INFO : EPOCH - 8 : training on 2996051328 raw words (2402970402 effective words) took 1404.5s, 1710899 effective words/s\n", + "2022-03-18 18:00:32,091 : INFO : EPOCH 9 - PROGRESS: at 0.01% examples, 2063533 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-18 18:23:47,471 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 18:23:47,482 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 18:23:47,485 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 18:23:47,489 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 18:23:47,490 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 18:23:47,492 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 18:23:47,497 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 18:23:47,506 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 18:23:47,507 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 18:23:47,509 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 18:23:47,510 : INFO : EPOCH - 9 : training on 2996051328 raw words (2402978209 effective words) took 1396.4s, 1720847 effective words/s\n", + "2022-03-18 18:23:48,527 : INFO : EPOCH 10 - PROGRESS: at 0.01% examples, 1857646 words/s, in_qsize 11, out_qsize 0\n", + "2022-03-18 18:47:53,356 : INFO : worker thread finished; awaiting finish of 9 more threads\n", + "2022-03-18 18:47:53,364 : INFO : worker thread finished; awaiting finish of 8 more threads\n", + "2022-03-18 18:47:53,368 : INFO : worker thread finished; awaiting finish of 7 more threads\n", + "2022-03-18 18:47:53,370 : INFO : worker thread finished; awaiting finish of 6 more threads\n", + "2022-03-18 18:47:53,370 : INFO : worker thread finished; awaiting finish of 5 more threads\n", + "2022-03-18 18:47:53,372 : INFO : worker thread finished; awaiting finish of 4 more threads\n", + "2022-03-18 18:47:53,377 : INFO : worker thread finished; awaiting finish of 3 more threads\n", + "2022-03-18 18:47:53,383 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2022-03-18 18:47:53,385 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2022-03-18 18:47:53,389 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2022-03-18 18:47:53,390 : INFO : EPOCH - 10 : training on 2996051328 raw words (2402975872 effective words) took 1445.9s, 1661955 effective words/s\n", + "2022-03-18 18:47:53,391 : INFO : Doc2Vec lifecycle event {'msg': 'training on 29960513280 raw words (24029684455 effective words) took 14429.7s, 1665293 effective words/s', 'datetime': '2022-03-18T18:47:53.391169', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n" ] } ], @@ -380,14 +592,63 @@ "source": [ "After that, let's test both models! The DBOW model shows similar results as the original paper.\n", "\n", - "First, calculate the most similar Wikipedia articles to the \"Machine learning\" article. The calculated word vectors and document vectors are separately stored, in `model.wv` and `model.dv` respectively:" + "First, calculate the most similar Wikipedia articles to the \"Machine learning\" article. The calculated word vectors and document vectors are stored separately, in `model.wv` and `model.dv` respectively:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Doc2Vec\n", + "[('Supervised learning', 0.7626678943634033),\n", + " ('Pattern recognition', 0.7443839907646179),\n", + " ('Artificial neural network', 0.7443667650222778),\n", + " ('Boosting (machine learning)', 0.7209591865539551),\n", + " ('Deep learning', 0.7030681371688843),\n", + " ('Linear classifier', 0.6918482184410095),\n", + " ('Feature selection', 0.6885010600090027),\n", + " ('Knowledge retrieval', 0.6797034740447998),\n", + " ('Convolutional neural network', 0.6789148449897766),\n", + " ('Outline of computer science', 0.6732515096664429),\n", + " ('Training, validation, and test sets', 0.6729527711868286),\n", + " ('Support-vector machine', 0.6719434857368469),\n", + " ('Learning classifier system', 0.6716565489768982),\n", + " ('Outline of machine learning', 0.6692107915878296),\n", + " ('Bayesian network', 0.6654112935066223),\n", + " ('Manifold regularization', 0.6635575294494629),\n", + " ('Multi-task learning', 0.6624512672424316),\n", + " ('Fuzzy logic', 0.6605969667434692),\n", + " ('Computer mathematics', 0.6600310206413269),\n", + " ('Recurrent neural network', 0.6571199893951416)]\n", + "Doc2Vec\n", + "[('Pattern recognition', 0.731984555721283),\n", + " ('Supervised learning', 0.7107947468757629),\n", + " ('Multi-task learning', 0.6985798478126526),\n", + " ('Semi-supervised learning', 0.6792073249816895),\n", + " ('Meta learning (computer science)', 0.6784282922744751),\n", + " ('Support-vector machine', 0.6740356683731079),\n", + " ('Feature selection', 0.6702772378921509),\n", + " ('Statistical learning theory', 0.6683863997459412),\n", + " ('Automatic image annotation', 0.661750078201294),\n", + " ('Deep learning', 0.6617218255996704),\n", + " ('Linear classifier', 0.6573296189308167),\n", + " ('Statistical classification', 0.654957115650177),\n", + " ('Regularization (mathematics)', 0.6517974138259888),\n", + " ('Data analysis techniques for fraud detection', 0.6505621671676636),\n", + " ('Artificial neural network', 0.6478281021118164),\n", + " ('Boosting (machine learning)', 0.6463974714279175),\n", + " ('Naive Bayes classifier', 0.6442222595214844),\n", + " ('Autoencoder', 0.6438822746276855),\n", + " ('Predictive Model Markup Language', 0.6405109763145447),\n", + " ('Perceptron', 0.6379765868186951)]\n" + ] + } + ], "source": [ "for model in models:\n", " print(model)\n", @@ -398,18 +659,47 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The DBOW model interprets the word 'Machine Learning' as a part of the Computer Science field, while the DM model as a Data Science related field.\n", + "Both results seem similar, but note the DM model took 4x less time train (training 4x faster).\n", "\n", "Second, let's calculate the most similar Wikipedia entries to \"Lady Gaga\" using Paragraph Vector:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Doc2Vec\n", + "[('Ariana Grande', 0.755739688873291),\n", + " ('Katy Perry', 0.7534462213516235),\n", + " ('Miley Cyrus', 0.7091007828712463),\n", + " ('Adele', 0.6958011984825134),\n", + " ('Demi Lovato', 0.6867919564247131),\n", + " ('Nicki Minaj', 0.6783465147018433),\n", + " ('Taylor Swift', 0.6691418886184692),\n", + " ('Adam Lambert', 0.6638894081115723),\n", + " ('Rihanna', 0.6437391638755798),\n", + " ('Kesha', 0.6433634161949158)]\n", + "Doc2Vec\n", + "[('Born This Way (album)', 0.6649508476257324),\n", + " ('Artpop', 0.6616811752319336),\n", + " ('Lady Gaga videography', 0.6363328695297241),\n", + " ('Katy Perry', 0.6322777271270752),\n", + " ('Beautiful, Dirty, Rich', 0.6277879476547241),\n", + " ('Lady Gaga discography', 0.60688316822052),\n", + " ('Applause (Lady Gaga song)', 0.6062529683113098),\n", + " ('List of Lady Gaga live performances', 0.5975069403648376),\n", + " ('Born This Way (song)', 0.5948888659477234),\n", + " ('Madonna', 0.5918263792991638)]\n" + ] + } + ], "source": [ "for model in models:\n", " print(model)\n", @@ -422,18 +712,49 @@ "collapsed": true }, "source": [ - "The DBOW model reveals similar singers in the U.S., while the DM model understands that many of Lady Gaga's songs contain the word \"Lady Gaga\".\n", + "The DBOW model reveals similar singers in the U.S., while the DM model seems to pay more attention to the word \"Gaga\" itself.\n", "\n", - "Finally, let's do some wilder artihmetics that embeddings are famous for. What are the entries most similar to \"Lady Gaga\" - \"American\" + \"Japanese\"? Note that \"American\" and \"Japanese\" are word vectors, but they live in the same space as the document vectors so we can add / subtract them for some interesting results. Note that all word vectors were already lowercased by our tokenizer above, so we look for the lowercased version here:" + "Finally, let's do some wilder artihmetics that vectors embeddings are famous for. What are the entries most similar to \"Lady Gaga\" - \"American\" + \"Japanese\"?\n", + "\n", + "Note that \"American\" and \"Japanese\" are word vectors, but they live in the same space as the document vectors so we can add / subtract them at will, for some interesting results. All word vectors were already lowercased by our tokenizer above, so we look for the lowercased version here:" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": { "scrolled": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Doc2Vec\n", + "[('Ayumi Hamasaki', 0.604461669921875),\n", + " ('2NE1', 0.5942890644073486),\n", + " ('Katy Perry', 0.5932046175003052),\n", + " ('Ariana Grande', 0.5865142941474915),\n", + " (\"Can't Stop the Disco\", 0.5778986215591431),\n", + " (\"Girls' Generation\", 0.5741134285926819),\n", + " ('We Are \"Lonely Girl\"', 0.5682086944580078),\n", + " ('Perfume (Japanese band)', 0.568188488483429),\n", + " ('H (Ayumi Hamasaki EP)', 0.5679325461387634),\n", + " ('Kyary Pamyu Pamyu', 0.5665541887283325)]\n", + "Doc2Vec\n", + "[('Kaela Kimura', 0.5528751015663147),\n", + " ('Chisato Moritaka', 0.551906943321228),\n", + " ('Suzuki Ami Around the World: Live House Tour 2005', 0.5428911447525024),\n", + " ('Pink Lady (duo)', 0.5385505557060242),\n", + " ('Artpop', 0.5361125469207764),\n", + " ('Kaede (dancer)', 0.535369873046875),\n", + " ('Miliyah Kato', 0.5336685180664062),\n", + " ('Liyuu', 0.5325193405151367),\n", + " ('Ai (singer)', 0.5272262692451477),\n", + " ('Momoiro Clover Z', 0.525260329246521)]\n" + ] + } + ], "source": [ "for model in models:\n", " print(model)\n", @@ -445,11 +766,62 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "As a result, the DBOW model surfaced artists similar to Lady Gaga in Japan, such as 'Perfume', who is the most famous idol in Japan.\n", + "As a result, the DBOW model surfaced artists similar to Lady Gaga in Japan, such as **Ayumi Hamasaki** whose Wiki bio says:\n", + "\n", + "> Ayumi Hamasaki is a Japanese singer, songwriter, record producer, actress, model, spokesperson, and entrepreneur.\n", + "\n", + "So that sounds like a success.\n", + "\n", + "Similarly, the DM model thought **Kaela Kimura** is the closest hit:\n", + "\n", + "> Kaela Kimura is a Japanese pop rock singer, lyricist, fashion model and television presenter.\n", "\n", - "On the other hand, results from the DM model don't include any Japanese artists in its top 10 most similar documents.\n", + "Also pretty good.\n", "\n", - "These results demonstrate that the DBOW training mode employed in the original paper is serviceable for calculating similarity between document vectors, word vectors, or a combination of both." + "These results demonstrate that both training modes employed in the original paper are outstanding for calculating similarity between document vectors, word vectors, or a combination of both. The DM mode has the added advantage of being 4x faster to train." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you wanted to continue working with these trained models, you could save them to disk, to avoid having to re-train the models from scratch every time:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-03-18 19:08:34,623 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec_dbow.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-03-18T19:08:34.622990', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'saving'}\n", + "2022-03-18 19:08:34,641 : INFO : storing np array 'vectors' to doc2vec_dbow.model.dv.vectors.npy\n", + "2022-03-18 19:08:40,244 : INFO : storing np array 'vectors' to doc2vec_dbow.model.wv.vectors.npy\n", + "2022-03-18 19:08:46,811 : INFO : storing np array 'syn1neg' to doc2vec_dbow.model.syn1neg.npy\n", + "2022-03-18 19:08:48,564 : INFO : not storing attribute cum_table\n", + "2022-03-18 19:08:56,097 : INFO : saved doc2vec_dbow.model\n", + "2022-03-18 19:08:56,098 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec_dm.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-03-18T19:08:56.098765', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'saving'}\n", + "2022-03-18 19:08:56,099 : INFO : storing np array 'vectors' to doc2vec_dm.model.dv.vectors.npy\n", + "2022-03-18 19:09:09,087 : INFO : storing np array 'vectors' to doc2vec_dm.model.wv.vectors.npy\n", + "2022-03-18 19:09:13,804 : INFO : storing np array 'syn1neg' to doc2vec_dm.model.syn1neg.npy\n", + "2022-03-18 19:09:16,101 : INFO : not storing attribute cum_table\n", + "2022-03-18 19:09:20,432 : INFO : saved doc2vec_dm.model\n" + ] + } + ], + "source": [ + "models[0].save('doc2vec_dbow.model')\n", + "models[1].save('doc2vec_dm.model')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To continue your doc2vec explorations, refer to the official API documentation in Gensim: https://radimrehurek.com/gensim/models/doc2vec.html" ] } ], diff --git a/docs/src/models/fasttext.rst b/docs/src/models/fasttext.rst index e65b43fd25..392e68f2fd 100644 --- a/docs/src/models/fasttext.rst +++ b/docs/src/models/fasttext.rst @@ -5,6 +5,6 @@ :synopsis: FastText model :members: :inherited-members: - :special-members: __getitem__ + :special-members: __getitem__, __contains__ :undoc-members: :show-inheritance: diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index b289223032..5f4d06e634 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1286,7 +1286,7 @@ def _log_epoch_progress( report = progress_queue.get() # blocks if workers too slow if report is None: # a thread reporting that it finished unfinished_worker_count -= 1 - logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) + logger.debug("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) continue examples, trained_words, raw_words = report job_tally += 1 From fe79fbfa7a2e98ded3cd9190e96d706218d38aa2 Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Sun, 20 Mar 2022 10:12:59 +0800 Subject: [PATCH 39/81] Reformat FT_CMD definition (#3300) Requested-in: https://github.com/RaRe-Technologies/gensim/pull/3264#pullrequestreview-894414178 Co-authored-by: Michael Penkov --- gensim/test/test_fasttext.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 8922ee0ac9..2ff7995e0c 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -45,8 +45,7 @@ BUCKET = 10000 FT_HOME = os.environ.get("FT_HOME") -FT_CMD = shutil.which("fasttext", path=FT_HOME) or \ - shutil.which("fasttext") +FT_CMD = shutil.which("fasttext", path=FT_HOME) or shutil.which("fasttext") new_sentences = [ From e844c90f3e55afc0bff62d2c5b611b18498437d4 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 20 Mar 2022 11:20:03 +0900 Subject: [PATCH 40/81] mark test_translate_gc as unconditionally xfail Seems to occasionally fail on both windows and darwin --- gensim/test/test_translation_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_translation_matrix.py b/gensim/test/test_translation_matrix.py index c725fc0139..0cb4682013 100644 --- a/gensim/test/test_translation_matrix.py +++ b/gensim/test/test_translation_matrix.py @@ -63,7 +63,7 @@ def test_translate_nn(self): self.assertTrue(item[1] in translated_words[item[0]]) @pytest.mark.xfail( - sys.platform == 'darwin', + True, reason='blinking test, can be related to ' ) def test_translate_gc(self): From c59a4129c3b2b55550a4f5aa0e97473e516767fd Mon Sep 17 00:00:00 2001 From: Lasse Hyyrynen Date: Sun, 20 Mar 2022 07:40:45 +0200 Subject: [PATCH 41/81] Respect encoding when reading binary keyed vectors Current implementation fails to read keyed vectors that have iso-8859-1 encoding in the words when encoded in binary format. An example of this type of a file can be seen in the turkuNLP finnish embeddings: http://dl.turkunlp.org/finnish-embeddings/finnish_s24_skgram.bin This file is quite trivial to load by passing the encoding to the vector loading function. It is also logical that when user asks KeyedVectors.load_word2vec_format(filename, binary=True, encoding='iso-8859-1') The library would try to load the file assuming that the matrix is in binary format and the words are encoded using iso-8859-1 encoding. --- gensim/models/keyedvectors.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 674689afce..bf1d81ed80 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -1836,7 +1836,7 @@ def _add_word_to_kv(kv, counts, word, weights, vocab_size): kv.set_vecattr(word, 'count', word_count) -def _add_bytes_to_kv(kv, counts, chunk, vocab_size, vector_size, datatype, unicode_errors): +def _add_bytes_to_kv(kv, counts, chunk, vocab_size, vector_size, datatype, unicode_errors, encoding): start = 0 processed_words = 0 bytes_per_vector = vector_size * dtype(REAL).itemsize @@ -1849,7 +1849,7 @@ def _add_bytes_to_kv(kv, counts, chunk, vocab_size, vector_size, datatype, unico if i_space == -1 or (len(chunk) - i_vector) < bytes_per_vector: break - word = chunk[start:i_space].decode("utf-8", errors=unicode_errors) + word = chunk[start:i_space].decode(encoding, errors=unicode_errors) # Some binary files are reported to have obsolete new line in the beginning of word, remove it word = word.lstrip('\n') vector = frombuffer(chunk, offset=i_vector, count=vector_size, dtype=REAL).astype(datatype) @@ -1860,7 +1860,7 @@ def _add_bytes_to_kv(kv, counts, chunk, vocab_size, vector_size, datatype, unico return processed_words, chunk[start:] -def _word2vec_read_binary(fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size): +def _word2vec_read_binary(fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size, encoding="utf-8"): chunk = b'' tot_processed_words = 0 @@ -1868,7 +1868,7 @@ def _word2vec_read_binary(fin, kv, counts, vocab_size, vector_size, datatype, un new_chunk = fin.read(binary_chunk_size) chunk += new_chunk processed_words, chunk = _add_bytes_to_kv( - kv, counts, chunk, vocab_size, vector_size, datatype, unicode_errors) + kv, counts, chunk, vocab_size, vector_size, datatype, unicode_errors, encoding) tot_processed_words += processed_words if len(new_chunk) < binary_chunk_size: break @@ -1973,7 +1973,7 @@ def _load_word2vec_format( if binary: _word2vec_read_binary( - fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size, + fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size, encoding ) else: _word2vec_read_text(fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, encoding) From 7e866cd85e26505646f9770e9ca6e28cb5203f09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sun, 20 Mar 2022 09:40:49 +0100 Subject: [PATCH 42/81] explore different settings in doc2vec-wikipedia --- docs/notebooks/doc2vec-wikipedia.ipynb | 648 ++++++++----------------- gensim/models/doc2vec.py | 2 +- 2 files changed, 209 insertions(+), 441 deletions(-) diff --git a/docs/notebooks/doc2vec-wikipedia.ipynb b/docs/notebooks/doc2vec-wikipedia.ipynb index 66b48b9062..5e36b2b525 100644 --- a/docs/notebooks/doc2vec-wikipedia.ipynb +++ b/docs/notebooks/doc2vec-wikipedia.ipynb @@ -13,7 +13,7 @@ "source": [ "This notebook replicates the **Document Embedding with Paragraph Vectors** paper, http://arxiv.org/abs/1507.07998.\n", "\n", - "In that paper, the authors only showed results from the DBOW (\"distributed bag of words\") mode, trained on the English Wikipedia. Here we replicate this experiment using not only DBOW, but also the DM mode of the \"paragraph vector\" algorithm aka Doc2Vec." + "In that paper, the authors only showed results from the DBOW (\"distributed bag of words\") mode, trained on the English Wikipedia. Here we replicate this experiment using not only DBOW, but also the DM (\"distributed memory\") mode of the Paragraph Vector algorithm aka Doc2Vec." ] }, { @@ -65,20 +65,6 @@ "We'll preprocess each article at the same time, normalizing its text to lowercase, splitting into tokens, etc. Below I use a regexp tokenizer that simply looks for alphabetic sequences as tokens. But feel free to adapt the text preprocessing to your own domain. High quality preprocessing is often critical for the final pipeline accuracy – garbage in, garbage out!" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "wiki = WikiCorpus(\n", - " \"enwiki-latest-pages-articles.xml.bz2\", # path to the file you downloaded above\n", - " tokenizer_func=tokenize, # simple regexp; plug in your own tokenizer here\n", - " metadata=True, # also return the article titles and ids when parsing\n", - " dictionary={}, # don't start processing the data yet\n", - ")" - ] - }, { "cell_type": "code", "execution_count": 2, @@ -106,6 +92,13 @@ } ], "source": [ + "wiki = WikiCorpus(\n", + " \"enwiki-latest-pages-articles.xml.bz2\", # path to the file you downloaded above\n", + " tokenizer_func=tokenize, # simple regexp; plug in your own tokenizer here\n", + " metadata=True, # also return the article titles and ids when parsing\n", + " dictionary={}, # don't start processing the data yet\n", + ")\n", + "\n", "with smart_open.open(\"wiki.txt.gz\", \"w\", encoding='utf8') as fout:\n", " for article_no, (content, (page_id, title)) in enumerate(wiki.get_texts()):\n", " title = ' '.join(title.split())\n", @@ -125,7 +118,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -143,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -178,12 +171,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The original paper had a vocabulary size of 915,715 word types, so we'll try to match it by setting `max_final_vocab=915715` in the Doc2vec constructor." + "The original paper had a vocabulary size of 915,715 word types, so we'll try to match it by setting `max_final_vocab` to 1,000,000 in the Doc2vec constructor.\n", + "\n", + "Other critical parameters were left unspecified in the paper, so we'll go with a default window size of five (a prediction window of 5 tokens to either side), and downsampling of frequent words at 1e-5. It looks like the authors tried vector dimensionality of 100, 300, 1,000 & 10,000 in the paper (with 10k dims performing the best), but I'll only train with 300 dimensions here, to keep RAM in check on my laptop.\n", + "\n", + "Feel free to tinker with these values yourself if you like:" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "metadata": { "scrolled": false }, @@ -192,62 +189,68 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-03-17 23:12:37,360 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-17T23:12:37.360576', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n", - "2022-03-17 23:12:37,365 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-17T23:12:37.365118', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n" + "2022-03-19 19:45:13,743 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-19T19:45:13.743356', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n", + "2022-03-19 19:45:13,745 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-19T19:45:13.745470', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n" ] } ], "source": [ - "cores = multiprocessing.cpu_count()\n", + "workers = multiprocessing.cpu_count() # train with 10 threads on my 10-core laptop\n", "\n", "models = [\n", - " # PV-DBOW \n", - " Doc2Vec(dm=0, dbow_words=1, vector_size=200, window=8, epochs=10, workers=cores, max_final_vocab=915715),\n", - " # PV-DM with average\n", - " Doc2Vec(dm=1, dm_mean=1, vector_size=200, window=8, epochs=10, workers=cores, max_final_vocab=915715),\n", + " # PV-DBOW: paragraph vector in distributed bag of words mode\n", + " Doc2Vec(\n", + " dm=0, dbow_words=1, # dbow_words=1 to train word vectors at the same time too, not only DBOW\n", + " vector_size=300, window=5, sample=1e-5, epochs=10, workers=workers, max_final_vocab=1000000,\n", + " ),\n", + " # PV-DM: paragraph vector in distributed memory mode\n", + " Doc2Vec(\n", + " dm=1, dm_concat=0, dm_mean=1, # use average of context word vectors to train DM\n", + " vector_size=300, window=5, sample=1e-5, epochs=10, workers=workers, max_final_vocab=1000000,\n", + " )\n", "]" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2022-03-17 23:14:38,521 : INFO : collecting all words and their counts\n", - "2022-03-17 23:14:38,529 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags\n", - "2022-03-17 23:16:33,505 : INFO : PROGRESS: at example #500000, processed 654950164 words (5696698/s), 3222179 word types, 500000 tags\n", - "2022-03-17 23:17:41,900 : INFO : PROGRESS: at example #1000000, processed 1018611068 words (5317131/s), 4480366 word types, 1000000 tags\n", - "2022-03-17 23:18:36,271 : INFO : PROGRESS: at example #1500000, processed 1305140647 words (5269927/s), 5420104 word types, 1500000 tags\n", - "2022-03-17 23:19:23,908 : INFO : PROGRESS: at example #2000000, processed 1550245240 words (5145361/s), 6188355 word types, 2000000 tags\n", - "2022-03-17 23:20:10,242 : INFO : PROGRESS: at example #2500000, processed 1790661139 words (5188872/s), 6941128 word types, 2500000 tags\n", - "2022-03-17 23:20:56,600 : INFO : PROGRESS: at example #3000000, processed 2028261627 words (5125392/s), 7664997 word types, 3000000 tags\n", - "2022-03-17 23:21:41,918 : INFO : PROGRESS: at example #3500000, processed 2264063867 words (5203393/s), 8347719 word types, 3500000 tags\n", - "2022-03-17 23:22:25,048 : INFO : PROGRESS: at example #4000000, processed 2488354257 words (5200461/s), 8971529 word types, 4000000 tags\n", - "2022-03-17 23:23:07,487 : INFO : PROGRESS: at example #4500000, processed 2703313059 words (5065249/s), 9605666 word types, 4500000 tags\n", - "2022-03-17 23:23:50,776 : INFO : PROGRESS: at example #5000000, processed 2925111571 words (5123692/s), 10217554 word types, 5000000 tags\n", - "2022-03-17 23:24:19,393 : INFO : collected 10427023 word types and 5176019 unique tags from a corpus of 5176019 examples and 2996051328 words\n", - "2022-03-17 23:24:22,841 : INFO : Doc2Vec lifecycle event {'msg': 'max_final_vocab=915715 and min_count=5 resulted in calc_min_count=27, effective_min_count=27', 'datetime': '2022-03-17T23:24:22.841740', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-17 23:24:22,842 : INFO : Creating a fresh vocabulary\n", - "2022-03-17 23:24:26,131 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=27 retains 894446 unique words (8.578153131531407%% of original 10427023, drops 9532577)', 'datetime': '2022-03-17T23:24:26.131147', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-17 23:24:26,131 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=27 leaves 2965917340 word corpus (98.99420988824929%% of original 2996051328, drops 30133988)', 'datetime': '2022-03-17T23:24:26.131643', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-17 23:24:28,513 : INFO : deleting the raw counts dictionary of 10427023 items\n", - "2022-03-17 23:24:28,581 : INFO : sample=0.001 downsamples 23 most-common words\n", - "2022-03-17 23:24:28,581 : INFO : Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 2412497836.8123784 word corpus (81.3%% of prior 2965917340)', 'datetime': '2022-03-17T23:24:28.581828', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-17 23:24:32,724 : INFO : estimated required memory for 894446 words and 200 dimensions: 7054355600 bytes\n", - "2022-03-17 23:24:32,725 : INFO : resetting layer weights\n", - "2022-03-17 23:24:37,804 : INFO : resetting layer weights\n" + "2022-03-19 19:45:22,230 : INFO : collecting all words and their counts\n", + "2022-03-19 19:45:22,237 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags\n", + "2022-03-19 19:47:16,714 : INFO : PROGRESS: at example #500000, processed 654950164 words (5721331 words/s), 3222179 word types, 500000 tags\n", + "2022-03-19 19:48:22,015 : INFO : PROGRESS: at example #1000000, processed 1018611068 words (5569047 words/s), 4480366 word types, 1000000 tags\n", + "2022-03-19 19:49:14,916 : INFO : PROGRESS: at example #1500000, processed 1305140647 words (5416499 words/s), 5420104 word types, 1500000 tags\n", + "2022-03-19 19:50:00,348 : INFO : PROGRESS: at example #2000000, processed 1550245240 words (5395000 words/s), 6188355 word types, 2000000 tags\n", + "2022-03-19 19:50:45,274 : INFO : PROGRESS: at example #2500000, processed 1790661139 words (5351430 words/s), 6941128 word types, 2500000 tags\n", + "2022-03-19 19:51:30,706 : INFO : PROGRESS: at example #3000000, processed 2028261627 words (5229901 words/s), 7664997 word types, 3000000 tags\n", + "2022-03-19 19:52:15,667 : INFO : PROGRESS: at example #3500000, processed 2264063867 words (5244690 words/s), 8347719 word types, 3500000 tags\n", + "2022-03-19 19:52:58,112 : INFO : PROGRESS: at example #4000000, processed 2488354257 words (5284392 words/s), 8971529 word types, 4000000 tags\n", + "2022-03-19 19:53:40,217 : INFO : PROGRESS: at example #4500000, processed 2703313059 words (5105821 words/s), 9605666 word types, 4500000 tags\n", + "2022-03-19 19:54:22,431 : INFO : PROGRESS: at example #5000000, processed 2925111571 words (5254227 words/s), 10217554 word types, 5000000 tags\n", + "2022-03-19 19:54:51,211 : INFO : collected 10427023 word types and 5176019 unique tags from a corpus of 5176019 examples and 2996051328 words\n", + "2022-03-19 19:54:54,641 : INFO : Doc2Vec lifecycle event {'msg': 'max_final_vocab=1000000 and min_count=5 resulted in calc_min_count=23, effective_min_count=23', 'datetime': '2022-03-19T19:54:54.641858', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-19 19:54:54,642 : INFO : Creating a fresh vocabulary\n", + "2022-03-19 19:54:58,144 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 retains 991887 unique words (9.51% of original 10427023, drops 9435136)', 'datetime': '2022-03-19T19:54:58.144360', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-19 19:54:58,144 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 leaves 2968296495 word corpus (99.07% of original 2996051328, drops 27754833)', 'datetime': '2022-03-19T19:54:58.144810', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-19 19:55:00,904 : INFO : deleting the raw counts dictionary of 10427023 items\n", + "2022-03-19 19:55:00,968 : INFO : sample=1e-05 downsamples 4155 most-common words\n", + "2022-03-19 19:55:00,969 : INFO : Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 1194754612.050565 word corpus (40.3%% of prior 2968296495)', 'datetime': '2022-03-19T19:55:00.969465', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-19 19:55:05,714 : INFO : estimated required memory for 991887 words and 300 dimensions: 10122898900 bytes\n", + "2022-03-19 19:55:05,714 : INFO : resetting layer weights\n", + "2022-03-19 19:55:14,899 : INFO : resetting layer weights\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "Doc2Vec\n" + "Doc2Vec\n", + "Doc2Vec\n" ] } ], @@ -255,7 +258,7 @@ "models[0].build_vocab(documents, progress_per=500000)\n", "print(models[0])\n", "\n", - "# Save some time by copying the vocabulary structures from the first model.\n", + "# Save some time by copying the vocabulary structures from the DBOW model to the DM model.\n", "# Both models are built on top of exactly the same data, so there's no need to repeat the vocab-building step.\n", "models[1].reset_from(models[0])\n", "print(models[1])" @@ -265,12 +268,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we’re ready to train Doc2Vec on the English Wikipedia. **Warning!** Training the DBOW model takes ~16 hours, and DM ~4 hours, on my 2021 laptop." + "Now we’re ready to train Doc2Vec on the entirety of the English Wikipedia. **Warning!** Training this DBOW model takes ~16 hours, and DM ~4 hours, on my 2021 laptop." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 7, "metadata": { "scrolled": true }, @@ -279,298 +282,60 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-03-17 23:29:03,317 : WARNING : Effective 'alpha' higher than previous training cycles\n", - "2022-03-17 23:29:03,320 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 10 workers on 894446 vocabulary and 200 features, using sg=1 hs=0 sample=0.001 negative=5 window=8 shrink_windows=True', 'datetime': '2022-03-17T23:29:03.320153', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", - "2022-03-17 23:29:04,361 : INFO : EPOCH 1 - PROGRESS: at 0.00% examples, 379389 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-17 23:59:04,372 : INFO : EPOCH 1 - PROGRESS: at 17.95% examples, 429937 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 00:29:04,379 : INFO : EPOCH 1 - PROGRESS: at 55.55% examples, 437068 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-18 00:59:04,423 : INFO : EPOCH 1 - PROGRESS: at 98.13% examples, 439343 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-18 01:00:11,996 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 01:00:12,013 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 01:00:12,028 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 01:00:12,045 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 01:00:12,084 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 01:00:12,110 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 01:00:12,124 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 01:00:12,127 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 01:00:12,128 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 01:00:12,149 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 01:00:12,150 : INFO : EPOCH - 1 : training on 2996051328 raw words (2402988821 effective words) took 5468.8s, 439397 effective words/s\n", - "2022-03-18 01:00:13,169 : INFO : EPOCH 2 - PROGRESS: at 0.00% examples, 390039 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 01:30:13,186 : INFO : EPOCH 2 - PROGRESS: at 19.41% examples, 451763 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-18 02:00:13,162 : INFO : EPOCH 2 - PROGRESS: at 57.23% examples, 446954 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-18 02:30:05,143 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 02:30:05,151 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 02:30:05,152 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 02:30:05,162 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 02:30:05,206 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 02:30:05,229 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 02:30:05,232 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 02:30:05,244 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 02:30:05,248 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 02:30:05,255 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 02:30:05,255 : INFO : EPOCH - 2 : training on 2996051328 raw words (2402947663 effective words) took 5393.0s, 445566 effective words/s\n", - "2022-03-18 02:30:06,266 : INFO : EPOCH 3 - PROGRESS: at 0.00% examples, 414962 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 03:00:06,339 : INFO : EPOCH 3 - PROGRESS: at 19.29% examples, 449902 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 03:30:06,314 : INFO : EPOCH 3 - PROGRESS: at 57.27% examples, 447187 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 03:59:56,898 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 03:59:56,905 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 03:59:56,908 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 03:59:56,919 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 03:59:56,982 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 03:59:56,989 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 03:59:57,008 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 03:59:57,020 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 03:59:57,025 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 03:59:57,034 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 03:59:57,034 : INFO : EPOCH - 3 : training on 2996051328 raw words (2402969567 effective words) took 5391.8s, 445672 effective words/s\n", - "2022-03-18 03:59:58,059 : INFO : EPOCH 4 - PROGRESS: at 0.00% examples, 400337 words/s, in_qsize 19, out_qsize 1\n", - "2022-03-18 04:29:58,091 : INFO : EPOCH 4 - PROGRESS: at 19.41% examples, 451678 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 04:59:58,167 : INFO : EPOCH 4 - PROGRESS: at 57.02% examples, 445731 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-18 05:29:58,234 : INFO : EPOCH 4 - PROGRESS: at 99.74% examples, 444166 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-18 05:30:07,257 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 05:30:07,259 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 05:30:07,262 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 05:30:07,296 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 05:30:07,321 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 05:30:07,327 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 05:30:07,337 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 05:30:07,360 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 05:30:07,363 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 05:30:07,395 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 05:30:07,395 : INFO : EPOCH - 4 : training on 2996051328 raw words (2402983106 effective words) took 5410.2s, 444155 effective words/s\n", - "2022-03-18 05:30:08,414 : INFO : EPOCH 5 - PROGRESS: at 0.00% examples, 407411 words/s, in_qsize 19, out_qsize 1\n", - "2022-03-18 06:00:08,451 : INFO : EPOCH 5 - PROGRESS: at 19.32% examples, 450435 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 06:30:08,480 : INFO : EPOCH 5 - PROGRESS: at 57.18% examples, 446664 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 07:00:08,481 : INFO : EPOCH 5 - PROGRESS: at 99.48% examples, 443405 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 07:00:27,046 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 07:00:27,057 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 07:00:27,070 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 07:00:27,083 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 07:00:27,103 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 07:00:27,110 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 07:00:27,117 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 07:00:27,123 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 07:00:27,151 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 07:00:27,156 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 07:00:27,156 : INFO : EPOCH - 5 : training on 2996051328 raw words (2402973707 effective words) took 5419.7s, 443375 effective words/s\n", - "2022-03-18 07:00:28,166 : INFO : EPOCH 6 - PROGRESS: at 0.00% examples, 411123 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 07:30:28,233 : INFO : EPOCH 6 - PROGRESS: at 19.44% examples, 452181 words/s, in_qsize 18, out_qsize 1\n", - "2022-03-18 08:00:28,218 : INFO : EPOCH 6 - PROGRESS: at 57.42% examples, 447976 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 08:30:28,213 : INFO : EPOCH 6 - PROGRESS: at 99.29% examples, 442848 words/s, in_qsize 19, out_qsize 0\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-03-18 08:30:54,071 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 08:30:54,085 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 08:30:54,094 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 08:30:54,131 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 08:30:54,132 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 08:30:54,145 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 08:30:54,164 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 08:30:54,171 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 08:30:54,183 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 08:30:54,189 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 08:30:54,189 : INFO : EPOCH - 6 : training on 2996051328 raw words (2402970085 effective words) took 5427.0s, 442777 effective words/s\n", - "2022-03-18 08:30:55,193 : INFO : EPOCH 7 - PROGRESS: at 0.00% examples, 410013 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 09:00:55,267 : INFO : EPOCH 7 - PROGRESS: at 18.94% examples, 444759 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 09:30:55,268 : INFO : EPOCH 7 - PROGRESS: at 55.80% examples, 438741 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 10:00:55,370 : INFO : EPOCH 7 - PROGRESS: at 96.36% examples, 433564 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-18 10:03:28,340 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 10:03:28,355 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 10:03:28,356 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 10:03:28,376 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 10:03:28,384 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 10:03:28,418 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 10:03:28,419 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 10:03:28,428 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 10:03:28,454 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 10:03:28,468 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 10:03:28,468 : INFO : EPOCH - 7 : training on 2996051328 raw words (2402959910 effective words) took 5554.2s, 432641 effective words/s\n", - "2022-03-18 10:03:29,479 : INFO : EPOCH 8 - PROGRESS: at 0.00% examples, 369271 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 10:33:29,480 : INFO : EPOCH 8 - PROGRESS: at 17.46% examples, 422524 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-18 11:03:29,445 : INFO : EPOCH 8 - PROGRESS: at 50.17% examples, 408225 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 11:33:29,516 : INFO : EPOCH 8 - PROGRESS: at 90.78% examples, 414606 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-18 11:39:44,147 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 11:39:44,166 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 11:39:44,171 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 11:39:44,186 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 11:39:44,215 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 11:39:44,226 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 11:39:44,231 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 11:39:44,256 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 11:39:44,262 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 11:39:44,268 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 11:39:44,268 : INFO : EPOCH - 8 : training on 2996051328 raw words (2402978662 effective words) took 5775.8s, 416044 effective words/s\n", - "2022-03-18 11:39:45,288 : INFO : EPOCH 9 - PROGRESS: at 0.00% examples, 394893 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 12:09:45,255 : INFO : EPOCH 9 - PROGRESS: at 19.03% examples, 446055 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 12:39:45,321 : INFO : EPOCH 9 - PROGRESS: at 56.60% examples, 443313 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-18 13:09:45,381 : INFO : EPOCH 9 - PROGRESS: at 97.33% examples, 436797 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 13:11:27,488 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 13:11:27,492 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 13:11:27,504 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 13:11:27,535 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 13:11:27,552 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 13:11:27,553 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 13:11:27,564 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 13:11:27,567 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 13:11:27,584 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 13:11:27,592 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 13:11:27,592 : INFO : EPOCH - 9 : training on 2996051328 raw words (2402988333 effective words) took 5503.3s, 436648 effective words/s\n", - "2022-03-18 13:11:28,615 : INFO : EPOCH 10 - PROGRESS: at 0.00% examples, 391894 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-18 13:41:28,611 : INFO : EPOCH 10 - PROGRESS: at 17.83% examples, 428194 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-18 14:11:28,622 : INFO : EPOCH 10 - PROGRESS: at 51.72% examples, 416555 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 14:41:28,621 : INFO : EPOCH 10 - PROGRESS: at 91.60% examples, 417311 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-18 14:47:23,420 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 14:47:23,432 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 14:47:23,433 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 14:47:23,437 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 14:47:23,469 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 14:47:23,484 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 14:47:23,502 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 14:47:23,512 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 14:47:23,516 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 14:47:23,534 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 14:47:23,534 : INFO : EPOCH - 10 : training on 2996051328 raw words (2402969667 effective words) took 5755.9s, 417476 effective words/s\n", - "2022-03-18 14:47:23,536 : INFO : Doc2Vec lifecycle event {'msg': 'training on 29960513280 raw words (24029729521 effective words) took 55099.9s, 436112 effective words/s', 'datetime': '2022-03-18T14:47:23.536569', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", - "2022-03-18 14:47:23,537 : WARNING : Effective 'alpha' higher than previous training cycles\n", - "2022-03-18 14:47:23,537 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 10 workers on 894446 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=8 shrink_windows=True', 'datetime': '2022-03-18T14:47:23.537351', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", - "2022-03-18 14:47:24,546 : INFO : EPOCH 1 - PROGRESS: at 0.00% examples, 121520 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-18 15:12:45,307 : INFO : worker thread finished; awaiting finish of 9 more threads\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-03-18 15:12:45,325 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 15:12:45,326 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 15:12:45,327 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 15:12:45,327 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 15:12:45,332 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 15:12:45,338 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 15:12:45,346 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 15:12:45,348 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 15:12:45,354 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 15:12:45,355 : INFO : EPOCH - 1 : training on 2996051328 raw words (2402951760 effective words) took 1521.7s, 1579074 effective words/s\n", - "2022-03-18 15:12:46,373 : INFO : EPOCH 2 - PROGRESS: at 0.01% examples, 1835607 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-18 15:38:18,272 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 15:38:18,310 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 15:38:18,312 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 15:38:18,313 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 15:38:18,319 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 15:38:18,320 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 15:38:18,325 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 15:38:18,336 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 15:38:18,338 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 15:38:18,339 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 15:38:18,339 : INFO : EPOCH - 2 : training on 2996051328 raw words (2402972271 effective words) took 1533.0s, 1567541 effective words/s\n", - "2022-03-18 15:38:19,355 : INFO : EPOCH 3 - PROGRESS: at 0.01% examples, 1940890 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-18 16:02:47,736 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 16:02:47,759 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 16:02:47,762 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 16:02:47,764 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 16:02:47,764 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 16:02:47,775 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 16:02:47,781 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 16:02:47,788 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 16:02:47,789 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 16:02:47,791 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 16:02:47,791 : INFO : EPOCH - 3 : training on 2996051328 raw words (2402988495 effective words) took 1469.4s, 1635360 effective words/s\n", - "2022-03-18 16:02:48,814 : INFO : EPOCH 4 - PROGRESS: at 0.01% examples, 2013560 words/s, in_qsize 0, out_qsize 2\n", - "2022-03-18 16:26:11,222 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 16:26:11,234 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 16:26:11,236 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 16:26:11,239 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 16:26:11,242 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 16:26:11,245 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 16:26:11,254 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 16:26:11,258 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 16:26:11,261 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 16:26:11,262 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 16:26:11,262 : INFO : EPOCH - 4 : training on 2996051328 raw words (2402958098 effective words) took 1403.4s, 1712179 effective words/s\n", - "2022-03-18 16:26:12,270 : INFO : EPOCH 5 - PROGRESS: at 0.01% examples, 2003817 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-18 16:50:15,159 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 16:50:15,175 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 16:50:15,176 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 16:50:15,177 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 16:50:15,183 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 16:50:15,186 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 16:50:15,198 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 16:50:15,206 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 16:50:15,206 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 16:50:15,207 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 16:50:15,207 : INFO : EPOCH - 5 : training on 2996051328 raw words (2402956752 effective words) took 1443.9s, 1664163 effective words/s\n", - "2022-03-18 16:50:16,219 : INFO : EPOCH 6 - PROGRESS: at 0.01% examples, 1987406 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-18 17:13:45,624 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 17:13:45,632 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 17:13:45,635 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 17:13:45,636 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 17:13:45,637 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 17:13:45,640 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 17:13:45,651 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 17:13:45,657 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 17:13:45,663 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 17:13:45,665 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 17:13:45,665 : INFO : EPOCH - 6 : training on 2996051328 raw words (2402960350 effective words) took 1410.5s, 1703664 effective words/s\n", - "2022-03-18 17:13:46,675 : INFO : EPOCH 7 - PROGRESS: at 0.01% examples, 1985995 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-18 17:37:06,489 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 17:37:06,523 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 17:37:06,524 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 17:37:06,524 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 17:37:06,525 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 17:37:06,529 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 17:37:06,537 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 17:37:06,543 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 17:37:06,545 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 17:37:06,546 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 17:37:06,546 : INFO : EPOCH - 7 : training on 2996051328 raw words (2402972246 effective words) took 1400.9s, 1715338 effective words/s\n", - "2022-03-18 17:37:07,560 : INFO : EPOCH 8 - PROGRESS: at 0.01% examples, 2069561 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-18 18:00:31,024 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 18:00:31,053 : INFO : worker thread finished; awaiting finish of 8 more threads\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-03-18 18:00:31,056 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 18:00:31,057 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 18:00:31,059 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 18:00:31,059 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 18:00:31,068 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 18:00:31,072 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 18:00:31,075 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 18:00:31,076 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 18:00:31,076 : INFO : EPOCH - 8 : training on 2996051328 raw words (2402970402 effective words) took 1404.5s, 1710899 effective words/s\n", - "2022-03-18 18:00:32,091 : INFO : EPOCH 9 - PROGRESS: at 0.01% examples, 2063533 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-18 18:23:47,471 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 18:23:47,482 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 18:23:47,485 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 18:23:47,489 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 18:23:47,490 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 18:23:47,492 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 18:23:47,497 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 18:23:47,506 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 18:23:47,507 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 18:23:47,509 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 18:23:47,510 : INFO : EPOCH - 9 : training on 2996051328 raw words (2402978209 effective words) took 1396.4s, 1720847 effective words/s\n", - "2022-03-18 18:23:48,527 : INFO : EPOCH 10 - PROGRESS: at 0.01% examples, 1857646 words/s, in_qsize 11, out_qsize 0\n", - "2022-03-18 18:47:53,356 : INFO : worker thread finished; awaiting finish of 9 more threads\n", - "2022-03-18 18:47:53,364 : INFO : worker thread finished; awaiting finish of 8 more threads\n", - "2022-03-18 18:47:53,368 : INFO : worker thread finished; awaiting finish of 7 more threads\n", - "2022-03-18 18:47:53,370 : INFO : worker thread finished; awaiting finish of 6 more threads\n", - "2022-03-18 18:47:53,370 : INFO : worker thread finished; awaiting finish of 5 more threads\n", - "2022-03-18 18:47:53,372 : INFO : worker thread finished; awaiting finish of 4 more threads\n", - "2022-03-18 18:47:53,377 : INFO : worker thread finished; awaiting finish of 3 more threads\n", - "2022-03-18 18:47:53,383 : INFO : worker thread finished; awaiting finish of 2 more threads\n", - "2022-03-18 18:47:53,385 : INFO : worker thread finished; awaiting finish of 1 more threads\n", - "2022-03-18 18:47:53,389 : INFO : worker thread finished; awaiting finish of 0 more threads\n", - "2022-03-18 18:47:53,390 : INFO : EPOCH - 10 : training on 2996051328 raw words (2402975872 effective words) took 1445.9s, 1661955 effective words/s\n", - "2022-03-18 18:47:53,391 : INFO : Doc2Vec lifecycle event {'msg': 'training on 29960513280 raw words (24029684455 effective words) took 14429.7s, 1665293 effective words/s', 'datetime': '2022-03-18T18:47:53.391169', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n" + "2022-03-19 19:55:38,660 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 9 workers on 991887 vocabulary and 300 features, using sg=1 hs=0 sample=1e-05 negative=5 window=5 shrink_windows=True', 'datetime': '2022-03-19T19:55:38.660218', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", + "2022-03-19 19:55:39,851 : INFO : EPOCH 1 - PROGRESS: at 0.00% examples, 5854 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-19 20:25:39,805 : INFO : EPOCH 1 - PROGRESS: at 48.25% examples, 393044 words/s, in_qsize 18, out_qsize 0\n", + "2022-03-19 20:48:10,761 : INFO : EPOCH - 1 : training on 2996051328 raw words (1198675030 effective words) took 3152.1s, 380279 effective words/s\n", + "2022-03-19 20:48:11,793 : INFO : EPOCH 2 - PROGRESS: at 0.00% examples, 367086 words/s, in_qsize 16, out_qsize 0\n", + "2022-03-19 21:18:11,834 : INFO : EPOCH 2 - PROGRESS: at 46.01% examples, 380787 words/s, in_qsize 18, out_qsize 0\n", + "2022-03-19 21:41:41,740 : INFO : EPOCH - 2 : training on 2996051328 raw words (1198715463 effective words) took 3210.9s, 373328 effective words/s\n", + "2022-03-19 21:41:42,801 : INFO : EPOCH 3 - PROGRESS: at 0.00% examples, 371377 words/s, in_qsize 17, out_qsize 0\n", + "2022-03-19 22:11:42,858 : INFO : EPOCH 3 - PROGRESS: at 47.08% examples, 386416 words/s, in_qsize 18, out_qsize 1\n", + "2022-03-19 22:34:07,773 : INFO : EPOCH - 3 : training on 2996051328 raw words (1198709324 effective words) took 3145.9s, 381038 effective words/s\n", + "2022-03-19 22:34:08,813 : INFO : EPOCH 4 - PROGRESS: at 0.00% examples, 391762 words/s, in_qsize 18, out_qsize 0\n", + "2022-03-19 23:04:08,869 : INFO : EPOCH 4 - PROGRESS: at 47.18% examples, 387058 words/s, in_qsize 18, out_qsize 0\n", + "2022-03-19 23:27:17,510 : INFO : EPOCH - 4 : training on 2996051328 raw words (1198691470 effective words) took 3189.6s, 375807 effective words/s\n", + "2022-03-19 23:27:18,566 : INFO : EPOCH 5 - PROGRESS: at 0.00% examples, 341762 words/s, in_qsize 18, out_qsize 0\n", + "2022-03-19 23:57:18,560 : INFO : EPOCH 5 - PROGRESS: at 49.49% examples, 400066 words/s, in_qsize 17, out_qsize 0\n", + "2022-03-20 00:18:39,692 : INFO : EPOCH - 5 : training on 2996051328 raw words (1198651978 effective words) took 3082.1s, 388902 effective words/s\n", + "2022-03-20 00:18:40,744 : INFO : EPOCH 6 - PROGRESS: at 0.00% examples, 389656 words/s, in_qsize 17, out_qsize 1\n", + "2022-03-20 00:48:40,746 : INFO : EPOCH 6 - PROGRESS: at 48.94% examples, 396931 words/s, in_qsize 18, out_qsize 0\n", + "2022-03-20 01:10:12,092 : INFO : EPOCH - 6 : training on 2996051328 raw words (1198683303 effective words) took 3092.4s, 387624 effective words/s\n", + "2022-03-20 01:10:13,131 : INFO : EPOCH 7 - PROGRESS: at 0.00% examples, 380610 words/s, in_qsize 17, out_qsize 0\n", + "2022-03-20 01:40:13,225 : INFO : EPOCH 7 - PROGRESS: at 48.99% examples, 397202 words/s, in_qsize 18, out_qsize 0\n", + "2022-03-20 02:01:49,460 : INFO : EPOCH - 7 : training on 2996051328 raw words (1198700562 effective words) took 3097.3s, 387016 effective words/s\n", + "2022-03-20 02:01:50,477 : INFO : EPOCH 8 - PROGRESS: at 0.00% examples, 379974 words/s, in_qsize 17, out_qsize 0\n", + "2022-03-20 02:31:50,541 : INFO : EPOCH 8 - PROGRESS: at 49.01% examples, 397386 words/s, in_qsize 17, out_qsize 0\n", + "2022-03-20 02:53:22,480 : INFO : EPOCH - 8 : training on 2996051328 raw words (1198739172 effective words) took 3093.0s, 387567 effective words/s\n", + "2022-03-20 02:53:23,526 : INFO : EPOCH 9 - PROGRESS: at 0.00% examples, 403121 words/s, in_qsize 18, out_qsize 0\n", + "2022-03-20 03:23:23,535 : INFO : EPOCH 9 - PROGRESS: at 49.86% examples, 402052 words/s, in_qsize 17, out_qsize 0\n", + "2022-03-20 03:44:21,076 : INFO : EPOCH - 9 : training on 2996051328 raw words (1198699759 effective words) took 3058.5s, 391921 effective words/s\n", + "2022-03-20 03:44:22,093 : INFO : EPOCH 10 - PROGRESS: at 0.00% examples, 363235 words/s, in_qsize 17, out_qsize 0\n", + "2022-03-20 04:14:22,127 : INFO : EPOCH 10 - PROGRESS: at 47.77% examples, 390356 words/s, in_qsize 18, out_qsize 0\n", + "2022-03-20 04:37:01,731 : INFO : EPOCH - 10 : training on 2996051328 raw words (1198674743 effective words) took 3160.6s, 379258 effective words/s\n", + "2022-03-20 04:37:01,758 : INFO : Doc2Vec lifecycle event {'msg': 'training on 29960513280 raw words (11986940804 effective words) took 31282.8s, 383179 effective words/s', 'datetime': '2022-03-20T04:37:01.757530', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", + "2022-03-20 04:37:01,760 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 9 workers on 991887 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=5 window=5 shrink_windows=True', 'datetime': '2022-03-20T04:37:01.760198', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", + "2022-03-20 04:37:02,806 : INFO : EPOCH 1 - PROGRESS: at 0.00% examples, 51780 words/s, in_qsize 17, out_qsize 0\n", + "2022-03-20 05:01:24,379 : INFO : EPOCH - 1 : training on 2996051328 raw words (1198699065 effective words) took 1462.6s, 819587 effective words/s\n", + "2022-03-20 05:01:25,419 : INFO : EPOCH 2 - PROGRESS: at 0.01% examples, 1026623 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-20 05:25:51,408 : INFO : EPOCH - 2 : training on 2996051328 raw words (1198713530 effective words) took 1467.0s, 817102 effective words/s\n", + "2022-03-20 05:25:52,466 : INFO : EPOCH 3 - PROGRESS: at 0.01% examples, 1053617 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-20 05:50:03,250 : INFO : EPOCH - 3 : training on 2996051328 raw words (1198725674 effective words) took 1451.7s, 825713 effective words/s\n", + "2022-03-20 05:50:04,303 : INFO : EPOCH 4 - PROGRESS: at 0.01% examples, 994295 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-20 06:14:21,091 : INFO : EPOCH - 4 : training on 2996051328 raw words (1198700285 effective words) took 1457.8s, 822282 effective words/s\n", + "2022-03-20 06:14:22,184 : INFO : EPOCH 5 - PROGRESS: at 0.01% examples, 1051640 words/s, in_qsize 0, out_qsize 1\n", + "2022-03-20 06:38:35,360 : INFO : EPOCH - 5 : training on 2996051328 raw words (1198682356 effective words) took 1454.2s, 824283 effective words/s\n", + "2022-03-20 06:38:36,422 : INFO : EPOCH 6 - PROGRESS: at 0.01% examples, 1055533 words/s, in_qsize 0, out_qsize 1\n", + "2022-03-20 07:02:50,449 : INFO : EPOCH - 6 : training on 2996051328 raw words (1198672468 effective words) took 1455.0s, 823803 effective words/s\n", + "2022-03-20 07:02:51,496 : INFO : EPOCH 7 - PROGRESS: at 0.01% examples, 1052539 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-20 07:27:17,109 : INFO : EPOCH - 7 : training on 2996051328 raw words (1198696667 effective words) took 1466.6s, 817342 effective words/s\n", + "2022-03-20 07:27:18,148 : INFO : EPOCH 8 - PROGRESS: at 0.01% examples, 1047255 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-20 07:51:29,674 : INFO : EPOCH - 8 : training on 2996051328 raw words (1198691941 effective words) took 1452.5s, 825260 effective words/s\n", + "2022-03-20 07:51:30,808 : INFO : EPOCH 9 - PROGRESS: at 0.01% examples, 915921 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-20 08:15:44,007 : INFO : EPOCH - 9 : training on 2996051328 raw words (1198707402 effective words) took 1454.2s, 824316 effective words/s\n", + "2022-03-20 08:15:45,041 : INFO : EPOCH 10 - PROGRESS: at 0.01% examples, 1038693 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-20 08:39:48,080 : INFO : EPOCH - 10 : training on 2996051328 raw words (1198683355 effective words) took 1444.0s, 830092 effective words/s\n", + "2022-03-20 08:39:48,155 : INFO : Doc2Vec lifecycle event {'msg': 'training on 29960513280 raw words (11986972743 effective words) took 14566.3s, 822924 effective words/s', 'datetime': '2022-03-20T08:39:48.155821', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n" ] } ], @@ -597,55 +362,55 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "[('Supervised learning', 0.7626678943634033),\n", - " ('Pattern recognition', 0.7443839907646179),\n", - " ('Artificial neural network', 0.7443667650222778),\n", - " ('Boosting (machine learning)', 0.7209591865539551),\n", - " ('Deep learning', 0.7030681371688843),\n", - " ('Linear classifier', 0.6918482184410095),\n", - " ('Feature selection', 0.6885010600090027),\n", - " ('Knowledge retrieval', 0.6797034740447998),\n", - " ('Convolutional neural network', 0.6789148449897766),\n", - " ('Outline of computer science', 0.6732515096664429),\n", - " ('Training, validation, and test sets', 0.6729527711868286),\n", - " ('Support-vector machine', 0.6719434857368469),\n", - " ('Learning classifier system', 0.6716565489768982),\n", - " ('Outline of machine learning', 0.6692107915878296),\n", - " ('Bayesian network', 0.6654112935066223),\n", - " ('Manifold regularization', 0.6635575294494629),\n", - " ('Multi-task learning', 0.6624512672424316),\n", - " ('Fuzzy logic', 0.6605969667434692),\n", - " ('Computer mathematics', 0.6600310206413269),\n", - " ('Recurrent neural network', 0.6571199893951416)]\n", - "Doc2Vec\n", - "[('Pattern recognition', 0.731984555721283),\n", - " ('Supervised learning', 0.7107947468757629),\n", - " ('Multi-task learning', 0.6985798478126526),\n", - " ('Semi-supervised learning', 0.6792073249816895),\n", - " ('Meta learning (computer science)', 0.6784282922744751),\n", - " ('Support-vector machine', 0.6740356683731079),\n", - " ('Feature selection', 0.6702772378921509),\n", - " ('Statistical learning theory', 0.6683863997459412),\n", - " ('Automatic image annotation', 0.661750078201294),\n", - " ('Deep learning', 0.6617218255996704),\n", - " ('Linear classifier', 0.6573296189308167),\n", - " ('Statistical classification', 0.654957115650177),\n", - " ('Regularization (mathematics)', 0.6517974138259888),\n", - " ('Data analysis techniques for fraud detection', 0.6505621671676636),\n", - " ('Artificial neural network', 0.6478281021118164),\n", - " ('Boosting (machine learning)', 0.6463974714279175),\n", - " ('Naive Bayes classifier', 0.6442222595214844),\n", - " ('Autoencoder', 0.6438822746276855),\n", - " ('Predictive Model Markup Language', 0.6405109763145447),\n", - " ('Perceptron', 0.6379765868186951)]\n" + "Doc2Vec\n", + "[('Deep learning', 0.6759864687919617),\n", + " ('Pattern recognition', 0.6742060780525208),\n", + " ('Supervised learning', 0.6707901358604431),\n", + " ('Artificial neural network', 0.6665838956832886),\n", + " ('Semi-supervised learning', 0.654625654220581),\n", + " ('Outline of machine learning', 0.6472466588020325),\n", + " ('Multi-task learning', 0.646246075630188),\n", + " ('Boosting (machine learning)', 0.641443133354187),\n", + " ('Neural network', 0.637937605381012),\n", + " ('Types of artificial neural networks', 0.6377928853034973),\n", + " ('Incremental learning', 0.637475848197937),\n", + " ('Perceptron', 0.6358065605163574),\n", + " ('Rule induction', 0.6326183676719666),\n", + " ('Early stopping', 0.6307427883148193),\n", + " ('Multilayer perceptron', 0.6273738741874695),\n", + " ('Connectionist expert system', 0.6203325390815735),\n", + " ('Neural Designer', 0.6193098425865173),\n", + " ('Meta learning (computer science)', 0.6182209253311157),\n", + " ('Feature (machine learning)', 0.6175855398178101),\n", + " ('Domain adaptation', 0.6154839396476746)]\n", + "Doc2Vec\n", + "[('Pattern recognition', 0.7132866382598877),\n", + " ('Deep learning', 0.66518634557724),\n", + " ('Supervised learning', 0.6553921699523926),\n", + " ('Artificial neural network', 0.6497268080711365),\n", + " ('Semi-supervised learning', 0.6446605324745178),\n", + " ('Statistical learning theory', 0.6335287094116211),\n", + " ('Ensemble learning', 0.6282770037651062),\n", + " ('Prior knowledge for pattern recognition', 0.6238192319869995),\n", + " ('Boosting (machine learning)', 0.6212880611419678),\n", + " ('Early stopping', 0.6177847981452942),\n", + " ('Statistical classification', 0.6156905293464661),\n", + " ('Autoencoder', 0.6140605211257935),\n", + " ('Predictive Model Markup Language', 0.6070867776870728),\n", + " ('Multi-task learning', 0.606650710105896),\n", + " ('Similarity learning', 0.6065689921379089),\n", + " ('Feature learning', 0.5982990860939026),\n", + " ('Automatic image annotation', 0.5949181914329529),\n", + " ('Linear classifier', 0.5930992960929871),\n", + " ('Support-vector machine', 0.5929517149925232),\n", + " ('Active learning (machine learning)', 0.5914105176925659)]\n" ] } ], @@ -659,14 +424,17 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Both results seem similar, but note the DM model took 4x less time train (training 4x faster).\n", + "Both results seem similar and match the results from the paper's Table 1, although not exactly. This is because we don't kno wthe exact parameters and also Wikipedia. FIXME\n", + "\n", + "\n", + ", but note the DM model took 4x less time train (training 4x faster).\n", "\n", "Second, let's calculate the most similar Wikipedia entries to \"Lady Gaga\" using Paragraph Vector:" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 9, "metadata": { "scrolled": false }, @@ -675,28 +443,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "[('Ariana Grande', 0.755739688873291),\n", - " ('Katy Perry', 0.7534462213516235),\n", - " ('Miley Cyrus', 0.7091007828712463),\n", - " ('Adele', 0.6958011984825134),\n", - " ('Demi Lovato', 0.6867919564247131),\n", - " ('Nicki Minaj', 0.6783465147018433),\n", - " ('Taylor Swift', 0.6691418886184692),\n", - " ('Adam Lambert', 0.6638894081115723),\n", - " ('Rihanna', 0.6437391638755798),\n", - " ('Kesha', 0.6433634161949158)]\n", - "Doc2Vec\n", - "[('Born This Way (album)', 0.6649508476257324),\n", - " ('Artpop', 0.6616811752319336),\n", - " ('Lady Gaga videography', 0.6363328695297241),\n", - " ('Katy Perry', 0.6322777271270752),\n", - " ('Beautiful, Dirty, Rich', 0.6277879476547241),\n", - " ('Lady Gaga discography', 0.60688316822052),\n", - " ('Applause (Lady Gaga song)', 0.6062529683113098),\n", - " ('List of Lady Gaga live performances', 0.5975069403648376),\n", - " ('Born This Way (song)', 0.5948888659477234),\n", - " ('Madonna', 0.5918263792991638)]\n" + "Doc2Vec\n", + "[('Katy Perry', 0.6756664514541626),\n", + " ('Ariana Grande', 0.6528252363204956),\n", + " ('Miley Cyrus', 0.6260033845901489),\n", + " ('Taylor Swift', 0.6144734621047974),\n", + " ('Britney Spears', 0.6105279922485352),\n", + " ('List of awards and nominations received by Lady Gaga', 0.6099144220352173),\n", + " ('Madonna', 0.5910527110099792),\n", + " ('Christina Aguilera', 0.5906776785850525),\n", + " ('Beyoncé', 0.5887077450752258),\n", + " ('Demi Lovato', 0.5824941992759705)]\n", + "Doc2Vec\n", + "[('Taylor Swift', 0.5584290623664856),\n", + " ('Joanne (album)', 0.5531442761421204),\n", + " ('List of Lady Gaga live performances', 0.546563982963562),\n", + " ('Katy Perry', 0.5364790558815002),\n", + " ('Cynthia Germanotta', 0.5363910794258118),\n", + " ('Artpop', 0.5359800457954407),\n", + " ('Natali Germanotta', 0.5320610404014587),\n", + " ('Beautiful, Dirty, Rich', 0.5298762917518616),\n", + " ('Joanne Trattoria Cookbook', 0.5219755172729492),\n", + " ('The Fame', 0.5171768069267273)]\n" ] } ], @@ -712,16 +480,16 @@ "collapsed": true }, "source": [ - "The DBOW model reveals similar singers in the U.S., while the DM model seems to pay more attention to the word \"Gaga\" itself.\n", + "The DBOW results are in line with what the paper shows in Table 2a), revealing similar singers in the U.S.\n", "\n", - "Finally, let's do some wilder artihmetics that vectors embeddings are famous for. What are the entries most similar to \"Lady Gaga\" - \"American\" + \"Japanese\"?\n", + "Finally, let's do some of the wilder arithmetics that vectors embeddings are famous for. What are the entries most similar to \"Lady Gaga\" - \"American\" + \"Japanese\"? Table 2b) in the paper.\n", "\n", "Note that \"American\" and \"Japanese\" are word vectors, but they live in the same space as the document vectors so we can add / subtract them at will, for some interesting results. All word vectors were already lowercased by our tokenizer above, so we look for the lowercased version here:" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 10, "metadata": { "scrolled": false }, @@ -730,28 +498,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "[('Ayumi Hamasaki', 0.604461669921875),\n", - " ('2NE1', 0.5942890644073486),\n", - " ('Katy Perry', 0.5932046175003052),\n", - " ('Ariana Grande', 0.5865142941474915),\n", - " (\"Can't Stop the Disco\", 0.5778986215591431),\n", - " (\"Girls' Generation\", 0.5741134285926819),\n", - " ('We Are \"Lonely Girl\"', 0.5682086944580078),\n", - " ('Perfume (Japanese band)', 0.568188488483429),\n", - " ('H (Ayumi Hamasaki EP)', 0.5679325461387634),\n", - " ('Kyary Pamyu Pamyu', 0.5665541887283325)]\n", - "Doc2Vec\n", - "[('Kaela Kimura', 0.5528751015663147),\n", - " ('Chisato Moritaka', 0.551906943321228),\n", - " ('Suzuki Ami Around the World: Live House Tour 2005', 0.5428911447525024),\n", - " ('Pink Lady (duo)', 0.5385505557060242),\n", - " ('Artpop', 0.5361125469207764),\n", - " ('Kaede (dancer)', 0.535369873046875),\n", - " ('Miliyah Kato', 0.5336685180664062),\n", - " ('Liyuu', 0.5325193405151367),\n", - " ('Ai (singer)', 0.5272262692451477),\n", - " ('Momoiro Clover Z', 0.525260329246521)]\n" + "Doc2Vec\n", + "[('Katy Perry', 0.5954936146736145),\n", + " ('Ariana Grande', 0.5549463033676147),\n", + " ('Thank You, Love (Kana Nishino album)', 0.5462373495101929),\n", + " ('Kōsui (Eito song)', 0.5363353490829468),\n", + " ('Crazy Crazy / Harajuku Iyahoi', 0.5308663249015808),\n", + " ('Big Boys Cry/Beautiful', 0.5303178429603577),\n", + " ('23rd Monster', 0.5298969745635986),\n", + " ('Koi (song)', 0.5275850296020508),\n", + " ('X -Cross-', 0.5255367755889893),\n", + " ('Suzume (song)', 0.525151252746582)]\n", + "Doc2Vec\n", + "[('Joanne (album)', 0.5087171196937561),\n", + " ('Artpop', 0.49931594729423523),\n", + " ('The Cure (song)', 0.4835745394229889),\n", + " ('Natali Germanotta', 0.47394222021102905),\n", + " ('Chisato Moritaka', 0.47318926453590393),\n", + " ('Ayumi Hamasaki', 0.47156259417533875),\n", + " ('List of Lady Gaga live performances', 0.4629444479942322),\n", + " ('Joanne Trattoria Cookbook', 0.4615497887134552),\n", + " ('Katy Perry', 0.46151337027549744),\n", + " ('Blackpink', 0.45603010058403015)]\n" ] } ], @@ -770,7 +538,7 @@ "\n", "> Ayumi Hamasaki is a Japanese singer, songwriter, record producer, actress, model, spokesperson, and entrepreneur.\n", "\n", - "So that sounds like a success.\n", + "So that sounds like a success. It's also the nr. 1 hit in the paper we're replicating.\n", "\n", "Similarly, the DM model thought **Kaela Kimura** is the closest hit:\n", "\n", diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 40fd6c0053..20a739f64a 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -221,7 +221,7 @@ def __init__( More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that other values may perform better for recommendation applications. dm_mean : {1,0}, optional - If 0 , use the sum of the context word vectors. If 1, use the mean. + If 0, use the sum of the context word vectors. If 1, use the mean. Only applies when `dm` is used in non-concatenative mode. dm_concat : {1,0}, optional If 1, use concatenation of context vectors rather than sum/average; From 010a7ac745896e7b4607f1ff2b4507b835c3ddf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sun, 20 Mar 2022 20:11:39 +0100 Subject: [PATCH 43/81] doc2vec-wikipedia: dim=100 window=8 --- docs/notebooks/doc2vec-wikipedia.ipynb | 371 ++++++++++++------------- 1 file changed, 184 insertions(+), 187 deletions(-) diff --git a/docs/notebooks/doc2vec-wikipedia.ipynb b/docs/notebooks/doc2vec-wikipedia.ipynb index 5e36b2b525..9bcf0ad56f 100644 --- a/docs/notebooks/doc2vec-wikipedia.ipynb +++ b/docs/notebooks/doc2vec-wikipedia.ipynb @@ -189,8 +189,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-03-19 19:45:13,743 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-19T19:45:13.743356', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n", - "2022-03-19 19:45:13,745 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-19T19:45:13.745470', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n" + "2022-03-20 09:48:28,259 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-20T09:48:28.259891', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n", + "2022-03-20 09:48:28,262 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-20T09:48:28.262943', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n" ] } ], @@ -201,12 +201,12 @@ " # PV-DBOW: paragraph vector in distributed bag of words mode\n", " Doc2Vec(\n", " dm=0, dbow_words=1, # dbow_words=1 to train word vectors at the same time too, not only DBOW\n", - " vector_size=300, window=5, sample=1e-5, epochs=10, workers=workers, max_final_vocab=1000000,\n", + " vector_size=100, window=8, sample=1e-5, epochs=10, workers=workers, max_final_vocab=1000000,\n", " ),\n", " # PV-DM: paragraph vector in distributed memory mode\n", " Doc2Vec(\n", " dm=1, dm_concat=0, dm_mean=1, # use average of context word vectors to train DM\n", - " vector_size=300, window=5, sample=1e-5, epochs=10, workers=workers, max_final_vocab=1000000,\n", + " vector_size=100, window=8, sample=1e-5, epochs=10, workers=workers, max_final_vocab=1000000,\n", " )\n", "]" ] @@ -220,37 +220,37 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-03-19 19:45:22,230 : INFO : collecting all words and their counts\n", - "2022-03-19 19:45:22,237 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags\n", - "2022-03-19 19:47:16,714 : INFO : PROGRESS: at example #500000, processed 654950164 words (5721331 words/s), 3222179 word types, 500000 tags\n", - "2022-03-19 19:48:22,015 : INFO : PROGRESS: at example #1000000, processed 1018611068 words (5569047 words/s), 4480366 word types, 1000000 tags\n", - "2022-03-19 19:49:14,916 : INFO : PROGRESS: at example #1500000, processed 1305140647 words (5416499 words/s), 5420104 word types, 1500000 tags\n", - "2022-03-19 19:50:00,348 : INFO : PROGRESS: at example #2000000, processed 1550245240 words (5395000 words/s), 6188355 word types, 2000000 tags\n", - "2022-03-19 19:50:45,274 : INFO : PROGRESS: at example #2500000, processed 1790661139 words (5351430 words/s), 6941128 word types, 2500000 tags\n", - "2022-03-19 19:51:30,706 : INFO : PROGRESS: at example #3000000, processed 2028261627 words (5229901 words/s), 7664997 word types, 3000000 tags\n", - "2022-03-19 19:52:15,667 : INFO : PROGRESS: at example #3500000, processed 2264063867 words (5244690 words/s), 8347719 word types, 3500000 tags\n", - "2022-03-19 19:52:58,112 : INFO : PROGRESS: at example #4000000, processed 2488354257 words (5284392 words/s), 8971529 word types, 4000000 tags\n", - "2022-03-19 19:53:40,217 : INFO : PROGRESS: at example #4500000, processed 2703313059 words (5105821 words/s), 9605666 word types, 4500000 tags\n", - "2022-03-19 19:54:22,431 : INFO : PROGRESS: at example #5000000, processed 2925111571 words (5254227 words/s), 10217554 word types, 5000000 tags\n", - "2022-03-19 19:54:51,211 : INFO : collected 10427023 word types and 5176019 unique tags from a corpus of 5176019 examples and 2996051328 words\n", - "2022-03-19 19:54:54,641 : INFO : Doc2Vec lifecycle event {'msg': 'max_final_vocab=1000000 and min_count=5 resulted in calc_min_count=23, effective_min_count=23', 'datetime': '2022-03-19T19:54:54.641858', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-19 19:54:54,642 : INFO : Creating a fresh vocabulary\n", - "2022-03-19 19:54:58,144 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 retains 991887 unique words (9.51% of original 10427023, drops 9435136)', 'datetime': '2022-03-19T19:54:58.144360', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-19 19:54:58,144 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 leaves 2968296495 word corpus (99.07% of original 2996051328, drops 27754833)', 'datetime': '2022-03-19T19:54:58.144810', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-19 19:55:00,904 : INFO : deleting the raw counts dictionary of 10427023 items\n", - "2022-03-19 19:55:00,968 : INFO : sample=1e-05 downsamples 4155 most-common words\n", - "2022-03-19 19:55:00,969 : INFO : Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 1194754612.050565 word corpus (40.3%% of prior 2968296495)', 'datetime': '2022-03-19T19:55:00.969465', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-19 19:55:05,714 : INFO : estimated required memory for 991887 words and 300 dimensions: 10122898900 bytes\n", - "2022-03-19 19:55:05,714 : INFO : resetting layer weights\n", - "2022-03-19 19:55:14,899 : INFO : resetting layer weights\n" + "2022-03-20 09:48:31,797 : INFO : collecting all words and their counts\n", + "2022-03-20 09:48:31,803 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags\n", + "2022-03-20 09:50:27,549 : INFO : PROGRESS: at example #500000, processed 654950164 words (5658546 words/s), 3222179 word types, 500000 tags\n", + "2022-03-20 09:51:35,208 : INFO : PROGRESS: at example #1000000, processed 1018611068 words (5374966 words/s), 4480366 word types, 1000000 tags\n", + "2022-03-20 09:52:28,939 : INFO : PROGRESS: at example #1500000, processed 1305140647 words (5332757 words/s), 5420104 word types, 1500000 tags\n", + "2022-03-20 09:53:17,522 : INFO : PROGRESS: at example #2000000, processed 1550245240 words (5045171 words/s), 6188355 word types, 2000000 tags\n", + "2022-03-20 09:54:05,142 : INFO : PROGRESS: at example #2500000, processed 1790661139 words (5048717 words/s), 6941128 word types, 2500000 tags\n", + "2022-03-20 09:54:53,539 : INFO : PROGRESS: at example #3000000, processed 2028261627 words (4909433 words/s), 7664997 word types, 3000000 tags\n", + "2022-03-20 09:55:40,654 : INFO : PROGRESS: at example #3500000, processed 2264063867 words (5004899 words/s), 8347719 word types, 3500000 tags\n", + "2022-03-20 09:56:25,746 : INFO : PROGRESS: at example #4000000, processed 2488354257 words (4974154 words/s), 8971529 word types, 4000000 tags\n", + "2022-03-20 09:57:09,436 : INFO : PROGRESS: at example #4500000, processed 2703313059 words (4920183 words/s), 9605666 word types, 4500000 tags\n", + "2022-03-20 09:57:55,813 : INFO : PROGRESS: at example #5000000, processed 2925111571 words (4782561 words/s), 10217554 word types, 5000000 tags\n", + "2022-03-20 09:58:26,146 : INFO : collected 10427023 word types and 5176019 unique tags from a corpus of 5176019 examples and 2996051328 words\n", + "2022-03-20 09:58:29,677 : INFO : Doc2Vec lifecycle event {'msg': 'max_final_vocab=1000000 and min_count=5 resulted in calc_min_count=23, effective_min_count=23', 'datetime': '2022-03-20T09:58:29.677488', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-20 09:58:29,677 : INFO : Creating a fresh vocabulary\n", + "2022-03-20 09:58:33,171 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 retains 991887 unique words (9.51% of original 10427023, drops 9435136)', 'datetime': '2022-03-20T09:58:33.171030', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-20 09:58:33,171 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 leaves 2968296495 word corpus (99.07% of original 2996051328, drops 27754833)', 'datetime': '2022-03-20T09:58:33.171494', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-20 09:58:35,956 : INFO : deleting the raw counts dictionary of 10427023 items\n", + "2022-03-20 09:58:36,023 : INFO : sample=1e-05 downsamples 4155 most-common words\n", + "2022-03-20 09:58:36,023 : INFO : Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 1194754612.050565 word corpus (40.3%% of prior 2968296495)', 'datetime': '2022-03-20T09:58:36.023801', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-20 09:58:40,804 : INFO : estimated required memory for 991887 words and 100 dimensions: 4395064500 bytes\n", + "2022-03-20 09:58:40,804 : INFO : resetting layer weights\n", + "2022-03-20 09:58:42,744 : INFO : resetting layer weights\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "Doc2Vec\n" + "Doc2Vec\n", + "Doc2Vec\n" ] } ], @@ -282,60 +282,60 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-03-19 19:55:38,660 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 9 workers on 991887 vocabulary and 300 features, using sg=1 hs=0 sample=1e-05 negative=5 window=5 shrink_windows=True', 'datetime': '2022-03-19T19:55:38.660218', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", - "2022-03-19 19:55:39,851 : INFO : EPOCH 1 - PROGRESS: at 0.00% examples, 5854 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-19 20:25:39,805 : INFO : EPOCH 1 - PROGRESS: at 48.25% examples, 393044 words/s, in_qsize 18, out_qsize 0\n", - "2022-03-19 20:48:10,761 : INFO : EPOCH - 1 : training on 2996051328 raw words (1198675030 effective words) took 3152.1s, 380279 effective words/s\n", - "2022-03-19 20:48:11,793 : INFO : EPOCH 2 - PROGRESS: at 0.00% examples, 367086 words/s, in_qsize 16, out_qsize 0\n", - "2022-03-19 21:18:11,834 : INFO : EPOCH 2 - PROGRESS: at 46.01% examples, 380787 words/s, in_qsize 18, out_qsize 0\n", - "2022-03-19 21:41:41,740 : INFO : EPOCH - 2 : training on 2996051328 raw words (1198715463 effective words) took 3210.9s, 373328 effective words/s\n", - "2022-03-19 21:41:42,801 : INFO : EPOCH 3 - PROGRESS: at 0.00% examples, 371377 words/s, in_qsize 17, out_qsize 0\n", - "2022-03-19 22:11:42,858 : INFO : EPOCH 3 - PROGRESS: at 47.08% examples, 386416 words/s, in_qsize 18, out_qsize 1\n", - "2022-03-19 22:34:07,773 : INFO : EPOCH - 3 : training on 2996051328 raw words (1198709324 effective words) took 3145.9s, 381038 effective words/s\n", - "2022-03-19 22:34:08,813 : INFO : EPOCH 4 - PROGRESS: at 0.00% examples, 391762 words/s, in_qsize 18, out_qsize 0\n", - "2022-03-19 23:04:08,869 : INFO : EPOCH 4 - PROGRESS: at 47.18% examples, 387058 words/s, in_qsize 18, out_qsize 0\n", - "2022-03-19 23:27:17,510 : INFO : EPOCH - 4 : training on 2996051328 raw words (1198691470 effective words) took 3189.6s, 375807 effective words/s\n", - "2022-03-19 23:27:18,566 : INFO : EPOCH 5 - PROGRESS: at 0.00% examples, 341762 words/s, in_qsize 18, out_qsize 0\n", - "2022-03-19 23:57:18,560 : INFO : EPOCH 5 - PROGRESS: at 49.49% examples, 400066 words/s, in_qsize 17, out_qsize 0\n", - "2022-03-20 00:18:39,692 : INFO : EPOCH - 5 : training on 2996051328 raw words (1198651978 effective words) took 3082.1s, 388902 effective words/s\n", - "2022-03-20 00:18:40,744 : INFO : EPOCH 6 - PROGRESS: at 0.00% examples, 389656 words/s, in_qsize 17, out_qsize 1\n", - "2022-03-20 00:48:40,746 : INFO : EPOCH 6 - PROGRESS: at 48.94% examples, 396931 words/s, in_qsize 18, out_qsize 0\n", - "2022-03-20 01:10:12,092 : INFO : EPOCH - 6 : training on 2996051328 raw words (1198683303 effective words) took 3092.4s, 387624 effective words/s\n", - "2022-03-20 01:10:13,131 : INFO : EPOCH 7 - PROGRESS: at 0.00% examples, 380610 words/s, in_qsize 17, out_qsize 0\n", - "2022-03-20 01:40:13,225 : INFO : EPOCH 7 - PROGRESS: at 48.99% examples, 397202 words/s, in_qsize 18, out_qsize 0\n", - "2022-03-20 02:01:49,460 : INFO : EPOCH - 7 : training on 2996051328 raw words (1198700562 effective words) took 3097.3s, 387016 effective words/s\n", - "2022-03-20 02:01:50,477 : INFO : EPOCH 8 - PROGRESS: at 0.00% examples, 379974 words/s, in_qsize 17, out_qsize 0\n", - "2022-03-20 02:31:50,541 : INFO : EPOCH 8 - PROGRESS: at 49.01% examples, 397386 words/s, in_qsize 17, out_qsize 0\n", - "2022-03-20 02:53:22,480 : INFO : EPOCH - 8 : training on 2996051328 raw words (1198739172 effective words) took 3093.0s, 387567 effective words/s\n", - "2022-03-20 02:53:23,526 : INFO : EPOCH 9 - PROGRESS: at 0.00% examples, 403121 words/s, in_qsize 18, out_qsize 0\n", - "2022-03-20 03:23:23,535 : INFO : EPOCH 9 - PROGRESS: at 49.86% examples, 402052 words/s, in_qsize 17, out_qsize 0\n", - "2022-03-20 03:44:21,076 : INFO : EPOCH - 9 : training on 2996051328 raw words (1198699759 effective words) took 3058.5s, 391921 effective words/s\n", - "2022-03-20 03:44:22,093 : INFO : EPOCH 10 - PROGRESS: at 0.00% examples, 363235 words/s, in_qsize 17, out_qsize 0\n", - "2022-03-20 04:14:22,127 : INFO : EPOCH 10 - PROGRESS: at 47.77% examples, 390356 words/s, in_qsize 18, out_qsize 0\n", - "2022-03-20 04:37:01,731 : INFO : EPOCH - 10 : training on 2996051328 raw words (1198674743 effective words) took 3160.6s, 379258 effective words/s\n", - "2022-03-20 04:37:01,758 : INFO : Doc2Vec lifecycle event {'msg': 'training on 29960513280 raw words (11986940804 effective words) took 31282.8s, 383179 effective words/s', 'datetime': '2022-03-20T04:37:01.757530', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", - "2022-03-20 04:37:01,760 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 9 workers on 991887 vocabulary and 300 features, using sg=0 hs=0 sample=1e-05 negative=5 window=5 shrink_windows=True', 'datetime': '2022-03-20T04:37:01.760198', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", - "2022-03-20 04:37:02,806 : INFO : EPOCH 1 - PROGRESS: at 0.00% examples, 51780 words/s, in_qsize 17, out_qsize 0\n", - "2022-03-20 05:01:24,379 : INFO : EPOCH - 1 : training on 2996051328 raw words (1198699065 effective words) took 1462.6s, 819587 effective words/s\n", - "2022-03-20 05:01:25,419 : INFO : EPOCH 2 - PROGRESS: at 0.01% examples, 1026623 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-20 05:25:51,408 : INFO : EPOCH - 2 : training on 2996051328 raw words (1198713530 effective words) took 1467.0s, 817102 effective words/s\n", - "2022-03-20 05:25:52,466 : INFO : EPOCH 3 - PROGRESS: at 0.01% examples, 1053617 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-20 05:50:03,250 : INFO : EPOCH - 3 : training on 2996051328 raw words (1198725674 effective words) took 1451.7s, 825713 effective words/s\n", - "2022-03-20 05:50:04,303 : INFO : EPOCH 4 - PROGRESS: at 0.01% examples, 994295 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-20 06:14:21,091 : INFO : EPOCH - 4 : training on 2996051328 raw words (1198700285 effective words) took 1457.8s, 822282 effective words/s\n", - "2022-03-20 06:14:22,184 : INFO : EPOCH 5 - PROGRESS: at 0.01% examples, 1051640 words/s, in_qsize 0, out_qsize 1\n", - "2022-03-20 06:38:35,360 : INFO : EPOCH - 5 : training on 2996051328 raw words (1198682356 effective words) took 1454.2s, 824283 effective words/s\n", - "2022-03-20 06:38:36,422 : INFO : EPOCH 6 - PROGRESS: at 0.01% examples, 1055533 words/s, in_qsize 0, out_qsize 1\n", - "2022-03-20 07:02:50,449 : INFO : EPOCH - 6 : training on 2996051328 raw words (1198672468 effective words) took 1455.0s, 823803 effective words/s\n", - "2022-03-20 07:02:51,496 : INFO : EPOCH 7 - PROGRESS: at 0.01% examples, 1052539 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-20 07:27:17,109 : INFO : EPOCH - 7 : training on 2996051328 raw words (1198696667 effective words) took 1466.6s, 817342 effective words/s\n", - "2022-03-20 07:27:18,148 : INFO : EPOCH 8 - PROGRESS: at 0.01% examples, 1047255 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-20 07:51:29,674 : INFO : EPOCH - 8 : training on 2996051328 raw words (1198691941 effective words) took 1452.5s, 825260 effective words/s\n", - "2022-03-20 07:51:30,808 : INFO : EPOCH 9 - PROGRESS: at 0.01% examples, 915921 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-20 08:15:44,007 : INFO : EPOCH - 9 : training on 2996051328 raw words (1198707402 effective words) took 1454.2s, 824316 effective words/s\n", - "2022-03-20 08:15:45,041 : INFO : EPOCH 10 - PROGRESS: at 0.01% examples, 1038693 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-20 08:39:48,080 : INFO : EPOCH - 10 : training on 2996051328 raw words (1198683355 effective words) took 1444.0s, 830092 effective words/s\n", - "2022-03-20 08:39:48,155 : INFO : Doc2Vec lifecycle event {'msg': 'training on 29960513280 raw words (11986972743 effective words) took 14566.3s, 822924 effective words/s', 'datetime': '2022-03-20T08:39:48.155821', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n" + "2022-03-20 09:58:45,477 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 10 workers on 991887 vocabulary and 100 features, using sg=1 hs=0 sample=1e-05 negative=5 window=8 shrink_windows=True', 'datetime': '2022-03-20T09:58:45.477703', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", + "2022-03-20 09:58:46,489 : INFO : EPOCH 1 - PROGRESS: at 0.00% examples, 133130 words/s, in_qsize 2, out_qsize 4\n", + "2022-03-20 10:28:46,500 : INFO : EPOCH 1 - PROGRESS: at 63.89% examples, 479007 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-20 10:40:41,145 : INFO : EPOCH - 1 : training on 2996051328 raw words (1198720461 effective words) took 2515.7s, 476504 effective words/s\n", + "2022-03-20 10:40:42,152 : INFO : EPOCH 2 - PROGRESS: at 0.01% examples, 508266 words/s, in_qsize 12, out_qsize 0\n", + "2022-03-20 11:10:42,144 : INFO : EPOCH 2 - PROGRESS: at 66.17% examples, 491918 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-20 11:21:35,923 : INFO : EPOCH - 2 : training on 2996051328 raw words (1198662660 effective words) took 2454.8s, 488296 effective words/s\n", + "2022-03-20 11:21:36,946 : INFO : EPOCH 3 - PROGRESS: at 0.01% examples, 480046 words/s, in_qsize 17, out_qsize 2\n", + "2022-03-20 11:51:37,033 : INFO : EPOCH 3 - PROGRESS: at 66.68% examples, 494875 words/s, in_qsize 17, out_qsize 0\n", + "2022-03-20 12:02:19,087 : INFO : EPOCH - 3 : training on 2996051328 raw words (1198716633 effective words) took 2443.1s, 490658 effective words/s\n", + "2022-03-20 12:02:20,107 : INFO : EPOCH 4 - PROGRESS: at 0.01% examples, 496547 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-20 12:32:20,140 : INFO : EPOCH 4 - PROGRESS: at 67.18% examples, 497696 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-20 12:42:46,250 : INFO : EPOCH - 4 : training on 2996051328 raw words (1198700256 effective words) took 2427.1s, 493880 effective words/s\n", + "2022-03-20 12:42:47,258 : INFO : EPOCH 5 - PROGRESS: at 0.01% examples, 477438 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-20 13:12:47,245 : INFO : EPOCH 5 - PROGRESS: at 67.37% examples, 498714 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-20 13:23:09,733 : INFO : EPOCH - 5 : training on 2996051328 raw words (1198706312 effective words) took 2423.5s, 494619 effective words/s\n", + "2022-03-20 13:23:10,742 : INFO : EPOCH 6 - PROGRESS: at 0.01% examples, 508818 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-20 13:53:10,771 : INFO : EPOCH 6 - PROGRESS: at 67.18% examples, 497693 words/s, in_qsize 19, out_qsize 1\n", + "2022-03-20 14:03:38,965 : INFO : EPOCH - 6 : training on 2996051328 raw words (1198686418 effective words) took 2429.2s, 493449 effective words/s\n", + "2022-03-20 14:03:39,984 : INFO : EPOCH 7 - PROGRESS: at 0.01% examples, 493009 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-20 14:33:40,013 : INFO : EPOCH 7 - PROGRESS: at 65.76% examples, 489578 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-20 14:44:45,759 : INFO : EPOCH - 7 : training on 2996051328 raw words (1198679497 effective words) took 2466.8s, 485932 effective words/s\n", + "2022-03-20 14:44:46,784 : INFO : EPOCH 8 - PROGRESS: at 0.01% examples, 481341 words/s, in_qsize 18, out_qsize 1\n", + "2022-03-20 15:14:46,827 : INFO : EPOCH 8 - PROGRESS: at 67.23% examples, 497983 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-20 15:25:15,178 : INFO : EPOCH - 8 : training on 2996051328 raw words (1198737432 effective words) took 2429.4s, 493435 effective words/s\n", + "2022-03-20 15:25:16,182 : INFO : EPOCH 9 - PROGRESS: at 0.01% examples, 494052 words/s, in_qsize 16, out_qsize 0\n", + "2022-03-20 15:55:16,171 : INFO : EPOCH 9 - PROGRESS: at 64.71% examples, 483746 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-20 16:07:12,899 : INFO : EPOCH - 9 : training on 2996051328 raw words (1198676046 effective words) took 2517.7s, 476094 effective words/s\n", + "2022-03-20 16:07:13,918 : INFO : EPOCH 10 - PROGRESS: at 0.01% examples, 484059 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-20 16:37:13,967 : INFO : EPOCH 10 - PROGRESS: at 65.05% examples, 485618 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-20 16:48:27,957 : INFO : EPOCH - 10 : training on 2996051328 raw words (1198704348 effective words) took 2475.0s, 484327 effective words/s\n", + "2022-03-20 16:48:27,959 : INFO : Doc2Vec lifecycle event {'msg': 'training on 29960513280 raw words (11986990063 effective words) took 24582.3s, 487627 effective words/s', 'datetime': '2022-03-20T16:48:27.959476', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", + "2022-03-20 16:48:27,960 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 10 workers on 991887 vocabulary and 100 features, using sg=0 hs=0 sample=1e-05 negative=5 window=8 shrink_windows=True', 'datetime': '2022-03-20T16:48:27.960338', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", + "2022-03-20 16:48:28,973 : INFO : EPOCH 1 - PROGRESS: at 0.01% examples, 834487 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-20 17:08:24,024 : INFO : EPOCH - 1 : training on 2996051328 raw words (1198718673 effective words) took 1196.1s, 1002220 effective words/s\n", + "2022-03-20 17:08:25,030 : INFO : EPOCH 2 - PROGRESS: at 0.02% examples, 1293557 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-20 17:28:24,924 : INFO : EPOCH - 2 : training on 2996051328 raw words (1198708592 effective words) took 1200.9s, 998186 effective words/s\n", + "2022-03-20 17:28:25,931 : INFO : EPOCH 3 - PROGRESS: at 0.02% examples, 1276259 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-20 17:48:18,838 : INFO : EPOCH - 3 : training on 2996051328 raw words (1198693921 effective words) took 1193.9s, 1004025 effective words/s\n", + "2022-03-20 17:48:19,842 : INFO : EPOCH 4 - PROGRESS: at 0.02% examples, 1304070 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-20 18:08:14,585 : INFO : EPOCH - 4 : training on 2996051328 raw words (1198686639 effective words) took 1195.7s, 1002469 effective words/s\n", + "2022-03-20 18:08:15,591 : INFO : EPOCH 5 - PROGRESS: at 0.02% examples, 1256541 words/s, in_qsize 0, out_qsize 1\n", + "2022-03-20 18:28:15,854 : INFO : EPOCH - 5 : training on 2996051328 raw words (1198698541 effective words) took 1201.3s, 997873 effective words/s\n", + "2022-03-20 18:28:16,859 : INFO : EPOCH 6 - PROGRESS: at 0.02% examples, 1243710 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-20 18:48:21,554 : INFO : EPOCH - 6 : training on 2996051328 raw words (1198688111 effective words) took 1205.7s, 994191 effective words/s\n", + "2022-03-20 18:48:22,560 : INFO : EPOCH 7 - PROGRESS: at 0.02% examples, 1245030 words/s, in_qsize 0, out_qsize 1\n", + "2022-03-20 19:08:16,696 : INFO : EPOCH - 7 : training on 2996051328 raw words (1198696981 effective words) took 1195.1s, 1002985 effective words/s\n", + "2022-03-20 19:08:17,713 : INFO : EPOCH 8 - PROGRESS: at 0.02% examples, 1313159 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-20 19:28:12,212 : INFO : EPOCH - 8 : training on 2996051328 raw words (1198693792 effective words) took 1195.5s, 1002650 effective words/s\n", + "2022-03-20 19:28:13,217 : INFO : EPOCH 9 - PROGRESS: at 0.02% examples, 1258160 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-20 19:48:09,326 : INFO : EPOCH - 9 : training on 2996051328 raw words (1198724296 effective words) took 1197.1s, 1001366 effective words/s\n", + "2022-03-20 19:48:10,336 : INFO : EPOCH 10 - PROGRESS: at 0.02% examples, 1319522 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-20 20:08:08,642 : INFO : EPOCH - 10 : training on 2996051328 raw words (1198697061 effective words) took 1199.3s, 999484 effective words/s\n", + "2022-03-20 20:08:08,643 : INFO : Doc2Vec lifecycle event {'msg': 'training on 29960513280 raw words (11987006607 effective words) took 11980.6s, 1000531 effective words/s', 'datetime': '2022-03-20T20:08:08.643623', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n" ] } ], @@ -369,48 +369,48 @@ "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "[('Deep learning', 0.6759864687919617),\n", - " ('Pattern recognition', 0.6742060780525208),\n", - " ('Supervised learning', 0.6707901358604431),\n", - " ('Artificial neural network', 0.6665838956832886),\n", - " ('Semi-supervised learning', 0.654625654220581),\n", - " ('Outline of machine learning', 0.6472466588020325),\n", - " ('Multi-task learning', 0.646246075630188),\n", - " ('Boosting (machine learning)', 0.641443133354187),\n", - " ('Neural network', 0.637937605381012),\n", - " ('Types of artificial neural networks', 0.6377928853034973),\n", - " ('Incremental learning', 0.637475848197937),\n", - " ('Perceptron', 0.6358065605163574),\n", - " ('Rule induction', 0.6326183676719666),\n", - " ('Early stopping', 0.6307427883148193),\n", - " ('Multilayer perceptron', 0.6273738741874695),\n", - " ('Connectionist expert system', 0.6203325390815735),\n", - " ('Neural Designer', 0.6193098425865173),\n", - " ('Meta learning (computer science)', 0.6182209253311157),\n", - " ('Feature (machine learning)', 0.6175855398178101),\n", - " ('Domain adaptation', 0.6154839396476746)]\n", - "Doc2Vec\n", - "[('Pattern recognition', 0.7132866382598877),\n", - " ('Deep learning', 0.66518634557724),\n", - " ('Supervised learning', 0.6553921699523926),\n", - " ('Artificial neural network', 0.6497268080711365),\n", - " ('Semi-supervised learning', 0.6446605324745178),\n", - " ('Statistical learning theory', 0.6335287094116211),\n", - " ('Ensemble learning', 0.6282770037651062),\n", - " ('Prior knowledge for pattern recognition', 0.6238192319869995),\n", - " ('Boosting (machine learning)', 0.6212880611419678),\n", - " ('Early stopping', 0.6177847981452942),\n", - " ('Statistical classification', 0.6156905293464661),\n", - " ('Autoencoder', 0.6140605211257935),\n", - " ('Predictive Model Markup Language', 0.6070867776870728),\n", - " ('Multi-task learning', 0.606650710105896),\n", - " ('Similarity learning', 0.6065689921379089),\n", - " ('Feature learning', 0.5982990860939026),\n", - " ('Automatic image annotation', 0.5949181914329529),\n", - " ('Linear classifier', 0.5930992960929871),\n", - " ('Support-vector machine', 0.5929517149925232),\n", - " ('Active learning (machine learning)', 0.5914105176925659)]\n" + "Doc2Vec\n", + "[('Pattern recognition', 0.8360552787780762),\n", + " ('Supervised learning', 0.8315915465354919),\n", + " ('Artificial neural network', 0.8176121115684509),\n", + " ('Deep learning', 0.7854599952697754),\n", + " ('Bayesian optimization', 0.7843242287635803),\n", + " ('Ensemble learning', 0.7767862677574158),\n", + " ('Outline of machine learning', 0.7751106023788452),\n", + " ('Intelligent control', 0.7724385261535645),\n", + " ('Neural Designer', 0.7724118828773499),\n", + " ('Incremental learning', 0.7718138098716736),\n", + " ('Behavior selection algorithm', 0.7676114439964294),\n", + " ('Directed information', 0.7646719217300415),\n", + " ('Boosting (machine learning)', 0.7642883658409119),\n", + " ('Discriminative model', 0.7642679214477539),\n", + " ('Algorithmic technique', 0.7640156745910645),\n", + " ('Outline of computer science', 0.7638604640960693),\n", + " ('Numenta', 0.7621665596961975),\n", + " ('Feature selection', 0.761879026889801),\n", + " ('Multiway data analysis', 0.7613278031349182),\n", + " ('Types of artificial neural networks', 0.7612452507019043)]\n", + "Doc2Vec\n", + "[('Pattern recognition', 0.7676337361335754),\n", + " ('Supervised learning', 0.765723466873169),\n", + " ('Artificial neural network', 0.7397017478942871),\n", + " ('Deep learning', 0.7371508479118347),\n", + " ('Semi-supervised learning', 0.7312546968460083),\n", + " ('Statistical learning theory', 0.72916179895401),\n", + " ('Multi-task learning', 0.7289299368858337),\n", + " ('Data analysis techniques for fraud detection', 0.7225874066352844),\n", + " ('Similarity learning', 0.7212273478507996),\n", + " ('Symbolic artificial intelligence', 0.7145661115646362),\n", + " ('Autoencoder', 0.7123140096664429),\n", + " ('Naive Bayes classifier', 0.7108708024024963),\n", + " ('Cognitive model', 0.7059794068336487),\n", + " ('Predictive Model Markup Language', 0.7042246460914612),\n", + " ('Support-vector machine', 0.7018824815750122),\n", + " ('Regularization (mathematics)', 0.7006190419197083),\n", + " ('Linear classifier', 0.699646532535553),\n", + " ('John Robert Anderson (psychologist)', 0.696628749370575),\n", + " ('Multiclass classification', 0.6958213448524475),\n", + " ('Image segmentation', 0.6927947402000427)]\n" ] } ], @@ -424,12 +424,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Both results seem similar and match the results from the paper's Table 1, although not exactly. This is because we don't kno wthe exact parameters and also Wikipedia. FIXME\n", + "Both results seem similar and match the results from the paper's Table 1, although not exactly. This is because we don't know the exact parameters of the original implementation (see above). And also because we're training the model 7 years later and the Wikipedia content has changed in the meantime.\n", "\n", - "\n", - ", but note the DM model took 4x less time train (training 4x faster).\n", - "\n", - "Second, let's calculate the most similar Wikipedia entries to \"Lady Gaga\" using Paragraph Vector:" + "Now following the paper's Table 2a), let's calculate the most similar Wikipedia entries to \"Lady Gaga\" using Paragraph Vector:" ] }, { @@ -443,28 +440,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "[('Katy Perry', 0.6756664514541626),\n", - " ('Ariana Grande', 0.6528252363204956),\n", - " ('Miley Cyrus', 0.6260033845901489),\n", - " ('Taylor Swift', 0.6144734621047974),\n", - " ('Britney Spears', 0.6105279922485352),\n", - " ('List of awards and nominations received by Lady Gaga', 0.6099144220352173),\n", - " ('Madonna', 0.5910527110099792),\n", - " ('Christina Aguilera', 0.5906776785850525),\n", - " ('Beyoncé', 0.5887077450752258),\n", - " ('Demi Lovato', 0.5824941992759705)]\n", - "Doc2Vec\n", - "[('Taylor Swift', 0.5584290623664856),\n", - " ('Joanne (album)', 0.5531442761421204),\n", - " ('List of Lady Gaga live performances', 0.546563982963562),\n", - " ('Katy Perry', 0.5364790558815002),\n", - " ('Cynthia Germanotta', 0.5363910794258118),\n", - " ('Artpop', 0.5359800457954407),\n", - " ('Natali Germanotta', 0.5320610404014587),\n", - " ('Beautiful, Dirty, Rich', 0.5298762917518616),\n", - " ('Joanne Trattoria Cookbook', 0.5219755172729492),\n", - " ('The Fame', 0.5171768069267273)]\n" + "Doc2Vec\n", + "[('Ariana Grande', 0.8456711769104004),\n", + " ('Katy Perry', 0.8200861811637878),\n", + " ('Mariah Carey', 0.8129399418830872),\n", + " ('Beyoncé', 0.8065741658210754),\n", + " ('Cardi B', 0.8001681566238403),\n", + " ('Harry Styles', 0.8001224398612976),\n", + " ('Rihanna', 0.7915332913398743),\n", + " ('Cher', 0.7864869236946106),\n", + " ('Adele', 0.7830130457878113),\n", + " ('Britney Spears', 0.78132164478302)]\n", + "Doc2Vec\n", + "[('Katy Perry', 0.7350542545318604),\n", + " ('Joanne (album)', 0.7215949296951294),\n", + " ('Britney Spears', 0.7209619879722595),\n", + " ('Kesha', 0.71454918384552),\n", + " ('Artpop', 0.7129204869270325),\n", + " ('Taylor Swift', 0.7099292874336243),\n", + " ('Cardi B', 0.7032613158226013),\n", + " ('Beyoncé', 0.69660484790802),\n", + " ('Madonna', 0.6904603838920593),\n", + " ('Beautiful, Dirty, Rich', 0.6878629922866821)]\n" ] } ], @@ -498,28 +495,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "[('Katy Perry', 0.5954936146736145),\n", - " ('Ariana Grande', 0.5549463033676147),\n", - " ('Thank You, Love (Kana Nishino album)', 0.5462373495101929),\n", - " ('Kōsui (Eito song)', 0.5363353490829468),\n", - " ('Crazy Crazy / Harajuku Iyahoi', 0.5308663249015808),\n", - " ('Big Boys Cry/Beautiful', 0.5303178429603577),\n", - " ('23rd Monster', 0.5298969745635986),\n", - " ('Koi (song)', 0.5275850296020508),\n", - " ('X -Cross-', 0.5255367755889893),\n", - " ('Suzume (song)', 0.525151252746582)]\n", - "Doc2Vec\n", - "[('Joanne (album)', 0.5087171196937561),\n", - " ('Artpop', 0.49931594729423523),\n", - " ('The Cure (song)', 0.4835745394229889),\n", - " ('Natali Germanotta', 0.47394222021102905),\n", - " ('Chisato Moritaka', 0.47318926453590393),\n", - " ('Ayumi Hamasaki', 0.47156259417533875),\n", - " ('List of Lady Gaga live performances', 0.4629444479942322),\n", - " ('Joanne Trattoria Cookbook', 0.4615497887134552),\n", - " ('Katy Perry', 0.46151337027549744),\n", - " ('Blackpink', 0.45603010058403015)]\n" + "Doc2Vec\n", + "[('Last Angel', 0.7589371204376221),\n", + " ('Kyary Pamyu Pamyu', 0.7432413697242737),\n", + " ('Hottaraka Series', 0.7406142354011536),\n", + " ('Blackpink', 0.739910364151001),\n", + " ('2NE1', 0.7388721108436584),\n", + " ('Nanda Collection', 0.7339783310890198),\n", + " ('Ayumi Hamasaki', 0.7316932082176208),\n", + " ('In the Middle (Ai song)', 0.7270604968070984),\n", + " ('Duty (album)', 0.724769115447998),\n", + " ('Change Myself', 0.7235770225524902)]\n", + "Doc2Vec\n", + "[('Ayumi Hamasaki', 0.6632838845252991),\n", + " ('Artpop', 0.657765805721283),\n", + " ('Pink Lady (duo)', 0.6549235582351685),\n", + " ('Free Free', 0.651936948299408),\n", + " ('Taboo (Koda Kumi song)', 0.6419629454612732),\n", + " ('Princess Princess (band)', 0.6365096569061279),\n", + " ('Radwimps', 0.6351915001869202),\n", + " ('Dempagumi.inc', 0.6337336897850037),\n", + " ('Headbanger (Babymetal song)', 0.6331109404563904),\n", + " ('Joanne (album)', 0.6326634287834167)]\n" ] } ], @@ -558,25 +555,25 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2022-03-18 19:08:34,623 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec_dbow.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-03-18T19:08:34.622990', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'saving'}\n", - "2022-03-18 19:08:34,641 : INFO : storing np array 'vectors' to doc2vec_dbow.model.dv.vectors.npy\n", - "2022-03-18 19:08:40,244 : INFO : storing np array 'vectors' to doc2vec_dbow.model.wv.vectors.npy\n", - "2022-03-18 19:08:46,811 : INFO : storing np array 'syn1neg' to doc2vec_dbow.model.syn1neg.npy\n", - "2022-03-18 19:08:48,564 : INFO : not storing attribute cum_table\n", - "2022-03-18 19:08:56,097 : INFO : saved doc2vec_dbow.model\n", - "2022-03-18 19:08:56,098 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec_dm.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-03-18T19:08:56.098765', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'saving'}\n", - "2022-03-18 19:08:56,099 : INFO : storing np array 'vectors' to doc2vec_dm.model.dv.vectors.npy\n", - "2022-03-18 19:09:09,087 : INFO : storing np array 'vectors' to doc2vec_dm.model.wv.vectors.npy\n", - "2022-03-18 19:09:13,804 : INFO : storing np array 'syn1neg' to doc2vec_dm.model.syn1neg.npy\n", - "2022-03-18 19:09:16,101 : INFO : not storing attribute cum_table\n", - "2022-03-18 19:09:20,432 : INFO : saved doc2vec_dm.model\n" + "2022-03-20 20:10:55,289 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec_dbow.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-03-20T20:10:55.289405', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'saving'}\n", + "2022-03-20 20:10:55,292 : INFO : storing np array 'vectors' to doc2vec_dbow.model.dv.vectors.npy\n", + "2022-03-20 20:10:58,822 : INFO : storing np array 'vectors' to doc2vec_dbow.model.wv.vectors.npy\n", + "2022-03-20 20:10:59,349 : INFO : storing np array 'syn1neg' to doc2vec_dbow.model.syn1neg.npy\n", + "2022-03-20 20:10:59,842 : INFO : not storing attribute cum_table\n", + "2022-03-20 20:11:03,053 : INFO : saved doc2vec_dbow.model\n", + "2022-03-20 20:11:03,054 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec_dm.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-03-20T20:11:03.054368', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'saving'}\n", + "2022-03-20 20:11:03,054 : INFO : storing np array 'vectors' to doc2vec_dm.model.dv.vectors.npy\n", + "2022-03-20 20:11:05,496 : INFO : storing np array 'vectors' to doc2vec_dm.model.wv.vectors.npy\n", + "2022-03-20 20:11:06,366 : INFO : storing np array 'syn1neg' to doc2vec_dm.model.syn1neg.npy\n", + "2022-03-20 20:11:06,792 : INFO : not storing attribute cum_table\n", + "2022-03-20 20:11:09,296 : INFO : saved doc2vec_dm.model\n" ] } ], From 3ce81a44cc59b3d77edee043ee14050f611008df Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 22 Mar 2022 00:55:35 +0900 Subject: [PATCH 44/81] get rid of tox, build things via github actions directly (#3308) * get rid of tox, build things via github actions directly This gets rid of an unnecessary layer of indirection. All the build steps are now defined in the github workflow file. This makes it easier to debug and understand what's going on. * unpin flake8 and flake8-rst * fixup * move linters to a separate job * why does the documentation fail to lint??? * install pytest * add requirements.txt for github workflow * use setup.py to install dependencies instead * simplify coverage step * re-enable previously disabled jobs * fix job name in workflow * update tests.yml * add timeout-minutes to build-wheels.yml * build docs using github actions instead of circleci --- .circleci/config.yml | 48 -------- .github/workflows/build-wheels.yml | 1 + .github/workflows/tests.yml | 135 +++++++++++++++++++--- gensim/test/test_translation_matrix.py | 1 - tox.ini | 154 ------------------------- 5 files changed, 118 insertions(+), 221 deletions(-) delete mode 100644 .circleci/config.yml delete mode 100644 tox.ini diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 1071aa5aeb..0000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,48 +0,0 @@ -version: 2 -jobs: - build: - docker: - - image: cimg/python:3.8.11 - - working_directory: ~/gensim - - steps: - - checkout - - - restore_cache: - key: pip-cache - - - run: - name: Apt install (for latex render) - command: | - sudo apt-get -yq update - sudo apt-get -yq remove texlive-binaries --purge - sudo apt-get -yq --no-install-suggests --no-install-recommends --force-yes install dvipng texlive-latex-base texlive-latex-extra texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended latexmk - sudo apt-get -yq install build-essential python3.8-dev - - - run: - name: Basic installation (tox) - command: | - python3.8 -m virtualenv venv - source venv/bin/activate - pip install tox --progress-bar off - - - run: - name: Build documentation - environment: - TOX_PARALLEL_NO_SPINNER: 1 - TOX_PIP_OPTS: --progress-bar=off - command: | - source venv/bin/activate - tox -e compile,docs -vv - - - store_artifacts: - path: docs/src/_build - destination: documentation - - - save_cache: - key: pip-cache - paths: - - "~/.cache/pip" - - "~/.ccache" - - "~/.pip-cache" diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index ff304ea1c7..42f61bb8b2 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -10,6 +10,7 @@ on: jobs: build: + timeout-minutes: 30 runs-on: ${{ matrix.os }} defaults: run: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 530aff2683..f09b21d61d 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -6,8 +6,79 @@ on: branches: [ develop ] jobs: + linters: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Setup up Python ${{ matrix.python }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python }} + + - name: Update pip + run: python -m pip install -U pip + + - name: Install dependencies + run: python -m pip install flake8 flake8-rst + + - name: Run flake8 linter (source) + run: flake8 --ignore E12,W503 --max-line-length 120 --show-source gensim + + # - name: Run flake8 linter (documentation) + # run: flake8 --ignore E202,E402,E302,E305,F821 --max-line-length 120 --filename '*.py,*.rst' docs + + docs: + name: build documentation + timeout-minutes: 10 + runs-on: ubuntu-20.04 + defaults: + run: + shell: bash + + # + # Don't run this job unless the linters have succeeded. + # It's wasteful to test code that failed to lint, because it'll get + # re-tested once the lint errors are fixed. + # + needs: [linters] + + steps: + - uses: actions/checkout@v2 + - name: Setup up Python ${{ matrix.python }} + uses: actions/setup-python@v2 + with: + # + # We use Py3.8 here for historical reasons. + # + python-version: "3.8" + + - name: Update pip + run: python -m pip install -U pip + + - name: Install apt packages for LaTeX rendering + run: | + sudo apt-get -yq update + sudo apt-get -yq remove texlive-binaries --purge + sudo apt-get -yq --no-install-suggests --no-install-recommends --force-yes install dvipng texlive-latex-base texlive-latex-extra texlive-latex-recommended texlive-latex-extra texlive-fonts-recommended latexmk + sudo apt-get -yq install build-essential python3.8-dev + + - name: Install gensim and its dependencies + run: pip install -e .[docs] + + - name: Build documentation + run: | + python setup.py build_ext --inplace + make -C docs/src clean html + + # + # FIXME: do we want to store the built documentation somewhere, or is + # knowing that the docs built successfully enough? + # + tests: - name: ${{ matrix.name }} + name: test ${{ matrix.os }} python ${{ matrix.python }} + timeout-minutes: 30 runs-on: ${{ matrix.os }} defaults: run: @@ -16,17 +87,22 @@ jobs: fail-fast: false matrix: include: - - {name: Linux, python: 3.7, os: ubuntu-20.04, tox: 'flake8,flake8-docs'} - - {name: Linux, python: 3.7, os: ubuntu-20.04, tox: 'py37-linux'} - - {name: Linux, python: 3.8, os: ubuntu-20.04, tox: 'py38-linux-cov'} - - {name: Linux, python: 3.9, os: ubuntu-20.04, tox: 'py39-linux'} - - {name: Linux, python: '3.10', os: ubuntu-20.04, tox: 'py310-linux'} - - {name: Windows, python: 3.7, os: windows-2019, tox: 'py37-win'} - - {name: Windows, python: 3.8, os: windows-2019, tox: 'py38-win'} - - {name: Windows, python: 3.9, os: windows-2019, tox: 'py39-win'} - - {name: Windows, python: '3.10', os: windows-2019, tox: 'py310-win'} - env: - TOX_PARALLEL_NO_SPINNER: 1 + - {python: 3.7, os: ubuntu-20.04} + - {python: 3.8, os: ubuntu-20.04} + - {python: 3.9, os: ubuntu-20.04} + - {python: '3.10', os: ubuntu-20.04, coverage: true} + + - {python: 3.7, os: windows-2019} + - {python: 3.8, os: windows-2019} + - {python: 3.9, os: windows-2019} + - {python: '3.10', os: windows-2019} + + # + # Don't run this job unless the linters have succeeded. + # It's wasteful to test code that failed to lint, because it'll get + # re-tested once the lint errors are fixed. + # + needs: [linters] steps: - uses: actions/checkout@v2 @@ -50,25 +126,48 @@ jobs: curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo apt-key add sudo apt-get update -y sudo apt-get install -y sbt - - name: Install tox - run: pip install tox + - name: Install GDB & enable core dumps if: matrix.os == 'ubuntu-20.04' run: | sudo apt-get update -y sudo apt-get install -y gdb ulimit -c unlimited -S # enable core dumps - - name: Run tox tests - run: tox -e ${{ matrix.tox }} + + - name: Install gensim and its dependencies + if: matrix.os != 'windows' + run: pip install -e .[test] + + - name: Install gensim and its dependencies (Windows) + if: matrix.os == 'windows' + run: pip install -e .[test-win] + + - name: Build + run: | + python --version + pip --version + python setup.py build_ext --inplace + + # + # Some of our tests are hanging. + # Limit the use of the coverage plugin for pytest to rule it out as a factor. + # + - name: Run tests (without coverage) + if: matrix.coverage != true + run: pytest -v gensim/test + + - name: Run tests (with coverage) + if: matrix.coverage == true + run: pytest -v gensim/test --cov=gensim/ --cov-report=xml + - name: Upload coverage to Codecov - if: matrix.os == 'ubuntu-20.04' && matrix.python == '3.8' + if: matrix.coverage == true uses: codecov/codecov-action@v2 with: fail_ci_if_error: true files: ./coverage.xml verbose: true - - name: Collect corefile if: ${{ failure() }} && matrix.os == 'ubuntu-20.04' run: | diff --git a/gensim/test/test_translation_matrix.py b/gensim/test/test_translation_matrix.py index 0cb4682013..44ed22855e 100644 --- a/gensim/test/test_translation_matrix.py +++ b/gensim/test/test_translation_matrix.py @@ -1,6 +1,5 @@ #!/usr/bin/env python # encoding: utf-8 -import sys from collections import namedtuple import unittest import logging diff --git a/tox.ini b/tox.ini deleted file mode 100644 index 566e331997..0000000000 --- a/tox.ini +++ /dev/null @@ -1,154 +0,0 @@ -[tox] -minversion = 2.0 -envlist = {py37,py38,py39,py310}-{win,linux}, py38-linux-cov, flake8, docs, docs-upload, download-wheels, upload-wheels, test-pypi -skipsdist = True -platform = linux: linux - win: win64 - - -[flake8] -ignore = E12, W503 -max-line-length = 120 -show-source = True - - -[flake8-rst] -filename = *.rst *.py -max-line-length = 120 -ignore = E203, # space before : - E402, # module level import not at top of file - # Classes / functions in a docstring block generate those errors - E302, # expected 2 blank lines, found 0 - E305, # expected 2 blank lines after class or function definition, found 0 - F821, # undefined name; remove once all docstrings are fully executable -exclude = .venv, .git, .tox, dist, doc, build, gensim/models/deprecated - - -[coverage:run] -source=gensim - -[coverage:report] -omit = - gensim/test/* - */__init__.py - -exclude_lines = - pragma: no cover - def __repr__ - def __str__ - raise AssertionError - raise NotImplementedError - if __name__ == .__main__.: - -ignore_errors = True - -# -# Conditional factors https://tox.wiki/en/latest/config.html#factors -# -[pytest] -addopts = -rfxEXs --durations=20 --showlocals - -[testenv] -recreate = True - -install_command = python -m pip install --timeout=60 {env:TOX_PIP_OPTS:} {opts} {packages} - -deps = - pip>=19.1.1 - linux: .[test] - win: .[test-win] - -setenv = - FT_HOME={env:FT_HOME:} - WR_HOME={env:WR_HOME:} - VOWPAL_WABBIT_PATH={env:VOWPAL_WABBIT_PATH:} - DTM_PATH={env:DTM_PATH:} - MALLET_HOME={env:MALLET_HOME:} - SKIP_NETWORK_TESTS={env:SKIP_NETWORK_TESTS:} - BOTO_CONFIG={env:BOTO_CONFIG:} - RUNNER_OS={env:RUNNER_OS:} - PYTHONHASHSEED=1 - TOX_PARALLEL_NO_SPINNER=1 - -commands = - python --version - pip --version - python setup.py build_ext --inplace - cov: pytest {posargs:gensim/test} --cov=gensim/ --cov-report=xml - !cov: pytest {posargs:gensim/test} - - -[testenv:flake8] -recreate = True -deps = - # Pinned to 3.7.9 because >3.8.0 triggers "AttributeError: 'Namespace' object has no attribute 'output_file'" - # in flake8-rst. Apparently some bug in flake8-rst: - # https://gitlab.com/pycqa/flake8/-/issues/641 - # https://github.com/kataev/flake8-rst/pull/23/files - flake8==3.7.9 - -commands = flake8 gensim/ {posargs} - - -[testenv:flake8-docs] -recreate = True -deps = - flake8-rst==0.7.2 - flake8==3.7.9 - -commands = flake8-rst gensim/ docs/ {posargs} - - -[testenv:compile] -basepython = python3 -recreate = True - -deps = numpy -commands = python setup.py build_ext --inplace - - -[testenv:docs] -basepython = python3 -recreate = True -whitelist_externals = make -deps = .[docs] - -commands = - python setup.py build_ext --inplace - make -C docs/src clean html - - -[testenv:docs-upload] -recreate = True -whitelist_externals = make -deps = .[docs] -changedir = docs/src - -commands = make clean html upload - - -[testenv:download-wheels] -deps = wheelhouse_uploader -whitelist_externals = rm -recreate = True - -commands = - rm -rf dist/ - python setup.py sdist fetch_artifacts - - -[testenv:upload-wheels] -deps = twine - -commands = twine upload dist/* - - -[testenv:test-pypi] -deps = twine -whitelist_externals = rm - -commands = - rm -rf dist/ - python setup.py sdist - twine upload --repository-url https://test.pypi.org/legacy/ dist/* - ; Go to https://testpypi.python.org/pypi?name=gensim&:action=display and check result From ac3bbcdf87b263f79d5e19cce173e6c709a15f9d Mon Sep 17 00:00:00 2001 From: Jakub Janik Date: Tue, 22 Mar 2022 02:05:30 +0100 Subject: [PATCH 45/81] streamlining most_similar_cosmul and evaluate_word_analogies (#2656) * streamlining most_similar_cosmul * Fix PR requested changes and add unit test * fix merge artifacts Co-authored-by: n3hrox Co-authored-by: Michael Penkov --- gensim/models/keyedvectors.py | 28 +++++++++++++++++++++++++--- gensim/test/test_fasttext.py | 3 +++ gensim/test/test_word2vec.py | 6 ++++++ 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 674689afce..a1f1d22df8 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -946,7 +946,9 @@ def nbow(document): # Compute WMD. return emd(d1, d2, distance_matrix) - def most_similar_cosmul(self, positive=None, negative=None, topn=10): + def most_similar_cosmul( + self, positive=None, negative=None, topn=10, restrict_vocab=None + ): """Find the top-N most similar words, using the multiplicative combination objective, proposed by `Omer Levy and Yoav Goldberg "Linguistic Regularities in Sparse and Explicit Word Representations" `_. Positive words still contribute positively towards the similarity, @@ -959,6 +961,9 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): With a single positive example, rankings will be the same as in the default :meth:`~gensim.models.keyedvectors.KeyedVectors.most_similar`. + Allows calls like most_similar_cosmul('dog', 'cat'), as a shorthand for + most_similar_cosmul(['dog'], ['cat']) where 'dog' is positive and 'cat' negative + Parameters ---------- positive : list of str, optional @@ -968,6 +973,11 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): topn : int or None, optional Number of top-N similar words to return, when `topn` is int. When `topn` is None, then similarities for all words are returned. + restrict_vocab : int or None, optional + Optional integer which limits the range of vectors which are searched for most-similar values. + For example, restrict_vocab=10000 would only check the first 10000 node vectors in the vocabulary order. + This may be meaningful if vocabulary is sorted by descending frequency. + Returns ------- @@ -985,7 +995,14 @@ def most_similar_cosmul(self, positive=None, negative=None, topn=10): positive = _ensure_list(positive) negative = _ensure_list(negative) - self.fill_norms() + self.init_sims() + + if isinstance(positive, str): + # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog']) + positive = [positive] + + if isinstance(negative, str): + negative = [negative] all_words = { self.get_index(word) for word in positive + negative @@ -1205,7 +1222,9 @@ def _log_evaluate_word_analogies(section): logger.info("%s: %.1f%% (%i/%i)", section['section'], 100.0 * score, correct, correct + incorrect) return score - def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensitive=True, dummy4unknown=False): + def evaluate_word_analogies( + self, analogies, restrict_vocab=300000, case_insensitive=True, + dummy4unknown=False, similarity_function='most_similar'): """Compute performance of the model on an analogy test set. The accuracy is reported (printed to log and returned as a score) for each section separately, @@ -1231,6 +1250,8 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi dummy4unknown : bool, optional If True - produce zero accuracies for 4-tuples with out-of-vocabulary words. Otherwise, these tuples are skipped entirely and not used in the evaluation. + similarity_function : str, optional + Function name used for similarity calculation. Returns ------- @@ -1286,6 +1307,7 @@ def evaluate_word_analogies(self, analogies, restrict_vocab=300000, case_insensi predicted = None # find the most likely prediction using 3CosAdd (vector offset) method # TODO: implement 3CosMul and set-based methods for solving analogies + sims = self.most_similar(positive=[b, c], negative=[a], topn=5, restrict_vocab=restrict_vocab) self.key_to_index = original_key_to_index for element in sims: diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 2ff7995e0c..ecc44a30e4 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -373,6 +373,9 @@ def test_most_similar_cosmul(self): self.assertEqual( self.test_model.wv.most_similar_cosmul('nights'), self.test_model.wv.most_similar_cosmul(positive=['nights'])) + self.assertEqual( + self.test_model.wv.most_similar_cosmul('the', 'and'), + self.test_model.wv.most_similar_cosmul(positive=['the'], negative=['and'])) def test_lookup(self): # In vocab, sanity check diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 56a1ecfae0..8edfe3c04c 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -555,6 +555,12 @@ def test_evaluate_word_analogies(self): """Test that evaluating analogies on KeyedVectors give sane results""" model = word2vec.Word2Vec(LeeCorpus()) score, sections = model.wv.evaluate_word_analogies(datapath('questions-words.txt')) + score_cosmul, sections_cosmul = model.wv.evaluate_word_analogies( + datapath('questions-words.txt'), + similarity_function='most_similar_cosmul' + ) + self.assertEqual(score, score_cosmul) + self.assertEqual(sections, sections_cosmul) self.assertGreaterEqual(score, 0.0) self.assertLessEqual(score, 1.0) self.assertGreater(len(sections), 0) From c19f2233474b09b55d3b29ebaead7699393ef815 Mon Sep 17 00:00:00 2001 From: Ayan Saha Date: Tue, 22 Mar 2022 07:46:34 +0530 Subject: [PATCH 46/81] Add get_sentence_vector() to FastText and get_mean_vector() to KeyedVectors (#3188) * Implement `get_mean_vector()` for keyedvectors * Add test cases for get_mean_vector() * Implement get_sentence_vector() * Fix __contains__ method to consider no ngrams case * Add support for ndarrray in get_mean_vector * Resolve merge conflicts * Refactor rank_by_centrality & n_similarity * Add post-normalization in get_mean_vector() * Refactor iteration and Improve doc comment Resolve merge conflicts in most_similar * Convert 2D ndarray to list in most_similar Co-authored-by: Michael Penkov --- gensim/models/fasttext.py | 24 ++++++- gensim/models/keyedvectors.py | 119 +++++++++++++++++++++++-------- gensim/test/test_keyedvectors.py | 29 ++++++++ 3 files changed, 141 insertions(+), 31 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 6d992d9b94..5ea5077a0c 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -1045,7 +1045,7 @@ def __contains__(self, word): Note ---- - This method **always** returns True, because of the way FastText works. + This method **always** returns True with char ngrams, because of the way FastText works. If you want to check if a word is an in-vocabulary term, use this instead: @@ -1059,7 +1059,10 @@ def __contains__(self, word): False """ - return True + if self.bucket == 0: # check for the case when char ngrams not used + return word in self.key_to_index + else: + return True def save(self, *args, **kwargs): """Save object. @@ -1131,6 +1134,23 @@ def get_vector(self, word, norm=False): else: return word_vec / len(ngram_hashes) + def get_sentence_vector(self, sentence): + """Get a single 1-D vector representation for a given `sentence`. + This function is workalike of the official fasttext's get_sentence_vector(). + + Parameters + ---------- + sentence : list of (str or int) + list of words specified by string or int ids. + + Returns + ------- + numpy.ndarray + 1-D numpy array representation of the `sentence`. + + """ + return super(FastTextKeyedVectors, self).get_mean_vector(sentence) + def resize_vectors(self, seed=0): """Make underlying vectors match 'index_to_key' size; random-initialize any new rows.""" diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index a1f1d22df8..0dd043c2df 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -174,8 +174,8 @@ from typing import Iterable from numpy import ( - dot, float32 as REAL, double, array, zeros, vstack, - ndarray, sum as np_sum, prod, argmax, dtype, ascontiguousarray, frombuffer, + dot, float32 as REAL, double, zeros, vstack, ndarray, + sum as np_sum, prod, argmax, dtype, ascontiguousarray, frombuffer, ) import numpy as np from scipy import stats @@ -203,6 +203,9 @@ def _ensure_list(value): if isinstance(value, _KEY_TYPES) or (isinstance(value, ndarray) and len(value.shape) == 1): return [value] + if isinstance(value, ndarray) and len(value.shape) == 2: + return list(value) + return value @@ -453,6 +456,71 @@ def word_vec(self, *args, **kwargs): """Compatibility alias for get_vector(); must exist so subclass calls reach subclass get_vector().""" return self.get_vector(*args, **kwargs) + def get_mean_vector(self, keys, weights=None, pre_normalize=True, post_normalize=False, ignore_missing=True): + """Get the mean vector for a given list of keys. + + Parameters + ---------- + + keys : list of (str or int or ndarray) + Keys specified by string or int ids or numpy array. + weights : list of float or numpy.ndarray, optional + 1D array of same size of `keys` specifying the weight for each key. + pre_normalize : bool, optional + Flag indicating whether to normalize each keyvector before taking mean. + If False, individual keyvector will not be normalized. + post_normalize: bool, optional + Flag indicating whether to normalize the final mean vector. + If True, normalized mean vector will be return. + ignore_missing : bool, optional + If False, will raise error if a key doesn't exist in vocabulary. + + Returns + ------- + + numpy.ndarray + Mean vector for the list of keys. + + Raises + ------ + + ValueError + If the size of the list of `keys` and `weights` doesn't match. + KeyError + If any of the key doesn't exist in vocabulary and `ignore_missing` is false. + + """ + if len(keys) == 0: + raise ValueError("cannot compute mean with no input") + if isinstance(weights, list): + weights = np.array(weights) + if weights is None: + weights = np.ones(len(keys)) + if len(keys) != weights.shape[0]: # weights is a 1-D numpy array + raise ValueError( + "keys and weights array must have same number of elements" + ) + + mean = np.zeros(self.vector_size, self.vectors.dtype) + + total_weight = 0 + for idx, key in enumerate(keys): + if isinstance(key, ndarray): + mean += weights[idx] * key + total_weight += abs(weights[idx]) + elif self.__contains__(key): + vec = self.get_vector(key, norm=pre_normalize) + mean += weights[idx] * vec + total_weight += abs(weights[idx]) + elif not ignore_missing: + raise KeyError(f"Key '{key}' not present in vocabulary") + + if(total_weight > 0): + mean = mean / total_weight + if post_normalize: + mean = matutils.unitvec(mean).astype(REAL) + return mean + def add_vector(self, key, vector): """Add one new vector at the given key, into existing slot if available. @@ -717,10 +785,10 @@ def most_similar( Parameters ---------- - positive : list of (str or int or ndarray), optional - List of keys that contribute positively. - negative : list of (str or int or ndarray), optional - List of keys that contribute negatively. + positive : list of (str or int or ndarray) or list of ((str,float) or (int,float) or (ndarray,float)), optional + List of keys that contribute positively. If tuple, second element specifies the weight (default `1.0`) + negative : list of (str or int or ndarray) or list of ((str,float) or (int,float) or (ndarray,float)), optional + List of keys that contribute negatively. If tuple, second element specifies the weight (default `-1.0`) topn : int or None, optional Number of top-N similar keys to return, when `topn` is int. When `topn` is None, then similarities for all keys are returned. @@ -758,27 +826,20 @@ def most_similar( clip_end = restrict_vocab # add weights for each key, if not already present; default to 1.0 for positive and -1.0 for negative keys - positive = [ - (item, 1.0) if isinstance(item, _EXTENDED_KEY_TYPES) else item - for item in positive - ] - negative = [ - (item, -1.0) if isinstance(item, _EXTENDED_KEY_TYPES) else item - for item in negative - ] + keys = [] + weight = np.concatenate((np.ones(len(positive)), -1.0 * np.ones(len(negative)))) + for idx, item in enumerate(positive + negative): + if isinstance(item, _EXTENDED_KEY_TYPES): + keys.append(item) + else: + keys.append(item[0]) + weight[idx] = item[1] # compute the weighted average of all keys - all_keys, mean = set(), [] - for key, weight in positive + negative: - if isinstance(key, ndarray): - mean.append(weight * key) - else: - mean.append(weight * self.get_vector(key, norm=True)) - if self.has_index_for(key): - all_keys.add(self.get_index(key)) - if not mean: - raise ValueError("cannot compute similarity with no input") - mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) + mean = self.get_mean_vector(keys, weight, pre_normalize=True, post_normalize=True, ignore_missing=False) + all_keys = [ + self.get_index(key) for key in keys if isinstance(key, _KEY_TYPES) and self.has_index_for(key) + ] if indexer is not None and isinstance(topn, int): return indexer.most_similar(mean, topn) @@ -1059,7 +1120,7 @@ def rank_by_centrality(self, words, use_norm=True): if not used_words: raise ValueError("cannot select a word from an empty list") vectors = vstack([self.get_vector(word, norm=use_norm) for word in used_words]).astype(REAL) - mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) + mean = self.get_mean_vector(vectors, post_normalize=True) dists = dot(vectors, mean) return sorted(zip(dists, used_words), reverse=True) @@ -1191,9 +1252,9 @@ def n_similarity(self, ws1, ws2): """ if not(len(ws1) and len(ws2)): raise ZeroDivisionError('At least one of the passed list is empty.') - v1 = [self[key] for key in ws1] - v2 = [self[key] for key in ws2] - return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) + mean1 = self.get_mean_vector(ws1, pre_normalize=False) + mean2 = self.get_mean_vector(ws2, pre_normalize=False) + return dot(matutils.unitvec(mean1), matutils.unitvec(mean2)) @staticmethod def _log_evaluate_word_analogies(section): diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index d5eda547ea..cc70577842 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -366,6 +366,35 @@ def test_no_header(self): self.assertEqual(randkv.index_to_key, reloadtxtkv.index_to_key) self.assertTrue((randkv.vectors == reloadtxtkv.vectors).all()) + def test_get_mean_vector(self): + """Test get_mean_vector returns expected results.""" + keys = [ + 'conflict', + 'administration', + 'terrorism', + 'call', + 'an out-of-vocabulary word', + ] + weights = [1, 2, 3, 1, 2] + expected_result_1 = np.array([ + 0.02000151, -0.12685453, 0.09196121, 0.25514853, 0.25740655, + -0.11134843, -0.0502661, -0.19278568, -0.83346179, -0.12068878, + ], dtype=np.float32) + expected_result_2 = np.array([ + -0.0145228, -0.11530358, 0.1169825, 0.22537769, 0.29353586, + -0.10458107, -0.05272481, -0.17547795, -0.84245106, -0.10356515, + ], dtype=np.float32) + expected_result_3 = np.array([ + 0.01343237, -0.47651053, 0.45645328, 0.98304356, 1.1840123, + -0.51647933, -0.25308795, -0.77931081, -3.55954733, -0.55429711, + ], dtype=np.float32) + + self.assertTrue(np.allclose(self.vectors.get_mean_vector(keys), expected_result_1)) + self.assertTrue(np.allclose(self.vectors.get_mean_vector(keys, weights), expected_result_2)) + self.assertTrue(np.allclose( + self.vectors.get_mean_vector(keys, pre_normalize=False), expected_result_3) + ) + class Gensim320Test(unittest.TestCase): def test(self): From a4808c171f6adc8127de223987195cf2b4ceb9bd Mon Sep 17 00:00:00 2001 From: mark-todd <60781787+mark-todd@users.noreply.github.com> Date: Tue, 22 Mar 2022 10:20:17 +0000 Subject: [PATCH 47/81] Added new ValueError in place of assertion error for no model data provided in lsi model (#3271) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Added new ValueError in place of assertion error for no model data provided in lsi model Added warning to lsi model for initialising a model with no data * Update lsimodel.py * Update gensim/models/lsimodel.py * Added better empty corpus testing * Moved is_empty function to utils * Update gensim/models/lsimodel.py Added space Co-authored-by: Radim Řehůřek * Update gensim/models/lsimodel.py Added import space Co-authored-by: Radim Řehůřek * Update gensim/utils.py Added space after False Co-authored-by: Radim Řehůřek * Moved import * Update utils.py * fix flake8 problem Co-authored-by: Radim Řehůřek Co-authored-by: Michael Penkov --- gensim/models/lsimodel.py | 7 +++++-- gensim/utils.py | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 6a407e860e..8f8c9c511a 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -70,6 +70,7 @@ from gensim import interfaces, matutils, utils from gensim.models import basemodel +from gensim.utils import is_empty logger = logging.getLogger(__name__) @@ -489,7 +490,8 @@ def add_documents(self, corpus, chunksize=None, decay=None): chunksize = self.chunksize if decay is None: decay = self.decay - + if is_empty(corpus): + logger.warning('LsiModel.add_documents() called but no documents provided, is this intended?') if not scipy.sparse.issparse(corpus): if not self.onepass: # we are allowed multiple passes over the input => use a faster, randomized two-pass algo @@ -590,7 +592,8 @@ def __getitem__(self, bow, scaled=False, chunksize=512): Latent representation of corpus in BoW format if `bow` is corpus. """ - assert self.projection.u is not None, "decomposition not initialized yet" + if self.projection.u is None: + raise ValueError('No training data provided - LSI model not initialized yet') # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) diff --git a/gensim/utils.py b/gensim/utils.py index d4fc6a71dc..78d64b88e6 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -30,6 +30,7 @@ from copy import deepcopy from datetime import datetime import platform +import types import numpy as np import scipy.sparse @@ -2084,3 +2085,19 @@ def effective_n_jobs(n_jobs): elif n_jobs < 0: n_jobs = max(multiprocessing.cpu_count() + 1 + n_jobs, 1) return n_jobs + + +def is_empty(corpus): + """Is the corpus (an iterable or a scipy.sparse array) empty?""" + if scipy.sparse.issparse(corpus): + return corpus.shape[1] == 0 # by convention, scipy.sparse documents are columns + if isinstance(corpus, types.GeneratorType): + return False # don't try to guess emptiness of generators, may lose elements irretrievably + try: + # list, numpy array etc + first_doc = next(iter(corpus)) # noqa: F841 (ignore unused variable) + return False # first document exists => not empty + except StopIteration: + return True + except Exception: + return False From 546de206fb7080614cf339d4da93997b61ff5dd5 Mon Sep 17 00:00:00 2001 From: austereantelope <95935342+austereantelope@users.noreply.github.com> Date: Tue, 22 Mar 2022 06:30:44 -0500 Subject: [PATCH 48/81] tighten test_topic_word (#3280) Co-authored-by: Michael Penkov --- gensim/test/test_ldaseqmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py index 0c18b2cf8c..90f6977410 100644 --- a/gensim/test/test_ldaseqmodel.py +++ b/gensim/test/test_ldaseqmodel.py @@ -214,7 +214,7 @@ def test_topic_word(self): topics = self.ldaseq.print_topics(0) expected_topic_word = [('skills', 0.035999999999999997)] self.assertEqual(topics[0][0][0], expected_topic_word[0][0]) - self.assertAlmostEqual(topics[0][0][1], expected_topic_word[0][1], places=2) + self.assertAlmostEqual(topics[0][0][1], expected_topic_word[0][1], delta=0.0012) # testing document-topic proportions def test_doc_topic(self): From acd83080818eb7abc4c39f8b50a0448a645cf413 Mon Sep 17 00:00:00 2001 From: austereantelope <95935342+austereantelope@users.noreply.github.com> Date: Tue, 22 Mar 2022 06:40:43 -0500 Subject: [PATCH 49/81] Update test_word2vec.py (#3281) Co-authored-by: Michael Penkov --- gensim/test/test_word2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 8edfe3c04c..09def2c733 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -840,7 +840,7 @@ def test_parallel(self): # the exact vectors and therefore similarities may differ, due to different thread collisions/randomization # so let's test only for top10 neighbor_rank = [word for word, sim in sims].index(expected_neighbor) - self.assertLess(neighbor_rank, 3) + self.assertLess(neighbor_rank, 5) def test_r_n_g(self): """Test word2vec results identical with identical RNG seed.""" From 4bfb777d5e8232334914a0120821817b4a4ed3c8 Mon Sep 17 00:00:00 2001 From: David Beauchemin Date: Tue, 22 Mar 2022 07:57:50 -0400 Subject: [PATCH 50/81] Typos, text and code fix in LDA tutorial (#3289) * typos, text and code fix * fix then, when training [...] missing for * fix cryptic doc text for debug mode Co-authored-by: Michael Penkov --- .../src/auto_examples/tutorials/run_lda.ipynb | 714 ++++++++++++------ docs/src/auto_examples/tutorials/run_lda.py | 34 +- docs/src/auto_examples/tutorials/run_lda.rst | 34 +- 3 files changed, 509 insertions(+), 273 deletions(-) diff --git a/docs/src/auto_examples/tutorials/run_lda.ipynb b/docs/src/auto_examples/tutorials/run_lda.ipynb index 363de86b07..12f3eb1865 100644 --- a/docs/src/auto_examples/tutorials/run_lda.ipynb +++ b/docs/src/auto_examples/tutorials/run_lda.ipynb @@ -1,241 +1,477 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n# LDA Model\n\nIntroduces Gensim's LDA model and demonstrates its use on the NIPS corpus.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import logging\nlogging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The purpose of this tutorial is to demonstrate how to train and tune an LDA model.\n\nIn this tutorial we will:\n\n* Load input data.\n* Pre-process that data.\n* Transform documents into bag-of-words vectors.\n* Train an LDA model.\n\nThis tutorial will **not**:\n\n* Explain how Latent Dirichlet Allocation works\n* Explain how the LDA model performs inference\n* Teach you all the parameters and options for Gensim's LDA implementation\n\nIf you are not familiar with the LDA model or how to use it in Gensim, I (Olavur Mortensen)\nsuggest you read up on that before continuing with this tutorial. Basic\nunderstanding of the LDA model should suffice. Examples:\n\n* `Introduction to Latent Dirichlet Allocation `_\n* Gensim tutorial: `sphx_glr_auto_examples_core_run_topics_and_transformations.py`\n* Gensim's LDA model API docs: :py:class:`gensim.models.LdaModel`\n\nI would also encourage you to consider each step when applying the model to\nyour data, instead of just blindly applying my solution. The different steps\nwill depend on your data and possibly your goal with the model.\n\n## Data\n\nI have used a corpus of NIPS papers in this tutorial, but if you're following\nthis tutorial just to learn about LDA I encourage you to consider picking a\ncorpus on a subject that you are familiar with. Qualitatively evaluating the\noutput of an LDA model is challenging and can require you to understand the\nsubject matter of your corpus (depending on your goal with the model).\n\nNIPS (Neural Information Processing Systems) is a machine learning conference\nso the subject matter should be well suited for most of the target audience\nof this tutorial. You can download the original data from Sam Roweis'\n`website `_. The code below will\nalso do that for you.\n\n.. Important::\n The corpus contains 1740 documents, and not particularly long ones.\n So keep in mind that this tutorial is not geared towards efficiency, and be\n careful before applying the code to a large dataset.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import io\nimport os.path\nimport re\nimport tarfile\n\nimport smart_open\n\ndef extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):\n with smart_open.open(url, \"rb\") as file:\n with tarfile.open(fileobj=file) as tar:\n for member in tar.getmembers():\n if member.isfile() and re.search(r'nipstxt/nips\\d+/\\d+\\.txt', member.name):\n member_bytes = tar.extractfile(member).read()\n yield member_bytes.decode('utf-8', errors='replace')\n\ndocs = list(extract_documents())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So we have a list of 1740 documents, where each document is a Unicode string.\nIf you're thinking about using your own corpus, then you need to make sure\nthat it's in the same format (list of Unicode strings) before proceeding\nwith the rest of this tutorial.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print(len(docs))\nprint(docs[0][:500])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pre-process and vectorize the documents\n\nAs part of preprocessing, we will:\n\n* Tokenize (split the documents into tokens).\n* Lemmatize the tokens.\n* Compute bigrams.\n* Compute a bag-of-words representation of the data.\n\nFirst we tokenize the text using a regular expression tokenizer from NLTK. We\nremove numeric tokens and tokens that are only a single character, as they\ndon't tend to be useful, and the dataset contains a lot of them.\n\n.. Important::\n\n This tutorial uses the nltk library for preprocessing, although you can\n replace it with something else if you want.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Tokenize the documents.\nfrom nltk.tokenize import RegexpTokenizer\n\n# Split the documents into tokens.\ntokenizer = RegexpTokenizer(r'\\w+')\nfor idx in range(len(docs)):\n docs[idx] = docs[idx].lower() # Convert to lowercase.\n docs[idx] = tokenizer.tokenize(docs[idx]) # Split into words.\n\n# Remove numbers, but not words that contain numbers.\ndocs = [[token for token in doc if not token.isnumeric()] for doc in docs]\n\n# Remove words that are only one character.\ndocs = [[token for token in doc if len(token) > 1] for doc in docs]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We use the WordNet lemmatizer from NLTK. A lemmatizer is preferred over a\nstemmer in this case because it produces more readable words. Output that is\neasy to read is very desirable in topic modelling.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Lemmatize the documents.\nfrom nltk.stem.wordnet import WordNetLemmatizer\n\nlemmatizer = WordNetLemmatizer()\ndocs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We find bigrams in the documents. Bigrams are sets of two adjacent words.\nUsing bigrams we can get phrases like \"machine_learning\" in our output\n(spaces are replaced with underscores); without bigrams we would only get\n\"machine\" and \"learning\".\n\nNote that in the code below, we find bigrams and then add them to the\noriginal data, because we would like to keep the words \"machine\" and\n\"learning\" as well as the bigram \"machine_learning\".\n\n.. Important::\n Computing n-grams of large dataset can be very computationally\n and memory intensive.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Compute bigrams.\nfrom gensim.models import Phrases\n\n# Add bigrams and trigrams to docs (only ones that appear 20 times or more).\nbigram = Phrases(docs, min_count=20)\nfor idx in range(len(docs)):\n for token in bigram[docs[idx]]:\n if '_' in token:\n # Token is a bigram, add to document.\n docs[idx].append(token)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We remove rare words and common words based on their *document frequency*.\nBelow we remove words that appear in less than 20 documents or in more than\n50% of the documents. Consider trying to remove words only based on their\nfrequency, or maybe combining that with this approach.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Remove rare and common tokens.\nfrom gensim.corpora import Dictionary\n\n# Create a dictionary representation of the documents.\ndictionary = Dictionary(docs)\n\n# Filter out words that occur less than 20 documents, or more than 50% of the documents.\ndictionary.filter_extremes(no_below=20, no_above=0.5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we transform the documents to a vectorized form. We simply compute\nthe frequency of each word, including the bigrams.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Bag-of-words representation of the documents.\ncorpus = [dictionary.doc2bow(doc) for doc in docs]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's see how many tokens and documents we have to train on.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print('Number of unique tokens: %d' % len(dictionary))\nprint('Number of documents: %d' % len(corpus))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Training\n\nWe are ready to train the LDA model. We will first discuss how to set some of\nthe training parameters.\n\nFirst of all, the elephant in the room: how many topics do I need? There is\nreally no easy answer for this, it will depend on both your data and your\napplication. I have used 10 topics here because I wanted to have a few topics\nthat I could interpret and \"label\", and because that turned out to give me\nreasonably good results. You might not need to interpret all your topics, so\nyou could use a large number of topics, for example 100.\n\n``chunksize`` controls how many documents are processed at a time in the\ntraining algorithm. Increasing chunksize will speed up training, at least as\nlong as the chunk of documents easily fit into memory. I've set ``chunksize =\n2000``, which is more than the amount of documents, so I process all the\ndata in one go. Chunksize can however influence the quality of the model, as\ndiscussed in Hoffman and co-authors [2], but the difference was not\nsubstantial in this case.\n\n``passes`` controls how often we train the model on the entire corpus.\nAnother word for passes might be \"epochs\". ``iterations`` is somewhat\ntechnical, but essentially it controls how often we repeat a particular loop\nover each document. It is important to set the number of \"passes\" and\n\"iterations\" high enough.\n\nI suggest the following way to choose iterations and passes. First, enable\nlogging (as described in many Gensim tutorials), and set ``eval_every = 1``\nin ``LdaModel``. When training the model look for a line in the log that\nlooks something like this::\n\n 2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations\n\nIf you set ``passes = 20`` you will see this line 20 times. Make sure that by\nthe final passes, most of the documents have converged. So you want to choose\nboth passes and iterations to be high enough for this to happen.\n\nWe set ``alpha = 'auto'`` and ``eta = 'auto'``. Again this is somewhat\ntechnical, but essentially we are automatically learning two parameters in\nthe model that we usually would have to specify explicitly.\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Train LDA model.\nfrom gensim.models import LdaModel\n\n# Set training parameters.\nnum_topics = 10\nchunksize = 2000\npasses = 20\niterations = 400\neval_every = None # Don't evaluate model perplexity, takes too much time.\n\n# Make a index to word dictionary.\ntemp = dictionary[0] # This is only to \"load\" the dictionary.\nid2word = dictionary.id2token\n\nmodel = LdaModel(\n corpus=corpus,\n id2word=id2word,\n chunksize=chunksize,\n alpha='auto',\n eta='auto',\n iterations=iterations,\n num_topics=num_topics,\n passes=passes,\n eval_every=eval_every\n)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can compute the topic coherence of each topic. Below we display the\naverage topic coherence and print the topics in order of topic coherence.\n\nNote that we use the \"Umass\" topic coherence measure here (see\n:py:func:`gensim.models.ldamodel.LdaModel.top_topics`), Gensim has recently\nobtained an implementation of the \"AKSW\" topic coherence measure (see\naccompanying blog post, http://rare-technologies.com/what-is-topic-coherence/).\n\nIf you are familiar with the subject of the articles in this dataset, you can\nsee that the topics below make a lot of sense. However, they are not without\nflaws. We can see that there is substantial overlap between some topics,\nothers are hard to interpret, and most of them have at least some terms that\nseem out of place. If you were able to do better, feel free to share your\nmethods on the blog at http://rare-technologies.com/lda-training-tips/ !\n\n\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "top_topics = model.top_topics(corpus) #, num_words=20)\n\n# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.\navg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics\nprint('Average topic coherence: %.4f.' % avg_topic_coherence)\n\nfrom pprint import pprint\npprint(top_topics)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Things to experiment with\n\n* ``no_above`` and ``no_below`` parameters in ``filter_extremes`` method.\n* Adding trigrams or even higher order n-grams.\n* Consider whether using a hold-out set or cross-validation is the way to go for you.\n* Try other datasets.\n\n## Where to go from here\n\n* Check out a RaRe blog post on the AKSW topic coherence measure (http://rare-technologies.com/what-is-topic-coherence/).\n* pyLDAvis (https://pyldavis.readthedocs.io/en/latest/index.html).\n* Read some more Gensim tutorials (https://github.com/RaRe-Technologies/gensim/blob/develop/tutorials.md#tutorials).\n* If you haven't already, read [1] and [2] (see references).\n\n## References\n\n1. \"Latent Dirichlet Allocation\", Blei et al. 2003.\n2. \"Online Learning for Latent Dirichlet Allocation\", Hoffman et al. 2010.\n\n\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# LDA Model\n", + "\n", + "Introduces Gensim's LDA model and demonstrates its use on the NIPS corpus.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import logging\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The purpose of this tutorial is to demonstrate how to train and tune an LDA model.\n", + "\n", + "In this tutorial we will:\n", + "\n", + "* Load input data.\n", + "* Pre-process that data.\n", + "* Transform documents into bag-of-words vectors.\n", + "* Train an LDA model.\n", + "\n", + "This tutorial will **not**:\n", + "\n", + "* Explain how Latent Dirichlet Allocation works\n", + "* Explain how the LDA model performs inference\n", + "* Teach you all the parameters and options for Gensim's LDA implementation\n", + "\n", + "If you are not familiar with the LDA model or how to use it in Gensim, I (Olavur Mortensen)\n", + "suggest you read up on that before continuing with this tutorial. Basic\n", + "understanding of the LDA model should suffice. Examples:\n", + "\n", + "* `Introduction to Latent Dirichlet Allocation `_\n", + "* Gensim tutorial: `sphx_glr_auto_examples_core_run_topics_and_transformations.py`\n", + "* Gensim's LDA model API docs: :py:class:`gensim.models.LdaModel`\n", + "\n", + "I would also encourage you to consider each step when applying the model to\n", + "your data, instead of just blindly applying my solution. The different steps\n", + "will depend on your data and possibly your goal with the model.\n", + "\n", + "## Data\n", + "\n", + "I have used a corpus of NIPS papers in this tutorial, but if you're following\n", + "this tutorial just to learn about LDA I encourage you to consider picking a\n", + "corpus on a subject that you are familiar with. Qualitatively evaluating the\n", + "output of an LDA model is challenging and can require you to understand the\n", + "subject matter of your corpus (depending on your goal with the model).\n", + "\n", + "NIPS (Neural Information Processing Systems) is a machine learning conference\n", + "so the subject matter should be well suited for most of the target audience\n", + "of this tutorial. You can download the original data from Sam Roweis'\n", + "`website `_. The code below will\n", + "also do that for you.\n", + "\n", + ".. Important::\n", + " The corpus contains 1740 documents, and not particularly long ones.\n", + " So keep in mind that this tutorial is not geared towards efficiency, and be\n", + " careful before applying the code to a large dataset.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import re\n", + "import tarfile\n", + "\n", + "import smart_open\n", + "\n", + "def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):\n", + " with smart_open.open(url, \"rb\") as file:\n", + " with tarfile.open(fileobj=file) as tar:\n", + " for member in tar.getmembers():\n", + " if member.isfile() and re.search(r'nipstxt/nips\\d+/\\d+\\.txt', member.name):\n", + " member_bytes = tar.extractfile(member).read()\n", + " yield member_bytes.decode('utf-8', errors='replace')\n", + "\n", + "docs = list(extract_documents())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So we have a list of 1740 documents, where each document is a Unicode string.\n", + "If you're thinking about using your own corpus, then you need to make sure\n", + "that it's in the same format (list of Unicode strings) before proceeding\n", + "with the rest of this tutorial.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print(len(docs))\n", + "print(docs[0][:500])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pre-process and vectorize the documents\n", + "\n", + "As part of preprocessing, we will:\n", + "\n", + "* Tokenize (split the documents into tokens).\n", + "* Lemmatize the tokens.\n", + "* Compute bigrams.\n", + "* Compute a bag-of-words representation of the data.\n", + "\n", + "First we tokenize the text using a regular expression tokenizer from NLTK. We\n", + "remove numeric tokens and tokens that are only a single character, as they\n", + "don't tend to be useful, and the dataset contains a lot of them.\n", + "\n", + ".. Important::\n", + "\n", + " This tutorial uses the nltk library for preprocessing, although you can\n", + " replace it with something else if you want.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Tokenize the documents.\n", + "from nltk.tokenize import RegexpTokenizer\n", + "\n", + "# Split the documents into tokens.\n", + "tokenizer = RegexpTokenizer(r'\\w+')\n", + "for idx in range(len(docs)):\n", + " docs[idx] = docs[idx].lower() # Convert to lowercase.\n", + " docs[idx] = tokenizer.tokenize(docs[idx]) # Split into words.\n", + "\n", + "# Remove numbers, but not words that contain numbers.\n", + "docs = [[token for token in doc if not token.isnumeric()] for doc in docs]\n", + "\n", + "# Remove words that are only one character.\n", + "docs = [[token for token in doc if len(token) > 1] for doc in docs]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use the WordNet lemmatizer from NLTK. A lemmatizer is preferred over a\n", + "stemmer in this case because it produces more readable words. An output that is\n", + "easy to read is very desirable in topic modelling.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Lemmatize the documents.\n", + "from nltk.stem.wordnet import WordNetLemmatizer\n", + "\n", + "lemmatizer = WordNetLemmatizer()\n", + "docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We find bigrams in the documents. Bigrams are sets of two adjacent words.\n", + "Using bigrams we can get phrases like \"machine_learning\" in our output\n", + "(spaces are replaced with underscores); without bigrams we would only get\n", + "\"machine\" and \"learning\".\n", + "\n", + "Note that in the code below, we find bigrams and then add them to the\n", + "original data, because we would like to keep the words \"machine\" and\n", + "\"learning\" as well as the bigram \"machine_learning\".\n", + "\n", + ".. Important::\n", + " Computing n-grams of large dataset can be very computationally\n", + " and memory intensive.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Compute bigrams.\n", + "from gensim.models import Phrases\n", + "\n", + "# Add bigrams to docs (only ones that appear 20 times or more).\n", + "bigram = Phrases(docs, min_count=20)\n", + "for idx in range(len(docs)):\n", + " for token in bigram[docs[idx]]:\n", + " if '_' in token:\n", + " # Token is a bigram, add to document.\n", + " docs[idx].append(token)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We remove rare words and common words based on their *document frequency*.\n", + "Below we remove words that appear in less than 20 documents or in more than\n", + "50% of the documents. Consider trying to remove words only based on their\n", + "frequency, or maybe combining that with this approach.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Remove rare and common tokens.\n", + "from gensim.corpora import Dictionary\n", + "\n", + "# Create a dictionary representation of the documents.\n", + "dictionary = Dictionary(docs)\n", + "\n", + "# Filter out words that occur less than 20 documents, or more than 50% of the documents.\n", + "dictionary.filter_extremes(no_below=20, no_above=0.5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we transform the documents to a vectorized form. We simply compute\n", + "the frequency of each word, including the bigrams.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Bag-of-words representation of the documents.\n", + "corpus = [dictionary.doc2bow(doc) for doc in docs]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see how many tokens and documents we have to train on.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print('Number of unique tokens: %d' % len(dictionary))\n", + "print('Number of documents: %d' % len(corpus))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training\n", + "\n", + "We are ready to train the LDA model. We will first discuss how to set some of the training parameters.\n", + "\n", + "First of all, the elephant in the room: how many topics do I need?\n", + "There is really no easy answer for this. It will depend on both your\n", + "data and your application. I have used 10 topics here because I wanted\n", + "to have a few topics that I could interpret and \"label\", and because that\n", + "turned out to give me reasonably good results. On the other hand, you might\n", + "not need to interpret all your topics, so you could use many topics,\n", + "for example, 100.\n", + "\n", + "``chunksize`` controls how many documents are processed at a time in the\n", + "training algorithm. Increasing chunksize will speed up training, at least as\n", + "long as the chunk of documents easily fit into memory. I've set ``chunksize =\n", + "2000``, which is more than the number of documents, so I process all the\n", + "data in one go. However, chunksize can influence the quality of the model, as\n", + "discussed in Hoffman and al. [2], but the difference was not\n", + "substantial in this case.\n", + "\n", + "``passes`` controls how often we train the model on the entire corpus.\n", + "Another word for passes might be \"epochs\". ``iterations`` is somewhat\n", + "technical, but essentially it controls how often we repeat a particular loop\n", + "over each document. It is important to set the number of \"passes\" and\n", + "\"iterations\" high enough.\n", + "\n", + "I suggest the following way to choose iterations and passes. First, enable\n", + "logging (``logging.basicConfig(level=logging.INFO, format='PID:%(process)d:%(threadName)s - %(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s')``), and set ``eval_every = 1``\n", + "in ``LdaModel``. Then, when training the model, look for a line in the log that\n", + "looks something like this::\n", + "\n", + " 2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations\n", + "\n", + "If you set ``passes = 20`` you will see this line 20 times. Make sure that by\n", + "the final passes, most of the documents have converged. So you want to choose\n", + "both passes and iterations to be high enough for this to happen.\n", + "\n", + "We set ``alpha = 'auto'`` and ``eta = 'auto'``. Again this is somewhat\n", + "technical, but essentially we are automatically learning two parameters in\n", + "the model that we usually would have to specify explicitly.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Train LDA model.\n", + "from gensim.models import LdaModel\n", + "\n", + "# Set training parameters.\n", + "num_topics = 10\n", + "chunksize = 2000\n", + "passes = 20\n", + "iterations = 400\n", + "eval_every = None # Don't evaluate model perplexity, takes too much time.\n", + "\n", + "# Make an index to word dictionary.\n", + "temp = dictionary[0] # This is only to \"load\" the dictionary.\n", + "id2word = dictionary.id2token\n", + "\n", + "model = LdaModel(\n", + " corpus=corpus,\n", + " id2word=id2word,\n", + " chunksize=chunksize,\n", + " alpha='auto',\n", + " eta='auto',\n", + " iterations=iterations,\n", + " num_topics=num_topics,\n", + " passes=passes,\n", + " eval_every=eval_every\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can compute the topic coherence of each topic. Below we display the\n", + "average topic coherence and print the topics in order of topic coherence.\n", + "\n", + "Note that we use the \"Umass\" topic coherence measure here (see\n", + ":py:func:`gensim.models.ldamodel.LdaModel.top_topics`), Gensim has recently\n", + "obtained an implementation of the \"AKSW\" topic coherence measure (see\n", + "accompanying blog post, http://rare-technologies.com/what-is-topic-coherence/).\n", + "\n", + "If you are familiar with the subject of the articles in this dataset, you can\n", + "see that the topics below make a lot of sense. However, they are not without\n", + "flaws. We can see that there is substantial overlap between some topics,\n", + "others are hard to interpret, and most of them have at least some terms that\n", + "seem out of place. If you were able to do better, feel free to share your\n", + "methods on the blog at http://rare-technologies.com/lda-training-tips/ !\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "top_topics = model.top_topics(corpus)\n", + "\n", + "# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.\n", + "avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics\n", + "print('Average topic coherence: %.4f.' % avg_topic_coherence)\n", + "\n", + "from pprint import pprint\n", + "pprint(top_topics)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Things to experiment with\n\n* ``no_above`` and ``no_below`` parameters in ``filter_extremes`` method.\n* Adding trigrams or even higher order n-grams.\n* Consider whether using a hold-out set or cross-validation is the way to go for you.\n* Try other datasets.\n\n## Where to go from here\n\n* Check out a RaRe blog post on the AKSW topic coherence measure (http://rare-technologies.com/what-is-topic-coherence/).\n* pyLDAvis (https://pyldavis.readthedocs.io/en/latest/index.html).\n* Read some more Gensim tutorials (https://github.com/RaRe-Technologies/gensim/blob/develop/tutorials.md#tutorials).\n* If you haven't already, read [1] and [2] (see references).\n\n## References\n\n1. \"Latent Dirichlet Allocation\", Blei et al. 2003.\n2. \"Online Learning for Latent Dirichlet Allocation\", Hoffman et al. 2010.\n\n\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_lda.py b/docs/src/auto_examples/tutorials/run_lda.py index 2ec06a801c..00116db20e 100644 --- a/docs/src/auto_examples/tutorials/run_lda.py +++ b/docs/src/auto_examples/tutorials/run_lda.py @@ -58,8 +58,6 @@ # careful before applying the code to a large dataset. # -import io -import os.path import re import tarfile @@ -122,7 +120,7 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' ############################################################################### # We use the WordNet lemmatizer from NLTK. A lemmatizer is preferred over a -# stemmer in this case because it produces more readable words. Output that is +# stemmer in this case because it produces more readable words. An output that is # easy to read is very desirable in topic modelling. # @@ -151,7 +149,7 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' # Compute bigrams. from gensim.models import Phrases -# Add bigrams and trigrams to docs (only ones that appear 20 times or more). +# Add bigrams to docs (only ones that appear 20 times or more). bigram = Phrases(docs, min_count=20) for idx in range(len(docs)): for token in bigram[docs[idx]]: @@ -197,19 +195,20 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' # We are ready to train the LDA model. We will first discuss how to set some of # the training parameters. # -# First of all, the elephant in the room: how many topics do I need? There is -# really no easy answer for this, it will depend on both your data and your -# application. I have used 10 topics here because I wanted to have a few topics -# that I could interpret and "label", and because that turned out to give me -# reasonably good results. You might not need to interpret all your topics, so -# you could use a large number of topics, for example 100. +# First of all, the elephant in the room: how many topics do I need? +# There is really no easy answer for this. It will depend on both your +# data and your application. I have used 10 topics here because I wanted +# to have a few topics that I could interpret and "label", and because that +# turned out to give me reasonably good results. On the other hand, you might +# not need to interpret all your topics, so you could use many topics, +# for example, 100. # # ``chunksize`` controls how many documents are processed at a time in the # training algorithm. Increasing chunksize will speed up training, at least as # long as the chunk of documents easily fit into memory. I've set ``chunksize = -# 2000``, which is more than the amount of documents, so I process all the -# data in one go. Chunksize can however influence the quality of the model, as -# discussed in Hoffman and co-authors [2], but the difference was not +# 2000``, which is more than the number of documents, so I process all the +# data in one go. However, chunksize can influence the quality of the model, as +# discussed in Hoffman and al. [2], but the difference was not # substantial in this case. # # ``passes`` controls how often we train the model on the entire corpus. @@ -219,8 +218,9 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' # "iterations" high enough. # # I suggest the following way to choose iterations and passes. First, enable -# logging (as described in many Gensim tutorials), and set ``eval_every = 1`` -# in ``LdaModel``. When training the model look for a line in the log that +# logging (``logging.basicConfig(level=logging.INFO, format='PID:%(process)d:%(threadName)s - +# %(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s')``), and set ``eval_every = 1`` +# in ``LdaModel``. Then, when training the model, look for a line in the log that # looks something like this:: # # 2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations @@ -245,7 +245,7 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' iterations = 400 eval_every = None # Don't evaluate model perplexity, takes too much time. -# Make a index to word dictionary. +# Make an index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token @@ -278,7 +278,7 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' # methods on the blog at http://rare-technologies.com/lda-training-tips/ ! # -top_topics = model.top_topics(corpus) #, num_words=20) +top_topics = model.top_topics(corpus) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics diff --git a/docs/src/auto_examples/tutorials/run_lda.rst b/docs/src/auto_examples/tutorials/run_lda.rst index 458fbee5c7..80abb74085 100644 --- a/docs/src/auto_examples/tutorials/run_lda.rst +++ b/docs/src/auto_examples/tutorials/run_lda.rst @@ -93,8 +93,6 @@ also do that for you. .. code-block:: default - import io - import os.path import re import tarfile @@ -250,7 +248,7 @@ don't tend to be useful, and the dataset contains a lot of them. .. GENERATED FROM PYTHON SOURCE LINES 124-128 We use the WordNet lemmatizer from NLTK. A lemmatizer is preferred over a -stemmer in this case because it produces more readable words. Output that is +stemmer in this case because it produces more readable words. An output that is easy to read is very desirable in topic modelling. @@ -297,7 +295,7 @@ original data, because we would like to keep the words "machine" and # Compute bigrams. from gensim.models import Phrases - # Add bigrams and trigrams to docs (only ones that appear 20 times or more). + # Add bigrams to docs (only ones that appear 20 times or more). bigram = Phrases(docs, min_count=20) for idx in range(len(docs)): for token in bigram[docs[idx]]: @@ -426,19 +424,20 @@ Training We are ready to train the LDA model. We will first discuss how to set some of the training parameters. -First of all, the elephant in the room: how many topics do I need? There is -really no easy answer for this, it will depend on both your data and your -application. I have used 10 topics here because I wanted to have a few topics -that I could interpret and "label", and because that turned out to give me -reasonably good results. You might not need to interpret all your topics, so -you could use a large number of topics, for example 100. +First of all, the elephant in the room: how many topics do I need? +There is really no easy answer for this. It will depend on both your +data and your application. I have used 10 topics here because I wanted +to have a few topics that I could interpret and "label", and because that +turned out to give me reasonably good results. On the other hand, you might +not need to interpret all your topics, so you could use many topics, +for example, 100. ``chunksize`` controls how many documents are processed at a time in the training algorithm. Increasing chunksize will speed up training, at least as long as the chunk of documents easily fit into memory. I've set ``chunksize = -2000``, which is more than the amount of documents, so I process all the -data in one go. Chunksize can however influence the quality of the model, as -discussed in Hoffman and co-authors [2], but the difference was not +2000``, which is more than the number of documents, so I process all the +data in one go. However, chunksize can influence the quality of the model, as +discussed in Hoffman and al. [2], but the difference was not substantial in this case. ``passes`` controls how often we train the model on the entire corpus. @@ -448,8 +447,9 @@ over each document. It is important to set the number of "passes" and "iterations" high enough. I suggest the following way to choose iterations and passes. First, enable -logging (as described in many Gensim tutorials), and set ``eval_every = 1`` -in ``LdaModel``. When training the model look for a line in the log that +logging (``logging.basicConfig(level=logging.INFO, format='PID:%(process)d:%(threadName)s - +%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s')``), and set ``eval_every = 1`` +in ``LdaModel``. Then, when training the model, look for a line in the log that looks something like this:: 2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations @@ -479,7 +479,7 @@ the model that we usually would have to specify explicitly. iterations = 400 eval_every = None # Don't evaluate model perplexity, takes too much time. - # Make a index to word dictionary. + # Make an index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token @@ -696,7 +696,7 @@ methods on the blog at http://rare-technologies.com/lda-training-tips/ ! .. code-block:: default - top_topics = model.top_topics(corpus) #, num_words=20) + top_topics = model.top_topics(corpus) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics From 5746d3686bb60b0dd0def0c9faa35707973f24b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Wed, 23 Mar 2022 23:03:13 +0100 Subject: [PATCH 51/81] DM broken, always stalls --- docs/notebooks/doc2vec-wikipedia.ipynb | 428 ++++++++++++------------- gensim/models/word2vec.py | 14 +- 2 files changed, 219 insertions(+), 223 deletions(-) diff --git a/docs/notebooks/doc2vec-wikipedia.ipynb b/docs/notebooks/doc2vec-wikipedia.ipynb index 9bcf0ad56f..6add0580f9 100644 --- a/docs/notebooks/doc2vec-wikipedia.ipynb +++ b/docs/notebooks/doc2vec-wikipedia.ipynb @@ -173,7 +173,7 @@ "source": [ "The original paper had a vocabulary size of 915,715 word types, so we'll try to match it by setting `max_final_vocab` to 1,000,000 in the Doc2vec constructor.\n", "\n", - "Other critical parameters were left unspecified in the paper, so we'll go with a default window size of five (a prediction window of 5 tokens to either side), and downsampling of frequent words at 1e-5. It looks like the authors tried vector dimensionality of 100, 300, 1,000 & 10,000 in the paper (with 10k dims performing the best), but I'll only train with 300 dimensions here, to keep RAM in check on my laptop.\n", + "Other critical parameters were left unspecified in the paper, so we'll go with a default window size of five (a prediction window of 5 tokens to either side), and downsampling of frequent words at 1e-5. It looks like the authors tried vector dimensionality of 100, 300, 1,000 & 10,000 in the paper (with 10k dims performing the best), but I'll only train with 200 dimensions here, to keep RAM in check on my laptop.\n", "\n", "Feel free to tinker with these values yourself if you like:" ] @@ -189,26 +189,25 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-03-20 09:48:28,259 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-20T09:48:28.259891', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n", - "2022-03-20 09:48:28,262 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-20T09:48:28.262943', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n" + "2022-03-23 11:46:26,539 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-23T11:46:26.539501', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n", + "2022-03-23 11:46:26,541 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-23T11:46:26.541772', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n" ] } ], "source": [ "workers = multiprocessing.cpu_count() # train with 10 threads on my 10-core laptop\n", "\n", - "models = [\n", - " # PV-DBOW: paragraph vector in distributed bag of words mode\n", - " Doc2Vec(\n", - " dm=0, dbow_words=1, # dbow_words=1 to train word vectors at the same time too, not only DBOW\n", - " vector_size=100, window=8, sample=1e-5, epochs=10, workers=workers, max_final_vocab=1000000,\n", - " ),\n", - " # PV-DM: paragraph vector in distributed memory mode\n", - " Doc2Vec(\n", - " dm=1, dm_concat=0, dm_mean=1, # use average of context word vectors to train DM\n", - " vector_size=100, window=8, sample=1e-5, epochs=10, workers=workers, max_final_vocab=1000000,\n", - " )\n", - "]" + "# PV-DBOW: paragraph vector in distributed bag of words mode\n", + "model_dbow = Doc2Vec(\n", + " dm=0, dbow_words=1, # dbow_words=1 to train word vectors at the same time too, not only DBOW\n", + " vector_size=200, window=8, sample=1e-5, epochs=10, workers=workers, max_final_vocab=1000000,\n", + ")\n", + "\n", + "# PV-DM: paragraph vector in distributed memory mode\n", + "model_dm = Doc2Vec(\n", + " dm=1, dm_concat=0, dm_mean=1, # use average of context word vectors to train DM\n", + " vector_size=200, window=8, sample=1e-5, epochs=10, workers=workers, max_final_vocab=1000000,\n", + ")" ] }, { @@ -220,55 +219,55 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-03-20 09:48:31,797 : INFO : collecting all words and their counts\n", - "2022-03-20 09:48:31,803 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags\n", - "2022-03-20 09:50:27,549 : INFO : PROGRESS: at example #500000, processed 654950164 words (5658546 words/s), 3222179 word types, 500000 tags\n", - "2022-03-20 09:51:35,208 : INFO : PROGRESS: at example #1000000, processed 1018611068 words (5374966 words/s), 4480366 word types, 1000000 tags\n", - "2022-03-20 09:52:28,939 : INFO : PROGRESS: at example #1500000, processed 1305140647 words (5332757 words/s), 5420104 word types, 1500000 tags\n", - "2022-03-20 09:53:17,522 : INFO : PROGRESS: at example #2000000, processed 1550245240 words (5045171 words/s), 6188355 word types, 2000000 tags\n", - "2022-03-20 09:54:05,142 : INFO : PROGRESS: at example #2500000, processed 1790661139 words (5048717 words/s), 6941128 word types, 2500000 tags\n", - "2022-03-20 09:54:53,539 : INFO : PROGRESS: at example #3000000, processed 2028261627 words (4909433 words/s), 7664997 word types, 3000000 tags\n", - "2022-03-20 09:55:40,654 : INFO : PROGRESS: at example #3500000, processed 2264063867 words (5004899 words/s), 8347719 word types, 3500000 tags\n", - "2022-03-20 09:56:25,746 : INFO : PROGRESS: at example #4000000, processed 2488354257 words (4974154 words/s), 8971529 word types, 4000000 tags\n", - "2022-03-20 09:57:09,436 : INFO : PROGRESS: at example #4500000, processed 2703313059 words (4920183 words/s), 9605666 word types, 4500000 tags\n", - "2022-03-20 09:57:55,813 : INFO : PROGRESS: at example #5000000, processed 2925111571 words (4782561 words/s), 10217554 word types, 5000000 tags\n", - "2022-03-20 09:58:26,146 : INFO : collected 10427023 word types and 5176019 unique tags from a corpus of 5176019 examples and 2996051328 words\n", - "2022-03-20 09:58:29,677 : INFO : Doc2Vec lifecycle event {'msg': 'max_final_vocab=1000000 and min_count=5 resulted in calc_min_count=23, effective_min_count=23', 'datetime': '2022-03-20T09:58:29.677488', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-20 09:58:29,677 : INFO : Creating a fresh vocabulary\n", - "2022-03-20 09:58:33,171 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 retains 991887 unique words (9.51% of original 10427023, drops 9435136)', 'datetime': '2022-03-20T09:58:33.171030', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-20 09:58:33,171 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 leaves 2968296495 word corpus (99.07% of original 2996051328, drops 27754833)', 'datetime': '2022-03-20T09:58:33.171494', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-20 09:58:35,956 : INFO : deleting the raw counts dictionary of 10427023 items\n", - "2022-03-20 09:58:36,023 : INFO : sample=1e-05 downsamples 4155 most-common words\n", - "2022-03-20 09:58:36,023 : INFO : Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 1194754612.050565 word corpus (40.3%% of prior 2968296495)', 'datetime': '2022-03-20T09:58:36.023801', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-20 09:58:40,804 : INFO : estimated required memory for 991887 words and 100 dimensions: 4395064500 bytes\n", - "2022-03-20 09:58:40,804 : INFO : resetting layer weights\n", - "2022-03-20 09:58:42,744 : INFO : resetting layer weights\n" + "2022-03-23 11:46:26,921 : INFO : collecting all words and their counts\n", + "2022-03-23 11:46:26,926 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags\n", + "2022-03-23 11:48:29,646 : INFO : PROGRESS: at example #500000, processed 654950164 words (5336561 words/s), 3222179 word types, 500000 tags\n", + "2022-03-23 11:49:40,672 : INFO : PROGRESS: at example #1000000, processed 1018611068 words (5119067 words/s), 4480366 word types, 1000000 tags\n", + "2022-03-23 11:50:36,816 : INFO : PROGRESS: at example #1500000, processed 1305140647 words (5103506 words/s), 5420104 word types, 1500000 tags\n", + "2022-03-23 11:51:25,894 : INFO : PROGRESS: at example #2000000, processed 1550245240 words (4994178 words/s), 6188355 word types, 2000000 tags\n", + "2022-03-23 11:52:14,324 : INFO : PROGRESS: at example #2500000, processed 1790661139 words (4964223 words/s), 6941128 word types, 2500000 tags\n", + "2022-03-23 11:53:02,465 : INFO : PROGRESS: at example #3000000, processed 2028261627 words (4935656 words/s), 7664997 word types, 3000000 tags\n", + "2022-03-23 11:53:52,510 : INFO : PROGRESS: at example #3500000, processed 2264063867 words (4711784 words/s), 8347719 word types, 3500000 tags\n", + "2022-03-23 11:54:39,637 : INFO : PROGRESS: at example #4000000, processed 2488354257 words (4759368 words/s), 8971529 word types, 4000000 tags\n", + "2022-03-23 11:55:25,929 : INFO : PROGRESS: at example #4500000, processed 2703313059 words (4643620 words/s), 9605666 word types, 4500000 tags\n", + "2022-03-23 11:56:13,244 : INFO : PROGRESS: at example #5000000, processed 2925111571 words (4687730 words/s), 10217554 word types, 5000000 tags\n", + "2022-03-23 11:56:43,409 : INFO : collected 10427023 word types and 5176019 unique tags from a corpus of 5176019 examples and 2996051328 words\n", + "2022-03-23 11:56:46,967 : INFO : Doc2Vec lifecycle event {'msg': 'max_final_vocab=1000000 and min_count=5 resulted in calc_min_count=23, effective_min_count=23', 'datetime': '2022-03-23T11:56:46.967882', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-23 11:56:46,968 : INFO : Creating a fresh vocabulary\n", + "2022-03-23 11:56:50,535 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 retains 991887 unique words (9.51% of original 10427023, drops 9435136)', 'datetime': '2022-03-23T11:56:50.535964', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-23 11:56:50,536 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 leaves 2968296495 word corpus (99.07% of original 2996051328, drops 27754833)', 'datetime': '2022-03-23T11:56:50.536397', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-23 11:56:53,313 : INFO : deleting the raw counts dictionary of 10427023 items\n", + "2022-03-23 11:56:53,376 : INFO : sample=1e-05 downsamples 4155 most-common words\n", + "2022-03-23 11:56:53,376 : INFO : Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 1194754612.050565 word corpus (40.3%% of prior 2968296495)', 'datetime': '2022-03-23T11:56:53.376525', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", + "2022-03-23 11:56:58,203 : INFO : estimated required memory for 991887 words and 200 dimensions: 7258981700 bytes\n", + "2022-03-23 11:56:58,204 : INFO : resetting layer weights\n", + "2022-03-23 11:57:02,030 : INFO : resetting layer weights\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "Doc2Vec\n" + "Doc2Vec\n", + "Doc2Vec\n" ] } ], "source": [ - "models[0].build_vocab(documents, progress_per=500000)\n", - "print(models[0])\n", + "model_dbow.build_vocab(documents, progress_per=500000)\n", + "print(model_dbow)\n", "\n", "# Save some time by copying the vocabulary structures from the DBOW model to the DM model.\n", "# Both models are built on top of exactly the same data, so there's no need to repeat the vocab-building step.\n", - "models[1].reset_from(models[0])\n", - "print(models[1])" + "model_dm.reset_from(model_dbow)\n", + "print(model_dm)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now we’re ready to train Doc2Vec on the entirety of the English Wikipedia. **Warning!** Training this DBOW model takes ~16 hours, and DM ~4 hours, on my 2021 laptop." + "Now we’re ready to train Doc2Vec on the entirety of the English Wikipedia. **Warning!** Training this DBOW model takes ~9 hours, and DM ~4 hours, on my 2021 laptop." ] }, { @@ -282,66 +281,82 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-03-20 09:58:45,477 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 10 workers on 991887 vocabulary and 100 features, using sg=1 hs=0 sample=1e-05 negative=5 window=8 shrink_windows=True', 'datetime': '2022-03-20T09:58:45.477703', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", - "2022-03-20 09:58:46,489 : INFO : EPOCH 1 - PROGRESS: at 0.00% examples, 133130 words/s, in_qsize 2, out_qsize 4\n", - "2022-03-20 10:28:46,500 : INFO : EPOCH 1 - PROGRESS: at 63.89% examples, 479007 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-20 10:40:41,145 : INFO : EPOCH - 1 : training on 2996051328 raw words (1198720461 effective words) took 2515.7s, 476504 effective words/s\n", - "2022-03-20 10:40:42,152 : INFO : EPOCH 2 - PROGRESS: at 0.01% examples, 508266 words/s, in_qsize 12, out_qsize 0\n", - "2022-03-20 11:10:42,144 : INFO : EPOCH 2 - PROGRESS: at 66.17% examples, 491918 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-20 11:21:35,923 : INFO : EPOCH - 2 : training on 2996051328 raw words (1198662660 effective words) took 2454.8s, 488296 effective words/s\n", - "2022-03-20 11:21:36,946 : INFO : EPOCH 3 - PROGRESS: at 0.01% examples, 480046 words/s, in_qsize 17, out_qsize 2\n", - "2022-03-20 11:51:37,033 : INFO : EPOCH 3 - PROGRESS: at 66.68% examples, 494875 words/s, in_qsize 17, out_qsize 0\n", - "2022-03-20 12:02:19,087 : INFO : EPOCH - 3 : training on 2996051328 raw words (1198716633 effective words) took 2443.1s, 490658 effective words/s\n", - "2022-03-20 12:02:20,107 : INFO : EPOCH 4 - PROGRESS: at 0.01% examples, 496547 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-20 12:32:20,140 : INFO : EPOCH 4 - PROGRESS: at 67.18% examples, 497696 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-20 12:42:46,250 : INFO : EPOCH - 4 : training on 2996051328 raw words (1198700256 effective words) took 2427.1s, 493880 effective words/s\n", - "2022-03-20 12:42:47,258 : INFO : EPOCH 5 - PROGRESS: at 0.01% examples, 477438 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-20 13:12:47,245 : INFO : EPOCH 5 - PROGRESS: at 67.37% examples, 498714 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-20 13:23:09,733 : INFO : EPOCH - 5 : training on 2996051328 raw words (1198706312 effective words) took 2423.5s, 494619 effective words/s\n", - "2022-03-20 13:23:10,742 : INFO : EPOCH 6 - PROGRESS: at 0.01% examples, 508818 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-20 13:53:10,771 : INFO : EPOCH 6 - PROGRESS: at 67.18% examples, 497693 words/s, in_qsize 19, out_qsize 1\n", - "2022-03-20 14:03:38,965 : INFO : EPOCH - 6 : training on 2996051328 raw words (1198686418 effective words) took 2429.2s, 493449 effective words/s\n", - "2022-03-20 14:03:39,984 : INFO : EPOCH 7 - PROGRESS: at 0.01% examples, 493009 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-20 14:33:40,013 : INFO : EPOCH 7 - PROGRESS: at 65.76% examples, 489578 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-20 14:44:45,759 : INFO : EPOCH - 7 : training on 2996051328 raw words (1198679497 effective words) took 2466.8s, 485932 effective words/s\n", - "2022-03-20 14:44:46,784 : INFO : EPOCH 8 - PROGRESS: at 0.01% examples, 481341 words/s, in_qsize 18, out_qsize 1\n", - "2022-03-20 15:14:46,827 : INFO : EPOCH 8 - PROGRESS: at 67.23% examples, 497983 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-20 15:25:15,178 : INFO : EPOCH - 8 : training on 2996051328 raw words (1198737432 effective words) took 2429.4s, 493435 effective words/s\n", - "2022-03-20 15:25:16,182 : INFO : EPOCH 9 - PROGRESS: at 0.01% examples, 494052 words/s, in_qsize 16, out_qsize 0\n", - "2022-03-20 15:55:16,171 : INFO : EPOCH 9 - PROGRESS: at 64.71% examples, 483746 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-20 16:07:12,899 : INFO : EPOCH - 9 : training on 2996051328 raw words (1198676046 effective words) took 2517.7s, 476094 effective words/s\n", - "2022-03-20 16:07:13,918 : INFO : EPOCH 10 - PROGRESS: at 0.01% examples, 484059 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-20 16:37:13,967 : INFO : EPOCH 10 - PROGRESS: at 65.05% examples, 485618 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-20 16:48:27,957 : INFO : EPOCH - 10 : training on 2996051328 raw words (1198704348 effective words) took 2475.0s, 484327 effective words/s\n", - "2022-03-20 16:48:27,959 : INFO : Doc2Vec lifecycle event {'msg': 'training on 29960513280 raw words (11986990063 effective words) took 24582.3s, 487627 effective words/s', 'datetime': '2022-03-20T16:48:27.959476', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", - "2022-03-20 16:48:27,960 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 10 workers on 991887 vocabulary and 100 features, using sg=0 hs=0 sample=1e-05 negative=5 window=8 shrink_windows=True', 'datetime': '2022-03-20T16:48:27.960338', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", - "2022-03-20 16:48:28,973 : INFO : EPOCH 1 - PROGRESS: at 0.01% examples, 834487 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-20 17:08:24,024 : INFO : EPOCH - 1 : training on 2996051328 raw words (1198718673 effective words) took 1196.1s, 1002220 effective words/s\n", - "2022-03-20 17:08:25,030 : INFO : EPOCH 2 - PROGRESS: at 0.02% examples, 1293557 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-20 17:28:24,924 : INFO : EPOCH - 2 : training on 2996051328 raw words (1198708592 effective words) took 1200.9s, 998186 effective words/s\n", - "2022-03-20 17:28:25,931 : INFO : EPOCH 3 - PROGRESS: at 0.02% examples, 1276259 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-20 17:48:18,838 : INFO : EPOCH - 3 : training on 2996051328 raw words (1198693921 effective words) took 1193.9s, 1004025 effective words/s\n", - "2022-03-20 17:48:19,842 : INFO : EPOCH 4 - PROGRESS: at 0.02% examples, 1304070 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-20 18:08:14,585 : INFO : EPOCH - 4 : training on 2996051328 raw words (1198686639 effective words) took 1195.7s, 1002469 effective words/s\n", - "2022-03-20 18:08:15,591 : INFO : EPOCH 5 - PROGRESS: at 0.02% examples, 1256541 words/s, in_qsize 0, out_qsize 1\n", - "2022-03-20 18:28:15,854 : INFO : EPOCH - 5 : training on 2996051328 raw words (1198698541 effective words) took 1201.3s, 997873 effective words/s\n", - "2022-03-20 18:28:16,859 : INFO : EPOCH 6 - PROGRESS: at 0.02% examples, 1243710 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-20 18:48:21,554 : INFO : EPOCH - 6 : training on 2996051328 raw words (1198688111 effective words) took 1205.7s, 994191 effective words/s\n", - "2022-03-20 18:48:22,560 : INFO : EPOCH 7 - PROGRESS: at 0.02% examples, 1245030 words/s, in_qsize 0, out_qsize 1\n", - "2022-03-20 19:08:16,696 : INFO : EPOCH - 7 : training on 2996051328 raw words (1198696981 effective words) took 1195.1s, 1002985 effective words/s\n", - "2022-03-20 19:08:17,713 : INFO : EPOCH 8 - PROGRESS: at 0.02% examples, 1313159 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-20 19:28:12,212 : INFO : EPOCH - 8 : training on 2996051328 raw words (1198693792 effective words) took 1195.5s, 1002650 effective words/s\n", - "2022-03-20 19:28:13,217 : INFO : EPOCH 9 - PROGRESS: at 0.02% examples, 1258160 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-20 19:48:09,326 : INFO : EPOCH - 9 : training on 2996051328 raw words (1198724296 effective words) took 1197.1s, 1001366 effective words/s\n", - "2022-03-20 19:48:10,336 : INFO : EPOCH 10 - PROGRESS: at 0.02% examples, 1319522 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-20 20:08:08,642 : INFO : EPOCH - 10 : training on 2996051328 raw words (1198697061 effective words) took 1199.3s, 999484 effective words/s\n", - "2022-03-20 20:08:08,643 : INFO : Doc2Vec lifecycle event {'msg': 'training on 29960513280 raw words (11987006607 effective words) took 11980.6s, 1000531 effective words/s', 'datetime': '2022-03-20T20:08:08.643623', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n" + "2022-03-23 11:57:10,178 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 10 workers on 991887 vocabulary and 200 features, using sg=1 hs=0 sample=1e-05 negative=5 window=8 shrink_windows=True', 'datetime': '2022-03-23T11:57:10.178111', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", + "2022-03-23 11:57:11,217 : INFO : EPOCH 0 - PROGRESS: at 0.00% examples, 2586 words/s, in_qsize 8, out_qsize 0\n", + "2022-03-23 12:27:11,295 : INFO : EPOCH 0 - PROGRESS: at 45.70% examples, 379076 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-23 12:48:46,182 : INFO : EPOCH 0: training on 2996051328 raw words (1198675664 effective words) took 3096.0s, 387174 effective words/s\n", + "2022-03-23 12:48:47,191 : INFO : EPOCH 1 - PROGRESS: at 0.00% examples, 407128 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-23 13:18:47,205 : INFO : EPOCH 1 - PROGRESS: at 48.78% examples, 396088 words/s, in_qsize 18, out_qsize 1\n", + "2022-03-23 13:39:22,059 : INFO : EPOCH 1: training on 2996051328 raw words (1198698563 effective words) took 3035.8s, 394848 effective words/s\n", + "2022-03-23 13:39:23,077 : INFO : EPOCH 2 - PROGRESS: at 0.00% examples, 382976 words/s, in_qsize 19, out_qsize 1\n", + "2022-03-23 14:09:23,122 : INFO : EPOCH 2 - PROGRESS: at 50.09% examples, 403393 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-23 14:29:15,186 : INFO : EPOCH 2: training on 2996051328 raw words (1198677402 effective words) took 2993.1s, 400483 effective words/s\n", + "2022-03-23 14:29:16,190 : INFO : EPOCH 3 - PROGRESS: at 0.00% examples, 388313 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-23 14:59:16,204 : INFO : EPOCH 3 - PROGRESS: at 51.43% examples, 410626 words/s, in_qsize 19, out_qsize 1\n", + "2022-03-23 15:18:21,724 : INFO : EPOCH 3: training on 2996051328 raw words (1198678276 effective words) took 2946.5s, 406814 effective words/s\n", + "2022-03-23 15:18:22,733 : INFO : EPOCH 4 - PROGRESS: at 0.00% examples, 401973 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-23 15:48:22,713 : INFO : EPOCH 4 - PROGRESS: at 51.08% examples, 408593 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-23 16:07:40,817 : INFO : EPOCH 4: training on 2996051328 raw words (1198689651 effective words) took 2959.1s, 405082 effective words/s\n", + "2022-03-23 16:07:41,822 : INFO : EPOCH 5 - PROGRESS: at 0.00% examples, 396007 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-23 16:37:41,889 : INFO : EPOCH 5 - PROGRESS: at 50.83% examples, 407325 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-23 16:57:06,451 : INFO : EPOCH 5: training on 2996051328 raw words (1198721998 effective words) took 2965.6s, 404214 effective words/s\n", + "2022-03-23 16:57:07,456 : INFO : EPOCH 6 - PROGRESS: at 0.00% examples, 385000 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-23 17:27:07,478 : INFO : EPOCH 6 - PROGRESS: at 51.20% examples, 409302 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-23 17:47:19,873 : INFO : EPOCH 6: training on 2996051328 raw words (1198686792 effective words) took 3013.4s, 397785 effective words/s\n", + "2022-03-23 17:47:20,887 : INFO : EPOCH 7 - PROGRESS: at 0.00% examples, 360937 words/s, in_qsize 19, out_qsize 0\n", + "2022-03-23 18:17:20,839 : INFO : EPOCH 7 - PROGRESS: at 43.05% examples, 365059 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-23 18:41:23,903 : INFO : EPOCH 7: training on 2996051328 raw words (1198695760 effective words) took 3244.0s, 369508 effective words/s\n", + "2022-03-23 18:41:24,920 : INFO : EPOCH 8 - PROGRESS: at 0.00% examples, 378286 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-23 19:11:24,921 : INFO : EPOCH 8 - PROGRESS: at 48.73% examples, 395820 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-23 19:33:01,653 : INFO : EPOCH 8: training on 2996051328 raw words (1198722784 effective words) took 3097.7s, 386971 effective words/s\n", + "2022-03-23 19:33:02,670 : INFO : EPOCH 9 - PROGRESS: at 0.00% examples, 374366 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-23 20:03:02,691 : INFO : EPOCH 9 - PROGRESS: at 47.88% examples, 391016 words/s, in_qsize 20, out_qsize 0\n", + "2022-03-23 20:24:57,297 : INFO : EPOCH 9: training on 2996051328 raw words (1198712861 effective words) took 3115.6s, 384741 effective words/s\n", + "2022-03-23 20:24:57,299 : INFO : Doc2Vec lifecycle event {'msg': 'training on 29960513280 raw words (11986959751 effective words) took 30466.9s, 393442 effective words/s', 'datetime': '2022-03-23T20:24:57.299122', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n" + ] + } + ], + "source": [ + "# Train DBOW doc2vec incl. word vectors\n", + "model_dbow.train(documents, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs, report_delay=30*60)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-03-23 21:06:50,772 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 10 workers on 991887 vocabulary and 200 features, using sg=0 hs=0 sample=1e-05 negative=5 window=8 shrink_windows=True', 'datetime': '2022-03-23T21:06:50.772480', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", + "2022-03-23 21:06:51,779 : INFO : EPOCH 0 - PROGRESS: at 0.01% examples, 774441 words/s, in_qsize 0, out_qsize 0\n", + "2022-03-23 21:28:47,548 : INFO : EPOCH 0: training on 2996051328 raw words (1198677606 effective words) took 1316.7s, 910333 effective words/s\n", + "2022-03-23 21:28:48,576 : INFO : EPOCH 1 - PROGRESS: at 0.02% examples, 1133718 words/s, in_qsize 0, out_qsize 0\n" + ] + }, + { + "ename": "KeyboardInterrupt", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", + "Input \u001b[0;32mIn [8]\u001b[0m, in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Train DM doc2vec\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mmodel_dm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtotal_examples\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_dm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcorpus_count\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_dm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mepochs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreport_delay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m30\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m60\u001b[39;49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/Volumes/work/workspace/gensim/trunk/gensim/models/doc2vec.py:516\u001b[0m, in \u001b[0;36mDoc2Vec.train\u001b[0;34m(self, corpus_iterable, corpus_file, total_examples, total_words, epochs, start_alpha, end_alpha, word_count, queue_factor, report_delay, callbacks, **kwargs)\u001b[0m\n\u001b[1;32m 513\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124moffsets\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m offsets\n\u001b[1;32m 514\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstart_doctags\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m start_doctags\n\u001b[0;32m--> 516\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mDoc2Vec\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 517\u001b[0m \u001b[43m \u001b[49m\u001b[43mcorpus_iterable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcorpus_iterable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcorpus_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcorpus_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 518\u001b[0m \u001b[43m \u001b[49m\u001b[43mtotal_examples\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtotal_examples\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtotal_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtotal_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 519\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mepochs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstart_alpha\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstart_alpha\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mend_alpha\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mend_alpha\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword_count\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mword_count\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 520\u001b[0m \u001b[43m \u001b[49m\u001b[43mqueue_factor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mqueue_factor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreport_delay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreport_delay\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallbacks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/Volumes/work/workspace/gensim/trunk/gensim/models/word2vec.py:1070\u001b[0m, in \u001b[0;36mWord2Vec.train\u001b[0;34m(self, corpus_iterable, corpus_file, total_examples, total_words, epochs, start_alpha, end_alpha, word_count, queue_factor, report_delay, compute_loss, callbacks, **kwargs)\u001b[0m\n\u001b[1;32m 1067\u001b[0m callback\u001b[38;5;241m.\u001b[39mon_epoch_begin(\u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m 1069\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m corpus_iterable \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1070\u001b[0m trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_train_epoch\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1071\u001b[0m \u001b[43m \u001b[49m\u001b[43mcorpus_iterable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcur_epoch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcur_epoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtotal_examples\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtotal_examples\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1072\u001b[0m \u001b[43m \u001b[49m\u001b[43mtotal_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtotal_words\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mqueue_factor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mqueue_factor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreport_delay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreport_delay\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1073\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallbacks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1074\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1075\u001b[0m trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_train_epoch_corpusfile(\n\u001b[1;32m 1076\u001b[0m corpus_file, cur_epoch\u001b[38;5;241m=\u001b[39mcur_epoch, total_examples\u001b[38;5;241m=\u001b[39mtotal_examples, total_words\u001b[38;5;241m=\u001b[39mtotal_words,\n\u001b[1;32m 1077\u001b[0m callbacks\u001b[38;5;241m=\u001b[39mcallbacks, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[0;32m/Volumes/work/workspace/gensim/trunk/gensim/models/word2vec.py:1431\u001b[0m, in \u001b[0;36mWord2Vec._train_epoch\u001b[0;34m(self, data_iterable, cur_epoch, total_examples, total_words, queue_factor, report_delay, callbacks)\u001b[0m\n\u001b[1;32m 1428\u001b[0m thread\u001b[38;5;241m.\u001b[39mdaemon \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m \u001b[38;5;66;03m# make interrupting the process with ctrl+c easier\u001b[39;00m\n\u001b[1;32m 1429\u001b[0m thread\u001b[38;5;241m.\u001b[39mstart()\n\u001b[0;32m-> 1431\u001b[0m trained_word_count, raw_word_count, job_tally \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_epoch_progress\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1432\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_queue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjob_queue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcur_epoch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcur_epoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtotal_examples\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtotal_examples\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1433\u001b[0m \u001b[43m \u001b[49m\u001b[43mtotal_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtotal_words\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreport_delay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreport_delay\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_corpus_file_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1434\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1436\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m trained_word_count, raw_word_count, job_tally\n", + "File \u001b[0;32m/Volumes/work/workspace/gensim/trunk/gensim/models/word2vec.py:1286\u001b[0m, in \u001b[0;36mWord2Vec._log_epoch_progress\u001b[0;34m(self, progress_queue, job_queue, cur_epoch, total_examples, total_words, report_delay, is_corpus_file_mode)\u001b[0m\n\u001b[1;32m 1283\u001b[0m unfinished_worker_count \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mworkers\n\u001b[1;32m 1285\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m unfinished_worker_count \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m-> 1286\u001b[0m report \u001b[38;5;241m=\u001b[39m \u001b[43mprogress_queue\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# blocks if workers too slow\u001b[39;00m\n\u001b[1;32m 1287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m report \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m: \u001b[38;5;66;03m# a thread reporting that it finished\u001b[39;00m\n\u001b[1;32m 1288\u001b[0m unfinished_worker_count \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n", + "File \u001b[0;32m/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/queue.py:170\u001b[0m, in \u001b[0;36mQueue.get\u001b[0;34m(self, block, timeout)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 169\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_qsize():\n\u001b[0;32m--> 170\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnot_empty\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m timeout \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtimeout\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m must be a non-negative number\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/threading.py:302\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 300\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m 301\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 302\u001b[0m \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 303\u001b[0m gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 304\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ - "for model in models:\n", - " model.train(documents, total_examples=model.corpus_count, epochs=model.epochs, report_delay=30*60)" + "# Train DM doc2vec\n", + "model_dm.train(documents, total_examples=model_dm.corpus_count, epochs=model_dm.epochs, report_delay=30*60)" ] }, { @@ -362,60 +377,60 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "[('Pattern recognition', 0.8360552787780762),\n", - " ('Supervised learning', 0.8315915465354919),\n", - " ('Artificial neural network', 0.8176121115684509),\n", - " ('Deep learning', 0.7854599952697754),\n", - " ('Bayesian optimization', 0.7843242287635803),\n", - " ('Ensemble learning', 0.7767862677574158),\n", - " ('Outline of machine learning', 0.7751106023788452),\n", - " ('Intelligent control', 0.7724385261535645),\n", - " ('Neural Designer', 0.7724118828773499),\n", - " ('Incremental learning', 0.7718138098716736),\n", - " ('Behavior selection algorithm', 0.7676114439964294),\n", - " ('Directed information', 0.7646719217300415),\n", - " ('Boosting (machine learning)', 0.7642883658409119),\n", - " ('Discriminative model', 0.7642679214477539),\n", - " ('Algorithmic technique', 0.7640156745910645),\n", - " ('Outline of computer science', 0.7638604640960693),\n", - " ('Numenta', 0.7621665596961975),\n", - " ('Feature selection', 0.761879026889801),\n", - " ('Multiway data analysis', 0.7613278031349182),\n", - " ('Types of artificial neural networks', 0.7612452507019043)]\n", - "Doc2Vec\n", - "[('Pattern recognition', 0.7676337361335754),\n", - " ('Supervised learning', 0.765723466873169),\n", - " ('Artificial neural network', 0.7397017478942871),\n", - " ('Deep learning', 0.7371508479118347),\n", - " ('Semi-supervised learning', 0.7312546968460083),\n", - " ('Statistical learning theory', 0.72916179895401),\n", - " ('Multi-task learning', 0.7289299368858337),\n", - " ('Data analysis techniques for fraud detection', 0.7225874066352844),\n", - " ('Similarity learning', 0.7212273478507996),\n", - " ('Symbolic artificial intelligence', 0.7145661115646362),\n", - " ('Autoencoder', 0.7123140096664429),\n", - " ('Naive Bayes classifier', 0.7108708024024963),\n", - " ('Cognitive model', 0.7059794068336487),\n", - " ('Predictive Model Markup Language', 0.7042246460914612),\n", - " ('Support-vector machine', 0.7018824815750122),\n", - " ('Regularization (mathematics)', 0.7006190419197083),\n", - " ('Linear classifier', 0.699646532535553),\n", - " ('John Robert Anderson (psychologist)', 0.696628749370575),\n", - " ('Multiclass classification', 0.6958213448524475),\n", - " ('Image segmentation', 0.6927947402000427)]\n" + "Doc2Vec\n", + "[('Pattern recognition', 0.7641374468803406),\n", + " ('Multi-task learning', 0.7290244698524475),\n", + " ('Supervised learning', 0.7212514877319336),\n", + " ('Incremental learning', 0.7164462208747864),\n", + " ('Deep learning', 0.7093881964683533),\n", + " ('Predictive analytics', 0.7086609601974487),\n", + " ('Semi-supervised learning', 0.7068915367126465),\n", + " ('Outline of machine learning', 0.7035143971443176),\n", + " ('Artificial neural network', 0.6998467445373535),\n", + " ('Ensemble learning', 0.6948938965797424),\n", + " ('Intelligent control', 0.6883038878440857),\n", + " ('Statistical classification', 0.6876234412193298),\n", + " ('Rule induction', 0.6867162585258484),\n", + " ('Boosting (machine learning)', 0.685867190361023),\n", + " ('Feature selection', 0.6836000084877014),\n", + " ('Training, validation, and test sets', 0.6823415160179138),\n", + " ('Support-vector machine', 0.6810059547424316),\n", + " ('Perceptron', 0.6794257760047913),\n", + " ('Multilayer perceptron', 0.6773776412010193),\n", + " ('Neural network', 0.6765708923339844)]\n", + "Doc2Vec\n", + "[('Pattern recognition', 0.7597464323043823),\n", + " ('Support-vector machine', 0.7284112572669983),\n", + " ('Bayesian network', 0.7256077527999878),\n", + " ('Naive Bayes classifier', 0.7218978404998779),\n", + " ('Hidden Markov model', 0.7194668054580688),\n", + " ('Learning classifier system', 0.7183035016059875),\n", + " ('Boosting (machine learning)', 0.7128430604934692),\n", + " ('Conditional random field', 0.7125300168991089),\n", + " ('Semi-supervised learning', 0.7124624252319336),\n", + " ('Multi-task learning', 0.7108726501464844),\n", + " ('GeneMark', 0.708616316318512),\n", + " ('Deep learning', 0.7016053795814514),\n", + " ('Supervised learning', 0.6973129510879517),\n", + " ('Data analysis techniques for fraud detection', 0.6920328140258789),\n", + " ('Artificial neural network', 0.6897733807563782),\n", + " ('Mixture model', 0.688715398311615),\n", + " ('Symbolic artificial intelligence', 0.6857218742370605),\n", + " ('Meta learning (computer science)', 0.6849099397659302),\n", + " ('Grammar induction', 0.6836742758750916),\n", + " ('Intelligent agent', 0.6833598613739014)]\n" ] } ], "source": [ - "for model in models:\n", + "for model in [model_dbow, model_dm]:\n", " print(model)\n", " pprint(model.dv.most_similar(positive=[\"Machine learning\"], topn=20))" ] @@ -431,7 +446,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "metadata": { "scrolled": false }, @@ -440,33 +455,33 @@ "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "[('Ariana Grande', 0.8456711769104004),\n", - " ('Katy Perry', 0.8200861811637878),\n", - " ('Mariah Carey', 0.8129399418830872),\n", - " ('Beyoncé', 0.8065741658210754),\n", - " ('Cardi B', 0.8001681566238403),\n", - " ('Harry Styles', 0.8001224398612976),\n", - " ('Rihanna', 0.7915332913398743),\n", - " ('Cher', 0.7864869236946106),\n", - " ('Adele', 0.7830130457878113),\n", - " ('Britney Spears', 0.78132164478302)]\n", - "Doc2Vec\n", - "[('Katy Perry', 0.7350542545318604),\n", - " ('Joanne (album)', 0.7215949296951294),\n", - " ('Britney Spears', 0.7209619879722595),\n", - " ('Kesha', 0.71454918384552),\n", - " ('Artpop', 0.7129204869270325),\n", - " ('Taylor Swift', 0.7099292874336243),\n", - " ('Cardi B', 0.7032613158226013),\n", - " ('Beyoncé', 0.69660484790802),\n", - " ('Madonna', 0.6904603838920593),\n", - " ('Beautiful, Dirty, Rich', 0.6878629922866821)]\n" + "Doc2Vec\n", + "[('Katy Perry', 0.7140653133392334),\n", + " ('Ariana Grande', 0.6990166306495667),\n", + " ('Demi Lovato', 0.6782864332199097),\n", + " ('Miley Cyrus', 0.6620475053787231),\n", + " ('List of awards and nominations received by Lady Gaga', 0.6562342047691345),\n", + " ('Christina Aguilera', 0.6527020335197449),\n", + " ('Taylor Swift', 0.6430284380912781),\n", + " ('Adele', 0.6412620544433594),\n", + " ('Adam Lambert', 0.6401858329772949),\n", + " ('Halsey (singer)', 0.637832760810852)]\n", + "Doc2Vec\n", + "[('Katy Perry', 0.6719839572906494),\n", + " ('Ariana Grande', 0.6502904295921326),\n", + " ('Taylor Swift', 0.6452381014823914),\n", + " ('Artpop', 0.6417931914329529),\n", + " ('Christina Aguilera', 0.634290337562561),\n", + " ('Nicki Minaj', 0.6294941902160645),\n", + " ('Adam Lambert', 0.6128465533256531),\n", + " ('Kesha', 0.6105154156684875),\n", + " ('Born This Way (album)', 0.6087599992752075),\n", + " ('Adele', 0.6087093353271484)]\n" ] } ], "source": [ - "for model in models:\n", + "for model in [model_dbow, model_dm]:\n", " print(model)\n", " pprint(model.dv.most_similar(positive=[\"Lady Gaga\"], topn=10))" ] @@ -486,7 +501,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "metadata": { "scrolled": false }, @@ -495,33 +510,33 @@ "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "[('Last Angel', 0.7589371204376221),\n", - " ('Kyary Pamyu Pamyu', 0.7432413697242737),\n", - " ('Hottaraka Series', 0.7406142354011536),\n", - " ('Blackpink', 0.739910364151001),\n", - " ('2NE1', 0.7388721108436584),\n", - " ('Nanda Collection', 0.7339783310890198),\n", - " ('Ayumi Hamasaki', 0.7316932082176208),\n", - " ('In the Middle (Ai song)', 0.7270604968070984),\n", - " ('Duty (album)', 0.724769115447998),\n", - " ('Change Myself', 0.7235770225524902)]\n", - "Doc2Vec\n", - "[('Ayumi Hamasaki', 0.6632838845252991),\n", - " ('Artpop', 0.657765805721283),\n", - " ('Pink Lady (duo)', 0.6549235582351685),\n", - " ('Free Free', 0.651936948299408),\n", - " ('Taboo (Koda Kumi song)', 0.6419629454612732),\n", - " ('Princess Princess (band)', 0.6365096569061279),\n", - " ('Radwimps', 0.6351915001869202),\n", - " ('Dempagumi.inc', 0.6337336897850037),\n", - " ('Headbanger (Babymetal song)', 0.6331109404563904),\n", - " ('Joanne (album)', 0.6326634287834167)]\n" + "Doc2Vec\n", + "[('Katy Perry', 0.6580742001533508),\n", + " ('Kōsui (Eito song)', 0.6197645664215088),\n", + " ('2NE1', 0.6172165274620056),\n", + " ('Ariana Grande', 0.608268678188324),\n", + " ('Alex York', 0.5975368618965149),\n", + " ('Thank You, Love (Kana Nishino album)', 0.5951482653617859),\n", + " ('X -Cross-', 0.5949676632881165),\n", + " ('Megitsune', 0.5922212600708008),\n", + " ('7 Spirits', 0.5915307998657227),\n", + " ('Audience (Ayumi Hamasaki song)', 0.5913636088371277)]\n", + "Doc2Vec\n", + "[('Morning Musume', 0.6124014854431152),\n", + " ('Yuko Ando (singer)', 0.6063645482063293),\n", + " ('Yumi Matsutoya', 0.6047919392585754),\n", + " ('J-pop', 0.5908822417259216),\n", + " ('Ayumi Hamasaki', 0.5900821685791016),\n", + " ('E-girls', 0.5884340405464172),\n", + " ('Enka', 0.583469033241272),\n", + " ('Shingo Katori', 0.583054780960083),\n", + " ('Dempagumi.inc', 0.575444221496582),\n", + " ('Shinsei Kamattechan', 0.5742727518081665)]\n" ] } ], "source": [ - "for model in models:\n", + "for model in [model_dbow, model_dm]:\n", " print(model)\n", " vec = [model.dv[\"Lady Gaga\"] - model.wv[\"american\"] + model.wv[\"japanese\"]]\n", " pprint([m for m in model.dv.most_similar(vec, topn=11) if m[0] != \"Lady Gaga\"])" @@ -555,31 +570,12 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-03-20 20:10:55,289 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec_dbow.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-03-20T20:10:55.289405', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'saving'}\n", - "2022-03-20 20:10:55,292 : INFO : storing np array 'vectors' to doc2vec_dbow.model.dv.vectors.npy\n", - "2022-03-20 20:10:58,822 : INFO : storing np array 'vectors' to doc2vec_dbow.model.wv.vectors.npy\n", - "2022-03-20 20:10:59,349 : INFO : storing np array 'syn1neg' to doc2vec_dbow.model.syn1neg.npy\n", - "2022-03-20 20:10:59,842 : INFO : not storing attribute cum_table\n", - "2022-03-20 20:11:03,053 : INFO : saved doc2vec_dbow.model\n", - "2022-03-20 20:11:03,054 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec_dm.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-03-20T20:11:03.054368', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'saving'}\n", - "2022-03-20 20:11:03,054 : INFO : storing np array 'vectors' to doc2vec_dm.model.dv.vectors.npy\n", - "2022-03-20 20:11:05,496 : INFO : storing np array 'vectors' to doc2vec_dm.model.wv.vectors.npy\n", - "2022-03-20 20:11:06,366 : INFO : storing np array 'syn1neg' to doc2vec_dm.model.syn1neg.npy\n", - "2022-03-20 20:11:06,792 : INFO : not storing attribute cum_table\n", - "2022-03-20 20:11:09,296 : INFO : saved doc2vec_dm.model\n" - ] - } - ], + "outputs": [], "source": [ - "models[0].save('doc2vec_dbow.model')\n", - "models[1].save('doc2vec_dm.model')" + "model_dbow.save('doc2vec_dbow.model')\n", + "model_dm.save('doc2vec_dm.model')" ] }, { diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 5f4d06e634..061dcfc817 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -446,7 +446,7 @@ def __init__( def build_vocab( self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, trim_rule=None, **kwargs, - ): + ): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Parameters @@ -1593,14 +1593,14 @@ def _log_progress( # examples-based progress % logger.info( "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", - cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed, + cur_epoch, 100.0 * example_count / total_examples, trained_word_count / elapsed, -1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue) ) else: # words-based progress % logger.info( "EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", - cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed, + cur_epoch, 100.0 * raw_word_count / total_words, trained_word_count / elapsed, -1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue) ) @@ -1636,8 +1636,8 @@ def _log_epoch_end( """ logger.info( - "EPOCH - %i : training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - cur_epoch + 1, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed, + "EPOCH %i: training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", + cur_epoch, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed, ) # don't warn if training in file-based mode, because it's expected behavior @@ -1647,12 +1647,12 @@ def _log_epoch_end( # check that the input corpus hasn't changed during iteration if total_examples and total_examples != example_count: logger.warning( - "EPOCH - %i : supplied example count (%i) did not equal expected count (%i)", cur_epoch + 1, + "EPOCH %i: supplied example count (%i) did not equal expected count (%i)", cur_epoch, example_count, total_examples ) if total_words and total_words != raw_word_count: logger.warning( - "EPOCH - %i : supplied raw word count (%i) did not equal expected count (%i)", cur_epoch + 1, + "EPOCH %i: supplied raw word count (%i) did not equal expected count (%i)", cur_epoch, raw_word_count, total_words ) From 44931dedf92e1bab21aab7c97820c1b384d10d80 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sat, 26 Mar 2022 22:08:46 +0900 Subject: [PATCH 52/81] move coverage tests to py3.9 I suspect they are causing Py3.10 tests to hang --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f09b21d61d..26acb07711 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -89,8 +89,8 @@ jobs: include: - {python: 3.7, os: ubuntu-20.04} - {python: 3.8, os: ubuntu-20.04} - - {python: 3.9, os: ubuntu-20.04} - - {python: '3.10', os: ubuntu-20.04, coverage: true} + - {python: 3.9, os: ubuntu-20.04, coverage: true} + - {python: '3.10', os: ubuntu-20.04} - {python: 3.7, os: windows-2019} - {python: 3.8, os: windows-2019} From 0bd1d0651f7a52e1b6bb38529161e395986a803e Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 27 Mar 2022 17:46:41 +0900 Subject: [PATCH 53/81] disable code coverage for now --- .github/workflows/tests.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 26acb07711..0b64f2b1b2 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -89,7 +89,7 @@ jobs: include: - {python: 3.7, os: ubuntu-20.04} - {python: 3.8, os: ubuntu-20.04} - - {python: 3.9, os: ubuntu-20.04, coverage: true} + - {python: 3.9, os: ubuntu-20.04} - {python: '3.10', os: ubuntu-20.04} - {python: 3.7, os: windows-2019} @@ -149,8 +149,7 @@ jobs: python setup.py build_ext --inplace # - # Some of our tests are hanging. - # Limit the use of the coverage plugin for pytest to rule it out as a factor. + # Some of our tests are hanging, and I strongly suspect it's because of the coverage plugin. # - name: Run tests (without coverage) if: matrix.coverage != true From f32581e37b389654ece8f8492151265837f44e6f Mon Sep 17 00:00:00 2001 From: DingQK <58072531+DingQK@users.noreply.github.com> Date: Wed, 30 Mar 2022 21:57:15 +0800 Subject: [PATCH 54/81] Fix `str()` method in WmdSimilarity (#3282) * fix wmdsimilarity __str__ * Fix documentation for wmdsimilarity * Add unit test for the str method of wmdsimilarity * Update wmdsimilarity annotation * Update docsim.py Co-authored-by: Michael Penkov --- gensim/similarities/docsim.py | 4 ++-- gensim/test/test_similarities.py | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index ee73328ff1..cdb966547d 100644 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -1015,7 +1015,7 @@ class WmdSimilarity(interfaces.SimilarityABC): >>> >>> model = Word2Vec(common_texts, vector_size=20, min_count=1) # train word-vectors >>> - >>> index = WmdSimilarity(common_texts, model) + >>> index = WmdSimilarity(common_texts, model.wv) >>> # Make query. >>> query = ['trees'] >>> sims = index[query] @@ -1096,7 +1096,7 @@ def get_similarities(self, query): return result def __str__(self): - return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.w2v_model.wv.syn0.shape[1]) + return "%s<%i docs, %i features>" % (self.__class__.__name__, len(self), self.wv.vectors.shape[1]) class SparseMatrixSimilarity(interfaces.SimilarityABC): diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py index 35ddd03397..0b917980d2 100644 --- a/gensim/test/test_similarities.py +++ b/gensim/test/test_similarities.py @@ -373,6 +373,11 @@ def test_iter(self): self.assertTrue(numpy.alltrue(sims >= 0.0)) self.assertTrue(numpy.alltrue(sims <= 1.0)) + @unittest.skipIf(PYEMD_EXT is False, "pyemd not installed") + def test_str(self): + index = self.cls(TEXTS, self.w2v_model) + self.assertTrue(str(index)) + class TestSoftCosineSimilarity(_TestSimilarityABC): def setUp(self): From b64fdc0d4477d9965eb1fc517d4425e275ffbc2f Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 31 Mar 2022 12:00:48 -0700 Subject: [PATCH 55/81] update hijack_pr.py --- release/hijack_pr.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) mode change 100644 => 100755 release/hijack_pr.py diff --git a/release/hijack_pr.py b/release/hijack_pr.py old mode 100644 new mode 100755 index f885579985..5c77fca2da --- a/release/hijack_pr.py +++ b/release/hijack_pr.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python """Hijack a PR to add commits as a maintainer. This is a two-step process: @@ -14,6 +15,17 @@ import smart_open +def check_output(command): + return subprocess.check_output(command).strip().decode('utf-8') + + +if sys.argv[1] == "push": + command = "git rev-parse --abbrev-ref HEAD@{upstream}".split() + remote, remote_branch = check_output(command).split('/') + current_branch = check_output(['git', 'branch', '--show-current']) + check_output(['git', 'push', remote, f'{current_branch}:{remote_branch}']) + sys.exit(0) + prid = int(sys.argv[1]) url = f"https://api.github.com/repos/RaRe-Technologies/gensim/pulls/{prid}" with smart_open.open(url) as fin: @@ -22,7 +34,7 @@ user = prinfo['head']['user']['login'] ssh_url = prinfo['head']['repo']['ssh_url'] -remotes = subprocess.check_output(['git', 'remote']).strip().decode('utf-8').split('\n') +remotes = check_output(['git', 'remote']).split('\n') if user not in remotes: subprocess.check_call(['git', 'remote', 'add', user, ssh_url]) @@ -30,4 +42,14 @@ ref = prinfo['head']['ref'] subprocess.check_call(['git', 'checkout', f'{user}/{ref}']) -subprocess.check_call(['git', 'switch', '-c', f'{ref}']) + +# +# Prefix the local branch name with the user to avoid naming clashes with +# existing branches, e.g. develop +# +subprocess.check_call(['git', 'switch', '-c', f'{user}_{ref}']) + +# +# Set the upstream so we can push back to it more easily +# +subprocess.check_call(['git', 'branch', '--set-upstream-to', f'{user}/{ref}']) From 4c941b454a86bdcead2cb1a174e6ec7158253e29 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 31 Mar 2022 12:13:21 -0700 Subject: [PATCH 56/81] add example and clean-up code --- release/hijack_pr.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/release/hijack_pr.py b/release/hijack_pr.py index 5c77fca2da..d109836d49 100755 --- a/release/hijack_pr.py +++ b/release/hijack_pr.py @@ -8,6 +8,16 @@ As a maintainer, you can add changes by making new commits and pushing them back to the remote. + +An example session: + + $ release/hijack_pr.py 1234 + $ git merge upstream/develop # or any other changes you want to make + $ release/hijack_pr.py push + +The above commands would check out the code for the PR, make changes to them, and push them back. +Obviously, this requires the PR to be writable, but most gensim PRs are. +If they aren't, then leave it up to the PR author to make the required changes. """ import json import subprocess @@ -24,6 +34,12 @@ def check_output(command): remote, remote_branch = check_output(command).split('/') current_branch = check_output(['git', 'branch', '--show-current']) check_output(['git', 'push', remote, f'{current_branch}:{remote_branch}']) + + # + # Cleanup to prevent remotes and branches from piling up + # + check_output(['git', 'branch', '--delete', current_branch]) + check_output(['git', 'remote', 'remove', remote]) sys.exit(0) prid = int(sys.argv[1]) From 0b1b3aa6e1201db05a2050179757541887128335 Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Sun, 27 Feb 2022 07:46:55 +0800 Subject: [PATCH 57/81] Remove unused Jupyter screenshots Fixes: commit 34ee98b3ca29054c7c20139ff20922c2efdcf834 --- jupyter_execute_cell.png | Bin 126947 -> 0 bytes jupyter_home.png | Bin 59843 -> 0 bytes 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 jupyter_execute_cell.png delete mode 100644 jupyter_home.png diff --git a/jupyter_execute_cell.png b/jupyter_execute_cell.png deleted file mode 100644 index 3005d277b3bdb24dc7d6dc9228820b4f78ac26db..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 126947 zcmZ^~bwFG}&o7J>El{9Hp*Y1Iic4{KcPL)m9SRh8cQ5Yl?p|CLcXwTO*-IaLpL^f$ z+rMTrNhZn6nVIBwPMD&+1S&EU>zu>0Da%+H_%p`dsll zHWun51C+mzk%<+ojlKkxx&9h26!b?D802pXj_9Z}bq(zAB%g0zFHJ2GXT--)!WeWQ z^n2pn5if zkIvBEzu)M)MZ|G>rh+Bbnt7kNA*>Ck3Bxd4H` ztpW^97?dWqUtP7kYw5!})^TSf(q4i;t-KD|?irqLJ}6-bK_3)#pHsZJkV~d}XHSab1M=y_8^WTH8iv3i_qBdSNe8;7{`h zy6q?KFY*OcI9S0g0KW!Xi1(`H3CScNl-HAc)MmD^B14ZH};1&u-|R`!2|IBsgE_>q(vG&fxCTt^Q{^W3jjL~nQ=R!PUJ`F?~>qC1nx7zGCFA8nC z-*l(m!mu$%YtBLVc{v9T&WH&cKJ;lrS4OEzJdv->Ojx**_$N!4~{|XzJqgapBajA3e=w_`_8M)onL=|0pU}A6>$hR9M8>^neR7)c()|3R9c{` zpgZibbWz-T(VMR}B~#a;e+d?7**V=xoISnH#8t&+C+S0o3le7}(`7>oe?$KJ0x(c1&5Dq@!affa_U&?sD=5)ochflOAOInNaM3)RW)+%mv5L z<8dR<*2mR!UB$>Eu4Gu#Lao)hqEktnjr`F;e@_C1Z(?2pZQFw`8p}NjBbE!#?oY%F zkJE$0W3;e)?Wqspkdt!iFUrSw&$Gao2Kff*E^4H&Hxyipd z@*i>q#Iq#y*j6e7Dj!Ce2L9>WRBiRYH^fb>8OhY^PXMl58Pyql?wrIB=w4}9uZ3VZ~w-B_Nd*Hc0+&tZwT(e($ z@0so_?4WKh!MicrRx*v8QS}=e;=rf2i@ZxIhT7HJMs@LG986YhzLnfRrypu&y|GU(r z!z5}Cmj;@~y!3r3TB*S-@+|c1m)UC5BhyIJ9n%xjuJQ2Ew()~ev=NN`yZz|Vg)yH5 z+?2<}ZVHI9zfiC2_HW^czTo_LGOb!=2X(1$qiQq;mYQiwE6UG_H(1z_;Z0*KN$^S7 zYGMn_3%q3p3l#I9g{tz&GmMSh)?$h>K6d^(J~RFW{%H?+ zuTrn9+w^Po$EmyQ`$Ka2>&~N>$>~4Uf9zH^`ylBMv!5d0)JO#We63*&6&MjL zVMyhDvbQ$~b(d5gc_Lj9SqV89rQDlUq;~eRu)!n0Q7fj?r2vUOwTE_Mcwh*UpOaq_ z74d#{ZlRN=r~UGSM_Ukhf^moONX{Vf98Dw$WE7`6kf#55iUV+rglO*6mU0EQ)b0M- zeI3pjmKn~8Fp#vDEK%5#j+DvdGP2Jn3*!mPWcOA+H!))6S>bHu6=U2N9;;T3nV(`m zZpdu7SGg$WQ5esh;_vhrI+b;6`wlFuU#=H24`F3z9h;iTiV)z8vrN{?8q7#aZ;bU} zezA=9mf!a1d%n8eZ7phT#D!zb?5y#fIovVKcGXuCF!z;yb@Z1If_#~jNea>pvJ5Iu z@nP!j_-sRl}9t1Lec#s+JmwNoh|jJ%9v(7JM?vy);ULwXWM=LUa#lJPZ!1NA`40j zw_gRm?l#Id`nU|aFx>yRe{a7YhV=YiGy+WgodA5hE#H24A~fCwZq7e8~L(Am0-NK#Ud24GKd0c5 zEMN4u&ICU&BXdKNE0iI+lyHusjXlgNd_frYQ0=Z3jXUB=;(_YxK>fT6yOT5KUH1FI zNUVwP^hytR^HTLp&8)pws*0*hSKTR)Yehd^`N9@ z<#QW|fci+R{xz8sn-iCB%vQ>VfAwI=_Exhtu+eyb^l2`8YIs_^orvG=sOn_&9QAze znChY1J&tqhYYFqOPxwTPUZ1X|o#1$9{RdL|GLB~=SsH7@nu~`Qh6Fq5yzTg*6;$O` z<*H;&eU`y_maCqgjt2u3^Jja*bqYdGj!qFd|oUU4pN1T^|}G zwQq!XEF2qsL>!%*01(h0hD2g|l<8bMm#5pCSt4ONQE0wAER^8~k7@@0Y8CORJrNsb6|?^|*7ROwU`WN;hTP}>E_fGhPiPup z6kf<5%-{6!@zvcNbpGpXUaTDnwo485b}(EMN8`ha8mC4(S{; zC&bNJ?RS|pO33Nk`0*e3d-wN-P~ppD;HMK1>)CMw@aJ$=h#t8*3$~=L=VY?XvxD4L z_)?C<-D2+6j^$2oPn$jSICD8IzV2~+er3I07>!F^(zkOKUy(Q*I@!qn?B-2%O$2%E z7dv&OYwucLMfq z+UdCQXmrc^VQd0`?Bt@d`s?TO8PR7(<48Oj=GQ7OYptnupa*>%M9xd7i=~aXkYkV$ zk*gPd39m})$|Ox-2_DT@Nf*mfONLN4J1xPafe69e;@Lw1L%Wo%ML1?$_3F29DC9Eg<1y+{ zi)kDJO+Q>k+gV;R&?E?>vQjfQcv8+$#4jL8V082W!g+_m_J_Eeb!#K_%q!mpt z(lN>`nuuH3ft_@SMV-R+Skp@Pz~Rz;9j;b%5i3!O#BzI2^yW6PY+l9kx5`biL$Sov znzRD{X^xS7^y}L0gio52*361F_v&E+(-u?FfgDbaUOaHWmUjY2TDUd=RA4& zeUsb?lW>$b>wy*Z=Zm*YhcTyc5_qp18+ik70SDfK)eJczRg%OSMHk7NlN4A+lglOf zNGwgT@6Quv;sGvEOC{zAEZAms$BU9E&xf5EJkr4Pj6{l$Yy3`HLCJvz2!DyI>9S$iDER)+{D$ezbU#^hnG zOg8a2M0=~suubu2O;~6PwUV>GwIZnZLI$%oo>7^0Hgfisb=tIgyl~`&8jntzmW9TK z)=&MD%DKkr+FaE}Q?p2cj+3aBxR(^}kx`rznq4SoK64SP5&J${0w-rPV%+zOvSau9RM-9)YG`jS`5?M8)z2^s~v&DHq}MH^dPbR6}^(ZzQ4b zKIOvAzm*T(agdlvBC`f}=NT2l2&#}w1uQvc2CY`;G(~n$^ct2U=iN{SiC(&eSMLvHTCsj9F{J}xjgnp*< zSu`3`UT#XN>bX4=1_l@94pCybR+99X3sR$zanE%3-@HsbGuhF1d#4czR+$oCP0^>u(765vC9|1h3H>3&!J*F?To+m2GM3XmCP&I~ zrv(cnc?XksekB)}W6?S6Sbf*X*9em&JVSSsKl}xw@beN;a=RMyI$ou9_kxr8b(ieq zOq)3>dsN%xUh-_y3fg(GTboJSt(U|LE_@fNTz4j3nn(Q=j;BPbVJ2oON9JUj3NRg5 z;LL4U=*8gB>0-RSG}Z)B8o4NjUHb-e_D;PAW#H#$dlYB?m=EtL(9(oVsBwP;3*@IN zwUscd()@rc3XIwkZTYnEtvs(_=Kjbf5{D9-EfRZ>XgGT3VJG~`-J99>r*KgI&!1#Z z81OVMG{lOfiu`2Sn1YJKb5}F$b8UYZXR-=Ya{Z=03`m;L8_$|~_J1Dio1C5W44#bA z%Mgi7h;xdvN^DO){A@>(bIfSzs}5b!6TZcnSKTKl$}vzf_|DZzx8?So4Ot|nQ-&KY%#9PB=ezh>Gbi6{ta?4uOe9hc*hHrUxN_)|4U}~jneCE&b zpa4@!4g#hwsd6YFHzeM?i!*52K`_A2Ay?UVZ$hYSx7M1;?)aWCOlf1?m>{66a^5IB|ztwe{I4*6|yE zWB@^D5>U;hS?nUSJEWUfrBgpcndvU~(a1aD`>nN%Z^odp^Kf8s-t<5_WjoV7&jVZ6 z5wR>Whi@yl<8IX8(FEY?I_mK{;Yt<6{)2Lx95`3o%4BfYEk23s%?suSg5-+pH7D;8!VeW ze8wSp`i^ygB?QqzR46lRr`wPZP%TbS)+W}rRoFJRwjaqyhX|n3VO?JVD_vF=&?A2B zK|F)-yRTs4aX;vtx=H;bSJS#TAOX%&Rnu8hPL{{m&W6#*#O}K(qq~j$8?FEa#pllR z*0eEoHX?Div9@*Mapx!d2M5nv`!AV^jN~6I&Q|x?BFwNVZP@a_h}O zroTN*ER4)d|6gLxmS+DyVt;%7DfSP){y7}qUu8UsmhPt3TB4RVrnXLRrU|gKvhn?6 znE%!DKZgF7q~`xfa9NT#APWNeT(@GAJH6DV&v*Dq!W}t35r94vxF52{&p5* zpNwP}1UK|8;454#2OPa*fSB;&>K84m{@||5@{^M)+s4wRQ<@az6BMSnSVtqBPK*BGC%-t-7i8lr0b*RcQ8=g<^|_P;EOMtajjj8zj{@gF1q zJvtXa9O2*Xf7$;YX86}nho%@LO_9a_kp690rcD9tzuW)t_D$H|Sh$A^Fbr2gYV~i@ zf8sIKH1nVKzvI!vLTZ?KZVa;+`}N z=0!tUgZ&9T|8k&jTZhQ=vt*1amd*d|l06*>vk~bUGAg%{jfI-U&U~vs{g-hU+R5qn z6XvY{PvqA!5mUJV#wj`>(lpftovK~ZUS3|Rn_2}e>YH|%(~79|GXL)aYWa%s=4iQ?|rO6U)xUB6XW2{4N z?YZ@A7sAmpq2xqOWGTy}32_ZMIw8j32aYIlw{wp*3ku&A`$^sb5kgrvg&XdE+s)N?9WT9Et<`O&A)II6BfK1YaSqqv zed)DC6Qm9)rHdzzl5emWb{Ol8##Xy4h=+J`uc+nZ)109G_+2pw;(S|rws0_wHrd2S zuNy#uz}BY(LHk>$mGjD8)97@O>!)j~Xlbcr2yi;V=4npwGkRx&^uC~gkyh)H`+i}K z>aa%Ve4Ri8XBQ52+gm*>#Y{0XV-<3CFG%_f`8GE>?0Ho?On({{P2k2q?YRJ1x=ik{ z?b`QrG(Kqy`oOHvH0+%+uM`s?#2NiARwyG{c4|VIEUw2c`+zKAWF>#Rc$zo!*WA0+ zwl=Xr_BDN8@9E?5`8}r(dzhXrQGIvr?!lCCTT&5>vqKBzs``#p)g8wmKAUDG?SkEH zmXG-~SzOsROVW;MR7)xwD+PgI{#DKR4t{y`nf?3V;$N+Q_Y`?N7&~9EVesN67gx-v z!!E`$L?b`N2V-WCIH{xKX+=~pof23{Hp2w@I8&}qyX*JLAUZ7|c2^B?-1T^Hx?&=x z07gTiIK1g*oU%`g^gS46ir1u^pVe&F?)gsDASkZ{GvfxLAKx`C?M6-CTi2JKwviaR zl7FI`ApB7wxTk2{#6ArOI~O=+FZ-49lT|dLAxlAuickkm`4+7WDxO$8`o}J7p_!_< zhpukJI|{x2hB(Se$`0q?)}6ZYB-%-e0}GC% z;>wjK`zspuud)sa!*BaoixMJVXeqoPB-Pm@ziRXU~_`B~vOcZ62i_X>7B)sIw z93F4X6(6M)%g;GxA0{X$o0!c?aZN#%=dPhLc#B;9Xh&c@`67{=1xYChUxXvE39h8R9@I>17XBZ&v8<+?Xq`%ZTv{7yUD9+NH)ovM2a);*Ceq#!C- zi%5k&hu0?w3C8_kYB1v!2beZ0>yPncni(eQKW*htdmLA^#Z}OMf#(8t->}uuD@4`2 z_Gi4D6NsW4;`cTCrtKXnyyo)8G{enE5Ptz`@kxQ3>P2(liMn3dXJWk3XAmGW83}|m z2WfsCe^k+lgDX`n{S1~=_MQU;>EpQET@aLJ?(f znfGW1;wDQ*?vVCsP}wLvyB(1~*b|hCCg=41?QqmJfV+DaPwb=0coDE( zL*(;216}{)kR|A4BxKDA0y8RD)nZB8?F)i1xz4E{Up}MQX*G_>7oMU}cwBEjLeM!O zf_!JQ;DLl>mt$<(blU0ROV2b@u^hTN5DGV*Duy$}n$wlVeaol8$trd`mP(k-ui2&w?A2 z5O8)iy3w!3?!1D5`&r~;V^#RX8(NrUmgU76bsBcaQL*0XQ|{Y%#PcTSIqjx*X^Q%?TA7*3blc6iyqz@;(NB6#$`JzF4{W{=jnD3K>$$}f2X3pH zE|UU$m@EPw{Xkyhz|o3~2MxcA#d{lbQ@&<3blmQRNxuss{=vFsp?yz%=MaMGll|TG zCzeHGp!xXs!Zn`PhVzn}G_deI1 zKfHc2y0T28coKO3=rRu6luCOgWQtn?+R|3UgoiAFYfOC%HBK;iryVWdk^G@rm796fe zi7o%yFrBq(=g6oM;U8H+`<bb8Hf&N;T`8V`@ORUH&$DHr<2;j^V(0+Kthw~Lq|K`z?821K?z^lBXruV zEF0e+;Pp+4V|&mGq&FiY2d5l#)&LB=xVz;Drls%HPBtEn@VE~d-~2MSa;<7~f6qE~ zsrZr;?S`g`G!9H7PBhhbTZvKu-Ao2MKB%5-f}p|?UeAedVe9h~0L70JEI z7sG3iykzHrKI7N6KFNEwVlF$LdHIrIB$Mh55B(%V`IZNgre{qnqY7@=h}uG7_NjX$ z?CHQB&kKI#RKT_(dae}-3*84b4WI6%=V4wf$V|ChNl~-Ppxn|(Gss92wk+%YE;@j# z#5R*x4Kz*kmYq}?5^;_Hj^a^wb?V$hU@js4n)Y^cD#dWO&ar@=nn4Hi&B*?#yp~(% zN%3$k>)QRInfvIaxdOt;w|mZEm7}NcKj!ph;W##DYL;+c{D=YJx)SL3gPlHvwHICS zSOuxkH}rdSUkw_pG9?|o+L6$%L_Bp7pgaid*0;?zUNO;tS@@G!S?Ta7P!c)8_)qvU#w8h8tI0v$qo(@JEFgRhvj z3K;B51c^DJJf8>vVZaKnJI%}IiHI$0NG(Be_HV}tDhL&)zTj?g#@O-t3rXBflELc_ zGGE%M96xjOmn{1#?wUDY0M+>ICgdw03j`y-`=>0I$<*%VFRpu#7re%Y4={D`d+|zb z^^*#(IgcLv^*l%&bYq*loF1GrDBWv5j?Kgy^V_A90;U-?ZQ7*$(}y1N!{o{0 zbF0nwx&%b1FiXVH8z&b8Dz=@x8rxew?Os0Q)1c^kh!0BB7>Z$N)+H1E)qo~bQIYr+ zg8Wl?Ko0P|_Yr&px3`Nfkr()|^5LLMz366cFh!3H2*cG6d8c6gyi2p078~vT$JM{n z;^f-g{@RO}5~$Bre&*f{3KG5a>^mj#i$A9qoEdIAr$Z^hBDVLvYVmHHSP=lTtxXt& zQ7yxnzr^?wsOb8J9!1$DT2Avu6`X&$#q#kL1`T7U;drI-0U?6tmanNZ)t8>hI>9+j z=)2ZT-d|?`0^b_kg?Iow#BYgZ_r{U3+w@&2qwU%Dv~4%6slP_`v3Mu7ALa&({Sz4o z$4@Ks0aTpLclyb9d-#qnzxkE2$v5+H2-R!w=D@zWh>msWEw#N9eJjZD=d~hX31eul z1F%#Sg;i8IjbPu8zZGl5e;#gj9rzj9nQ;L;pLQ{;e5GPq1rGap7KPTXh$i1okLF!4 zj^9`Y;h7v)tV#||X?94z@8QoS;HEk6YPD9+#(a>i?W;>QuWe%L7;BpkLGQ8s1;MAQI`Y#+ce>vMCzRcj zrPhI$zRbWJiRzQLZ$;Qu0bOsMCwle+%y)%9IqJVDk6uSBzTTvhxonl2;`em(T|&p743bez7gzwA)BnIJA# z@?SFXUeOpYFVq-k((tXydddQE3@dYo2nyU7r9Co71FsrWGn?^Yd>(08!v0Yj+*1Fx zm1XDD43)rW^!WACn6Ijb_{@EYmm%UYz;{80LFSxoz5JqoKoGskrh01=sIWgLNSl8G zYHpHFlEALE0~harh^w|MFZG`r8Ixbn|EO~Nv~zWGBvn1E95hft*4G1pgVV0fr^d2s zuQ{6YrJ6X!KKJHr$G}Cuw_twq!@b&J*tyHK`-Izo@N9>zm#*8~B}ROiQu(yKd+H}# zOh2vW`KKu)Gw^ZafDhQUwXWL1qM?7fIR5heblE}v^-1oee2wpN`7YVKm9}f+Wod(b z!%pYC7aRx~&nOx@s(T-Pn|z~R>icw`a~P$eA^1Wb-<;AdIoKjGh--&FssnwpZF6dy z@vw1O6W;wiPc3PPJcEF$s%-f-CBt|^z;hSpW--N5}fGsf`19@3(aE1_PvRZ zAmv>;Q!NLFb3;I%(x!-=J;npe5_QDg{;a8Tu3s{rpv?pdo+kh4-ok@;zun41ADh)V z*_hf?X6LhPiYgzV&Gp}E1PtY%h~)d#OXpK_3Gk-i#bUt}2(2+(L!xV2$^E)(mt3%QT9m zm?~xx!-BOT$VCPRnK6RD3oLF>VK>i8dS1-S*}IT-eJ44UtGNz`Ai(f38!dv>uDv!! zCp0j*B*=jr8@Mp-@pPbk{h>jkHQzWI9-0{DNQv?vwN`(f574g^_s>i&2ohf0_UEz7g*%Yb>;&zMm8B+@6`BDxZIjSZRQ^oJ`WQ=Zc|nk24 z+GNW3+UNt{QczFhd8{9=it&OqpXU>di#pwVGXfnz$~r`4wgNYD@Pc18w)+{HZ8l#V zu}_IsEMKiN(0DISq@HtuNo+)S4%8`a+-!)>q|Ci6&8zcxb3|`B$`(!Z&#ubN8b$9< zixJ%-8O=V4gAx6Uqt6$;msSShgVCw&lV12aLu7SIO?L!^RRst8Vs7_VniHXFq>sK~ znNAmf1{C|+P1d4<+@?LMu)OH0I+m~bQzc(}DVq=nbZs|CO;W|2-ms~A$cBaRvc2^3-PEYNR^J_(wqhvGh*_%D56-wf&8aJ%ZwLz^*&Xjo5% zigo?iF3WyYJoGt%Y@}fA`(}l@9V)fId@e&%IVuf)gDY}=U~-;+MO-^U1i4CGXxgW` zzTPNcoP%hTs?Q(LL{ylZ*ti#YHV@WFm>J;&t|>At=mqxI^@u`5&LMRNEhnEi318DG zFIS!5e?dN?wIad$V|?f-_>A=rsGx`8=eKGbu(;(aU9E9%OMpYgaM15OCpywADSKXD ztAwNR0#UnH?0kqn)f%S=b*H@JbDP`R`!#$b1w5F8(FY`~P@}&!NG>>-j>^sP?1*YG zVg4CcINq@5XKH+Wg)eY8UH(V7ej2_XM+DXOes5nrNwBORsnlM+aAI^`a4O)t1?OZy zz1=D|;e=V~|H2?XAKVWCSBdY5%Ukq&xfYjDELEl}BcU!E!7vk-e&FhDa<)_ypZI>e zSJtk0pkNr$&{|3^lWMlf2C5wu%$uehL>4^CzvxXI##WHB;G?l9=sIB+d8k%^(F9fR zAHdR8qEOGoitkwHD*HT%Jql@$L9ot~XZ}r2WVcd>Bx`_zOU|C2(9q@&k(qx1MQ=#c zt;tL3&#xwbD;37VTg(7pRG)OGI6D?m;jhvE;6wh|h2&#K=o)Qte=oWjZPOo3ceYl-AQxRO6kPQ$ zTx(4dZh}3YAgrqt@Hg6gFrfR6$Bu}j7&_7Jh;^8a2mk)LSs7qAMQL%G(?4AN3jjri zzO5e#xCxWLn1DPr46#?_(e#G=oy2mx+kQ%@fFqp1--2KBHl5r57`7FPm}*HO0as&i zEnhCws2z0`x1_@n%E6)FisC|ae|wEwC=#W;8$8ygRIXg8Txe$AlH7>@^~Z|Gv$rM- z(YyY*aaV0stA=2!KW|0zyYFX;H^!-EH`Py>Hwdoig8O@yr~L~aflf$#M#jriaA5oo z%;U*1oqg1Gmtw|y?#Wh_mbklJPJFJ8CO!mtq`Q4`WA>7S#3P$g%_6W`?4PXRR#nT@ zOO*ME)`@W^0`Huj1SnrTh*kECpK<0)$A1|M8*Ywa7!6R-wKjRlx~#d^@p9K?i*TQA z4b{&k1bUKM``}>1#Q;5~VMM;1Y0GSvG&Wf^gA64`c$%GzV@pzY^L-ba`xN;;YlIz zvM7f4kj`gRa0wx|jhpdkIHT?7)=lPY`dj^Odm{jC@Ms&I?wQh(MV+8yzYPq%#(~xd zwhw|=_xsXjFc()12Ts9azL_0=+iLaEh-Jv|B-4v0i**pSr!(OphpqEmzQ1eZ-goZL=!iDB3RA{o8S=%f9%8WFB!ql7xf7_xn&a}-AVGu*zm0s z7oqD3eF_mya3+vLmFqHx&3=#)-Dc`NOg50Yd%2_n9uLzzG} z=VSIxVhuXjtKLWXx8heK(Yl$1vaRAh;I>y*u+8D)j0 z;iOS>Dv8Z6T zFW#vx|?N`|DZ zj?-z9BYxUm)0^IYzcQ%>-_AlJE}Y|#Gedv!6yNG zJ~41$iA8afhWDiod!$S(Y5a>LiFjme+IknB$-xeKg)x%$`bLqm^!EvulqMZ8Ug3hZ z^i>})Zlq$_r4b`1_x7q0$@4>msgw$83X%I1psv-J4BoqJ6_rGPsq%1!Lt`jv4Ef8) z%h+AuN!_}<#(~X|B)tO>!#W%hi?+U9I8ZaCKAjOo9~!`%SKB4bqGneF$8U!uA)17J zQ)lVKTSw=~o64D7-1qUgkoIY}CspDzk(4aA2kHLiy+Xrl6>R=FS!HlHT253pPAlYj z*e~sSNw#-)0 zPg@!a%BHDU%a2xsw(VQqjr+zE6^B@K!tvSy@clX`nonYUUoQ9M>qRD!zkDBWh&nc< z$*7_(-`I8Hapvg`aOLG-REREZo-6cSZEPr`h9wB>Ch2g|4*;whzqd6%VAjAkpXjAXd^D@)#xUb-UR{DD9uF{+e@!whEk5qYRu9Gm0g zP2#R7jiTR*dFa56ZF;PgEWx3WAGc7LH@5D>l7b)csZC%S=8Q;^i?+Xys8E8HrGY+1 z_0?*;?68$S7+W0Do(U(bemyzmFk!+fN{UvdQT1Tq${gW}ijLr+A zLVG-d(*8#J7F9Z9?pb={uO&SYEu<`254$3z;DPQ3x6Q%v$L8?0U}5b)xqInjC)^hw z^U3)2sC9IyW6JIvP7B$7*eI%`cC3KR0Jxw4xa2O0h(r{bJF zf7FRkri?Ud)%{?a)?^}ef?%M+YHnlgZ{Cnoi{jhcWX|BA55lWLxyhZ#Y8sN=M+4~Xw=HSCbn>(4 z17Y^mG$Y4di){!2fDcRg#18AOg5fx>ikjc6o<67nz`2NNg!AL^I4A8-D0X!lud8;B zRY=uQOC%5qdwX~23+}z7)a1&1Jq<25?BD*yl``QeFh!j9I%D@fR^=@wU;H^!v3QUpB8gSY?g9Eb4o?3hI~TB zSQq*8MF?H_&w05OJ6%GWGj@_oF@J(_$$Qu(0f!YK*Sq{$5yN$l}z;)m??jQc?hP7lk7}~;75m$ z&dP&zZqBG;bFw1iR1-@`sg5MBk83~O9Dii*TK6ZqzPldxk3K3XFRyC9L*8YXv_)=Q z!e`TBMGq;7KR6I!CF2yz_K!L?^9K{|-Z7tq_t}PDi_{DDF@9{Scr^|MRMSi}sn^ft zX3$k@?PuQ85~fzQTUCCO=5jdeBS?p(=&Ypad3@3-Po1< zed;ct^b+WE?jWGHnO`jAJ+J}xW50<+w02iX&Z}k7?3+VU8qnx@F?r#~{xEU}zuR|P z`qekq+WMm2OVq-77G3Z6m1-7ui9Ha;xHn5DaV04_NZt`WI-0qTWEx(L2i_Hy*^L$z z)>W;U*_G|{A(F>(Y>=9{7;r-zZE8C?OGR+`97{fy(2l?%CdZJf6uPNIZf&3I_(@VC zGymOW$vKL-1UDU8Om1;bef<@^_Pl*_MI*&~Xx=2|&`6<c!zmi*WR$G8C{^fSk z-MSuS2%oC5V%vT{3K^l_b#3MPNv^+<7Zx z43RGe_>^13_zhMPS~YhWW5*lWfOcBDZqjW;JgmlPi@;8Mg*KtPd7~Q}*)9vdco>Vr zR7Wex_J+IJwOP;W)%_mg#Ku6em)>xD4u{)5JnxVyXi;7)M&usrYc-L2jC57?hi)l^r_ z>F(;!bzk=}Qg=Jy6yvZh4;5Q`aDkLtCQDe%sdTMKLMvT9(`;wrNu=Qb-mI-&PN<68 z(CR{Jde+2?-_P%eADneev)_Cmkn99M6HxrmzCnV=yD@0@&p|DL<>mDedawy~j~oVD zCm8LWEnnc0)o_^ERD!=wfVGsR3@Y!=5z*<(PZl~i#$nZ zn>M^M8obbBLv^lG`6HA+1*YkfXcEql6!opS!%p+Eq> zPTSvGCj%nC{79pit1{V9^`pou~@@fbW`#5b6&yU`q;cQkLTqcM36~^lJ7V04b^!rk^*0G)@OE6WrEqk9l7B2l*Wbl!((x$%Z|IUPvyd7nRYcD2$O>8<=DYR_J7A5x?Qf8A zMg;qLwEnX2`g5Hcz`AemSfAkA&YdW6)F(X@_yUg-mt@_b1a|#+V1>R1H^y_JWbAY! z2Gu5uofvlb%W;LZSL(f;iDqmuD;f_L31CLf=}4R7=VZ^J+wVH+t~kl&c=m8L@ip6wWVtqY_r3v_l;m`o*9+GHzRvWKW7k?WQhGyj^(KF4?Un_1?7&B9%5$XW;(ZuRi z)B7!noC>*PJsQ_;U(ukBN~ij^c3epc8`BLt)dIX&_~JtCmw)HQ`yXQ1&YI=!dOd5j z*%bH1_1658Y_BNl%N6`R;5hJYgdnq=FG|x88}9s7jQ3?rtQS*ooI=TkyO58nrd`Yn zDYZC)N24eb_qdY|cObg=#qwJvBVz#1ZWgrswTRo<(I6$U^Uc1KBZn<%G{)hYIIHc33JgBkyj3Sz8XN0oP@+w=6 z;sYh*CtsMTZ{R982zg(rB)hc)ei+Xp=5{}|PCG47Ipnqgx9MHn|R#^Ry58zXfKTSw}D#%EQ8*gOA1p1ADwQ2PO=s zVMkA4=9M!qhAR;el~T35s#AJ7^|8m-J0x+>y?&pE+S;C~nNKe5)WSNNgPwNHags)cA!7&@e>Ph>!b3(I}uZ6zW=~NX8kg z?#=(0UL_h7omC!>w#@Co^9= zM@=iVd1`z{$3`mH$Gq3-SZS_21g6MWnA-z;(6Y%&KUi}L03;>)Rtr+k6k9jv%bAjp zL7YaV@{HU4m3B1_bq~V(_UGfT`w6<7%Zw*O(#M6NgADEl+19ZdB^>nZyD@Lsf$8pu z5h(=53zk7@Ebdqi{L^zhnZB&uZj5Jiu&St zxvN5mhqHm2+DH!e)1xU7QW*{J1^!=7993Omfh!moE~IbtlVuH54pcdL=`KI5Eb{d_ z=yBbiTHL@H^#N3l6QvsIQ}jx(XB+G4h4R|v&fv9k{TgyYoA!-%>3>Q&RHmbmQ7@ zr7`3jyw27UlV?E>@SDiwWFEjWGYJ)9h z##T3QB706b95jU=Xg2vyg?b@!43!PhX3yzo%2Hcd5Z{qNvq06W{w}*}19Mw&SkqVc z2C1i0sDbZGFlZaDp7QvTIpXpeVNxX(>#fLeCN#GVX-U`^%Rr#CogUYAn_-0&0GOpbT>?Ioiiqj~p zf6bts;92z~7!+3Utu6O`6}S5#=Jdr={xRB04uw6l-xW41 z;7oe>e$|;^sat&pJH$L}IKZ!9R3G0C=e$$haZs3V6twUHj`vESf*-j)_XliTr$#!A z(Ne&auO+{KXBYoB8#jd4Sau;E=9oBa}q^~QfXbOf2{)2|$=Um;^Lo6n(Njh<**egijXy8n$* zIef@2{`geK5Q)wB_y^~q&1xc{1bzm@d-LbyI$a8OmbTODrhF)wqS3>BzB3JxX4She z0?GksniY&gqG?E0DHMA4nw)A55)ey86yP!2lrC0`h%r9sYqs!i9{iE+sVnKDKbu~# z(YRLV9a^fu%@(w?DQX)vd%8WZrF*Z=v7+d#XZ5cn7yTk1m^sPiOEs6nMMi#1t*^^j zeLw<;yJpXv@h>yRh;?04Dl=-WC`=um(4@JOd2{xX0PGDjCY^%A!|4rTLi^h_VZ~D{ zwhWD6CFs+hp|vz5=TbPc&tN5|ugghJ%i$(%q^k+?S^i_VMIPBh`dTx;5CEq0=^L6L z%%CR|=lxElM7>Qo`Q;)C7wNU>y=tT=_lOt_U(vUvh$Ck*a_7DM;%X;YcMJQM9FJsT zIHnc#E*xpZxoQ4t5pX*dx$#_o3{jfW1I=~kF0u(T!XM)H*k476?wf-I(R0GZt#~*T zPhv-uNzx~ACR+X8FjJvrDvN5!Q7alO(Z-knr+RM=52h>4g#9p2wq>D01pcaIR%g`K zVqto%ehE_WG(Uo@VnC+VGRuCN858<}iW0KClHp2bDo#IOp_)*h2D*O+m{obh}Sw53dH1J?rW-QxC#YYX<@rNX@BOJ=4N@BgB+Vdjo{H-`!MGSHtCqF z|LyB=w$qWXSD(AIut1;HVjr##LM6J*2XphwpHv%I*QFdzMG#CQ12pgF8~6X5%0sl7 zPHEvgyqPyRt+E?q7z4j8zcPvE2>J}c6tOO^9LCILhr*D`dhCUp9^x)j)-#)+z7f0f z`NOkdrbX!IH|n5smZ>ImfTKh8&`65{sG7<2J;z3$oKkgrpD0;iuFrz4^JeF9AOx?C zO;(O&c+WI`VN$Y9$GO0H9~29Q=^bEOcufiex#0ILssZv)A)_ytiS#kDt2DoT*9>D>bZ$0ny(I+w}BDfr13qH|5MK^{-2`0!PbgO96CFJ3-l)QQ_6;-OoJ7ELV0857^$ae=y0gxe6CS+41 zA@b9iYd1?ngx2SBTE*H<(+gVtLmk>3qtD!GBso_qbHV6hxE$Hl*E)L9sJtUaxvn~AlL{0o!EzOb%B*^iR)|ms$@NJXAM2r_n z{@T<(+@KU0lJ>8H#KqiW;M=s$Ej^}pVY$yEyvBB58GDdGKhgr9LnJt7*j@X-c)5{( z*a-s>ehcUJjw6~FND}+K>`WnEs-(m?9s>bn^dM~*9hGEU*3vBQ$3Vf2+HA@lsK~Ta z9sXv%-3}OjQz8{4Gq)y0#~!R%p4cNgd}F{WkS>#0$Jq^a%QxhODzs^U&=tMAp?ghN z>|z+LJq(sdjIW>x`oS&>R}j=r_vwPZ*B=+L6t<|gxiAF_fJF+2l)J;pWH`fmk{(^q z%gEr`@|pGnwA=>U{sBF7jUF}$zIj|VIwcB>1Ey|INZyS12WBB|&)c*(9A!G@)~`5Y zqr|Kgj@wKps7$kWf3^GoJxJj$Ppqx^-rf^P_07CcwgM|?Rk&}x8ydY}g7qj1b=Twi zd5Ri!9N45z(S+F;7MrfPU@*xH(bHaMn{qhv+}l4r+!}|0Q0^+QgZbP z91i!OPowQcS)0+HO3aageEtw=zGj=XbN)6xIzkgS=bIAu#w9T`iBDbz#d2Ev+vE`0 zJo2)QYn7;_S>j~3haer*)+3t?N~$03FM(Ng7kINaMy(d^F7sMl?Y{_^xn0o!&R5(O zv2ua8bC4nW@+K5wm$e&1c7>$?eD{DI%2ZtHXauTH{xPZJL2ZA#wfNVlb|)1!0bF7G zngYTPsn9gbx7_q1C# z@sMwiyvl9v(Y>%HLC;CYDU?FTQ}QHRsQh?@SyT!~ed-s5{x&F`(0}C=iU{$M#Jb)1 z$U6MbL|x8SVOuZwTz#l3X84i|)KaD*Xqfj&K^5-J3!*Ht9kwe4C!gndD~7cNLI4Dn zHh~r~85>*p(2h}MW{PyH=(EFXv4x0`>}lslu?HR47#wylvFJD6iFB{xTz@ z=pA^T%wJ~eyl0wD-ro(q;#u`)Hn{H?I79^uPU+ziOU%YC>pKsplp)K2res=E&=( zjD4iP>x1I?DEh7OE&iAJNB$pc%MRu*2>Abykd5)5--3&1StwDgWB4{b1*n8uP!5Rg^0nx!*r|=?r9Hu(-nBD} z<#Egp1M9#5XLEI&dq$f&JENAqzU|aG!h_E>!V|D90U!qd_t8~@@Iiqij zdw7xfq*(wtFq`k!)Wr1G=c1Qh)@?XadwuLM`q8u8f6(Q_hp+-o75|0LUm-*vaGO8W z(cj|V8vI=?Y}jh zx3B&}U*Z?BnEwY2=XWY3QsdWlWkgOL37}^q?-`kP0_`fN=9u-C zZ5C9IZKdb3O%!cNF(mEkhdDdmG^5u>DR4=-4A~t;4p9}`8l{r%9RyVFnrzGuVyKMj z#L@h(2eZXUN{gVr-rsn=)^1G!NPh*ObM}ZtAZHmv*~@GRN`cyxv4)YD(Yq3@M@Pda|_m_4h8ZfS*L}AAc2u! z0Dq1+(P+iPVVlT;RmS`|GTq6)`Af}lvvTC zB78seNU>~Im|o}v<+y~iygV^XHtaQCG`Y-`&)}yF#dWIfFA;*cY;{;{uKMlfA&f)x zCaE;_sN|(I_;0jH*rSwWjB;l_zlt@)G-V6ra@Xk=LTDFVtSi9H?f!><%*_Im6@5Jt zjx+^CYSfcc_s_f|lIh%ukda%?iU#}Y2_xMf$pZMgPdw*r5k6*$#u#LRMoFVp5lcZr zHdd<1G#FZxSrAnYG>s}GZB3QSR_z|E9T(}`f1(IH06EQx*4p6;Wq!P8g7I2Of3pKU zeWmyYDc@wVdv99t6_+d|eMU5`N#gk+Yd)HTB5KE71wx19^xu2V zq+g7SMK`uX=_a=?7UL)JEFWg<<#As|A+im7!!3Q_$XITWGcB} zx0>!|4krfrj?Lq!Fz%+y^&7ktS0*41iJFSdVq$+HxUG~Gh!tR{3I7Q!=P>Ew6(Cpu z(XQsXM;GY}o^rCbZ2hC7#eW2+ym}g}DmB5)V+9y}&)gi5+E$g9my5t41cy4T)Wfgg z^#0#n05QP&xm=9Xz(74H-tkAFP1YI@b=7=tO7U=OSWW-wq3miO>D<`)GKx@LvO2`b zrA+~5jc=>UGtX{OLjk(&Fh?7G(5K`}==24-0-;xy-*yWvn&+?R^Q8-f zPQttfMWt`!b$N|&mrykY^L_7gQC_lgZ(b7vZ94cDZxciwxaav`z=1ZE*0Pt)8xzp= z#d=+4FOfBsn&k?kjR2NLKq{qruY2FU0Mv{2>?8dLh%FahIbIj#qB zgI?eKnBvLGcI_MO>KI4y$kHq|ri4ze9~DLA^6hZm#T+_;`Dwa zJ@#US&)^tnz@vADeN_$fOc4)p7T!e>2y|Mh!--FUB-7cc6PSd-oyoD9_ z1X0sosxH@7AX-_ek}}-IoW#~;CprBBhs?6M@y6`Is*5zKH@e*R+ODtSI_}->tcIvC z7M{752uxHWA;K@JI*dpMG#&}&Pn`ig@#iJdMi+mnPs<-fsD9gTiO#{86dK}@d}GKJ zq#NhnViyff@h~4_HTw=iZglOGNr-T zLt{me^RFZ4{}Ooh=8n#L(L3~H>^ZGE69NX2a_G~v4M%}4%aiqA@h=JW5Md@L_m!rX z@+z;uQ=^N2n_rHcst>I%RQzzp$xz+HOA)GfrRCE?-D~yF?&s6aYs5zUU(j;^^+7h* zANAtXei2k>HJ}_qsKGoP9#oc`zH8p$MMUmXzxCZ#E2Cxs9vs8;P#r&ot|b?35QH;k ztlw!7|CuvZuveVb*z?PDEVO=l{mDsVjdn9h2=A5FOQv2@Dzi9@>(ZazCllP&RY)8W zZbb8sb{j|ueGN3_R8h^+lmRLnH)Iw)g(wT&!9YZmz%_Vw$iSZGw1;%Bopl-7&aWm>ADtfse)c_E{ZxEvR0yM%!|RvGxR+SVP8r71;< zK)MQ>!`c1KTGug)l+KA3KFOh+d{`7K3w9z2^Qp*&Vs5?1N0n=WQ2NA;!AO8@!wbMM z4g-4U`#&30?|BppKmIy+wWEf< zv5oYjl!#7=Cg%4zJG?#_ty;3zO~|mzx;JllXM&D=4#Bhpi+v4hafdYd&JWJ2f3)LY z&J^%3%3Ecq5+`x%Ga!LkwG~Pge$@^OtBBi@JV=1N6^abCSm3fl)NbxS$Ur&HjGpgW z#V5d`NE3T4G|&X#_0d~#EHSOMNI@#ImEPYX0I$AeQ2%SIN;dOPmKuk)*X+6kz09)e zi|Mi*NxQye%LcKoqYc|&nQ^VTo|gy0Y{$R*8ozflT|S?fq26Xc?P+H;ZdO-of}|(n zLbi>2OUG&(gogry(~lg@vUebt4B zb?4ycSt*21OIW~nF@1t*-{y$(uAv3ry%S|reJ}5Di53H8S^p0jODYK*cc&900l}M- z2B&0{e6_PZ7E*q6)*P_v_Lwd1o9M?mlH!#(1j^}Od;W2}ajy(pg0KaAYco{oDr3+D z1&!w#w#GzEaHjg#fOslrOIu8yQz;^EEi+Pb^9O7An7ATJx!A~%<__cZx)WguA?DWb z(LRKS5O;n6TR~DVJ@-RC{E3vbHsf?~=pu_9rrl>!67=Q?O+^6C84L zl6EF@xCIxU7_lX(>DPDau4wDyV4mnZno&br`cPEYqf2-@OvrRRx0v&|!gmh98IBVo z@+{j*2>0p^R$BUJmrK%;m7lbHBB`~1D0$tE>E*P^q(M%JsYCPC!QYzB1k*Mm&4SeG`epl$IlFqcwecTm zLaNG$zGn#IVef6suQA(j*49!5gIy^; zaY@1EjCGRonQfxQxqCBvWD`^viA)35kc<5R)9Q=m+@pxypWmcBJh5a|W_OpxUvY79 zPg#trC(TU-;&!U<_2URdNgkp_3NAg*^bTPCvVDsntDnyKcUz^G-0>PWrzT~~XLlaW zEma9##K!oJ&&8^JZ+o4;hGf^wRPH4M*e36r#dxvv2+gZ(m$t0+@~Cmx2TfORH6Co0 zj08m0Xrw$05Uc~)&Yy+s%Frf&&Tuz1pe#Zsr_{bL*&@n)Ow#ekzFFfm2Rt)c`IFe# zARYkHbLQ_&YZiZ=gt=9e069xq@x>lu8>6c6$_G_VTz&2*8?>wrf9>ZLdoy93K zU)HRnsF`VlFBj5bwz+4rQ+>U5MNz^16zV!$wA!^=fA!=70@2|(|Mq}JlS>j=e{g9i zZQs%DNtb4@Ek|~p@AXYC^UtPNn>%IJ-r$%>UbXs{tJONh@ukTP2mDy#ExqCtbAdXc zx;!W6F0ze+t)E`K``AWDIzo;}WcH@H&;#p2uW5%@?0rNuC>O^zyzt0iw=% z?j7h^14Jgc$Zr@Q5Ny_VmPJN-D7hl|x5zs76rlfQI}C#a>A_PS9Td`!av7V8tHyz7 zI(*3I53*q@5@?Eon4UXd4+ELcg15gG+J)19bZNBVQPHSIeE<~s@bw=Nci+-8(~H_J z>%Bh8Rn$ErO$T)kzrxgfGRI%(RsWPUBAN*WDKfE)$Yv-YY&+Zg!u^ePZjXKDY#YV| z2Pg9?w?ijC$+Tqbcpo-&Ec#3K_sKc>_UL_VdgGQ|;xcxvM8y`LJ7At1uo|*7oxd7w z(|TZ>T8K6)wrXoa%}GHHfp~Zxq(ugO6JRgDUfHKg^Ke}t@^shqLZ81p_zCNypV5y3 z10cUa`i7kE8gtLdz{E9x>7yb}7(eNI@}@rC8W+nW1vweB=PnLi>*!_#{dFvX$@+A0 zyls+)6IrR2!JhI{2tB>t7nW-6AL--ay-qJd0=+G_vAL4(PMWiWLu5siF@BSWD?W~r zK_0HQ?4jpl!p!PoLRB9k4tJ3%^&hFUy%Q;^AIJ#|#%Q)R z%5X@6D&aUdGfC7~H|s0Q3#HmVR!wZd_Ewj^%?b;PP!!f#REtvFEORtc{&iJLk2KFw zau`U5jMlTEnN(G-?5OhjT$M%!#}yEN^PMy^a|bIDALj=7ca@~J`n9^AYpa#CGt={% zsu5gm@q~fwW(U6v^)bLtAjQvJIKA8W-VJNsFlI)GYvs_wG6NrPC5z8opO+XnBC{yU z<}Dklk%W^cwVOq~-2}9^mLxKiX)BG=W&NC{ejMUN+#WT&Sm2cJ5tM)xb&6WpaF2fd z3_ep0u3GbhYb)1ZLfEUcQ5$eZo_L)#Ee)b#=@7tR5v#*%<4loRHh^`panixfGQDEQ zZs#v{Qa9*~zNG~Bi}T`8$9s>R+d$@T=|R8wL@6%kx@jL5P6@SLV9eO{4+b98o2oAqWC3PwcA z=5OiD5=9fCITl)|v7D}0wZf%bx_0R|VnaQxM!&D`Z1ulOzc}w#EHe}@SQ;J%CU%3F z1tVvqfV=99QXR{t{Gt#ZWXp!(y_Y4mDeC~=nkTT%dT+!#JV`E_Q@aw{Uf8 zec6~{!<%IRd~ulXsb;vey9jx!WUO_5?2ZNVLDlTTCIBgu0P1kTI*O6cKAhJV4jbAh zRyQjZXx#6dmXgyuNew}fHTXR#ctg)01prxFiz^rwALZQ?)5K=^r~^;NegPxXcw);! znfYbIZV6rKJ~Iqbt}AB4yV3PwYtGMo>zJ+pk8 zbI|9;(xc*^tK_G66K_C860J^GJ6r44)}9a>Fyv|IF8FARX|^+IgrL?SuIn~TM2fbm z+8dD?V8qIEpV|~=i5ZhY$p~{gIvSAN{-K)Iw5Yk(VRUuIK^h(Z{QgiO%tt>O9=o?3 z33x?X@6VTOb0&%$Q`YfDE{Gr~=0t5q;yhhL%P+;`LGj^X(NjzeZ+a zy<;7~X8T8rp`YsJxwck&NFQ*n4cK)jL-5;w23y@;0aP`ZyFQTd$0=a#%6XRTFs+!4Y(Y~o(Dr+P1z#@nOFIm z$a}L&p4$ryrxi7I2l3=L#jZh$NOR7*Fp+;ra8L$q#@$)gas_A|;Hq4|ZCDgGH&`hp zL_Rh%>dvz}zQZI1t~1os{m!5Aaq4R-!Ao%Ef1^Ab9$A{Iwzmp*i>~%C(H;d@iaA~U zE5|YRFvjCvUID+t13X_P+12$o1*1t+#3UQ;d(q9Xg40AwFB_^>${Y z_szevYbtL}!-AO2WDZ-3 z_Vz#9%H{Yn%}pC^$KYqz#q5c#E;*(L_OiIJDlkqxBMo1-T>Fd*YH zQTUTQ|D!F|#lGkIOe_JaF7rsN*t(j3ht*cQC9TMXcBp)OyZudU@m;~}p_u|yMjw|& z;h%!x0Bo@`X}b!zn<<{E*sAFHMxm1M#ut}Hv6XNotWuJw%bGMHY*L>unW4F}0GaP^ za08S;rP1uw?q!s9s<nz%PiFQvG-qzQu9NQ-k;w&{Wp?f!n0mpCp zp(gx8R^X82Hp^bgq1~jJ?S1B7*J5C2gJ|9>lhyqCxgl>d$PyyxYI7%GqLvYT+mWpcEobk9X_ zG46DB(9^b1G9q8OnTY7fpL)JEck$TX_BuFpS8|zL7I;@a1K*x!Q*fk)az&b`h4_6A z58gk|rH5wbMRb~1$kjcuG5Svv7Zexg05^mS=e^kaY~s%2`2z`u;!~$5nM9b-iR=@> zO`F2Hx640lkttL977l7H)S3YEjK)4yj@sLoJvf$jks?fHe^nAq2Cm&qq?T=;uPWaX4+Fa z*%u-A9IEnjOY2Jb<~h96Z(bMta^+2#gm99sr#`+veTGu`%&iGjaRbzE=RVrw$2&?{ zRt%0DIK@})OcfSH2BK#h6|YfMo=}F3KPc3v)>4cwvcMV4ZPKp7nW|Y}5r3!rSSaY| zZD!s*uargMYIvW-D9dGS%DJ?sjJOJR5&V$pqdV6nWA2lQn5DkAu zU4ZviU7PF9V+4Y%Lk!n*5OQ#H#0gQG)SCrZ5kI#}fQ0!~T>r~19%S8wQlO6HMkYM0 zvZC9jHh((9McMS72>&E_ti}{FxuQd8Q%5jZoGHrC@#K-s$gGQUn+=oYKDpgW(cN>v{ z3}F;ZuM}|Xkfxk%S@vbyLY7{5EE_P6vu|{cIXU=@Xg*vU|8c6)g6Wj%!#!+x#yZ;Dpnz|3ys}{4gtD~>>tvf?w z9C}3P@1O6gPYHEK5~nAVnNxMV>#Y*QC*~+IK8!QtybYM!xh6(hQ6FR3c~C@j+N=)6 zd#A+~L(ZnS#kbXHME3?0qOn;^6Kqy35ZoV$nF3cE?6*U{=S9k+fi(erqfyV%S{Y;2 z;bTL3uxJ$PQ+uDtkK}xn!#i4#WW{0L5^9sO&Im6rk&Muib=C#MlxS7A_IRuq`16xU z_%f@_Qtqbfdt~*7)f_sn+{XmF4UNLQXfzC?q0A$q|GLVzsq6^AQm@f~E?=r|hw)T4 zeDS_iRS8g+>t|K|taI@dAUU+k&tmK$G0iwqT9Nh?ZSpvQDabFQ;M#zgLGwZeQLEG zNqmvet(%FCu5)r90r5-4&WY6PjsiM^{G!f7{qs@SgHUt^ z^_@_5g^)Sh{b}TMI+3D+E~$RaQt~w7T@kJ|-VIsIxA!uFhikH4)Eeu7b9CecY1?@+eOxBG`lnt9D6F6(a~mPf>J^@q(eI+PNzI4Hg%Ez$6- z?CWwm_R^{HB>_9XB#F-Mi$HTsTd!E;N=b?dAWa^qM@K*En#|Fu%unVw@9El1KYD(F zWvr`TThm~GJ*()TQ*oiCH9$d!kZo5`ouoUiq2*;hxA;dAjmoLJnwoj{w$dkLdO#+T z#@eD`bk389X4FY5!YWP?sH^nr+DOAo74>ZEy2WvYc9j+Ru*!9Y7ysYe$%>8);B0MK zvo3k(m#chnGlN%jRtJL>O%_%wdQ5Spwu6j5L~XNv=eoWt3d|_Au0xwl`W`S{(?(-+ zmT%ux|AByyCQ#`jDvhv!k^dAWWzyFufh1zF_l>zzQl)hNgE3Zj5i2N$WA2zgtTz)a zH0&H{-H)J7{R_-(I(RH!gh<`wNUQn#ovW21h{DOl3h6EMh3;em)wxnHqbH02%7iUU zzKJ{M*mI)Yx^zea%T!RmrdSfn$i2S+E3~*jk*jr;Z`PYvjkB& zOs7@hwX$kWW>L5hT+~443X%|dI|?rqrw+V^Gel#9llFPsVMsFeQVt!M$A`-jl4f8d zf*bOgb(NsU^~2JerG!K;OUiU;7#0NO?lpaWDOxQY4C~FGHz@U6gI9hbEoZQhKxhOu zeTJpZ%^y*BZWub~<3H6BYka%n0DNGhg3Q_s!jal7-(JmXlZQLwD)S_Rz?K+!ghP7U z=nV-P!Un`6x3_CH8~&sHz`+Q9NT(7OCsb>@`N>cGKx6xyrP4NawkaVaP)r6s`SY>d zJ4<~I$_3TQR?P$i1I&orJHg=fmD{$=LH<7LoTZfnj?qQ{5~x>4eJc2{Ueuf3K(f_# zdVUecCmeH%gk89<*cnlN_-6ly(=@TY?4st)3^4K!qlu^LMREg9Cxxz+v4Jm`cH4hI z&^@e&GDZj;E-RRr!5j$biE*w@u!cEN#W=RU3^EG9Gr>#NpH^0Iu=Z|727HqCHz8hH zslwgu&28O1s!EapZP(LUp4Tv49LbJr39$@v7%3>D*8Lr}?A9-{lHxoc^>;bhW;^H0 zuARxZM>Szwky_C2j1>BPGpY}uIYxO&M{{AZXNE&0aLwFjE;gx?Uok%pjLOvqY*}@S zq1ZE;CO2pSky=*&1&iNaK0biWrZizT_Ih9^nc-v?!L|aaMLKy%lW!A0edf}8ihWFw z1H)SF(3pDWS3k!%v1;_Ybg`HAo#wJmo#uMJgtFI{$rb!Xhau10Y&9Hl1tQwBs&FsI zbLo5fA<#)Au$B4?A0=8x9?5h1UhTw?4UCdL+kfK<=(w`4Yctbbp0@vImU-dIv^fgS>w1qe1Zbe8F(K5f zyEzc)w$i2#G+y)cPnKCHyi^AzxCPs&TsuJQnud&^jR_Bl3NaDoUDg(E2U?C{W3NMg z{^e)sY8>a|(sO-LKPZa@B{8B7ga4{Ys0g@MD7b=iSL|*%xYs68>cCTUt$$1Gk2y@s zoZX_1z+7*)Kks(8Y7DC@I*c+b{k@!B{D;O1D~*^7U5~g1pDiAkHc6mWa(sL~`Toy7>;2#4XxsfdC{+=yutK33l$X{~ngJvC1~?c3zg0^#r#nqpaVN^w~2%7-+j zqOzQDhUo<7n8~d4?-7(VT(N4Q?!f|V$8mM?&o`54(u<#UiENb+l%UFzWNyDtX@nJX zBjm{)sbkGh9!&P{I4{+{O-h;~z6!GGoiMPko?FVpSrOg=&T8B;b($gHWWx~xr3q3SYm^W} zUxfqI4bMiT{_;J3*X=>K;R3K1G{oP)1{+-3ZMkrVc6$>4Gia|MmuBj(Z^=l&(vqBg zu*skC7b2s04=tIq4cC6VmAj&04KaNl<`#8I6Z70zqu}LmKq#x1yg`t)qaA>me4FJ( z1rxRJbcx3nvsCt!;yL49MxEt>vaHzo|5ERG!^kDC(`X9&IRq2*mOD7J$oq}_FeR$` za@LtCLPFnQEtepLDCnDY7F5dP`;(MW%jeSm8bGDHf;?8Nv?dV#!C6XZmi=Ru!g)XW ztK9e9eqt#6?&OSELN8=8L#1{V;gKIJjqzan#IgC%v8#z;HWs6#> zVl;Lo`g6W(0p^d;Z^eM2yrz!wJjQ#VLhWGgA0+(Tqlg-RAAdwI2WSyiulowpnD}jg z5P~eS8lsOZM7EbSbJ>V#QhfO3tC1xz;h;GEwyrvO{~EdDnGYmyLasb!H(QLILX>g) z0hH!|Y{k?eOF=rISzCEe17O`}rNo9iuu+SOw%Z*MD6TOhwv#DI-et*vN8b{dJ#|*jRIR zps~DSvCW;Jzv1%Vd;Q0obKcH#sgU?g_ZZ$S0WSiW6hLM(lLOm#H>;QU1 zog~Sq^|MRx5h5eM*Fl#-IHPVe znjmtcmRDk0#;=w&Bl!)2#ExC|5U1s%TZ{=aPJQR8nK#!r)X$x+smJcC`+HYe?2C2I z&zKLniAOPqSJ!2H7P*JL8k zzVWq8!0^;0I+I5FKrf|yHRK|k2;)6fj=DNqzAMXpIHUJtcpZ01zg&TW8#2Qcig94| zkZ^@{utX1=|`)N&Z8n2;>krVmZ;@BJkx!R zU_1!Q{rl>pU}F0EN~UJvb9Ff)m*fx|_~d-kYQ}rh|vJ^iS}91HQRtw;l6!FquBaMk>+xh5LSF`*D$Xwg!53(nHE)Y5VjHKeYkVaIuA7 zR=KG=QwbZWgQbsA99F}%FLgGsHxEJvo?;zo|LUGkhMrd;6b{D>Xq`X@8lo)XhU%Y& z#bvS@(6OS+;26BISZLk1Z7#h%clbWue2UvL7eCL&@AL(?b^Ytq^Lx8{cy{4(545jC zaZw^|jkVeGM&#nJVxB*~H3LXc2jDi2c&C-wH|i0D=`);AzuB?Fj~7#;p?xpq`IPN> znh}2x%Be&?A^z!Ub);pYBYRsqrDdLJ?Kj1H?`e0*9x@H`7-6gkwMTbb*LHck@FanmcP2l zl3%b973D%GBI4R$W5bOsF))_w1doMA>}`J%ULyY#gLf;MTCM6kKhmPdFnx8!wjiNyItyHzUM{T%!$Ky+SpWc_~y3#FKC}vv|XqaboVUY zLcP)25@WzkefnMWK>HaX6Zo0rCs_CwKg_?*yKcv*{E*0%+KQonS~O7QjGw_oWbh{v zTt0JBXKog|7@U23R^)ijn9~N4_l;@>+)MpwJ-!yUM6bf4&S=jP6CPpFD#3J<>;B}w zj3r|dOfM0?b9t=HH+ShT#$0W6r$Cv0U;iDQbGRmOB#`omJ?ip+ZXPbjMPdDX)6>VW z70qKa2c5I=$JfpMiI}35;VUR;cEvd0@n;fm;l3^lA7>!XL^)PuCg=jL&puJy+>-QX zmjDrIDoMK4$(PkL7Y#e6X;D4JJX|6pqSK&lX12*5TY_v7I=$Zx0LkhXLoUd4biHLk zr!`erjKK{xzjHEKG2^VUAylJb)IUsP!K74>1oH(y+utuM(XdomkaM!m<(51lndMs- zJ`G2bJHoz{nsNHKl@&aHu?X2HAv@5K?QM$WDIq^WEV^yUSXL=EYeEyRjd={~4_5kyc$1R1FiA{Kp9^y@%2;`^WrsNqb;x*)vj6x(vp@ud&P$ z>Xlk4^5fhPfvwvd_Mn?wr5z_*WGW}+6PGfq6m$HLv*KA&{BJ0ji##()Y*gl@zgJ+q z;k0A2zW-1{G*f-1=?fhs?-eT9O^hBGrrDOI>i-ljgvq{yvoD3M;*F7=*11)^QG$Cs z`ead4&^7qFb}fuaBe6uuP^?Wx5`k(e)E^%#C#L7a~IlG9Fd_ffx8!aQ+Na4 zsdV_s_pnmUFrCvY#f!TaQAsm`8LQbme)?E#4yyn9eFuNIo$1gvp9gI+Ztv6`fSto0 zKk$SpZF~e__Ssg)VT!|UZ|+%X`}@N!%6XCepL#1?8j|@KoG4I#Bl*Z5le+~Jr9?+7 z7P0OU@p06E7ArgD(GA|?^ooNftuznfa_J<$sQ}CFabUYPQbLmRK0ELO+njsJ)L85x z{4%yIF-sFGlCzmZ%E{M8^Gm%Q*AD5d^T-C-x57ZOEW^*bd0N9XB+;w28E^)UKWC)Y z)Th)$%Zc@mEob?&cP`h1hv#&U=)5hb!`ad?TxV(w^pHahFpK*U5LUbq?yDm(f18SG zaibX2ZDzL}6vH>6}qh;K}=4tlu@3KZTuASOvN(3IqCb&E_HEXM~4y;>!FJkN3m#^hY`oqhq z=Ds^gP~v`q@r>n=_K1={WYoKr{X0AkKYW38D{f3eQt$)PL09a+3lusMbx6}gUW<$iu1Sek&42t5K(eQ^=uq$UCZnu0DAg%wO30lf06C-3XwPJjg>G!ka2&l zpyw~d^g!5o4sgRHL53n|fkl!#enp!+T#J_Rf~fN1+s`rwS{A7;+>^Ad0Im6IH54OX z0u@_m!jBEBn;Z!yIXSFw3A*)XiPCRMVI6WXi_uOs>5yzS7)hec=;P+y| zEiU15>O1}~*4{EKj;3w@OmHW-Lju9wWspFSU_)?sm%*I?2?V#`GPnl!!9BRUyAAG= z!T#LOd+hH0yt^Ow>r{7FcXc1v)K$N_&a*|}*HWh+_-|5_kI7wK?Zc{W>K8n41BewEUz4~RiS?g9_}5+ zxj7s&ifceLSZY606mPs(@t1R6O6_Kfc`zH3PG@@!TFX3RZqiD%rv`iJG_fSeF%Ae7wXk(yX@KE%h_0=|CQ(X92-gd%OUJTJ$6# zFZ!V8>?39VblBjqsl=G*WaQ2JG`X-*a7C0=;;y%_U|-_*4|vfiZ`Ve0L_wY8^5}2~B9o)>d za@*J3>UvYkRgT{xZ2YX;tjf)SO%CfSSB>(!yq|YzbSHX55qyaKY*mC`ph)HsUw~)e z_DzaGLxwRxn`hR7mZcVkO*DP&F_(V-a|N}dJ*y$HDf}w6pD}p%Ypn9zg(wW+(p@E? z9m<0~l{CJm(6%nGe^@r%rh2)D@A?GOVRybaf|j3DdB9g;lA?#$7)Fh4=0eJa$YY=7 z?xqTbdi}1B%$NUYn^58;lv~9Y>`rd>DfqWtOv5jp!K$lS%humxKXMAZi_hH5#|AHZ zm}@1q=;h()M+H(sJ~&`|7w$l}I_*flZxPjGvX==M zb5x2zTJNG#rM1`Ln0Dy5bjcJw#qNzz645`nr4GRk7FDNuZ~X=$URqyzth1IEfi*XG z`^S1s z+}Jb%BNh?{hYKZ6u>1vnaUe6B&Dl|5G#qJ!E6IAk&zy-4YqnPhhhxCO%NTiD#6qf> zeG4_RRi~$2`8t#iW-*yC9``2!Ralnq@iU5wl}BM712&XouON?~3eIAr_3x6MBl)(~ zNE<7Ry_keaPe^L|8%$PM(Qu)6=zH?Q4%qx-pso4m!T+vB>U_WPs`6>)v%$gt|KfUG z{}{D(8Cx@uyms3=}tznULkNeGBs1#Wd;qUwj9let4%vs?v zeDiA2*8Rtf5tppl?U0Y_9|7V@ef0f|KJd`6EVDH}WXf1sZzTq?Phge2UdKo|p?KN8sbsSMQlUAhrbo>$o!wu5F<9|uQ=Un6k9nn;QIY1AM`4CX3L3ex!Bh%)>ID_BM7XDl zd$BsL$`y#HnoABO{dt0=dn9DKH%fJ(^bW5^qs!MhIE+iXs;uk*AqVL2^WbC@B~J;hTrfR!q1x2dj#_fiP8Q6&SfoN&+l2S!vo}Nf9}NATyc%`gsDR(6$?S5&xwtFDIgnDAg?6+np2s~9K58YH3idoZ=b#+Zc1Y%%O> zN)4?@rAFCjS1rt`#!IF0L$QWt!@#oYUmHjFoCD2z&Z0E)^1nT*LzRXRyL<$WGQ@xH z;Ngq5NRQjWVV~3fr6@d``m8phlRF{qt(d;0?C%$P0*`5lx zREZ>v(85*K!u?w-eAQ@}_l;{WLFv0k7_sKudEGR^l9BdogqooOs2R_Sxl7X|HJz@{ z&Pbr{i#bbB=;DdsTMCAoOOh4?+~$q3l$?wlOH@u%#UyRKtGk79C^2A?C`*}ZQxWG8>`O!{A-o7YLBUh}tM6GjY2W1iQsySiH! z+ug+?5hUXRAF5ax7JW5la}2TVHC>s?FK#`T|B|XJQAN;R<;K(}w*hTOoIPgl#{>*Yy~A$$(#Vw$Lw*wdpB8V}y2w0LV72A`wj_ z%*ST!TdMO#g+cjz!$1KJ9FVU&&jCj1JugsXO#9waBu15~VDjgf{IRdry+G(lvB4>I zdJ?IC%iMHoPT?nRl^ZY9;)#bHoDkN!IO62XN%b6w=1=237J!{8mtkMZZnUKVg@)S5 z4ejSQ+(~p8xIZ$~(W9(a#g&;Gnr(IoEqZ-9+CJE5XJHRKcrFd?PR+4gIzK0ZKAwRD z?;h>=(jXyI{Ks9 z7ugzFL+hK(mh%}p>yTS_k8=I}j^7J4?sJGIAJB9T^v2}I<+T~xsTu5dkA2Q1-tK6x zPueWi&T*q_;6JWNyk!>6s+se+Ei2cXiRDk!t=E{k+7eBPzXa_D%&q>2L>w%`hqk9# zl#ie}0{hLIewXQ%qD+b~^E2;Ski&Ng94-;=Hl5;ojxU_xf6>R2;-3;1XU`ndKH{@y z*+-JR>wkUuMqU?L!+UnmXE}}aqv;DN&fYwUwHlgJx#bKg@X7k!Yrz9#_0?7G8j>Oo zx%d+sZv>lsw@bI|eLuXP$}5gcgrC#fw}oS>9N2mlx=waeIEb%rVY`m8Aj)26)(~QI z3P(ePuoynM+r;Bgpy$9Jxcy50hLp>A9TN$~PKUI&1h$(U_1Vqne5pi9eKHdceLh*Z z!E^nH+?LXCLuWr+6j4Zl9r_EldtcDnt#<)bnWQN5Bd!} zIq;D-R6Jjtp=&dsWNeC>Eh|zqd+M6x3z`hFU4 zDs1;-ikDJ1*Ikz@Lr<=@c0T{+d;m^2udgs@CA>a&X$`_k2Fcj-Yi?#cGWmN>p!Ul; zNw7iL?U;HwnxM{@iODI}Jl|ldySXoze0i(%%_PJv_vynj9E{dChF^Vk zJsT{g=3>a8qoC{-jB(t zwF~n)t)6NacUc2URgexMFY&B@5jEQD&j0z;D@zWcrE<RKTsa1<&4a&Pi#?v=$7m6!`NT$k zxb_JNb#*2@bSAOtG~DjG#%8%;)T|I{eIGVM7MPi|>TA;+cM?h0k98A-=Qd?X7xZsU zWV7Tk?Hj*G)V$A8tU2<|9l|Q13WfZPcK}Phr*_M5ThyfI#ZaHIb;&<%i6=_8)fFhL zwQGPK%ILYz%h|%1PJi&B3|O-$itn4+UBRYvUbZ)kLU#}2~(Vzc6l8rr8iH&oB6o`1lEnJ4Nq!b1|jaa=~w`?6@IhJo~`uz`g3 zYRi~{0|eA2N5^jlGER$#r@_+-EY?mzjMY+v0+EWAu~Mjj0T|B$V^~{9zr;cpuMj5; z8j<45N7G8Y8BqS_{=%;9swI9CG?IEN;FdrHAM)x1h_kr}EbIXRVzjSpAri$xfTM;u z2?KS%ApY_8YML%r+VPPV5$1uw)^Zn{H>$gU!2!@@^7G=257M`)<`C^!plb>;5O0ir zzgNU`y4d&T@z7khPS%yu(5fQGvH?A$p1G!#o8vRKWGng_!N&XrZBLSVG6fnGI;ead z*p|U+OJN8g+`0_Pbc%1D)}~+1Ij9#bhlnroWpZkqEXwxlK+ohJ=rhdU#!XmnHkOX~ z0OpV^0;h|;7AQUT(lQL(nHC2L&JNul>U@mt$4c*KC@kGs43(}ty=o1q!FIew4G!y| zO@S8f&TuEMT_UOw8P+gE%3hY1(FQi|yASs4tH~DW8AN!?)drL4md}T|>1Ghj;}LGm zOhrRY3Seo6YJ%Hj^(xf(5%D{S=Z^Q{lI=KWwLgvHvK7UP>3p8-6y0t(p*&7%m%F%@ ztHSipw`N1erk|Nq4Un~bj;5~LlWQ6m-Vj7p%c2{=V2 zIPG~^-V{F|GF!rmXzps*I;B;9G%HOdpGwCRKGby)p7 z#q4gK;!hsRp+B{I4UpJc#!%gVYYzV;Q6L2k>9X&kM%QQOTxtw!9uzMUkS;oWMZ{KJ zu*?Y|o$8<%sjp6aP?PFQ+0thJxaH=JXI%N?waioDm^2tXAWE2~*~XQpP_FY{Aq8dm zFwUi)k#;_XnMr+^$Qs35y6FwyI-7P2{+K1B(=A*^v-jKhZrz0=-9-=& z?VJmbYxrHneF)BrS^TR%FbCO3WK2Hq`6G71bWC4`_J2_@3qIX4+xofU?}`EIq=_+& zR|^j~8N#i`RRNgXK&u36oH?DHo-E~$$F^b zCFX01R8am{{Y7dx_hi-!wrMe08z#dJ$bAclEZ=$D!b_F^N^SfS6RF%L?Mc#ISjGI4 zy++r}`17vbN6F39s*i}$PaM0NQS=*JO<-c>$Yu0wtJx#F8zQRAY_nfbI~M5|)aZT$ z@~X*V#O7Uz0F#Nl`<2LSeltL1P6{0rpUlN+eU9-s1M2QY@_f&GK5IH$FVa*rRTIt# zc2l39v{=^&4t|JytJlk)hufW&*5-EK@QDe60w5&YCqGAp^X%BUZVUX4e#Q#Mj5>KU z5E1$+4d8fDe7KH#zQE;^*|Vi_Y{acX6*#fpdEm|xf8lU1V5Qf;pe&?6fJ;2G zKafu4C#EXGP5XliGNXE(N)`=Wn!64Vcy}>O->FWLY+lcWV^6A1GU8}r zK^czA&}OFyy!lWp$+53nXKQqUdqG#|WQ0EzxBUidl3c-t>{E<@HasjhU;Z7xAl!us zhUhwk-|InuiqcPQjgnR$!WFuL`36TS!HHX>zSy+aw_nXoRP$y!BM^rX?ae#o2MZ!3 z?@X>X-ZilEVQZ51@C3H2{20~l;<@v0%6BkPNAzjDv}V`ZcfMI+6!IZhKzZX$HtwCx zR=eO^I^DR!O#v`1Z&kQQ6ThdEl^+yDGZsd>`l(ygchmqE|1yBXF?Rdu17&yWB%?Q) zMTp0V<`IV{hr~1osaPV`B#MxrsP6w@v#*6rehS8d?g76P7>}E8kNBxbW4`971afQT z^oFLB$cCnd;H1Hg@(PF+*ih8I^-6mMXJ2N&E4MlOkpdFs)lB_{Xon)yD$0Qoc&Ci- zfH{;jIcCA_iki@mzyt=(3sW9}ADb1bL5e!Xkp?GH*d0vXcXxKTy!2!M zQQkPHx)59Q@gKBwg%Z1l&=*4KKcHi8;qcl4ao|4=mr}Qk6_}CS~_3vMEbjGaw0~ey@Ecw;s zNJ_>PvY6*6W!#(T#A$p3rfh$W;zsg3RU-a>s8I#YMv}B0)yo_MnDvw9H+-#~8T74$ z@GSx%F^aA*1cExHsG*e2y@@54=%I@HZ4tt>Kxv$ezCm{Lg(T7=F1v`79C8aFVAv1p z*SQ1krqD!J`-xfRWSEhH&wdk0A!KHmVVC>CUIY|P-{?Se_*kVxQQCE|ifTmHMD4%u zTgN@al&(LTqT|=xFE-dmi?xer1#wPAj0cQ;hdBt|P&MUd0QGk^;iWg^&<_jt{OnuD z0!Oz6KjM?V(s6NycbMWw;EK4qrxS8wB^ZxO&;!xKM-7#GdvcV+F9h7ulpfz?Mc&PP z^!3I|kiH-dsVR>fOs~y@; z8wL8kmzQO}pI>(%%&Q~ryJ+!RhO~j0%ja^aGv?c0NgC-=A+HWY?%Wn)^(o=uhv%z* zH2gkKF7+Y_V}t`?-)zKJ_94~LM9k9~gex@)G{oat)Pn}P^n4CX&(}w{;n|GyDF>+b z-+V+)iUAzpni*#ygTOs~nY|O+xMeF4pvmXQQ6 zrSoTbpCtTY7y?`g+(T1FHlBA!V(=Z;>4;(X=MC~SC_L^sWtvuUl+Ah^!S?>s#}Ww>QXx?d5}|+3M+JJv2Zd5 zyq~W)QfksR&ww>scf6)wI`+S>r^#NNv@7yFKmw*q!Ui_(L?`@}QH(4dLcH`r(x)Xo zqa)ylqv%e~3Y;NOH_nGn+nOEAl0vj!o$PtA)LBKrKddiXDGE?L&nXAr(qE&Xw-iC*^((3qMTS zy=Q`{|J*=x;&(d*?zdNcsGJ0xXpr{?X>sM-_cG_{QP;V~)Tu~^t&DjW1DcC)=RIJZ|zf-1(-`e2wYt<)~v9qrw^3dU9& z9NMzE{PJOJn4hEiITo8RnNUDxAfwXp!TUYJ2+|;Y*PPFXRnDt~!`ZO!+Az`C9ceVS z>G=K4;=v}tSNmUP0lB5@u`ornGzs2xbn1w2Z9-Cla+9A=R zJ?$(*A}|y~2TSoCs}KsXjfd2{*?Chq$+y6o?I$hCu;m4MvR1Pv>(-GKcdWgqySyc7 zy))ra)c&71{&wWqXpvH~Hv5CP=gk*`N^h(`#f2>VJXy%Nr@OiXWsB`y%SlR1UhgyW zmCFNHH(1DSu>w0M5a#yidy$HJesP-rry%VO4F~SnRG&?nbjh&m-nBN9ai0DuDKIY$ z7yT@TINz5%npeEz7x(~bM8ydo+$z2i{Hc?Lrs+dk7l-kGYxAqYN47`8At$&K<*kxV;%9FF5I!*k4gRD->e?cRJd&@d_@m1`gN6#j-nZp(Hk0W% z-6kTTKM=_g_M?Leh0w9YG~e3zrHeXp`S3XN4XQ9pTX}QMf!nFRIMyw{@~r_4Y&BRP zD?(G;2RT9;WFuu0E(2l&y7>;n<9W1ev8a&Wd_v;AwebljR&OrkL#!U+wf#dL-@L(U z9N73 z9}_kEEv1iMY4TvY=(idp`U>F4^Ne>sJOW>ikfOye!Tp)R`i|{ln7QaL;^5nc{Ryp~ ze0&l-bJH;s|Bx)=FWy4{Bk7j7tIN48J^e6#^WRei+?ogehUwBEt~_HD(t zc^x++9#uT2FcXJV^%02%BKr4(Y#U3Oj8#3kbAH@?&5- zFGlHzUNm(yd1{frwqi1y1#vO~HlpG=lHy4RUm`4LyAwu|ixb?Ao*Gnk0kx zKuHmQG3b+uha_A&%Vhq+Sl{F#7M+!T#;YAx5oR0c6%vH2*jY&8JE);0peUh`#2CBN zrK%GfLFjlkO=z9xtx@jvl*@VSa|LO_pRNAgZQ|QK1v6L8OA-F7)Un&b$ z^3K!wqtrf+fafRlyYW~fd1&4s$r#2aA{lgy8w7UMODs*ijAoI*78*0lML0@b_3Mub zvLeYcNz{?`cBHOJ*;HQmR2d$<UOS)1YHt}oUOGJ zG@mMiA#w{hia)dA(MjNoztT`h^JEZGSDbRv;=F7*t*=g?{gU+gw#)Y(;#FXW^ zJPTQQV$azI1K3>FwVA4KtTEf4x{Ii;^9_T{8`^MRLnG=Svt6YF#>pK*Mv)1){qxTP z>?$|f`e=maoVybanAEaKRn_n&@pZ0TP%p=-7YkiG<>~yy{&&FgrYWAW4%gF3uSP5m%Vz2$*XC~RBJirr!qCk!LQ6v8x;%Ey{qO+QQSK)-u{H_} z$N9LAB~a6q(3$b6~E9z1!?7 zHZyS6^#OTJ@0y4b*)?q)q$k$yHivA@@wqf7y6i+3I?o--Ep_kYOLdXwTVXOQvw}QU z(Hz$n5zp&Q3bx}ap=hdWH5RdiBX&)aw-AIAjC+fAEx)k!lKM2=U*8b8=PH5?0jc@{ zOc2`w_>W6*%l5pC4SPc#B za^*fX`p$W{4KcNYdh2g>l}|z}Yu&+>F2oXiWI4Du>DALlCpkMWt5QYJ3zSpP5hD%5R^#JE&ZzD^CcDmsg%kkC^8IKs{Ra zljco^5*>?l5Tty>#Z0HAimE6*w%QsrFwKFAYk?xCHN9lkCM2S;92Dg6-p1TWmw>9+ zRs~0MyJD@Z;){seL;B9=`j)bq4#Unyr*J^UhS{@DSq{h;aA-NQiP94?#@*U-hpEDG|4?3jxh5MG~`c8|Y z-u@}$WdgooZex@1MdeQq)yZTlCuQNy3D0}SPTUNAuglz15ZF;omg1efOTYB#JVYct zD%Y@5r-e9c0cr4J(6C}Ps_s`HX!%g6&&=m|*zQE4D};&0l+?U76)Y&8gXZ*YXU);o48u5Fmaid@74-xwbfj{DAH3S&Fh zW1%UsA04uLSXSjo$vZ>ik9pbPIjd^Z+u%3Y-{ba|;m{q=PQF7zrMXYryvOGGw78Vm z4n&xHutGXNk@Bkub?<@{0g`Irv!`Gi{dcXt}ghiq!oqG%YHHEQ=;ZJ*>$eAmu)wHjSRWgU(&IkOc0 zx#ngHc_CH}SlF@FvNe!0vpbKVlGZ+Gc^KqCz+UBnH07vlzTp!C6#Uq7jlS+Tq1drA z%)V(pe-i8HnGjD;In@42zWYKS5SLOPyo`)jH@{i(n)gnhw#B~zclst*fAx(t>^4xk zA~i_bZQWURpULC*{6NU=EZ)*l3(vY$LO9qkvt8JDV#f^dZERNeeKKf3d>Km5D>(Mc z6+8zpch3!>5_VJdN8QX<1Gi;8p={wt+*gS=W%YAI~-KE$&c*VFx=e{{uh z)xgIFP~RPFr={lbxAi|LpyWw= z-T3GsjN}oMD~rrq+`ihr?3jBD!hKW-IMNmB5iQ(R(Ay*nx*5D+t5}07 zv@ud|vD7xneg@lcbvszE`GZDNW#35Og2+~-SW>h{YBZycewIL1C{>{^=pl8%SdTrjM+ zWJpDE3??jTFn{kR_24#h@q{!1|Dc(~Ko}_cxyZB{MZc?-*G|j%%fbp@6~gbPr{HQ~ zxajzSmi6Be>4O6a`f$lPsLcqyEoA)*Jjbrj7)J=#I4L6GR=yLmcsQDx_uS64Ea|j- zyiCsFVoW|UHEyTykn7t5hc=le{~xq)G+36O@tYHJiK@N?w`?d-E3PjJD>)~AN=&N2 zWqsdavd(BBBXo-nHQ4v-4ASLvu_Nawp_cgutn^5ca9PP^!k9$iC-S_-wAx32b?)`5 zZoOK+S4-|1vt*=#t#!+O6p3Y|(paP2hAAGYx~A?KsTNQMqb^HAXKlTf9_h!T+~;mA zJOfOGz|=BS{^S4=X!9K-RnqYA9~Jn8tXuF|l`G=$PzT$3?XStnAF-T~KwOwXA9^y{ z(C=GZ%>k{h+P@#IKf@_X;{N>b7RJEc^Px9|bL$sLcgnB63{eZm+{69MTj)ZJ`RaO9 z2_$L9l908gYH%htk?P^_KjAX|rpX^dphHPaouIDecJXW}mSg-QoJHWy%>=?J1+a3xfA9T%A?U=Eo70 znb~BB1P+PhWE;W-($+XY`!clN$nr%WIU4lq&#%k0h+1M%r!W%5dWzO(a!++i_Nrv)MI&btpUIMUR|7~ZjY|6g) zBR1F+gH6JxPP1^L-4Bp~_Jm`b{j=;q2T@}yt58N;G}gtvvi1*kW|L#&x2Z;}PPg5N zLN2>-#_n0qIdKyPLVCEyiu?DB03TJ=3BVq~OA(i`-4uC2R%1CE8ZZA9#}#e7Wb++1 zIk^Eo#f2z=NA^kmbYPiJXrGDTP_{v^4^{NBB= zl5Y|{uYZxTax)JAW;IO&<$_l*{bQ#C!y2-h6I!eN9x@v0np9CzQb9p~o78MIT}-xD@&ELqerb4B!##L3jdf8*2ek#R5H7kgvdR-7e(W; z2^3MaQyZ=u#K8$ZkZm>^-y5Wly2Q0WiB>H(wmi6~F=R%U0W^S&x7FgP7k&!%sJ@_J z$fAOaFY9s8OmFFQ>#Q7n2)6IiUKShqlIkyOQo4Hkopj{Jl&z3z+l8TyLeDO$h9gNA znn6svoapot4X*0>$w$UkId=0#4Vv%qU2aIcMf#$XRIH*<0HOfb*dg8%V|@s`W`FYy zwGKeH?1<#|_o&a85rGQqP;h&8vM_wgPdE(hm?3AZvlOq64VxJ0I-feFMIl_Z*82nB zmTrdmN4M|%z|yFY$8v_G)PkGZWav}@`t@qiN5?ZU*CXV1{H|>VE>E}Aj?N3_GBigW z`Os}lS@9f|)_GFaZFCio85(t=PhoYL-*}kLSDvOnn}>6uOOs)aKTTp$q|U{H zQ(8w(-#qp%RYU8m*#xT2qm(69yckBgGN0}{F$pfXqMauW zu4z?)i)@{qBBkTg%+$j5CcUE(TG1=L?)@=lBDSfB+EdHciaH)}5nd&L-(W4hKQuBJ zZ|iCsp1z8?S+6(e>3_ib>=)4_uy2M?|1#*q;1ALGJfNgo{m4hCnjony zFg3_n9chL+{NvCu43`AO>jtB@Mj-LcC3{f+g-}7Lhx~rzYT!&);Dk+Lfj%hinPIeR zbGem67B!tgwQ?RYdU~G?B9iQb4;v30X3tE3u>;=>P!p zjuBQ|a`DH?DEllA@bNc>4v2Sil5cc^l1!$p;@?Z1`}&x<%CQU%?a4C?H;!-2V&wSd zHQhj>Zs@^43C*5*J3>PlCKFTz%eHxW+l!E~EgtGQC5#yXK~vZkov#NaJ2ynw6k7C1 z?YH|OmmFvVC3X$2Q}FJvTF0&%QB*E4JU*{wb9spHOHiCRU{*|ZB1<8$^@Vx`;E7il z6dsVIv-Lo5K)Mp^$GjsW=6`C^7+VK4e`pfW?~Ev9`thEFynJq zVewFD<`m+U|7@!rJLc=`w3l!jcQ)Q@d~{p1_F>k3n&rAwtvD?8+t^~+;bHrc=cG#1 zJl4-R5)-gHSLCh;RF@z<$#uRs;-@b>z6u&D=|8aZ?W5xZtMtdfe(xQ9uk>znl(5v920in_1 zF|kka1P4C`#%noNZ~IRj*fL>Dxuj5^VhDS;H740D8|Rwjlzn_Wb8Uj&^J=}uF`iB% z+nxw0>-&4Y5o7q-fK~VK@Y%IU98>`ASgP6j>EN(RbBUHQip`tw}QJ-E|Yl0;>Q zPA2kwi&lHdY_22r%cnwC+J1Ej2&c2m@*XgsIUA-!2=dNBq-yCH()L0ScG-;e$FIW7 zpsd!eg$?;%i8{Ufa0HF|RmH%1R0q8w&~H##iv}Cp1Ncjbeg6HL=wz-sVN0~uvF^gx;_HCQAD(?Rqe#q|i6#mcdXFKavA$z#Us2kefrt{B?}0xz ze7xl1#)>FCybe3;q2^zgN1$e)2V)>X*U(>26?4*yJwEkIVs@Nz{0kal#YLP3%3YYcGCXgu%^)P7- zUKp4pKII$=iF(6d%pg)cuB5c;vtuBsV`mXxe-bXH-~})Y1xJf-?EMF82`&vf z{%@-)JZd8gh(otDH_JcOVgF)IE9C!3JMLN0_Z0juc%>Nn54`&S_mm>C=;8hL3trge ze5N2=yWf}fZ`Yi#Jr^-yxTWCyf8W55qlnk;E7{lB!AmwTXWOo)*w2}nWQ~$OYOoQ$ zr#)pN^sg4b(F)v$T9`xe6J@ySF; z$7Alm`45j3c_JDy*@n&21DTg#(Tj=c>+`F|rd@*T9~XFEe2`af105#%iKuA3_m`nG zXa2Px6_lpvU#8cmeX&=Rx_q*7VqV-6GsT?Jp$6U85fbJ`g>A2J)Yg}aV#vmNPk+`&~bvWdOKs9Fe)aVRU+@T=En||7AGom{O!9L z;d-!X31$A$zF*&eu!T%-z1I#fqaO0ZHLnb>smwO?J0sh1xtjm4K^}hdwl>miBIbce-`#9oSfE-#0{dw0eGk@#<6Xy4EI_m{TI2qmkfO)MO-!NJ9Tm#GsNUu~$Oe_7b&lybomb=zhGO7p=(SVv@$Y>Kc?vOM7 z*zd3Sl9%%L3CZ^$FJ>ysUPO_~^XST?@9+BR%Q)21e@{`ei)spb=AUWPBP>EeIK+H7dm%?v$$SeV>9m-C)~ z!Kj3Bk549VXd{PY<&)D5Jb^~FLx-E?{MSKcydtp_RJI4?0`|gRx~>#?^(lk@qKa9)TuCv{vWd{~ ztZXsaArjZ_rHcmx2X>>7Qqiaa?uOMHq~@b0OL2tS?eK)gb`_%w&^Wh?>A!bHZ0t#P z_9M0mB6=4$`6W7MHcdGC~Npw-T=cHgxt=d6FTit)SYdL6nfJdw}>vVz@? zLTm>y`EiPlj5QQE-m)T%&8GgWi;*|A(=ZN;V*kzd`6cbl+>j0o`ZR~2X+@r=Oj+;q zjN3iQ#zX(cM}bwpdE4X1(daYqL7;;#KAC94L41sf;6R&$GoSAg;-BUtmO1?nnp^Jc zF5*e#*iNvd|1-N6-X3efeuj_IL}0a3(LnNLD-Y7Jf`OU#t)s!+9W=^e)0`eU&s~yuw$HPki!~ zg4!cHr~X-#eTBV-+RxM?xApYJw@|T(9I+ zwe!7Tx`rCk3F1(>$8VZ$>H@j|B|pZtPoy$Ke$Q!jeuu*4jOu)nb0)m?xkv02Bu}j5 z)5?qW23oXLv?pRv9#0xGl^M~AciRHUD9`kgG_`FmjI0OJDj>yq9Yl3&3#@!+n3brVxix${`pTW{;q;xM z9^67XF`a^ldhZycV|7o_x!J`xphS+BDjn_N%R$X7e*br%TSMakkZjqD>|lNUu0`;C zo=7%a%R6}U8H$u#5dnQo>#IfWe`N&xoqWsJ@2Mn>!44R7JTv*9^HlD8n%0H4ybu7J zOb(KOB1uP@Q56Q=%sEb#x@?4~I}9nExef@NL* zufP9K+Rx?r=dGCdk^h(8$^eNI>FWtUl66tszxT_3+L`|U@BDwl21 zY(P>I$E}~x^W|!e>5BDEa?B+|cC^VnjsX2?VKSDCm*oWfyKlty{TT zmC_BZfv%xZq29EsmA;ZB9QF>x`bpe9-h1|)3JVd7O+bTl&of&Vt7yyMh+q=n?6y)> z;G+ze>35Vhp;=2?Fef{tW^AM{Ii0_@MVI>Dk&%(cPX$vNu&Rl;Hecv-6+`dLI&`j* z%OzOzy@dWI&o|(WQ**jyF!cNSNV|Q1*^2|Pd&JtU_cS<)JINgSZlaBk6tRL#PG8xA$ zpFWy)3*x8hR5nZI{i*ide#{i3fhv7bn)~%zBe0+S&{8TqG5_G|0 zW`F?g1IZ^|X)p0j4WSDA#Jg{Hx8M83Cdj*j2!Y5kj;8~ztM?}l#6Cd!_@V}O1qSGE zK3XKbiHP2)&2Su2v`5kE|0+1OYs{S(2MgV zS9ns)bTv4laSCj@5E07HwX%m~5X6iC8&C&u=$3W(jzpu_py6_OUN!r=r>4iC{ljcP zZtrW1$Lm#l*HF7yl1_{Y{tvw24a?N>OG|be1@{Q407PhY;m>>O_qBSWvJx$6@ z`uTXh6+R{203Bt?+yW*5&xw(Dt$E!p4s-lRql0U5TOppNlYb~u>V1<>=ZI_)3DJ_9MgD>k5l zE8!`Bb(xP+*!tWr4q&VkH2SeFTohl)X_j{IFSl5G^*PpcNEy&3tzJ|kCVBtvHCsAP zlZz@?_qFCg>RRbeO?X&CTd_u zV%;QoC2MaOeSf)B`7=S~64-T`*AcWFp`Xw0?(AtB<-@b4((41M^C zM#1+iXi{Xf0hySe8jW0g$<2tSjqA!@+87j472}K9h0lBcNXjCLmo~gzt<8M&Zqej4 z|B*8FR{V`!@r>_l@|`by-r+Vauqd}Q?)`6;1EZAn7X|{3mW!9k@5h&hjdsu+EY`KU z5L^HNhi&D;W;PJm+FmBwc2UT=x1W2?}yd@*9ei?NBk zY)vz%4t)A32m$hMCfcv>?`ujoYbU2`le()5)zzmyeCeR#G$^DaPBa|8^_)$1T}>AV zO?)tI@qojbr=LyOLZAG!7b-YerB3{dIUX~EvMMLBpAQ@D}Cw6LdD_ViA1aD8latO~ZQ zUk?MK_k$f$8Z&-l8y*4UTVOz)UYUjb1&-1g1;Ot@msSd z`IKvJvW&%mwgLc>1cpQRO-8$_SsTXtE_1m0uJF_-J1MYoNUgQ?hncolEYf5pmix5r zI(i zR_GkX+Of1I_rSA}rx~7qskxMv>@b3psD3***=WbJ`og34wzkrx`~F(?hpSBR8zPUV z-}Uvrg3Lp#t!wCw%lY7BggMXPd>3NdZN=f`chiy>p`ce==~vmdFWjYUBkNHCN4`%MW58a&h6)7+E%mo8?ZqL#iMo%gt)`08 zBFeS+E#AnRyZEk}it4wCc#VV7ZO_rERNt3RK*d$(cHQ)4TE;0Z%VZ-1?>prWu+ExR z!;5ok?f(!CCR#G|pHM<(_B1hEsg}8nL&eVk+3gXX*XRw2(azo39p+coQ>*uCFX~4F z6sA+MG)I!=QU?I#+lG(DF@ksp(HUPoH~tr*pfi90xiq-hewWKmTjDplUHXbTKpy`^ z2wQ+l(>A%m!!$h$GTR~nOB$*LKU;_3GBT-=3DdEz;?d71F2y5$E!*ahx96C*6G3NA zk4P6P=C1s4!9_M@$bwP~Ps5kO6nqBbO__D;-j%Wj1*}$q$MSwv+{@xf{$kZw|G~iQ zsT|{VmbB9Lee>*_v^E*z(?y?__RFEw0-;81q0_+yF0-HDa%$8x`36&9+rjJYsON&F zo*kj3t#+YZ)|iYn_inInfoWsfqd505_wVP^r5KJBs3_8B*c7Re744bOewl&{k%2d0 zTV&-qT(5&Fgu_L#A7{Pij9Vpw&+or0k52FTS5v+0yPwvqr^_AR`Gq1F@LfL>$#i|T zy+SGD&Cd{5Gf&2u<)Xl3_2rG8n=tQ4y0%BsdHr6KR#cWdUuF>LXdZAeuuk7TKcC%_ zA1VnsQr8us_}bB>$#dv3eXZfPnWFe6;2JQI&Gs1jf#{JW2jv7)04c1kIS zT?_+e45w5!T{w1mSN_2ibY?VOmr7SyxI_>!2fVl@V<>D zXZ~|?+yL?0uUo`-;!b{O|#O}H)(7WrjoG4eTU6&p_dURXheBZ4 zR2L!*h5f_D8690N`Z-?Gvkb`VirsQ#dzMvU_W3V~_H<|kgN+B3BL(A&2XY*m-sg!! z^D=zApVe8s{PHPEzU-ZX@%=yK;IoSE!Kb$!U}#${}88HzKR3)Kt?_ekhpqVr5HM%$tr^cZ76{ zStT$MV|ylRqAM1Mh-!H}(aq7@9PhnvFL35bCk0u!ZHHcN_)qgz)`&LMI{E~iqiG5JrF(hI;`ulg@7jVe^O)5?o? zw{y~wA5pQQ7BO~>b3RD3zkLF)3uv67(%(j(!R_f58nl|TYD+6M3G4CX6Jr%KyNi~I z2^dL{t-s+8n9kB}$fM5RZbm2<7J?sGk-8I0=L&4^=Uk{AZ{nym!27q#JBZ=`baOYy z6(SaYBVk`_$&-F5Kdq8jS9N`dH4t~$D$QzZDBms@(O)P(A-|M}T$@_DbS}+Z;$K(k z1Y^yg?Gi3MbcAhlhj6ChQ@A_@6_egU_i z;}d(a>(8%cdm5|gOdTA>vdRv=Xx4q)Pk4jo^biaeqtvy4E`%{Pors<&q(V}a)WZvP zzoQ6#R)Rd?LP+B;(!enF4QO0y02?lLfhL;v_~8!KLTS{-(;OxT1o7hy&I5@qmjJx4 z^m88Sn>msx{~bxvP1zpr!DR6leZMv>)gJAd3kk63&d4joq{FWCUr=XLMR&&6hY{*b zXXQd}-+OXI-YMPAV-JV;G_Ow{(4lO7*1GMEh{s7vGw@``{l1u&@)QeCNJ>#va{5A} z847-BCwHmkcE)hbXq4wtm2d|xx7xjy=?aZP<$*(=h)JPNw#-Szt2A8!-_+qwds6`C zfY3WpjJzirz-~PTF|+&f!x5eG6ph+-iQ4l!Kx_nQwMAw-C4FLIj1xlqRH(2;a8OsHp>^gg{B{ukbZ?02;zL1yCWuK9)*HUHMGEj);Fey zF2Abmo*7haqp{e6pO;rC=FlqidRNuf!njg6{K(}4?xO|9@9BFEqf&rwzlFkHD*k?MYNFe%9ft>(Mko+G z!1kbRn>=m$6Jba87~*)_L~CfN`Fgx2dl`kHH)h_TZN)UI0I|c9?#cn&B*6B)BiMQ@ za1$7+SJBpa*7AOD2#$(;HAKbC7d)rF?5pm8Gxr~cLps?^o=*_}+vm9toOiSY@G~WD z?@umDVIJ9!@I@Dk_riMXq{Rz{LKll9^~$Qklbpi0(K=e)rqODxwMylbg&(@3izX=x zVit8mt2*^kaiTwn$%rxIoyfR8zdl}NI{k4+;0Qza#+R}_Z6q~k8 zykjmWcA#h@y{jCQ(p`W~=53VO12G7_$%?x5O3CS2xT|d^ZGX+3CucA#!>8r%n(F4} zl@~8Cb7ri!Mbm=mZX2&?A58>oy#U`4Z|Hfw^NqlYlTXuM#stbAok*&l{SGIhO)(^} z_Ovj{EMC;w?E_#hLMkzv#exJ1R(3UmO|^DNTa#vp?Khrt(8|9hYg^$}UeOWX^BBDp zk<0P24B^pdCk<58lYU;xxHy!n(6-tOhAPN>X|zDB=A^;8EM%6^@*V>NJbEU zYq91CxrT*oE$b+>-Ll*`%$yU+u!PUCB|MF6u%>QFyCu96rQz=X-2eu}p#T2ES`>fH zI)R=Zslm2w>g~n<{Wik7#s79n{XvZ&=QF6>MtoT}azk>o$?MD$^>OQ9i>5j8;#VI& z5qlZWpD^!Mxi`Selo3c_W85;fEpS)H%In6lgsaLd<>B;iwdLc~;!6ZGO(e7p%yf3< zUvgp2i>FiD?~PXAs(QJXWP5)RtsL`b+1TmP+x#@7N z_}sh27ri8_r6p&G4Z1P3aI|#sSxwrHIcD<^)oK@nix2%F`^_ENl5>}tO`#bA1T{N3 z#}pWo<#@9uUfZ78$T(f|n2E3-DuW^ri$J%KGpWPt6^B=hb93#qiRM3$9vzJVm}9$t z$Itg*@gL7mUl;du$S>-1td&91U6JW>O?QWThr%~^@le;?uUXYfF{G{tAEK=>O?{|QwEe9V}`~! zQ$`opu&T^M;zpag*^AibYZqe%rYu`Vj)fWf=Oq{51<;i~kstM#*{%Dgkubro^5@c) zxEsu_IY9{BQDaww9K`LGB0A=GOPXJrV!>b1(Q`&I39h7pM@>1QQbv*t1Y8~ZUuDlOw1j~5HosO^%l^JYUwxm z=`e*8mMGRD82d1|iXcxtVn}{irR=$@h)kFoD-E=tzo?qMI62J=g}7i>Tzz`glRyBy zsI4{n#ewtk;QhXX!ZOwpJzDVKPLXY9*-UGf%!=?Mqhw^D>d;hL%*OM zd6Oi~7`fDB-c2)avM=~y6|>1rZlEA8_GHh~q52;~csDsV5$v^Z)`MBO`;T{$JJ}EG zD2XiMtQ_^L4o8nFWu;hZCxmQbb9_0Mv0K_fi~Y-^m=e}=#c(mg%W_giTJyJhoObNh znN}W{xF#cerzVk~+us0Ow*qD2uL7^*hso-~ULJ8TDEANw!C$=_?(Wkd8aPVfx);L!a}R61Ep$buhhzv4Z5(BB z6h`?I(WxR6-WR{;g`>|2F=)$9)i9#2Cj7(%Lkl#f>! z9&m=hWT_{$>Lmc3GoMS>fv!~T%DOIUNmEtB^O+mAMY6k;Lu>V(l}pl=)|}o?MJ9oc zw`znspJHQuxn5WNcy0lLSt)QbP!y1Ge^EgaBIIT$A#u2!62<6MQkVq zMLy%Wd;5ESIZmwxCTce5$0Ol3{$>8HK>2)4jBk1Q8`!(QpFSx6D^1g#M~9Iu%(|4I zbpe&mCR?|*ws*xIh9B(lyo!(Al!DLRZBzCb67zqJFb-Psp8*G~hWR7K`SRfXtVz~Z za4C;QQ;#X~Cz5H;;P4NLrklFlG|{gOI)Aqs>v+&LFdI|(;l^xnP_ zqkQ-{M?8J_yhafUSk3mztvr3*;CS3z1*%EBh(!obV^keeTwfSIYyy`&i4uBm*Jv_a zo~1Ws4BP&6kC^|D#p0mX#lVBq>o6Psy!Es<=n>mqsh7-Vg=PTqm7u2+!Yq&^M(G+hDgxO8#vqA#hpoZNzFY8Y*k; zHsg%v``U)7tQI$onnsD}X-6KEMzQDVs%~4@eY-AD-lsYr9&EOCevX6bLq16|9l($k}A^Lv;03=+H zPGtL;cx=%D*l2xFZDed0|BpMWBuS(%-Sm1|?A!9P{T>t>K%weAH+cYyIK9GO5; znZ@~bkW@@(*!hz3pbhZoYddW_9G9`~GP|f~+I!ta9m|MK)}BRubZt1?<*PKZV!0 z|Fl({AM{FWgyRc%27wm4g0UXaJ88F&j>&Yv9F=4!sbj>qE1|AKL{WmJkz40GsfeHh>C$$uOi&acpbp431eX zKRTM>hPwjbq2}B1YDc*md z>VJSsAu~KwtgMHcFz8Ff&M1t_tB$JGzU?yRxX}EVB{N5ZdJYTxqM~H^mae$+#l;c- zYgAwPyk>K8O85445v;RzRjz>Ds$zpK!G+T><6Q-|>FG6|C+wHf+FKa#N}8vt*nvC{ z=i=w~tr%POv^`vglNo(9&eauZz-zN&-I=lfL%xW_&A865G*1N1JA8v(_1`vx`MO?& zAw{~*YC39T`f+e?g3-RPYLC6y9X!R*6;J5Zur~C&qo`!M-^lc3{&%ObheBR%k=mHa zXV$DLw&-?ab+fPE&@-8>k5R!^K{|WB{lExq z)E@pkzdgu|bpDs8NrGRJfEsCc9j)DzT(yY?qE%d>Tz5rT7uLLwm6&Nt|4-O|9ZzBi zL9?-(kIWN(KVWl327bdw7W)?DIv~ANhtT!NI_kS@PPPk z=JE3Xj>YaQq3+@uXh5GpkB>x|Kv&pGR@^-QH!}k3V4Xm-2Qwm-H!;%fxm>Pn%U3Jj zjYsMxhd4O0jSy+b#D8V$QD{|D?XI`?r=ePqHC28vG%*IgXim$rw#dz zsx@L0!T*`aKYO8?kf8P1lf4^J*N^b#FkHV?v2QP0B#Iv31oyUq${!ms3B8XOx#WG) z|7-rQhn`PR{Q6L`o<+A7%@Hpf8>_SZ=!UoN4Co-#)}P8&ddb@VGe0++fW?pQ)U9v0 zA25k%<;AE~FQkk$at}(5)+!H$`q3uRi#@JLl3p3}7=6WK;4$J;?ws?0`Rmn^fnYw6LGgT~y=k5r3R!x|7`kU=&IQxIb1pbaM)K+8eMyN>{1|l1QZz{&&vXk4TEL^4zX1 z9g@uTqPdXBalNkaziATwWtM+Vg3l(TM8OyOuJ2>B|BBmxwx4;Te->NF-emTFX8m7# zQkONjuX=A|e6a$y-^v5EjD8>VFM_S8Vd^z|HU_Y*Sh&8}k-7mj1V>WlgamsgE-;D| z7%T@@7c3GDLI=`{vHl7@d_ctOdXF)E#Vg|qyf(qf9opMTZh>QZfs{YyH}w#*MT~}=ANP-ri_)L7|NecZG5%xuHL>Q4O4UuvCudX^JD5!KJSH|(Kmd+xQ`h?eEa zi!V2$GIS+&cW`W>Ql1!kUsiY>6x{fork|IoSZ8yrMjkF_Eks9g3*u8s0+@`!wB$Oo zAY8=%MjLy(J`LvJ5Bh9a=G$$B^IA$7H^VbustUBi|FI-GwyB@`aZ9ezpK>@r>?*;| zEqfUo@h6G=QTJ#DG_+qN5yh;xUI1lKamf-7?t#ru!^3bbG0@LiT9hPs>D-AtA{Tc) z;3>Q1SlV!{8plF`k0AK-B^Nwa;2_vcD{T~JjeMAct^bUL?|SkSFI3pqcTx@S#S7nuaD^1dauATKD?tM9OlGh0 zd8_?Jz&jE6`FZ#tND^{I)UbXBhDX9H2-^>9Tn86yhzo4O-P#d8rP#nh632m63UjA= zmy{FM+%fux>4ZpayOa!en2@Xw7yRwaA7Dt%o%@e(aY3&0_sh1g4Wq|2tM zvK#WH1KP=mq3G+0<`3pzo6wE@%RBR_MWK>FLHDLd;ZBAzld>La%B31LXr(VB0_ZTR z{lugg53c|}kYyCD-z3}ks?wrJ%+(!F++V28gzjR{ZiH%Fg77*mZlS{RdAxC%XE z=}iNt5DF{EMH#y*m^mi?tlkYktI&}b7eCmsk3S)AANr(Y^meSoH)ngC4l7_?A6>+1 zbKd&=QA~Mfiu1R=Z@v>KtdSc*)W*p~G4phvX1qlLBgUK6LpyxXJXcMp~i*7i!Au=I7rOdUwDj9U%cN^q3K7Wczy3!&#y`D zc^16AdO2YowcR!7ZM8nYKzBD*3t_XUoJ~|IZGoX$+1#@`)SK57s9bD;yU-6t2*2F+ zm*NX;#u{CU#aM(sjdL*Fp{b!9y=gY7v}dCQl=ix9r{JREP2ll~*CnNRAN?hlpG~)1ZA_=(1*2MC`2?W}n?BqH*$bhK3vn$G-;`QU zaGk9fnrD?+nRR+!7DQ%z!UX8A^h55(b`4V7&|+^jWnD}zr$Qu#9NeF?44bJps}**y z7lx66n!j0*Lj1k4odx^wDv2R-=;!IC5F}2oBE=O9Wa4q!F+3)tk7|t>4geJzp3# zNR^-bqQ=G=r}Y(Q@E{wH2=a7uj|@j6MBdiM?C-$GgelCiE)e$&6~OVdpaptWG)|Up znw!m$mEF!svjWW9O{rTEuQ6eMTx!jmzA>H=F&TLP=Ta~V!ROWRmZG_IpW^Wmg~Gk zt1-F8kgMbPq3r!CHhW!#Qt^m5vf2?)J_+_8(J#_G_wE;b_o>*9+5kdNFE!@6myyA= zh4QhQZ!bCtQ|kFL937L(1mIkhu_TvvUh#xqcZZDMZA@cCarM#dGWO5;n@=S!_w$P4 znY<#IpR4E0s7?Hx~mG!6N(?ycIWCD;$+&THT|OXvtvR-Yd-5l-ku*-_wgtx zSMG^l?bs*oO$*}Q`<=#hc)5$l@XsU;@@k*XD+o*9h!CY1!LLS$n_J$UlA{m4<41;4d{=&HRx65GD|YH6`(r;4nDjK(yE3)AfX8*W z2z+n3#_auZw(ls*>xkOlhW86!Alk_HBjRPgUR$AlI=z{NfKD0tB+M>&lV9xhngsEJ znMm?Ppz66;Qs_ExS@X^y3Vo~n@cQ^P5alZTd7)KEJpx>z=xE9hEFB&@%-gObCj#xV z(@7_`NhqVNf}u9t(0miNhl@Nca&%zON$+PzK3H~-+}qS=q;RL)u%+j=bLX=!m&^A( z#;dVnCRYcXP5qSLTW^7ZL1oR5HcT&1dFCF+PTzf-!GJL<(Ki`Oh7?Kp7A-%LmW;aw z{xrzdp5*A00^2s+dz!(CRXSB2Z7}CiKQ~{w9wFfeQH8kYItdk`{9wJBlsWzKSyo$w z^v{7z(0RFX;e=2hFI?H}osd>fgQO>|A@c{Q#~UBiZyq*K=P}PX$e%h;?YJTQzNkp$ zl2vjy%k_-ktzXyOA4jIiG!t1H3XW5L>k!TvKFULG$ zp*mJf@G3Sg9MYWUnyWsF&mT^Vq^IbZ&4V_Z)jRs(YA^+6C|rMha2}$C^IDhpPZy6r zWi(VAniqn&*DN5*5mWaWqsx~aE%4&))AOXsmuUHsQGb5*1t< zM%UA*iJ3mn|8A`1ZK_y~dAJ!o&~mw!@O!73;i#fl6bE*O1_zG8#R{b1Q)xSgqM$Tu zw^a|Tdp|*bEb`_xfnAlKX=+r^)E77<`w@>s==69Rc*JAjjyghE(&4=ljpP+ByfQ zrG?D)_*a^d*%2`-k6G(4158-j} zV27fMYuUO~hutG?2j3T$wEmulsr=T71Y)J=`M_l!{Pp9c@8C?DddtC75)Y)|KJ$o- zePEoIixOSD?m?@M&E5IU#Rv|t!B+E(xC0nU4{x*n2+mF;-|T~bJzr6|V%7arinFD)>ZyDvg_?8gHY+5Vr>U+BG0=hQb!OP6Sx9Ks>)Q(! zgiRk_qturBZMKhjjWYso9iq}mTu@!7w6~*as0uP8boQncRVp6=rAAKCwvFH%v4|37 zhC@=(VITzqU<-G$x0zkjJPF|Y_ZDmxpPz6k1#m6`2?FyP$70EtUoZ^5 z?28KtP0)e3)OT30+ysT=az~PVchi?ZxNWIPe;QrzIHL4bH?WU@*cor1_&yFNx9B~r z+4f|>cqt6T^Fgp_z2Ar|MY|BTanNbQ?seUmNj7 zgGCF9me|Kg5@_Ac5&o&*ZT!QJGJm6I!@S9Xn+Vc{b*8>&a65L>EIA+j&Kg@f7y})t zVMI+2a8ZV)hkw1BNcHBUDRTZgrg#bxF!Z?;Q?8prmVPnX<2bjwT0{#UZp8=|eA z*ls$6(KcK_^M@D~zZ_iO2&tbSBU_PsPd=T^k(Il`wq2pn&gVd?YD^9s&!3T_*a}hcE8ArzhcdlId3p@ zUIJlDY}D+@!j9a%*ETzZtUDcWjYrPisfP+Xi29iW*(qQ*`?*Kc^iNjp9c%}DPnaaOe;X6gW57D#V+bxu@Dt4{DZ%>hXrVM+G|u0zj2^ZuU_ zS7rRHHiW;~jX@pslG0}CH2mtX=>+>@Y~~Z4s7N+LZ{Ph}j@c=o9oCje1e_^agZTZg zj!Ly?;+418*-HFwS#MEy(c}WLm{JaFRj4O3wAN0-mH}9uPy+`3**@>r7XdfJa)yKW zrRnyk?Z}*_v{m%{zZ5d|5$XmCB1ldzNu(uriHmQF&*igQZa`!` z`(d|niV(44s#i%NHH2k3`0DY1+hoLGc$=&y#wPT+?X)`J74ur$;dCZah1o_wvltMsmNd=2_oGkYdU^T=)f~XL@N_$Ug+I;bf zuS*UDTLo#tt4sWZL*lrF*$=iA%Ypel_mju%_1KsXkn&_qM1g;ox`T9?1+SYV4Yw>R_KhF9i>m;wau_q#GtD8eHoa7hI0}A zv%e_ffQT8!4?_7|xHr;O;y=zhn^{4R{S5ZQj5_T2{V%9e=8bo7&dSFP28PW=t>5IT`2Yf7#10T)o&QcE5eBE0;7;&iEF#>`VD9F4|I@!ZQJRLGNB7L`MeC?6BUj6|)LV`NE%Zs$#_kkBv z9~FOmd#iX`8*t2Ih*pL3xU;S`c^DicEWS9zI%DWwSp8_KKQZ_YB_Ni&ErXWL{5T~7 ziw;tn@EQK-6H2^h#sw8g@|$B75QCdW`~D&SKq{0*a})&y=Q|zsw(SW72gg@=K`Hsg z&2k2#lhc6NGDNX!7%jo^76wq>pXx>L^hVl(&xyS+2{)e^L(jor%Df|YbHDcS@e2%P z7boQ*MPC`&{NY!4l2!tuU>WX}6)tReD?i&1+5V=|W@VLmYyLyG>U=nWvEY1QMQ3N# z`3=0Z6OV`|LOTB7FNK%4&g-oHBK0d#zHR=$=C-rerbT5Y;fst8&?$`?tsJ8#T=6|(7X zoWdR4by+K<`}tGg4G=i@tinTz_5Jx2&3Ee_PohHxO$C)McW*2hAl)vf>k(v=X61hq zufe3}K`(r+p#JLM2G#mhV}C>I zU7vJ6dZ*(N_@(o1tpi7^Q&~$Y=<&bYMZ?Lj~svdt$v8AIFa;{kT!=^ycfxLpdC1OPocM}U(zBCTMEE_dQoXOzRPde%E?2o-4J-Y>U1&bT_F<-Xnj_tS|OiVD~?+4vUS>~w@wfO7fC85V~53w zL@RITtBR}H#H0jyH9eUu3;%IV&{NbaBm@#{!!U;1<~I2nk29c2uq4ctHEAG60LVVO z7jH7&s=2ZxHE83~=G9$nV?qd~^~|dJU_a#2(*eH>u&4Ny|3~rw4HVa=92}y!HT#zi z4k|Ue3QrHKiXylT+o$>l(BWc?cppYgkVB!;k9xNqMR145QZZwTXC~H~x!%`)CRS8Y z$SupF4-LW}XW`pcG2|{VATVPUUdvf(@7)x2- z>3Ipm<@n~o3MWRGPJ8R6L>r`~cj3dJ0~7h^?rbRjN4TqJR7c+Z`2k7`bmkmmwy%mod>K^*LfF}%WsI$Y$5G~W?Wfslbdj8}r+bG1-J=GC0!Nto8jBA_T zxQrp4I%v-Yd3@yO+q>1OmXmoS86RV~y9HFnafvnXT|e(6?-cfz1Ht;JnvM!ei1f<9*e zf;Y_IvJRQGh6Sf1sn~q(!FvFDXP?<3-hkJCynLJ>t@-)z6F=Z!*Sd!AAIPTP6Km2sC)cUJ%q7yY_S8Tsj zFihoRdR-v@)d(*kphP~CP!&j69Q5jFYQ}Z(Zt&j2L_)KJlJ0z`v3qPmGcyp2${mcK z{R~~=Flt1_8A-X(C7cc#!Y_li)(MHW9GT?F^rJC1nw7^?#*+S0GN*(vl+*J5i!%5Q zS3Fn98%9{pXyUtj=z3glNx*Jlj`({Jg~xkZZOvk~lf{@oxbt~i3r2KlmLWGaG8PSY zVl7XlU}$E5KWo!`y@?B!{nuWyT)#|n+Av~8wEhEciBFZc*2}OSu9TrK81T3)@h@S2 zxKIY2OIbrGZf#2Q@xs?-2q8!8UE>m%t;I&>6KT;NNo-N251MS05z+K!uNCYZ!wxr3 zxxSTMH2o0f(!*u(^>l_XQKMt;xHvShm--^JJ+Goe`)Q%>ag;yaJrN+kjxBLPFs!f? z#H7y_R{~eSEK>|gB|TNMc6B8xpbTqzG^8e=oFC_Xcj=xw_!E+f14aH$4I0PN1Zvli zFOstH3sBeDnKSt7tDdDv;i9nF+@?R7p`|AUs>-mlzx@gT$5Yg&e7pGP>$1;7!j;^YB1r#nY!QNucgOH9RdE4k?KS?yVU)4#FR6SZ1IsBM#i&0mJuXo z(6&K?w~Y1MLm%_bN*bg@RPz-b<{CWFgzcK26nTpr%%X0<_L73b_i~_B_1fMnkitgx zsHX*ZFwSBJh*idhir@E+Z=L|s_f=qhoO}X=Qkj7^{sA1fX3ASk+c-)IS-1w!+k=upoQcqY7_E zgg)g}Bh!FewVs9N>^*!Ka*76YSg@qePv49DnyDAU)==){`mpd%uGCunbdzAan9hf#464B|cc7p^Pt&nS`#i<)bj-qZiN)kqsDV(j%@s zc>@S5&k?c=NgmtKJ@m-vf5gn-=$1_wSP;lrplE9JFCpua_$Y-QZb+c>(u+GNtqkjN zGRZH;v=z-ICiU>;Sxl8RFw}B5|5#JZL*E${9#1u=lV$B2B2F_DeXZpjvME*)49nLX z!|8wD!@jDU?WEX&vYmNOsMG6a%S!gf+iJ<5W4PytYKOQ1xYo&;sSt(tJ*kM(T%2`+ zTu?=upnj4yRM*&qJsMj4KU}?YaHLWH?K`o}iEZ2F1RWa_+qP{?Y-eI86Wg}!bZnly zzkA))y`IuF0q0wYsUws$g2PxbOsGWl7wJm?q=lPr$-H z?+q~Tm+4RKLZ*gr_j`;LV;qldYdl_C0B;2e$e6c36C{ zT;EgJDB6nJw5@*V6=aOW*rUoi39q}O?v;RflRsD;zOv>N^lh`Zp&POyEP?3kTpX+W zfm|SGk;$PluiVgJR|YXSUGJ5-Wyz&DL+D|G)K4xXv(}Z%i%TSxPb|KmAka{>*fiYK zDsh&Oi-nI&2Mf{I#3k|UV=rq;x(h#{S4jgoi8sGZNN2lwEvCh)5W1mcV>ZeeX?jbYhifnqgcxnm}S z83*y6rrX0RELVIF0$+=;nas!4NG~$%1V4+A?fc`uHiJP1hyrmxeA_o2sK1_ZC2TEY zMs!_WNgafS=@<~b%SA=k51ML+PB*v@Ul0$sihp0`6+UD6 zjn@9TjE(omRREW|_qsilw|)M!-yL$*-o&LeNsgs&&l#`2Wzs*lArsYxHd;w+f-k@t z+2`;@J!VTz5}s#bH1LMKRYe_%+Fj%^gwAid`p|h*mayEo;+d9CdqkQN^Sn)h9`X!E z=&(=*RP^CLlHtQ;5ij>j`1G%aN)G6sZ=A$*-F>`1WAR0Jti)B9Fw;C@_fC2q#D;fw zqni>Ja9hW@iPEOn1#N2U^Pqc1ej=X`#RM(}SFoDV5|mbzuQc4FPYT&mcNwa(hvgfF zxLo5k5;P#4Mm0Ul2nM??@ab$P1ySe&B|5uM!kq@aj(kQt5JmdI+d$$EKc&noSK8G zC!NnO&7)>ifm0KMCbT@6gzup~J5(geNdH9sO#+P8FrYjxi|O;zhDu0%;%wkVoJ(hM zkuY zHdiR}I;6}GrNk+lvF*zfWt-;|ufqiud02(*6fEgZmUj^HiH>f0X`Mav;) z!Y-jd(G)twH+@P+)eoE-a@1Z`@#T9n0Vlu;?hb)h@J%*e9scUV3a;>e2f?+-q4nYR z1WAs)Ekx0+oCcFs_h`Mqup>m8b>@PLkf2N@x3g>!@1s4R6!d@cJ5w0*4t_bcr;F>n zSh8U`L*%vrXLZg{@BEy!UR_>wwps%nqZ|`<6XuLA$3!J6W&?PQ`XdiS8I{?%4QMxv zXD;jMM^1PYel z0)Do!RP9V0PlFXhouh1ut3mi!_Sn&j!lxmpuNg2J65g^J_Y`R)n(ArmIN%&awd0mWj9X z4m(r9B4vGgLem2Ue>&8mW#?E6hxq;w#@)B;RIW)7P#1amY$df);sF`MJ{I2K0h9N? z5HlgbDD-BlIgGiYe2^vq7us}b;wG75V^%%6#u^p!`Etuf&^g8B9bFE39cF$~H`9F2 zEahOm3=8o=Cbyn2cPxblVAc|^5L5%Qi+-HO%7@ej2+yVkuV)E?ZH|vj!72M{G6EUQqe_3luKiLO7HQ4^kOy0n8!|Lc<@6!ow-w(1jVo(9 z_T5OzZD=4;nH-#r7bz7+a%lm#A@_4kvZ_;NG8vHHc&H)L=qlr{TRt}+o06?iB1<^f_?<_f za!4e&w$N2zr!lz6?bue}ao-XllMR|YL{~hdGn~U$eB_;XeW!8L7H*AWw&K|y0KNXX zF=p;^SFr)_Kis5#o91$LM9=i@0f0XSo%Fv1Q4I!ON_1Z@^6{~lW-F5j6qFCU`}7vf z$E$iskFvSLnm+!^J=(O`7^;N^MCR9cDY>Y!RM)ivAsKoDHZ4Yq6;0bqRNoyJvH!3b z{14q^e?>&}7+6E;3P9xnWk_UrWa$3!xJ4aB`zwk&K|ME2IO1o%VKqI~14cvq!pRv* zdA9fuw=ZN0fKHM5FGd9<-sD_d{Cu2$96fLef{KtGyr5(J1`y~Ubt8-O0cQi7A(UkwCvi`$S?;J<>FpfnZ9P1?fq0tU1F<-7Woxr6L zkBcVzKR+f!@*69lJ{_UmuDJAnaQgqv>i&maSFivts@1hPF&6vZ+}8h~?B8GD^!*MH zkMQL&HQ0Ye&}EElK+MEGcY88i$9dprCSD&E-R)oHo)Dpf!ssA40l3O#V;!%Y3~&l? z;ei`EvM9O1l{ca&{g)I$(n7tZ8$~@yd6gkCKB1C;L}inEz%rYj>iwe_}46S+l|0k|v`4@#W2j+|bi13GyaBXz~dgDjj!<8|B{ z63NXqieB7KJvyw5`CsSG4nGw6&|ngi1$P~6b{y$`*D>K(G_j~}WXZTcga&$jW3mfp z&R)dup@XGlZy^alWbtg?9Cai0Q$T~asoFEKhQ2&JsJctvuV_O$w<+Y`SutrUl#RGTGy8H?>Q^4H5ub{NlDNvl0DZy zaPExPlmrRg4FXnN^+6D8=HcOca4gsw~Bja?I&%Yf3^^L_@Dw# z_wd$@SZA${z`_Z*aR=e!Dy`Yl+95Z+UV4%-!U$97`(wb)P^MiV0ix>=XtO0S?!k80 z)p(e1eWrz5<1#lV$bchmTnam|Umj9HU(Ic{e>9WX1-#j9Ej`?$(*ii0hN(TE18WBH z#Kv~b)41Gkq+)yG&OO*z&EPWRu%jONI(q)JcliCBVvGthsLq?!&*ydWNJSAD z^%>7^wO?=fAJuKwT<;)L)iG+%(PO0V7M?9w$7b|uvFc#}B0lek;o$B9^)kVARSqR1 zr3Amk5K(ZY0-i#FW7A*U=J)4^`a@>;DT_qzh34A_!lyS(W}K2Avmc6{?pSwLu@nSk z*m?i$0>Bj%$GT-#pThXMVD5>N#&&S{I+n`pRI|y27Tf;=H|r9H1k?LdS-Cp|;{oSYzB5fa|!IR9G?=>TZrRDzIyx=3Ne8|3NfX(5Cn(h&5Tesi&qx zsRNV)=`^K(p++~3vuDfp(a-I~TKyLT$=gKAc3V!kJ~I^>&9P}MDK^-v8xfhYeVQ$W>j7+?Vs@Uw3^rE<79Bat-FdT5rRa&WtWaMqxXf5unU3LT#RV zB>5)V_;$h1HXvXegX5o;=54wg6bd3s>q=N1HhJ7r_8-4K(BZ}V{rUi^^>2WI`6k$; zX>5?0FIhC60*2-1ScnDNCDq(cJA(AJUIVCJP(WtGG4W)i#CNCwB72~~V%ydFyWZ9a z?rc&g;K&J*rnM;CvGsI{0jJls9m2(oPPjcAcZMu^_)plkd7r{X4?!@RCDhBRD@$zB zGw?2}h?h)`)tVl}!Q_z;$Prrbv(9`jCh!piwrTVXgmh}1a-2IjERji{b;axmUVJn_ z_oAgEcNVt=vxcGsG>QmzFQJv$O8^$td&XP}tC_#~#y816C5U_(c!xTtcOYb2$Md5d zg#+WXMqZrai?P(|&rv~*8sN=&D)lT8$s@{FoWSsMMKoQ}Vxx|7y?clCf-q3hzSiw? zaj@s$X(phf>QL6OS9q+7u1PnIsy3?r5P!q~Khdr86^DV^L^lSn(}#zaHsrR%D2?JA z+{diPDwxm~xB?M8CA=@dJ+e=PW??~g{O4*qB1DVRCAg^fOf3~9I<+94DX{f z@v6!#ifeX;Cve};Mep7@_;{d(TnZoD{J(sKSY}DBulLWy6#=^!dC~D~ZHKQpgNv(g zVCI11;I=aZ@@BIEZcv5bxjG9ZOCu^7-;>Z_dWAnX&=utJ%aW7F`+=A*W`Kfs(NOOQ z9)SNn)k;#(whC8Da#XPO+EE67um5t^pHTwvgp zUktLQw4WetqcpBMKbwOS6Ak=3RI*x<4s9-nK=?}Rz7PL?ZOO%Mv+CZ6C5UK65|54v z0swM74grJG5(xv#dy85x=Ze3raf8skd{(&GUoe{xpC~}&pSL&+SMnlWh(EOM@rS<> z!V7h6Pv`!eEPXzzV92e9%Z9HY!}O0;Efm1folsO@sdqy#hf$R&y1CdR z5!hCx)J-<`hyUdM0TD$-c7YQTMhTD{$Qd+$p0&L{saVq99f!Kvynd02+l&6iXvg+n zQo-K0X)Tn_!;4fyrvGV&656yg@YLJuH=m<2bo@--Jhx0gKF>#4l}u(Q(gRo2ZCL>X zyzs`Q=tx`}xc$*!@qXCDSS;Rxej8Y4nFCg}+%O>ij2Gr~yx+lLg3`R-b=zhUCz--d zbif{PufaZMW7C|NHzNz{R8#FVW3x`x)QNZ!x8R?e7Pdy9(=xrfUL_? zLgM%$z?Q|E4R*to1+8CVyw7P&SGN)}x4b2Kl}>!t^L|(t{vV8>HJ&!!_xs@u31wBJ zlHD5h+zMHm&F$;f!D>YVD%<}k^^SO$wb(_?T0?;)ZCd~Jb*yTBcIMNF+6&aYLKl)5 zYzX*>4ywll$cYBA2aG&C9J884}W38Jw7gnei}IL^QwzauNuyyK(WR z{2%B@?ezHtV07{(PUh1>w(pNUjSwW`F7+faT`Ei$<8+9H9~JBoaIk6Sv=!;he^ODi zbL~mAp}Obc9tN6+1D_t8J85`tqLR|yXG!mX05{^kgLh| zefOE#$2c9i_-(D*vk>@~8{IOM^w6JNEPGZ)L8>b47KPNQ#pN%o2$M=wp+3)y`{o8q z68t!?rMTw})$dAiOu^^g@5h(lj+<$ZUwlTbzkKiZVHYfK;!I1NL5)pOz?^x9vsDOq z{-+5Y2lJK4g#bbhHtn4GNR>Xl@x=@G9 zD{hd6?|YsCdR_AcN(>L8NK$WR!qb^~g*VvjMV+ueOT2wdh*VWLp$1OMK8o&4v+?fj zrRRiVD2$kB2Bt<9r`8;LzCTO8JFw=t7k`33C{-=n&Mh z8ZYe~hbSwYXcifbd8Ly*Gk$%g*SMr}W5XV)!1=Od2Q{Mn0ZDV6bge@tUVLyaAH* z2HDIT-3(f!0s_pzf_YmTW30BmG+S8XC7^(yS6mZg>!k4aUp52@S`1YZ$hnq7>`|C6 zl|&vULS{8SEx8*4f8cnv)z){lMW0@zL!vMVU1Xy#XR`|5sEs4yYc9eMAZ0P_49x;&bvaq( z7on_DQ#A6qV^?*^wT{NMFk51k(jAi?9{QSa4D@I|kXA9tR(Hu83!}#9M6s*q?nfz)`b3GBPo2s7XjqJ1Of39O@XLwR_hDFRay66#p;AaK756#9(sI+vEVqKv_OfUdEj16L zsq&;)Thx1_!xH1;xP?>lw)=zYpeqxu_8!t%e|csBTsL4{221IlD=INyf^uOl@4@yP z*IsMY%kOL>gur%NE2T1vU@0t8Qc%y-MTI$AiDKe3T0L5OgR|p7o6L95Us4!Qs&$+b z5QkT7t#8y^1blRUuZ|W`w%6o7KXP6?u~{q(P4m>WXV++oO!q0wMtH>NguvF8qa37{ z9$!Y%J8^!3EhVoa(+UF{b$74+RhIv9?yXRHkNA0oVYV*L2Ef9CLXf$aM4IZjKbnXt zkEX;?6cbpL8l8@*XUX#5oXlPMytlTUVS{B8w_F)OLU4Cep5W4Ko2w)nESee{54~Dm zbZ^*ttWRRh-yPaKnb54jWs9Oc#K}!oyi0KXP4~|Ac8UYY%*x_}-re?nYm7CHD-D0D zepP&!9i51rGW?jEE~oNDPWrlsU7M|F<_f`Dz%(So|Lzuu6tZP$d0meOQz$gMemGum zL%y^F2ED%9|L)g%m(o^n+mOrfGw!Ox{66W=u4F3Ly$<)$)So1)_aV*Mvi^yJJf>cA zzT2H|mGDzmgM7j(nWQrjGZ?3NEWw0}gRR<&T2BF^P?D)fIyLS(`R9K`k`kRU?+wNk zykO#$kYL*vQ>l`**#%GO#TTIb#yX~iN~n%fLBqz^CxHWs)+>Cy0={ z#)v3$UGT%AMx|SE`gwlwMZ&?9S9%H17TND$(XySGwOy-J^;_w9Q(o-aWgSTB6EAw; zT+VoWd!mUY2^NtY>KUm;<_Y3&DyItHxy=4m1ZIBQ|Hm6FPKkp%MzlNXxkfHsYTwecH>U)%YGdw-mFkUggt-;?z~XG|%JfB7Ze}Ftn2}%}{K56cg^O=@7nlko6i=Xp zTF)9Rub6-}2SZGN$%gXCF>n74rKVDQ4|RN?j^l*F@11p2UTiD&h3Yk4St~Rb83_VL zjw_J|SFQI_5B~xF9pr|(>bwc(3^ySL5Yb1Yycb(TsEfTeBstqs4N~3 z7>Mx(^)UABTTE}jh?_rp-b+cfH{hNm4~EML&0k@));zH34Y$O`3?DS(7{OV*9&^fz7_;x$KQEa6r8bmKMz2LU zv6j15Pow3GD!8mo0pE*dbs64 zGLh+>{MN6JHfXZ!;!`M;eD7#+S?%RCV1ZW;IW3P@VSoCcR>MWpOlVmI$?Q^Lgmfn! zn{~T}pzWUHXO*nwwZdwI#^R}|q!n?4S?dGf>?>lB`%E&8dw%mBEYK8a5G&B=a9GXn zDOi_O|0fltd?SP+JIZKWCl|*3_paTHUWMB*aDKd`n{)MxRCycw{@^x?rd!lLPMHug_mA_JIH*We1a>tr{D*lCqaa3+);n zy~CHpy<9hNN|7xk*)f8qoYQ)3v&r428#L$h!&1soMK#C*eV9XQ|D(hCOy1dv19`U3 zUxM};OA3Nyp(h{lA8tm_ScwcJBmvEf75;=PoX%a=qm_HvJ~_&Z{uFX)#Jz2oDuoe`amX%XXP81s)$=jXRX zPFrF-o$6+&xIa3>y)G1#E&?SFL3Cd44b$-8Rr^}l^r~U*Df@SrBo~|C65E#i%N&S% z2Y>5D`)MuVO}j_`;D;)hjzBWBBl%Urzs3uC?XI-?=C02&~youP4|W{jVoB z@B8~T*w0wLZv`LvC)TIZiYrLGw+$A^V6~iiI~PB7$@gCyuUaXEO;D8mUiRWdlg*Eo zoaojRlRlrWEkqGq)hc|sA2=4u$s8at4*$uNnl~o&;bEPqPL+E=%dARl42eOfub;7BlhZM^5Kcj0E{Zu5H)$AVXwmUCVdBWNS^WaX8s&FBLbDOJU^mei zHO8CO96h*@pk+TQj2|2M2g&fM!nWobq&k`8q9>%SKkT7W?Blp=Tsmv+%>=8f#Y;J_ z`Lli1p#MvE8st4i_{7`VFra!eYs0 zBZCb^B8_6@*p6LW&59>$Gjl~iP6F__7v$%nzFPRG@$ftvwTV>w7oPFZr$@}t_;{#1 zn+rV7@2;w@_=t`Erwa6h7BgkGa zWAMXF~&bK5K(*UnMgT7+l z+h*In-sjAu-lq_eCOEgVUT>cV&ji!?fko_2C-4=dAAQp2g)Q0a`%GXs@7P!@!l zJ*^t8$nVZ$my6-l>680+E;yz^4s6uY0>w|?WmN>*_cx8kWH9(&XbdXJreb%&D zPY4Saj2z#iXhyIx|Y==~N5;3bhNs^X1H0eRg;L%jf4j)6% zYfQ@L)&;;?$*y_mr)!xhXa{CJ=CH5OSvos`@d|9YaY)M=r6}${YJT8P7RTd;x3fz6 zE*#@qm6@H-b2atQ;D&%HK-NF>ae!5Bk}27rN;C@zCFvbww*SljdinvnXd*!ZrTrvi zB7;i)|8-IQ$2_}~126hH+No)mX!*Y{dN)MY&>66i%xZ-|Fy;23)t3gpWB^ra@Nv6I zZTtUq6}(4(*Bd-ew>p~4+M-IJA+U9e>O$Yc`ewF*NyXSh&2|E+Y%laG7_IsqLk@=*BVgjXV=F-wZ$*T z!tERUPE=p2#z|aQKnIH-ddL0NorQ5eZe0OzNMmn~&YM4y-Xw}R`_>6cm z+7HiSpEB%^l_X$(zi6}f9bV=w{4>en&v$S1rDqHjh@4yEpRLWLwEY=0;}Lymk1w+( z7{qI6u!k~FzGplN!tmJDq`a|_)+r@-6>ZDc*G8dha4SsE$GS}uC4?^N7s=tc4@E0` zS+%~fndy)eR=~H>RqwU9oqoFo!-LV~NfO>s@k5+1qCQsoT4A;IN)};U=aS;P3Zg6X zWW2))3lu#A_s+ime~dVf)$ajD!Q%#Z28f7a_{RNNBg(zWZw(ztVxq8OpJ<5_rV(fh>g?Sc}qj<~Z4 zOi^8(Hn2FFK0|>$_e$q2m>d;OBND!O(s<4|Nv1m$zWHg@6^`fq{2JFET}Uc_Ep5rX z{R0Ql3`l%qpBk!E<~R~=34Rjhn@YLxi;DQ>*kAOf0`o2ciCNeXcOml)vP`r^JHpE`)>`01=mB zkbN=kZ+r-B*4EwD5qzl+n36R}{C!MoDIbE~k`X1~Cq2&yIBRjoV^3$~M6CGcy>VtT zSi^2@zs6VJ%8XQ;>My!s%lWFN=2EXLu|V;`+64cC(Emf|d+E|M36o8$S9&pk7HijC zU?IWScFhQDR(rA}=2J|So9%cs%Hqj_-G?FgF>8hcswNk#kJg#C`t0ywc@Rvf#2*M% z@WSil0bk1t`VR!D`;MBy%^T*pE?IKut&oWp?7`b7e&rO#gwc9>DQM zPvGTP-LXWLHa|>ygP_Dwe6jgxfaoVo-pv&M$(ADY`M!PA6(3p14Vz%LteLtIjeN`- zy=>^GdA9|F)?u#1!N}qUVClzI$W?xnE8F{GStpmE6*Gh%g+%XmKPRpWH_<7^SxvF+ zz~DzNsm-Gnun;$Vh!(5hJf0{75iJfhmY^m5^#2NgZ-iR82iIldoH?EM$G3I+zufI4 zjPki|ojsq7zoim?dzxpi@FJa+t&uK6{KLxsF{zY{aRZXerpl1e*qsEf&kyYu1@^A{ z>rGl6{9WYL&92!Rb(v6X#5&L2ndO5lWNz~8P`}MY{ocX-rl%Pr)9^&Mio-H zV0uN@OK8KibpD&j_c)u@2-q3@n2m3MD%D%9yjNwG3IUj5i>xQ4_%OY}4}i6&6!1fp zg96Ne^Q!A8eEWthFaAwaui2#8VM{xGv>l`&hq0Y3}yu5O~N@x8vB|M}2i<#=*N(ZiA^wu4Bt zv($od><7-X1Qo~wHZAw#^BJV}%&zC$DIg%9k2G1U=R<#mvc-rqZ`=rdAwBex=Rx6P zZt7H^#qZ?iX6eaX z&Y>kgeCPJ5K81H=ke*PCYu&Sk<m1wO7r2N)>u5T*>^4p47*p$Xmc=?vU z(*hqY+^b&VGnSBMSDx4$B@l!*Y*rXjA{S5-?b&HzZazeAL`6Wnm*|A=e)x{Cl?Kle za=BkC30$U{(IH*?MRPYKeV0JwdpX?H;(fSyijME+XG-wuqu z(WAQ=lDi+~vMg!(C<$tk!IadPV6Y!QAG2B*^4HU&ffn2hpPk zghh%^;d6aKhE~YX$;I|Dk&tI~F(*-MUl5H5v}a!nA*eR%up@BPNTPdKvjaIKCj74? zWp>iLky;F$6tcuWDY9sJ78MZva9uBkg8&h#BH<&Y7nG=|7r2?NaFs^anH?sJVEK&-1070nd-zi*s3xe^rIyf9~9igf7yNFdy z_gENVqq%q);`=Z~^v3n+9x!Wdq*}%;SU za)EhGn493yZS3*Teq46wgJ-^kCskbg{+S!_HNBPEBkp2y>IbScb_6N`!Al3Z5N^ok zP((fpQ#H?!KEqW-9Mvq7=)h*$D1N%LV_pH|xLr11mu90dt!t9hqGaG-DKRSO?IjnU zAsQ*5Z><}({Nw+1)m2;;xQ494cJ=A2dezyU<8|qJ*>;@c6w$j+0eX%F=JeS>eA$0X zu@kEc^KQyhYdGEy`T!O-4_s&T5OE00q-U^uU%XQvj#2N-#626Jmf$UgLNu}h*%=Zq zc)}u`QDyom7Y-UtcOxp%Az{Gv;Q(1lbhL~QaBg_8_YieL z4Frm)(G91gkR1t({=h^^m|OWE?G5aViWe7=`jT*69gIHBj=N!}r!4D<6!;w&OiK;7v4R=z=+No z;R>#czR%87Kw(D$EAZU&LhS;D>RSy?%9<3|a4)*hh0QeC7_?zW`_2qKL%|Fv}CRKok zeE|Y-!pygk&>%mraEWqC@gmWpx}Vt{Hcv@0RqeW*Z^&t*;fl=6U<+OMl8!HIb;$Uh zohWB7r%J^-lFZ$XS#IleXORD`T=3eNfl)b9;YR8ToZ*q1S=@0DL^XhkC&!k3NQ4e8;kAcN@`vvZ$ z&|wS){GO5_w7^2i8tHKqWQ;IeS#`yqr`h;})*ei{ z8_%gIjG1F)eN#J;SBrG5iknm4M19)KTM%ILZJ0@44#fVkEm3vw;Scwh2KBkw6Nug* z=hRDG{bCap?z#s)M0ZPz0BjMv47!zgLyYV*jAss1c_tIQO}rzkLhPq13;dy6chD7W zrP~)3a?ZjTSJQVFqG*Ir={hoHbh>Er#B^d;TBBPmTU=e57PIByl3rSE7Z=%w++ z9~A`|=E$n@2S8O+iO(}#ml^bM)Zt6o`rGfm1LayK%AHLyZ_aVJ`YMh={p6DpH9z1k zjM$~%+})WJ-p(@ZnTeaLM0{h2_b5N48Py6iAj2V6TA{mHe=bISj9mzQOThe!Jp#Mn?bGGY8t~iG zecj6)@qK3Dd-ybWW{PF(Bx5-osxuui;X1W6@}6&2{d;Wx@?HlQf%d|_dRN9{xG zzGR6G)b%Ia;-ZfcCCo`=yC)yMjo+z*G^5>_uX~Nu;B+wGmwbLT9d!D{#{>Kw zTwt1!V4h(jzy*7r`~ro#Gu~btO@XmtkJ*vEd=dLx>xWBVU$hKAJbtSPi0LkdzIZ6v zEf!CP@`Fla3dU2oY#4?Fslfa}&Y3aPbd;1W{H^r8k2eZGW9cRCd)Mhro$jc;@V-o^ zIDPV(o!ERKxpRXCZF$4mTJ7z+#y&LdIfwX#twTlFzb%`$c4EV}43OKuJ=#V4luE<4 zTzvwjt}?rS8QAI*2CmoZ_uF_AZG3*Hw)(shWXVF2Oiw2Djx%p)&+kTDY$crhAPPtETbir z!|~*%3hXNm+9^IMDdL&L+bPMCQ{F(^ws|H-mGAE{5{pxSI)7dlJxk!5xXdfxjgD3i zEUGvV-)eeux<@q9vtHjGn-nuZE)~bOVx?jLCm0Hx3oEexO|Qy>N_xuEg9`k#a};d*lcN-fPN{3FtqAyZRGu%7(+CmbYfyA$!(Vos zPv|n(gvRc|7jtcdR-*Is_un9V^F}l+mC0>Em&$9|L@Ag^kInf<4scVlg5O=@GJ9;wG$ro zv*Rsp#4v^HALEt~?97A;`ab;0ewzQRk+EoDx|)y$m~{QGa{8dzY$EFh9Tq6-Y>;c4 z`3=zdL~8qJ<%rSw=_!R8vdb&qtx-&W98gaHpjupu0CO6cpG0pF-@8Z%NTA$@^FjwJU|5;mbcC*hZLT%NZpM%IFr^{8 zKevm}d)znF3*NPHd+60AT}bzvK)joe`{_%3_%0pVu6BQ(Y5AvWcuA)cyyKhRnqR@f zqKOU%$c!nCgy^=pp&hC}l@#6(pqQzyX|vo7{^0$&o$FtY4bI;|kKvveutDwr>vR;- ztan#1J1&dVjAQv)L}$r%C^((@T7Jc+2H1-F{ehCwCiRe6oJx58D}jzYu0S! z=N}1N1OAT+ss;P8#H=SKce;`eM`Qm+kJglbxL74LHuU0oT7w}DC;S#<&JgY8yz6YT zQ+?U)I7K5=>c?8KM%KIr_7R8b<%dXkVX+<98V#w>Khyk6U89c=mkf)#S)mRSEv4Gv zk0Zuscx~DKQ7gyLbD7kAS9KNp%|F`fb^XjKzoA;!2CZRy>^Xc=q~ey!DybA+&E(j5 zrvf*NHZsQ^p2E@LJe{8_kJGOMbAf=3b953lB+ThOL05}7cS@Aw`da8|@S(1*QhF^d zi#Km*U4=f$jD>IF52egATcFl4Aau~{f&-~0lnHL%2rDOUkz%Lq@}St9fN~1O9ve{b#@XG`!a?X6b44ATUp#PEAJ$0`aX|I+ zfyaa60Kg~Ydt~qeM-t4qtFN4>s^KC*oliqBXY>d^s(doHTbnE!obB!UcM!$IQPDsn zL*!YTq&Y;ZPaz}^YvUS14vUN0@8L=f!R#Wc$BQ;#kr2!-dY?R_sh+QnCbPPaaV{DL zJcWrsl|b+H*6-%IT{)oUfZofcX=kn_$FZQ!TV)`#FUu{x7+-M_4mu9{(Vd06XFS9Uu_G`{LGIl zR{OK_N_q>4W&X_3alze~HnuVQl9$MH%u+chBA2smZE#`Y^j8dgPOJrq1-MlwR{^Ej zj635TnND~tv>!18>48DWNOZAHK>$`Oyr+8WZAy8ueG(nvb46Ly?^CV6j4dH$2O;7H z*HVq=o%TxI#3_8yoQ|0(B>PDlri@lLYaBb)ux)K|q%Nw;DoSrsYbwfy+As!Qs)-H2 zB2tFSdTFf+KQ%N=Dx#6QoF_D}Dsk&}4G5k;!Oivo9|y;spI8G8_J%pn{a8BF z$@`?eqo4e7&`$|Fs5;p)WL<c<+dKCMfMPEE&zv}IK^QUKd_nJK5p%y*)I~-)&Lame z@HXSN?SAs`_M)#*iD5XwydxO6=Z}l8{8ubkMFu5+7o06+><_{_$^ox6?7A2~ztn+8 z6(fFa;Nkd}FByzY03j!;2-1`yZ#W6V9x+2ja}UgmsTFwrulRp!;iQqSBu!;C&3mU2U);t;{)4my005{d_ z(+6((GqV(<;V}(m?Z}~#5djQ|5VDeWafbVhV`{@r5TuAUy{0++iywL& za@O&l)nB`Q4WAg`ku#4^~I&s6N;|P6E=$sZ8L-+Y? zks>Z(+N>oZ^YlzY4UwvNFQ9#*aT01=mo=%o?qanXMN&;UtvFn$tBp?BYj zDb*M%Bx(EHQu;uC(yuL==cJAa~ zkz@z;s!!k%??+tu9|H>?uO>Unk=l*TUX`Ur+=0I0KK({?v8XtuuW?o%q<=nQSTZ+|?$5Q9|Ga{d#p-Xo6w2z{KgU4r6;1v}IMv zyY~=-A-a2I2#2sJKD4`90O9H^Gehsgc7OU2J=dZ;=(+Us0FyV88MoOdf|p^b-6U#! zhULFE5DIwqvfJsD zal(y7`|G_L6MwPt!wc44tu`u5i~isBKKu3BNl@+PwB_ce1=%v84+aZ9C-a84`pExp zYrY-=T(HHJ21Vtu!3?J1s}2DaYPv?mFa2CK09ryiFhZY6DC73>T~qAL$j91MV=Hjo zbx#OPbt`#5P0i;T{h&h~t4-9a0OtAYYFCbJ4}u}H4NiXq#NopO9l+rfvS~`_FY;<| z$Hjc#xisr~eonskOAbVjE<}Rdq7=ivxl2I_#1zjwX zsX;ICs0pqGili?*;tvsdhp1o|_7a0`RLClV%}`q-&Rs3MOHw4b%w)j&s<&v)B0`6hUB=rN%h^TkP5(l;~Lbt5Qkh z)lfA_c?VUgDfM$0-9vVy6n!KTcAAB+lQ-tsAIWwQME01EzAUk%3Zk+aa@uFWH(pIuwH%%og(Bqu0tKStiyij|7c;D@(Of{rxh` zUt{%=!~fQR2*GB~V-#&qrdeq*b&NS33K|^u^!<#iJB^<)`SD%#KD06z3~DU5VZL=2 zSYJPC?Ls%1QTYRbqM(93)Z};Cya&m8SN5+ZLJ^9}B)!5UeJlc{t}o?a*s1;}}i%B+yyx{iQbQS|!mz z%rUa-ybk#1@cm_fL(5w@LqOL08<|-hQg`vDaM8=`1Oh@4IgGWz4v)cz)e_JMtgO$;JOk;WLnaR#{A9 zuzxZ`fZxDb49+@T0B!c{m7m^yBnYv8h?o!S#2R*C9?@1V%^sKkp$%xw20R{sAD!9=r z9wK#~<^>8aXFK9{pK&}RQE#G#(JI>Ha~vW=ma}r@+aIH2OIx+^ChqP;&l`>7ZsT-U zT`xfCOh6%-LfF{Ii(9L)dj?!A6z*!`REZRzGdqw*i&^b&Gg#9;6Qcg-S#gyo#qNLack(x(7DRt zNuqRZ516=G445U0ofg@O=f7by%uHs1F8QM;4iS||T(&T$BClmZ#p_)=OJFe}?XkQQ z#DyKKl0%+VdrZZYBn`;1dwlR-?R<_VE!&N^v-&&{T)ETBC)1>WO^6Y^+M6rd0B8o3 zG$&UTebtFL3q9hiFNr_{%_7Kb7+U`Nz$Y42k?Y8Dttn7^uarW@kg}8kSB)&<J0F+X_25{0wT)+&dqwfJ9m*m(F4b19%9fc*64wg`FDernscxiU?+8(pH-)m@$XN9E4Rzr| zj0$D5XkiCR1a7PrQ+eRwD$D#Kj{}ej>9{Gu=!@23fkeH8TwpkX>%%Yn@ zk19Ldg(O^{#vChvjV%C1R>|=W-J*rCMq?KL9=Tw|T1=UDvQj6kJYWTWCs6P=CZS@g zzn-4^+tuXv#PFXfq%1DnX?!tZ!EgxrN9K10UcJBhC&IM8zddA9lNp(iS|Gw|9+^pQ zTyAiK$I_9}Q_ArTKokhSqFcT@b;!-mu$~>h6;MA&O-*^!)02_Rovt`qytz~!NlZIydu|bf>yWNT5k{s9PU6ih@WxMQ zSLKY-BoeC6Yr+s7ACI)5Esq$d#7Iz5U~R(V-0+B1NZ3!**lGPQQ~;URN;X4?YNUyF z^WUa2+KCBKJH1x5kn{qYVWykJ1N5d`r6*h$nt>;SG2TRS!uXnHQ}ouVP%)u`Fed@~ zNAg`+&v;bAaA%32t2Z+y$XHjGKk#nRf69rkkB^aZ2JDWEdsi@lJSo71GF#|1jqJMb zLv1>LeSu&)TiK#;?Ie5iAq5t5Lubc3M;%6-tz&Uk9}zXFam~>Vc!%J;hggUC8anWY ziacqlNH>T%!G4kGhwJFO6ktyK?Vy~`6yQ9+h|7kSx%n5Dh-kX(2mThq0Ctz#N1XC3 z4yWQY`dJ-8TzQX~sMkMWVfOZ)j`zNmZR3^U#c?>HFizt7c?)VNiLj1&J?g=q&>f6p zuZM<8E#k-jRIz%Vg3uv;(;4Z$3Oc+}!gZ}(!uyVDS^bCs_7G+b5{-Pk{DMUs*AGCf zq&~3#LP@Y|o}6T8n|yp*>MvZbm+J zqOjL1Bx(6Uj<;WQstFfdUkR0YbZ5tg>m!ptBG~5VcN!c1;7VqupV@neBvcIeLNI4C zbg`Y-?wsmjd*KSoU`jL$jlk&vMc0s6>M2Qt>lHh(Q1S02T^wlc8#+%^F~oR4c1A^A z*-7yl$HZa8P&uZlert-MavnA}e#W+RrcM#Ao;X^epaintyU~x_PoACl*)Q}usAzXR zs@=y3N9@k{Hk~c7PF{mOmLvt$q&4iGtnCNx1)j&eI^i$yAPDkBdz*L4Np67=aZph~ zR>ePEz+kuB@E~mM>^51loo4e?1BGNCE)I)Hz*_e!X3E}>Wq&XT>+Ko9!vaBtxnn}m z6vCq%zlT!JG{Tzvp=JJKZt&n4j+EDE3+kC4t}Nb5<)g^$NfnZ7w1vXXmIE$EGK85$ z#RaYov8LQ&2_*|rL*MHp6Uc&22vet2=ZLly(qs8-Y#;VxN9Dk5@OqEse5m=HEj!}V z?1;8j>FGcv6A5mBS)@OH*!fa>y!Cei8-}dsAIbNgB&$PLop49^6hjC5P02Psw}(AR z@sL{Gb@14_{hn6tdjinRLrTcJz&Hnnun+th4a}+`w73uQ@+KfIOipV)tg;if^gPC2 zqTvhf@M2b`5d<-)2e=WnmWj?f{47d@?)xJCB(m!rKe2uz)Xb9GBYp>r(L<_&ms4!A znJVMWJA=ANgrj|AK{z{m0D6sN)#lxGcbGW(_1{R^l5Gqr(MVfRQ{97!0FuWT>O;s< z3!X4*qb}z^KC8G=9jO`@$>z=R!}2G+3r7P9e=_vrv7@{&%7S%OwgA`gA|-A>JGShr z4(4DbJGA7#$oG62dC8Q!L6$6;e`Davh%0>7yFWkF6Y&$t=ItUesqM1RoO@hUzI6qb zJ+hSYZ($JPUv7+)w6?MmQlOE>O}a@T&p~FNz=*EBcPQ>q^)~$xf38BEpDB$7Jd7dN zwDU+Q_^A$&rxbpOU|hXr8ps z(vK`=yr55PZ_zg%=P@H(FXCGB`J(4J*zcTVc5F2ahy(~&c1ghZ-Ays(LeEfQ)b$38 z0cK-N`?HyS6A|01JxKIsff47;;=K5Us9h^`dolalEb2srcl5lep}Y!`ZtO}u_k?RI zy$&x>VtiN^+MO2OLC75#S?1l*V%zP;*?OP^AA%PG!KkfBg( z3;@%OUv_?I@Y9CeQe5Bft1FM6{z2_)X=7p9p)*!YHlj8%84xzutECP!T`)^RaYlEG zwge%7UDfWccc}6ZK}(h>ap7w1Z@wA#1OJ%?Fiz&O9^U3~U^x%VQWP?LkIjda(ljnU zGKLnFf9em1i_MjN5a$FzKHRY+Fw?ql;FPtyA@FvEDzEQ7{UR1A+ZBQsoZ0O_3`Ubj&jDHTq%0X`cYf9xm0Dia~0ozY-?O=C(Ufg~l;@Ytv z9Ni;fS+52g?BxLXL>taJFmd3Y7Ht$p_p{unO0NI(sNW?ixMO)oB3{%DPue(9L+$TYDz4~#WI$w2 z8t`Mj5t^v<5D;a7Yp*xwAZ;0%oB!(p6eWPqaNh5qkRF8zvi&AAxm0-~_X!p3=OOjH zKJU@6V~c^nB8+eLSqf~(svw9+cMyQj@4bAZ)@x{W@PM0 zdNuNLr3G?QP`hc(G38Ui<2#VhB?IBP?)P(yl_CMq78E~!*}Lnmd^Q`vX12jYc&WOo z^modbR>R}4vPeZASVBi9EZdy%`%89P54TYQoeWCeeAx@_C&oAg>0H)6@;K0d;?W^8^KPEuE^Bz=dN~) z2;E`AJ)RPXc-gl^rcA}3a5l`O+}VY0$)4m-J4T-eoKV%GQI&EhM8AC(uz;&oVmHO` z8p;dlA3+p}2~=*bbwpvb!l(%X{&rYn&1MdJ{^5Xx2Z=kSjD+`w+wKzPhlu5{U4O3o zt6_A+iMRlwI@kL|dtLaduqT?Wot>a+G4vcPM>KSm->Pbn4PC5DVPL{vR1IC%uQ~EZ zqXmsUsMvH?LOr0T?neS#(>Iu5FZ_J)Vfvm#P%J=mjgsLt>u zM-VXCS|ao}G|<8$aZku@O(iX7#Z{mfP|~IFIfj&SuTQr#xgpa&uEj-c9S`{U7Iw^{A}eAE1ZZ5!_wcM z$ZW}Zs-1HoxB1f)sOnTfqI-o3p4>O7fU{?_+GiR>E{PE^^!@4_4=2D<+dMrJkSa=M zhs@CPyUr&Z{9#*-RiGK#oYF}Bp@TF{{E~F$*BRcPdutbwDY}@FO1i)W=H$V4?;l|} zqWr1VbTFGC1uNE)Twfi%Vuw>cT)nRk;OF@-b?Ja0a&jsXvfbEU+jWgjOt!o}iUn^D zE5A-xf2_YSk2(i$uj+{ZufgOo>L8kExIB$&rnv&T|KT zC)Vv+R$C{4`mbbMUmXkeDi6Z*S%YYu49%;L3sV^oAp4t$zZn)nFFosE)bWTxgET-kQ{yWO%%V&ihDwH> zC|Y6`-r&yC-I{Qd7(ihY8xyOn*7ZzZs?^ymY>#HF6%1bh<5s*!VJ#{x{sT*eFd?6@ zxCHueEF^z8urOf^6})LTkv|ikQk1~*(I&1MNcsR=%}5j|3H}->x4JRU+5%I$Lo0%z&~$V)akr#UgInWELPWKuU4ucXUq$W#?yfQNXl z4{gUc7!w0zWNc_X40TQAMAtr%l@^<&rM?E%X$9lcf6`eSp#LYybtoFei

GOKVzsV=FHKHYFF-NnSWMwj!yXyi~+kPy$@Jc?^5Q36*|cy zeRBErIk58a(nhE$GF|nXotR~p&yH289R}ytg_w(r2d(*`#ny05f*@_gi$O zuPUaozGt##oqkvtDomsBKbiac-Indki7ZZh0zV-oEWoK`Xz4ftnAO8M(ymODWH9YS z{O=ai=uKuOB+~*<9SfP4ZVUi+Ok|W z#&)BZ@Wo6Z1aEf}MPvs6XT&DdZ^J~-xVK7|aE$m4i#G7L2FP{h%(V^j*^wM~bfI^^ zq2BFR&5Lw@avF@Aw1vC%p=eBa+l4v6t)R7I^}dAj)^#Nx%*yRrIGk@cv2)dY!oGst z=orPfiSH3fZMIe`$?}0MWR7_+e}Iq$-XJ1S)gCn0!izI#2H$6N7ScyZnsGrnRpK;l z5VS6iUk$}CJ#P8L#0qw-IyzmBCcf1BfNyI0p7<#?$!@G0QvYd8VWSRv)P`cr&@Hq| z0=>|tbe9RPZfB|F^<>O;?TK4uU_FLZylL??`XzvlJl{uPd9o7rGmE+yE;mqW8Qwt$;z0KF$j%W7f} z7g=K+U>!O{3z@0j`W4xrm^E`=$L>W#xm5WSC;(3wK~qAALu}pt=Ye2UC?gYWO=XiSO zBR-OB=Xzq9cbJ+2Sq34UKZGkn(syLbIr-;aF#`~OO<(WZuPF8`@<6q4jcT(rP^~H# z!nF@v5lZ?ff{gIq1jx?v0>aw}7wu-0YCE_LG!J_-&j-zlwYUR=qc4qIdDe`*6M7FN zRGI1#Zv;^1c%+Q`yyG)}{Wj`$$7v_$|NNN^iW_0;dVP@D&j-0r3EW7d3Jst^v%$aY zOHqcKHwzonb4`qdv`OE=`OYAp)GSD~0)B|FqUF)n^URiUYYMi57CxC%E%M3y{4{9a zrh*5<^Z8D&O`S2R9c3_mUAT8#(|!Gcmc%0wW}qUpqngzcka^xOIl>V6D9J+amF(3) z=q*(8=W$kmTQCQ%<{&-D@6T-ti`jH8rTOODDSi86oxJ>8rJGrQBggpiwe47cqufsz z9SwT3?4ThE+uVl+pMboOoUC*%rseZxgUJ!5orkTd-Nv$_=>wBFN<+BlpzO>(g+0?v z^gluxNP6+C1+^616?5Fmo$W{hs)7kNO&Os2iqXyeQTwXHIg9Zdt9#N~GQ_*EKNY$5 zsTbIZM3Ch&E$tKNLo$XsGpgOaAb6g}bDSJoGvHGt=kwB8)2k9s$$jp!dF*DV{ajnM zxhuxp+>(&pHNR?Uzpc(VPaRUs`kxn|yxgM~z)CcDlJJs}`v*z~9Px=-4dW_E;P;Gj z#SyF2?c8oYW-T(3`HW44rO;q9d8ezLG3uQId%Nv^ZF2)mw*CfRwoyM-CDg-X8S?j@ zXvPitv6iqC%4J>`zEr$Vy|~ut4(e*Gi}uQDy~t;*@*8STO4_(67bK6{VhUSOFxhv( z(T)r}P-!4U!vg|GGfnr`m#bVlTst!8T(sTyq62G`OGX z5$qoN=sSC#Z zmV>3~jHPyn8J$9xO{e>ARK^^yB>LH0;eg#|cn}MU8>}0!VtHYUL z1JInNbJL3Tch$y4Yjdp@%$>2EL%Z&0JK_?Y9_yS4{Vx?5Q;nj_Wq;NFnu`msEfgoS zl|hLGZ6e{FJ25pVs(O<^t`>2tj_By%<&e1~pzuIPK}(}e`(l64{X~cF$|ASYA?@q? zlrs*)K-6@JPZmEumLh6iJrObc;M3~K1twhm)&*w52A84*zDmCYe>;9VvjIw@cJsp) zKSXMS6j?pP?%{`W8 z>kgN0rkGCxCMCL-144R8c9!_Iejja2G8Fary#@~fb3Hx-S+7AIwU9fm$ED8y#oTcr z$C;1-{P~hwPC6>p zC_%;$>|&dR!JIfR1=X5!RUr-p>z5B)m}7MJ>L-pXypw#DjfF9UXJh2h5ArK@)0e836?Z*4 zv*EEH{K4=2E_x&E&taRHkskjE3e1FIR=m>H8S9+PksU*X zXA8^i5Uj_})!Vq-$`{=>(9C4QZ;K0Mh#F=_r(Z zx>yjG?@Uj2&29DSAk_RndE-r3dp z^b~S_kc73@-^0a6tN(~Fi3S0)AsF+8Ym?Rek2GKpZ!p13m9#il8qk>SUJ&GPeH=Dv z05)BTbJw#Fz&=rh+ndV*#f0ndmrR%f?AL>$y4`+>t(a`n@f0;`v>FzbsJt;Lku_zu#lN{I|>Gj*+7m; zx*vs}%l)oV4KGg$FUXww7D*#Ir>lcmfgGKRqNr}FtU`^ zY)%J{zkYjGReQgkFMjj08A0oqMebv&Z?*_%=6c^1J!02E<|YzFL2YVFp{)xEyha&k zHSG4#)^#@Bdv3Szahk3-IX8tP4Q%e{8Q&4y{6C3gUSsLcpU$2Z~;mugg} zXYSc)9}4A~&)IDojUTk-HPh1~&s;Ef8d)0}bJ}pden%#r7*PMp==Y3tC$e>|)`izS zfT9jfI-R!zlh>9CC!~4I>Rmop`84D5#Mhx!*}hvPB7Y0Dw^~ZVoA^T1QPGDo($GF+ z)86Ocj;i0-L%pvngj6m$fa;OLT?9-!FMMn(L;}lPE1_|>%#nDM9wwn@ut=BPMGUX5 zvU>?UbUQL9 zBKrfTq@gqy=1i~f9rkA=Y+Q_QWmSgocP2vN-)y`^HGgKmT3XfB;A6o?Hoi2oAY}kc zi{9r5Yq=-!Q`xK<+Sn%>jpKT9OYBG{Ey1Kjurxv5^(Nu^uy#itsdC|Z0EHXYBB#)q zxushx@)=5v_pzA^-FBH<2YHJ6irPOjf>wg(K3p$%hc)ZL4ml=au((|sZsDZ5J!Yuo zbpmU3MO|E}J>$97P+sSNP|$|yS`U;;!lt~9Z!lf@Tg}1_39e%vhhYl1G#G0;Dmi82xW_GaEljXKqo1S zmJD0QoW-cqq34Yo9?pY3lJ%gHym_O_&11-0t|GR5`^*S#qYYUS@D<$hL}a8h-%D)# zh)@<4#Z!hkdLCfpZ}cR;yPmV-)G*E+TG;9ew{$Q@^w=nepibpQTY749drN2Is{Z zvN~q#Q1kaXmaesd61 zPwSLzcvPhYt%bqGUJnYY)!8s+lN_3CFi|`#m7jS36b_&9nJ(wdD6!8}@JyBUD(2Svqf6RSt2O%ey1GJTj(r$4>ax=;r zqV3#_n>D-t&SQO0&(w60#eIDE=Xs14u>=lJ?|zsg=|)fBG_LC6n)2ENPR*Sd_R@3b z@Q91zU1?%p;v>T1b$=Wn(*A^n)><(Pb!IBB%qtZ42w zz$($@!_Zf-l<$rJqWb3!`od~-J7Dn`py&#-o*xTb(Uai57>o`;Q~YsBjIRebBx9t` zw0VfwcV2V~kl8b6suET|K&b*FWJjz|E6F|0o1@CxuJj^jGcj%R83okSO>Sqwp={yl zRS$Hl54G5+2G{;r9x9RNPBW!KJ>NygL2=l*&S$c@BuyX*2YR`}d~tBO+k%d3#;KmO zK92+?x!;jf=`Ck%8L>@ty5*&Q-yv&~lZohexDJ=g=IkW2w3lRdDz9B0$=jwKS!VI# zsYkwa%d=Pbr<5fSJDl2LNJy`v*!VrX71O}@@VG*vQ}<88^8NU_Vh2oZ^NM=<2r-5B7` zT$Ij>-mPdH+E<-DK^?_p@lFA3Cb34>cSRfrs8CX=(IZTh02`Ev}XRDOmcg^KfeE`;LBf~+gqkZLA*tTf!e_-;5YfyW@aKFzk@?Cy$# z*yPmiXwg%a(JnOeIa2c5&*wX6T)#;$zgqth=x41#s;?0>b+3xLE@#QlLa|G25f%QPkA9F~4klUnC%iy-Z0*;RbtrwhW)^jc37I)VwgOsDAhT zR{*=PTM91Hd`cpJmbiK|dLDyU|BeJiCr7*Ig%anQ8rJRxwmY|B%#PlZQs?y{vAIp4 zS2T8tS>xrA=p?K+E5a%U1ocz{#`Kk=jXALdB_59tb}KrS<{=)DR}nmhAf_VNcn^O} z^l-yR)W;m3y7`yn@x#@z3|uw+M34(}V9}(_)RbrlLQk^{zuY7pc*g+(5Sx0?Xe5Uu z6^N7S-wSp<91$h;5!nSG+qkit$9-3YTwd*(tl{aCY?A}hhXd(Z5e?3?kNLRrNG%8< z#qawcuWx9=#k|IFL*u)=xasDk_t9IWGQ&R)EsWJx`b>8SwNH!v@NZH8M#mbio)Ylx`>ngndIE$wTp$9hJWFz7dbU0v zd-!JqJk9(7j^GJAnw_CKL!ui_lf3jt@%1-JqoS?8kiiest0QvH%C=#jM#4p0l9_z4wWI!-0KbhA ztpaj}7$Oiz{>?S`lD<==&I<60-Qfwj%6j@k-3L+f!a3@Nj47=!qXX+*qvr-z2d-I4 z=t(E$)h_A$vZMTgP=z~dQU|9aq%F_%%hwCGY)?9lMIm zFbWhBL7t^Aa=9ccQO{SLXL+nx~5{R z{RIEtSeueDm3*NRrI8)o)NNi(E!zd+T+0Bz;VU3k=D(mJ zbpUU9zR5hf3`Idz!@4t0BcJ)5hc~*g)cNC2&(CD%K$?J=8EQr<)*JChnY_|O6nJzo;nCSl zE!){hl^3e2tAE0d5h36~jP1A_uCW@fy|NI;a4bJc5u4*(9c!?n=f87Nj0nZ934C&t zL%cD>9LQt&w}Cxn7mo6JDb_Bqdet)%BPA<2#$fvsqWEmjQ1H+)Rrhh&hX$>u)pHx6 zgDNtWBqQ5H-SOH__Oo(0CI4>9ntnoagGz)0-S3O$yqO$nf#{c1#^1zk#wboTMo(1RW&n_R(iVnjpNu4VSuy4FXqtT7M+!mD|$I!+2X>5NQ+SZZ29}pK+l8Z(d zE$OxKf#%>2;bvx|50Tc8MEs;aC!sFg?Zde%IiThM!&8*nadd6Z$Q5T;G~lMdchATo zEOX<&LAO?+(xpxI1H^CcCxUvnIqjQkLcmx*PPvJeTQS?>f2$qF);P7T_6d(T14Z=W z8LF!->rz*uMme(}gl9f5J)wFFs)M7tDRMPrNSbr-)>zZu#2YCw7aXNg+k z<^la|S@{cI!t%zbPd`yKF;*^sr&YS}x@YVA_kbFH%=3*AXCw~c`OiD}lleE(@>)ot zIWB{F$x?`~0}w9x>G8*?G2^l1Uf80dEx*j)zn9yO6kc%anObcg`X*F(64GJWd{Cw& z5G?grw9Y(2{k{&E8MBZxYT@<%jFx1kQbD%2rEk5)^^uKR?g=_ah3-85CzY*!u}|i4 zD4U7>Mz27)4KD7Ikdk!6c1v?e9U4u4oR*W!Sd>B<+O><0&OEtLUkza8l_d?gRsDS) zybXXIf)RsXFK1E(G!PSEA)a+Ny@%{RO?sk)%f9TFnC#fon#@1xN;Jc<&Drs2B(pq5 z?LuO&#j*Hs?a)h_i|}^(80D)eq)7V&V*m$17=I0v*tG~V)?G#FJkGMW5Jno$tuTq{ zgpUmE*U#VOI(=eI5H(9h`tgT^Au){e9M48zZ%HMHY`ZkE%A z`7@EI^cWo<_ms;@<4Y>o^Hy!y1giFHJ0{udmrNQp^8B&NkflB+=?B7|`C_x&VUJG_ zwnrCH>of2sqS6sJ6GD`O0VtE_JVA!`_BIEslksU!(O!p8$tKja`?w9{pD)9qUZS!8e?(^eW4z2vJ>sXJAoJ8 z>)k@EG1?5_XD4{E>fvt~ux&=&N%pIvK*fc5ebVzQn#F)e{Cq0e_xk9q(LZM6Q@e0+ z#3%ZhGRDZ8W1znu9`PiR&w*$!vgoJVu(b?8KLC;-f+2`fqzfV`3*IE!#Tf6xcwe>&0sePUYS7K`| zdkuC+^~uF@lzqhw*V5uIathv%q}jWo<}G&8Grc2fsPTa&^8r#)t+ z9Z8z_1YUk7I!tpppQAeOlq7)u+TI_SE7QKh=z4w zm9jaYAHa#S`B6*{m=Q5j>WhFGdV|~Z5&08gL6@^oaV8KZ)LMv+Zw)ShxvP@HHG&?MPuo$ z4SNcE=(qPEoi2*GOEU@}b>TWSTqPvkrFK+Q=tc`)FPY#31|M%a$Mwj(~1L%TTv%?)2<+@b=!JTeJDbgUt-67qrdE38=>_3nqPei zTFcL6S$=q>j-rxt*5MJujI0j~7$+zENF1oc`Sgzd3m805!SMZSt|sP zYfp|88r9})&Ot+sNGP218noXpa)gJ%2x}z?Um88^Nc=HX_0)P=Dg`fwxo>FA3_jT# zy}-Qxbo|U=vug72wJEa8@AdeZx_ddJNx!pIkZ_^|NN;eGq;iVvU*U28f@El8}p=J4W1k2{lzN86|_1(#NV;h zI59m%r4SxM!fZka+ealjg47MNn)twqf`d{REPRa(I>PD z@op-^>M=59frm3OU&8nXKtw-*lhCZ|^y{O3m9kyPUQj2?hsV|yu0+(gXg<9bUa<;d zGO8Gb*ZlmxNx;YEP^*qsti0qF*TQzUBMa@MRXF;IjbEAkcv4r4v0xr2v##`6((ofe*Q^YZ*or^H*o{rA%q8h`-Yh;N&@3iubK7-mfdEMELIO{}c zH65Q_e&%fQ>0#A{D^ZhkPn2D64&3EJPdSp=tA~97ARXh@#3{vL(7pjlrS#R?%4sEE z&Z0f*&+%NamNtZS?l1r8brcX@!E95%?&9L6IoZE&X?4}st z{yGt>j527MsUY2}J=>_ipeFTuLc7+s0ay;G)b5_{IhN?WoE9Kyx9TG8tIuKvJDYtI z@!MCAw=@36*2l^$Pah&}vw8w|B}3}#9;#Iz%{_snqX`k0HJisYM_?0XCIXL^yCqL% zvb}Xx&UtIN>Y7EzJxBM}z<v{|;{V+#{Zyw`dRa7WV0<~v{ z3m{nqQJ;50vkA2%LUdXXzz-=7nTl2IjWU>wnp3r27$(u%YMIsQ9y{H5ah9@kphVQ9 z=PyRWc~!PMtkQ+-UUEwl!x$){Xftxy5kdl;W+kuIinDt8F4&4FUT&?g~w z(YRyfwq?%1jYr&gw`kAJu)nG!wC03?=b?>TjDMcPpdBHeWz~1ARB8R69hqLexClF+ zHid=@vlanVA8tww+h{xJ)9S3o&HWjee)|{NT}d0}{`ellq{75#sUi0h=uY(WJo9N$ zXRY6NTK7Wq8WXD~XSm&8YibZHe2sD)SM49vRxb~~E>I(2$IDEoSG8cKuctzam8*?( ztF?GN?R?q_$v{CilpbQK|KC`ypN04o&8MI4IaFOyXV25!`iblZT_x{7SNW;`S?PR^ zfcp(1imCnE$lnPg0h6?|63Lrl*cFhpt%Prf3=Ha=ZYK0sw=hWM*;+0CF8ZGYHOPPA zPd>}AYc!KnzcLpN)O54movL$jN-b?a+3r_b&f2qKVp1Dv^U=eqxdzW;*@|6QWH zAj&TpWet>z1kCr72afjlUQA8;vq%ElNGG&H39V_js=5ViX(er3QGCp8V**cFw? z;;p&D$L}=Gm!~1S#W6GVg_)@nl!7u*5!K5>>dt!C|Nl4og`oTcL>IJ#6!JCRNTtOs z6#BaRLP1WJ-#!>l&NFM^JJ}Ty9HxoDuwv@-1uM-Vg04eOgObcHPh&0^<|Sv@ABpN> z2v<%Gl_SmIv$<{<)x9>D-j2<&4`A=ug)a9_q6kGcAEk>7$4(0#G@9MFS+uh1?FKsb zgUWc6CNxRpv-(KZ`M;l6{rGz>L?qqDB_h|yJ!98-_Aoq=a@MAuoIlTjt0;+UCVYkt zDwu9PKeY)?kUm5f7mmE-G%QCPQF!H_RIx+=Df`__M_e8fPyWLeHW=6314$P#x`7cGE>xP= z7Oy+L(_u(Av!WTBO@TsAOhSQ~QhqZ?Cm#DW+Ci>gcCgyx;d$$-K1!q(8<6|4>g~ke zYdAwVK!$yL=iVt&x55WWqHT-MaU&s`j2E;P)@~ z7l>;cO+4o&M512n^5;Jh)P;GK=vw@l8^L)C!fi{~)1l`cQ3ed3Ay8oC=|MEd?)tXe zDITePm0kW74wyR%Ilt=3V(urOQH9;74m_B7VGT`gh?LqXP}SLCs(6N&m6BRelH^Ld zg$>tuQ7Xl@+uD>6ZQJ_OjEXihH1F$eN|+EOAIXSzVbVXV7)$c9ILB*FIiA`0F>3;n z=O1K>PcIREj5|2Nh5LQ>dNnr>EJ%7(m=i`jYvg%pW+UZqWNb@f`R8Q6L;j=D!>aGN zB}gap-<4`M!R)h1ezQ(r{+h7#o%!%j_SKzAgScjS%)z5kkFTWFr>2%xMTrsi5#G3w z4?$~Cl%zsnh;m@RQlcW`cjmVQK{2L1;M&Kv>-wJVV=L3E6EKPkuS}5B%KCx2o`;@&j(yxe~re zKg|O}Tz==X$E;tOLKZkrIgdpCy}esFlCn!w82oWcpr)T=W4 zT|>Ah?Kn4_X)x(p>G#Gm5^qjOU&KDbAp!I5G4*RPr@Oxbr(7+d^^4Gva^aSpH1xcF zan)Ka&Jx9jaeW~jK>ua(#>Kg8+mwjjX=-hsQFC~2XhDGIppSgc$@Yyw2TuDWvNB2* zg~+zk01>1YQ9m+$eD7GefgqFdY1G`s#Da468922sCI1RQ(w~nS3$2@5^1pH3jotoK z9=j70&=C$$mnHXortX7Q-rtgK|rWGh@b|z>dF&57luwEof(J;9TsF$DNj~+lHnN_hpWo>_322#o(r#Rq^nE zW_0?q_^)e^)0E#vK{4aj6rHT#yQ1Ig_c~ zTcujrJJMC^;~*no z^o*p5QlPj!fPZA6$i-#*f>L@E1()@X91uOLp*1dDDwwZyKOYa}o;`qs@&F&Ga}X7+ z<8;dIM#Yww@lGIRQ%<8PVf52p2hTMksX0 z8@7C>3F8D42cdCRli7`G92rjZ?JJk4ID+YCm^B_70ZZrV7d+=_fS^YoC#ED?s}rc~ zH`+==&bYZ^7xsDbZw~cUMck*f1Qw8pltcCgYYGL_Z5c4%wsFCu<8HoP z`@)N-t`}S<)@KTh4;f+Rb97CFF$NC!4KMbPVYHo3dt)zbcT*+rc zsj)%XAxwF8IB@1pY)Vmr9wBGkSHJSrY|vld8@frpQ%lZJ6c7f~Y&6Rrz<8rOvH0LQ zPke{u^^oj_jE8ZkFwDbeA@rKoH*gtIQN?);rWpZrwDfye;!Qm5NExYvi-Y(`Lo z=0ct3Y5fE2+5AyU;-E-&-X`)Gb9a>8C|#J6b5ZnD)Xn-FI-4A8)I?_l&FSbUV?Nu7 zQgDcyX_Zh4sn6Q`19E7{xs>s{#h(%MG4EO5ZLqjVv`t3V>ra6~PNQ^jA7~@1pV^IP z33P?;?=&n%lH_kREcxm>0|DP~9O3bl_B|lmqe4X}4a9kg!a>_z1d`U^sGLp$VN>wf z1&&Oz!xCs4OG#qV->Hl4_^V+PH&*|XJ&VDwH{qF%A(?xI<%x+7m$tThS3e;#zSX~7 zP3R%fa@?YU_AuU>ak@O^2I8K})#!e)rNnQX7Pz_D;yBXWPn}rM1A#PZ!`EWgi*~NC zc;K~xdcof`(WeuBI^|*}mu!i1hh7wa`wx5RSBT__W1>8{`D}Qka#Sa>-agHfh^ub} za{@;f{?1;AqtLmYXceAXyCn(L!C^LMCGSNL`)pB}5pUp@iJ3kf z!cY5XJd3nOL}L6AgetOUHK772{tF3W+rR5td(Uy&7*(apug7KE%hvbYaN>1QRY_gJ zA2}iUjh12K509+ZK9lRSg)5II-jOzLXJ9-UQ9DO+MNGJq5yEgzd>{Z@s8_d)tRDAW zf2P+MXM6|oV(3UgkB7NLF^nr`l4!>l`Fa3Fc0Tm1hbQDZ|TBZIb?YyEM zDzF56t|C%*>YrtvMu)?=iZTr;g9!fiL+CdmPj3>^TWO;yp)+muy&**qHwy)6Dw$GL z0f~N>R|?rUisGXgO)w~qQ<);x|Kw<-e`>2qdHf>HeK!?z;_>1kvkWCQ#n#j`L1$)Y zCZ4WgmVW;Mzw}4{kb2R+yP!Q5hW#5?;qopkHPrgEe?OS4-Q^Nrdw7ArC?sPojOgmL zghveRqMx>+TY6V(bs16XJO%_xoj+F%!Gz?|nEX{8%JbiMr-Mo(W&I9lQyAQW7=0O< ziM*iv_bPQ*1{M_sW~QcRuo2Vk0<)j?)PmrBQE8u2sr2e$yPQMrQIU~Fd+o_wfnJLN=0`1`ZP%OMc?GF z$ECUx1`np^Gq|ncjQ=Xr(XE29_-wEpX=ecAq#QzJRK%dfM~aNRe|Q*H@q84dNDD1; zm;5WIa$lEYCh}=Vz?!tYA12K)JH#J-g0A8G>g$k6F{eQg5!l)sY+BJP)woDCmj7i- zwSH)4Y|VTn-{jbl-k3ei+{QE{ zpq$pM`4P`B#WJ>Bc)5`6!3uNkXMI-JmI6EXZ>ziC_u7_aU zWb@Zw^^KZ_OETmTyA)2ztfOfX_QStyzjoi00G$Ld3M5pjjDCWn(G zB{vRFTDN0GHWqgftH@i*B)D)l&bdF-&sZ@nHf$;+fhe%5^w_dSgjkg;*6=c>7YWkPJlrHAG4dVAVIaFg@_U#P&{FIiZ%=V z;Cd5Cj?*ZUUTVFVMyCryoK>!`E^`g}@L9e+ieHKaF8gPbosHJU%rN`sQ@H!XD8SQk&WXWUOK8uTIQ3;&UU&`opMvF zCjf+Tn#nYYb#!st=cDW4y7K!u=Ydf^2LIG7pntT8z#K2sr?0?jf^9#YIG6-8osFy@F%0KxhC{Eg)xJNJKP0cgU>{1wj^Lzz90)ZnZsfu&Vo z;R3VxeYR8~Pk3MZV_ZWp$X!*j==Zn3tK6ESly-eudi3+ETyB5rjY}^sDfCP_&z%8i z@@v732{Zc?T;OyF(ZS9h2 z@vTkRRvVr+MmcDVgzL0reGab@Ini^VWh`%T^(NforW5skG*S^nz^Y5s9AR+RVfNLAryKv=YkZvV@N-Ic`B$7FRew{v>S~F8t6l-tQyy z2xSTe``M4OIQFGXgIwMG4DD$cU%ace%O z2HTC>MKO0p=4l?gw85xS3VnHx{Bjj#cwBHc%Z9j?yUCrLQS*0w+2gR0lFiKsdB7X} zuZS|}nFQ>;UN0uI4wJ+|7S*{VEPPm7EkE_?AI=Wbf0YO@JadH&-thF~CwfbQ`4T<~ zd5wMel?NLPuVs7ct!l}_uCmkL%reIrHX7k0|vM(Sitpw0k@I6vf)v4|9%rwH!Ba^#aAq|sn zt5u^6tGpsa_~sP+`a2E!E8sLaHq*PtWkF|thWSH_P2e>RE7krQZ-kQYwa$iMrGxDt zu?TdAk2!M@C)@!Ly04KiPr%~&;&~J!&8pCDo1?rMJZBY&?=(3FMRC6GYMb&UzRUMr zgTmEp$aBkARK%gFN8Q79muj0<1o1nyi~71}A?OFxs~|aa&3)&2l|>In(ZLcjO>^}z zXUegR#DJv!V2^)~S+@_@a&t>xLGoh0~ud<-E21`_|?a%Ic61Hn2NSX(3 zmOJyKZ^qM}a8N3F^~i*-e8;2?0vQ^aj9JLW;-kom`8*Xt+D7HaQ)WVv4#PB z=PeqyKNYd1BgFhC9(;$&+IpN1DXt}GAL6!XV;4_*2CD5EY8ea#<*t$k3_l8>|qoCK&K& zW1;U`qBZ$m#J9NJa7vfiHiZue1M2t#w`#E0ON-y)kf;#D6Yv`9RjAIZR&p_7@`>tu z?>DpH^IP&I8jSfQ6|u?`PpeNaJfCWB_?p4e(3f~wWk2mLjLv9o^cpbCUt5bEm;r1% z2V-il#&wiP};gez;*SrOiBH_Rik`a9bMR7>AcI?CRf9 zn6ZJ2GLJuGhcePz+qK#Ad(Adnxx$4N3>J5G3xupx0J0O#dWA*gyo9lN%`A_h zkJx$SiLVwqCVrbwa`HMUk%L0|{XP6EMU*P$t4N|GulpP^I4p%SfS%WTQJ*_{IM{6V z%L%*BH?FwkY1^xU6;u8Pk;!RmAP2^HIk5=vkYrv++|A+Y6RxRb+~J!q4V>p6;uTL? z)i$d)6E5tru#T|%v00d-Ig8A8|Zba>sPVO)t)jPIvsKSKF?WH)}Ov&GQw5 zmRH-QgvHJGsOOBLYU_pM?mrTL-WQ|_yXR@7Es^nLEko^VOR(y52vR*jXMtZA6g7CP zlZ%}4X{yvN5or(xmgTD~dK2aq5^ut*_W3P0zo1dOISyPN#UAdiDmeSbnzO@l?Ib?^g-nAaSUO=v!I{o>Mck{u?q1cUu1B&7C zyHy}!6SnB&xu2fU%YAh)n(*%J$JC3buaDc&>m0sPn%#>Mv6zK;!NhvK#10sYm#1xa zOo|sEpWalXh(ECMG)Gh_f3o!ME7=i?382pCU&be#-_ero>NBE)CsJ{h>9A8?qGDJ8 zlBllg951CSZk(EVyVqwE*}vWJX2sX(ReQpfDhQkva@1{5D|#oZp7QaTxUQ2ETgoz= zceza_vQ^wGi&1YF9xR4nfHs?ptY~$+Xwu4%4*Pu-K!Sa(P={2kL=H2jM4)WX!T^;0+mvV*7L zJa1I`+IqUBq60tr&P>(v&2En~0yIqidQ*o5u4C0C)ninSpC#vtBHOS?J9N8r!~e;p z=E4K2d3~=A8#Wt%oQ?ydtWq&`k~*D1GKes!q9$XnH=jt$=?X~{jO=|E&#%cYdo|NP zi5n%cvIuj+Y|`TylRB-CBS*C&h7_&4Ufv0;`KjCCdvmc_v5C)kJ+oPd`HpN)|GeDQ zE9bvg*E++O7H8lw{1U93jZa5@?l{;?TOXZM-MSa>*)$qQiqt*osn=00? zs+xUxtfHCwgnOdF-7+y0U0Q|Z9FUcY46ujvLE-gIuxC1>L;VfYhzlOytK#vpuv`_} zm;sj2TND^z)<_)dWkIW3^x}5i_k3x^5$_+P;nsHT8%LNbCdQ?tR=r7cm9ISI|JgKf zmx?O`N%4QzX1X#!tn3tF%p}=O3GK*R~YZr#wsl|Bfg4&+&dS z*b7%L@)CQ7%XcrQf8xZ7i=4-M@{fUy?+Ph|5^Os;rv z*6ls@NF<#Zi4k|a-=6-)5b#uij)SQanEHIkeo6j(BnEd%y~V%(om(nqyspVzcN|WY zrngF7pPJ^UcW{LBy_5Z58(+Yg^T%+cf)KY;!f01n>HSIm!H^)+o_tizeUeu^&BZE+ z7Op_bn30LdIB%At?dFsSbAHT>4=*|>`NtEjA3PkdiIpQ-@GI({U7V36u3vX1q;XBR z(N3g~sL+RgYC)6t-6W029QizHYyQlDr`k#7JrAh$ zct07sTtLRZ&^x71(C zoe=42ZDV?w<8M2%kZM-y!YZShRs&eEFH7$|`b!j_RRW6sb$Qc`JYF&m=Rcz-Z;1bJ zIag*#cM@P)YnuRRMtV2YwXypOH9n(8WE0qFPbyk!4lf;i)j>1|amD2_Z_1Ize+~1) zq=dC`hrB{;xJ1a{DI*An+e?rmPTcUcMMN&jhThDC0^Ye3)RK#uZ6y4*|L_ODZLs|C zq^rd0G-V0_{3m~446Bo@cDN8l1=nL)pE!&uvM0(2N;Z;|vGKzLkjw(Ii>fILfHXUe znCAB*E+znIcLioBznRDy|DIHe59QFyoxsb6KrWT@XtBAI(-WM*ES0L(^@XO`6Y-Wt z*7GPm!&7DINd^>sv^~__Gw0sd(MvpPXIMOt8QS}~ zZ4PX6SE4?h86V$I>6uD*awUGpx1I6Y{^Oa_eliB7e<^ekZuvDg|D~mRdGB!JUBbwb zCjNA8w{8?iFYZo*iIhyDNkYdDM-;X35g!9TH%H`n6g0e5owuhJh4pff z>>RHf@jKQ9Wp+#JkBQ!DGi*y@TshJn`rhk@vq2FcT!medgI}(E|L%Sxd~djFWvl2o z|N27Z*fcamoW<|^IXlps3an}4F%e+sx^W{Fvm35;8k=%xQN+z8_B^XjEEn25D%CaQ zAPfm1zZu}m3y)MYP~wK`2ul#XtF*72*ofBR9*u8AzU%*F8Uk%o9@Rjcwxg(4V;W{i zv-uXiCmZJAYWKH*!b^|uiLi!o%jW4ctlxIP1|g5Yf=y7WKusML9rv>6iQDe>T<AT!`aHS3_$$%h8`6{J4CR zu#(F0@YjsOGFY$MZxxO_ZNw>hv5w|#FVuqtJq5mWmnQmX|KuBa@M`QJ{Td1SKK3Qz z@y{}7rqKLalwB{_&sV$7pO(p;F^E)g9H{^1>u3+RyU}d;ic-L(?V+}2?Zn$VR4-JC zAWD=aV6_BsQHI}g5$SVI_nOMu?9V7`IOh<}#E9$PA(hm&c7eddC~$7szc<^@VT)ze zNIc8z@Kpsw!TwcVU&JqZa_Z zcEITM^GJ#8`9{g;w72Ex;e@nW1iZ?PP(yobIy?riZogVR%Q{mDw^?}7V0 zdWD{Ssn#F**#$d4pr?vgMhXsd{2uXOZ;3wq!) zvB~V78BhjCG9^v7RaFfz8vL5uGBtq7x_&hwC=63F-7WW~+y*o+rnF6AMl>&TvU#&L zYxG3l4MX1Tu)zU#@W{}17oJw675j$$bJm}DGP&ge4YPxsdIu2axp>`umr_xElWa-b{1hPC*8 zd7eVC(PVcoDHd+u(|N&~)ex(64P=d4ifg-9=Qiq>fROen3S7JA@T$#$6WCC;*i=lW zbMkoi!phk2nKN`%+Zq$MU&mcM);ZXgzjIx7tVbDMB3oXXPbnY03qCPxCRCw5I3X>Q zj@8a^OLjYsy}FPh^%e|~?7@C^xZ=7nY>lh3gO`Z3qi0Wv#!g>qL$PN+6s!|dH%1Q^Uuwj-9HDae|tVP#D)ghbL*Ios9 zLu@?NN0#q=Al(h-peUy6u-+GbrV^r||3#}TySxdIwz~mO%P;U;UOm-g+_UMaEQiFW#HDxpIjo{7=Gt9%#yhZx9H=3f{JQ4fM9fVRO_O zh-I^|NZky{C@1$#C1=XP-ds=3laFt|DZ;+|j5Gv3`9Mc~dck6Z zK~3q$ogi(!34BC(_KT^UX0?rzjqTPmzS*qUV8ki^9}Qo&RGH<)Zc~1IkLvXCh5}cS zCGghYoW}-vxX@(CFL{uWjE$0F+Z4saf-MNs`VdkL0ZH^FI(c^^eo=4EXpyDSRkhuEe$@2;qaxm_vlS>G-X(;1P3?sEtQ8xac&|-~6Z!zua zOyfspS-+9pyC@T&;2WZ-M3*U&E3_s_Jo=BzHzDHyvWha7uMdH$!q~Jw&)rUmc(NO4 zQj+za>~NPE2!CsuLm`rd89v$v9Hvpdw5|0RG`1Iboe0(doM`2?E zB!L`5y?kA~(9hi`JNP zUnpN_Xx)Sp(c=Lty`k_m(M-Z2s`)yVeI0xv4VdAU!Q%y2>qnOrgz155KBCLFW zjzq(-ze2HqdA(A~@Hi=)oCY1eS~}&@ZBg(?Vq^19FYO@)y)fvk5vH&kBK}lYia9Es zzdGr7#z)?A8;uc^R0fDX9z${LN)#OL<-deWP!t2`(E5L46UWxOAg}|-*zHc-ME9r4 zs(%5l$t{t;vbC_Y)hCrRjpQ^&cTuF>OARKo6Izy%?$9$9LtKr;FCBP?V48JE?2+2!iiZ1UgB2g!6wR)`VfgW2R`J7mu_3kv z0)#HX{RF-}^yw-2e9hrGrFkX2*8giqlF_uOEW9`Wu3`Rs`wv9e|N9?^u((R$GMcv$wuQsSvl{cA8jiyYksnr0 zOpOLmTXyxMROhS>%UoaTan#5bHuw8+ENz`v3*5=KS{g$&JEmd6Dq%AaQzov9gni(T!K2?`fN8_JR{ElxZMjq0#`eg3G87hE_$ zvb;MVmQiNvW|5@2-rjoO%(tutD~Qp zZU8St&XDTR+~@$DODVUwe>BuZmk&QQ%@O8z`hy7+S2(WV3wx~XSa-w3&+JS8vS1Ch zt;}6Un~6IfmDOjX2iz@;7SRnYi7`1!kgX<%uwgaj+RygC*(;zhoHO`Fc ziB}~_lHC}Q8#Uy$v`GgA{;70)tFk67dKTIbNXu0nJ7CafaUY`Y$caSr;LRdt5~ zbrYY{_ey}NB~^{?Hjw4l*(tHM)){IJG?Fr2P+X^H7^Nz|_2o*%n63XqXq0`jaY}E@ zO8dd|>E{R7#HGP#S$z3m({O6rv$L{(f*d{Hw6hPxq)v34d0f@VTZa|mzW4~`CB3!| zAaTDVv5ctQMJhuM>8wad0hTKomez{WT4l|*8%OXi#b0J9*5?N!;pUh^UXhbY+A=M# zoO4DQEVN1CtePETa5qFrB)t>Q8I2A9=%TY}R@L1Qr!+B#$PMl^`CR$_?}XA4iVg^_ z`9aAV%iER&TMQW!2fzoUx_SN16P(U*ncOFMSvhOsBiRyca{YYPaEko!)?txsx~=@d ze_?>8n+(23?ls49Qe4&%FxfAv9d$KQ!CXBCALW(z)1Bo0Qef7}t$;!(*s^*?E&bYH zP6X(JrW9tktUSKsEmK}!=)@mqS(Ma0p+@+Z9qCpL;$#<8hk$WtL1H>3N5eY}3qDaY z!NuCMjo=31iLSZ(R<|%JvmG?q#z%8$5CjI$`p?Vctj_+i2*?Vt+67kjtK{y_p)-c# zf}YN#EL2{QAgQ2FrSIW6Pv4=XixPL8BA>lT(PgrI+j4DsWFuj?6YV8*wr6nJn{wsV z!?@N0urQ2{qww+5b?j(Zhc(+g~S2}SdxKq9&h%RdmN>z z%o>0EPFRM<0Z;2-zm)8WJ{Z!g>hxQRPwOa02y~V^>?N&qrxYT}-5)R>*ldLYdT(Cb zs*wFXHJrMRwf`K7U%yvX*YrM%LithR4-wzUEA53ays*$K;y;X-$kb)*+WZXkIr218 zf^vhBDmP&~@zU*kn!mw?dQ1nU zz5bpM{S_cpM@K|P4imejn86Df%&`N4&&zDdLu@s0JM}OnX5(bZR+f82! z&?{L43-~dMAh@uJIi)Kf6nDqugbiXPN|bX@ES@l24`HjZExbs)n}C5XEI2{J-iiM#@} zMOwk7-;x8sHAkW5fmuz_z4tLFap%uKqf;h(-Kd~q^e_anS$i3j$|OhyR2bfSo)`K= zHK*GJh|SF!kjzAqDqYTL74LT?Q#~<_gH)1>1&`e+%X%VZlAE>P@FlL6G zVW{Mi>|=-S{ss|Beoal0-Y!VUL=8V=@0O%`tD=`!1`;f26k)Ph=2|G~*}6E4a8V_n`KoXxo5%?>jToGP2`{iQne z>!7@Qa2u+Ct-6wrJ$M=tvsrqWaZb6xOus>(gr+^@g6#n`X{L*dv?1+F zeI~+y9Q54bQsq9oDTr(3=Gwedi|51=<(bJSTZicaOeNMW4iSg`T{EqISl>vOl9HI& zs!K1^Q??Ocj2qFqvjeX3jMGU-9WmXv<-siO3JvqeY33f(>2$!^myzNfK0CY^k^DF)fT-$MNnXFaa5>75=cuR}K52P19K5e%NT z_8;*+o(jUZsDf{5r;C}WbL{>s*7RCpo-n%(n=bIiuQ1}7!T-JF#ek%@ z!DgnL1^LXH7vs*V8Qs=TnK82!Tf2#KO2+il4qTg+$rp3?n2#}34ll2@1YXLIFn6pe*q>pmSpmk{r)2Fyj!uX;JVM9crT6f zpA||!WbI<0F=exs=+cfinf2E1Ni;*adp-wUqMcOq)?M1MC9`rc|%Rm`#%B@}*XWT0%%06&HT=s|L>o<2jYZGI?>aqAndoa~-|acS1T( z*`R;B37y(;E8NgAE$wDQict1n^J6-p-2?rjrtsUooC7e}q1-7d(Lu)ml0|1WLFr74@L*If8scB_t*Z9$PdF@ zTvj^n|Bk7b3u0wDzLnU@5F%}@>!CG*)&}ZiXL75yHO;1*{zun81ha%~P}MLEir-yW z{PY|&-O6y_t;zMYjYJ0F;aF30bZ`S8g^(vjN=(P-w16$GA#lK~oV41>f7i}OX=|-kX{wNv0 z?axNX)FTh1%_?-&Ig@#bCwVrjg4*xwFleox`puuzp5QGy-r!`LeZ#||Gvuiu^y4x! z>`!hMjaIkWx1XNS?wUQd^!}>G&0ortN5_f@o0g_%_9O2!K%TM1HNJ?VV@80U9d;Ee zUiOuNJTE*@b+>9xl%-X%(H@yIQ;OPI@xa+vVBR~Tm(=CqRO9Ph0kK1KEz8M5gaglf zUi9!5m9MCMLm@;>-IhUxJ=$(y6@EN83(OiDoB=5z+w%IiQ-hTbmxDwBxXafE0>K2+ zKhR!okCH^el?PSt+{jrD;h>dpM1JkuaFoEl0y6-+2LmfqT2beoH3J!?n-IKY7Ip%E z6N{u<9tY;hLEe!4K%(`J+Mhz`pO|=W@4GEd9PHY8r2c0Hl%asEBoiD)y3+qBDKd0&)(%CS8YP67tVfkee|e0VznH*aPW1j1cBc6RD({BYb+WX>M*R7o z0pLRZ#jZhiwfVsfaXhtY4xi?xv3BSp|3{uOypXGYD^~og>iNsDtQh_q3OV}lU7Ui6 zB%bp&$2_d7_8oZt3I5sPROv;uP_1sbtk|n-Mk9*SpUmKoVZTAlD1?`^OO4u4cU+>t zj^}q2=Rv0qlu;PdmYxO$-i3I)@%n^>#_uU-cCW!@{nS5W2&Y*ZLiM_5)?Ab6IrN(2 z6?;#gi@3X43_)Fq3=Xa5^t%nWrU-<}f$$rgdm8hop@0yjkn7h87BfdvN!6S;T*^%w zqko^xr=`os4T{{&>gIKS5}aq;ay&`#oduk^FES`1v1$c1w6U&2whaB+< z;z_TSIAZv`tdRY}@GqoHI1$g1zQ>l7Gn`x&2#VYob`c2Z(5+c) z@btLR101+^d3@j&WQ1G6XM%$NIYyd0?Iq;&aOiy;Ue_n5~`?aeCG)5 zqymYuNuYWy^&qdmp$GuyScgH6Rw+8E8e~T3;Ifv2?@$9!vuuuZ!<|BX?82HyC3I;a zPfRS#>#{zGVSX7w)Fq7Y(mFBY{@R;A~?4T<12q^E* zrsNa^OBWBttbpDLpT~ZzNbGn*uJC*FM{VC)yF4h*-=zB4=J8)1=x^^`z6MD4<0#ys zF5Z$K^ca&m`p`Fytw|DhD7pKy-p%11Cx}r+Ha=U5DoJ+yTQ_$!&E!I=JZpUciq(cd zdI+cnzKD6NSeKZTiNu$aRHj9=^6lo{fcvB=h`)o}f2SGL9Pa$ii%MdF*IG7u@~a~Z z%=-fYqTCx_^&I_yF2Y`X@ZCe|nfBI?=o1e(*!h}B+bdy0a!0P}<_REU{uD#t&5jX- z9Z>ea#k#_%@T9?Q+N~7{QBP%>O5p!8nR9h-%M02&ccylRB7Cy4I#E$~U~wtd{MMM{ zEf##ACoF&4LSjt4i+W)&xF(bl!h7|ADA^?c&2ve38r|C7FQd2hBA$F{Djr%ke=tmJ zAlbM7AglB6Z%5=~^E=J%wavKO(A3@a^p(kz-3OS#L6l0H$M+iCe>H?R4E)VVYfOoO z4~7wf-rbVdO{!~xJ0kyu58@Hdd6N3wyRNR~=J*u~e8KCRaB^sut2s3EuQKt^(X&fJ z&`Z&lbU-5ztA_%zL|2VoMhFbQf365GG=oUbwR;iq>sSuO{PVD%8?oe?b+$=Xl{|d# zXG;g_-H^AVKx4|KOC2ne{+%rAI_BxSFDsPqCksE&EG(+Y#;{2hLXN59JtyT~IdPzC zF3u|}omQE?h}t1r@wjbE zE!?xuk6Tt+FEA-tktDUTnt6>ug;DXJqu!mRWx^3fl4-VJ<(VUnh#D3E~l9(>7 zJEqaBS~QVuhy3-+?RJ?2jH*wq`$ME+x|f?am?EfR34W7_VWH!Di^Gh0@T*=?W^GnU zNRtX3E+Kw4FWF8=H04M1Ius;Eg)yX%s0|*Xmp~-~f-AOP_aO9Sk@r`(+kR#pJ*@e& zDoUe&6CGt1$3U_ZldKc8pA*y&?~f=KUp0$d>&U&$;D%39GqRtLl!Y7A$stnTX>KbG zcw%}~Ds%HVq2psq#f1@u*@OmhWN$)l>3C!4&6=w$!ZRpuqm&keH0Qa2DYVjC0;uGc zGr|SOV2t)TL8x_X`SdR_6Fp(v|5f4hNm~diHXbvf6ilPQlTMTiox!e>SA`TJ1(^R= zWJU2fzCHUt9Ao1@9787s#}p?>HF%V}GYr4j4a^!bB1&W@K&R9nA`{V2Ri{Kzhk56B zbhqA|_pcQgS{P0I{~eP!C$@u`B|<(ocfbjr&S&Jc3 zXGbtHE`UYHTI8ZN5zZJUs*3kCzUBCN@a&kXQxS-nk$6UOM&L8B0FG$bZ&*EsVt)*T zT~CK3AaT7vh%^y*hxhbjlqnOg7DF9x^*`|vNN1$5TbNH1Sijt?5d374N8(GV9N)RQ zpuK*7WW}FiLNA#~_|^JNg~LaUJ3D>85TMtDUyoF>Kd8wLV?;8i>kj9e)0BU!&2iIb zUFRlegY@?&>Vl8(FT!J5=Vgcj>pjjwBm z+HT>MLDw0*=PH~F07I!7S0~ou7+gzkiwK$HH*Ck-ElZWbUzpR8LD2sSKmRD}F>1P8 zv3Wrd-Pi7FH7}U>zzGk2Y_<0LVy95ErF3DFxdyY2VMX#TK(oV8H826Z1QJF;+oGxo ze&V==|4lpvi+6rbV<=h`s?;4bSAwR6k*{!ZIeX*JR3U`k1;o3FzhdDQQ!362&MAF> zF4gc4i=T1x*m>DQx8XZvpPH#fAR`7kjSDH63$jW(?hm3h;xvMmnaa%~hAO6!Y@rHM zq(_k6I?xt_AS>u*4xGwZ9iG&OCU}R~qGNnkmw)I&55at18j_a>V%(@&EcWTcd2;zO zb2>JvMP57_QH(qpQ4Ha?64cYe(eaU$YPqx7pj1(e*ESAU!cRQe&I&3rweN#5hUs|b z>2bo>mXP17@Mge`PepB)cz(DIsD~|vBE-$LMmb2s7k6<4J?B>()EbN_HFbC~D=f*~ zV*7JY>5{`kyOZ{vn}~$-Kt9+b--+!?$uzB@o)SUTI>OmAY}!2C9sub0t-sX1H(`Of z{|Eb&Faxm^?dw6YnZXlUVEQl4PcH^?S`1roeICz{Eo~ z*MmgIq9e;JkDu!rG9rIB?o*Q}g@8GjffGTNjm(8PtHdl~bFuWzm3yq*Kd2;&syz~? z>E_Q^^_T@M4yMM!@Doo-@Zzmv?6_q4(2|r{BJFaV)Rx0o(HhJezPMD%2)HaFGW%_L z!AEPF2&<2`4!a^JoJtvxvW|x{5S^)pM4nG{c+sf;Kv^J~PTz(lwq}Z-@X@`GV(p*Q zQ6X_`mT8mae&or)s2$%7B*^c>?U4F%grl-o&48i;J6^P9FuaOMIB)Ob-UOkrt-;j- zouL;qs`;z)q2A@wtqp<8rE=VgUWFYdy9q_Yp|T3IkdXGLVf5kR?fivuLR0uF)*zfR zO}o5h4@0{N2%|X_2rjmzMZNoU`Rhdm%0#Gf{d7NA@L~ot^<7(L2RpC|F{YXJ%$ZyN zI$VS7{Woo4pEKdo8$yNs;`ZYx!OU?hMdxL+lA?Y+D=rOD*E1qpX27WjhFz_CH&gZH z>u@7cspSaUhWn~p7ed}{KfW1zh(+mF%xgMUSd-W<*%6-0J#UyqR`_*hJAAug%+-aK zNhs2*Lxb-gcX64fs3*>ub=_h(`=u5-@WPFsaG>%j18ZK|>vu)#fD|@hlO0!Ly{a3Q z(B%oy-Gey{d{tVTg2)DN%pj>&_|_CzhvWLCLT-(lsZ|+ zVtBAXps^aIq7ZqIzo>pH+hs`VM8DpI*3!DC@-;KHK(NV8=RBlzo5s2scp(Vaout*F z`?xRa?uS_BAMv5cq8cV%zPu}`e|El=t>i--I6{7)0xW#09PX=9;pMzte4=^UDafUx zg2qQ%DoUr$?9HqE28Df#5|dGxWxr?j7jJ>{8D~9$#|Qs*5t3%bO7kQos~#m43P5N7 zXW$>eN-8!pNu6Jc2{e9AQ^OigF!GL>H=IvTx#)&h!#`ISWj~q(vK@(Nk};V~q3nXX z`cuUl=2q6j{6A<(*@jUK7H?dMJid}v6S*q@%o2pay|8l$A_tv)jp2=b4?`wTZ7GkU zqT=9uCHBR`SE`hC>+{m{?Z9!b;MaW!#@`bqqB?F<84dOonNF8_&4nPx z6tqsLzH)e(G!!m)s{gO9uMUgy`QD~Wx)JH_4v|>8yF^M_VwX-8SW4-ZbfsIGC8Sfj zL|VFALUQ@7@u~0izWd)^GjpCZ=b63c%ze*&Hh8dN6?;(z^m)?O-ZgmxKjRQf({}lp zADX^;lA3mc|1*oqvmxUoO{dMOJPz6m&!$7b=KRa7T#vF4$`2#0e7b3ZXA!tLA4JKX z!x*$L_T zjL^lv050>jK~-udeyL;5#{^~Fn_wmVpM25N4-0z$(_72eTq^FGM3bm+QJ0-77&s)t z4q5r71+p*QH~~2Xh%4XgqU+zD89mV!_GbGo+PJ-Wr0zc~Ym*f9Dq|4|E@3?PzPVEh z4os_=HLS0uIgsPN8O&ez)bk~z6c(8Jo>5a}^&qf<{8P7NbZEsXF$7`GxdV9dG_?2} zn}CBTbF%Gyi0Y#^C4-UK>a)Rumag9qx*XEN8$bHz4dYC%B$b$c z#eLnW8Vtka-0NWkY20F^%-Hd`Am_Oa*=Nb?@hNE6ZHo6zA@Q%KxqFW#KGiz!vyRY> zAFbL`xW*GDAuEk~>@6BY67dN$a~pZJ*<~D3o=g)lkCc_qN8Y6r$_jCr&j@)j!34D$6kK|e$gO{n%4#WM06Uck^j)UvEjgYB=Ws#J-6f zCj~Stv(uc_HO@tteLcr zNan(gw*d1)&^Pfa#-ylB5c=uUip}v~#M?>%eVE}8V|L_Y={k{9^W@Tk3ZK|D=A4`npd9`zIt)>*&iZTa*iFCkfz>bsC>-W~VQE_Gx$hB}*a5>PLTZLbF<4xwZMc zKs>k2a>KzaMj|j|3DmAtWgjQ@hNUQA;VJ!Yb;u5-aONED+Jhq=NkztU#P@s#8+B2D z;w^zBzw|3+su6KARm@cx@vF5*UHv@o2=-@-ov(#GRR^Q&cU@sP5FLSQtEe@04EZ;r z@;m8V5ZNAqD2s^rlS4~T#QM5=>M`GTPJb^Izx*a|V8g4`3CoV>MZF%K54zG%GJ`5l z2VJ{zVqO_3%HCEy4dE*o;}=KO4G2%{4w4(U#j(9~f)v$!cFBohqUuF>CEPLJSwZ3G zZgb96{i2K1wWZ8sR+;6-0c>MVqUgnK9zv&QM0jr2e&HK_FzNIqQ6Ft2J&Utr&*^;2 zRTcljzPYDgd~-$}7;}Bo$4cj}=cOGzYa%tlTpDVSA2qI=!&Ul0{3TF2G<&b~f9%fU#XY z_b0c?#_zA^>2W6ECdmC*WaNRwW#)Pd>xMs-H`YVJ>mwJ(jK-0n?%Dc9*oiC(>{5EIjbSrDyivY!b6e;%5FPwK9QV4tkfdQ{V={U)FdFt| z1EjV>l+Br~_BK02x|Q}DOowjNBR7feHf{R-!=wAiU0dZP`-lwKDD}cV5z-Oa+R7D! z{oRRXdw5Gbt#Dl72l?SL!||~m-s~4iC9hr|6THZ(t!BB#|0gINj1DHIm?HaQCHZ>#CMkfYkfG|My z#4AbB?t6vJd=an!n7X4kMc1S=Fx_yM+BE~MqlA>jVyQ$)W7gKk34arqysl^3p6U21bJc6#Pl5(-JgD?1eQLnxR?t-9UM1#UvEs~ zk`spo!JeM;b1(Q7ZKdGYDz8DD-6LH)?;m|j<<6<+o2k0m7v0vKr=KR8Wc+sPT8yla zK}I5V=)xI-c#-K4F*>~0ZY5`^)H2N_B(N4O8=Z3x3{q^hucY;=>ksM|jE8K=8s0X_xFa%no*e0E;Gi+iSb2TLg2GI`r~Q zoek;@eRl*@%p%`8N!04NDxyq@mGHSWTM89;R+h2gC1#C9h!`Fyw+y4xQaR5XM6bw| z_DRc>vAiz|ba|P} z)lv!)>D%>lSs_(i!aIsw>y78)xKQ~hD64_t?M3&J_Ci1T!39dQ)szy-#_*2Q68pMB z%{PZx4Ju>>q4dRdbHr=(QW)Dk&GD_^6zFJ)wUN}qNx32#2E7gWno89VO*jy_r_{f{ zOdvspB>Ajys4SVsN>9C6dV1AQ1`qCEOzThY(2>FDy466u`Pb&9t=fm`6u=cC^6s@q zIM|l~RFx?@x@}xXvsy&<56kko^_z7X!?Q=AmL8AQoAZMIZ3i*DJvS84R}w$dbulqo za^$h=Bw`1-%HK^)n=`?fKhzXSNSMc+;-5bG;>f4c%4KzdqqW5D`L29|``>yj0R1rr z{g`j+0cTgQa&F1nj|1eQPd(MQaI4>uJ4YsJWJVjBJ4@vSxCp?whZ|Jw{%10ZIN*^k zC(3x|%C2>@AuU;XaeM67N06q{%~~LpJV> zwKgsHi){pk5i!^=ITrILT-I?0NDuN@FU`rrgq~mgy$nhb?V|D!zN;mMCVQh>qi>ME zpozNR<<4f8!=Ahr0s;xHvYfPDZEs#8geu=D}dDnTyQx4QN zRNg$%vpsoaNWHu?{NX3f+sDhoV^pQ_al=_8F{PxwgljfN^7KueIGD7zh(|>q1S### zA?@8~?C3YyJf1rhgHT~Z9(<$_Cq+(j41I$6no1*VO*keU>DFgm@J!shNpWnFoN9zY zBR~GP_6=;nFy_lRuPK>Nx*Bi?@s^s815KZ*fbWbPQq6Bmrcj|2C(q1uw-N(@yUeE4 zAQZv7<&8c88RXmL&6UNxGLptO@nT-gprbMA(;+2V8wpT_l8mo`d}8H~f$aNzsm=_+ zrpsCxp%N0g^Y^ugbW1F`0c$ss&*Hz{y`9kVK<(KRUirG}?c%tjVzLz~o&O^^ZX}^F zShzgS$en1*{}#6jPWoME1{dvc)~brZ*C~8`Ax8lERH3F^mp2&;a@#z(V+B`=5kx1W ztDBn3R*?D@mrxg>j95bbPs@A-t1eU?y~}P%^=|T&IzBC=Qkuq?YJPZ{z$ooSJM|u4v-($`l%}eRhNYqePRz4A zmhk*34P_J0DZZzCx7AMnFN^8mx>(KIY$g(CScQHwIU=sn(i9{z72073d#ZPI9kb&5 zg)ob* zcj*e=a`}?n+59I`WF=MEar?ZUDm=8X-#VGfhMZ@=1Ui^oMbw!dUzm;nulaaZeGiba zl~WONAL*GGWv11J+t^477L@joi1~7TPe)}{neO7+^$Z!k=-LoTVV`C{MTK*iIx$~) z{^2l_$dpOCl8Iu*t2$KDV_+u8eU{jU-AQ}#cYVhOa~~VhZ;-+$%8R{-%}>at>EYB~ z_2MJ47banf%OHQLkVi&?58mi;f~eL!5zYOh=#V3P56*SI;j1{=a`jEtgcYAZyw@xW zrD@_eiR`|47x$iLnfZik0O-e5P&h@G3<}03UVzq*deq}OyS2nFH=X;RFENX(qLRnn z8_N|i!^qn0D8DR4jv}f0oHz0Ii(AdNL^?R{-6#17g>v2Z<2_7+=|2bz7gt{|t_l z0gTrwTh|5v>h%Y+Ft_waR~;PF2g-7-C0oW{pY~zQp3(_kw|ow8m!jZ%7){d@N}LN@ zKwL0uCiH|ctWfOW++8PMUNv$pE1%iwzC!T~UR!Ci$nFFYReQjtGd`GR;ST6D8Z*N7f z8pKl)_y3j>_0IQ$Ry5+%-zfD^OvVKe+mO9}Tx=8x=>k^+?J~n{v2s<-lodW9wXm(w zoWU2}n0t<1nf_kX#x|cXmPtf-s7|>5HcUmlB6Sq(@5sz9$6hu~eM**H8h~3uHY#@! z`Dn?OCwTfy>&xDNYl`QDXC>~ro`sADO0!&gm23NBvvsbRL_epi$!%<0Z+1xOsEY@^U# z+16jeWq2{wv6CTamgo4h!66KUBpqD%qadxk{6L=Vu$^ah_mkxLKW{rMeRNZN4ah*n&;kBKi3S165_>Ly9+|GR8LUI%dUz4)F(s9` zFi1oP9jz1Yb8@Khj8IzBYBR^1tDBv%vYr-k z5t1+hkPomujDpNN+|Bmi3<+TFk z`RuvTx(8ATRLO=n+FZK30F);7oI&G;#v4I2?EvWg{_Aat47DFn-_M#6(n?mFqeI?l z#b3-7A5#DYtZ<2`QL3kf-aWF*VKk40DYRJf+slN?JN*mge<>I*nb9d+&H5gqoF*of z#+TSauvqe`SCE32spmGD2qiJ2lGWp?onwHEWM02%7&fi_I^Ny3CYa4!7e?w&yp{WK zjgQrnr0FYZ?mKL!@$%$_?S`MC6SSRr{Samz5_`qR@5zI|y(}VRuq8CfX4NzU=j1DD zsqBasUgN5}9(sD*&{GgtCvXft?q`*XZl>U1YY9L^gm3<@z@e>bLwb6LpP0=H)pDFt zRKq9MotJNa%(NP;s(e+_tN0u;AEwVNafK#9;i)F~nL=N)*`Naaq^2bI%i{aOFmm}zC*Uy1>mZk=NJuWy;|Rugd+1$ zMXMM{Laob6BFl4sq1VL85p(peXu_V}{bW-*AB=vY*Gkd5=F(?g7uNx1d%nA*Y@Kak3?7hXLNmj;FHu8zi#I!H&k`az! z(dqEgin%;}b?&oK@4`!y-U?%E#<_aREO4emt!7y*nnD}%Bvd0yInj%*wSD-JrzZrm z1lHmBb$Ws!f9lYuh|w^9$0j68^TIt`>(&a*mF%B`MZaL|4r4O~V zl!oES36d-n%w5s1|513f>mJQa(T025E|+ZZMW-Y{`5#MptXQ)>cOgeY8>41r<}!A@ z%KD1suG<>Td0qCNHRBVpukVKV?s&#+(;ND!dVFh+s5<{egY|^|uhh3#)AhRPpfVz1 zjB(P4ni<}=dqQBKA2b^_l-B6R;PAt>{ea?&`4&ocfJbHjBpgz{U>GZ(*A#cuHp+v3 zNM!iXQygr)kJBsU5;UPfa_3y~P`MaNKyj7y>+J9Y5Bv_p?2smz-!2@$0-*Icmn@-R zTHU;^k8Am+rj=r2F|flZqQjc3Mm)qe9D%xQK_!^op5~0t84{aLduGOuhN^rN2PXC> z*zUyvPbt4F&WVdIx05VXsBjRhPzP*G)PKS8TG1WLV0nj~r4a^FOz5m!z4TkBTI;8w zlgiqUd+UxgJ)Lja-izsgG<<7#0#8dha9LZgj$-7cAaY2L+Uwe05bVp|oXD9BKj?c; z!9zLoOei~Isik0SwR+zXWeB&kPKX`ByJ$6@?A-8@N?_9Su?=yr8U$R)BO0BmDuzeV z5dj!=7J05j+c9Rx;5;{o)y3_`nwmYa{u>*`^6t=^)}4E4U++WIOZLs&31AK`MAl8p ze5+E<`1boo-<#|ou_rF0bDDKYL@GnO`5dftB4484*P4*@yE22?$RB_<42Bz+$VDH| z9OrZpm8Q_0_MA6R*Q9_5To|Ox&DCD-1{;ovum>l&FrbAublNgTk$KkB)abB{(^{_( z=Fw;d4JJ%~j4?iY#7h#~I<`hA618dcJlp635$$^trRo}uPz2$3hV2$Dl{zlhc&}gE zl1m%DOQWiL8-GCDslEra>qhFcd|YikAsn7~GsqU`nfqcg_fxM*`;J-yy?^pIWJuKf zQm3^l?GDWi=E-0fv)`M(M~VP5?y>{R8pn;D+~+;U0CW<+8} zG4U-4Jz>O3w0giv-&(KU@5IM%z{t*KKkWpL={q_%742eMTqm_&>W~%^2X@(7yw8#4 zo$K^jAY(QjHr}=Cc1s0TU0(Uy` ze7Pc07Xcffe<-~&iNh2;zuTwN>Wj|sG@34cLX9#!#QV)uE5<>%(4g);q~&Cxo9uYO z*(_^wQj~29ir##d`0ID4K}(wkWC0{d!<7X=f$$ozM->NgW!B7iWJuJjnFpHl1;Umx zd3!{um&RQuh0KcZ!j!-9r?2#5Pi<=Jc0T+3IraRj0uF!ye%F2x1OMu05ug9u#DBr_ zv&Bqlz~k5H``@5Z^v_zR41o30{{ldPa#<}uTVB$!P5ycIPbBX**d!UPbHa=?_gB-B9!M^NT7{{1y%niq&NPkspB3 zF%U&WM9#Lyk)@@jx995ikmasgSoGl*yBnIC9;T(G6_u7cTB-0i{W~aNbRtALr)JMR z(t&{id;$V6Xt7Bm6}1svZ!q_M4r6z!)Q~B6Q32b=4jU6dLq#A^oF#Goj#AFB$Ch@6}V0~{Iqt)>&Kz# zhEGPe>>KtB#P#t2t}rcyf32NmIzYS6*%Fbac=oYEkYa*@Jp9EUj1hL92(PTH9KJ59!3w$jdtOrIlFl*^ku#mEy{06o^PD82Ge+z;*~K^qN_7hFg5YKK&iX z+~Ol6jG`!n-_4b}U@oaMV(0@rU-GaGv58JMo8P10?Kl`dcAvnqQ29TG5Wr$conBPl zrG#*+zs3vF(mIK_e1|c}m)ei*|V_v#) z?r(SS@u9;RIsh;ujQsc9QG!rH z4rreOF@Hwzmraa$+rPU80^$NIsZdI}{_)LX9{caE_!xxv(BS8FOn-*r(;lPJHm>5o TFgwCVfPa+bHRQ@=o(KIOucDNs diff --git a/jupyter_home.png b/jupyter_home.png deleted file mode 100644 index 770b1aaee338b8935056dc371055289bd8a7becd..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 59843 zcmd41V|Zpww=Npn?iihn;wWdHymCh%c2dPJD#`II)Z#ky~$d@?hn1o_uU=j>P2?7PfgcbOM z;tfY-!%+B36oVw);7czr-#b5#KDjTwGrcB{DmxxlvpSYmJ+_=^0I$6MW1g!E{)6}W`swfL#Z@A>)t;#jyX96 zHUNDo5q}>dp5lgVdjMpwYzPeN0qD9TviTUQ4x?%F(h_|mfB-=23hZ#9JwkI{>zQ3+ zZ(L-DSOf)-5h9H30LHO!?JuECFzR){f(w+~EAK;G9;BufMIbLnf{bG8o z6U8k&^GdHGgf)acLlZA8*Uzjccgr)I9mFhKfwrv|hl?Ec;Z#6lU)DAMe2bsDhE#In zJ0H&KT{V=U?HlMbA}Oyy-&Q^bt@m}$wjSgV`2Y_KyUxGAx?;ZfBykGdTYen7KA&O{ zV1zQxcbMX71mk2DH$v!mIDjY+HM7ib4eq;a)tKVp-14yR+n!NP0zqhB5uRyW9(R}f z<@g+eO0@oJ9MTu6HcpPN*pb#*o?)1uxq}CXm(l9LfvXNSiP^BRrbW_PUF>=#2?SC7 zSPZ6d1H(|gENAefVC+4V+4`XU-tD{y=zS?d-?+X5PZJoRwt8VJkmpA>4cO|79ze7J z3j)luy~d@&5bU{XHUs{h7i_Z|S@bRO?PRBV11W&hikI;z0$&Ee!ua4y&_`dfd5fi|EAZHv`G(<0xN>rGekEf523 zlERe7{5yHxV3W`WSxSe^X8b_04{dro>#_9(= zT>9$E8{qVv6)MXAdkom*PfJ@wl%M*tVAel5z6gCqw&`5K+AL; zDdwAJJw-N01fwM$hq>)Rs-`ao*65zBMwbjS*imE!r~XN?KK|h6Kgu@R3q!^{(CbUvS4SPTFFU zD19JZD?MCuT@y@`Koh-Lz2&eurG=^Gy}7O#tHs0(!}a0j>BjJy@!E6WXm?>3c6;K$ zYA@qxZ=Ycw`lzjUX`-o7!Zm55@<~gnP52q|ZS+n1P3?{9?G}g=$QVcjlpI7DgbXAB z)C*(^+8Kfx8WNffA_JuqF%{7ml^H1ttrEeTh#zGJf$h8ecY6FVR3)@~bSi{%RDn>k zkmrzt(EE_Qe$)Z40qFi$X9MT5-*Bs1^&YA}#rY)&`E&VqGrrg5ml(DiM((4N0g{=N zz?8t1=*&XR0?rc8Rv8@|MHuZGof>tHg^je19ge^aBOcrxM2#$rdi_FAdW`S-{weRr z*CV-;&mZ0!lp9B=RwHkxEXF&cM5be=nj*I%{~UjVj2aQvI7*iQmVl}xv_QMSQL3}> zeg18svTWiv;^tmUk>OOp3$;_+3+nxKN4tmCqxL38Wls^`?2lR3s*$Rb9JNxm3gXg( zA}4{f@Wy&(wb#yv+Ne_L2jvnpFN2F#sU#J zp?X0()rd*4ZE`I$?KN?B5q4>i389fALcbmh^$QK>+!^9iYSao>upIbq@ozDqb@dEm z&FIX@EZv50{7@H1XUV5d{Tm|g<<8)+z|#Dgm#h*k8?5H6bk;(b1J@ds=a$=+lh$ZB zj@i>4FkIs8BkVj6`!Dv-6we|Y8eOQJwAYK8Xgs#!!+%P|jZJ&+yazw>aCLj*Z+G|k zGlGkOU$*LN;%HX2RsNzB70MT4A!_n$a&y6XyuHZcWaO&lH0E01nsSr&DDlX+O}$or zoV?4tpSc>jPPtvU?l^v(n3}GdwprQg{Y?Ed_7&h&!sni5s-_O%9_A^gO6GX7wbco6 z6_p=;B3KYu2|gSl*`JmD;pk&(g+X+qR79cq{Y~(xEu;g{4UvcFoahp_kOSMXnL?bB zoYZOuO7Q+2ndgw?;fb}wM>V<>A#Vkj$I zN7PodSY}^5LL!}2&o-AZlsz<^(Nq51P>-H{g{6f9kx-?pGt zwD?C6h6c9WTTXvx{$Ae%(SozrMXu{}Nm4miS6|Yi&%J`;j%Z!J@ zqp?+i;6VcUVQIa_w(?U&bd#11{03dioW07k^#NCp$MfU2iy~!#1-XS=CT^y^2I&Sb zr$HyGd-MB8JW=lE#qOn;$Cwj6ccL7+8oN5ORWF7c$2UpUSH@?95gJDwPFz|{S7uZ0Wfnu0JNsoL*SCV9u@n`+s?JEki1Lmo4}Z5`?BX1$41;XS zcglBG3(dPveJ-t+t($A@K3qH8*JkvN_Fm-Iz}Etkj8l%!5x*t-%0DbGhrIBLxC1@a z(Kzk6kQ4wThBJz{T8?K`wFUF0S|PgWraLQ-Nr}i9uci zRB`1=Zi99Ox`JQ>S%l+-jB%qPp+cP5^&J(tgvpezlf}~)?S8oigG}{TBa_C}N+*8b zk}{J%7sq11f4I9#nL62;qBf??&=}VoSD)a^G8x_Q>ey zGE*W8DhoR2sI{0C*XrSt^{r}+e}lon$kSZr$}e*H>a|h$~v2U3SQlPHtj$9n)0^PI(a@~I}z$d)o=KBP3;@J z1neCg);`~+sp9b{p{KHK40fxdSe@EI0%`e@?a>n$A@>!7*s_Br5O7vw?0s1Xm3otxB^oPs1b1)a3H(P;n(#Y)}TQq zl9tDn|FV*|l36KOC1n?6dtqH+Kf9Q-o!Pqd#J|yPJfo5i*C_I68}gd+tRmWk~!$%__zl=@N?rk>A5D@7oawrkjB z{BSjSI}iBXKaBkPD?;X!B&{$;KDtPY>;CC(IfdpV+$4zsZR&kEed;BhE9v&&Z3?;` zUFqiMjRbTnjTFkHc4q5M4cv?1WYnXFrE4YuW9eLYg*K-mXCYgaQE1XHk!YnBoYw`Wc6IV{;MrcC?H*dO%Q3nQ_~OWHP$!Yd+2gQuIB*e;%=*SMb_eS&9(J@Yy6 zsdI{RzzhR*L)#yEH{rKcyiFO;hhp-V~k@iKBw=Vjl1nPZVfIO z<_3mqpS#(x^nTj8oO+b$QD`EMy16y-%WCUN?eKvp15i1?YGcUbOr@wK1f=Q&U&AU> zI@1Y$(FKj9t)vQND8)yHT4i_z~ymc_LtlSZ=y(^#mm`^51E;ERdMB&$3Xi6)um zs1`TxSaUr<~Qc*xTJyLXn554&WA88i}AXC#aNyn&WyHF4UYFc$6{C zGI`ZIHUv7FI{96|cbL5s_=5+r31<%e9NZ&mDMT}7ty8`QfhLkr9*b6vTufo+ZZvlm zY@>TkgA>7t%t%h(WKTMW7QXmQc!!7Y$C>-HaWS`# zbyRc$|CRjp@T;39Io7{1xgzTnMmeCfP`S-!$ekzJ9<;aR*a_h7TgcV@)8DTYmE~_o;d6> z?OB40r(D`~cl?5WW>qycy*nKm5f3p0l1je0W87Lq+VR zjk|#b{W#;yQh`jqJ8;9u$3)zKnS#7;1FHrf6Ud1okH8D$%!zU@!%1c1qu_~S9r&?_ z8oIp~D<$JG`{(V@yJAF&l;r}?3>>RqxJN+7#Wv^@l#?*St^MrVj<7Saqq+8c0`SBE zjq3HwjnGff6>jENEo3aB%;Q&Rk!wH#4JGTRnQxqj6)pV52W<*JBR$JDb3c7H$73jK zTx_gpWNB1uoT>|F(o?%%4OO#Pn^HGXP3kagD`|^n-(nZ;@VW0co;Gn*Baw+W_Nl&I zsoN^+TOAtG{6o%B+fw$e=RyLhCXQO3d^TeCmVU~pYOG-R^#=xpI5{1e6}gY{H-&SR zv-P>k&Bi8yJPijy3tf!O%S!?wdU2CwZy9oi{7AKP+ENb*f>ubr%0JirU*S)FVfF6!l@Sud zZAEQ5Btx47aZEqS)rf2Il{4GbRyFto}JPsqHgdb0V9gQf~dPj=HjP) z^K*>Sj2rbY^%IUBC#lCy#~j}*F@$5s9TUhh)FPh0jS(mtFPbw`G{XOu!xoG}l9rkj ztAw#Oz%|d5+ideRaeltBfUbt}MDqOMsJtv8sG%qup^e>zMdX*1pO~9e_^W5Ee0Xvg zcFcvGQ{FRwzT{2)pbhgL*UYJ`qZ3mv)sPC$N(2X33zf^w%Lo{_&tJG^P(_km@@RsM z@{aO7mV;{Su-&RrEAibZm}JOi66$qVGZSx{8{=?wZ}9TsAokpCt8C3<)?^}jMTfeZ znU?Ta-_~JRgkGZ9TUGF>Atbi1Ue0*tL_~WKVqg$EjrAgfH*GluB{4#ZEhUIM!83@c zgNcZHj!xsKeHFBUvjHf9e}>{XcZe9V0DB2Hu}z6~1Ea#SYr(Isurqu+NF|u`H zKXJBk1@64arPZ+Y)h$qbiXgQEwrOwGy>>%xD-CaAL!;m+_}ket;Mv8Wac1+ z{*l{)&EGb8%X0E&?vI@!&`3}jB2Wi#hoW{LcEhe*J!!pt`2%x(eF>is!N{D*@MKG5 zxd_#fcw~p>u4WkLTBoUJG76HieI^n5MGYwpW{utZum^f4W+&W(CL*=c1R{RLIz(E; zw$C-4p1WwuU{PeNQR-=U|Apj4y3vLn)5^=+RM$$k+ck=w<&4uU$lqKjPPtC?? zggf>!CiO?9olK~Gy<6Dq z_aX<5$Jef|BOkAST`9gXnv?7hz0B3L(CFNC2~VJVa=dfB1z$uT1fGQ-hF2wdCMG5l zC7SrOzWr94Q7cg;t_&)Fs3I#rsPxicQKwPYS*zY?eun-ycsd&-sW)qK^BVii(YCK$ zTLKp>fCVtNbhr&h0BCjqur#!^u0*x6wniZu8N>oe1$O>eTj{hg1swKi3uGSv+xvLO z8}k9&t)0+5b~dX0x$LF*+G-j6< z6#kd}Ys5up>gZ_8PDA73;zI4hKy71hLPN*K#zsR+PeV^n^+iGD;AZWp=SpSmK=fD0 zf8+=nIT+ZR**copSmXbZtEX?{+gt5z-xf)rj37T0MSv!20#?3@e&-ss<|83^K4gDXS zs{hT&$nw8%{*Rge!THA$b{Ts!qpu+S2?aMDC(Zx%>|c6Lnm=*+A94Gelz*OnWrZ7x zljgrN%ne2QnyU=}zzZNI$fxKE_&WpAQ*q(>(FanE!0$UH(hmeQ0SH0nP|DF=AVm~e zQ==Jl6pGc-W1R|o^_`dybut81o$AQ3BD#Bha@SG ze+B`*2naw>fHeMmFsd!+|BSxq=2-w~@bI!;CKYb}!#BMT>FV`WrTe6RWl3fFQrer}H2MCA$^-$y&k0OvMB%UYR8By|WfC7~2!OxZZNUH$ zRZV0QI0OG!!k;~YjNoVWzp!7~3m3vBn7FyIadMTp3e&~|qrkx%Sh-d(4&7gY(4;?H zU3CXSqgT6Ja)W?_8@xTe?wp;i$toq;|8*xj7P*)|V?A#np+8}Zm7GR_LWV!g`0j1n z?Cpx}HTG8bx}ce|lIQb{0SO6-*Mpuyk8*YQzao*~yQW@kWMtIrc>u+~)}(@gA*#y7 znHK9lTXf7G;|wCo1GcY2%tLPgyat5x3YY-~wCOFL+E^iwJ$E9jsTrYM@e_ro(Gsg{ zj`DGEmD?=okC)vE5Y6Yp$Z0G#gCqTd5wG`Ebe6}mN`kkQ09zk8PuIZP=1^+YhLPsX z>{x|hNL$QYzI^b&E^mNFojt7X-{E=)2##m`-_m?3C@J$}Z|xavHrpZVl~aHG$w9Ln z$ok?>tHT3OnOf!*dn4$Ew=?+2t9uf(8G$RAj_oR{XJ*!Ekne7m@5G$hxvFsZ3@B zhe#A}TbgOeQ=7|9ix)HOJ5rYXi)*aY;tdIYGx9c8Hb2EUu>GuW`xq}IEWr^8;F)mV zTh8MS-IJV?c4H~1!hKZ7DGbxhTu^C424%xpse&?IoG997gT+$3koJSV#ZODzp~NgV z)~ux?0Y9frd#XP!+OhyyQ&ekz4+?9>-2(I!-#uMGI%WV2!GLwd0t0cv=P6U_Ar9=O z`PSWX={xu8n>Bp;zE8(wT}3zAOKQ~l0QNwr5l?9>BqTH{ZonUbjARtyb&hH=xsV2O_E)+MH*|F5A6*_@6E~Z2(>w%(|SBu z7DUK!KZB&CqzsRa@}DhM%;X4w8`&)t%jMC!-!8i$iA0khL}0PN#m2@y9r1jIUUq%rMswakTy#8Y7rfty{CT;j z6(O6eQhF|Hao;hX&Ut(RyOdsK+y;V3w4a@@kpR-7bPjfw99gB?aJ%ndyW!K!q@5{f z?W0ojOY2ca3DK~y%2bowJ2n+TUO%t;;KK@r2*;xal#rLpRH}BzQy9WxQjliEtgWf# z<>hTp=gIm92FTp1lbwGrBZ`TMv3fqys-}P574hU~u-pR@;^Xr|8r1ARD^BdrQ!Z3> zzCabpWGM^AkCFfQ!7D5b*4(Kn(`j6D5bXE%=2^g=p4j*KveZTHbTV52O(K;VAdy79 zv)$t#Ci;0D?IY;uST7dg^L&6_AZ0R@S)h{nKwiaO!+R@|{!h^=LAKTLqoM$`&ZvaD ztku0KB@sE4RpWeC!gUT?j@yRI^|YTA<1zzk`X;l#>SC!@i(&S&aNsbZ2&fGm+X{B60$AVJ@{?VK5=AYsw+`C_F`q(;~+7I`UQ?1=rw zmM-{~^E?0&nbvBRu_Ek=sbc=)HtRFu>un=fE?)%OO!glO4GrzNKU=!)N0#xe(4tz8 z_e=k0+epO+q~BYgcY7}070)$V(r;MeMAJYsnj~t@6WASHsgb=IT2*_s*d4uoT4zsx zR|{Cl5_(uM$x`Z93SVoq(%^i*n+to&;PKI3LGYgg0Rz(?jv*MGRA{G9&QQkNWb_xR zQmN7oLmk(%+_?!Jn~n0E+#MD9s7|g!9W^rFEnVo6$>I(>7+s`>hj#FsxSpohII1(A z0D@-$2L%o7Hn-jXs(!q`NVYqH#x@2QbBL5JoV3Q>FsLR@bCppmA(fPNl%6eimok~h zx1W{K#3#ewpVN=zz~QiqL(t&ASwtOatBH2KUU1Cy62k528B34t{B~_#)I?w+IHcuX z3UW|OX4|M}+ALLVw2Zw$pvP;$_LfPeQb2Qe7&22;T;V@#XOv@(COZ04BK-HcD_OyTUoUJa zrQj}`dM4I^imj?{O9Y=k7?~ZL3>97lCvo?aNEy7%w~?Q1219Va*IVW|!|W&Bt`MFM z#_m#o&D1pC92KTn$9bhQ{jf**vqfV-HaB&*>ax^|el0hg2J@Q+Djur#Qlc*HK51lu zMoKr=$I7;g-a@Ig_-73?)kgl6Fl=T^4E09KlK26$W=X0Fr)TvP&iOI@l`0(`qk_6; zP5L3doWrZDD_aF`_02_iF!jv8su+!c2&?bsUn#j$8-}ZXCv%`_eb{1K6osNdr#H+A z#vZY*m{on4K3{QCLQ`r~a0ZW6y;b{~_w}V#n?Kv)s=IkkN+WBc8ES_N^v%DCM{x}i zRyHhXb3*=e{`iuf&~=?gABkC?5jd*ZteWmh9mis{t)j79V`|U z7Lus5GBaz2MjZYP9{!f|heq^oRb4i&&vD#v%B)*`Qe1N5oP8lNbxv;kZ@kGckHh8y?ck-GL@4lvM0L&2%~KMEq>#nX zh|pBg$hdDjihrwGKEL#bB0ueh)5Hr2sBj)b=JF z=}iPGP`paJ8wHOY!IHar$E)1bhSX&>>@Ar$Q)(CYA2_@-_8hf8ziVu2{$%Ob^@*Ai zn0ecIvBi{HueeRL_-%u(v4BvQ9rpO>ooKE^@l^?KgKYQGPm?RBqokTG+2=V%lae9#)Qr*ZD^^W}5pU9z!O_@k@+A>y zljy5#U9Eiwx7J_1v~pSPk(~iq%Wn>=Ja0Sju6vOzS6QELBHyeJ-L$mX#<3KxCgmt= zY1*%T#_@d4oc3BF{(Evv1X)+neP+?BN(Y4NfS1$;t(WzgO6a^-sv)+JcxJ*u({T5e7SP^6ScSO=64|$G#~Ob5 zRrwQK5t(I~qYD>P6M4>OAD zU+l3iJ72BC*=<)W4@L)(%zcJ+{z(@bVAE!Ja_6~UQ?4I>>TSCN_w*y+YG6)=+iNLz zS0W_#8l0z;2V&DH9Oq{Rd1orVM!|eQ5R2jVyO&zJt7wx_nQ~!rYW4LQefvAX&<+*2 zn}vY$@2T8=4!#=EI@4?A5SxzsM8fQFG&!x|YXs_{11UUL)8vfACVRcNKDTPu$qWwk zcCDS>K%&PPCmOA|o`)0Rq?XW~s#C2EpoR#7CHp4gO#<;78 z&g#J_Nu>W>(Wq3>RM7fX+I)@fCXX&i{XVGyta0H2V8N`z@OScc=C5HD1M7Z7Gk+AE zBhbtk!57b$=QvII>BC`gRz4_&LqET_(=leqq_ zVw0+7lAD{YJvR4%v#@KpTV=vy;_)^~7)Y;B$P_tvV+>#A>~x;o;anQUvoO&H_ohkW zdBBFJ_;(xTg;4yb`tOQ66d(9~&+Yoe<7Afd6Iqnu1$qbQmbu$u;?BG~uvv9U<{XtU zjY@cPn@^5D2b+C57b#sRgJgDFv!|@Iyf=gPb@kDCm^`R8QTm!PBw$ zSdHzQ*t5%2Pj2bjHGAtbbJf}L`F!0PCCnef^^k<5qv`u&;B(57Oj#OQrB>R|4z2VM zo`s5*+v4?s`rktmL~~WQJ^V~AiOk0&{5j~`jzz7mS@P2~*bPk~{BvFQPafeR?=a!% zH>AmNngl-Ub3(b$V~Dz+nGpB@A3NSXfx54vJF9IJBf_rO?ykXhAE+@-ncBikQ0Sje?rRskQ*8@6J16rx z#NCU8YW@${&{*LV;pm^Wt<{u2lspxU>#^#D)DjoZg!ISkfcNgjD`3jq;qS8uOYEMY|jn3t* zlQJ4v#h@XTbjQJ}fmS0d3Z_QAsL5r*p@qqb%i}>YF62O6!5&HHpG4CB!9dRo2#wpd zm4-R#APpN*=kWvFl5q6H#}R3#Pem1fLvY-y_fq%fqwRpP-+hd#DI2uIrW8_-=z_s4^>2on0RfxPR9Utul>XV* zA^b?hyFYGLNT7sL|G`6lfO92?pzfKt>?}0kzgq+!FUTSo%Jn~bzM?Iw^0h-d5&t;( z3$%G5mhtg{EP%sY|I-I3rU0xOmwZ`a{ukuoe;ubfkYbh`Pf?2uNkINZ#Kial{r05) zW)Hy@=nK7*@+MIJM`%d*DAyO`KUlsS^2?+Q0HxZ$Mt|7_MDh5~%%4sK>7Tv;km88^ zpJ4uBfhb=G{tC%IqjcUc;~`~(`Sa-0hI*?!kVy$7&FsXGeob>XK+nA}o63~_ceK9xx<^f_J{i~7kLP+-f zpRmvpedQBOK>9za`2U4qdzOPOWuZ>F=){hr<~Eq3KF8nfMjwR*FW?)IpA=qZPe0on zVtgw+mVy6OTgpszh;Zk(S1Tav;OJX9h_Xmc(3W$3n?6l|EyQ<>{#%AR0 zAhLpcU(=>T7vHy2r{y+^ZyJ-wIe8oGeF7S~rx_h|jGTfcvf4LtmYk;=P0UFsIm2sB z*R*vDN(;923HcA+DpxDuWEfAfjO?yU+~{ItAgBJkd5yXbGUr2nz&eC%TsSAIq->}q z-&%f9jdc6rITJL2h(>M%0Dp3UvO77b{G89asV1ud7-17SC9Q%eL$1_+3A?O*)Q_E8 zO~OnIm4a;f;j9h`n{GF0#(8^aH!d?x^!pIuO~(-e$O+WdSpgdjPgI5cm)#wG!?SwE zs;9dLOi%|R_|d&2utOd7Bw*B{*r{X1)Zl5qr6JSg`Pf;2FcJ%=pRiSry9YYJCUUiD zk7HWm$q=55VhE`ROOb2jksV9B4=b7s9K(iGxP!F*YAvH>nAA}6naO zm$9qBExEvN5i(XmC)I~LE{t!hk|h(#b}PpM`cF+>CUg#hB2Vr!w2RCRH5$=BaC zPpDQ~qI>VeD0^M+{!BvHhy{y=j=P>}+U!j#ibrt7V1%WaNP&Q}CmZVPJ)~wu5r`>- z5T1F)qx1FxT4{M=`PIq~YE#ez?2d+;1}&$eTziT|5ycd!98rDGoqerSWPF(!Gul{a*eKcgw)$&Kxws<7)+7EF(Tw<25Hn zDWtCY=kX3@)B`PQ~ETtC{F`lZ9?cd&5jnX z>WxLCBWdKQ=a|}6kH2b=>$}aOH(N&y2@PRs-6W;^UQP7`7A5Hc6%dS47WU6A2g@0~I@T!4s$pHR z%~73A6pd0g(FD(ol&xp=NKLFoC~e5KIcE!qYsO9kb9@EKUC;S(D#}o>5;{ssL)z_S zTR!0<-z$OhSfg#%MVvN7^fOXT?qzjLwfeG93N)2*g)h1avMed42X_F8qpWWqh34Q4vdXI5XK zn5m_@kuqpDQ*ixY^MGqOxMAy>3gq+U^AFGH()yWN6!5j9;fe`$8mh>Z-oRX0N3Vwm zRfRP1uva0|id_2r3+5-CYFOj?CY0HzBWUR8*!bS--shR1=Jl@W(Nt(C({RNs{i}(i zE-ht@6myO}u?K%OJ0HtDuKpHnpW6b@>euRqoK#6iFa-dLT>p7s)1%;OvqHuYuXW|# zkRI%s`g{cz$BdhSgPtc{9D7a$)lF4s$5NvuU>v3LGY zuVl&GRFMjr!<_L-oEmfccqC@0UK-U7GW3%C2|sanrymWcuFsp3 zBgmwOqy$PMGeRF2!Ir>p<%;#GrwxsW*1MYOKg$A*An>Nq>8zZFp)CD*)g7IE4B44+ z?uUl;xcv56%L{9dVpU;&3rY+O&w7y(qfHGK5!vt;$9b@-RpN0=+L`U|Fg(=6Rw2!x zez5k|>~kt-00s$)Y@_Z$3{tQ(`C9YQZ+@I|GOy<{?`Irc49XrP!VI+A=wDsPsL(B( zk#py43)NkA4^Q?(XVg$#AqZPdDjW`Ytu+qS?)YJ_7NP@5h{kZt#7KG4NuO2f>Do%D z;D2{A-3P&J>5fz5bEw8*mQm@SZKpZ0*etZ@DNU_`mf>j~BYYdL!G6=5NgMV$!*jLS z!Mnx{c6aI7s~Q(kH4+#iv=Y#+Zf>|(fXw2a2OTi{_Tq}V*I${JoMEe`w$C^#Y$Qp@ ztJf|{R$f8xY>8HZ;jwHH}Y&iDUM(N&_h}?rHsG=F484H+EDY z@-|>~=m_a*w1+C0$`*0>f^GLbS4ITiW>2m5;h8U*oAI*sI47|`g<4{~T-(4|f*G#| zDZD&9UWj9z~yBK8AJi?#Q9M~5r{x+u_))6_~GKquEV zrOsxqF*h2BNC2@ia%%eYnQ%ibBGrd)Of-r=n-7xF-mijqtkdD@`b}@JAC$G_6Cc5i zVOC*kJ2OdHm&8oc`bF%wW=%~0!{zb&wdX=>g^}HTMmZBK+lxE+w8(_pcD^-)jHs|O zw-4L3K7x#Tc3y&dgrX6*Qd=Pr>=@e|@tJGP=CQcxWb7!*XYc$^>-m``2a)yQGSzoC z`aGbQC98o!L8*K zZ^LD32`YB^V2^^@)Ejdj!rDhPX@|NBE&~Q1-(<=?UYucIhy$tFBI%&zi_^orTM2aY zr%42|A-Bv&pB1U?)@&3 ze(nm$h$6aZXH0_TlNkboLnWKT#;7ehAh1@s$A8^#T558k6rqC^JcnNOqg=nqD^dh` zbmrwSN?8F05aLYG6PX&HFDWuYfl;3qI%l57p6kIzy633?{6_@Z(DO6qmK^gWGQC5p z1E<{wlxLe0p&q!l=Orzy!E7~W=-FDC0_|s7*{B6)A`$4rtrKh_obBCZ2x_O45qjv-z%SKi?ljNw<=Wl96c zCHeS3CpEXh0ma2{qYVhfCP38p4a!qidB2C}^I2>yThOS+v#u$2oxVOA(3aGSF#1#t z0UiwS$Mr$pV+YIAea6oJup>s}{`9BI2R3f!#-V29w|q@Nyky3Zg1>^^o29c6L|MFV z4OpomTgH)?xVk00CO!+f1gY&-4M|v%iJ(#x>~&27nWmc`7gxwLo%*^3$JY*~BdTt> z11KEQ?j}{_t|Gh;2zOzB!a`0P?WiHe5sv6)zA`_ul++p0SuNdD>Y$OlFlsL~z3sNc znK5x<$JN3q3V_`yq8X5d@6KPrvD5bw!thsT>|ik+`Q~W@$#0Rmm)XIMi=~Yd=_X^6 z=W2ynh;=KcV)sZ&p^kG1b2sO@HnpVMgXL`-us3Y-qK_MT8YlBBLSQd5nWdzw_Gg>u z3_p@&T9b%KpbDB@R)_QfF0LSAB=C2x%hM17$a>LnjK$zX-M$m;wL3F<y6fp>{HY{Uo-}{x%~2i1K!*C0nmD&6)Y-s)!_N|QkL>a z;^!aa_dzrrEXy&}d#|j&+g70zhVB)EweO4QZYNM8(z#I>%n%dL*wpPZ@+OyGK?0J4 zHmsKFV1(S|L3}VXn-FtClKc1$h3a3DNuF^|vr_jh^3-AX4**0CGV_Dsc2yWm_A4bL zivT?2PCc^RX-?sIBBIj8(8*wby5!15!gIrEVIO z&VsHTd<`lt#XQ!(wCxPJ!|L<_#K?bLPkh}gFQ2f~j^G4EW-lp9@767s)h8H0cEGPi zF`HoEzYcB=OUKrJ!6RF!FVtG3n;?0P2mx~$X}0sAFPi)ZX?MRvno^LgcM;D5{Renb z3w|NdKz4yr!oR3CGT0Zwa_>aKEc9P!j_@T5{S>Ao?C+Qn_Y12=7SR+C{|o>4K{VY_ z?Ra^mbXZ*oYT=X%tkxR0CrJAL!fxI@!atCTiF$q(^RG%%aS+3hX0rRndZoX(5HsRS zqkm4~&%ekon+);mhFrXZ$_f?ej|xAqFWk4LTU>zuU;Ki)Y4SD_8h?Rcb`L~IH#3Gg zndo0}sYqXeDx6=NRsNTr_RC<{Aa$s}jRL>IFNYidkE*ApsZJT{?4CU`*0=2P#^AVS zE8lR(2#neu;fz3)Xd;}8ZElG8)#^jUeBojY*-*X=E1l?ONqRV$9+E=wu(h2!pZf&X zY-280mP%?yd0y-`MjVJ>`wfVisygL0iLm~&s)mULhyk6lyFWyclgSCs=Ign zdpn~I89k2t9(gT=BtH*4yron_f*8R=v^z>PNeNI|eGMMxptY0aTiK6F>dingteIW| z!no5ov5tJH-t`aoZD^Rn;M|A%xI702vi?DDT#U8_(Mnp>si`#erq{t1)Jj5^G3WWQ zcO4Q1CVy?aAzX@SYN6P@bP6o11!=~)MQpfqE{)w`S}*sMqSF6=asS~y!L5a z=e%W!NxgX12)Mw&M*HhNNq;~Fvh<8B9gI~Yaxukdu#dB&qu?7XlFP(MVwmJMUFyOB zwU@TT1iZInw(dK8G-gn+K9r^M(hYUB27ha9yjG8sMue|kdpGRYl0z%+{T>1`2_xOV z+~T@-&|+%yw)-PLr@F;}Ynv~zuxm)wl51S&GC}KbrvgL5jx`hfKo2)E*j-BR-B|yq zMiYg^iukMn-C0gK`r)Q~J)%D4$ z`*$lZI9#nFq@s?HTvm>R{y^4tr^*t~=+Jdzhd9)g{0q%6ZDD^HaZq(W@3C^>BB?i+;gTij!sKg7t=>sHG4o zA?W$K@plcn*i*f5B|UOG#!b?3yMf8?;_>;_8RDqiAEq>S9@eB#3NE=iJf{Lgdz-&-}1iJpdJ%L)giQm9VYkS}gU?dhT>H7OgG41b+Q71!LB66-e zX>91e7kg=}d{2r+FzVPS^i?MfX_Zd1|G)4flvAC9eARDkPVYD1rGyjJ`NJ@*&qVV_P!e zNsPMm)e?sOOSzOgE##_rq%PwKb)5ol4;Y5xkL2n=h+?Z9Zp_;oOmhqJf(Aq5D%^FE z^h6#Sg-?>h`GCl80Nhe?x2XbN>A88GYxE)|w*=NUlLQ5LbSA(JGy@1-0#;iQTw?t` z!LxW?I-9(YDq$y`p+~QSt~QeRAD&y5YPp+O7APU1RV5hhJPZ$U4Mum-et=@a) zYZL-|eUGKjUiCo~COh_)js7je;j)rzQ7^o@;L~EwQ&_TK<{yt^L4xiu*$*XFC8Ee(2L5`{*Jl217Mjh{ zP$K7@HO(x=RmVFp2IRQZiT$=+#ng6V7y_bOu!Z2!r>_&PHp+6sa0`KW?c-szZ7lWj z6j6)k+n+Rzv*zn(-ag9lR3=}W+1LF|uRwp`uw<@Yd$o7+&APm&-8g_hH@M+|-cmbw zwJs?5}VG^#e4=sPQHL#8b%4_Vv@Ny3W9gS-f?|FUCNZ#Mv+ixJF9V3 z!DJ<}@-0ymSuY2N6#Bh08*_={;-du5U-IGctJ2@hCS}TgDid^Ta~%VQ9xg;ylc`7_ zU|l0v>hsNrdabE_weu%EzWP)(6p;(V3bPdL%6tjR=OSja6N^42q6 z^2XqF2I7xENN6>8Kpj-o9xdU|ms(KIBY8g6Oau=9@POg8^_`ukF)O!l0Qp<730Y27GD!YTc@MymA z*_zR%bRh?wD7pax3507pk7`GK8RF4RgY=|B`Woj$NzYjG2dM&3He18US+vBMdtGvc zs_-FjI+{5qw1~nWhYUCl)%9@Uo z)wh25ibq`G(i9bIb0n&`93lODzVxv%I(7vOQwRW^iyXFn2%S0+Bt`1-XwL4}}D0Q|;g-ZV^p|hOyGD6U^PDc#;|Eg4 zr6`8bmB;ix*rbOSF<&A&Z95&? zwrzJhwr$(&*zDN0Z6|a4efzC%W@@HtX6l=n`|s5A)IH}p+-tAB*YSZ3Ry>B`uqwDg zqz^-*BAHG$wK-s@fIN@Zt^UrqZ=R`H)Ci$FD>Rt!(LtYr)^z1sqbnC_Q@s38CS>|g?T19u?2vt&F~eCcIPEipZ?v5Evi5KSfT^mpa=!XU<>)UFShD!^j| zFeE*%a#5a8x;RGpl=RR{7(s^ZMN8D4Iw}_Ug}#6Vr@VPdo)sfQOp)!A$|(w^l{Oax zU*iPA^i=Td6aS#?!0S}=ehx}2Mm zTB_$M?Bh77Q7x?dScg51QWge^B8d~Tsz_>7#SAFP2aE)DhIQOstu(CrsUVIb_Kd|i zo)asQNy|ZMRdg|PW2L=p!zISz>bR;)ESgT0hJ9=+nK!gTTyEdu3V~@H#@`I-Y=Z&& zqj{8BrWpeJw}RC3rO&^YV)KlxmY9MMz~;rJ{Ama4yi;ld&_Gl3R5FfsrT0t2b3)d! zo#o-1!%qhsZJMwr zs!lwvv~n09EhE$wS&NLk1YuHB$`^uBbc)?2ULO{OHsFW7 zCp;^bzmjgW)JRFbGLDOsPqgB3WJ3);tNZW1i>4~8e;F@lIu$ky^{S@4tr+y|+q34b zH4}3-Ki&Ueo*5QhYd{Q=C{@y!(pkznQ->2LeQ~B$8O!NAO3pqq6F@P&X1=DiNhq}dj zB^dUSk8zK=UTZQ_iBfas9;4eCrLa6!q>^utSE;3o`~BheK;(XtzuR=HpN&nK5gT>NGc>w1TdmSyQRTY%B!$5 z1@NuTM5)&Wc1|LGlgR!-GyEtdTckq$(haT!%YWfl6PT7OYV9gvV%UGgD3Gw;KJ7om zH-I_Y0t28QnIDwu0AmdB2LN``FVXWa2^sJM1pu8mfy<02{&@l-$PcD+@!xwOG64Sc z6s#n!@L%8u3BZBBApT?L*!=)}CQ6P((!b5X2QZ=gxc{e>+8vL7JRi2v>UKuV%qSdF z{A~(|jYxMPc+vn1J^fVW4+-P{m_2&nWL!eDF=ZQte_ymy{eyKbpp{SlV}t*1HB_>i zu&Sy96BBbw)xCe2YzqJ@;Bh#YqKR|=ZM+((g6 zj`wa@U%T1LLWki<%GaxbV?G7kY@SrSS*m;sr2*4Q1N6P5>ttlP*4Z*Pcjq?Qn~fnz zC#N&2W~c1X+j((p+Wr=Lhx+_|CNSd-P&AYl+ zk{#v|Qeaf0x!gU8T0i^ivMfgP3z?TgQFrg=frvShOEKDRe*I!5Va)(0hnmoHwX4Dh z>nd@UV7D|>%ROQ73L_mkd3UK|vE9d;``+H((r;3Bc8v8_XCWgaVvPnfeI^;h`!%d& zKi{Du!&rjp&!Vm;&DS93h;3?~Ok60`#eAG&Q!*VCwtRn0EY z{fdtwIgT*0d&QZ3X_Iv|m#tU}9mbCBPja?*X2zrn2hOEVK5(@X{(1CsxnGWvZl}e) z66Rs^x!es-j|B@SkaGRh*m;l%BtDcIhcw08NQH2cJ{KZ4~ut!3GI|>-ORMMR{;LS$Nz8B?1vIeC$HK0TBKj~#5 zrrvs1iWb?_5|c?K%X;yBpL*Ajz~yr=#s&y`35$tA zOvi)YWyfx&Xvz4A+CbFST^`7L7SDEy)&8YW2Bj z#E;aBWofM#nfM zJ>9?xS}m#`^kHsCXaqZ+s^sm?+V!dC`NY=S`J3c?L^H5Iv+$<+uBOJrO-)l2m1k5R zySVf%Z*t`w{b+0r=AEm#_r-Qru3soxR8_;KyUT32#qyi46YNdjO~+wljzttp~9U0Vjr3?pPs&Mw72n&3UR9WdhR11$PB9@pukQuM6pyJqT zO{lzXd3*<-o#+QK!%nsrp*ueb2R!z+eiF($ovdS33kT~h2RxkAP(mo}PT)QXhT1<) z;m-G0KrVc7F^xR;Ct*7g0;;QA-36tBM{B?=O?IWw%KUfAcIVkcO zMtytnPP;z4MeZQBfgtI*f)Mi)$^I`ov$ zLCm+c*=Y9Xhq;yx%@0@oz3KCgufBBKzO_m+TiPYpPK;@}s;bVkUETbF!4c{-NGh4{U>ye54!0*N1QVye5`?pQjDq<$-#b%7)Kj{h z4>R_s5dBz57$R1@^M3Azum`n;5M0;}4gDBfET00>$<(QE>LH-7pAO)B*pmXawvzc~ z3WJN)yV#hm9cF$?E=_|VtWOEMYsSWe=soQzQsEXmrl?i(zfuu&&OoWTbRs4VC-l|S z=)bsY3u@pzS*QaxCuTGdX$8IY+S>||CfE;rk)9UM=q_%e_a#3V_x%90b5Hyn7}PVUT*>lNDjvg4ZxX=udw9|e6Gp&j`syvC3{+GR zyc|>-EyW(J!!J6j)vqv>iiWU^Q~e-{VBtg#drw{Z?3Y_kHWtwhFG7FFjv}Z@dFejW zBXRklhcp{lOg8@v18@+70^QfOGlSDQ%{JiSXQ$O&@TQ^V+0A4f*8(b1s8RlD@r9=$ zv)Md3YMwlU3sWn%&;xJcJx4NudsGI=be>J&K7{?+n=9+DGk-x8{c% z94}h~^!q-Zs0WVZ{pY7#m z>F7u1c~wN*R`clAd+jUznF_dfD}fIfmk!oDk4n$pb7k1UiwCG!S+hp5Wj0UA(`OqU z-_q{YlzC2eHs#XaZ@D_)PVZWhv_Wh=DhpWqN`c^j(T}!wB^^r-F5S~u@>*(u+G`UK zI*PTHKMGtWD0ss8fyRe4IHB`1V-x84PU*)|P*q8$W_xih5Z`d56=IiinO@IpV?|1F zpGEdvFiK(d;&XX;wN#w9Sx*da%HVVr4LFcMB7^v?hd$S^`1$#PZfIz>rjFcm{ z*|Al%tRbKBjlXLrgal51e8J=OX$nv-r?QsQS9IgZV5_c4RvGJiJA(U6I?*gY=)8B= z+2wS9ChETU)Jya`=UmzCqbP%E&MZIC{MPw!6HiaDl?8V8@#r*3stkziwN^TRU+C`U zPlr;|;)r^SDwF2k`D^yiBbvp5tQJb5Yqi(`b91L>Pg=@TaW!eFYjITlHGvU9SV*Rb z!51bc5&p!9^0I)FUjJvUvkUU8AaMzb@BA02S3&?_V6Z^qkN(33*aTY(Lj8a|i`$Zm zi;VRz2gEXesh^}DnlKd=SJq4EA`*WyF^8ar5o>oVh>QM%9l-$WnlnOz@IM4&mjN(; z|F0`|ECLan{f1 z=x8oB=@Z(2+cOV9Y?WW+w0|Pt@m4?{zg~7B$jZtlBqtjJ%xt~S?Y`he5=(KazaJw5 zXnZJ6LS|;<)Rdfv)-Mb+^(MW=D(!GU#uFeTaG??${?|V!t%V0LMqe&^d;B5%`4(g< zG%X4LrXJdY)D4e_yhK-OHn}XHp#Du>_@B}(q23tC$RL?A#qFOO+6?VsDZ377Wm*XSUBM$GTqcYd8&dqOWCoM2abkF(mLgv^XH)WQx3 z%TrWb8tqz{=sm%lk_0b_@EX2m>;Vohg1>VM5N--#y=`jOC}Doj;rqee>U>(HJu3h2 z#0&VbiWoy+(0oi;TJrBaYe50vTh|ks((=D$I6(RT+vmxfl(DXk$bWlWuzMX6Fn>o? zLN8n-#l8PsZ9o1jjx~t?7S{hyVZ`10t_}`@>50@Z(9nM}6|Ynx04ZLq;ZX4KUoMGJ zF)>5YXf&203@7VbS}-gZDg?Z|xM9D5a&vP}b@}>o-_I#tU0>HGEA{^CP3RtnVpjK) zmXP?VQ$3U;;E(wAt5N+oM@Pr&+etqEU3hHvDHCkZtY8E@m4LhV$Iam4VzSY(v09sT z&YI7w?Y?soXZi|Qs5W?_>d#7ymE}dBv^c`A?yq@iY7e(H-!uSHiV9B(m^=U024q$T zQa*c3Z#0Y;5D-9~R9KnrN3*0-sj)qkE7bI}{C;d=Le$Kxgs=)bRV0aK36MKz@cwd- zc5G?7eiDnA99)qePTzoa%-zBBWp*-PYs+$azQ3U)X&3zk2IJF&F)PKHi_{s$jytpe zpPh(V0Yq9wPf}9yJdEm*EgTnyeyEa;&FxC>YG`HtcPmZ)-uTfjobE{v|LJcltyd5E z{ufKpedISDB#BOu!i~hf{(qBC!TxlTvE_`sYJ5x7fjLg}n4V>Fw}No*IN|j z!ge}JumPu)p_H5>1CS&unV}1>`Fz=8L~0*KWT^7azBj3dvB8>Wh#Ko2$bb{8fk;t^ zutbk7e4XXwU?Um9-nW;4jTT>^F|wFzZ)}}6W=*);+o9GWh{X1B5#XA|QLSnH z&dif(LsUX%p7voaADc&2ORV`QF)5MW39*c)>TzLWi?J@!U9*Li3AY#aCkzliL17! zaZ2B4pZNHAOsYT7O8w5SU#?UV=;%4i;7IQ+f+khS^qpy;=;w`B-O42Y#9BJF9*ie_ z{bxzBkxUuUs5PHf$HBO<1%g?beJzSSDpB(7(V`>nO8rULc>~`AlP}H-&hvmUV~S|A-tEkEvG1M7LIEeCF`@`s&($gb9k2m?u`^%j@Es1rhUuPa5P> zGU@gm0abstRqAXOd@Ge97xMi8{Vw+=zcn!?AfAe*XTFy)0O2S+rHRuByh)xZx$B?Z zkn~U9z&IyIL-}$EN|B_;4bjtv**tAJ)On=ghRo>&w_ZEqrM5gHwg1*QA!uV@QDaHT*eU zcACZ8ti*j*K8>4xUn$zmeyDk;Xlc zJ_#2e^)lz>^?K(zYntY5v%j5*U1WvDqlE^)TmH`DfffJhi75#WMzx=#DE9V5Tw1zw z6KjW$$s21WUtF%E!+k0j6agTJyxwf2Zs~-wEB+j z&akciv7pR`>#>IH*=?+`tN9NW{oDfWoTGN_v}6ot-CcX6(p=U%F4=jxBnVR+kyaN; zd+tDxpej#BDWrnmzai|}o~eG1FWw7#YFCOzIjl~pB(1`3CXYNYWy2fV=_rm>Jps?w z)2Wmq>hl4{V4uVLE;&>NdKjwBip?2Kl^+8kMb9gv(1d3*OCbM0*^3T zhn8B$XHd&JLOm3|fL+H71fU}kOyFPM6z{6OlbMjQEtAb#X*5RV3JN+Nq&*I8Xm31i zxL9v0TaP#-Q|anZmpq#Ls3$pdy&VSj7Ab0kioO-N@LZU};7wj6g0|Jy9F-pUY{(3C z(kZCdS8j-uBp9B|dMlwT+ZrX~{f$HUd{kku&mQzVTMv0mzr+vJ>RCc0=saf^U-W(asACI~; zI2$;F{I93gP6R~S1%3gQOX%dUdG`MxJV{tjyM(EL^4~La%I#0IP*!I`(qC&81`uuJ zoI4Ry_HUVg7m&rX>h_|b@i!idRv2Kfy&j$;5B!%$1Q2HcMD$FogfFdgN}T+)ye>re zgF@EJy9%2At!IlA;3Hly$S+9yTh{qg{-63>>w`y)ZClNng^7u&7T_Gw>hf7PSp#^s zw;C69gl-SUt%Jgg{~9ysexy`2;Nalw&L>~pic2ywu+uxxu(4q`jtChUkpXfGHJC&4 zlYq!AWs0f{F@SskAQO}nCC79SUvvv|s2HiZuc4!(R=w6>=8_I5kD z)dC6u-+(!UBPuC8vX)!8!KZn2$QNM4-6 ztHT?oTt^%!jJ(Rfmga_fCbPdS8(nWE!p!Na39_zbSdup^Qq23hKVd^BEISQj}1SQ=I?ESfcy8jZcVLcvx+ zTqh&P%LXQofab1Bv0BWLWApiFL4D(JC>Up_-7oF>acxVw8>-%HsL=ewqTB_L+)|Vc z)zh<>SJ_{^Qt+b_C|#kBq3){-IP30_v6X~bV5*{?dXP@ADasA~ZW1AEBdmRn-B8}1 z{5!8Wc^}h)F;|k70bs&RLH;?lX=rE~sjAi%soSlr9oAY~Qc^HTo^5?!H35nf6NKeU zOKKIG&G7H9k3);PzLD3vLz)3?XMtU0f~5{3g)UMO^`oyI)0HZc5uNLK*SmR)^=nqy zFCNorJk=&wweSy@;>VsbwoIon9&IY453{2Ot(?^r9_0;2f7XoRp9b9DIe7s)OYdkr zs_5w80`w)mUPkkMV7%43@bdB=ZFF#b+*kWjICd^<7g@fGSZyXmkeHV#G{RpE?F>N} z>a8N5X?tP>|J3L`gjH7v%;<#)H~a1^0i`$* zPD`pcQsi`|G6Vd0e;ME&@3}QRXj@BPP13}J63Nd(YOu-mqwx!#mIRG3+H>a8SO2|S zfcE7n<6t#icdM!x{_kv}L&M85HDZJo%xy7QLm$#tW5cC4(50pyI|&+1g@4SN0|GsX zbb)(7LI=*pkn!)R+1+)q6}=6#QO~M0*TEk4_M1(sj4kRB9*lKxI;o>VouxNV@Ndce z`S%66EnTO|)H-Q^H1CgLE=3??tKLb7v7*he*!7#5lSr^58>pcjsPzbrLMoriA(32X zErY}e;MQ#D>h1N%53z-&6~6p@PJ@jEaw4vDRFpsD92NuckaAJ1K#7Igk_wY0z)i=a zSq;!YS=c~Z^nSVPNo6vfYIC(6uxNs0*8aL(Z(RIci|2ac?z>g`1`_D)eM-k^1#bvimvd+A2RM-9|p@dGCMHLU#Gs zqf3s^hDU#kR!o@joG8|&fqBRYd$#koqq-HjIVpXUIuQ}-FWe_?5wURBBK`HY=5Qv% zpm)?lo6?g%tn;DfWb4`RK-!}@Y7LAQi7*s&O+YR)HVBBoPP`jX+FZ9s|E?u@<>Vyh zbY0LcIo}=*Fvz$=ErUPiCYEE}in|bab6d!)sudMafW2`a_LGUCPaqSBe$`;Ve%BHT zcIhduFR{@XaI=5c#~wKiiDmON(AC0XoJ$zyN(_^}+nTPoq*^kr=%*VX&7@lx59Aiva7tRf9Nu*Qam!5M_6fO1@jGQ4Ex?fO_!2yvbUmI&{{&ki{) zAi?tcyM$Qpk-xR$Wq0?m4F;z^Sd)k+0TlxdrncvnfJEEz56T@(`_%76E%6s2-BS#? zqlos|!14*E19>#3+sg~gqR2~?=ois@J;=Th*b6AUBNS@V_uk%vI2ewaY{Khkk#J0N zFBg&%s^yYsPp!XP4G#jy&6wIVnr&Af)L7eL)R*QartqEcB@@=o*ID+>ZlfdZ(|F(e zu18yRj`*ED4+bD%32;}t#Kq>cxvwgRJ`^)_F{8XhqRG!!_4VlGv}g?JtbAhGT~0$v z7+f(pPGT(T75?~o_zlVQcFT)v*?w)WB&Vd5s1-d(SS?gwzC4_VYqz;<$BNqIVpCG`REObLZTY3zu!h?u#xEX=a2`$p?qd!|)c<*O$&sl_c?N(S}W< z%?5esJ6Mrc$w2kR0thV})F}9jm>co~BK0H{$*?6=do%+}`68^1T*ZMfoz1I9f70#D9LY*Ot^1v@&#Ps^0#zqFN<4Yr##0=QK%H{t9p>{vP%hK*5==$Lb@GXFXGPR*cWkjj(Z5^MfDqK0?NXaQXI1CV`8*_JjexgF#t-YBjP4 z23uPFNq0|A!?UH@7LR*MpZjuMze9s^Cz8YKvx6~yC3I{p<|MT9hyjx<7iRQTN)%Nb zxq#bCIfFjjdlKP$vd>%j(c1V##b!Xo8lz^=4unk6=%@k<*HHXONlpA5C;>y5Pc~)r zc?tmr_5&Ti(aQXDEPmH+ZH0X!U$@47)4ZL&fh6h-kV zD6iByWBBNdiFRiJeMpljLv$x#M%g;|F*g@BE0|ta2k8|ko5A1jV#6wo;dih}KgbNK_-oPXB5UPag3Mk0(l@F!G%Ci=%j5jAhuZ|1Y+go_P#jD1L#KG9&{e+W! zU^bO{bs5m zO*&Zf=TsV+{^nH9f#|f7ljP1_>(o%14Z&rimc9U^l zX~hM8Qoi$Lw4-CewtDB(R%5NR6H*QJsFU+A9+rangTw50K)a8L`m1?dNG7QzH;D>- zwq@BCo+?p#$BG4}4^cy^ikgdwDMzInA4&I3)FW;pLg%w+#D+5!@(lt($txxX=Gv=E zjk;UQ?Jn$3Ge0EsS#McxE40&pGsY+qF;J@qDaytLoF@-dPxYONM|Ys@YSq_Bkwq zqCIEXSYSPWdioFCSklWuCxZq$EX8AXK>v~I3i_?H;;MRNdBN4s6dlNR98^b3D?TA{_u@HPBYTq% z=!Vohs%BDGN1ET*B8d^RA~sFl90Qt+4?0`m*=S> zX-;Sb0Q~jgP63uG{sH&)SkM+qv_Ce;qN?gdI&(X&N8^>hNCHw;c>S_>DEsey>hu`5|?ENgMMwG-jW;zIKvniu>!c zoeS$;qp+wWaLSo~0*E0BrN>$`f7)7bf4nXpQcaX|8lE;{KatsEWiHMg4J+^Hb{N9bbPc@+>M3<{kX#KzU@n%p~*K_CG$}_ZL|$Oxji<~ zyTSK>+rgv>x>oD+s#H*I-Ba$bIsvHN9F{4^d;9}PPqMr74xbz0{d38@Elxk`b;b=( z=*O@Zx+-5L+gE2kj|Aqk#~SZBlGTttGr=t~nh%tpuGM#H!y0UV*TR;cNR&_RK4QVj zMwEZI)d?ON#DFa%N#b~mv!>_fo^kP#d3^^Sd-5t|aDSHf*MGJ|H80-KL271wRw-z5 zH#xj6i9-ty{ugAj$MerHusjkp%rq0M(cN9V0%=(Pj#5R(`^;iprv-5XzxBSOL~MgF zQ(MmBa{qc{>}8u(xuL4M6t2y2*6|5)=>uQ!3Ui+0qSGkI_9+zk$g6(G#aAQ66j*VF z@Br$>zs7b_E*aW?9?4UyBw+UrEG_*fjz1#pwu9sGL^*AFxr zzmU$d8ag+P$X`MIvq`&1dy8hMyXX|m72MfD`9meOCu!A{b;Gh?P0;lJWXd3#*?|;C zlCaJR7VvfCqdo2LnLlFjjUBk0e5rVfr*u+JrtW(Rt6V2nF>xsWsL%*HQ6B%4acgl% zD_K?Qx~oUkw|y7j`LwG%muAKF(1P3Kzg=|yg?jvZ@}NUnm3h+Y;)F=|_ATO2hxJWs zH2gnW!3-L#H8M{-(%}2=IN98$O>Z1JT?Mx9BV)D0`D2Mx4VkdNEPsV{*6TNf@8F0rSbIJRYr@HCd-@R$d) zbxyq-L0>-bWg=@OxgdeBmclPotpU}Uq9(tu`|Lfmi*T?pV6MO(I;Rd|pa!oTXvX@V zGbN*|nOY~BeG7j2hvl=u_siWC*BeRu{cCTDwKm9Miwa9{-JX_lQxu9?K7c-qlcW?CyOeL?ksOEyA zq?v_S3DyKvW@Od!Sh%!c*|R#HkOnxR@T!ICSozr2sPR0B?x|p9m9~yvCbSaq6{f9%_MfH%xm0dE-YHY1P;`YHf7C3iVUzmGY)^VRlV_2m`+=Q_s$T zaQh|$7q4!O#(EP2ndJI}JY4B!SEV53&2DBTDW8hHb~2OczN9mwCvk}+97*NBOLZ<4 z=#uS?7!IXA{Z^B_DQ~uT* z4chlUAIItR3SsLtQ+R9Rd9ICSiaivZ-mY-7lF%VLxDYS~xS2@)bAPypwR;^&NA3gF~rO1Xw+p*x1njNF}phbo({LRVPIi z|5E?GOrzV0O+Y|^2CxYJcP>r1L$idnFgc|iaNSE2GBJp? z{7ZmZ^+XD5L^363&wl36;JNB2pXVhg>VD!m%m)ZKc&q|ggNXlEVHklBU}R1`&Yl8L z%+D>46X|c_Ul>i&ez&C^m$)}iPlR{c5IY)5i5|E0izmfEiEnb-iE7qYNs0MS6K+E_ z<>iPZSLO~mRe*1)s%0zgps_jz59V-jkO zhGCgRm2wdPAp(fruz$MJH!_-vc!H0ck#(!`o{_odD&hT2*)Spe87*{tNHX#YFE3e& z&jFAGOv+!iOF`UJ=6lb#BeYGLgJ>?A&422?!c5+`_owxU^xNf zWTH6{E4SGOd&M0#U1PwR?7+5+{}``V{th>Ak12Z)InqjVwg=kE3Z{2q#2WtH3+p)6 zyM&e}G$USAokCwBB{if!6_{R|(=Wr@dzi$QslEcmk}==U`w*#}TV%$%RVn-}Zj4>e z{ezD+V#Up0u)aWCR8fWF$_HjX8%H430N-e zdb)YZASGm`Q9LGlm`UK(9^3^Yz_7jyH9g(CR#>0k$1uCVXPL>dhC!Rg8@)ali4Q%evb+J@xU5?N4^acDLVNFCKp z{pGcy6JP@2+D?~Ege!^`qyvY!h?do{LbW3y-CXqPrF*Qb{4MES&GH< zpZSgX{frga@9oD{3+8Y-&1B*N4-=?-{QH#md<1A?X3Mi0#)yo_4II%1wRuu?eQ&Ll z{$S9K#>lSq=ZB^d;x9r%FcNhnH?};4=T}Sf_k^1ji`tx&VHQ2)hl(hHZ9{$^8ivX} z^JyYSmZ4aIy)yAMKZTjLiD)azC$vKvufuk_g*Bi8YG|>fE%`vc8#EcAnmhzf1zY;D zej;%!=_wW$A&!#lj}qD$iM>{85?GRe5k{s)-4c<;rDm6W1XB$z5(jN;oPfEJ4)V*3 zc;`pf?M6Yu#UJ|F{u7?{84 zEPW{BY%ln?6hIy2G&Oo`2P!bK=KdIFmS+m=d{pSL9PTP3lseSYYyk0$K9RHZU-tWg zvOtqf6q#vz%2MHW$qrjb>eHnQsg>_Z;0t;MlV%1lQZ-_f@L;L;sZP51%EF5~HPJ${ z^vc2t(9y=Nqd>x~DlLue?XL<%{jS9f7g%oYFY_A~ELcV`Zz+9V>KpaZCSxHVQ!H(J zOdKx6y%CnmwzX)0{_V^!xglab?+nPrF9fPumU*=&3|qh-E>H@t zX}qBGl3;C)$&`@il>)Q5I(_RK6U^B6s+JuH@z%n!L~bu#YFWs-xrZSiC`A?p-BfV6 zWv$ikhUVhGQYpU6%1IikrieKI{$fU#S~2j(kp>w8ivE#7X8V2R$?r9Zu;htvG(^jM zCh;x9Q=Jya6>`{SvgP#Q{n*aiyJIi4)pC4w^&5b_?CbP?y*eqcZbd3=l%<`4?TsDem-Ex-zI+ubn-I@)oP1R>rfJ3SkD_>48c8qD_}#9975jJrM>?fyU}b>4-+>es=#}WwhpZoV6Yah>wOrFfLPP|r1vcLiK1Ha%t%AWM z@FWw*mLbThAD@_TDHY--qGDwuAtr9I?d7hC@|>LtElR5&t|+GT>0o?LHMUh$^@OsJ z#Hcfj8PsHS5A_n@RDjMwq;hW+Gv6KWtbekWK)ge}pBc;_Fc(I|;<3Ad)>%~!&YGJu|^>59liMTyBo3@OTQbd$Ce)&UE&qM?DkCvY!paPc@0{Dw@V zjHt7B(bxDr3mK~KxPCa7o}Pt-tj%Yl+qU?)Cim^@&YP5*3#q-wF-ItIGmoT^iCOXq zWG`;Q%d4zqo$7r_QKxM&cO>(4&Gnl-rpRTs$e5u4N?ZpSwRCXnN^RjCwZp7XVW({l zM`E{hp6I8#8s2mWHcj9d1z0w>h#qTu!*TpBqRyOO|nsXrK^Xy9{YQXlfUa3C* z)wrq?;8OUlJ9?w$PvEEX)o!rIF3i5Ee!^WaKYPiV9W%j>tYQB+3FG_mGcoO$@))E# z*Eqnh0P*_e{?vXqXlay-J#<{zhFFKxHxF6^;En*vX16qS_y#Py zoD0r6a|o-n9jvB=`P{xs`+%9@pcQxSy>~DP7U1+`q@}XbuQn|BMq+-hmAS^A*cwCZ0bw7 zZR?9&JZmI}bi=n*xm77c!rdfn9?)ONHi9ePY9|AMvKxp_pTCDEOD^;J+t;obTN7Dp z;8`z})oD@I5Ru74(`k-9-_jzL@XCCqooD~(8tDSIX?$P&l;&;VGYivXpfv)+M#5+| zMS|&VbGy!S64rNhZ6>0fX}r<`{}mJ*+&5_(5#aC9M!?Nm*mX9Z0^880;#|%BjgutQ zrRAp+6$HUfl@z4nFTayG8vhe4+T0iB+Dy1<@sm=#9tKA<#U!RlnFY49PL3_cVJ~z{ z(tHe0yk12$@9-DfCNEd{T)wt%#=hJ^bw?aeUV&*RvA7zl&fMbVg?S*D`^|D`kJ#XG zVks9jvP4$KXJOyW`j$&Sfw|v%JL<2wf0dJJSM zTyz)QMz^oQGBmJEZTM=7FB$3(5Ly|d>>~{_N_&Ky!hM^V%Uq@3)k$x5oZ0#Z^ZlTr zod`O#2M@VUYdhv6mfX^WVZXwDuHrnHKfy)Vk2ITuP9OS$b3z*8iQ)MMG_mRC_>Oh} zKQ(`v9)(pGfLwlHrie^7I2eFkfc#klu6~%RI0PW0Q@jb3SfRYaiC=C;Z{ypRY;cI^-g zzW&6!DA=nLet9}V49uGa*p5)5dSH-Oou5cQ2J#|+i0DTl*YceGRd*oR>JIaJ+$S9R zX|L9W%TSJWd*MW?y70jvK5*ElhxSfjB`4Y}!BX@tu>@7RA?ZOLVf}D4Gd^FNeK){D z@?o?|t}#}1UB+~5CN8zb;r4vxA>Z6k)b5b?d6;7+H!=J?Lx51uc);S789*Yswek56 zH4N9YwWyw&D`iG)Kq0!oaC4>Q=nmN_T!$$OYHF{^^!9DdXEVob1RxkXjleFzLE67~ z(#aX&09tM53hAVqFV=npe40DMM+>T0Jf<)->bF@4IxnQ0cjNnxz6Ez<6+AmgKUWJ$ zM2`)&h#1|hHScml|PCF60&sj&(#cloH3XibB z4|uzn=Enj=3oggu#<7ufL^lp0{Q*K+NW>oI5l$41sv+1QE67f4 z#wW}3#?sTN9B1iq7ij1$k|5SVI115K4id&K*lP~(&I2Z}s9pjh`4f@d)95;dZ$#N= zHiX|?plflkB@;;g45~6NMSikNOi51|t%^*aqyeID!)Hl}`wn(WeA)tW) z1ZZH=C5S-o<}t_!fS_;{=m>@D=4;o-w@joSK7CgNVoMAfu%`=b8Y6W&z+bV8ZRzTu ze6C+z%*qp!U(1e zAcn|(e4kKH-*D8~HA;=pk)G1?3p4K-L0kj60oHd`7Vg~q{2v{E08d}0ctt9k1#3Sg zA|(Y&DwVROWYUrfkfNp?Q9d#=H~({qOR8Ql4EdNlJjl-378_f1RY&#vWw+!f%#~gD zQB&UvrXLbMy$#gE+Al|XtVp*%GVvHkZ(VLq&d%FIfZZTQT1pBVxZ)wN*>*E&SZmMc zX*>8&rZCFWmOsMO+8S)3LTFSJJmA)?)x=d@r~C&wn$8l2oEK5NpP+Bo4ifA$2j#g9 z=-A1s=Pb(PY!lsO&rDi(U2wr+Cw8|Ca7n}gD6Knt}b9`;;31b`@_&fEPv!&W@PkH*W5?4}k} zQ9X=5NPqtH{=1A!((pxYaK>k!@pg|)8Z%U4;+@=^Hu3-PuCUPj<6X&&8&yzL%p3*E zBp<65^kGB%R@B=HN#azzTDxEFNM6FG_X#-Hvy8Y-IBVCrLm?Ny1Fl5H@{v}}1>;4* zxcGzDOV$>nIu7tFIzS!3Tjd{$4cTMpA+qU|+brIqZZs(=W8`gw?Q-}Qh{5ynf5Ct& zkCRWO+r34CrSq=EzQ|WN;Q-qn!Y|MrB=4|<>U`Xx?D0jC!l4|CY!t^OeNXtfyNT!X z9w3X1s~(q>GVnV7>2bEgmvhq@S>K-3E#fx_M z7nQF^k1@5hL~abYc_|r@ZfH-k{#p#r`zye@Ds4D8Rio8)&DW6ZDgqWzvn5b1*IJuV zAGY`arhimOLN1aN?jK0D-wL9xW#@(cMRg#!>_`YMlMS{vtxqqYMdd(JSa&{wLs9NP8$vf)2o6cScrF@!ofSsX@-u^y=;3W! z1WYl+Wmi>L2W>ZrB+&4E=E%pOX2t69cq$0e)7JZ)f;dh(*u^_|v^(_5`lw&0ln;7O z{`DB^#_b5A@p#I0n8Sh}LfgETl%O)X_dxONeJMI@I_n3hg>Dy(@u_9y1TW@GKwbSl$HW^rSjnn3A4 zWRK@+@hQ0@?OPu&b|(!_=IRRS?`2TzI`fUEEs|Si@(53H*Hm;u_xNe8SkyLsBbfYp z&2QiN^(@Ui9;VNzM7Y%-@AWO5^}_j>DS1t<&UzpYE*sd2jx7JG5r3HDM4}4GUV*YH zN>e9M@C0I8H%WJa@(H6j2f?0Z$@3A+7p3|dCv%v{LQLJy1lWSe*Us|qdfoA!Zqk*1 z{yT4!1RDjr9KHVx2Qy@%3_y|xgV0`|N0GPD#uHzA1|1va1V$qMutvKNpPNqWUUf5q zBSZLZ)_;A?V0($}gn{Wiza?gbcL;yl*sO|@efS?4A&Kjze64_>edx7&k`?;B>cQn^ z@5oCrxLR%Q@CX57RVhCGA}G$dNBeI^j30Q)$?X}_+9HRE9OmRh?$$^Rhz;pED5}+} zg*Fe^j$BQ^zl`AM5REkF{^;Z&#wMf}^yhxd-?FqC8gUcV$G2>wu=ZLRD^$y!(vki3 z%$4#mp+hD&3fuc4Ae?CRk1Lyy+=BNP4f?uw8SakawYa z#CMB9m zwnLjO_(mr1g&B$&9O3So6V?dTS|}9O`*mPP(>3M4(4?Fbc&XxDlsL?sJgD;n!KScw zLj_XN7JVMGIggSCqR?_nPV6S-%^v+h6=#aAkfQ(FIgwOQd-e6E6u??Z6}dTj?~Gmd zURNK;Db4(5tq7kqItSpH|BzNB8~LVb)kMF_;M`RvdYa52ukq({Rla+cj=e@CtH7E4 z1pV2FqwUqHamQv74K5Y>$GNpCja%z|6mz(d~U zO}C#$$e>Uj$h!~jVt#xhB;nF>_~X-KBE`OYyipeTI~=b-ip^sowAeM23%4JRP@ovq zYn%D%VM9;@ekgZ*`TOn`^xZa_;gv2B#p6?TEQRyR8#?_JMyd^=f|d6QE*E6u zht-M{DlQr0ap0lhMF7G@3ynqBL(yOPJW65PMHk<421jC|w-?4%dNC>)Diq#6X>=5H z^8Tq;COs^ZnjrbhtSv;(nJ6t%2!~BPTIfWK^~1c|`#NEn7 zf--U3^OK&jMYj&*y)NOt1d!`)h);9$iv;NH9m-zHQaCUw#^94UPnF51yo-*%o~B*6 zz1I0W4rNbchoHjEsaU6Xi~}*~IxDD- z5Rg=+zM8~ZK@~k(S%V@-Jn>S)tL63pm9 zngZJ=QZTMG2(=GRw}<;I_pz9p`jAQekYEl>*}d6w!_A6%0zmo-z=mikaqAmqMh4#wVnxncp98MSJxwe#!kv&ce=c=NiCvx2|#nAt*PLoJU7RdfcBm3ZGS9X57gu1W0##+ zQ8Pm$UTd~eme&oxpxW+7%^X75a1Q8lb4DY+O%f9G$j{NZdaJy{YhW=>J`(e>@x@u2 zT!C?93d`1}hgePOa|)unQ$%Z>UGMI)P!zsjZa5`nVZO|@8=GN^y&;ydimG{}XlorC zR&=Z1fjgds9kQ1X)_FHoFrH+Ehf~+trN6?8Zc+O5ZBq*8sV`RA1%EP2Fm_Q0gi7W9 znAEo7b>p`RUAK3e2D3vOA=DBO{FZk5KG#YJNHwUv-tJz>(6-$#cQsmEavlMyc|)6m zvIV=G^#T%}cNnTedJNDix-2s7Qti9cQNrFeeiA>7?LpRdj;w~<{s}TTk+n9hW%;Ga zG<6&QF?s-g?`qp_3)vo&3nN%ogiO1JetyDf(2+IhMXV{E43?TbcU^9OA6PTT+_a+S z6iyaFx=$`!Gz*Y~uSvStT;L#?#J_m*td^$rq|)72*?rtVROw}p_{=S;x4P~N1_p+L z6B8sMATqozjuxyKvS;<{agR5@rNw3P@TGGo@6!zLCeR%6q_0n5dtFSMBeH9)Ka5wI zPxDji(W%hPIaLKks|ZpA9A$&QS|*yn2a%4ZayaRG7A8;Kzy%230BK=d>jZn5&4Qnv zam8yXw3jqLponA-W7Aqvr=xF!w1?z8`J1johGV#PX5Uy2-y|Ki%2!F9I7&{{ zIzc@_f~JDXkiMM0UwrhLY1EEmF&SZJE#=*#?h-cXY$-a+7nb|+CIdW%rnh6P0n9|V zGeFpB-`YG5ihWbvy@=(oS&bB1GsCt$>KeXrv2KI!FnfK6d_*U)>M#ssEH@Z|z$tooU;SiCNFOsFJ}Rrf z*Hdq1ZYkLQjK)3c9yY(^yRpnQ+79r9C0(Q|9ZtzyGNA{)yyyJ4_p*uec{g58aKVml zZVB28W%PM$rx(8pS@w3#C5cnNI5dSpXj}Z$x&MgMOtRg z@O zREQI80pM?Y5OL35ves9kqLT;cy(!3wde0VB{mj`k2^G&O3Dp6Kczi@#klCm}Z$hT` z>w?OP;;=sqyjDe2ztz0q@2v{N;`m%P?`?h`GOZO}k=haD9H%jt)<8&_5gICpXN)Vpx-z7% zS&WSrYwz@2{U#Y~Q4#8UlFwm9)f7QLx6+h$FqO|dV?MLe7Hwg!xWuQfqXJG#KKay8 zgI?|F6v)C2OI$3(*)3@nj~an>F0Go~z%ET^=b}oXm*Z{(X5E2KnaWbVdwm*qBXabS ziT|+xPklo4O96F&hgk=nS1D;7rg2ij_lnbbfoCHKpjaI86s2G@f!vB93w0C&NE9PV z1TEmx`Ihj0NbiY_gL?+)W-FWnA~JGtHbJYmAw}x&>lR7@4=?nb3ssKVBh1RV*u9do zeRypj+W@sF=`c$GaH>uz2y1d>LHoXJvl#b?SmXIkV_at>*5Dz}y)r6jyLIUjr>0KjVFNPp0{|v9rXF6SwFO1+=UqBTJWyY5#7lqnaFeoNyGfyug|>e@fAb}8pD+reAyrCmYiB(Hd9Fp+ z(>1EeBJw@^rF}r7dIoCC9t_)8_|w*BhH^O+6D6gDOYC})HU_CuIBX&O>n%`ILrkJ) z*HdDH`hF1ZcZ>`FQspFqC_44&TJ7o5mY^ax4Rps#A#++aVNsxeCwl_&$dc7pY?jtA z5xj#i8o?E2rG*=bVBQc*%MU4@IWmOAwdSK5-*5NZCZaqYE6lx*HKK*9lJQO7!VwTj zJSj;xEzoL-AY@*Rfsu%MtVMP*dKgoS1o?;hv9;k`aGdK&(QW_Cyt;%* zNOh*0c+1`j=Z||&u`+xb@!4NcZ`G{^8R?>Su1;Jw+I!Z8$XY%@(g9Bsv!2->#f&q$ zE)1WPUb-cpF#&oXr!WH34OEG#FD7f}j241#OmL`|g0_WfN)wGHRB>$@1n^Q5R(!3T z0KfRcZyLS^8xpRq)yL560Y3fyqYE3rf8q1^Sn>QG5%GCZ$OqQhDhgh6;Lnh6n=tQt z+-_gh-Tl1?IuLu+veQ{%{F&hsO!67xaP#%>cg8pKJue)F_FvsH6_BQnCSg? zSm%Z)iOU9G19yP`^8si`E_g5`RQGw8+Qz@e0nO#nD}oU~=_7v~Tm9wO|E~u`-1(Dn z!OJYnH#IdGj3&|E?iXigz)5HG;GK5-M8fyF)i2)7_}7jAlIeJVpI=)WuU|d_=vnN< zVG&O*8^k6~MMegNP9|kX#kl$vsN}e(C`h=hNNoxAQN^0TH6Qp&GrjR#r^37 zH$sGDEBo;4-?ad;JTAn6w#XcKMO9UFkGI>=D0M(DW*Ot4c-Mz(>(H-(N5KSMAvrmO z?G^+cXJBdR>A6Z&Q&UsE-@fU!J1k%|%e9|!zS+v=`yy3nS=&y*pnc2fta|sYnR>5R z8(tbCagxf!VD9nlr>V5VGTIH*+!A~6d1U=yOGK^Sm$Uu(Nmy?ctwN_-)WM-n$QhaY zGDdMW5Sr6`np8tr@st?%y0ih5+n94$3Ixft5n*jzJTiAORsGX49P zk05&?e#T7dH61W=;I}Z=i()#{9e0LNOYbG=fj}S}bE$ONqLG4zMhK{tfkeO)0i;>C zJM3XKQ>>)x@W@TWlx4uL#jlpA>U<6*VwUbN>%gj!s52MaP{OB#9je6p|2i_=j z6{r1=dTgt*WLR%|EfkyS1+{XY^ABIbCWyORWxuH75(KScBu!S8*R7LTdCCT#y!U>C z?M8Ta;MKL5bO)h8LZ0{ZaC&Bvh~_@e;~Vv#>7@R<@FJVs`+5=z#j)P`K)d`2pG@zH zz-I=;Cc4N5HBa>~@H(L2Z)I*uP%;6wfD4ILs21E+U0iHIs^FAKhqcP^X2@I6K7aLB z1K@zse#=^hju0OE`1>aR9D@3WXEZ|XYO(4Y70uDtgVj^-*^V^wyxF07HNtFru_E=p*0AU4G@rbj`Owx8gms82*PUsU#bPkao|OnNkfQ50)mq?1=)wlu#|vw)iE}?6p*@ zA|8sLGI>URKBdv5uwaD76S;1Or1)l1Z{>Qf;|=^A(M01(-V zdAVXgD*DA8Z#x4BrtU zElO{o90Av^>FbnQe?JF6S6-xd)$Ix^UcM|kj40UBbqynE;%rZ zPaZsSl3qcP;eGE`-vQk$3knN!J3H~Bxh`P(HIC#;^QDhOFfHQ}+^#6d{iHBAXh~X+ zKhYC9g)P1ko-N~zwxy#$t?D4w=A>4nzAuOroJ0Ke*q#5Swt^zNl(`;lPGWbmt%VR11mv ztOxPNw-!)Ip3LE3V0;#RnC$ZcE3_iQev9<|o1iWBm7^o8WR8BuI)jgj07djb=FuCA zSJW4Dp(v(wMZ~;?`GuF0+*$$+qC5hj&%n{3Z)hWzG+h9Ao43{Hpt`>nu;W^AQnyfu zX%H{b(hL>~TQ3^B_PySi*m|}9RWi?MeVYTi-9{Xr{<3kg=8!-w;|erQax5I+f2cr^ zSZXjk+)HUy`hkSq97({cY{hMxWhtqewRY=?Dxr%9y34J1i=wq^rih=G^N0Tcdob{N zeB%op0?0mUK%DP_sY06g#r3P=f8#o6pmmPH2IBmfL*Qw}UGkNu52v)D)pSj?aZy2p zUj9u^j+3kdb0H{fl{vGewft==q_P8Mpe7JC$4~q?E;`C<2nEOh-!)^Wr_Ehv`0W_{ z1L?!`9ro*ucb97uqFzctTXl(-w+bJuW8$1|ztBpY8fY&h@~G2*Boz+iE0{?h-Zo|k zpU&b1gbmpr#d#c;$MMdXq176st0ciF>HorsUJ?HZ`R6u;^bH>NZ{t>pux}8=HKd55 z!i^+_Io%ovpGFN_TO03-@BN6egN=`m#LL?~^?p0bF)gB3#|>&;cfX;l{es`P6RXCe z9%)pS{_}+xq?BVD#^dXd1faQw_c!T2C0uV#{eEl2gFd{pFLQrhxY7{!p7B#w zMl=VQ0dFxd-gOSaUNp3LY2y0BpQs6l&d`9s zih-TR3iMZCRs0fI?UM{&{~cI~16uXaPEC#cU-42_6c8^JM-#5RNVUM<-jls3mGOPO zB7h+M67solfEzxutJ$ZalmB&Gk2>&M!@!gxo0%MvHx!m(EG{@eMrQHWNu<Q^IXA9}+<;>_G931ol)gEAo(o<4?D4DRO{!u%J|5iK8 zuYuO@Ijya4B?{5-@bF-ekeaAJdD_D_K6LpLJ;N4!W#WHG)*ux2P;ILMN0;h^qwwrl z8+LJ#r@xvKg;k|Up?kz@FZVcGN&Kk&B?7|Nz0y{0B?z8^%2HIr1g+=+fod1wB`(bW zPwFMU0|!^He1;w%<^z1T~u_Zf?q#`8d3>4G}I8(#dupE0te@zEoWt2F4zEjdIP8QdJrySF z^yUeNA1RqPYJe~?iFtVNfeK_IA|gTBtv|yx6KJ{Q2q?@Ulm&));m9vc)GvgCeimG! zU)>K!Ki!cP`tjK_8p&68_@Lj^Jltq(5YB)MzJlcL!qE;Fk>j?0pAo~CGjer-{S5UI zvqV+*=K474evfXkt2Dk_)A}XaHola@0GsC zriN|ErT9hamt3qDT7L7!?YZ8*NuFZigiB^=euLt?JR5i#`>hZV3*|w~tfAGR`GL*V3`6k^}EIO4qvC%^} z5b=R?_XDptsOuSoCa4W*T`FOV&eDDJdUooWg4aCez`WNlCVeN+11>s@#qYH;g%%@$ z1wYf^C+ivyfc_K7m%*N*0@R2W3mVNIcO;D@xn z9Tm!2(_{3V9-ZbPDm(C|&Md!{k@fdKo}(V)R1!3Y-P9G>=IJBxcdnYef8nhsT#ogy zwl~~@KTp1Kt7(>8+zfPr-_x6W)aBE>q-}abmgi_4$3>ZE9(BD9cZBq!(3NXe3`FFX zlz=B9ia5roj5Pp_Cfw>G0?mTZf#*sDJ4DdL2gC8pcE(|4wZ+u=|4mRS%scp>gG#ee zGd|Z(1PI@Gn2t#tNYKVdBYM2zBu`j~i-D^AfEa7r#8s?u^A$lz`dUf0PH2C#g~f2jv4t3Kz>Qe`kGEM;YxVi5`9)2ON)MXOU>^?u(N`2_f!k)} zm;0UVLx>i$IOFiRUByrIWAE^vm`JCmacfw04?%Asg{tW3vtOdyA8DND*s+JBZ?O~T zkooPB!EkOuy>-RnLvJi%t5pp|i)h>d5z#?nCAN+NR~?Kr?@V;P*Bs`0U-(9Oikx_X z&by=C4$9_t030pu6swXJ0wZ-(jWo1{!|h_N38@isw(u{ZEM+IUQyJ@Ac+H8kd_Xuj zvg@SWGeG^fO2*C*(2UjOtsI<}hTs2Anorb{yy0(eHyK?hc3MJ9j|L_b5>rY|R1M`= z2HeKHrP}$ry=z7V_~Q#;HT)lj#gwa@6qr-NQBC-_LE{1^r(1G$8vZfE6a|2j+rowD z9XFE2#k601Zw)O8)aaflu}dA?0i{4Js`Qi5kLaftkG4YOtg4B;yJv~DmGG{FHV z3&x}CT*4gpSz$%jT`?uUmlxKwfecPNeKHaG93&zF_8rUGuOop}Yxscbfk`A{9{(w< zT6px@>|qc4{%>nL#8ULZN3@A0r|vr5es-ziEnI=0%#pQw7NpN2Qv;1j6|~O%Nf#C7 zB(hrRlgQOrJK6i(d?r5vdYQ~(i6NTYA5*uvKwl{EvKb@;pO)=gWE%TfSyo-0%%%8= z`~3QPu*3CdjqL_c$NfS1iBMtPB#ilha?(%LZDZklA&w@DIZ5=IF}ZNWU98$n9b#kd ze14@JPzueaOgy0+T;u({d+z=ZcCOlPnYj235T;-58MUil&wQK?J1#Yyu$cxUXTOF( zwqne47!2~KeDPJ~wM>_D*&(Yhz7}``QV%2jRbp)|;-?m_x$<-gCo-1lohZ9cO_Mhs z7b!oD#Yfv=@p>5=YZa+bnnZ$;Vk)Kaav7bP62JX0kN~MneJ77~HgO{QwV`>wOg=Z= z2qS;$!i_*Qtx=M|d@F5XStk94{gpk)imx;DDBDk&60RagTIHB}u7t-z$SwnFrnDuL zjIOFC2}hpLzR7NEw;+NrI;;2Fy$yejjlk(V-0$o42Jc1I3(&%)+oZd@y(OfjMf^)g zBAhn}mguF148W2oRi&>)CL2|KIt`N4hVdjrCD(!V*)PMN3m)H+hC8;vhZikVmcYKV z!Ce$y&(TG>{zVkqA#0m*5GC0jQE70(<8%y_ZonP3&Ko|t8$IHx&uF(5$tLOg+w~ou zs!wuyP!7~yOH}OIty;To$BN+)wt7Wet>TTpp(Q85nV(*O8_Wuqi7b&cGI+<5W^G2Z zGv|zH$$Qd&kN8vXAS>k4hAXb7`SfJo1K1tt!}wzbCiy%MkC?JI)KZb9RdETp>lcOZ zv`mdtSj$>uBC1bXZYjAKv7DDL^Ga_m%Lqel~qfAnJwm_G07EaEULZtJEipAB0n3!)}u-SG-!TZD1 z+<`*M1PUZJxsUZ9OA0iiAXWs7($uT-nHSj#0`3IGb;`tu44}`IYo8aOwU6JywL@LG zH{>)9B@GhP)C$EEVYRi}ww{OMi)9x7=Mj)%bx%gE(ke^UoJrq)VhM6`e$&c9R| z`|c1eY%y2H&YSNMOT5{lQ6CdVF?cP<-`%Z`N*o zjshiZd*eX8baLT{5MTiBTae=XSwN|v%Gs2VC2ZqXGJAu_G@+4!Nw&Su3x7z09h+Zh z#~!#CPm2!0`Hqf`#nSf!ljsdu*7dB(5HuEXPQ?L&)I83~pyl?!dzy%Iz%{`$dp89zlI^ zx%yQ>H6V46^p>f4L8&{stXOQ0LF-KjEE`?TVf2c`ynVMEsV$Hmq1O@e$LlRSGEz3- zAkJBa*f^%fhf^&i5tZe;YAQ$@xE|seC2;u1@tx`@92)J9^rg~8Pw*b`k*QpxgblCylT zJ_)MzzPHbdZB5<@x^NJSV)+T=^+XnI7(MYQLA#5qe8p2rYID9QSm1N3G!Bw1;gQC# zOJuC9>sJH}-v!dD^A($3NI2m{AUJLVZFS+MvfPh#fnOq=N2d+{h9&mFnL%~IIxRgQ>M;BU(qq#HV>f_wu*3=S;aV1tPcKF;7ts4D4S} zgYwJO8bwuBu}b;dm5qPg@WV9jtE!}P>hwtmf*+VMZxJ6`;|7Jbqr^VQNXZcErWmzenobWD{O-j{uW#@~oG}d#}nljJr@t4*XP9x~jBt`@*Ii>k1rYUFw ztPdGuz;B8l`NC;+>O}AC7=<3;yZs<4_NU-l6pts%t*e_hzq4Z$_S|F$zns^!Wv^_~ z>jBT;aFU>sl$Lg)Vq}{MS*Ci;VUix>fWdH4m*->8(GX%2CB_&xrQN01mi zNE7bfE+-cnl6)l=7b_@JD29(dO(y;sM@KB3Y~v#6B)z19oO{`FG5dyQzgAXySp~FE z9@B~T$+x0YW`+81tS!^a%rAl~juR#CM&^A!0dw6)bhWeA2eRy5q!tU#LkvWyB-?i4YtMG;Wv|jt! zk+Oz5I&Dvm8!jq2p<0IM;e`%m7PV8NKfyGuN5Ykhnh&xzW)91|IV(zUI8HL@_(t8O zez5cWT~)2B_C9Ux_PZf1%PdPi4N=C$E>)TfVpEmeDAwgVfRWdD)H^oUuq2f#Bc670PfCjzQlS@;_f*jTp@%1*mOM^-XNG+StZnU zKVC?NBpY5_T@B<%^MuLrJ}GJZbfO_>IqA=xu&623A$uIK=Wh|$HLL{x$7CA~m~7{1 z;P@qYxA+u=-?1d=`pXO}Qm~z}{*1xb7lPELlgu_II(SvUC7YlOFbQ+I!Z(jI zNT=^=enHa}3I-F)?6)S#|2SSl73eE8*@8Qc|AnV3Q!+w~{QiN{|7Lg%{%v@Du4#TN zER0cN^z;U46mVmwy{yn->^XPB>Ut zFcA?EQ#e0W?J%!kT<7k`^={$K4UW)K?T|Mrs`6+fQ>DQ$=_s}*^s{0QfHa8j2t~l5|W2VjAX?r^7zU6;L=UW+Z$8k zysV^mP+Rp?x476$Opf4zSwYRmb}pMW)d#zxR%lbQU=(MD;q>FcW8Jyjk|Jn(arDWl zW!Op`m)wL1Y0&zKV4ew_=V_taf8hkCY%Mt^^?KWOKgy-CV{}z6)bxJPnRftu763I1 z$ewP1bTs^L(vg2a0K`IFCsu7b?q)**TZLN^3%L9;P#)FM+adv`|2GnKIyHHM+gB)I zLUSim)%RkEt%+hmz$eDc(J3UZjZ;U8z` zB`-SXqfAZ*Lk!vLg(;;4y2=~4I40CsQS%Jw)?!gnJw%S$pFiG#Xfs0(`Z7N^n2W)7 zQm!0cuR(Whopjm53uxTSKqS@wO7o=sDc$hKhUklaemso&sJ=@$Tj1nP$8!Kvs=VpU z&hC@p;)N5sj+Z-TQZDuY1}~wfF-W6KgHqsIxb=?HMhD@+z2UEw5r-1)xQB6%H>_GJk7jyd57_a2?c^Z|U&_I~P#Rt&GqNtDe_iiJMP749e&8$FG zlVHz>>&-b;T_jnVa z!nvv^%_1|M;=c4rb zU78f?mns}mZZ`j;A4N`UA$*jM)B^iyEkF`rVVmx(PO8@4KXgk&K_zd44@NG~aCH}V=&Pa(I%OMa9@|cHn`59i1@dJtKNb=Zi zKV~-S#DAa#s+LknCl$pM!#ruATidw(q7g{u4J>gZ=029LP=DipbjMf;eo;IjWi(MG z7kT37FAJe+TruXtR9O_+(FK}~Fs@3iXNm6#Y3a*~1*O$5$}G~P4@uTQ(Ij{g0& zM?I{?$Y^ogHJy1KNUFV+{pHEHM*`$5u@DlzGI%}RABtJ$z>xHBw#WMVdhG+0iQ%v@ zX`8;jJp11W_$9L#P1#c{vL*0hVmvo445Ek1(8PejqE>%_0gl-MMIuYChrD4>jOfd3 zih0WtpP+QsHvr@|P^$(m`Rag4=OI}P70XjQhG9x5C2v19iAza3UU6iwKIEhaIE%voEb#!hd}P{UdXntr;WXNNSM@Zz1}-udq9xf61?IAU!(ocFD70~=PcL{F za0y_}i!x1<1^_Mj`p7lOa0?uV|H+Bgm|3Ts4>0dQ%dCJ!E)9VMUif-nfuu}W6>pY?3xtR%erAh^r|0RQYK`kEB-^h@fC1mtF^)Lkr z?^#koN&tyw+-_zqc>T+-_5l-Lzd?^>4z;ctt|ky)!X&Pkt|yzv?im-(WfXmTR8h~J5# zuTQ{TV2=DN$9dRSJOBW^aS6j3WhC8&(uaaF`rVs=qmUvxITFG9CBtw+=o`^Rj!m@k z%m72sR!Q%anBN(lP6q{^{Nma%mk=eFDu31H=H}T?^zKPRlGH!y=LQdrMdP@&t1YXP z-=Q+tYazW#m*DKGm3Bz<%%t7%c}}Yf9+g9pC4%T&nzVUa)sj-)0Pf22F!2Cui_#K4 zkvrOi@7(0BOHIatURZ_#9aWdz^Ek7|!DoL9r9U1zH#wG7^LD`} zly=%1B7@E$a_Cl@i3d}DYy*M3b|UB^=$oq#JR9n=(WI%UIJ~AiJ5)A)@^9ifJDXeZ zUl1|p3npFXgi)WFzyWhpGQU%T8yG#D@_#rGj)x(r5QiJMC_LyH0Fu_;gKuN{+T#%8 zHv?l*G=jJT{UmpDV81LJnirYtYC}g?52N7$G{c=R?yweO-@x6+FoIHu3R-UYI$cQ zo(QQeBs8=q02V#t3j%9>L&NO5RyaJM-hPYM69@W{9hfouZ+Je)_Oif4CMR_MR0w&t z2b!<1uS{@hc{x1q^DRk$m4QK?V(O;RdY7jr@kgQuK$j(^pct-Nh9>av`TpI6B2mK8 z(UFjiZLwUZLb~w83 zjzAvS;%$dsYLV~%WmeldiK(@4yk*kjlUL`Q{tyr3h2}=k|33GGFBjH7<+COQk=Z-n z2K z-pGo1*_)O5*K0_`g%~p^^a^e!j~9m|=F6V`q+On&_EUFA6S=eQ566-yTqWzc!G>ef z)+rXwS;(uYh1}k{;PQC90U9kk?~c>k$TzWedgK(KDzRf>oBV%aykca4m_YGEW4t1UoT{`U8*Cp%#8+gv(dq@?pszUe|CX29^lwv9o?4&y)3`vcUh_K$ zWI0xL`mH!_uX{Eko2I$jnK-~)aYMP*ch)?WbAaY93F*wiIW#zP=iuoR;%Z?bg^EJc zC@(vc53~iJl`*a>viz;5;EuS<1S{n+F;1berPP`|kR%w3dz-JYRn*S#W4_tGoHHFD z-v`3I8|o1t!vtC0_4Y9{Gb^Ml7TTQ(ACZZ&(Y`|}&x-WZV8lxICg8U$>fOsru0n;} zY3WM8qAvLL0nNT!syB;#JGNQIIkAZ-Q60yaowYIn!CW*P zCxqm)y9W?H8U5*6h3k$r6yX*7GWXgXQl=2qjylV$@a*sx$h5lAPZ-N({BIsUDrzz( z*YZ6QW)remSXmU2mb_Z~7@m$QWJM57;sN8k z@7vxb3w(Rg|L%F>@1H^Ig`U=sx~x5{CYVXFV)g^bae6?Gn(J+sS{C1;_z1znhD?#P zu0=uD2T6_mD(em#N)pWviZNL^K^$aYftVeQ$rUrPUJTDK{@QJnmls_ZfuXjpF1>5f z$@0a2rw68EsCqN&s4z)vidzos>%Vq4NlY^b@tg#VNNq|JlZ{!eygCLIbPMRDy<=8} zK~5G+>6qzdmL+0Pn$MSj5xsckNSJdJKzB>!GJ%wcE6I9ylqsuY*QFeKX%3{C_00bZ z0u2b|z)?$ZKwF@k<81(~(zJFo=)W~YWGI`*z0Yy|mj1*2QYpNczV0=ODA=pnbUSp# z8Y*F5j0Vaw&qC}u(MXu688OQV>4r_oZ>Uytv(&+%mC1S0q4gsJ&7{UcFeAgh_!ngl*Vg{86tRW|riSL`ZJ?)uuUa@Bl~V&MtU(^le7PultY}@!hfyLK zK`hVva74BQy2fn;&jN9*oS4$eY|-@e-?E)7QCXIltUJ0T<{+W_vs8G)Zl5Vxo#_IR zs5|~hd^MOpBZ}~&dEF{Ikm3cdP&|B`dbxK#2Klbn_OP_>E$t3$IV~>0eA!!-X8KrH zlWz)Td7}U5d>^wZQ+*Lex>q6H_{~u@)bRj&Wn@NG)9e&8ZCPw_LUUQI(2EWM+xmA# z7PL)_>b*w`wKUS5Nro+1Wh~|gYoKrwXHNr8`in;b_|5;LRCnX|KQki+OO(BXQ4!Bw z94Hw8GOQ?+N^`YaGbwI`P~vz4&5JipUL!iq86Q=o_#8oCSRiK!^C!Puc{C?Mj-~D` zBBq_zgj8hIjMbmjA!qS;#FMDYAKA`#TJRG&LWCiT zW=gPX3u67`86L;_ooLYxRr4B0?k$VGSog^<+tWw?G>)H7zcA4R;N&4Lu_@6?p!g?= zlc>$8mo*nl!x$6yk(Z*57F#coms^!TnUpS+e${DYl(ZJT6U^X{5N+SEh33dF&(!>` zL)$c%RN8_GiOA_wh@t{E#V1I(^~ALY0YenZKY(`2#YoByG|c+xX0X2ZlYN5S?Wp5)TYrq~iJhyf{kDc82%nI+8)6v&KTLxsGAv_{JCt#?Zn+e2=yJ&zk zX4wL?@i65=%1cNNc%&r6#bX8fSDLH^ZRuM@=w9$mztirQL^=t)`p4L=cc_}YrTqHr zPoa5wkb$Hlfr$zba2F%STw*wpNTWA1#4){{w)DxijY~0pc`M4KlOhRWnequ2j9al5 z=6%lglOE{m19ia^NVzG#^?{`q;~H%y?aXIn*Go)dcxcG9kqe13Bu$NO3DZHJ}IgZ-helAD^r#nlh zTc*{fu^5CVV2gIBm>(zIlISA_mLfMB{tRd6BL|Zx<|(70dW(jvsmrUAdF(iz;obLsQ4Nk!x~Cgp3Xtl~C;QbN2YSob@AQOTjHal-%5V36v04xx|U>7lPMHZpQn2wgW=z8G5NqP@f4RiJcp`*Vg z@%s~SRSk;xKssA>w6OeZM9pWnz))%iFyOzyB5!cOFc)jjx0e5&(Lwwf{1Ahzs{iG` zz_NN)5NdiugW;Bpf6X`x`y9}Xtim-GivIVax+L#6A)Pz$6MtC3190W{Kp$IFKtHps z1oX2%EAa^UK+cO|0zS?^d);_BkmljQ{9ogvQS^MTVgn5rQ-2@#b2CiM=lglL+QvUS z`$hun!_z2tQR>ge&uGDEYPuSAtZ4u20%*aGsYaj$!E>;|83`1?$fvH4O`*8s(g{Ra$sK2C54 z$Hbr{j{-Gv0(HIaAoFa1OqNlOb))reZ(X#H@;5*xEYQ02M~Df5=OyY(0*ylHB#=N; zQ&WSwdxCtD`@R3>Jvpu5GFRe*g-$|>cNbD$A?FW8ik$EnA_kL!NFB#>h_cnd&0lk` z%tvD?9-)ykPkIK&C{xOD*&7-+PuY|T`@6flV4_8gjk-V6J|3%)#OX@%t34<;Z*w*~ zaM_e~yz)1|XQ>Ra`|4$tt?3Qm|3N}4_oI*AR%8yMQhHxdQtH4hKcvQ_rBQ{hHJHz6 z(Y2iEnSM?2g~y_6p}ApEY?*^`kc^zl9b(B@d#g;`6c!0-+%1AW{bjo-hK zQ&8{&ib`j&e}qz3S10+RO|Y78<$bRxAWU6iuRSuVg&qZGkBBs|8 zA$W-glG=NP4C!Nk zWUSjQ_z>Pb5DJQqX_hNgw&!)KlR!n>fnXdW^Im72$h3k8yE zmeF0<*xW@Dff9>WKN#6AS^NZfTm=f38%xQUx7Pgc@VesIH|S=2#N1;^A6$?*zDl!Q zZZiM)s8B+Br73e4IL#KFAqdf>St5|+l*V8D@(I1hMupck<3n;pKMbfNi(+D7OCPBj z$L>;*B&@xk&}�Hg0KZ3MH5q6?$vMtqbnddoGmP*of=P|? zal24)7iOq@SNXBXi^H?%)ag_k_x{pbYhqy?ryIDz-#h+s zWJBMv@citxcZ50w3kzFP4aiKw;iT2BLYtF4+;4a)P!Kx_?Zwm5%Z&*F5w#$+A#1waPF)|X(=LA*{=TFkCT5&gi#=2ZID)mT~y@RG*7 zKebeh(d*B*GGya8u-*FPU5bA&w4Qlcq@5!os!C3`jus3Oscec7yZmP0|0YprA2;+# zikS(Mk*W<^uW2NYwqJ;(zY=))kyh!8PYrA53X2*dM;tqUPx5kf)RdEOPO=j3kHzwn zlb`WBGIW}4E@n@uDHaPU_e4@Bx}^@>4(>3i%k48sT-3lRqv&4`Z58kTK#%_r4}SJe z_*GTJGN&oOVkaUNZLb&HzztH4U%zrXW?;N#UXA)x@NsbWknl%{X{_7ayt22IN{XZ( ztx8SUKQijJCg=ih4wnzW-4Fd>bOz4* zBy=+KIt~U|RaMn+f4cC*@_OSlkwXCEG+M2BVh6v;LN@6AvuZx6hKeCc*a-iF-p`*q zBQPV8J+Z0EYVs1qRvn=}j36i^*rBY2?8L#z-RUZ`%`uuvuBtmz53d@`hhalt z9@+(qe{2^SOd$D+KIxdGQhc+j?E1ISDVUmQRX*@ET=8w;U<2=buu~li{vm^R5~V|H znkq4@WLBxcPrkf)c?6>14!p-ipEe3ccF- z{FHb!qKhMJU9uKp)VC3p#c%FOpfe)Y|KLnVY(*D52~ACH=kf3}1S87>Aynb?7h(Gs zi2NxUhL5bl^riPF9ec5K)`aQ7AI2ancx(o97HHhCh%DwR*eTrj{dc~yncbg)?<8(1 z5LBg~5W5zgQQ@I2VScwui76O*ur#K>$~?As^GRFqt`+mhHFiEg=6{uTmSIsv-5ysk zz>)5uOJE35N^<{Ua$0%YE?x3$0?NB7 z%m$oCxdvePSVA>>*=C68q%=3&FzRqVp^hgPrK9Z z^seq!-K_S^iN^g_*NWF)SjM&AGACWiC4cC1`|bEae;@2o??Kh0wKzhK2gLe4yR+2 z>>y1;s^dKb{9^%6Xp!fYUL>AmqQ6rL0{2<30;!+}>0Qjv{gmQiC*x?e2(h>f{jOPb zKH-{}uj5(TOS+d7V@yLXw1s?Iv0mhX`rp-yFVeL3=^rxsC5vT_yVL&SFCg^zB^%{v z%Q`svL!jv*bdS9RS9-+A(rvhx5veI{$j+f6I(7P3k_`())R^jU(H~+dLTxA9$ zCZ?12C*0?VJ;%;Q!uym(9%ZM#y7I8zBt}h%n*|aT`&C?iL>JIo!%#gh#H;N_VDPGH zzGcH>=uRz320sE!HOCSxnC%1GTq>Ro*vK4lvEe4#QI%&7bV*$J5tT z81qYIp@T7QRt^phdQ67aDd;*l?NZW_2Xp4eQoleRMhqxU+-DF!rGrG9BnbB41nP7R z#3`l$Whxsf@2%CgFlSwt(@lI$*l^mGr7tw zI9MZ(bNkS;j_kK#p-!1xx_$P~uvvvFWbV-_sB{&jDsq%GUW{(JiU~yvGLLwm=Hy>f z(M0b}RfC z$BC4VE(wVXr>~m5#6j`&ICUDVKQV~f@49+MkRT$sEOTg9q;WVQeJeF=R`})b#jlKv zR}@EX?6I8_#Oe*6#cbizzI6#AI#1SzlX}V{VoM1}FC|jd@Ytac1u_wql$~gIrqKp( z#_1s%j!PzTO^X6n&CkPESgiN2t{^bY>CHsN5k~$U5la^;xQ^hb?~Yp0J9m2u;MzXO z$hDKz^i2x7kr0CGWxal#u|+75d{FHu%cg%EC4(NnSwX1xXG#*|*`$IcS|A#A9~XW{tZ?e|z7_@NOE@6uB`!oIQ1NJC zze>L45avc(baH|-l+$3=6W^T^IPA#oBxM>d2KB@7P!5v=gFj02v`_|)f#;mcQY2C4 z2bNajO3W`Q?03{qZ>k|Fh<_G9fp_L+ab)WwQ+NXXrHflCxy$DeINd|`okx((U6G1w* zNg6pD{$_$7Lgv7h3sg;YA0;ZZ?%ej-MBd@&7cjLI>0`EoWr0UD=gv*`nNfQI#v+Rc zFQIJDt&UT1c>{MCX52HObTX=6CE(`W-QCIoB%{(tx`oT3Xsum+#3%Zl9vX-{G!u&Z z?zxfp;Jj<15_Qw7EhW(aI5$FFsh8QqKEKw@1)*T&gVC_goeZ@NS~SHB%jycpI6D?} zAI=N82EyLnV_(?%=A`~E{cK-5W#D?fo?ow53ID{O^Rg^iMr3dTb_Wqk!%SYmBIa_o zD8PVv{VAOVwH#Yj<=8R93yBE=_-}==8Ehcczve`0rp%eOj>W$O{w9XRWtwAle$-;` zFZTj_9ru9%Nn%BdzW0}OQ>}rwKHHC=UH5-PZV8Zb2IX3t?jO)iZDV0>!xHTyPt5w7*%I(H&^`WV2EBE_R=kc*SkjT4a* zGclq2v)MQ0_JmQEam@AO@V4Ws8U!|bZh*PSL7Bdr!XY6cb;f@zE_MeV5_%*gB$!O> zJlMOuIEzn>Y@iP~e&dKKUT+!T~Bsx z3c19^r))At-jk1MzaOb+i{OusSo{4vPK!QbPgJX0;fXx;_T-|zD%uEWDV5y6cQldu zK!#d*jlb8AcD|zzbFNN``X%)RkE?u|=N`%mD9a`n8t7{DUBGN zuo{Hc8i5Yl|BIGgu%v+Uv|+ThBOCsD^E(hTYWfJaz@AU<;Yj%>Up3usV9u!r`18vm%>qyZ7=_&zn{7wq8ND z(9z5QkHts&nwBQQG~KTW?=})lHy$RFgZCg&!VESgDiN|$hs(S!X9U0Iryyj}O_ldQ zk0$k=;p!`RCJFY}JY$j1n(+{~i}sI{-g8fEPOyZAS9EN@V8y?!V3G0}ukizRxv53> zw7gG~o~eG=5ub;rmBW))Rm2pl+{;-s7wJ^|S7Ud-)CesK0spfHH*ap~JR??}aJ{$L z(Gl%&2O5*NX`a^ZHQRi6bVNYc@LF9_QPR&(oSfT`#b=O*Gmg#rxJsEARZ2C&K1qW+ z=8+QwDc&aiJ|S=6#!JgShQ8RP3E{XiJZ3NRD142Ynqp4z?VavL*m)rGAc{-8Uvapj z;j8ym$kG-5u(2N<0FbANIAFeeM%diFoR{P%Zo>s9oIHuiilhA@Uh3jlEt_^u`$d`j zsyS!5Y8H2Y^hc7IFX=ceURsSLSFxWb{3hRO31loA(J0we#$oK+8++1zTY=}v;CPcN z_U3t*soJ|PdG&Yo6nh7D=PG;y=@0#MB6G(xo|<(&taaJ%1g{Ge+N%1rZJ*!NM-n;OyH%Wbi*on7bQKwwgWnfKpY zPcTG%^k+I+zNr)vT3lS5$6GO3bRpytMl<`aO4h-U43yzdZoW)cAX;O`d4L-hPl;{)Pb>0oV8Xt)MWWe#6dD~>N#|u zqmZX4#U>7X6R9~wxVn^q;SI?BYJC$-cu1XMQ!6B>L5sm>gNB$3^O}raYU3G$!8)^j z3n%EE3u!KMg?%T}@?46g5tAPt*@8)=l^=~DH~Ai+g|2VTPE1qv{RW@C*!PVz^crSe zSCcG*zU2|)=rGIG+L>x^mV|sDiXty>wwgLu zi|}PL`%9Q+f5=PP4VSXlq4wFiiLP^ZN%#O}yGo5fQqq18fPS2?ydeaz2ro`wdIKE4C~n$}imGM7OR2^(x?Q0Ujb zwhm-ep1{;KPo^5*nH?M|_r(lf2j7#aBVXi1IxC&E^w2!RKWw+Wz29jXTQ+SIcD~u# zLajNU6E|1Q!rxF`(aSrgn7L2=kmn0(M?rGtMszYbmJR97dt0mJqB2vuTc+>CX!!|& zU0F=pMmZG+W%xo9g_rDSGTMa_yPunjtaO2l%+j@(Ck?ETf- zE6X|50v#}<2Q=pu9K-l$*y z0)YsQLoe>^gpQuzx;W-csd3>)vPTrxc?w$4EjqkZz*>h+C<1F1^ zEPJx)oWUSYyjok5__a;sTk0(>-uThp(GuJCzdHd$F9L`yw*x2b! zhe3mIHH^G9c{M}mt)n&6+g4q_MHs)1nH-EZ3XF!5B4{0Z#QT_&7JA#)l3>CO7)^ao zvfJ+sUz~5aPk-TIj=$w~9Fz}vK)lu~^Ye1$=t(BEg6SeR4Z9OErG1w4F=}qE!r8Sz zC{fV1e(q=JdaHK!dMO--m&j^Y`_c1guCM(kX!5BrF>I4PJY0n$lcr{ix_@e6bG(DF zcfyB)mD7z*zmz7bn6>M4rUK{w6)}vfv3m;qM>IU?A@|5~p4?4^7;Bj5{Wz@*n|BnmqcE9i*#)cGZts*u}RV&0r8X#|(&z_V|5Qcg+)`yi$Y4%4u= zXoj)y61PEiXIM61?A6=oj$OQ>Zk%8u)I*Su`T9g>5|8_R7Oe$^k4Jk^_O@%=S1Ia^ z?h%Bgn;PH2IG4?*?h&3}I_`I*GIz^|%H*UZ9MeV6@6zWW^%gFK(MxPf(hfPa9eQu| z!|huRTQ;1jSib#`yK@resrT{qnSpOOlz1WpJ9M(*q>FNOATu%djIu@!k?`pG4Ifrd zBiptMpT4KwKII_)=YXj7lH?Ebgp0`SP|EBSbnGZfZ#(_0V-(LRZ)}|WO$fNZ%z??s z;BAr^B7Uj(ai;Y=U-NMp=uwP072MdqEIaA&an!0>q15hopTm|F4$Wt-kNCXZ#l3Qm zykxw`A=T^E27^-?n!a0FLj70gwEgqsb9)Z7pL|9=)RUg#-)vla&h`33KP1+Kf(e=) zTxV$F;#gWDtJ@`>xM9jiD(@+DSn!(dPHw+&@O3y*Xos5b8_#od3R@- zi_`HrE*ETc$h|CAINibK9Tk?tlGC=C%~RmDlUFK#PLjydjgNx%JV3le>3$hN+b7{S zWz^Ol$mZ)(o8)d*hvu)ynlIcp+CB*sS+}qJr$^qOK}uS?S^~plcP#^n3M8Y_42ojY zo3Z+)wf*Kig5p|S1>p@V;pbJ;|Iqg5W^8qjZ^@i9@h}S Date: Sat, 26 Feb 2022 17:19:18 +0800 Subject: [PATCH 58/81] Enable test_word2vec_stand_alone_script by using sys.executable for python --- gensim/test/test_word2vec.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index 09def2c733..c7b0da6b7d 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -14,6 +14,7 @@ import bz2 import sys import tempfile +import subprocess import numpy as np @@ -27,6 +28,7 @@ from gensim import utils from gensim.models import word2vec, keyedvectors +from gensim.utils import check_output from gensim.test.utils import ( datapath, get_tmpfile, temporary_file, common_texts as sentences, LeeCorpus, lee_corpus_list, @@ -1168,15 +1170,18 @@ def test_path_line_sentences_one_file(self): # endclass TestWord2VecSentenceIterators -# TODO: get correct path to Python binary -# class TestWord2VecScripts(unittest.TestCase): -# def test_word2vec_stand_alone_script(self): -# """Does Word2Vec script launch standalone?""" -# cmd = 'python -m gensim.scripts.word2vec_standalone -train ' + datapath('testcorpus.txt') + \ -# ' -output vec.txt -size 200 -sample 1e-4 -binary 0 -iter 3 -min_count 1' -# output = check_output(cmd, stderr=PIPE) -# self.assertEqual(output, '0') -# #endclass TestWord2VecScripts + +class TestWord2VecScripts(unittest.TestCase): + def test_word2vec_stand_alone_script(self): + """Does Word2Vec script launch standalone?""" + cmd = [ + sys.executable, '-m', 'gensim.scripts.word2vec_standalone', + '-train', datapath('testcorpus.txt'), + '-output', 'vec.txt', '-size', '200', '-sample', '1e-4', + '-binary', '0', '-iter', '3', '-min_count', '1', + ] + output = check_output(args=cmd, stderr=subprocess.PIPE) + self.assertEqual(output, b'') if not hasattr(TestWord2VecModel, 'assertLess'): From 168a9efbff98b5ef61bc564506f8483cfb5c8e7e Mon Sep 17 00:00:00 2001 From: Paul Wise Date: Sat, 26 Feb 2022 15:30:12 +0800 Subject: [PATCH 59/81] Use gensim.test.utils datapath() to construct paths to the test data Makes the code more readable and consistent with other tests. --- gensim/test/test_lee.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/gensim/test/test_lee.py b/gensim/test/test_lee.py index 1eadd398a9..c8a592d539 100644 --- a/gensim/test/test_lee.py +++ b/gensim/test/test_lee.py @@ -25,7 +25,6 @@ from __future__ import with_statement import logging -import os.path import unittest from functools import partial @@ -34,6 +33,7 @@ from gensim import corpora, models, utils, matutils from gensim.parsing.preprocessing import preprocess_documents, preprocess_string, DEFAULT_FILTERS +from gensim.test.utils import datapath bg_corpus = None corpus = None @@ -45,24 +45,23 @@ def setUp(self): """setup lee test corpora""" global bg_corpus, corpus, human_sim_vector, bg_corpus2, corpus2 - pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - bg_corpus_file = 'lee_background.cor' - corpus_file = 'lee.cor' - sim_file = 'similarities0-1.txt' + bg_corpus_file = datapath('lee_background.cor') + corpus_file = datapath('lee.cor') + sim_file = datapath('similarities0-1.txt') # read in the corpora latin1 = partial(utils.to_unicode, encoding='latin1') - with utils.open(os.path.join(pre_path, bg_corpus_file), 'rb') as f: + with utils.open(bg_corpus_file, 'rb') as f: bg_corpus = preprocess_documents(latin1(line) for line in f) - with utils.open(os.path.join(pre_path, corpus_file), 'rb') as f: + with utils.open(corpus_file, 'rb') as f: corpus = preprocess_documents(latin1(line) for line in f) - with utils.open(os.path.join(pre_path, bg_corpus_file), 'rb') as f: + with utils.open(bg_corpus_file, 'rb') as f: bg_corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f] - with utils.open(os.path.join(pre_path, corpus_file), 'rb') as f: + with utils.open(corpus_file, 'rb') as f: corpus2 = [preprocess_string(latin1(s), filters=DEFAULT_FILTERS[:-1]) for s in f] # read the human similarity data - sim_matrix = np.loadtxt(os.path.join(pre_path, sim_file)) + sim_matrix = np.loadtxt(sim_file) sim_m_size = np.shape(sim_matrix)[0] human_sim_vector = sim_matrix[np.triu_indices(sim_m_size, 1)] From dfe740f45529dfa6862b89f334a0fdb95698c5be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 2 Apr 2022 10:28:05 +0200 Subject: [PATCH 60/81] fixes #3315: clean up evaluate_word_pairs --- gensim/models/keyedvectors.py | 36 +++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 0dd043c2df..1cfb564249 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -256,6 +256,9 @@ def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None): self.mapfile_path = mapfile_path + def __str__(self): + return f"{self.__class__.__name__}" + def _load_specials(self, *args, **kwargs): """Handle special requirements of `.load()` protocol, usually up-converting older versions.""" super(KeyedVectors, self)._load_specials(*args, **kwargs) @@ -1471,16 +1474,13 @@ def evaluate_word_pairs( similarity_model = [] oov = 0 - original_key_to_index = self.key_to_index - self.key_to_index = ok_vocab - - with utils.open(pairs, 'rb') as fin: - for line_no, line in enumerate(fin): - line = utils.to_unicode(line) - if line.startswith('#'): - # May be a comment - continue - else: + original_key_to_index, self.key_to_index = self.key_to_index, ok_vocab + try: + with utils.open(pairs, 'rb') as fin: + for line_no, line in enumerate(fin): + line = utils.to_unicode(line) + if not line or line.startswith('#'): # Ignore lines with comments. + continue try: if case_insensitive: a, b, sim = [word.upper() for word in line.split(delimiter)] @@ -1490,19 +1490,27 @@ def evaluate_word_pairs( except (ValueError, TypeError): logger.info('Skipping invalid line #%d in %s', line_no, pairs) continue + if a not in ok_vocab or b not in ok_vocab: oov += 1 if dummy4unknown: logger.debug('Zero similarity for line #%d with OOV words: %s', line_no, line.strip()) similarity_model.append(0.0) similarity_gold.append(sim) - continue else: - logger.debug('Skipping line #%d with OOV words: %s', line_no, line.strip()) - continue + logger.info('Skipping line #%d with OOV words: %s', line_no, line.strip()) + continue similarity_gold.append(sim) # Similarity from the dataset similarity_model.append(self.similarity(a, b)) # Similarity from the model - self.key_to_index = original_key_to_index + finally: + self.key_to_index = original_key_to_index + + assert len(similarity_gold) == len(similarity_model) + if not similarity_gold: + raise ValueError( + f"No valid similarity judgements found in {pairs}: either invalid format or " + f"all are out-of-vocabulary in {self}" + ) spearman = stats.spearmanr(similarity_gold, similarity_model) pearson = stats.pearsonr(similarity_gold, similarity_model) if dummy4unknown: From 4c9671afd3ec95819bfaf141dd48efff6676a7c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sat, 2 Apr 2022 10:34:20 +0200 Subject: [PATCH 61/81] allow non-utf8 encoding in evaluate_word_pairs --- gensim/models/keyedvectors.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 1cfb564249..54fa631778 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -1423,7 +1423,8 @@ def log_evaluate_word_pairs(pearson, spearman, oov, pairs): logger.info('Pairs with unknown words ratio: %.1f%%', oov) def evaluate_word_pairs( - self, pairs, delimiter='\t', restrict_vocab=300000, case_insensitive=True, dummy4unknown=False, + self, pairs, delimiter='\t', encoding='utf8', + restrict_vocab=300000, case_insensitive=True, dummy4unknown=False, ): """Compute correlation of the model with human similarity judgments. @@ -1476,9 +1477,8 @@ def evaluate_word_pairs( original_key_to_index, self.key_to_index = self.key_to_index, ok_vocab try: - with utils.open(pairs, 'rb') as fin: + with utils.open(pairs, encoding=encoding) as fin: for line_no, line in enumerate(fin): - line = utils.to_unicode(line) if not line or line.startswith('#'): # Ignore lines with comments. continue try: From 998074e5147e5d8e4d09c6334b99fc56e425dfb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Fri, 15 Apr 2022 14:21:36 +0200 Subject: [PATCH 62/81] Update keyedvectors.py --- gensim/models/keyedvectors.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index bf1d81ed80..b7fb2fe820 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -1860,7 +1860,10 @@ def _add_bytes_to_kv(kv, counts, chunk, vocab_size, vector_size, datatype, unico return processed_words, chunk[start:] -def _word2vec_read_binary(fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size, encoding="utf-8"): +def _word2vec_read_binary( + fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size, + encoding="utf-8", + ): chunk = b'' tot_processed_words = 0 From 662e38061dc4a972d81c743647f864e37bd7109c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Fri, 15 Apr 2022 14:23:04 +0200 Subject: [PATCH 63/81] Update test_translation_matrix.py --- gensim/test/test_translation_matrix.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_translation_matrix.py b/gensim/test/test_translation_matrix.py index 0cb4682013..2ccd61f597 100644 --- a/gensim/test/test_translation_matrix.py +++ b/gensim/test/test_translation_matrix.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # encoding: utf-8 -import sys + from collections import namedtuple import unittest import logging From edaeee9e112bf1a95825ca73bd3a212d1a3ae43d Mon Sep 17 00:00:00 2001 From: Ziang Ren Date: Fri, 15 Apr 2022 08:26:04 -0400 Subject: [PATCH 64/81] Added encoding='utf-8' keyword argument to TextDirectoryCorpus. Used smart_open to replace builtin open. (#3317) --- gensim/corpora/textcorpus.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/gensim/corpora/textcorpus.py b/gensim/corpora/textcorpus.py index c2b8b620bf..b4406c248a 100644 --- a/gensim/corpora/textcorpus.py +++ b/gensim/corpora/textcorpus.py @@ -33,7 +33,6 @@ """ - from __future__ import with_statement import logging @@ -50,6 +49,8 @@ ) from gensim.utils import deaccent, simple_tokenize +from smart_open import open + logger = logging.getLogger(__name__) @@ -399,7 +400,7 @@ class TextDirectoryCorpus(TextCorpus): """ def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_depth=None, - pattern=None, exclude_pattern=None, lines_are_documents=False, **kwargs): + pattern=None, exclude_pattern=None, lines_are_documents=False, encoding='utf-8', **kwargs): """ Parameters @@ -423,6 +424,8 @@ def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_dept Regex to use for file name exclusion, all files matching this pattern will be ignored. lines_are_documents : bool, optional If True - each line is considered a document, otherwise - each file is one document. + encoding : str, optional + Encoding used to read the specified file or files in the specified directory. kwargs: keyword arguments passed through to the `TextCorpus` constructor. See :meth:`gemsim.corpora.textcorpus.TextCorpus.__init__` docstring for more details on these. @@ -432,6 +435,7 @@ def __init__(self, input, dictionary=None, metadata=False, min_depth=0, max_dept self.pattern = pattern self.exclude_pattern = exclude_pattern self.lines_are_documents = lines_are_documents + self.encoding = encoding super(TextDirectoryCorpus, self).__init__(input, dictionary, metadata, **kwargs) @property @@ -510,7 +514,7 @@ def getstream(self): """ num_texts = 0 for path in self.iter_filepaths(): - with open(path, 'rt') as f: + with open(path, 'rt', encoding=self.encoding) as f: if self.lines_are_documents: for line in f: yield line.strip() From d872c02849af37812991ed72d69c8ed5725d1563 Mon Sep 17 00:00:00 2001 From: Radim Rehurek Date: Tue, 19 Apr 2022 08:20:27 +0200 Subject: [PATCH 65/81] retrained nb on Linux --- docs/notebooks/doc2vec-wikipedia.ipynb | 448 ++++++++++++++----------- 1 file changed, 246 insertions(+), 202 deletions(-) diff --git a/docs/notebooks/doc2vec-wikipedia.ipynb b/docs/notebooks/doc2vec-wikipedia.ipynb index 6add0580f9..ca07bc40ec 100644 --- a/docs/notebooks/doc2vec-wikipedia.ipynb +++ b/docs/notebooks/doc2vec-wikipedia.ipynb @@ -58,7 +58,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "First, download the dump of all Wikipedia articles from [here](http://download.wikimedia.org/enwiki/). You want the file named `enwiki-latest-pages-articles.xml.bz2`.\n", + "First, download the dump of all Wikipedia articles from [here](http://download.wikimedia.org/enwiki/latest). You want the file named `enwiki-latest-pages-articles.xml.bz2`.\n", "\n", "Second, convert that Wikipedia article dump from the arcane Wikimedia XML format into a plain text file. This will make the subsequent training faster and also allow easy inspection of the data = \"input eyeballing\".\n", "\n", @@ -74,20 +74,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Volumes/work/workspace/gensim/trunk/gensim/utils.py:1332: UserWarning: detected OSX with python3.8+; aliasing chunkize to chunkize_serial\n", - " warnings.warn(\"detected %s; aliasing chunkize to chunkize_serial\" % entity)\n", - "2022-03-17 21:15:32,118 : INFO : processing article #0: 'Anarchism' (6538 tokens)\n", - "2022-03-17 21:30:00,138 : INFO : processing article #500000: 'Spiritual Formation Bible' (54 tokens)\n", - "2022-03-17 21:40:22,219 : INFO : processing article #1000000: 'Adolf von Liebenberg' (52 tokens)\n", - "2022-03-17 21:49:43,825 : INFO : processing article #1500000: 'Small nucleolar RNA U6-53/MBII-28' (123 tokens)\n", - "2022-03-17 21:59:23,620 : INFO : processing article #2000000: 'Xie Fei' (50 tokens)\n", - "2022-03-17 22:09:17,460 : INFO : processing article #2500000: 'Rhein, Saskatchewan' (185 tokens)\n", - "2022-03-17 22:19:39,293 : INFO : processing article #3000000: 'Kunyinsky District' (969 tokens)\n", - "2022-03-17 22:30:41,221 : INFO : processing article #3500000: 'Lake Saint-Charles' (555 tokens)\n", - "2022-03-17 22:41:17,487 : INFO : processing article #4000000: 'Mahāyānasaṃgraha' (612 tokens)\n", - "2022-03-17 22:52:27,834 : INFO : processing article #4500000: 'Liriomyza trifolii' (1493 tokens)\n", - "2022-03-17 23:04:41,464 : INFO : processing article #5000000: 'Daniel O. Griffin' (594 tokens)\n", - "2022-03-17 23:08:58,451 : INFO : finished iterating over Wikipedia corpus of 5176019 documents with 2996051328 positions (total 21837336 articles, 3072543084 positions before pruning articles shorter than 50 words)\n" + "2022-04-16 11:23:20,663 : INFO : processing article #0: 'Anarchism' (6540 tokens)\n", + "2022-04-16 11:30:53,798 : INFO : processing article #500000: 'Onward Muslim Soldiers' (517 tokens)\n", + "2022-04-16 11:36:14,662 : INFO : processing article #1000000: 'Push Upstairs' (354 tokens)\n", + "2022-04-16 11:40:59,785 : INFO : processing article #1500000: 'Small nucleolar RNA Z278' (113 tokens)\n", + "2022-04-16 11:45:58,630 : INFO : processing article #2000000: '1925–26 Boston Bruins season' (556 tokens)\n", + "2022-04-16 11:51:03,737 : INFO : processing article #2500000: 'Tessier, Saskatchewan' (119 tokens)\n", + "2022-04-16 11:56:20,254 : INFO : processing article #3000000: 'Sebezhsky District' (908 tokens)\n", + "2022-04-16 12:01:59,089 : INFO : processing article #3500000: 'Niko Peleshi' (248 tokens)\n", + "2022-04-16 12:07:23,184 : INFO : processing article #4000000: 'Kudoa gunterae' (109 tokens)\n", + "2022-04-16 12:13:08,024 : INFO : processing article #4500000: 'Danko (singer)' (699 tokens)\n", + "2022-04-16 12:19:33,734 : INFO : processing article #5000000: 'Lada West Togliatti' (253 tokens)\n", + "2022-04-16 12:22:20,928 : INFO : finished iterating over Wikipedia corpus of 5205168 documents with 3016298486 positions (total 21961341 articles, 3093120544 positions before pruning articles shorter than 50 words)\n" ] } ], @@ -111,9 +109,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The above takes about 2 hours on my 2021 M1 MacbookPro, and creates a new ~5.8 GB file named `wiki.txt.gz`. We're compressing the text into `.gz` (GZIP) right away to save on disk space, using the [smart_open](https://github.com/RaRe-Technologies/smart_open) library.\n", + "The above took about 1 hour and created a new ~5.8 GB file named `wiki.txt.gz`. Note the output text was transparently compressed into `.gz` (GZIP) right away, using the [smart_open](https://github.com/RaRe-Technologies/smart_open) library, to save on disk space.\n", "\n", - "Next we'll set up a stream to load the preprocessed articles from `wiki.txt.gz` one by one, in the format expected by Doc2Vec, ready for training. We don't want to load everything into RAM at once, because that would blow up the memory. And it is not necessary – Gensim can handle streamed training data:" + "Next we'll set up a document stream to load the preprocessed articles from `wiki.txt.gz` one by one, in the format expected by Doc2Vec, ready for training. We don't want to load everything into RAM at once, because that would blow up the memory. And it is not necessary – Gensim can handle streamed input training data:" ] }, { @@ -173,7 +171,7 @@ "source": [ "The original paper had a vocabulary size of 915,715 word types, so we'll try to match it by setting `max_final_vocab` to 1,000,000 in the Doc2vec constructor.\n", "\n", - "Other critical parameters were left unspecified in the paper, so we'll go with a default window size of five (a prediction window of 5 tokens to either side), and downsampling of frequent words at 1e-5. It looks like the authors tried vector dimensionality of 100, 300, 1,000 & 10,000 in the paper (with 10k dims performing the best), but I'll only train with 200 dimensions here, to keep RAM in check on my laptop.\n", + "Other critical parameters were left unspecified in the paper, so we'll go with a window size of eight (a prediction window of 8 tokens to either side). It looks like the authors tried vector dimensionality of 100, 300, 1,000 & 10,000 in the paper (with 10k dims performing the best), but I'll only train with 200 dimensions here, to keep the RAM in check on my laptop.\n", "\n", "Feel free to tinker with these values yourself if you like:" ] @@ -189,27 +187,34 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-03-23 11:46:26,539 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-23T11:46:26.539501', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n", - "2022-03-23 11:46:26,541 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-03-23T11:46:26.541772', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'created'}\n" + "2022-04-18 12:05:46,344 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-04-18T12:05:46.344471', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'created'}\n", + "2022-04-18 12:05:46,345 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec', 'datetime': '2022-04-18T12:05:46.345716', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'created'}\n" ] } ], "source": [ - "workers = multiprocessing.cpu_count() # train with 10 threads on my 10-core laptop\n", + "workers = 20 # multiprocessing.cpu_count() - 1 # leave one core for the OS & other stuff\n", "\n", "# PV-DBOW: paragraph vector in distributed bag of words mode\n", "model_dbow = Doc2Vec(\n", " dm=0, dbow_words=1, # dbow_words=1 to train word vectors at the same time too, not only DBOW\n", - " vector_size=200, window=8, sample=1e-5, epochs=10, workers=workers, max_final_vocab=1000000,\n", + " vector_size=200, window=8, epochs=10, workers=workers, max_final_vocab=1000000,\n", ")\n", "\n", "# PV-DM: paragraph vector in distributed memory mode\n", "model_dm = Doc2Vec(\n", - " dm=1, dm_concat=0, dm_mean=1, # use average of context word vectors to train DM\n", - " vector_size=200, window=8, sample=1e-5, epochs=10, workers=workers, max_final_vocab=1000000,\n", + " dm=1, dm_mean=1, # use average of context word vectors to train DM\n", + " vector_size=200, window=8, epochs=10, workers=workers, max_final_vocab=1000000,\n", ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run one pass through the Wikipedia corpus, to collect the 1M vocabulary and initialize the doc2vec models:" + ] + }, { "cell_type": "code", "execution_count": 6, @@ -219,37 +224,37 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-03-23 11:46:26,921 : INFO : collecting all words and their counts\n", - "2022-03-23 11:46:26,926 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags\n", - "2022-03-23 11:48:29,646 : INFO : PROGRESS: at example #500000, processed 654950164 words (5336561 words/s), 3222179 word types, 500000 tags\n", - "2022-03-23 11:49:40,672 : INFO : PROGRESS: at example #1000000, processed 1018611068 words (5119067 words/s), 4480366 word types, 1000000 tags\n", - "2022-03-23 11:50:36,816 : INFO : PROGRESS: at example #1500000, processed 1305140647 words (5103506 words/s), 5420104 word types, 1500000 tags\n", - "2022-03-23 11:51:25,894 : INFO : PROGRESS: at example #2000000, processed 1550245240 words (4994178 words/s), 6188355 word types, 2000000 tags\n", - "2022-03-23 11:52:14,324 : INFO : PROGRESS: at example #2500000, processed 1790661139 words (4964223 words/s), 6941128 word types, 2500000 tags\n", - "2022-03-23 11:53:02,465 : INFO : PROGRESS: at example #3000000, processed 2028261627 words (4935656 words/s), 7664997 word types, 3000000 tags\n", - "2022-03-23 11:53:52,510 : INFO : PROGRESS: at example #3500000, processed 2264063867 words (4711784 words/s), 8347719 word types, 3500000 tags\n", - "2022-03-23 11:54:39,637 : INFO : PROGRESS: at example #4000000, processed 2488354257 words (4759368 words/s), 8971529 word types, 4000000 tags\n", - "2022-03-23 11:55:25,929 : INFO : PROGRESS: at example #4500000, processed 2703313059 words (4643620 words/s), 9605666 word types, 4500000 tags\n", - "2022-03-23 11:56:13,244 : INFO : PROGRESS: at example #5000000, processed 2925111571 words (4687730 words/s), 10217554 word types, 5000000 tags\n", - "2022-03-23 11:56:43,409 : INFO : collected 10427023 word types and 5176019 unique tags from a corpus of 5176019 examples and 2996051328 words\n", - "2022-03-23 11:56:46,967 : INFO : Doc2Vec lifecycle event {'msg': 'max_final_vocab=1000000 and min_count=5 resulted in calc_min_count=23, effective_min_count=23', 'datetime': '2022-03-23T11:56:46.967882', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-23 11:56:46,968 : INFO : Creating a fresh vocabulary\n", - "2022-03-23 11:56:50,535 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 retains 991887 unique words (9.51% of original 10427023, drops 9435136)', 'datetime': '2022-03-23T11:56:50.535964', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-23 11:56:50,536 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 leaves 2968296495 word corpus (99.07% of original 2996051328, drops 27754833)', 'datetime': '2022-03-23T11:56:50.536397', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-23 11:56:53,313 : INFO : deleting the raw counts dictionary of 10427023 items\n", - "2022-03-23 11:56:53,376 : INFO : sample=1e-05 downsamples 4155 most-common words\n", - "2022-03-23 11:56:53,376 : INFO : Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 1194754612.050565 word corpus (40.3%% of prior 2968296495)', 'datetime': '2022-03-23T11:56:53.376525', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'prepare_vocab'}\n", - "2022-03-23 11:56:58,203 : INFO : estimated required memory for 991887 words and 200 dimensions: 7258981700 bytes\n", - "2022-03-23 11:56:58,204 : INFO : resetting layer weights\n", - "2022-03-23 11:57:02,030 : INFO : resetting layer weights\n" + "2022-04-18 12:05:47,311 : INFO : collecting all words and their counts\n", + "2022-04-18 12:05:47,313 : INFO : PROGRESS: at example #0, processed 0 words (0 words/s), 0 word types, 0 tags\n", + "2022-04-18 12:07:35,880 : INFO : PROGRESS: at example #500000, processed 656884578 words (6050478 words/s), 3221051 word types, 500000 tags\n", + "2022-04-18 12:08:38,784 : INFO : PROGRESS: at example #1000000, processed 1021477892 words (5796084 words/s), 4478830 word types, 1000000 tags\n", + "2022-04-18 12:09:29,607 : INFO : PROGRESS: at example #1500000, processed 1308608477 words (5649726 words/s), 5419923 word types, 1500000 tags\n", + "2022-04-18 12:10:13,477 : INFO : PROGRESS: at example #2000000, processed 1554211349 words (5598537 words/s), 6190970 word types, 2000000 tags\n", + "2022-04-18 12:10:56,549 : INFO : PROGRESS: at example #2500000, processed 1794853915 words (5587147 words/s), 6943275 word types, 2500000 tags\n", + "2022-04-18 12:11:39,668 : INFO : PROGRESS: at example #3000000, processed 2032520202 words (5511955 words/s), 7668721 word types, 3000000 tags\n", + "2022-04-18 12:12:23,192 : INFO : PROGRESS: at example #3500000, processed 2268859232 words (5430192 words/s), 8352590 word types, 3500000 tags\n", + "2022-04-18 12:13:02,526 : INFO : PROGRESS: at example #4000000, processed 2493668037 words (5715482 words/s), 8977844 word types, 4000000 tags\n", + "2022-04-18 12:13:42,550 : INFO : PROGRESS: at example #4500000, processed 2709484503 words (5392235 words/s), 9612299 word types, 4500000 tags\n", + "2022-04-18 12:14:21,813 : INFO : PROGRESS: at example #5000000, processed 2932680226 words (5684768 words/s), 10226832 word types, 5000000 tags\n", + "2022-04-18 12:14:51,346 : INFO : collected 10469247 word types and 5205168 unique tags from a corpus of 5205168 examples and 3016298486 words\n", + "2022-04-18 12:14:55,076 : INFO : Doc2Vec lifecycle event {'msg': 'max_final_vocab=1000000 and min_count=5 resulted in calc_min_count=23, effective_min_count=23', 'datetime': '2022-04-18T12:14:55.076153', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}\n", + "2022-04-18 12:14:55,076 : INFO : Creating a fresh vocabulary\n", + "2022-04-18 12:14:58,906 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 retains 996522 unique words (9.52% of original 10469247, drops 9472725)', 'datetime': '2022-04-18T12:14:58.906148', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}\n", + "2022-04-18 12:14:58,906 : INFO : Doc2Vec lifecycle event {'msg': 'effective_min_count=23 leaves 2988436691 word corpus (99.08% of original 3016298486, drops 27861795)', 'datetime': '2022-04-18T12:14:58.906730', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}\n", + "2022-04-18 12:15:01,747 : INFO : deleting the raw counts dictionary of 10469247 items\n", + "2022-04-18 12:15:01,860 : INFO : sample=0.001 downsamples 23 most-common words\n", + "2022-04-18 12:15:01,861 : INFO : Doc2Vec lifecycle event {'msg': 'downsampling leaves estimated 2431447874.2898555 word corpus (81.4%% of prior 2988436691)', 'datetime': '2022-04-18T12:15:01.861332', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}\n", + "2022-04-18 12:15:07,001 : INFO : estimated required memory for 996522 words and 200 dimensions: 7297864200 bytes\n", + "2022-04-18 12:15:07,002 : INFO : resetting layer weights\n", + "2022-04-18 12:15:10,247 : INFO : resetting layer weights\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "Doc2Vec\n" + "Doc2Vec\n", + "Doc2Vec\n" ] } ], @@ -267,7 +272,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now we’re ready to train Doc2Vec on the entirety of the English Wikipedia. **Warning!** Training this DBOW model takes ~9 hours, and DM ~4 hours, on my 2021 laptop." + "Now we’re ready to train Doc2Vec on the entirety of the English Wikipedia. **Warning!** Training this DBOW model takes ~14 hours, and DM ~6 hours, on my 2020 Linux machine." ] }, { @@ -281,43 +286,54 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-03-23 11:57:10,178 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 10 workers on 991887 vocabulary and 200 features, using sg=1 hs=0 sample=1e-05 negative=5 window=8 shrink_windows=True', 'datetime': '2022-03-23T11:57:10.178111', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", - "2022-03-23 11:57:11,217 : INFO : EPOCH 0 - PROGRESS: at 0.00% examples, 2586 words/s, in_qsize 8, out_qsize 0\n", - "2022-03-23 12:27:11,295 : INFO : EPOCH 0 - PROGRESS: at 45.70% examples, 379076 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-23 12:48:46,182 : INFO : EPOCH 0: training on 2996051328 raw words (1198675664 effective words) took 3096.0s, 387174 effective words/s\n", - "2022-03-23 12:48:47,191 : INFO : EPOCH 1 - PROGRESS: at 0.00% examples, 407128 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-23 13:18:47,205 : INFO : EPOCH 1 - PROGRESS: at 48.78% examples, 396088 words/s, in_qsize 18, out_qsize 1\n", - "2022-03-23 13:39:22,059 : INFO : EPOCH 1: training on 2996051328 raw words (1198698563 effective words) took 3035.8s, 394848 effective words/s\n", - "2022-03-23 13:39:23,077 : INFO : EPOCH 2 - PROGRESS: at 0.00% examples, 382976 words/s, in_qsize 19, out_qsize 1\n", - "2022-03-23 14:09:23,122 : INFO : EPOCH 2 - PROGRESS: at 50.09% examples, 403393 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-23 14:29:15,186 : INFO : EPOCH 2: training on 2996051328 raw words (1198677402 effective words) took 2993.1s, 400483 effective words/s\n", - "2022-03-23 14:29:16,190 : INFO : EPOCH 3 - PROGRESS: at 0.00% examples, 388313 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-23 14:59:16,204 : INFO : EPOCH 3 - PROGRESS: at 51.43% examples, 410626 words/s, in_qsize 19, out_qsize 1\n", - "2022-03-23 15:18:21,724 : INFO : EPOCH 3: training on 2996051328 raw words (1198678276 effective words) took 2946.5s, 406814 effective words/s\n", - "2022-03-23 15:18:22,733 : INFO : EPOCH 4 - PROGRESS: at 0.00% examples, 401973 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-23 15:48:22,713 : INFO : EPOCH 4 - PROGRESS: at 51.08% examples, 408593 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-23 16:07:40,817 : INFO : EPOCH 4: training on 2996051328 raw words (1198689651 effective words) took 2959.1s, 405082 effective words/s\n", - "2022-03-23 16:07:41,822 : INFO : EPOCH 5 - PROGRESS: at 0.00% examples, 396007 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-23 16:37:41,889 : INFO : EPOCH 5 - PROGRESS: at 50.83% examples, 407325 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-23 16:57:06,451 : INFO : EPOCH 5: training on 2996051328 raw words (1198721998 effective words) took 2965.6s, 404214 effective words/s\n", - "2022-03-23 16:57:07,456 : INFO : EPOCH 6 - PROGRESS: at 0.00% examples, 385000 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-23 17:27:07,478 : INFO : EPOCH 6 - PROGRESS: at 51.20% examples, 409302 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-23 17:47:19,873 : INFO : EPOCH 6: training on 2996051328 raw words (1198686792 effective words) took 3013.4s, 397785 effective words/s\n", - "2022-03-23 17:47:20,887 : INFO : EPOCH 7 - PROGRESS: at 0.00% examples, 360937 words/s, in_qsize 19, out_qsize 0\n", - "2022-03-23 18:17:20,839 : INFO : EPOCH 7 - PROGRESS: at 43.05% examples, 365059 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-23 18:41:23,903 : INFO : EPOCH 7: training on 2996051328 raw words (1198695760 effective words) took 3244.0s, 369508 effective words/s\n", - "2022-03-23 18:41:24,920 : INFO : EPOCH 8 - PROGRESS: at 0.00% examples, 378286 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-23 19:11:24,921 : INFO : EPOCH 8 - PROGRESS: at 48.73% examples, 395820 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-23 19:33:01,653 : INFO : EPOCH 8: training on 2996051328 raw words (1198722784 effective words) took 3097.7s, 386971 effective words/s\n", - "2022-03-23 19:33:02,670 : INFO : EPOCH 9 - PROGRESS: at 0.00% examples, 374366 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-23 20:03:02,691 : INFO : EPOCH 9 - PROGRESS: at 47.88% examples, 391016 words/s, in_qsize 20, out_qsize 0\n", - "2022-03-23 20:24:57,297 : INFO : EPOCH 9: training on 2996051328 raw words (1198712861 effective words) took 3115.6s, 384741 effective words/s\n", - "2022-03-23 20:24:57,299 : INFO : Doc2Vec lifecycle event {'msg': 'training on 29960513280 raw words (11986959751 effective words) took 30466.9s, 393442 effective words/s', 'datetime': '2022-03-23T20:24:57.299122', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n" + "2022-04-18 12:15:13,503 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 20 workers on 996522 vocabulary and 200 features, using sg=1 hs=0 sample=0.001 negative=5 window=8 shrink_windows=True', 'datetime': '2022-04-18T12:15:13.503265', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'train'}\n", + "2022-04-18 12:15:14,566 : INFO : EPOCH 0 - PROGRESS: at 0.00% examples, 299399 words/s, in_qsize 38, out_qsize 1\n", + "2022-04-18 12:45:14,574 : INFO : EPOCH 0 - PROGRESS: at 20.47% examples, 469454 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 13:15:14,578 : INFO : EPOCH 0 - PROGRESS: at 61.04% examples, 470927 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 13:40:53,256 : INFO : EPOCH 0: training on 3016298486 raw words (2421756111 effective words) took 5139.7s, 471184 effective words/s\n", + "2022-04-18 13:40:54,274 : INFO : EPOCH 1 - PROGRESS: at 0.00% examples, 401497 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 14:10:54,283 : INFO : EPOCH 1 - PROGRESS: at 21.90% examples, 488616 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 14:40:54,290 : INFO : EPOCH 1 - PROGRESS: at 63.73% examples, 485374 words/s, in_qsize 40, out_qsize 0\n", + "2022-04-18 15:04:11,566 : INFO : EPOCH 1: training on 3016298486 raw words (2421755370 effective words) took 4998.3s, 484515 effective words/s\n", + "2022-04-18 15:04:12,590 : INFO : EPOCH 2 - PROGRESS: at 0.00% examples, 413109 words/s, in_qsize 38, out_qsize 2\n", + "2022-04-18 15:34:12,592 : INFO : EPOCH 2 - PROGRESS: at 21.94% examples, 489186 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 16:04:12,595 : INFO : EPOCH 2 - PROGRESS: at 64.02% examples, 487045 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 16:27:13,124 : INFO : EPOCH 2: training on 3016298486 raw words (2421749843 effective words) took 4981.6s, 486143 effective words/s\n", + "2022-04-18 16:27:14,132 : INFO : EPOCH 3 - PROGRESS: at 0.00% examples, 425720 words/s, in_qsize 37, out_qsize 0\n", + "2022-04-18 16:57:14,170 : INFO : EPOCH 3 - PROGRESS: at 22.16% examples, 492364 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 17:27:14,181 : INFO : EPOCH 3 - PROGRESS: at 64.36% examples, 489039 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 17:49:58,875 : INFO : EPOCH 3: training on 3016298486 raw words (2421759041 effective words) took 4965.7s, 487693 effective words/s\n", + "2022-04-18 17:49:59,888 : INFO : EPOCH 4 - PROGRESS: at 0.00% examples, 405295 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 18:19:59,893 : INFO : EPOCH 4 - PROGRESS: at 21.95% examples, 489379 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 18:49:59,917 : INFO : EPOCH 4 - PROGRESS: at 63.77% examples, 485582 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 19:13:19,358 : INFO : EPOCH 4: training on 3016298486 raw words (2421753794 effective words) took 5000.5s, 484304 effective words/s\n", + "2022-04-18 19:13:20,362 : INFO : EPOCH 5 - PROGRESS: at 0.00% examples, 417569 words/s, in_qsize 38, out_qsize 1\n", + "2022-04-18 19:43:20,366 : INFO : EPOCH 5 - PROGRESS: at 22.18% examples, 492529 words/s, in_qsize 40, out_qsize 0\n", + "2022-04-18 20:13:20,367 : INFO : EPOCH 5 - PROGRESS: at 64.36% examples, 489058 words/s, in_qsize 39, out_qsize 1\n", + "2022-04-18 20:36:01,806 : INFO : EPOCH 5: training on 3016298486 raw words (2421774390 effective words) took 4962.4s, 488021 effective words/s\n", + "2022-04-18 20:36:02,845 : INFO : EPOCH 6 - PROGRESS: at 0.00% examples, 376602 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 21:06:02,845 : INFO : EPOCH 6 - PROGRESS: at 21.77% examples, 486989 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 21:36:02,858 : INFO : EPOCH 6 - PROGRESS: at 63.44% examples, 483745 words/s, in_qsize 40, out_qsize 0\n", + "2022-04-18 21:59:40,920 : INFO : EPOCH 6: training on 3016298486 raw words (2421753569 effective words) took 5019.1s, 482507 effective words/s\n", + "2022-04-18 21:59:41,945 : INFO : EPOCH 7 - PROGRESS: at 0.00% examples, 410164 words/s, in_qsize 38, out_qsize 1\n", + "2022-04-18 22:29:41,989 : INFO : EPOCH 7 - PROGRESS: at 22.09% examples, 491334 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 22:59:42,000 : INFO : EPOCH 7 - PROGRESS: at 64.16% examples, 487826 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 23:22:40,504 : INFO : EPOCH 7: training on 3016298486 raw words (2421770259 effective words) took 4979.6s, 486340 effective words/s\n", + "2022-04-18 23:22:41,509 : INFO : EPOCH 8 - PROGRESS: at 0.00% examples, 294981 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-18 23:52:41,532 : INFO : EPOCH 8 - PROGRESS: at 21.64% examples, 485279 words/s, in_qsize 40, out_qsize 0\n", + "2022-04-19 00:22:41,533 : INFO : EPOCH 8 - PROGRESS: at 63.05% examples, 481687 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 00:46:43,879 : INFO : EPOCH 8: training on 3016298486 raw words (2421753439 effective words) took 5043.4s, 480185 effective words/s\n", + "2022-04-19 00:46:44,905 : INFO : EPOCH 9 - PROGRESS: at 0.00% examples, 383709 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 01:16:44,926 : INFO : EPOCH 9 - PROGRESS: at 21.82% examples, 487579 words/s, in_qsize 40, out_qsize 0\n", + "2022-04-19 01:46:44,928 : INFO : EPOCH 9 - PROGRESS: at 63.44% examples, 483731 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 02:10:25,029 : INFO : EPOCH 9: training on 3016298486 raw words (2421762745 effective words) took 5021.1s, 482313 effective words/s\n", + "2022-04-19 02:10:25,030 : INFO : Doc2Vec lifecycle event {'msg': 'training on 30162984860 raw words (24217588561 effective words) took 50111.5s, 483274 effective words/s', 'datetime': '2022-04-19T02:10:25.030386', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'train'}\n" ] } ], "source": [ - "# Train DBOW doc2vec incl. word vectors\n", + "# Train DBOW doc2vec incl. word vectors.\n", + "# Report progress every ½ hour.\n", "model_dbow.train(documents, total_examples=model_dbow.corpus_count, epochs=model_dbow.epochs, report_delay=30*60)" ] }, @@ -330,32 +346,43 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-03-23 21:06:50,772 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 10 workers on 991887 vocabulary and 200 features, using sg=0 hs=0 sample=1e-05 negative=5 window=8 shrink_windows=True', 'datetime': '2022-03-23T21:06:50.772480', 'gensim': '4.1.3.dev0', 'python': '3.8.9 (default, Oct 26 2021, 07:25:53) \\n[Clang 13.0.0 (clang-1300.0.29.30)]', 'platform': 'macOS-12.2.1-arm64-arm-64bit', 'event': 'train'}\n", - "2022-03-23 21:06:51,779 : INFO : EPOCH 0 - PROGRESS: at 0.01% examples, 774441 words/s, in_qsize 0, out_qsize 0\n", - "2022-03-23 21:28:47,548 : INFO : EPOCH 0: training on 2996051328 raw words (1198677606 effective words) took 1316.7s, 910333 effective words/s\n", - "2022-03-23 21:28:48,576 : INFO : EPOCH 1 - PROGRESS: at 0.02% examples, 1133718 words/s, in_qsize 0, out_qsize 0\n" - ] - }, - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "Input \u001b[0;32mIn [8]\u001b[0m, in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Train DM doc2vec\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[43mmodel_dm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdocuments\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtotal_examples\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_dm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcorpus_count\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmodel_dm\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mepochs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreport_delay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m30\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m60\u001b[39;49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/Volumes/work/workspace/gensim/trunk/gensim/models/doc2vec.py:516\u001b[0m, in \u001b[0;36mDoc2Vec.train\u001b[0;34m(self, corpus_iterable, corpus_file, total_examples, total_words, epochs, start_alpha, end_alpha, word_count, queue_factor, report_delay, callbacks, **kwargs)\u001b[0m\n\u001b[1;32m 513\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124moffsets\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m offsets\n\u001b[1;32m 514\u001b[0m kwargs[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mstart_doctags\u001b[39m\u001b[38;5;124m'\u001b[39m] \u001b[38;5;241m=\u001b[39m start_doctags\n\u001b[0;32m--> 516\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mDoc2Vec\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 517\u001b[0m \u001b[43m \u001b[49m\u001b[43mcorpus_iterable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcorpus_iterable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcorpus_file\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcorpus_file\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 518\u001b[0m \u001b[43m \u001b[49m\u001b[43mtotal_examples\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtotal_examples\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtotal_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtotal_words\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 519\u001b[0m \u001b[43m \u001b[49m\u001b[43mepochs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mepochs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstart_alpha\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstart_alpha\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mend_alpha\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mend_alpha\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mword_count\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mword_count\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 520\u001b[0m \u001b[43m \u001b[49m\u001b[43mqueue_factor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mqueue_factor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreport_delay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreport_delay\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallbacks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/Volumes/work/workspace/gensim/trunk/gensim/models/word2vec.py:1070\u001b[0m, in \u001b[0;36mWord2Vec.train\u001b[0;34m(self, corpus_iterable, corpus_file, total_examples, total_words, epochs, start_alpha, end_alpha, word_count, queue_factor, report_delay, compute_loss, callbacks, **kwargs)\u001b[0m\n\u001b[1;32m 1067\u001b[0m callback\u001b[38;5;241m.\u001b[39mon_epoch_begin(\u001b[38;5;28mself\u001b[39m)\n\u001b[1;32m 1069\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m corpus_iterable \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m-> 1070\u001b[0m trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_train_epoch\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1071\u001b[0m \u001b[43m \u001b[49m\u001b[43mcorpus_iterable\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcur_epoch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcur_epoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtotal_examples\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtotal_examples\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1072\u001b[0m \u001b[43m \u001b[49m\u001b[43mtotal_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtotal_words\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mqueue_factor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mqueue_factor\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreport_delay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreport_delay\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1073\u001b[0m \u001b[43m \u001b[49m\u001b[43mcallbacks\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallbacks\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1074\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 1075\u001b[0m trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_train_epoch_corpusfile(\n\u001b[1;32m 1076\u001b[0m corpus_file, cur_epoch\u001b[38;5;241m=\u001b[39mcur_epoch, total_examples\u001b[38;5;241m=\u001b[39mtotal_examples, total_words\u001b[38;5;241m=\u001b[39mtotal_words,\n\u001b[1;32m 1077\u001b[0m callbacks\u001b[38;5;241m=\u001b[39mcallbacks, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", - "File \u001b[0;32m/Volumes/work/workspace/gensim/trunk/gensim/models/word2vec.py:1431\u001b[0m, in \u001b[0;36mWord2Vec._train_epoch\u001b[0;34m(self, data_iterable, cur_epoch, total_examples, total_words, queue_factor, report_delay, callbacks)\u001b[0m\n\u001b[1;32m 1428\u001b[0m thread\u001b[38;5;241m.\u001b[39mdaemon \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m \u001b[38;5;66;03m# make interrupting the process with ctrl+c easier\u001b[39;00m\n\u001b[1;32m 1429\u001b[0m thread\u001b[38;5;241m.\u001b[39mstart()\n\u001b[0;32m-> 1431\u001b[0m trained_word_count, raw_word_count, job_tally \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_log_epoch_progress\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1432\u001b[0m \u001b[43m \u001b[49m\u001b[43mprogress_queue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjob_queue\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcur_epoch\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcur_epoch\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtotal_examples\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtotal_examples\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1433\u001b[0m \u001b[43m \u001b[49m\u001b[43mtotal_words\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtotal_words\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreport_delay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreport_delay\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_corpus_file_mode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 1434\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1436\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m trained_word_count, raw_word_count, job_tally\n", - "File \u001b[0;32m/Volumes/work/workspace/gensim/trunk/gensim/models/word2vec.py:1286\u001b[0m, in \u001b[0;36mWord2Vec._log_epoch_progress\u001b[0;34m(self, progress_queue, job_queue, cur_epoch, total_examples, total_words, report_delay, is_corpus_file_mode)\u001b[0m\n\u001b[1;32m 1283\u001b[0m unfinished_worker_count \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mworkers\n\u001b[1;32m 1285\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m unfinished_worker_count \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m-> 1286\u001b[0m report \u001b[38;5;241m=\u001b[39m \u001b[43mprogress_queue\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m \u001b[38;5;66;03m# blocks if workers too slow\u001b[39;00m\n\u001b[1;32m 1287\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m report \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m: \u001b[38;5;66;03m# a thread reporting that it finished\u001b[39;00m\n\u001b[1;32m 1288\u001b[0m unfinished_worker_count \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m\n", - "File \u001b[0;32m/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/queue.py:170\u001b[0m, in \u001b[0;36mQueue.get\u001b[0;34m(self, block, timeout)\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 169\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_qsize():\n\u001b[0;32m--> 170\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnot_empty\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m timeout \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mtimeout\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m must be a non-negative number\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m/Library/Developer/CommandLineTools/Library/Frameworks/Python3.framework/Versions/3.8/lib/python3.8/threading.py:302\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 300\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m: \u001b[38;5;66;03m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[39;00m\n\u001b[1;32m 301\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 302\u001b[0m \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 303\u001b[0m gotit \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m 304\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " + "2022-04-19 02:10:25,033 : INFO : Doc2Vec lifecycle event {'msg': 'training model with 20 workers on 996522 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=8 shrink_windows=True', 'datetime': '2022-04-19T02:10:25.033682', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'train'}\n", + "2022-04-19 02:10:26,039 : INFO : EPOCH 0 - PROGRESS: at 0.01% examples, 1154750 words/s, in_qsize 0, out_qsize 2\n", + "2022-04-19 02:40:26,040 : INFO : EPOCH 0 - PROGRESS: at 83.97% examples, 1182619 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 02:44:58,625 : INFO : EPOCH 0: training on 3016298486 raw words (2421749575 effective words) took 2073.6s, 1167903 effective words/s\n", + "2022-04-19 02:44:59,635 : INFO : EPOCH 1 - PROGRESS: at 0.01% examples, 1565065 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 03:14:59,636 : INFO : EPOCH 1 - PROGRESS: at 84.22% examples, 1185115 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 03:19:27,814 : INFO : EPOCH 1: training on 3016298486 raw words (2421738810 effective words) took 2069.2s, 1170383 effective words/s\n", + "2022-04-19 03:19:28,819 : INFO : EPOCH 2 - PROGRESS: at 0.01% examples, 1582102 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 03:49:28,822 : INFO : EPOCH 2 - PROGRESS: at 84.33% examples, 1186338 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 03:53:55,901 : INFO : EPOCH 2: training on 3016298486 raw words (2421754027 effective words) took 2068.1s, 1171014 effective words/s\n", + "2022-04-19 03:53:56,905 : INFO : EPOCH 3 - PROGRESS: at 0.01% examples, 1586215 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 04:23:56,914 : INFO : EPOCH 3 - PROGRESS: at 84.30% examples, 1186028 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 04:28:23,932 : INFO : EPOCH 3: training on 3016298486 raw words (2421734506 effective words) took 2068.0s, 1171036 effective words/s\n", + "2022-04-19 04:28:24,943 : INFO : EPOCH 4 - PROGRESS: at 0.01% examples, 1594202 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 04:58:24,946 : INFO : EPOCH 4 - PROGRESS: at 84.53% examples, 1188348 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 05:02:49,190 : INFO : EPOCH 4: training on 3016298486 raw words (2421739011 effective words) took 2065.3s, 1172611 effective words/s\n", + "2022-04-19 05:02:50,203 : INFO : EPOCH 5 - PROGRESS: at 0.01% examples, 1590285 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 05:32:50,205 : INFO : EPOCH 5 - PROGRESS: at 84.51% examples, 1188165 words/s, in_qsize 38, out_qsize 0\n", + "2022-04-19 05:37:12,922 : INFO : EPOCH 5: training on 3016298486 raw words (2421759651 effective words) took 2063.7s, 1173488 effective words/s\n", + "2022-04-19 05:37:13,928 : INFO : EPOCH 6 - PROGRESS: at 0.01% examples, 1574494 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 06:07:13,930 : INFO : EPOCH 6 - PROGRESS: at 84.61% examples, 1189231 words/s, in_qsize 40, out_qsize 0\n", + "2022-04-19 06:11:35,588 : INFO : EPOCH 6: training on 3016298486 raw words (2421751669 effective words) took 2062.7s, 1174090 effective words/s\n", + "2022-04-19 06:11:36,605 : INFO : EPOCH 7 - PROGRESS: at 0.01% examples, 1584768 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 06:41:36,617 : INFO : EPOCH 7 - PROGRESS: at 84.50% examples, 1188066 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 06:46:00,286 : INFO : EPOCH 7: training on 3016298486 raw words (2421751802 effective words) took 2064.7s, 1172935 effective words/s\n", + "2022-04-19 06:46:01,290 : INFO : EPOCH 8 - PROGRESS: at 0.01% examples, 1610826 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 07:16:01,295 : INFO : EPOCH 8 - PROGRESS: at 84.71% examples, 1190249 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 07:20:20,193 : INFO : EPOCH 8: training on 3016298486 raw words (2421731383 effective words) took 2059.9s, 1175653 effective words/s\n", + "2022-04-19 07:20:21,198 : INFO : EPOCH 9 - PROGRESS: at 0.01% examples, 1591209 words/s, in_qsize 0, out_qsize 0\n", + "2022-04-19 07:50:21,200 : INFO : EPOCH 9 - PROGRESS: at 84.65% examples, 1189549 words/s, in_qsize 39, out_qsize 0\n", + "2022-04-19 07:54:42,812 : INFO : EPOCH 9: training on 3016298486 raw words (2421765551 effective words) took 2062.6s, 1174124 effective words/s\n", + "2022-04-19 07:54:42,813 : INFO : Doc2Vec lifecycle event {'msg': 'training on 30162984860 raw words (24217475985 effective words) took 20657.8s, 1172317 effective words/s', 'datetime': '2022-04-19T07:54:42.813436', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'train'}\n" ] } ], "source": [ - "# Train DM doc2vec\n", + "# Train DM doc2vec.\n", "model_dm.train(documents, total_examples=model_dm.corpus_count, epochs=model_dm.epochs, report_delay=30*60)" ] }, @@ -363,7 +390,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Similarity interface" + "## Finding similar documents" ] }, { @@ -384,48 +411,48 @@ "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "[('Pattern recognition', 0.7641374468803406),\n", - " ('Multi-task learning', 0.7290244698524475),\n", - " ('Supervised learning', 0.7212514877319336),\n", - " ('Incremental learning', 0.7164462208747864),\n", - " ('Deep learning', 0.7093881964683533),\n", - " ('Predictive analytics', 0.7086609601974487),\n", - " ('Semi-supervised learning', 0.7068915367126465),\n", - " ('Outline of machine learning', 0.7035143971443176),\n", - " ('Artificial neural network', 0.6998467445373535),\n", - " ('Ensemble learning', 0.6948938965797424),\n", - " ('Intelligent control', 0.6883038878440857),\n", - " ('Statistical classification', 0.6876234412193298),\n", - " ('Rule induction', 0.6867162585258484),\n", - " ('Boosting (machine learning)', 0.685867190361023),\n", - " ('Feature selection', 0.6836000084877014),\n", - " ('Training, validation, and test sets', 0.6823415160179138),\n", - " ('Support-vector machine', 0.6810059547424316),\n", - " ('Perceptron', 0.6794257760047913),\n", - " ('Multilayer perceptron', 0.6773776412010193),\n", - " ('Neural network', 0.6765708923339844)]\n", - "Doc2Vec\n", - "[('Pattern recognition', 0.7597464323043823),\n", - " ('Support-vector machine', 0.7284112572669983),\n", - " ('Bayesian network', 0.7256077527999878),\n", - " ('Naive Bayes classifier', 0.7218978404998779),\n", - " ('Hidden Markov model', 0.7194668054580688),\n", - " ('Learning classifier system', 0.7183035016059875),\n", - " ('Boosting (machine learning)', 0.7128430604934692),\n", - " ('Conditional random field', 0.7125300168991089),\n", - " ('Semi-supervised learning', 0.7124624252319336),\n", - " ('Multi-task learning', 0.7108726501464844),\n", - " ('GeneMark', 0.708616316318512),\n", - " ('Deep learning', 0.7016053795814514),\n", - " ('Supervised learning', 0.6973129510879517),\n", - " ('Data analysis techniques for fraud detection', 0.6920328140258789),\n", - " ('Artificial neural network', 0.6897733807563782),\n", - " ('Mixture model', 0.688715398311615),\n", - " ('Symbolic artificial intelligence', 0.6857218742370605),\n", - " ('Meta learning (computer science)', 0.6849099397659302),\n", - " ('Grammar induction', 0.6836742758750916),\n", - " ('Intelligent agent', 0.6833598613739014)]\n" + "Doc2Vec\n", + "[('Supervised learning', 0.7491602301597595),\n", + " ('Pattern recognition', 0.7462332844734192),\n", + " ('Artificial neural network', 0.7142727971076965),\n", + " ('Data mining', 0.6930587887763977),\n", + " ('Computer mathematics', 0.686907947063446),\n", + " ('Deep learning', 0.6868096590042114),\n", + " ('Multi-task learning', 0.6859176158905029),\n", + " ('Outline of computer science', 0.6858125925064087),\n", + " ('Boosting (machine learning)', 0.6807966828346252),\n", + " ('Linear classifier', 0.6807013154029846),\n", + " ('Learning classifier system', 0.679194450378418),\n", + " ('Knowledge retrieval', 0.6765366196632385),\n", + " ('Perceptron', 0.675654947757721),\n", + " ('Incremental learning', 0.6712607741355896),\n", + " ('Support-vector machine', 0.6711161136627197),\n", + " ('Feature selection', 0.6696343421936035),\n", + " ('Image segmentation', 0.6688867211341858),\n", + " ('Neural network', 0.6670624017715454),\n", + " ('Reinforcement learning', 0.6666402220726013),\n", + " ('Feature extraction', 0.6657401323318481)]\n", + "Doc2Vec\n", + "[('Pattern recognition', 0.7151365280151367),\n", + " ('Supervised learning', 0.7006939053535461),\n", + " ('Multi-task learning', 0.6899284720420837),\n", + " ('Semi-supervised learning', 0.674682080745697),\n", + " ('Statistical classification', 0.6649825572967529),\n", + " ('Deep learning', 0.6647047400474548),\n", + " ('Artificial neural network', 0.66275954246521),\n", + " ('Feature selection', 0.6612880825996399),\n", + " ('Statistical learning theory', 0.6528184413909912),\n", + " ('Naive Bayes classifier', 0.6506016850471497),\n", + " ('Automatic image annotation', 0.6491228342056274),\n", + " ('Regularization (mathematics)', 0.6452057957649231),\n", + " ('Early stopping', 0.6439507007598877),\n", + " ('Support-vector machine', 0.64285808801651),\n", + " ('Meta learning (computer science)', 0.6418778300285339),\n", + " ('Linear classifier', 0.6391816735267639),\n", + " ('Empirical risk minimization', 0.6339778900146484),\n", + " ('Anomaly detection', 0.6328380703926086),\n", + " ('Predictive Model Markup Language', 0.6314322352409363),\n", + " ('Learning classifier system', 0.6307871341705322)]\n" ] } ], @@ -455,28 +482,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "[('Katy Perry', 0.7140653133392334),\n", - " ('Ariana Grande', 0.6990166306495667),\n", - " ('Demi Lovato', 0.6782864332199097),\n", - " ('Miley Cyrus', 0.6620475053787231),\n", - " ('List of awards and nominations received by Lady Gaga', 0.6562342047691345),\n", - " ('Christina Aguilera', 0.6527020335197449),\n", - " ('Taylor Swift', 0.6430284380912781),\n", - " ('Adele', 0.6412620544433594),\n", - " ('Adam Lambert', 0.6401858329772949),\n", - " ('Halsey (singer)', 0.637832760810852)]\n", - "Doc2Vec\n", - "[('Katy Perry', 0.6719839572906494),\n", - " ('Ariana Grande', 0.6502904295921326),\n", - " ('Taylor Swift', 0.6452381014823914),\n", - " ('Artpop', 0.6417931914329529),\n", - " ('Christina Aguilera', 0.634290337562561),\n", - " ('Nicki Minaj', 0.6294941902160645),\n", - " ('Adam Lambert', 0.6128465533256531),\n", - " ('Kesha', 0.6105154156684875),\n", - " ('Born This Way (album)', 0.6087599992752075),\n", - " ('Adele', 0.6087093353271484)]\n" + "Doc2Vec\n", + "[('Katy Perry', 0.7450265884399414),\n", + " ('Miley Cyrus', 0.7275323867797852),\n", + " ('Ariana Grande', 0.7223592400550842),\n", + " ('Adele', 0.6982873678207397),\n", + " ('Taylor Swift', 0.6901045441627502),\n", + " ('Demi Lovato', 0.6819911003112793),\n", + " ('Adam Lambert', 0.6552075147628784),\n", + " ('Nicki Minaj', 0.6513625383377075),\n", + " ('Selena Gomez', 0.6427122354507446),\n", + " ('Rihanna', 0.6323978304862976)]\n", + "Doc2Vec\n", + "[('Born This Way (album)', 0.6612793803215027),\n", + " ('Artpop', 0.6428781747817993),\n", + " ('Beautiful, Dirty, Rich', 0.6408763527870178),\n", + " ('Lady Gaga videography', 0.6143141388893127),\n", + " ('Lady Gaga discography', 0.6102882027626038),\n", + " ('Katy Perry', 0.6046711802482605),\n", + " ('Beyoncé', 0.6015700697898865),\n", + " ('List of Lady Gaga live performances', 0.5977909564971924),\n", + " ('Artpop (song)', 0.5930275917053223),\n", + " ('Born This Way (song)', 0.5911758542060852)]\n" ] } ], @@ -494,7 +521,9 @@ "source": [ "The DBOW results are in line with what the paper shows in Table 2a), revealing similar singers in the U.S.\n", "\n", - "Finally, let's do some of the wilder arithmetics that vectors embeddings are famous for. What are the entries most similar to \"Lady Gaga\" - \"American\" + \"Japanese\"? Table 2b) in the paper.\n", + "Interestingly, the DM results seem to capture more \"fact about Lady Gaga\" (her albums, trivia), whereas DBOW recovered \"similar artists\".\n", + "\n", + "**Finally, let's do some of the wilder arithmetics that vectors embeddings are famous for**. What are the entries most similar to \"Lady Gaga\" - \"American\" + \"Japanese\"? Table 2b) in the paper.\n", "\n", "Note that \"American\" and \"Japanese\" are word vectors, but they live in the same space as the document vectors so we can add / subtract them at will, for some interesting results. All word vectors were already lowercased by our tokenizer above, so we look for the lowercased version here:" ] @@ -510,28 +539,28 @@ "name": "stdout", "output_type": "stream", "text": [ - "Doc2Vec\n", - "[('Katy Perry', 0.6580742001533508),\n", - " ('Kōsui (Eito song)', 0.6197645664215088),\n", - " ('2NE1', 0.6172165274620056),\n", - " ('Ariana Grande', 0.608268678188324),\n", - " ('Alex York', 0.5975368618965149),\n", - " ('Thank You, Love (Kana Nishino album)', 0.5951482653617859),\n", - " ('X -Cross-', 0.5949676632881165),\n", - " ('Megitsune', 0.5922212600708008),\n", - " ('7 Spirits', 0.5915307998657227),\n", - " ('Audience (Ayumi Hamasaki song)', 0.5913636088371277)]\n", - "Doc2Vec\n", - "[('Morning Musume', 0.6124014854431152),\n", - " ('Yuko Ando (singer)', 0.6063645482063293),\n", - " ('Yumi Matsutoya', 0.6047919392585754),\n", - " ('J-pop', 0.5908822417259216),\n", - " ('Ayumi Hamasaki', 0.5900821685791016),\n", - " ('E-girls', 0.5884340405464172),\n", - " ('Enka', 0.583469033241272),\n", - " ('Shingo Katori', 0.583054780960083),\n", - " ('Dempagumi.inc', 0.575444221496582),\n", - " ('Shinsei Kamattechan', 0.5742727518081665)]\n" + "Doc2Vec\n", + "[('Ayumi Hamasaki', 0.6339365839958191),\n", + " ('Katy Perry', 0.5903329849243164),\n", + " ('2NE1', 0.5886631608009338),\n", + " (\"Girls' Generation\", 0.5769038796424866),\n", + " ('Flying Easy Loving Crazy', 0.5748921036720276),\n", + " ('Love Life 2', 0.5738793611526489),\n", + " ('Ariana Grande', 0.5715743899345398),\n", + " ('Game (Perfume album)', 0.569789707660675),\n", + " ('We Are \"Lonely Girl\"', 0.5696560740470886),\n", + " ('H (Ayumi Hamasaki EP)', 0.5691372156143188)]\n", + "Doc2Vec\n", + "[('Radwimps', 0.548571765422821),\n", + " ('Chisato Moritaka', 0.5456540584564209),\n", + " ('Suzuki Ami Around the World: Live House Tour 2005', 0.5375290513038635),\n", + " ('Anna Suda', 0.5338292121887207),\n", + " ('Beautiful, Dirty, Rich', 0.5309030413627625),\n", + " ('Momoiro Clover Z', 0.5304197072982788),\n", + " ('Pink Lady (duo)', 0.5268998742103577),\n", + " ('Reol (singer)', 0.5237400531768799),\n", + " ('Ami Suzuki', 0.5232592225074768),\n", + " ('Kaela Kimura', 0.5219823122024536)]\n" ] } ], @@ -550,13 +579,9 @@ "\n", "> Ayumi Hamasaki is a Japanese singer, songwriter, record producer, actress, model, spokesperson, and entrepreneur.\n", "\n", - "So that sounds like a success. It's also the nr. 1 hit in the paper we're replicating.\n", - "\n", - "Similarly, the DM model thought **Kaela Kimura** is the closest hit:\n", + "So that sounds like a success. It's also the nr. 1 hit in the paper we're replicating – success!\n", "\n", - "> Kaela Kimura is a Japanese pop rock singer, lyricist, fashion model and television presenter.\n", - "\n", - "Also pretty good.\n", + "The DM model results are opaque to me, but seem art & Japan related as well. The score deltas between these DM results are marginal, so it's likely they would change if retrained on a different version of Wikipedia. Or even when simply re-run on the same version – the doc2vec training algorithm is stochastic.\n", "\n", "These results demonstrate that both training modes employed in the original paper are outstanding for calculating similarity between document vectors, word vectors, or a combination of both. The DM mode has the added advantage of being 4x faster to train." ] @@ -570,9 +595,28 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-19 07:54:48,399 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec_dbow.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-04-19T07:54:48.399560', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'saving'}\n", + "2022-04-19 07:54:48,400 : INFO : storing np array 'vectors' to doc2vec_dbow.model.dv.vectors.npy\n", + "2022-04-19 07:54:49,613 : INFO : storing np array 'vectors' to doc2vec_dbow.model.wv.vectors.npy\n", + "2022-04-19 07:54:49,875 : INFO : storing np array 'syn1neg' to doc2vec_dbow.model.syn1neg.npy\n", + "2022-04-19 07:54:50,135 : INFO : not storing attribute cum_table\n", + "2022-04-19 07:54:53,026 : INFO : saved doc2vec_dbow.model\n", + "2022-04-19 07:54:53,027 : INFO : Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec_dm.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-04-19T07:54:53.027661', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Nov 26 2021, 20:14:08) \\n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-94-generic-x86_64-with-glibc2.29', 'event': 'saving'}\n", + "2022-04-19 07:54:53,028 : INFO : storing np array 'vectors' to doc2vec_dm.model.dv.vectors.npy\n", + "2022-04-19 07:54:54,556 : INFO : storing np array 'vectors' to doc2vec_dm.model.wv.vectors.npy\n", + "2022-04-19 07:54:54,808 : INFO : storing np array 'syn1neg' to doc2vec_dm.model.syn1neg.npy\n", + "2022-04-19 07:54:55,058 : INFO : not storing attribute cum_table\n", + "2022-04-19 07:54:57,872 : INFO : saved doc2vec_dm.model\n" + ] + } + ], "source": [ "model_dbow.save('doc2vec_dbow.model')\n", "model_dm.save('doc2vec_dm.model')" @@ -602,7 +646,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.9" + "version": "3.8.10" } }, "nbformat": 4, From 2982148972aab7eaa564df87f7a0654d53c31db9 Mon Sep 17 00:00:00 2001 From: Radim Rehurek Date: Tue, 19 Apr 2022 18:30:11 +0200 Subject: [PATCH 66/81] add pytest info to index.html --- docs/src/_templates/indexcontent.html | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/docs/src/_templates/indexcontent.html b/docs/src/_templates/indexcontent.html index 0ca74b7210..396f48b50d 100644 --- a/docs/src/_templates/indexcontent.html +++ b/docs/src/_templates/indexcontent.html @@ -199,6 +199,15 @@

Testing Gensim

+
+

Or, to install and test Gensim locally:

+

+                      pip install -e .  # compile and install Gensim from the current directory
+                    
+

+                      pytest gensim     # run the tests
+                    
+
From 9bbf12c330275351e777b553c145066b7c397f95 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 22 Apr 2022 19:11:47 +0900 Subject: [PATCH 67/81] Check gallery up to date as part of CI (#3329) * Check gallery up to date as part of CI Fix #2916 * tweak check_gallery.py * update CI workflow * update stale doc cache * update stale docs --- .github/workflows/build-wheels.yml | 30 + .github/workflows/tests.yml | 3 + docs/src/auto_examples/index.rst | 16 +- .../src/auto_examples/tutorials/run_lda.ipynb | 714 +++++---------- docs/src/auto_examples/tutorials/run_lda.py | 30 +- .../auto_examples/tutorials/run_lda.py.md5 | 2 +- docs/src/auto_examples/tutorials/run_lda.rst | 850 +++++++++--------- .../tutorials/sg_execution_times.rst | 8 +- docs/src/check_gallery.py | 69 ++ docs/src/gallery/tutorials/run_lda.py | 4 +- 10 files changed, 775 insertions(+), 951 deletions(-) create mode 100644 docs/src/check_gallery.py diff --git a/.github/workflows/build-wheels.yml b/.github/workflows/build-wheels.yml index 42f61bb8b2..cab6a16641 100644 --- a/.github/workflows/build-wheels.yml +++ b/.github/workflows/build-wheels.yml @@ -9,12 +9,42 @@ on: - cron: '0 0 * * sun,wed' jobs: + # + # The linters job duplicates tests.yml, can't think of a way to avoid this right now. + # + linters: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Setup up Python ${{ matrix.python }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python }} + + - name: Update pip + run: python -m pip install -U pip + + - name: Install dependencies + run: python -m pip install flake8 flake8-rst + + - name: Run flake8 linter (source) + run: flake8 --ignore E12,W503 --max-line-length 120 --show-source gensim + + # - name: Run flake8 linter (documentation) + # run: flake8 --ignore E202,E402,E302,E305,F821 --max-line-length 120 --filename '*.py,*.rst' docs + + - name: Check Sphinx Gallery cache + run: python docs/src/check_gallery.py build: timeout-minutes: 30 runs-on: ${{ matrix.os }} defaults: run: shell: bash + + needs: [linters] + strategy: fail-fast: false matrix: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 0b64f2b1b2..3cb54fe8be 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -28,6 +28,9 @@ jobs: # - name: Run flake8 linter (documentation) # run: flake8 --ignore E202,E402,E302,E305,F821 --max-line-length 120 --filename '*.py,*.rst' docs + - name: Check Sphinx Gallery cache + run: python docs/src/check_gallery.py + docs: name: build documentation timeout-minutes: 10 diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst index d3dd2291be..a3626768f2 100644 --- a/docs/src/auto_examples/index.rst +++ b/docs/src/auto_examples/index.rst @@ -71,7 +71,7 @@ Understanding this functionality is vital for using gensim effectively. .. raw:: html -
+
.. only:: html @@ -92,7 +92,7 @@ Understanding this functionality is vital for using gensim effectively. .. raw:: html -
+
.. only:: html @@ -190,7 +190,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. raw:: html -
+
.. only:: html @@ -211,7 +211,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. raw:: html -
+
.. only:: html @@ -309,7 +309,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. raw:: html -
+
.. only:: html @@ -330,7 +330,7 @@ These **goal-oriented guides** demonstrate how to **solve a specific problem** u .. raw:: html -
+
.. only:: html @@ -447,13 +447,13 @@ Blog posts, tutorial videos, hackathons and other useful Gensim resources, from .. container:: sphx-glr-download sphx-glr-download-python - :download:`Download all examples in Python source code: auto_examples_python.zip ` + :download:`Download all examples in Python source code: auto_examples_python.zip ` .. container:: sphx-glr-download sphx-glr-download-jupyter - :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` + :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` .. only:: html diff --git a/docs/src/auto_examples/tutorials/run_lda.ipynb b/docs/src/auto_examples/tutorials/run_lda.ipynb index 12f3eb1865..b953fe872b 100644 --- a/docs/src/auto_examples/tutorials/run_lda.ipynb +++ b/docs/src/auto_examples/tutorials/run_lda.ipynb @@ -1,477 +1,241 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "%matplotlib inline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "# LDA Model\n", - "\n", - "Introduces Gensim's LDA model and demonstrates its use on the NIPS corpus.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import logging\n", - "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The purpose of this tutorial is to demonstrate how to train and tune an LDA model.\n", - "\n", - "In this tutorial we will:\n", - "\n", - "* Load input data.\n", - "* Pre-process that data.\n", - "* Transform documents into bag-of-words vectors.\n", - "* Train an LDA model.\n", - "\n", - "This tutorial will **not**:\n", - "\n", - "* Explain how Latent Dirichlet Allocation works\n", - "* Explain how the LDA model performs inference\n", - "* Teach you all the parameters and options for Gensim's LDA implementation\n", - "\n", - "If you are not familiar with the LDA model or how to use it in Gensim, I (Olavur Mortensen)\n", - "suggest you read up on that before continuing with this tutorial. Basic\n", - "understanding of the LDA model should suffice. Examples:\n", - "\n", - "* `Introduction to Latent Dirichlet Allocation `_\n", - "* Gensim tutorial: `sphx_glr_auto_examples_core_run_topics_and_transformations.py`\n", - "* Gensim's LDA model API docs: :py:class:`gensim.models.LdaModel`\n", - "\n", - "I would also encourage you to consider each step when applying the model to\n", - "your data, instead of just blindly applying my solution. The different steps\n", - "will depend on your data and possibly your goal with the model.\n", - "\n", - "## Data\n", - "\n", - "I have used a corpus of NIPS papers in this tutorial, but if you're following\n", - "this tutorial just to learn about LDA I encourage you to consider picking a\n", - "corpus on a subject that you are familiar with. Qualitatively evaluating the\n", - "output of an LDA model is challenging and can require you to understand the\n", - "subject matter of your corpus (depending on your goal with the model).\n", - "\n", - "NIPS (Neural Information Processing Systems) is a machine learning conference\n", - "so the subject matter should be well suited for most of the target audience\n", - "of this tutorial. You can download the original data from Sam Roweis'\n", - "`website `_. The code below will\n", - "also do that for you.\n", - "\n", - ".. Important::\n", - " The corpus contains 1740 documents, and not particularly long ones.\n", - " So keep in mind that this tutorial is not geared towards efficiency, and be\n", - " careful before applying the code to a large dataset.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "import re\n", - "import tarfile\n", - "\n", - "import smart_open\n", - "\n", - "def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):\n", - " with smart_open.open(url, \"rb\") as file:\n", - " with tarfile.open(fileobj=file) as tar:\n", - " for member in tar.getmembers():\n", - " if member.isfile() and re.search(r'nipstxt/nips\\d+/\\d+\\.txt', member.name):\n", - " member_bytes = tar.extractfile(member).read()\n", - " yield member_bytes.decode('utf-8', errors='replace')\n", - "\n", - "docs = list(extract_documents())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So we have a list of 1740 documents, where each document is a Unicode string.\n", - "If you're thinking about using your own corpus, then you need to make sure\n", - "that it's in the same format (list of Unicode strings) before proceeding\n", - "with the rest of this tutorial.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print(len(docs))\n", - "print(docs[0][:500])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pre-process and vectorize the documents\n", - "\n", - "As part of preprocessing, we will:\n", - "\n", - "* Tokenize (split the documents into tokens).\n", - "* Lemmatize the tokens.\n", - "* Compute bigrams.\n", - "* Compute a bag-of-words representation of the data.\n", - "\n", - "First we tokenize the text using a regular expression tokenizer from NLTK. We\n", - "remove numeric tokens and tokens that are only a single character, as they\n", - "don't tend to be useful, and the dataset contains a lot of them.\n", - "\n", - ".. Important::\n", - "\n", - " This tutorial uses the nltk library for preprocessing, although you can\n", - " replace it with something else if you want.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Tokenize the documents.\n", - "from nltk.tokenize import RegexpTokenizer\n", - "\n", - "# Split the documents into tokens.\n", - "tokenizer = RegexpTokenizer(r'\\w+')\n", - "for idx in range(len(docs)):\n", - " docs[idx] = docs[idx].lower() # Convert to lowercase.\n", - " docs[idx] = tokenizer.tokenize(docs[idx]) # Split into words.\n", - "\n", - "# Remove numbers, but not words that contain numbers.\n", - "docs = [[token for token in doc if not token.isnumeric()] for doc in docs]\n", - "\n", - "# Remove words that are only one character.\n", - "docs = [[token for token in doc if len(token) > 1] for doc in docs]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We use the WordNet lemmatizer from NLTK. A lemmatizer is preferred over a\n", - "stemmer in this case because it produces more readable words. An output that is\n", - "easy to read is very desirable in topic modelling.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Lemmatize the documents.\n", - "from nltk.stem.wordnet import WordNetLemmatizer\n", - "\n", - "lemmatizer = WordNetLemmatizer()\n", - "docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We find bigrams in the documents. Bigrams are sets of two adjacent words.\n", - "Using bigrams we can get phrases like \"machine_learning\" in our output\n", - "(spaces are replaced with underscores); without bigrams we would only get\n", - "\"machine\" and \"learning\".\n", - "\n", - "Note that in the code below, we find bigrams and then add them to the\n", - "original data, because we would like to keep the words \"machine\" and\n", - "\"learning\" as well as the bigram \"machine_learning\".\n", - "\n", - ".. Important::\n", - " Computing n-grams of large dataset can be very computationally\n", - " and memory intensive.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Compute bigrams.\n", - "from gensim.models import Phrases\n", - "\n", - "# Add bigrams to docs (only ones that appear 20 times or more).\n", - "bigram = Phrases(docs, min_count=20)\n", - "for idx in range(len(docs)):\n", - " for token in bigram[docs[idx]]:\n", - " if '_' in token:\n", - " # Token is a bigram, add to document.\n", - " docs[idx].append(token)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We remove rare words and common words based on their *document frequency*.\n", - "Below we remove words that appear in less than 20 documents or in more than\n", - "50% of the documents. Consider trying to remove words only based on their\n", - "frequency, or maybe combining that with this approach.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Remove rare and common tokens.\n", - "from gensim.corpora import Dictionary\n", - "\n", - "# Create a dictionary representation of the documents.\n", - "dictionary = Dictionary(docs)\n", - "\n", - "# Filter out words that occur less than 20 documents, or more than 50% of the documents.\n", - "dictionary.filter_extremes(no_below=20, no_above=0.5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Finally, we transform the documents to a vectorized form. We simply compute\n", - "the frequency of each word, including the bigrams.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Bag-of-words representation of the documents.\n", - "corpus = [dictionary.doc2bow(doc) for doc in docs]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's see how many tokens and documents we have to train on.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "print('Number of unique tokens: %d' % len(dictionary))\n", - "print('Number of documents: %d' % len(corpus))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Training\n", - "\n", - "We are ready to train the LDA model. We will first discuss how to set some of the training parameters.\n", - "\n", - "First of all, the elephant in the room: how many topics do I need?\n", - "There is really no easy answer for this. It will depend on both your\n", - "data and your application. I have used 10 topics here because I wanted\n", - "to have a few topics that I could interpret and \"label\", and because that\n", - "turned out to give me reasonably good results. On the other hand, you might\n", - "not need to interpret all your topics, so you could use many topics,\n", - "for example, 100.\n", - "\n", - "``chunksize`` controls how many documents are processed at a time in the\n", - "training algorithm. Increasing chunksize will speed up training, at least as\n", - "long as the chunk of documents easily fit into memory. I've set ``chunksize =\n", - "2000``, which is more than the number of documents, so I process all the\n", - "data in one go. However, chunksize can influence the quality of the model, as\n", - "discussed in Hoffman and al. [2], but the difference was not\n", - "substantial in this case.\n", - "\n", - "``passes`` controls how often we train the model on the entire corpus.\n", - "Another word for passes might be \"epochs\". ``iterations`` is somewhat\n", - "technical, but essentially it controls how often we repeat a particular loop\n", - "over each document. It is important to set the number of \"passes\" and\n", - "\"iterations\" high enough.\n", - "\n", - "I suggest the following way to choose iterations and passes. First, enable\n", - "logging (``logging.basicConfig(level=logging.INFO, format='PID:%(process)d:%(threadName)s - %(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s')``), and set ``eval_every = 1``\n", - "in ``LdaModel``. Then, when training the model, look for a line in the log that\n", - "looks something like this::\n", - "\n", - " 2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations\n", - "\n", - "If you set ``passes = 20`` you will see this line 20 times. Make sure that by\n", - "the final passes, most of the documents have converged. So you want to choose\n", - "both passes and iterations to be high enough for this to happen.\n", - "\n", - "We set ``alpha = 'auto'`` and ``eta = 'auto'``. Again this is somewhat\n", - "technical, but essentially we are automatically learning two parameters in\n", - "the model that we usually would have to specify explicitly.\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "# Train LDA model.\n", - "from gensim.models import LdaModel\n", - "\n", - "# Set training parameters.\n", - "num_topics = 10\n", - "chunksize = 2000\n", - "passes = 20\n", - "iterations = 400\n", - "eval_every = None # Don't evaluate model perplexity, takes too much time.\n", - "\n", - "# Make an index to word dictionary.\n", - "temp = dictionary[0] # This is only to \"load\" the dictionary.\n", - "id2word = dictionary.id2token\n", - "\n", - "model = LdaModel(\n", - " corpus=corpus,\n", - " id2word=id2word,\n", - " chunksize=chunksize,\n", - " alpha='auto',\n", - " eta='auto',\n", - " iterations=iterations,\n", - " num_topics=num_topics,\n", - " passes=passes,\n", - " eval_every=eval_every\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can compute the topic coherence of each topic. Below we display the\n", - "average topic coherence and print the topics in order of topic coherence.\n", - "\n", - "Note that we use the \"Umass\" topic coherence measure here (see\n", - ":py:func:`gensim.models.ldamodel.LdaModel.top_topics`), Gensim has recently\n", - "obtained an implementation of the \"AKSW\" topic coherence measure (see\n", - "accompanying blog post, http://rare-technologies.com/what-is-topic-coherence/).\n", - "\n", - "If you are familiar with the subject of the articles in this dataset, you can\n", - "see that the topics below make a lot of sense. However, they are not without\n", - "flaws. We can see that there is substantial overlap between some topics,\n", - "others are hard to interpret, and most of them have at least some terms that\n", - "seem out of place. If you were able to do better, feel free to share your\n", - "methods on the blog at http://rare-technologies.com/lda-training-tips/ !\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": false - }, - "outputs": [], - "source": [ - "top_topics = model.top_topics(corpus)\n", - "\n", - "# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.\n", - "avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics\n", - "print('Average topic coherence: %.4f.' % avg_topic_coherence)\n", - "\n", - "from pprint import pprint\n", - "pprint(top_topics)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Things to experiment with\n\n* ``no_above`` and ``no_below`` parameters in ``filter_extremes`` method.\n* Adding trigrams or even higher order n-grams.\n* Consider whether using a hold-out set or cross-validation is the way to go for you.\n* Try other datasets.\n\n## Where to go from here\n\n* Check out a RaRe blog post on the AKSW topic coherence measure (http://rare-technologies.com/what-is-topic-coherence/).\n* pyLDAvis (https://pyldavis.readthedocs.io/en/latest/index.html).\n* Read some more Gensim tutorials (https://github.com/RaRe-Technologies/gensim/blob/develop/tutorials.md#tutorials).\n* If you haven't already, read [1] and [2] (see references).\n\n## References\n\n1. \"Latent Dirichlet Allocation\", Blei et al. 2003.\n2. \"Online Learning for Latent Dirichlet Allocation\", Hoffman et al. 2010.\n\n\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.0" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n# LDA Model\n\nIntroduces Gensim's LDA model and demonstrates its use on the NIPS corpus.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import logging\nlogging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The purpose of this tutorial is to demonstrate how to train and tune an LDA model.\n\nIn this tutorial we will:\n\n* Load input data.\n* Pre-process that data.\n* Transform documents into bag-of-words vectors.\n* Train an LDA model.\n\nThis tutorial will **not**:\n\n* Explain how Latent Dirichlet Allocation works\n* Explain how the LDA model performs inference\n* Teach you all the parameters and options for Gensim's LDA implementation\n\nIf you are not familiar with the LDA model or how to use it in Gensim, I (Olavur Mortensen)\nsuggest you read up on that before continuing with this tutorial. Basic\nunderstanding of the LDA model should suffice. Examples:\n\n* `Introduction to Latent Dirichlet Allocation `_\n* Gensim tutorial: `sphx_glr_auto_examples_core_run_topics_and_transformations.py`\n* Gensim's LDA model API docs: :py:class:`gensim.models.LdaModel`\n\nI would also encourage you to consider each step when applying the model to\nyour data, instead of just blindly applying my solution. The different steps\nwill depend on your data and possibly your goal with the model.\n\n## Data\n\nI have used a corpus of NIPS papers in this tutorial, but if you're following\nthis tutorial just to learn about LDA I encourage you to consider picking a\ncorpus on a subject that you are familiar with. Qualitatively evaluating the\noutput of an LDA model is challenging and can require you to understand the\nsubject matter of your corpus (depending on your goal with the model).\n\nNIPS (Neural Information Processing Systems) is a machine learning conference\nso the subject matter should be well suited for most of the target audience\nof this tutorial. You can download the original data from Sam Roweis'\n`website `_. The code below will\nalso do that for you.\n\n.. Important::\n The corpus contains 1740 documents, and not particularly long ones.\n So keep in mind that this tutorial is not geared towards efficiency, and be\n careful before applying the code to a large dataset.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import io\nimport os.path\nimport re\nimport tarfile\n\nimport smart_open\n\ndef extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'):\n with smart_open.open(url, \"rb\") as file:\n with tarfile.open(fileobj=file) as tar:\n for member in tar.getmembers():\n if member.isfile() and re.search(r'nipstxt/nips\\d+/\\d+\\.txt', member.name):\n member_bytes = tar.extractfile(member).read()\n yield member_bytes.decode('utf-8', errors='replace')\n\ndocs = list(extract_documents())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So we have a list of 1740 documents, where each document is a Unicode string.\nIf you're thinking about using your own corpus, then you need to make sure\nthat it's in the same format (list of Unicode strings) before proceeding\nwith the rest of this tutorial.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print(len(docs))\nprint(docs[0][:500])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pre-process and vectorize the documents\n\nAs part of preprocessing, we will:\n\n* Tokenize (split the documents into tokens).\n* Lemmatize the tokens.\n* Compute bigrams.\n* Compute a bag-of-words representation of the data.\n\nFirst we tokenize the text using a regular expression tokenizer from NLTK. We\nremove numeric tokens and tokens that are only a single character, as they\ndon't tend to be useful, and the dataset contains a lot of them.\n\n.. Important::\n\n This tutorial uses the nltk library for preprocessing, although you can\n replace it with something else if you want.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Tokenize the documents.\nfrom nltk.tokenize import RegexpTokenizer\n\n# Split the documents into tokens.\ntokenizer = RegexpTokenizer(r'\\w+')\nfor idx in range(len(docs)):\n docs[idx] = docs[idx].lower() # Convert to lowercase.\n docs[idx] = tokenizer.tokenize(docs[idx]) # Split into words.\n\n# Remove numbers, but not words that contain numbers.\ndocs = [[token for token in doc if not token.isnumeric()] for doc in docs]\n\n# Remove words that are only one character.\ndocs = [[token for token in doc if len(token) > 1] for doc in docs]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We use the WordNet lemmatizer from NLTK. A lemmatizer is preferred over a\nstemmer in this case because it produces more readable words. Output that is\neasy to read is very desirable in topic modelling.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Lemmatize the documents.\nfrom nltk.stem.wordnet import WordNetLemmatizer\n\nlemmatizer = WordNetLemmatizer()\ndocs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We find bigrams in the documents. Bigrams are sets of two adjacent words.\nUsing bigrams we can get phrases like \"machine_learning\" in our output\n(spaces are replaced with underscores); without bigrams we would only get\n\"machine\" and \"learning\".\n\nNote that in the code below, we find bigrams and then add them to the\noriginal data, because we would like to keep the words \"machine\" and\n\"learning\" as well as the bigram \"machine_learning\".\n\n.. Important::\n Computing n-grams of large dataset can be very computationally\n and memory intensive.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Compute bigrams.\nfrom gensim.models import Phrases\n\n# Add bigrams and trigrams to docs (only ones that appear 20 times or more).\nbigram = Phrases(docs, min_count=20)\nfor idx in range(len(docs)):\n for token in bigram[docs[idx]]:\n if '_' in token:\n # Token is a bigram, add to document.\n docs[idx].append(token)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We remove rare words and common words based on their *document frequency*.\nBelow we remove words that appear in less than 20 documents or in more than\n50% of the documents. Consider trying to remove words only based on their\nfrequency, or maybe combining that with this approach.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Remove rare and common tokens.\nfrom gensim.corpora import Dictionary\n\n# Create a dictionary representation of the documents.\ndictionary = Dictionary(docs)\n\n# Filter out words that occur less than 20 documents, or more than 50% of the documents.\ndictionary.filter_extremes(no_below=20, no_above=0.5)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we transform the documents to a vectorized form. We simply compute\nthe frequency of each word, including the bigrams.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Bag-of-words representation of the documents.\ncorpus = [dictionary.doc2bow(doc) for doc in docs]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see how many tokens and documents we have to train on.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print('Number of unique tokens: %d' % len(dictionary))\nprint('Number of documents: %d' % len(corpus))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training\n\nWe are ready to train the LDA model. We will first discuss how to set some of\nthe training parameters.\n\nFirst of all, the elephant in the room: how many topics do I need? There is\nreally no easy answer for this, it will depend on both your data and your\napplication. I have used 10 topics here because I wanted to have a few topics\nthat I could interpret and \"label\", and because that turned out to give me\nreasonably good results. You might not need to interpret all your topics, so\nyou could use a large number of topics, for example 100.\n\n``chunksize`` controls how many documents are processed at a time in the\ntraining algorithm. Increasing chunksize will speed up training, at least as\nlong as the chunk of documents easily fit into memory. I've set ``chunksize =\n2000``, which is more than the amount of documents, so I process all the\ndata in one go. Chunksize can however influence the quality of the model, as\ndiscussed in Hoffman and co-authors [2], but the difference was not\nsubstantial in this case.\n\n``passes`` controls how often we train the model on the entire corpus.\nAnother word for passes might be \"epochs\". ``iterations`` is somewhat\ntechnical, but essentially it controls how often we repeat a particular loop\nover each document. It is important to set the number of \"passes\" and\n\"iterations\" high enough.\n\nI suggest the following way to choose iterations and passes. First, enable\nlogging (as described in many Gensim tutorials), and set ``eval_every = 1``\nin ``LdaModel``. When training the model look for a line in the log that\nlooks something like this::\n\n 2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations\n\nIf you set ``passes = 20`` you will see this line 20 times. Make sure that by\nthe final passes, most of the documents have converged. So you want to choose\nboth passes and iterations to be high enough for this to happen.\n\nWe set ``alpha = 'auto'`` and ``eta = 'auto'``. Again this is somewhat\ntechnical, but essentially we are automatically learning two parameters in\nthe model that we usually would have to specify explicitly.\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Train LDA model.\nfrom gensim.models import LdaModel\n\n# Set training parameters.\nnum_topics = 10\nchunksize = 2000\npasses = 20\niterations = 400\neval_every = None # Don't evaluate model perplexity, takes too much time.\n\n# Make an index to word dictionary.\ntemp = dictionary[0] # This is only to \"load\" the dictionary.\nid2word = dictionary.id2token\n\nmodel = LdaModel(\n corpus=corpus,\n id2word=id2word,\n chunksize=chunksize,\n alpha='auto',\n eta='auto',\n iterations=iterations,\n num_topics=num_topics,\n passes=passes,\n eval_every=eval_every\n)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can compute the topic coherence of each topic. Below we display the\naverage topic coherence and print the topics in order of topic coherence.\n\nNote that we use the \"Umass\" topic coherence measure here (see\n:py:func:`gensim.models.ldamodel.LdaModel.top_topics`), Gensim has recently\nobtained an implementation of the \"AKSW\" topic coherence measure (see\naccompanying blog post, http://rare-technologies.com/what-is-topic-coherence/).\n\nIf you are familiar with the subject of the articles in this dataset, you can\nsee that the topics below make a lot of sense. However, they are not without\nflaws. We can see that there is substantial overlap between some topics,\nothers are hard to interpret, and most of them have at least some terms that\nseem out of place. If you were able to do better, feel free to share your\nmethods on the blog at http://rare-technologies.com/lda-training-tips/ !\n\n\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "top_topics = model.top_topics(corpus)\n\n# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.\navg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics\nprint('Average topic coherence: %.4f.' % avg_topic_coherence)\n\nfrom pprint import pprint\npprint(top_topics)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Things to experiment with\n\n* ``no_above`` and ``no_below`` parameters in ``filter_extremes`` method.\n* Adding trigrams or even higher order n-grams.\n* Consider whether using a hold-out set or cross-validation is the way to go for you.\n* Try other datasets.\n\n## Where to go from here\n\n* Check out a RaRe blog post on the AKSW topic coherence measure (http://rare-technologies.com/what-is-topic-coherence/).\n* pyLDAvis (https://pyldavis.readthedocs.io/en/latest/index.html).\n* Read some more Gensim tutorials (https://github.com/RaRe-Technologies/gensim/blob/develop/tutorials.md#tutorials).\n* If you haven't already, read [1] and [2] (see references).\n\n## References\n\n1. \"Latent Dirichlet Allocation\", Blei et al. 2003.\n2. \"Online Learning for Latent Dirichlet Allocation\", Hoffman et al. 2010.\n\n\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_lda.py b/docs/src/auto_examples/tutorials/run_lda.py index 00116db20e..7ee6b07cd2 100644 --- a/docs/src/auto_examples/tutorials/run_lda.py +++ b/docs/src/auto_examples/tutorials/run_lda.py @@ -58,6 +58,8 @@ # careful before applying the code to a large dataset. # +import io +import os.path import re import tarfile @@ -120,7 +122,7 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' ############################################################################### # We use the WordNet lemmatizer from NLTK. A lemmatizer is preferred over a -# stemmer in this case because it produces more readable words. An output that is +# stemmer in this case because it produces more readable words. Output that is # easy to read is very desirable in topic modelling. # @@ -149,7 +151,7 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' # Compute bigrams. from gensim.models import Phrases -# Add bigrams to docs (only ones that appear 20 times or more). +# Add bigrams and trigrams to docs (only ones that appear 20 times or more). bigram = Phrases(docs, min_count=20) for idx in range(len(docs)): for token in bigram[docs[idx]]: @@ -195,20 +197,19 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' # We are ready to train the LDA model. We will first discuss how to set some of # the training parameters. # -# First of all, the elephant in the room: how many topics do I need? -# There is really no easy answer for this. It will depend on both your -# data and your application. I have used 10 topics here because I wanted -# to have a few topics that I could interpret and "label", and because that -# turned out to give me reasonably good results. On the other hand, you might -# not need to interpret all your topics, so you could use many topics, -# for example, 100. +# First of all, the elephant in the room: how many topics do I need? There is +# really no easy answer for this, it will depend on both your data and your +# application. I have used 10 topics here because I wanted to have a few topics +# that I could interpret and "label", and because that turned out to give me +# reasonably good results. You might not need to interpret all your topics, so +# you could use a large number of topics, for example 100. # # ``chunksize`` controls how many documents are processed at a time in the # training algorithm. Increasing chunksize will speed up training, at least as # long as the chunk of documents easily fit into memory. I've set ``chunksize = -# 2000``, which is more than the number of documents, so I process all the -# data in one go. However, chunksize can influence the quality of the model, as -# discussed in Hoffman and al. [2], but the difference was not +# 2000``, which is more than the amount of documents, so I process all the +# data in one go. Chunksize can however influence the quality of the model, as +# discussed in Hoffman and co-authors [2], but the difference was not # substantial in this case. # # ``passes`` controls how often we train the model on the entire corpus. @@ -218,9 +219,8 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' # "iterations" high enough. # # I suggest the following way to choose iterations and passes. First, enable -# logging (``logging.basicConfig(level=logging.INFO, format='PID:%(process)d:%(threadName)s - -# %(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s')``), and set ``eval_every = 1`` -# in ``LdaModel``. Then, when training the model, look for a line in the log that +# logging (as described in many Gensim tutorials), and set ``eval_every = 1`` +# in ``LdaModel``. When training the model look for a line in the log that # looks something like this:: # # 2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations diff --git a/docs/src/auto_examples/tutorials/run_lda.py.md5 b/docs/src/auto_examples/tutorials/run_lda.py.md5 index 9d25508c2f..6ce0e72960 100644 --- a/docs/src/auto_examples/tutorials/run_lda.py.md5 +++ b/docs/src/auto_examples/tutorials/run_lda.py.md5 @@ -1 +1 @@ -8e115014ecce36aa58a35f11fb525042 \ No newline at end of file +6733157cebb44ef13ae98ec8f4a533f1 \ No newline at end of file diff --git a/docs/src/auto_examples/tutorials/run_lda.rst b/docs/src/auto_examples/tutorials/run_lda.rst index 80abb74085..28c01ce172 100644 --- a/docs/src/auto_examples/tutorials/run_lda.rst +++ b/docs/src/auto_examples/tutorials/run_lda.rst @@ -93,6 +93,8 @@ also do that for you. .. code-block:: default + import io + import os.path import re import tarfile @@ -202,53 +204,13 @@ don't tend to be useful, and the dataset contains a lot of them. -.. rst-class:: sphx-glr-script-out - - Out: - - .. code-block:: none - - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - dtype=np.int): - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:35: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:597: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:836: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, copy_X=True, fit_path=True, - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, positive=False): - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1074: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - max_n_alphas=1000, n_jobs=1, eps=np.finfo(np.float).eps, - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1306: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - max_n_alphas=1000, n_jobs=1, eps=np.finfo(np.float).eps, - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1442: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, copy_X=True, positive=False): - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/randomized_l1.py:152: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - precompute=False, eps=np.finfo(np.float).eps, - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/randomized_l1.py:318: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=np.finfo(np.float).eps, random_state=None, - /home/jonaschn/.pyenv/versions/anaconda3-5.3.1/lib/python3.7/site-packages/sklearn/linear_model/randomized_l1.py:575: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here. - Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations - eps=4 * np.finfo(np.float).eps, n_jobs=1, - .. GENERATED FROM PYTHON SOURCE LINES 124-128 We use the WordNet lemmatizer from NLTK. A lemmatizer is preferred over a -stemmer in this case because it produces more readable words. An output that is +stemmer in this case because it produces more readable words. Output that is easy to read is very desirable in topic modelling. @@ -295,7 +257,7 @@ original data, because we would like to keep the words "machine" and # Compute bigrams. from gensim.models import Phrases - # Add bigrams to docs (only ones that appear 20 times or more). + # Add bigrams and trigrams to docs (only ones that appear 20 times or more). bigram = Phrases(docs, min_count=20) for idx in range(len(docs)): for token in bigram[docs[idx]]: @@ -313,13 +275,11 @@ original data, because we would like to keep the words "machine" and .. code-block:: none - /home/jonaschn/Projects/gensim/gensim/similarities/__init__.py:11: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning. - "The gensim.similarities.levenshtein submodule is disabled, because the optional " - 2021-03-19 14:09:53,817 : INFO : collecting all words and their counts - 2021-03-19 14:09:53,817 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types - 2021-03-19 14:09:59,172 : INFO : collected 1120198 token types (unigram + bigrams) from a corpus of 4629808 words and 1740 sentences - 2021-03-19 14:09:59,172 : INFO : merged Phrases<1120198 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> - 2021-03-19 14:09:59,190 : INFO : Phrases lifecycle event {'msg': 'built Phrases<1120198 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> in 5.36s', 'datetime': '2021-03-19T14:09:59.189253', 'gensim': '4.0.0.rc1', 'python': '3.7.0 (default, Jun 28 2018, 13:15:42) \n[GCC 7.2.0]', 'platform': 'Linux-4.15.0-136-generic-x86_64-with-debian-buster-sid', 'event': 'created'} + 2022-04-22 17:42:29,962 : INFO : collecting all words and their counts + 2022-04-22 17:42:29,963 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types + 2022-04-22 17:42:37,368 : INFO : collected 1120198 token types (unigram + bigrams) from a corpus of 4629808 words and 1740 sentences + 2022-04-22 17:42:37,368 : INFO : merged Phrases<1120198 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> + 2022-04-22 17:42:37,426 : INFO : Phrases lifecycle event {'msg': 'built Phrases<1120198 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> in 7.41s', 'datetime': '2022-04-22T17:42:37.369061', 'gensim': '4.1.3.dev0', 'python': '3.9.7 (default, Sep 3 2021, 12:37:55) \n[Clang 12.0.5 (clang-1205.0.22.9)]', 'platform': 'macOS-11.6.5-x86_64-i386-64bit', 'event': 'created'} @@ -356,12 +316,12 @@ frequency, or maybe combining that with this approach. .. code-block:: none - 2021-03-19 14:10:07,280 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2021-03-19 14:10:09,906 : INFO : built Dictionary(79429 unique tokens: ['1ooooo', '1st', '25oo', '2o00', '4ooo']...) from 1740 documents (total 4953968 corpus positions) - 2021-03-19 14:10:09,906 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(79429 unique tokens: ['1ooooo', '1st', '25oo', '2o00', '4ooo']...) from 1740 documents (total 4953968 corpus positions)", 'datetime': '2021-03-19T14:10:09.906597', 'gensim': '4.0.0.rc1', 'python': '3.7.0 (default, Jun 28 2018, 13:15:42) \n[GCC 7.2.0]', 'platform': 'Linux-4.15.0-136-generic-x86_64-with-debian-buster-sid', 'event': 'created'} - 2021-03-19 14:10:10,101 : INFO : discarding 70785 tokens: [('1ooooo', 1), ('25oo', 2), ('2o00', 6), ('4ooo', 2), ('64k', 6), ('a', 1740), ('aaditional', 1), ('above', 1114), ('abstract', 1740), ('acase', 1)]... - 2021-03-19 14:10:10,102 : INFO : keeping 8644 tokens which were in no less than 20 and no more than 870 (=50.0%) documents - 2021-03-19 14:10:10,128 : INFO : resulting dictionary: Dictionary(8644 unique tokens: ['1st', '5oo', '7th', 'a2', 'a_well']...) + 2022-04-22 17:42:50,414 : INFO : adding document #0 to Dictionary<0 unique tokens: []> + 2022-04-22 17:42:54,959 : INFO : built Dictionary<79429 unique tokens: ['1ooooo', '1st', '25oo', '2o00', '4ooo']...> from 1740 documents (total 4953968 corpus positions) + 2022-04-22 17:42:54,960 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<79429 unique tokens: ['1ooooo', '1st', '25oo', '2o00', '4ooo']...> from 1740 documents (total 4953968 corpus positions)", 'datetime': '2022-04-22T17:42:54.960496', 'gensim': '4.1.3.dev0', 'python': '3.9.7 (default, Sep 3 2021, 12:37:55) \n[Clang 12.0.5 (clang-1205.0.22.9)]', 'platform': 'macOS-11.6.5-x86_64-i386-64bit', 'event': 'created'} + 2022-04-22 17:42:55,733 : INFO : discarding 70785 tokens: [('1ooooo', 1), ('25oo', 2), ('2o00', 6), ('4ooo', 2), ('64k', 6), ('a', 1740), ('aaditional', 1), ('above', 1114), ('abstract', 1740), ('acase', 1)]... + 2022-04-22 17:42:55,734 : INFO : keeping 8644 tokens which were in no less than 20 and no more than 870 (=50.0%) documents + 2022-04-22 17:42:55,779 : INFO : resulting dictionary: Dictionary<8644 unique tokens: ['1st', '5oo', '7th', 'a2', 'a_well']...> @@ -424,20 +384,19 @@ Training We are ready to train the LDA model. We will first discuss how to set some of the training parameters. -First of all, the elephant in the room: how many topics do I need? -There is really no easy answer for this. It will depend on both your -data and your application. I have used 10 topics here because I wanted -to have a few topics that I could interpret and "label", and because that -turned out to give me reasonably good results. On the other hand, you might -not need to interpret all your topics, so you could use many topics, -for example, 100. +First of all, the elephant in the room: how many topics do I need? There is +really no easy answer for this, it will depend on both your data and your +application. I have used 10 topics here because I wanted to have a few topics +that I could interpret and "label", and because that turned out to give me +reasonably good results. You might not need to interpret all your topics, so +you could use a large number of topics, for example 100. ``chunksize`` controls how many documents are processed at a time in the training algorithm. Increasing chunksize will speed up training, at least as long as the chunk of documents easily fit into memory. I've set ``chunksize = -2000``, which is more than the number of documents, so I process all the -data in one go. However, chunksize can influence the quality of the model, as -discussed in Hoffman and al. [2], but the difference was not +2000``, which is more than the amount of documents, so I process all the +data in one go. Chunksize can however influence the quality of the model, as +discussed in Hoffman and co-authors [2], but the difference was not substantial in this case. ``passes`` controls how often we train the model on the entire corpus. @@ -447,9 +406,8 @@ over each document. It is important to set the number of "passes" and "iterations" high enough. I suggest the following way to choose iterations and passes. First, enable -logging (``logging.basicConfig(level=logging.INFO, format='PID:%(process)d:%(threadName)s - -%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s')``), and set ``eval_every = 1`` -in ``LdaModel``. Then, when training the model, look for a line in the log that +logging (as described in many Gensim tutorials), and set ``eval_every = 1`` +in ``LdaModel``. When training the model look for a line in the log that looks something like this:: 2016-06-21 15:40:06,753 - gensim.models.ldamodel - DEBUG - 68/1566 documents converged within 400 iterations @@ -505,170 +463,170 @@ the model that we usually would have to specify explicitly. .. code-block:: none - 2021-03-19 14:10:12,273 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1] - 2021-03-19 14:10:12,278 : INFO : using serial LDA version on this node - 2021-03-19 14:10:12,478 : INFO : running online (multi-pass) LDA training, 10 topics, 20 passes over the supplied corpus of 1740 documents, updating model once every 1740 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000 - 2021-03-19 14:10:12,482 : INFO : PROGRESS: pass 0, at document #1740/1740 - 2021-03-19 14:10:27,000 : INFO : optimized alpha [0.06386429, 0.07352975, 0.10417274, 0.09618805, 0.09326739, 0.07658379, 0.05232423, 0.09257348, 0.05156824, 0.064680815] - 2021-03-19 14:10:27,050 : INFO : topic #8 (0.052): 0.004*"layer" + 0.004*"action" + 0.003*"generalization" + 0.003*"image" + 0.002*"dynamic" + 0.002*"sample" + 0.002*"optimal" + 0.002*"matrix" + 0.002*"net" + 0.002*"classifier" - 2021-03-19 14:10:27,051 : INFO : topic #6 (0.052): 0.006*"image" + 0.005*"hidden" + 0.004*"recognition" + 0.003*"component" + 0.003*"field" + 0.003*"dynamic" + 0.002*"map" + 0.002*"solution" + 0.002*"net" + 0.002*"generalization" - 2021-03-19 14:10:27,051 : INFO : topic #4 (0.093): 0.004*"class" + 0.003*"rule" + 0.003*"hidden" + 0.003*"neuron" + 0.003*"layer" + 0.003*"field" + 0.002*"noise" + 0.002*"net" + 0.002*"image" + 0.002*"node" - 2021-03-19 14:10:27,051 : INFO : topic #3 (0.096): 0.006*"image" + 0.003*"gaussian" + 0.003*"layer" + 0.003*"neuron" + 0.003*"field" + 0.003*"matrix" + 0.003*"circuit" + 0.003*"class" + 0.002*"threshold" + 0.002*"recognition" - 2021-03-19 14:10:27,051 : INFO : topic #2 (0.104): 0.005*"neuron" + 0.004*"image" + 0.004*"control" + 0.004*"layer" + 0.004*"hidden" + 0.003*"recognition" + 0.003*"object" + 0.003*"signal" + 0.003*"response" + 0.003*"class" - 2021-03-19 14:10:27,051 : INFO : topic diff=1.190941, rho=1.000000 - 2021-03-19 14:10:27,063 : INFO : PROGRESS: pass 1, at document #1740/1740 - 2021-03-19 14:10:36,200 : INFO : optimized alpha [0.05691391, 0.05848132, 0.0764488, 0.07592632, 0.07411411, 0.06465285, 0.046124753, 0.06826302, 0.043833494, 0.05291034] - 2021-03-19 14:10:36,207 : INFO : topic #8 (0.044): 0.007*"action" + 0.004*"robot" + 0.004*"control" + 0.003*"optimal" + 0.003*"policy" + 0.003*"reinforcement" + 0.003*"generalization" + 0.003*"dynamic" + 0.003*"layer" + 0.003*"trajectory" - 2021-03-19 14:10:36,207 : INFO : topic #6 (0.046): 0.007*"image" + 0.007*"hidden" + 0.005*"recognition" + 0.003*"hidden_unit" + 0.003*"energy" + 0.003*"component" + 0.003*"map" + 0.003*"generalization" + 0.003*"net" + 0.003*"layer" - 2021-03-19 14:10:36,207 : INFO : topic #4 (0.074): 0.005*"class" + 0.004*"rule" + 0.003*"hidden" + 0.003*"layer" + 0.003*"net" + 0.003*"classifier" + 0.002*"node" + 0.002*"word" + 0.002*"context" + 0.002*"architecture" - 2021-03-19 14:10:36,207 : INFO : topic #3 (0.076): 0.007*"image" + 0.004*"circuit" + 0.003*"layer" + 0.003*"field" + 0.003*"analog" + 0.003*"chip" + 0.003*"threshold" + 0.003*"gaussian" + 0.003*"class" + 0.003*"matrix" - 2021-03-19 14:10:36,208 : INFO : topic #2 (0.076): 0.005*"control" + 0.005*"recognition" + 0.005*"image" + 0.005*"object" + 0.004*"speech" + 0.004*"layer" + 0.004*"signal" + 0.004*"neuron" + 0.004*"hidden" + 0.003*"word" - 2021-03-19 14:10:36,208 : INFO : topic diff=0.297702, rho=0.577350 - 2021-03-19 14:10:36,218 : INFO : PROGRESS: pass 2, at document #1740/1740 - 2021-03-19 14:10:43,026 : INFO : optimized alpha [0.05407287, 0.051192053, 0.06480061, 0.06461501, 0.06359977, 0.05890888, 0.042885136, 0.056735355, 0.039943077, 0.04743726] - 2021-03-19 14:10:43,033 : INFO : topic #8 (0.040): 0.008*"action" + 0.006*"control" + 0.005*"robot" + 0.005*"reinforcement" + 0.005*"policy" + 0.004*"optimal" + 0.004*"dynamic" + 0.003*"trajectory" + 0.003*"reinforcement_learning" + 0.003*"controller" - 2021-03-19 14:10:43,033 : INFO : topic #6 (0.043): 0.008*"image" + 0.008*"hidden" + 0.005*"recognition" + 0.004*"hidden_unit" + 0.003*"energy" + 0.003*"layer" + 0.003*"net" + 0.003*"generalization" + 0.003*"map" + 0.003*"solution" - 2021-03-19 14:10:43,034 : INFO : topic #4 (0.064): 0.005*"class" + 0.004*"rule" + 0.004*"hidden" + 0.004*"layer" + 0.003*"net" + 0.003*"classifier" + 0.003*"node" + 0.003*"word" + 0.003*"context" + 0.002*"architecture" - 2021-03-19 14:10:43,034 : INFO : topic #3 (0.065): 0.008*"image" + 0.004*"circuit" + 0.004*"chip" + 0.004*"analog" + 0.004*"threshold" + 0.004*"layer" + 0.003*"field" + 0.003*"node" + 0.003*"class" + 0.003*"net" - 2021-03-19 14:10:43,034 : INFO : topic #2 (0.065): 0.006*"recognition" + 0.006*"speech" + 0.005*"control" + 0.005*"object" + 0.005*"image" + 0.005*"layer" + 0.005*"signal" + 0.004*"word" + 0.004*"hidden" + 0.003*"classification" - 2021-03-19 14:10:43,034 : INFO : topic diff=0.256329, rho=0.500000 - 2021-03-19 14:10:43,044 : INFO : PROGRESS: pass 3, at document #1740/1740 - 2021-03-19 14:10:48,846 : INFO : optimized alpha [0.053115886, 0.046841364, 0.05838778, 0.05814584, 0.05758646, 0.05547897, 0.040862918, 0.05055692, 0.037515096, 0.044183854] - 2021-03-19 14:10:48,853 : INFO : topic #8 (0.038): 0.010*"action" + 0.008*"control" + 0.006*"reinforcement" + 0.006*"robot" + 0.005*"policy" + 0.005*"optimal" + 0.004*"controller" + 0.004*"dynamic" + 0.004*"reinforcement_learning" + 0.004*"trajectory" - 2021-03-19 14:10:48,853 : INFO : topic #6 (0.041): 0.009*"hidden" + 0.008*"image" + 0.006*"recognition" + 0.004*"hidden_unit" + 0.004*"layer" + 0.004*"energy" + 0.003*"net" + 0.003*"generalization" + 0.003*"field" + 0.003*"map" - 2021-03-19 14:10:48,853 : INFO : topic #4 (0.058): 0.005*"class" + 0.005*"hidden" + 0.004*"rule" + 0.004*"layer" + 0.004*"net" + 0.004*"classifier" + 0.003*"node" + 0.003*"propagation" + 0.003*"architecture" + 0.003*"context" - 2021-03-19 14:10:48,854 : INFO : topic #3 (0.058): 0.009*"image" + 0.005*"chip" + 0.005*"circuit" + 0.005*"analog" + 0.004*"threshold" + 0.004*"layer" + 0.003*"field" + 0.003*"bit" + 0.003*"node" + 0.003*"net" - 2021-03-19 14:10:48,854 : INFO : topic #2 (0.058): 0.007*"recognition" + 0.007*"speech" + 0.006*"object" + 0.006*"image" + 0.005*"word" + 0.005*"layer" + 0.005*"control" + 0.005*"signal" + 0.004*"hidden" + 0.003*"face" - 2021-03-19 14:10:48,854 : INFO : topic diff=0.230126, rho=0.447214 - 2021-03-19 14:10:48,864 : INFO : PROGRESS: pass 4, at document #1740/1740 - 2021-03-19 14:10:54,097 : INFO : optimized alpha [0.052869715, 0.044183813, 0.0546517, 0.054109406, 0.053801704, 0.053375203, 0.0394719, 0.04672288, 0.035995413, 0.04192354] - 2021-03-19 14:10:54,105 : INFO : topic #8 (0.036): 0.010*"action" + 0.010*"control" + 0.007*"reinforcement" + 0.006*"robot" + 0.006*"policy" + 0.005*"optimal" + 0.005*"controller" + 0.005*"dynamic" + 0.004*"reinforcement_learning" + 0.004*"trajectory" - 2021-03-19 14:10:54,105 : INFO : topic #6 (0.039): 0.009*"hidden" + 0.008*"image" + 0.006*"recognition" + 0.005*"hidden_unit" + 0.004*"layer" + 0.004*"energy" + 0.003*"net" + 0.003*"digit" + 0.003*"field" + 0.003*"generalization" - 2021-03-19 14:10:54,105 : INFO : topic #4 (0.054): 0.005*"class" + 0.005*"hidden" + 0.005*"rule" + 0.005*"net" + 0.005*"layer" + 0.004*"classifier" + 0.004*"node" + 0.003*"propagation" + 0.003*"architecture" + 0.003*"sequence" - 2021-03-19 14:10:54,106 : INFO : topic #3 (0.054): 0.009*"image" + 0.006*"chip" + 0.006*"circuit" + 0.006*"analog" + 0.004*"threshold" + 0.004*"layer" + 0.003*"field" + 0.003*"bit" + 0.003*"node" + 0.003*"net" - 2021-03-19 14:10:54,106 : INFO : topic #2 (0.055): 0.008*"recognition" + 0.008*"speech" + 0.007*"object" + 0.006*"word" + 0.006*"image" + 0.005*"layer" + 0.005*"signal" + 0.005*"control" + 0.004*"hidden" + 0.004*"face" - 2021-03-19 14:10:54,106 : INFO : topic diff=0.214075, rho=0.408248 - 2021-03-19 14:10:54,116 : INFO : PROGRESS: pass 5, at document #1740/1740 - 2021-03-19 14:10:59,195 : INFO : optimized alpha [0.05290075, 0.042460088, 0.052235015, 0.051339325, 0.05138389, 0.05190376, 0.038578223, 0.044312876, 0.035001513, 0.040355477] - 2021-03-19 14:10:59,202 : INFO : topic #8 (0.035): 0.011*"control" + 0.011*"action" + 0.007*"reinforcement" + 0.006*"policy" + 0.006*"robot" + 0.005*"controller" + 0.005*"optimal" + 0.005*"dynamic" + 0.005*"reinforcement_learning" + 0.005*"trajectory" - 2021-03-19 14:10:59,202 : INFO : topic #6 (0.039): 0.010*"hidden" + 0.008*"image" + 0.006*"recognition" + 0.005*"hidden_unit" + 0.005*"layer" + 0.004*"energy" + 0.004*"digit" + 0.004*"character" + 0.004*"net" + 0.003*"field" - 2021-03-19 14:10:59,203 : INFO : topic #5 (0.052): 0.021*"neuron" + 0.012*"cell" + 0.007*"response" + 0.007*"spike" + 0.006*"synaptic" + 0.006*"stimulus" + 0.005*"activity" + 0.005*"firing" + 0.005*"signal" + 0.004*"memory" - 2021-03-19 14:10:59,203 : INFO : topic #2 (0.052): 0.009*"recognition" + 0.008*"speech" + 0.007*"object" + 0.007*"word" + 0.006*"image" + 0.006*"signal" + 0.005*"layer" + 0.004*"hidden" + 0.004*"control" + 0.004*"face" - 2021-03-19 14:10:59,203 : INFO : topic #0 (0.053): 0.005*"gaussian" + 0.005*"noise" + 0.005*"matrix" + 0.005*"hidden" + 0.004*"approximation" + 0.004*"sample" + 0.004*"estimate" + 0.004*"variance" + 0.004*"bayesian" + 0.003*"prior" - 2021-03-19 14:10:59,203 : INFO : topic diff=0.202368, rho=0.377964 - 2021-03-19 14:10:59,214 : INFO : PROGRESS: pass 6, at document #1740/1740 - 2021-03-19 14:11:04,013 : INFO : optimized alpha [0.053310633, 0.041254587, 0.050613035, 0.04936813, 0.049790192, 0.05083673, 0.038025398, 0.042830754, 0.034370847, 0.039269455] - 2021-03-19 14:11:04,020 : INFO : topic #8 (0.034): 0.012*"control" + 0.011*"action" + 0.008*"reinforcement" + 0.007*"policy" + 0.006*"robot" + 0.006*"controller" + 0.005*"optimal" + 0.005*"dynamic" + 0.005*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:04,020 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"recognition" + 0.005*"hidden_unit" + 0.005*"layer" + 0.004*"energy" + 0.004*"character" + 0.004*"digit" + 0.004*"net" + 0.004*"field" - 2021-03-19 14:11:04,021 : INFO : topic #2 (0.051): 0.010*"recognition" + 0.009*"speech" + 0.007*"word" + 0.007*"object" + 0.007*"image" + 0.006*"signal" + 0.006*"layer" + 0.004*"hidden" + 0.004*"face" + 0.004*"classification" - 2021-03-19 14:11:04,021 : INFO : topic #5 (0.051): 0.021*"neuron" + 0.012*"cell" + 0.007*"response" + 0.007*"spike" + 0.006*"synaptic" + 0.006*"stimulus" + 0.006*"activity" + 0.005*"firing" + 0.005*"signal" + 0.004*"frequency" - 2021-03-19 14:11:04,021 : INFO : topic #0 (0.053): 0.006*"gaussian" + 0.005*"noise" + 0.005*"matrix" + 0.005*"hidden" + 0.004*"approximation" + 0.004*"estimate" + 0.004*"sample" + 0.004*"bayesian" + 0.004*"variance" + 0.004*"prior" - 2021-03-19 14:11:04,021 : INFO : topic diff=0.192693, rho=0.353553 - 2021-03-19 14:11:04,032 : INFO : PROGRESS: pass 7, at document #1740/1740 - 2021-03-19 14:11:08,718 : INFO : optimized alpha [0.053891532, 0.040544394, 0.049499568, 0.047873296, 0.04881682, 0.0500006, 0.037689965, 0.04181969, 0.03393164, 0.038607482] - 2021-03-19 14:11:08,725 : INFO : topic #8 (0.034): 0.013*"control" + 0.012*"action" + 0.008*"reinforcement" + 0.007*"policy" + 0.006*"robot" + 0.006*"controller" + 0.005*"dynamic" + 0.005*"optimal" + 0.005*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:08,725 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"recognition" + 0.005*"layer" + 0.005*"hidden_unit" + 0.005*"character" + 0.004*"energy" + 0.004*"digit" + 0.004*"net" + 0.004*"field" - 2021-03-19 14:11:08,726 : INFO : topic #2 (0.049): 0.011*"recognition" + 0.009*"speech" + 0.008*"word" + 0.007*"object" + 0.007*"image" + 0.006*"signal" + 0.006*"layer" + 0.004*"face" + 0.004*"hidden" + 0.004*"classification" - 2021-03-19 14:11:08,726 : INFO : topic #5 (0.050): 0.022*"neuron" + 0.012*"cell" + 0.007*"response" + 0.007*"spike" + 0.007*"synaptic" + 0.006*"stimulus" + 0.006*"activity" + 0.005*"firing" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:08,726 : INFO : topic #0 (0.054): 0.006*"gaussian" + 0.005*"noise" + 0.005*"matrix" + 0.004*"approximation" + 0.004*"hidden" + 0.004*"estimate" + 0.004*"sample" + 0.004*"bayesian" + 0.004*"variance" + 0.004*"prior" - 2021-03-19 14:11:08,726 : INFO : topic diff=0.183651, rho=0.333333 - 2021-03-19 14:11:08,737 : INFO : PROGRESS: pass 8, at document #1740/1740 - 2021-03-19 14:11:13,510 : INFO : optimized alpha [0.0545965, 0.040113404, 0.048812777, 0.0467447, 0.048271947, 0.049433745, 0.03755086, 0.04124074, 0.033623673, 0.038269136] - 2021-03-19 14:11:13,518 : INFO : topic #8 (0.034): 0.014*"control" + 0.012*"action" + 0.008*"reinforcement" + 0.007*"policy" + 0.006*"controller" + 0.006*"robot" + 0.006*"dynamic" + 0.006*"optimal" + 0.005*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:13,518 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"recognition" + 0.005*"layer" + 0.005*"hidden_unit" + 0.005*"character" + 0.004*"energy" + 0.004*"digit" + 0.004*"net" + 0.004*"field" - 2021-03-19 14:11:13,518 : INFO : topic #2 (0.049): 0.011*"recognition" + 0.009*"speech" + 0.008*"word" + 0.008*"object" + 0.007*"image" + 0.006*"signal" + 0.006*"layer" + 0.004*"face" + 0.004*"classification" + 0.004*"hidden" - 2021-03-19 14:11:13,518 : INFO : topic #5 (0.049): 0.022*"neuron" + 0.013*"cell" + 0.008*"response" + 0.007*"spike" + 0.007*"synaptic" + 0.006*"stimulus" + 0.006*"activity" + 0.006*"firing" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:13,519 : INFO : topic #0 (0.055): 0.006*"gaussian" + 0.005*"noise" + 0.005*"matrix" + 0.004*"approximation" + 0.004*"estimate" + 0.004*"hidden" + 0.004*"sample" + 0.004*"bayesian" + 0.004*"likelihood" + 0.004*"variance" - 2021-03-19 14:11:13,519 : INFO : topic diff=0.175043, rho=0.316228 - 2021-03-19 14:11:13,530 : INFO : PROGRESS: pass 9, at document #1740/1740 - 2021-03-19 14:11:18,487 : INFO : optimized alpha [0.055368014, 0.039957594, 0.048399936, 0.045934383, 0.04802085, 0.049097233, 0.037513737, 0.040929828, 0.0334422, 0.038141657] - 2021-03-19 14:11:18,495 : INFO : topic #8 (0.033): 0.014*"control" + 0.012*"action" + 0.008*"reinforcement" + 0.007*"policy" + 0.006*"controller" + 0.006*"robot" + 0.006*"dynamic" + 0.006*"optimal" + 0.005*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:18,495 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"layer" + 0.006*"recognition" + 0.005*"character" + 0.005*"hidden_unit" + 0.004*"digit" + 0.004*"energy" + 0.004*"field" + 0.004*"net" - 2021-03-19 14:11:18,496 : INFO : topic #2 (0.048): 0.012*"recognition" + 0.010*"speech" + 0.009*"word" + 0.008*"image" + 0.008*"object" + 0.006*"signal" + 0.006*"layer" + 0.004*"face" + 0.004*"classification" + 0.004*"trained" - 2021-03-19 14:11:18,496 : INFO : topic #5 (0.049): 0.022*"neuron" + 0.013*"cell" + 0.008*"spike" + 0.008*"response" + 0.007*"synaptic" + 0.006*"stimulus" + 0.006*"activity" + 0.006*"firing" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:18,496 : INFO : topic #0 (0.055): 0.006*"gaussian" + 0.005*"noise" + 0.005*"matrix" + 0.005*"estimate" + 0.005*"approximation" + 0.004*"hidden" + 0.004*"sample" + 0.004*"bayesian" + 0.004*"likelihood" + 0.004*"prior" - 2021-03-19 14:11:18,496 : INFO : topic diff=0.166410, rho=0.301511 - 2021-03-19 14:11:18,507 : INFO : PROGRESS: pass 10, at document #1740/1740 - 2021-03-19 14:11:23,641 : INFO : optimized alpha [0.056234606, 0.039904997, 0.04814231, 0.045396697, 0.048054837, 0.048870783, 0.037563145, 0.04080154, 0.03336996, 0.03815883] - 2021-03-19 14:11:23,650 : INFO : topic #8 (0.033): 0.015*"control" + 0.012*"action" + 0.008*"reinforcement" + 0.008*"policy" + 0.007*"controller" + 0.006*"robot" + 0.006*"dynamic" + 0.006*"optimal" + 0.005*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:23,651 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"layer" + 0.006*"character" + 0.005*"recognition" + 0.005*"hidden_unit" + 0.004*"digit" + 0.004*"energy" + 0.004*"field" + 0.004*"net" - 2021-03-19 14:11:23,651 : INFO : topic #2 (0.048): 0.012*"recognition" + 0.010*"speech" + 0.009*"word" + 0.008*"image" + 0.008*"object" + 0.006*"signal" + 0.006*"layer" + 0.005*"face" + 0.004*"classification" + 0.004*"trained" - 2021-03-19 14:11:23,651 : INFO : topic #5 (0.049): 0.023*"neuron" + 0.013*"cell" + 0.008*"spike" + 0.008*"response" + 0.007*"synaptic" + 0.006*"activity" + 0.006*"stimulus" + 0.006*"firing" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:23,651 : INFO : topic #0 (0.056): 0.006*"gaussian" + 0.005*"noise" + 0.005*"estimate" + 0.005*"matrix" + 0.005*"approximation" + 0.004*"bayesian" + 0.004*"likelihood" + 0.004*"sample" + 0.004*"hidden" + 0.004*"prior" - 2021-03-19 14:11:23,651 : INFO : topic diff=0.157726, rho=0.288675 - 2021-03-19 14:11:23,663 : INFO : PROGRESS: pass 11, at document #1740/1740 - 2021-03-19 14:11:28,247 : INFO : optimized alpha [0.05706192, 0.039978355, 0.04797657, 0.044978894, 0.048209604, 0.048704833, 0.03767563, 0.04074631, 0.033347335, 0.038310345] - 2021-03-19 14:11:28,255 : INFO : topic #8 (0.033): 0.015*"control" + 0.012*"action" + 0.008*"reinforcement" + 0.008*"policy" + 0.007*"controller" + 0.006*"robot" + 0.006*"dynamic" + 0.006*"optimal" + 0.006*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:28,256 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"layer" + 0.006*"character" + 0.005*"recognition" + 0.005*"hidden_unit" + 0.004*"digit" + 0.004*"energy" + 0.004*"field" + 0.004*"net" - 2021-03-19 14:11:28,256 : INFO : topic #4 (0.048): 0.008*"hidden" + 0.007*"net" + 0.006*"layer" + 0.006*"rule" + 0.005*"node" + 0.004*"classifier" + 0.004*"hidden_unit" + 0.004*"class" + 0.004*"propagation" + 0.004*"sequence" - 2021-03-19 14:11:28,256 : INFO : topic #5 (0.049): 0.023*"neuron" + 0.013*"cell" + 0.008*"spike" + 0.008*"response" + 0.007*"synaptic" + 0.006*"activity" + 0.006*"firing" + 0.006*"stimulus" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:28,256 : INFO : topic #0 (0.057): 0.006*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"matrix" + 0.005*"approximation" + 0.004*"likelihood" + 0.004*"bayesian" + 0.004*"prior" + 0.004*"sample" + 0.004*"hidden" - 2021-03-19 14:11:28,256 : INFO : topic diff=0.149091, rho=0.277350 - 2021-03-19 14:11:28,268 : INFO : PROGRESS: pass 12, at document #1740/1740 - 2021-03-19 14:11:32,844 : INFO : optimized alpha [0.057841934, 0.040147286, 0.047984846, 0.04466845, 0.048510514, 0.048608452, 0.037831437, 0.04078982, 0.03338453, 0.038538743] - 2021-03-19 14:11:32,852 : INFO : topic #8 (0.033): 0.015*"control" + 0.012*"action" + 0.008*"reinforcement" + 0.008*"policy" + 0.007*"controller" + 0.006*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:32,852 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"layer" + 0.006*"character" + 0.005*"recognition" + 0.005*"hidden_unit" + 0.005*"digit" + 0.004*"energy" + 0.004*"attractor" + 0.004*"field" - 2021-03-19 14:11:32,853 : INFO : topic #4 (0.049): 0.008*"hidden" + 0.007*"net" + 0.006*"layer" + 0.006*"rule" + 0.005*"node" + 0.004*"hidden_unit" + 0.004*"classifier" + 0.004*"class" + 0.004*"propagation" + 0.004*"sequence" - 2021-03-19 14:11:32,853 : INFO : topic #5 (0.049): 0.023*"neuron" + 0.013*"cell" + 0.008*"spike" + 0.008*"response" + 0.007*"synaptic" + 0.006*"firing" + 0.006*"activity" + 0.006*"stimulus" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:32,853 : INFO : topic #0 (0.058): 0.006*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"approximation" + 0.005*"matrix" + 0.004*"likelihood" + 0.004*"bayesian" + 0.004*"prior" + 0.004*"variance" + 0.004*"sample" - 2021-03-19 14:11:32,853 : INFO : topic diff=0.140596, rho=0.267261 - 2021-03-19 14:11:32,865 : INFO : PROGRESS: pass 13, at document #1740/1740 - 2021-03-19 14:11:37,447 : INFO : optimized alpha [0.058551796, 0.040399875, 0.048106886, 0.044424307, 0.04896659, 0.04858641, 0.03804483, 0.040931225, 0.03344661, 0.038809597] - 2021-03-19 14:11:37,455 : INFO : topic #8 (0.033): 0.016*"control" + 0.013*"action" + 0.008*"policy" + 0.008*"reinforcement" + 0.007*"controller" + 0.006*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:37,455 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"layer" + 0.006*"character" + 0.005*"hidden_unit" + 0.005*"recognition" + 0.005*"digit" + 0.004*"energy" + 0.004*"attractor" + 0.004*"field" - 2021-03-19 14:11:37,456 : INFO : topic #5 (0.049): 0.023*"neuron" + 0.013*"cell" + 0.008*"spike" + 0.008*"response" + 0.007*"synaptic" + 0.006*"firing" + 0.006*"activity" + 0.006*"stimulus" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:37,456 : INFO : topic #4 (0.049): 0.008*"hidden" + 0.007*"net" + 0.006*"layer" + 0.006*"rule" + 0.006*"node" + 0.005*"hidden_unit" + 0.004*"classifier" + 0.004*"class" + 0.004*"sequence" + 0.004*"propagation" - 2021-03-19 14:11:37,456 : INFO : topic #0 (0.059): 0.007*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"approximation" + 0.005*"matrix" + 0.005*"likelihood" + 0.004*"bayesian" + 0.004*"prior" + 0.004*"variance" + 0.004*"sample" - 2021-03-19 14:11:37,456 : INFO : topic diff=0.132327, rho=0.258199 - 2021-03-19 14:11:37,467 : INFO : PROGRESS: pass 14, at document #1740/1740 - 2021-03-19 14:11:41,536 : INFO : optimized alpha [0.05925279, 0.040705983, 0.04832607, 0.04427085, 0.049501013, 0.048644915, 0.038285527, 0.04113948, 0.03352695, 0.039150245] - 2021-03-19 14:11:41,544 : INFO : topic #8 (0.034): 0.016*"control" + 0.013*"action" + 0.009*"policy" + 0.008*"reinforcement" + 0.007*"controller" + 0.006*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:41,544 : INFO : topic #6 (0.038): 0.010*"hidden" + 0.009*"image" + 0.006*"character" + 0.006*"layer" + 0.005*"hidden_unit" + 0.005*"recognition" + 0.005*"digit" + 0.004*"energy" + 0.004*"attractor" + 0.004*"net" - 2021-03-19 14:11:41,544 : INFO : topic #5 (0.049): 0.023*"neuron" + 0.013*"cell" + 0.008*"spike" + 0.008*"response" + 0.007*"synaptic" + 0.006*"firing" + 0.006*"activity" + 0.006*"stimulus" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:41,545 : INFO : topic #4 (0.050): 0.008*"hidden" + 0.008*"net" + 0.006*"layer" + 0.006*"rule" + 0.006*"node" + 0.005*"hidden_unit" + 0.004*"sequence" + 0.004*"propagation" + 0.004*"architecture" + 0.004*"activation" - 2021-03-19 14:11:41,545 : INFO : topic #0 (0.059): 0.007*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"approximation" + 0.005*"likelihood" + 0.005*"matrix" + 0.004*"prior" + 0.004*"bayesian" + 0.004*"variance" + 0.004*"density" - 2021-03-19 14:11:41,545 : INFO : topic diff=0.124371, rho=0.250000 - 2021-03-19 14:11:41,556 : INFO : PROGRESS: pass 15, at document #1740/1740 - 2021-03-19 14:11:45,592 : INFO : optimized alpha [0.05994643, 0.041028578, 0.048593685, 0.04419364, 0.05009154, 0.048734292, 0.03856185, 0.041424613, 0.033627965, 0.039535556] - 2021-03-19 14:11:45,600 : INFO : topic #8 (0.034): 0.016*"control" + 0.013*"action" + 0.009*"policy" + 0.008*"reinforcement" + 0.007*"controller" + 0.007*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.005*"reinforcement_learning" - 2021-03-19 14:11:45,600 : INFO : topic #6 (0.039): 0.010*"hidden" + 0.009*"image" + 0.006*"character" + 0.006*"layer" + 0.005*"hidden_unit" + 0.005*"recognition" + 0.005*"digit" + 0.004*"energy" + 0.004*"attractor" + 0.004*"dynamic" - 2021-03-19 14:11:45,600 : INFO : topic #5 (0.049): 0.023*"neuron" + 0.014*"cell" + 0.008*"spike" + 0.008*"response" + 0.008*"synaptic" + 0.006*"firing" + 0.006*"activity" + 0.006*"stimulus" + 0.005*"signal" + 0.005*"frequency" - 2021-03-19 14:11:45,600 : INFO : topic #4 (0.050): 0.008*"hidden" + 0.008*"net" + 0.007*"layer" + 0.006*"rule" + 0.006*"node" + 0.005*"hidden_unit" + 0.004*"sequence" + 0.004*"architecture" + 0.004*"propagation" + 0.004*"activation" - 2021-03-19 14:11:45,601 : INFO : topic #0 (0.060): 0.007*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"likelihood" + 0.005*"approximation" + 0.005*"matrix" + 0.004*"prior" + 0.004*"bayesian" + 0.004*"variance" + 0.004*"density" - 2021-03-19 14:11:45,601 : INFO : topic diff=0.116794, rho=0.242536 - 2021-03-19 14:11:45,611 : INFO : PROGRESS: pass 16, at document #1740/1740 - 2021-03-19 14:11:49,737 : INFO : optimized alpha [0.06068379, 0.041378528, 0.048856508, 0.0441432, 0.05072476, 0.0488511, 0.038870405, 0.041741073, 0.03375229, 0.039979585] - 2021-03-19 14:11:49,745 : INFO : topic #8 (0.034): 0.016*"control" + 0.013*"action" + 0.009*"policy" + 0.008*"reinforcement" + 0.007*"controller" + 0.007*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.006*"reinforcement_learning" - 2021-03-19 14:11:49,745 : INFO : topic #6 (0.039): 0.010*"hidden" + 0.009*"image" + 0.006*"character" + 0.006*"layer" + 0.005*"hidden_unit" + 0.005*"recognition" + 0.005*"digit" + 0.004*"energy" + 0.004*"attractor" + 0.004*"dynamic" - 2021-03-19 14:11:49,745 : INFO : topic #5 (0.049): 0.023*"neuron" + 0.014*"cell" + 0.008*"spike" + 0.008*"response" + 0.008*"synaptic" + 0.006*"firing" + 0.006*"activity" + 0.006*"stimulus" + 0.006*"signal" + 0.005*"frequency" - 2021-03-19 14:11:49,746 : INFO : topic #4 (0.051): 0.008*"hidden" + 0.008*"net" + 0.007*"layer" + 0.006*"rule" + 0.006*"node" + 0.005*"hidden_unit" + 0.004*"sequence" + 0.004*"architecture" + 0.004*"activation" + 0.004*"propagation" - 2021-03-19 14:11:49,746 : INFO : topic #0 (0.061): 0.007*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"likelihood" + 0.005*"approximation" + 0.005*"prior" + 0.004*"bayesian" + 0.004*"matrix" + 0.004*"variance" + 0.004*"density" - 2021-03-19 14:11:49,746 : INFO : topic diff=0.109661, rho=0.235702 - 2021-03-19 14:11:49,756 : INFO : PROGRESS: pass 17, at document #1740/1740 - 2021-03-19 14:11:53,841 : INFO : optimized alpha [0.061406724, 0.04174132, 0.0491224, 0.044116188, 0.05141323, 0.049025778, 0.03920408, 0.04207979, 0.033907466, 0.04045379] - 2021-03-19 14:11:53,850 : INFO : topic #8 (0.034): 0.016*"control" + 0.013*"action" + 0.009*"policy" + 0.008*"reinforcement" + 0.007*"controller" + 0.007*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.006*"reinforcement_learning" - 2021-03-19 14:11:53,850 : INFO : topic #6 (0.039): 0.010*"hidden" + 0.009*"image" + 0.007*"character" + 0.006*"layer" + 0.005*"hidden_unit" + 0.005*"recognition" + 0.005*"digit" + 0.004*"energy" + 0.004*"attractor" + 0.004*"dynamic" - 2021-03-19 14:11:53,850 : INFO : topic #2 (0.049): 0.014*"recognition" + 0.011*"speech" + 0.010*"word" + 0.010*"image" + 0.008*"object" + 0.006*"signal" + 0.005*"layer" + 0.005*"face" + 0.005*"classification" + 0.005*"trained" - 2021-03-19 14:11:53,851 : INFO : topic #4 (0.051): 0.009*"hidden" + 0.008*"net" + 0.007*"layer" + 0.006*"rule" + 0.006*"node" + 0.005*"hidden_unit" + 0.004*"architecture" + 0.004*"sequence" + 0.004*"activation" + 0.004*"propagation" - 2021-03-19 14:11:53,851 : INFO : topic #0 (0.061): 0.007*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"likelihood" + 0.005*"approximation" + 0.005*"prior" + 0.005*"bayesian" + 0.004*"matrix" + 0.004*"variance" + 0.004*"density" - 2021-03-19 14:11:53,851 : INFO : topic diff=0.102938, rho=0.229416 - 2021-03-19 14:11:53,862 : INFO : PROGRESS: pass 18, at document #1740/1740 - 2021-03-19 14:11:57,816 : INFO : optimized alpha [0.062154472, 0.042110436, 0.04939213, 0.044109803, 0.05212181, 0.049227104, 0.039544087, 0.04246847, 0.03410476, 0.040957462] - 2021-03-19 14:11:57,823 : INFO : topic #8 (0.034): 0.016*"control" + 0.013*"action" + 0.009*"policy" + 0.008*"reinforcement" + 0.007*"controller" + 0.007*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.006*"reinforcement_learning" - 2021-03-19 14:11:57,824 : INFO : topic #6 (0.040): 0.010*"hidden" + 0.008*"image" + 0.007*"character" + 0.006*"layer" + 0.005*"hidden_unit" + 0.005*"recognition" + 0.005*"digit" + 0.004*"energy" + 0.004*"attractor" + 0.004*"dynamic" - 2021-03-19 14:11:57,824 : INFO : topic #2 (0.049): 0.014*"recognition" + 0.011*"speech" + 0.010*"word" + 0.010*"image" + 0.008*"object" + 0.006*"signal" + 0.005*"layer" + 0.005*"face" + 0.005*"classification" + 0.005*"trained" - 2021-03-19 14:11:57,824 : INFO : topic #4 (0.052): 0.009*"hidden" + 0.008*"net" + 0.007*"layer" + 0.006*"rule" + 0.006*"node" + 0.005*"hidden_unit" + 0.004*"architecture" + 0.004*"sequence" + 0.004*"activation" + 0.004*"propagation" - 2021-03-19 14:11:57,824 : INFO : topic #0 (0.062): 0.007*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"likelihood" + 0.005*"approximation" + 0.005*"prior" + 0.005*"bayesian" + 0.004*"matrix" + 0.004*"density" + 0.004*"variance" - 2021-03-19 14:11:57,825 : INFO : topic diff=0.096678, rho=0.223607 - 2021-03-19 14:11:57,835 : INFO : PROGRESS: pass 19, at document #1740/1740 - 2021-03-19 14:12:01,856 : INFO : optimized alpha [0.06292996, 0.04251684, 0.049703237, 0.044167582, 0.052860808, 0.049467582, 0.039925203, 0.042864826, 0.03433462, 0.0415304] - 2021-03-19 14:12:01,864 : INFO : topic #8 (0.034): 0.016*"control" + 0.013*"action" + 0.009*"policy" + 0.008*"reinforcement" + 0.007*"controller" + 0.007*"dynamic" + 0.006*"robot" + 0.006*"optimal" + 0.006*"trajectory" + 0.006*"reinforcement_learning" - 2021-03-19 14:12:01,864 : INFO : topic #6 (0.040): 0.010*"hidden" + 0.008*"image" + 0.007*"character" + 0.006*"layer" + 0.005*"hidden_unit" + 0.005*"recognition" + 0.005*"digit" + 0.004*"attractor" + 0.004*"energy" + 0.004*"dynamic" - 2021-03-19 14:12:01,864 : INFO : topic #2 (0.050): 0.014*"recognition" + 0.011*"speech" + 0.010*"word" + 0.010*"image" + 0.008*"object" + 0.006*"signal" + 0.005*"layer" + 0.005*"classification" + 0.005*"face" + 0.005*"trained" - 2021-03-19 14:12:01,865 : INFO : topic #4 (0.053): 0.009*"hidden" + 0.008*"net" + 0.007*"layer" + 0.006*"rule" + 0.006*"node" + 0.005*"hidden_unit" + 0.004*"architecture" + 0.004*"activation" + 0.004*"sequence" + 0.004*"propagation" - 2021-03-19 14:12:01,865 : INFO : topic #0 (0.063): 0.007*"gaussian" + 0.006*"noise" + 0.005*"estimate" + 0.005*"likelihood" + 0.005*"approximation" + 0.005*"prior" + 0.005*"bayesian" + 0.004*"density" + 0.004*"mixture" + 0.004*"variance" - 2021-03-19 14:12:01,865 : INFO : topic diff=0.090853, rho=0.218218 - 2021-03-19 14:12:01,877 : INFO : LdaModel lifecycle event {'msg': 'trained LdaModel(num_terms=8644, num_topics=10, decay=0.5, chunksize=2000) in 109.40s', 'datetime': '2021-03-19T14:12:01.877604', 'gensim': '4.0.0.rc1', 'python': '3.7.0 (default, Jun 28 2018, 13:15:42) \n[GCC 7.2.0]', 'platform': 'Linux-4.15.0-136-generic-x86_64-with-debian-buster-sid', 'event': 'created'} + 2022-04-22 17:43:05,111 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1] + 2022-04-22 17:43:05,115 : INFO : using serial LDA version on this node + 2022-04-22 17:43:05,137 : INFO : running online (multi-pass) LDA training, 10 topics, 20 passes over the supplied corpus of 1740 documents, updating model once every 1740 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000 + 2022-04-22 17:43:05,148 : INFO : PROGRESS: pass 0, at document #1740/1740 + 2022-04-22 17:43:21,190 : INFO : optimized alpha [0.0578294, 0.07125457, 0.07889137, 0.09016259, 0.077791244, 0.0792375, 0.097086295, 0.061600033, 0.095310934, 0.060617708] + 2022-04-22 17:43:21,202 : INFO : topic #0 (0.058): 0.007*"hidden" + 0.006*"word" + 0.005*"recognition" + 0.004*"gaussian" + 0.003*"hidden_unit" + 0.003*"rule" + 0.003*"component" + 0.003*"layer" + 0.003*"image" + 0.002*"connection" + 2022-04-22 17:43:21,202 : INFO : topic #9 (0.061): 0.015*"neuron" + 0.007*"cell" + 0.005*"signal" + 0.005*"spike" + 0.004*"layer" + 0.004*"response" + 0.004*"firing" + 0.004*"noise" + 0.003*"density" + 0.003*"hidden" + 2022-04-22 17:43:21,202 : INFO : topic #3 (0.090): 0.006*"image" + 0.005*"class" + 0.003*"classifier" + 0.003*"classification" + 0.003*"recognition" + 0.003*"component" + 0.003*"kernel" + 0.003*"noise" + 0.003*"sequence" + 0.002*"rule" + 2022-04-22 17:43:21,203 : INFO : topic #8 (0.095): 0.004*"hidden" + 0.003*"signal" + 0.003*"rule" + 0.003*"dynamic" + 0.002*"control" + 0.002*"prediction" + 0.002*"net" + 0.002*"sequence" + 0.002*"speech" + 0.002*"matrix" + 2022-04-22 17:43:21,203 : INFO : topic #6 (0.097): 0.006*"image" + 0.005*"cell" + 0.004*"neuron" + 0.004*"layer" + 0.004*"field" + 0.004*"object" + 0.003*"recognition" + 0.003*"signal" + 0.003*"noise" + 0.003*"class" + 2022-04-22 17:43:21,203 : INFO : topic diff=1.159133, rho=1.000000 + 2022-04-22 17:43:21,212 : INFO : PROGRESS: pass 1, at document #1740/1740 + 2022-04-22 17:43:30,981 : INFO : optimized alpha [0.05010912, 0.057179544, 0.06367695, 0.07760008, 0.061386272, 0.06139503, 0.06987214, 0.050920427, 0.08028384, 0.05094144] + 2022-04-22 17:43:30,987 : INFO : topic #0 (0.050): 0.009*"word" + 0.009*"hidden" + 0.008*"recognition" + 0.005*"gaussian" + 0.005*"speech" + 0.004*"hidden_unit" + 0.004*"mixture" + 0.003*"layer" + 0.003*"component" + 0.003*"likelihood" + 2022-04-22 17:43:30,987 : INFO : topic #9 (0.051): 0.019*"neuron" + 0.009*"cell" + 0.009*"spike" + 0.007*"signal" + 0.006*"response" + 0.005*"firing" + 0.005*"stimulus" + 0.005*"noise" + 0.004*"layer" + 0.004*"visual" + 2022-04-22 17:43:30,987 : INFO : topic #6 (0.070): 0.007*"image" + 0.006*"cell" + 0.005*"object" + 0.005*"field" + 0.004*"motion" + 0.004*"visual" + 0.004*"signal" + 0.004*"direction" + 0.004*"layer" + 0.004*"filter" + 2022-04-22 17:43:30,988 : INFO : topic #3 (0.078): 0.008*"image" + 0.006*"class" + 0.005*"classifier" + 0.004*"classification" + 0.003*"kernel" + 0.003*"recognition" + 0.003*"component" + 0.003*"noise" + 0.003*"estimate" + 0.003*"gaussian" + 2022-04-22 17:43:30,988 : INFO : topic #8 (0.080): 0.004*"hidden" + 0.004*"rule" + 0.003*"sequence" + 0.003*"prediction" + 0.003*"net" + 0.003*"bound" + 0.003*"optimal" + 0.003*"signal" + 0.003*"dynamic" + 0.002*"hidden_unit" + 2022-04-22 17:43:30,988 : INFO : topic diff=0.292768, rho=0.577350 + 2022-04-22 17:43:30,996 : INFO : PROGRESS: pass 2, at document #1740/1740 + 2022-04-22 17:43:38,324 : INFO : optimized alpha [0.046267115, 0.049782153, 0.055386752, 0.070311576, 0.054385237, 0.052613482, 0.0592381, 0.044921257, 0.07121881, 0.045337107] + 2022-04-22 17:43:38,330 : INFO : topic #7 (0.045): 0.009*"chip" + 0.006*"analog" + 0.006*"neuron" + 0.006*"noise" + 0.006*"memory" + 0.005*"layer" + 0.004*"connection" + 0.004*"signal" + 0.004*"circuit" + 0.004*"image" + 2022-04-22 17:43:38,331 : INFO : topic #9 (0.045): 0.021*"neuron" + 0.011*"spike" + 0.011*"cell" + 0.007*"signal" + 0.007*"response" + 0.007*"stimulus" + 0.006*"firing" + 0.005*"noise" + 0.004*"visual" + 0.004*"layer" + 2022-04-22 17:43:38,331 : INFO : topic #6 (0.059): 0.009*"image" + 0.007*"object" + 0.006*"cell" + 0.006*"visual" + 0.006*"motion" + 0.005*"field" + 0.005*"direction" + 0.004*"filter" + 0.004*"signal" + 0.004*"response" + 2022-04-22 17:43:38,331 : INFO : topic #3 (0.070): 0.007*"image" + 0.007*"class" + 0.005*"classifier" + 0.004*"classification" + 0.003*"kernel" + 0.003*"sample" + 0.003*"estimate" + 0.003*"gaussian" + 0.003*"component" + 0.003*"noise" + 2022-04-22 17:43:38,331 : INFO : topic #8 (0.071): 0.005*"hidden" + 0.005*"rule" + 0.003*"sequence" + 0.003*"net" + 0.003*"bound" + 0.003*"prediction" + 0.003*"optimal" + 0.003*"generalization" + 0.003*"hidden_unit" + 0.002*"tree" + 2022-04-22 17:43:38,331 : INFO : topic diff=0.259048, rho=0.500000 + 2022-04-22 17:43:38,339 : INFO : PROGRESS: pass 3, at document #1740/1740 + 2022-04-22 17:43:44,815 : INFO : optimized alpha [0.04398281, 0.045212083, 0.050260257, 0.066244416, 0.050919566, 0.047668763, 0.053777307, 0.041211806, 0.06501518, 0.041524593] + 2022-04-22 17:43:44,821 : INFO : topic #7 (0.041): 0.010*"chip" + 0.007*"analog" + 0.007*"neuron" + 0.006*"memory" + 0.006*"noise" + 0.005*"circuit" + 0.005*"signal" + 0.005*"layer" + 0.004*"voltage" + 0.004*"connection" + 2022-04-22 17:43:44,821 : INFO : topic #9 (0.042): 0.021*"neuron" + 0.012*"spike" + 0.012*"cell" + 0.008*"signal" + 0.008*"stimulus" + 0.008*"response" + 0.007*"firing" + 0.005*"noise" + 0.004*"visual" + 0.004*"activity" + 2022-04-22 17:43:44,821 : INFO : topic #6 (0.054): 0.011*"image" + 0.008*"object" + 0.007*"visual" + 0.007*"motion" + 0.006*"field" + 0.006*"cell" + 0.005*"direction" + 0.005*"filter" + 0.004*"signal" + 0.004*"response" + 2022-04-22 17:43:44,822 : INFO : topic #8 (0.065): 0.005*"rule" + 0.005*"hidden" + 0.003*"sequence" + 0.003*"generalization" + 0.003*"net" + 0.003*"bound" + 0.003*"prediction" + 0.003*"hidden_unit" + 0.003*"optimal" + 0.003*"machine" + 2022-04-22 17:43:44,822 : INFO : topic #3 (0.066): 0.007*"image" + 0.007*"class" + 0.005*"classifier" + 0.005*"classification" + 0.004*"gaussian" + 0.004*"sample" + 0.003*"estimate" + 0.003*"kernel" + 0.003*"noise" + 0.003*"component" + 2022-04-22 17:43:44,822 : INFO : topic diff=0.235399, rho=0.447214 + 2022-04-22 17:43:44,830 : INFO : PROGRESS: pass 4, at document #1740/1740 + 2022-04-22 17:43:50,907 : INFO : optimized alpha [0.042409703, 0.0423433, 0.04680129, 0.06358971, 0.049375836, 0.044652227, 0.0507185, 0.038540646, 0.06110631, 0.038821314] + 2022-04-22 17:43:50,913 : INFO : topic #7 (0.039): 0.011*"chip" + 0.008*"analog" + 0.008*"neuron" + 0.007*"circuit" + 0.007*"memory" + 0.006*"noise" + 0.006*"signal" + 0.005*"voltage" + 0.005*"layer" + 0.004*"vlsi" + 2022-04-22 17:43:50,914 : INFO : topic #9 (0.039): 0.021*"neuron" + 0.013*"spike" + 0.013*"cell" + 0.009*"stimulus" + 0.009*"signal" + 0.009*"response" + 0.007*"firing" + 0.006*"noise" + 0.004*"activity" + 0.004*"visual" + 2022-04-22 17:43:50,914 : INFO : topic #6 (0.051): 0.013*"image" + 0.009*"object" + 0.008*"visual" + 0.007*"motion" + 0.007*"field" + 0.006*"cell" + 0.006*"direction" + 0.005*"filter" + 0.005*"response" + 0.004*"map" + 2022-04-22 17:43:50,914 : INFO : topic #8 (0.061): 0.006*"rule" + 0.005*"hidden" + 0.004*"generalization" + 0.004*"sequence" + 0.003*"net" + 0.003*"prediction" + 0.003*"hidden_unit" + 0.003*"bound" + 0.003*"machine" + 0.003*"tree" + 2022-04-22 17:43:50,914 : INFO : topic #3 (0.064): 0.007*"class" + 0.006*"image" + 0.005*"classifier" + 0.005*"classification" + 0.004*"gaussian" + 0.004*"sample" + 0.004*"estimate" + 0.003*"kernel" + 0.003*"density" + 0.003*"prior" + 2022-04-22 17:43:50,915 : INFO : topic diff=0.220905, rho=0.408248 + 2022-04-22 17:43:50,922 : INFO : PROGRESS: pass 5, at document #1740/1740 + 2022-04-22 17:43:57,459 : INFO : optimized alpha [0.04136415, 0.040443134, 0.04439863, 0.062082667, 0.048723623, 0.042787064, 0.048876576, 0.036657482, 0.058343116, 0.03701785] + 2022-04-22 17:43:57,465 : INFO : topic #7 (0.037): 0.012*"chip" + 0.009*"analog" + 0.008*"neuron" + 0.008*"circuit" + 0.007*"memory" + 0.006*"signal" + 0.006*"noise" + 0.006*"voltage" + 0.005*"vlsi" + 0.004*"layer" + 2022-04-22 17:43:57,465 : INFO : topic #9 (0.037): 0.022*"neuron" + 0.013*"spike" + 0.013*"cell" + 0.009*"stimulus" + 0.009*"signal" + 0.009*"response" + 0.008*"firing" + 0.006*"noise" + 0.004*"activity" + 0.004*"channel" + 2022-04-22 17:43:57,466 : INFO : topic #6 (0.049): 0.015*"image" + 0.010*"object" + 0.009*"visual" + 0.007*"motion" + 0.007*"field" + 0.006*"direction" + 0.006*"cell" + 0.005*"filter" + 0.005*"map" + 0.005*"response" + 2022-04-22 17:43:57,466 : INFO : topic #8 (0.058): 0.006*"rule" + 0.005*"hidden" + 0.004*"generalization" + 0.004*"sequence" + 0.003*"net" + 0.003*"hidden_unit" + 0.003*"prediction" + 0.003*"bound" + 0.003*"machine" + 0.003*"tree" + 2022-04-22 17:43:57,466 : INFO : topic #3 (0.062): 0.007*"class" + 0.006*"image" + 0.005*"classifier" + 0.005*"classification" + 0.004*"gaussian" + 0.004*"sample" + 0.004*"estimate" + 0.004*"density" + 0.004*"prior" + 0.003*"bayesian" + 2022-04-22 17:43:57,467 : INFO : topic diff=0.210451, rho=0.377964 + 2022-04-22 17:43:57,477 : INFO : PROGRESS: pass 6, at document #1740/1740 + 2022-04-22 17:44:02,657 : INFO : optimized alpha [0.040722344, 0.039083496, 0.04264549, 0.061218463, 0.048731733, 0.041630186, 0.047772773, 0.03532755, 0.0563227, 0.03579225] + 2022-04-22 17:44:02,663 : INFO : topic #7 (0.035): 0.012*"chip" + 0.010*"analog" + 0.009*"circuit" + 0.009*"neuron" + 0.007*"memory" + 0.007*"signal" + 0.006*"voltage" + 0.006*"noise" + 0.005*"vlsi" + 0.004*"implementation" + 2022-04-22 17:44:02,663 : INFO : topic #9 (0.036): 0.022*"neuron" + 0.014*"spike" + 0.013*"cell" + 0.010*"stimulus" + 0.009*"signal" + 0.009*"response" + 0.008*"firing" + 0.006*"noise" + 0.005*"channel" + 0.005*"activity" + 2022-04-22 17:44:02,664 : INFO : topic #4 (0.049): 0.008*"matrix" + 0.006*"gradient" + 0.005*"solution" + 0.004*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.003*"optimization" + 0.003*"neuron" + 0.003*"eq" + 2022-04-22 17:44:02,664 : INFO : topic #8 (0.056): 0.007*"rule" + 0.005*"hidden" + 0.004*"generalization" + 0.004*"sequence" + 0.004*"net" + 0.003*"hidden_unit" + 0.003*"prediction" + 0.003*"tree" + 0.003*"machine" + 0.003*"bound" + 2022-04-22 17:44:02,664 : INFO : topic #3 (0.061): 0.007*"class" + 0.005*"classifier" + 0.005*"image" + 0.005*"gaussian" + 0.005*"classification" + 0.004*"sample" + 0.004*"estimate" + 0.004*"density" + 0.004*"prior" + 0.004*"bayesian" + 2022-04-22 17:44:02,664 : INFO : topic diff=0.201353, rho=0.353553 + 2022-04-22 17:44:02,673 : INFO : PROGRESS: pass 7, at document #1740/1740 + 2022-04-22 17:44:08,716 : INFO : optimized alpha [0.040365368, 0.038083963, 0.041339714, 0.06076524, 0.04909782, 0.040898465, 0.047129765, 0.034341704, 0.054831598, 0.034885667] + 2022-04-22 17:44:08,722 : INFO : topic #7 (0.034): 0.013*"chip" + 0.010*"circuit" + 0.010*"analog" + 0.009*"neuron" + 0.007*"memory" + 0.007*"signal" + 0.007*"voltage" + 0.006*"noise" + 0.005*"vlsi" + 0.005*"implementation" + 2022-04-22 17:44:08,723 : INFO : topic #9 (0.035): 0.022*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.010*"stimulus" + 0.010*"signal" + 0.010*"response" + 0.008*"firing" + 0.006*"noise" + 0.005*"channel" + 0.005*"activity" + 2022-04-22 17:44:08,723 : INFO : topic #4 (0.049): 0.009*"matrix" + 0.006*"gradient" + 0.005*"solution" + 0.004*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.003*"optimization" + 0.003*"eq" + 0.003*"neuron" + 2022-04-22 17:44:08,723 : INFO : topic #8 (0.055): 0.007*"rule" + 0.005*"hidden" + 0.005*"generalization" + 0.004*"sequence" + 0.004*"hidden_unit" + 0.004*"net" + 0.003*"prediction" + 0.003*"tree" + 0.003*"machine" + 0.003*"bound" + 2022-04-22 17:44:08,723 : INFO : topic #3 (0.061): 0.007*"class" + 0.005*"classifier" + 0.005*"gaussian" + 0.005*"classification" + 0.005*"sample" + 0.005*"image" + 0.004*"estimate" + 0.004*"density" + 0.004*"prior" + 0.004*"bayesian" + 2022-04-22 17:44:08,724 : INFO : topic diff=0.192330, rho=0.333333 + 2022-04-22 17:44:08,732 : INFO : PROGRESS: pass 8, at document #1740/1740 + 2022-04-22 17:44:13,585 : INFO : optimized alpha [0.040182494, 0.037441313, 0.04036209, 0.060601927, 0.049758103, 0.04055522, 0.046829112, 0.03359148, 0.053864058, 0.03418947] + 2022-04-22 17:44:13,591 : INFO : topic #7 (0.034): 0.013*"chip" + 0.011*"circuit" + 0.011*"analog" + 0.009*"neuron" + 0.007*"signal" + 0.007*"memory" + 0.007*"voltage" + 0.006*"vlsi" + 0.006*"noise" + 0.005*"implementation" + 2022-04-22 17:44:13,592 : INFO : topic #9 (0.034): 0.022*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.010*"stimulus" + 0.010*"signal" + 0.010*"response" + 0.008*"firing" + 0.006*"noise" + 0.005*"channel" + 0.005*"frequency" + 2022-04-22 17:44:13,592 : INFO : topic #4 (0.050): 0.009*"matrix" + 0.006*"gradient" + 0.005*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.003*"optimization" + 0.003*"eq" + 0.003*"descent" + 2022-04-22 17:44:13,592 : INFO : topic #8 (0.054): 0.007*"rule" + 0.006*"hidden" + 0.005*"generalization" + 0.004*"sequence" + 0.004*"hidden_unit" + 0.004*"net" + 0.003*"prediction" + 0.003*"tree" + 0.003*"machine" + 0.003*"trained" + 2022-04-22 17:44:13,592 : INFO : topic #3 (0.061): 0.007*"class" + 0.005*"classifier" + 0.005*"gaussian" + 0.005*"sample" + 0.005*"classification" + 0.004*"estimate" + 0.004*"density" + 0.004*"image" + 0.004*"prior" + 0.004*"bayesian" + 2022-04-22 17:44:13,593 : INFO : topic diff=0.182985, rho=0.316228 + 2022-04-22 17:44:13,601 : INFO : PROGRESS: pass 9, at document #1740/1740 + 2022-04-22 17:44:19,306 : INFO : optimized alpha [0.040097952, 0.036957335, 0.039702885, 0.060680483, 0.050588053, 0.040437363, 0.046769954, 0.033025023, 0.053330485, 0.033663847] + 2022-04-22 17:44:19,312 : INFO : topic #7 (0.033): 0.013*"chip" + 0.012*"circuit" + 0.011*"analog" + 0.010*"neuron" + 0.008*"signal" + 0.007*"memory" + 0.007*"voltage" + 0.006*"vlsi" + 0.005*"noise" + 0.005*"implementation" + 2022-04-22 17:44:19,312 : INFO : topic #9 (0.034): 0.022*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.010*"stimulus" + 0.010*"signal" + 0.010*"response" + 0.008*"firing" + 0.007*"noise" + 0.006*"channel" + 0.005*"frequency" + 2022-04-22 17:44:19,313 : INFO : topic #4 (0.051): 0.009*"matrix" + 0.006*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.003*"optimization" + 0.003*"descent" + 2022-04-22 17:44:19,313 : INFO : topic #8 (0.053): 0.008*"rule" + 0.006*"hidden" + 0.005*"generalization" + 0.004*"sequence" + 0.004*"hidden_unit" + 0.004*"net" + 0.004*"prediction" + 0.003*"tree" + 0.003*"machine" + 0.003*"trained" + 2022-04-22 17:44:19,313 : INFO : topic #3 (0.061): 0.007*"class" + 0.005*"gaussian" + 0.005*"classifier" + 0.005*"sample" + 0.005*"classification" + 0.005*"estimate" + 0.004*"density" + 0.004*"prior" + 0.004*"bayesian" + 0.004*"mixture" + 2022-04-22 17:44:19,313 : INFO : topic diff=0.173278, rho=0.301511 + 2022-04-22 17:44:19,321 : INFO : PROGRESS: pass 10, at document #1740/1740 + 2022-04-22 17:44:23,819 : INFO : optimized alpha [0.040098477, 0.036638554, 0.03923829, 0.060877353, 0.051485594, 0.04045682, 0.04686068, 0.032584008, 0.05302629, 0.03327818] + 2022-04-22 17:44:23,825 : INFO : topic #7 (0.033): 0.013*"chip" + 0.012*"circuit" + 0.011*"analog" + 0.010*"neuron" + 0.008*"signal" + 0.007*"memory" + 0.007*"voltage" + 0.006*"vlsi" + 0.005*"noise" + 0.005*"implementation" + 2022-04-22 17:44:23,825 : INFO : topic #9 (0.033): 0.021*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.011*"stimulus" + 0.010*"signal" + 0.010*"response" + 0.008*"firing" + 0.007*"noise" + 0.006*"channel" + 0.006*"frequency" + 2022-04-22 17:44:23,826 : INFO : topic #4 (0.051): 0.009*"matrix" + 0.006*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.004*"optimization" + 0.003*"descent" + 2022-04-22 17:44:23,826 : INFO : topic #8 (0.053): 0.008*"rule" + 0.006*"hidden" + 0.005*"generalization" + 0.004*"hidden_unit" + 0.004*"sequence" + 0.004*"net" + 0.004*"prediction" + 0.004*"tree" + 0.003*"machine" + 0.003*"trained" + 2022-04-22 17:44:23,826 : INFO : topic #3 (0.061): 0.007*"class" + 0.006*"gaussian" + 0.005*"classifier" + 0.005*"sample" + 0.005*"estimate" + 0.005*"classification" + 0.004*"density" + 0.004*"prior" + 0.004*"mixture" + 0.004*"bayesian" + 2022-04-22 17:44:23,827 : INFO : topic diff=0.163348, rho=0.288675 + 2022-04-22 17:44:23,834 : INFO : PROGRESS: pass 11, at document #1740/1740 + 2022-04-22 17:44:29,135 : INFO : optimized alpha [0.040188633, 0.03646946, 0.038880475, 0.06112813, 0.05245481, 0.04061286, 0.047049697, 0.03229136, 0.05290524, 0.03296597] + 2022-04-22 17:44:29,141 : INFO : topic #7 (0.032): 0.013*"chip" + 0.013*"circuit" + 0.011*"analog" + 0.010*"neuron" + 0.008*"signal" + 0.007*"memory" + 0.007*"voltage" + 0.006*"vlsi" + 0.005*"noise" + 0.005*"implementation" + 2022-04-22 17:44:29,141 : INFO : topic #9 (0.033): 0.021*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.011*"signal" + 0.011*"stimulus" + 0.010*"response" + 0.008*"firing" + 0.007*"noise" + 0.006*"frequency" + 0.006*"channel" + 2022-04-22 17:44:29,142 : INFO : topic #4 (0.052): 0.009*"matrix" + 0.006*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.004*"optimization" + 0.003*"optimal" + 2022-04-22 17:44:29,142 : INFO : topic #8 (0.053): 0.008*"rule" + 0.006*"hidden" + 0.005*"generalization" + 0.004*"hidden_unit" + 0.004*"sequence" + 0.004*"prediction" + 0.004*"net" + 0.004*"tree" + 0.003*"machine" + 0.003*"trained" + 2022-04-22 17:44:29,142 : INFO : topic #3 (0.061): 0.007*"class" + 0.006*"gaussian" + 0.005*"classifier" + 0.005*"sample" + 0.005*"estimate" + 0.005*"classification" + 0.005*"density" + 0.004*"prior" + 0.004*"mixture" + 0.004*"bayesian" + 2022-04-22 17:44:29,142 : INFO : topic diff=0.153485, rho=0.277350 + 2022-04-22 17:44:29,150 : INFO : PROGRESS: pass 12, at document #1740/1740 + 2022-04-22 17:44:33,545 : INFO : optimized alpha [0.04036388, 0.03635188, 0.038611963, 0.061483774, 0.05345723, 0.040894084, 0.04736741, 0.03211178, 0.05297828, 0.03274891] + 2022-04-22 17:44:33,551 : INFO : topic #7 (0.032): 0.013*"circuit" + 0.013*"chip" + 0.011*"analog" + 0.010*"neuron" + 0.008*"signal" + 0.007*"voltage" + 0.007*"memory" + 0.006*"vlsi" + 0.005*"implementation" + 0.005*"noise" + 2022-04-22 17:44:33,552 : INFO : topic #9 (0.033): 0.021*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.011*"signal" + 0.011*"stimulus" + 0.011*"response" + 0.009*"firing" + 0.007*"noise" + 0.006*"frequency" + 0.006*"channel" + 2022-04-22 17:44:33,552 : INFO : topic #8 (0.053): 0.008*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.004*"hidden_unit" + 0.004*"sequence" + 0.004*"prediction" + 0.004*"net" + 0.004*"tree" + 0.003*"machine" + 0.003*"trained" + 2022-04-22 17:44:33,552 : INFO : topic #4 (0.053): 0.009*"matrix" + 0.006*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.003*"optimization" + 0.003*"optimal" + 2022-04-22 17:44:33,552 : INFO : topic #3 (0.061): 0.007*"class" + 0.006*"gaussian" + 0.005*"sample" + 0.005*"classifier" + 0.005*"estimate" + 0.005*"density" + 0.005*"classification" + 0.004*"prior" + 0.004*"mixture" + 0.004*"bayesian" + 2022-04-22 17:44:33,553 : INFO : topic diff=0.143831, rho=0.267261 + 2022-04-22 17:44:33,562 : INFO : PROGRESS: pass 13, at document #1740/1740 + 2022-04-22 17:44:39,235 : INFO : optimized alpha [0.040587135, 0.03631959, 0.03839379, 0.061911535, 0.05453887, 0.041285977, 0.047773384, 0.032027513, 0.05315258, 0.03261802] + 2022-04-22 17:44:39,246 : INFO : topic #7 (0.032): 0.014*"circuit" + 0.013*"chip" + 0.011*"analog" + 0.010*"neuron" + 0.008*"signal" + 0.008*"voltage" + 0.007*"memory" + 0.006*"vlsi" + 0.005*"implementation" + 0.005*"noise" + 2022-04-22 17:44:39,246 : INFO : topic #9 (0.033): 0.021*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.011*"signal" + 0.011*"stimulus" + 0.011*"response" + 0.009*"firing" + 0.007*"noise" + 0.007*"frequency" + 0.006*"channel" + 2022-04-22 17:44:39,247 : INFO : topic #8 (0.053): 0.008*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.004*"hidden_unit" + 0.004*"sequence" + 0.004*"prediction" + 0.004*"net" + 0.004*"tree" + 0.004*"machine" + 0.003*"trained" + 2022-04-22 17:44:39,247 : INFO : topic #4 (0.055): 0.009*"matrix" + 0.007*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.003*"optimization" + 0.003*"optimal" + 2022-04-22 17:44:39,247 : INFO : topic #3 (0.062): 0.007*"class" + 0.006*"gaussian" + 0.005*"sample" + 0.005*"classifier" + 0.005*"estimate" + 0.005*"density" + 0.005*"classification" + 0.004*"prior" + 0.004*"mixture" + 0.004*"bayesian" + 2022-04-22 17:44:39,248 : INFO : topic diff=0.134602, rho=0.258199 + 2022-04-22 17:44:39,258 : INFO : PROGRESS: pass 14, at document #1740/1740 + 2022-04-22 17:44:46,319 : INFO : optimized alpha [0.040821876, 0.036360793, 0.03824259, 0.062456302, 0.055688635, 0.041737743, 0.048259463, 0.032020763, 0.05343126, 0.03254091] + 2022-04-22 17:44:46,325 : INFO : topic #7 (0.032): 0.014*"circuit" + 0.013*"chip" + 0.012*"analog" + 0.010*"neuron" + 0.008*"signal" + 0.008*"voltage" + 0.007*"memory" + 0.006*"vlsi" + 0.005*"implementation" + 0.005*"noise" + 2022-04-22 17:44:46,326 : INFO : topic #9 (0.033): 0.021*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.011*"signal" + 0.011*"response" + 0.011*"stimulus" + 0.009*"firing" + 0.007*"noise" + 0.007*"frequency" + 0.006*"channel" + 2022-04-22 17:44:46,327 : INFO : topic #8 (0.053): 0.008*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.004*"hidden_unit" + 0.004*"sequence" + 0.004*"prediction" + 0.004*"net" + 0.004*"tree" + 0.004*"machine" + 0.003*"trained" + 2022-04-22 17:44:46,327 : INFO : topic #4 (0.056): 0.009*"matrix" + 0.007*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.004*"optimal" + 0.003*"optimization" + 2022-04-22 17:44:46,327 : INFO : topic #3 (0.062): 0.007*"class" + 0.006*"gaussian" + 0.005*"sample" + 0.005*"classifier" + 0.005*"estimate" + 0.005*"density" + 0.004*"mixture" + 0.004*"classification" + 0.004*"prior" + 0.004*"bayesian" + 2022-04-22 17:44:46,328 : INFO : topic diff=0.125871, rho=0.250000 + 2022-04-22 17:44:46,338 : INFO : PROGRESS: pass 15, at document #1740/1740 + 2022-04-22 17:44:53,655 : INFO : optimized alpha [0.04109236, 0.036467522, 0.0381424, 0.06306473, 0.056903645, 0.04227092, 0.04874864, 0.032058466, 0.053792715, 0.03251973] + 2022-04-22 17:44:53,666 : INFO : topic #7 (0.032): 0.014*"circuit" + 0.013*"chip" + 0.012*"analog" + 0.010*"neuron" + 0.008*"signal" + 0.008*"voltage" + 0.007*"memory" + 0.006*"vlsi" + 0.005*"implementation" + 0.005*"bit" + 2022-04-22 17:44:53,666 : INFO : topic #9 (0.033): 0.021*"neuron" + 0.014*"spike" + 0.014*"cell" + 0.011*"signal" + 0.011*"response" + 0.011*"stimulus" + 0.009*"firing" + 0.007*"frequency" + 0.007*"noise" + 0.006*"channel" + 2022-04-22 17:44:53,667 : INFO : topic #8 (0.054): 0.008*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.004*"hidden_unit" + 0.004*"prediction" + 0.004*"sequence" + 0.004*"net" + 0.004*"tree" + 0.004*"machine" + 0.003*"trained" + 2022-04-22 17:44:53,667 : INFO : topic #4 (0.057): 0.009*"matrix" + 0.007*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.004*"optimal" + 0.003*"optimization" + 2022-04-22 17:44:53,667 : INFO : topic #3 (0.063): 0.007*"class" + 0.006*"gaussian" + 0.005*"sample" + 0.005*"estimate" + 0.005*"classifier" + 0.005*"density" + 0.005*"mixture" + 0.005*"prior" + 0.004*"classification" + 0.004*"bayesian" + 2022-04-22 17:44:53,667 : INFO : topic diff=0.117670, rho=0.242536 + 2022-04-22 17:44:53,679 : INFO : PROGRESS: pass 16, at document #1740/1740 + 2022-04-22 17:45:00,393 : INFO : optimized alpha [0.041376065, 0.03660367, 0.0380804, 0.06374838, 0.058118302, 0.0428449, 0.049285352, 0.03212048, 0.054208644, 0.032528903] + 2022-04-22 17:45:00,403 : INFO : topic #7 (0.032): 0.014*"circuit" + 0.013*"chip" + 0.012*"analog" + 0.011*"neuron" + 0.008*"signal" + 0.008*"voltage" + 0.008*"memory" + 0.006*"vlsi" + 0.006*"implementation" + 0.005*"bit" + 2022-04-22 17:45:00,403 : INFO : topic #9 (0.033): 0.021*"neuron" + 0.014*"cell" + 0.014*"spike" + 0.012*"signal" + 0.011*"response" + 0.011*"stimulus" + 0.009*"firing" + 0.007*"frequency" + 0.007*"noise" + 0.007*"channel" + 2022-04-22 17:45:00,404 : INFO : topic #8 (0.054): 0.008*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.004*"hidden_unit" + 0.004*"prediction" + 0.004*"sequence" + 0.004*"net" + 0.004*"tree" + 0.004*"machine" + 0.003*"trained" + 2022-04-22 17:45:00,404 : INFO : topic #4 (0.058): 0.009*"matrix" + 0.007*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.004*"optimal" + 0.003*"optimization" + 2022-04-22 17:45:00,404 : INFO : topic #3 (0.064): 0.007*"class" + 0.006*"gaussian" + 0.006*"sample" + 0.005*"estimate" + 0.005*"classifier" + 0.005*"density" + 0.005*"mixture" + 0.005*"prior" + 0.004*"classification" + 0.004*"bayesian" + 2022-04-22 17:45:00,405 : INFO : topic diff=0.109988, rho=0.235702 + 2022-04-22 17:45:00,416 : INFO : PROGRESS: pass 17, at document #1740/1740 + 2022-04-22 17:45:09,386 : INFO : optimized alpha [0.041690826, 0.036777373, 0.038074017, 0.06447209, 0.059317604, 0.043464534, 0.04985148, 0.032209247, 0.05470903, 0.032565065] + 2022-04-22 17:45:09,400 : INFO : topic #7 (0.032): 0.014*"circuit" + 0.013*"chip" + 0.012*"analog" + 0.011*"neuron" + 0.008*"signal" + 0.008*"voltage" + 0.008*"memory" + 0.006*"vlsi" + 0.006*"implementation" + 0.005*"bit" + 2022-04-22 17:45:09,400 : INFO : topic #9 (0.033): 0.020*"neuron" + 0.014*"cell" + 0.014*"spike" + 0.012*"signal" + 0.011*"response" + 0.011*"stimulus" + 0.009*"firing" + 0.007*"frequency" + 0.007*"noise" + 0.007*"channel" + 2022-04-22 17:45:09,401 : INFO : topic #8 (0.055): 0.008*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.004*"hidden_unit" + 0.004*"prediction" + 0.004*"sequence" + 0.004*"net" + 0.004*"tree" + 0.004*"machine" + 0.003*"trained" + 2022-04-22 17:45:09,401 : INFO : topic #4 (0.059): 0.009*"matrix" + 0.007*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"let" + 0.004*"minimum" + 0.004*"eq" + 0.004*"optimal" + 0.003*"optimization" + 2022-04-22 17:45:09,402 : INFO : topic #3 (0.064): 0.007*"class" + 0.006*"gaussian" + 0.006*"sample" + 0.005*"estimate" + 0.005*"classifier" + 0.005*"density" + 0.005*"mixture" + 0.005*"prior" + 0.004*"classification" + 0.004*"bayesian" + 2022-04-22 17:45:09,402 : INFO : topic diff=0.102916, rho=0.229416 + 2022-04-22 17:45:09,423 : INFO : PROGRESS: pass 18, at document #1740/1740 + 2022-04-22 17:45:19,067 : INFO : optimized alpha [0.042022552, 0.037017036, 0.038090236, 0.06523256, 0.06052085, 0.044076443, 0.050475497, 0.03232651, 0.055261094, 0.032642473] + 2022-04-22 17:45:19,077 : INFO : topic #7 (0.032): 0.015*"circuit" + 0.013*"chip" + 0.012*"analog" + 0.011*"neuron" + 0.008*"signal" + 0.008*"voltage" + 0.008*"memory" + 0.006*"vlsi" + 0.006*"implementation" + 0.005*"bit" + 2022-04-22 17:45:19,077 : INFO : topic #9 (0.033): 0.020*"neuron" + 0.014*"cell" + 0.014*"spike" + 0.012*"signal" + 0.011*"response" + 0.011*"stimulus" + 0.009*"firing" + 0.008*"frequency" + 0.007*"noise" + 0.007*"channel" + 2022-04-22 17:45:19,078 : INFO : topic #8 (0.055): 0.009*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.005*"hidden_unit" + 0.004*"prediction" + 0.004*"net" + 0.004*"sequence" + 0.004*"tree" + 0.004*"machine" + 0.003*"trained" + 2022-04-22 17:45:19,078 : INFO : topic #4 (0.061): 0.009*"matrix" + 0.007*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"minimum" + 0.004*"let" + 0.004*"eq" + 0.004*"optimal" + 0.003*"optimization" + 2022-04-22 17:45:19,078 : INFO : topic #3 (0.065): 0.007*"class" + 0.007*"gaussian" + 0.006*"sample" + 0.005*"estimate" + 0.005*"density" + 0.005*"classifier" + 0.005*"mixture" + 0.005*"prior" + 0.004*"classification" + 0.004*"likelihood" + 2022-04-22 17:45:19,079 : INFO : topic diff=0.096362, rho=0.223607 + 2022-04-22 17:45:19,090 : INFO : PROGRESS: pass 19, at document #1740/1740 + 2022-04-22 17:45:26,202 : INFO : optimized alpha [0.042380035, 0.037280142, 0.03813037, 0.06597655, 0.0617652, 0.044686105, 0.051100377, 0.032451425, 0.05581024, 0.03274816] + 2022-04-22 17:45:26,210 : INFO : topic #7 (0.032): 0.015*"circuit" + 0.013*"chip" + 0.012*"analog" + 0.011*"neuron" + 0.008*"signal" + 0.008*"voltage" + 0.008*"memory" + 0.006*"vlsi" + 0.006*"implementation" + 0.005*"bit" + 2022-04-22 17:45:26,210 : INFO : topic #9 (0.033): 0.020*"neuron" + 0.015*"cell" + 0.014*"spike" + 0.012*"signal" + 0.011*"response" + 0.011*"stimulus" + 0.009*"firing" + 0.008*"frequency" + 0.007*"noise" + 0.007*"channel" + 2022-04-22 17:45:26,210 : INFO : topic #8 (0.056): 0.009*"rule" + 0.006*"hidden" + 0.006*"generalization" + 0.005*"hidden_unit" + 0.004*"prediction" + 0.004*"net" + 0.004*"sequence" + 0.004*"tree" + 0.004*"machine" + 0.003*"trained" + 2022-04-22 17:45:26,211 : INFO : topic #4 (0.062): 0.009*"matrix" + 0.007*"gradient" + 0.006*"solution" + 0.005*"convergence" + 0.004*"distance" + 0.004*"minimum" + 0.004*"let" + 0.004*"eq" + 0.004*"optimal" + 0.003*"energy" + 2022-04-22 17:45:26,211 : INFO : topic #3 (0.066): 0.007*"class" + 0.007*"gaussian" + 0.006*"sample" + 0.005*"estimate" + 0.005*"density" + 0.005*"mixture" + 0.005*"classifier" + 0.005*"prior" + 0.004*"likelihood" + 0.004*"bayesian" + 2022-04-22 17:45:26,211 : INFO : topic diff=0.090311, rho=0.218218 + 2022-04-22 17:45:26,222 : INFO : LdaModel lifecycle event {'msg': 'trained LdaModel in 141.08s', 'datetime': '2022-04-22T17:45:26.222157', 'gensim': '4.1.3.dev0', 'python': '3.9.7 (default, Sep 3 2021, 12:37:55) \n[Clang 12.0.5 (clang-1205.0.22.9)]', 'platform': 'macOS-11.6.5-x86_64-i386-64bit', 'event': 'created'} @@ -715,218 +673,218 @@ methods on the blog at http://rare-technologies.com/lda-training-tips/ ! .. code-block:: none - 2021-03-19 14:12:02,008 : INFO : CorpusAccumulator accumulated stats from 1000 documents - Average topic coherence: -1.1072. - [([(0.023360161, 'neuron'), - (0.013864572, 'cell'), - (0.0085508, 'spike'), - (0.007835109, 'response'), - (0.0077002184, 'synaptic'), - (0.006420619, 'firing'), - (0.0063291225, 'activity'), - (0.005894408, 'stimulus'), - (0.005635916, 'signal'), - (0.005319338, 'frequency'), - (0.0044079474, 'potential'), - (0.0042212, 'connection'), - (0.003969707, 'fig'), - (0.0038775448, 'phase'), - (0.0037467096, 'synapsis'), - (0.0035546266, 'channel'), - (0.0035464808, 'dynamic'), - (0.0035111816, 'memory'), - (0.003500412, 'simulation'), - (0.0033668294, 'temporal')], - -0.8843724877515563), - ([(0.007043698, 'gaussian'), - (0.0058810986, 'noise'), - (0.005357382, 'estimate'), - (0.005118217, 'likelihood'), - (0.004725707, 'approximation'), - (0.0047162576, 'prior'), - (0.004589121, 'bayesian'), - (0.0044163894, 'density'), - (0.004383228, 'mixture'), - (0.0043818722, 'variance'), - (0.004343727, 'matrix'), - (0.003920799, 'log'), - (0.0039041233, 'sample'), - (0.0038657538, 'posterior'), - (0.0038494268, 'hidden'), - (0.003747304, 'prediction'), - (0.0035524433, 'generalization'), - (0.003297515, 'em'), - (0.0031830291, 'optimal'), - (0.0029574349, 'estimation')], - -0.9201121458749306), - ([(0.013338742, 'visual'), - (0.011440194, 'cell'), - (0.010699649, 'field'), - (0.009350259, 'image'), - (0.008701173, 'motion'), - (0.008576538, 'map'), - (0.0077895345, 'direction'), - (0.0073878667, 'orientation'), - (0.006964441, 'eye'), - (0.0066007036, 'response'), - (0.0062312516, 'stimulus'), - (0.006194355, 'spatial'), - (0.0055934438, 'receptive'), - (0.005137706, 'receptive_field'), - (0.00512753, 'object'), - (0.004664231, 'layer'), - (0.0046304427, 'activity'), - (0.0045092506, 'position'), - (0.004168487, 'cortex'), - (0.0040872716, 'location')], - -0.9666086669197183), - ([(0.009677556, 'hidden'), - (0.008472348, 'image'), - (0.0066851787, 'character'), - (0.0064806826, 'layer'), - (0.005060741, 'hidden_unit'), - (0.004902215, 'recognition'), - (0.004825573, 'digit'), - (0.0043749292, 'attractor'), - (0.0043325345, 'energy'), - (0.00431843, 'dynamic'), - (0.0038877935, 'matrix'), - (0.003805258, 'net'), - (0.003757226, 'field'), - (0.0035065063, 'transformation'), - (0.0034933372, 'dimensional'), - (0.0034391459, 'distance'), - (0.0031490896, 'gradient'), - (0.0031419578, 'solution'), - (0.002954112, 'map'), - (0.0028736237, 'minimum')], - -1.011100924928429), - ([(0.010836434, 'circuit'), - (0.009359381, 'chip'), - (0.008903197, 'analog'), - (0.00655248, 'neuron'), - (0.006147317, 'threshold'), - (0.0050505013, 'image'), - (0.0048734145, 'bit'), - (0.0048433533, 'voltage'), - (0.004609887, 'memory'), - (0.004231914, 'vlsi'), - (0.0042090695, 'implementation'), - (0.004113957, 'net'), - (0.003907882, 'gate'), - (0.0038376434, 'layer'), - (0.0034949183, 'pp'), - (0.003291277, 'element'), - (0.0032199384, 'node'), - (0.0030992834, 'signal'), - (0.0029631325, 'design'), - (0.0028471586, 'processor')], - -1.0450720584710176), - ([(0.008781833, 'hidden'), - (0.008109003, 'net'), - (0.0069496827, 'layer'), - (0.006155399, 'rule'), - (0.005891262, 'node'), - (0.0051560537, 'hidden_unit'), - (0.0041502067, 'architecture'), - (0.0041317134, 'activation'), - (0.0041251457, 'sequence'), - (0.0040346556, 'propagation'), - (0.0036248995, 'back'), - (0.0035959794, 'recurrent'), - (0.0031377305, 'class'), - (0.0030542722, 'trained'), - (0.0030384492, 'code'), - (0.002923781, 'expert'), - (0.0028879363, 'string'), - (0.0027964872, 'learn'), - (0.0027678378, 'table'), - (0.0027654031, 'connection')], - -1.122278491657109), - ([(0.014161764, 'recognition'), - (0.011104057, 'speech'), - (0.010318562, 'word'), - (0.010277273, 'image'), - (0.00809512, 'object'), - (0.0063050594, 'signal'), - (0.0053472514, 'layer'), - (0.005024713, 'classification'), - (0.0050242324, 'face'), - (0.004580911, 'trained'), - (0.004409548, 'human'), - (0.0043301815, 'context'), - (0.0042581595, 'frame'), - (0.0040203724, 'hidden'), - (0.004008649, 'speaker'), - (0.0035841789, 'class'), - (0.0033736168, 'sequence'), - (0.0032663026, 'hmm'), - (0.0032505158, 'architecture'), - (0.0031761383, 'view')], - -1.1844643136695376), - ([(0.0071913837, 'matrix'), - (0.006639144, 'gradient'), - (0.0058832015, 'kernel'), - (0.0058791665, 'component'), - (0.0047264574, 'class'), - (0.0042780563, 'density'), - (0.004226884, 'xi'), - (0.004164046, 'convergence'), - (0.0041592806, 'source'), - (0.0040763966, 'loss'), - (0.00392406, 'basis'), - (0.0036241056, 'regression'), - (0.0035536229, 'approximation'), - (0.0033525354, 'independent'), - (0.0032649476, 'bound'), - (0.0031867179, 'mixture'), - (0.0031306876, 'let'), - (0.0030615225, 'signal'), - (0.0030061873, 'support'), - (0.0029361995, 'pca')], - -1.2550214906161075), - ([(0.012204602, 'tree'), - (0.010181904, 'node'), - (0.010171177, 'class'), - (0.007966109, 'classifier'), - (0.0075656017, 'decision'), - (0.005655141, 'rule'), - (0.0056041405, 'classification'), - (0.0054354756, 'sample'), - (0.0050921105, 'distance'), - (0.0046420856, 'bound'), - (0.0035473844, 'let'), - (0.0032015098, 'measure'), - (0.0031701634, 'cluster'), - (0.0030615227, 'clustering'), - (0.0030600468, 'graph'), - (0.003044858, 'neighbor'), - (0.0030077181, 'nearest'), - (0.0029182513, 'call'), - (0.0027482447, 'machine'), - (0.0027105191, 'hypothesis')], - -1.2831209969858721), - ([(0.016391048, 'control'), - (0.013031393, 'action'), - (0.009197483, 'policy'), - (0.008487638, 'reinforcement'), - (0.0068111503, 'controller'), - (0.0067618974, 'dynamic'), - (0.006282514, 'robot'), - (0.0061591244, 'optimal'), - (0.005933612, 'trajectory'), - (0.00556125, 'reinforcement_learning'), - (0.004895806, 'environment'), - (0.0044026882, 'goal'), - (0.0042024464, 'reward'), - (0.0037804258, 'position'), - (0.0037499247, 'arm'), - (0.003601292, 'motor'), - (0.0034139594, 'sutton'), - (0.0031908047, 'movement'), - (0.003142896, 'td'), - (0.0031323545, 'trial')], - -1.4003243935908478)] + 2022-04-22 17:45:28,224 : INFO : CorpusAccumulator accumulated stats from 1000 documents + Average topic coherence: -1.2010. + [([(0.009335279, 'matrix'), + (0.006810243, 'gradient'), + (0.0058767716, 'solution'), + (0.0050566536, 'convergence'), + (0.0043554083, 'distance'), + (0.004101262, 'minimum'), + (0.0040506367, 'let'), + (0.0039807004, 'eq'), + (0.0038555989, 'optimal'), + (0.0034886731, 'energy'), + (0.0034828722, 'optimization'), + (0.0034504435, 'condition'), + (0.0033918922, 'approximation'), + (0.0033640305, 'descent'), + (0.0032366295, 'constraint'), + (0.0032220806, 'xi'), + (0.003061566, 'stochastic'), + (0.0029803582, 'component'), + (0.0028803074, 'dynamic'), + (0.00280652, 'graph')], + -1.0314809310847135), + ([(0.006758064, 'class'), + (0.006583767, 'gaussian'), + (0.005633773, 'sample'), + (0.0053001167, 'estimate'), + (0.0049426625, 'density'), + (0.0048573534, 'mixture'), + (0.004835742, 'classifier'), + (0.0046612574, 'prior'), + (0.004377199, 'likelihood'), + (0.004344127, 'bayesian'), + (0.0043293545, 'classification'), + (0.0037983125, 'regression'), + (0.0037747815, 'noise'), + (0.003772593, 'log'), + (0.0037171794, 'kernel'), + (0.003717116, 'approximation'), + (0.0037102823, 'variance'), + (0.0034671598, 'component'), + (0.0032801689, 'posterior'), + (0.003173915, 'em')], + -1.0736087121706135), + ([(0.02519838, 'image'), + (0.013268676, 'object'), + (0.011446378, 'visual'), + (0.009458303, 'field'), + (0.008084482, 'motion'), + (0.006914001, 'direction'), + (0.0060067754, 'map'), + (0.0055346545, 'position'), + (0.004941865, 'pixel'), + (0.004847295, 'spatial'), + (0.0047093197, 'face'), + (0.0046589067, 'eye'), + (0.0046168645, 'location'), + (0.0043804147, 'filter'), + (0.0042905244, 'response'), + (0.0041273055, 'view'), + (0.0040860246, 'orientation'), + (0.0038862277, 'receptive'), + (0.0038229467, 'human'), + (0.0038166828, 'recognition')], + -1.101159857337566), + ([(0.015339, 'layer'), + (0.014894987, 'node'), + (0.010977563, 'net'), + (0.0097472165, 'hidden'), + (0.0075573265, 'threshold'), + (0.006544599, 'class'), + (0.006098466, 'bound'), + (0.005063979, 'activation'), + (0.0047261445, 'dimension'), + (0.0046081766, 'hidden_unit'), + (0.004463069, 'theorem'), + (0.0043413443, 'region'), + (0.0040992484, 'polynomial'), + (0.003927951, 'propagation'), + (0.003906715, 'hidden_layer'), + (0.003902104, 'back'), + (0.0034719643, 'let'), + (0.0034161368, 'bit'), + (0.0033824549, 'connection'), + (0.003204875, 'back_propagation')], + -1.1578264561349325), + ([(0.020037105, 'neuron'), + (0.01450755, 'cell'), + (0.014472483, 'spike'), + (0.011981914, 'signal'), + (0.011293252, 'response'), + (0.010934215, 'stimulus'), + (0.008777942, 'firing'), + (0.0077151447, 'frequency'), + (0.007196151, 'noise'), + (0.006772501, 'channel'), + (0.004612463, 'temporal'), + (0.0043820725, 'auditory'), + (0.0043365704, 'activity'), + (0.0040383274, 'sound'), + (0.004009629, 'potential'), + (0.0039981017, 'correlation'), + (0.0038944164, 'fig'), + (0.0036725644, 'train'), + (0.0034477867, 'firing_rate'), + (0.0033127973, 'source')], + -1.175461993278655), + ([(0.015848655, 'neuron'), + (0.015059427, 'cell'), + (0.009022958, 'activity'), + (0.008109199, 'connection'), + (0.008041161, 'synaptic'), + (0.0057249856, 'memory'), + (0.0053059673, 'cortex'), + (0.0050525647, 'dynamic'), + (0.0047387453, 'cortical'), + (0.004596282, 'simulation'), + (0.004441938, 'inhibitory'), + (0.004316362, 'phase'), + (0.004202166, 'response'), + (0.004129471, 'excitatory'), + (0.0041026585, 'attractor'), + (0.0036624784, 'synapsis'), + (0.003452054, 'fig'), + (0.003326298, 'interaction'), + (0.003292976, 'layer'), + (0.003188004, 'oscillator')], + -1.224961800422038), + ([(0.014448352, 'control'), + (0.011206106, 'action'), + (0.008610181, 'policy'), + (0.0073960284, 'reinforcement'), + (0.0071460134, 'dynamic'), + (0.006695718, 'trajectory'), + (0.006001844, 'optimal'), + (0.005919467, 'controller'), + (0.005142686, 'robot'), + (0.0049040187, 'reinforcement_learning'), + (0.004231131, 'environment'), + (0.0038927419, 'reward'), + (0.0036765926, 'goal'), + (0.0032516345, 'forward'), + (0.0029738136, 'arm'), + (0.0029553284, 'adaptive'), + (0.0029314642, 'sutton'), + (0.0029179594, 'position'), + (0.0028270711, 'path'), + (0.002815493, 'motor')], + -1.280662748184417), + ([(0.01465422, 'circuit'), + (0.0134508265, 'chip'), + (0.012013224, 'analog'), + (0.010762642, 'neuron'), + (0.008197728, 'signal'), + (0.007833759, 'voltage'), + (0.0075949323, 'memory'), + (0.0062134205, 'vlsi'), + (0.005665418, 'implementation'), + (0.00510467, 'bit'), + (0.004741555, 'noise'), + (0.004108878, 'processor'), + (0.004068751, 'pulse'), + (0.00402028, 'digital'), + (0.003979967, 'design'), + (0.0037854807, 'hardware'), + (0.0036803125, 'transistor'), + (0.0036066298, 'block'), + (0.0035669305, 'device'), + (0.0035628842, 'synapse')], + -1.2836262379148498), + ([(0.016415589, 'recognition'), + (0.0136875985, 'speech'), + (0.01258169, 'word'), + (0.0104766805, 'hidden'), + (0.0063662766, 'layer'), + (0.0061339615, 'character'), + (0.0056002084, 'trained'), + (0.005490037, 'context'), + (0.0051139165, 'sequence'), + (0.004984547, 'architecture'), + (0.004967922, 'hmm'), + (0.004862166, 'speaker'), + (0.004366162, 'net'), + (0.0042531807, 'digit'), + (0.0039046167, 'classification'), + (0.0037942464, 'class'), + (0.0037750585, 'frame'), + (0.00358875, 'mixture'), + (0.003476494, 'phoneme'), + (0.0034512014, 'letter')], + -1.323380921633785), + ([(0.008542947, 'rule'), + (0.00631226, 'hidden'), + (0.00597873, 'generalization'), + (0.0045754625, 'hidden_unit'), + (0.0043068537, 'prediction'), + (0.0040594153, 'net'), + (0.003990005, 'sequence'), + (0.0038032297, 'tree'), + (0.0035338537, 'machine'), + (0.0034035398, 'trained'), + (0.003242104, 'recurrent'), + (0.0031919426, 'training_set'), + (0.0029770972, 'table'), + (0.0028571628, 'learn'), + (0.0028489903, 'language'), + (0.0028364619, 'target'), + (0.0026097689, 'architecture'), + (0.0025739158, 'string'), + (0.0025172615, 'symbol'), + (0.0024356844, 'teacher')], + -1.3578438548773115)] @@ -959,9 +917,9 @@ References .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 2 minutes 47.007 seconds) + **Total running time of the script:** ( 4 minutes 13.971 seconds) -**Estimated memory usage:** 658 MB +**Estimated memory usage:** 664 MB .. _sphx_glr_download_auto_examples_tutorials_run_lda.py: diff --git a/docs/src/auto_examples/tutorials/sg_execution_times.rst b/docs/src/auto_examples/tutorials/sg_execution_times.rst index da986968c9..0dfaf2783f 100644 --- a/docs/src/auto_examples/tutorials/sg_execution_times.rst +++ b/docs/src/auto_examples/tutorials/sg_execution_times.rst @@ -5,18 +5,18 @@ Computation times ================= -**08:55.221** total execution time for **auto_examples_tutorials** files: +**04:13.971** total execution time for **auto_examples_tutorials** files: +-------------------------------------------------------------------------------------+-----------+----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 08:55.221 | 506.6 MB | +| :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` (``run_lda.py``) | 04:13.971 | 664.3 MB | +-------------------------------------------------------------------------------------+-----------+----------+ | :ref:`sphx_glr_auto_examples_tutorials_run_annoy.py` (``run_annoy.py``) | 00:00.000 | 0.0 MB | +-------------------------------------------------------------------------------------+-----------+----------+ | :ref:`sphx_glr_auto_examples_tutorials_run_doc2vec_lee.py` (``run_doc2vec_lee.py``) | 00:00.000 | 0.0 MB | +-------------------------------------------------------------------------------------+-----------+----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` (``run_fasttext.py``) | 00:00.000 | 0.0 MB | +| :ref:`sphx_glr_auto_examples_tutorials_run_ensemblelda.py` (``run_ensemblelda.py``) | 00:00.000 | 0.0 MB | +-------------------------------------------------------------------------------------+-----------+----------+ -| :ref:`sphx_glr_auto_examples_tutorials_run_lda.py` (``run_lda.py``) | 00:00.000 | 0.0 MB | +| :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py` (``run_fasttext.py``) | 00:00.000 | 0.0 MB | +-------------------------------------------------------------------------------------+-----------+----------+ | :ref:`sphx_glr_auto_examples_tutorials_run_scm.py` (``run_scm.py``) | 00:00.000 | 0.0 MB | +-------------------------------------------------------------------------------------+-----------+----------+ diff --git a/docs/src/check_gallery.py b/docs/src/check_gallery.py new file mode 100644 index 0000000000..d03726dabb --- /dev/null +++ b/docs/src/check_gallery.py @@ -0,0 +1,69 @@ +"""Check that the cached gallery files are up to date. + +If they are stale, then Sphinx will attempt to rebuild them from source. When +running the documentation build on CI, we want to avoid rebuilding the gallery, +because that takes too long. Instead, we use this script to warn the author of +the PR that they need to rebuild the docs themselves. +""" + +import hashlib +import os +import sys + + +def different(path1, path2): + with open(path1) as fin: + f1 = fin.read() + with open(path2) as fin: + f2 = fin.read() + return f1 != f2 + + +curr_dir = os.path.dirname(__file__) +stale = [] +for root, dirs, files in os.walk(os.path.join(curr_dir, 'gallery')): + for f in files: + if f.endswith('.py'): + source_path = os.path.join(root, f) + cache_path = source_path.replace('docs/src/gallery/', 'docs/src/auto_examples/') + + # + # We check two things: + # + # 1) Actual file content + # 2) MD5 checksums + # + # We check 1) because that's the part that matters to the user - + # it's what will appear in the documentation. We check 2) because + # that's what Sphinx Gallery relies on to decide what it needs to + # rebuild. In practice, only one of these checks is necessary, + # but we run them both because it's trivial. + # + if different(source_path, cache_path): + stale.append(cache_path) + continue + + actual_md5 = hashlib.md5() + with open(source_path, 'rb') as fin: + actual_md5.update(fin.read()) + + md5_path = cache_path + '.md5' + with open(md5_path) as fin: + expected_md5 = fin.read() + + if actual_md5.hexdigest() != expected_md5: + stale.append(cache_path) + +if stale: + print(f"""The gallery cache appears stale. + +Rebuild the documentation using the following commands from the gensim root subdirectory: + + pip install -e .[docs] + make -C docs/src html + +and then run `git add docs/src/auto_examples` to update the cache. + +Stale files: {stale} +""", file=sys.stderr) + sys.exit(1) diff --git a/docs/src/gallery/tutorials/run_lda.py b/docs/src/gallery/tutorials/run_lda.py index 2ec06a801c..7ee6b07cd2 100644 --- a/docs/src/gallery/tutorials/run_lda.py +++ b/docs/src/gallery/tutorials/run_lda.py @@ -245,7 +245,7 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' iterations = 400 eval_every = None # Don't evaluate model perplexity, takes too much time. -# Make a index to word dictionary. +# Make an index to word dictionary. temp = dictionary[0] # This is only to "load" the dictionary. id2word = dictionary.id2token @@ -278,7 +278,7 @@ def extract_documents(url='https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz' # methods on the blog at http://rare-technologies.com/lda-training-tips/ ! # -top_topics = model.top_topics(corpus) #, num_words=20) +top_topics = model.top_topics(corpus) # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics. avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics From 93ed2fbe27dab41b459bd09bd5b19ed5d7e01908 Mon Sep 17 00:00:00 2001 From: ahaya3776 <83044786+ahaya3776@users.noreply.github.com> Date: Fri, 22 Apr 2022 20:53:39 +0900 Subject: [PATCH 68/81] Fix broken external link for LDA implementation (#3190) * Fix broken external link * Fix external link to be more permanent * update documentation * rebuild on Linux dev machine Co-authored-by: Michael Penkov --- .../core/run_corpora_and_vector_spaces.ipynb | 4 +- .../core/run_corpora_and_vector_spaces.py | 2 +- .../core/run_corpora_and_vector_spaces.py.md5 | 2 +- .../core/run_corpora_and_vector_spaces.rst | 82 +++++++++---------- .../auto_examples/core/sg_execution_times.rst | 4 +- docs/src/auto_examples/index.rst | 20 ++--- .../core/run_corpora_and_vector_spaces.py | 2 +- 7 files changed, 58 insertions(+), 58 deletions(-) diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb index 875db7b507..80606bfde4 100644 --- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb +++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb @@ -249,7 +249,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Other formats include `Joachim's SVMlight format `_,\n`Blei's LDA-C format `_ and\n`GibbsLDA++ format `_.\n\n" + "Other formats include `Joachim's SVMlight format `_,\n`Blei's LDA-C format `_ and\n`GibbsLDA++ format `_.\n\n" ] }, { @@ -424,7 +424,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.8.10" } }, "nbformat": 4, diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py index 983a9d1235..d02e7d3418 100644 --- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py +++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py @@ -222,7 +222,7 @@ def __iter__(self): ############################################################################### # Other formats include `Joachim's SVMlight format `_, -# `Blei's LDA-C format `_ and +# `Blei's LDA-C format `_ and # `GibbsLDA++ format `_. corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus) diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5 b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5 index 174fe2a139..860d4a2586 100644 --- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5 +++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5 @@ -1 +1 @@ -55a8a886f05e5005c5f66d57569ee79d \ No newline at end of file +986566c5996bfc214bd711c0d2cf54db \ No newline at end of file diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst index 3cc549dd65..f49b214562 100644 --- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst +++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst @@ -178,12 +178,12 @@ between the questions and ids is called a dictionary: .. code-block:: none - 2021-06-01 10:34:56,824 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2021-06-01 10:34:56,824 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions) - 2021-06-01 10:34:56,834 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)", 'datetime': '2021-06-01T10:34:56.825003', 'gensim': '4.1.0.dev0', 'python': '3.8.5 (default, Jan 27 2021, 15:41:15) \n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-73-generic-x86_64-with-glibc2.29', 'event': 'created'} - 2021-06-01 10:34:56,834 : INFO : Dictionary lifecycle event {'fname_or_handle': '/tmp/deerwester.dict', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-06-01T10:34:56.834300', 'gensim': '4.1.0.dev0', 'python': '3.8.5 (default, Jan 27 2021, 15:41:15) \n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-73-generic-x86_64-with-glibc2.29', 'event': 'saving'} - 2021-06-01 10:34:56,834 : INFO : saved /tmp/deerwester.dict - Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) + 2022-04-22 19:16:03,056 : INFO : adding document #0 to Dictionary<0 unique tokens: []> + 2022-04-22 19:16:03,057 : INFO : built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions) + 2022-04-22 19:16:03,068 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> from 9 documents (total 29 corpus positions)", 'datetime': '2022-04-22T19:16:03.057201', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Mar 15 2022, 12:22:08) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-104-generic-x86_64-with-glibc2.29', 'event': 'created'} + 2022-04-22 19:16:03,069 : INFO : Dictionary lifecycle event {'fname_or_handle': '/tmp/deerwester.dict', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-04-22T19:16:03.069013', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Mar 15 2022, 12:22:08) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-104-generic-x86_64-with-glibc2.29', 'event': 'saving'} + 2022-04-22 19:16:03,069 : INFO : saved /tmp/deerwester.dict + Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> @@ -273,11 +273,11 @@ therefore reads: in the document `"Human computer interaction"`, the words `comp .. code-block:: none - 2021-06-01 10:34:57,074 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm - 2021-06-01 10:34:57,075 : INFO : saving sparse matrix to /tmp/deerwester.mm - 2021-06-01 10:34:57,075 : INFO : PROGRESS: saving document #0 - 2021-06-01 10:34:57,076 : INFO : saved 9x12 matrix, density=25.926% (28/108) - 2021-06-01 10:34:57,076 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index + 2022-04-22 19:16:03,436 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm + 2022-04-22 19:16:03,446 : INFO : saving sparse matrix to /tmp/deerwester.mm + 2022-04-22 19:16:03,447 : INFO : PROGRESS: saving document #0 + 2022-04-22 19:16:03,449 : INFO : saved 9x12 matrix, density=25.926% (28/108) + 2022-04-22 19:16:03,449 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index [[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]] @@ -372,7 +372,7 @@ then convert the tokens via a dictionary to their ids and yield the resulting sp .. code-block:: none - <__main__.MyCorpus object at 0x7f389b5f8520> + <__main__.MyCorpus object at 0x7ff5d5552250> @@ -450,10 +450,10 @@ Similarly, to construct the dictionary without loading all texts into memory: .. code-block:: none - 2021-06-01 10:34:58,466 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2021-06-01 10:34:58,467 : INFO : built Dictionary(42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...) from 9 documents (total 69 corpus positions) - 2021-06-01 10:34:58,467 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...) from 9 documents (total 69 corpus positions)", 'datetime': '2021-06-01T10:34:58.467454', 'gensim': '4.1.0.dev0', 'python': '3.8.5 (default, Jan 27 2021, 15:41:15) \n[GCC 9.3.0]', 'platform': 'Linux-5.4.0-73-generic-x86_64-with-glibc2.29', 'event': 'created'} - Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) + 2022-04-22 19:16:05,452 : INFO : adding document #0 to Dictionary<0 unique tokens: []> + 2022-04-22 19:16:05,455 : INFO : built Dictionary<42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...> from 9 documents (total 69 corpus positions) + 2022-04-22 19:16:05,455 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...> from 9 documents (total 69 corpus positions)", 'datetime': '2022-04-22T19:16:05.455728', 'gensim': '4.1.3.dev0', 'python': '3.8.10 (default, Mar 15 2022, 12:22:08) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-104-generic-x86_64-with-glibc2.29', 'event': 'created'} + Dictionary<12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...> @@ -502,11 +502,11 @@ create a toy corpus of 2 documents, as a plain Python list .. code-block:: none - 2021-06-01 10:34:58,603 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm - 2021-06-01 10:34:58,604 : INFO : saving sparse matrix to /tmp/corpus.mm - 2021-06-01 10:34:58,604 : INFO : PROGRESS: saving document #0 - 2021-06-01 10:34:58,604 : INFO : saved 2x2 matrix, density=25.000% (1/4) - 2021-06-01 10:34:58,604 : INFO : saving MmCorpus index to /tmp/corpus.mm.index + 2022-04-22 19:16:05,705 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm + 2022-04-22 19:16:05,708 : INFO : saving sparse matrix to /tmp/corpus.mm + 2022-04-22 19:16:05,708 : INFO : PROGRESS: saving document #0 + 2022-04-22 19:16:05,708 : INFO : saved 2x2 matrix, density=25.000% (1/4) + 2022-04-22 19:16:05,709 : INFO : saving MmCorpus index to /tmp/corpus.mm.index @@ -514,7 +514,7 @@ create a toy corpus of 2 documents, as a plain Python list .. GENERATED FROM PYTHON SOURCE LINES 224-227 Other formats include `Joachim's SVMlight format `_, -`Blei's LDA-C format `_ and +`Blei's LDA-C format `_ and `GibbsLDA++ format `_. .. GENERATED FROM PYTHON SOURCE LINES 227-233 @@ -537,16 +537,16 @@ Other formats include `Joachim's SVMlight format .. code-block:: none - 2021-06-01 10:34:58,653 : INFO : converting corpus to SVMlight format: /tmp/corpus.svmlight - 2021-06-01 10:34:58,654 : INFO : saving SvmLightCorpus index to /tmp/corpus.svmlight.index - 2021-06-01 10:34:58,654 : INFO : no word id mapping provided; initializing from corpus - 2021-06-01 10:34:58,654 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c - 2021-06-01 10:34:58,654 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab - 2021-06-01 10:34:58,654 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index - 2021-06-01 10:34:58,707 : INFO : no word id mapping provided; initializing from corpus - 2021-06-01 10:34:58,708 : INFO : storing corpus in List-Of-Words format into /tmp/corpus.low - 2021-06-01 10:34:58,708 : WARNING : List-of-words format can only save vectors with integer elements; 1 float entries were truncated to integer value - 2021-06-01 10:34:58,708 : INFO : saving LowCorpus index to /tmp/corpus.low.index + 2022-04-22 19:16:05,818 : INFO : converting corpus to SVMlight format: /tmp/corpus.svmlight + 2022-04-22 19:16:05,820 : INFO : saving SvmLightCorpus index to /tmp/corpus.svmlight.index + 2022-04-22 19:16:05,821 : INFO : no word id mapping provided; initializing from corpus + 2022-04-22 19:16:05,821 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c + 2022-04-22 19:16:05,821 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab + 2022-04-22 19:16:05,822 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index + 2022-04-22 19:16:05,934 : INFO : no word id mapping provided; initializing from corpus + 2022-04-22 19:16:05,936 : INFO : storing corpus in List-Of-Words format into /tmp/corpus.low + 2022-04-22 19:16:05,937 : WARNING : List-of-words format can only save vectors with integer elements; 1 float entries were truncated to integer value + 2022-04-22 19:16:05,937 : INFO : saving LowCorpus index to /tmp/corpus.low.index @@ -572,9 +572,9 @@ Conversely, to load a corpus iterator from a Matrix Market file: .. code-block:: none - 2021-06-01 10:34:58,756 : INFO : loaded corpus index from /tmp/corpus.mm.index - 2021-06-01 10:34:58,757 : INFO : initializing cython corpus reader from /tmp/corpus.mm - 2021-06-01 10:34:58,757 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries + 2022-04-22 19:16:06,046 : INFO : loaded corpus index from /tmp/corpus.mm.index + 2022-04-22 19:16:06,048 : INFO : initializing cython corpus reader from /tmp/corpus.mm + 2022-04-22 19:16:06,048 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries @@ -685,10 +685,10 @@ To save the same Matrix Market document stream in Blei's LDA-C format, .. code-block:: none - 2021-06-01 10:34:59,085 : INFO : no word id mapping provided; initializing from corpus - 2021-06-01 10:34:59,086 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c - 2021-06-01 10:34:59,087 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab - 2021-06-01 10:34:59,087 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index + 2022-04-22 19:16:06,823 : INFO : no word id mapping provided; initializing from corpus + 2022-04-22 19:16:06,825 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c + 2022-04-22 19:16:06,834 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab + 2022-04-22 19:16:06,835 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index @@ -785,9 +785,9 @@ Optimize converting between corpora and NumPy/SciPy arrays?), see the :ref:`apir .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 0 minutes 3.242 seconds) + **Total running time of the script:** ( 0 minutes 5.212 seconds) -**Estimated memory usage:** 48 MB +**Estimated memory usage:** 47 MB .. _sphx_glr_download_auto_examples_core_run_corpora_and_vector_spaces.py: diff --git a/docs/src/auto_examples/core/sg_execution_times.rst b/docs/src/auto_examples/core/sg_execution_times.rst index da5c34f485..e206b6d636 100644 --- a/docs/src/auto_examples/core/sg_execution_times.rst +++ b/docs/src/auto_examples/core/sg_execution_times.rst @@ -5,10 +5,10 @@ Computation times ================= -**00:03.242** total execution time for **auto_examples_core** files: +**00:05.212** total execution time for **auto_examples_core** files: +--------------------------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` (``run_corpora_and_vector_spaces.py``) | 00:03.242 | 48.2 MB | +| :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` (``run_corpora_and_vector_spaces.py``) | 00:05.212 | 47.2 MB | +--------------------------------------------------------------------------------------------------------------+-----------+---------+ | :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` (``run_core_concepts.py``) | 00:00.000 | 0.0 MB | +--------------------------------------------------------------------------------------------------------------+-----------+---------+ diff --git a/docs/src/auto_examples/index.rst b/docs/src/auto_examples/index.rst index a3626768f2..47819284b2 100644 --- a/docs/src/auto_examples/index.rst +++ b/docs/src/auto_examples/index.rst @@ -253,14 +253,14 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_scm_thumb.png - :alt: Soft Cosine Measure + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png + :alt: Word Mover's Distance - :ref:`sphx_glr_auto_examples_tutorials_run_scm.py` + :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` .. raw:: html @@ -270,18 +270,18 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_scm + /auto_examples/tutorials/run_wmd .. raw:: html -
+
.. only:: html - .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_wmd_thumb.png - :alt: Word Mover's Distance + .. figure:: /auto_examples/tutorials/images/thumb/sphx_glr_run_scm_thumb.png + :alt: Soft Cosine Measure - :ref:`sphx_glr_auto_examples_tutorials_run_wmd.py` + :ref:`sphx_glr_auto_examples_tutorials_run_scm.py` .. raw:: html @@ -291,7 +291,7 @@ Learning-oriented lessons that introduce a particular gensim feature, e.g. a mod .. toctree:: :hidden: - /auto_examples/tutorials/run_wmd + /auto_examples/tutorials/run_scm .. raw:: html
diff --git a/docs/src/gallery/core/run_corpora_and_vector_spaces.py b/docs/src/gallery/core/run_corpora_and_vector_spaces.py index 983a9d1235..d02e7d3418 100644 --- a/docs/src/gallery/core/run_corpora_and_vector_spaces.py +++ b/docs/src/gallery/core/run_corpora_and_vector_spaces.py @@ -222,7 +222,7 @@ def __iter__(self): ############################################################################### # Other formats include `Joachim's SVMlight format `_, -# `Blei's LDA-C format `_ and +# `Blei's LDA-C format `_ and # `GibbsLDA++ format `_. corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus) From 533da755ec18a8e75c7e2d2012bb259fc4087429 Mon Sep 17 00:00:00 2001 From: code-review-doctor <72647856+code-review-doctor@users.noreply.github.com> Date: Sun, 24 Apr 2022 06:55:03 +0100 Subject: [PATCH 69/81] Fix issue probably-meant-fstring found at https://codereview.doctor (#3332) --- gensim/test/test_word2vec.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index c7b0da6b7d..7e58275208 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -580,9 +580,9 @@ def test_evaluate_word_pairs(self): pearson = correlation[0][0] spearman = correlation[1][0] oov = correlation[2] - self.assertTrue(0.1 < pearson < 1.0, "pearson {pearson} not between 0.1 & 1.0") - self.assertTrue(0.1 < spearman < 1.0, "spearman {spearman} not between 0.1 and 1.0") - self.assertTrue(0.0 <= oov < 90.0, "OOV {oov} not between 0.0 and 90.0") + self.assertTrue(0.1 < pearson < 1.0, f"pearson {pearson} not between 0.1 & 1.0") + self.assertTrue(0.1 < spearman < 1.0, f"spearman {spearman} not between 0.1 and 1.0") + self.assertTrue(0.0 <= oov < 90.0, f"OOV {oov} not between 0.0 and 90.0") def test_evaluate_word_pairs_from_file(self): """Test Spearman and Pearson correlation coefficients give sane results on similarity datasets""" From 995ad8a11bda79b2db8672dc729f1dd660083731 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 26 Apr 2022 20:41:37 +0900 Subject: [PATCH 70/81] update release/README.md --- release/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/README.md b/release/README.md index 1ec47db5dc..2614252723 100644 --- a/release/README.md +++ b/release/README.md @@ -1,3 +1,3 @@ Scripts to help when making new releases. -For more info, see [our Wiki page](https://github.com/RaRe-Technologies/gensim/wiki/Developer-page#make-a-new-release-for-maintainers). +For more info, see [our Wiki page](https://github.com/RaRe-Technologies/gensim/wiki/Maintainer-page). From 8483502f045210d25a7cf7f7d02193c899e9d014 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 26 Apr 2022 21:25:53 +0900 Subject: [PATCH 71/81] update changelog script --- release/generate_changelog.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/release/generate_changelog.py b/release/generate_changelog.py index 97cc306f62..5479101e36 100644 --- a/release/generate_changelog.py +++ b/release/generate_changelog.py @@ -6,28 +6,41 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """Generate changelog entries for all PRs merged since the last release.""" +import os import re import requests import time + +def throttle_get(*args, seconds=3, **kwargs): + result = requests.get(*args, **kwargs) + result.raise_for_status() + + # Avoid Github API throttling; see https://github.com/RaRe-Technologies/gensim/pull/3203#issuecomment-887453109 + time.sleep(seconds) + + return result + + # # The releases get sorted in reverse chronological order, so the first release # in the list is the most recent. # -get = requests.get('https://api.github.com/repos/RaRe-Technologies/gensim/releases') -get.raise_for_status() +get = throttle_get('https://api.github.com/repos/RaRe-Technologies/gensim/releases') most_recent_release = get.json()[0] release_timestamp = most_recent_release['published_at'] +throttle() + def iter_merged_prs(since=release_timestamp): page = 1 while True: - get = requests.get( + get = throttle_get( 'https://api.github.com/repos/RaRe-Technologies/gensim/pulls', params={'state': 'closed', 'page': page}, ) - get.raise_for_status() + pulls = get.json() if not pulls: break @@ -37,18 +50,15 @@ def iter_merged_prs(since=release_timestamp): yield pr page += 1 - # Avoid Github API throttling; see https://github.com/RaRe-Technologies/gensim/pull/3203#issuecomment-887453109 - time.sleep(1) def iter_closed_issues(since=release_timestamp): page = 1 while True: - get = requests.get( + get = throttle_get( 'https://api.github.com/repos/RaRe-Technologies/gensim/issues', params={'state': 'closed', 'page': page, 'since': since}, ) - get.raise_for_status() issues = get.json() if not issues: break @@ -60,9 +70,6 @@ def iter_closed_issues(since=release_timestamp): if 'pull_request' not in issue and issue['closed_at'] > since: yield issue page += 1 - # Avoid Github API throttling; see https://github.com/RaRe-Technologies/gensim/pull/3203#issuecomment-887453109 - time.sleep(1) - fixed_issue_numbers = set() for pr in iter_merged_prs(since=release_timestamp): @@ -74,6 +81,12 @@ def iter_closed_issues(since=release_timestamp): # Unfortunately, the GitHub API doesn't link PRs to issues that they fix, # so we have do it ourselves. # + if pr['body'] is None: + # + # Weird edge case, PR with no body + # + continue + for match in re.finditer(r'fix(es)? #(?P\d+)\b', pr['body'], flags=re.IGNORECASE): fixed_issue_numbers.add(int(match.group('number'))) From 742fb188dc6de03a42411510bf5b45e26574b328 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 26 Apr 2022 21:28:12 +0900 Subject: [PATCH 72/81] fixup --- release/generate_changelog.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/release/generate_changelog.py b/release/generate_changelog.py index 5479101e36..73c930fe05 100644 --- a/release/generate_changelog.py +++ b/release/generate_changelog.py @@ -6,7 +6,6 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """Generate changelog entries for all PRs merged since the last release.""" -import os import re import requests import time @@ -30,8 +29,6 @@ def throttle_get(*args, seconds=3, **kwargs): most_recent_release = get.json()[0] release_timestamp = most_recent_release['published_at'] -throttle() - def iter_merged_prs(since=release_timestamp): page = 1 @@ -71,6 +68,7 @@ def iter_closed_issues(since=release_timestamp): yield issue page += 1 + fixed_issue_numbers = set() for pr in iter_merged_prs(since=release_timestamp): pr['user_login'] = pr['user']['login'] From 2f09b7744f0c8c11f6a9e47c52ddf829fdd61ebf Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 26 Apr 2022 22:07:48 +0900 Subject: [PATCH 73/81] started work on CHANGELOG for new release --- CHANGELOG.md | 103 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 96 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a3be62510..25a1f6e41a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,14 +3,103 @@ Changes ## Unreleased -* [#3194](https://github.com/RaRe-Technologies/gensim/pull/3194): Added random_seed parameter to make LsiModel reproducible, by [@parashardhapola](https://github.com/parashardhapola) -* [#3251](https://github.com/RaRe-Technologies/gensim/pull/3251): Apply new convention of delimiting instance params in str function, by [@menshikh-iv](https://github.com/menshikh-iv) +## 5.0.0, 2022-04-29 + +### :+1: New features + +* [#3317](https://github.com/RaRe-Technologies/gensim/pull/3317): Added `encoding` parameter to TextDirectoryCorpus, by [@Sandman-Ren](https://github.com/Sandman-Ren) +* [#3299](https://github.com/RaRe-Technologies/gensim/pull/3299): Enable test_word2vec_stand_alone_script by using sys.executable for python, by [@pabs3](https://github.com/pabs3) + +### :books: Tutorials and docs + +* [#3307](https://github.com/RaRe-Technologies/gensim/pull/3307): Documentation fixes, by [@piskvorky](https://github.com/piskvorky) +* [#3301](https://github.com/RaRe-Technologies/gensim/pull/3301): Remove unused Jupyter screenshots, by [@pabs3](https://github.com/pabs3) +* [#3289](https://github.com/RaRe-Technologies/gensim/pull/3289): Typos, text and code fix in LDA tutorial, by [@davebulaval](https://github.com/davebulaval) +* [#3284](https://github.com/RaRe-Technologies/gensim/pull/3284): Documentation fixes + added CITATION.cff, by [@piskvorky](https://github.com/piskvorky) +* [#3279](https://github.com/RaRe-Technologies/gensim/pull/3279): Add the FastSS and Levenshtein modules to docs, by [@piskvorky](https://github.com/piskvorky) +* [#3257](https://github.com/RaRe-Technologies/gensim/pull/3257): Dictionary doc: ref FAQ entry about filter_extremes corpus migration, by [@zacchiro](https://github.com/zacchiro) +* [#3235](https://github.com/RaRe-Technologies/gensim/pull/3235): Fix TFIDF docs, by [@piskvorky](https://github.com/piskvorky) * [#3227](https://github.com/RaRe-Technologies/gensim/pull/3227): Fix FastText doc-comment example for `build_vocab` and `train` to use correct argument names, by [@HLasse](https://github.com/HLasse) -* [#3247](https://github.com/RaRe-Technologies/gensim/pull/3247): Sparse2Corpus: update __getitem__ to work on slices, lists and ellipsis, by [@PrimozGodec](https://github.com/PrimozGodec) + +### :red_circle: Bug fixes + +* [#3332](https://github.com/RaRe-Technologies/gensim/pull/3332): Missing `f` prefix on f-strings fix, by [@code-review-doctor](https://github.com/code-review-doctor) +* [#3309](https://github.com/RaRe-Technologies/gensim/pull/3309): Respect encoding when reading binary keyed vectors, by [@alhoo](https://github.com/alhoo) +* [#3282](https://github.com/RaRe-Technologies/gensim/pull/3282): Fix `str()` method in WmdSimilarity, by [@DingQK](https://github.com/DingQK) +* [#3286](https://github.com/RaRe-Technologies/gensim/pull/3286): Fixes 'not enough arguments for format string' error, by [@gilbertfrancois](https://github.com/gilbertfrancois) * [#3250](https://github.com/RaRe-Technologies/gensim/pull/3250): Make negative ns_exponent work correctly, by [@menshikh-iv](https://github.com/menshikh-iv) -* [#3258](https://github.com/RaRe-Technologies/gensim/pull/3258): Adding another check to _check_corpus_sanity for compressed files, adding test, by [@dchaplinsky](https://github.com/dchaplinsky) + +### :warning: Removed functionality & deprecations + +### 🔮 Testing, CI, housekeeping + +* [#3329](https://github.com/RaRe-Technologies/gensim/pull/3329): Check gallery up to date as part of CI, by [@mpenkov](https://github.com/mpenkov) +* [#3318](https://github.com/RaRe-Technologies/gensim/pull/3318): Clean up evaluate_word_pairs code, by [@piskvorky](https://github.com/piskvorky) +* [#3308](https://github.com/RaRe-Technologies/gensim/pull/3308): get rid of tox, build things via github actions directly, by [@mpenkov](https://github.com/mpenkov) +* [#3303](https://github.com/RaRe-Technologies/gensim/pull/3303): add GitHub URL for PyPi, by [@andriyor](https://github.com/andriyor) +* [#3300](https://github.com/RaRe-Technologies/gensim/pull/3300): Fix code formatting for FT_CMD definition, by [@pabs3](https://github.com/pabs3) +* [#3298](https://github.com/RaRe-Technologies/gensim/pull/3298): test and build wheels for Py3.{7,8,9,10}, by [@mpenkov](https://github.com/mpenkov) * [#3274](https://github.com/RaRe-Technologies/gensim/pull/3274): Migrate setup.py from distutils to setuptools, by [@geojacobm6](https://github.com/geojacobm6) -* [#3286](https://github.com/RaRe-Technologies/gensim/pull/3286): Fixes 'not enough arguments for format string' error, by [@gilbertfrancois](https://github.com/gilbertfrancois) +* [#3255](https://github.com/RaRe-Technologies/gensim/pull/3255): Move windows tests from azure to github actions, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3252](https://github.com/RaRe-Technologies/gensim/pull/3252): Add Codecov to gensim repo, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3263](https://github.com/RaRe-Technologies/gensim/pull/3263): Remove commented out pytest-rerunfailures test dependency, by [@pabs3](https://github.com/pabs3) + +### Unsorted + +* [#3297](https://github.com/RaRe-Technologies/gensim/pull/3297): Use gensim.test.utils datapath() to construct paths to the test data, by [@pabs3](https://github.com/pabs3) +* [#3281](https://github.com/RaRe-Technologies/gensim/pull/3281): adjust test_parallel bound, by [@austereantelope](https://github.com/austereantelope) +* [#3280](https://github.com/RaRe-Technologies/gensim/pull/3280): tighten test_topic_word, by [@austereantelope](https://github.com/austereantelope) +* [#3278](https://github.com/RaRe-Technologies/gensim/pull/3278): Tighten test_parallel bound, by [@austereantelope](https://github.com/austereantelope) +* [#3271](https://github.com/RaRe-Technologies/gensim/pull/3271): Added new ValueError in place of assertion error for no model data provided in lsi model, by [@mark-todd](https://github.com/mark-todd) +* [#3264](https://github.com/RaRe-Technologies/gensim/pull/3264): Detect when a fasttext executable is available in PATH, by [@pabs3](https://github.com/pabs3) +* [#3258](https://github.com/RaRe-Technologies/gensim/pull/3258): Adding another check to _check_corpus_sanity for compressed files, adding test, by [@dchaplinsky](https://github.com/dchaplinsky) +* [#3254](https://github.com/RaRe-Technologies/gensim/pull/3254): Skip blinking test `test_translate_gc` on OSX + py3.9, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3251](https://github.com/RaRe-Technologies/gensim/pull/3251): Apply new convention of delimiting instance params in str function, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3247](https://github.com/RaRe-Technologies/gensim/pull/3247): Sparse2Corpus: update __getitem__ to work on slices, lists and ellipsis, by [@PrimozGodec](https://github.com/PrimozGodec) +* [#3230](https://github.com/RaRe-Technologies/gensim/pull/3230): Adding lifecycle configuration, by [@mpenkov](https://github.com/mpenkov) +* [#3197](https://github.com/RaRe-Technologies/gensim/pull/3197): Fix computation of topic coherence, by [@silviatti](https://github.com/silviatti) +* [#3194](https://github.com/RaRe-Technologies/gensim/pull/3194): Added random_seed parameter to make LsiModel reproducible, by [@parashardhapola](https://github.com/parashardhapola) +* [#3190](https://github.com/RaRe-Technologies/gensim/pull/3190): Fix broken external link for LDA implementation, by [@ahaya3776](https://github.com/ahaya3776) +* [#3188](https://github.com/RaRe-Technologies/gensim/pull/3188): Add get_sentence_vector() to FastText and get_mean_vector() to KeyedVectors, by [@rock420](https://github.com/rock420) +* [#3182](https://github.com/RaRe-Technologies/gensim/pull/3182): Fix error message when Doc2Vec does not receive corpus_file or corpus iterable, by [@blainedietrich](https://github.com/blainedietrich) +* [#3117](https://github.com/RaRe-Technologies/gensim/pull/3117): Ensure next_index available when loading old stored KeyedVectors models, by [@gojomo](https://github.com/gojomo) +* [#2656](https://github.com/RaRe-Technologies/gensim/pull/2656): streamlining most_similar_cosmul and evaluate_word_analogies, by [@n3hrox](https://github.com/n3hrox) + +### :question: Closed issues + +TODO: move each issue to its appropriate section or delete if irrelevant + +* [#3337](https://github.com/RaRe-Technologies/gensim/issues/3337): Problem with numpy=1.21.5 and gensim +* [#3333](https://github.com/RaRe-Technologies/gensim/issues/3333): KeyedVector most_similar() use too much CPU +* [#3325](https://github.com/RaRe-Technologies/gensim/issues/3325): AttributeError: 'KeyedVectors' object has no attribute 'add' +* [#3321](https://github.com/RaRe-Technologies/gensim/issues/3321): Improve models dump +* [#3319](https://github.com/RaRe-Technologies/gensim/issues/3319): PorterStemmer doesn't install +* [#3316](https://github.com/RaRe-Technologies/gensim/issues/3316): corpora.TextDirectoryCorpus fails on utf-8 encoded files on windows +* [#3313](https://github.com/RaRe-Technologies/gensim/issues/3313): Unable to find equivalent of doctag_syn0 in version 3 in version 4 +* [#3306](https://github.com/RaRe-Technologies/gensim/issues/3306): Text8corpuse error output causing OOM +* [#3296](https://github.com/RaRe-Technologies/gensim/issues/3296): LSI add_documents +* [#3288](https://github.com/RaRe-Technologies/gensim/issues/3288): Python 3.10 wheels +* [#3285](https://github.com/RaRe-Technologies/gensim/issues/3285): Infer vectors for each word of a new document +* [#3277](https://github.com/RaRe-Technologies/gensim/issues/3277): All the vocab model files are not saved +* [#3268](https://github.com/RaRe-Technologies/gensim/issues/3268): Can't suppress lifecycle events +* [#3267](https://github.com/RaRe-Technologies/gensim/issues/3267): ImportError : Ensemble LDA +* [#3266](https://github.com/RaRe-Technologies/gensim/issues/3266): Incorrect CBOW implementation in Gensim leads to inferior performance +* [#3249](https://github.com/RaRe-Technologies/gensim/issues/3249): Installing older version of Gensim gives a newer version +* [#3248](https://github.com/RaRe-Technologies/gensim/issues/3248): LdaMallet error returned non-zero exit status 1. +* [#3246](https://github.com/RaRe-Technologies/gensim/issues/3246): Partial support of compressed corpora in FastText model +* [#3245](https://github.com/RaRe-Technologies/gensim/issues/3245): Log level control +* [#3243](https://github.com/RaRe-Technologies/gensim/issues/3243): default estimation method of gensim's word2vec skipgram? +* [#3242](https://github.com/RaRe-Technologies/gensim/issues/3242): Computing WmdSimilarity each-with-each +* [#3241](https://github.com/RaRe-Technologies/gensim/issues/3241): I cannot import remove_stopword_tokens +* [#3240](https://github.com/RaRe-Technologies/gensim/issues/3240): Trying to get in touch regarding a security issue +* [#3233](https://github.com/RaRe-Technologies/gensim/issues/3233): Ask travis-ci.com for more credits +* [#3226](https://github.com/RaRe-Technologies/gensim/issues/3226): numpy 1.19.2 incompatible with gensim 4.1.0 +* [#3181](https://github.com/RaRe-Technologies/gensim/issues/3181): Mismatch get_coherence_per_topic and get_coherence for single topic +* [#3162](https://github.com/RaRe-Technologies/gensim/issues/3162): Doc2Vec: when we have string tags, build_vocab with update removes previous index +* [#3036](https://github.com/RaRe-Technologies/gensim/issues/3036): import gensim segmentation fault (macOS Big Sur, Apple M1/Apple Silicon/ARM) +* [#3015](https://github.com/RaRe-Technologies/gensim/issues/3015): Add convenience `get_sentence_vector()`-like methods for FastText, other models +* [#2535](https://github.com/RaRe-Technologies/gensim/issues/2535): streamlining most_similar_cosmul +* [#483](https://github.com/RaRe-Technologies/gensim/issues/483): Doc2Vec.infer_vector: AttributeError: 'Doc2Vec' object has no attribute 'syn1' ## 4.1.2, 2021-09-17 @@ -100,11 +189,11 @@ Plus a large number of smaller improvements and fixes, as usual. * [#3142](https://github.com/RaRe-Technologies/gensim/pull/3142): Use more permanent pdf link and update code link, by [@dymil](https://github.com/dymil) * [#3141](https://github.com/RaRe-Technologies/gensim/pull/3141): Update link for online LDA paper, by [@dymil](https://github.com/dymil) * [#3133](https://github.com/RaRe-Technologies/gensim/pull/3133): Update link to Hoffman paper (online VB LDA), by [@jonaschn](https://github.com/jonaschn) -* [#3129](https://github.com/RaRe-Technologies/gensim/pull/3129): [MRG] Add bronze sponsor: TechTarget, by [@piskvorky](https://github.com/piskvorky) +* [#3129](https://github.com/RaRe-Technologies/gensim/pull/3129): Add bronze sponsor: TechTarget, by [@piskvorky](https://github.com/piskvorky) * [#3126](https://github.com/RaRe-Technologies/gensim/pull/3126): Fix typos in make_wiki_online.py and make_wikicorpus.py, by [@nicolasassi](https://github.com/nicolasassi) * [#3125](https://github.com/RaRe-Technologies/gensim/pull/3125): Improve & unify docs for dirichlet priors, by [@jonaschn](https://github.com/jonaschn) * [#3123](https://github.com/RaRe-Technologies/gensim/pull/3123): Fix hyperlink for doc2vec tutorial, by [@AdityaSoni19031997](https://github.com/AdityaSoni19031997) -* [#3121](https://github.com/RaRe-Technologies/gensim/pull/3121): [MRG] Add bronze sponsor: eaccidents.com, by [@piskvorky](https://github.com/piskvorky) +* [#3121](https://github.com/RaRe-Technologies/gensim/pull/3121): Add bronze sponsor: eaccidents.com, by [@piskvorky](https://github.com/piskvorky) * [#3120](https://github.com/RaRe-Technologies/gensim/pull/3120): Fix URL for ldamodel.py, by [@jonaschn](https://github.com/jonaschn) * [#3118](https://github.com/RaRe-Technologies/gensim/pull/3118): Fix URL in doc string, by [@jonaschn](https://github.com/jonaschn) * [#3107](https://github.com/RaRe-Technologies/gensim/pull/3107): Draw attention to sponsoring in README, by [@piskvorky](https://github.com/piskvorky) From efd5b778d6381d67063c1ecf831b58efc81f908f Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Tue, 26 Apr 2022 22:07:54 +0900 Subject: [PATCH 74/81] git add release/generate_changelog.py --- release/generate_changelog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/generate_changelog.py b/release/generate_changelog.py index 73c930fe05..62ca7b329b 100644 --- a/release/generate_changelog.py +++ b/release/generate_changelog.py @@ -11,7 +11,7 @@ import time -def throttle_get(*args, seconds=3, **kwargs): +def throttle_get(*args, seconds=10, **kwargs): result = requests.get(*args, **kwargs) result.raise_for_status() From 97f63e06eaf0626e13aa4b8ecc70057f923814cd Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 29 Apr 2022 11:38:41 +0900 Subject: [PATCH 75/81] more work on sorting issues and PRs --- CHANGELOG.md | 144 ++++++++++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 66 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 25a1f6e41a..1f1fc9f8dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,100 +7,112 @@ Changes ### :+1: New features -* [#3317](https://github.com/RaRe-Technologies/gensim/pull/3317): Added `encoding` parameter to TextDirectoryCorpus, by [@Sandman-Ren](https://github.com/Sandman-Ren) +* [#3188](https://github.com/RaRe-Technologies/gensim/pull/3188): Add get_sentence_vector() to FastText and get_mean_vector() to KeyedVectors, by [@rock420](https://github.com/rock420) +* [#3194](https://github.com/RaRe-Technologies/gensim/pull/3194): Added random_seed parameter to make LsiModel reproducible, by [@parashardhapola](https://github.com/parashardhapola) +* [#3247](https://github.com/RaRe-Technologies/gensim/pull/3247): Sparse2Corpus: update __getitem__ to work on slices, lists and ellipsis, by [@PrimozGodec](https://github.com/PrimozGodec) +* [#3264](https://github.com/RaRe-Technologies/gensim/pull/3264): Detect when a fasttext executable is available in PATH, by [@pabs3](https://github.com/pabs3) +* [#3271](https://github.com/RaRe-Technologies/gensim/pull/3271): Added new ValueError in place of assertion error for no model data provided in lsi model, by [@mark-todd](https://github.com/mark-todd) * [#3299](https://github.com/RaRe-Technologies/gensim/pull/3299): Enable test_word2vec_stand_alone_script by using sys.executable for python, by [@pabs3](https://github.com/pabs3) +* [#3317](https://github.com/RaRe-Technologies/gensim/pull/3317): Added `encoding` parameter to TextDirectoryCorpus, by [@Sandman-Ren](https://github.com/Sandman-Ren) ### :books: Tutorials and docs -* [#3307](https://github.com/RaRe-Technologies/gensim/pull/3307): Documentation fixes, by [@piskvorky](https://github.com/piskvorky) -* [#3301](https://github.com/RaRe-Technologies/gensim/pull/3301): Remove unused Jupyter screenshots, by [@pabs3](https://github.com/pabs3) -* [#3289](https://github.com/RaRe-Technologies/gensim/pull/3289): Typos, text and code fix in LDA tutorial, by [@davebulaval](https://github.com/davebulaval) -* [#3284](https://github.com/RaRe-Technologies/gensim/pull/3284): Documentation fixes + added CITATION.cff, by [@piskvorky](https://github.com/piskvorky) -* [#3279](https://github.com/RaRe-Technologies/gensim/pull/3279): Add the FastSS and Levenshtein modules to docs, by [@piskvorky](https://github.com/piskvorky) -* [#3257](https://github.com/RaRe-Technologies/gensim/pull/3257): Dictionary doc: ref FAQ entry about filter_extremes corpus migration, by [@zacchiro](https://github.com/zacchiro) -* [#3235](https://github.com/RaRe-Technologies/gensim/pull/3235): Fix TFIDF docs, by [@piskvorky](https://github.com/piskvorky) * [#3227](https://github.com/RaRe-Technologies/gensim/pull/3227): Fix FastText doc-comment example for `build_vocab` and `train` to use correct argument names, by [@HLasse](https://github.com/HLasse) +* [#3235](https://github.com/RaRe-Technologies/gensim/pull/3235): Fix TFIDF docs, by [@piskvorky](https://github.com/piskvorky) +* [#3257](https://github.com/RaRe-Technologies/gensim/pull/3257): Dictionary doc: ref FAQ entry about filter_extremes corpus migration, by [@zacchiro](https://github.com/zacchiro) +* [#3279](https://github.com/RaRe-Technologies/gensim/pull/3279): Add the FastSS and Levenshtein modules to docs, by [@piskvorky](https://github.com/piskvorky) +* [#3284](https://github.com/RaRe-Technologies/gensim/pull/3284): Documentation fixes + added CITATION.cff, by [@piskvorky](https://github.com/piskvorky) +* [#3289](https://github.com/RaRe-Technologies/gensim/pull/3289): Typos, text and code fix in LDA tutorial, by [@davebulaval](https://github.com/davebulaval) +* [#3301](https://github.com/RaRe-Technologies/gensim/pull/3301): Remove unused Jupyter screenshots, by [@pabs3](https://github.com/pabs3) +* [#3307](https://github.com/RaRe-Technologies/gensim/pull/3307): Documentation fixes, by [@piskvorky](https://github.com/piskvorky) ### :red_circle: Bug fixes -* [#3332](https://github.com/RaRe-Technologies/gensim/pull/3332): Missing `f` prefix on f-strings fix, by [@code-review-doctor](https://github.com/code-review-doctor) -* [#3309](https://github.com/RaRe-Technologies/gensim/pull/3309): Respect encoding when reading binary keyed vectors, by [@alhoo](https://github.com/alhoo) +* [#3117](https://github.com/RaRe-Technologies/gensim/pull/3117): Ensure next_index available when loading old stored KeyedVectors models, by [@gojomo](https://github.com/gojomo) +* [#3182](https://github.com/RaRe-Technologies/gensim/pull/3182): Fix error message when Doc2Vec does not receive corpus_file or corpus iterable, by [@blainedietrich](https://github.com/blainedietrich) +* [#3190](https://github.com/RaRe-Technologies/gensim/pull/3190): Fix broken external link for LDA implementation, by [@ahaya3776](https://github.com/ahaya3776) +* [#3197](https://github.com/RaRe-Technologies/gensim/pull/3197): Fix computation of topic coherence, by [@silviatti](https://github.com/silviatti) +* [#3250](https://github.com/RaRe-Technologies/gensim/pull/3250): Make negative ns_exponent work correctly, by [@menshikh-iv](https://github.com/menshikh-iv) * [#3282](https://github.com/RaRe-Technologies/gensim/pull/3282): Fix `str()` method in WmdSimilarity, by [@DingQK](https://github.com/DingQK) * [#3286](https://github.com/RaRe-Technologies/gensim/pull/3286): Fixes 'not enough arguments for format string' error, by [@gilbertfrancois](https://github.com/gilbertfrancois) -* [#3250](https://github.com/RaRe-Technologies/gensim/pull/3250): Make negative ns_exponent work correctly, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3309](https://github.com/RaRe-Technologies/gensim/pull/3309): Respect encoding when reading binary keyed vectors, by [@alhoo](https://github.com/alhoo) +* [#3332](https://github.com/RaRe-Technologies/gensim/pull/3332): Missing `f` prefix on f-strings fix, by [@code-review-doctor](https://github.com/code-review-doctor) ### :warning: Removed functionality & deprecations ### 🔮 Testing, CI, housekeeping -* [#3329](https://github.com/RaRe-Technologies/gensim/pull/3329): Check gallery up to date as part of CI, by [@mpenkov](https://github.com/mpenkov) -* [#3318](https://github.com/RaRe-Technologies/gensim/pull/3318): Clean up evaluate_word_pairs code, by [@piskvorky](https://github.com/piskvorky) -* [#3308](https://github.com/RaRe-Technologies/gensim/pull/3308): get rid of tox, build things via github actions directly, by [@mpenkov](https://github.com/mpenkov) -* [#3303](https://github.com/RaRe-Technologies/gensim/pull/3303): add GitHub URL for PyPi, by [@andriyor](https://github.com/andriyor) -* [#3300](https://github.com/RaRe-Technologies/gensim/pull/3300): Fix code formatting for FT_CMD definition, by [@pabs3](https://github.com/pabs3) -* [#3298](https://github.com/RaRe-Technologies/gensim/pull/3298): test and build wheels for Py3.{7,8,9,10}, by [@mpenkov](https://github.com/mpenkov) -* [#3274](https://github.com/RaRe-Technologies/gensim/pull/3274): Migrate setup.py from distutils to setuptools, by [@geojacobm6](https://github.com/geojacobm6) -* [#3255](https://github.com/RaRe-Technologies/gensim/pull/3255): Move windows tests from azure to github actions, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3230](https://github.com/RaRe-Technologies/gensim/pull/3230): Adding lifecycle configuration, by [@mpenkov](https://github.com/mpenkov) * [#3252](https://github.com/RaRe-Technologies/gensim/pull/3252): Add Codecov to gensim repo, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3255](https://github.com/RaRe-Technologies/gensim/pull/3255): Move windows tests from azure to github actions, by [@menshikh-iv](https://github.com/menshikh-iv) * [#3263](https://github.com/RaRe-Technologies/gensim/pull/3263): Remove commented out pytest-rerunfailures test dependency, by [@pabs3](https://github.com/pabs3) +* [#3274](https://github.com/RaRe-Technologies/gensim/pull/3274): Migrate setup.py from distutils to setuptools, by [@geojacobm6](https://github.com/geojacobm6) +* [#3298](https://github.com/RaRe-Technologies/gensim/pull/3298): test and build wheels for Py3.{7,8,9,10}, by [@mpenkov](https://github.com/mpenkov) +* [#3300](https://github.com/RaRe-Technologies/gensim/pull/3300): Fix code formatting for FT_CMD definition, by [@pabs3](https://github.com/pabs3) +* [#3303](https://github.com/RaRe-Technologies/gensim/pull/3303): add GitHub URL for PyPi, by [@andriyor](https://github.com/andriyor) +* [#3308](https://github.com/RaRe-Technologies/gensim/pull/3308): get rid of tox, build things via github actions directly, by [@mpenkov](https://github.com/mpenkov) +* [#3318](https://github.com/RaRe-Technologies/gensim/pull/3318): Clean up evaluate_word_pairs code, by [@piskvorky](https://github.com/piskvorky) +* [#3329](https://github.com/RaRe-Technologies/gensim/pull/3329): Check gallery up to date as part of CI, by [@mpenkov](https://github.com/mpenkov) -### Unsorted +### Minor improvements + +**I'm not sure if these belong here in the change log, as such changes aren't really visible to the regular users** -* [#3297](https://github.com/RaRe-Technologies/gensim/pull/3297): Use gensim.test.utils datapath() to construct paths to the test data, by [@pabs3](https://github.com/pabs3) -* [#3281](https://github.com/RaRe-Technologies/gensim/pull/3281): adjust test_parallel bound, by [@austereantelope](https://github.com/austereantelope) -* [#3280](https://github.com/RaRe-Technologies/gensim/pull/3280): tighten test_topic_word, by [@austereantelope](https://github.com/austereantelope) -* [#3278](https://github.com/RaRe-Technologies/gensim/pull/3278): Tighten test_parallel bound, by [@austereantelope](https://github.com/austereantelope) -* [#3271](https://github.com/RaRe-Technologies/gensim/pull/3271): Added new ValueError in place of assertion error for no model data provided in lsi model, by [@mark-todd](https://github.com/mark-todd) -* [#3264](https://github.com/RaRe-Technologies/gensim/pull/3264): Detect when a fasttext executable is available in PATH, by [@pabs3](https://github.com/pabs3) -* [#3258](https://github.com/RaRe-Technologies/gensim/pull/3258): Adding another check to _check_corpus_sanity for compressed files, adding test, by [@dchaplinsky](https://github.com/dchaplinsky) -* [#3254](https://github.com/RaRe-Technologies/gensim/pull/3254): Skip blinking test `test_translate_gc` on OSX + py3.9, by [@menshikh-iv](https://github.com/menshikh-iv) -* [#3251](https://github.com/RaRe-Technologies/gensim/pull/3251): Apply new convention of delimiting instance params in str function, by [@menshikh-iv](https://github.com/menshikh-iv) -* [#3247](https://github.com/RaRe-Technologies/gensim/pull/3247): Sparse2Corpus: update __getitem__ to work on slices, lists and ellipsis, by [@PrimozGodec](https://github.com/PrimozGodec) -* [#3230](https://github.com/RaRe-Technologies/gensim/pull/3230): Adding lifecycle configuration, by [@mpenkov](https://github.com/mpenkov) -* [#3197](https://github.com/RaRe-Technologies/gensim/pull/3197): Fix computation of topic coherence, by [@silviatti](https://github.com/silviatti) -* [#3194](https://github.com/RaRe-Technologies/gensim/pull/3194): Added random_seed parameter to make LsiModel reproducible, by [@parashardhapola](https://github.com/parashardhapola) -* [#3190](https://github.com/RaRe-Technologies/gensim/pull/3190): Fix broken external link for LDA implementation, by [@ahaya3776](https://github.com/ahaya3776) -* [#3188](https://github.com/RaRe-Technologies/gensim/pull/3188): Add get_sentence_vector() to FastText and get_mean_vector() to KeyedVectors, by [@rock420](https://github.com/rock420) -* [#3182](https://github.com/RaRe-Technologies/gensim/pull/3182): Fix error message when Doc2Vec does not receive corpus_file or corpus iterable, by [@blainedietrich](https://github.com/blainedietrich) -* [#3117](https://github.com/RaRe-Technologies/gensim/pull/3117): Ensure next_index available when loading old stored KeyedVectors models, by [@gojomo](https://github.com/gojomo) * [#2656](https://github.com/RaRe-Technologies/gensim/pull/2656): streamlining most_similar_cosmul and evaluate_word_analogies, by [@n3hrox](https://github.com/n3hrox) +* [#3251](https://github.com/RaRe-Technologies/gensim/pull/3251): Apply new convention of delimiting instance params in str function, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3254](https://github.com/RaRe-Technologies/gensim/pull/3254): Skip blinking test `test_translate_gc` on OSX + py3.9, by [@menshikh-iv](https://github.com/menshikh-iv) +* [#3258](https://github.com/RaRe-Technologies/gensim/pull/3258): Adding another check to _check_corpus_sanity for compressed files, adding test, by [@dchaplinsky](https://github.com/dchaplinsky) +* [#3278](https://github.com/RaRe-Technologies/gensim/pull/3278): Tighten test_parallel bound, by [@austereantelope](https://github.com/austereantelope) +* [#3280](https://github.com/RaRe-Technologies/gensim/pull/3280): tighten test_topic_word, by [@austereantelope](https://github.com/austereantelope) +* [#3281](https://github.com/RaRe-Technologies/gensim/pull/3281): adjust test_parallel bound, by [@austereantelope](https://github.com/austereantelope) +* [#3297](https://github.com/RaRe-Technologies/gensim/pull/3297): Use gensim.test.utils datapath() to construct paths to the test data, by [@pabs3](https://github.com/pabs3) ### :question: Closed issues -TODO: move each issue to its appropriate section or delete if irrelevant +**I've sorted the issues to the best of my ability. +Most of them appear irrelevant to the release. +I think we can get rid of this section altogether.** + +Duplicates: -* [#3337](https://github.com/RaRe-Technologies/gensim/issues/3337): Problem with numpy=1.21.5 and gensim -* [#3333](https://github.com/RaRe-Technologies/gensim/issues/3333): KeyedVector most_similar() use too much CPU -* [#3325](https://github.com/RaRe-Technologies/gensim/issues/3325): AttributeError: 'KeyedVectors' object has no attribute 'add' -* [#3321](https://github.com/RaRe-Technologies/gensim/issues/3321): Improve models dump -* [#3319](https://github.com/RaRe-Technologies/gensim/issues/3319): PorterStemmer doesn't install -* [#3316](https://github.com/RaRe-Technologies/gensim/issues/3316): corpora.TextDirectoryCorpus fails on utf-8 encoded files on windows -* [#3313](https://github.com/RaRe-Technologies/gensim/issues/3313): Unable to find equivalent of doctag_syn0 in version 3 in version 4 -* [#3306](https://github.com/RaRe-Technologies/gensim/issues/3306): Text8corpuse error output causing OOM -* [#3296](https://github.com/RaRe-Technologies/gensim/issues/3296): LSI add_documents -* [#3288](https://github.com/RaRe-Technologies/gensim/issues/3288): Python 3.10 wheels -* [#3285](https://github.com/RaRe-Technologies/gensim/issues/3285): Infer vectors for each word of a new document -* [#3277](https://github.com/RaRe-Technologies/gensim/issues/3277): All the vocab model files are not saved -* [#3268](https://github.com/RaRe-Technologies/gensim/issues/3268): Can't suppress lifecycle events -* [#3267](https://github.com/RaRe-Technologies/gensim/issues/3267): ImportError : Ensemble LDA * [#3266](https://github.com/RaRe-Technologies/gensim/issues/3266): Incorrect CBOW implementation in Gensim leads to inferior performance -* [#3249](https://github.com/RaRe-Technologies/gensim/issues/3249): Installing older version of Gensim gives a newer version -* [#3248](https://github.com/RaRe-Technologies/gensim/issues/3248): LdaMallet error returned non-zero exit status 1. -* [#3246](https://github.com/RaRe-Technologies/gensim/issues/3246): Partial support of compressed corpora in FastText model -* [#3245](https://github.com/RaRe-Technologies/gensim/issues/3245): Log level control -* [#3243](https://github.com/RaRe-Technologies/gensim/issues/3243): default estimation method of gensim's word2vec skipgram? -* [#3242](https://github.com/RaRe-Technologies/gensim/issues/3242): Computing WmdSimilarity each-with-each -* [#3241](https://github.com/RaRe-Technologies/gensim/issues/3241): I cannot import remove_stopword_tokens -* [#3240](https://github.com/RaRe-Technologies/gensim/issues/3240): Trying to get in touch regarding a security issue -* [#3233](https://github.com/RaRe-Technologies/gensim/issues/3233): Ask travis-ci.com for more credits -* [#3226](https://github.com/RaRe-Technologies/gensim/issues/3226): numpy 1.19.2 incompatible with gensim 4.1.0 -* [#3181](https://github.com/RaRe-Technologies/gensim/issues/3181): Mismatch get_coherence_per_topic and get_coherence for single topic -* [#3162](https://github.com/RaRe-Technologies/gensim/issues/3162): Doc2Vec: when we have string tags, build_vocab with update removes previous index -* [#3036](https://github.com/RaRe-Technologies/gensim/issues/3036): import gensim segmentation fault (macOS Big Sur, Apple M1/Apple Silicon/ARM) + +Junk (not a bug report, cannot reproduce, etc): + * [#3015](https://github.com/RaRe-Technologies/gensim/issues/3015): Add convenience `get_sentence_vector()`-like methods for FastText, other models -* [#2535](https://github.com/RaRe-Technologies/gensim/issues/2535): streamlining most_similar_cosmul +* [#3036](https://github.com/RaRe-Technologies/gensim/issues/3036): import gensim segmentation fault (macOS Big Sur, Apple M1/Apple Silicon/ARM) +* [#3162](https://github.com/RaRe-Technologies/gensim/issues/3162): Doc2Vec: when we have string tags, build_vocab with update removes previous index +* [#3226](https://github.com/RaRe-Technologies/gensim/issues/3226): numpy 1.19.2 incompatible with gensim 4.1.0 +* [#3233](https://github.com/RaRe-Technologies/gensim/issues/3233): Ask travis-ci.com for more credits +* [#3240](https://github.com/RaRe-Technologies/gensim/issues/3240): Trying to get in touch regarding a security issue +* [#3241](https://github.com/RaRe-Technologies/gensim/issues/3241): I cannot import remove_stopword_tokens +* [#3242](https://github.com/RaRe-Technologies/gensim/issues/3242): Computing WmdSimilarity each-with-each +* [#3243](https://github.com/RaRe-Technologies/gensim/issues/3243): default estimation method of gensim's word2vec skipgram? +* [#3245](https://github.com/RaRe-Technologies/gensim/issues/3245): Log level control +* [#3248](https://github.com/RaRe-Technologies/gensim/issues/3248): LdaMallet error returned non-zero exit status 1. +* [#3249](https://github.com/RaRe-Technologies/gensim/issues/3249): Installing older version of Gensim gives a newer version +* [#3267](https://github.com/RaRe-Technologies/gensim/issues/3267): ImportError : Ensemble LDA +* [#3268](https://github.com/RaRe-Technologies/gensim/issues/3268): Can't suppress lifecycle events +* [#3277](https://github.com/RaRe-Technologies/gensim/issues/3277): All the vocab model files are not saved +* [#3285](https://github.com/RaRe-Technologies/gensim/issues/3285): Infer vectors for each word of a new document +* [#3296](https://github.com/RaRe-Technologies/gensim/issues/3296): LSI add_documents +* [#3306](https://github.com/RaRe-Technologies/gensim/issues/3306): Text8corpuse error output causing OOM +* [#3313](https://github.com/RaRe-Technologies/gensim/issues/3313): Unable to find equivalent of doctag_syn0 in version 3 in version 4 +* [#3319](https://github.com/RaRe-Technologies/gensim/issues/3319): PorterStemmer doesn't install +* [#3321](https://github.com/RaRe-Technologies/gensim/issues/3321): Improve models dump +* [#3325](https://github.com/RaRe-Technologies/gensim/issues/3325): AttributeError: 'KeyedVectors' object has no attribute 'add' +* [#3333](https://github.com/RaRe-Technologies/gensim/issues/3333): KeyedVector most_similar() use too much CPU +* [#3337](https://github.com/RaRe-Technologies/gensim/issues/3337): Problem with numpy=1.21.5 and gensim * [#483](https://github.com/RaRe-Technologies/gensim/issues/483): Doc2Vec.infer_vector: AttributeError: 'Doc2Vec' object has no attribute 'syn1' +Closed by one of the PRs mentioned above: + +* [#2535](https://github.com/RaRe-Technologies/gensim/issues/2535): streamlining most_similar_cosmul +* [#3181](https://github.com/RaRe-Technologies/gensim/issues/3181): Mismatch get_coherence_per_topic and get_coherence for single topic +* [#3246](https://github.com/RaRe-Technologies/gensim/issues/3246): Partial support of compressed corpora in FastText model +* [#3288](https://github.com/RaRe-Technologies/gensim/issues/3288): Python 3.10 wheels +* [#3316](https://github.com/RaRe-Technologies/gensim/issues/3316): corpora.TextDirectoryCorpus fails on utf-8 encoded files on windows + ## 4.1.2, 2021-09-17 This is a bugfix release that addresses left over compatibility issues with older versions of numpy and MacOS. From ed8122ed3b704496178f79ec7e8c32c35eb9b3fa Mon Sep 17 00:00:00 2001 From: MattYoon <57797966+MattYoon@users.noreply.github.com> Date: Fri, 29 Apr 2022 11:54:36 +0900 Subject: [PATCH 76/81] fix FastText Docs (#3339) --- gensim/models/fasttext.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 5ea5077a0c..7c0ec8501b 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -968,11 +968,6 @@ def __init__(self, vector_size, min_n, max_n, bucket, count=0, dtype=REAL): buckets_word : list of np.array For each key (by its index), report bucket slots their subwords map to. - When used in training, FastTextKeyedVectors may be decorated with - extra attributes that closely associate with its core attributes, - such as the experimental vectors_vocab_lockf and vectors_ngrams_lockf - training-update-dampening factors. - """ super(FastTextKeyedVectors, self).__init__(vector_size=vector_size, count=count, dtype=dtype) self.min_n = min_n From 7d942b24f7b6c960fdb5df1f6f48902573b615be Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 29 Apr 2022 11:55:34 +0900 Subject: [PATCH 77/81] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1f1fc9f8dd..d01da753e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,6 +25,7 @@ Changes * [#3289](https://github.com/RaRe-Technologies/gensim/pull/3289): Typos, text and code fix in LDA tutorial, by [@davebulaval](https://github.com/davebulaval) * [#3301](https://github.com/RaRe-Technologies/gensim/pull/3301): Remove unused Jupyter screenshots, by [@pabs3](https://github.com/pabs3) * [#3307](https://github.com/RaRe-Technologies/gensim/pull/3307): Documentation fixes, by [@piskvorky](https://github.com/piskvorky) +* [#3339](https://github.com/RaRe-Technologies/gensim/pull/3339): fix parsing error in FastText Docs, by [@MattYoon](https://github.com/MattYoon) ### :red_circle: Bug fixes From f5bc192693738ddb60e85df2f836bb0b35265180 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Fri, 29 Apr 2022 14:49:30 +0200 Subject: [PATCH 78/81] get rid of "bad issues" section in CHANGELOG --- CHANGELOG.md | 48 +++--------------------------------------------- 1 file changed, 3 insertions(+), 45 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d01da753e3..ac3d92531e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ Changes ## Unreleased + ## 5.0.0, 2022-04-29 ### :+1: New features @@ -68,60 +69,17 @@ Changes * [#3281](https://github.com/RaRe-Technologies/gensim/pull/3281): adjust test_parallel bound, by [@austereantelope](https://github.com/austereantelope) * [#3297](https://github.com/RaRe-Technologies/gensim/pull/3297): Use gensim.test.utils datapath() to construct paths to the test data, by [@pabs3](https://github.com/pabs3) -### :question: Closed issues - -**I've sorted the issues to the best of my ability. -Most of them appear irrelevant to the release. -I think we can get rid of this section altogether.** - -Duplicates: - -* [#3266](https://github.com/RaRe-Technologies/gensim/issues/3266): Incorrect CBOW implementation in Gensim leads to inferior performance - -Junk (not a bug report, cannot reproduce, etc): - -* [#3015](https://github.com/RaRe-Technologies/gensim/issues/3015): Add convenience `get_sentence_vector()`-like methods for FastText, other models -* [#3036](https://github.com/RaRe-Technologies/gensim/issues/3036): import gensim segmentation fault (macOS Big Sur, Apple M1/Apple Silicon/ARM) -* [#3162](https://github.com/RaRe-Technologies/gensim/issues/3162): Doc2Vec: when we have string tags, build_vocab with update removes previous index -* [#3226](https://github.com/RaRe-Technologies/gensim/issues/3226): numpy 1.19.2 incompatible with gensim 4.1.0 -* [#3233](https://github.com/RaRe-Technologies/gensim/issues/3233): Ask travis-ci.com for more credits -* [#3240](https://github.com/RaRe-Technologies/gensim/issues/3240): Trying to get in touch regarding a security issue -* [#3241](https://github.com/RaRe-Technologies/gensim/issues/3241): I cannot import remove_stopword_tokens -* [#3242](https://github.com/RaRe-Technologies/gensim/issues/3242): Computing WmdSimilarity each-with-each -* [#3243](https://github.com/RaRe-Technologies/gensim/issues/3243): default estimation method of gensim's word2vec skipgram? -* [#3245](https://github.com/RaRe-Technologies/gensim/issues/3245): Log level control -* [#3248](https://github.com/RaRe-Technologies/gensim/issues/3248): LdaMallet error returned non-zero exit status 1. -* [#3249](https://github.com/RaRe-Technologies/gensim/issues/3249): Installing older version of Gensim gives a newer version -* [#3267](https://github.com/RaRe-Technologies/gensim/issues/3267): ImportError : Ensemble LDA -* [#3268](https://github.com/RaRe-Technologies/gensim/issues/3268): Can't suppress lifecycle events -* [#3277](https://github.com/RaRe-Technologies/gensim/issues/3277): All the vocab model files are not saved -* [#3285](https://github.com/RaRe-Technologies/gensim/issues/3285): Infer vectors for each word of a new document -* [#3296](https://github.com/RaRe-Technologies/gensim/issues/3296): LSI add_documents -* [#3306](https://github.com/RaRe-Technologies/gensim/issues/3306): Text8corpuse error output causing OOM -* [#3313](https://github.com/RaRe-Technologies/gensim/issues/3313): Unable to find equivalent of doctag_syn0 in version 3 in version 4 -* [#3319](https://github.com/RaRe-Technologies/gensim/issues/3319): PorterStemmer doesn't install -* [#3321](https://github.com/RaRe-Technologies/gensim/issues/3321): Improve models dump -* [#3325](https://github.com/RaRe-Technologies/gensim/issues/3325): AttributeError: 'KeyedVectors' object has no attribute 'add' -* [#3333](https://github.com/RaRe-Technologies/gensim/issues/3333): KeyedVector most_similar() use too much CPU -* [#3337](https://github.com/RaRe-Technologies/gensim/issues/3337): Problem with numpy=1.21.5 and gensim -* [#483](https://github.com/RaRe-Technologies/gensim/issues/483): Doc2Vec.infer_vector: AttributeError: 'Doc2Vec' object has no attribute 'syn1' - -Closed by one of the PRs mentioned above: - -* [#2535](https://github.com/RaRe-Technologies/gensim/issues/2535): streamlining most_similar_cosmul -* [#3181](https://github.com/RaRe-Technologies/gensim/issues/3181): Mismatch get_coherence_per_topic and get_coherence for single topic -* [#3246](https://github.com/RaRe-Technologies/gensim/issues/3246): Partial support of compressed corpora in FastText model -* [#3288](https://github.com/RaRe-Technologies/gensim/issues/3288): Python 3.10 wheels -* [#3316](https://github.com/RaRe-Technologies/gensim/issues/3316): corpora.TextDirectoryCorpus fails on utf-8 encoded files on windows ## 4.1.2, 2021-09-17 This is a bugfix release that addresses left over compatibility issues with older versions of numpy and MacOS. + ## 4.1.1, 2021-09-14 This is a bugfix release that addresses compatibility issues with older versions of numpy. + ## 4.1.0, 2021-08-15 Gensim 4.1 brings two major new functionalities: From d570cae9d03580672bd8937fafc4db8633249627 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Fri, 29 Apr 2022 17:51:41 +0200 Subject: [PATCH 79/81] rename release to 4.2.0 --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ac3d92531e..d77ade4832 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ Changes ## Unreleased -## 5.0.0, 2022-04-29 +## 4.2.0, 2022-04-29 ### :+1: New features From a5fd65c68ca17d0956147be7be821a5b6278f242 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Fri, 29 Apr 2022 18:04:54 +0200 Subject: [PATCH 80/81] bumped version to 4.2.0 --- docs/src/conf.py | 4 ++-- gensim/__init__.py | 2 +- setup.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/src/conf.py b/docs/src/conf.py index 669a56a20a..168d4cf58e 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -61,9 +61,9 @@ # built documents. # # The short X.Y version. -version = '4.1' +version = '4.2.0' # The full version, including alpha/beta/rc tags. -release = '4.1.3.dev0' +release = '4.2.0' # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/gensim/__init__.py b/gensim/__init__.py index c97e0f74ae..b5f915a3ed 100644 --- a/gensim/__init__.py +++ b/gensim/__init__.py @@ -4,7 +4,7 @@ """ -__version__ = '4.1.3.dev0' +__version__ = '4.2.0' import logging diff --git a/setup.py b/setup.py index 1be3057c3e..e3ee0c3bdb 100644 --- a/setup.py +++ b/setup.py @@ -334,7 +334,7 @@ def run(self): setup( name='gensim', - version='4.1.3.dev0', + version='4.2.0', description='Python framework for fast Vector Space Modelling', long_description=LONG_DESCRIPTION, From 239792ebe20cb8bbfb253790e5cfbaadbc6e7157 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Sun, 1 May 2022 09:56:42 +0200 Subject: [PATCH 81/81] update CHANGELOG --- CHANGELOG.md | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d77ade4832..9718b90b64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,8 @@ Changes * [#3271](https://github.com/RaRe-Technologies/gensim/pull/3271): Added new ValueError in place of assertion error for no model data provided in lsi model, by [@mark-todd](https://github.com/mark-todd) * [#3299](https://github.com/RaRe-Technologies/gensim/pull/3299): Enable test_word2vec_stand_alone_script by using sys.executable for python, by [@pabs3](https://github.com/pabs3) * [#3317](https://github.com/RaRe-Technologies/gensim/pull/3317): Added `encoding` parameter to TextDirectoryCorpus, by [@Sandman-Ren](https://github.com/Sandman-Ren) +* [#2656](https://github.com/RaRe-Technologies/gensim/pull/2656): Streamlining most_similar_cosmul and evaluate_word_analogies, by [@n3hrox](https://github.com/n3hrox) + ### :books: Tutorials and docs @@ -26,7 +28,8 @@ Changes * [#3289](https://github.com/RaRe-Technologies/gensim/pull/3289): Typos, text and code fix in LDA tutorial, by [@davebulaval](https://github.com/davebulaval) * [#3301](https://github.com/RaRe-Technologies/gensim/pull/3301): Remove unused Jupyter screenshots, by [@pabs3](https://github.com/pabs3) * [#3307](https://github.com/RaRe-Technologies/gensim/pull/3307): Documentation fixes, by [@piskvorky](https://github.com/piskvorky) -* [#3339](https://github.com/RaRe-Technologies/gensim/pull/3339): fix parsing error in FastText Docs, by [@MattYoon](https://github.com/MattYoon) +* [#3339](https://github.com/RaRe-Technologies/gensim/pull/3339): Fix parsing error in FastText docs, by [@MattYoon](https://github.com/MattYoon) +* [#3251](https://github.com/RaRe-Technologies/gensim/pull/3251): Apply new convention of delimiting instance params in str function, by [@menshikh-iv](https://github.com/menshikh-iv) ### :red_circle: Bug fixes @@ -55,13 +58,6 @@ Changes * [#3308](https://github.com/RaRe-Technologies/gensim/pull/3308): get rid of tox, build things via github actions directly, by [@mpenkov](https://github.com/mpenkov) * [#3318](https://github.com/RaRe-Technologies/gensim/pull/3318): Clean up evaluate_word_pairs code, by [@piskvorky](https://github.com/piskvorky) * [#3329](https://github.com/RaRe-Technologies/gensim/pull/3329): Check gallery up to date as part of CI, by [@mpenkov](https://github.com/mpenkov) - -### Minor improvements - -**I'm not sure if these belong here in the change log, as such changes aren't really visible to the regular users** - -* [#2656](https://github.com/RaRe-Technologies/gensim/pull/2656): streamlining most_similar_cosmul and evaluate_word_analogies, by [@n3hrox](https://github.com/n3hrox) -* [#3251](https://github.com/RaRe-Technologies/gensim/pull/3251): Apply new convention of delimiting instance params in str function, by [@menshikh-iv](https://github.com/menshikh-iv) * [#3254](https://github.com/RaRe-Technologies/gensim/pull/3254): Skip blinking test `test_translate_gc` on OSX + py3.9, by [@menshikh-iv](https://github.com/menshikh-iv) * [#3258](https://github.com/RaRe-Technologies/gensim/pull/3258): Adding another check to _check_corpus_sanity for compressed files, adding test, by [@dchaplinsky](https://github.com/dchaplinsky) * [#3278](https://github.com/RaRe-Technologies/gensim/pull/3278): Tighten test_parallel bound, by [@austereantelope](https://github.com/austereantelope)