Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tokenize spaces when explicitly requested and reduce iterations in occurrence counting loop #6

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 20 additions & 14 deletions src/classifier.js
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class Classifier {
tokens = this.vectorize(tokens)
}

// Set up an empty entry for the label if it does not exist
// Set up an empty entry for the label if it does not exist
if (typeof this._model.data[label] === 'undefined') {
this._model.data[label] = {}
}
Expand Down Expand Up @@ -148,7 +148,7 @@ class Classifier {

/**
* Split a string into an array of lowercase words, with all non-letter characters removed
*
*
* @param {string} input
* @return {Array}
*/
Expand Down Expand Up @@ -179,7 +179,7 @@ class Classifier {
if (!(words instanceof Array)) {
throw new Error('input must be either a string or Array')
}

if (this._model.nGramMax < this._model.nGramMin) {
throw new Error('Invalid nGramMin/nGramMax combination in model config')
}
Expand All @@ -190,22 +190,28 @@ class Classifier {
// based on the models configured min/max values
words.forEach((word, index) => {
let sequence = ''

words.slice(index).forEach(nextWord => {
let tokenCount = 0
let nextWord

// Create n-gram(s) of between nGramMin and nGramMax words from segment starting at (index)
// Increment the occurrence counter (tokens[sequence]) for each n-gram created
// Stop looping once we have nGramMax words (or reach the end of the segment)
let segment = words.slice(index)
while (tokenCount < this._model.nGramMax && tokenCount < segment.length) {
nextWord = segment[tokenCount]
sequence += sequence ? (' ' + nextWord) : nextWord
let tokenCount = sequence.split(' ').length
tokenCount++
if(tokenCount >= this._model.nGramMin && tokenCount <= this._model.nGramMax) {
if (typeof tokens[sequence] === 'undefined') {
tokens[sequence] = 0
}

if (tokenCount < this._model.nGramMin || tokenCount > this._model.nGramMax) {
return
++tokens[sequence]
}
}
})

if (typeof tokens[sequence] === 'undefined') {
tokens[sequence] = 0
}

++tokens[sequence]
})
})

return tokens
}
Expand Down
16 changes: 13 additions & 3 deletions test/classifier.js
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ describe('Classifier', () => {
const classifier = new Classifier()

classifier.model.nGramMin = 2

expect(() => classifier.tokenize('Hello world!')).to.throw(Error)
})

Expand Down Expand Up @@ -129,6 +129,16 @@ describe('Classifier', () => {
})
})

it('should create a unigrams for the space character from an array of characters including a space', () => {
const classifier = new Classifier()

expect(classifier.tokenize([' ','a','b'])).to.eql({
' ': 1,
'a': 1,
'b': 1
})
})

it('should increment the occurrence of the duplicate tokens', () => {
const classifier = new Classifier()

Expand Down Expand Up @@ -195,7 +205,7 @@ describe('Classifier', () => {

expect(() => classifier.train('test', [])).to.throw(Error)
})

it('should add tokens to the vocabulary (if not configured to false)', () => {
const classifier = new Classifier()

Expand Down Expand Up @@ -254,7 +264,7 @@ describe('Classifier', () => {
expect(classifier.train('hello world', 'test')).to.equal(classifier)
})
})

describe('cosineSimilarity', () => {
it('should throw an error if v1 is not an object literal', () => {
const classifier = new Classifier()
Expand Down