Skip to content

Commit

Permalink
feat: add zh-cn, zh-tw support
Browse files Browse the repository at this point in the history
  • Loading branch information
Cheng Qian committed Jan 19, 2024
1 parent e35c6e9 commit be11a4a
Show file tree
Hide file tree
Showing 9 changed files with 241 additions and 6 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Whether you are new to the process and are building your first AI assistant or y
- Why is the assistant responding incorrectly to this question?
- How do I improve my assistant’s ability to understand questions?

Currently Supported Languages: en, fr, cs, de, es, it, pt, nl
Currently Supported Languages: en, fr, cs, de, es, it, pt, nl, zh-cn, zh-tw

## Usage
If you clone the notebook from this repository locally, please use the steps below. For usage in Watson studio, please refer to the
Expand Down
155 changes: 155 additions & 0 deletions assistant_skill_analysis/resources/zh-cn/stopwords
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
<
>
|
-
,
;
:
!
?
.
''
'
"
(
)
[
]
{
}
*
%
+
<SE>
一会儿
一边
一面
不但
不光
不可
不如
不是
不管
不论
与其
之所以
也不
也许
也许是
他们
你们
便
倘若
即使
只有
只要
可以
可是
可能
哪怕
因为
因此
她们
如果
宁可
它们
尽管
已经
并且
我们
所以
无论
既然
是因为
没有
然后
然而
由于
而且
而是
自己
虽然
认为
还是
通过
那么
51 changes: 51 additions & 0 deletions assistant_skill_analysis/resources/zh-tw/stopwords
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
the
of
is
and
to
in
that
we
for
an
are
by
be
as
on
with
can
if
from
which
you
it
this
then
at
have
all
not
one
has
or
that
一個
沒有
我們
你們
妳們
他們
她們
是否
17 changes: 16 additions & 1 deletion assistant_skill_analysis/utils/lang_utils.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,30 @@
import os
import re
from types import SimpleNamespace
import sys
import jieba
from nltk.stem.snowball import SnowballStemmer
from spacy.tokenizer import Tokenizer
import unicodedata
import assistant_skill_analysis


SUPPORTED_LANGUAGE = ["en", "fr", "de", "cs", "es", "it", "pt", "nl"]
SUPPORTED_LANGUAGE = ["en", "fr", "de", "cs", "es", "it", "pt", "nl", "zh-cn", "zh-tw"]
PUNCTUATION = [
"\\" + chr(i)
for i in range(sys.maxunicode)
if unicodedata.category(chr(i)).startswith("P")
]


class _JiebaTokenizerWrapper:
"""for zh-cn and zh-tw"""
def __call__(self, *args, **kwargs):
text = args[0]
for token in jieba.tokenize(text):
yield SimpleNamespace(text=token[0])


class LanguageUtility:
def __init__(self, language_code):
if language_code not in SUPPORTED_LANGUAGE:
Expand Down Expand Up @@ -96,6 +106,11 @@ def init_resources(self):
self.tokenizer = Tokenizer(Dutch().vocab)
self.stemmer = SnowballStemmer(language="dutch")
self.stop_words = self.load_stop_words(stopwords_path)

elif self.language_code in ["zh-cn", "zh-tw"]:
self.tokenizer = _JiebaTokenizerWrapper()
self.stop_words = self.load_stop_words(stopwords_path)

else:
raise Exception("language code %s is not supported", self.language_code)

Expand Down
2 changes: 1 addition & 1 deletion classic_dialog_skill_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
"metadata": {},
"source": [
"Pick the language code correspond to your workspace data: \n",
"*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl**"
"*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion classic_dialog_skill_analysis_cp4d.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
"metadata": {},
"source": [
"Pick the language code correspond to your workspace data: \n",
"*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl**"
"*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**"
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion new_experience_skill_analysis.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
"### Assistant Settings\n",
"Please set values for the variables in the cell below to configure this notebook.\n",
"\n",
"- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl**\n",
"- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**\n",
"\n",
"- **ASSISTANT_ID:** id of the Watson Assistant service instance\n",
"\n",
Expand Down
2 changes: 1 addition & 1 deletion new_experience_skill_analysis_cp4d.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@
"### Assistant Settings\n",
"Please set values for the variables in the cell below to configure this notebook. The notebook uses CloudPakForDataAuthenticator to authenticate the APIs.\n",
"\n",
"- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl**\n",
"- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**\n",
"\n",
"- **ASSISTANT_ID:** id of the Watson Assistant service instance\n",
"\n",
Expand Down
14 changes: 14 additions & 0 deletions tests/utils/test_lang_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,20 @@ def test_de(self):
sent = util.tokenize(sent)
self.assertEqual(sent, ["autobahn"])

def test_zh_cn(self):
util = LanguageUtility("zh-cn")
sent = util.preprocess("不想当兼职")
self.assertEqual(sent, "不想当兼职")
sent = util.tokenize(sent)
self.assertEqual(sent, ['不想', '当', '兼职'])

def test_zh_tw(self):
util = LanguageUtility("zh-tw")
sent = util.preprocess("畀到機會我嘗試")
self.assertEqual(sent, "畀到機會我嘗試")
sent = util.tokenize(sent)
self.assertEqual(sent, ['畀', '到', '機會', '我', '嘗試'])

def tearDown(self):
unittest.TestCase.tearDown(self)
self.skill_file.close()
Expand Down

0 comments on commit be11a4a

Please sign in to comment.