From bc00b34d3c469de3185a50b4296dca9b199bd955 Mon Sep 17 00:00:00 2001
From: Cheng Qian <cheng.qian@ibm.com>
Date: Fri, 19 Jan 2024 11:01:02 -0500
Subject: [PATCH] feat: add zh-cn, zh-tw support

---
 README.md                                     |   2 +-
 .../resources/zh-cn/stopwords                 | 155 ++++++++++++++++++
 .../resources/zh-tw/stopwords                 |  51 ++++++
 assistant_skill_analysis/utils/lang_utils.py  |  18 +-
 classic_dialog_skill_analysis.ipynb           |   2 +-
 classic_dialog_skill_analysis_cp4d.ipynb      |   2 +-
 new_experience_skill_analysis.ipynb           |   2 +-
 new_experience_skill_analysis_cp4d.ipynb      |   2 +-
 tests/utils/test_lang_utils.py                |  14 ++
 9 files changed, 242 insertions(+), 6 deletions(-)
 create mode 100644 assistant_skill_analysis/resources/zh-cn/stopwords
 create mode 100644 assistant_skill_analysis/resources/zh-tw/stopwords
diff --git a/README.md b/README.md
index 1f54e52..19fcd35 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ Whether you are new to the process and are building your first AI assistant or y
 - Why is the assistant responding incorrectly to this question?
 - How do I improve my assistant’s ability to understand questions?
 
-Currently Supported Languages: en, fr, cs, de, es, it, pt, nl
+Currently Supported Languages: en, fr, cs, de, es, it, pt, nl, zh-cn, zh-tw
 
 ## Usage
 If you clone the notebook from this repository locally, please use the steps below. For usage in Watson studio, please refer to the 
diff --git a/assistant_skill_analysis/resources/zh-cn/stopwords b/assistant_skill_analysis/resources/zh-cn/stopwords
new file mode 100644
index 0000000..1848efb
--- /dev/null
+++ b/assistant_skill_analysis/resources/zh-cn/stopwords
@@ -0,0 +1,155 @@
+<
+>
+|
+-
+,
+;
+:
+!
+?
+.
+''
+'
+"
+(
+)
+[
+]
+{
+}
+*
+%
++
+。
+<SE>
+一
+一会儿
+一边
+一面
+上
+下
+不
+不但
+不光
+不可
+不如
+不是
+不管
+不论
+与
+与其
+个
+中
+为
+之
+之所以
+也
+也不
+也许
+也许是
+了
+于
+从
+他
+他们
+以
+会
+但
+你们
+便
+倘若
+先
+全
+其
+再
+到
+前
+十
+即使
+却
+又
+及
+只
+只有
+只要
+可
+可以
+可是
+可能
+各
+后
+向
+和
+哪怕
+因为
+因此
+在
+地
+多
+她
+她们
+如果
+宁可
+它
+它们
+对
+将
+小
+就
+尽管
+已
+已经
+并
+并且
+很
+我
+我们
+或
+所
+所以
+才
+把
+据
+无论
+既
+既然
+时
+是
+是因为
+更
+最
+有
+未
+来
+此
+每
+没有
+然后
+然而
+用
+由
+由于
+的
+看
+着
+种
+而
+而且
+而是
+能
+自己
+至
+虽然
+被
+要
+认为
+让
+该
+还
+还是
+这
+通过
+那么
+都
+非
+、
diff --git a/assistant_skill_analysis/resources/zh-tw/stopwords b/assistant_skill_analysis/resources/zh-tw/stopwords
new file mode 100644
index 0000000..1cf8259
--- /dev/null
+++ b/assistant_skill_analysis/resources/zh-tw/stopwords
@@ -0,0 +1,51 @@
+the
+of
+is
+and
+to
+in
+that
+we
+for
+an
+are
+by
+be
+as
+on
+with
+can
+if
+from
+which
+you
+it
+this
+then
+at
+have
+all
+not
+one
+has
+or
+that
+的
+了
+和
+是
+就
+都
+而
+及
+與
+著
+或
+一個
+沒有
+我們
+你們
+妳們
+他們
+她們
+是否
\ No newline at end of file
diff --git a/assistant_skill_analysis/utils/lang_utils.py b/assistant_skill_analysis/utils/lang_utils.py
index 6483136..2b689d5 100644
--- a/assistant_skill_analysis/utils/lang_utils.py
+++ b/assistant_skill_analysis/utils/lang_utils.py
@@ -1,13 +1,15 @@
 import os
 import re
+from types import SimpleNamespace
 import sys
+import jieba
 from nltk.stem.snowball import SnowballStemmer
 from spacy.tokenizer import Tokenizer
 import unicodedata
 import assistant_skill_analysis
 
 
-SUPPORTED_LANGUAGE = ["en", "fr", "de", "cs", "es", "it", "pt", "nl"]
+SUPPORTED_LANGUAGE = ["en", "fr", "de", "cs", "es", "it", "pt", "nl", "zh-cn", "zh-tw"]
 PUNCTUATION = [
     "\\" + chr(i)
     for i in range(sys.maxunicode)
@@ -15,6 +17,15 @@
 ]
 
 
+class _JiebaTokenizerWrapper:
+    """for zh-cn and zh-tw"""
+
+    def __call__(self, *args, **kwargs):
+        text = args[0]
+        for token in jieba.tokenize(text):
+            yield SimpleNamespace(text=token[0])
+
+
 class LanguageUtility:
     def __init__(self, language_code):
         if language_code not in SUPPORTED_LANGUAGE:
@@ -96,6 +107,11 @@ def init_resources(self):
             self.tokenizer = Tokenizer(Dutch().vocab)
             self.stemmer = SnowballStemmer(language="dutch")
             self.stop_words = self.load_stop_words(stopwords_path)
+
+        elif self.language_code in ["zh-cn", "zh-tw"]:
+            self.tokenizer = _JiebaTokenizerWrapper()
+            self.stop_words = self.load_stop_words(stopwords_path)
+
         else:
             raise Exception("language code %s is not supported", self.language_code)
 
diff --git a/classic_dialog_skill_analysis.ipynb b/classic_dialog_skill_analysis.ipynb
index 674f4db..bda8ed1 100644
--- a/classic_dialog_skill_analysis.ipynb
+++ b/classic_dialog_skill_analysis.ipynb
@@ -73,7 +73,7 @@
    "metadata": {},
    "source": [
     "Pick the language code correspond to your workspace data:   \n",
-    "*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl**"
+    "*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**"
    ]
   },
   {
diff --git a/classic_dialog_skill_analysis_cp4d.ipynb b/classic_dialog_skill_analysis_cp4d.ipynb
index 7242d04..412cbc3 100644
--- a/classic_dialog_skill_analysis_cp4d.ipynb
+++ b/classic_dialog_skill_analysis_cp4d.ipynb
@@ -73,7 +73,7 @@
    "metadata": {},
    "source": [
     "Pick the language code correspond to your workspace data:   \n",
-    "*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl**"
+    "*Supported Language codes:* **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**"
    ]
   },
   {
diff --git a/new_experience_skill_analysis.ipynb b/new_experience_skill_analysis.ipynb
index 3874755..904dbae 100644
--- a/new_experience_skill_analysis.ipynb
+++ b/new_experience_skill_analysis.ipynb
@@ -80,7 +80,7 @@
     "### Assistant Settings\n",
     "Please set values for the variables in the cell below to configure this notebook.\n",
     "\n",
-    "- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl**\n",
+    "- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**\n",
     "\n",
     "- **ASSISTANT_ID:** id of the Watson Assistant service instance\n",
     "\n",
diff --git a/new_experience_skill_analysis_cp4d.ipynb b/new_experience_skill_analysis_cp4d.ipynb
index 757f4a7..3a58e48 100644
--- a/new_experience_skill_analysis_cp4d.ipynb
+++ b/new_experience_skill_analysis_cp4d.ipynb
@@ -80,7 +80,7 @@
     "### Assistant Settings\n",
     "Please set values for the variables in the cell below to configure this notebook. The notebook uses CloudPakForDataAuthenticator to authenticate the APIs.\n",
     "\n",
-    "- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl**\n",
+    "- **LANGUAGE_CODE:** language code correspond to your workspace data, supported languages: **en, fr, de, es, cs, it, pt, nl, zh-cn, zh-tw**\n",
     "\n",
     "- **ASSISTANT_ID:** id of the Watson Assistant service instance\n",
     "\n",
diff --git a/tests/utils/test_lang_utils.py b/tests/utils/test_lang_utils.py
index a32edbf..d07aa9e 100644
--- a/tests/utils/test_lang_utils.py
+++ b/tests/utils/test_lang_utils.py
@@ -61,6 +61,20 @@ def test_de(self):
         sent = util.tokenize(sent)
         self.assertEqual(sent, ["autobahn"])
 
+    def test_zh_cn(self):
+        util = LanguageUtility("zh-cn")
+        sent = util.preprocess("不想当兼职")
+        self.assertEqual(sent, "不想当兼职")
+        sent = util.tokenize(sent)
+        self.assertEqual(sent, ['不想', '当', '兼职'])
+
+    def test_zh_tw(self):
+        util = LanguageUtility("zh-tw")
+        sent = util.preprocess("畀到機會我嘗試")
+        self.assertEqual(sent, "畀到機會我嘗試")
+        sent = util.tokenize(sent)
+        self.assertEqual(sent, ['畀', '到', '機會', '我', '嘗試'])
+
     def tearDown(self):
         unittest.TestCase.tearDown(self)
         self.skill_file.close()