Skip to content

Commit

Permalink
Merge pull request #248 from easyops-cn/steve/user-dict
Browse files Browse the repository at this point in the history
feat: support `zhUserDict` and `zhUserDictPath`
  • Loading branch information
weareoutman authored Aug 8, 2022
2 parents 8ef0345 + 33f1ea4 commit 389410c
Show file tree
Hide file tree
Showing 5 changed files with 52 additions and 31 deletions.
44 changes: 23 additions & 21 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,27 +68,29 @@ module.exports = {
## Theme Options

| Name | Type | Default | Description |
| -------------------------------- | ---------------------------------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| indexDocs | boolean | `true` | Whether to index docs. |
| indexBlog | boolean | `true` | Whether to index blog. |
| indexPages | boolean | `false` | Whether to index pages. |
| docsRouteBasePath | string \| string[] | `"/docs"` | Base route path(s) of docs. Slash at beginning is not required. Note: for [docs-only mode](https://docusaurus.io/docs/docs-introduction#docs-only-mode), this needs to be the same as `routeBasePath` in your `@docusaurus/preset-classic` config e.g., `"/"`. |
| blogRouteBasePath | string \| string[] | `"/blog"` | Base route path(s) of blog. Slash at beginning is not required. |
| language | string \| string[] | `"en"` | All [lunr-languages](https://github.com/MihaiValentin/lunr-languages) supported languages, + `zh` 🔥. |
| hashed | boolean \| `"filename"` \| `"query"` | `false` | Whether to add a hashed query when fetching index (based on the content hash of all indexed `*.md` in `docsDir` and `blogDir` if applicable). Setting to `"filename"` will save hash in filename instead of query. |
| docsDir | string \| string[] | `"docs"` | The dir(s) of docs to get the content hash, it's relative to the dir of your project. |
| blogDir | string \| string[] | `"blog"` | Just like the `docsDir` but applied to blog. |
| removeDefaultStopWordFilter | boolean | `false` | Sometimes people (E.g., us) want to keep the English stop words as indexed, since they maybe are relevant in programming docs. |
| removeDefaultStemmer | boolean | `false` | Enable this if you want to be able to search for any partial word at the cost of search performance. |
| highlightSearchTermsOnTargetPage | boolean | `false` | Highlight search terms on target page. |
| searchResultLimits | number | `8` | Limit the search results. |
| searchResultContextMaxLength | number | `50` | Set the max length of characters of each search result to show. |
| explicitSearchResultPath | boolean | `false` | Whether an explicit path to a heading should be presented on a suggestion template. |
| ignoreFiles | string \| RegExp \| (string \| RegExp)[] | `[]` | Set the match rules to ignore some routes. Put a string if you want an exact match, or put a regex if you want a partial match. Note: without the website base url. |
| searchBarShortcut | boolean | `true` | Whether to enable keyboard shortcut to focus in search bar. |
| searchBarShortcutHint | boolean | `true` | Whether to show keyboard shortcut hint in search bar. Disable it if you need to hide the hint while shortcut is still enabled. |
| docsPluginIdForPreferredVersion | string | `undefined` | When you're using multi-instance of docs, set the docs plugin id which you'd like to check the preferred version with, for the search index. |
| Name | Type | Default | Description |
| -------------------------------- | ---------------------------------------- | --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| indexDocs | boolean | `true` | Whether to index docs. |
| indexBlog | boolean | `true` | Whether to index blog. |
| indexPages | boolean | `false` | Whether to index pages. |
| docsRouteBasePath | string \| string[] | `"/docs"` | Base route path(s) of docs. Slash at beginning is not required. Note: for [docs-only mode](https://docusaurus.io/docs/docs-introduction#docs-only-mode), this needs to be the same as `routeBasePath` in your `@docusaurus/preset-classic` config e.g., `"/"`. |
| blogRouteBasePath | string \| string[] | `"/blog"` | Base route path(s) of blog. Slash at beginning is not required. |
| language | string \| string[] | `"en"` | All [lunr-languages](https://github.com/MihaiValentin/lunr-languages) supported languages, + `zh` 🔥. |
| hashed | boolean \| `"filename"` \| `"query"` | `false` | Whether to add a hashed query when fetching index (based on the content hash of all indexed `*.md` in `docsDir` and `blogDir` if applicable). Setting to `"filename"` will save hash in filename instead of query. |
| docsDir | string \| string[] | `"docs"` | The dir(s) of docs to get the content hash, it's relative to the dir of your project. |
| blogDir | string \| string[] | `"blog"` | Just like the `docsDir` but applied to blog. |
| removeDefaultStopWordFilter | boolean | `false` | Sometimes people (E.g., us) want to keep the English stop words as indexed, since they maybe are relevant in programming docs. |
| removeDefaultStemmer | boolean | `false` | Enable this if you want to be able to search for any partial word at the cost of search performance. |
| highlightSearchTermsOnTargetPage | boolean | `false` | Highlight search terms on target page. |
| searchResultLimits | number | `8` | Limit the search results. |
| searchResultContextMaxLength | number | `50` | Set the max length of characters of each search result to show. |
| explicitSearchResultPath | boolean | `false` | Whether an explicit path to a heading should be presented on a suggestion template. |
| ignoreFiles | string \| RegExp \| (string \| RegExp)[] | `[]` | Set the match rules to ignore some routes. Put a string if you want an exact match, or put a regex if you want a partial match. Note: without the website base url. |
| searchBarShortcut | boolean | `true` | Whether to enable keyboard shortcut to focus in search bar. |
| searchBarShortcutHint | boolean | `true` | Whether to show keyboard shortcut hint in search bar. Disable it if you need to hide the hint while shortcut is still enabled. |
| docsPluginIdForPreferredVersion | string | | When you're using multi-instance of docs, set the docs plugin id which you'd like to check the preferred version with, for the search index. |
| zhUserDict | string | | Provide your custom dict for language of zh, [see here](https://github.com/fxsjy/jieba#%E8%BD%BD%E5%85%A5%E8%AF%8D%E5%85%B8) |
| zhUserDictPath | string | | Provide the file path to your custom dict for language of zh, E.g.: `path.resolve("./src/zh-dict.txt")` |

### I18N

Expand Down
16 changes: 6 additions & 10 deletions docusaurus-search-local/src/server/utils/buildIndex.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/* eslint @typescript-eslint/no-var-requires: 0 */
import lunr from "lunr";
import {
ProcessedPluginOptions,
Expand All @@ -11,32 +12,27 @@ export function buildIndex(
language,
removeDefaultStopWordFilter,
removeDefaultStemmer,
zhUserDict,
zhUserDictPath,
}: ProcessedPluginOptions
): Omit<WrappedIndex, "type">[] {
if (language.length > 1 || language.some((item) => item !== "en")) {
// eslint-disable-next-line @typescript-eslint/no-var-requires
require("lunr-languages/lunr.stemmer.support")(lunr);
}
if (language.includes("ja") || language.includes("jp")) {
// eslint-disable-next-line @typescript-eslint/no-var-requires
require("lunr-languages/tinyseg")(lunr);
}
for (const lang of language.filter(
(item) => item !== "en" && item !== "zh"
)) {
// eslint-disable-next-line @typescript-eslint/no-var-requires
require(`lunr-languages/lunr.${lang}`)(lunr);
}
if (language.includes("zh")) {
// eslint-disable-next-line @typescript-eslint/no-var-requires
require("../../shared/lunrLanguageZh").lunrLanguageZh(
lunr,
// eslint-disable-next-line @typescript-eslint/no-var-requires
require("./tokenizer").tokenizer
);
const { tokenizer, loadUserDict } = require("./tokenizer");
loadUserDict(zhUserDict, zhUserDictPath);
require("../../shared/lunrLanguageZh").lunrLanguageZh(lunr, tokenizer);
}
if (language.length > 1) {
// eslint-disable-next-line @typescript-eslint/no-var-requires
require("lunr-languages/lunr.multi")(lunr);
}

Expand Down
18 changes: 18 additions & 0 deletions docusaurus-search-local/src/server/utils/tokenizer.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import fs from "fs";
import lunr from "lunr";
import jieba from "@node-rs/jieba";
import { MatchMetadata } from "../../shared/interfaces";
Expand All @@ -6,6 +7,23 @@ import { cutWordByUnderscore } from "./cutWordByUnderscore";
// https://zhuanlan.zhihu.com/p/33335629
const RegExpConsecutiveWord = /\w+|\p{Unified_Ideograph}+/u;

let userDictLoaded = false;

export function loadUserDict(
zhUserDict: string,
zhUserDictPath?: string
): void {
if (userDictLoaded) {
return;
}
if (zhUserDict) {
jieba.loadDict(Buffer.from(zhUserDict));
} else if (zhUserDictPath) {
jieba.loadDict(fs.readFileSync(zhUserDictPath));
}
userDictLoaded = true;
}

export function tokenizer(
input: string | string[] | null | undefined,
metadata: MatchMetadata
Expand Down
2 changes: 2 additions & 0 deletions docusaurus-search-local/src/server/utils/validateOptions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ const schema = Joi.object<PluginOptions>({
searchBarShortcut: Joi.boolean().default(true),
searchBarShortcutHint: Joi.boolean().default(true),
docsPluginIdForPreferredVersion: Joi.string(),
zhUserDict: Joi.string(),
zhUserDictPath: Joi.string(),
});

export function validateOptions({
Expand Down
3 changes: 3 additions & 0 deletions docusaurus-search-local/src/shared/interfaces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,9 @@ export interface PluginOptions {
searchBarShortcut?: boolean;
searchBarShortcutHint?: boolean;

zhUserDict?: string;
zhUserDictPath?: string;

// searchInputPlaceholder?: string;
// searchNoResults?: string;
// searchSeeAllResults?: string;
Expand Down

0 comments on commit 389410c

Please sign in to comment.