Add: add more and update test code

go-ego · Jan 1, 2023 · 25aa5e1 · 25aa5e1
1 parent 12ba544
commit 25aa5e1
Show file tree

Hide file tree

Showing 7 changed files with 73 additions and 33 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # gse
 
-Go efficient multilingual NLP and text segmentation; support English, Chinese, Japanese and others. 
+Go efficient multilingual NLP and text segmentation; support English, Chinese, Japanese and others.
 And supports with [elasticsearch](https://github.com/vcaesar/go-gse-elastic) and [bleve](https://github.com/vcaesar/gse-bleve).
 
 <!--<img align="right" src="https://raw.githubusercontent.com/go-ego/ego/master/logo.jpg">-->
@@ -23,23 +23,26 @@ And supports with [elasticsearch](https://github.com/vcaesar/go-gse-elastic) and
 Gse is implements jieba by golang, and try add NLP support and more feature
 
 ## Feature:
-- Support common, search engine, full mode, precise mode and HMM mode multiple word segmentation modes; 
+
+- Support common, search engine, full mode, precise mode and HMM mode multiple word segmentation modes;
 - Support user and embed dictionary, Part-of-speech/POS tagging, analyze segment info, stop and trim words
 - Support multilingual: English, Chinese, Japanese and others
 - Support Traditional Chinese
 - Support HMM cut text use Viterbi algorithm
 - Support NLP by TensorFlow (in work)
-- Named Entity Recognition (in work) 
+- Named Entity Recognition (in work)
 - Supports with [elasticsearch](https://github.com/vcaesar/go-gse-elastic) and bleve
 - run<a href="https://github.com/go-ego/gse/blob/master/tools/server/server.go"> JSON RPC service</a>.
 
 ## Algorithm:
+
 - [Dictionary](https://github.com/go-ego/gse/blob/master/dictionary.go) with double array trie (Double-Array Trie) to achieve
 - [Segmenter](https://github.com/go-ego/gse/blob/master/dag.go) algorithm is the shortest path (based on word frequency and dynamic programming), and DAG and HMM algorithm word segmentation.
 
 ## Text Segmentation speed:
+
 - <a href="https://github.com/go-ego/gse/blob/master/tools/benchmark/benchmark.go"> single thread</a> 9.2MB/s
-- <a href="https://github.com/go-ego/gse/blob/master/tools/benchmark/goroutines/goroutines.go">goroutines concurrent</a> 26.8MB/s. 
+- <a href="https://github.com/go-ego/gse/blob/master/tools/benchmark/goroutines/goroutines.go">goroutines concurrent</a> 26.8MB/s.
 - HMM text segmentation single thread 3.2MB/s. (2core 4threads Macbook Pro).
 
 ## Binding:
@@ -49,11 +52,13 @@ Gse is implements jieba by golang, and try add NLP support and more feature
 ## Install / update
 
 With Go module support (Go 1.11+), just import:
+
 ```go
 import "github.com/go-ego/gse"
 ```
 
 Otherwise, to install the gse package, run the command:
+
 ```
 go get -u github.com/go-ego/gse
 ```
@@ -74,7 +79,7 @@ import (
 var (
 	text = "Hello world, Helloworld. Winter is coming! こんにちは世界, 你好世界."
 
-	new, _ = gse.New("zh,testdata/test_dict3.txt", "alpha")
+	new, _ = gse.New("zh,testdata/test_en_dict3.txt", "alpha")
 
 	seg gse.Segmenter
 	posSeg pos.Segmenter
@@ -85,17 +90,17 @@ func main() {
 	seg.LoadDict()
 	// Loading the default dictionary with embed
 	// seg.LoadDictEmbed()
-	// 
+	//
 	// Loading the Simplified Chinese dictionary
 	// seg.LoadDict("zh_s")
 	// seg.LoadDictEmbed("zh_s")
 	//
 	// Loading the Traditional Chinese dictionary
 	// seg.LoadDict("zh_t")
-	// 
+	//
 	// Loading the Japanese dictionary
 	// seg.LoadDict("jp")
-	// 
+	//
 	// Load the dictionary
 	// seg.LoadDict("your gopath"+"/src/github.com/go-ego/gse/data/dict/dictionary.txt")
 
@@ -170,12 +175,12 @@ import (
 	"github.com/go-ego/gse"
 )
 
-//go:embed test_dict3.txt
+//go:embed test_en_dict3.txt
 var testDict string
 
 func main() {
 	// var seg gse.Segmenter
-	// seg.LoadDict("zh, testdata/test_dict.txt, testdata/test_dict1.txt")
+	// seg.LoadDict("zh, testdata/zh/test_dict.txt, testdata/zh/test_dict1.txt")
 	// seg.LoadStop()
 	seg, err := gse.NewEmbed("zh, word 20 n"+testDict, "en")
 	// seg.LoadDictEmbed()
@@ -198,6 +203,7 @@ func main() {
 [Look at an Japanese example](/examples/jp/main.go)
 
 ## Elasticsearch
+
 How to use it with elasticsearch?
 
 [go-gse-elastic](https://github.com/vcaesar/go-gse-elastic)
@@ -209,7 +215,7 @@ How to use it with elasticsearch?
 
 ## License
 
-Gse is primarily distributed under the terms of "both the MIT license and the Apache License (Version 2.0)". 
+Gse is primarily distributed under the terms of "both the MIT license and the Apache License (Version 2.0)".
 See [LICENSE-APACHE](http://www.apache.org/licenses/LICENSE-2.0), [LICENSE-MIT](https://github.com/go-vgo/robotgo/blob/master/LICENSE).
 
 Thanks for [sego](https://github.com/huichen/sego) and [jieba](https://github.com/fxsjy/jieba)([jiebago](https://github.com/wangbin/jiebago)).
diff --git a/README_zh.md b/README_zh.md
@@ -19,6 +19,7 @@ Go 高性能多语言 NLP 和分词, 支持英文、中文、日文等, 支持
 Gse 是结巴分词(jieba)的 golang 实现, 并尝试添加 NLP 功能和更多属性
 
 ## 特征:
+
 - 支持普通、搜索引擎、全模式、精确模式和 HMM 模式多种分词模式
 - 支持自定义词典、embed 词典、词性标注、停用词、整理分析分词
 - 多语言支持: 英文, 中文, 日文等
@@ -28,27 +29,32 @@ Gse 是结巴分词(jieba)的 golang 实现, 并尝试添加 NLP 功能和更多
 - 支持接入 Elasticsearch 和 bleve
 - 可运行<a href="https://github.com/go-ego/gse/blob/master/tools/server/server.go"> JSON RPC 服务</a>
 
-## 算法: 
+## 算法:
+
 - [词典](https://github.com/go-ego/gse/blob/master/dictionary.go)用双数组 trie（Double-Array Trie）实现，
 - [分词器](https://github.com/go-ego/gse/blob/master/segmenter.go)算法为基于词频的最短路径加动态规划, 以及 DAG 和 HMM 算法分词.
 - 支持 HMM 分词, 使用 viterbi 算法.
 
 ## 分词速度:
+
 - <a href="https://github.com/go-ego/gse/blob/master/tools/benchmark/benchmark.go">单线程</a> 9.2MB/s
-- <a href="https://github.com/go-ego/gse/blob/master/tools/benchmark/goroutines/goroutines.go">goroutines 并发</a> 26.8MB/s. 
+- <a href="https://github.com/go-ego/gse/blob/master/tools/benchmark/goroutines/goroutines.go">goroutines 并发</a> 26.8MB/s.
 - HMM 模式单线程分词速度 3.2MB/s.（双核 4 线程 Macbook Pro）。
 
 ## Binding:
 
 [gse-bind](https://github.com/vcaesar/gse-bind), binding JavaScript and other, support more language.
 
 ## 安装/更新
+
 With Go module support (Go 1.11+), just import:
+
 ```go
 import "github.com/go-ego/gse"
 ```
 
 Otherwise, to install the gse package, run the command:
+
 ```
 go get -u github.com/go-ego/gse
 ```
@@ -70,7 +76,7 @@ var (
 	seg gse.Segmenter
 	posSeg pos.Segmenter
 
-	new, _ = gse.New("zh,testdata/test_dict3.txt", "alpha")
+	new, _ = gse.New("zh,testdata/test_en_dict3.txt", "alpha")
 
 	text = "你好世界, Hello world, Helloworld."
 )
@@ -80,17 +86,17 @@ func main() {
 	seg.LoadDict()
 	// 加载默认 embed 词典
 	// seg.LoadDictEmbed()
-	// 
+	//
 	// 加载简体中文词典
 	// seg.LoadDict("zh_s")
 	// seg.LoadDictEmbed("zh_s")
-	// 
+	//
 	// 加载繁体中文词典
 	// seg.LoadDict("zh_t")
-	// 
+	//
 	// 加载日文词典
 	// seg.LoadDict("jp")
-	// 
+	//
 	// 载入词典
 	// seg.LoadDict("your gopath"+"/src/github.com/go-ego/gse/data/dict/dictionary.txt")
 
@@ -174,7 +180,7 @@ import (
 	"github.com/go-ego/gse"
 )
 
-//go:embed test_dict3.txt
+//go:embed test_en_dict3.txt
 var testDict string
 
 func main() {
@@ -199,6 +205,7 @@ func main() {
 [日文分词示例](/examples/jp/main.go)
 
 ## Elasticsearch
+
 How to use it with elasticsearch?
 
 [go-gse-elastic](https://github.com/vcaesar/go-gse-elastic)

diff --git a/dict_1.16.go b/dict_1.16.go
@@ -68,7 +68,7 @@ func (seg *Segmenter) LoadDictEmbed(dict ...string) (err error) {
 			return seg.LoadDictStr(zhT)
 		}
 
-		if strings.Contains(d, ", ") {
+		if strings.Contains(d, ", ") && seg.DictSep != "," {
 			begin := 0
 			s := strings.Split(d, ", ")
 			begin, err = seg.loadZhST(d)

diff --git a/dict_1.16_test.go b/dict_1.16_test.go
@@ -13,6 +13,9 @@ import (
 //go:embed testdata/test_en_dict3.txt
 var testDict string
 
+//go:embed testdata/test_en.txt
+var testEn string
+
 //go:embed testdata/zh/test_zh_dict2.txt
 var testDict2 string
 
@@ -77,3 +80,15 @@ func TestLoadStopEmbed(t *testing.T) {
 	tt.Bool(t, seg1.IsStop("比如"))
 	tt.Bool(t, seg1.IsStop("离开"))
 }
+
+func TestDictSep(t *testing.T) {
+	var seg1 Segmenter
+	seg1.DictSep = ","
+	err := seg1.LoadDictEmbed(testEn)
+	tt.Nil(t, err)
+
+	f, pos, ok := seg1.Find("to be")
+	tt.Bool(t, ok)
+	tt.Equal(t, "x", pos)
+	tt.Equal(t, 10, f)
+}
diff --git a/gse_test.go b/gse_test.go
@@ -15,25 +15,25 @@ func init() {
 func TestLoadDictMap(t *testing.T) {
 	m := []map[string]string{
 		{
-			"text": "一城山水",
+			"text": "to be",
 			"freq": "10",
 			"pos":  "n",
 		},
 		{
-			"text": "山河日月",
+			"text": "or not",
 			"freq": "13",
 		},
 	}
 
 	err := prodSeg.LoadDictMap(m)
 	tt.Nil(t, err)
 
-	f, pos, ok := prodSeg.Find("一城山水")
+	f, pos, ok := prodSeg.Find("to be")
 	tt.Bool(t, ok)
 	tt.Equal(t, "n", pos)
 	tt.Equal(t, 10, f)
 
-	f, _, ok = prodSeg.Find("山河日月")
+	f, _, ok = prodSeg.Find("or not")
 	tt.Bool(t, ok)
 	tt.Equal(t, 13, f)
 }
@@ -207,7 +207,7 @@ func TestLoadST(t *testing.T) {
 	tt.Equal(t, 352275, len(seg.Dict.Tokens))
 	tt.Equal(t, 3.3335153e+07, seg.Dict.totalFreq)
 
-	err = seg.LoadDict("zh_t, ./testdata/test_dict3.txt")
+	err = seg.LoadDict("zh_t, ./testdata/test_en_dict3.txt")
 	tt.Nil(t, err)
 	tt.Equal(t, 587210, len(seg.Dict.Tokens))
 	tt.Equal(t, 5.3226814e+07, seg.Dict.totalFreq)
@@ -221,7 +221,7 @@ func TestStop(t *testing.T) {
 
 	err = seg.LoadStop("testdata/stop.txt")
 	tt.Nil(t, err)
-	tt.Equal(t, 89, len(seg.StopWordMap))
+	tt.Equal(t, 90, len(seg.StopWordMap))
 	tt.Bool(t, seg.IsStop("离开"))
 
 	err = seg.EmptyStop()
@@ -282,7 +282,7 @@ func TestStop(t *testing.T) {
 }
 
 func TestNum(t *testing.T) {
-	seg, err := New("./testdata/test_dict3.txt")
+	seg, err := New("./testdata/test_en_dict3.txt")
 	tt.Nil(t, err)
 
 	seg.Num = true
@@ -303,9 +303,21 @@ func TestNum(t *testing.T) {
 }
 
 func TestUrl(t *testing.T) {
-	seg, err := New("./testdata/test_dict3.txt")
+	seg, err := New("./testdata/test_en_dict3.txt")
 	tt.Nil(t, err)
 
 	s1 := seg.CutUrls("https://www.g.com/search?q=test%m11.42&ie=UTF-8")
 	tt.Equal(t, "https www g com search q test m 11 42 ie utf 8", s1)
 }
+
+func TestLoadDictSep(t *testing.T) {
+	var seg1 Segmenter
+	seg1.DictSep = ","
+	err := seg1.LoadDict("./testdata/test_en.txt")
+	tt.Nil(t, err)
+
+	f, pos, ok := seg1.Find("not to be")
+	tt.Bool(t, ok)
+	tt.Equal(t, "x", pos)
+	tt.Equal(t, 5, f)
+}
diff --git a/segmenter_test.go b/segmenter_test.go
@@ -60,8 +60,8 @@ func TestSplit(t *testing.T) {
 
 func TestSegment(t *testing.T) {
 	var seg Segmenter
-	seg.LoadDict("testdata/test_dict1.txt,testdata/test_dict2.txt")
-	// seg.LoadDict("testdata/test_dict1.txt", "testdata/test_dict2.txt")
+	seg.LoadDict("testdata/zh/test_dict1.txt,testdata/zh/test_dict2.txt")
+	// seg.LoadDict("testdata/zh/test_dict1.txt", "testdata/zh/test_dict2.txt")
 	tt.Expect(t, "16", seg.Dict.NumTokens())
 	// tt.Expect(t, "5", seg.Dict.NumTokens())
 	segments := seg.Segment([]byte("世界有七十亿人口"))
@@ -246,11 +246,11 @@ func TestDictPaths(t *testing.T) {
 func TestInAlphaNum(t *testing.T) {
 	// var seg Segmenter
 	// AlphaNum = true
-	// seg.LoadDict("zh,./testdata/test_dict3.txt")
+	// seg.LoadDict("zh,./testdata/test_en_dict3.txt")
 	//
 	// AlphaNum = true
 	// ToLower = true
-	seg, err := New("zh,./testdata/test_dict3.txt", "alpha")
+	seg, err := New("zh,./testdata/test_en_dict3.txt", "alpha")
 	tt.Nil(t, err)
 
 	freq, _, ok := seg.Find("hello")

diff --git a/testdata/test_en.txt b/testdata/test_en.txt
@@ -1,4 +1,4 @@
 to be, 10, x
 or, 10, c
-no to be, 5, x 
+not to be, 5, x 
 that's the question!, 10, x