jiebago: github.com/wangbin/jiebago Index | Examples | Files | Directories

package jiebago

import "github.com/wangbin/jiebago"

Package jiebago is the Golang implemention of [Jieba](https://github.com/fxsjy/jieba), Python Chinese text segmentation module.

Code:

var seg jiebago.Segmenter
seg.LoadDictionary("dict.txt")

print := func(ch <-chan string) {
    for word := range ch {
        fmt.Printf(" %s /", word)
    }
    fmt.Println()
}

fmt.Print("【全模式】:")
print(seg.CutAll("我来到北京清华大学"))

fmt.Print("【精确模式】:")
print(seg.Cut("我来到北京清华大学", false))

fmt.Print("【新词识别】:")
print(seg.Cut("他来到了网易杭研大厦", true))

fmt.Print("【搜索引擎模式】:")
print(seg.CutForSearch("小明硕士毕业于中国科学院计算所,后在日本京都大学深造", true))

Output:

【全模式】: 我 / 来到 / 北京 / 清华 / 清华大学 / 华大 / 大学 /
【精确模式】: 我 / 来到 / 北京 / 清华大学 /
【新词识别】: 他 / 来到 / 了 / 网易 / 杭研 / 大厦 /
【搜索引擎模式】: 小明 / 硕士 / 毕业 / 于 / 中国 / 科学 / 学院 / 科学院 / 中国科学院 / 计算 / 计算所 / , / 后 / 在 / 日本 / 京都 / 大学 / 日本京都大学 / 深造 /

Code:

var seg jiebago.Segmenter
seg.LoadDictionary("dict.txt")

print := func(ch <-chan string) {
    for word := range ch {
        fmt.Printf(" %s /", word)
    }
    fmt.Println()
}
sentence := "李小福是创新办主任也是云计算方面的专家"
fmt.Print("Before:")
print(seg.Cut(sentence, true))

seg.LoadUserDictionary("userdict.txt")

fmt.Print("After:")
print(seg.Cut(sentence, true))

Output:

Before: 李小福 / 是 / 创新 / 办 / 主任 / 也 / 是 / 云 / 计算 / 方面 / 的 / 专家 /
After: 李小福 / 是 / 创新办 / 主任 / 也 / 是 / 云计算 / 方面 / 的 / 专家 /

Code:

package main

import (
    "bufio"
    "fmt"
    "log"
    "os"
    "runtime"
    "strings"
    "time"

    "github.com/wangbin/jiebago"
)

type line struct {
    number int
    text   string
}

var (
    segmenter  = jiebago.Segmenter{}
    numThreads = runtime.NumCPU()
    task       = make(chan line, numThreads)
    result     = make(chan line, numThreads)
)

func worker() {
    for l := range task {
        var segments []string
        for segment := range segmenter.Cut(l.text, true) {
            segments = append(segments, segment)
        }

        l.text = fmt.Sprintf("%s\n", strings.Join(segments, " / "))
        result <- l
    }
}

func main() {
    // Set the number of goroutines
    runtime.GOMAXPROCS(numThreads)

    // Load dictionary
    segmenter.LoadDictionary("dict.txt")

    // open file for segmentation
    file, err := os.Open("README.md")
    if err != nil {
        log.Fatal(err)
    }
    defer file.Close()

    // start worker routines
    for i := 0; i < numThreads; i++ {
        go worker()
    }

    var length, size int
    scanner := bufio.NewScanner(file)

    t0 := time.Now()

    lines := make([]string, 0)

    // Read lines
    for scanner.Scan() {
        t := scanner.Text()
        size += len(t)
        lines = append(lines, t)
    }
    length = len(lines)

    // Segmentation
    go func() {
        for i := 0; i < length; i++ {
            task <- line{number: i, text: lines[i]}
        }
        close(task)
    }()

    // Make sure the segmentation result contains same line as original file
    for i := 0; i < length; i++ {
        l := <-result
        lines[l.number] = l.text
    }

    t1 := time.Now()

    // Write the segments into a file for verify
    outputFile, _ := os.OpenFile("parallelCut.log", os.O_CREATE|os.O_WRONLY, 0600)
    defer outputFile.Close()
    writer := bufio.NewWriter(outputFile)
    for _, l := range lines {
        writer.WriteString(l)
    }
    writer.Flush()

    log.Printf("Time cousumed: %v", t1.Sub(t0))
    log.Printf("Segmentation speed: %f MB/s", float64(size)/t1.Sub(t0).Seconds()/(1024*1024))
}

Code:

var seg jiebago.Segmenter
seg.LoadDictionary("dict.txt")

print := func(ch <-chan string) {
    for word := range ch {
        fmt.Printf(" %s /", word)
    }
    fmt.Println()
}
sentence := "超敏C反应蛋白是什么?"
fmt.Print("Before:")
print(seg.Cut(sentence, false))
word := "超敏C反应蛋白"
oldFrequency, _ := seg.Frequency(word)
frequency := seg.SuggestFrequency(word)
fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
seg.AddWord(word, frequency)
fmt.Print("After:")
print(seg.Cut(sentence, false))

sentence = "如果放到post中将出错"
fmt.Print("Before:")
print(seg.Cut(sentence, false))
word = "中将"
oldFrequency, _ = seg.Frequency(word)
frequency = seg.SuggestFrequency("中", "将")
fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
seg.AddWord(word, frequency)
fmt.Print("After:")
print(seg.Cut(sentence, false))

sentence = "今天天气不错"
fmt.Print("Before:")
print(seg.Cut(sentence, false))
word = "今天天气"
oldFrequency, _ = seg.Frequency(word)
frequency = seg.SuggestFrequency("今天", "天气")
fmt.Printf("%s current frequency: %f, suggest: %f.\n", word, oldFrequency, frequency)
seg.AddWord(word, frequency)
fmt.Print("After:")
print(seg.Cut(sentence, false))

Output:

Before: 超敏 / C / 反应 / 蛋白 / 是 / 什么 / ? /
超敏C反应蛋白 current frequency: 0.000000, suggest: 1.000000.
After: 超敏C反应蛋白 / 是 / 什么 / ? /
Before: 如果 / 放到 / post / 中将 / 出错 /
中将 current frequency: 763.000000, suggest: 494.000000.
After: 如果 / 放到 / post / 中 / 将 / 出错 /
Before: 今天天气 / 不错 /
今天天气 current frequency: 3.000000, suggest: 0.000000.
After: 今天 / 天气 / 不错 /

Index

Examples

Package Files

dictionary.go jieba.go

type Dictionary Uses

type Dictionary struct {
    sync.RWMutex
    // contains filtered or unexported fields
}

A Dictionary represents a thread-safe dictionary used for word segmentation.

func (*Dictionary) AddToken Uses

func (d *Dictionary) AddToken(token dictionary.Token)

AddToken adds one token

func (*Dictionary) Frequency Uses

func (d *Dictionary) Frequency(key string) (float64, bool)

Frequency returns the frequency and existence of give word

func (*Dictionary) Load Uses

func (d *Dictionary) Load(ch <-chan dictionary.Token)

Load loads all tokens from given channel

type Segmenter Uses

type Segmenter struct {
    // contains filtered or unexported fields
}

Segmenter is a Chinese words segmentation struct.

func (*Segmenter) AddWord Uses

func (seg *Segmenter) AddWord(word string, frequency float64)

AddWord adds a new word with frequency to dictionary

func (*Segmenter) Cut Uses

func (seg *Segmenter) Cut(sentence string, hmm bool) <-chan string

Cut cuts a sentence into words using accurate mode. Parameter hmm controls whether to use the Hidden Markov Model. Accurate mode attempts to cut the sentence into the most accurate segmentations, which is suitable for text analysis.

func (*Segmenter) CutAll Uses

func (seg *Segmenter) CutAll(sentence string) <-chan string

CutAll cuts a sentence into words using full mode. Full mode gets all the possible words from the sentence. Fast but not accurate.

func (*Segmenter) CutForSearch Uses

func (seg *Segmenter) CutForSearch(sentence string, hmm bool) <-chan string

CutForSearch cuts sentence into words using search engine mode. Search engine mode, based on the accurate mode, attempts to cut long words into several short words, which can raise the recall rate. Suitable for search engines.

func (*Segmenter) DeleteWord Uses

func (seg *Segmenter) DeleteWord(word string)

DeleteWord removes a word from dictionary

func (*Segmenter) Frequency Uses

func (seg *Segmenter) Frequency(word string) (float64, bool)

Frequency returns a word's frequency and existence

func (*Segmenter) LoadDictionary Uses

func (seg *Segmenter) LoadDictionary(fileName string) error

LoadDictionary loads dictionary from given file name. Everytime LoadDictionary is called, previously loaded dictionary will be cleard.

func (*Segmenter) LoadUserDictionary Uses

func (seg *Segmenter) LoadUserDictionary(fileName string) error

LoadUserDictionary loads a user specified dictionary, it must be called after LoadDictionary, and it will not clear any previous loaded dictionary, instead it will override exist entries.

func (*Segmenter) SuggestFrequency Uses

func (seg *Segmenter) SuggestFrequency(words ...string) float64

SuggestFrequency returns a suggested frequncy of a word or a long word cutted into several short words.

This method is useful when a word in the sentence is not cutted out correctly.

If a word should not be further cutted, for example word "石墨烯" should not be cutted into "石墨" and "烯", SuggestFrequency("石墨烯") will return the maximu frequency for this word.

If a word should be further cutted, for example word "今天天气" should be further cutted into two words "今天" and "天气", SuggestFrequency("今天", "天气") should return the minimum frequency for word "今天天气".

Directories

PathSynopsis
analysePackage analyse is the Golang implementation of Jieba's analyse module.
dictionaryPackage dictionary contains a interface and wraps all io related work.
finalsegPackage finalseg is the Golang implementation of Jieba's finalseg module.
possegPackage posseg is the Golang implementation of Jieba's posseg module.
tokenizers
utilPackage util contains some util functions used by jiebago.

Package jiebago imports 7 packages (graph) and is imported by 16 packages. Updated 2017-08-30. Refresh now. Tools for package owners.