Documentation ¶
Overview ¶
Package tekstus is a library for working with text.
Index ¶
- Constants
- Variables
- func BytesCutUntil(line, token []byte, startAt int, checkEsc bool) (v []byte, p int, found bool)
- func BytesEncapsulate(token, line, leftcap, rightcap []byte) (newline []byte, changed bool)
- func BytesFind(line, token []byte, startat int) (at int)
- func BytesMatchForward(line, token []byte, p int) bool
- func BytesRemoveUntil(line, leftcap, rightcap []byte) (newline []byte, changed bool)
- func BytesSkipUntil(line, token []byte, startAt int, checkEsc bool) (p int, found bool)
- func CountAlnumChar(text string) (n int)
- func CountAlnumDistribution(text string) (chars []rune, values []int)
- func CountCharSequence(text string) (chars []rune, counts []int)
- func CountDigit(text string) (n int)
- func CountNonAlnumChar(text string, withspace bool) (n int)
- func CountUniqChar(text string) (n int)
- func CountUpperLowerChar(text string) (upper, lower int)
- func GetMaxCharSequence(text string) (char rune, count int)
- func InterfacesToStrings(is []interface{}) (vs []string)
- func RatioAlnumChar(text string) float64
- func RatioDigit(text string) float64
- func RatioNonAlnumChar(text string, withspace bool) float64
- func RatioUpper(text string) float64
- func RatioUpperLowerChar(text string) float64
- func RunesContain(s []rune, c rune) (bool, int)
- func RunesDiff(l []rune, r []rune) (diff []rune)
- func RunesFind(line, token []rune, startAt int) (at int)
- func RunesFindSpaces(line []rune, startAt int) (idx int)
- func RunesRemoveUntil(line, leftcap, rightcap []rune) (newline []rune, changed bool)
- func StringCountTokens(text string, tokens []string, sensitive bool) (cnt int)
- func StringFrequenciesOf(text string, tokens []string, sensitive bool) (freq float64)
- func StringMergeSpaces(text string, withline bool) string
- func StringRemoveURI(text string) string
- func StringRemoveWikiMarkup(text string) string
- func StringSplitWords(text string, cleanit bool, uniq bool) (words []string)
- func StringTrimNonAlnum(text string) string
- func StringsIsContain(ss Strings, el string) bool
- func StringsSortByIndex(d *[]string, sortedIds []int)
- func StringsSwap(d []string, x, y int)
- func StringsToFloat64(ss []string) (sv []float64)
- func StringsToInt64(ss []string) (sv []int64)
- func WordsCountMissRate(src []string, target []string) (missrate float64, nmiss, length int)
- func WordsCountToken(words []string, token string, sensitive bool) (cnt int)
- func WordsCountTokens(words []string, tokens []string, sensitive bool) (clsCnt []int)
- func WordsFindLongest(words []string) (slong string, idx int)
- func WordsFrequenciesOf(words, tokens []string, sensitive bool) (sumfreq float64)
- func WordsFrequencyOf(words []string, token string, sensitive bool) float64
- func WordsMaxCountOf(words []string, tokens []string, sensitive bool) string
- func WordsProbabilitiesOf(words, tokens []string, sensitive bool) (probs []float64)
- func WordsUniq(words []string, sensitive bool) (uniques []string)
- type Chunk
- type Chunks
- type Line
- type Lines
- type ListStrings
- type Strings
- type TableStrings
- type WikiMarkup
Constants ¶
const (
// DefEscape character(s).
DefEscape = '\\'
)
Variables ¶
var BadWords = []string{
"666", "da", "dont", "dosent", "whatever", "guy", "hi", "nazi", "sup",
"guise", "loser", "thats", "ugly", "wanna", "whats", "wont", "gotta",
"bloody", "fart", "pot", "prick", "stink", "smells", "smelly", "alot",
"dunno", "gotcha",
}
BadWords contain list of colloquial words or bad writing words.
var BiasedWords = []string{
"cutting-edge", "single-handedly", "well-established", "well-known",
"world-class", "absolute", "acclaimed", "amazing", "astonishing",
"authoritative", "beautiful", "best", "boreing", "boring", "brilliant",
"canonical", "cares", "celebrated", "charismatic", "classic",
"coolest", "defining", "definitive", "eminent", "enigma", "ever",
"everyone", "exciting", "extraordinary", "fabulous", "famous",
"fantastic", "fat", "fully", "genius", "global", "great", "greatest",
"hate", "huge", "iconic", "idiotic", "immensely", "impactful",
"incendiary", "indisputable", "infamous", "influential", "innovative",
"inspired", "intriguing", "lame", "leader", "leading", "legendary",
"like", "major", "masterly", "mature", "memorable", "most", "notable",
"outstanding", "pioneer", "popular", "prestigious", "probably",
"really", "remarkable", "renowned", "respected", "seminal",
"significant", "skillful", "solution", "staunch", "strange", "super",
"talented", "top", "total", "totally", "transcendent", "ugly",
"undoubtedly", "unique", "virtually", "virtuoso", "visionary", "weird",
"worst",
}
BiasedWords contain list of colloquial words with high bias.
var (
// DEBUG debug level, set using environment TEKSTUS_DEBUG
DEBUG = 0
)
var PronounWords = []string{
"i", "me", "mine", "my", "myself", "our", "ours", "ourself",
"ourselves", "selves", "thee", "thine", "thou", "thy", "thyself", "us",
"we", "y'all", "y'all", "y'all's", "yis", "you", "you-uns", "your",
"yours", "yourself", "yourselves", "yourselves", "yous", "yous's",
"youse", "youse",
}
PronounWords contains list of first and second person pronouns including slangs.
var SexWords = []string{
"anal", "breast", "breasts", "buttocks", "dildo", "dildos", "erect",
"nipple", "nipples", "penis", "sex", "sodomized", "sodomy", "vagina",
"vibrator", "vibrators",
}
SexWords contain list of non-vulgar sex-related words.
var URIPrefixes = []string{
"http://", "https://", "ftp://", "ftps://",
}
URIPrefixes contain list of common URI prefix in content of web pages.
var VulgarWords = []string{}/* 779 elements not displayed */
VulgarWords contain list of vulgar and offensive words in informal and slangs.
var WikiMarkups = []WikiMarkup{
{
"[[Category:",
"]]",
}, {
"[[:Category:",
"]]",
}, {
"[[File:",
"]]",
}, {
"[[Help:",
"]]",
}, {
"[[Image:",
"]]",
}, {
"[[Special:",
"]]",
}, {
"[[Wikipedia:",
"]]",
}, {
"{{DEFAULTSORT:",
"}}",
}, {
"{{Template:",
"}}",
}, {
"<ref",
"/>",
},
}
WikiMarkups contain list of common markup in Wikimedia software.
Functions ¶
func BytesCutUntil ¶
BytesCutUntil we found token.
If `checkEsc` is true, token that is prefixed with escaped character '\' will be considered as non-match token.
Return all bytes before token and positition of byte _after_ token, or false if token is not found.
func BytesEncapsulate ¶
BytesEncapsulate will find `token` in `line` and capsulating it with bytes from `leftcap` and `rightcap`. If no token is found, it will return the same line with false status.
func BytesFind ¶
BytesFind return the first index of matched token in line. If not found it will return -1.
func BytesMatchForward ¶
BytesMatchForward return true if `line` at index `p` match with `token`, otherwise return false.
func BytesRemoveUntil ¶
BytesRemoveUntil given a line, remove all bytes inside it, starting from `leftcap` until the `rightcap` and return cutted line and changed to true.
If no `leftcap` or `rightcap` is found, the line will unchanged, and changed will be false.
Example,
line : "[[ ABC ]] DEF" leftcap : "[[" rightcap: "]]" return : " DEF"
func BytesSkipUntil ¶
BytesSkipUntil skip all bytes until matched token is found.
If `checkEsc` is true, token that is prefixed with escaped character '\' will be considered as non-match token.
Return index of line with matched token or false if line end before finding the token.
func CountAlnumChar ¶
CountAlnumChar return number of alpha-numeric character in text.
func CountAlnumDistribution ¶
CountAlnumDistribution count distribution of alpha-numeric characters in text.
Example, given a text "abbcccddddeeeee", it will return [a b c d e] and [1 2 3 4 5].
func CountCharSequence ¶
CountCharSequence given a string, count number of repeated character more than one in sequence and return character and counting value.
Example, given a text of string
"aaa abcdee ffgf"
it will return
[a e f]
and
[3 2 2]
'a' is not counted as 4 because it will breaked by space, so do 'f'.
func CountNonAlnumChar ¶
CountNonAlnumChar return number of non alpha-numeric character in text. If `withspace` is true, it will be counted as non-alpha-numeric, if it false it will be skipped.
func CountUniqChar ¶
CountUniqChar count number of character in text without duplication.
Example, if text is "aba" then it will count as 2 ("a", "b").
func CountUpperLowerChar ¶
CountUpperLowerChar return number of uppercase and lowercase in text.
func GetMaxCharSequence ¶
GetMaxCharSequence return character which have maximum sequence in `text`.
Example, given a text of string "aaa abcdee ffgf" it will return 'a' and 3.
func InterfacesToStrings ¶
func InterfacesToStrings(is []interface{}) (vs []string)
InterfacesToStrings will convert slice of interface to slice of string.
func RatioAlnumChar ¶
RatioAlnumChar compute and return ratio of alpha-numeric with all character in text.
func RatioDigit ¶
RatioDigit compute and return digit ratio to all characters in text.
func RatioNonAlnumChar ¶
RatioNonAlnumChar return ratio of non-alphanumeric character to all character in text. If `withspace` is true then white-space character will be counted as non-alpha numeric, otherwise it will be skipped.
func RatioUpper ¶
RatioUpper compute and return ratio of uppercase character to all character in text.
func RatioUpperLowerChar ¶
RatioUpperLowerChar compute and return ratio of uppercase with lowercase character in text.
func RunesContain ¶
RunesContain return true if character `c` is in slice of rune `s` and index of character in `s`.
func RunesDiff ¶
RunesDiff return the difference between two slice of rune.
For example, input are
l: [a b c d] r: [b c]
and the output will be `[a d]`
func RunesFind ¶
RunesFind will search token in text starting from index `startAt` and return the matching index.
If no token is found it will return -1.
func RunesFindSpaces ¶
RunesFindSpaces in line, return -1 if not found.
func RunesRemoveUntil ¶
RunesRemoveUntil given a line, remove all characters inside it, starting from `leftcap` until the `rightcap` and return cutted line and changed to true.
If no `leftcap` or `rightcap` is found, the line will unchanged, and changed will be false.
Example,
line : "[[ ABC ]] DEF" leftcap : "[[" rightcap: "]]" return : " DEF"
func StringCountTokens ¶
StringCountTokens given a text, count how many tokens inside of it and return sum of all.
func StringFrequenciesOf ¶
StringFrequenciesOf return frequencies of tokens by counting each occurence of token and divide it with total words in text.
func StringMergeSpaces ¶
StringMergeSpaces replace two or more spaces with single space. If withline is true it also replace two or more new lines with single new-line.
func StringRemoveURI ¶
StringRemoveURI remove link (http, https, ftp, ftps) from text and return the new text. This function assume that space in URI is using '%20'.
func StringRemoveWikiMarkup ¶
StringRemoveWikiMarkup remove wiki markup, including, - [[Category: ... ]] - [[:Category: ... ]] - [[File: ... ]] - [[Help: ... ]] - [[Image: ... ]] - [[Special: ... ]] - [[Wikipedia: ... ]] - {{DEFAULTSORT: ... }} - {{Template: ... }} - <ref ... />
func StringSplitWords ¶
StringSplitWords given a text, return all words in text.
Definition of word is any sequence of character which have length equal or greater than one and separated by space.
If cleanit is true remove any non-alphanumeric in the start and the end of each words.
If uniq is true remove duplicate words.
func StringTrimNonAlnum ¶
StringTrimNonAlnum remove non alpha-numeric character at the beginning and end for `text`.
func StringsIsContain ¶
StringsIsContain return true if elemen `el` is in slice of string `ss`, otherwise return false.
func StringsSortByIndex ¶
StringsSortByIndex will sort the slice of string `d` using sorted index `sortedIds`.
func StringsSwap ¶
StringsSwap swap two indices value of string.
func StringsToFloat64 ¶
StringsToFloat64 convert slice of string to slice of float64. If converted string return error it will set the float value to 0.
func StringsToInt64 ¶
StringsToInt64 convert slice of string to slice of int64. If converted string return error it will set the integer value to 0.
func WordsCountMissRate ¶
WordsCountMissRate given two slice of string, count number of string that is not equal with each other, and return the miss rate as
number of not equal / number of data
missing count, and length of input `src`.
func WordsCountToken ¶
WordsCountToken will return number of token occurence in words.
func WordsCountTokens ¶
WordsCountTokens count number of occurrence of each `tokens` values in words. Return number of each tokens based on their index.
For example, if words is "[A,A,B]" and tokens is "[A,B]", this function will return "[2,1]".
idx cls count 0 : A -> 2 1 : B -> 1
func WordsFindLongest ¶
WordsFindLongest find the longest word in words and return their value and index.
If words is empty return nil string with negative (-1) index.
func WordsFrequenciesOf ¶
WordsFrequenciesOf return total frequency of tokens in words.
func WordsFrequencyOf ¶
WordsFrequencyOf return frequency of token in words using
count-of-token / total-words
func WordsMaxCountOf ¶
WordsMaxCountOf return the string that has highest frequency.
Example, given input
words: [A A B A B C C] tokens: [A B]
it will return A as the majority tokens in words. If tokens has equal frequency, then the first tokens in order will returned.
func WordsProbabilitiesOf ¶
WordsProbabilitiesOf will compute each probability of token in word, and return it as a slice of float.
Example,
words: ["A", "B", "A"] tokens:["A", "B"]
It will return: [0.6, 0.3].
Types ¶
type Line ¶
Line represent bytes of string and line number.
type ListStrings ¶
type ListStrings []Strings
ListStrings is for working with list of set of string. Each elemen of slice is in the form of [["a"],["b","c"],...]
func (*ListStrings) IsEqual ¶
func (lss *ListStrings) IsEqual(b ListStrings) bool
IsEqual compare two list of slice of string without regard to their order.
{{"a"},{"b"}} == {{"b"},{"a"}} is true.
Return true if both contain the same list, false otherwise.
type Strings ¶
type Strings []string
Strings is for working with element of list with type is string. Each element of slice is in the form of ["a", ..., "n"]
func (*Strings) IsEqual ¶
IsEqual compare elements of two slice of string without regard to their order
{"a","b"} == {"b","a"} is true
Return true if each both slice have the same elements, false otherwise.
func (*Strings) Partitioning ¶
func (ss *Strings) Partitioning(k int) (table TableStrings)
Partitioning will group the set's element `orgseed` into non-empty lists, in such a way that every element is included in one and only of the lists.
Given a list of element in `orgseed`, and number of partition `k`, return the set of all group of all elements without duplication.
For example, the set {a,b,c} if partitioned into 2 group will result in set
{ {{a,b},{c}}, {{a,c},{b}}, {{a},{b,c}}, }
if partitioned into 3 group (k=3) will result in,
{ {{a},{b},{c}}, }
Number of possible list can be computed using Stirling number of second kind.
For more information see, - https://en.wikipedia.org/wiki/Partition_of_a_set
func (*Strings) SinglePartition ¶
func (ss *Strings) SinglePartition() (table TableStrings)
SinglePartition create a table from a set of string, where each elemen in a set become a single set.
Input: [a,b,c] output:
[ [[a],[b],[c]] ]
type TableStrings ¶
type TableStrings []ListStrings
TableStrings is for working with set of list of set of string. Each elemen in set is in the form of
[ [["a"],["b","c"],...], [["x"],["y",z"],...] ]
func (*TableStrings) IsEqual ¶
func (tss *TableStrings) IsEqual(b TableStrings) bool
IsEqual compare two table of string without regard to their order.
{ {{"a"},{"b"}}, {{"c"}} }
is equal to
{ {{"c"}}, {{"b"},{"a"}} }
Return true if both set is contain the same list, false otherwise.
func (*TableStrings) JoinCombination ¶
func (tss *TableStrings) JoinCombination(s string) (tssout TableStrings)
JoinCombination will append string `s` to each set in list in different index.
For example, given string `s` and input table `[[["a"]["b"]["c"]]]`, the output table will be,
[ [["a","s"]["b"] ["c"]], [["a"] ["b","s"]["c"]], [["a"] ["b"] ["c","s"]] ]
type WikiMarkup ¶
type WikiMarkup struct {
// contains filtered or unexported fields
}
WikiMarkup define the markup for Wikimedia software.