dbcs

package
v0.33.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Oct 20, 2023 License: GPL-3.0 Imports: 13 Imported by: 0

Documentation

Index

Constants

View Source
const (
	N_FIRST_COMMENTS                = 10
	COMMENT_STEP_DURATION           = 1 * time.Millisecond
	REPLY_STEP_NANO_TS              = 100000    // 0.1 millisecond
	DELETE_STEP_NANO_TS             = 10000     // 0.01 milliseond
	COMMENT_STEP_NANO_TS            = 1000000   // 1 millisecond
	COMMENT_EXCEED_NANO_TS          = 1000      // 0.001 millisecond
	COMMENT_BACKWARD_OFFSET_NANO_TS = 900000000 // 900 millisecond
	COMMENT_DIFF_ALIGN_END_NANO_TS  = 60 * types.TS_TO_NANO_TS
	COMMENT_DIFF2_ALIGN_END_NANO_TS = 86400 * types.TS_TO_NANO_TS

	DEFAULT_LINE_BYTES = 200

	LEN_OLD_RECOMMEND_DATE = 5
	LEN_RECOMMEND_DATE     = 11

	ONE_YEAR_OFFSET_NANO_TS = 365 * 86400 * types.TS_TO_NANO_TS

	N_LINES_PER_CONTENT_BLOCK = 50

	MAX_COMMENT_BYTES = 81
)
View Source
const MATCH_SIGNATURE_INIT_STR = "\n※ 發信站:" //\n※  發信站: (no \n-- in forward (轉))

Variables

View Source
var (
	COMMENT_STEP_DIFF_NANO_TS  types.NanoTS = 2 * 60 * types.TS_TO_NANO_TS    // 2 mins
	COMMENT_STEP_DIFF2_NANO_TS types.NanoTS = 2 * 86400 * types.TS_TO_NANO_TS // 2 days
)
View Source
var (
	MATCH_COMMENT_RECOMMEND_BYTES = []byte{

		0x1b, 0x5b, 0x31, 0x3b, 0x33, 0x37, 0x6d,
		0xb1, 0xc0, 0x20, 0x1b, 0x5b, 0x33, 0x33, 0x6d,
	}

	MATCH_COMMENT_BOO_BYTES = []byte{

		0x1b, 0x5b, 0x31, 0x3b, 0x33, 0x31, 0x6d,
		0xbc, 0x4e, 0x20, 0x1b, 0x5b, 0x33, 0x33, 0x6d,
	}

	MATCH_COMMENT_ARROW_BYTES = []byte{

		0x1b, 0x5b, 0x31, 0x3b, 0x33, 0x31, 0x6d,
		0xa1, 0xf7, 0x20, 0x1b, 0x5b, 0x33, 0x33, 0x6d,
	}

	MATCH_COMMENT_INFIX = []byte("\x1b[m\x1b[33m:")

	//※ 編輯: abcd (1.2.3.4 臺灣), 03/21/2021 03:04:47
	//\xa1\xb0 \xbds\xbf\xe8: abcd (1.2.3.4 \xbbO\xc6W), 03/18/2021 12:07:22
	MATCH_COMMENT_EDIT_BYTES = []byte("\xa1\xb0 \xbds\xbf\xe8: ")

	MATCH_COMMENT_EDIT_FROM_BYTES = []byte("\xa8\xd3\xa6\xdb: ")

	//※ abcde:轉錄至看板 SYSOP
	//\xa1\xb0 \x1b[1;32mabcd\x1b[0;32m:\xc2\xe0\xbf\xfd\xa6\xdc\xac\xdd\xaaO Mavericks\x1b[m                               03/18 12:07
	//※ jasome:轉錄至某隱形看板
	//\xa1\xb0 \x1b[1;32mjasome\x1b[0;32m:\xc2\xe0\xbf\xfd\xa6\xdc\xacY\xc1\xf4\xa7\xce\xac\xdd\xaaO\x1b[m                                         01/29 02:39
	MATCH_COMMENT_FORWARD_BYTES       = []byte("\x1b[0;32m:\xc2\xe0\xbf\xfd\xa6\xdc") //\x1b[0;32m:\xc2
	MATCH_COMMENT_FORWARD_BOARD_BYTES = []byte("\xac\xdd\xaaO ")

	MATCH_COMMENT_FORWARD_PREFIX = []byte("\xa1\xb0 \x1b[1;32m")

	MATCH_COMMENT_FORWARD_HIDDEN_BYTES = []byte("\xacY\xc1\xf4\xa7\xce\xac\xdd\xaaO")

	//(teemocogs 刪除 teemocogs 的推文: 誤植)
	//\x1b[1;30m(teemocogs \xa7R\xb0\xa3 teemocogs \xaa\xba\xb1\xc0\xa4\xe5: \xbb~\xb4\xd3)\x1b[m
	MATCH_COMMENT_DELETED_PREFIX  = []byte("\x1b[1;30m(")
	MATCH_COMMENT_DELETED_INFIX0  = []byte(" \xa7R\xb0\xa3 ")
	MATCH_COMMENT_DELETED_INFIX1  = []byte(" \xaa\xba\xb1\xc0\xa4\xe5: ")
	MATCH_COMMENT_DELETED_POSTFIX = []byte(")\x1b[m")

	MATCH_COMMENT_GREEN_PREFIX = []byte("\xa1\xb0 ") //※
)
View Source
var (
	MATCH_COMMENT_RECOMMEND_STR = "\x1b[1;37m推 \x1b[33m" // 推

	MATCH_COMMENT_BOO_STR = "\x1b[1;31m噓 \x1b[33m" // 噓

	MATCH_COMMENT_ARROW_STR = "\x1b[1;31m→ \x1b[33m" //→

	MATCH_DEFAULT_INFIX_STR = "\x1b[m\x1b[33m"

	//※ 編輯: peter50505      來自: 163.27.69.176        (10/03 14:42)
	//\xa1\xb0 \xbds\xbf\xe8: abcd (1.2.3.4 \xbbO\xc6W), 03/18/2021 12:07:22
	MATCH_COMMENT_EDIT_STR = "※ 編輯: "

	MATCH_COMMENT_EDIT_FROM_STR = "來自: "

	//※ abcde:轉錄至看板 SYSOP
	//※ \x1b[1;32mPttACT\x1b[0;32m:轉錄至看板 OriginalSong\x1b[m                                  01/26 17:19
	//
	//※ jasome:轉錄至某隱形看板
	//\xa1\xb0 \x1b[1;32mjasome\x1b[0;32m:\xc2\xe0\xbf\xfd\xa6\xdc\xacY\xc1\xf4\xa7\xce\xac\xdd\xaaO\x1b[m                                         01/29 02:39
	MATCH_COMMENT_FORWARD_STR        = "\x1b[0;32m:轉錄至"
	MATCH_COMMENT_FORWARD_BOARD_STR  = "看板 "
	MATCH_COMMENT_FORWARD_PREFIX_STR = "※ \x1b[1;32m"

	//(teemocogs 刪除 teemocogs 的推文: 誤植)
	//\x1b[1;30m(teemocogs 刪除 teemocogs 的推文: 誤植)\x1b[m
	MATCH_COMMENT_DELETED_PREFIX_STR  = "\x1b[1;30m("
	MATCH_COMMENT_DELETED_INFIX0_STR  = " 刪除 "
	MATCH_COMMENT_DELETED_INFIX1_STR  = " 的推文: "
	MATCH_COMMENT_DELETED_POSTFIX_STR = ")\x1b[m"

	MATCH_COMMENT_GREEN_PREFIX_STR = "※ " //※
)
View Source
var (
	MATCH_SIGNATURE_FROM = []byte{
		0x29, 0x2c, 0x20, 0xa8, 0xd3, 0xa6, 0xdb, 0x3a, 0x20,
	}

	MATCH_SIGNATURE_FROM_OLD = []byte{
		0xa1, 0xbb, 0x20, 0x46, 0x72, 0x6f, 0x6d, 0x3a, 0x20,
	}

	MATCH_SIGNATURE_FORWARD = []byte{
		0xa1, 0xb0, 0x20, 0xc2, 0xe0, 0xbf, 0xfd, 0xaa, 0xcc, 0x3a, 0x20,
	}

	MATCH_SIGNATURE_URL = []byte{
		0xa1, 0xb0, 0x20, 0xa4, 0xe5, 0xb3, 0xb9, 0xba, 0xf4, 0xa7, 0x7d, 0x3a, 0x20,
	}
)
View Source
var (
	MATCH_SIGNATURE_FROM_STR = "), 來自: " //), 來自:

	MATCH_SIGNATURE_FROM_OLD_STR = "◆ From: " //◆ From:

	MATCH_SIGNATURE_FORWARD_STR = "※ 轉錄者: "

	MATCH_SIGNATURE_URL_STR = "※ 文章網址: "
)
View Source
var MATCH_SIGNATURE_INIT = []byte{
	0x0a, 0xa1, 0xb0, 0x20, 0xb5,
	0x6f, 0xab, 0x48, 0xaf, 0xb8, 0x3a, 0x20,
}

Functions

func CommentUtf8ToDBCS added in v0.27.0

func CommentUtf8ToDBCS(c *schema.Comment)

func InferTimestamp added in v0.15.0

func InferTimestamp(edBlocks []*EDBlock, isForwardOnly bool, isLastAlignEndNanoTS bool, articleCreateTime types.NanoTS) (nBlock int)

func InitConfig added in v0.25.3

func InitConfig() (err error)

func IntegrateComments added in v0.15.0

func IntegrateComments(boardID bbs.BBoardID, articleID bbs.ArticleID, comments []*schema.Comment, articleCreateTime types.NanoTS, articleMTime types.NanoTS, isForwardOnly bool, isLastAlignEndNanoTS bool) (newComments []*schema.Comment, toDeleteComments []*schema.CommentMD5, err error)

func MatchComment added in v0.15.0

func MatchComment(content []byte) int

MatchComment

TODO: record the idxes of each condition, rematch only the condition with the smallest idx.

func MatchCommentStr added in v0.27.0

func MatchCommentStr(content string) int

func MatchCommentType added in v0.15.0

func MatchCommentType(commentDBCS []byte) (theType ptttype.CommentType, nextCommentDBCS []byte)

func MatchCommentTypeStr added in v0.27.0

func MatchCommentTypeStr(commentDBCS string) (theType ptttype.CommentType, nextCommentDBCS string)

func Md5sum added in v0.22.0

func Md5sum(theBytes []byte) string

func ParseComments

func ParseComments(
	ownerID bbs.UUserID,
	commentsDBCS []byte,
	allCommentsDBCS []byte,
) (comments []*schema.Comment)

ParseComments

有可能 reply-edit-info (編輯) 不在 commentsDBCS 裡 但是會在 allCommentsDBCS 裡 (firstComments) 只考慮:

  1. appropriately split comments.
  2. 對於每個 comment 裡的 DBCS Parse 成 Utf8.
  3. type / IP / Host / MD5 / TheDate

不考慮:

  1. boardID / articleID / commentID.
  2. createTime / firstCreateTime / InferredCreateTime / AddCreateTime (除了編輯以外)

steps:

  1. 根據 '\n' 估計 nComments
  2. 找出 pre-comment reply.
  3. 對於每個 comment-leading newline for-loop: 3.0. parse comment 3.1. 找下一個 comment 3.1.1. 如果沒有更多 comment: 假設剩下 text 的都是 reply. 3.2. 假設下一個 comment 之前的 text 都是 reply.
  4. (outside for-loop): 處理最後一個沒有 '\n' 的 comment.

func ParseCommentsStr added in v0.27.0

func ParseCommentsStr(ownerID bbs.UUserID, commentsDBCS string, allCommentsDBCS string) (comments []*schema.Comment)

func ParseContent

func ParseContent(contentBytes []byte, origContentMD5 string) (content [][]*types.Rune, contentPrefix [][]*types.Rune, contentMD5 string, ip string, host string, bbs string, signatureMD5 string, signatureDBCS []byte, commentsDBCS []byte)

ParseContent

Assume: 1. the content is with chars >= 32 and '\x1b', '\r', \n' 2. the timestamp of the 1st-comments (around 10 comments, including the last-same-min comments) are within 1-year of the createTime. 3. the timestamp of the rest of the comments are able to reverse-inferred from mtime. compared as stored as nano-ts. 4. assuming no more than 60000 comments (60 x 1000) in 1 minute.

func ParseContentBlocks added in v0.24.0

func ParseContentBlocks(boardID bbs.BBoardID, articleID bbs.ArticleID, content [][]*types.Rune, contentMD5 string, updateNanoTS types.NanoTS) (contentID types.ContentID, contentBlocks []*schema.ContentBlock)

func ParseContentStr added in v0.27.0

func ParseContentStr(contentStr string, origContentMD5 string, isSplit bool) (content [][]*types.Rune, contentPrefix [][]*types.Rune, contentMD5 string, ip string, host string, bbs string, signatureMD5 string, signatureDBCS string, commentsDBCS string)

ParseContentStr

Assume: 1. the content is with chars >= 32 and '\x1b', '\r', \n' 2. the timestamp of the 1st-comments (around 10 comments, including the last-same-min comments) are within 1-year of the createTime. 3. the timestamp of the rest of the comments are able to reverse-inferred from mtime. compared as stored as nano-ts. 4. assuming no more than 60000 comments (60 x 1000) in 1 minute.

func ParseFirstComments

func ParseFirstComments(
	bboardID bbs.BBoardID,
	articleID bbs.ArticleID,
	ownerID bbs.UUserID,
	articleCreateTime types.NanoTS,
	articleMTime types.NanoTS,
	commentsDBCS []byte,
	origFirstCommentsMD5 string) (

	firstComments []*schema.Comment,
	firstCommentsMD5 string,
	theRestCommentsDBCS []byte,
	err error)

ParseFirstComments

Check with origFirstCommentsMD5, if exists, return nil and requires getting firstComments and lastTime from db.

func ParseFirstCommentsStr added in v0.27.0

func ParseFirstCommentsStr(
	bboardID bbs.BBoardID,
	articleID bbs.ArticleID,
	ownerID bbs.UUserID,
	articleCreateTime types.NanoTS,
	articleMTime types.NanoTS,
	commentsDBCS string,
	origFirstCommentsMD5 string) (

	firstComments []*schema.Comment,
	firstCommentsMD5 string,
	theRestCommentsDBCS string,
	err error)

ParseFirstComments

Check with origFirstCommentsMD5, if exists, return nil and requires getting firstComments and lastTime from db.

func Utf8ToDBCS added in v0.15.0

func Utf8ToDBCS(utf8 [][]*types.Rune) (dbcsBytes [][]byte)

Types

type DBCSState

type DBCSState int
const (
	DBCS_STATE_NONE  DBCSState = 0
	DBCS_STATE_LEAD  DBCSState = 1
	DBCS_STATE_TAIL  DBCSState = 2
	DBCS_STATE_COLOR DBCSState = 3
)

func (DBCSState) String added in v0.15.0

func (d DBCSState) String() string

type EDBlock added in v0.15.0

type EDBlock struct {
	NewComments  []*EDInfo
	OrigComments []*EDInfo
	StartNanoTS  types.NanoTS
	EndNanoTS    types.NanoTS
}

func CalcEDBlocks added in v0.15.0

func CalcEDBlocks(newComments []*schema.Comment, origComments []*schema.CommentMD5, articleCreateTime types.NanoTS, articleMTime types.NanoTS) (edBlocks []*EDBlock, err error)

CalcEDBlocks

Must already guarantee that: 1. articleCreateTime < all origComments.SortTime 2. articleMTime >= all origComments.SortTime 3. origComments are sorted by SortTime 4. newComments are sorted by the line-idx.

func (*EDBlock) AlignEndNanoTS added in v0.15.0

func (ed *EDBlock) AlignEndNanoTS()

func (*EDBlock) BackwardInferTS added in v0.15.0

func (ed *EDBlock) BackwardInferTS(nextIdx int, isAlignEndNanoTS bool)

BackwardInferTS

func (*EDBlock) ForwardInferTS added in v0.15.0

func (ed *EDBlock) ForwardInferTS(articleCreateTime types.NanoTS) (nextIdx int)

ForwardInferTS

func (*EDBlock) InferTimestamp added in v0.15.0

func (ed *EDBlock) InferTimestamp(articleCreateTime types.NanoTS, isForwardOnly bool, isLastAlignEndNanoTS bool)

InferTimestamp

  1. OrigComments are sorted between ed.StartNanoTS and ed.EndNanoTS
  2. It's possible that the newComments are with out-of-range time.
  3. It's possible that multiple comments shares the same date-str, but we still need some way to make the timestamp unique.
  4. The time from OrigComments should not be moved.

The possibilities that new-comments are in between original-comments: XXX 1. delete (try to map the corresponding deleted messages)

We don't do this to simplify mapping sequence.

2. reply (previous-appearing-message (currentNanoTS in same or newComments) + REPLY_STEP_NANO_TS) 3. new messages. (sort-time should be after the deleted-messages) 4. others (the owners accidentally edited something, sort-time should be after the deleted-messages)

type EDInfo added in v0.15.0

type EDInfo struct {
	Op          EDOp
	NewComment  *schema.Comment // SAME/DELETE: origComments, ADD: newComments
	OrigComment *schema.CommentMD5
	SortTime    types.NanoTS
}

func NewEDInfoFromAddComment added in v0.15.0

func NewEDInfoFromAddComment(comment *schema.Comment) (edInfo *EDInfo)

func NewEDInfoFromDeleteComment added in v0.15.0

func NewEDInfoFromDeleteComment(commentMD5 *schema.CommentMD5) (edInfo *EDInfo)

func NewEDInfoFromSameComment added in v0.15.0

func NewEDInfoFromSameComment(newComment *schema.Comment, origCommentMD5 *schema.CommentMD5) (edInfo *EDInfo)

type EDInfoMeta added in v0.15.0

type EDInfoMeta struct {

	// StartNanoTS (not included)
	StartNanoTS types.NanoTS

	// EndNanoTS (not included except the last ed-info)
	EndNanoTS types.NanoTS

	// StartIdx (included)
	StartIdx int

	// EndIdx (not incldued)
	EndIdx int
}

func (*EDInfoMeta) ToEDBlock added in v0.15.0

func (meta *EDInfoMeta) ToEDBlock(edInfos []*EDInfo) (edBlock *EDBlock)

ToEDBlock

Given the list of edInfos, where NewComments are OrigComments are already separately sorted, construct the corresponding ed-block.

type EDOp added in v0.15.0

type EDOp uint8
const (
	ED_OP_UNKNOWN EDOp = 0
	ED_OP_SAME    EDOp = 1
	ED_OP_DELETE  EDOp = 2
	ED_OP_ADD     EDOp = 3
)

type INFER_TIMESTAMP_TYPE added in v0.15.0

type INFER_TIMESTAMP_TYPE uint8
const (
	INFER_TIMESTAMP_INVALID INFER_TIMESTAMP_TYPE = 0
	INFER_TIMESTAMP_YMDHM   INFER_TIMESTAMP_TYPE = 1
	INFER_TIMESTAMP_YMD     INFER_TIMESTAMP_TYPE = 2
)

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL