fastx

package
v1.0.0 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Nov 6, 2020 License: MIT Imports: 9 Imported by: 0

README

fastx

GoDoc

This package seamlessly parses both FASTA and FASTQ formats.

Examples

Common operation
package main

import (
	"fmt"
	"io"
	"os"

	// "github.com/shenwei356/bio/seq"
	"github.com/shenwei356/bio/seqio/fastx"
	"github.com/shenwei356/xopen"
)

func main() {
	// use buffered out stream for output
	outfh, err := xopen.Wopen("-") // "-" for STDOUT
	checkError(err)
	defer outfh.Close()

	// disable sequence validation could reduce time when reading large sequences
	// seq.ValidateSeq = false

	reader, err := fastx.NewDefaultReader("-")
	checkError(err)
	var record *fastx.Record
	for {
		record, err = reader.Read()
		if err != nil {
			if err == io.EOF {
				break
			}
			checkError(err)
			break
		}

		// fmt is slow for output, because it's not buffered
		// fmt.Printf("%s", record.Format(0))

		record.FormatToWriter(outfh, 0)
	}
}

func checkError(err error) {
	if err != nil {
		fmt.Fprintln(os.Stderr, err)
		os.Exit(1)
	}
}

Note that, similar with bytes.Buffer.Bytes() method, the current record will change after your another call of this method. You may use record.Clone() to make a copy.

Asynchronously parsing

ChunkChan asynchronously reads FASTA/Q records, and returns a channel of Record Chunk, from which you can easily access the records. bufferSize is the number of buffered chunks, and chunkSize is the size of records in a chunk.

reader, err := fastx.NewDefaultReader(file)
checkError(err)

for chunk := range reader.ChunkChan(bufferSize, chunkSize) {
    checkError(chunk.Err)

    for _, record := range chunk.Data {
        fmt.Print(record)
    }
}

Note that, these's no need to clone the record by record.Clone() here.

Custom alphabet and identifier regular expression
import (
    "github.com/shenwei356/bio/seq"
    "github.com/shenwei356/bio/seqio/fastx"
)

reader, err := fastx.NewReader(seq.DNA, file, "^([^\s]+)\s?")

Documentation

Overview

Package fastx seamlessly parses FASTA and FASTQ format file This package seamlessly parses both FASTA and FASTQ formats.

## Examples

### Common operation

package main

import (
	"fmt"
	"io"
	"os"

	// "github.com/shenwei356/bio/seq"
	"github.com/shenwei356/bio/seqio/fastx"
	"github.com/shenwei356/xopen"
)

func main() {
	// use buffered out stream for output
	outfh, err := xopen.Wopen("-") // "-" for STDOUT
	checkError(err)
	defer outfh.Close()

	// disable sequence validation could reduce time when reading large sequences
	// seq.ValidateSeq = false

	reader, err := fastx.NewDefaultReader("-")
	checkError(err)
	for {
		record, err := reader.Read()
		if err != nil {
			if err == io.EOF {
				break
			}
			checkError(err)
			break
		}

		// fmt is slow for output, because it's not buffered
		// fmt.Printf("%s", record.Format(0))

		record.FormatToWriter(outfh, 0)
	}
}

func checkError(err error) {
	if err != nil {
		fmt.Fprintln(os.Stderr, err)
		os.Exit(1)
	}
}

***Note that***, similar with `bytes.Buffer.Bytes()` method, the current record will change after your another call of this method. You may use `record.Clone()` to make a copy.

### Asynchronously parsing

`ChunkChan` asynchronously reads FASTA/Q records, and returns a channel of Record Chunk, from which you can easily access the records. `bufferSize` is the number of buffered chunks, and `chunkSize` is the size of records in a chunk.

reader, err := fastx.NewDefaultReader(file)
checkError(err)

for chunk := range reader.ChunkChan(bufferSize, chunkSize) {
    checkError(chunk.Err)

    for _, record := range chunk.Data {
        fmt.Print(record)
    }
}

***Note that***, these's no need to clone the record by `record.Clone()` here.

### Custom alphabet and identifier regular expression

import (
    "github.com/shenwei356/bio/seq"
    "github.com/shenwei356/bio/seqio/fastx"
)

reader, err := fastx.NewReader(seq.DNA, file, "^([^\s]+)\s?")

Index

Constants

This section is empty.

Variables

View Source
var DefaultIDRegexp = `^(\S+)\s?`

DefaultIDRegexp is the default ID parsing regular expression

View Source
var ErrBadFASTQFormat = errors.New("fastx: bad fastq format")

ErrBadFASTQFormat means bad fastq format

View Source
var ErrNoContent = errors.New("fastx: no content found")

ErrNoContent means nothing in the file or stream

View Source
var ErrNotFASTXFormat = errors.New("fastx: invalid FASTA/Q format")

ErrNotFASTXFormat means that the file is not FASTA/Q

View Source
var ErrUnequalSeqAndQual = errors.New("fastx: unequal sequence and quality")

ErrUnequalSeqAndQual means unequal sequence and quality

View Source
var ForcelyOutputFastq bool

ForcelyOutputFastq means outputing record as fastq even if it has no quality (zero-length fastq)

Functions

func GetSeqNames

func GetSeqNames(file string) ([]string, error)

GetSeqNames returns the names of a fasta/q file

func GetSeqNumber

func GetSeqNumber(file string) (int, error)

GetSeqNumber returns the sequences number of FASTA/Q files

func GetSeqsMap

func GetSeqsMap(file string, alphabet *seq.Alphabet, bufferSize int, chunkSize int, idRegexp string) (map[string]*Record, error)

GetSeqsMap returns all seqs as a map for fasta file

func GuessAlphabet

func GuessAlphabet(file string) (*seq.Alphabet, bool, error)

GuessAlphabet guess the alphabet of the file by the first maxLen bases

func ParseHeadID

func ParseHeadID(idRegexp *regexp.Regexp, head []byte) []byte

ParseHeadID parse ID from head by IDRegexp

Types

type Reader

type Reader struct {
	IsFastq bool // if the file is fastq format

	IDRegexp *regexp.Regexp // regexp for parsing record id

	Err error // Current error
	// contains filtered or unexported fields
}

Reader seamlessly parse both FASTA and FASTQ formats

func NewDefaultReader

func NewDefaultReader(file string) (*Reader, error)

NewDefaultReader automaticlly recognizes sequence type and parses id with default manner

func NewReader

func NewReader(t *seq.Alphabet, file string, idRegexp string) (*Reader, error)

NewReader is constructor of FASTX Reader.

Parameters:

t            sequence alphabet
             if nil is given, it will guess alphabet by the first record
file         file name, "-" for stdin
idRegexp     id parsing regular expression string, must contains "(" and ")" to capture matched ID
             "" for default value: `^([^\s]+)\s?`
             if record head does not match the idRegxp, whole name will be the id

func (*Reader) Alphabet

func (fastxReader *Reader) Alphabet() *seq.Alphabet

Alphabet returns Alphabet of the file

func (*Reader) ChunkChan

func (fastxReader *Reader) ChunkChan(bufferSize int, chunkSize int) chan RecordChunk

ChunkChan asynchronously reads FASTA/Q records, and returns a channel of Record Chunk, from which you can easily access the records. bufferSize is the number of buffered chunks, and chunkSize is the size of records in a chunk.

func (*Reader) Close

func (fastxReader *Reader) Close()

Close closes the reader

func (*Reader) Read

func (fastxReader *Reader) Read() (*Record, error)

Read reads and return one FASTA/Q record. Note that, similar to bytes.Buffer.Bytes() method, the current record will change after your another call of this method. So, you could use record.Clone() to make a copy.

type Record

type Record struct {
	ID   []byte   // id
	Name []byte   // full name
	Desc []byte   // Description
	Seq  *seq.Seq // seq
}

Record is a struct for FASTA/Q

func GetSeqs

func GetSeqs(file string, alphabet *seq.Alphabet, bufferSize int, chunkSize int, idRegexp string) ([]*Record, error)

GetSeqs return fastx records of a file. when alphabet is nil or seq.Unlimit, it will automaticlly detect the alphabet. when idRegexp is "", default idRegexp ( ^([^\s]+)\s? ) will be used.

func NewRecord

func NewRecord(t *seq.Alphabet, id, name, desc, s []byte) (*Record, error)

NewRecord is constructor of type Record for FASTA

func NewRecordWithQual

func NewRecordWithQual(t *seq.Alphabet, id, name, desc, s, q []byte) (*Record, error)

NewRecordWithQual is constructor of type Record for FASTQ

func NewRecordWithQualWithoutValidation

func NewRecordWithQualWithoutValidation(t *seq.Alphabet, id, name, desc, s, q []byte) (*Record, error)

NewRecordWithQualWithoutValidation is constructor of type Record for FASTQ

func NewRecordWithSeq

func NewRecordWithSeq(id, name, desc []byte, s *seq.Seq) (*Record, error)

NewRecordWithSeq is constructor of type Record for FASTA with a existed seq.Seq object

func NewRecordWithoutValidation

func NewRecordWithoutValidation(t *seq.Alphabet, id, name, desc, s []byte) (*Record, error)

NewRecordWithoutValidation is constructor of type Record for FASTA without validation of the sequence

func (*Record) Clone

func (record *Record) Clone() *Record

Clone of a Record

func (*Record) Format

func (record *Record) Format(width int) []byte

Format returns formated (wrapped with fixed length of) sequence record

func (*Record) FormatToWriter

func (record *Record) FormatToWriter(outfh *xopen.Writer, width int)

FormatToWriter formats and directly writes to writer

func (*Record) String

func (record *Record) String() string

type RecordChunk

type RecordChunk struct {
	ID   uint64
	Data []*Record
	Err  error
}

RecordChunk is chunk for records

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL