fastx

package
v0.0.0-...-b8182e9 Latest Latest
Warning

This package is not in the latest version of its module.

Go to latest
Published: Mar 4, 2021 License: MIT Imports: 9 Imported by: 0

README

fastx

GoDoc

This package seamlessly parses both FASTA and FASTQ formats.

Examples

Common operation
package main

import (
	"fmt"
	"io"
	"os"

	// "github.com/shenwei356/bio/seq"
	"github.com/shenwei356/bio/seqio/fastx"
	"github.com/shenwei356/xopen"
)

func main() {
	// use buffered out stream for output
	outfh, err := xopen.Wopen("-") // "-" for STDOUT
	checkError(err)
	defer outfh.Close()

	// disable sequence validation could reduce time when reading large sequences
	// seq.ValidateSeq = false

	reader, err := fastx.NewDefaultReader("-")
	checkError(err)
	var record *fastx.Record
	for {
		record, err = reader.Read()
		if err != nil {
			if err == io.EOF {
				break
			}
			checkError(err)
			break
		}

		// fmt is slow for output, because it's not buffered
		// fmt.Printf("%s", record.Format(0))

		record.FormatToWriter(outfh, 0)
	}
}

func checkError(err error) {
	if err != nil {
		fmt.Fprintln(os.Stderr, err)
		os.Exit(1)
	}
}

Note that, similar with bytes.Buffer.Bytes() method, the current record will change after your another call of this method. You may use record.Clone() to make a copy.

Asynchronously parsing

ChunkChan asynchronously reads FASTA/Q records, and returns a channel of Record Chunk, from which you can easily access the records. bufferSize is the number of buffered chunks, and chunkSize is the size of records in a chunk.

reader, err := fastx.NewDefaultReader(file)
checkError(err)

for chunk := range reader.ChunkChan(bufferSize, chunkSize) {
    checkError(chunk.Err)

    for _, record := range chunk.Data {
        fmt.Print(record)
    }
}

Note that, these's no need to clone the record by record.Clone() here.

Custom alphabet and identifier regular expression
import (
    "github.com/shenwei356/bio/seq"
    "github.com/shenwei356/bio/seqio/fastx"
)

reader, err := fastx.NewReader(seq.DNA, file, "^([^\s]+)\s?")

Documentation

Overview

Package fastx seamlessly parses FASTA and FASTQ format file This package seamlessly parses both FASTA and FASTQ formats.

## Examples

### Common operation

package main

import (
	"fmt"
	"io"
	"os"

	// "github.com/shenwei356/bio/seq"
	"github.com/shenwei356/bio/seqio/fastx"
	"github.com/shenwei356/xopen"
)

func main() {
	// use buffered out stream for output
	outfh, err := xopen.Wopen("-") // "-" for STDOUT
	checkError(err)
	defer outfh.Close()

	// disable sequence validation could reduce time when reading large sequences
	// seq.ValidateSeq = false

	reader, err := fastx.NewDefaultReader("-")
	checkError(err)
	for {
		record, err := reader.Read()
		if err != nil {
			if err == io.EOF {
				break
			}
			checkError(err)
			break
		}

		// fmt is slow for output, because it's not buffered
		// fmt.Printf("%s", record.Format(0))

		record.FormatToWriter(outfh, 0)
	}
}

func checkError(err error) {
	if err != nil {
		fmt.Fprintln(os.Stderr, err)
		os.Exit(1)
	}
}

***Note that***, similar with `bytes.Buffer.Bytes()` method, the current record will change after your another call of this method. You may use `record.Clone()` to make a copy.

### Asynchronously parsing

`ChunkChan` asynchronously reads FASTA/Q records, and returns a channel of Record Chunk, from which you can easily access the records. `bufferSize` is the number of buffered chunks, and `chunkSize` is the size of records in a chunk.

reader, err := fastx.NewDefaultReader(file)
checkError(err)

for chunk := range reader.ChunkChan(bufferSize, chunkSize) {
    checkError(chunk.Err)

    for _, record := range chunk.Data {
        fmt.Print(record)
    }
}

***Note that***, these's no need to clone the record by `record.Clone()` here.

### Custom alphabet and identifier regular expression

import (
    "github.com/shenwei356/bio/seq"
    "github.com/shenwei356/bio/seqio/fastx"
)

reader, err := fastx.NewReader(seq.DNA, file, "^([^\s]+)\s?")

Index

Constants

This section is empty.

Variables

View Source
var DefaultIDRegexp = `^(\S+)\s?`

DefaultIDRegexp is the default ID parsing regular expression

View Source
var ErrBadFASTQFormat = errors.New("fastx: bad fastq format")

ErrBadFASTQFormat means bad fastq format

View Source
var ErrNoContent = errors.New("fastx: no content found")

ErrNoContent means nothing in the file or stream

View Source
var ErrNotFASTXFormat = errors.New("fastx: invalid FASTA/Q format")

ErrNotFASTXFormat means that the file is not FASTA/Q

View Source
var ErrUnequalSeqAndQual = errors.New("fastx: unequal sequence and quality")

ErrUnequalSeqAndQual means unequal sequence and quality

View Source
var ForcelyOutputFastq bool

ForcelyOutputFastq means outputing record as fastq even if it has no quality (zero-length fastq)

Functions

func GetSeqNames

func GetSeqNames(file string) ([]string, error)

GetSeqNames returns the names of a fasta/q file

func GetSeqNumber

func GetSeqNumber(file string) (int, error)

GetSeqNumber returns the sequences number of FASTA/Q files

func GetSeqsMap

func GetSeqsMap(file string, alphabet *seq.Alphabet, bufferSize int, chunkSize int, idRegexp string) (map[string]*Record, error)

GetSeqsMap returns all seqs as a map for fasta file

func GuessAlphabet

func GuessAlphabet(file string) (*seq.Alphabet, bool, error)

GuessAlphabet guess the alphabet of the file by the first maxLen bases

func ParseHeadID

func ParseHeadID(idRegexp *regexp.Regexp, head []byte) []byte

ParseHeadID parse ID from head by IDRegexp

Types

type Reader

type Reader struct {
	IsFastq bool // if the file is fastq format

	IDRegexp *regexp.Regexp // regexp for parsing record id

	Err error // Current error
	// contains filtered or unexported fields
}

Reader seamlessly parse both FASTA and FASTQ formats

func NewDefaultReader

func NewDefaultReader(file string) (*Reader, error)

NewDefaultReader automaticlly recognizes sequence type and parses id with default manner

func NewReader

func NewReader(t *seq.Alphabet, file string, idRegexp string) (*Reader, error)

NewReader is constructor of FASTX Reader.

Parameters:

t            sequence alphabet
             if nil is given, it will guess alphabet by the first record
file         file name, "-" for stdin
idRegexp     id parsing regular expression string, must contains "(" and ")" to capture matched ID
             "" for default value: `^([^\s]+)\s?`
             if record head does not match the idRegxp, whole name will be the id

func NewReaderFromIO

func NewReaderFromIO(t *seq.Alphabet, ioReader io.Reader, idRegexp string) (*Reader, error)

NewReaderFromIO is constructor of FASTX Reader.

Parameters:

t            sequence alphabet
             if nil is given, it will guess alphabet by the first record
file         an io.Reader
idRegexp     id parsing regular expression string, must contains "(" and ")" to capture matched ID
             "" for default value: `^([^\s]+)\s?`
             if record head does not match the idRegxp, whole name will be the id

func (*Reader) Alphabet

func (fastxReader *Reader) Alphabet() *seq.Alphabet

Alphabet returns Alphabet of the file

func (*Reader) ChunkChan

func (fastxReader *Reader) ChunkChan(bufferSize int, chunkSize int) chan RecordChunk

ChunkChan asynchronously reads FASTA/Q records, and returns a channel of Record Chunk, from which you can easily access the records. bufferSize is the number of buffered chunks, and chunkSize is the size of records in a chunk.

func (*Reader) Close

func (fastxReader *Reader) Close()

Close closes the reader

func (*Reader) Read

func (fastxReader *Reader) Read() (*Record, error)

Read reads and return one FASTA/Q record. Note that, similar to bytes.Buffer.Bytes() method, the current record will change after your another call of this method. So, you could use record.Clone() to make a copy.

type Record

type Record struct {
	ID   []byte   // id
	Name []byte   // full name
	Desc []byte   // Description
	Seq  *seq.Seq // seq
}

Record is a struct for FASTA/Q

func GetSeqs

func GetSeqs(file string, alphabet *seq.Alphabet, bufferSize int, chunkSize int, idRegexp string) ([]*Record, error)

GetSeqs return fastx records of a file. when alphabet is nil or seq.Unlimit, it will automaticlly detect the alphabet. when idRegexp is "", default idRegexp ( ^([^\s]+)\s? ) will be used.

func NewRecord

func NewRecord(t *seq.Alphabet, id, name, desc, s []byte) (*Record, error)

NewRecord is constructor of type Record for FASTA

func NewRecordWithQual

func NewRecordWithQual(t *seq.Alphabet, id, name, desc, s, q []byte) (*Record, error)

NewRecordWithQual is constructor of type Record for FASTQ

func NewRecordWithQualWithoutValidation

func NewRecordWithQualWithoutValidation(t *seq.Alphabet, id, name, desc, s, q []byte) (*Record, error)

NewRecordWithQualWithoutValidation is constructor of type Record for FASTQ

func NewRecordWithSeq

func NewRecordWithSeq(id, name, desc []byte, s *seq.Seq) (*Record, error)

NewRecordWithSeq is constructor of type Record for FASTA with a existed seq.Seq object

func NewRecordWithoutValidation

func NewRecordWithoutValidation(t *seq.Alphabet, id, name, desc, s []byte) (*Record, error)

NewRecordWithoutValidation is constructor of type Record for FASTA without validation of the sequence

func (*Record) Clone

func (record *Record) Clone() *Record

Clone of a Record

func (*Record) Format

func (record *Record) Format(width int) []byte

Format returns formated (wrapped with fixed length of) sequence record

func (*Record) FormatToWriter

func (record *Record) FormatToWriter(outfh *xopen.Writer, width int)

FormatToWriter formats and directly writes to writer

func (*Record) String

func (record *Record) String() string

type RecordChunk

type RecordChunk struct {
	ID   uint64
	Data []*Record
	Err  error
}

RecordChunk is chunk for records

Jump to

Keyboard shortcuts

? : This menu
/ : Search site
f or F : Jump to
y or Y : Canonical URL