SEQIO -- A Package for Sequence File I/O
BIOSEQ.TXT - An Example BIOSEQ File
#
# bioseq.txt - BIOSEQ entries for various databases (Version 1.2)
#
# This file contains BIOSEQ entries for a number of databases. Currently,
# there are entries for GenBank, GenPept (part of GenBank), NRFES, NRL3D,
# PIR, PROSITE, REPBASE, SWISS-PROT and UTR (part of NRFES).
#
# To customize it for your installation, go through the entries and for
# each entry:
# 1) Change the root directory from "/databases/..." to the
# directory where you have installed that database's files.
# 2a) If one of the choices listed in the entry matches your database
# file organization, uncomment the file/alias lines (if they
# have been commented).
# 2b) If your installation differed from the choices listed in the
# entry, comment out any uncommented lines, add the names of
# the database's files, and rewrite the aliases.
# 3) Comment out (or change the name of) any index files you don't
# want to use. Only include the index files for the identifiers
# you want the users to be able to access entries by.
# 4) Run idxseq for each of the databases to create the index files.
#
# In this version, only a few databases are given, but my hope is that
# this file could become a canonical list of the databases, their
# ftp/WWW locations or ordering information, and their file structure.
# So, if you administer, or are familiar with, a database which either
# is not included here, is incorrectly given here, or occurs in an ftp
# site, CD-ROM or floppy disk release with another file structure,
# please send e-mail to knight@cs.ucdavis.edu.
#
#
# Virtual BIOSEQ entries used to create and store index files for
# the NID and PID identifiers (to give random access to entries
# using the "nid" and "pid" identifiers).
#
>NID
>IdPrefix: nid
>Index: /databases/nididx
>PID
>IdPrefix: pid
>Index: /databases/pididx
#
# The EMBL Nucleotide Sequence Database
#
>EMBL: /databases/embl
>Name: EMBL
>IdPrefix: embl
>Index: emblindex
>Format: emblfast
>Alphabet: DNA
#
# EMBL files as found at ftp site ftp.ebi.ac.uk in
# /pub/databases/embl/release.
#
est?.dat, fun.dat, hum?.dat, inv.dat, mam.dat, org.dat, patent.dat,
phg.dat, pln.dat, pro.dat, rod.dat, sts.dat, syn.dat, unc.dat
vrl.dat, vrt.dat
est:(est?.dat), est1:(est1.dat), est2:(est2.dat), est3:(est3.dat),
est4:(est4.dat), est5:(est5.dat), est6:(est6.dat), fun:(fun.dat),
hum:(hum?.dat), hum1:(hum1.dat), hum2:(hum2.dat), inv:(inv.dat),
mam:(mam.dat), org:(org.dat), pat:(patent.dat), patent:(patent.dat),
phg:(phg.dat), pln:(pln.dat), pri:(pri.dat), pro:(pro.dat),
rod:(rod.dat), sts:(sts.dat), syn:(syn.dat), unc:(unc.dat),
vrl:(vrl.dat), vrt:(vrt.dat)
#
# The ENZYME Database
#
>enzyme: /databases/enzyme
>Name: ENZYME
>IdPrefix: ec
>Index: ecindex
>Format: EMBL
#
# Enzyme files as found at ftp site ncbi.nlm.nih.gov in /repository/enzyme.
#
enzyme.dat
#
# The GenBank Flat-File Database
#
>GenBank,gb: /databases/genbank
>Name: GenBank
>IdPrefix: gb
>Index: gbindex
>Format: gbfast
>Alphabet: DNA
#
# GenBank files as found at ftp site ncbi.nlm.nih.gov in /genbank.
# Uncomment one of the alternatives for the between-release, daily
# files (if you have those files).
#
gbbct.seq, gbest?.seq, gbinv.seq, gbmam.seq, gbpat.seq, gbphg.seq
gbpln.seq, gbpri.seq, gbrna.seq, gbrod.seq, gbsts.seq, gbsyn.seq,
gbuna.seq, gbvrl.seq, gbvrt.seq
bct:(gbbct.seq), est:(gbest?.seq), est1:(gbest1.seq), est2:(gbest2.seq)
est3:(gbest3.seq), est4:(gbest4.seq), est5:(gbest5.seq)
est6:(gbest6.seq), est7:(gbest7.seq), inv:(gbinv.seq), mam:(gbmam.seq)
pat:(gbpat.seq), phg:(gbphg.seq), pln:(gbpln.seq), pri:(gbpri.seq)
rna:(gbrna.seq), rod:(gbrod.seq), sts:(gbsts.seq), syn:(gbsyn.seq)
una:(gbuna.seq), vrl:(gbvrl.seq), vrt:(gbvrt.seq)
~bct:(gbbct.seq), ~est:(gbest?.seq), ~inv:(gbinv.seq), ~mam:(gbmam.seq)
~pat:(gbpat.seq), ~phg:(gbphg.seq), ~pln:(gbpln.seq), ~pri:(gbpri.seq)
~rna:(gbrna.seq), ~rod:(gbrod.seq), ~sts:(gbsts.seq), ~syn:(gbsyn.seq)
~una:(gbuna.seq), ~vrl:(gbvrl.seq), ~vrt:(gbvrt.seq)
# daily/gbcu.flat, daily:(daily/gbcu.flat)
# daily-nc/nc????.flat, daily:(daily-nc/nc????.flat)
#
# The GenPept Protein Translation of GenBank coding sequences.
#
>GenPept: /databases/genbank
>Name: GenPept
>IdPrefix: gp
>Index: genpeptindex
>Format: FASTA
>Alphabet: Protein
#
# GenPept files as found at ftp site ncbi.nlm.nih.gov in /genbank
#
genpept.fsa, daily/gpcu.fsa
#
# The Non-Redundant Functionally Equivalent Sequences (NRFES) Database.
#
>NRFES: /databases/NRFES
>Name: NRFES
>IdPrefix: gb
>Index: nrfesindex
>Format: NBRF
>Alphabet: DNA
#
# NRFES files as found at ftp site ncbi.nlm.nih.gov in /repository/NRFES.
#
all_v05/(bcta, inva, mama, orga, phga, plna, pria, roda, vrla, vrta, yeaa)
cds_v05/(bctc, invc, mamc, orgc, phgc, plnc, pric, rodc, vrlc, vrtc, yeac)
exo_v05/(inve, mame, orge, plne, prie, rode, vrle, vrte, yeae)
ivs_v05/(invi, mami, orgi, plni, prii, rodi, vrli, vrti, yeai)
~:(bcta, inva, mama, orga, phga, plna, pria, roda, vrla, vrta, yeaa)
all:(bcta, inva, mama, orga, phga, plna, pria, roda, vrla, vrta, yeaa)
cds:(bctc, invc, mamc, orgc, phgc, plnc, pric, rodc, vrlc, vrtc, yeac)
exo:(inve, mame, orge, plne, prie, rode, vrle, vrte, yeae)
ivs:(invi, mami, orgi, plni, prii, rodi, vrli, vrti, yeai)
bct:(bcta), inv:(inva), mam:(mama), org:(orga), phg:(phga), pln:(plna)
pri:(pria), rod:(roda), vrl:(vrla), vrt:(vrta), yea:(yeaa)
#
# The NRL_3D Protein Sequence--Structure Database (a mirror of the PDB
# database in PIR/CODATA file format).
#
>NRL3D: /databases/pir
>Name: NRL3D
>IdPrefix: pdb
>Index: nrl3dindex
>Format: pirfast
>Alphabet: Protein
#
# NRL_3D files as included in the PIR release found at ftp site
# ncbi.nlm.nih.gov in /repository/PIR.
#
nrl3d.dat
#
# The Protein Information Resource (PIR) Database.
#
>PIR: /databases/pir
>Name: PIR
>IdPrefix: pir
>Index: pirindex
>Format: pirfast
>Alphabet: Protein
#
# PIR files as found at ftp site ncbi.nlm.nih.gov in /repository/PIR.
#
pir1.dat, pir2.dat, pir3.dat pir4.dat
~1:(pir1.dat), ~2:(pir2.dat), ~3:(pir3.dat), ~4:(pir4.dat)
~12:(pir1.dat,pir2.dat), ~13:(pir1.dat,pir3.dat), ~23:(pir2.dat,pir3.dat)
~123:(pir1.dat,pir2.dat,pir3.dat)
#
# The PROSITE Pattern Database.
#
>PROSITE: /databases/prosite
>Name: PROSITE
>IdPrefix: pros
>Index: prositeindex
>Format: EMBL
#
# PROSITE files as found at ftp site ncbi.nlm.nih.gov in
# /repository/prosite.
#
prosite.dat
#
# The REPBASE Repetitive Element Database.
#
>repbase: /databases/repbase
>Name: REPBASE
>IdPrefix: rpb
>Index: repbaseindex
>Format: EMBL
#
# REPBASE files as found at ftp site ncbi.nlm.nih.gov in
# /repository/repbase.
#
MAIN/(B1.rodent, B2.rodent, L1.primate, MIR.mammal, MIR.primate,
MIR.rodent, MIR2.primate, THE.mammal, THE.primate, THE.rodent,
THR.human, alu.galago, alu.human, alu.other)
MER/MER*.pri
REF/(humrep.ref, invrep.ref, mamrep.ref, plnrep.ref, rodrep.ref,
simple.ref, vertrep.ref)
B1:(B1.rodent), B2:(B2.rodent), L1:(L1.primate)
MIR:(MIR.mammal, MIR.primate, MIR.rodent), MIR2:(MIR2.primate)
THE:(THE.mammal, THE.primate, THE.rodent), THR:(THR.human)
alu:(alu.galago, alu.human, alu.other)
rodent:(B1.rodent, B2.rodent, MIR.rodent),
rod:(B1.rodent, B2.rodent, MIR.rodent),
primate:(L1.primate, MIR.primate, MIR2.primate, THE.primate)
pri:(L1.primate, MIR.primate, MIR2.primate, THE.primate)
mammal:(MIR.mammal, THE.mammal), mam:(MIR.mammal, THE.mammal)
human:(THR.human, alu.human)
#
# The SWISS-PROT Protein Sequence Data Bank.
#
>SWISS-PROT,swissprot,sprot: /databases/swiss-prot
>Name: SWISS-PROT
>IdPrefix: sp
>Index: sprotindex
>Format: spfast
>Alphabet: Protein
#
# SWISS-PROT files as found at ftp site ncbi.nlm.nih.gov in
# /repository/swiss-prot. The ?? match the release number of
# the database.
#
sprot??.dat
# updates/new-seq.dat
#
# The UTR Database (UnTRanslated regions of protein coding genes).
#
>UTR: /databases/NRFES/utr
>Format: FASTA
>Index: utrindex
>Alphabet: DNA
#
# UTR files as included in the NRFES release found at ftp site
# ncbi.nlm.nih.gov in /repository/NRFES/utr.
#
amp_3p, bird_3p, dro_3p, fish_3p, inse_3p, mam_3p, mollu_3p
mou_3p, nema_3p, prot_3p, rat_3p
amp_5p, bird_5p, dro_5p, fish_5p, inse_5p, mam_5p, mollu_5p
mou_5p, nema_5p, prot_5p, rat_5p
~_3p:(amp_3p, bird_3p, dro_3p, fish_3p, inse_3p, mam_3p, mollu_3p,
mou_3p, nema_3p, prot_3p, rat_3p)
~_5p:(amp_5p, bird_5p, dro_5p, fish_5p, inse_5p, mam_5p, mollu_5p,
mou_5p, nema_5p, prot_5p, rat_5p)
3p:(amp_3p, bird_3p, dro_3p, fish_3p, inse_3p, mam_3p, mollu_3p,
mou_3p, nema_3p, prot_3p, rat_3p)
5p:(amp_5p, bird_5p, dro_5p, fish_5p, inse_5p, mam_5p, mollu_5p,
mou_5p, nema_5p, prot_5p, rat_5p)
amp:(amp_3p,amp_5p), bird:(bird_3p,bird_5p), dro:(dro_3p,dro_5p)
fish:(fish_3p,fish_5p), inse:(inse_3p,inse_5p), mam:(mam_3p,mam_5p)
mollu:(mollu_3p,mollu_5p), mou:(mou_3p,mou_5p), nema:(nema_3p,nema_5p)
prot:(prot_3p,prot_5p), rat:(rat_3p,rat_5p)
James R. Knight,
knight@cs.ucdavis.edu
June 27, 1996