Header file for sequence handling. More...

#include "MathType.h"
#include "SysDefs.h"
#include "pdb.h"
#include "hash.h"
#include "deprecated.h"

Data Structures
struct	SEQINFO

Macros
#define	blMAXPIRLABEL 160

#define	ALLOCSIZE

#define	blPDB2Seq(x) blDoPDB2Seq((x), FALSE, FALSE, FALSE)

#define	blPDB2SeqX(x) blDoPDB2Seq((x), TRUE, FALSE, FALSE)

#define	blPDB2SeqNoX(x) blDoPDB2Seq((x), FALSE, FALSE, TRUE)

#define	blPDB2SeqXNoX(x) blDoPDB2Seq((x), TRUE, FALSE, TRUE)

#define	blPDBProt2Seq(x) blDoPDB2Seq((x), FALSE, TRUE, FALSE)

#define	blPDBProt2SeqX(x) blDoPDB2Seq((x), TRUE, TRUE, FALSE)

#define	blPDBProt2SeqNoX(x) blDoPDB2Seq((x), FALSE, TRUE, TRUE)

#define	blPDBProt2SeqXNoX(x) blDoPDB2Seq((x), TRUE, TRUE, TRUE)

#define	blPDB2SeqByChain(x) blDoPDB2SeqByChain((x), FALSE, FALSE, FALSE)

#define	blPDB2SeqXByChain(x) blDoPDB2SeqByChain((x), TRUE, FALSE, FALSE)

#define	blPDB2SeqNoXByChain(x) blDoPDB2SeqByChain((x), FALSE, FALSE, TRUE)

#define	blPDB2SeqXNoXByChain(x) blDoPDB2SeqByChain((x), TRUE, FALSE, TRUE)

#define	blPDBProt2SeqByChain(x) blDoPDB2SeqByChain((x), FALSE, TRUE, FALSE)

#define	blPDBProt2SeqXByChain(x) blDoPDB2SeqByChain((x), TRUE, TRUE, FALSE)

#define	blPDBProt2SeqNoXByChain(x) blDoPDB2SeqByChain((x), FALSE, TRUE, TRUE)

#define	blPDBProt2SeqXNoXByChain(x) blDoPDB2SeqByChain((x), TRUE, TRUE, TRUE)

#define	_SEQ_H_DEPRECATED

Functions
char	blThrone (char *three)

char	blThronex (char *three)

char *	blOnethr (char one)

char *	blDoPDB2Seq (PDB *pdb, BOOL DoAsxGlx, BOOL ProtOnly, BOOL NoX)

HASHTABLE *	blDoPDB2SeqByChain (PDB *pdb, BOOL DoAsxGlx, BOOL ProtOnly, BOOL NoX)

int	blSplitSeq (char LinearSeq, char *seqs)

int	blReadSimplePIR (FILE fp, int maxres, char *seqs)

int	blReadPIR (FILE fp, BOOL DoInsert, char seqs, int maxchain, SEQINFO seqinfo, BOOL punct, BOOL error)

int	blReadRawPIR (FILE fp, char seqs, int maxchain, BOOL upcase, SEQINFO seqinfo, BOOL *error)

int	blAlign (char seq1, int length1, char seq2, int length2, BOOL verbose, BOOL identity, int penalty, char align1, char align2, int *align_len)

int	blAffinealign (char seq1, int length1, char seq2, int length2, BOOL verbose, BOOL identity, int penalty, int penext, char align1, char align2, int *align_len)

int	blCalcMDMScore (char resa, char resb)

int	blAffinealignuc (char seq1, int length1, char seq2, int length2, BOOL verbose, BOOL identity, int penalty, int penext, char align1, char align2, int *align_len)

int	blCalcMDMScoreUC (char resa, char resb)

BOOL	blReadMDM (char *mdmfile)

int	blZeroMDM (void)

char	blDNAtoAA (char *dna)

int	blTrueSeqLen (char *sequence)

int	blKnownSeqLen (char *sequence)

BOOL	blNumericReadMDM (char *mdmfile)

int	blNumericCalcMDMScore (int resa, int resb)

int	blNumericAffineAlign (int seq1, int length1, int seq2, int length2, BOOL verbose, BOOL identity, int penalty, int penext, int align1, int align2, int *align_len)

void	blSetMDMScoreWeight (char resa, char resb, REAL weight)

void	blWriteOneStringPIR (FILE out, char label, char title, char sequence, char **chains, BOOL ByChain, BOOL doFasta)

Variables
BOOL	gBioplibSeqNucleicAcid

Detailed Description

Header file for sequence handling.

Version: V2.16

Date: 30.11.15

Copyright: (c) UCL / Dr. Andrew C. R. Martin 1991-2015

Author: Dr. Andrew C. R. Martin

: Institute of Structural & Molecular Biology, University College London, Gower Street, London. WC1E 6BT.

: andre.nosp@m.w@bi.nosp@m.oinf..nosp@m.org..nosp@m.uk andre.nosp@m.w.ma.nosp@m.rtin@.nosp@m.ucl..nosp@m.ac.uk

This code is NOT IN THE PUBLIC DOMAIN, but it may be copied according to the conditions laid out in the accompanying file COPYING.DOC.

The code may be modified as required, but any modifications must be documented so that the person responsible can be identified.

The code may not be sold commercially or included as part of a commercial product except as described in the file COPYING.DOC.

Description:

Usage:

Revision History:

V2.0 11.03.94 Original V2 release
V2.1 11.05.94 Added DNAtoAA() & TrueSeqLen() prototypes
V2.2 13.05.93 Added KnownSeqLen() prototype
V2.3 28.02.95 Added ReadRawPIR()
V2.4 25.07.95 Added the gBioplibSeqNucleicAcid external for throne()
V2.5 11.07.96 Added CalcMDMScore()
V2.6 17.09.95 Added ZeroMDM()
V2.7 26.08.97 Added macro interfaces to new DoPDB2Seq()
V2.8 08.03.00 Added Numeric***() alignment routines
V2.9 02.10.00 Modified DoPDB2Seq()
V2.10 27.02.07 Added CalcMDMScoreUC() and affinealignuc()
V2.11 07.07.14 Use bl prefix for functions. Renamed PDB2Seq macros to blPDB2Seq and added PDB2Seq defines to deprecated.h By: CTP
V2.12 31.07.14 Updated deprecation: Removed deprecated.h, added prototypes for renamed functions and defines for PDB2Seq macros. By: CTP
V2.13 14.08.14 Moved deprecated function prototypes to deprecated.h By: CTP
V2.14 26.08.14 Added blSetMDMScoreWeight()
V2.15 11.06.15 Added blWriteOneStringPIR()
V2.16 30.11.15 Added wrapper macros for blDoPDB2SeqByChain() and prototype

Definition in file seq.h.

Macro Definition Documentation

#define _SEQ_H_DEPRECATED

Definition at line 168 of file seq.h.

#define ALLOCSIZE

Value:

80  /* ReadPIR() uses this as a chunk size for 
                         allocating memory
                      */

Definition at line 86 of file seq.h.

#define blMAXPIRLABEL 160

Definition at line 85 of file seq.h.

#define blPDB2Seq ( x ) blDoPDB2Seq((x), FALSE, FALSE, FALSE)

Definition at line 107 of file seq.h.

#define blPDB2SeqByChain ( x ) blDoPDB2SeqByChain((x), FALSE, FALSE, FALSE)

Definition at line 117 of file seq.h.

#define blPDB2SeqNoX ( x ) blDoPDB2Seq((x), FALSE, FALSE, TRUE)

Definition at line 109 of file seq.h.

#define blPDB2SeqNoXByChain ( x ) blDoPDB2SeqByChain((x), FALSE, FALSE, TRUE)

Definition at line 119 of file seq.h.

#define blPDB2SeqX ( x ) blDoPDB2Seq((x), TRUE, FALSE, FALSE)

Definition at line 108 of file seq.h.

#define blPDB2SeqXByChain ( x ) blDoPDB2SeqByChain((x), TRUE, FALSE, FALSE)

Definition at line 118 of file seq.h.

#define blPDB2SeqXNoX ( x ) blDoPDB2Seq((x), TRUE, FALSE, TRUE)

Definition at line 110 of file seq.h.

#define blPDB2SeqXNoXByChain ( x ) blDoPDB2SeqByChain((x), TRUE, FALSE, TRUE)

Definition at line 120 of file seq.h.

#define blPDBProt2Seq ( x ) blDoPDB2Seq((x), FALSE, TRUE, FALSE)

Definition at line 112 of file seq.h.

#define blPDBProt2SeqByChain ( x ) blDoPDB2SeqByChain((x), FALSE, TRUE, FALSE)

Definition at line 122 of file seq.h.

#define blPDBProt2SeqNoX ( x ) blDoPDB2Seq((x), FALSE, TRUE, TRUE)

Definition at line 114 of file seq.h.

#define blPDBProt2SeqNoXByChain ( x ) blDoPDB2SeqByChain((x), FALSE, TRUE, TRUE)

Definition at line 124 of file seq.h.

#define blPDBProt2SeqX ( x ) blDoPDB2Seq((x), TRUE, TRUE, FALSE)

Definition at line 113 of file seq.h.

#define blPDBProt2SeqXByChain ( x ) blDoPDB2SeqByChain((x), TRUE, TRUE, FALSE)

Definition at line 123 of file seq.h.

#define blPDBProt2SeqXNoX ( x ) blDoPDB2Seq((x), TRUE, TRUE, TRUE)

Definition at line 115 of file seq.h.

#define blPDBProt2SeqXNoXByChain ( x ) blDoPDB2SeqByChain((x), TRUE, TRUE, TRUE)

Definition at line 125 of file seq.h.

Function Documentation

int blAffinealign	(	char *	seq1,
		int	length1,
		char *	seq2,
		int	length2,
		BOOL	verbose,
		BOOL	identity,
		int	penalty,
		int	penext,
		char *	align1,
		char *	align2,
		int *	align_len
	)

Parameters

[in]	*seq1	First sequence
[in]	length1	First sequence length
[in]	*seq2	Second sequence
[in]	length2	Second sequence length
[in]	verbose	Display N&W matrix
[in]	identity	Use identity matrix
[in]	penalty	Gap insertion penalty value
[in]	penext	Extension penalty
[out]	*align1	Sequence 1 aligned
[out]	*align2	Sequence 2 aligned
[out]	*align_len	Alignment length

Returns: Alignment score (0 on error)

Perform simple N&W alignment of seq1 and seq2. No window is used, so will be slow for long sequences.

Note that you must allocate sufficient memory for the aligned sequences. The easy way to do this is to ensure that align1 and align2 are of length (length1+length2).

07.10.92 Adapted from original written while at NIMR
08.10.92 Split into separate routines
09.10.92 Changed best structure to simple integers, moved SearchForBest() into TraceBack()
21.08.95 Was only filling in the bottom right cell at initialisation rather than all the right hand column and bottom row
11.07.96 Changed calls to calcscore() to CalcMDMScore()
06.03.00 Changed name to affinealign() (the routine align() is provided as a backwards compatible wrapper). Added penext parameter. Now supports affine gap penalties with separate opening and extension penalties. The code now maintains the path as it goes.

07.07.14 Use bl prefix for functions By: CTP

  NOTE AND CHANGES SHOULD BE PROPAGATED TO affinealignuc()   ******

Definition at line 275 of file align.c.

int blAffinealignuc	(	char *	seq1,
		int	length1,
		char *	seq2,
		int	length2,
		BOOL	verbose,
		BOOL	identity,
		int	penalty,
		int	penext,
		char *	align1,
		char *	align2,
		int *	align_len
	)

Parameters

[in]	*seq1	First sequence
[in]	length1	First sequence length
[in]	*seq2	Second sequence
[in]	length2	Second sequence length
[in]	verbose	Display N&W matrix
[in]	identity	Use identity matrix
[in]	penalty	Gap insertion penalty value
[in]	penext	Extension penalty
[out]	*align1	Sequence 1 aligned
[out]	*align2	Sequence 2 aligned
[out]	*align_len	Alignment length

Returns: Alignment score (0 on error)

Perform simple N&W alignment of seq1 and seq2. No window is used, so will be slow for long sequences.

Note that you must allocate sufficient memory for the aligned sequences. The easy way to do this is to ensure that align1 and align2 are of length (length1+length2).

07.10.92 Adapted from original written while at NIMR
08.10.92 Split into separate routines
09.10.92 Changed best structure to simple integers, moved SearchForBest() into TraceBack()
21.08.95 Was only filling in the bottom right cell at initialisation rather than all the right hand column and bottom row
11.07.96 Changed calls to calcscore() to CalcMDMScore()
06.03.00 Changed name to affinealign() (the routine align() is provided as a backwards compatible wrapper). Added penext parameter. Now supports affine gap penalties with separate opening and extension penalties. The code now maintains the path as it goes.
27.02.07 Exactly as affinealign() but upcases characters before comparison
07.07.14 Use bl prefix for functions By: CTP

      NOTE AND CHANGES SHOULD BE PROPAGATED TO affinealign()    ******

Definition at line 583 of file align.c.

int blAlign	(	char *	seq1,
		int	length1,
		char *	seq2,
		int	length2,
		BOOL	verbose,
		BOOL	identity,
		int	penalty,
		char *	align1,
		char *	align2,
		int *	align_len
	)

Parameters

[in]	*seq1	First sequence
[in]	length1	First sequence length
[in]	*seq2	Second sequence
[in]	length2	Second sequence length
[in]	verbose	Display N&W matrix
[in]	identity	Use identity matrix
[in]	penalty	Gap insertion penalty value
[out]	*align1	Sequence 1 aligned
[out]	*align2	Sequence 2 aligned
[out]	*align_len	Alignment length

Returns: Alignment score (0 on error)

Perform simple N&W alignment of seq1 and seq2. No window is used, so will be slow for long sequences.

A single gap penalty is used, so gap extension incurrs no further penalty.

Note that you must allocate sufficient memory for the aligned sequences. The easy way to do this is to ensure that align1 and align2 are of length (length1+length2).

06.03.00 Implemented as a wrapper to affinealign() which is the old align() routine, plus support for affine gap penalties, plus new traceback code based on storing the path as we go
07.07.14 Use bl prefix for functions By: CTP

Definition at line 214 of file align.c.

int blCalcMDMScore	(	char	resa,
		char	resb
	)

Parameters

[in]	resa	First residue
[in]	resb	Second residue

Returns: score

Calculate score from static globally stored mutation data matrix

If both residues are set as '\0' it will simply silence all warnings

07.10.92 Adapted from NIMR-written original
24.11.94 Only gives 10 warnings
28.02.95 Modified to use sMDMSize
24.08.95 If a residue was not found was doing an out-of-bounds array reference causing a potential core dump
11.07.96 Name changed from calcscore() and now non-static
07.07.14 Use bl prefix for functions By: CTP
04.01.16 Added special call with both residues set to '\0' to silence warnings. Also warnings now go to stderr

Definition at line 1220 of file align.c.

int blCalcMDMScoreUC	(	char	resa,
		char	resb
	)

Parameters

[in]	resa	First residue
[in]	resb	Second residue

Returns: score

Calculate score from static globally stored mutation data matrix

07.10.92 Adapted from NIMR-written original
24.11.94 Only gives 10 warnings
28.02.95 Modified to use sMDMSize
24.08.95 If a residue was not found was doing an out-of-bounds array reference causing a potential core dump
11.07.96 Name changed from calcscore() and now non-static
27.02.07 As CalcMDMScore() but upcases characters before comparison
07.07.14 Use bl prefix for functions By: CTP
04.01.16 Added special call with both residues set to '\0' to silence warnings. Also warnings now go to stderr

Definition at line 1293 of file align.c.

char blDNAtoAA ( char * dna )

Parameters

[in] *dna DNA/RNA codon

Returns: 1-letter amino acid code (X=termination)

Converts a nucleic acid codon to the 1-letter amino acid equivalent. Termination codons are returned as X. No special action is taken for initiation codons.

18.04.94 Original By: ACRM
07.07.14 Use bl prefix for functions By: CTP

Definition at line 110 of file DNAtoAA.c.

char* blDoPDB2Seq	(	PDB *	pdb,
		BOOL	DoAsxGlx,
		BOOL	ProtOnly,
		BOOL	NoX
	)

Parameters

[in]	*pdb	PDB linked list
[in]	DoAsxGlx	Handle Asx and Glx as B and Z rather than X
[in]	ProtOnly	Don't do DNA/RNA; these simply don't get done rather than being handled as X
[in]	NoX	Skip amino acids which would be assigned as X

Returns: Allocated character array containing sequence

malloc()'s an array containing the 1-letter sequence corresponding to an input PDB linked list. Returns NULL if given a NULL parameter or memory allocation fails. Puts *'s in the sequence for multi-chains.

This routine is normally called via the macro interfaces: PDB2Seq(pdb), PDB2SeqX(pdb), PDBProt2Seq(pdb), PDBProt2SeqX(pdb) Those with Prot in their names handle protein only; those with X handle Asx/Glx as B/Z rather than as X

29.09.92 Original By: ACRM
07.06.93 Corrected allocation.
18.06.93 Handles multi-chains and skips NTER and CTER residues
13.05.94 Check for chain change before copy residue (!) (Bug reported by Bob MacCullum)
19.07.95 Added check for ATOM records
24.01.96 Returns blank string (rather than core dumping!) if the linked list contained no ATOM records
26.08.97 Changed to doPDB2Seq with extra parameters (DoAsxGlx & ProtOnly). The old calling forms have now become macros
02.10.00 Added NoX
10.06.05 Changed the initialization of rescount, resnum, etc. so it correctly points to the first residue. This solves a bug with CA-only chains where it was undercounting by 1
04.02.14 Use CHAINMATCH By: CTP
07.07.14 Use bl prefix for functions By: CTP

Definition at line 146 of file PDB2Seq.c.

HASHTABLE* blDoPDB2SeqByChain	(	PDB *	pdb,
		BOOL	DoAsxGlx,
		BOOL	ProtOnly,
		BOOL	NoX
	)

Parameters

[in]	*pdb	PDB linked list
[in]	DoAsxGlx	Handle Asx and Glx as B and Z rather than X
[in]	ProtOnly	Don't do DNA/RNA; these simply don't get done rather than being handled as X
[in]	NoX	Skip amino acids which would be assigned as X

Returns: A hash of 1-letter code sequences indexed by chain label

Reads sequence from ATOM records in 1-letter code, storing the results in a hash indexed by chain label.

This routine is normally called via the macro interfaces: PDB2SeqByCHain(pdb), PDB2SeqXByCHain(pdb), PDBProt2SeqByChain(pdb), PDBProt2SeqXByChain(pdb) Those with Prot in their names handle protein only; those with X handle Asx/Glx as B/Z rather than as X

30.11.15 Original based on blDoPDB2Seq() By: ACRM

Definition at line 294 of file PDB2Seq.c.

int blKnownSeqLen ( char * sequence )

Parameters

[in] *sequence A sequence containing deletions

Returns: Length without deletions

Scans a 1-letter code sequence and calculate the length without `-', ` ' or '?' residues

13.05.94 Original By: ACRM
07.07.14 Use bl prefix for functions By: CTP

Definition at line 107 of file KnownSeqLen.c.

int blNumericAffineAlign	(	int *	seq1,
		int	length1,
		int *	seq2,
		int	length2,
		BOOL	verbose,
		BOOL	identity,
		int	penalty,
		int	penext,
		int *	align1,
		int *	align2,
		int *	align_len
	)

Parameters

[in]	*seq1	First sequence of tokens
[in]	length1	First sequence length
[in]	*seq2	Second sequence of tokens
[in]	length2	Second sequence length
[in]	verbose	Display N&W matrix
[in]	identity	Use identity matrix
[in]	penalty	Gap insertion penalty value
[in]	penext	Extension penalty
[out]	*align1	Sequence 1 aligned
[out]	*align2	Sequence 2 aligned
[out]	*align_len	Alignment length

Returns: Alignment score (0 on error)

Perform simple N&W alignment of seq1 and seq2. No window is used, so will be slow for long sequences.

The sequences come as integer arrays containing numeric tokens

Note that you must allocate sufficient memory for the aligned sequences. The easy way to do this is to ensure that align1 and align2 are of length (length1+length2).

Identical to align.c/affinealign(), but uses integer arrays

08.03.00 Original based on align.c/affinealign() 06.03.00 By: ACRM
07.07.14 Use bl prefix for functions By: CTP

Definition at line 412 of file NumericAlign.c.

int blNumericCalcMDMScore	(	int	resa,
		int	resb
	)

Parameters

[in]	resa	First token
[in]	resb	Second token

Returns: score

Calculate score from static globally stored mutation data matrix

Identical to align.c/CalcMDMScore(), but uses a different static score array and takes integer parameters. These are used as direct lookups into the score array rather than being searched.

08.03.00 Original based on align.c/CalcMDMScore() 11.07.96 By: ACRM
07.07.14 Use bl prefix for functions By: CTP

Definition at line 342 of file NumericAlign.c.

BOOL blNumericReadMDM ( char * mdmfile )

Parameters

[in] *mdmfile Mutation data matrix filename

Returns: Success?

Read mutation data matrix into static global arrays. The matrix may have comments at the start introduced with a ! in the first column. The matrix must be complete (i.e. a triangular matrix will not work). A line describing the residue types must appear, and may be placed before or after the matrix itself

Identical to align.c/ReadMDM() but reads into a different static 2D array and doesn't read a symbol identifier line from the file as the symbols are numeric and always start from 1 (0 is used as the insert character)

08.03.00 Original based on align.c/ReadMDM() 26.07.95 By: ACRM
06.02.03 Fixed for new version of GetWord()
07.07.14 Use bl prefix for functions By: CTP

Definition at line 258 of file NumericAlign.c.

char* blOnethr ( char one )

Parameters

[in] one One letter code

Returns: Three letter code (padded to 4 chars with a space)

Converts 1-letter code to 3-letter code (actually as 4 chars).

07.06.93 Original By: ACRM
25.07.95 If the gBioplibSeqNucleicAcid flag is set, assumes nucleic acids rather than amino acids
03.02.09 Fixed nucleic search - j was incrementing instead of decrementing!
07.07.14 Use bl prefix for functions By: CTP

Definition at line 223 of file throne.c.

BOOL blReadMDM ( char * mdmfile )

Parameters

[in] *mdmfile Mutation data matrix filename

Returns: Success?

Read mutation data matrix into static global arrays. The matrix may have comments at the start introduced with a ! in the first column. The matrix must be complete (i.e. a triangular matrix will not work). A line describing the residue types must appear, and may be placed before or after the matrix itself

07.10.92 Original
18.03.94 getc() -> fgetc()
24.11.94 Automatically looks in DATAENV if not found in current directory
28.02.95 Modified to read any size MDM and allow comments Also allows the list of aa types before or after the actual matrix
26.07.95 Removed unused variables
06.02.03 Fixed for new version of GetWord()
07.04.09 Completely re-written to allow it to read BLAST style matrix files as well as the ones used previously Allow comments introduced with # as well as ! Uses MAXWORD rather than hardcoded 16
07.07.14 Use bl prefix for functions By: CTP

Definition at line 871 of file align.c.

int blReadPIR	(	FILE *	fp,
		BOOL	DoInsert,
		char **	seqs,
		int	maxchain,
		SEQINFO *	seqinfo,
		BOOL *	punct,
		BOOL *	error
	)

Parameters

[in]	*fp	File pointer
[in]	DoInsert	TRUE Read - characters into the sequence FALSE Skip - characters
[in]	maxchain	Max number of chains to read. This is the dimension of the seqs array. N.B. THIS SHOULD BE AT LEAST 1 MORE THAN THE EXPECTED MAXIMUM NUMBER OF SEQUENCES
[out]	**seqs	Array of character pointers which will be filled in with sequence information. Memory will be allocated for any sequence length.
[out]	*seqinfo	This structure will be filled in with extra information about the sequence. Header & title information and details of any punctuation.
[out]	*punct	TRUE if any punctuation found.
[out]	*error	TRUE if an error occured (e.g. memory allocation)

Returns: Number of chains in this sequence. 0 if file ended, or no valid sequence entries found.

This is an all-singing, all-dancing PIR reader which should handle all legal PIR files and some (slightly) incorrect ones. The only requirements of the code are that the PIR file should have 2 title lines per entry, the first line starting with a > sign.

The routine will handle multiple sequence files. Successive calls will return information on the next entry. The routine will return 0 when there are no more entries.

Header line: Must start with >. Will handle files which don't have the proper P1; or F1; parts of the header as well as those which do.

Title line: Will read the name and source fields if correctly separated by a -, otherwise copies all information into the name.

Sequence: May contain allowed puctuation. This will set the punct flag and information on the types found will be placed in seqinfo. White space and line breaks are ignored. Each chain should end with a *, but the routine will accept the last chain of an entry with no . While the standard requires upper case text, this routine will handle lower case and convert it to upper case. While the routine does pretty well at last chains not terminated with a *, a last chain ending with a / not followed by a * but followed by a text line will be identified as incomplete rather than truncated. If the DoInsert flag is set, - signs in the sequence will be read as part of the sequence, otherwise they will be skipped. This is an addition to the PIR standard.

Text lines: Text lines after an entry (beginning with R;, C;, A;, N; or F;) are ignored.

02.03.94 Original By: ACRM
03.03.94 Added / and = handling, upcasing, strcpy()->strncpy(), header lines without semi-colon, title lines without -
07.03.94 Added sequence insertion handling and DoInsert parameter.
11.05.94 buffer is now 504 characters (V38.0 spec allows 500 chars) Removes leading spaces from entry code and terminates at first space (V39.0 spec allows comments after the code).
28.02.95 Added check that buffer doesn't overflow. Check on nseq changed to >=
06.02.96 Removes trailing spaces from comment line
07.07.14 Use bl prefix for functions By: CTP

Definition at line 180 of file ReadPIR.c.

int blReadRawPIR	(	FILE *	fp,
		char **	seqs,
		int	maxchain,
		BOOL	upcase,
		SEQINFO *	seqinfo,
		BOOL *	error
	)

Parameters

[in]	*fp	File pointer
[in]	maxchain	Max number of chains to read. This is the dimension of the seqs array. N.B. THIS SHOULD BE AT LEAST 1 MORE THAN THE EXPECTED MAXIMUM NUMBER OF SEQUENCES
[in]	upcase	Should lower-case letters be upcased?
[out]	**seqs	Array of character pointers which will be filled in with sequence information. Memory will be allocated for any sequence length.
[out]	*seqinfo	This structure will be filled in with extra information about the sequence. Header & title information and details of any punctuation.
[out]	*error	TRUE if an error occured (e.g. memory allocation)

Returns: Number of chains in this sequence. 0 if file ended, or no valid sequence entries found.

This is based on ReadPIR(), but reads all characters into the sequence arrays (i.e. all punctuation characters are read as is). This is useful when punctuation has been used to indicate consensus sequence features.

The only requirements of the code are that the PIR file should have 2 title lines per entry, the first line starting with a > sign. The routine will handle multiple sequence files. Successive calls will return information on the next entry. The routine will return 0 when there are no more entries.

Header line: Must start with >. Will handle files which don't have the proper P1; or F1; parts of the header as well as those which do.

Title line: Will read the name and source fields if correctly separated by a -, otherwise copies all information into the name.

White space and line breaks are ignored. Each chain should end with a *, but the routine will accept the last chain of an entry with no . While the standard requires upper case text, this routine will handle lower case and convert it to upper case. While the routine does pretty well at last chains not terminated with a *, a last chain ending with a / not followed by a * but followed by a text line will be identified as incomplete rather than truncated. If the DoInsert flag is set, - signs in the sequence will be read as part of the sequence, otherwise they will be skipped. This is an addition to the PIR standard.

Text lines: Text lines after an entry (beginning with R;, C;, A;, N; or F;) are ignored.

28.02.95 Original based on ReadPIR() By: ACRM
13.03.95 chpos++ had got moved wrongly when adapting from ReadPIR(). Put it back fixing handling of text lines.
26.07.95 Removed unused variables
06.02.96 Remove any trailing spaces

Definition at line 169 of file ReadRawPIR.c.

int blReadSimplePIR	(	FILE *	fp,
		int	maxres,
		char **	seqs
	)

Parameters

[in]	*fp	File pointer
[in]	maxres	Max number of residues in chain.
[out]	**seqs	Array of pointers to sequences

Returns: Number of chains. 0 if error

Read a PIR file containing multiple chains of up to maxres amino acids. Each chain is returned in seqs[]. The number of chains is returned by the routine. 0 is returned if a memory allocation failed

01.06.91 Original
03.03.94 Added check on case before toupper(). Changed name.
18.03.94 Changed getc() to fgetc()
07.07.14 Use bl prefix for functions By: CTP

Definition at line 121 of file ReadSimplePIR.c.

void blSetMDMScoreWeight	(	char	resa,
		char	resb,
		REAL	weight
	)

Parameters

[in]	resa	First residue
[in]	resb	Second residue
[in]	weight	Weight to apply

Apply a weight to a particular amino acid substitution

26.08.14 Original By: ACRM

Definition at line 1408 of file align.c.

int blSplitSeq	(	char *	LinearSeq,
		char **	seqs
	)

Parameters

[in]	*LinearSeq	Array containing sequence with chains terminated by *'s
[out]	**seqs	Allocated set of character arrays containing one chain per array

Returns: Number of chains found

Splits a sequence stored as a linear array with each chain separated by a * into an array of sequences. Returns the number of chains found.

18.06.93 Original By: ACRM
09.07.93 Cleans up properly of allocation failed
07.09.94 Sequence space was being allocated one too small
07.07.14 Use bl prefix for functions By: CTP

Definition at line 115 of file SplitSeq.c.

char blThrone ( char * three )

Parameters

[in] *three Three letter code

Returns: One letter code

Converts 3-letter code to 1-letter code. Handles ASX and GLX as X

29.09.92 Original By: ACRM
11.03.94 Modified to handle ASX and GLX in the tables
25.07.95 Added handling of gBioplibSeqNucleicAcid
07.07.14 Use bl prefix for functions By: CTP

Definition at line 153 of file throne.c.

char blThronex ( char * three )

Parameters

[in] *three Three letter code

Returns: One letter code

Converts 3-letter code to 1-letter code. Handles ASX and GLX as B and Z.

29.09.92 Original By: ACRM
25.07.95 Added handling of gBioplibSeqNucleicAcid
07.07.14 Use bl prefix for functions By: CTP

Definition at line 188 of file throne.c.

int blTrueSeqLen ( char * sequence )

Parameters

[in] *sequence A sequence containing deletions

Returns: Length without deletions

Scans a 1-letter code sequence and calculate the length without `-' or ` ' residues

14.04.94 Original By: ACRM
07.07.14 Use bl prefix for functions By: CTP

Definition at line 106 of file TrueSeqLen.c.

void blWriteOneStringPIR	(	FILE *	out,
		char *	label,
		char *	title,
		char *	sequence,
		char **	chainLabels,
		BOOL	ByChain,
		BOOL	doFasta
	)

Parameters

[in]	*out	File pointer
[in]	*label	Sequence label
[in]	*title	Sequence title
[in]	*sequence	Sequence (1-letter code) with chains separated by *
[in]	**chainLabels	Chain labels (may be set to NULL unless ByChain is set)
[in]	ByChain	Print a separate header for each chain
[in]	doFasta	Output FASTA format instead of PIR

Writes a PIR sequence file from a 1-letter code sequence. Multiple chains are split with '*'. If ByChain is set the the chainLabels array must be non-NULL and contains labels for each chain Adds a terminating * if required.

10.05.94 Original By: ACRM
22.08.97 Can now handle chains separately
26.08.97 If chains are handled separately, don't bother writing out an empty chain
10.08.98 Basically a total rewrite to fix a bug which caused the header not to be printed with -c -p for a chain after one which was non-protein. Much simplified the code by printing the header at the beginning of a chain rather than end of previous chain.
18.10.00 Added code to write FASTA as well
11.06.15 Reset count=0 after a * - tidies up the output!

Definition at line 116 of file WritePIR.c.

int blZeroMDM ( void )

Returns: Maximum value in modified matrix

Modifies all values in the MDM such that the minimum value is 0

17.09.96 Original
07.07.14 Use bl prefix for functions By: CTP

Definition at line 1358 of file align.c.

Variable Documentation

BOOL gBioplibSeqNucleicAcid

Definition at line 130 of file throne.c.

Data Structures

Macros

Functions

Variables

Detailed Description

Description:

Usage:

Revision History:

Macro Definition Documentation

Function Documentation

Variable Documentation