|
#define | blMAXPIRLABEL 160 |
|
#define | ALLOCSIZE |
|
#define | blPDB2Seq(x) blDoPDB2Seq((x), FALSE, FALSE, FALSE) |
|
#define | blPDB2SeqX(x) blDoPDB2Seq((x), TRUE, FALSE, FALSE) |
|
#define | blPDB2SeqNoX(x) blDoPDB2Seq((x), FALSE, FALSE, TRUE) |
|
#define | blPDB2SeqXNoX(x) blDoPDB2Seq((x), TRUE, FALSE, TRUE) |
|
#define | blPDBProt2Seq(x) blDoPDB2Seq((x), FALSE, TRUE, FALSE) |
|
#define | blPDBProt2SeqX(x) blDoPDB2Seq((x), TRUE, TRUE, FALSE) |
|
#define | blPDBProt2SeqNoX(x) blDoPDB2Seq((x), FALSE, TRUE, TRUE) |
|
#define | blPDBProt2SeqXNoX(x) blDoPDB2Seq((x), TRUE, TRUE, TRUE) |
|
#define | blPDB2SeqByChain(x) blDoPDB2SeqByChain((x), FALSE, FALSE, FALSE) |
|
#define | blPDB2SeqXByChain(x) blDoPDB2SeqByChain((x), TRUE, FALSE, FALSE) |
|
#define | blPDB2SeqNoXByChain(x) blDoPDB2SeqByChain((x), FALSE, FALSE, TRUE) |
|
#define | blPDB2SeqXNoXByChain(x) blDoPDB2SeqByChain((x), TRUE, FALSE, TRUE) |
|
#define | blPDBProt2SeqByChain(x) blDoPDB2SeqByChain((x), FALSE, TRUE, FALSE) |
|
#define | blPDBProt2SeqXByChain(x) blDoPDB2SeqByChain((x), TRUE, TRUE, FALSE) |
|
#define | blPDBProt2SeqNoXByChain(x) blDoPDB2SeqByChain((x), FALSE, TRUE, TRUE) |
|
#define | blPDBProt2SeqXNoXByChain(x) blDoPDB2SeqByChain((x), TRUE, TRUE, TRUE) |
|
#define | _SEQ_H_DEPRECATED |
|
|
char | blThrone (char *three) |
|
char | blThronex (char *three) |
|
char * | blOnethr (char one) |
|
char * | blDoPDB2Seq (PDB *pdb, BOOL DoAsxGlx, BOOL ProtOnly, BOOL NoX) |
|
HASHTABLE * | blDoPDB2SeqByChain (PDB *pdb, BOOL DoAsxGlx, BOOL ProtOnly, BOOL NoX) |
|
int | blSplitSeq (char *LinearSeq, char **seqs) |
|
int | blReadSimplePIR (FILE *fp, int maxres, char **seqs) |
|
int | blReadPIR (FILE *fp, BOOL DoInsert, char **seqs, int maxchain, SEQINFO *seqinfo, BOOL *punct, BOOL *error) |
|
int | blReadRawPIR (FILE *fp, char **seqs, int maxchain, BOOL upcase, SEQINFO *seqinfo, BOOL *error) |
|
int | blAlign (char *seq1, int length1, char *seq2, int length2, BOOL verbose, BOOL identity, int penalty, char *align1, char *align2, int *align_len) |
|
int | blAffinealign (char *seq1, int length1, char *seq2, int length2, BOOL verbose, BOOL identity, int penalty, int penext, char *align1, char *align2, int *align_len) |
|
int | blCalcMDMScore (char resa, char resb) |
|
int | blAffinealignuc (char *seq1, int length1, char *seq2, int length2, BOOL verbose, BOOL identity, int penalty, int penext, char *align1, char *align2, int *align_len) |
|
int | blCalcMDMScoreUC (char resa, char resb) |
|
BOOL | blReadMDM (char *mdmfile) |
|
int | blZeroMDM (void) |
|
char | blDNAtoAA (char *dna) |
|
int | blTrueSeqLen (char *sequence) |
|
int | blKnownSeqLen (char *sequence) |
|
BOOL | blNumericReadMDM (char *mdmfile) |
|
int | blNumericCalcMDMScore (int resa, int resb) |
|
int | blNumericAffineAlign (int *seq1, int length1, int *seq2, int length2, BOOL verbose, BOOL identity, int penalty, int penext, int *align1, int *align2, int *align_len) |
|
void | blSetMDMScoreWeight (char resa, char resb, REAL weight) |
|
void | blWriteOneStringPIR (FILE *out, char *label, char *title, char *sequence, char **chains, BOOL ByChain, BOOL doFasta) |
|
Header file for sequence handling.
- Version
- V2.16
- Date
- 30.11.15
- Copyright
- (c) UCL / Dr. Andrew C. R. Martin 1991-2015
- Author
- Dr. Andrew C. R. Martin
- Institute of Structural & Molecular Biology, University College London, Gower Street, London. WC1E 6BT.
- andre.nosp@m.w@bi.nosp@m.oinf..nosp@m.org..nosp@m.uk andre.nosp@m.w.ma.nosp@m.rtin@.nosp@m.ucl..nosp@m.ac.uk
This code is NOT IN THE PUBLIC DOMAIN, but it may be copied according to the conditions laid out in the accompanying file COPYING.DOC.
The code may be modified as required, but any modifications must be documented so that the person responsible can be identified.
The code may not be sold commercially or included as part of a commercial product except as described in the file COPYING.DOC.
Description:
Usage:
Revision History:
- V2.0 11.03.94 Original V2 release
- V2.1 11.05.94 Added DNAtoAA() & TrueSeqLen() prototypes
- V2.2 13.05.93 Added KnownSeqLen() prototype
- V2.3 28.02.95 Added ReadRawPIR()
- V2.4 25.07.95 Added the gBioplibSeqNucleicAcid external for throne()
- V2.5 11.07.96 Added CalcMDMScore()
- V2.6 17.09.95 Added ZeroMDM()
- V2.7 26.08.97 Added macro interfaces to new DoPDB2Seq()
- V2.8 08.03.00 Added Numeric***() alignment routines
- V2.9 02.10.00 Modified DoPDB2Seq()
- V2.10 27.02.07 Added CalcMDMScoreUC() and affinealignuc()
- V2.11 07.07.14 Use bl prefix for functions. Renamed PDB2Seq macros to blPDB2Seq and added PDB2Seq defines to deprecated.h By: CTP
- V2.12 31.07.14 Updated deprecation: Removed deprecated.h, added prototypes for renamed functions and defines for PDB2Seq macros. By: CTP
- V2.13 14.08.14 Moved deprecated function prototypes to deprecated.h By: CTP
- V2.14 26.08.14 Added blSetMDMScoreWeight()
- V2.15 11.06.15 Added blWriteOneStringPIR()
- V2.16 30.11.15 Added wrapper macros for blDoPDB2SeqByChain() and prototype
Definition in file seq.h.
int blAffinealignuc |
( |
char * |
seq1, |
|
|
int |
length1, |
|
|
char * |
seq2, |
|
|
int |
length2, |
|
|
BOOL |
verbose, |
|
|
BOOL |
identity, |
|
|
int |
penalty, |
|
|
int |
penext, |
|
|
char * |
align1, |
|
|
char * |
align2, |
|
|
int * |
align_len |
|
) |
| |
- Parameters
-
[in] | *seq1 | First sequence |
[in] | length1 | First sequence length |
[in] | *seq2 | Second sequence |
[in] | length2 | Second sequence length |
[in] | verbose | Display N&W matrix |
[in] | identity | Use identity matrix |
[in] | penalty | Gap insertion penalty value |
[in] | penext | Extension penalty |
[out] | *align1 | Sequence 1 aligned |
[out] | *align2 | Sequence 2 aligned |
[out] | *align_len | Alignment length |
- Returns
- Alignment score (0 on error)
Perform simple N&W alignment of seq1 and seq2. No window is used, so will be slow for long sequences.
Note that you must allocate sufficient memory for the aligned sequences. The easy way to do this is to ensure that align1 and align2 are of length (length1+length2).
- 07.10.92 Adapted from original written while at NIMR
- 08.10.92 Split into separate routines
- 09.10.92 Changed best structure to simple integers, moved SearchForBest() into TraceBack()
- 21.08.95 Was only filling in the bottom right cell at initialisation rather than all the right hand column and bottom row
- 11.07.96 Changed calls to calcscore() to CalcMDMScore()
- 06.03.00 Changed name to affinealign() (the routine align() is provided as a backwards compatible wrapper). Added penext parameter. Now supports affine gap penalties with separate opening and extension penalties. The code now maintains the path as it goes.
- 27.02.07 Exactly as affinealign() but upcases characters before comparison
- 07.07.14 Use bl prefix for functions By: CTP
NOTE AND CHANGES SHOULD BE PROPAGATED TO affinealign() ******
Definition at line 583 of file align.c.
int blAlign |
( |
char * |
seq1, |
|
|
int |
length1, |
|
|
char * |
seq2, |
|
|
int |
length2, |
|
|
BOOL |
verbose, |
|
|
BOOL |
identity, |
|
|
int |
penalty, |
|
|
char * |
align1, |
|
|
char * |
align2, |
|
|
int * |
align_len |
|
) |
| |
- Parameters
-
[in] | *seq1 | First sequence |
[in] | length1 | First sequence length |
[in] | *seq2 | Second sequence |
[in] | length2 | Second sequence length |
[in] | verbose | Display N&W matrix |
[in] | identity | Use identity matrix |
[in] | penalty | Gap insertion penalty value |
[out] | *align1 | Sequence 1 aligned |
[out] | *align2 | Sequence 2 aligned |
[out] | *align_len | Alignment length |
- Returns
- Alignment score (0 on error)
Perform simple N&W alignment of seq1 and seq2. No window is used, so will be slow for long sequences.
A single gap penalty is used, so gap extension incurrs no further penalty.
Note that you must allocate sufficient memory for the aligned sequences. The easy way to do this is to ensure that align1 and align2 are of length (length1+length2).
- 06.03.00 Implemented as a wrapper to affinealign() which is the old align() routine, plus support for affine gap penalties, plus new traceback code based on storing the path as we go
- 07.07.14 Use bl prefix for functions By: CTP
Definition at line 214 of file align.c.
- Parameters
-
[in] | *pdb | PDB linked list |
[in] | DoAsxGlx | Handle Asx and Glx as B and Z rather than X |
[in] | ProtOnly | Don't do DNA/RNA; these simply don't get done rather than being handled as X |
[in] | NoX | Skip amino acids which would be assigned as X |
- Returns
- Allocated character array containing sequence
malloc()'s an array containing the 1-letter sequence corresponding to an input PDB linked list. Returns NULL if given a NULL parameter or memory allocation fails. Puts *'s in the sequence for multi-chains.
This routine is normally called via the macro interfaces: PDB2Seq(pdb), PDB2SeqX(pdb), PDBProt2Seq(pdb), PDBProt2SeqX(pdb) Those with Prot in their names handle protein only; those with X handle Asx/Glx as B/Z rather than as X
- 29.09.92 Original By: ACRM
- 07.06.93 Corrected allocation.
- 18.06.93 Handles multi-chains and skips NTER and CTER residues
- 13.05.94 Check for chain change before copy residue (!) (Bug reported by Bob MacCullum)
- 19.07.95 Added check for ATOM records
- 24.01.96 Returns blank string (rather than core dumping!) if the linked list contained no ATOM records
- 26.08.97 Changed to doPDB2Seq with extra parameters (DoAsxGlx & ProtOnly). The old calling forms have now become macros
- 02.10.00 Added NoX
- 10.06.05 Changed the initialization of rescount, resnum, etc. so it correctly points to the first residue. This solves a bug with CA-only chains where it was undercounting by 1
- 04.02.14 Use CHAINMATCH By: CTP
- 07.07.14 Use bl prefix for functions By: CTP
Definition at line 146 of file PDB2Seq.c.
- Parameters
-
[in] | *pdb | PDB linked list |
[in] | DoAsxGlx | Handle Asx and Glx as B and Z rather than X |
[in] | ProtOnly | Don't do DNA/RNA; these simply don't get done rather than being handled as X |
[in] | NoX | Skip amino acids which would be assigned as X |
- Returns
- A hash of 1-letter code sequences indexed by chain label
Reads sequence from ATOM records in 1-letter code, storing the results in a hash indexed by chain label.
This routine is normally called via the macro interfaces: PDB2SeqByCHain(pdb), PDB2SeqXByCHain(pdb), PDBProt2SeqByChain(pdb), PDBProt2SeqXByChain(pdb) Those with Prot in their names handle protein only; those with X handle Asx/Glx as B/Z rather than as X
Definition at line 294 of file PDB2Seq.c.
int blNumericAffineAlign |
( |
int * |
seq1, |
|
|
int |
length1, |
|
|
int * |
seq2, |
|
|
int |
length2, |
|
|
BOOL |
verbose, |
|
|
BOOL |
identity, |
|
|
int |
penalty, |
|
|
int |
penext, |
|
|
int * |
align1, |
|
|
int * |
align2, |
|
|
int * |
align_len |
|
) |
| |
- Parameters
-
[in] | *seq1 | First sequence of tokens |
[in] | length1 | First sequence length |
[in] | *seq2 | Second sequence of tokens |
[in] | length2 | Second sequence length |
[in] | verbose | Display N&W matrix |
[in] | identity | Use identity matrix |
[in] | penalty | Gap insertion penalty value |
[in] | penext | Extension penalty |
[out] | *align1 | Sequence 1 aligned |
[out] | *align2 | Sequence 2 aligned |
[out] | *align_len | Alignment length |
- Returns
- Alignment score (0 on error)
Perform simple N&W alignment of seq1 and seq2. No window is used, so will be slow for long sequences.
The sequences come as integer arrays containing numeric tokens
Note that you must allocate sufficient memory for the aligned sequences. The easy way to do this is to ensure that align1 and align2 are of length (length1+length2).
Identical to align.c/affinealign(), but uses integer arrays
- 08.03.00 Original based on align.c/affinealign() 06.03.00 By: ACRM
- 07.07.14 Use bl prefix for functions By: CTP
Definition at line 412 of file NumericAlign.c.
BOOL blNumericReadMDM |
( |
char * |
mdmfile | ) |
|
- Parameters
-
[in] | *mdmfile | Mutation data matrix filename |
- Returns
- Success?
Read mutation data matrix into static global arrays. The matrix may have comments at the start introduced with a ! in the first column. The matrix must be complete (i.e. a triangular matrix will not work). A line describing the residue types must appear, and may be placed before or after the matrix itself
Identical to align.c/ReadMDM() but reads into a different static 2D array and doesn't read a symbol identifier line from the file as the symbols are numeric and always start from 1 (0 is used as the insert character)
- 08.03.00 Original based on align.c/ReadMDM() 26.07.95 By: ACRM
- 06.02.03 Fixed for new version of GetWord()
- 07.07.14 Use bl prefix for functions By: CTP
Definition at line 258 of file NumericAlign.c.
int blReadPIR |
( |
FILE * |
fp, |
|
|
BOOL |
DoInsert, |
|
|
char ** |
seqs, |
|
|
int |
maxchain, |
|
|
SEQINFO * |
seqinfo, |
|
|
BOOL * |
punct, |
|
|
BOOL * |
error |
|
) |
| |
- Parameters
-
[in] | *fp | File pointer |
[in] | DoInsert | TRUE Read - characters into the sequence FALSE Skip - characters |
[in] | maxchain | Max number of chains to read. This is the dimension of the seqs array. N.B. THIS SHOULD BE AT LEAST 1 MORE THAN THE EXPECTED MAXIMUM NUMBER OF SEQUENCES |
[out] | **seqs | Array of character pointers which will be filled in with sequence information. Memory will be allocated for any sequence length. |
[out] | *seqinfo | This structure will be filled in with extra information about the sequence. Header & title information and details of any punctuation. |
[out] | *punct | TRUE if any punctuation found. |
[out] | *error | TRUE if an error occured (e.g. memory allocation) |
- Returns
- Number of chains in this sequence. 0 if file ended, or no valid sequence entries found.
This is an all-singing, all-dancing PIR reader which should handle all legal PIR files and some (slightly) incorrect ones. The only requirements of the code are that the PIR file should have 2 title lines per entry, the first line starting with a > sign.
The routine will handle multiple sequence files. Successive calls will return information on the next entry. The routine will return 0 when there are no more entries.
Header line: Must start with >. Will handle files which don't have the proper P1; or F1; parts of the header as well as those which do.
Title line: Will read the name and source fields if correctly separated by a -, otherwise copies all information into the name.
Sequence: May contain allowed puctuation. This will set the punct flag and information on the types found will be placed in seqinfo. White space and line breaks are ignored. Each chain should end with a *, but the routine will accept the last chain of an entry with no . While the standard requires upper case text, this routine will handle lower case and convert it to upper case. While the routine does pretty well at last chains not terminated with a *, a last chain ending with a / not followed by a * but followed by a text line will be identified as incomplete rather than truncated. If the DoInsert flag is set, - signs in the sequence will be read as part of the sequence, otherwise they will be skipped. This is an addition to the PIR standard.
Text lines: Text lines after an entry (beginning with R;, C;, A;, N; or F;) are ignored.
- 02.03.94 Original By: ACRM
- 03.03.94 Added / and = handling, upcasing, strcpy()->strncpy(), header lines without semi-colon, title lines without -
- 07.03.94 Added sequence insertion handling and DoInsert parameter.
- 11.05.94 buffer is now 504 characters (V38.0 spec allows 500 chars) Removes leading spaces from entry code and terminates at first space (V39.0 spec allows comments after the code).
- 28.02.95 Added check that buffer doesn't overflow. Check on nseq changed to >=
- 06.02.96 Removes trailing spaces from comment line
- 07.07.14 Use bl prefix for functions By: CTP
Definition at line 180 of file ReadPIR.c.
int blReadRawPIR |
( |
FILE * |
fp, |
|
|
char ** |
seqs, |
|
|
int |
maxchain, |
|
|
BOOL |
upcase, |
|
|
SEQINFO * |
seqinfo, |
|
|
BOOL * |
error |
|
) |
| |
- Parameters
-
[in] | *fp | File pointer |
[in] | maxchain | Max number of chains to read. This is the dimension of the seqs array. N.B. THIS SHOULD BE AT LEAST 1 MORE THAN THE EXPECTED MAXIMUM NUMBER OF SEQUENCES |
[in] | upcase | Should lower-case letters be upcased? |
[out] | **seqs | Array of character pointers which will be filled in with sequence information. Memory will be allocated for any sequence length. |
[out] | *seqinfo | This structure will be filled in with extra information about the sequence. Header & title information and details of any punctuation. |
[out] | *error | TRUE if an error occured (e.g. memory allocation) |
- Returns
- Number of chains in this sequence. 0 if file ended, or no valid sequence entries found.
This is based on ReadPIR(), but reads all characters into the sequence arrays (i.e. all punctuation characters are read as is). This is useful when punctuation has been used to indicate consensus sequence features.
The only requirements of the code are that the PIR file should have 2 title lines per entry, the first line starting with a > sign. The routine will handle multiple sequence files. Successive calls will return information on the next entry. The routine will return 0 when there are no more entries.
Header line: Must start with >. Will handle files which don't have the proper P1; or F1; parts of the header as well as those which do.
Title line: Will read the name and source fields if correctly separated by a -, otherwise copies all information into the name.
White space and line breaks are ignored. Each chain should end with a *, but the routine will accept the last chain of an entry with no . While the standard requires upper case text, this routine will handle lower case and convert it to upper case. While the routine does pretty well at last chains not terminated with a *, a last chain ending with a / not followed by a * but followed by a text line will be identified as incomplete rather than truncated. If the DoInsert flag is set, - signs in the sequence will be read as part of the sequence, otherwise they will be skipped. This is an addition to the PIR standard.
Text lines: Text lines after an entry (beginning with R;, C;, A;, N; or F;) are ignored.
- 28.02.95 Original based on ReadPIR() By: ACRM
- 13.03.95 chpos++ had got moved wrongly when adapting from ReadPIR(). Put it back fixing handling of text lines.
- 26.07.95 Removed unused variables
- 06.02.96 Remove any trailing spaces
Definition at line 169 of file ReadRawPIR.c.