|
Bioplib
Protein Structure C Library
|
Header file for sequence handling. More...
#include "MathType.h"#include "SysDefs.h"#include "pdb.h"#include "hash.h"#include "deprecated.h"Go to the source code of this file.
Data Structures | |
| struct | SEQINFO |
Functions | |
| char | blThrone (char *three) |
| char | blThronex (char *three) |
| char * | blOnethr (char one) |
| char * | blDoPDB2Seq (PDB *pdb, BOOL DoAsxGlx, BOOL ProtOnly, BOOL NoX) |
| HASHTABLE * | blDoPDB2SeqByChain (PDB *pdb, BOOL DoAsxGlx, BOOL ProtOnly, BOOL NoX) |
| int | blSplitSeq (char *LinearSeq, char **seqs) |
| int | blReadSimplePIR (FILE *fp, int maxres, char **seqs) |
| int | blReadPIR (FILE *fp, BOOL DoInsert, char **seqs, int maxchain, SEQINFO *seqinfo, BOOL *punct, BOOL *error) |
| int | blReadRawPIR (FILE *fp, char **seqs, int maxchain, BOOL upcase, SEQINFO *seqinfo, BOOL *error) |
| int | blAlign (char *seq1, int length1, char *seq2, int length2, BOOL verbose, BOOL identity, int penalty, char *align1, char *align2, int *align_len) |
| int | blAffinealign (char *seq1, int length1, char *seq2, int length2, BOOL verbose, BOOL identity, int penalty, int penext, char *align1, char *align2, int *align_len) |
| int | blCalcMDMScore (char resa, char resb) |
| int | blAffinealignuc (char *seq1, int length1, char *seq2, int length2, BOOL verbose, BOOL identity, int penalty, int penext, char *align1, char *align2, int *align_len) |
| int | blCalcMDMScoreUC (char resa, char resb) |
| BOOL | blReadMDM (char *mdmfile) |
| int | blZeroMDM (void) |
| char | blDNAtoAA (char *dna) |
| int | blTrueSeqLen (char *sequence) |
| int | blKnownSeqLen (char *sequence) |
| BOOL | blNumericReadMDM (char *mdmfile) |
| int | blNumericCalcMDMScore (int resa, int resb) |
| int | blNumericAffineAlign (int *seq1, int length1, int *seq2, int length2, BOOL verbose, BOOL identity, int penalty, int penext, int *align1, int *align2, int *align_len) |
| void | blSetMDMScoreWeight (char resa, char resb, REAL weight) |
| void | blWriteOneStringPIR (FILE *out, char *label, char *title, char *sequence, char **chains, BOOL ByChain, BOOL doFasta) |
Variables | |
| BOOL | gBioplibSeqNucleicAcid |
Header file for sequence handling.
This code is NOT IN THE PUBLIC DOMAIN, but it may be copied according to the conditions laid out in the accompanying file COPYING.DOC.
The code may be modified as required, but any modifications must be documented so that the person responsible can be identified.
The code may not be sold commercially or included as part of a commercial product except as described in the file COPYING.DOC.
Definition in file seq.h.
| #define ALLOCSIZE |
| #define blPDB2SeqByChain | ( | x | ) | blDoPDB2SeqByChain((x), FALSE, FALSE, FALSE) |
| #define blPDB2SeqNoX | ( | x | ) | blDoPDB2Seq((x), FALSE, FALSE, TRUE) |
| #define blPDB2SeqNoXByChain | ( | x | ) | blDoPDB2SeqByChain((x), FALSE, FALSE, TRUE) |
| #define blPDB2SeqXByChain | ( | x | ) | blDoPDB2SeqByChain((x), TRUE, FALSE, FALSE) |
| #define blPDB2SeqXNoX | ( | x | ) | blDoPDB2Seq((x), TRUE, FALSE, TRUE) |
| #define blPDB2SeqXNoXByChain | ( | x | ) | blDoPDB2SeqByChain((x), TRUE, FALSE, TRUE) |
| #define blPDBProt2Seq | ( | x | ) | blDoPDB2Seq((x), FALSE, TRUE, FALSE) |
| #define blPDBProt2SeqByChain | ( | x | ) | blDoPDB2SeqByChain((x), FALSE, TRUE, FALSE) |
| #define blPDBProt2SeqNoX | ( | x | ) | blDoPDB2Seq((x), FALSE, TRUE, TRUE) |
| #define blPDBProt2SeqNoXByChain | ( | x | ) | blDoPDB2SeqByChain((x), FALSE, TRUE, TRUE) |
| #define blPDBProt2SeqX | ( | x | ) | blDoPDB2Seq((x), TRUE, TRUE, FALSE) |
| #define blPDBProt2SeqXByChain | ( | x | ) | blDoPDB2SeqByChain((x), TRUE, TRUE, FALSE) |
| #define blPDBProt2SeqXNoX | ( | x | ) | blDoPDB2Seq((x), TRUE, TRUE, TRUE) |
| #define blPDBProt2SeqXNoXByChain | ( | x | ) | blDoPDB2SeqByChain((x), TRUE, TRUE, TRUE) |
| int blAffinealign | ( | char * | seq1, |
| int | length1, | ||
| char * | seq2, | ||
| int | length2, | ||
| BOOL | verbose, | ||
| BOOL | identity, | ||
| int | penalty, | ||
| int | penext, | ||
| char * | align1, | ||
| char * | align2, | ||
| int * | align_len | ||
| ) |
| [in] | *seq1 | First sequence |
| [in] | length1 | First sequence length |
| [in] | *seq2 | Second sequence |
| [in] | length2 | Second sequence length |
| [in] | verbose | Display N&W matrix |
| [in] | identity | Use identity matrix |
| [in] | penalty | Gap insertion penalty value |
| [in] | penext | Extension penalty |
| [out] | *align1 | Sequence 1 aligned |
| [out] | *align2 | Sequence 2 aligned |
| [out] | *align_len | Alignment length |
Perform simple N&W alignment of seq1 and seq2. No window is used, so will be slow for long sequences.
Note that you must allocate sufficient memory for the aligned sequences. The easy way to do this is to ensure that align1 and align2 are of length (length1+length2).
NOTE AND CHANGES SHOULD BE PROPAGATED TO affinealignuc() ******
| int blAffinealignuc | ( | char * | seq1, |
| int | length1, | ||
| char * | seq2, | ||
| int | length2, | ||
| BOOL | verbose, | ||
| BOOL | identity, | ||
| int | penalty, | ||
| int | penext, | ||
| char * | align1, | ||
| char * | align2, | ||
| int * | align_len | ||
| ) |
| [in] | *seq1 | First sequence |
| [in] | length1 | First sequence length |
| [in] | *seq2 | Second sequence |
| [in] | length2 | Second sequence length |
| [in] | verbose | Display N&W matrix |
| [in] | identity | Use identity matrix |
| [in] | penalty | Gap insertion penalty value |
| [in] | penext | Extension penalty |
| [out] | *align1 | Sequence 1 aligned |
| [out] | *align2 | Sequence 2 aligned |
| [out] | *align_len | Alignment length |
Perform simple N&W alignment of seq1 and seq2. No window is used, so will be slow for long sequences.
Note that you must allocate sufficient memory for the aligned sequences. The easy way to do this is to ensure that align1 and align2 are of length (length1+length2).
NOTE AND CHANGES SHOULD BE PROPAGATED TO affinealign() ******
| int blAlign | ( | char * | seq1, |
| int | length1, | ||
| char * | seq2, | ||
| int | length2, | ||
| BOOL | verbose, | ||
| BOOL | identity, | ||
| int | penalty, | ||
| char * | align1, | ||
| char * | align2, | ||
| int * | align_len | ||
| ) |
| [in] | *seq1 | First sequence |
| [in] | length1 | First sequence length |
| [in] | *seq2 | Second sequence |
| [in] | length2 | Second sequence length |
| [in] | verbose | Display N&W matrix |
| [in] | identity | Use identity matrix |
| [in] | penalty | Gap insertion penalty value |
| [out] | *align1 | Sequence 1 aligned |
| [out] | *align2 | Sequence 2 aligned |
| [out] | *align_len | Alignment length |
Perform simple N&W alignment of seq1 and seq2. No window is used, so will be slow for long sequences.
A single gap penalty is used, so gap extension incurrs no further penalty.
Note that you must allocate sufficient memory for the aligned sequences. The easy way to do this is to ensure that align1 and align2 are of length (length1+length2).
| int blCalcMDMScore | ( | char | resa, |
| char | resb | ||
| ) |
| [in] | resa | First residue |
| [in] | resb | Second residue |
Calculate score from static globally stored mutation data matrix
If both residues are set as '\0' it will simply silence all warnings
| int blCalcMDMScoreUC | ( | char | resa, |
| char | resb | ||
| ) |
| [in] | resa | First residue |
| [in] | resb | Second residue |
Calculate score from static globally stored mutation data matrix
| char blDNAtoAA | ( | char * | dna | ) |
| [in] | *dna | DNA/RNA codon |
Converts a nucleic acid codon to the 1-letter amino acid equivalent. Termination codons are returned as X. No special action is taken for initiation codons.
| [in] | *pdb | PDB linked list |
| [in] | DoAsxGlx | Handle Asx and Glx as B and Z rather than X |
| [in] | ProtOnly | Don't do DNA/RNA; these simply don't get done rather than being handled as X |
| [in] | NoX | Skip amino acids which would be assigned as X |
malloc()'s an array containing the 1-letter sequence corresponding to an input PDB linked list. Returns NULL if given a NULL parameter or memory allocation fails. Puts *'s in the sequence for multi-chains.
This routine is normally called via the macro interfaces: PDB2Seq(pdb), PDB2SeqX(pdb), PDBProt2Seq(pdb), PDBProt2SeqX(pdb) Those with Prot in their names handle protein only; those with X handle Asx/Glx as B/Z rather than as X
| [in] | *pdb | PDB linked list |
| [in] | DoAsxGlx | Handle Asx and Glx as B and Z rather than X |
| [in] | ProtOnly | Don't do DNA/RNA; these simply don't get done rather than being handled as X |
| [in] | NoX | Skip amino acids which would be assigned as X |
Reads sequence from ATOM records in 1-letter code, storing the results in a hash indexed by chain label.
This routine is normally called via the macro interfaces: PDB2SeqByCHain(pdb), PDB2SeqXByCHain(pdb), PDBProt2SeqByChain(pdb), PDBProt2SeqXByChain(pdb) Those with Prot in their names handle protein only; those with X handle Asx/Glx as B/Z rather than as X
| int blKnownSeqLen | ( | char * | sequence | ) |
| [in] | *sequence | A sequence containing deletions |
Scans a 1-letter code sequence and calculate the length without `-', ` ' or '?' residues
Definition at line 107 of file KnownSeqLen.c.
| int blNumericAffineAlign | ( | int * | seq1, |
| int | length1, | ||
| int * | seq2, | ||
| int | length2, | ||
| BOOL | verbose, | ||
| BOOL | identity, | ||
| int | penalty, | ||
| int | penext, | ||
| int * | align1, | ||
| int * | align2, | ||
| int * | align_len | ||
| ) |
| [in] | *seq1 | First sequence of tokens |
| [in] | length1 | First sequence length |
| [in] | *seq2 | Second sequence of tokens |
| [in] | length2 | Second sequence length |
| [in] | verbose | Display N&W matrix |
| [in] | identity | Use identity matrix |
| [in] | penalty | Gap insertion penalty value |
| [in] | penext | Extension penalty |
| [out] | *align1 | Sequence 1 aligned |
| [out] | *align2 | Sequence 2 aligned |
| [out] | *align_len | Alignment length |
Perform simple N&W alignment of seq1 and seq2. No window is used, so will be slow for long sequences.
The sequences come as integer arrays containing numeric tokens
Note that you must allocate sufficient memory for the aligned sequences. The easy way to do this is to ensure that align1 and align2 are of length (length1+length2).
Identical to align.c/affinealign(), but uses integer arrays
Definition at line 412 of file NumericAlign.c.
| int blNumericCalcMDMScore | ( | int | resa, |
| int | resb | ||
| ) |
| [in] | resa | First token |
| [in] | resb | Second token |
Calculate score from static globally stored mutation data matrix
Identical to align.c/CalcMDMScore(), but uses a different static score array and takes integer parameters. These are used as direct lookups into the score array rather than being searched.
Definition at line 342 of file NumericAlign.c.
| BOOL blNumericReadMDM | ( | char * | mdmfile | ) |
| [in] | *mdmfile | Mutation data matrix filename |
Read mutation data matrix into static global arrays. The matrix may have comments at the start introduced with a ! in the first column. The matrix must be complete (i.e. a triangular matrix will not work). A line describing the residue types must appear, and may be placed before or after the matrix itself
Identical to align.c/ReadMDM() but reads into a different static 2D array and doesn't read a symbol identifier line from the file as the symbols are numeric and always start from 1 (0 is used as the insert character)
Definition at line 258 of file NumericAlign.c.
| char* blOnethr | ( | char | one | ) |
| [in] | one | One letter code |
Converts 1-letter code to 3-letter code (actually as 4 chars).
| BOOL blReadMDM | ( | char * | mdmfile | ) |
| [in] | *mdmfile | Mutation data matrix filename |
Read mutation data matrix into static global arrays. The matrix may have comments at the start introduced with a ! in the first column. The matrix must be complete (i.e. a triangular matrix will not work). A line describing the residue types must appear, and may be placed before or after the matrix itself
| int blReadPIR | ( | FILE * | fp, |
| BOOL | DoInsert, | ||
| char ** | seqs, | ||
| int | maxchain, | ||
| SEQINFO * | seqinfo, | ||
| BOOL * | punct, | ||
| BOOL * | error | ||
| ) |
| [in] | *fp | File pointer |
| [in] | DoInsert | TRUE Read - characters into the sequence FALSE Skip - characters |
| [in] | maxchain | Max number of chains to read. This is the dimension of the seqs array. N.B. THIS SHOULD BE AT LEAST 1 MORE THAN THE EXPECTED MAXIMUM NUMBER OF SEQUENCES |
| [out] | **seqs | Array of character pointers which will be filled in with sequence information. Memory will be allocated for any sequence length. |
| [out] | *seqinfo | This structure will be filled in with extra information about the sequence. Header & title information and details of any punctuation. |
| [out] | *punct | TRUE if any punctuation found. |
| [out] | *error | TRUE if an error occured (e.g. memory allocation) |
This is an all-singing, all-dancing PIR reader which should handle all legal PIR files and some (slightly) incorrect ones. The only requirements of the code are that the PIR file should have 2 title lines per entry, the first line starting with a > sign.
The routine will handle multiple sequence files. Successive calls will return information on the next entry. The routine will return 0 when there are no more entries.
Header line: Must start with >. Will handle files which don't have the proper P1; or F1; parts of the header as well as those which do.
Title line: Will read the name and source fields if correctly separated by a -, otherwise copies all information into the name.
Sequence: May contain allowed puctuation. This will set the punct flag and information on the types found will be placed in seqinfo. White space and line breaks are ignored. Each chain should end with a *, but the routine will accept the last chain of an entry with no . While the standard requires upper case text, this routine will handle lower case and convert it to upper case. While the routine does pretty well at last chains not terminated with a *, a last chain ending with a / not followed by a * but followed by a text line will be identified as incomplete rather than truncated. If the DoInsert flag is set, - signs in the sequence will be read as part of the sequence, otherwise they will be skipped. This is an addition to the PIR standard.
Text lines: Text lines after an entry (beginning with R;, C;, A;, N; or F;) are ignored.
| int blReadRawPIR | ( | FILE * | fp, |
| char ** | seqs, | ||
| int | maxchain, | ||
| BOOL | upcase, | ||
| SEQINFO * | seqinfo, | ||
| BOOL * | error | ||
| ) |
| [in] | *fp | File pointer |
| [in] | maxchain | Max number of chains to read. This is the dimension of the seqs array. N.B. THIS SHOULD BE AT LEAST 1 MORE THAN THE EXPECTED MAXIMUM NUMBER OF SEQUENCES |
| [in] | upcase | Should lower-case letters be upcased? |
| [out] | **seqs | Array of character pointers which will be filled in with sequence information. Memory will be allocated for any sequence length. |
| [out] | *seqinfo | This structure will be filled in with extra information about the sequence. Header & title information and details of any punctuation. |
| [out] | *error | TRUE if an error occured (e.g. memory allocation) |
This is based on ReadPIR(), but reads all characters into the sequence arrays (i.e. all punctuation characters are read as is). This is useful when punctuation has been used to indicate consensus sequence features.
The only requirements of the code are that the PIR file should have 2 title lines per entry, the first line starting with a > sign. The routine will handle multiple sequence files. Successive calls will return information on the next entry. The routine will return 0 when there are no more entries.
Header line: Must start with >. Will handle files which don't have the proper P1; or F1; parts of the header as well as those which do.
Title line: Will read the name and source fields if correctly separated by a -, otherwise copies all information into the name.
White space and line breaks are ignored. Each chain should end with a *, but the routine will accept the last chain of an entry with no . While the standard requires upper case text, this routine will handle lower case and convert it to upper case. While the routine does pretty well at last chains not terminated with a *, a last chain ending with a / not followed by a * but followed by a text line will be identified as incomplete rather than truncated. If the DoInsert flag is set, - signs in the sequence will be read as part of the sequence, otherwise they will be skipped. This is an addition to the PIR standard.
Text lines: Text lines after an entry (beginning with R;, C;, A;, N; or F;) are ignored.
Definition at line 169 of file ReadRawPIR.c.
| int blReadSimplePIR | ( | FILE * | fp, |
| int | maxres, | ||
| char ** | seqs | ||
| ) |
| [in] | *fp | File pointer |
| [in] | maxres | Max number of residues in chain. |
| [out] | **seqs | Array of pointers to sequences |
Read a PIR file containing multiple chains of up to maxres amino acids. Each chain is returned in seqs[]. The number of chains is returned by the routine. 0 is returned if a memory allocation failed
Definition at line 121 of file ReadSimplePIR.c.
| void blSetMDMScoreWeight | ( | char | resa, |
| char | resb, | ||
| REAL | weight | ||
| ) |
| int blSplitSeq | ( | char * | LinearSeq, |
| char ** | seqs | ||
| ) |
| [in] | *LinearSeq | Array containing sequence with chains terminated by *'s |
| [out] | **seqs | Allocated set of character arrays containing one chain per array |
Splits a sequence stored as a linear array with each chain separated by a * into an array of sequences. Returns the number of chains found.
Definition at line 115 of file SplitSeq.c.
| char blThrone | ( | char * | three | ) |
| [in] | *three | Three letter code |
Converts 3-letter code to 1-letter code. Handles ASX and GLX as X
| char blThronex | ( | char * | three | ) |
| int blTrueSeqLen | ( | char * | sequence | ) |
| [in] | *sequence | A sequence containing deletions |
Scans a 1-letter code sequence and calculate the length without `-' or ` ' residues
Definition at line 106 of file TrueSeqLen.c.
| void blWriteOneStringPIR | ( | FILE * | out, |
| char * | label, | ||
| char * | title, | ||
| char * | sequence, | ||
| char ** | chainLabels, | ||
| BOOL | ByChain, | ||
| BOOL | doFasta | ||
| ) |
| [in] | *out | File pointer |
| [in] | *label | Sequence label |
| [in] | *title | Sequence title |
| [in] | *sequence | Sequence (1-letter code) with chains separated by * |
| [in] | **chainLabels | Chain labels (may be set to NULL unless ByChain is set) |
| [in] | ByChain | Print a separate header for each chain |
| [in] | doFasta | Output FASTA format instead of PIR |
Writes a PIR sequence file from a 1-letter code sequence. Multiple chains are split with '*'. If ByChain is set the the chainLabels array must be non-NULL and contains labels for each chain Adds a terminating * if required.
Definition at line 116 of file WritePIR.c.
| int blZeroMDM | ( | void | ) |
1.8.8