The outage for Sunday 24th November has been cancelled.
Bioplib
Protein Structure C Library
 All Data Structures Files Functions Variables Typedefs Macros Pages
align.c
Go to the documentation of this file.
1 /************************************************************************/
2 /**
3 
4  \file align.c
5 
6  \version V3.6
7  \date 04.01.16
8  \brief Perform Needleman & Wunsch sequence alignment
9 
10  \copyright (c) UCL / Dr. Andrew C. R. Martin 1993-2016
11  \author Dr. Andrew C. R. Martin
12  \par
13  Institute of Structural & Molecular Biology,
14  University College London,
15  Gower Street,
16  London.
17  WC1E 6BT.
18  \par
19  andrew@bioinf.org.uk
20  andrew.martin@ucl.ac.uk
21 
22 **************************************************************************
23 
24  This code is NOT IN THE PUBLIC DOMAIN, but it may be copied
25  according to the conditions laid out in the accompanying file
26  COPYING.DOC.
27 
28  The code may be modified as required, but any modifications must be
29  documented so that the person responsible can be identified.
30 
31  The code may not be sold commercially or included as part of a
32  commercial product except as described in the file COPYING.DOC.
33 
34 **************************************************************************
35 
36  Description:
37  ============
38 
39  A simple Needleman & Wunsch Dynamic Programming alignment of 2
40  sequences.
41 
42  A window is not used so the routine may be a bit slow on long
43  sequences.
44 
45 **************************************************************************
46 
47  Usage:
48  ======
49 
50  First call ReadMDM() to read the mutation data matrix, then call
51  align() to align the sequences.
52 
53 **************************************************************************
54 
55  Revision History:
56  =================
57 - V1.0 19.06.90 Original used in NW program
58 - V2.0 07.10.92 Original extracted from old NW program
59 - V2.1 16.06.93 Tidied for book
60 - V2.2 01.03.94 Changed static variable names
61 - V2.3 18.03.94 getc() -> fgetc()
62 - V2.4 24.11.94 ReadMDM() now looks after searching DATADIR
63 - V2.5 28.02.95 ReadMDM() and other code improved to cope with MDMs of
64  any size
65 - V2.6 26.07.95 Removed unused variables
66 - V2.7 21.08.95 Initialisation of matrix was incorrect leading to errors
67  at the end of the alignment.
68 - V2.8 24.08.95 calcscore() was doing an out-of-bounds array reference
69  if a character wasn't found
70 - V2.9 11.07.96 calcscore() changed to CalcMDMScore() and made
71  non-static
72 - V2.10 09.09.96 Improved comments for ReadMDM()
73 - V2.11 17.09.96 Added ZeroMDM()
74 - V3.0 06.03.00 Traceback code rewritten to use a trace matrix created
75  while the main matrix is populated. New affinealign()
76  routine implemented. align() is now a wrapper to that.
77 - V3.1 06.02.03 Fixed for new version of GetWord()
78 - V3.2 27.02.07 Added affinealineuc() and CalcMDMScoreUC()
79 - V3.3 07.04.09 Complete re-write of ReadMDM() so it can read BLAST
80  style matrix files as well as our own
81 - V3.4 07.07.14 Use bl prefix for functions By: CTP
82 - V3.5 26.08.14 Added blSetMDMScoreWeight() By: ACRM
83 - V3.6 04.01.16 Added special calls to blCalcMDMScore() and
84  blCalcMDMScoreUC() to silence warnings. Warnings now
85  go to stderr
86 
87 *************************************************************************/
88 /* Doxygen
89  -------
90  #GROUP Handling Sequence Data
91  #SUBGROUP Alignment
92 
93  #FUNCTION blAlign()
94  Perform simple N&W alignment of seq1 and seq2. A single gap penalty
95  is used so there is no extension penalty
96 
97  #FUNCTION blAffinealign()
98  Perform simple N&W alignment of seq1 and seq2 with separate gap
99  opening and extension penalties
100 
101  #FUNCTION blAffinealignuc()
102  Perform simple N&W alignment of seq1 and seq2 with separate gap
103  opening and extension penalties. Optimized for DNA sequences
104 
105  #FUNCTION blReadMDM()
106  Read mutation data matrix into static global arrays for use by
107  alignment code
108 
109  #FUNCTION blCalcMDMScore()
110  Calculates a score for comparing two amino acids using a mutation
111  data matrix
112 
113  #FUNCTION blCalcMDMScoreUC()
114  As blCalcMDMScore() but upcases the amino acid labels before
115  calculation
116 
117  #FUNCTION blZeroMDM()
118  Modifies all values in the MDM such that the minimum value is 0
119 
120  #FUNCTION blSetMDMScoreWeight()
121  Apply a weight to a particular amino acid substitution. Modifies
122  the scoring matrix read by blReadMDM()
123 
124 */
125 /************************************************************************/
126 /* Includes
127 */
128 #include <stdio.h>
129 #include <stdlib.h>
130 #include <string.h>
131 #include <ctype.h>
132 
133 #include "SysDefs.h"
134 #include "macros.h"
135 #include "array.h"
136 #include "general.h"
137 #include "seq.h"
138 
139 /************************************************************************/
140 /* Defines and macros
141 */
142 #ifndef MAX
143 #define MAX(a,b) ((a) > (b) ? (a) : (b))
144 #endif
145 
146 #define MAX3(c,d,e) (MAX(MAX((c),(d)),(e)))
147 
148 #define DATAENV "DATADIR" /* Environment variable or assign */
149 
150 #define MAXBUFF 400
151 #define MAXWORD 16
152 
153 /* Type definition to store a X,Y coordinate pair in the matrix */
154 typedef struct
155 {
156  int x, y;
157 } XY;
158 
159 
160 /************************************************************************/
161 /* Globals
162 */
163 static int **sMDMScore;
164 static char *sMDM_AAList;
165 static int sMDMSize = 0;
166 
167 /************************************************************************/
168 /* Prototypes
169 */
170 static int SearchForBest(int **matrix, int length1, int length2,
171  int *BestI, int *BestJ, char *seq1, char *seq2,
172  char *align1, char *align2);
173 static int TraceBack(int **matrix, XY **dirn, int length1, int length2,
174  char *seq1, char *seq2, char *align1, char *align2,
175  int *align_len);
176 
177 
178 /************************************************************************/
179 /*>int blAlign(char *seq1, int length1, char *seq2, int length2,
180  BOOL verbose, BOOL identity, int penalty,
181  char *align1, char *align2, int *align_len)
182  -----------------------------------------------------------
183 *//**
184 
185  \param[in] *seq1 First sequence
186  \param[in] length1 First sequence length
187  \param[in] *seq2 Second sequence
188  \param[in] length2 Second sequence length
189  \param[in] verbose Display N&W matrix
190  \param[in] identity Use identity matrix
191  \param[in] penalty Gap insertion penalty value
192  \param[out] *align1 Sequence 1 aligned
193  \param[out] *align2 Sequence 2 aligned
194  \param[out] *align_len Alignment length
195  \return Alignment score (0 on error)
196 
197  Perform simple N&W alignment of seq1 and seq2. No window is used, so
198  will be slow for long sequences.
199 
200  A single gap penalty is used, so gap extension incurrs no further
201  penalty.
202 
203  Note that you must allocate sufficient memory for the aligned
204  sequences.
205  The easy way to do this is to ensure that align1 and align2 are
206  of length (length1+length2).
207 
208 - 06.03.00 Implemented as a wrapper to affinealign() which is the old
209  align() routine, plus support for affine gap penalties,
210  plus new traceback code based on storing the path as we
211  go
212 - 07.07.14 Use bl prefix for functions By: CTP
213 */
214 int blAlign(char *seq1,
215  int length1,
216  char *seq2,
217  int length2,
218  BOOL verbose,
219  BOOL identity,
220  int penalty,
221  char *align1,
222  char *align2,
223  int *align_len)
224 {
225  return(blAffinealign(seq1, length1, seq2, length2, verbose, identity,
226  penalty, 0, align1, align2, align_len));
227 }
228 
229 
230 /************************************************************************/
231 /*>int blAffinealign(char *seq1, int length1, char *seq2, int length2,
232  BOOL verbose, BOOL identity, int penalty, int penext,
233  char *align1, char *align2, int *align_len)
234  ---------------------------------------------------------------------
235 *//**
236 
237  \param[in] *seq1 First sequence
238  \param[in] length1 First sequence length
239  \param[in] *seq2 Second sequence
240  \param[in] length2 Second sequence length
241  \param[in] verbose Display N&W matrix
242  \param[in] identity Use identity matrix
243  \param[in] penalty Gap insertion penalty value
244  \param[in] penext Extension penalty
245  \param[out] *align1 Sequence 1 aligned
246  \param[out] *align2 Sequence 2 aligned
247  \param[out] *align_len Alignment length
248  \return Alignment score (0 on error)
249 
250  Perform simple N&W alignment of seq1 and seq2. No window is used, so
251  will be slow for long sequences.
252 
253  Note that you must allocate sufficient memory for the aligned
254  sequences.
255  The easy way to do this is to ensure that align1 and align2 are
256  of length (length1+length2).
257 
258 - 07.10.92 Adapted from original written while at NIMR
259 - 08.10.92 Split into separate routines
260 - 09.10.92 Changed best structure to simple integers, moved
261  SearchForBest() into TraceBack()
262 - 21.08.95 Was only filling in the bottom right cell at initialisation
263  rather than all the right hand column and bottom row
264 - 11.07.96 Changed calls to calcscore() to CalcMDMScore()
265 - 06.03.00 Changed name to affinealign() (the routine align() is
266  provided as a backwards compatible wrapper). Added penext
267  parameter. Now supports affine gap penalties with separate
268  opening and extension penalties. The code now maintains
269  the path as it goes.
270 - 07.07.14 Use bl prefix for functions By: CTP
271 **************************************************************************
272 ****** NOTE AND CHANGES SHOULD BE PROPAGATED TO affinealignuc() ******
273 **************************************************************************
274 */
275 int blAffinealign(char *seq1,
276  int length1,
277  char *seq2,
278  int length2,
279  BOOL verbose,
280  BOOL identity,
281  int penalty,
282  int penext,
283  char *align1,
284  char *align2,
285  int *align_len)
286 {
287  XY **dirn = NULL;
288  int **matrix = NULL,
289  maxdim,
290  i, j, k, l,
291  i1, j1,
292  dia, right, down,
293  rcell, dcell, maxoff,
294  match = 1,
295  thisscore,
296  gapext,
297  score;
298 
299  maxdim = MAX(length1, length2);
300 
301  /* Initialise the score matrix */
302  if((matrix = (int **)blArray2D(sizeof(int), maxdim, maxdim))==NULL)
303  return(0);
304  if((dirn = (XY **)blArray2D(sizeof(XY), maxdim, maxdim))==NULL)
305  return(0);
306 
307  for(i=0;i<maxdim;i++)
308  {
309  for(j=0;j<maxdim;j++)
310  {
311  matrix[i][j] = 0;
312  dirn[i][j].x = -1;
313  dirn[i][j].y = -1;
314  }
315  }
316 
317  /* Fill in scores up the right hand side of the matrix */
318  for(j=0; j<length2; j++)
319  {
320  if(identity)
321  {
322  if(seq1[length1-1] == seq2[j]) matrix[length1-1][j] = match;
323  }
324  else
325  {
326  matrix[length1-1][j] = blCalcMDMScore(seq1[length1-1], seq2[j]);
327  }
328  }
329 
330  /* Fill in scores along the bottom row of the matrix */
331  for(i=0; i<length1; i++)
332  {
333  if(identity)
334  {
335  if(seq1[i] == seq2[length2-1]) matrix[i][length2-1] = match;
336  }
337  else
338  {
339  matrix[i][length2-1] = blCalcMDMScore(seq1[i], seq2[length2-1]);
340  }
341  }
342 
343  i = length1 - 1;
344  j = length2 - 1;
345 
346  /* Move back along the diagonal */
347  while(i > 0 && j > 0)
348  {
349  i--;
350  j--;
351 
352  /* Fill in the scores along this row */
353  for(i1 = i; i1 > -1; i1--)
354  {
355  dia = matrix[i1+1][j+1];
356 
357  /* Find highest score to right of diagonal */
358  rcell = i1+2;
359  if(i1+2 >= length1) right = 0;
360  else right = matrix[i1+2][j+1] - penalty;
361 
362  gapext = 1;
363  for(k = i1+3; k<length1; k++, gapext++)
364  {
365  thisscore = matrix[k][j+1] - (penalty + gapext*penext);
366 
367  if(thisscore > right)
368  {
369  right = thisscore;
370  rcell = k;
371  }
372  }
373 
374  /* Find highest score below diagonal */
375  dcell = j+2;
376  if(j+2 >= length2) down = 0;
377  else down = matrix[i1+1][j+2] - penalty;
378 
379  gapext = 1;
380  for(l = j+3; l<length2; l++, gapext++)
381  {
382  thisscore = matrix[i1+1][l] - (penalty + gapext*penext);
383 
384  if(thisscore > down)
385  {
386  down = thisscore;
387  dcell = l;
388  }
389  }
390 
391  /* Set score to best of these */
392  maxoff = MAX(right, down);
393  if(dia >= maxoff)
394  {
395  matrix[i1][j] = dia;
396  dirn[i1][j].x = i1+1;
397  dirn[i1][j].y = j+1;
398  }
399  else
400  {
401  if(right > down)
402  {
403  matrix[i1][j] = right;
404  dirn[i1][j].x = rcell;
405  dirn[i1][j].y = j+1;
406  }
407  else
408  {
409  matrix[i1][j] = down;
410  dirn[i1][j].x = i1+1;
411  dirn[i1][j].y = dcell;
412  }
413  }
414 
415  /* Add the score for a match */
416  if(identity)
417  {
418  if(seq1[i1] == seq2[j]) matrix[i1][j] += match;
419  }
420  else
421  {
422  matrix[i1][j] += blCalcMDMScore(seq1[i1],seq2[j]);
423  }
424  }
425 
426  /* Fill in the scores in this column */
427  for(j1 = j; j1 > -1; j1--)
428  {
429  dia = matrix[i+1][j1+1];
430 
431  /* Find highest score to right of diagonal */
432  rcell = i+2;
433  if(i+2 >= length1) right = 0;
434  else right = matrix[i+2][j1+1] - penalty;
435 
436  gapext = 1;
437  for(k = i+3; k<length1; k++, gapext++)
438  {
439  thisscore = matrix[k][j1+1] - (penalty + gapext*penext);
440 
441  if(thisscore > right)
442  {
443  right = thisscore;
444  rcell = k;
445  }
446  }
447 
448  /* Find highest score below diagonal */
449  dcell = j1+2;
450  if(j1+2 >= length2) down = 0;
451  else down = matrix[i+1][j1+2] - penalty;
452 
453  gapext = 1;
454  for(l = j1+3; l<length2; l++, gapext++)
455  {
456  thisscore = matrix[i+1][l] - (penalty + gapext*penext);
457 
458  if(thisscore > down)
459  {
460  down = thisscore;
461  dcell = l;
462  }
463  }
464 
465  /* Set score to best of these */
466  maxoff = MAX(right, down);
467  if(dia >= maxoff)
468  {
469  matrix[i][j1] = dia;
470  dirn[i][j1].x = i+1;
471  dirn[i][j1].y = j1+1;
472  }
473  else
474  {
475  if(right > down)
476  {
477  matrix[i][j1] = right;
478  dirn[i][j1].x = rcell;
479  dirn[i][j1].y = j1+1;
480  }
481  else
482  {
483  matrix[i][j1] = down;
484  dirn[i][j1].x = i+1;
485  dirn[i][j1].y = dcell;
486  }
487  }
488 
489  /* Add the score for a match */
490  if(identity)
491  {
492  if(seq1[i] == seq2[j1]) matrix[i][j1] += match;
493  }
494  else
495  {
496  matrix[i][j1] += blCalcMDMScore(seq1[i],seq2[j1]);
497  }
498  }
499  }
500 
501  score = TraceBack(matrix, dirn, length1, length2,
502  seq1, seq2, align1, align2, align_len);
503 
504  if(verbose)
505  {
506  printf("Matrix:\n-------\n");
507  for(j=0; j<length2;j++)
508  {
509  for(i=0; i<length1; i++)
510  {
511  printf("%3d ",matrix[i][j]);
512  }
513  printf("\n");
514  }
515 
516  printf("Path:\n-----\n");
517  for(j=0; j<length2;j++)
518  {
519  for(i=0; i<length1; i++)
520  {
521  printf("(%3d,%3d) ",dirn[i][j].x,dirn[i][j].y);
522  }
523  printf("\n");
524  }
525  }
526 
527  blFreeArray2D((char **)matrix, maxdim, maxdim);
528  blFreeArray2D((char **)dirn, maxdim, maxdim);
529 
530  return(score);
531 }
532 
533 
534 /************************************************************************/
535 /*>int blAffinealignuc(char *seq1, int length1, char *seq2, int length2,
536  BOOL verbose, BOOL identity, int penalty,
537  int penext, char *align1, char *align2,
538  int *align_len)
539  ---------------------------------------------------------------------
540 *//**
541 
542  \param[in] *seq1 First sequence
543  \param[in] length1 First sequence length
544  \param[in] *seq2 Second sequence
545  \param[in] length2 Second sequence length
546  \param[in] verbose Display N&W matrix
547  \param[in] identity Use identity matrix
548  \param[in] penalty Gap insertion penalty value
549  \param[in] penext Extension penalty
550  \param[out] *align1 Sequence 1 aligned
551  \param[out] *align2 Sequence 2 aligned
552  \param[out] *align_len Alignment length
553  \return Alignment score (0 on error)
554 
555  Perform simple N&W alignment of seq1 and seq2. No window is used, so
556  will be slow for long sequences.
557 
558  Note that you must allocate sufficient memory for the aligned
559  sequences.
560  The easy way to do this is to ensure that align1 and align2 are
561  of length (length1+length2).
562 
563 - 07.10.92 Adapted from original written while at NIMR
564 - 08.10.92 Split into separate routines
565 - 09.10.92 Changed best structure to simple integers, moved
566  SearchForBest() into TraceBack()
567 - 21.08.95 Was only filling in the bottom right cell at initialisation
568  rather than all the right hand column and bottom row
569 - 11.07.96 Changed calls to calcscore() to CalcMDMScore()
570 - 06.03.00 Changed name to affinealign() (the routine align() is
571  provided as a backwards compatible wrapper). Added penext
572  parameter. Now supports affine gap penalties with separate
573  opening and extension penalties. The code now maintains
574  the path as it goes.
575 - 27.02.07 Exactly as affinealign() but upcases characters before
576  comparison
577 - 07.07.14 Use bl prefix for functions By: CTP
578 
579 **************************************************************************
580 ****** NOTE AND CHANGES SHOULD BE PROPAGATED TO affinealign() ******
581 **************************************************************************
582 */
583 int blAffinealignuc(char *seq1,
584  int length1,
585  char *seq2,
586  int length2,
587  BOOL verbose,
588  BOOL identity,
589  int penalty,
590  int penext,
591  char *align1,
592  char *align2,
593  int *align_len)
594 {
595  XY **dirn = NULL;
596  int **matrix = NULL,
597  maxdim,
598  i, j, k, l,
599  i1, j1,
600  dia, right, down,
601  rcell, dcell, maxoff,
602  match = 1,
603  thisscore,
604  gapext,
605  score;
606 
607  maxdim = MAX(length1, length2);
608 
609  /* Initialise the score matrix */
610  if((matrix = (int **)blArray2D(sizeof(int), maxdim, maxdim))==NULL)
611  return(0);
612  if((dirn = (XY **)blArray2D(sizeof(XY), maxdim, maxdim))==NULL)
613  return(0);
614 
615  for(i=0;i<maxdim;i++)
616  {
617  for(j=0;j<maxdim;j++)
618  {
619  matrix[i][j] = 0;
620  dirn[i][j].x = -1;
621  dirn[i][j].y = -1;
622  }
623  }
624 
625  /* Fill in scores up the right hand side of the matrix */
626  for(j=0; j<length2; j++)
627  {
628  if(identity)
629  {
630  if(seq1[length1-1] == seq2[j]) matrix[length1-1][j] = match;
631  }
632  else
633  {
634  matrix[length1-1][j] = blCalcMDMScoreUC(seq1[length1-1],seq2[j]);
635  }
636  }
637 
638  /* Fill in scores along the bottom row of the matrix */
639  for(i=0; i<length1; i++)
640  {
641  if(identity)
642  {
643  if(seq1[i] == seq2[length2-1]) matrix[i][length2-1] = match;
644  }
645  else
646  {
647  matrix[i][length2-1] = blCalcMDMScoreUC(seq1[i],seq2[length2-1]);
648  }
649  }
650 
651  i = length1 - 1;
652  j = length2 - 1;
653 
654  /* Move back along the diagonal */
655  while(i > 0 && j > 0)
656  {
657  i--;
658  j--;
659 
660  /* Fill in the scores along this row */
661  for(i1 = i; i1 > -1; i1--)
662  {
663  dia = matrix[i1+1][j+1];
664 
665  /* Find highest score to right of diagonal */
666  rcell = i1+2;
667  if(i1+2 >= length1) right = 0;
668  else right = matrix[i1+2][j+1] - penalty;
669 
670  gapext = 1;
671  for(k = i1+3; k<length1; k++, gapext++)
672  {
673  thisscore = matrix[k][j+1] - (penalty + gapext*penext);
674 
675  if(thisscore > right)
676  {
677  right = thisscore;
678  rcell = k;
679  }
680  }
681 
682  /* Find highest score below diagonal */
683  dcell = j+2;
684  if(j+2 >= length2) down = 0;
685  else down = matrix[i1+1][j+2] - penalty;
686 
687  gapext = 1;
688  for(l = j+3; l<length2; l++, gapext++)
689  {
690  thisscore = matrix[i1+1][l] - (penalty + gapext*penext);
691 
692  if(thisscore > down)
693  {
694  down = thisscore;
695  dcell = l;
696  }
697  }
698 
699  /* Set score to best of these */
700  maxoff = MAX(right, down);
701  if(dia >= maxoff)
702  {
703  matrix[i1][j] = dia;
704  dirn[i1][j].x = i1+1;
705  dirn[i1][j].y = j+1;
706  }
707  else
708  {
709  if(right > down)
710  {
711  matrix[i1][j] = right;
712  dirn[i1][j].x = rcell;
713  dirn[i1][j].y = j+1;
714  }
715  else
716  {
717  matrix[i1][j] = down;
718  dirn[i1][j].x = i1+1;
719  dirn[i1][j].y = dcell;
720  }
721  }
722 
723  /* Add the score for a match */
724  if(identity)
725  {
726  if(seq1[i1] == seq2[j]) matrix[i1][j] += match;
727  }
728  else
729  {
730  matrix[i1][j] += blCalcMDMScoreUC(seq1[i1],seq2[j]);
731  }
732  }
733 
734  /* Fill in the scores in this column */
735  for(j1 = j; j1 > -1; j1--)
736  {
737  dia = matrix[i+1][j1+1];
738 
739  /* Find highest score to right of diagonal */
740  rcell = i+2;
741  if(i+2 >= length1) right = 0;
742  else right = matrix[i+2][j1+1] - penalty;
743 
744  gapext = 1;
745  for(k = i+3; k<length1; k++, gapext++)
746  {
747  thisscore = matrix[k][j1+1] - (penalty + gapext*penext);
748 
749  if(thisscore > right)
750  {
751  right = thisscore;
752  rcell = k;
753  }
754  }
755 
756  /* Find highest score below diagonal */
757  dcell = j1+2;
758  if(j1+2 >= length2) down = 0;
759  else down = matrix[i+1][j1+2] - penalty;
760 
761  gapext = 1;
762  for(l = j1+3; l<length2; l++, gapext++)
763  {
764  thisscore = matrix[i+1][l] - (penalty + gapext*penext);
765 
766  if(thisscore > down)
767  {
768  down = thisscore;
769  dcell = l;
770  }
771  }
772 
773  /* Set score to best of these */
774  maxoff = MAX(right, down);
775  if(dia >= maxoff)
776  {
777  matrix[i][j1] = dia;
778  dirn[i][j1].x = i+1;
779  dirn[i][j1].y = j1+1;
780  }
781  else
782  {
783  if(right > down)
784  {
785  matrix[i][j1] = right;
786  dirn[i][j1].x = rcell;
787  dirn[i][j1].y = j1+1;
788  }
789  else
790  {
791  matrix[i][j1] = down;
792  dirn[i][j1].x = i+1;
793  dirn[i][j1].y = dcell;
794  }
795  }
796 
797  /* Add the score for a match */
798  if(identity)
799  {
800  if(seq1[i] == seq2[j1]) matrix[i][j1] += match;
801  }
802  else
803  {
804  matrix[i][j1] += blCalcMDMScoreUC(seq1[i],seq2[j1]);
805  }
806  }
807  }
808 
809  score = TraceBack(matrix, dirn, length1, length2,
810  seq1, seq2, align1, align2, align_len);
811 
812  if(verbose)
813  {
814  printf("Matrix:\n-------\n");
815  for(j=0; j<length2;j++)
816  {
817  for(i=0; i<length1; i++)
818  {
819  printf("%3d ",matrix[i][j]);
820  }
821  printf("\n");
822  }
823 
824  printf("Path:\n-----\n");
825  for(j=0; j<length2;j++)
826  {
827  for(i=0; i<length1; i++)
828  {
829  printf("(%3d,%3d) ",dirn[i][j].x,dirn[i][j].y);
830  }
831  printf("\n");
832  }
833  }
834 
835  blFreeArray2D((char **)matrix, maxdim, maxdim);
836  blFreeArray2D((char **)dirn, maxdim, maxdim);
837 
838  return(score);
839 }
840 
841 
842 /************************************************************************/
843 /*>BOOL blReadMDM(char *mdmfile)
844  -----------------------------
845 *//**
846 
847  \param[in] *mdmfile Mutation data matrix filename
848  \return Success?
849 
850  Read mutation data matrix into static global arrays. The matrix may
851  have comments at the start introduced with a ! in the first column.
852  The matrix must be complete (i.e. a triangular matrix will not
853  work). A line describing the residue types must appear, and may
854  be placed before or after the matrix itself
855 
856 - 07.10.92 Original
857 - 18.03.94 getc() -> fgetc()
858 - 24.11.94 Automatically looks in DATAENV if not found in current
859  directory
860 - 28.02.95 Modified to read any size MDM and allow comments
861  Also allows the list of aa types before or after the actual
862  matrix
863 - 26.07.95 Removed unused variables
864 - 06.02.03 Fixed for new version of GetWord()
865 - 07.04.09 Completely re-written to allow it to read BLAST style matrix
866  files as well as the ones used previously
867  Allow comments introduced with # as well as !
868  Uses MAXWORD rather than hardcoded 16
869 - 07.07.14 Use bl prefix for functions By: CTP
870 */
871 BOOL blReadMDM(char *mdmfile)
872 {
873  FILE *mdm = NULL;
874  int i, j, k, row, tmpStoreSize;
875  char buffer[MAXBUFF],
876  word[MAXWORD],
877  *p,
878  **tmpStore;
879  BOOL noenv;
880 
881  if((mdm=blOpenFile(mdmfile, DATAENV, "r", &noenv))==NULL)
882  {
883  return(FALSE);
884  }
885 
886  /* First read the file to determine the dimensions */
887  while(fgets(buffer,MAXBUFF,mdm))
888  {
889  TERMINATE(buffer);
890  KILLLEADSPACES(p,buffer);
891 
892  /* First line which is non-blank and non-comment */
893  if(strlen(p) && p[0] != '!' && p[0] != '#')
894  {
895  sMDMSize = 0;
896  for(p = buffer; p!=NULL;)
897  {
898  p = blGetWord(p, word, MAXWORD);
899  /* Increment counter if this is numeric */
900  if(isdigit(word[0]) ||
901  ((word[0] == '-')&&(isdigit(word[1]))))
902  sMDMSize++;
903  }
904  if(sMDMSize)
905  break;
906  }
907  }
908 
909  /* Allocate memory for the MDM and the AA List */
910  if((sMDMScore = (int **)blArray2D(sizeof(int),sMDMSize,sMDMSize))==NULL)
911  return(FALSE);
912  if((sMDM_AAList = (char *)malloc((sMDMSize+1)*sizeof(char)))==NULL)
913  {
914  blFreeArray2D((char **)sMDMScore, sMDMSize, sMDMSize);
915  return(FALSE);
916  }
917 
918  /* Allocate temporary storage for a row from the matrix */
919  tmpStoreSize = 2*sMDMSize;
920  if((tmpStore = (char **)blArray2D(sizeof(char), tmpStoreSize, MAXWORD))
921  ==NULL)
922  {
923  free(sMDM_AAList);
924  blFreeArray2D((char **)sMDMScore, sMDMSize, sMDMSize);
925  return(FALSE);
926  }
927 
928  /* Fill the matrix with zeros */
929  for(i=0; i<sMDMSize; i++)
930  {
931  for(j=0; j<sMDMSize; j++)
932  {
933  sMDMScore[i][j] = 0;
934  }
935  }
936 
937  /* Rewind the file and read the actual data */
938  rewind(mdm);
939  row = 0;
940  while(fgets(buffer,MAXBUFF,mdm))
941  {
942  int Numeric;
943 
944  TERMINATE(buffer);
945  KILLLEADSPACES(p,buffer);
946 
947  /* Check line is non-blank and non-comment */
948  if(strlen(p) && p[0] != '!' && p[0] != '#')
949  {
950  Numeric = 0;
951  for(p = buffer, i = 0; p!=NULL && i<tmpStoreSize; i++)
952  {
953  p = blGetWord(p, tmpStore[i], MAXWORD);
954  /* Incremement Numeric counter if it's a numeric field */
955  if(isdigit(tmpStore[i][0]) ||
956  ((tmpStore[i][0] == '-')&&(isdigit(tmpStore[i][1]))))
957  {
958  Numeric++;
959  }
960  }
961 
962  /* No numeric fields so it is the amino acid names */
963  if(Numeric == 0)
964  {
965  for(j = 0; j<i && j<sMDMSize; j++)
966  {
967  sMDM_AAList[j] = tmpStore[j][0];
968  }
969  }
970  else
971  {
972  /* There were numeric fields, so copy them into the matrix,
973  skipping any non-numeric fields
974  j counts the input fields
975  k counts the fields in sMDMScore
976  row counts the row in sMDMScore
977  */
978  for(j=0, k=0; j<i && k<sMDMSize; j++)
979  {
980  if(isdigit(tmpStore[j][0]) ||
981  ((tmpStore[j][0] == '-')&&(isdigit(tmpStore[j][1]))))
982  {
983  sscanf(tmpStore[j],"%d",&(sMDMScore[row][k]));
984  k++;
985  }
986  }
987 
988  row++;
989  }
990  }
991  }
992  fclose(mdm);
993  blFreeArray2D((char **)tmpStore, tmpStoreSize, MAXWORD);
994 
995  return(TRUE);
996 }
997 
998 
999 /************************************************************************/
1000 /*>static int SearchForBest(int **matrix, int length1, int length2,
1001  int *BestI, int *BestJ, char *seq1,
1002  char *seq2, char *align1, char *align2)
1003  ----------------------------------------------------------------
1004 *//**
1005 
1006  \param[in] **matrix N&W matrix
1007  \param[in] length1 Length of first sequence
1008  \param[in] length2 Length of second sequence
1009  \param[in] *BestI x position of highest score
1010  \param[in] *BestJ y position of highest score
1011  \param[in] *seq1 First sequence
1012  \param[in] *seq2 Second sequence
1013  \param[out] *align1 First sequence with end aligned correctly
1014  \param[out] *align2 Second sequence with end aligned correctly
1015  \return Alignment length thus far
1016 
1017  Searches the outside of the matrix for the best score and starts the
1018  alignment by putting in any starting - characters.
1019 
1020 - 08.10.92 Original extracted from Align()
1021 */
1022 static int SearchForBest(int **matrix,
1023  int length1,
1024  int length2,
1025  int *BestI,
1026  int *BestJ,
1027  char *seq1,
1028  char *seq2,
1029  char *align1,
1030  char *align2)
1031 {
1032  int ai,
1033  besti, bestj,
1034  i, j;
1035 
1036  /* Now search the outside of the matrix for the highest scoring cell */
1037  ai = 0;
1038  besti = 0;
1039  for(i = 1; i < length1; i++)
1040  {
1041  if(matrix[i][0] > matrix[besti][0]) besti = i;
1042  }
1043  bestj = 0;
1044  for(j = 1; j < length2; j++)
1045  {
1046  if(matrix[0][j] > matrix[0][bestj]) bestj = j;
1047  }
1048  if(matrix[besti][0] > matrix[0][bestj])
1049  {
1050  *BestI = besti;
1051  *BestJ = 0;
1052  for(i=0; i<*BestI; i++)
1053  {
1054  align1[ai] = seq1[i];
1055  align2[ai++] = '-';
1056  }
1057  }
1058  else
1059  {
1060  *BestI = 0;
1061  *BestJ = bestj;
1062  for(j=0; j<*BestJ; j++)
1063  {
1064  align1[ai] = '-';
1065  align2[ai++] = seq2[j];
1066  }
1067  }
1068  return(ai);
1069 }
1070 
1071 
1072 /************************************************************************/
1073 /*>static int TraceBack(int **matrix, XY **dirn,
1074  int length1, int length2,
1075  char *seq1, char *seq2, char *align1,
1076  char *align2, int *align_len)
1077  ----------------------------------------------------------
1078 *//**
1079  \param[in] **matrix N&W matrix
1080  \param[in] **dirn Direction Matrix
1081  \param[in] length1 Length of first sequence
1082  \param[in] length2 Length of second sequence
1083  \param[in] *seq1 First sequence
1084  \param[in] *seq2 Second sequence
1085  \param[out] *align1 First sequence aligned
1086  \param[out] *align2 Second sequence aligned
1087  \param[out] *align_len Aligned sequence length
1088  \return Alignment score
1089 
1090  Does the traceback to find the aligment.
1091 
1092 - 08.10.92 Original extracted from Align(). Rewritten to do tracing
1093  correctly.
1094 - 09.10.92 Changed to call SearchForBest(). Nor returns score rather than
1095  length.
1096 - 06.03.00 Recoded to take the path matrix which is calculated within
1097  the main affinealign() routine rather than calculating the
1098  path as we go. penalty parameter removed as this is no longer
1099  needed. dirn parameter added.
1100 - 28.09.00 Fixed bug where last inserts were printed properly if one chain
1101  ended first
1102 */
1103 static int TraceBack(int **matrix,
1104  XY **dirn,
1105  int length1,
1106  int length2,
1107  char *seq1,
1108  char *seq2,
1109  char *align1,
1110  char *align2,
1111  int *align_len)
1112 {
1113  int i, j,
1114  ai,
1115  BestI,BestJ;
1116  XY nextCell;
1117 
1118  ai = SearchForBest(matrix, length1, length2, &BestI, &BestJ,
1119  seq1, seq2, align1, align2);
1120 
1121  /* Now trace back to find the alignment */
1122  i = BestI;
1123  j = BestJ;
1124  align1[ai] = seq1[i];
1125  align2[ai++] = seq2[j];
1126 
1127  while(i < length1-1 && j < length2-1)
1128  {
1129  nextCell.x = dirn[i][j].x;
1130  nextCell.y = dirn[i][j].y;
1131  if((nextCell.x == i+1) && (nextCell.y == j+1))
1132  {
1133  /* We are inheriting from the diagonal */
1134  i++;
1135  j++;
1136  }
1137  else if(nextCell.y == j+1)
1138  {
1139  /* We are inheriting from the off-diagonal inserting a gap in
1140  the y-sequence (seq2)
1141  */
1142  i++;
1143  j++;
1144  while((i < nextCell.x) && (i < length1-1))
1145  {
1146  align1[ai] = seq1[i++];
1147  align2[ai++] = '-';
1148  }
1149  }
1150  else if(nextCell.x == i+1)
1151  {
1152  /* We are inheriting from the off-diagonal inserting a gap in
1153  the x-sequence (seq1)
1154  */
1155  i++;
1156  j++;
1157  while((j < nextCell.y) && (j < length2-1))
1158  {
1159  align1[ai] = '-';
1160  align2[ai++] = seq2[j++];
1161  }
1162  }
1163  else
1164  {
1165  /* Cockup! */
1166  fprintf(stderr,"align.c/TraceBack() internal error\n");
1167  }
1168 
1169  align1[ai] = seq1[i];
1170  align2[ai++] = seq2[j];
1171  }
1172 
1173  /* If one sequence finished first, fill in the end with insertions */
1174  if(i < length1-1)
1175  {
1176  for(j=i+1; j<length1; j++)
1177  {
1178  align1[ai] = seq1[j];
1179  align2[ai++] = '-';
1180  }
1181  }
1182  else if(j < length2-1)
1183  {
1184  for(i=j+1; i<length2; i++)
1185  {
1186  align1[ai] = '-';
1187  align2[ai++] = seq2[i];
1188  }
1189  }
1190 
1191  *align_len = ai;
1192 
1193  return(matrix[BestI][BestJ]);
1194 }
1195 
1196 
1197 /************************************************************************/
1198 /*>int blCalcMDMScore(char resa, char resb)
1199  ----------------------------------------
1200 *//**
1201 
1202  \param[in] resa First residue
1203  \param[in] resb Second residue
1204  \return score
1205 
1206  Calculate score from static globally stored mutation data matrix
1207 
1208  If both residues are set as '\0' it will simply silence all warnings
1209 
1210 - 07.10.92 Adapted from NIMR-written original
1211 - 24.11.94 Only gives 10 warnings
1212 - 28.02.95 Modified to use sMDMSize
1213 - 24.08.95 If a residue was not found was doing an out-of-bounds array
1214  reference causing a potential core dump
1215 - 11.07.96 Name changed from calcscore() and now non-static
1216 - 07.07.14 Use bl prefix for functions By: CTP
1217 - 04.01.16 Added special call with both residues set to '\0' to silence
1218  warnings. Also warnings now go to stderr
1219 */
1220 int blCalcMDMScore(char resa, char resb)
1221 {
1222  int i,j;
1223  static int NWarn = 0;
1224  BOOL Warned = FALSE;
1225 
1226  if((resa == '\0') && (resb == '\0'))
1227  {
1228  NWarn = 100;
1229  return(0);
1230  }
1231 
1232  for(i=0; i<sMDMSize; i++)
1233  {
1234  if(resa==sMDM_AAList[i]) break;
1235  }
1236 
1237  if(i==sMDMSize)
1238  {
1239  if(NWarn < 10)
1240  fprintf(stderr, "Residue %c not found in matrix\n", resa);
1241  else if(NWarn == 10)
1242  fprintf(stderr, "More residues not found in matrix...\n");
1243 
1244  Warned = TRUE;
1245  }
1246 
1247  for(j=0; j<sMDMSize; j++)
1248  {
1249  if(resb==sMDM_AAList[j]) break;
1250  }
1251 
1252  if(j==sMDMSize)
1253  {
1254  if(NWarn < 10)
1255  fprintf(stderr, "Residue %c not found in matrix\n", resb);
1256  else if(NWarn == 10)
1257  fprintf(stderr, "More residues not found in matrix...\n");
1258 
1259  Warned = TRUE;
1260  }
1261 
1262  if(Warned)
1263  {
1264  NWarn++;
1265  return(0);
1266  }
1267 
1268  return(sMDMScore[i][j]);
1269 }
1270 
1271 /************************************************************************/
1272 /*>int blCalcMDMScoreUC(char resa, char resb)
1273  ------------------------------------------
1274 *//**
1275 
1276  \param[in] resa First residue
1277  \param[in] resb Second residue
1278  \return score
1279 
1280  Calculate score from static globally stored mutation data matrix
1281 
1282 - 07.10.92 Adapted from NIMR-written original
1283 - 24.11.94 Only gives 10 warnings
1284 - 28.02.95 Modified to use sMDMSize
1285 - 24.08.95 If a residue was not found was doing an out-of-bounds array
1286  reference causing a potential core dump
1287 - 11.07.96 Name changed from calcscore() and now non-static
1288 - 27.02.07 As CalcMDMScore() but upcases characters before comparison
1289 - 07.07.14 Use bl prefix for functions By: CTP
1290 - 04.01.16 Added special call with both residues set to '\0' to silence
1291  warnings. Also warnings now go to stderr
1292 */
1293 int blCalcMDMScoreUC(char resa, char resb)
1294 {
1295  int i,j;
1296  static int NWarn = 0;
1297  BOOL Warned = FALSE;
1298 
1299  if((resa == '\0') && (resb == '\0'))
1300  {
1301  NWarn = 10;
1302  return(0);
1303  }
1304 
1305  resa = (islower(resa)?toupper(resa):resa);
1306  resb = (islower(resb)?toupper(resb):resb);
1307 
1308  for(i=0; i<sMDMSize; i++)
1309  {
1310  if(resa==sMDM_AAList[i]) break;
1311  }
1312 
1313  if(i==sMDMSize)
1314  {
1315  if(NWarn < 10)
1316  fprintf(stderr, "Residue %c not found in matrix\n", resa);
1317  else if(NWarn == 10)
1318  fprintf(stderr, "More residues not found in matrix...\n");
1319 
1320  Warned = TRUE;
1321  }
1322 
1323  for(j=0; j<sMDMSize; j++)
1324  {
1325  if(resb==sMDM_AAList[j]) break;
1326  }
1327 
1328  if(j==sMDMSize)
1329  {
1330  if(NWarn < 10)
1331  fprintf(stderr, "Residue %c not found in matrix\n", resb);
1332  else if(NWarn == 10)
1333  fprintf(stderr, "More residues not found in matrix...\n");
1334 
1335  Warned = TRUE;
1336  }
1337 
1338  if(Warned)
1339  {
1340  NWarn++;
1341  return(0);
1342  }
1343 
1344  return(sMDMScore[i][j]);
1345 }
1346 
1347 /************************************************************************/
1348 /*>int blZeroMDM(void)
1349  -------------------
1350 *//**
1351 
1352  \return Maximum value in modified matrix
1353 
1354  Modifies all values in the MDM such that the minimum value is 0
1355 - 17.09.96 Original
1356 - 07.07.14 Use bl prefix for functions By: CTP
1357 */
1358 int blZeroMDM(void)
1359 {
1360  int MinVal = sMDMScore[0][0],
1361  MaxVal = sMDMScore[0][0],
1362  i, j;
1363 
1364  /* Find the minimum and maximum values on the matrix */
1365  for(i=0; i<sMDMSize; i++)
1366  {
1367  for(j=0; j<sMDMSize; j++)
1368  {
1369  if(sMDMScore[i][j] < MinVal)
1370  {
1371  MinVal = sMDMScore[i][j];
1372  }
1373  else if(sMDMScore[i][j] > MaxVal)
1374  {
1375  MaxVal = sMDMScore[i][j];
1376  }
1377  }
1378  }
1379 
1380  /* Now subtract the MinVal from all cells in the matrix so it starts
1381  at zero.
1382  */
1383  for(i=0; i<sMDMSize; i++)
1384  {
1385  for(j=0; j<sMDMSize; j++)
1386  {
1387  sMDMScore[i][j] -= MinVal;
1388  }
1389  }
1390 
1391  /* Return maximum value in modified matrix */
1392  return(MaxVal-MinVal);
1393 }
1394 
1395 /************************************************************************/
1396 /*>void blSetMDMScoreWeight(char resa, char resb, REAL weight)
1397  -----------------------------------------------------------
1398 *//**
1399 
1400  \param[in] resa First residue
1401  \param[in] resb Second residue
1402  \param[in] weight Weight to apply
1403 
1404  Apply a weight to a particular amino acid substitution
1405 
1406 - 26.08.14 Original By: ACRM
1407 */
1408 void blSetMDMScoreWeight(char resa, char resb, REAL weight)
1409 {
1410  int i,j;
1411  static int NWarn = 0;
1412  BOOL Warned = FALSE;
1413 
1414  resa = (islower(resa)?toupper(resa):resa);
1415  resb = (islower(resb)?toupper(resb):resb);
1416 
1417  for(i=0;i<sMDMSize;i++)
1418  {
1419  if(resa==sMDM_AAList[i]) break;
1420  }
1421  if(i==sMDMSize)
1422  {
1423  if(NWarn < 10)
1424  printf("Residue %c not found in matrix\n",resa);
1425  else if(NWarn == 10)
1426  printf("More residues not found in matrix...\n");
1427  Warned = TRUE;
1428  }
1429  for(j=0;j<sMDMSize;j++)
1430  {
1431  if(resb==sMDM_AAList[j]) break;
1432  }
1433  if(j==sMDMSize)
1434  {
1435  if(NWarn < 10)
1436  printf("Residue %c not found in matrix\n",resb);
1437  else if(NWarn == 10)
1438  printf("More residues not found in matrix...\n");
1439  Warned = TRUE;
1440  }
1441 
1442  if(Warned)
1443  {
1444  NWarn++;
1445  return;
1446  }
1447 
1448  sMDMScore[i][j] *= weight;
1449  if(i != j)
1450  {
1451  sMDMScore[j][i] *= weight;
1452  }
1453 }
1454 
1455 
1456 #ifdef DEMO
1457 int main(int argc, char **argv)
1458 {
1459  char seq1[] = "ACTCLMCT",
1460  seq2[] = "ACTCCT",
1461  align1[100],
1462  align2[100];
1463  int score, al_len;
1464  int i, j;
1465 
1466  ReadMDM("pet91.mat");
1467 
1468  for(i=0; i<sMDMSize; i++)
1469  {
1470  printf(" %c", sMDM_AAList[i]);
1471  }
1472  printf("\n");
1473 
1474  for(i=0; i<sMDMSize; i++)
1475  {
1476  for(j=0; j<sMDMSize; j++)
1477  {
1478  printf("%3d", sMDMScore[i][j]);
1479  }
1480  printf("\n");
1481  }
1482 
1483  score = affinealign(seq1, strlen(seq1), seq2, strlen(seq2),
1484  TRUE, FALSE,
1485  10, 1, align1, align2, &al_len);
1486 
1487  align1[al_len] = '\0';
1488  align2[al_len] = '\0';
1489 
1490  printf("%s\n", align1);
1491  printf("%s\n", align2);
1492 
1493  return(0);
1494 }
1495 #endif
#define MAX(a, b)
Definition: macros.h:239
int main(int argc, char **argv)
Definition: test.c:4
int blCalcMDMScoreUC(char resa, char resb)
Definition: align.c:1293
short BOOL
Definition: SysDefs.h:64
#define NULL
Definition: array2.c:99
#define MAXBUFF
Definition: align.c:150
char ** blArray2D(int size, int dim1, int dim2)
Definition: array2.c:130
int x
Definition: align.c:156
#define FALSE
Definition: macros.h:223
void blSetMDMScoreWeight(char resa, char resb, REAL weight)
Definition: align.c:1408
int blZeroMDM(void)
Definition: align.c:1358
Definition: align.c:154
Useful macros.
Include file for 2D/3D array functions.
FILE * blOpenFile(char *filename, char *envvar, char *mode, BOOL *noenv)
Definition: OpenFile.c:146
#define TERMINATE(x)
Definition: macros.h:366
double REAL
Definition: MathType.h:67
Header file for sequence handling.
int blAffinealignuc(char *seq1, int length1, char *seq2, int length2, BOOL verbose, BOOL identity, int penalty, int penext, char *align1, char *align2, int *align_len)
Definition: align.c:583
void blFreeArray2D(char **array, int dim1, int dim2)
Definition: array2.c:174
#define MAXWORD
Definition: align.c:151
#define TRUE
Definition: macros.h:219
int y
Definition: align.c:156
char * blGetWord(char *buffer, char *word, int maxsize)
Definition: GetWord.c:268
BOOL blReadMDM(char *mdmfile)
Definition: align.c:871
Header file for general purpose routines.
int blAlign(char *seq1, int length1, char *seq2, int length2, BOOL verbose, BOOL identity, int penalty, char *align1, char *align2, int *align_len)
Definition: align.c:214
#define KILLLEADSPACES(y, x)
Definition: macros.h:408
#define DATAENV
Definition: align.c:148
System-type variable type definitions.
int blCalcMDMScore(char resa, char resb)
Definition: align.c:1220
int blAffinealign(char *seq1, int length1, char *seq2, int length2, BOOL verbose, BOOL identity, int penalty, int penext, char *align1, char *align2, int *align_len)
Definition: align.c:275