Bioplib
Protein Structure C Library
 All Data Structures Files Functions Variables Typedefs Macros Pages
ReadRawPIR.c
Go to the documentation of this file.
1 /************************************************************************/
2 /**
3 
4  \file ReadRawPIR.c
5 
6  \version V2.8
7  \date 07.07.14
8  \brief
9 
10  \copyright (c) UCL / Dr. Andrew C. R. Martin 1991-2014
11  \author Dr. Andrew C. R. Martin
12  \par
13  Institute of Structural & Molecular Biology,
14  University College London,
15  Gower Street,
16  London.
17  WC1E 6BT.
18  \par
19  andrew@bioinf.org.uk
20  andrew.martin@ucl.ac.uk
21 
22 **************************************************************************
23 
24  This code is NOT IN THE PUBLIC DOMAIN, but it may be copied
25  according to the conditions laid out in the accompanying file
26  COPYING.DOC.
27 
28  The code may be modified as required, but any modifications must be
29  documented so that the person responsible can be identified.
30 
31  The code may not be sold commercially or included as part of a
32  commercial product except as described in the file COPYING.DOC.
33 
34 **************************************************************************
35 
36  Description:
37  ============
38 
39 
40 **************************************************************************
41 
42  Usage:
43  ======
44 
45 \code
46  int ReadRawPIR(FILE *fp, BOOL DoInsert, char **seqs, int maxchain,
47  SEQINFO *seqinfo, BOOL *punct, BOOL *error)
48 \endcode
49 
50  As ReadPIR(), but reads punctuation characters without taking any
51  special action. Used when punctuation characters have been used
52  to indicate consensus sequence features.
53 
54 **************************************************************************
55 
56  Revision History:
57  =================
58 - V1.0 01.06.92 Original
59 - V2.0 08.03.94 Changed name of ReadPIR() to ReadSimplePIR()
60  Added new ReadPIR().
61 - V2.1 18.03.94 getc() -> fgetc()
62 - V2.2 11.05.94 Changes to ReadPIR() for better compatibility with
63  PIR V38.0 and V39.0
64 - V2.3 28.02.95 Added ReadRawPIR()
65 - V2.4 13.03.95 Fixed bug in reading text lines in ReadRawPIR()
66 - V2.5 26.07.95 Removed unused variables
67 - V2.6 30.10.95 Cosmetic
68 - V2.7 06.02.96 Removes trailing spaces from comment line
69 - V2.8 07.07.14 Use bl prefix for functions By: CTP
70 
71 *************************************************************************/
72 /* Doxygen
73  -------
74  #GROUP Handling Sequence Data
75  #SUBGROUP File IO
76  #FUNCTION blReadRawPIR()
77  This is based on ReadPIR(), but reads all characters into the
78  sequence arrays (i.e. all punctuation characters are read as is).
79  This is useful when punctuation has been used to indicate
80  consensus sequence features.
81 */
82 /************************************************************************/
83 /* Includes
84 */
85 #include <stdio.h>
86 #include <stdlib.h>
87 
88 #include "SysDefs.h"
89 #include "seq.h"
90 #include "macros.h"
91 
92 /************************************************************************/
93 /* Defines and macros
94 */
95 
96 /************************************************************************/
97 /* Globals
98 */
99 
100 /************************************************************************/
101 /* Prototypes
102 */
103 
104 
105 /************************************************************************/
106 /*>int blReadRawPIR(FILE *fp, char **seqs, int maxchain, BOOL upcase,
107  SEQINFO *seqinfo, BOOL *error)
108  ------------------------------------------------------------------
109 *//**
110 
111  \param[in] *fp File pointer
112  \param[in] maxchain Max number of chains to read. This is the
113  dimension of the seqs array.
114  N.B. THIS SHOULD BE AT LEAST 1 MORE THAN
115  THE EXPECTED MAXIMUM NUMBER OF SEQUENCES
116  \param[in] upcase Should lower-case letters be upcased?
117  \param[out] **seqs Array of character pointers which will
118  be filled in with sequence information.
119  Memory will be allocated for any sequence
120  length.
121  \param[out] *seqinfo This structure will be filled in with
122  extra information about the sequence.
123  Header & title information and details
124  of any punctuation.
125  \param[out] *error TRUE if an error occured (e.g. memory
126  allocation)
127  \return Number of chains in this sequence.
128  0 if file ended, or no valid sequence
129  entries found.
130 
131  This is based on ReadPIR(), but reads all characters into the
132  sequence arrays (i.e. all punctuation characters are read as is).
133  This is useful when punctuation has been used to indicate
134  consensus sequence features.
135 
136  The only requirements of the code are that the PIR file should have
137  2 title lines per entry, the first line starting with a > sign.
138  The routine will handle multiple sequence files. Successive calls
139  will return information on the next entry. The routine will return
140  0 when there are no more entries.
141 
142  Header line: Must start with >. Will handle files which don't have
143  the proper P1; or F1; parts of the header as well as those which
144  do.
145 
146  Title line: Will read the name and source fields if correctly
147  separated by a -, otherwise copies all information into the name.
148 
149  White space and line breaks are ignored. Each chain should end with
150  a *, but the routine will accept the last chain of an entry with no
151  *. While the standard requires upper case text, this routine will
152  handle lower case and convert it to upper case. While the routine
153  does pretty well at last chains not terminated with a *, a last
154  chain ending with a / not followed by a * but followed by a text
155  line will be identified as incomplete rather than truncated.
156  If the DoInsert flag is set, - signs in the sequence will be
157  read as part of the sequence, otherwise they will be skipped. This
158  is an addition to the PIR standard.
159 
160  Text lines: Text lines after an entry (beginning with R;, C;, A;,
161  N; or F;) are ignored.
162 
163 - 28.02.95 Original based on ReadPIR() By: ACRM
164 - 13.03.95 chpos++ had got moved wrongly when adapting from ReadPIR().
165  Put it back fixing handling of text lines.
166 - 26.07.95 Removed unused variables
167 - 06.02.96 Remove any trailing spaces
168 */
169 int blReadRawPIR(FILE *fp, char **seqs, int maxchain, BOOL upcase,
170  SEQINFO *seqinfo, BOOL *error)
171 {
172  int ch,
173  i,
174  chpos,
175  nseq = 0,
176  ArraySize,
177  SeqPos;
178  char buffer[504],
179  *ptr;
180  BOOL GotStar;
181 
182  /* Initialise error and punct outputs */
183  *error = FALSE;
184 
185  /* Initialise seqinfo structure */
186  if(seqinfo != NULL)
187  {
188  seqinfo->code[0] = '\0';
189  seqinfo->name[0] = '\0';
190  seqinfo->source[0] = '\0';
191  seqinfo->fragment = FALSE;
192  seqinfo->paren = FALSE;
193  seqinfo->DotInParen = FALSE;
194  seqinfo->NonExpJoin = FALSE;
195  seqinfo->UnknownPos = FALSE;
196  seqinfo->Incomplete = FALSE;
197  seqinfo->Juxtapose = FALSE;
198  seqinfo->Truncated = FALSE;
199  }
200 
201  /* Skip over any characters until the first > sign */
202  while((ch=fgetc(fp)) != EOF && ch != '>') ;
203 
204  /* Check for end of file */
205  if(ch==EOF) return(0);
206 
207  /* Read the rest of this line into a buffer */
208  i = 0;
209  while((ch=fgetc(fp)) != EOF && ch != '\n' && i<503)
210  buffer[i++] = (char)ch;
211  buffer[i] = '\0';
212 
213  /* Check for end of file */
214  if(ch==EOF) return(0);
215 
216  /* Set information in the seqinfo structure */
217  if(seqinfo != NULL)
218  {
219  /* Fragment flag */
220  if(buffer[2] == ';' && buffer[0] == 'F')
221  seqinfo->fragment = TRUE;
222  else
223  seqinfo->fragment = FALSE;
224 
225  /* Entry code */
226  if(buffer[2] == ';')
227  {
228  KILLLEADSPACES(ptr,(buffer+3));
229  }
230  else
231  {
232  KILLLEADSPACES(ptr,buffer);
233  }
234 
235  strncpy(seqinfo->code, ptr, 16);
236  seqinfo->code[15] = '\0';
237 
238  /* Terminate entry code at first space since comments are allowed
239  after the entry code (V39.0 spec)
240  */
241  for(i=0; seqinfo->code[i]; i++)
242  {
243  if(seqinfo->code[i] == ' ' || seqinfo->code[i] == '\t')
244  {
245  seqinfo->code[i] = '\0';
246  break;
247  }
248  }
249  }
250 
251  /* Now read the title line */
252  if(!fgets(buffer,240,fp))
253  return(0);
254  buffer[240] = '\0';
255  /* 06.02.96 Remove any trailing spaces */
256  KILLTRAILSPACES(buffer);
257 
258  /* Set information in the seqinfo structure */
259  if(seqinfo)
260  {
261  TERMINATE(buffer);
262  /* If it's a fully legal PIR file, there will be a - in the midle
263  of the title line to separate name from source. If we don't
264  find one, we copy the whole line into the name
265  */
266  if((ptr = strstr(buffer," - ")) != NULL)
267  {
268  *ptr = '\0';
269  strncpy(seqinfo->source, ptr+3, 160);
270  seqinfo->source[159] = '\0';
271  }
272  strncpy(seqinfo->name, buffer, 160);
273  seqinfo->name[159] = '\0';
274  /* 06.02.96 Remove any trailing spaces */
275  KILLTRAILSPACES(seqinfo->name);
276  }
277 
278  /* Read the actual sequence info. */
279  chpos = 0;
280  for(;;)
281  {
282  GotStar = FALSE;
283 
284  /* Allocate some space for the sequence */
285  ArraySize = ALLOCSIZE;
286  if((seqs[nseq] = (char *)malloc(ArraySize * sizeof(char)))==NULL)
287  {
288  *error = TRUE;
289  return(0);
290  }
291 
292  SeqPos = 0;
293 
294  /* Read characters, storing sequence and handling any
295  special punctuation (end or start of new sequence)
296  */
297  while((ch = fgetc(fp)) != EOF && ch != '*' && ch != '>')
298  {
299  chpos++;
300 
301  if(ch == '\n')
302  {
303  /* Start of new line, relevant to check on ; */
304  chpos = 0;
305  }
306  else if(ch == ';' && chpos == 2)
307  {
308  /* This is a text line, so the previous character wasn't
309  a sequence item
310  */
311  SeqPos--;
312 
313  /* Ignore the rest of this line and reset chpos */
314  while((ch = fgetc(fp))!=EOF && ch != '\n') ;
315  chpos = 0;
316  }
317  else if(ch != ' ' && ch != '\t')
318  {
319  /* This is a sequence entry (probably!) */
320  seqs[nseq][SeqPos++] =
321  (upcase ? (isupper(ch) ? ch : toupper(ch)) : ch);
322 
323  /* If necessary, expand the sequence array */
324  if(SeqPos >= ArraySize)
325  {
326  ArraySize += ALLOCSIZE;
327  seqs[nseq] = (char *)realloc((void *)(seqs[nseq]),
328  ArraySize);
329  if(seqs[nseq] == NULL)
330  {
331  *error = TRUE;
332  return(0);
333  }
334  }
335  }
336  } /* Reading this sequence */
337 
338  /* Test the exit conditions from the read character loop */
339  if(ch == '*')
340  {
341  /* End of chain */
342  seqs[nseq][SeqPos] = '\0';
343  GotStar = TRUE;
344  if(++nseq >= maxchain)
345  {
346  *error = TRUE;
347  return(nseq);
348  }
349  }
350  else if(ch == '>')
351  {
352  /* Start of new entry */
353  ungetc(ch,fp);
354  break; /* Out of read for this sequence */
355  }
356  else if(ch == EOF)
357  {
358  /* End of file */
359  break; /* Out of read for this sequence */
360  }
361  } /* Loop on with this sequence (next chain) */
362 
363  /* Now tidy up if we have an unfinished sequence */
364  if(!GotStar)
365  {
366  seqs[nseq][SeqPos] = '\0';
367  if(!strlen(seqs[nseq]))
368  free(seqs[nseq]);
369  else
370  nseq++;
371  }
372 
373  return(nseq);
374 }
375 
char code[16]
Definition: seq.h:100
short BOOL
Definition: SysDefs.h:64
#define NULL
Definition: array2.c:99
BOOL UnknownPos
Definition: seq.h:92
#define KILLTRAILSPACES(x)
Definition: macros.h:414
BOOL fragment
Definition: seq.h:92
#define FALSE
Definition: macros.h:223
Definition: seq.h:90
Useful macros.
#define TERMINATE(x)
Definition: macros.h:366
Header file for sequence handling.
BOOL Incomplete
Definition: seq.h:92
BOOL DotInParen
Definition: seq.h:92
BOOL NonExpJoin
Definition: seq.h:92
#define TRUE
Definition: macros.h:219
BOOL paren
Definition: seq.h:92
char source[160]
Definition: seq.h:100
#define KILLLEADSPACES(y, x)
Definition: macros.h:408
System-type variable type definitions.
int blReadRawPIR(FILE *fp, char **seqs, int maxchain, BOOL upcase, SEQINFO *seqinfo, BOOL *error)
Definition: ReadRawPIR.c:169
BOOL Juxtapose
Definition: seq.h:92
char name[160]
Definition: seq.h:100
#define ALLOCSIZE
BOOL Truncated
Definition: seq.h:92