Bioplib
Protein Structure C Library
 All Data Structures Files Functions Variables Typedefs Macros Pages
GetWord.c
Go to the documentation of this file.
1 /************************************************************************/
2 /**
3 
4  \file GetWord.c
5 
6  \version V2.4
7  \date 12.03.15
8  \brief Get a space delimited word from a string
9 
10  \copyright (c) UCL / Dr. Andrew C. R. Martin 1995-2015
11  \author Dr. Andrew C. R. Martin
12  \par
13  Institute of Structural & Molecular Biology,
14  University College London,
15  Gower Street,
16  London.
17  WC1E 6BT.
18  \par
19  andrew@bioinf.org.uk
20  andrew.martin@ucl.ac.uk
21 
22 **************************************************************************
23 
24  This code is NOT IN THE PUBLIC DOMAIN, but it may be copied
25  according to the conditions laid out in the accompanying file
26  COPYING.DOC.
27 
28  The code may be modified as required, but any modifications must be
29  documented so that the person responsible can be identified.
30 
31  The code may not be sold commercially or included as part of a
32  commercial product except as described in the file COPYING.DOC.
33 
34 **************************************************************************
35 
36  Description:
37  ============
38 
39 
40 **************************************************************************
41 
42  Usage:
43  ======
44 
45 **************************************************************************
46 
47  Revision History:
48  =================
49 - V1.0 02.03.99 Original By: ACRM
50 - V2.0 10.06.99 Complete rewrite to allow escaping of characters
51 - V2.1 07.07.14 Use bl prefix for functions By: CTP
52 - V2.2 08.03.14 Made doGetWord() a static function. By CTP
53 - V2.3 10.03.15 Added blSplitStringOnCommas() By: ACRM
54 - V2.4 12.03.15 Added blSplitStringOnChars()
55 
56 *************************************************************************/
57 /* Doxygen
58  -------
59  #GROUP General Programming
60  #SUBGROUP String handling
61 
62  #FUNCTION blGetWord()
63  Reads a whitespace/comma delimited word out of buffer into word.
64 
65  #FUNCTION blGetWordNC()
66  Reads a whitespace delimited word out of buffer into word. Commas
67  are treated just like normal characters.
68 
69  #FUNCTION blSplitStringOnCommas()
70  Split a comma-separated string into an array of items. Mallocs a
71  2D array
72 
73  #FUNCTION blSplitStringOnChars()
74  Split a string into an array of individual characters, but each
75  stored as a string. Mallocs a 2D array
76 */
77 /************************************************************************/
78 /* Includes
79 */
80 #include <stdio.h>
81 #include <string.h>
82 #include "macros.h"
83 #include "SysDefs.h"
84 #include "array.h"
85 
86 /************************************************************************/
87 /* Defines and macros
88 */
89 
90 /************************************************************************/
91 /* Globals
92 */
93 
94 /************************************************************************/
95 /* Prototypes
96 */
97 static char *doGetWord(char *buffer, char *word, int maxlen, BOOL comma);
98 
99 
100 /************************************************************************/
101 /*>static char *doGetWord(char *buffer, char *word, int maxlen,
102  BOOL comma)
103  ------------------------------------------------------------
104 *//**
105 
106  \param[in] *buffer Input buffer to read words from
107  \param[in] maxlen Max length of output word
108  \param[in] comma Treat commas like white space?
109  \param[out] *word Word read from buffer
110  \return Pointer to start of next word in buffer
111  or NULL
112 
113  This code is designed to be called from GetWord() or GetWordNC()
114 
115  Reads a whitespace delimited word out of buffer into word. If comma is
116  TRUE, then commas are treated just like white space, otherwise they
117  are treated like normal characters.
118 
119  Words containing white space may be wrapped in double inverted commas.
120  A \ is used as an escape character and maybe used to escape *any*
121  following character. In particular:
122  "\\" -> '\' To get a backslash
123  "\ " -> ' ' To get a hard whitespace (alternatively wrap the
124  string in double inverted commas)
125  "\"" -> '"' To get a double inverted comma
126 
127 - 10.06.99 Original By: ACRM (based on code from Bioplib)
128 - 03.08.14 Made static By: CTP
129 */
130 static char *doGetWord(char *buffer, char *word, int maxlen, BOOL comma)
131 {
132  int i, j;
133  BOOL dic = FALSE,
134  escape = FALSE;
135  char *chp;
136 
137  /* Decrement maxlen so we can terminate correctly */
138  maxlen--;
139 
140  /* Check validity of passed pointers */
141  if(word==NULL)
142  return(NULL);
143 
144  word[0] = '\0';
145  if(buffer==NULL)
146  return(NULL);
147 
148  KILLLEADSPACES(chp, buffer);
149 
150  /* Run through each character in the input buffer */
151  for(i=0, j=0; chp[i]; i++)
152  {
153  switch(chp[i])
154  {
155  case '\\':
156  /* Use backslash as an escape character. If we've just had an
157  escape, then simply store it
158  */
159  if(escape)
160  {
161  escape = FALSE;
162  if(j<maxlen)
163  word[j++] = chp[i];
164  }
165  else
166  {
167  escape = TRUE;
168  }
169  break;
170  case '\"':
171  /* Double inverted commas enclose strings containing white space
172  If we've just had an escape then handle as a normal character,
173  otherwise, toggle the dic flag
174  */
175  if(escape)
176  {
177  if(j<maxlen)
178  word[j++] = chp[i];
179  }
180  else
181  {
182  TOGGLE(dic);
183  }
184  escape = FALSE;
185  break;
186  case ',':
187  /* A comma is handled as white space or a normal character,
188  depending on the comma flag
189  */
190  if(!comma) /* Treat as default */
191  {
192  if(j<maxlen)
193  word[j++] = chp[i];
194  escape = FALSE;
195  break;
196  }
197  /* Otherwise, if comma is true, just fall through to treat it
198  like whitespace
199  */
200  case ' ':
201  case '\t':
202  /* If we are in double inverted commas or last char was an escape
203  just handle as a normal character
204  */
205  if(dic || escape)
206  {
207  if(j<maxlen)
208  word[j++] = chp[i];
209  }
210  else
211  {
212  /* Otherwise, this terminates the word, so terminate, move
213  the pointer on and return
214  */
215  word[j] = '\0';
216  chp += i;
217  KILLLEADSPACES(chp, chp);
218  if(comma)
219  {
220  /* If we are handling commas as whitespace, then k
221  the comma if found
222  */
223  if(*chp == ',') chp++;
224  }
225  if(*chp == '\0') chp = NULL;
226  return(chp);
227  }
228  escape = FALSE;
229  break;
230  default:
231  /* A normal character, copy it across */
232  if(j<maxlen)
233  word[j++] = chp[i];
234  escape = FALSE;
235  }
236  }
237 
238  word[j] = '\0';
239  return(NULL);
240 }
241 
242 /************************************************************************/
243 /*>char *blGetWord(char *buffer, char *word, int maxlen)
244  -----------------------------------------------------
245 *//**
246 
247  \param[in] *buffer Input buffer to read words from
248  \param[in] maxlen Max length of output word
249  \param[out] *word Word read from buffer
250  \return Pointer to start of next word in buffer
251  or NULL
252 
253  This code is a wrapper to doGetWord()
254 
255  Reads a whitespace/comma delimited word out of buffer into word.
256 
257  Words containing white space may be wrapped in double inverted commas.
258  A \ is used as an escape character and maybe used to escape *any*
259  following character. In particular:
260  "\\" -> '\' To get a backslash
261  "\ " -> ' ' To get a hard whitespace (alternatively wrap the
262  string in double inverted commas)
263  "\"" -> '"' To get a double inverted comma
264 
265 - 10.06.99 Original By: ACRM
266 - 07.07.14 Use bl prefix for functions By: CTP
267 */
268 char *blGetWord(char *buffer, char *word, int maxlen)
269 {
270  return(doGetWord(buffer, word, maxlen, TRUE));
271 }
272 
273 /************************************************************************/
274 /*>char *blGetWordNC(char *buffer, char *word, int maxlen)
275  -------------------------------------------------------
276 *//**
277 
278  \param[in] *buffer Input buffer to read words from
279  \param[in] maxlen Max length of output word
280  \param[out] *word Word read from buffer
281  \return Pointer to start of next word in buffer
282  or NULL
283 
284  This code is a wrapper to doGetWord()
285 
286  Reads a whitespace delimited word out of buffer into word. Commas
287  are treated just like normal characters.
288 
289  Words containing white space may be wrapped in double inverted commas.
290  A \ is used as an escape character and maybe used to escape *any*
291  following character. In particular:
292  "\\" -> '\' To get a backslash
293  "\ " -> ' ' To get a hard whitespace (alternatively wrap the
294  string in double inverted commas)
295  "\"" -> '"' To get a double inverted comma
296 
297 - 10.06.99 Original By: ACRM
298 - 07.07.14 Use bl prefix for functions By: CTP
299 */
300 char *blGetWordNC(char *buffer, char *word, int maxlen)
301 {
302  return(doGetWord(buffer, word, maxlen, FALSE));
303 }
304 
305 /************************************************************************/
306 /*>char **blSplitStringOnCommas(char *string, int minItemLen)
307  ----------------------------------------------------------
308 *//**
309  \param[in] *string String containing comma-separated items
310  \param[in] minItemLength Min size to allocate for each item
311  \return Malloc'd array of strings
312 
313  Splits a comma separated list of items malloc'ing a 2D array which
314  contains the item strings. The first dimension will be the number
315  if items plus one. The second dimension will be the maximum item
316  length plus one, or at least minItemLen.
317 
318  The last position in the first array will be set to a null string
319 
320  Note that this routine malloc's a 2D array which will need to be
321  freed
322 
323 - 10.03.15 Original By: ACRM
324 */
325 char **blSplitStringOnCommas(char *string, int minItemLen)
326 {
327  int nitems = 0;
328  char **items = NULL;
329  char *c,
330  *buffer;
331  int maxItemLen = minItemLen-1,
332  itemLen,
333  i;
334 
335  /* Count the number of comma-separated items in the string. Also record
336  the length of the longest item
337  */
338  itemLen = 0;
339  for(c=string; *c; c++)
340  {
341  if(*c == ',')
342  {
343  if(itemLen > maxItemLen)
344  maxItemLen = itemLen;
345  nitems++;
346  itemLen = 0;
347  }
348  }
349  if(itemLen > maxItemLen)
350  maxItemLen = itemLen;
351 
352  nitems++;
353  maxItemLen++;
354 
355  /* Allocate space for the items */
356  if((items = (char **)blArray2D(sizeof(char), nitems+1,
357  maxItemLen))==NULL)
358  return(NULL);
359 
360  /* And copy in the data */
361  buffer = string;
362  for(i=0; i<nitems; i++)
363  {
364  if((c = strchr(buffer, ','))!=NULL)
365  *c = '\0';
366  strncpy(items[i], buffer, maxItemLen);
367  buffer=c+1;
368  }
369  items[nitems][0] = '\0';
370 
371  return(items);
372 }
373 
374 
375 /************************************************************************/
376 /*>char **blSplitStringOnCommas(char *string)
377  ------------------------------------------
378 *//**
379  \param[in] *string String containing comma-separated items
380  \return Malloc'd array of strings
381 
382  Splits a string into a list of items malloc'ing a 2D array which
383  contains the item strings. The first dimension will be the number
384  if items plus one. The second dimension will be 2 - a single charcater
385  plus the \0
386 
387  The last position in the first array will be set to a null string
388 
389  Note that this routine malloc's a 2D array which will need to be
390  freed
391 
392 - 12.03.15 Original By: ACRM
393 */
394 char **blSplitStringOnChars(char *string)
395 {
396  int nitems = 0;
397  char **items = NULL;
398  int i;
399 
400  /* Count the number of items in the string. */
401  nitems = strlen(string);
402 
403  /* Allocate space for the items */
404  if((items = (char **)blArray2D(sizeof(char), nitems+1, 2))==NULL)
405  return(NULL);
406 
407  /* And copy in the data */
408  for(i=0; i<nitems; i++)
409  {
410  items[i][0] = string[i];
411  items[i][1] = '\0';
412  }
413  items[nitems][0] = '\0';
414 
415  return(items);
416 }
417 
418 
char * blGetWordNC(char *buffer, char *word, int maxlen)
Definition: GetWord.c:300
short BOOL
Definition: SysDefs.h:64
#define NULL
Definition: array2.c:99
char * blGetWord(char *buffer, char *word, int maxlen)
Definition: GetWord.c:268
char ** blArray2D(int size, int dim1, int dim2)
Definition: array2.c:130
#define FALSE
Definition: macros.h:223
Useful macros.
Include file for 2D/3D array functions.
#define TRUE
Definition: macros.h:219
#define TOGGLE(x)
Definition: macros.h:364
char ** blSplitStringOnChars(char *string)
Definition: GetWord.c:394
char ** blSplitStringOnCommas(char *string, int minItemLen)
Definition: GetWord.c:325
#define KILLLEADSPACES(y, x)
Definition: macros.h:408
System-type variable type definitions.
char * string
Definition: general.h:85