• Main Page
  • Related Pages
  • Namespaces
  • Data Structures
  • Files
  • File List
  • Globals

SimpleTokenizer.h

Go to the documentation of this file.
00001 /** @file
00002 
00003   A brief file description
00004 
00005   @section license License
00006 
00007   Licensed to the Apache Software Foundation (ASF) under one
00008   or more contributor license agreements.  See the NOTICE file
00009   distributed with this work for additional information
00010   regarding copyright ownership.  The ASF licenses this file
00011   to you under the Apache License, Version 2.0 (the
00012   "License"); you may not use this file except in compliance
00013   with the License.  You may obtain a copy of the License at
00014 
00015       http://www.apache.org/licenses/LICENSE-2.0
00016 
00017   Unless required by applicable law or agreed to in writing, software
00018   distributed under the License is distributed on an "AS IS" BASIS,
00019   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00020   See the License for the specific language governing permissions and
00021   limitations under the License.
00022  */
00023 
00024 #ifndef _SIMPLE_TOKENIZER_H_
00025 #define _SIMPLE_TOKENIZER_H_
00026 
00027 #include <stdlib.h>
00028 #include <string.h>
00029 #include <ctype.h>
00030 
00031 /*-----------------------------------------------------------------------------
00032   SimpleTokenizer
00033 
00034   This class provides easy token parsing from an input string. It supports:
00035 
00036   1- ignoring (or not) of null fields
00037   2- left whitespace trimming
00038   3- right whitespace trimming
00039   4- escaping the delimiter character with a user defined escape character
00040 
00041   The class has two constructors, one that defines the input string,
00042   and another one that does not. If the latter is used, then the
00043   setString method should be used to set the data string.
00044 
00045   Both constructors set the delimiter, the operation mode (which
00046   defines bullets 1-3 above), and the escape character.
00047 
00048   The available methods are:
00049 
00050   void setString(char *s)
00051   sets the data string to s. The mode specified upon construction of the
00052   tokenizer determines whether s is copied or not.
00053 
00054   char *getNext()
00055   returns the next token, or NULL if there are no more tokens. This method
00056   uses the delimiter specified upon object construction.
00057 
00058   char *getNext(char delimiter)
00059   similar to getNext(), but allows the user to change the delimiter (just for
00060   this call).
00061 
00062   char *getNext(int count)
00063   get the next count tokens as a single token (ignoring the delimiters in
00064   between).
00065 
00066   char *getNext(char delimiter, int count)
00067   this is similar to getNext(int count) but allows user to specify the
00068   delimiter.
00069 
00070   IMPORTANT: the char pointers returned by the SimpleTokenizer are valid
00071   ONLY during the lifetime of the object. The copy of the input string
00072   is destroyed by the object's destructor.
00073 
00074   char *getRest()
00075   returns the rest of the tokens all together. Advances pointer so a
00076   subsequent call to getNext returns NULL;
00077 
00078   char *peekAtRestOfString()
00079   returns the rest of the input string, but DOES NOT advance pointer so a
00080   subsequent call to getNext does return the next token (if there is still
00081   one).
00082 
00083   size_t getNumTokensRemaining()
00084   returns the number of tokens remaining in the string (using the delimiter
00085   specified upon object construction).
00086 
00087   size_t getNumTokensRemaining(char delimiter)
00088   similar to the above, but allows the user to change the delimiter (just for
00089   this call).
00090 
00091   Note that multiple delimiters are not supported (more than one per call).
00092 
00093   examples:
00094 
00095   SimpleTokenizer tok("one    two\\ and\\ three four:   five : six");
00096   tok.getNumTokensRemaining() --> 5     note calculation is done assuming
00097                                         space is the delimiter
00098   tok.getNext() -> "one"
00099   tok.getNext() -> "two and three"
00100   tok.getNext(':') -> "four"
00101   tok.peekAtRestOfString() -> "   five  : six"
00102   tok.getNext(':') -> "five"
00103 
00104   SimpleTokenizer tok(",  with null fields ,,,", ',',
00105                       CONSIDER_NULL_FIELDS | KEEP_WHITESPACE);
00106   tok.getNext() -> ""
00107   tok.getNext() -> "  with null fields "
00108   tok.getNumTokensRemaining() -> 3
00109 
00110   ---------------------------------------------------------------------------*/
00111 
00112 class SimpleTokenizer
00113 {
00114 public:
00115 
00116   // by default, null fields are disregarded, whitespace is trimmed left
00117   // and right, and input string is copied (not overwritten)
00118   //
00119   enum
00120   {
00121     CONSIDER_NULL_FIELDS = 1,
00122     KEEP_WHITESPACE_LEFT = 2,
00123     KEEP_WHITESPACE_RIGHT = 4,
00124     KEEP_WHITESPACE = KEEP_WHITESPACE_LEFT + KEEP_WHITESPACE_RIGHT,
00125     OVERWRITE_INPUT_STRING = 8
00126   };
00127 
00128   SimpleTokenizer(char delimiter = ' ', unsigned mode = 0, char escape = '\\')
00129     : _data(0), _delimiter(delimiter), _mode(mode), _escape(escape), _start(0), _length(0)
00130   {  }
00131 
00132   // NOTE: The input strring 's' is overwritten for mode OVERWRITE_INPUT_STRING.
00133   SimpleTokenizer(const char *s, char delimiter = ' ', unsigned mode = 0, char escape = '\\')
00134   : _data(0), _delimiter(delimiter), _mode(mode), _escape(escape)
00135   {
00136     setString(s);
00137   }
00138 
00139   ~SimpleTokenizer() {
00140     _clearData();
00141   }
00142 
00143   void setString(const char *s)
00144   {
00145     _clearData();
00146 
00147     _start = 0;
00148     _length = strlen(s);
00149     _data = (_mode & OVERWRITE_INPUT_STRING ? const_cast<char *>(s) : ats_strdup(s));
00150 
00151     // to handle the case where there is a null field at the end of the
00152     // input string, we replace the null character at the end of the
00153     // string with the delimiter (and consider the string to be one
00154     // character larger).
00155     //
00156     _data[_length++] = _delimiter;
00157   };
00158   char *getNext(int count = 1) {
00159     return _getNext(_delimiter, false, count);
00160   };
00161   char *getNext(char delimiter, int count = 1) {
00162     return _getNext(delimiter, false, count);
00163   }
00164   char *getRest()
00165   {
00166     // there can't be more than _length tokens, so we get the rest
00167     // of the tokens by requesting _length of them
00168     //
00169     return _getNext(_delimiter, false, _length);
00170   }
00171   size_t getNumTokensRemaining()
00172   {
00173     return _getNumTokensRemaining(_delimiter);
00174   };
00175   size_t getNumTokensRemaining(char delimiter)
00176   {
00177     return _getNumTokensRemaining(delimiter);
00178   };
00179   char *peekAtRestOfString()
00180   {
00181     _data[_length - 1] = 0;
00182     return (_start < _length ? &_data[_start] : &_data[_length - 1]);
00183   }
00184 
00185 private:
00186 
00187   char *_data;                  // a pointer to the input data itself,
00188   // or to a copy of it
00189   char _delimiter;              // the token delimiter
00190   unsigned _mode;                    // flags that determine the
00191   // mode of operation
00192   char _escape;                 // the escape character
00193   size_t _start;                // pointer to the start of the next
00194   // token
00195   size_t _length;               // the length of _data
00196 
00197   void _clearData()
00198   {
00199     if (_data && !(_mode & OVERWRITE_INPUT_STRING)) {
00200       ats_free(_data);
00201     }
00202   }
00203 
00204   char *_getNext(char delimiter, bool countOnly = false, int numTokens = 1) {
00205     char *next = NULL;
00206 
00207     if (_start < _length) {
00208       // set start
00209       //
00210       bool hasEsc = false;      // escape character seen
00211       while (_start < _length &&
00212              ((!(_mode & CONSIDER_NULL_FIELDS) &&
00213                (_data[_start] == delimiter &&
00214                 !(_start &&
00215                   (_data[_start - 1] == _escape ? (hasEsc = true) : 0)))) ||
00216               (!(_mode & KEEP_WHITESPACE_LEFT) && isspace(_data[_start])))) {
00217         ++_start;
00218       }
00219 
00220       if (_start < _length)     // data still available
00221       {
00222         // update the extra delimiter just in case the function
00223         // is called with a different delimiter from the previous one
00224         //
00225         _data[_length - 1] = delimiter;
00226 
00227         next = &_data[_start];
00228 
00229         // set end
00230         //
00231         size_t end = _start;
00232         int delimCount = 0;
00233         while (end < _length &&
00234                (_data[end] != delimiter ||
00235                 (end && (_data[end - 1] == _escape ? (hasEsc = true) : 0)) ||
00236                 ((++delimCount < numTokens) && (end < _length - 1)))) {
00237           ++end;
00238         }
00239 
00240         _start = end + 1;
00241 
00242         // there can be delimiters at the end if the number of tokens
00243         // requested is larger than 1, remove them if the
00244         // CONSIDER_NULL_FIELDS flag is not set
00245         //
00246         if (!(_mode & CONSIDER_NULL_FIELDS)) {
00247           while (_data[--end] == delimiter);
00248           ++end;
00249         }
00250 
00251         if (!(_mode & KEEP_WHITESPACE_RIGHT)) {
00252           while (isspace(_data[--end]));
00253           ++end;
00254         }
00255 
00256         if (!countOnly) {
00257           _data[end] = 0;
00258 
00259           // remove escape characters only if the number of
00260           // delimiters is one
00261           //
00262           if (hasEsc && delimCount == 1) {
00263             int numEscape = 0, i = 0;
00264             while (next[i]) {
00265               if (next[i] == _escape) {
00266                 ++numEscape;
00267               } else {
00268                 next[i - numEscape] = next[i];
00269               }
00270               ++i;
00271             }
00272             _data[end - numEscape] = 0;
00273           }
00274         }
00275       }
00276     }
00277     return next;
00278   };
00279 
00280   size_t _getNumTokensRemaining(char delimiter)
00281   {
00282     size_t startSave = _start;  // save current position
00283     size_t count = 0;
00284     while (_getNext(delimiter, true)) {
00285       ++count;
00286     };
00287     _start = startSave;
00288     return count;
00289   };
00290 };
00291 
00292 #endif

Generated by  doxygen 1.7.1