00001 /** @file 00002 00003 A brief file description 00004 00005 @section license License 00006 00007 Licensed to the Apache Software Foundation (ASF) under one 00008 or more contributor license agreements. See the NOTICE file 00009 distributed with this work for additional information 00010 regarding copyright ownership. The ASF licenses this file 00011 to you under the Apache License, Version 2.0 (the 00012 "License"); you may not use this file except in compliance 00013 with the License. You may obtain a copy of the License at 00014 00015 http://www.apache.org/licenses/LICENSE-2.0 00016 00017 Unless required by applicable law or agreed to in writing, software 00018 distributed under the License is distributed on an "AS IS" BASIS, 00019 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00020 See the License for the specific language governing permissions and 00021 limitations under the License. 00022 */ 00023 00024 #ifndef _SIMPLE_TOKENIZER_H_ 00025 #define _SIMPLE_TOKENIZER_H_ 00026 00027 #include <stdlib.h> 00028 #include <string.h> 00029 #include <ctype.h> 00030 00031 /*----------------------------------------------------------------------------- 00032 SimpleTokenizer 00033 00034 This class provides easy token parsing from an input string. It supports: 00035 00036 1- ignoring (or not) of null fields 00037 2- left whitespace trimming 00038 3- right whitespace trimming 00039 4- escaping the delimiter character with a user defined escape character 00040 00041 The class has two constructors, one that defines the input string, 00042 and another one that does not. If the latter is used, then the 00043 setString method should be used to set the data string. 00044 00045 Both constructors set the delimiter, the operation mode (which 00046 defines bullets 1-3 above), and the escape character. 00047 00048 The available methods are: 00049 00050 void setString(char *s) 00051 sets the data string to s. The mode specified upon construction of the 00052 tokenizer determines whether s is copied or not. 00053 00054 char *getNext() 00055 returns the next token, or NULL if there are no more tokens. This method 00056 uses the delimiter specified upon object construction. 00057 00058 char *getNext(char delimiter) 00059 similar to getNext(), but allows the user to change the delimiter (just for 00060 this call). 00061 00062 char *getNext(int count) 00063 get the next count tokens as a single token (ignoring the delimiters in 00064 between). 00065 00066 char *getNext(char delimiter, int count) 00067 this is similar to getNext(int count) but allows user to specify the 00068 delimiter. 00069 00070 IMPORTANT: the char pointers returned by the SimpleTokenizer are valid 00071 ONLY during the lifetime of the object. The copy of the input string 00072 is destroyed by the object's destructor. 00073 00074 char *getRest() 00075 returns the rest of the tokens all together. Advances pointer so a 00076 subsequent call to getNext returns NULL; 00077 00078 char *peekAtRestOfString() 00079 returns the rest of the input string, but DOES NOT advance pointer so a 00080 subsequent call to getNext does return the next token (if there is still 00081 one). 00082 00083 size_t getNumTokensRemaining() 00084 returns the number of tokens remaining in the string (using the delimiter 00085 specified upon object construction). 00086 00087 size_t getNumTokensRemaining(char delimiter) 00088 similar to the above, but allows the user to change the delimiter (just for 00089 this call). 00090 00091 Note that multiple delimiters are not supported (more than one per call). 00092 00093 examples: 00094 00095 SimpleTokenizer tok("one two\\ and\\ three four: five : six"); 00096 tok.getNumTokensRemaining() --> 5 note calculation is done assuming 00097 space is the delimiter 00098 tok.getNext() -> "one" 00099 tok.getNext() -> "two and three" 00100 tok.getNext(':') -> "four" 00101 tok.peekAtRestOfString() -> " five : six" 00102 tok.getNext(':') -> "five" 00103 00104 SimpleTokenizer tok(", with null fields ,,,", ',', 00105 CONSIDER_NULL_FIELDS | KEEP_WHITESPACE); 00106 tok.getNext() -> "" 00107 tok.getNext() -> " with null fields " 00108 tok.getNumTokensRemaining() -> 3 00109 00110 ---------------------------------------------------------------------------*/ 00111 00112 class SimpleTokenizer 00113 { 00114 public: 00115 00116 // by default, null fields are disregarded, whitespace is trimmed left 00117 // and right, and input string is copied (not overwritten) 00118 // 00119 enum 00120 { 00121 CONSIDER_NULL_FIELDS = 1, 00122 KEEP_WHITESPACE_LEFT = 2, 00123 KEEP_WHITESPACE_RIGHT = 4, 00124 KEEP_WHITESPACE = KEEP_WHITESPACE_LEFT + KEEP_WHITESPACE_RIGHT, 00125 OVERWRITE_INPUT_STRING = 8 00126 }; 00127 00128 SimpleTokenizer(char delimiter = ' ', unsigned mode = 0, char escape = '\\') 00129 : _data(0), _delimiter(delimiter), _mode(mode), _escape(escape), _start(0), _length(0) 00130 { } 00131 00132 // NOTE: The input strring 's' is overwritten for mode OVERWRITE_INPUT_STRING. 00133 SimpleTokenizer(const char *s, char delimiter = ' ', unsigned mode = 0, char escape = '\\') 00134 : _data(0), _delimiter(delimiter), _mode(mode), _escape(escape) 00135 { 00136 setString(s); 00137 } 00138 00139 ~SimpleTokenizer() { 00140 _clearData(); 00141 } 00142 00143 void setString(const char *s) 00144 { 00145 _clearData(); 00146 00147 _start = 0; 00148 _length = strlen(s); 00149 _data = (_mode & OVERWRITE_INPUT_STRING ? const_cast<char *>(s) : ats_strdup(s)); 00150 00151 // to handle the case where there is a null field at the end of the 00152 // input string, we replace the null character at the end of the 00153 // string with the delimiter (and consider the string to be one 00154 // character larger). 00155 // 00156 _data[_length++] = _delimiter; 00157 }; 00158 char *getNext(int count = 1) { 00159 return _getNext(_delimiter, false, count); 00160 }; 00161 char *getNext(char delimiter, int count = 1) { 00162 return _getNext(delimiter, false, count); 00163 } 00164 char *getRest() 00165 { 00166 // there can't be more than _length tokens, so we get the rest 00167 // of the tokens by requesting _length of them 00168 // 00169 return _getNext(_delimiter, false, _length); 00170 } 00171 size_t getNumTokensRemaining() 00172 { 00173 return _getNumTokensRemaining(_delimiter); 00174 }; 00175 size_t getNumTokensRemaining(char delimiter) 00176 { 00177 return _getNumTokensRemaining(delimiter); 00178 }; 00179 char *peekAtRestOfString() 00180 { 00181 _data[_length - 1] = 0; 00182 return (_start < _length ? &_data[_start] : &_data[_length - 1]); 00183 } 00184 00185 private: 00186 00187 char *_data; // a pointer to the input data itself, 00188 // or to a copy of it 00189 char _delimiter; // the token delimiter 00190 unsigned _mode; // flags that determine the 00191 // mode of operation 00192 char _escape; // the escape character 00193 size_t _start; // pointer to the start of the next 00194 // token 00195 size_t _length; // the length of _data 00196 00197 void _clearData() 00198 { 00199 if (_data && !(_mode & OVERWRITE_INPUT_STRING)) { 00200 ats_free(_data); 00201 } 00202 } 00203 00204 char *_getNext(char delimiter, bool countOnly = false, int numTokens = 1) { 00205 char *next = NULL; 00206 00207 if (_start < _length) { 00208 // set start 00209 // 00210 bool hasEsc = false; // escape character seen 00211 while (_start < _length && 00212 ((!(_mode & CONSIDER_NULL_FIELDS) && 00213 (_data[_start] == delimiter && 00214 !(_start && 00215 (_data[_start - 1] == _escape ? (hasEsc = true) : 0)))) || 00216 (!(_mode & KEEP_WHITESPACE_LEFT) && isspace(_data[_start])))) { 00217 ++_start; 00218 } 00219 00220 if (_start < _length) // data still available 00221 { 00222 // update the extra delimiter just in case the function 00223 // is called with a different delimiter from the previous one 00224 // 00225 _data[_length - 1] = delimiter; 00226 00227 next = &_data[_start]; 00228 00229 // set end 00230 // 00231 size_t end = _start; 00232 int delimCount = 0; 00233 while (end < _length && 00234 (_data[end] != delimiter || 00235 (end && (_data[end - 1] == _escape ? (hasEsc = true) : 0)) || 00236 ((++delimCount < numTokens) && (end < _length - 1)))) { 00237 ++end; 00238 } 00239 00240 _start = end + 1; 00241 00242 // there can be delimiters at the end if the number of tokens 00243 // requested is larger than 1, remove them if the 00244 // CONSIDER_NULL_FIELDS flag is not set 00245 // 00246 if (!(_mode & CONSIDER_NULL_FIELDS)) { 00247 while (_data[--end] == delimiter); 00248 ++end; 00249 } 00250 00251 if (!(_mode & KEEP_WHITESPACE_RIGHT)) { 00252 while (isspace(_data[--end])); 00253 ++end; 00254 } 00255 00256 if (!countOnly) { 00257 _data[end] = 0; 00258 00259 // remove escape characters only if the number of 00260 // delimiters is one 00261 // 00262 if (hasEsc && delimCount == 1) { 00263 int numEscape = 0, i = 0; 00264 while (next[i]) { 00265 if (next[i] == _escape) { 00266 ++numEscape; 00267 } else { 00268 next[i - numEscape] = next[i]; 00269 } 00270 ++i; 00271 } 00272 _data[end - numEscape] = 0; 00273 } 00274 } 00275 } 00276 } 00277 return next; 00278 }; 00279 00280 size_t _getNumTokensRemaining(char delimiter) 00281 { 00282 size_t startSave = _start; // save current position 00283 size_t count = 0; 00284 while (_getNext(delimiter, true)) { 00285 ++count; 00286 }; 00287 _start = startSave; 00288 return count; 00289 }; 00290 }; 00291 00292 #endif
 1.7.1
 1.7.1