• Main Page
  • Related Pages
  • Namespaces
  • Data Structures
  • Files
  • File List
  • Globals

Tokenizer.h

Go to the documentation of this file.
00001 /** @file
00002 
00003   A brief file description
00004 
00005   @section license License
00006 
00007   Licensed to the Apache Software Foundation (ASF) under one
00008   or more contributor license agreements.  See the NOTICE file
00009   distributed with this work for additional information
00010   regarding copyright ownership.  The ASF licenses this file
00011   to you under the Apache License, Version 2.0 (the
00012   "License"); you may not use this file except in compliance
00013   with the License.  You may obtain a copy of the License at
00014 
00015       http://www.apache.org/licenses/LICENSE-2.0
00016 
00017   Unless required by applicable law or agreed to in writing, software
00018   distributed under the License is distributed on an "AS IS" BASIS,
00019   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00020   See the License for the specific language governing permissions and
00021   limitations under the License.
00022  */
00023 
00024 /***************************************/
00025 
00026 #ifndef _TOKENIZER_H_
00027 #define _TOKENIZER_H_
00028 
00029 /****************************************************************************
00030  *
00031  *  Tokenizer.h - A string tokenzier
00032  *
00033  *
00034  *
00035  ****************************************************************************/
00036 
00037 /**********************************************************
00038  *  class Tokenizer
00039  *
00040  *  Tokenizes a string, and then allows array like access
00041  *
00042  *  The delimiters are determined by the string passed to the
00043  *   the constructor.
00044  *
00045  *  There are three memory options.
00046  *     SHARE_TOKS - this modifies the original string passed in
00047  *          through Intialize() and shares its space.   NULLs
00048  *          are inserted into string after each token.  Choosing
00049  *          this option means the user is reponsible for not
00050  *          deallocating the string storage before deallocating
00051  *          the tokenizer object
00052  *     COPY_TOKS - this option copies the orginial string and
00053  *          leaves the original unchanged.  The deallocation of the
00054  *          original string and the deallocation of the Tokenizer
00055  *          object are now independent.
00056  *     Note: If neither SHARE_TOKS or COPY_TOKS is selected, COPY_TOKS
00057  *          is the default
00058  *     ALLOW_EMPTY_TOKENS: If multiple delimiters appear next to each
00059  *          other, each delimiter creates a token someof which
00060  *          will be zero length.  The default is to skip repeated
00061  *          delimiters
00062  *
00063  *  Tokenizer(const char* StrOfDelimit) - a string that contains
00064  *     the delimiters for tokenizing.  This string is copied.
00065  *
00066  *  Intialize(char* str, TokenizerOpts opt) - Submits a string
00067  *     to be tokenized according to the memory options listed above
00068  *
00069  *  ReUse() - Allows the object to be reused for a new string
00070  *     After ReUse() is called, Initialize() can be called safely
00071  *     again
00072  *
00073  *  operator[index] - returns a pointer to the number token given
00074  *     by index.  If index > numTokens-1, NULL is returned.
00075  *     Because of way tokens are stored, this is O(n) operation
00076  *     It is very fast though for the first 16 tokens and
00077  *     is intended to be used on a small number of tokens
00078  *
00079  *  iterFirst(tok_iter_state* state) - Returns the first
00080  *     token and intializes state argument for subsequent
00081  *     calls to iterNext.  If no tokens exist, NULL is
00082  *     returned
00083  *
00084  *  iterNext(tok_iter_state* state) - Returns the next token after
00085  *     what arg state returned next last time.   Returns NULL if no
00086  *     more tokens exists.
00087  *
00088  *  Note: To iterate through a list using operator[] takes O(n^2) time
00089  *      Using iterFirst, iterNext the running time is O(n), so use
00090  *      the iteration where possible
00091  *
00092  *  getNumber() - returns the number of tokens
00093  *
00094  *  setMaxTokens() - sets the maximum number of tokens.  Once maxTokens
00095  *                     is reached, delimiters are ignored and the
00096  *                     last token is rest of the string.  Negative numbers
00097  *                     mean no limit on the number of tokens
00098  *
00099  *  getMaxTokens() - returns maxTokens.  Negative number mean no limit
00100  *
00101  *  Print() - Debugging method to print out the tokens
00102  *
00103  *******************************************************************/
00104 
00105 #include "ink_apidefs.h"
00106 
00107 #define COPY_TOKS         1 << 0
00108 #define SHARE_TOKS        1 << 1
00109 #define ALLOW_EMPTY_TOKS  1 << 2
00110 
00111 #define TOK_NODE_ELEMENTS  16
00112 struct tok_node
00113 {
00114   char *el[TOK_NODE_ELEMENTS];
00115   tok_node *next;
00116 };
00117 
00118 struct tok_iter_state
00119 {
00120   tok_node *node;
00121   int index;
00122 };
00123 
00124 
00125 
00126 class Tokenizer
00127 {
00128 public:
00129   inkcoreapi Tokenizer(const char *StrOfDelimiters);
00130     inkcoreapi ~ Tokenizer();
00131   int Initialize(char *str, int opt);
00132   inkcoreapi int Initialize(const char *str);   // Automatically sets option to copy
00133   const char *operator [] (int index);
00134   void setMaxTokens(int max)
00135   {
00136     maxTokens = max;
00137   };
00138   int getMaxTokens()
00139   {
00140     return maxTokens;
00141   };
00142   int getNumber();
00143   void Print();                 // Debugging print out
00144   inkcoreapi const char *iterFirst(tok_iter_state * state);
00145   inkcoreapi const char *iterNext(tok_iter_state * state);
00146 private:
00147   Tokenizer & operator=(const Tokenizer &);
00148   Tokenizer(const Tokenizer &);
00149   int isDelimiter(char c);
00150   void addToken(char *startAddr, int length);
00151   void ReUse();
00152   char *strOfDelimit;
00153   tok_node start_node;
00154   int numValidTokens;
00155   int maxTokens;
00156   int options;
00157 
00158   // State about where to add the next token
00159   tok_node *add_node;
00160   int add_index;
00161 };
00162 
00163 #endif

Generated by  doxygen 1.7.1