• Main Page
  • Related Pages
  • Namespaces
  • Data Structures
  • Files
  • File List
  • Globals

Tokenizer.cc

Go to the documentation of this file.
00001 /** @file
00002 
00003   A brief file description
00004 
00005   @section license License
00006 
00007   Licensed to the Apache Software Foundation (ASF) under one
00008   or more contributor license agreements.  See the NOTICE file
00009   distributed with this work for additional information
00010   regarding copyright ownership.  The ASF licenses this file
00011   to you under the Apache License, Version 2.0 (the
00012   "License"); you may not use this file except in compliance
00013   with the License.  You may obtain a copy of the License at
00014 
00015       http://www.apache.org/licenses/LICENSE-2.0
00016 
00017   Unless required by applicable law or agreed to in writing, software
00018   distributed under the License is distributed on an "AS IS" BASIS,
00019   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00020   See the License for the specific language governing permissions and
00021   limitations under the License.
00022  */
00023 
00024 /***************************************/
00025 #include "ink_platform.h"
00026 #include "Tokenizer.h"
00027 #include "ink_assert.h"
00028 #include "ink_memory.h"
00029 
00030 /****************************************************************************
00031  *
00032  *  Tokenizer.cc - A string tokenzier
00033  *
00034  *
00035  *
00036  ****************************************************************************/        /* MAGIC_EDITING_TAG */
00037 
00038 Tokenizer::Tokenizer(const char *StrOfDelimiters)
00039 {
00040   int length;
00041 
00042   if (StrOfDelimiters == NULL) {
00043     strOfDelimit = NULL;
00044   } else {
00045     length = (int) (strlen(StrOfDelimiters) + 1);
00046     strOfDelimit = new char[length];
00047     memcpy(strOfDelimit, StrOfDelimiters, length);
00048   }
00049 
00050   memset(&start_node, 0, sizeof(tok_node));
00051 
00052   numValidTokens = 0;
00053   maxTokens = -1;
00054   options = 0;
00055 
00056   add_node = &start_node;
00057   add_index = 0;
00058 }
00059 
00060 Tokenizer::~Tokenizer()
00061 {
00062   bool root = true;
00063   tok_node *cur = &start_node;;
00064   tok_node *next = NULL;
00065 
00066   if (strOfDelimit != NULL) {
00067     delete[]strOfDelimit;
00068   }
00069 
00070   while (cur != NULL) {
00071 
00072     if (options & COPY_TOKS) {
00073       for (int i = 0; i < TOK_NODE_ELEMENTS; i++)
00074         ats_free(cur->el[i]);
00075     }
00076 
00077     next = cur->next;
00078     if (root == false) {
00079       ats_free(cur);
00080     } else {
00081       root = false;
00082     }
00083     cur = next;
00084   }
00085 }
00086 
00087 int
00088 Tokenizer::Initialize(const char *str)
00089 {
00090   return Initialize((char *) str, COPY_TOKS);
00091 }
00092 
00093 inline int
00094 Tokenizer::isDelimiter(char c)
00095 {
00096   int i = 0;
00097 
00098   while (strOfDelimit[i] != '\0') {
00099     if (c == strOfDelimit[i]) {
00100       return 1;
00101     }
00102     i++;
00103   }
00104 
00105   return 0;
00106 }
00107 
00108 int
00109 Tokenizer::Initialize(char *str, int opt)
00110 {
00111   char *strStart;
00112   int priorCharWasDelimit = 1;
00113   char *tokStart = NULL;
00114   int tok_count = 0;
00115   bool max_limit_hit = false;
00116 
00117   // We can't depend on ReUse() being called so just do it
00118   //   if the object needs it
00119   if (numValidTokens > 0) {
00120     ReUse();
00121   }
00122 
00123   strStart = str;
00124 
00125   if (!(opt & (COPY_TOKS | SHARE_TOKS))) {
00126     opt = opt | COPY_TOKS;
00127   }
00128   options = opt;
00129 
00130   // Make sure that both options are not set
00131   ink_assert(!((opt & COPY_TOKS) && (opt & SHARE_TOKS)));
00132 
00133 
00134   str = strStart;
00135   priorCharWasDelimit = 1;
00136 
00137   tok_count = 0;
00138   tokStart = str;
00139 
00140   while (*str != '\0') {
00141 
00142     // Check to see if we've run to maxToken limit
00143     if (tok_count + 1 == maxTokens) {
00144       max_limit_hit = true;
00145       break;
00146     }
00147     // There two modes for collecting tokens
00148     //
00149     //  Mode 1: Every delimiter creates a token
00150     //          even if some of those tokens
00151     //          are zero length
00152     //
00153     //  Mode2:  Every token has some data
00154     //          in it which means we
00155     //          to skip past repeated delimiters
00156     if (options & ALLOW_EMPTY_TOKS) {
00157       if (isDelimiter(*str)) {
00158         addToken(tokStart, (int) (str - tokStart));
00159         tok_count++;
00160         tokStart = str + 1;
00161         priorCharWasDelimit = 1;
00162       } else {
00163         priorCharWasDelimit = 0;
00164       }
00165       str++;
00166     } else {
00167       if (isDelimiter(*str)) {
00168         if (priorCharWasDelimit == 0) {
00169           // This is a word end, so add it
00170           addToken(tokStart, (int) (str - tokStart));
00171           tok_count++;
00172         }
00173         priorCharWasDelimit = 1;
00174       } else {
00175         if (priorCharWasDelimit == 1) {
00176           // This is the start of a new token
00177           tokStart = str;
00178         }
00179         priorCharWasDelimit = 0;
00180       }
00181       str++;
00182     }
00183   }
00184 
00185   // Check to see if we stoped due to a maxToken limit
00186   if (max_limit_hit == true) {
00187 
00188     if (options & ALLOW_EMPTY_TOKS) {
00189 
00190       // Go till either we hit a delimiter or we've
00191       //   come to the end of the string, then
00192       //   set for copying
00193       for (; *str != '\0' && !isDelimiter(*str); str++);
00194       priorCharWasDelimit = 0;
00195 
00196     } else {
00197 
00198       // First, skip the delimiters
00199       for (; *str != '\0' && isDelimiter(*str); str++);
00200 
00201       // If there are only delimiters remaining, bail and set
00202       //   so that we do not add the empty token
00203       if (*str == '\0') {
00204         priorCharWasDelimit = 1;
00205       } else {
00206         // There is stuff to copy for the last token
00207         tokStart = str;
00208         priorCharWasDelimit = 0;
00209 
00210         // Advance until the end of the string
00211         for (; *str != '\0'; str++);
00212 
00213         // Now back off all trailing delimiters
00214         for (; isDelimiter(*(str - 1)); str--);
00215       }
00216     }
00217   }
00218   // Check to see if we got the last token.  We will
00219   //  only have gotten it if the string ended with a delimiter
00220   if (priorCharWasDelimit == 0) {
00221     // We did not get it
00222     addToken(tokStart, (int) (str - tokStart));
00223     tok_count++;
00224   }
00225 
00226   numValidTokens = tok_count;
00227   return tok_count;
00228 }
00229 
00230 
00231 void
00232 Tokenizer::addToken(char *startAddr, int length)
00233 {
00234   char *add_ptr;
00235   if (options & SHARE_TOKS) {
00236     startAddr[length] = '\0';
00237     add_ptr = startAddr;
00238   } else {
00239     add_ptr = (char *)ats_malloc(length + 1);
00240     memcpy(add_ptr, startAddr, length);
00241     add_ptr[length] = '\0';
00242   }
00243 
00244   add_node->el[add_index] = add_ptr;
00245 
00246   add_index++;
00247 
00248   // Check to see if we are out of elements after
00249   //   adding this one.  If we are change add_node
00250   //   to point to next tok_node, creating one
00251   //   if there is not a next one
00252   if (add_index >= TOK_NODE_ELEMENTS) {
00253     if (add_node->next == NULL) {
00254       add_node->next = (tok_node *)ats_malloc(sizeof(tok_node));
00255       memset(add_node->next, 0, sizeof(tok_node));
00256     }
00257     add_node = add_node->next;
00258     add_index = 0;
00259   }
00260 }
00261 
00262 
00263 const char *
00264 Tokenizer::operator[] (int index)
00265 {
00266   tok_node *
00267     cur_node = &start_node;
00268   int
00269     cur_start = 0;
00270   if (index >= numValidTokens) {
00271     return NULL;
00272   } else {
00273     while (cur_start + TOK_NODE_ELEMENTS <= index) {
00274       cur_node = cur_node->next;
00275       ink_assert(cur_node != NULL);
00276       cur_start += TOK_NODE_ELEMENTS;
00277     }
00278     return cur_node->el[index % TOK_NODE_ELEMENTS];
00279   }
00280 }
00281 
00282 int
00283 Tokenizer::getNumber()
00284 {
00285   return numValidTokens;
00286 }
00287 
00288 const char *
00289 Tokenizer::iterFirst(tok_iter_state * state)
00290 {
00291   state->node = &start_node;
00292   state->index = -1;
00293   return iterNext(state);
00294 }
00295 
00296 const char *
00297 Tokenizer::iterNext(tok_iter_state * state)
00298 {
00299   tok_node *node = state->node;;
00300   int index = state->index;
00301 
00302   index++;
00303   if (index >= TOK_NODE_ELEMENTS) {
00304     node = node->next;
00305     if (node == NULL) {
00306       return NULL;
00307     } else {
00308       index = 0;
00309     }
00310   }
00311 
00312   if (node->el[index] != NULL) {
00313     state->node = node;
00314     state->index = index;
00315     return node->el[index];
00316   } else {
00317     return NULL;
00318   }
00319 }
00320 
00321 
00322 
00323 void
00324 Tokenizer::Print()
00325 {
00326   tok_node *cur_node = &start_node;
00327   int node_index = 0;
00328   int count = 0;
00329 
00330   while (cur_node != NULL) {
00331 
00332     if (cur_node->el[node_index] != NULL) {
00333       printf("Token %d : |%s|\n", count, cur_node->el[node_index]);
00334       count++;
00335     } else {
00336       return;
00337     }
00338 
00339     node_index++;
00340     if (node_index >= TOK_NODE_ELEMENTS) {
00341       cur_node = cur_node->next;
00342       node_index = 0;
00343     }
00344   }
00345 }
00346 
00347 void
00348 Tokenizer::ReUse()
00349 {
00350   tok_node *cur_node = &start_node;
00351 
00352   while (cur_node != NULL) {
00353     if (options & COPY_TOKS) {
00354       for (int i = 0; i < TOK_NODE_ELEMENTS; i++)
00355         ats_free(cur_node->el[i]);
00356     }
00357     memset(cur_node->el, 0, sizeof(char *) * TOK_NODE_ELEMENTS);
00358     cur_node = cur_node->next;
00359   }
00360 
00361   numValidTokens = 0;
00362   add_node = &start_node;
00363   add_index = 0;
00364 }

Generated by  doxygen 1.7.1