gecko/xpcom/ds/nsCharSeparatedTokenizer.h

/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim: set ts=8 sts=2 et sw=2 tw=80: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#ifndef __nsCharSeparatedTokenizer_h
#define __nsCharSeparatedTokenizer_h

#include "mozilla/RangedPtr.h"

#include "nsDependentSubstring.h"
#include "nsCRT.h"

/**
 * This parses a SeparatorChar-separated string into tokens.
 * Whitespace surrounding tokens is not treated as part of tokens, however
 * whitespace inside a token is. If the final token is the empty string, it is
 * not returned.
 *
 * Some examples, with SeparatorChar = ',':
 *
 * "foo, bar, baz" ->      "foo" "bar" "baz"
 * "foo,bar,baz" ->        "foo" "bar" "baz"
 * "foo , bar hi , baz" -> "foo" "bar hi" "baz"
 * "foo, ,bar,baz" ->      "foo" "" "bar" "baz"
 * "foo,,bar,baz" ->       "foo" "" "bar" "baz"
 * "foo,bar,baz," ->       "foo" "bar" "baz"
 *
 * The function used for whitespace detection is a template argument.
 * By default, it is NS_IsAsciiWhitespace.
 */
template<typename DependentSubstringType, bool IsWhitespace(char16_t)>
class nsTCharSeparatedTokenizer
{
  typedef typename DependentSubstringType::char_type CharType;
  typedef typename DependentSubstringType::substring_type SubstringType;

public:
  // Flags -- only one for now. If we need more, they should be defined to
  // be 1 << 1, 1 << 2, etc. (They're masks, and aFlags is a bitfield.)
  enum
  {
    SEPARATOR_OPTIONAL = 1
  };

  nsTCharSeparatedTokenizer(const SubstringType& aSource,
                            CharType aSeparatorChar,
                            uint32_t aFlags = 0)
    : mIter(aSource.Data(), aSource.Length())
    , mEnd(aSource.Data() + aSource.Length(), aSource.Data(),
           aSource.Length())
    , mSeparatorChar(aSeparatorChar)
    , mWhitespaceBeforeFirstToken(false)
    , mWhitespaceAfterCurrentToken(false)
    , mSeparatorAfterCurrentToken(false)
    , mSeparatorOptional(aFlags & SEPARATOR_OPTIONAL)
  {
    // Skip initial whitespace
    while (mIter < mEnd && IsWhitespace(*mIter)) {
      mWhitespaceBeforeFirstToken = true;
      ++mIter;
    }
  }

  /**
   * Checks if any more tokens are available.
   */
  bool hasMoreTokens() const
  {
    MOZ_ASSERT(mIter == mEnd || !IsWhitespace(*mIter),
               "Should be at beginning of token if there is one");

    return mIter < mEnd;
  }

  /*
   * Returns true if there is whitespace prior to the first token.
   */
  bool whitespaceBeforeFirstToken() const
  {
    return mWhitespaceBeforeFirstToken;
  }

  /*
   * Returns true if there is a separator after the current token.
   * Useful if you want to check whether the last token has a separator
   * after it which may not be valid.
   */
  bool separatorAfterCurrentToken() const
  {
    return mSeparatorAfterCurrentToken;
  }

  /*
   * Returns true if there is any whitespace after the current token.
   */
  bool whitespaceAfterCurrentToken() const
  {
    return mWhitespaceAfterCurrentToken;
  }

  /**
   * Returns the next token.
   */
  const DependentSubstringType nextToken()
  {
    mozilla::RangedPtr<const CharType> tokenStart = mIter;
    mozilla::RangedPtr<const CharType> tokenEnd = mIter;

    MOZ_ASSERT(mIter == mEnd || !IsWhitespace(*mIter),
               "Should be at beginning of token if there is one");

    // Search until we hit separator or end (or whitespace, if a separator
    // isn't required -- see clause with 'break' below).
    while (mIter < mEnd && *mIter != mSeparatorChar) {
      // Skip to end of the current word.
      while (mIter < mEnd &&
             !IsWhitespace(*mIter) && *mIter != mSeparatorChar) {
        ++mIter;
      }
      tokenEnd = mIter;

      // Skip whitespace after the current word.
      mWhitespaceAfterCurrentToken = false;
      while (mIter < mEnd && IsWhitespace(*mIter)) {
        mWhitespaceAfterCurrentToken = true;
        ++mIter;
      }
      if (mSeparatorOptional) {
        // We've hit (and skipped) whitespace, and that's sufficient to end
        // our token, regardless of whether we've reached a SeparatorChar.
        break;
      } // (else, we'll keep looping until we hit mEnd or SeparatorChar)
    }

    mSeparatorAfterCurrentToken = (mIter != mEnd &&
                                   *mIter == mSeparatorChar);
    MOZ_ASSERT(mSeparatorOptional ||
               (mSeparatorAfterCurrentToken == (mIter < mEnd)),
               "If we require a separator and haven't hit the end of "
               "our string, then we shouldn't have left the loop "
               "unless we hit a separator");

    // Skip separator (and any whitespace after it), if we're at one.
    if (mSeparatorAfterCurrentToken) {
      ++mIter;

      while (mIter < mEnd && IsWhitespace(*mIter)) {
        mWhitespaceAfterCurrentToken = true;
        ++mIter;
      }
    }

    return Substring(tokenStart.get(), tokenEnd.get());
  }

private:
  mozilla::RangedPtr<const CharType> mIter;
  const mozilla::RangedPtr<const CharType> mEnd;
  CharType mSeparatorChar;
  bool mWhitespaceBeforeFirstToken;
  bool mWhitespaceAfterCurrentToken;
  bool mSeparatorAfterCurrentToken;
  bool mSeparatorOptional;
};

template<bool IsWhitespace(char16_t) = NS_IsAsciiWhitespace>
class nsCharSeparatedTokenizerTemplate
  : public nsTCharSeparatedTokenizer<nsDependentSubstring, IsWhitespace>
{
public:
  nsCharSeparatedTokenizerTemplate(const nsSubstring& aSource,
                                   char16_t aSeparatorChar,
                                   uint32_t aFlags = 0)
    : nsTCharSeparatedTokenizer<nsDependentSubstring,
                                IsWhitespace>(aSource, aSeparatorChar, aFlags)
  {
  }
};

typedef nsCharSeparatedTokenizerTemplate<> nsCharSeparatedTokenizer;

template<bool IsWhitespace(char16_t) = NS_IsAsciiWhitespace>
class nsCCharSeparatedTokenizerTemplate
  : public nsTCharSeparatedTokenizer<nsDependentCSubstring, IsWhitespace>
{
public:
  nsCCharSeparatedTokenizerTemplate(const nsCSubstring& aSource,
                                    char aSeparatorChar,
                                    uint32_t aFlags = 0)
    : nsTCharSeparatedTokenizer<nsDependentCSubstring,
                                IsWhitespace>(aSource, aSeparatorChar, aFlags)
  {
  }
};

typedef nsCCharSeparatedTokenizerTemplate<> nsCCharSeparatedTokenizer;

#endif /* __nsCharSeparatedTokenizer_h */