gecko/parser/htmlparser/src/nsScannerString.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=2 sw=2 sts=2 et cindent: */
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is Mozilla.
 *
 * The Initial Developer of the Original Code is IBM Corporation.
 * Portions created by IBM Corporation are Copyright (C) 2003
 * IBM Corporation. All Rights Reserved.
 *
 * Contributor(s):
 *   Darin Fisher <darin@meer.net>
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

#include <stdlib.h>
#include "nsScannerString.h"


  /**
   * nsScannerBufferList
   */

nsScannerBufferList::Buffer*
nsScannerBufferList::AllocBufferFromString( const nsAString& aString )
  {
    PRUint32 len = aString.Length();

    Buffer* buf = (Buffer*) malloc(sizeof(Buffer) + (len + 1) * sizeof(PRUnichar));
    if (buf)
      {
        // leave PRCList members of Buffer uninitialized

        buf->mUsageCount = 0;
        buf->mDataEnd = buf->DataStart() + len;

        nsAString::const_iterator source;
        aString.BeginReading(source);
        nsCharTraits<PRUnichar>::copy(buf->DataStart(), source.get(), len);

        // XXX null terminate.  this shouldn't be required, but we do it because
        // nsScanner erroneously thinks it can dereference DataEnd :-(
        *buf->mDataEnd = PRUnichar(0);
      }
    return buf;
  }

nsScannerBufferList::Buffer*
nsScannerBufferList::AllocBuffer( PRUint32 capacity )
  {
    Buffer* buf = (Buffer*) malloc(sizeof(Buffer) + (capacity + 1) * sizeof(PRUnichar));
    if (buf)
      {
        // leave PRCList members of Buffer uninitialized

        buf->mUsageCount = 0;
        buf->mDataEnd = buf->DataStart() + capacity;

        // XXX null terminate.  this shouldn't be required, but we do it because
        // nsScanner erroneously thinks it can dereference DataEnd :-(
        *buf->mDataEnd = PRUnichar(0);
      }
    return buf;
  }

void
nsScannerBufferList::ReleaseAll()
  {
    while (!PR_CLIST_IS_EMPTY(&mBuffers))
      {
        PRCList* node = PR_LIST_HEAD(&mBuffers);
        PR_REMOVE_LINK(node);
        //printf(">>> freeing buffer @%p\n", node);
        free(static_cast<Buffer*>(node));
      }
  }

void
nsScannerBufferList::SplitBuffer( const Position& pos )
  {
    // splitting to the right keeps the work string and any extant token
    // pointing to and holding a reference count on the same buffer.

    Buffer* bufferToSplit = pos.mBuffer;
    NS_ASSERTION(bufferToSplit, "null pointer");

    PRUint32 splitOffset = pos.mPosition - bufferToSplit->DataStart();
    NS_ASSERTION(pos.mPosition >= bufferToSplit->DataStart() &&
                 splitOffset <= bufferToSplit->DataLength(),
                 "split offset is outside buffer");

    PRUint32 len = bufferToSplit->DataLength() - splitOffset;
    Buffer* new_buffer = AllocBuffer(len);
    if (new_buffer)
      {
        nsCharTraits<PRUnichar>::copy(new_buffer->DataStart(),
                                      bufferToSplit->DataStart() + splitOffset,
                                      len);
        InsertAfter(new_buffer, bufferToSplit);
        bufferToSplit->SetDataLength(splitOffset);
      }
  }

void
nsScannerBufferList::DiscardUnreferencedPrefix( Buffer* aBuf )
  {
    if (aBuf == Head())
      {
        while (!PR_CLIST_IS_EMPTY(&mBuffers) && !Head()->IsInUse())
          {
            Buffer* buffer = Head();
            PR_REMOVE_LINK(buffer);
            free(buffer);
          }
      }
  }

size_t
nsScannerBufferList::Position::Distance( const Position& aStart, const Position& aEnd )
  {
    size_t result = 0;
    if (aStart.mBuffer == aEnd.mBuffer)
      {
        result = aEnd.mPosition - aStart.mPosition;
      }
    else
      {
        result = aStart.mBuffer->DataEnd() - aStart.mPosition;
        for (Buffer* b = aStart.mBuffer->Next(); b != aEnd.mBuffer; b = b->Next())
          result += b->DataLength();
        result += aEnd.mPosition - aEnd.mBuffer->DataStart();
      }
    return result;
  }


/**
 * nsScannerSubstring
 */

nsScannerSubstring::nsScannerSubstring()
  : mStart(nsnull, nsnull)
  , mEnd(nsnull, nsnull)
  , mBufferList(nsnull)
  , mLength(0)
  , mIsDirty(PR_TRUE)
  {
  }

nsScannerSubstring::nsScannerSubstring( const nsAString& s )
  : mBufferList(nsnull)
  , mIsDirty(PR_TRUE)
  {
    Rebind(s);
  }

nsScannerSubstring::~nsScannerSubstring()
  {
    release_ownership_of_buffer_list();
  }

PRInt32
nsScannerSubstring::CountChar( PRUnichar c ) const
  {
      /*
        re-write this to use a counting sink
       */

    size_type result = 0;
    size_type lengthToExamine = Length();

    nsScannerIterator iter;
    for ( BeginReading(iter); ; )
      {
        PRInt32 lengthToExamineInThisFragment = iter.size_forward();
        const PRUnichar* fromBegin = iter.get();
        result += size_type(NS_COUNT(fromBegin, fromBegin+lengthToExamineInThisFragment, c));
        if ( !(lengthToExamine -= lengthToExamineInThisFragment) )
          return result;
        iter.advance(lengthToExamineInThisFragment);
      }
      // never reached; quiets warnings
    return 0;
  }

void
nsScannerSubstring::Rebind( const nsScannerSubstring& aString,
                            const nsScannerIterator& aStart,
                            const nsScannerIterator& aEnd )
  {
    // allow for the case where &aString == this

    aString.acquire_ownership_of_buffer_list();
    release_ownership_of_buffer_list();

    mStart      = aStart;
    mEnd        = aEnd;
    mBufferList = aString.mBufferList;
    mLength     = Distance(aStart, aEnd);
    mIsDirty    = PR_TRUE;
  }

void
nsScannerSubstring::Rebind( const nsAString& aString )
  {
    release_ownership_of_buffer_list();

    mBufferList = new nsScannerBufferList(AllocBufferFromString(aString));
    mIsDirty    = PR_TRUE;

    init_range_from_buffer_list();
    acquire_ownership_of_buffer_list();
  }

const nsSubstring&
nsScannerSubstring::AsString() const
  {
    if (mIsDirty)
      {
        nsScannerSubstring* mutable_this = const_cast<nsScannerSubstring*>(this);

        if (mStart.mBuffer == mEnd.mBuffer) {
          // We only have a single fragment to deal with, so just return it
          // as a substring.
          mutable_this->mFlattenedRep.Rebind(mStart.mPosition, mEnd.mPosition);
        } else {
          // Otherwise, we need to copy the data into a flattened buffer.
          nsScannerIterator start, end;
          CopyUnicodeTo(BeginReading(start), EndReading(end), mutable_this->mFlattenedRep);
        }

        mutable_this->mIsDirty = PR_FALSE;
      }

    return mFlattenedRep;
  }

nsScannerIterator&
nsScannerSubstring::BeginReading( nsScannerIterator& iter ) const
  {
    iter.mOwner = this;

    iter.mFragment.mBuffer = mStart.mBuffer;
    iter.mFragment.mFragmentStart = mStart.mPosition;
    if (mStart.mBuffer == mEnd.mBuffer)
      iter.mFragment.mFragmentEnd = mEnd.mPosition;
    else
      iter.mFragment.mFragmentEnd = mStart.mBuffer->DataEnd();

    iter.mPosition = mStart.mPosition;
    iter.normalize_forward();
    return iter;
  }

nsScannerIterator&
nsScannerSubstring::EndReading( nsScannerIterator& iter ) const
  {
    iter.mOwner = this;

    iter.mFragment.mBuffer = mEnd.mBuffer;
    iter.mFragment.mFragmentEnd = mEnd.mPosition;
    if (mStart.mBuffer == mEnd.mBuffer)
      iter.mFragment.mFragmentStart = mStart.mPosition;
    else
      iter.mFragment.mFragmentStart = mEnd.mBuffer->DataStart();

    iter.mPosition = mEnd.mPosition;
    // must not |normalize_backward| as that would likely invalidate tests like |while ( first != last )|
    return iter;
  }

PRBool
nsScannerSubstring::GetNextFragment( nsScannerFragment& frag ) const
  {
    // check to see if we are at the end of the buffer list
    if (frag.mBuffer == mEnd.mBuffer)
      return PR_FALSE;

    frag.mBuffer = static_cast<const Buffer*>(PR_NEXT_LINK(frag.mBuffer));

    if (frag.mBuffer == mStart.mBuffer)
      frag.mFragmentStart = mStart.mPosition;
    else
      frag.mFragmentStart = frag.mBuffer->DataStart();

    if (frag.mBuffer == mEnd.mBuffer)
      frag.mFragmentEnd = mEnd.mPosition;
    else
      frag.mFragmentEnd = frag.mBuffer->DataEnd();

    return PR_TRUE;
  }

PRBool
nsScannerSubstring::GetPrevFragment( nsScannerFragment& frag ) const
  {
    // check to see if we are at the beginning of the buffer list
    if (frag.mBuffer == mStart.mBuffer)
      return PR_FALSE;

    frag.mBuffer = static_cast<const Buffer*>(PR_PREV_LINK(frag.mBuffer));

    if (frag.mBuffer == mStart.mBuffer)
      frag.mFragmentStart = mStart.mPosition;
    else
      frag.mFragmentStart = frag.mBuffer->DataStart();

    if (frag.mBuffer == mEnd.mBuffer)
      frag.mFragmentEnd = mEnd.mPosition;
    else
      frag.mFragmentEnd = frag.mBuffer->DataEnd();

    return PR_TRUE;
  }


  /**
   * nsScannerString
   */

nsScannerString::nsScannerString( Buffer* aBuf )
  {
    mBufferList = new nsScannerBufferList(aBuf);

    init_range_from_buffer_list();
    acquire_ownership_of_buffer_list();
  }

void
nsScannerString::AppendBuffer( Buffer* aBuf )
  {
    mBufferList->Append(aBuf);
    mLength += aBuf->DataLength();

    mEnd.mBuffer = aBuf;
    mEnd.mPosition = aBuf->DataEnd();

    mIsDirty = PR_TRUE;
  }

void
nsScannerString::DiscardPrefix( const nsScannerIterator& aIter )
  {
    Position old_start(mStart);
    mStart = aIter;
    mLength -= Position::Distance(old_start, mStart);

    mStart.mBuffer->IncrementUsageCount();
    old_start.mBuffer->DecrementUsageCount();

    mBufferList->DiscardUnreferencedPrefix(old_start.mBuffer);

    mIsDirty = PR_TRUE;
  }

void
nsScannerString::UngetReadable( const nsAString& aReadable, const nsScannerIterator& aInsertPoint )
    /*
     * Warning: this routine manipulates the shared buffer list in an unexpected way.
     *  The original design did not really allow for insertions, but this call promises
     *  that if called for a point after the end of all extant token strings, that no token string
     *  or the work string will be invalidated.
     *
     *  This routine is protected because it is the responsibility of the derived class to keep those promises.
     */
  {
    Position insertPos(aInsertPoint);

    mBufferList->SplitBuffer(insertPos);
      // splitting to the right keeps the work string and any extant token pointing to and
      //  holding a reference count on the same buffer

    Buffer* new_buffer = AllocBufferFromString(aReadable);
      // make a new buffer with all the data to insert...
      //  BULLSHIT ALERT: we may have empty space to re-use in the split buffer, measure the cost
      //  of this and decide if we should do the work to fill it

    Buffer* buffer_to_split = insertPos.mBuffer;
    mBufferList->InsertAfter(new_buffer, buffer_to_split);
    mLength += aReadable.Length();

    mEnd.mBuffer = mBufferList->Tail();
    mEnd.mPosition = mEnd.mBuffer->DataEnd();

    mIsDirty = PR_TRUE;
  }

void
nsScannerString::ReplaceCharacter(nsScannerIterator& aPosition, PRUnichar aChar)
  {
    // XXX Casting a const to non-const. Unless the base class
    // provides support for writing iterators, this is the best
    // that can be done.
    PRUnichar* pos = const_cast<PRUnichar*>(aPosition.get());
    *pos = aChar;

    mIsDirty = PR_TRUE;
  }


  /**
   * nsScannerSharedSubstring
   */

void
nsScannerSharedSubstring::Rebind(const nsScannerIterator &aStart,
                              const nsScannerIterator &aEnd)
{
  // If the start and end positions are inside the same buffer, we must
  // acquire ownership of the buffer.  If not, we can optimize by not holding
  // onto it.

  Buffer *buffer = const_cast<Buffer*>(aStart.buffer());
  PRBool sameBuffer = buffer == aEnd.buffer();

  nsScannerBufferList *bufferList;

  if (sameBuffer) {
    bufferList = aStart.mOwner->mBufferList;
    bufferList->AddRef();
    buffer->IncrementUsageCount();
  }

  if (mBufferList)
    ReleaseBuffer();

  if (sameBuffer) {
    mBuffer = buffer;
    mBufferList = bufferList;
    mString.Rebind(aStart.mPosition, aEnd.mPosition);
  } else {
    mBuffer = nsnull;
    mBufferList = nsnull;
    CopyUnicodeTo(aStart, aEnd, mString);
  }
}

void
nsScannerSharedSubstring::ReleaseBuffer()
{
  NS_ASSERTION(mBufferList, "Should only be called with non-null mBufferList");
  mBuffer->DecrementUsageCount();
  mBufferList->DiscardUnreferencedPrefix(mBuffer);
  mBufferList->Release();
}

void
nsScannerSharedSubstring::MakeMutable()
{
  nsString temp(mString); // this will force a copy of the data
  mString.Assign(temp);   // mString will now share the just-allocated buffer

  ReleaseBuffer();

  mBuffer = nsnull;
  mBufferList = nsnull;
}

  /**
   * utils -- based on code from nsReadableUtils.cpp
   */

// private helper function
static inline
nsAString::iterator&
copy_multifragment_string( nsScannerIterator& first, const nsScannerIterator& last, nsAString::iterator& result )
  {
    typedef nsCharSourceTraits<nsScannerIterator> source_traits;
    typedef nsCharSinkTraits<nsAString::iterator> sink_traits;

    while ( first != last )
      {
        PRUint32 distance = source_traits::readable_distance(first, last);
        sink_traits::write(result, source_traits::read(first), distance);
        NS_ASSERTION(distance > 0, "|copy_multifragment_string| will never terminate");
        source_traits::advance(first, distance);
      }

    return result;
  }

void
CopyUnicodeTo( const nsScannerIterator& aSrcStart,
               const nsScannerIterator& aSrcEnd,
               nsAString& aDest )
  {
    nsAString::iterator writer;
    if (!EnsureStringLength(aDest, Distance(aSrcStart, aSrcEnd))) {
      aDest.Truncate();
      return; // out of memory
    }
    aDest.BeginWriting(writer);
    nsScannerIterator fromBegin(aSrcStart);

    copy_multifragment_string(fromBegin, aSrcEnd, writer);
  }

void
AppendUnicodeTo( const nsScannerIterator& aSrcStart,
                 const nsScannerIterator& aSrcEnd,
                 nsScannerSharedSubstring& aDest )
  {
    // Check whether we can just create a dependent string.
    if (aDest.str().IsEmpty()) {
      // We can just make |aDest| point to the buffer.
      // This will take care of copying if the buffer spans fragments.
      aDest.Rebind(aSrcStart, aSrcEnd);
    } else {
      // The dest string is not empty, so it can't be a dependent substring.
      AppendUnicodeTo(aSrcStart, aSrcEnd, aDest.writable());
    }
  }

void
AppendUnicodeTo( const nsScannerIterator& aSrcStart,
                 const nsScannerIterator& aSrcEnd,
                 nsAString& aDest )
  {
    nsAString::iterator writer;
    PRUint32 oldLength = aDest.Length();
    if (!EnsureStringLength(aDest, oldLength + Distance(aSrcStart, aSrcEnd)))
      return; // out of memory
    aDest.BeginWriting(writer).advance(oldLength);
    nsScannerIterator fromBegin(aSrcStart);

    copy_multifragment_string(fromBegin, aSrcEnd, writer);
  }

PRBool
FindCharInReadable( PRUnichar aChar,
                    nsScannerIterator& aSearchStart,
                    const nsScannerIterator& aSearchEnd )
  {
    while ( aSearchStart != aSearchEnd )
      {
        PRInt32 fragmentLength;
        if ( SameFragment(aSearchStart, aSearchEnd) )
          fragmentLength = aSearchEnd.get() - aSearchStart.get();
        else
          fragmentLength = aSearchStart.size_forward();

        const PRUnichar* charFoundAt = nsCharTraits<PRUnichar>::find(aSearchStart.get(), fragmentLength, aChar);
        if ( charFoundAt ) {
          aSearchStart.advance( charFoundAt - aSearchStart.get() );
          return PR_TRUE;
        }

        aSearchStart.advance(fragmentLength);
      }

    return PR_FALSE;
  }

PRBool
FindInReadable( const nsAString& aPattern,
                nsScannerIterator& aSearchStart,
                nsScannerIterator& aSearchEnd,
                const nsStringComparator& compare )
  {
    PRBool found_it = PR_FALSE;

      // only bother searching at all if we're given a non-empty range to search
    if ( aSearchStart != aSearchEnd )
      {
        nsAString::const_iterator aPatternStart, aPatternEnd;
        aPattern.BeginReading(aPatternStart);
        aPattern.EndReading(aPatternEnd);

          // outer loop keeps searching till we find it or run out of string to search
        while ( !found_it )
          {
              // fast inner loop (that's what it's called, not what it is) looks for a potential match
            while ( aSearchStart != aSearchEnd &&
                    compare(*aPatternStart, *aSearchStart) )
              ++aSearchStart;

              // if we broke out of the `fast' loop because we're out of string ... we're done: no match
            if ( aSearchStart == aSearchEnd )
              break;

              // otherwise, we're at a potential match, let's see if we really hit one
            nsAString::const_iterator testPattern(aPatternStart);
            nsScannerIterator testSearch(aSearchStart);

              // slow inner loop verifies the potential match (found by the `fast' loop) at the current position
            for(;;)
              {
                  // we already compared the first character in the outer loop,
                  //  so we'll advance before the next comparison
                ++testPattern;
                ++testSearch;

                  // if we verified all the way to the end of the pattern, then we found it!
                if ( testPattern == aPatternEnd )
                  {
                    found_it = PR_TRUE;
                    aSearchEnd = testSearch; // return the exact found range through the parameters
                    break;
                  }

                  // if we got to end of the string we're searching before we hit the end of the
                  //  pattern, we'll never find what we're looking for
                if ( testSearch == aSearchEnd )
                  {
                    aSearchStart = aSearchEnd;
                    break;
                  }

                  // else if we mismatched ... it's time to advance to the next search position
                  //  and get back into the `fast' loop
                if ( compare(*testPattern, *testSearch) )
                  {
                    ++aSearchStart;
                    break;
                  }
              }
          }
      }

    return found_it;
  }

  /**
   * This implementation is simple, but does too much work.
   * It searches the entire string from left to right, and returns the last match found, if any.
   * This implementation will be replaced when I get |reverse_iterator|s working.
   */
PRBool
RFindInReadable( const nsAString& aPattern,
                 nsScannerIterator& aSearchStart,
                 nsScannerIterator& aSearchEnd,
                 const nsStringComparator& aComparator )
  {
    PRBool found_it = PR_FALSE;

    nsScannerIterator savedSearchEnd(aSearchEnd);
    nsScannerIterator searchStart(aSearchStart), searchEnd(aSearchEnd);

    while ( searchStart != searchEnd )
      {
        if ( FindInReadable(aPattern, searchStart, searchEnd, aComparator) )
          {
            found_it = PR_TRUE;

              // this is the best match so far, so remember it
            aSearchStart = searchStart;
            aSearchEnd = searchEnd;

              // ...and get ready to search some more
              //  (it's tempting to set |searchStart=searchEnd| ... but that misses overlapping patterns)
            ++searchStart;
            searchEnd = savedSearchEnd;
          }
      }

      // if we never found it, return an empty range
    if ( !found_it )
      aSearchStart = aSearchEnd;

    return found_it;
  }