Bug 918135 - Parse VBR headers in MP3FrameParser for calculating exact duration r=cpearce

This patch adds support for Xing and VBRI headers to MP3FrameParser so that we can calculate the exact duration of an MP3 file if these headers are present. It is also now more robust against large ID3 tags and discontinuities in input.
2024-09-13 09:24:08 -07:00 · 2013-12-03 10:25:27 +13:00 · 2013-12-03 10:25:27 +13:00 · 6510155080
commit 6510155080
parent 499b236543
4 changed files with 275 additions and 56 deletions
--- a/content/media/MP3FrameParser.cpp
+++ b/content/media/MP3FrameParser.cpp
@ -5,11 +5,16 @@
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

 #include <algorithm>
+
 #include "nsMemory.h"
 #include "MP3FrameParser.h"
 #include "VideoUtils.h"


+#define FROM_BIG_ENDIAN(X) ((uint32_t)((uint8_t)(X)[0] << 24 | (uint8_t)(X)[1] << 16 | \
+                                       (uint8_t)(X)[2] << 8 | (uint8_t)(X)[3]))
+
+
 namespace mozilla {

 /*
@ -161,6 +166,13 @@ MP3Parser::GetSampleRate()
  return mpeg_srates[frame.mVersion][frame.mSampleRate];
 }

+uint32_t
+MP3Parser::GetSamplesPerFrame()
+{
+  MP3Frame &frame = mData.mFrame;
+  return mpeg_frame_samples[frame.mVersion][frame.mLayer];
+}
+

 /** ID3Parser methods **/

@ -222,6 +234,67 @@ ID3Parser::GetHeaderLength() const
 }


+/** VBR header helper stuff **/
+
+// Helper function to find a VBR header in an MP3 frame.
+// Based on information from
+// http://www.codeproject.com/Articles/8295/MPEG-Audio-Frame-Header
+
+const uint32_t VBRI_TAG = FROM_BIG_ENDIAN("VBRI");
+const uint32_t VBRI_OFFSET = 32 - sizeof(MP3Frame);
+const uint32_t VBRI_FRAME_COUNT_OFFSET = VBRI_OFFSET + 14;
+const uint32_t VBRI_MIN_FRAME_SIZE = VBRI_OFFSET + 26;
+
+const uint32_t XING_TAG = FROM_BIG_ENDIAN("Xing");
+enum XingFlags {
+  XING_HAS_NUM_FRAMES = 0x01,
+  XING_HAS_NUM_BYTES = 0x02,
+  XING_HAS_TOC = 0x04,
+  XING_HAS_VBR_SCALE = 0x08
+};
+
+static int64_t
+ParseXing(const char *aBuffer)
+{
+  uint32_t flags = FROM_BIG_ENDIAN(aBuffer + 4);
+
+  if (!(flags & XING_HAS_NUM_FRAMES)) {
+    NS_WARNING("VBR file without frame count. Duration estimation likely to "
+               "be totally wrong.");
+    return -1;
+  }
+
+  int64_t numFrames = -1;
+  if (flags & XING_HAS_NUM_FRAMES) {
+    numFrames = FROM_BIG_ENDIAN(aBuffer + 8);
+  }
+
+  return numFrames;
+}
+
+static int64_t
+FindNumVBRFrames(const nsAutoCString& aFrame)
+{
+  const char *buffer = aFrame.get();
+  const char *bufferEnd = aFrame.get() + aFrame.Length();
+
+  // VBRI header is nice and well-defined; let's try to find that first.
+  if (aFrame.Length() > VBRI_MIN_FRAME_SIZE &&
+      FROM_BIG_ENDIAN(buffer + VBRI_OFFSET) == VBRI_TAG) {
+    return FROM_BIG_ENDIAN(buffer + VBRI_FRAME_COUNT_OFFSET);
+  }
+
+  // We have to search for the Xing header as its position can change.
+  for (; buffer + sizeof(XING_TAG) < bufferEnd; buffer++) {
+    if (FROM_BIG_ENDIAN(buffer) == XING_TAG) {
+      return ParseXing(buffer);
+    }
+  }
+
+  return -1;
+}
+
+
 /** MP3FrameParser methods **/

 // Some MP3's have large ID3v2 tags, up to 150KB, so we allow lots of
@ -241,12 +314,14 @@ enum {

 MP3FrameParser::MP3FrameParser(int64_t aLength)
 : mLock("MP3FrameParser.mLock"),
+  mTotalID3Size(0),
  mTotalFrameSize(0),
-  mNumFrames(0),
+  mFrameCount(0),
  mOffset(0),
  mLength(aLength),
  mMP3Offset(-1),
-  mSampleRate(0),
+  mSamplesPerSecond(0),
+  mFirstFrameEnd(-1),
  mIsMP3(MAYBE_MP3)
 { }

@ -257,7 +332,6 @@ nsresult MP3FrameParser::ParseBuffer(const uint8_t* aBuffer,
 {
  // Iterate forwards over the buffer, looking for ID3 tag, or MP3
  // Frame headers.
-
  const uint8_t *buffer = aBuffer;
  const uint8_t *bufferEnd = aBuffer + aLength;

@ -271,6 +345,8 @@ nsresult MP3FrameParser::ParseBuffer(const uint8_t* aBuffer,
        buffer = ch + mID3Parser.GetHeaderLength() - (ID3_HEADER_LENGTH - 1);
        ch = buffer;

+        mTotalID3Size += mID3Parser.GetHeaderLength();
+
        // Yes, this is an MP3!
        mIsMP3 = DEFINITELY_MP3;

@ -279,43 +355,95 @@ nsresult MP3FrameParser::ParseBuffer(const uint8_t* aBuffer,
    }
  }

+  // The first MP3 frame in a variable bitrate stream can contain metadata
+  // for duration estimation and seeking, so we buffer that first frame here.
+  if (aStreamOffset < mFirstFrameEnd) {
+    uint64_t copyLen = std::min((int64_t)aLength, mFirstFrameEnd - aStreamOffset);
+    mFirstFrame.Append((const char *)buffer, copyLen);
+    buffer += copyLen;
+  }
+
  while (buffer < bufferEnd) {
    uint16_t frameLen = mMP3Parser.ParseFrameLength(*buffer);

    if (frameLen) {
+      // We've found an MP3 frame!
+      // This is the first frame (and the only one we'll bother parsing), so:
+      // * Mark this stream as MP3;
+      // * Store the offset at which the MP3 data started; and
+      // * Start buffering the frame, as it might contain handy metadata.

-      if (mMP3Offset < 0) {
-        // Found our first frame: mark this stream as MP3 and let the decoder
-        // know where in the stream the MP3 data starts.
-        mIsMP3 = DEFINITELY_MP3;
-        // We're at the last byte of an MP3Frame, so MP3 data started
-        // sizeof - 1 bytes ago.
-        mMP3Offset = aStreamOffset
-          + (buffer - aBuffer)
-          - (sizeof(MP3Frame) - 1);
+      // We're now sure this is an MP3 stream.
+      mIsMP3 = DEFINITELY_MP3;
+
+      // We need to know these to convert the number of frames in the stream
+      // to the length of the stream in seconds.
+      mSamplesPerSecond = mMP3Parser.GetSampleRate();
+      mSamplesPerFrame = mMP3Parser.GetSamplesPerFrame();
+
+      // If the stream has a constant bitrate, we should only need the length
+      // of the first frame and the length (in bytes) of the stream to
+      // estimate the length (in seconds).
+      mTotalFrameSize += frameLen;
+      mFrameCount++;
+
+      // If |mMP3Offset| isn't set then this is the first MP3 frame we have
+      // seen in the stream, which is useful for duration estimation.
+      if (mMP3Offset > -1) {
+        uint16_t skip = frameLen - sizeof(MP3Frame);
+        buffer += skip ? skip : 1;
+        continue;
      }

-      mSampleRate = mMP3Parser.GetSampleRate();
-      mTotalFrameSize += frameLen;
-      mNumFrames++;
+      // Remember the offset of the MP3 stream.
+      // We're at the last byte of an MP3Frame, so MP3 data started
+      // sizeof(MP3Frame) - 1 bytes ago.
+      mMP3Offset = aStreamOffset
+        + (buffer - aBuffer)
+        - (sizeof(MP3Frame) - 1);
+
+      buffer++;
+
+      // If the stream has a variable bitrate, the first frame has metadata
+      // we need for duration estimation and seeking. Start buffering it so we
+      // can parse it later.
+      mFirstFrameEnd = mMP3Offset + frameLen;
+      uint64_t currOffset = buffer - aBuffer + aStreamOffset;
+      uint64_t copyLen = std::min(mFirstFrameEnd - currOffset,
+                                  (uint64_t)(bufferEnd - buffer));
+      mFirstFrame.Append((const char *)buffer, copyLen);
+
+      buffer += copyLen;

-      buffer += frameLen - sizeof(MP3Frame);
    } else {
+      // Nothing to see here. Move along.
      buffer++;
    }
  }

  *aOutBytesRead = buffer - aBuffer;
+
+  if (mFirstFrameEnd > -1 && mFirstFrameEnd <= aStreamOffset + buffer - aBuffer) {
+    // We have our whole first frame. Try to find a VBR header.
+    mNumFrames = FindNumVBRFrames(mFirstFrame);
+    mFirstFrameEnd = -1;
+  }
+
  return NS_OK;
 }

-void MP3FrameParser::Parse(const char* aBuffer, uint32_t aLength, int64_t aOffset)
+void MP3FrameParser::Parse(const char* aBuffer, uint32_t aLength, uint64_t aOffset)
 {
  MutexAutoLock mon(mLock);

+  if (HasExactDuration()) {
+    // We know the duration; nothing to do here.
+    return;
+  }
+
  const uint8_t* buffer = reinterpret_cast<const uint8_t*>(aBuffer);
  int32_t length = aLength;
-  int64_t offset = aOffset;
+  uint64_t offset = aOffset;

  // Got some data we have seen already. Skip forward to what we need.
  if (aOffset < mOffset) {
@ -335,6 +463,12 @@ void MP3FrameParser::Parse(const char* aBuffer, uint32_t aLength, int64_t aOffse
      // Only reset this if it hasn't finished yet.
      mID3Parser.Reset();
    }
+
+    if (mFirstFrameEnd > -1) {
+      NS_WARNING("Discontinuity in input while buffering first frame.");
+      mFirstFrameEnd = -1;
+    }
+
    mMP3Parser.Reset();
  }

@ -352,7 +486,10 @@ void MP3FrameParser::Parse(const char* aBuffer, uint32_t aLength, int64_t aOffse
  mOffset = offset + bytesRead;

  // If we've parsed lots of data and we still have nothing, just give up.
-  if (!mID3Parser.IsParsed() && !mNumFrames && mOffset > MAX_SKIPPED_BYTES) {
+  // We don't count ID3 headers towards that count, as MP3 files can have
+  // massive ID3 sections.
+  if (!mID3Parser.IsParsed() && mMP3Offset < 0 &&
+      mOffset - mTotalID3Size > MAX_SKIPPED_BYTES) {
    mIsMP3 = NOT_MP3;
  }
 }
@ -361,22 +498,25 @@ int64_t MP3FrameParser::GetDuration()
 {
  MutexAutoLock mon(mLock);

-  if (!mNumFrames || !mSampleRate) {
+  if (mMP3Offset < 0) {
    return -1; // Not a single frame decoded yet
  }

-  // Estimate the total number of frames in the file from the average frame
-  // size we've seen so far, and the length of the file.
-  double avgFrameSize = (double)mTotalFrameSize / mNumFrames;
-
-  // Need to cut out the header here. Ignore everything up to the first MP3
-  // frames.
-  double estimatedFrames = (double)(mLength - mMP3Offset) / avgFrameSize;
+  double frames;
+  if (mNumFrames < 0) {
+    // Estimate the number of frames in the stream based on the average frame
+    // size and the length of the MP3 file.
+    double frameSize = (double)mTotalFrameSize / mFrameCount;
+    frames = (double)(mLength - mMP3Offset) / frameSize;
+  } else {
+    // We know the exact number of frames from the VBR header.
+    frames = mNumFrames;
+  }

  // The duration of each frame is constant over a given stream.
-  double usPerFrame = USECS_PER_S * SAMPLES_PER_FRAME / mSampleRate;
+  double usPerFrame = USECS_PER_S * mSamplesPerFrame / mSamplesPerSecond;

-  return estimatedFrames * usPerFrame;
+  return frames * usPerFrame;
 }

 int64_t MP3FrameParser::GetMP3Offset()
@ -385,4 +525,24 @@ int64_t MP3FrameParser::GetMP3Offset()
  return mMP3Offset;
 }

+bool MP3FrameParser::ParsedHeaders()
+{
+  // We have seen both the beginning and the end of the first MP3 frame in the
+  // stream.
+  return mMP3Offset > -1 && mFirstFrameEnd < 0;
+}
+
+bool MP3FrameParser::HasExactDuration()
+{
+  return ParsedHeaders() && mNumFrames > -1;
+}
+
+bool MP3FrameParser::NeedsData()
+{
+  // If we don't know the duration exactly then either:
+  //  - we're still waiting for a VBR header; or
+  //  - we look at all frames to constantly update our duration estimate.
+  return IsMP3() && !HasExactDuration();
+}
+
 }
--- a/content/media/MP3FrameParser.h
+++ b/content/media/MP3FrameParser.h
@ -5,7 +5,9 @@
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

 #include <stdint.h>
+
 #include "mozilla/Mutex.h"
+#include "nsString.h"

 namespace mozilla {

@ -57,6 +59,9 @@ public:
  // Get the sample rate from the current header.
  uint32_t GetSampleRate();

+  // Get the number of samples per frame.
+  uint32_t GetSamplesPerFrame();
+
 private:
  uint32_t mCurrentChar;
  union {
@ -102,7 +107,7 @@ public:
    return mIsMP3 != NOT_MP3;
  }

-  void Parse(const char* aBuffer, uint32_t aLength, int64_t aStreamOffset);
+  void Parse(const char* aBuffer, uint32_t aLength, uint64_t aStreamOffset);

  // Returns the duration, in microseconds. If the entire stream has not
  // been parsed yet, this is an estimate based on the bitrate of the
@ -113,6 +118,18 @@ public:
  // no MP3 frame has been detected yet.
  int64_t GetMP3Offset();

+  // Returns true if we've seen the whole first frame of the MP3 stream, and
+  // therefore can make an estimate on the stream duration.
+  // Otherwise, returns false.
+  bool ParsedHeaders();
+
+  // Returns true if we know the exact duration of the MP3 stream;
+  // false otherwise.
+  bool HasExactDuration();
+
+  // Returns true if the parser needs more data for duration estimation.
+  bool NeedsData();
+
 private:

  // Parses aBuffer, starting at offset 0. Returns the number of bytes
@ -135,24 +152,47 @@ private:
  // MP3 frame header parser.
  MP3Parser mMP3Parser;

+  // If we read |MAX_SKIPPED_BYTES| from the stream without finding any MP3
+  // frames, we give up and report |NOT_MP3|. Here we track the cumulative size
+  // of any ID3 headers we've seen so big ID3 sections aren't counted towards
+  // skipped bytes.
+  uint32_t mTotalID3Size;
+
  // All fields below are protected by mLock
+
+  // We keep stats on the size of all the frames we've seen, as well as how many
+  // so that we can estimate the duration of the rest of the stream.
  uint64_t mTotalFrameSize;
-  uint64_t mNumFrames;
+  uint64_t mFrameCount;

  // Offset of the last data parsed. This is the end offset of the last data
  // block parsed, so it's the start offset we expect to get on the next
  // call to Parse().
-  int64_t  mOffset;
+  uint64_t mOffset;

  // Total length of the stream in bytes.
-  int64_t  mLength;
+  int64_t mLength;

  // Offset of first MP3 frame in the bitstream. Has value -1 until the
  // first MP3 frame is found.
  int64_t mMP3Offset;

-  // Number of audio samples per second. Fixed through the whole file.
-  uint16_t mSampleRate;
+  // The exact number of frames in this stream, if we know it. -1 otherwise.
+  int64_t mNumFrames;
+
+  // Number of audio samples per second and per frame. Fixed through the whole
+  // file. If we know these variables as well as the number of frames in the
+  // file, we can get an exact duration for the stream.
+  uint16_t mSamplesPerSecond;
+  uint16_t mSamplesPerFrame;
+
+  // If the MP3 has a variable bitrate, then there *should* be metadata about
+  // the encoding in the first frame. We buffer the first frame here.
+  nsAutoCString mFirstFrame;
+
+  // While we are reading the first frame, this is the stream offset of the
+  // last byte of that frame. -1 at all other times.
+  int64_t mFirstFrameEnd;

  enum eIsMP3 {
    MAYBE_MP3, // We're giving the stream the benefit of the doubt...
--- a/content/media/apple/AppleMP3Reader.cpp
+++ b/content/media/apple/AppleMP3Reader.cpp
@ -86,17 +86,12 @@ static void _AudioSampleCallback(void *aThis,
 * put it in |aData|, and return true.
 * Otherwise, put as much data as is left into |aData|, set |aNumBytes| to the
 * amount of data we have left, and return false.
- *
- * This function also passes the read data on to the MP3 frame parser for
- * stream duration estimation.
 */
 nsresult
-AppleMP3Reader::ReadAndNotify(uint32_t *aNumBytes, char *aData)
+AppleMP3Reader::Read(uint32_t *aNumBytes, char *aData)
 {
  MediaResource *resource = mDecoder->GetResource();

-  uint64_t offset = resource->Tell();
-
  // Loop until we have all the data asked for, or we've reached EOS
  uint32_t totalBytes = 0;
  uint32_t numBytes;
@ -111,18 +106,6 @@ AppleMP3Reader::ReadAndNotify(uint32_t *aNumBytes, char *aData)
    }
  } while(totalBytes < *aNumBytes && numBytes);

-  // Pass the buffer to the MP3 frame parser to improve our duration estimate.
-  if (mMP3FrameParser.IsMP3()) {
-    mMP3FrameParser.Parse(aData, totalBytes, offset);
-    uint64_t duration = mMP3FrameParser.GetDuration();
-    if (duration != mDuration) {
-      LOGD("Updating media duration to %lluus\n", duration);
-      mDuration = duration;
-      ReentrantMonitorAutoEnter mon(mDecoder->GetReentrantMonitor());
-      mDecoder->UpdateEstimatedMediaDuration(duration);
-    }
-  }
-
  *aNumBytes = totalBytes;

  // We will have read some data in the last iteration iff we filled the buffer.
@ -286,7 +269,7 @@ AppleMP3Reader::DecodeAudioData()
  char bytes[AUDIO_READ_BYTES];
  uint32_t numBytes = AUDIO_READ_BYTES;

-  nsresult readrv = ReadAndNotify(&numBytes, bytes);
+  nsresult readrv = Read(&numBytes, bytes);

  // This function calls |AudioSampleCallback| above, synchronously, when it
  // finds compressed MP3 frame.
@ -374,16 +357,21 @@ AppleMP3Reader::ReadMetadata(MediaInfo* aInfo,
   */
  OSStatus rv;
  nsresult readrv;
+  uint32_t offset = 0;
  do {
    char bytes[AUDIO_READ_BYTES];
    uint32_t numBytes = AUDIO_READ_BYTES;
-    readrv = ReadAndNotify(&numBytes, bytes);
+    readrv = Read(&numBytes, bytes);

    rv = AudioFileStreamParseBytes(mAudioFileStream,
                                   numBytes,
                                   bytes,
                                   0 /* flags */);

+    mMP3FrameParser.Parse(bytes, numBytes, offset);
+
+    offset += numBytes;
+
    // We have to do our decoder setup from the callback. When it's done it will
    // set mStreamReady.
  } while (!mStreamReady && !rv && NS_SUCCEEDED(readrv));
@ -398,12 +386,18 @@ AppleMP3Reader::ReadMetadata(MediaInfo* aInfo,
    return NS_ERROR_FAILURE;
  }

+  if (!mMP3FrameParser.IsMP3()) {
+    LOGE("Frame parser failed to parse MP3 stream\n");
+    return NS_ERROR_FAILURE;
+  }
+
  aInfo->mAudio.mRate = mAudioSampleRate;
  aInfo->mAudio.mChannels = mAudioChannels;
  aInfo->mAudio.mHasAudio = mStreamReady;

  {
    ReentrantMonitorAutoEnter mon(mDecoder->GetReentrantMonitor());
+    mDuration = mMP3FrameParser.GetDuration();
    mDecoder->SetMediaDuration(mDuration);
  }

@ -515,4 +509,25 @@ AppleMP3Reader::Seek(int64_t aTime,
  return NS_OK;
 }

+void
+AppleMP3Reader::NotifyDataArrived(const char* aBuffer,
+                                  uint32_t aLength,
+                                  int64_t aOffset)
+{
+  MOZ_ASSERT(NS_IsMainThread());
+  if (!mMP3FrameParser.NeedsData()) {
+    return;
+  }
+
+  mMP3FrameParser.Parse(aBuffer, aLength, aOffset);
+
+  uint64_t duration = mMP3FrameParser.GetDuration();
+  if (duration != mDuration) {
+    LOGD("Updating media duration to %lluus\n", duration);
+    mDuration = duration;
+    ReentrantMonitorAutoEnter mon(mDecoder->GetReentrantMonitor());
+    mDecoder->UpdateEstimatedMediaDuration(duration);
+  }
+}
+
 } // namespace mozilla
--- a/content/media/apple/AppleMP3Reader.h
+++ b/content/media/apple/AppleMP3Reader.h
@ -47,9 +47,13 @@ public:
                             AudioFileStreamPropertyID aPropertyID,
                             UInt32 *aFlags);

+  virtual void NotifyDataArrived(const char* aBuffer,
+                                 uint32_t aLength,
+                                 int64_t aOffset) MOZ_OVERRIDE;
+
 private:
  void SetupDecoder();
-  nsresult ReadAndNotify(uint32_t *aNumBytes, char *aData);
+  nsresult Read(uint32_t *aNumBytes, char *aData);

  static OSStatus PassthroughInputDataCallback(AudioConverterRef aAudioConverter,
                                               UInt32 *aNumDataPackets,