gecko/dom/media/webspeech/recognition/SpeechRecognition.cpp

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:set ts=2 sw=2 sts=2 et cindent: */
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

#include "SpeechRecognition.h"

#include "nsCOMPtr.h"
#include "nsCycleCollectionParticipant.h"

#include "mozilla/dom/SpeechRecognitionBinding.h"
#include "mozilla/dom/MediaStreamTrackBinding.h"
#include "mozilla/dom/MediaStreamError.h"
#include "mozilla/MediaManager.h"
#include "mozilla/Services.h"

#include "AudioSegment.h"
#include "endpointer.h"

#include "mozilla/dom/SpeechRecognitionEvent.h"
#include "nsIObserverService.h"
#include "nsServiceManagerUtils.h"

#include <algorithm>

// Undo the windows.h damage
#if defined(XP_WIN) && defined(GetMessage)
#undef GetMessage
#endif

namespace mozilla {
namespace dom {

#define PREFERENCE_DEFAULT_RECOGNITION_SERVICE "media.webspeech.service.default"
#define DEFAULT_RECOGNITION_SERVICE "google"

#define PREFERENCE_ENDPOINTER_SILENCE_LENGTH "media.webspeech.silence_length"
#define PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH "media.webspeech.long_silence_length"
#define PREFERENCE_ENDPOINTER_LONG_SPEECH_LENGTH "media.webspeech.long_speech_length"

static const uint32_t kSAMPLE_RATE = 16000;
static const uint32_t kSPEECH_DETECTION_TIMEOUT_MS = 10000;

// number of frames corresponding to 300ms of audio to send to endpointer while
// it's in environment estimation mode
// kSAMPLE_RATE frames = 1s, kESTIMATION_FRAMES frames = 300ms
static const uint32_t kESTIMATION_SAMPLES = 300 * kSAMPLE_RATE / 1000;

#ifdef PR_LOGGING
PRLogModuleInfo*
GetSpeechRecognitionLog()
{
  static PRLogModuleInfo* sLog;
  if (!sLog) {
    sLog = PR_NewLogModule("SpeechRecognition");
  }

  return sLog;
}
#define SR_LOG(...) PR_LOG(GetSpeechRecognitionLog(), PR_LOG_DEBUG, (__VA_ARGS__))
#else
#define SR_LOG(...)
#endif

already_AddRefed<nsISpeechRecognitionService>
GetSpeechRecognitionService()
{
  nsAutoCString speechRecognitionServiceCID;

  nsAdoptingCString prefValue =
  Preferences::GetCString(PREFERENCE_DEFAULT_RECOGNITION_SERVICE);
  nsAutoCString speechRecognitionService;

  if (!prefValue.get() || prefValue.IsEmpty()) {
    speechRecognitionService = DEFAULT_RECOGNITION_SERVICE;
  } else {
    speechRecognitionService = prefValue;
  }

  if (!SpeechRecognition::mTestConfig.mFakeRecognitionService){
    speechRecognitionServiceCID =
      NS_LITERAL_CSTRING(NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX) +
      speechRecognitionService;
  } else {
    speechRecognitionServiceCID =
      NS_SPEECH_RECOGNITION_SERVICE_CONTRACTID_PREFIX "fake";
  }

  nsresult aRv;
  nsCOMPtr<nsISpeechRecognitionService> recognitionService;
  recognitionService = do_GetService(speechRecognitionServiceCID.get(), &aRv);
  return recognitionService.forget();
}

NS_INTERFACE_MAP_BEGIN(SpeechRecognition)
  NS_INTERFACE_MAP_ENTRY(nsIObserver)
NS_INTERFACE_MAP_END_INHERITING(DOMEventTargetHelper)

NS_IMPL_ADDREF_INHERITED(SpeechRecognition, DOMEventTargetHelper)
NS_IMPL_RELEASE_INHERITED(SpeechRecognition, DOMEventTargetHelper)

struct SpeechRecognition::TestConfig SpeechRecognition::mTestConfig;

SpeechRecognition::SpeechRecognition(nsPIDOMWindow* aOwnerWindow)
  : DOMEventTargetHelper(aOwnerWindow)
  , mEndpointer(kSAMPLE_RATE)
  , mAudioSamplesPerChunk(mEndpointer.FrameSize())
  , mSpeechDetectionTimer(do_CreateInstance(NS_TIMER_CONTRACTID))
{
  SR_LOG("created SpeechRecognition");

  mTestConfig.Init();
  if (mTestConfig.mEnableTests) {
    nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
    obs->AddObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC, false);
    obs->AddObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC, false);
  }

  mEndpointer.set_speech_input_complete_silence_length(
      Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 500000));
  mEndpointer.set_long_speech_input_complete_silence_length(
      Preferences::GetInt(PREFERENCE_ENDPOINTER_LONG_SILENCE_LENGTH, 1000000));
  mEndpointer.set_long_speech_length(
      Preferences::GetInt(PREFERENCE_ENDPOINTER_SILENCE_LENGTH, 3 * 1000000));
  Reset();
}

bool
SpeechRecognition::StateBetween(FSMState begin, FSMState end)
{
  return mCurrentState >= begin && mCurrentState <= end;
}

void
SpeechRecognition::SetState(FSMState state)
{
  mCurrentState = state;
  SR_LOG("Transitioned to state %s", GetName(mCurrentState));
  return;
}

JSObject*
SpeechRecognition::WrapObject(JSContext* aCx)
{
  return SpeechRecognitionBinding::Wrap(aCx, this);
}

already_AddRefed<SpeechRecognition>
SpeechRecognition::Constructor(const GlobalObject& aGlobal,
                               ErrorResult& aRv)
{
  nsCOMPtr<nsPIDOMWindow> win = do_QueryInterface(aGlobal.GetAsSupports());
  if (!win) {
    aRv.Throw(NS_ERROR_FAILURE);
  }

  MOZ_ASSERT(win->IsInnerWindow());
  nsRefPtr<SpeechRecognition> object = new SpeechRecognition(win);
  return object.forget();
}

nsISupports*
SpeechRecognition::GetParentObject() const
{
  return GetOwner();
}

void
SpeechRecognition::ProcessEvent(SpeechEvent* aEvent)
{
  SR_LOG("Processing %s, current state is %s",
         GetName(aEvent),
         GetName(mCurrentState));

  if (mAborted && aEvent->mType != EVENT_ABORT) {
    // ignore all events while aborting
    return;
  }

  Transition(aEvent);
}

void
SpeechRecognition::Transition(SpeechEvent* aEvent)
{
  switch (mCurrentState) {
    case STATE_IDLE:
      switch (aEvent->mType) {
        case EVENT_START:
          // TODO: may want to time out if we wait too long
          // for user to approve
          WaitForAudioData(aEvent);
          break;
        case EVENT_STOP:
        case EVENT_ABORT:
        case EVENT_AUDIO_DATA:
        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
          DoNothing(aEvent);
          break;
        case EVENT_AUDIO_ERROR:
        case EVENT_RECOGNITIONSERVICE_ERROR:
          AbortError(aEvent);
          break;
        case EVENT_COUNT:
          MOZ_CRASH("Invalid event EVENT_COUNT");
      }
      break;
    case STATE_STARTING:
      switch (aEvent->mType) {
        case EVENT_AUDIO_DATA:
          StartedAudioCapture(aEvent);
          break;
        case EVENT_AUDIO_ERROR:
        case EVENT_RECOGNITIONSERVICE_ERROR:
          AbortError(aEvent);
          break;
        case EVENT_ABORT:
          AbortSilently(aEvent);
          break;
        case EVENT_STOP:
          Reset();
          break;
        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
          DoNothing(aEvent);
          break;
        case EVENT_START:
          SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
          MOZ_CRASH();
        case EVENT_COUNT:
          MOZ_CRASH("Invalid event EVENT_COUNT");
      }
      break;
    case STATE_ESTIMATING:
      switch (aEvent->mType) {
        case EVENT_AUDIO_DATA:
          WaitForEstimation(aEvent);
          break;
        case EVENT_STOP:
          StopRecordingAndRecognize(aEvent);
          break;
        case EVENT_ABORT:
          AbortSilently(aEvent);
          break;
        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
        case EVENT_RECOGNITIONSERVICE_ERROR:
          DoNothing(aEvent);
          break;
        case EVENT_AUDIO_ERROR:
          AbortError(aEvent);
          break;
        case EVENT_START:
          SR_LOG("STATE_ESTIMATING: Unhandled event %d", aEvent->mType);
          MOZ_CRASH();
        case EVENT_COUNT:
          MOZ_CRASH("Invalid event EVENT_COUNT");
      }
      break;
    case STATE_WAITING_FOR_SPEECH:
      switch (aEvent->mType) {
        case EVENT_AUDIO_DATA:
          DetectSpeech(aEvent);
          break;
        case EVENT_STOP:
          StopRecordingAndRecognize(aEvent);
          break;
        case EVENT_ABORT:
          AbortSilently(aEvent);
          break;
        case EVENT_AUDIO_ERROR:
          AbortError(aEvent);
          break;
        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
        case EVENT_RECOGNITIONSERVICE_ERROR:
          DoNothing(aEvent);
          break;
        case EVENT_START:
          SR_LOG("STATE_STARTING: Unhandled event %s", GetName(aEvent));
          MOZ_CRASH();
        case EVENT_COUNT:
          MOZ_CRASH("Invalid event EVENT_COUNT");
      }
      break;
    case STATE_RECOGNIZING:
      switch (aEvent->mType) {
        case EVENT_AUDIO_DATA:
          WaitForSpeechEnd(aEvent);
          break;
        case EVENT_STOP:
          StopRecordingAndRecognize(aEvent);
          break;
        case EVENT_AUDIO_ERROR:
        case EVENT_RECOGNITIONSERVICE_ERROR:
          AbortError(aEvent);
          break;
        case EVENT_ABORT:
          AbortSilently(aEvent);
          break;
        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
          DoNothing(aEvent);
          break;
        case EVENT_START:
          SR_LOG("STATE_RECOGNIZING: Unhandled aEvent %s", GetName(aEvent));
          MOZ_CRASH();
        case EVENT_COUNT:
          MOZ_CRASH("Invalid event EVENT_COUNT");
      }
      break;
    case STATE_WAITING_FOR_RESULT:
      switch (aEvent->mType) {
        case EVENT_STOP:
          DoNothing(aEvent);
          break;
        case EVENT_AUDIO_ERROR:
        case EVENT_RECOGNITIONSERVICE_ERROR:
          AbortError(aEvent);
          break;
        case EVENT_RECOGNITIONSERVICE_FINAL_RESULT:
          NotifyFinalResult(aEvent);
          break;
        case EVENT_AUDIO_DATA:
          DoNothing(aEvent);
          break;
        case EVENT_ABORT:
          AbortSilently(aEvent);
          break;
        case EVENT_START:
        case EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT:
          SR_LOG("STATE_WAITING_FOR_RESULT: Unhandled aEvent %s", GetName(aEvent));
          MOZ_CRASH();
        case EVENT_COUNT:
          MOZ_CRASH("Invalid event EVENT_COUNT");
      }
      break;
    case STATE_COUNT:
      MOZ_CRASH("Invalid state STATE_COUNT");
  }

  return;
}

/*
 * Handle a segment of recorded audio data.
 * Returns the number of samples that were processed.
 */
uint32_t
SpeechRecognition::ProcessAudioSegment(AudioSegment* aSegment, TrackRate aTrackRate)
{
  AudioSegment::ChunkIterator iterator(*aSegment);
  uint32_t samples = 0;
  while (!iterator.IsEnded()) {
    float out;
    mEndpointer.ProcessAudio(*iterator, &out);
    samples += iterator->GetDuration();
    iterator.Next();
  }

  mRecognitionService->ProcessAudioSegment(aSegment, aTrackRate);
  return samples;
}

/****************************************************************************
 * FSM Transition functions
 *
 * If a transition function may cause a DOM event to be fired,
 * it may also be re-entered, since the event handler may cause the
 * event loop to spin and new SpeechEvents to be processed.
 *
 * Rules:
 * 1) These methods should call SetState as soon as possible.
 * 2) If these methods dispatch DOM events, or call methods that dispatch
 * DOM events, that should be done as late as possible.
 * 3) If anything must happen after dispatching a DOM event, make sure
 * the state is still what the method expected it to be.
 ****************************************************************************/

void
SpeechRecognition::Reset()
{
  SetState(STATE_IDLE);
  mRecognitionService = nullptr;
  mEstimationSamples = 0;
  mBufferedSamples = 0;
  mSpeechDetectionTimer->Cancel();
  mAborted = false;
}

void
SpeechRecognition::ResetAndEnd()
{
  Reset();
  DispatchTrustedEvent(NS_LITERAL_STRING("end"));
}

void
SpeechRecognition::WaitForAudioData(SpeechEvent* aEvent)
{
  SetState(STATE_STARTING);
}

void
SpeechRecognition::StartedAudioCapture(SpeechEvent* aEvent)
{
  SetState(STATE_ESTIMATING);

  mEndpointer.SetEnvironmentEstimationMode();
  mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);

  DispatchTrustedEvent(NS_LITERAL_STRING("audiostart"));
  if (mCurrentState == STATE_ESTIMATING) {
    DispatchTrustedEvent(NS_LITERAL_STRING("start"));
  }
}

void
SpeechRecognition::StopRecordingAndRecognize(SpeechEvent* aEvent)
{
  SetState(STATE_WAITING_FOR_RESULT);

  MOZ_ASSERT(mRecognitionService, "Service deleted before recording done");
  mRecognitionService->SoundEnd();

  StopRecording();
}

void
SpeechRecognition::WaitForEstimation(SpeechEvent* aEvent)
{
  SetState(STATE_ESTIMATING);

  mEstimationSamples += ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
  if (mEstimationSamples > kESTIMATION_SAMPLES) {
    mEndpointer.SetUserInputMode();
    SetState(STATE_WAITING_FOR_SPEECH);
  }
}

void
SpeechRecognition::DetectSpeech(SpeechEvent* aEvent)
{
  SetState(STATE_WAITING_FOR_SPEECH);

  ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
  if (mEndpointer.DidStartReceivingSpeech()) {
    mSpeechDetectionTimer->Cancel();
    SetState(STATE_RECOGNIZING);
    DispatchTrustedEvent(NS_LITERAL_STRING("speechstart"));
  }
}

void
SpeechRecognition::WaitForSpeechEnd(SpeechEvent* aEvent)
{
  SetState(STATE_RECOGNIZING);

  ProcessAudioSegment(aEvent->mAudioSegment, aEvent->mTrackRate);
  if (mEndpointer.speech_input_complete()) {
    DispatchTrustedEvent(NS_LITERAL_STRING("speechend"));

    if (mCurrentState == STATE_RECOGNIZING) {
      // FIXME: StopRecordingAndRecognize should only be called for single
      // shot services for continuous we should just inform the service
      StopRecordingAndRecognize(aEvent);
    }
  }
}

void
SpeechRecognition::NotifyFinalResult(SpeechEvent* aEvent)
{
  ResetAndEnd();

  SpeechRecognitionEventInit init;
  init.mBubbles = true;
  init.mCancelable = false;
  // init.mResultIndex = 0;
  init.mResults = aEvent->mRecognitionResultList;
  init.mInterpretation = NS_LITERAL_STRING("NOT_IMPLEMENTED");
  // init.mEmma = nullptr;

  nsRefPtr<SpeechRecognitionEvent> event =
    SpeechRecognitionEvent::Constructor(this, NS_LITERAL_STRING("result"), init);
  event->SetTrusted(true);

  bool defaultActionEnabled;
  this->DispatchEvent(event, &defaultActionEnabled);
}

void
SpeechRecognition::DoNothing(SpeechEvent* aEvent)
{
}

void
SpeechRecognition::AbortSilently(SpeechEvent* aEvent)
{
  if (mRecognitionService) {
    mRecognitionService->Abort();
  }

  if (mDOMStream) {
    StopRecording();
  }

  ResetAndEnd();
}

void
SpeechRecognition::AbortError(SpeechEvent* aEvent)
{
  AbortSilently(aEvent);
  NotifyError(aEvent);
}

void
SpeechRecognition::NotifyError(SpeechEvent* aEvent)
{
  aEvent->mError->SetTrusted(true);

  bool defaultActionEnabled;
  this->DispatchEvent(aEvent->mError, &defaultActionEnabled);

  return;
}

/**************************************
 * Event triggers and other functions *
 **************************************/
NS_IMETHODIMP
SpeechRecognition::StartRecording(DOMMediaStream* aDOMStream)
{
  // hold a reference so that the underlying stream
  // doesn't get Destroy()'ed
  mDOMStream = aDOMStream;

  NS_ENSURE_STATE(mDOMStream->GetStream());
  mSpeechListener = new SpeechStreamListener(this);
  mDOMStream->GetStream()->AddListener(mSpeechListener);

  mEndpointer.StartSession();

  return mSpeechDetectionTimer->Init(this, kSPEECH_DETECTION_TIMEOUT_MS,
                                     nsITimer::TYPE_ONE_SHOT);
}

NS_IMETHODIMP
SpeechRecognition::StopRecording()
{
  // we only really need to remove the listener explicitly when testing,
  // as our JS code still holds a reference to mDOMStream and only assigning
  // it to nullptr isn't guaranteed to free the stream and the listener.
  mDOMStream->GetStream()->RemoveListener(mSpeechListener);
  mSpeechListener = nullptr;
  mDOMStream = nullptr;

  mEndpointer.EndSession();
  DispatchTrustedEvent(NS_LITERAL_STRING("audioend"));

  return NS_OK;
}

NS_IMETHODIMP
SpeechRecognition::Observe(nsISupports* aSubject, const char* aTopic,
                           const char16_t* aData)
{
  MOZ_ASSERT(NS_IsMainThread(), "Observer invoked off the main thread");

  if (!strcmp(aTopic, NS_TIMER_CALLBACK_TOPIC) &&
      StateBetween(STATE_IDLE, STATE_WAITING_FOR_SPEECH)) {

    DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
                  SpeechRecognitionErrorCode::No_speech,
                  NS_LITERAL_STRING("No speech detected (timeout)"));
  } else if (!strcmp(aTopic, SPEECH_RECOGNITION_TEST_END_TOPIC)) {
    nsCOMPtr<nsIObserverService> obs = services::GetObserverService();
    obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC);
    obs->RemoveObserver(this, SPEECH_RECOGNITION_TEST_END_TOPIC);
  } else if (mTestConfig.mFakeFSMEvents &&
             !strcmp(aTopic, SPEECH_RECOGNITION_TEST_EVENT_REQUEST_TOPIC)) {
    ProcessTestEventRequest(aSubject, nsDependentString(aData));
  }

  return NS_OK;
}

void
SpeechRecognition::ProcessTestEventRequest(nsISupports* aSubject, const nsAString& aEventName)
{
  if (aEventName.EqualsLiteral("EVENT_ABORT")) {
    Abort();
  } else if (aEventName.EqualsLiteral("EVENT_AUDIO_ERROR")) {
    DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR,
                  SpeechRecognitionErrorCode::Audio_capture, // TODO different codes?
                  NS_LITERAL_STRING("AUDIO_ERROR test event"));
  } else {
    NS_ASSERTION(mTestConfig.mFakeRecognitionService,
                 "Got request for fake recognition service event, but "
                 TEST_PREFERENCE_FAKE_RECOGNITION_SERVICE " is unset");

    // let the fake recognition service handle the request
  }

  return;
}

already_AddRefed<SpeechGrammarList>
SpeechRecognition::GetGrammars(ErrorResult& aRv) const
{
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
  return nullptr;
}

void
SpeechRecognition::SetGrammars(SpeechGrammarList& aArg, ErrorResult& aRv)
{
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
  return;
}

void
SpeechRecognition::GetLang(nsString& aRetVal, ErrorResult& aRv) const
{
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
  return;
}

void
SpeechRecognition::SetLang(const nsAString& aArg, ErrorResult& aRv)
{
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
  return;
}

bool
SpeechRecognition::GetContinuous(ErrorResult& aRv) const
{
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
  return false;
}

void
SpeechRecognition::SetContinuous(bool aArg, ErrorResult& aRv)
{
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
  return;
}

bool
SpeechRecognition::GetInterimResults(ErrorResult& aRv) const
{
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
  return false;
}

void
SpeechRecognition::SetInterimResults(bool aArg, ErrorResult& aRv)
{
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
  return;
}

uint32_t
SpeechRecognition::GetMaxAlternatives(ErrorResult& aRv) const
{
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
  return 0;
}

void
SpeechRecognition::SetMaxAlternatives(uint32_t aArg, ErrorResult& aRv)
{
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
  return;
}

void
SpeechRecognition::GetServiceURI(nsString& aRetVal, ErrorResult& aRv) const
{
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
  return;
}

void
SpeechRecognition::SetServiceURI(const nsAString& aArg, ErrorResult& aRv)
{
  aRv.Throw(NS_ERROR_NOT_IMPLEMENTED);
  return;
}

void
SpeechRecognition::Start(const Optional<NonNull<DOMMediaStream>>& aStream, ErrorResult& aRv)
{
  if (mCurrentState != STATE_IDLE) {
    aRv.Throw(NS_ERROR_DOM_INVALID_STATE_ERR);
    return;
  }

  mRecognitionService = GetSpeechRecognitionService();
  NS_ENSURE_TRUE_VOID(mRecognitionService);

  nsresult rv;
  rv = mRecognitionService->Initialize(this);
  NS_ENSURE_SUCCESS_VOID(rv);

  MediaStreamConstraints constraints;
  constraints.mAudio.SetAsBoolean() = true;

  if (aStream.WasPassed()) {
    StartRecording(&aStream.Value());
  } else {
    MediaManager* manager = MediaManager::Get();
    manager->GetUserMedia(GetOwner(),
                          constraints,
                          new GetUserMediaSuccessCallback(this),
                          new GetUserMediaErrorCallback(this));
  }

  nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_START);
  NS_DispatchToMainThread(event);
}

void
SpeechRecognition::Stop()
{
  nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_STOP);
  NS_DispatchToMainThread(event);
}

void
SpeechRecognition::Abort()
{
  if (mAborted) {
    return;
  }

  mAborted = true;
  nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_ABORT);
  NS_DispatchToMainThread(event);
}

void
SpeechRecognition::DispatchError(EventType aErrorType,
                                 SpeechRecognitionErrorCode aErrorCode,
                                 const nsAString& aMessage)
{
  MOZ_ASSERT(NS_IsMainThread());
  MOZ_ASSERT(aErrorType == EVENT_RECOGNITIONSERVICE_ERROR ||
             aErrorType == EVENT_AUDIO_ERROR, "Invalid error type!");

  nsRefPtr<SpeechRecognitionError> srError =
    new SpeechRecognitionError(nullptr, nullptr, nullptr);

  ErrorResult err;
  srError->InitSpeechRecognitionError(NS_LITERAL_STRING("error"), true, false,
                                      aErrorCode, aMessage, err);

  nsRefPtr<SpeechEvent> event = new SpeechEvent(this, aErrorType);
  event->mError = srError;
  NS_DispatchToMainThread(event);
}

/*
 * Buffer audio samples into mAudioSamplesBuffer until aBufferSize.
 * Updates mBufferedSamples and returns the number of samples that were buffered.
 */
uint32_t
SpeechRecognition::FillSamplesBuffer(const int16_t* aSamples,
                                     uint32_t aSampleCount)
{
  MOZ_ASSERT(mBufferedSamples < mAudioSamplesPerChunk);
  MOZ_ASSERT(mAudioSamplesBuffer.get());

  int16_t* samplesBuffer = static_cast<int16_t*>(mAudioSamplesBuffer->Data());
  size_t samplesToCopy = std::min(aSampleCount,
                                  mAudioSamplesPerChunk - mBufferedSamples);

  memcpy(samplesBuffer + mBufferedSamples, aSamples,
         samplesToCopy * sizeof(int16_t));

  mBufferedSamples += samplesToCopy;
  return samplesToCopy;
}

/*
 * Split a samples buffer starting of a given size into
 * chunks of equal size. The chunks are stored in the array
 * received as argument.
 * Returns the offset of the end of the last chunk that was
 * created.
 */
uint32_t
SpeechRecognition::SplitSamplesBuffer(const int16_t* aSamplesBuffer,
                                      uint32_t aSampleCount,
                                      nsTArray<nsRefPtr<SharedBuffer>>& aResult)
{
  uint32_t chunkStart = 0;

  while (chunkStart + mAudioSamplesPerChunk <= aSampleCount) {
    nsRefPtr<SharedBuffer> chunk =
      SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));

    memcpy(chunk->Data(), aSamplesBuffer + chunkStart,
           mAudioSamplesPerChunk * sizeof(int16_t));

    aResult.AppendElement(chunk.forget());
    chunkStart += mAudioSamplesPerChunk;
  }

  return chunkStart;
}

AudioSegment*
SpeechRecognition::CreateAudioSegment(nsTArray<nsRefPtr<SharedBuffer>>& aChunks)
{
  AudioSegment* segment = new AudioSegment();
  for (uint32_t i = 0; i < aChunks.Length(); ++i) {
    nsRefPtr<SharedBuffer> buffer = aChunks[i];
    const int16_t* chunkData = static_cast<const int16_t*>(buffer->Data());

    nsAutoTArray<const int16_t*, 1> channels;
    channels.AppendElement(chunkData);
    segment->AppendFrames(buffer.forget(), channels, mAudioSamplesPerChunk);
  }

  return segment;
}

void
SpeechRecognition::FeedAudioData(already_AddRefed<SharedBuffer> aSamples,
                                 uint32_t aDuration,
                                 MediaStreamListener* aProvider, TrackRate aTrackRate)
{
  NS_ASSERTION(!NS_IsMainThread(),
               "FeedAudioData should not be called in the main thread");

  // Endpointer expects to receive samples in chunks whose size is a
  // multiple of its frame size.
  // Since we can't assume we will receive the frames in appropriate-sized
  // chunks, we must buffer and split them in chunks of mAudioSamplesPerChunk
  // (a multiple of Endpointer's frame size) before feeding to Endpointer.

  // ensure aSamples is deleted
  nsRefPtr<SharedBuffer> refSamples = aSamples;

  uint32_t samplesIndex = 0;
  const int16_t* samples = static_cast<int16_t*>(refSamples->Data());
  nsAutoTArray<nsRefPtr<SharedBuffer>, 5> chunksToSend;

  // fill up our buffer and make a chunk out of it, if possible
  if (mBufferedSamples > 0) {
    samplesIndex += FillSamplesBuffer(samples, aDuration);

    if (mBufferedSamples == mAudioSamplesPerChunk) {
      chunksToSend.AppendElement(mAudioSamplesBuffer.forget());
      mBufferedSamples = 0;
    }
  }

  // create sample chunks of correct size
  if (samplesIndex < aDuration) {
    samplesIndex += SplitSamplesBuffer(samples + samplesIndex,
                                       aDuration - samplesIndex,
                                       chunksToSend);
  }

  // buffer remaining samples
  if (samplesIndex < aDuration) {
    mBufferedSamples = 0;
    mAudioSamplesBuffer =
      SharedBuffer::Create(mAudioSamplesPerChunk * sizeof(int16_t));

    FillSamplesBuffer(samples + samplesIndex, aDuration - samplesIndex);
  }

  AudioSegment* segment = CreateAudioSegment(chunksToSend);
  nsRefPtr<SpeechEvent> event = new SpeechEvent(this, EVENT_AUDIO_DATA);
  event->mAudioSegment = segment;
  event->mProvider = aProvider;
  event->mTrackRate = aTrackRate;
  NS_DispatchToMainThread(event);

  return;
}

const char*
SpeechRecognition::GetName(FSMState aId)
{
  static const char* names[] = {
    "STATE_IDLE",
    "STATE_STARTING",
    "STATE_ESTIMATING",
    "STATE_WAITING_FOR_SPEECH",
    "STATE_RECOGNIZING",
    "STATE_WAITING_FOR_RESULT",
  };

  MOZ_ASSERT(aId < STATE_COUNT);
  MOZ_ASSERT(ArrayLength(names) == STATE_COUNT);
  return names[aId];
}

const char*
SpeechRecognition::GetName(SpeechEvent* aEvent)
{
  static const char* names[] = {
    "EVENT_START",
    "EVENT_STOP",
    "EVENT_ABORT",
    "EVENT_AUDIO_DATA",
    "EVENT_AUDIO_ERROR",
    "EVENT_RECOGNITIONSERVICE_INTERMEDIATE_RESULT",
    "EVENT_RECOGNITIONSERVICE_FINAL_RESULT",
    "EVENT_RECOGNITIONSERVICE_ERROR"
  };

  MOZ_ASSERT(aEvent->mType < EVENT_COUNT);
  MOZ_ASSERT(ArrayLength(names) == EVENT_COUNT);
  return names[aEvent->mType];
}

SpeechEvent::~SpeechEvent()
{
  delete mAudioSegment;
}

NS_IMETHODIMP
SpeechEvent::Run()
{
  mRecognition->ProcessEvent(this);
  return NS_OK;
}

NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaSuccessCallback, nsIDOMGetUserMediaSuccessCallback)

NS_IMETHODIMP
SpeechRecognition::GetUserMediaSuccessCallback::OnSuccess(nsISupports* aStream)
{
  nsRefPtr<DOMMediaStream> stream = do_QueryObject(aStream);
  if (!stream) {
    return NS_ERROR_NO_INTERFACE;
  }
  mRecognition->StartRecording(stream);
  return NS_OK;
}

NS_IMPL_ISUPPORTS(SpeechRecognition::GetUserMediaErrorCallback, nsIDOMGetUserMediaErrorCallback)

NS_IMETHODIMP
SpeechRecognition::GetUserMediaErrorCallback::OnError(nsISupports* aError)
{
  nsRefPtr<MediaStreamError> error = do_QueryObject(aError);
  if (!error) {
    return NS_OK;
  }
  SpeechRecognitionErrorCode errorCode;

  nsString name;
  error->GetName(name);
  if (name.EqualsLiteral("PERMISSION_DENIED")) {
    errorCode = SpeechRecognitionErrorCode::Not_allowed;
  } else {
    errorCode = SpeechRecognitionErrorCode::Audio_capture;
  }

  nsString message;
  error->GetMessage(message);
  mRecognition->DispatchError(SpeechRecognition::EVENT_AUDIO_ERROR, errorCode,
                              message);
  return NS_OK;
}

} // namespace dom
} // namespace mozilla