gecko/media/webvtt/cuetext.c
2013-04-16 15:53:00 -07:00

744 lines
23 KiB
C

/**
* Copyright (c) 2013 Mozilla Foundation and Contributors
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are
* met:
*
* - Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
* HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <stdlib.h>
#include <string.h>
#include "parser_internal.h"
#include "cuetext_internal.h"
#include "node_internal.h"
#include "cue_internal.h"
#include "string_internal.h"
#ifdef min
# undef min
#endif
#define min(a,b) ( (a) < (b) ? (a) : (b) )
/**
* ERROR macro used for webvtt_parse_cuetext
*/
#undef ERROR
#define ERROR(code) \
do \
{ \
if( self->error ) \
if( self->error( self->userdata, line, col, code ) < 0 ) \
return WEBVTT_PARSE_ERROR; \
} while(0)
/**
* Macros for return statuses based on memory operations.
* This is to avoid many if statements checking for multiple memory operation
* return statuses in functions.
*/
#define CHECK_MEMORY_OP(status) \
if( status != WEBVTT_SUCCESS ) \
return status; \
#define CHECK_MEMORY_OP_JUMP(status_var, returned_status) \
if( returned_status != WEBVTT_SUCCESS) \
{ \
status_var = returned_status; \
goto dealloc; \
} \
WEBVTT_INTERN webvtt_status
webvtt_create_token( webvtt_cuetext_token **token, webvtt_token_type token_type )
{
webvtt_cuetext_token *temp_token = (webvtt_cuetext_token *)webvtt_alloc0( sizeof(*temp_token) );
if( !temp_token ) {
return WEBVTT_OUT_OF_MEMORY;
}
temp_token->token_type = token_type;
*token = temp_token;
return WEBVTT_SUCCESS;
}
WEBVTT_INTERN webvtt_status
webvtt_create_start_token( webvtt_cuetext_token **token, webvtt_string *tag_name,
webvtt_stringlist *css_classes, webvtt_string *annotation )
{
webvtt_status status;
webvtt_start_token_data sd;
if( WEBVTT_FAILED( status = webvtt_create_token( token, START_TOKEN ) ) ) {
return status;
}
webvtt_copy_string( &(*token)->tag_name, tag_name );
webvtt_copy_stringlist( &sd.css_classes, css_classes );
webvtt_copy_string( &sd.annotations, annotation );
(*token)->start_token_data = sd;
return WEBVTT_SUCCESS;
}
WEBVTT_INTERN webvtt_status
webvtt_create_end_token( webvtt_cuetext_token **token, webvtt_string *tag_name )
{
webvtt_status status;
if( WEBVTT_FAILED( status = webvtt_create_token( token, END_TOKEN ) ) ) {
return status;
}
webvtt_copy_string( &(*token)->tag_name, tag_name );
return WEBVTT_SUCCESS;
}
WEBVTT_INTERN webvtt_status
webvtt_create_text_token( webvtt_cuetext_token **token, webvtt_string *text )
{
webvtt_status status;
if( WEBVTT_FAILED( status = webvtt_create_token( token, TEXT_TOKEN ) ) ) {
return status;
}
webvtt_copy_string( &(*token)->text, text);
return WEBVTT_SUCCESS;
}
WEBVTT_INTERN webvtt_status
webvtt_create_timestamp_token( webvtt_cuetext_token **token, webvtt_timestamp time_stamp )
{
webvtt_status status;
if( WEBVTT_FAILED( status = webvtt_create_token( token, TIME_STAMP_TOKEN ) ) ) {
return status;
}
(*token)->time_stamp = time_stamp;
return WEBVTT_SUCCESS;
}
WEBVTT_INTERN void
webvtt_delete_token( webvtt_cuetext_token **token )
{
webvtt_start_token_data data;
webvtt_cuetext_token *t;
if( !token ) {
return;
}
if( !*token ) {
return;
}
t = *token;
/**
* Note that time stamp tokens do not need to free any internal data because
* they do not allocate anything.
*/
if( t->token_type == START_TOKEN ) {
data = t->start_token_data;
webvtt_release_stringlist( &data.css_classes );
webvtt_release_string( &data.annotations );
webvtt_release_string( &t->tag_name );
} else if( t->token_type == END_TOKEN ) {
webvtt_release_string( &t->tag_name );
} else if( t->token_type == TEXT_TOKEN ) {
webvtt_release_string( &t->text );
}
webvtt_free( t );
*token = 0;
}
WEBVTT_INTERN int
tag_accepts_annotation( webvtt_string *tag_name )
{
return webvtt_string_is_equal( tag_name, ( webvtt_byte * )"v", 1 );
}
WEBVTT_INTERN webvtt_status
webvtt_node_kind_from_tag_name( webvtt_string *tag_name, webvtt_node_kind *kind )
{
if( !tag_name || !kind ) {
return WEBVTT_INVALID_PARAM;
}
if( webvtt_string_length(tag_name) == 1 ) {
switch( webvtt_string_text(tag_name)[0] ) {
case( 'b' ):
*kind = WEBVTT_BOLD;
break;
case( 'i' ):
*kind = WEBVTT_ITALIC;
break;
case( 'u' ):
*kind = WEBVTT_UNDERLINE;
break;
case( 'c' ):
*kind = WEBVTT_CLASS;
break;
case( 'v' ):
*kind = WEBVTT_VOICE;
break;
}
} else if( webvtt_string_is_equal( tag_name, ( webvtt_byte * )"ruby", 4 ) ) {
*kind = WEBVTT_RUBY;
} else if( webvtt_string_is_equal( tag_name, ( webvtt_byte * )"rt", 2 ) ) {
*kind = WEBVTT_RUBY_TEXT;
} else {
return WEBVTT_INVALID_TAG_NAME;
}
return WEBVTT_SUCCESS;
}
WEBVTT_INTERN webvtt_status
webvtt_create_node_from_token( webvtt_cuetext_token *token, webvtt_node **node, webvtt_node *parent )
{
webvtt_node_kind kind;
if( !token || !node || !parent ) {
return WEBVTT_INVALID_PARAM;
}
/**
* We've recieved a node that is not null.
* In order to prevent memory leaks caused by overwriting a node which the
* caller has not released return unsuccessful.
*/
if( *node ) {
return WEBVTT_UNSUCCESSFUL;
}
switch ( token->token_type ) {
case( TEXT_TOKEN ):
return webvtt_create_text_node( node, parent, &token->text );
break;
case( START_TOKEN ):
CHECK_MEMORY_OP( webvtt_node_kind_from_tag_name( &token->tag_name, &kind) );
return webvtt_create_internal_node( node, parent, kind,
token->start_token_data.css_classes, &token->start_token_data.annotations );
break;
case ( TIME_STAMP_TOKEN ):
return webvtt_create_timestamp_node( node, parent, token->time_stamp );
break;
default:
return WEBVTT_INVALID_TOKEN_TYPE;
}
}
WEBVTT_INTERN webvtt_status
webvtt_data_state( webvtt_byte **position, webvtt_token_state *token_state,
webvtt_string *result )
{
for ( ; *token_state == DATA; (*position)++ ) {
switch( **position ) {
case '&':
*token_state = ESCAPE;
break;
case '<':
if( webvtt_string_length(result) == 0 ) {
*token_state = TAG;
} else {
return WEBVTT_SUCCESS;
}
break;
case '\0':
return WEBVTT_SUCCESS;
break;
default:
CHECK_MEMORY_OP( webvtt_string_putc( result, *position[0] ) );
break;
}
}
return WEBVTT_UNFINISHED;
}
/**
* Definitions for escape sequence replacement strings.
*/
#define RLM_REPLACE_LENGTH 3
#define LRM_REPLACE_LENGTH 3
#define NBSP_REPLACE_LENGTH 2
webvtt_byte rlm_replace[RLM_REPLACE_LENGTH] = { UTF8_RIGHT_TO_LEFT_1,
UTF8_RIGHT_TO_LEFT_2, UTF8_RIGHT_TO_LEFT_3 };
webvtt_byte lrm_replace[LRM_REPLACE_LENGTH] = { UTF8_LEFT_TO_RIGHT_1,
UTF8_LEFT_TO_RIGHT_2, UTF8_LEFT_TO_RIGHT_3 };
webvtt_byte nbsp_replace[NBSP_REPLACE_LENGTH] = { UTF8_NO_BREAK_SPACE_1,
UTF8_NO_BREAK_SPACE_2 };
WEBVTT_INTERN webvtt_status
webvtt_escape_state( webvtt_byte **position, webvtt_token_state *token_state,
webvtt_string *result )
{
webvtt_string buffer;
webvtt_status status = WEBVTT_SUCCESS;
CHECK_MEMORY_OP_JUMP( status, webvtt_create_string( 1, &buffer ) );
/**
* Append ampersand here because the algorithm is not able to add it to the
* buffer when it reads it in the DATA state tokenizer.
*/
CHECK_MEMORY_OP_JUMP( status, webvtt_string_putc( &buffer, '&' ) );
for( ; *token_state == ESCAPE; (*position)++ ) {
/**
* We have encountered a token termination point.
* Append buffer to result and return success.
*/
if( **position == '\0' || **position == '<' ) {
CHECK_MEMORY_OP_JUMP( status, webvtt_string_append_string( result, &buffer ) );
goto dealloc;
}
/**
* This means we have enocuntered a malformed escape character sequence.
* This means that we need to add that malformed text to the result and
* recreate the buffer to prepare for a new escape sequence.
*/
else if( **position == '&' ) {
CHECK_MEMORY_OP_JUMP( status, webvtt_string_append_string( result, &buffer ) );
webvtt_release_string( &buffer );
CHECK_MEMORY_OP_JUMP( status, webvtt_create_string( 1, &buffer ) );
CHECK_MEMORY_OP_JUMP( status, webvtt_string_putc( &buffer, *position[0] ) );
}
/**
* We've encountered the semicolon which is the end of an escape sequence.
* Check if buffer contains a valid escape sequence and if it does append
* the interpretation to result and change the state to DATA.
*/
else if( **position == ';' ) {
if( webvtt_string_is_equal( &buffer, ( webvtt_byte * )"&amp", 4 ) ) {
CHECK_MEMORY_OP_JUMP( status, webvtt_string_putc( result, '&' ) );
} else if( webvtt_string_is_equal( &buffer, ( webvtt_byte * )"&lt", 3 ) ) {
CHECK_MEMORY_OP_JUMP( status, webvtt_string_putc( result, '<' ) );
} else if( webvtt_string_is_equal( &buffer, ( webvtt_byte * )"&gt", 3 ) ) {
CHECK_MEMORY_OP_JUMP( status, webvtt_string_putc( result, '>' ) );
} else if( webvtt_string_is_equal( &buffer, ( webvtt_byte * )"&rlm", 4 ) ) {
CHECK_MEMORY_OP_JUMP( status, webvtt_string_append( result, rlm_replace, RLM_REPLACE_LENGTH ) );
} else if( webvtt_string_is_equal( &buffer, ( webvtt_byte * )"&lrm", 4 ) ) {
CHECK_MEMORY_OP_JUMP( status, webvtt_string_append( result, lrm_replace, LRM_REPLACE_LENGTH ) );
} else if( webvtt_string_is_equal( &buffer, ( webvtt_byte * )"&nbsp", 5 ) ) {
CHECK_MEMORY_OP_JUMP( status, webvtt_string_append( result, nbsp_replace, NBSP_REPLACE_LENGTH ) );
} else {
CHECK_MEMORY_OP_JUMP( status, webvtt_string_append_string( result, &buffer ) );
CHECK_MEMORY_OP_JUMP( status, webvtt_string_putc( result, **position ) );
}
*token_state = DATA;
status = WEBVTT_UNFINISHED;
}
/**
* Character is alphanumeric. This means we are in the body of the escape
* sequence.
*/
else if( webvtt_isalphanum( **position ) ) {
CHECK_MEMORY_OP_JUMP( status, webvtt_string_putc( &buffer, **position ) );
}
/**
* If we have not found an alphanumeric character then we have encountered
* a malformed escape sequence. Add buffer to result and continue to parse
* in DATA state.
*/
else {
CHECK_MEMORY_OP_JUMP( status, webvtt_string_append_string( result, &buffer ) );
CHECK_MEMORY_OP_JUMP( status, webvtt_string_putc( result, **position ) );
status = WEBVTT_UNFINISHED;
*token_state = DATA;
}
}
dealloc:
webvtt_release_string( &buffer );
return status;
}
WEBVTT_INTERN webvtt_status
webvtt_tag_state( webvtt_byte **position, webvtt_token_state *token_state,
webvtt_string *result )
{
for( ; *token_state == TAG; (*position)++ ) {
if( **position == '\t' || **position == '\n' ||
**position == '\r' || **position == '\f' ||
**position == ' ' ) {
*token_state = START_TAG_ANNOTATION;
} else if( webvtt_isdigit( **position ) ) {
CHECK_MEMORY_OP( webvtt_string_putc( result, **position ) );
*token_state = TIME_STAMP_TAG;
} else {
switch( **position ) {
case '.':
*token_state = START_TAG_CLASS;
break;
case '/':
*token_state = END_TAG;
break;
case '>':
return WEBVTT_SUCCESS;
break;
case '\0':
return WEBVTT_SUCCESS;
break;
default:
CHECK_MEMORY_OP( webvtt_string_putc( result, **position ) );
*token_state = START_TAG;
}
}
}
return WEBVTT_UNFINISHED;
}
WEBVTT_INTERN webvtt_status
webvtt_start_tag_state( webvtt_byte **position, webvtt_token_state *token_state,
webvtt_string *result )
{
for( ; *token_state == START_TAG; (*position)++ ) {
if( **position == '\t' || **position == '\f' ||
**position == ' ' || **position == '\n' ||
**position == '\r' ) {
*token_state = START_TAG_ANNOTATION;
} else {
switch( **position ) {
case '\t':
*token_state = START_TAG_ANNOTATION;
break;
case '.':
*token_state = START_TAG_CLASS;
break;
case '>':
return WEBVTT_SUCCESS;
break;
default:
CHECK_MEMORY_OP( webvtt_string_putc( result, **position ) );
break;
}
}
}
return WEBVTT_UNFINISHED;
}
WEBVTT_INTERN webvtt_status
webvtt_class_state( webvtt_byte **position, webvtt_token_state *token_state,
webvtt_stringlist *css_classes )
{
webvtt_string buffer;
webvtt_status status = WEBVTT_SUCCESS;
CHECK_MEMORY_OP( webvtt_create_string( 1, &buffer ) );
for( ; *token_state == START_TAG_CLASS; (*position)++ ) {
if( **position == '\t' || **position == '\f' ||
**position == ' ' || **position == '\n' ||
**position == '\r') {
CHECK_MEMORY_OP_JUMP( status, webvtt_stringlist_push( css_classes, &buffer ) );
*token_state = START_TAG_ANNOTATION;
return WEBVTT_SUCCESS;
} else if( **position == '>' || **position == '\0' ) {
CHECK_MEMORY_OP_JUMP( status, webvtt_stringlist_push( css_classes, &buffer ) );
webvtt_release_string( &buffer );
return WEBVTT_SUCCESS;
} else if( **position == '.' ) {
CHECK_MEMORY_OP_JUMP( status, webvtt_stringlist_push( css_classes, &buffer ) );
webvtt_release_string( &buffer );
CHECK_MEMORY_OP( webvtt_create_string( 1, &buffer ) );
} else {
CHECK_MEMORY_OP_JUMP( status, webvtt_string_putc( &buffer, **position ) );
}
}
dealloc:
webvtt_release_string( &buffer );
return status;
}
WEBVTT_INTERN webvtt_status
webvtt_annotation_state( webvtt_byte **position, webvtt_token_state *token_state,
webvtt_string *annotation )
{
for( ; *token_state == START_TAG_ANNOTATION; (*position)++ ) {
if( **position == '\0' || **position == '>' ) {
return WEBVTT_SUCCESS;
}
CHECK_MEMORY_OP( webvtt_string_putc( annotation, **position ) );
}
return WEBVTT_UNFINISHED;
}
WEBVTT_INTERN webvtt_status
webvtt_end_tag_state( webvtt_byte **position, webvtt_token_state *token_state,
webvtt_string *result )
{
for( ; *token_state == END_TAG; (*position)++ ) {
if( **position == '>' || **position == '\0' ) {
return WEBVTT_SUCCESS;
}
CHECK_MEMORY_OP( webvtt_string_putc( result, **position ) );
}
return WEBVTT_UNFINISHED;
}
WEBVTT_INTERN webvtt_status
webvtt_timestamp_state( webvtt_byte **position, webvtt_token_state *token_state,
webvtt_string *result )
{
for( ; *token_state == TIME_STAMP_TAG; (*position)++ ) {
if( **position == '>' || **position == '\0' ) {
return WEBVTT_SUCCESS;
}
CHECK_MEMORY_OP( webvtt_string_putc( result, **position ) );
}
return WEBVTT_UNFINISHED;
}
/**
* Need to set up differently.
* Get a status in order to return at end and release memeory.
*/
WEBVTT_INTERN webvtt_status
webvtt_cuetext_tokenizer( webvtt_byte **position, webvtt_cuetext_token **token )
{
webvtt_token_state token_state = DATA;
webvtt_string result, annotation;
webvtt_stringlist *css_classes;
webvtt_timestamp time_stamp = 0;
webvtt_status status = WEBVTT_UNFINISHED;
if( !position ) {
return WEBVTT_INVALID_PARAM;
}
webvtt_create_string( 10, &result );
webvtt_create_string( 10, &annotation );
webvtt_create_stringlist( &css_classes );
/**
* Loop while the tokenizer is not finished.
* Based on the state of the tokenizer enter a function to handle that
* particular tokenizer state. Those functions will loop until they either
* change the state of the tokenizer or reach a valid token end point.
*/
while( status == WEBVTT_UNFINISHED ) {
switch( token_state ) {
case DATA :
status = webvtt_data_state( position, &token_state, &result );
break;
case ESCAPE:
status = webvtt_escape_state( position, &token_state, &result );
break;
case TAG:
status = webvtt_tag_state( position, &token_state, &result );
break;
case START_TAG:
status = webvtt_start_tag_state( position, &token_state, &result );
break;
case START_TAG_CLASS:
status = webvtt_class_state( position, &token_state, css_classes );
break;
case START_TAG_ANNOTATION:
status = webvtt_annotation_state( position, &token_state, &annotation );
break;
case END_TAG:
status = webvtt_end_tag_state( position, &token_state, &result );
break;
case TIME_STAMP_TAG:
status = webvtt_timestamp_state( position, &token_state, &result );
break;
}
}
if( **position == '>' )
{ (*position)++; }
if( status == WEBVTT_SUCCESS ) {
/**
* The state that the tokenizer left off on will tell us what kind of token
* needs to be made.
*/
if( token_state == DATA || token_state == ESCAPE ) {
status = webvtt_create_text_token( token, &result );
} else if(token_state == TAG || token_state == START_TAG || token_state == START_TAG_CLASS ||
token_state == START_TAG_ANNOTATION) {
/**
* If the tag does not accept an annotation then release the current
* annotation and intialize annotation to a safe empty state
*/
if( !tag_accepts_annotation( &result ) ) {
webvtt_release_string( &annotation );
webvtt_init_string( &annotation );
}
status = webvtt_create_start_token( token, &result, css_classes, &annotation );
} else if( token_state == END_TAG ) {
status = webvtt_create_end_token( token, &result );
} else if( token_state == TIME_STAMP_TAG ) {
parse_timestamp( webvtt_string_text( &result ), &time_stamp );
status = webvtt_create_timestamp_token( token, time_stamp );
} else {
status = WEBVTT_INVALID_TOKEN_STATE;
}
}
webvtt_release_stringlist( &css_classes );
webvtt_release_string( &result );
webvtt_release_string( &annotation );
return status;
}
/**
* Currently line and len are not being kept track of.
* Don't think pnode_length is needed as nodes track there list count
* internally.
*/
WEBVTT_INTERN webvtt_status
webvtt_parse_cuetext( webvtt_parser self, webvtt_cue *cue, webvtt_string *payload, int finished )
{
const webvtt_byte *cue_text;
webvtt_status status;
webvtt_byte *position;
webvtt_node *node_head;
webvtt_node *current_node;
webvtt_node *temp_node;
webvtt_cuetext_token *token;
webvtt_node_kind kind;
/**
* TODO: Use these parameters! 'finished' isn't really important
* here, but 'self' certainly is as it lets us report syntax errors.
*
* However, for the time being we can trick the compiler into not
* warning us about unused variables by doing this.
*/
( void )self;
( void )finished;
if( !cue ) {
return WEBVTT_INVALID_PARAM;
}
cue_text = webvtt_string_text( payload );
if( !cue_text ) {
return WEBVTT_INVALID_PARAM;
}
if ( WEBVTT_FAILED(status = webvtt_create_head_node( &cue->node_head ) ) ) {
return status;
}
position = (webvtt_byte *)cue_text;
node_head = cue->node_head;
current_node = node_head;
temp_node = NULL;
token = NULL;
/**
* Routine taken from the W3C specification
* http://dev.w3.org/html5/webvtt/#webvtt-cue-text-parsing-rules
*/
while( *position != '\0' ) {
webvtt_status status = WEBVTT_SUCCESS;
webvtt_delete_token( &token );
/* Step 7. */
if( WEBVTT_FAILED( status = webvtt_cuetext_tokenizer( &position,
&token ) ) ) {
/* Error here. */
} else {
/* Succeeded... Process token */
if( token->token_type == END_TOKEN ) {
/**
* If we've found an end token which has a valid end token tag name and
* a tag name that is equal to the current node then set current to the
* parent of current.
*/
if( current_node->kind == WEBVTT_HEAD_NODE ) {
/**
* We have encountered an end token but we are at the top of the list
* and thus have not encountered any start tokens yet, throw away the
* token.
*/
continue;
}
if( webvtt_node_kind_from_tag_name( &token->tag_name, &kind ) == WEBVTT_INVALID_TAG_NAME ) {
/**
* We have encountered an end token but it is not in a format that is
* supported, throw away the token.
*/
continue;
}
if( current_node->kind == kind ) {
/**
* We have encountered an end token and it matches the start token of
* the node that we are currently on. Move back up the list of nodes
* and continue parsing.
*/
current_node = current_node->parent;
}
} else {
/**
* Attempt to create a valid node from the token.
* If successful then attach the node to the current nodes list and
* also set current to the newly created node if it is an internal
* node type.
*/
if( webvtt_create_node_from_token( token, &temp_node, current_node ) != WEBVTT_SUCCESS ) {
/* Do something here? */
} else {
webvtt_attach_node( current_node, temp_node );
if( WEBVTT_IS_VALID_INTERNAL_NODE( temp_node->kind ) ) {
current_node = temp_node;
}
/* Release the node as attach internal node increases the count. */
webvtt_release_node( &temp_node );
}
}
}
}
webvtt_delete_token( &token );
return WEBVTT_SUCCESS;
}