/* markdown.c - generic markdown parser */ /* * Copyright (c) 2009, Natacha Porté * Copyright (c) 2011, Vicent Marti * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include "markdown.h" #include "stack.h" #include #include #include #include #if defined(_WIN32) #define strncasecmp _strnicmp #endif #define REF_TABLE_SIZE 8 #define BUFFER_BLOCK 0 #define BUFFER_SPAN 1 #define MKD_LI_END 8 /* internal list flag */ #define gperf_case_strncmp( s1, s2, n ) strncasecmp( s1, s2, n ) #define GPERF_DOWNCASE 1 #define GPERF_CASE_STRNCMP 1 #include "html_blocks.h" /*************** * LOCAL TYPES * ***************/ /* link_ref: reference to a link */ struct link_ref { unsigned int id; struct buf* link; struct buf* title; struct link_ref* next; }; /* char_trigger: function pointer to render active chars */ /* returns the number of chars taken care of */ /* data is the pointer of the beginning of the span */ /* offset is the number of valid chars before data */ struct sd_markdown; typedef size_t (* char_trigger)( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ); static size_t char_emphasis( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ); static size_t char_linebreak( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ); static size_t char_codespan( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ); static size_t char_escape( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ); static size_t char_entity( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ); static size_t char_langle_tag( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ); static size_t char_autolink_url( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ); static size_t char_autolink_email( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ); static size_t char_autolink_www( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ); static size_t char_link( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ); static size_t char_superscript( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ); enum markdown_char_t { MD_CHAR_NONE = 0, MD_CHAR_EMPHASIS, MD_CHAR_CODESPAN, MD_CHAR_LINEBREAK, MD_CHAR_LINK, MD_CHAR_LANGLE, MD_CHAR_ESCAPE, MD_CHAR_ENTITITY, MD_CHAR_AUTOLINK_URL, MD_CHAR_AUTOLINK_EMAIL, MD_CHAR_AUTOLINK_WWW, MD_CHAR_SUPERSCRIPT, }; static char_trigger markdown_char_ptrs[] = { NULL, &char_emphasis, &char_codespan, &char_linebreak, &char_link, &char_langle_tag, &char_escape, &char_entity, &char_autolink_url, &char_autolink_email, &char_autolink_www, &char_superscript, }; /* render • structure containing one particular render */ struct sd_markdown { struct sd_callbacks cb; void* opaque; struct link_ref* refs[REF_TABLE_SIZE]; uint8_t active_char[256]; struct stack work_bufs[2]; unsigned int ext_flags; size_t max_nesting; int in_link_body; }; /*************************** * HELPER FUNCTIONS * ***************************/ static inline struct buf* rndr_newbuf( struct sd_markdown* rndr, int type ) { static const size_t buf_size[2] = { 256, 64 }; struct buf* work = NULL; struct stack* pool = &rndr->work_bufs[type]; if( pool->size < pool->asize && pool->item[pool->size] != NULL ) { work = pool->item[pool->size++]; work->size = 0; } else { work = bufnew( buf_size[type] ); stack_push( pool, work ); } return work; } static inline void rndr_popbuf( struct sd_markdown* rndr, int type ) { rndr->work_bufs[type].size--; } static void unscape_text( struct buf* ob, struct buf* src ) { size_t i = 0, org; while( i < src->size ) { org = i; while( i < src->size && src->data[i] != '\\' ) i++; if( i > org ) bufput( ob, src->data + org, i - org ); if( i + 1 >= src->size ) break; bufputc( ob, src->data[i + 1] ); i += 2; } } static unsigned int hash_link_ref( const uint8_t* link_ref, size_t length ) { size_t i; unsigned int hash = 0; for( i = 0; i < length; ++i ) hash = tolower( link_ref[i] ) + (hash << 6) + (hash << 16) - hash; return hash; } static struct link_ref* add_link_ref( struct link_ref** references, const uint8_t* name, size_t name_size ) { struct link_ref* ref = calloc( 1, sizeof(struct link_ref) ); if( !ref ) return NULL; ref->id = hash_link_ref( name, name_size ); ref->next = references[ref->id % REF_TABLE_SIZE]; references[ref->id % REF_TABLE_SIZE] = ref; return ref; } static struct link_ref* find_link_ref( struct link_ref** references, uint8_t* name, size_t length ) { unsigned int hash = hash_link_ref( name, length ); struct link_ref* ref = NULL; ref = references[hash % REF_TABLE_SIZE]; while( ref != NULL ) { if( ref->id == hash ) return ref; ref = ref->next; } return NULL; } static void free_link_refs( struct link_ref** references ) { size_t i; for( i = 0; i < REF_TABLE_SIZE; ++i ) { struct link_ref* r = references[i]; struct link_ref* next; while( r ) { next = r->next; bufrelease( r->link ); bufrelease( r->title ); free( r ); r = next; } } } /* * Check whether a char is a Markdown space. * * Right now we only consider spaces the actual * space and a newline: tabs and carriage returns * are filtered out during the preprocessing phase. * * If we wanted to actually be UTF-8 compliant, we * should instead extract an Unicode codepoint from * this character and check for space properties. */ static inline int _isspace( int c ) { return c == ' ' || c == '\n'; } /**************************** * INLINE PARSING FUNCTIONS * ****************************/ /* is_mail_autolink • looks for the address part of a mail autolink and '>' */ /* this is less strict than the original markdown e-mail address matching */ static size_t is_mail_autolink( uint8_t* data, size_t size ) { size_t i = 0, nb = 0; /* address is assumed to be: [-@._a-zA-Z0-9]+ with exactly one '@' */ for( i = 0; i < size; ++i ) { if( isalnum( data[i] ) ) continue; switch( data[i] ) { case '@': nb++; case '-': case '.': case '_': break; case '>': return (nb == 1) ? i + 1 : 0; default: return 0; } } return 0; } /* tag_length • returns the length of the given tag, or 0 is it's not valid */ static size_t tag_length( uint8_t* data, size_t size, enum mkd_autolink* autolink ) { size_t i, j; /* a valid tag can't be shorter than 3 chars */ if( size < 3 ) return 0; /* begins with a '<' optionally followed by '/', followed by letter or number */ if( data[0] != '<' ) return 0; i = (data[1] == '/') ? 2 : 1; if( !isalnum( data[i] ) ) return 0; /* scheme test */ *autolink = MKDA_NOT_AUTOLINK; /* try to find the beginning of an URI */ while( i < size && (isalnum( data[i] ) || data[i] == '.' || data[i] == '+' || data[i] == '-') ) i++; if( i > 1 && data[i] == '@' ) { if( ( j = is_mail_autolink( data + i, size - i ) ) != 0 ) { *autolink = MKDA_EMAIL; return i + j; } } if( i > 2 && data[i] == ':' ) { *autolink = MKDA_NORMAL; i++; } /* completing autolink test: no whitespace or ' or " */ if( i >= size ) *autolink = MKDA_NOT_AUTOLINK; else if( *autolink ) { j = i; while( i < size ) { if( data[i] == '\\' ) i += 2; else if( data[i] == '>' || data[i] == '\'' || data[i] == '"' || data[i] == ' ' || data[i] == '\n' ) break; else i++; } if( i >= size ) return 0; if( i > j && data[i] == '>' ) return i + 1; /* one of the forbidden chars has been found */ *autolink = MKDA_NOT_AUTOLINK; } /* looking for sometinhg looking like a tag end */ while( i < size && data[i] != '>' ) i++; if( i >= size ) return 0; return i + 1; } /* parse_inline • parses inline markdown elements */ static void parse_inline( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size ) { size_t i = 0, end = 0; uint8_t action = 0; struct buf work = { 0, 0, 0, 0 }; if( rndr->work_bufs[BUFFER_SPAN].size + rndr->work_bufs[BUFFER_BLOCK].size > rndr->max_nesting ) return; while( i < size ) { /* copying inactive chars into the output */ while( end < size && (action = rndr->active_char[data[end]]) == 0 ) { end++; } if( rndr->cb.normal_text ) { work.data = data + i; work.size = end - i; rndr->cb.normal_text( ob, &work, rndr->opaque ); } else bufput( ob, data + i, end - i ); if( end >= size ) break; i = end; end = markdown_char_ptrs[(int) action]( ob, rndr, data + i, i, size - i ); if( !end ) /* no action from the callback */ end = i + 1; else { i += end; end = i; } } } /* find_emph_char • looks for the next emph uint8_t, skipping other constructs */ static size_t find_emph_char( uint8_t* data, size_t size, uint8_t c ) { size_t i = 1; while( i < size ) { while( i < size && data[i] != c && data[i] != '`' && data[i] != '[' ) i++; if( i == size ) return 0; if( data[i] == c ) return i; /* not counting escaped chars */ if( i && data[i - 1] == '\\' ) { i++; continue; } if( data[i] == '`' ) { size_t span_nb = 0, bt; size_t tmp_i = 0; /* counting the number of opening backticks */ while( i < size && data[i] == '`' ) { i++; span_nb++; } if( i >= size ) return 0; /* finding the matching closing sequence */ bt = 0; while( i < size && bt < span_nb ) { if( !tmp_i && data[i] == c ) tmp_i = i; if( data[i] == '`' ) bt++; else bt = 0; i++; } if( i >= size ) return tmp_i; } /* skipping a link */ else if( data[i] == '[' ) { size_t tmp_i = 0; uint8_t cc; i++; while( i < size && data[i] != ']' ) { if( !tmp_i && data[i] == c ) tmp_i = i; i++; } i++; while( i < size && (data[i] == ' ' || data[i] == '\n') ) i++; if( i >= size ) return tmp_i; switch( data[i] ) { case '[': cc = ']'; break; case '(': cc = ')'; break; default: if( tmp_i ) return tmp_i; else continue; } i++; while( i < size && data[i] != cc ) { if( !tmp_i && data[i] == c ) tmp_i = i; i++; } if( i >= size ) return tmp_i; i++; } } return 0; } /* parse_emph1 • parsing single emphase */ /* closed by a symbol not preceded by whitespace and not followed by symbol */ static size_t parse_emph1( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size, uint8_t c ) { size_t i = 0, len; struct buf* work = 0; int r; if( !rndr->cb.emphasis ) return 0; /* skipping one symbol if coming from emph3 */ if( size > 1 && data[0] == c && data[1] == c ) i = 1; while( i < size ) { len = find_emph_char( data + i, size - i, c ); if( !len ) return 0; i += len; if( i >= size ) return 0; if( data[i] == c && !_isspace( data[i - 1] ) ) { if( rndr->ext_flags & MKDEXT_NO_INTRA_EMPHASIS ) { if( i + 1 < size && isalnum( data[i + 1] ) ) continue; } work = rndr_newbuf( rndr, BUFFER_SPAN ); parse_inline( work, rndr, data, i ); r = rndr->cb.emphasis( ob, work, rndr->opaque ); rndr_popbuf( rndr, BUFFER_SPAN ); return r ? i + 1 : 0; } } return 0; } /* parse_emph2 • parsing single emphase */ static size_t parse_emph2( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size, uint8_t c ) { int (* render_method)( struct buf* ob, const struct buf* text, void* opaque ); size_t i = 0, len; struct buf* work = 0; int r; render_method = (c == '~') ? rndr->cb.strikethrough : rndr->cb.double_emphasis; if( !render_method ) return 0; while( i < size ) { len = find_emph_char( data + i, size - i, c ); if( !len ) return 0; i += len; if( i + 1 < size && data[i] == c && data[i + 1] == c && i && !_isspace( data[i - 1] ) ) { work = rndr_newbuf( rndr, BUFFER_SPAN ); parse_inline( work, rndr, data, i ); r = render_method( ob, work, rndr->opaque ); rndr_popbuf( rndr, BUFFER_SPAN ); return r ? i + 2 : 0; } i++; } return 0; } /* parse_emph3 • parsing single emphase */ /* finds the first closing tag, and delegates to the other emph */ static size_t parse_emph3( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size, uint8_t c ) { size_t i = 0, len; int r; while( i < size ) { len = find_emph_char( data + i, size - i, c ); if( !len ) return 0; i += len; /* skip whitespace preceded symbols */ if( data[i] != c || _isspace( data[i - 1] ) ) continue; if( i + 2 < size && data[i + 1] == c && data[i + 2] == c && rndr->cb.triple_emphasis ) { /* triple symbol found */ struct buf* work = rndr_newbuf( rndr, BUFFER_SPAN ); parse_inline( work, rndr, data, i ); r = rndr->cb.triple_emphasis( ob, work, rndr->opaque ); rndr_popbuf( rndr, BUFFER_SPAN ); return r ? i + 3 : 0; } else if( i + 1 < size && data[i + 1] == c ) { /* double symbol found, handing over to emph1 */ len = parse_emph1( ob, rndr, data - 2, size + 2, c ); if( !len ) return 0; else return len - 2; } else { /* single symbol found, handing over to emph2 */ len = parse_emph2( ob, rndr, data - 1, size + 1, c ); if( !len ) return 0; else return len - 1; } } return 0; } /* char_emphasis • single and double emphasis parsing */ static size_t char_emphasis( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ) { uint8_t c = data[0]; size_t ret; if( rndr->ext_flags & MKDEXT_NO_INTRA_EMPHASIS ) { if( offset > 0 && !_isspace( data[-1] ) && data[-1] != '>' ) return 0; } if( size > 2 && data[1] != c ) { /* whitespace cannot follow an opening emphasis; * strikethrough only takes two characters '~~' */ if( c == '~' || _isspace( data[1] ) || ( ret = parse_emph1( ob, rndr, data + 1, size - 1, c ) ) == 0 ) return 0; return ret + 1; } if( size > 3 && data[1] == c && data[2] != c ) { if( _isspace( data[2] ) || ( ret = parse_emph2( ob, rndr, data + 2, size - 2, c ) ) == 0 ) return 0; return ret + 2; } if( size > 4 && data[1] == c && data[2] == c && data[3] != c ) { if( c == '~' || _isspace( data[3] ) || ( ret = parse_emph3( ob, rndr, data + 3, size - 3, c ) ) == 0 ) return 0; return ret + 3; } return 0; } /* char_linebreak • '\n' preceded by two spaces (assuming linebreak != 0) */ static size_t char_linebreak( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ) { if( offset < 2 || data[-1] != ' ' || data[-2] != ' ' ) return 0; /* removing the last space from ob and rendering */ while( ob->size && ob->data[ob->size - 1] == ' ' ) ob->size--; return rndr->cb.linebreak( ob, rndr->opaque ) ? 1 : 0; } /* char_codespan • '`' parsing a code span (assuming codespan != 0) */ static size_t char_codespan( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ) { size_t end, nb = 0, i, f_begin, f_end; /* counting the number of backticks in the delimiter */ while( nb < size && data[nb] == '`' ) nb++; /* finding the next delimiter */ i = 0; for( end = nb; end < size && i < nb; end++ ) { if( data[end] == '`' ) i++; else i = 0; } if( i < nb && end >= size ) return 0; /* no matching delimiter */ /* trimming outside whitespaces */ f_begin = nb; while( f_begin < end && data[f_begin] == ' ' ) f_begin++; f_end = end - nb; while( f_end > nb && data[f_end - 1] == ' ' ) f_end--; /* real code span */ if( f_begin < f_end ) { struct buf work = { data + f_begin, f_end - f_begin, 0, 0 }; if( !rndr->cb.codespan( ob, &work, rndr->opaque ) ) end = 0; } else { if( !rndr->cb.codespan( ob, 0, rndr->opaque ) ) end = 0; } return end; } /* char_escape • '\\' backslash escape */ static size_t char_escape( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ) { static const char* escape_chars = "\\`*_{}[]()#+-.!:|&<>^~"; struct buf work = { 0, 0, 0, 0 }; if( size > 1 ) { if( strchr( escape_chars, data[1] ) == NULL ) return 0; if( rndr->cb.normal_text ) { work.data = data + 1; work.size = 1; rndr->cb.normal_text( ob, &work, rndr->opaque ); } else bufputc( ob, data[1] ); } else if( size == 1 ) { bufputc( ob, data[0] ); } return 2; } /* char_entity • '&' escaped when it doesn't belong to an entity */ /* valid entities are assumed to be anything matching &#?[A-Za-z0-9]+; */ static size_t char_entity( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ) { size_t end = 1; struct buf work = { 0, 0, 0, 0 }; if( end < size && data[end] == '#' ) end++; while( end < size && isalnum( data[end] ) ) end++; if( end < size && data[end] == ';' ) end++; /* real entity */ else return 0; /* lone '&' */ if( rndr->cb.entity ) { work.data = data; work.size = end; rndr->cb.entity( ob, &work, rndr->opaque ); } else bufput( ob, data, end ); return end; } /* char_langle_tag • '<' when tags or autolinks are allowed */ static size_t char_langle_tag( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ) { enum mkd_autolink altype = MKDA_NOT_AUTOLINK; size_t end = tag_length( data, size, &altype ); struct buf work = { data, end, 0, 0 }; int ret = 0; if( end > 2 ) { if( rndr->cb.autolink && altype != MKDA_NOT_AUTOLINK ) { struct buf* u_link = rndr_newbuf( rndr, BUFFER_SPAN ); work.data = data + 1; work.size = end - 2; unscape_text( u_link, &work ); ret = rndr->cb.autolink( ob, u_link, altype, rndr->opaque ); rndr_popbuf( rndr, BUFFER_SPAN ); } else if( rndr->cb.raw_html_tag ) ret = rndr->cb.raw_html_tag( ob, &work, rndr->opaque ); } if( !ret ) return 0; else return end; } static size_t char_autolink_www( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ) { struct buf* link, * link_url, * link_text; size_t link_len, rewind; if( !rndr->cb.link || rndr->in_link_body ) return 0; link = rndr_newbuf( rndr, BUFFER_SPAN ); if( ( link_len = sd_autolink__www( &rewind, link, data, offset, size, 0 ) ) > 0 ) { link_url = rndr_newbuf( rndr, BUFFER_SPAN ); BUFPUTSL( link_url, "http://" ); bufput( link_url, link->data, link->size ); ob->size -= rewind; if( rndr->cb.normal_text ) { link_text = rndr_newbuf( rndr, BUFFER_SPAN ); rndr->cb.normal_text( link_text, link, rndr->opaque ); rndr->cb.link( ob, link_url, NULL, link_text, rndr->opaque ); rndr_popbuf( rndr, BUFFER_SPAN ); } else { rndr->cb.link( ob, link_url, NULL, link, rndr->opaque ); } rndr_popbuf( rndr, BUFFER_SPAN ); } rndr_popbuf( rndr, BUFFER_SPAN ); return link_len; } static size_t char_autolink_email( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ) { struct buf* link; size_t link_len, rewind; if( !rndr->cb.autolink || rndr->in_link_body ) return 0; link = rndr_newbuf( rndr, BUFFER_SPAN ); if( ( link_len = sd_autolink__email( &rewind, link, data, offset, size, 0 ) ) > 0 ) { ob->size -= rewind; rndr->cb.autolink( ob, link, MKDA_EMAIL, rndr->opaque ); } rndr_popbuf( rndr, BUFFER_SPAN ); return link_len; } static size_t char_autolink_url( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ) { struct buf* link; size_t link_len, rewind; if( !rndr->cb.autolink || rndr->in_link_body ) return 0; link = rndr_newbuf( rndr, BUFFER_SPAN ); if( ( link_len = sd_autolink__url( &rewind, link, data, offset, size, 0 ) ) > 0 ) { ob->size -= rewind; rndr->cb.autolink( ob, link, MKDA_NORMAL, rndr->opaque ); } rndr_popbuf( rndr, BUFFER_SPAN ); return link_len; } /* char_link • '[': parsing a link or an image */ static size_t char_link( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ) { int is_img = (offset && data[-1] == '!'), level; size_t i = 1, txt_e, link_b = 0, link_e = 0, title_b = 0, title_e = 0; struct buf* content = 0; struct buf* link = 0; struct buf* title = 0; struct buf* u_link = 0; size_t org_work_size = rndr->work_bufs[BUFFER_SPAN].size; int text_has_nl = 0, ret = 0; int in_title = 0, qtype = 0; /* checking whether the correct renderer exists */ if( (is_img && !rndr->cb.image) || (!is_img && !rndr->cb.link) ) goto cleanup; /* looking for the matching closing bracket */ for( level = 1; i < size; i++ ) { if( data[i] == '\n' ) text_has_nl = 1; else if( data[i - 1] == '\\' ) continue; else if( data[i] == '[' ) level++; else if( data[i] == ']' ) { level--; if( level <= 0 ) break; } } if( i >= size ) goto cleanup; txt_e = i; i++; /* skip any amount of whitespace or newline */ /* (this is much more laxist than original markdown syntax) */ while( i < size && _isspace( data[i] ) ) i++; /* inline style link */ if( i < size && data[i] == '(' ) { /* skipping initial whitespace */ i++; while( i < size && _isspace( data[i] ) ) i++; link_b = i; /* looking for link end: ' " ) */ while( i < size ) { if( data[i] == '\\' ) i += 2; else if( data[i] == ')' ) break; else if( i >= 1 && _isspace( data[i - 1] ) && (data[i] == '\'' || data[i] == '"') ) break; else i++; } if( i >= size ) goto cleanup; link_e = i; /* looking for title end if present */ if( data[i] == '\'' || data[i] == '"' ) { qtype = data[i]; in_title = 1; i++; title_b = i; while( i < size ) { if( data[i] == '\\' ) i += 2; else if( data[i] == qtype ) { in_title = 0; i++; } else if( (data[i] == ')') && !in_title ) break; else i++; } if( i >= size ) goto cleanup; /* skipping whitespaces after title */ title_e = i - 1; while( title_e > title_b && _isspace( data[title_e] ) ) title_e--; /* checking for closing quote presence */ if( data[title_e] != '\'' && data[title_e] != '"' ) { title_b = title_e = 0; link_e = i; } } /* remove whitespace at the end of the link */ while( link_e > link_b && _isspace( data[link_e - 1] ) ) link_e--; /* remove optional angle brackets around the link */ if( data[link_b] == '<' ) link_b++; if( data[link_e - 1] == '>' ) link_e--; /* building escaped link and title */ if( link_e > link_b ) { link = rndr_newbuf( rndr, BUFFER_SPAN ); bufput( link, data + link_b, link_e - link_b ); } if( title_e > title_b ) { title = rndr_newbuf( rndr, BUFFER_SPAN ); bufput( title, data + title_b, title_e - title_b ); } i++; } /* reference style link */ else if( i < size && data[i] == '[' ) { struct buf id = { 0, 0, 0, 0 }; struct link_ref* lr; /* looking for the id */ i++; link_b = i; while( i < size && data[i] != ']' ) i++; if( i >= size ) goto cleanup; link_e = i; /* finding the link_ref */ if( link_b == link_e ) { if( text_has_nl ) { struct buf* b = rndr_newbuf( rndr, BUFFER_SPAN ); size_t j; for( j = 1; j < txt_e; j++ ) { if( data[j] != '\n' ) bufputc( b, data[j] ); else if( data[j - 1] != ' ' ) bufputc( b, ' ' ); } id.data = b->data; id.size = b->size; } else { id.data = data + 1; id.size = txt_e - 1; } } else { id.data = data + link_b; id.size = link_e - link_b; } lr = find_link_ref( rndr->refs, id.data, id.size ); if( !lr ) goto cleanup; /* keeping link and title from link_ref */ link = lr->link; title = lr->title; i++; } /* shortcut reference style link */ else { struct buf id = { 0, 0, 0, 0 }; struct link_ref* lr; /* crafting the id */ if( text_has_nl ) { struct buf* b = rndr_newbuf( rndr, BUFFER_SPAN ); size_t j; for( j = 1; j < txt_e; j++ ) { if( data[j] != '\n' ) bufputc( b, data[j] ); else if( data[j - 1] != ' ' ) bufputc( b, ' ' ); } id.data = b->data; id.size = b->size; } else { id.data = data + 1; id.size = txt_e - 1; } /* finding the link_ref */ lr = find_link_ref( rndr->refs, id.data, id.size ); if( !lr ) goto cleanup; /* keeping link and title from link_ref */ link = lr->link; title = lr->title; /* rewinding the whitespace */ i = txt_e + 1; } /* building content: img alt is escaped, link content is parsed */ if( txt_e > 1 ) { content = rndr_newbuf( rndr, BUFFER_SPAN ); if( is_img ) { bufput( content, data + 1, txt_e - 1 ); } else { /* disable autolinking when parsing inline the * content of a link */ rndr->in_link_body = 1; parse_inline( content, rndr, data + 1, txt_e - 1 ); rndr->in_link_body = 0; } } if( link ) { u_link = rndr_newbuf( rndr, BUFFER_SPAN ); unscape_text( u_link, link ); } /* calling the relevant rendering function */ if( is_img ) { if( ob->size && ob->data[ob->size - 1] == '!' ) ob->size -= 1; ret = rndr->cb.image( ob, u_link, title, content, rndr->opaque ); } else { ret = rndr->cb.link( ob, u_link, title, content, rndr->opaque ); } /* cleanup */ cleanup: rndr->work_bufs[BUFFER_SPAN].size = (int) org_work_size; return ret ? i : 0; } static size_t char_superscript( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t offset, size_t size ) { size_t sup_start, sup_len; struct buf* sup; if( !rndr->cb.superscript ) return 0; if( size < 2 ) return 0; if( data[1] == '(' ) { sup_start = sup_len = 2; while( sup_len < size && data[sup_len] != ')' && data[sup_len - 1] != '\\' ) sup_len++; if( sup_len == size ) return 0; } else { sup_start = sup_len = 1; while( sup_len < size && !_isspace( data[sup_len] ) ) sup_len++; } if( sup_len - sup_start == 0 ) return (sup_start == 2) ? 3 : 0; sup = rndr_newbuf( rndr, BUFFER_SPAN ); parse_inline( sup, rndr, data + sup_start, sup_len - sup_start ); rndr->cb.superscript( ob, sup, rndr->opaque ); rndr_popbuf( rndr, BUFFER_SPAN ); return (sup_start == 2) ? sup_len + 1 : sup_len; } /********************************* * BLOCK-LEVEL PARSING FUNCTIONS * *********************************/ /* is_empty • returns the line length when it is empty, 0 otherwise */ static size_t is_empty( uint8_t* data, size_t size ) { size_t i; for( i = 0; i < size && data[i] != '\n'; i++ ) if( data[i] != ' ' ) return 0; return i + 1; } /* is_hrule • returns whether a line is a horizontal rule */ static int is_hrule( uint8_t* data, size_t size ) { size_t i = 0, n = 0; uint8_t c; /* skipping initial spaces */ if( size < 3 ) return 0; if( data[0] == ' ' ) { i++; if( data[1] == ' ' ) { i++; if( data[2] == ' ' ) { i++; } } } /* looking at the hrule uint8_t */ if( i + 2 >= size || (data[i] != '*' && data[i] != '-' && data[i] != '_') ) return 0; c = data[i]; /* the whole line must be the char or whitespace */ while( i < size && data[i] != '\n' ) { if( data[i] == c ) n++; else if( data[i] != ' ' ) return 0; i++; } return n >= 3; } /* check if a line begins with a code fence; return the * width of the code fence */ static size_t prefix_codefence( uint8_t* data, size_t size ) { size_t i = 0, n = 0; uint8_t c; /* skipping initial spaces */ if( size < 3 ) return 0; if( data[0] == ' ' ) { i++; if( data[1] == ' ' ) { i++; if( data[2] == ' ' ) { i++; } } } /* looking at the hrule uint8_t */ if( i + 2 >= size || !(data[i] == '~' || data[i] == '`') ) return 0; c = data[i]; /* the whole line must be the uint8_t or whitespace */ while( i < size && data[i] == c ) { n++; i++; } if( n < 3 ) return 0; return i; } /* check if a line is a code fence; return its size if it is */ static size_t is_codefence( uint8_t* data, size_t size, struct buf* syntax ) { size_t i = 0, syn_len = 0; uint8_t* syn_start; i = prefix_codefence( data, size ); if( i == 0 ) return 0; while( i < size && data[i] == ' ' ) i++; syn_start = data + i; if( i < size && data[i] == '{' ) { i++; syn_start++; while( i < size && data[i] != '}' && data[i] != '\n' ) { syn_len++; i++; } if( i == size || data[i] != '}' ) return 0; /* strip all whitespace at the beginning and the end * of the {} block */ while( syn_len > 0 && _isspace( syn_start[0] ) ) { syn_start++; syn_len--; } while( syn_len > 0 && _isspace( syn_start[syn_len - 1] ) ) syn_len--; i++; } else { while( i < size && !_isspace( data[i] ) ) { syn_len++; i++; } } if( syntax ) { syntax->data = syn_start; syntax->size = syn_len; } while( i < size && data[i] != '\n' ) { if( !_isspace( data[i] ) ) return 0; i++; } return i + 1; } /* is_atxheader • returns whether the line is a hash-prefixed header */ static int is_atxheader( struct sd_markdown* rndr, uint8_t* data, size_t size ) { if( data[0] != '#' ) return 0; if( rndr->ext_flags & MKDEXT_SPACE_HEADERS ) { size_t level = 0; while( level < size && level < 6 && data[level] == '#' ) level++; if( level < size && data[level] != ' ' ) return 0; } return 1; } /* is_headerline • returns whether the line is a setext-style hdr underline */ static int is_headerline( uint8_t* data, size_t size ) { size_t i = 0; /* test of level 1 header */ if( data[i] == '=' ) { for( i = 1; i < size && data[i] == '='; i++ ) ; while( i < size && data[i] == ' ' ) i++; return (i >= size || data[i] == '\n') ? 1 : 0; } /* test of level 2 header */ if( data[i] == '-' ) { for( i = 1; i < size && data[i] == '-'; i++ ) ; while( i < size && data[i] == ' ' ) i++; return (i >= size || data[i] == '\n') ? 2 : 0; } return 0; } static int is_next_headerline( uint8_t* data, size_t size ) { size_t i = 0; while( i < size && data[i] != '\n' ) i++; if( ++i >= size ) return 0; return is_headerline( data + i, size - i ); } /* prefix_quote • returns blockquote prefix length */ static size_t prefix_quote( uint8_t* data, size_t size ) { size_t i = 0; if( i < size && data[i] == ' ' ) i++; if( i < size && data[i] == ' ' ) i++; if( i < size && data[i] == ' ' ) i++; if( i < size && data[i] == '>' ) { if( i + 1 < size && data[i + 1] == ' ' ) return i + 2; return i + 1; } return 0; } /* prefix_code • returns prefix length for block code*/ static size_t prefix_code( uint8_t* data, size_t size ) { if( size > 3 && data[0] == ' ' && data[1] == ' ' && data[2] == ' ' && data[3] == ' ' ) return 4; return 0; } /* prefix_oli • returns ordered list item prefix */ static size_t prefix_oli( uint8_t* data, size_t size ) { size_t i = 0; if( i < size && data[i] == ' ' ) i++; if( i < size && data[i] == ' ' ) i++; if( i < size && data[i] == ' ' ) i++; if( i >= size || data[i] < '0' || data[i] > '9' ) return 0; while( i < size && data[i] >= '0' && data[i] <= '9' ) i++; if( i + 1 >= size || data[i] != '.' || data[i + 1] != ' ' ) return 0; if( is_next_headerline( data + i, size - i ) ) return 0; return i + 2; } /* prefix_uli • returns ordered list item prefix */ static size_t prefix_uli( uint8_t* data, size_t size ) { size_t i = 0; if( i < size && data[i] == ' ' ) i++; if( i < size && data[i] == ' ' ) i++; if( i < size && data[i] == ' ' ) i++; if( i + 1 >= size || (data[i] != '*' && data[i] != '+' && data[i] != '-') || data[i + 1] != ' ' ) return 0; if( is_next_headerline( data + i, size - i ) ) return 0; return i + 2; } /* parse_block • parsing of one block, returning next uint8_t to parse */ static void parse_block( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size ); /* parse_blockquote • handles parsing of a blockquote fragment */ static size_t parse_blockquote( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size ) { size_t beg, end = 0, pre, work_size = 0; uint8_t* work_data = 0; struct buf* out = 0; out = rndr_newbuf( rndr, BUFFER_BLOCK ); beg = 0; while( beg < size ) { for( end = beg + 1; end < size && data[end - 1] != '\n'; end++ ) ; pre = prefix_quote( data + beg, end - beg ); if( pre ) beg += pre; /* skipping prefix */ /* empty line followed by non-quote line */ else if( is_empty( data + beg, end - beg ) && ( end >= size || ( prefix_quote( data + end, size - end ) == 0 && !is_empty( data + end, size - end ) ) ) ) break; if( beg < end ) /* copy into the in-place working buffer */ { /* bufput(work, data + beg, end - beg); */ if( !work_data ) work_data = data + beg; else if( data + beg != work_data + work_size ) memmove( work_data + work_size, data + beg, end - beg ); work_size += end - beg; } beg = end; } parse_block( out, rndr, work_data, work_size ); if( rndr->cb.blockquote ) rndr->cb.blockquote( ob, out, rndr->opaque ); rndr_popbuf( rndr, BUFFER_BLOCK ); return end; } static size_t parse_htmlblock( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size, int do_render ); /* parse_blockquote • handles parsing of a regular paragraph */ static size_t parse_paragraph( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size ) { size_t i = 0, end = 0; int level = 0; struct buf work = { data, 0, 0, 0 }; while( i < size ) { for( end = i + 1; end < size && data[end - 1] != '\n'; end++ ) /* empty */ ; if( is_empty( data + i, size - i ) ) break; if( ( level = is_headerline( data + i, size - i ) ) != 0 ) break; if( is_atxheader( rndr, data + i, size - i ) || is_hrule( data + i, size - i ) || prefix_quote( data + i, size - i ) ) { end = i; break; } /* * Early termination of a paragraph with the same logic * as Markdown 1.0.0. If this logic is applied, the * Markdown 1.0.3 test suite won't pass cleanly * * :: If the first character in a new line is not a letter, * let's check to see if there's some kind of block starting * here */ if( (rndr->ext_flags & MKDEXT_LAX_SPACING) && !isalnum( data[i] ) ) { if( prefix_oli( data + i, size - i ) || prefix_uli( data + i, size - i ) ) { end = i; break; } /* see if an html block starts here */ if( data[i] == '<' && rndr->cb.blockhtml && parse_htmlblock( ob, rndr, data + i, size - i, 0 ) ) { end = i; break; } /* see if a code fence starts here */ if( (rndr->ext_flags & MKDEXT_FENCED_CODE) != 0 && is_codefence( data + i, size - i, NULL ) != 0 ) { end = i; break; } } i = end; } work.size = i; while( work.size && data[work.size - 1] == '\n' ) work.size--; if( !level ) { struct buf* tmp = rndr_newbuf( rndr, BUFFER_BLOCK ); parse_inline( tmp, rndr, work.data, work.size ); if( rndr->cb.paragraph ) rndr->cb.paragraph( ob, tmp, rndr->opaque ); rndr_popbuf( rndr, BUFFER_BLOCK ); } else { struct buf* header_work; if( work.size ) { size_t beg; i = work.size; work.size -= 1; while( work.size && data[work.size] != '\n' ) work.size -= 1; beg = work.size + 1; while( work.size && data[work.size - 1] == '\n' ) work.size -= 1; if( work.size > 0 ) { struct buf* tmp = rndr_newbuf( rndr, BUFFER_BLOCK ); parse_inline( tmp, rndr, work.data, work.size ); if( rndr->cb.paragraph ) rndr->cb.paragraph( ob, tmp, rndr->opaque ); rndr_popbuf( rndr, BUFFER_BLOCK ); work.data += beg; work.size = i - beg; } else work.size = i; } header_work = rndr_newbuf( rndr, BUFFER_SPAN ); parse_inline( header_work, rndr, work.data, work.size ); if( rndr->cb.header ) rndr->cb.header( ob, header_work, (int) level, rndr->opaque ); rndr_popbuf( rndr, BUFFER_SPAN ); } return end; } /* parse_fencedcode • handles parsing of a block-level code fragment */ static size_t parse_fencedcode( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size ) { size_t beg, end; struct buf* work = 0; struct buf lang = { 0, 0, 0, 0 }; beg = is_codefence( data, size, &lang ); if( beg == 0 ) return 0; work = rndr_newbuf( rndr, BUFFER_BLOCK ); while( beg < size ) { size_t fence_end; struct buf fence_trail = { 0, 0, 0, 0 }; fence_end = is_codefence( data + beg, size - beg, &fence_trail ); if( fence_end != 0 && fence_trail.size == 0 ) { beg += fence_end; break; } for( end = beg + 1; end < size && data[end - 1] != '\n'; end++ ) ; if( beg < end ) { /* verbatim copy to the working buffer, * escaping entities */ if( is_empty( data + beg, end - beg ) ) bufputc( work, '\n' ); else bufput( work, data + beg, end - beg ); } beg = end; } if( work->size && work->data[work->size - 1] != '\n' ) bufputc( work, '\n' ); if( rndr->cb.blockcode ) rndr->cb.blockcode( ob, work, lang.size ? &lang : NULL, rndr->opaque ); rndr_popbuf( rndr, BUFFER_BLOCK ); return beg; } static size_t parse_blockcode( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size ) { size_t beg, end, pre; struct buf* work = 0; work = rndr_newbuf( rndr, BUFFER_BLOCK ); beg = 0; while( beg < size ) { for( end = beg + 1; end < size && data[end - 1] != '\n'; end++ ) { } ; pre = prefix_code( data + beg, end - beg ); if( pre ) beg += pre; /* skipping prefix */ else if( !is_empty( data + beg, end - beg ) ) /* non-empty non-prefixed line breaks the pre */ break; if( beg < end ) { /* verbatim copy to the working buffer, * escaping entities */ if( is_empty( data + beg, end - beg ) ) bufputc( work, '\n' ); else bufput( work, data + beg, end - beg ); } beg = end; } while( work->size && work->data[work->size - 1] == '\n' ) work->size -= 1; bufputc( work, '\n' ); if( rndr->cb.blockcode ) rndr->cb.blockcode( ob, work, NULL, rndr->opaque ); rndr_popbuf( rndr, BUFFER_BLOCK ); return beg; } /* parse_listitem • parsing of a single list item */ /* assuming initial prefix is already removed */ static size_t parse_listitem( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size, int* flags ) { struct buf* work = 0, * inter = 0; size_t beg = 0, end, pre, sublist = 0, orgpre = 0, i; int in_empty = 0, has_inside_empty = 0, in_fence = 0; /* keeping track of the first indentation prefix */ while( orgpre < 3 && orgpre < size && data[orgpre] == ' ' ) orgpre++; beg = prefix_uli( data, size ); if( !beg ) beg = prefix_oli( data, size ); if( !beg ) return 0; /* skipping to the beginning of the following line */ end = beg; while( end < size && data[end - 1] != '\n' ) end++; /* getting working buffers */ work = rndr_newbuf( rndr, BUFFER_SPAN ); inter = rndr_newbuf( rndr, BUFFER_SPAN ); /* putting the first line into the working buffer */ bufput( work, data + beg, end - beg ); beg = end; /* process the following lines */ while( beg < size ) { size_t has_next_uli = 0, has_next_oli = 0; end++; while( end < size && data[end - 1] != '\n' ) end++; /* process an empty line */ if( is_empty( data + beg, end - beg ) ) { in_empty = 1; beg = end; continue; } /* calculating the indentation */ i = 0; while( i < 4 && beg + i < end && data[beg + i] == ' ' ) i++; pre = i; if( rndr->ext_flags & MKDEXT_FENCED_CODE ) { if( is_codefence( data + beg + i, end - beg - i, NULL ) != 0 ) in_fence = !in_fence; } /* Only check for new list items if we are **not** inside * a fenced code block */ if( !in_fence ) { has_next_uli = prefix_uli( data + beg + i, end - beg - i ); has_next_oli = prefix_oli( data + beg + i, end - beg - i ); } /* checking for ul/ol switch */ if( in_empty && ( ( (*flags & MKD_LIST_ORDERED) && has_next_uli ) || (!(*flags & MKD_LIST_ORDERED) && has_next_oli) ) ) { *flags |= MKD_LI_END; break; /* the following item must have same list type */ } /* checking for a new item */ if( ( has_next_uli && !is_hrule( data + beg + i, end - beg - i ) ) || has_next_oli ) { if( in_empty ) has_inside_empty = 1; if( pre == orgpre ) /* the following item must have */ break; /* the same indentation */ if( !sublist ) sublist = work->size; } /* joining only indented stuff after empty lines; * note that now we only require 1 space of indentation * to continue a list */ else if( in_empty && pre == 0 ) { *flags |= MKD_LI_END; break; } else if( in_empty ) { bufputc( work, '\n' ); has_inside_empty = 1; } in_empty = 0; /* adding the line without prefix into the working buffer */ bufput( work, data + beg + i, end - beg - i ); beg = end; } /* render of li contents */ if( has_inside_empty ) *flags |= MKD_LI_BLOCK; if( *flags & MKD_LI_BLOCK ) { /* intermediate render of block li */ if( sublist && sublist < work->size ) { parse_block( inter, rndr, work->data, sublist ); parse_block( inter, rndr, work->data + sublist, work->size - sublist ); } else parse_block( inter, rndr, work->data, work->size ); } else { /* intermediate render of inline li */ if( sublist && sublist < work->size ) { parse_inline( inter, rndr, work->data, sublist ); parse_block( inter, rndr, work->data + sublist, work->size - sublist ); } else parse_inline( inter, rndr, work->data, work->size ); } /* render of li itself */ if( rndr->cb.listitem ) rndr->cb.listitem( ob, inter, *flags, rndr->opaque ); rndr_popbuf( rndr, BUFFER_SPAN ); rndr_popbuf( rndr, BUFFER_SPAN ); return beg; } /* parse_list • parsing ordered or unordered list block */ static size_t parse_list( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size, int flags ) { struct buf* work = 0; size_t i = 0, j; work = rndr_newbuf( rndr, BUFFER_BLOCK ); while( i < size ) { j = parse_listitem( work, rndr, data + i, size - i, &flags ); i += j; if( !j || (flags & MKD_LI_END) ) break; } if( rndr->cb.list ) rndr->cb.list( ob, work, flags, rndr->opaque ); rndr_popbuf( rndr, BUFFER_BLOCK ); return i; } /* parse_atxheader • parsing of atx-style headers */ static size_t parse_atxheader( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size ) { size_t level = 0; size_t i, end, skip; while( level < size && level < 6 && data[level] == '#' ) level++; for( i = level; i < size && data[i] == ' '; i++ ) ; for( end = i; end < size && data[end] != '\n'; end++ ) ; skip = end; while( end && data[end - 1] == '#' ) end--; while( end && data[end - 1] == ' ' ) end--; if( end > i ) { struct buf* work = rndr_newbuf( rndr, BUFFER_SPAN ); parse_inline( work, rndr, data + i, end - i ); if( rndr->cb.header ) rndr->cb.header( ob, work, (int) level, rndr->opaque ); rndr_popbuf( rndr, BUFFER_SPAN ); } return skip; } /* htmlblock_end • checking end of HTML block : [ \t]*\n[ \t*]\n */ /* returns the length on match, 0 otherwise */ static size_t htmlblock_end_tag( const char* tag, size_t tag_len, struct sd_markdown* rndr, uint8_t* data, size_t size ) { size_t i, w; /* checking if tag is a match */ if( tag_len + 3 >= size || strncasecmp( (char*) data + 2, tag, tag_len ) != 0 || data[tag_len + 2] != '>' ) return 0; /* checking white lines */ i = tag_len + 3; w = 0; if( i < size && ( w = is_empty( data + i, size - i ) ) == 0 ) return 0; /* non-blank after tag */ i += w; w = 0; if( i < size ) w = is_empty( data + i, size - i ); return i + w; } static size_t htmlblock_end( const char* curtag, struct sd_markdown* rndr, uint8_t* data, size_t size, int start_of_line ) { size_t tag_size = strlen( curtag ); size_t i = 1, end_tag; int block_lines = 0; while( i < size ) { i++; while( i < size && !(data[i - 1] == '<' && data[i] == '/') ) { if( data[i] == '\n' ) block_lines++; i++; } /* If we are only looking for unindented tags, skip the tag * if it doesn't follow a newline. * * The only exception to this is if the tag is still on the * initial line; in that case it still counts as a closing * tag */ if( start_of_line && block_lines > 0 && data[i - 2] != '\n' ) continue; if( i + 2 + tag_size >= size ) break; end_tag = htmlblock_end_tag( curtag, tag_size, rndr, data + i - 1, size - i + 1 ); if( end_tag ) return i + end_tag - 1; } return 0; } /* parse_htmlblock • parsing of inline HTML block */ static size_t parse_htmlblock( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size, int do_render ) { size_t i, j = 0, tag_end; const char* curtag = NULL; struct buf work = { data, 0, 0, 0 }; /* identification of the opening tag */ if( size < 2 || data[0] != '<' ) return 0; i = 1; while( i < size && data[i] != '>' && data[i] != ' ' ) i++; if( i < size ) curtag = find_block_tag( (char*) data + 1, (int) i - 1 ); /* handling of special cases */ if( !curtag ) { /* HTML comment, laxist form */ if( size > 5 && data[1] == '!' && data[2] == '-' && data[3] == '-' ) { i = 5; while( i < size && !(data[i - 2] == '-' && data[i - 1] == '-' && data[i] == '>') ) i++; i++; if( i < size ) j = is_empty( data + i, size - i ); if( j ) { work.size = i + j; if( do_render && rndr->cb.blockhtml ) rndr->cb.blockhtml( ob, &work, rndr->opaque ); return work.size; } } /* HR, which is the only self-closing block tag considered */ if( size > 4 && (data[1] == 'h' || data[1] == 'H') && (data[2] == 'r' || data[2] == 'R') ) { i = 3; while( i < size && data[i] != '>' ) i++; if( i + 1 < size ) { i++; j = is_empty( data + i, size - i ); if( j ) { work.size = i + j; if( do_render && rndr->cb.blockhtml ) rndr->cb.blockhtml( ob, &work, rndr->opaque ); return work.size; } } } /* no special case recognised */ return 0; } /* looking for an unindented matching closing tag */ /* followed by a blank line */ tag_end = htmlblock_end( curtag, rndr, data, size, 1 ); /* if not found, trying a second pass looking for indented match */ /* but not if tag is "ins" or "del" (following original Markdown.pl) */ if( !tag_end && strcmp( curtag, "ins" ) != 0 && strcmp( curtag, "del" ) != 0 ) { tag_end = htmlblock_end( curtag, rndr, data, size, 0 ); } if( !tag_end ) return 0; /* the end of the block has been found */ work.size = tag_end; if( do_render && rndr->cb.blockhtml ) rndr->cb.blockhtml( ob, &work, rndr->opaque ); return tag_end; } static void parse_table_row( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size, size_t columns, int* col_data, int header_flag ) { size_t i = 0, col; struct buf* row_work = 0; if( !rndr->cb.table_cell || !rndr->cb.table_row ) return; row_work = rndr_newbuf( rndr, BUFFER_SPAN ); if( i < size && data[i] == '|' ) i++; for( col = 0; col < columns && i < size; ++col ) { size_t cell_start, cell_end; struct buf* cell_work; cell_work = rndr_newbuf( rndr, BUFFER_SPAN ); while( i < size && _isspace( data[i] ) ) i++; cell_start = i; while( i < size && data[i] != '|' ) i++; cell_end = i - 1; while( cell_end > cell_start && _isspace( data[cell_end] ) ) cell_end--; parse_inline( cell_work, rndr, data + cell_start, 1 + cell_end - cell_start ); rndr->cb.table_cell( row_work, cell_work, col_data[col] | header_flag, rndr->opaque ); rndr_popbuf( rndr, BUFFER_SPAN ); i++; } for( ; col < columns; ++col ) { struct buf empty_cell = { 0, 0, 0, 0 }; rndr->cb.table_cell( row_work, &empty_cell, col_data[col] | header_flag, rndr->opaque ); } rndr->cb.table_row( ob, row_work, rndr->opaque ); rndr_popbuf( rndr, BUFFER_SPAN ); } static size_t parse_table_header( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size, size_t* columns, int** column_data ) { int pipes; size_t i = 0, col, header_end, under_end; pipes = 0; while( i < size && data[i] != '\n' ) if( data[i++] == '|' ) pipes++; if( i == size || pipes == 0 ) return 0; header_end = i; while( header_end > 0 && _isspace( data[header_end - 1] ) ) header_end--; if( data[0] == '|' ) pipes--; if( header_end && data[header_end - 1] == '|' ) pipes--; *columns = pipes + 1; *column_data = calloc( *columns, sizeof(int) ); /* Parse the header underline */ i++; if( i < size && data[i] == '|' ) i++; under_end = i; while( under_end < size && data[under_end] != '\n' ) under_end++; for( col = 0; col < *columns && i < under_end; ++col ) { size_t dashes = 0; while( i < under_end && data[i] == ' ' ) i++; if( data[i] == ':' ) { i++; (*column_data)[col] |= MKD_TABLE_ALIGN_L; dashes++; } while( i < under_end && data[i] == '-' ) { i++; dashes++; } if( i < under_end && data[i] == ':' ) { i++; (*column_data)[col] |= MKD_TABLE_ALIGN_R; dashes++; } while( i < under_end && data[i] == ' ' ) i++; if( i < under_end && data[i] != '|' ) break; if( dashes < 3 ) break; i++; } if( col < *columns ) return 0; parse_table_row( ob, rndr, data, header_end, *columns, *column_data, MKD_TABLE_HEADER ); return under_end + 1; } static size_t parse_table( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size ) { size_t i; struct buf* header_work = 0; struct buf* body_work = 0; size_t columns; int* col_data = NULL; header_work = rndr_newbuf( rndr, BUFFER_SPAN ); body_work = rndr_newbuf( rndr, BUFFER_BLOCK ); i = parse_table_header( header_work, rndr, data, size, &columns, &col_data ); if( i > 0 ) { while( i < size ) { size_t row_start; int pipes = 0; row_start = i; while( i < size && data[i] != '\n' ) if( data[i++] == '|' ) pipes++; if( pipes == 0 || i == size ) { i = row_start; break; } parse_table_row( body_work, rndr, data + row_start, i - row_start, columns, col_data, 0 ); i++; } if( rndr->cb.table ) rndr->cb.table( ob, header_work, body_work, rndr->opaque ); } free( col_data ); rndr_popbuf( rndr, BUFFER_SPAN ); rndr_popbuf( rndr, BUFFER_BLOCK ); return i; } /* parse_block • parsing of one block, returning next uint8_t to parse */ static void parse_block( struct buf* ob, struct sd_markdown* rndr, uint8_t* data, size_t size ) { size_t beg, end, i; uint8_t* txt_data; beg = 0; if( rndr->work_bufs[BUFFER_SPAN].size + rndr->work_bufs[BUFFER_BLOCK].size > rndr->max_nesting ) return; while( beg < size ) { txt_data = data + beg; end = size - beg; if( is_atxheader( rndr, txt_data, end ) ) beg += parse_atxheader( ob, rndr, txt_data, end ); else if( data[beg] == '<' && rndr->cb.blockhtml && ( i = parse_htmlblock( ob, rndr, txt_data, end, 1 ) ) != 0 ) beg += i; else if( ( i = is_empty( txt_data, end ) ) != 0 ) beg += i; else if( is_hrule( txt_data, end ) ) { if( rndr->cb.hrule ) rndr->cb.hrule( ob, rndr->opaque ); while( beg < size && data[beg] != '\n' ) beg++; beg++; } else if( (rndr->ext_flags & MKDEXT_FENCED_CODE) != 0 && ( i = parse_fencedcode( ob, rndr, txt_data, end ) ) != 0 ) beg += i; else if( (rndr->ext_flags & MKDEXT_TABLES) != 0 && ( i = parse_table( ob, rndr, txt_data, end ) ) != 0 ) beg += i; else if( prefix_quote( txt_data, end ) ) beg += parse_blockquote( ob, rndr, txt_data, end ); else if( prefix_code( txt_data, end ) ) beg += parse_blockcode( ob, rndr, txt_data, end ); else if( prefix_uli( txt_data, end ) ) beg += parse_list( ob, rndr, txt_data, end, 0 ); else if( prefix_oli( txt_data, end ) ) beg += parse_list( ob, rndr, txt_data, end, MKD_LIST_ORDERED ); else beg += parse_paragraph( ob, rndr, txt_data, end ); } } /********************* * REFERENCE PARSING * *********************/ /* is_ref • returns whether a line is a reference or not */ static int is_ref( const uint8_t* data, size_t beg, size_t end, size_t* last, struct link_ref** refs ) { /* int n; */ size_t i = 0; size_t id_offset, id_end; size_t link_offset, link_end; size_t title_offset, title_end; size_t line_end; /* up to 3 optional leading spaces */ if( beg + 3 >= end ) return 0; if( data[beg] == ' ' ) { i = 1; if( data[beg + 1] == ' ' ) { i = 2; if( data[beg + 2] == ' ' ) { i = 3; if( data[beg + 3] == ' ' ) return 0; } } } i += beg; /* id part: anything but a newline between brackets */ if( data[i] != '[' ) return 0; i++; id_offset = i; while( i < end && data[i] != '\n' && data[i] != '\r' && data[i] != ']' ) i++; if( i >= end || data[i] != ']' ) return 0; id_end = i; /* spacer: colon (space | tab)* newline? (space | tab)* */ i++; if( i >= end || data[i] != ':' ) return 0; i++; while( i < end && data[i] == ' ' ) i++; if( i < end && (data[i] == '\n' || data[i] == '\r') ) { i++; if( i < end && data[i] == '\r' && data[i - 1] == '\n' ) i++; } while( i < end && data[i] == ' ' ) i++; if( i >= end ) return 0; /* link: whitespace-free sequence, optionally between angle brackets */ if( data[i] == '<' ) i++; link_offset = i; while( i < end && data[i] != ' ' && data[i] != '\n' && data[i] != '\r' ) i++; if( data[i - 1] == '>' ) link_end = i - 1; else link_end = i; /* optional spacer: (space | tab)* (newline | '\'' | '"' | '(' ) */ while( i < end && data[i] == ' ' ) i++; if( i < end && data[i] != '\n' && data[i] != '\r' && data[i] != '\'' && data[i] != '"' && data[i] != '(' ) return 0; line_end = 0; /* computing end-of-line */ if( i >= end || data[i] == '\r' || data[i] == '\n' ) line_end = i; if( i + 1 < end && data[i] == '\n' && data[i + 1] == '\r' ) line_end = i + 1; /* optional (space|tab)* spacer after a newline */ if( line_end ) { i = line_end + 1; while( i < end && data[i] == ' ' ) i++; } /* optional title: any non-newline sequence enclosed in '"() * alone on its line */ title_offset = title_end = 0; if( i + 1 < end && (data[i] == '\'' || data[i] == '"' || data[i] == '(') ) { i++; title_offset = i; /* looking for EOL */ while( i < end && data[i] != '\n' && data[i] != '\r' ) i++; if( i + 1 < end && data[i] == '\n' && data[i + 1] == '\r' ) title_end = i + 1; else title_end = i; /* stepping back */ i -= 1; while( i > title_offset && data[i] == ' ' ) i -= 1; if( i > title_offset && (data[i] == '\'' || data[i] == '"' || data[i] == ')') ) { line_end = title_end; title_end = i; } } if( !line_end || link_end == link_offset ) return 0; /* garbage after the link empty link */ /* a valid ref has been found, filling-in return structures */ if( last ) *last = line_end; if( refs ) { struct link_ref* ref; ref = add_link_ref( refs, data + id_offset, id_end - id_offset ); if( !ref ) return 0; ref->link = bufnew( link_end - link_offset ); bufput( ref->link, data + link_offset, link_end - link_offset ); if( title_end > title_offset ) { ref->title = bufnew( title_end - title_offset ); bufput( ref->title, data + title_offset, title_end - title_offset ); } } return 1; } static void expand_tabs( struct buf* ob, const uint8_t* line, size_t size ) { size_t i = 0, tab = 0; while( i < size ) { size_t org = i; while( i < size && line[i] != '\t' ) { i++; tab++; } if( i > org ) bufput( ob, line + org, i - org ); if( i >= size ) break; do { bufputc( ob, ' ' ); tab++; } while( tab % 4 ); i++; } } /********************** * EXPORTED FUNCTIONS * **********************/ struct sd_markdown* sd_markdown_new( unsigned int extensions, size_t max_nesting, const struct sd_callbacks* callbacks, void* opaque ) { struct sd_markdown* md = NULL; assert( max_nesting > 0 && callbacks ); md = malloc( sizeof(struct sd_markdown) ); if( !md ) return NULL; memcpy( &md->cb, callbacks, sizeof(struct sd_callbacks) ); stack_init( &md->work_bufs[BUFFER_BLOCK], 4 ); stack_init( &md->work_bufs[BUFFER_SPAN], 8 ); memset( md->active_char, 0x0, 256 ); if( md->cb.emphasis || md->cb.double_emphasis || md->cb.triple_emphasis ) { md->active_char['*'] = MD_CHAR_EMPHASIS; md->active_char['_'] = MD_CHAR_EMPHASIS; if( extensions & MKDEXT_STRIKETHROUGH ) md->active_char['~'] = MD_CHAR_EMPHASIS; } if( md->cb.codespan ) md->active_char['`'] = MD_CHAR_CODESPAN; if( md->cb.linebreak ) md->active_char['\n'] = MD_CHAR_LINEBREAK; if( md->cb.image || md->cb.link ) md->active_char['['] = MD_CHAR_LINK; md->active_char['<'] = MD_CHAR_LANGLE; md->active_char['\\'] = MD_CHAR_ESCAPE; md->active_char['&'] = MD_CHAR_ENTITITY; if( extensions & MKDEXT_AUTOLINK ) { md->active_char[':'] = MD_CHAR_AUTOLINK_URL; md->active_char['@'] = MD_CHAR_AUTOLINK_EMAIL; md->active_char['w'] = MD_CHAR_AUTOLINK_WWW; } if( extensions & MKDEXT_SUPERSCRIPT ) md->active_char['^'] = MD_CHAR_SUPERSCRIPT; /* Extension data */ md->ext_flags = extensions; md->opaque = opaque; md->max_nesting = max_nesting; md->in_link_body = 0; return md; } void sd_markdown_render( struct buf* ob, const uint8_t* document, size_t doc_size, struct sd_markdown* md ) { #define MARKDOWN_GROW( x ) ( (x) + ( (x) >> 1 ) ) static const char UTF8_BOM[] = { 0xEF, 0xBB, 0xBF }; struct buf* text; size_t beg, end; text = bufnew( 64 ); if( !text ) return; /* Preallocate enough space for our buffer to avoid expanding while copying */ bufgrow( text, doc_size ); /* reset the references table */ memset( &md->refs, 0x0, REF_TABLE_SIZE * sizeof(void*) ); /* first pass: looking for references, copying everything else */ beg = 0; /* Skip a possible UTF-8 BOM, even though the Unicode standard * discourages having these in UTF-8 documents */ if( doc_size >= 3 && memcmp( document, UTF8_BOM, 3 ) == 0 ) beg += 3; while( beg < doc_size ) /* iterating over lines */ if( is_ref( document, beg, doc_size, &end, md->refs ) ) beg = end; else /* skipping to the next line */ { end = beg; while( end < doc_size && document[end] != '\n' && document[end] != '\r' ) end++; /* adding the line body if present */ if( end > beg ) expand_tabs( text, document + beg, end - beg ); while( end < doc_size && (document[end] == '\n' || document[end] == '\r') ) { /* add one \n per newline */ if( document[end] == '\n' || (end + 1 < doc_size && document[end + 1] != '\n') ) bufputc( text, '\n' ); end++; } beg = end; } /* pre-grow the output buffer to minimize allocations */ bufgrow( ob, MARKDOWN_GROW( text->size ) ); /* second pass: actual rendering */ if( md->cb.doc_header ) md->cb.doc_header( ob, md->opaque ); if( text->size ) { /* adding a final newline if not already present */ if( text->data[text->size - 1] != '\n' && text->data[text->size - 1] != '\r' ) bufputc( text, '\n' ); parse_block( ob, md, text->data, text->size ); } if( md->cb.doc_footer ) md->cb.doc_footer( ob, md->opaque ); /* clean-up */ bufrelease( text ); free_link_refs( md->refs ); assert( md->work_bufs[BUFFER_SPAN].size == 0 ); assert( md->work_bufs[BUFFER_BLOCK].size == 0 ); } void sd_markdown_free( struct sd_markdown* md ) { size_t i; for( i = 0; i < (size_t) md->work_bufs[BUFFER_SPAN].asize; ++i ) bufrelease( md->work_bufs[BUFFER_SPAN].item[i] ); for( i = 0; i < (size_t) md->work_bufs[BUFFER_BLOCK].asize; ++i ) bufrelease( md->work_bufs[BUFFER_BLOCK].item[i] ); stack_free( &md->work_bufs[BUFFER_SPAN] ); stack_free( &md->work_bufs[BUFFER_BLOCK] ); free( md ); } void sd_version( int* ver_major, int* ver_minor, int* ver_revision ) { *ver_major = SUNDOWN_VER_MAJOR; *ver_minor = SUNDOWN_VER_MINOR; *ver_revision = SUNDOWN_VER_REVISION; } /* vim: set filetype=c: */