upgrade md4c to 601885f738

This commit is contained in:
Rasmus Andersson 2020-10-16 15:07:19 -07:00
parent 9941275ff7
commit 9e251082ee
7 changed files with 482 additions and 399 deletions

31
markdown.d.ts vendored
View file

@ -6,22 +6,16 @@ export function parse(s :Source, o? :ParseOptions & { asMemoryView? :never|false
export function parse(s :Source, o? :ParseOptions & { asMemoryView :true }) :Uint8Array
/** Markdown source code can be provided as a JavaScript string or UTF8 encoded data */
type Source = string|ArrayLike<number>
type Source = string | ArrayLike<number>
/** Options for the parse function */
export interface ParseOptions {
/**
* Customize parsing.
* If not provided, the following flags are used, equating to github-style parsing:
* COLLAPSE_WHITESPACE
* PERMISSIVE_ATX_HEADERS
* PERMISSIVE_URL_AUTO_LINKS
* STRIKETHROUGH
* TABLES
* TASK_LISTS
*/
/** Customize parsing. Defaults to ParseFlags.DEFAULT */
parseFlags? :ParseFlags
/** Select output format. Defaults to "html" */
format? : "html" | "xhtml"
/**
* asMemoryView=true causes parse() to return a view of heap memory as a Uint8Array,
* instead of a string.
@ -51,7 +45,18 @@ export enum ParseFlags {
/** Enable tables extension. */ TABLES,
/** Enable task list extension. */ TASK_LISTS,
/** Enable wiki links extension. */ WIKI_LINKS,
/** Enable underline extension (disables '_' for emphasis) */ UNDERLINE,
/** Default flags */ DEFAULT,
/** Shorthand for NO_HTML_BLOCKS | NO_HTML_SPANS */ NO_HTML,
/** Default flags are:
* COLLAPSE_WHITESPACE |
* PERMISSIVE_ATX_HEADERS |
* PERMISSIVE_URL_AUTO_LINKS |
* STRIKETHROUGH |
* TABLES |
* TASK_LISTS
*/
DEFAULT,
/** Shorthand for NO_HTML_BLOCKS | NO_HTML_SPANS */
NO_HTML,
}

View file

@ -34,6 +34,7 @@ typedef struct HtmlRenderer_st {
WBuf* outbuf;
int imgnest;
int addanchor;
u32 flags;
} HtmlRenderer;
@ -246,7 +247,7 @@ static void render_close_img_span(HtmlRenderer* r, const MD_SPAN_IMG_DETAIL* det
render_literal(r, "\" title=\"");
render_attribute(r, &det->title);
}
render_literal(r, "\">");
render_literal(r, (r->flags & MD_HTML_FLAG_XHTML) ? "\"/>" : "\">");
r->imgnest--;
}
@ -273,7 +274,7 @@ static int enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata)
case MD_BLOCK_UL: render_literal(r, "<ul>\n"); break;
case MD_BLOCK_OL: render_open_ol_block(r, (const MD_BLOCK_OL_DETAIL*)detail); break;
case MD_BLOCK_LI: render_open_li_block(r, (const MD_BLOCK_LI_DETAIL*)detail); break;
case MD_BLOCK_HR: render_literal(r, "<hr>\n"); break;
case MD_BLOCK_HR: render_literal(r, (r->flags & MD_HTML_FLAG_XHTML) ? "<hr/>\n" : "<hr>\n"); break;
case MD_BLOCK_H:
{
render_literal(r, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]);
@ -324,14 +325,28 @@ static int enter_span_callback(MD_SPANTYPE type, void* detail, void* userdata) {
HtmlRenderer* r = (HtmlRenderer*) userdata;
if(r->imgnest > 0) {
/* We are inside an image, i.e. rendering the ALT attribute of
* <IMG> tag. */
/* We are inside a Markdown image label. Markdown allows to use any
* emphasis and other rich contents in that context similarly as in
* any link label.
*
* However, unlike in the case of links (where that contents becomes
* contents of the <a>...</a> tag), in the case of images the contents
* is supposed to fall into the attribute alt: <img alt="...">.
*
* In that context we naturally cannot output nested HTML tags. So lets
* suppress them and only output the plain text (i.e. what falls into
* text() callback).
*
* This make-it-a-plain-text approach is the recommended practice by
* CommonMark specification (for HTML output).
*/
return 0;
}
switch(type) {
case MD_SPAN_EM: render_literal(r, "<em>"); break;
case MD_SPAN_STRONG: render_literal(r, "<b>"); break;
case MD_SPAN_U: render_literal(r, "<u>"); break;
case MD_SPAN_A: render_open_a_span(r, (MD_SPAN_A_DETAIL*) detail); break;
case MD_SPAN_IMG: render_open_img_span(r, (MD_SPAN_IMG_DETAIL*) detail); break;
case MD_SPAN_CODE: render_literal(r, "<code>"); break;
@ -348,8 +363,8 @@ static int leave_span_callback(MD_SPANTYPE type, void* detail, void* userdata) {
HtmlRenderer* r = (HtmlRenderer*) userdata;
if(r->imgnest > 0) {
/* We are inside an image, i.e. rendering the ALT attribute of
* <IMG> tag. */
/* Ditto as in enter_span_callback(), except we have to allow the
* end of the <img> tag. */
if(r->imgnest == 1 && type == MD_SPAN_IMG)
render_close_img_span(r, (MD_SPAN_IMG_DETAIL*) detail);
return 0;
@ -358,6 +373,7 @@ static int leave_span_callback(MD_SPANTYPE type, void* detail, void* userdata) {
switch(type) {
case MD_SPAN_EM: render_literal(r, "</em>"); break;
case MD_SPAN_STRONG: render_literal(r, "</b>"); break;
case MD_SPAN_U: render_literal(r, "</u>"); break;
case MD_SPAN_A: render_literal(r, "</a>"); break;
case MD_SPAN_IMG: /*noop, handled above*/ break;
case MD_SPAN_CODE: render_literal(r, "</code>"); break;
@ -395,7 +411,17 @@ static int text_callback(MD_TEXTTYPE type, const MD_CHAR* text, MD_SIZE size, vo
switch(type) {
case MD_TEXT_NULLCHAR: render_text(r, ucReplacementUTF8, sizeof(ucReplacementUTF8)); break;
case MD_TEXT_BR: render_literal(r, (r->imgnest == 0 ? "<br>\n" : " ")); break;
case MD_TEXT_BR:
render_literal(
r,
r->imgnest == 0 ?
((r->flags & MD_HTML_FLAG_XHTML) ? "<br/>\n" : "<br>\n") :
" "
);
break;
render_literal(r, (r->flags & MD_HTML_FLAG_XHTML) ? "<hr/>\n" : "<hr>\n"); break;
case MD_TEXT_SOFTBR: render_literal(r, (r->imgnest == 0 ? "\n" : " ")); break;
case MD_TEXT_HTML: render_text(r, text, size); break;
case MD_TEXT_ENTITY: render_text(r, text, size); break;
@ -409,9 +435,14 @@ static int text_callback(MD_TEXTTYPE type, const MD_CHAR* text, MD_SIZE size, vo
// dlog("MD4C: %s\n", msg);
// }
int
fmt_html(const MD_CHAR* input, MD_SIZE input_size, WBuf* outbuf, unsigned parser_flags) {
HtmlRenderer render = { outbuf, 0, 0 };
int fmt_html(
const MD_CHAR* input,
MD_SIZE input_size,
WBuf* outbuf,
u32 parser_flags,
u32 render_flags
) {
HtmlRenderer render = { outbuf, 0, 0, render_flags };
MD_PARSER parser = {
0,

View file

@ -1,4 +1,6 @@
#pragma once
#include "wbuf.h"
int fmt_html(const char* input, u32 inputlen, WBuf* outbuf, u32 parserFlags);
#define MD_HTML_FLAG_XHTML 0x0008 // instead of e.g. <br>, generate <br/>
int fmt_html(const char* input, u32 inputlen, WBuf* outbuf, u32 parserFlags, u32 renderFlags);

View file

@ -5,13 +5,10 @@
#include "fmt_html.h"
// #include "fmt_json.h"
// #include "md4c.h"
/* If set, debug output from md_parse() is sent to stderr. */
#define MD_RENDER_FLAG_DEBUG 0x0001
#define MD_RENDER_FLAG_VERBATIM_ENTITIES 0x0002
// these should be in sync with "OutputFlags" in md.js
typedef enum OutputFlags {
OutputFlagsHTML = 1 << 0,
OutputFlagHTML = 1 << 0,
OutputFlagXHTML = 1 << 1,
} OutputFlags;
typedef enum ErrorCode {
@ -44,10 +41,15 @@ export size_t parseUTF8(
WBufReset(&outbuf);
if (outflags & OutputFlagsHTML) {
if (outflags & OutputFlagHTML) {
WBufReserve(&outbuf, inbuflen * 2); // approximate output size to minimize reallocations
if (fmt_html(inbufptr, inbuflen, &outbuf, parser_flags) != 0) {
u32 render_flags = 0;
if (outflags & OutputFlagXHTML) {
render_flags |= MD_HTML_FLAG_XHTML;
}
if (fmt_html(inbufptr, inbuflen, &outbuf, parser_flags, render_flags) != 0) {
// fmt_html returns status of md_parse which only fails in extreme cases
// like when out of memory. md4c does not provide error codes or error messages.
WErrSet(ERR_MD_PARSE, "md parser error");

View file

@ -12,7 +12,7 @@ export const ParseFlags = {
PERMISSIVE_ATX_HEADERS: 0x0002, // Do not require space in ATX headers ( ###header )
PERMISSIVE_URL_AUTO_LINKS: 0x0004, // Recognize URLs as links even without <...>
PERMISSIVE_EMAIL_AUTO_LINKS: 0x0008, // Recognize e-mails as links even without <...>
NO_INDENTED_CODE_BLOCKS: 0x0010, // Disable indented code blocks. (Only fenced code works.)
NO_INDENTED_CODE_BLOCKS: 0x0010, // Disable indented code blocks. (Only fenced code works)
NO_HTML_BLOCKS: 0x0020, // Disable raw HTML blocks.
NO_HTML_SPANS: 0x0040, // Disable raw HTML (inline).
TABLES: 0x0100, // Enable tables extension.
@ -21,6 +21,7 @@ export const ParseFlags = {
TASK_LISTS: 0x0800, // Enable task list extension.
LATEX_MATH_SPANS: 0x1000, // Enable $ and $$ containing LaTeX equations.
WIKI_LINKS: 0x2000, // Enable wiki links extension.
UNDERLINE: 0x4000, // Enable underline extension (disables '_' for emphasis)
// Github style default flags
DEFAULT: 0x0001 | 0x0002 | 0x0004 | 0x0200 | 0x0100 | 0x0800,
@ -34,38 +35,46 @@ export const ParseFlags = {
NO_HTML: 0x0020 | 0x0040, // NO_HTML_BLOCKS | NO_HTML_SPANS
}
// these should be in sync with "OutputFlags" in md.c
const OutputFlags = {
HTML: 1 << 0, // Output HTML
}
const defaultOptions = {
parseFlags: ParseFlags.DEFAULT,
// how to format the output
format: "html",
// Return a view of heap memory as a Uint8Array, instead of a string.
//
// The returned Uint8Array is only valid until the next call to parse().
// If you need to keep the returned Uint8Array around, call Uint8Array.slice()
// to make a copy, as each call to parse() reuses the same underlying memory.
asMemoryView: false,
HTML: 1 << 0, // Output HTML
XHTML: 1 << 1, // Output XHTML (only has effect with HTML flag set)
}
export function parse(source, options) {
options = options ? {__proto__:defaultOptions, ...options} : defaultOptions
let outflags = (0
| (options.format == "html" ? OutputFlags.HTML : 0)
options = options || {}
let parseFlags = (
options.parseFlags === undefined ? ParseFlags.DEFAULT :
options.parseFlags
)
let outputFlags = 0
switch (options.format) {
case "xhtml":
outputFlags |= OutputFlags.HTML | OutputFlags.XHTML
break
case "html":
case undefined:
case null:
case "":
outputFlags |= OutputFlags.HTML
break
default:
throw new Error(`invalid format "${options.format}"`)
}
let buf = typeof source == "string" ? utf8.encode(source) : source
let outbuf = withOutPtr(outptr => withTmpBytePtr(buf, (inptr, inlen) =>
_parseUTF8(inptr, inlen, options.parseFlags, outflags, outptr)
_parseUTF8(inptr, inlen, parseFlags, outputFlags, outptr)
))
// check for error and throw if needed
werrCheck()
// DEBUG
// if (outbuf) {
// console.log(utf8.decode(outbuf))
// }

File diff suppressed because it is too large Load diff

View file

@ -2,7 +2,7 @@
* MD4C: Markdown parser for C
* (http://github.com/mity/md4c)
*
* Copyright (c) 2016-2019 Martin Mitas
* Copyright (c) 2016-2020 Martin Mitas
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@ -23,15 +23,15 @@
* IN THE SOFTWARE.
*/
#ifndef MD4C_MARKDOWN_H
#define MD4C_MARKDOWN_H
#ifndef MD4C_H
#define MD4C_H
#ifdef __cplusplus
extern "C" {
#endif
#if defined MD4C_USE_UTF16
/* Magic to support UTF-16. Not that in order to use it, you have to define
/* Magic to support UTF-16. Note that in order to use it, you have to define
* the macro MD4C_USE_UTF16 both when building MD4C as well as when
* including this header in your code. */
#ifdef _WIN32
@ -119,7 +119,7 @@ typedef enum MD_SPANTYPE {
* Detail: Structure MD_SPAN_IMG_DETAIL.
* Note: Image text can contain nested spans and even nested images.
* If rendered into ALT attribute of HTML <IMG> tag, it's responsibility
* of the renderer to deal with it.
* of the parser to deal with it.
*/
MD_SPAN_IMG,
@ -140,7 +140,11 @@ typedef enum MD_SPANTYPE {
/* Wiki links
* Note: Recognized only when MD_FLAG_WIKILINKS is enabled.
*/
MD_SPAN_WIKILINK
MD_SPAN_WIKILINK,
/* <u>...</u>
* Note: Recognized only when MD_FLAG_UNDERLINE is enabled. */
MD_SPAN_U
} MD_SPANTYPE;
/* Text is the actual textual contents of span. */
@ -159,7 +163,7 @@ typedef enum MD_TEXTTYPE {
MD_TEXT_SOFTBR, /* '\n' in source text where it is not semantically meaningful (soft break) */
/* Entity.
* (a) Named entity, e.g. &nbsp;
* (a) Named entity, e.g. &nbsp;
* (Note MD4C does not have a list of known entities.
* Anything matching the regexp /&[A-Za-z][A-Za-z0-9]{1,47};/ is
* treated as a named entity.)
@ -167,7 +171,7 @@ typedef enum MD_TEXTTYPE {
* (c) Hexadecimal entity, e.g. &#x12AB;
*
* As MD4C is mostly encoding agnostic, application gets the verbatim
* entity text into the MD_RENDERER::text_callback(). */
* entity text into the MD_PARSER::text_callback(). */
MD_TEXT_ENTITY,
/* Text in a code block (inside MD_BLOCK_CODE) or inlined code (`code`).
@ -202,8 +206,13 @@ typedef enum MD_ALIGN {
* propagated within various detailed structures, but which still may contain
* string portions of different types like e.g. entities.
*
* So, for example, lets consider an image has a title attribute string
* set to "foo &quot; bar". (Note the string size is 14.)
* So, for example, lets consider this image:
*
* ![image alt text](http://example.org/image.png 'foo &quot; bar')
*
* The image alt text is propagated as a normal text via the MD_PARSER::text()
* callback. However, the image title ('foo &quot; bar') is propagated as
* MD_ATTRIBUTE in MD_SPAN_IMG_DETAIL::title.
*
* Then the attribute MD_SPAN_IMG_DETAIL::title shall provide the following:
* -- [0]: "foo " (substr_types[0] == MD_TEXT_NORMAL; substr_offsets[0] == 0)
@ -211,10 +220,12 @@ typedef enum MD_ALIGN {
* -- [2]: " bar" (substr_types[2] == MD_TEXT_NORMAL; substr_offsets[2] == 10)
* -- [3]: (n/a) (n/a ; substr_offsets[3] == 14)
*
* Note that these conditions are guaranteed:
* Note that these invariants are always guaranteed:
* -- substr_offsets[0] == 0
* -- substr_offsets[LAST+1] == size
* -- Only MD_TEXT_NORMAL, MD_TEXT_ENTITY, MD_TEXT_NULLCHAR substrings can appear.
* -- Currently, only MD_TEXT_NORMAL, MD_TEXT_ENTITY, MD_TEXT_NULLCHAR
* substrings can appear. This could change only of the specification
* changes.
*/
typedef struct MD_ATTRIBUTE {
const MD_CHAR* text;
@ -280,7 +291,7 @@ typedef struct MD_SPAN_WIKILINK {
/* Flags specifying extensions/deviations from CommonMark specification.
*
* By default (when MD_RENDERER::flags == 0), we follow CommonMark specification.
* By default (when MD_PARSER::flags == 0), we follow CommonMark specification.
* The following flags may allow some extensions or deviations from it.
*/
#define MD_FLAG_COLLAPSEWHITESPACE 0x0001 /* In MD_TEXT_NORMAL, collapse non-trivial whitespace into single ' ' */
@ -296,6 +307,7 @@ typedef struct MD_SPAN_WIKILINK {
#define MD_FLAG_TASKLISTS 0x0800 /* Enable task list extension. */
#define MD_FLAG_LATEXMATHSPANS 0x1000 /* Enable $ and $$ containing LaTeX equations. */
#define MD_FLAG_WIKILINKS 0x2000 /* Enable wiki links extension. */
#define MD_FLAG_UNDERLINE 0x4000 /* Enable underline extension (and disables '_' for normal emphasis). */
#define MD_FLAG_PERMISSIVEAUTOLINKS (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS)
#define MD_FLAG_NOHTML (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS)
@ -312,7 +324,7 @@ typedef struct MD_SPAN_WIKILINK {
#define MD_DIALECT_COMMONMARK 0
#define MD_DIALECT_GITHUB (MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_TABLES | MD_FLAG_STRIKETHROUGH | MD_FLAG_TASKLISTS)
/* Renderer structure.
/* Parser structure.
*/
typedef struct MD_PARSER {
/* Reserved. Set to zero.
@ -333,9 +345,10 @@ typedef struct MD_PARSER {
*
* Note any strings provided to the callbacks as their arguments or as
* members of any detail structure are generally not zero-terminated.
* Application has take the respective size information into account.
* Application has to take the respective size information into account.
*
* Callbacks may abort further parsing of the document by returning non-zero.
* Any rendering callback may abort further parsing of the document by
* returning non-zero.
*/
int (*enter_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
int (*leave_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
@ -360,18 +373,19 @@ typedef struct MD_PARSER {
} MD_PARSER;
/* For backward compatibility. Do not use in new code. */
/* For backward compatibility. Do not use in new code.
*/
typedef MD_PARSER MD_RENDERER;
/* Parse the Markdown document stored in the string 'text' of size 'size'.
* The renderer provides callbacks to be called during the parsing so the
* The parser provides callbacks to be called during the parsing so the
* caller can render the document on the screen or convert the Markdown
* to another format.
*
* Zero is returned on success. If a runtime error occurs (e.g. a memory
* fails), -1 is returned. If the processing is aborted due any callback
* returning non-zero, md_parse() the return value of the callback is returned.
* returning non-zero, the return value of the callback is returned.
*/
int md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata);
@ -380,4 +394,4 @@ int md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* u
} /* extern "C" { */
#endif
#endif /* MD4C_MARKDOWN_H */
#endif /* MD4C_H */