upgrade md4c to 601885f738

2026-05-21 21:38:37 +00:00 · 2020-10-16 15:07:19 -07:00 · 2020-10-16 15:07:19 -07:00 · 9e251082ee
commit 9e251082ee
parent 9941275ff7
7 changed files with 482 additions and 399 deletions
--- a/markdown.d.ts
+++ b/markdown.d.ts
@ -6,22 +6,16 @@ export function parse(s :Source, o? :ParseOptions & { asMemoryView? :never|false
 export function parse(s :Source, o? :ParseOptions & { asMemoryView :true }) :Uint8Array

 /** Markdown source code can be provided as a JavaScript string or UTF8 encoded data */
-type Source = string|ArrayLike<number>
+type Source = string | ArrayLike<number>

 /** Options for the parse function */
 export interface ParseOptions {
-  /**
-   * Customize parsing.
-   * If not provided, the following flags are used, equating to github-style parsing:
-   *   COLLAPSE_WHITESPACE
-   *   PERMISSIVE_ATX_HEADERS
-   *   PERMISSIVE_URL_AUTO_LINKS
-   *   STRIKETHROUGH
-   *   TABLES
-   *   TASK_LISTS
-   */
+  /** Customize parsing. Defaults to ParseFlags.DEFAULT */
  parseFlags? :ParseFlags

+  /** Select output format. Defaults to "html" */
+  format? : "html" | "xhtml"
+
  /**
   * asMemoryView=true causes parse() to return a view of heap memory as a Uint8Array,
   * instead of a string.
@ -51,7 +45,18 @@ export enum ParseFlags {
  /** Enable tables extension. */                                 TABLES,
  /** Enable task list extension. */                              TASK_LISTS,
  /** Enable wiki links extension. */                             WIKI_LINKS,
+  /** Enable underline extension (disables '_' for emphasis) */   UNDERLINE,

-  /** Default flags */                                            DEFAULT,
-  /** Shorthand for NO_HTML_BLOCKS | NO_HTML_SPANS */             NO_HTML,
+  /** Default flags are:
+   *    COLLAPSE_WHITESPACE |
+   *    PERMISSIVE_ATX_HEADERS |
+   *    PERMISSIVE_URL_AUTO_LINKS |
+   *    STRIKETHROUGH |
+   *    TABLES |
+   *    TASK_LISTS
+   */
+  DEFAULT,
+
+  /** Shorthand for NO_HTML_BLOCKS | NO_HTML_SPANS */
+  NO_HTML,
 }
--- a/src/fmt_html.c
+++ b/src/fmt_html.c
@ -34,6 +34,7 @@ typedef struct HtmlRenderer_st {
  WBuf* outbuf;
  int   imgnest;
  int   addanchor;
+  u32   flags;
 } HtmlRenderer;


@ -246,7 +247,7 @@ static void render_close_img_span(HtmlRenderer* r, const MD_SPAN_IMG_DETAIL* det
    render_literal(r, "\" title=\"");
    render_attribute(r, &det->title);
  }
-  render_literal(r, "\">");
+  render_literal(r, (r->flags & MD_HTML_FLAG_XHTML) ? "\"/>" : "\">");
  r->imgnest--;
 }

@ -273,7 +274,7 @@ static int enter_block_callback(MD_BLOCKTYPE type, void* detail, void* userdata)
    case MD_BLOCK_UL:       render_literal(r, "<ul>\n"); break;
    case MD_BLOCK_OL:       render_open_ol_block(r, (const MD_BLOCK_OL_DETAIL*)detail); break;
    case MD_BLOCK_LI:       render_open_li_block(r, (const MD_BLOCK_LI_DETAIL*)detail); break;
-    case MD_BLOCK_HR:       render_literal(r, "<hr>\n"); break;
+    case MD_BLOCK_HR:       render_literal(r, (r->flags & MD_HTML_FLAG_XHTML) ? "<hr/>\n" : "<hr>\n"); break;
    case MD_BLOCK_H:
    {
      render_literal(r, head[((MD_BLOCK_H_DETAIL*)detail)->level - 1]);
@ -324,14 +325,28 @@ static int enter_span_callback(MD_SPANTYPE type, void* detail, void* userdata) {
  HtmlRenderer* r = (HtmlRenderer*) userdata;

  if(r->imgnest > 0) {
-    /* We are inside an image, i.e. rendering the ALT attribute of
-     * <IMG> tag. */
+    /* We are inside a Markdown image label. Markdown allows to use any
+     * emphasis and other rich contents in that context similarly as in
+     * any link label.
+     *
+     * However, unlike in the case of links (where that contents becomes
+     * contents of the <a>...</a> tag), in the case of images the contents
+     * is supposed to fall into the attribute alt: <img alt="...">.
+     *
+     * In that context we naturally cannot output nested HTML tags. So lets
+     * suppress them and only output the plain text (i.e. what falls into
+     * text() callback).
+     *
+     * This make-it-a-plain-text approach is the recommended practice by
+     * CommonMark specification (for HTML output).
+     */
    return 0;
  }

  switch(type) {
    case MD_SPAN_EM:                render_literal(r, "<em>"); break;
    case MD_SPAN_STRONG:            render_literal(r, "<b>"); break;
+    case MD_SPAN_U:                 render_literal(r, "<u>"); break;
    case MD_SPAN_A:                 render_open_a_span(r, (MD_SPAN_A_DETAIL*) detail); break;
    case MD_SPAN_IMG:               render_open_img_span(r, (MD_SPAN_IMG_DETAIL*) detail); break;
    case MD_SPAN_CODE:              render_literal(r, "<code>"); break;
@ -348,8 +363,8 @@ static int leave_span_callback(MD_SPANTYPE type, void* detail, void* userdata) {
  HtmlRenderer* r = (HtmlRenderer*) userdata;

  if(r->imgnest > 0) {
-    /* We are inside an image, i.e. rendering the ALT attribute of
-     * <IMG> tag. */
+    /* Ditto as in enter_span_callback(), except we have to allow the
+     * end of the <img> tag. */
    if(r->imgnest == 1  &&  type == MD_SPAN_IMG)
      render_close_img_span(r, (MD_SPAN_IMG_DETAIL*) detail);
    return 0;
@ -358,6 +373,7 @@ static int leave_span_callback(MD_SPANTYPE type, void* detail, void* userdata) {
  switch(type) {
    case MD_SPAN_EM:                render_literal(r, "</em>"); break;
    case MD_SPAN_STRONG:            render_literal(r, "</b>"); break;
+    case MD_SPAN_U:                 render_literal(r, "</u>"); break;
    case MD_SPAN_A:                 render_literal(r, "</a>"); break;
    case MD_SPAN_IMG:               /*noop, handled above*/ break;
    case MD_SPAN_CODE:              render_literal(r, "</code>"); break;
@ -395,7 +411,17 @@ static int text_callback(MD_TEXTTYPE type, const MD_CHAR* text, MD_SIZE size, vo

  switch(type) {
    case MD_TEXT_NULLCHAR:  render_text(r, ucReplacementUTF8, sizeof(ucReplacementUTF8)); break;
-    case MD_TEXT_BR:        render_literal(r, (r->imgnest == 0 ? "<br>\n" : " ")); break;
+    case MD_TEXT_BR:
+      render_literal(
+        r,
+        r->imgnest == 0 ?
+          ((r->flags & MD_HTML_FLAG_XHTML) ? "<br/>\n" : "<br>\n") :
+          " "
+      );
+      break;
+
+    render_literal(r, (r->flags & MD_HTML_FLAG_XHTML) ? "<hr/>\n" : "<hr>\n"); break;
+
    case MD_TEXT_SOFTBR:    render_literal(r, (r->imgnest == 0 ? "\n" : " ")); break;
    case MD_TEXT_HTML:      render_text(r, text, size); break;
    case MD_TEXT_ENTITY:    render_text(r, text, size); break;
@ -409,9 +435,14 @@ static int text_callback(MD_TEXTTYPE type, const MD_CHAR* text, MD_SIZE size, vo
 //   dlog("MD4C: %s\n", msg);
 // }

-int
-fmt_html(const MD_CHAR* input, MD_SIZE input_size, WBuf* outbuf, unsigned parser_flags) {
-  HtmlRenderer render = { outbuf, 0, 0 };
+int fmt_html(
+  const MD_CHAR* input,
+  MD_SIZE input_size,
+  WBuf* outbuf,
+  u32 parser_flags,
+  u32 render_flags
+) {
+  HtmlRenderer render = { outbuf, 0, 0, render_flags };

  MD_PARSER parser = {
    0,
--- a/src/fmt_html.h
+++ b/src/fmt_html.h
@ -1,4 +1,6 @@
 #pragma once
 #include "wbuf.h"

-int fmt_html(const char* input, u32 inputlen, WBuf* outbuf, u32 parserFlags);
+#define MD_HTML_FLAG_XHTML 0x0008 // instead of e.g. <br>, generate <br/>
+
+int fmt_html(const char* input, u32 inputlen, WBuf* outbuf, u32 parserFlags, u32 renderFlags);
--- a/src/md.c
+++ b/src/md.c
@ -5,13 +5,10 @@
 #include "fmt_html.h"
 // #include "fmt_json.h"

-// #include "md4c.h"
-/* If set, debug output from md_parse() is sent to stderr. */
-#define MD_RENDER_FLAG_DEBUG                0x0001
-#define MD_RENDER_FLAG_VERBATIM_ENTITIES    0x0002
-
+// these should be in sync with "OutputFlags" in md.js
 typedef enum OutputFlags {
-  OutputFlagsHTML = 1 << 0,
+  OutputFlagHTML  = 1 << 0,
+  OutputFlagXHTML = 1 << 1,
 } OutputFlags;

 typedef enum ErrorCode {
@ -44,10 +41,15 @@ export size_t parseUTF8(

  WBufReset(&outbuf);

-  if (outflags & OutputFlagsHTML) {
+  if (outflags & OutputFlagHTML) {
    WBufReserve(&outbuf, inbuflen * 2);  // approximate output size to minimize reallocations

-    if (fmt_html(inbufptr, inbuflen, &outbuf, parser_flags) != 0) {
+    u32 render_flags = 0;
+    if (outflags & OutputFlagXHTML) {
+      render_flags |= MD_HTML_FLAG_XHTML;
+    }
+
+    if (fmt_html(inbufptr, inbuflen, &outbuf, parser_flags, render_flags) != 0) {
      // fmt_html returns status of md_parse which only fails in extreme cases
      // like when out of memory. md4c does not provide error codes or error messages.
      WErrSet(ERR_MD_PARSE, "md parser error");
--- a/src/md.js
+++ b/src/md.js
@ -12,7 +12,7 @@ export const ParseFlags = {
  PERMISSIVE_ATX_HEADERS:      0x0002, // Do not require space in ATX headers ( ###header )
  PERMISSIVE_URL_AUTO_LINKS:   0x0004, // Recognize URLs as links even without <...>
  PERMISSIVE_EMAIL_AUTO_LINKS: 0x0008, // Recognize e-mails as links even without <...>
-  NO_INDENTED_CODE_BLOCKS:     0x0010, // Disable indented code blocks. (Only fenced code works.)
+  NO_INDENTED_CODE_BLOCKS:     0x0010, // Disable indented code blocks. (Only fenced code works)
  NO_HTML_BLOCKS:              0x0020, // Disable raw HTML blocks.
  NO_HTML_SPANS:               0x0040, // Disable raw HTML (inline).
  TABLES:                      0x0100, // Enable tables extension.
@ -21,6 +21,7 @@ export const ParseFlags = {
  TASK_LISTS:                  0x0800, // Enable task list extension.
  LATEX_MATH_SPANS:            0x1000, // Enable $ and $$ containing LaTeX equations.
  WIKI_LINKS:                  0x2000, // Enable wiki links extension.
+  UNDERLINE:                   0x4000, // Enable underline extension (disables '_' for emphasis)

  // Github style default flags
  DEFAULT: 0x0001 | 0x0002 | 0x0004 | 0x0200 | 0x0100 | 0x0800,
@ -34,38 +35,46 @@ export const ParseFlags = {
  NO_HTML: 0x0020 | 0x0040, // NO_HTML_BLOCKS | NO_HTML_SPANS
 }

+// these should be in sync with "OutputFlags" in md.c
 const OutputFlags = {
-  HTML: 1 << 0,  // Output HTML
-}
-
-const defaultOptions = {
-  parseFlags: ParseFlags.DEFAULT,
-
-  // how to format the output
-  format: "html",
-
-  // Return a view of heap memory as a Uint8Array, instead of a string.
-  //
-  // The returned Uint8Array is only valid until the next call to parse().
-  // If you need to keep the returned Uint8Array around, call Uint8Array.slice()
-  // to make a copy, as each call to parse() reuses the same underlying memory.
-  asMemoryView: false,
+  HTML:  1 << 0, // Output HTML
+  XHTML: 1 << 1, // Output XHTML (only has effect with HTML flag set)
 }

 export function parse(source, options) {
-  options = options ? {__proto__:defaultOptions, ...options} : defaultOptions
-  let outflags = (0
-    | (options.format == "html" ? OutputFlags.HTML : 0)
+  options = options || {}
+
+  let parseFlags = (
+    options.parseFlags === undefined ? ParseFlags.DEFAULT :
+    options.parseFlags
  )

+  let outputFlags = 0
+  switch (options.format) {
+    case "xhtml":
+      outputFlags |= OutputFlags.HTML | OutputFlags.XHTML
+      break
+
+    case "html":
+    case undefined:
+    case null:
+    case "":
+      outputFlags |= OutputFlags.HTML
+      break
+
+    default:
+      throw new Error(`invalid format "${options.format}"`)
+  }
+
  let buf = typeof source == "string" ? utf8.encode(source) : source
  let outbuf = withOutPtr(outptr => withTmpBytePtr(buf, (inptr, inlen) =>
-    _parseUTF8(inptr, inlen, options.parseFlags, outflags, outptr)
+    _parseUTF8(inptr, inlen, parseFlags, outputFlags, outptr)
  ))

  // check for error and throw if needed
  werrCheck()

+  // DEBUG
  // if (outbuf) {
  //   console.log(utf8.decode(outbuf))
  // }
--- a/src/md4c.c
+++ b/src/md4c.c
--- a/src/md4c.h
+++ b/src/md4c.h
@ -2,7 +2,7 @@
 * MD4C: Markdown parser for C
 * (http://github.com/mity/md4c)
 *
- * Copyright (c) 2016-2019 Martin Mitas
+ * Copyright (c) 2016-2020 Martin Mitas
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@ -23,15 +23,15 @@
 * IN THE SOFTWARE.
 */

-#ifndef MD4C_MARKDOWN_H
-#define MD4C_MARKDOWN_H
+#ifndef MD4C_H
+#define MD4C_H

 #ifdef __cplusplus
    extern "C" {
 #endif

 #if defined MD4C_USE_UTF16
-    /* Magic to support UTF-16. Not that in order to use it, you have to define
+    /* Magic to support UTF-16. Note that in order to use it, you have to define
     * the macro MD4C_USE_UTF16 both when building MD4C as well as when
     * including this header in your code. */
    #ifdef _WIN32
@ -119,7 +119,7 @@ typedef enum MD_SPANTYPE {
     * Detail: Structure MD_SPAN_IMG_DETAIL.
     * Note: Image text can contain nested spans and even nested images.
     * If rendered into ALT attribute of HTML <IMG> tag, it's responsibility
-     * of the renderer to deal with it.
+     * of the parser to deal with it.
     */
    MD_SPAN_IMG,

@ -140,7 +140,11 @@ typedef enum MD_SPANTYPE {
    /* Wiki links
     * Note: Recognized only when MD_FLAG_WIKILINKS is enabled.
     */
-    MD_SPAN_WIKILINK
+    MD_SPAN_WIKILINK,
+
+    /* <u>...</u>
+     * Note: Recognized only when MD_FLAG_UNDERLINE is enabled. */
+    MD_SPAN_U
 } MD_SPANTYPE;

 /* Text is the actual textual contents of span. */
@ -159,7 +163,7 @@ typedef enum MD_TEXTTYPE {
    MD_TEXT_SOFTBR,     /* '\n' in source text where it is not semantically meaningful (soft break) */

    /* Entity.
-     * (a) Named entity, e.g. &nbsp;
+     * (a) Named entity, e.g. &nbsp; 
     *     (Note MD4C does not have a list of known entities.
     *     Anything matching the regexp /&[A-Za-z][A-Za-z0-9]{1,47};/ is
     *     treated as a named entity.)
@ -167,7 +171,7 @@ typedef enum MD_TEXTTYPE {
     * (c) Hexadecimal entity, e.g. &#x12AB;
     *
     * As MD4C is mostly encoding agnostic, application gets the verbatim
-     * entity text into the MD_RENDERER::text_callback(). */
+     * entity text into the MD_PARSER::text_callback(). */
    MD_TEXT_ENTITY,

    /* Text in a code block (inside MD_BLOCK_CODE) or inlined code (`code`).
@ -202,8 +206,13 @@ typedef enum MD_ALIGN {
 * propagated within various detailed structures, but which still may contain
 * string portions of different types like e.g. entities.
 *
- * So, for example, lets consider an image has a title attribute string
- * set to "foo &quot; bar". (Note the string size is 14.)
+ * So, for example, lets consider this image:
+ *
+ *     ![image alt text](http://example.org/image.png 'foo &quot; bar')
+ *
+ * The image alt text is propagated as a normal text via the MD_PARSER::text()
+ * callback. However, the image title ('foo &quot; bar') is propagated as
+ * MD_ATTRIBUTE in MD_SPAN_IMG_DETAIL::title.
 *
 * Then the attribute MD_SPAN_IMG_DETAIL::title shall provide the following:
 *  -- [0]: "foo "   (substr_types[0] == MD_TEXT_NORMAL; substr_offsets[0] == 0)
@ -211,10 +220,12 @@ typedef enum MD_ALIGN {
 *  -- [2]: " bar"   (substr_types[2] == MD_TEXT_NORMAL; substr_offsets[2] == 10)
 *  -- [3]: (n/a)    (n/a                              ; substr_offsets[3] == 14)
 *
- * Note that these conditions are guaranteed:
+ * Note that these invariants are always guaranteed:
 *  -- substr_offsets[0] == 0
 *  -- substr_offsets[LAST+1] == size
- *  -- Only MD_TEXT_NORMAL, MD_TEXT_ENTITY, MD_TEXT_NULLCHAR substrings can appear.
+ *  -- Currently, only MD_TEXT_NORMAL, MD_TEXT_ENTITY, MD_TEXT_NULLCHAR
+ *     substrings can appear. This could change only of the specification
+ *     changes.
 */
 typedef struct MD_ATTRIBUTE {
    const MD_CHAR* text;
@ -280,7 +291,7 @@ typedef struct MD_SPAN_WIKILINK {

 /* Flags specifying extensions/deviations from CommonMark specification.
 *
- * By default (when MD_RENDERER::flags == 0), we follow CommonMark specification.
+ * By default (when MD_PARSER::flags == 0), we follow CommonMark specification.
 * The following flags may allow some extensions or deviations from it.
 */
 #define MD_FLAG_COLLAPSEWHITESPACE          0x0001  /* In MD_TEXT_NORMAL, collapse non-trivial whitespace into single ' ' */
@ -296,6 +307,7 @@ typedef struct MD_SPAN_WIKILINK {
 #define MD_FLAG_TASKLISTS                   0x0800  /* Enable task list extension. */
 #define MD_FLAG_LATEXMATHSPANS              0x1000  /* Enable $ and $$ containing LaTeX equations. */
 #define MD_FLAG_WIKILINKS                   0x2000  /* Enable wiki links extension. */
+#define MD_FLAG_UNDERLINE                   0x4000  /* Enable underline extension (and disables '_' for normal emphasis). */

 #define MD_FLAG_PERMISSIVEAUTOLINKS         (MD_FLAG_PERMISSIVEEMAILAUTOLINKS | MD_FLAG_PERMISSIVEURLAUTOLINKS | MD_FLAG_PERMISSIVEWWWAUTOLINKS)
 #define MD_FLAG_NOHTML                      (MD_FLAG_NOHTMLBLOCKS | MD_FLAG_NOHTMLSPANS)
@ -312,7 +324,7 @@ typedef struct MD_SPAN_WIKILINK {
 #define MD_DIALECT_COMMONMARK               0
 #define MD_DIALECT_GITHUB                   (MD_FLAG_PERMISSIVEAUTOLINKS | MD_FLAG_TABLES | MD_FLAG_STRIKETHROUGH | MD_FLAG_TASKLISTS)

-/* Renderer structure.
+/* Parser structure.
 */
 typedef struct MD_PARSER {
    /* Reserved. Set to zero.
@ -333,9 +345,10 @@ typedef struct MD_PARSER {
     *
     * Note any strings provided to the callbacks as their arguments or as
     * members of any detail structure are generally not zero-terminated.
-     * Application has take the respective size information into account.
+     * Application has to take the respective size information into account.
     *
-     * Callbacks may abort further parsing of the document by returning non-zero.
+     * Any rendering callback may abort further parsing of the document by
+     * returning non-zero.
     */
    int (*enter_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
    int (*leave_block)(MD_BLOCKTYPE /*type*/, void* /*detail*/, void* /*userdata*/);
@ -360,18 +373,19 @@ typedef struct MD_PARSER {
 } MD_PARSER;


-/* For backward compatibility. Do not use in new code. */
+/* For backward compatibility. Do not use in new code.
+ */
 typedef MD_PARSER MD_RENDERER;


 /* Parse the Markdown document stored in the string 'text' of size 'size'.
- * The renderer provides callbacks to be called during the parsing so the
+ * The parser provides callbacks to be called during the parsing so the
 * caller can render the document on the screen or convert the Markdown
 * to another format.
 *
 * Zero is returned on success. If a runtime error occurs (e.g. a memory
 * fails), -1 is returned. If the processing is aborted due any callback
- * returning non-zero, md_parse() the return value of the callback is returned.
+ * returning non-zero, the return value of the callback is returned.
 */
 int md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* userdata);

@ -380,4 +394,4 @@ int md_parse(const MD_CHAR* text, MD_SIZE size, const MD_PARSER* parser, void* u
    }  /* extern "C" { */
 #endif

-#endif  /* MD4C_MARKDOWN_H */
+#endif  /* MD4C_H */