Path: blob/master/src/packages/frontend/editors/slate/markdown-to-slate/parse-markdown.ts
1697 views
/*1* This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.2* License: MS-RSL – see LICENSE.md for details3*/45/*6This parses markdown using our markdown-it based parser,7but math-enhanced beyond what just markdown provides, by8stripping math first before considering markdown. This avoids9issues with math formulas that can be mistaken for markdown10syntax, which is a problem with many math markdown plugins.1112To quote the markdown-it docs: "Why not AST? Because it's13not needed for our tasks. We follow KISS principle. If you wish -14you can call a parser without a renderer and convert the token15stream to an AST." That AST is what slate is.16https://github.com/markdown-it/markdown-it/blob/master/docs/architecture.md17*/1819import { markdown_it, parseHeader } from "@cocalc/frontend/markdown";2021// Use this instead of the above to test with no plugins, which22// can be useful for isolating performance issues.23//import MarkdownIt from "markdown-it";24//const markdown_it = new MarkdownIt();2526import type { References, Token } from "./types";2728// Before feeding to markdown-it and tokenizing, for29// each line that ends in a single trailing space,30// append the following unused unicode character:31const TRAILING_WHITESPACE_CHR = "\uFE20";32const TRAILING_WHITESPACE_SUB = " " + TRAILING_WHITESPACE_CHR;33const TRAILING_WHITESPACE_REG = /\uFE20/g;34// Once tokenized, we remove the funny unicode character, leaving the35// single trailing space.36// This is critical to do since markdown-it (and the markdown spec)37// just silently removes a single trailing space from any line,38// but that's often what people type as they are typing. With39// collaborative editing, this is a massive problem, since one40// user removes the other user's trailing space, which results in41// merge conflicts and thus dropped content. Super annoying.42// Note that this sort of problem can still happen when the user43// types *two spaces* temporarily at the end of a line. However,44// that means newline in markdown, and at this point there is little45// that can be done.46function replaceSingleTrailingWhitespace(markdown: string): string {47// This one little regexp does exactly what we want...48// (?<=\S) = match a non-whitespace but don't capture it - see https://stackoverflow.com/questions/3926451/how-to-match-but-not-capture-part-of-a-regex49// \ = single space50// $ = end of line, because of the "m"51// gm = global and m means $ matches end of each line, not whole string.52//return markdown.replace(/(?<=\S)\ $/gm, TRAILING_WHITESPACE_SUB);53// Above isn't supported by Safari, but54// https://stackoverflow.com/questions/51568821/works-in-chrome-but-breaks-in-safari-invalid-regular-expression-invalid-group55// suggests a slight modification that is UGLIER and slower, but works:56return markdown.replace(57/(?:\S)\ $/gm,58(match) => match[0] + TRAILING_WHITESPACE_SUB,59);60}6162function restoreSingleTrailingWhitespace(tokens) {63for (const token of tokens) {64if (token.content && token.content.includes(TRAILING_WHITESPACE_CHR)) {65token.content = token.content.replace(TRAILING_WHITESPACE_REG, "");66if (token.children != null) {67restoreSingleTrailingWhitespace(token.children);68}69}70}71}7273export function parse_markdown(74markdown: string,75no_meta?: boolean,76): {77tokens: Token[];78meta?: string;79lines: string[];80references?: References;81} {82// const t0 = Date.now();83let meta: undefined | string = undefined;8485markdown = trailingCodeblockWhitespaceHack(markdown);8687if (!no_meta) {88const x = parseHeader(markdown);89markdown = x.body;90meta = x.header;91}9293const lines = markdown.split("\n");94markdown = replaceSingleTrailingWhitespace(markdown);95const state: any = {};96const tokens: Token[] = markdown_it.parse(markdown, state);97restoreSingleTrailingWhitespace(tokens);9899// window.parse_markdown = { tokens, meta };100// console.log("time: parse_markdown", Date.now() - t0, " ms");101// console.log("tokens", tokens);102return { tokens, meta, lines, references: state.references };103}104105function trailingCodeblockWhitespaceHack(markdown: string): string {106// Markdown-it leaves in the ending ``` when there happens to be107// whitespace after it, but otherwise doesn't. This throws off the108// code below, so we have to strip it. See109// https://github.com/sagemathinc/cocalc/issues/6564110// I don't understand *why* this is needed, but it should be harmless111// and I can't find any way around doing this. I tried disabling all112// extensions, updating markdown-it, etc., and it just parses113// code blocks wrong if there is trailing whitespace, despite the114// online demo seeming fine.115if (!markdown) {116// some weird situation even resulted being undefined in prod, and117// this special case also works around that...118return "";119}120// This reg exp just deletes the trailing whitespace from the backticks121// that define code blocks. it's tricky since it involves capture groups122// since one can use more than 3 backticks as a delimiter.123return markdown.replace(/^(```+)\s+$/gm, "$1");124}125126127