Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sagemathinc
GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/packages/frontend/editors/slate/markdown-to-slate/parse-markdown.ts
1697 views
1
/*
2
* This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.
3
* License: MS-RSL – see LICENSE.md for details
4
*/
5
6
/*
7
This parses markdown using our markdown-it based parser,
8
but math-enhanced beyond what just markdown provides, by
9
stripping math first before considering markdown. This avoids
10
issues with math formulas that can be mistaken for markdown
11
syntax, which is a problem with many math markdown plugins.
12
13
To quote the markdown-it docs: "Why not AST? Because it's
14
not needed for our tasks. We follow KISS principle. If you wish -
15
you can call a parser without a renderer and convert the token
16
stream to an AST." That AST is what slate is.
17
https://github.com/markdown-it/markdown-it/blob/master/docs/architecture.md
18
*/
19
20
import { markdown_it, parseHeader } from "@cocalc/frontend/markdown";
21
22
// Use this instead of the above to test with no plugins, which
23
// can be useful for isolating performance issues.
24
//import MarkdownIt from "markdown-it";
25
//const markdown_it = new MarkdownIt();
26
27
import type { References, Token } from "./types";
28
29
// Before feeding to markdown-it and tokenizing, for
30
// each line that ends in a single trailing space,
31
// append the following unused unicode character:
32
const TRAILING_WHITESPACE_CHR = "\uFE20";
33
const TRAILING_WHITESPACE_SUB = " " + TRAILING_WHITESPACE_CHR;
34
const TRAILING_WHITESPACE_REG = /\uFE20/g;
35
// Once tokenized, we remove the funny unicode character, leaving the
36
// single trailing space.
37
// This is critical to do since markdown-it (and the markdown spec)
38
// just silently removes a single trailing space from any line,
39
// but that's often what people type as they are typing. With
40
// collaborative editing, this is a massive problem, since one
41
// user removes the other user's trailing space, which results in
42
// merge conflicts and thus dropped content. Super annoying.
43
// Note that this sort of problem can still happen when the user
44
// types *two spaces* temporarily at the end of a line. However,
45
// that means newline in markdown, and at this point there is little
46
// that can be done.
47
function replaceSingleTrailingWhitespace(markdown: string): string {
48
// This one little regexp does exactly what we want...
49
// (?<=\S) = match a non-whitespace but don't capture it - see https://stackoverflow.com/questions/3926451/how-to-match-but-not-capture-part-of-a-regex
50
// \ = single space
51
// $ = end of line, because of the "m"
52
// gm = global and m means $ matches end of each line, not whole string.
53
//return markdown.replace(/(?<=\S)\ $/gm, TRAILING_WHITESPACE_SUB);
54
// Above isn't supported by Safari, but
55
// https://stackoverflow.com/questions/51568821/works-in-chrome-but-breaks-in-safari-invalid-regular-expression-invalid-group
56
// suggests a slight modification that is UGLIER and slower, but works:
57
return markdown.replace(
58
/(?:\S)\ $/gm,
59
(match) => match[0] + TRAILING_WHITESPACE_SUB,
60
);
61
}
62
63
function restoreSingleTrailingWhitespace(tokens) {
64
for (const token of tokens) {
65
if (token.content && token.content.includes(TRAILING_WHITESPACE_CHR)) {
66
token.content = token.content.replace(TRAILING_WHITESPACE_REG, "");
67
if (token.children != null) {
68
restoreSingleTrailingWhitespace(token.children);
69
}
70
}
71
}
72
}
73
74
export function parse_markdown(
75
markdown: string,
76
no_meta?: boolean,
77
): {
78
tokens: Token[];
79
meta?: string;
80
lines: string[];
81
references?: References;
82
} {
83
// const t0 = Date.now();
84
let meta: undefined | string = undefined;
85
86
markdown = trailingCodeblockWhitespaceHack(markdown);
87
88
if (!no_meta) {
89
const x = parseHeader(markdown);
90
markdown = x.body;
91
meta = x.header;
92
}
93
94
const lines = markdown.split("\n");
95
markdown = replaceSingleTrailingWhitespace(markdown);
96
const state: any = {};
97
const tokens: Token[] = markdown_it.parse(markdown, state);
98
restoreSingleTrailingWhitespace(tokens);
99
100
// window.parse_markdown = { tokens, meta };
101
// console.log("time: parse_markdown", Date.now() - t0, " ms");
102
// console.log("tokens", tokens);
103
return { tokens, meta, lines, references: state.references };
104
}
105
106
function trailingCodeblockWhitespaceHack(markdown: string): string {
107
// Markdown-it leaves in the ending ``` when there happens to be
108
// whitespace after it, but otherwise doesn't. This throws off the
109
// code below, so we have to strip it. See
110
// https://github.com/sagemathinc/cocalc/issues/6564
111
// I don't understand *why* this is needed, but it should be harmless
112
// and I can't find any way around doing this. I tried disabling all
113
// extensions, updating markdown-it, etc., and it just parses
114
// code blocks wrong if there is trailing whitespace, despite the
115
// online demo seeming fine.
116
if (!markdown) {
117
// some weird situation even resulted being undefined in prod, and
118
// this special case also works around that...
119
return "";
120
}
121
// This reg exp just deletes the trailing whitespace from the backticks
122
// that define code blocks. it's tricky since it involves capture groups
123
// since one can use more than 3 backticks as a delimiter.
124
return markdown.replace(/^(```+)\s+$/gm, "$1");
125
}
126
127