CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
sagemathinc

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.

GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/packages/frontend/components/html-ssr.tsx
Views: 687
1
/*
2
React component for rendering an HTML string.
3
4
- suitable for server side rendering (e.g., nextjs)
5
- parses and displays math using KaTeX
6
- sanitizes the HTML for XSS attacks, etc., so it is safe to display to users
7
- optionally transforms links
8
9
TODO: This should eventually completely replace ./html.tsx:
10
- syntax highlighting
11
- searching
12
- opens links in a new tab, or makes clicking anchor tags runs a function
13
instead of opening a new tab so can open internal cocalc links inside cocalc.
14
*/
15
16
import React from "react";
17
import htmlReactParser, {
18
attributesToProps,
19
domToReact,
20
} from "html-react-parser";
21
import { Element, Text } from "domhandler";
22
import stripXSS, { safeAttrValue, whiteList } from "xss";
23
import type { IFilterXSSOptions } from "xss";
24
import { useFileContext } from "@cocalc/frontend/lib/file-context";
25
import DefaultMath from "@cocalc/frontend/components/math/ssr";
26
import { MathJaxConfig } from "@cocalc/util/mathjax-config";
27
import { decodeHTML } from "entities";
28
29
const URL_TAGS = ["src", "href", "data"];
30
31
const MATH_SKIP_TAGS = new Set<string>(MathJaxConfig.tex2jax.skipTags);
32
33
function getXSSOptions(urlTransform): IFilterXSSOptions | undefined {
34
// - stripIgnoreTagBody - completely get rid of dangerous HTML
35
// (otherwise user sees weird mangled style code, when seeing
36
// nothing would be better).
37
// - whiteList - we need iframes to support 3d graphics; unfortunately this
38
// isn't safe without a lot more work, so we do NOT enable them.
39
return {
40
stripIgnoreTagBody: true,
41
// SECURITY: whitelist note -- we had tried to explicitly allow mathjax script tags in sanitized html
42
// by whitelisting and scanning. However, this didn't properly work (perhaps due to some update)
43
// and resulted in a security vulnerability:
44
// https://github.com/sagemathinc/cocalc/security/advisories/GHSA-8w44-hggw-p5rf
45
// The fix is completley removing any whitelisting of any script tags. The feature of
46
// mathjax in html is not important enough to support, and too dangerous -- even if it worked,
47
// it would probably be an easy attack vector by just making up fake mathjax.
48
// Due to https://github.com/sagemathinc/cocalc/security/advisories/GHSA-jpjc-pwjv-j9mg
49
// we also remove all use of iframes, which
50
whiteList: {
51
...whiteList,
52
// DISABLED due to https://github.com/sagemathinc/cocalc/security/advisories/GHSA-jpjc-pwjv-j9mg
53
// iframe: ["src", "srcdoc", "width", "height"],
54
iframe: [],
55
html: [],
56
},
57
safeAttrValue: (tag, name, value) => {
58
// disabled since not sufficiently secure.
59
// if (tag == "iframe" && name == "srcdoc") {
60
// // important not to mangle this or it won't work.
61
// return value;
62
// }
63
if (urlTransform && URL_TAGS.includes(name)) {
64
// use the url transform
65
return urlTransform(value, tag, name) ?? value;
66
}
67
// fallback to the builtin version
68
return safeAttrValue(tag, name, value, false as any);
69
},
70
};
71
}
72
73
export default function HTML({
74
value,
75
style,
76
inline,
77
}: {
78
value: string;
79
style?: React.CSSProperties;
80
inline?: boolean;
81
}) {
82
const { urlTransform, AnchorTagComponent, noSanitize, MathComponent } =
83
useFileContext();
84
if (!noSanitize) {
85
value = stripXSS(value, getXSSOptions(urlTransform));
86
}
87
if (value.trimLeft().startsWith("<html>")) {
88
// Sage output formulas are wrapped in "<html>" for some stupid reason, which
89
// probably originates with a ridiculous design choice that Tom Boothby or I
90
// made in 2006 related to "wiki" formatting in Sage notebooks. If we don't strip
91
// this, then htmlReactParser just deletes the whole documents, since html is
92
// not a valid tag inside the DOM. We do this in a really minimally flexible way
93
// to reduce the chances to 0 that we apply this when we shouldn't.
94
value = value.trim().slice("<html>".length, -"</html>".length);
95
}
96
let options: any = {};
97
options.replace = (domNode) => {
98
// console.log("domNode = ", domNode);
99
if (!/^[a-zA-Z]+[0-9]?$/.test(domNode.name)) {
100
// Without this, if user gives html input that is a malformed tag then all of React
101
// completely crashes, which is not desirable for us. On the other hand, I prefer not
102
// to always completely sanitize input, since that can do a lot we don't want to do
103
// and may be expensive. See
104
// https://github.com/remarkablemark/html-react-parser/issues/60#issuecomment-398588573
105
return React.createElement(React.Fragment);
106
}
107
if (domNode instanceof Text) {
108
if (hasAncestor(domNode, MATH_SKIP_TAGS)) {
109
// Do NOT convert Text to math inside a pre/code tree environment.
110
return;
111
}
112
const { data } = domNode;
113
if (MathComponent != null) {
114
return <MathComponent data={decodeHTML(data)} />;
115
}
116
return <DefaultMath data={decodeHTML(data)} />;
117
}
118
119
if (!(domNode instanceof Element)) return;
120
121
const { name, children, attribs } = domNode;
122
123
if (name == "script") {
124
const type = domNode.attribs?.type?.toLowerCase();
125
if (type?.startsWith("math/tex")) {
126
const child = domNode.children?.[0];
127
if (child instanceof Text && child.data) {
128
let data = "$" + decodeHTML(child.data) + "$";
129
if (type.includes("display")) {
130
data = "$" + data + "$";
131
}
132
if (MathComponent != null) {
133
return <MathComponent data={data} />;
134
}
135
return <DefaultMath data={data} />;
136
}
137
}
138
}
139
140
if (AnchorTagComponent != null && name == "a") {
141
return (
142
<AnchorTagComponent {...attribs}>
143
{domToReact(children, options)}
144
</AnchorTagComponent>
145
);
146
}
147
if (name == "iframe") {
148
// We sandbox and minimize what we allow. Don't
149
// use {...attribs} due to srcDoc vs srcdoc.
150
// We don't allow setting the style, since that leads
151
// to a lot of attacks (i.e., making the iframe move in a
152
// sneaky way). We have to allow-same-origin or scripts
153
// won't work at all, which is one of the main uses for
154
// iframes. A good test is 3d graphics in Sage kernel
155
// Jupyter notebooks.
156
// TODO: Except this is a security issue, since
157
// combining allow-scripts & allow-same-origin makes it
158
// possible to remove a lot of sandboxing.
159
return (
160
<iframe
161
src={attribs.src}
162
srcDoc={attribs.srcdoc}
163
width={attribs.width}
164
height={attribs.height}
165
sandbox="allow-forms allow-scripts allow-same-origin"
166
/>
167
);
168
}
169
170
if (noSanitize && urlTransform != null && attribs != null) {
171
// since we did not sanitize the HTML (which also does urlTransform),
172
// we have to do the urlTransform here instead.
173
for (const tag of URL_TAGS) {
174
if (attribs[tag] != null) {
175
const x = urlTransform(attribs[tag]);
176
if (x != null) {
177
const props = attributesToProps(attribs);
178
props[tag] = x;
179
return React.createElement(
180
name,
181
props,
182
children && children?.length > 0
183
? domToReact(children, options)
184
: undefined,
185
);
186
}
187
}
188
}
189
}
190
};
191
if (inline) {
192
return <span style={style}>{htmlReactParser(value, options)}</span>;
193
} else {
194
return <div style={style}>{htmlReactParser(value, options)}</div>;
195
}
196
}
197
198
function hasAncestor(domNode, tags: Set<string>): boolean {
199
const { parent } = domNode;
200
if (!(parent instanceof Element)) return false;
201
if (tags.has(parent.name)) return true;
202
return hasAncestor(parent, tags);
203
}
204
205