CoCalc -- html-ssr.tsx

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.

GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/packages/frontend/components/html-ssr.tsx
Views: ⁶⁸⁷
1
/*
2
React component for rendering an HTML string.
3

4
- suitable for server side rendering (e.g., nextjs)
5
- parses and displays math using KaTeX
6
- sanitizes the HTML for XSS attacks, etc., so it is safe to display to users
7
- optionally transforms links
8

9
TODO: This should eventually completely replace ./html.tsx:
10
- syntax highlighting
11
- searching
12
- opens links in a new tab, or makes clicking anchor tags runs a function
13
  instead of opening a new tab so can open internal cocalc links inside cocalc.
14
*/
15

16
import React from "react";
17
import htmlReactParser, {
18
  attributesToProps,
19
  domToReact,
20
} from "html-react-parser";
21
import { Element, Text } from "domhandler";
22
import stripXSS, { safeAttrValue, whiteList } from "xss";
23
import type { IFilterXSSOptions } from "xss";
24
import { useFileContext } from "@cocalc/frontend/lib/file-context";
25
import DefaultMath from "@cocalc/frontend/components/math/ssr";
26
import { MathJaxConfig } from "@cocalc/util/mathjax-config";
27
import { decodeHTML } from "entities";
28

29
const URL_TAGS = ["src", "href", "data"];
30

31
const MATH_SKIP_TAGS = new Set<string>(MathJaxConfig.tex2jax.skipTags);
32

33
function getXSSOptions(urlTransform): IFilterXSSOptions | undefined {
34
  // - stripIgnoreTagBody - completely get rid of dangerous HTML
35
  //   (otherwise user sees weird mangled style code, when seeing
36
  //   nothing would be better).
37
  // - whiteList - we need iframes to support 3d graphics; unfortunately this
38
  //   isn't safe without a lot more work, so we do NOT enable them.
39
  return {
40
    stripIgnoreTagBody: true,
41
    // SECURITY: whitelist note -- we had tried to explicitly allow mathjax script tags in sanitized html
42
    // by whitelisting and scanning.  However, this didn't properly work (perhaps due to some update)
43
    // and resulted in a security vulnerability:
44
    //    https://github.com/sagemathinc/cocalc/security/advisories/GHSA-8w44-hggw-p5rf
45
    // The fix is completley removing any whitelisting of any script tags.  The feature of
46
    // mathjax in html is not important enough to support, and too dangerous -- even if it worked,
47
    // it would probably be an easy attack vector by just making up fake mathjax.
48
    // Due to https://github.com/sagemathinc/cocalc/security/advisories/GHSA-jpjc-pwjv-j9mg
49
    // we also remove all use of iframes, which
50
    whiteList: {
51
      ...whiteList,
52
      // DISABLED due to https://github.com/sagemathinc/cocalc/security/advisories/GHSA-jpjc-pwjv-j9mg
53
      // iframe: ["src", "srcdoc", "width", "height"],
54
      iframe: [],
55
      html: [],
56
    },
57
    safeAttrValue: (tag, name, value) => {
58
      // disabled since not sufficiently secure.
59
      //       if (tag == "iframe" && name == "srcdoc") {
60
      //         // important not to mangle this or it won't work.
61
      //         return value;
62
      //       }
63
      if (urlTransform && URL_TAGS.includes(name)) {
64
        // use the url transform
65
        return urlTransform(value, tag, name) ?? value;
66
      }
67
      // fallback to the builtin version
68
      return safeAttrValue(tag, name, value, false as any);
69
    },
70
  };
71
}
72

73
export default function HTML({
74
  value,
75
  style,
76
  inline,
77
}: {
78
  value: string;
79
  style?: React.CSSProperties;
80
  inline?: boolean;
81
}) {
82
  const { urlTransform, AnchorTagComponent, noSanitize, MathComponent } =
83
    useFileContext();
84
  if (!noSanitize) {
85
    value = stripXSS(value, getXSSOptions(urlTransform));
86
  }
87
  if (value.trimLeft().startsWith("<html>")) {
88
    // Sage output formulas are wrapped in "<html>" for some stupid reason, which
89
    // probably originates with a ridiculous design choice that Tom Boothby or I
90
    // made in 2006 related to "wiki" formatting in Sage notebooks.  If we don't strip
91
    // this, then htmlReactParser just deletes the whole documents, since html is
92
    // not a valid tag inside the DOM.  We do this in a really minimally flexible way
93
    // to reduce the chances to 0 that we apply this when we shouldn't.
94
    value = value.trim().slice("<html>".length, -"</html>".length);
95
  }
96
  let options: any = {};
97
  options.replace = (domNode) => {
98
    // console.log("domNode = ", domNode);
99
    if (!/^[a-zA-Z]+[0-9]?$/.test(domNode.name)) {
100
      // Without this, if user gives html input that is a malformed tag then all of React
101
      // completely crashes, which is not desirable for us.  On the other hand, I prefer not
102
      // to always completely sanitize input, since that can do a lot we don't want to do
103
      // and may be expensive. See
104
      //   https://github.com/remarkablemark/html-react-parser/issues/60#issuecomment-398588573
105
      return React.createElement(React.Fragment);
106
    }
107
    if (domNode instanceof Text) {
108
      if (hasAncestor(domNode, MATH_SKIP_TAGS)) {
109
        // Do NOT convert Text to math inside a pre/code tree environment.
110
        return;
111
      }
112
      const { data } = domNode;
113
      if (MathComponent != null) {
114
        return <MathComponent data={decodeHTML(data)} />;
115
      }
116
      return <DefaultMath data={decodeHTML(data)} />;
117
    }
118

119
    if (!(domNode instanceof Element)) return;
120

121
    const { name, children, attribs } = domNode;
122

123
    if (name == "script") {
124
      const type = domNode.attribs?.type?.toLowerCase();
125
      if (type?.startsWith("math/tex")) {
126
        const child = domNode.children?.[0];
127
        if (child instanceof Text && child.data) {
128
          let data = "$" + decodeHTML(child.data) + "$";
129
          if (type.includes("display")) {
130
            data = "$" + data + "$";
131
          }
132
          if (MathComponent != null) {
133
            return <MathComponent data={data} />;
134
          }
135
          return <DefaultMath data={data} />;
136
        }
137
      }
138
    }
139

140
    if (AnchorTagComponent != null && name == "a") {
141
      return (
142
        <AnchorTagComponent {...attribs}>
143
          {domToReact(children, options)}
144
        </AnchorTagComponent>
145
      );
146
    }
147
    if (name == "iframe") {
148
      // We sandbox and minimize what we allow.  Don't
149
      // use {...attribs} due to srcDoc vs srcdoc.
150
      // We don't allow setting the style, since that leads
151
      // to a lot of attacks (i.e., making the iframe move in a
152
      // sneaky way).  We have to allow-same-origin or scripts
153
      // won't work at all, which is one of the main uses for
154
      // iframes.  A good test is 3d graphics in Sage kernel
155
      // Jupyter notebooks.
156
      // TODO: Except this is a security issue, since
157
      // combining allow-scripts & allow-same-origin makes it
158
      // possible to remove a lot of sandboxing.
159
      return (
160
        <iframe
161
          src={attribs.src}
162
          srcDoc={attribs.srcdoc}
163
          width={attribs.width}
164
          height={attribs.height}
165
          sandbox="allow-forms allow-scripts allow-same-origin"
166
        />
167
      );
168
    }
169

170
    if (noSanitize && urlTransform != null && attribs != null) {
171
      // since we did not sanitize the HTML (which also does urlTransform),
172
      // we have to do the urlTransform here instead.
173
      for (const tag of URL_TAGS) {
174
        if (attribs[tag] != null) {
175
          const x = urlTransform(attribs[tag]);
176
          if (x != null) {
177
            const props = attributesToProps(attribs);
178
            props[tag] = x;
179
            return React.createElement(
180
              name,
181
              props,
182
              children && children?.length > 0
183
                ? domToReact(children, options)
184
                : undefined,
185
            );
186
          }
187
        }
188
      }
189
    }
190
  };
191
  if (inline) {
192
    return <span style={style}>{htmlReactParser(value, options)}</span>;
193
  } else {
194
    return <div style={style}>{htmlReactParser(value, options)}</div>;
195
  }
196
}
197

198
function hasAncestor(domNode, tags: Set<string>): boolean {
199
  const { parent } = domNode;
200
  if (!(parent instanceof Element)) return false;
201
  if (tags.has(parent.name)) return true;
202
  return hasAncestor(parent, tags);
203
}
204

205
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.

Product

Resources

Company

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more, all in one place.

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.