Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Path: blob/master/src/packages/frontend/components/html-ssr.tsx
Views: 687
/*1React component for rendering an HTML string.23- suitable for server side rendering (e.g., nextjs)4- parses and displays math using KaTeX5- sanitizes the HTML for XSS attacks, etc., so it is safe to display to users6- optionally transforms links78TODO: This should eventually completely replace ./html.tsx:9- syntax highlighting10- searching11- opens links in a new tab, or makes clicking anchor tags runs a function12instead of opening a new tab so can open internal cocalc links inside cocalc.13*/1415import React from "react";16import htmlReactParser, {17attributesToProps,18domToReact,19} from "html-react-parser";20import { Element, Text } from "domhandler";21import stripXSS, { safeAttrValue, whiteList } from "xss";22import type { IFilterXSSOptions } from "xss";23import { useFileContext } from "@cocalc/frontend/lib/file-context";24import DefaultMath from "@cocalc/frontend/components/math/ssr";25import { MathJaxConfig } from "@cocalc/util/mathjax-config";26import { decodeHTML } from "entities";2728const URL_TAGS = ["src", "href", "data"];2930const MATH_SKIP_TAGS = new Set<string>(MathJaxConfig.tex2jax.skipTags);3132function getXSSOptions(urlTransform): IFilterXSSOptions | undefined {33// - stripIgnoreTagBody - completely get rid of dangerous HTML34// (otherwise user sees weird mangled style code, when seeing35// nothing would be better).36// - whiteList - we need iframes to support 3d graphics; unfortunately this37// isn't safe without a lot more work, so we do NOT enable them.38return {39stripIgnoreTagBody: true,40// SECURITY: whitelist note -- we had tried to explicitly allow mathjax script tags in sanitized html41// by whitelisting and scanning. However, this didn't properly work (perhaps due to some update)42// and resulted in a security vulnerability:43// https://github.com/sagemathinc/cocalc/security/advisories/GHSA-8w44-hggw-p5rf44// The fix is completley removing any whitelisting of any script tags. The feature of45// mathjax in html is not important enough to support, and too dangerous -- even if it worked,46// it would probably be an easy attack vector by just making up fake mathjax.47// Due to https://github.com/sagemathinc/cocalc/security/advisories/GHSA-jpjc-pwjv-j9mg48// we also remove all use of iframes, which49whiteList: {50...whiteList,51// DISABLED due to https://github.com/sagemathinc/cocalc/security/advisories/GHSA-jpjc-pwjv-j9mg52// iframe: ["src", "srcdoc", "width", "height"],53iframe: [],54html: [],55},56safeAttrValue: (tag, name, value) => {57// disabled since not sufficiently secure.58// if (tag == "iframe" && name == "srcdoc") {59// // important not to mangle this or it won't work.60// return value;61// }62if (urlTransform && URL_TAGS.includes(name)) {63// use the url transform64return urlTransform(value, tag, name) ?? value;65}66// fallback to the builtin version67return safeAttrValue(tag, name, value, false as any);68},69};70}7172export default function HTML({73value,74style,75inline,76}: {77value: string;78style?: React.CSSProperties;79inline?: boolean;80}) {81const { urlTransform, AnchorTagComponent, noSanitize, MathComponent } =82useFileContext();83if (!noSanitize) {84value = stripXSS(value, getXSSOptions(urlTransform));85}86if (value.trimLeft().startsWith("<html>")) {87// Sage output formulas are wrapped in "<html>" for some stupid reason, which88// probably originates with a ridiculous design choice that Tom Boothby or I89// made in 2006 related to "wiki" formatting in Sage notebooks. If we don't strip90// this, then htmlReactParser just deletes the whole documents, since html is91// not a valid tag inside the DOM. We do this in a really minimally flexible way92// to reduce the chances to 0 that we apply this when we shouldn't.93value = value.trim().slice("<html>".length, -"</html>".length);94}95let options: any = {};96options.replace = (domNode) => {97// console.log("domNode = ", domNode);98if (!/^[a-zA-Z]+[0-9]?$/.test(domNode.name)) {99// Without this, if user gives html input that is a malformed tag then all of React100// completely crashes, which is not desirable for us. On the other hand, I prefer not101// to always completely sanitize input, since that can do a lot we don't want to do102// and may be expensive. See103// https://github.com/remarkablemark/html-react-parser/issues/60#issuecomment-398588573104return React.createElement(React.Fragment);105}106if (domNode instanceof Text) {107if (hasAncestor(domNode, MATH_SKIP_TAGS)) {108// Do NOT convert Text to math inside a pre/code tree environment.109return;110}111const { data } = domNode;112if (MathComponent != null) {113return <MathComponent data={decodeHTML(data)} />;114}115return <DefaultMath data={decodeHTML(data)} />;116}117118if (!(domNode instanceof Element)) return;119120const { name, children, attribs } = domNode;121122if (name == "script") {123const type = domNode.attribs?.type?.toLowerCase();124if (type?.startsWith("math/tex")) {125const child = domNode.children?.[0];126if (child instanceof Text && child.data) {127let data = "$" + decodeHTML(child.data) + "$";128if (type.includes("display")) {129data = "$" + data + "$";130}131if (MathComponent != null) {132return <MathComponent data={data} />;133}134return <DefaultMath data={data} />;135}136}137}138139if (AnchorTagComponent != null && name == "a") {140return (141<AnchorTagComponent {...attribs}>142{domToReact(children, options)}143</AnchorTagComponent>144);145}146if (name == "iframe") {147// We sandbox and minimize what we allow. Don't148// use {...attribs} due to srcDoc vs srcdoc.149// We don't allow setting the style, since that leads150// to a lot of attacks (i.e., making the iframe move in a151// sneaky way). We have to allow-same-origin or scripts152// won't work at all, which is one of the main uses for153// iframes. A good test is 3d graphics in Sage kernel154// Jupyter notebooks.155// TODO: Except this is a security issue, since156// combining allow-scripts & allow-same-origin makes it157// possible to remove a lot of sandboxing.158return (159<iframe160src={attribs.src}161srcDoc={attribs.srcdoc}162width={attribs.width}163height={attribs.height}164sandbox="allow-forms allow-scripts allow-same-origin"165/>166);167}168169if (noSanitize && urlTransform != null && attribs != null) {170// since we did not sanitize the HTML (which also does urlTransform),171// we have to do the urlTransform here instead.172for (const tag of URL_TAGS) {173if (attribs[tag] != null) {174const x = urlTransform(attribs[tag]);175if (x != null) {176const props = attributesToProps(attribs);177props[tag] = x;178return React.createElement(179name,180props,181children && children?.length > 0182? domToReact(children, options)183: undefined,184);185}186}187}188}189};190if (inline) {191return <span style={style}>{htmlReactParser(value, options)}</span>;192} else {193return <div style={style}>{htmlReactParser(value, options)}</div>;194}195}196197function hasAncestor(domNode, tags: Set<string>): boolean {198const { parent } = domNode;199if (!(parent instanceof Element)) return false;200if (tags.has(parent.name)) return true;201return hasAncestor(parent, tags);202}203204205