Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Path: blob/master/src/packages/jupyter/ipynb/import-from-ipynb.ts
Views: 687
/*1* This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.2* License: MS-RSL – see LICENSE.md for details3*/45/*6Importing from an ipynb object (in-memory version of .ipynb file)7*/89import * as misc from "@cocalc/util/misc";10import { JUPYTER_MIMETYPES } from "@cocalc/jupyter/util/misc";1112const DEFAULT_IPYNB = {13cells: [14{15cell_type: "code",16execution_count: null,17metadata: {},18outputs: [],19source: [],20},21],22metadata: {23kernelspec: undefined,24language_info: undefined,25},26nbformat: 4,27nbformat_minor: 4,28};2930export class IPynbImporter {31private _ipynb: any;32private _new_id: any;33private _output_handler: any;34private _process_attachment: any;35private _existing_ids: any;36private _cells: any;37private _kernel: any;38private _metadata: any;39private _language_info: any;40import = (opts: any) => {41opts = misc.defaults(opts, {42ipynb: {},43new_id: undefined, // function that returns an unused id given44// an is_available function; new_id(is_available) = a new id.45existing_ids: [], // re-use these on loading for efficiency purposes46output_handler: undefined, // h = output_handler(cell); h.message(...) -- hard to explain47process_attachment: undefined,48}); // process attachments: attachment(base64, mime) --> sha14950this._ipynb = misc.deep_copy(opts.ipynb);51this._new_id = opts.new_id;52this._output_handler = opts.output_handler;53this._process_attachment = opts.process_attachment;54this._existing_ids = opts.existing_ids; // option to re-use existing ids5556this._handle_old_versions(); // must come before sanity checks, as old versions are "insane". -- see https://github.com/sagemathinc/cocalc/issues/193757this._sanity_improvements();58this._import_settings();59this._import_metadata();60this._read_in_cells();61};62cells = () => {63return this._cells;64};6566kernel = () => {67return this._kernel;68};6970metadata = () => {71return this._metadata;72};7374close = () => {75delete this._cells;76delete this._kernel;77delete this._metadata;78delete this._language_info;79delete this._ipynb;80delete this._existing_ids;81delete this._new_id;82delete this._output_handler;83delete this._process_attachment;84};8586// Everything below is the internal private implementation.8788private _sanity_improvements = () => {89// Do some basic easy sanity improvements to ipynb boject,90// in case parts of the object are missing.91const ipynb = this._ipynb;92if (ipynb.cells == null || ipynb.cells.length === 0) {93ipynb.cells = misc.deep_copy(DEFAULT_IPYNB.cells);94}95if (ipynb.metadata == null) {96ipynb.metadata = misc.deep_copy(DEFAULT_IPYNB.metadata);97}98if (ipynb.nbformat == null) {99ipynb.nbformat = DEFAULT_IPYNB.nbformat;100}101ipynb.nbformat_minor != null102? ipynb.nbformat_minor103: (ipynb.nbformat_minor = DEFAULT_IPYNB.nbformat_minor);104};105106private _handle_old_versions = () => {107// Update the ipynb file from formats before version 4.108// There are other changes made when parsing cells.109const ipynb = this._ipynb;110if (ipynb.nbformat >= 4) {111return;112}113if (ipynb.cells == null) {114ipynb.cells = [];115}116for (const worksheet of ipynb.worksheets || []) {117for (const cell of worksheet.cells || []) {118if (cell.input != null) {119cell.source = cell.input;120delete cell.input;121}122if (cell.cell_type === "heading") {123cell.cell_type = "markdown";124if (misc.is_array(cell.source)) {125cell.source = cell.source.join("");126}127cell.source = `# ${cell.source}`;128}129if (cell.outputs) {130for (const mesg of cell.outputs) {131if (mesg.output_type === "pyout") {132for (const type of JUPYTER_MIMETYPES) {133const b = type.split("/")[1];134if (mesg[b] != null) {135const data = { [type]: mesg[b] };136for (const k in mesg) {137delete mesg[k];138}139mesg.data = data;140break;141}142}143if (mesg.text != null) {144const data = { "text/plain": mesg.text.join("") };145for (const k in mesg) {146delete mesg[k];147}148mesg.data = data;149}150}151}152}153ipynb.cells.push(cell);154}155}156};157158_import_settings = () => {159this._kernel =160this._ipynb &&161this._ipynb.metadata &&162this._ipynb.metadata.kernelspec &&163this._ipynb.metadata.kernelspec.name;164if (this._kernel != null) {165// kernel names are supposed to be case insensitive166// https://jupyter-client.readthedocs.io/en/latest/kernels.html167// We also make them all lower case when reading them in at168// src/packages/jupyter/kernel/kernel-data.ts169this._kernel = this._kernel.toLowerCase();170}171};172173_import_metadata = () => {174const m = this._ipynb != null ? this._ipynb.metadata : undefined;175if (m == null) {176return;177}178const metadata: any = {};179for (const k in m) {180const v = m[k];181if (k === "kernelspec") {182continue;183}184metadata[k] = v;185}186if (misc.len(metadata) > 0) {187this._metadata = metadata;188}189};190191_read_in_cells = () => {192const ipynb = this._ipynb;193this._cells = {};194if ((ipynb != null ? ipynb.cells : undefined) == null) {195// nothing to do196return;197}198let n = 0;199for (let cell of ipynb.cells) {200cell = this._import_cell(cell, n);201this._cells[cell.id] = cell;202n += 1;203}204};205206_update_output_format = (content: any) => {207if ((this._ipynb != null ? this._ipynb.nbformat : undefined) >= 4) {208return content;209}210// fix old deprecated fields211if (content.output_type === "stream") {212if (misc.is_array(content.text)) {213content.text = content.text.join("");214}215content.name = content.stream;216} else {217for (const t of JUPYTER_MIMETYPES) {218const b = t.split("/")[1];219if (content[b] != null) {220content = { data: { [t]: content[b] } };221break; // at most one data per message.222}223}224if (content.text != null) {225content = {226data: { "text/plain": content.text },227output_type: "stream",228};229}230}231return content;232};233234_join_array_strings_obj = (obj: any) => {235if (obj != null) {236for (const key in obj) {237const val = obj[key];238if (misc.is_array(val)) {239obj[key] = val.join("");240}241}242}243return obj;244};245246// Mutate content to be of the format we use internally247_import_cell_output_content = (content: any): void => {248content = this._update_output_format(content); // old versions249this._join_array_strings_obj(content.data); // arrays --> strings250if (misc.is_array(content.text)) {251content.text = content.text.join("");252}253remove_redundant_reps(content.data); // multiple output formats254delete content.prompt_number; // redundant; in some files255};256257_id_is_available = (id: any) => {258return !(259(this._cells != null ? this._cells[id] : undefined) ||260(this._existing_ids != null ? this._existing_ids : []).includes(id)261);262};263264_get_new_id = (cell) => {265if (cell?.id && this._id_is_available(cell.id)) {266// attempt to use id in the ipynb file267return cell.id;268}269if (this._new_id != null) {270return this._new_id(this._id_is_available);271} else {272let id = 0;273while (true) {274const s = `${id}`;275if (this._id_is_available(s)) {276return s;277}278id += 1;279}280}281};282283_get_exec_count = (execution_count?: number, prompt_number?: number) => {284if (execution_count != null) {285return execution_count;286} else if (prompt_number != null) {287return prompt_number;288} else {289return null;290}291};292293_get_cell_type = (cell_type?: string) => {294return cell_type != null ? cell_type : "code";295};296297_get_cell_output = (outputs: any, alt_outputs: any, id: any) => {298if (outputs == null || outputs.length == 0) {299return null;300}301let handler: any;302const cell: any = { id, output: {} };303if (this._output_handler != null) {304handler = this._output_handler(cell);305}306let k: string; // it's perfectly fine that k is a string here.307for (k in outputs) {308let content = outputs[k];309if (alt_outputs != null && alt_outputs[k] != null) {310content = alt_outputs[k];311}312this._import_cell_output_content(content);313if (handler != null) {314handler.message(content);315} else {316cell.output[k] = content;317}318}319if (handler != null && typeof handler.done === "function") {320handler.done();321}322return cell.output;323};324325_get_cell_input(source) {326if (source != null) {327// "If you intend to work with notebook files directly, you must allow multi-line328// string fields to be either a string or list of strings."329// https://nbformat.readthedocs.io/en/latest/format_description.html#top-level-structure330if (misc.is_array(source)) {331return source.join("");332} else {333return source;334}335} else {336return null;337}338}339340_import_cell(cell: any, n: any) {341const id =342(this._existing_ids != null ? this._existing_ids[n] : undefined) != null343? this._existing_ids != null344? this._existing_ids[n]345: undefined346: this._get_new_id(cell);347const obj: any = {348type: "cell",349id,350pos: n,351input: this._get_cell_input(cell.source),352output: this._get_cell_output(353cell.outputs,354cell.metadata != null && cell.metadata.cocalc != null355? cell.metadata.cocalc.outputs356: undefined,357id,358),359cell_type: this._get_cell_type(cell.cell_type),360exec_count: this._get_exec_count(361cell.execution_count,362cell.prompt_number,363),364};365366if (cell.metadata != null) {367for (const k of ["collapsed", "scrolled"]) {368if (cell.metadata[k]) {369obj[k] = !!(cell.metadata != null ? cell.metadata[k] : undefined);370}371}372373if (cell.metadata.slideshow != null) {374obj.slide = cell.metadata.slideshow.slide_type;375}376377if (cell.metadata.tags != null) {378obj.tags = misc.dict(cell.metadata.tags.map((tag) => [tag, true]));379}380const other = misc.copy_without(cell.metadata, [381"collapsed",382"scrolled",383"slideshow",384"tags",385"_root",386"__ownerID",387"__hash",388"__altered",389]);390// See https://github.com/sagemathinc/cocalc/issues/3191 for391// why the _'d ones above; this is to fix "corrupted" worksheets.392if (misc.len(other) > 0) {393obj.metadata = other;394}395}396if (cell.attachments != null) {397obj.attachments = {};398for (const name in cell.attachments) {399const val = cell.attachments[name];400for (const mime in val) {401const base64 = val[mime];402if (this._process_attachment != null) {403try {404const sha1 = this._process_attachment(base64, mime);405obj.attachments[name] = { type: "sha1", value: sha1 };406} catch (err) {407// We put this in input, since actually attachments are408// only for markdown cells (?), and they have no output.409// Anyway, I'm mainly putting this here to debug this410// and it should never failed when debugged.411// Just to be clear again: this should never ever happen.412const text = `\n${err.stack}\nCoCalc Bug -- ${err}\n`;413obj.input = (obj.input ?? "") + text;414}415} else {416obj.attachments[name] = { type: "base64", value: base64 };417}418}419}420}421return obj;422}423}424425export function remove_redundant_reps(data?: any) {426if (data == null) {427return;428}429// We only keep the first representation in types, since it provides the richest430// representation in the client; there is no need for the others.431// TODO: probably we should still store all of these types somewhere (in the432// backend only) for the .ipynb export, but I'm not doing that right now!433// This means opening and closing an ipynb file may lose information, which434// no client currently cares about (?) -- maybe nbconvert does.435let keep;436for (const type of JUPYTER_MIMETYPES) {437if (data[type] != null) {438keep = type;439break;440}441}442if (keep != null) {443for (const type in data) {444// NOTE: we only remove multiple reps that are both in JUPYTER_MIMETYPES;445// if there is another rep that is NOT in JUPYTER_MIMETYPES, then it is446// not removed, e.g., application/vnd.jupyter.widget-view+json and447// text/plain both are types of representation of a widget.448if (JUPYTER_MIMETYPES[type] !== undefined && type !== keep) {449delete data[type];450}451}452}453return data;454}455456457