Path: blob/master/src/packages/jupyter/ipynb/import-from-ipynb.ts
1447 views
/*1* This file is part of CoCalc: Copyright © 2020 Sagemath, Inc.2* License: MS-RSL – see LICENSE.md for details3*/45/*6Importing from an ipynb object (in-memory version of .ipynb file)7*/89import * as misc from "@cocalc/util/misc";10import { JUPYTER_MIMETYPES } from "@cocalc/jupyter/util/misc";1112const DEFAULT_IPYNB = {13cells: [14{15cell_type: "code",16execution_count: null,17metadata: {},18outputs: [],19source: [],20},21],22metadata: {23kernelspec: undefined,24language_info: undefined,25},26nbformat: 4,27nbformat_minor: 4,28};2930export class IPynbImporter {31private _ipynb: any;32private _new_id: any;33private _output_handler: any;34private _existing_ids: any;35private _cells: any;36private _kernel: any;37private _metadata: any;38private _language_info: any;39import = (opts: any) => {40opts = misc.defaults(opts, {41ipynb: {},42new_id: undefined, // function that returns an unused id given43// an is_available function; new_id(is_available) = a new id.44existing_ids: [], // re-use these on loading for efficiency purposes45output_handler: undefined, // h = output_handler(cell); h.message(...) -- hard to explain46}); // process attachments: attachment(base64, mime) --> sha14748this._ipynb = misc.deep_copy(opts.ipynb);49this._new_id = opts.new_id;50this._output_handler = opts.output_handler;51this._existing_ids = opts.existing_ids; // option to re-use existing ids5253this._handle_old_versions(); // must come before sanity checks, as old versions are "insane". -- see https://github.com/sagemathinc/cocalc/issues/193754this._sanity_improvements();55this._import_settings();56this._import_metadata();57this._read_in_cells();58};59cells = () => {60return this._cells;61};6263kernel = () => {64return this._kernel;65};6667metadata = () => {68return this._metadata;69};7071close = () => {72delete this._cells;73delete this._kernel;74delete this._metadata;75delete this._language_info;76delete this._ipynb;77delete this._existing_ids;78delete this._new_id;79delete this._output_handler;80};8182// Everything below is the internal private implementation.8384private _sanity_improvements = () => {85// Do some basic easy sanity improvements to ipynb boject,86// in case parts of the object are missing.87const ipynb = this._ipynb;88if (ipynb.cells == null || ipynb.cells.length === 0) {89ipynb.cells = misc.deep_copy(DEFAULT_IPYNB.cells);90}91if (ipynb.metadata == null) {92ipynb.metadata = misc.deep_copy(DEFAULT_IPYNB.metadata);93}94if (ipynb.nbformat == null) {95ipynb.nbformat = DEFAULT_IPYNB.nbformat;96}97ipynb.nbformat_minor != null98? ipynb.nbformat_minor99: (ipynb.nbformat_minor = DEFAULT_IPYNB.nbformat_minor);100};101102private _handle_old_versions = () => {103// Update the ipynb file from formats before version 4.104// There are other changes made when parsing cells.105const ipynb = this._ipynb;106if (ipynb.nbformat >= 4) {107return;108}109if (ipynb.cells == null) {110ipynb.cells = [];111}112for (const worksheet of ipynb.worksheets || []) {113for (const cell of worksheet.cells || []) {114if (cell.input != null) {115cell.source = cell.input;116delete cell.input;117}118if (cell.cell_type === "heading") {119cell.cell_type = "markdown";120if (misc.is_array(cell.source)) {121cell.source = cell.source.join("");122}123cell.source = `# ${cell.source}`;124}125if (cell.outputs) {126for (const mesg of cell.outputs) {127if (mesg.output_type === "pyout") {128for (const type of JUPYTER_MIMETYPES) {129const b = type.split("/")[1];130if (mesg[b] != null) {131const data = { [type]: mesg[b] };132for (const k in mesg) {133delete mesg[k];134}135mesg.data = data;136break;137}138}139if (mesg.text != null) {140const data = { "text/plain": mesg.text.join("") };141for (const k in mesg) {142delete mesg[k];143}144mesg.data = data;145}146}147}148}149ipynb.cells.push(cell);150}151}152};153154_import_settings = () => {155this._kernel =156this._ipynb &&157this._ipynb.metadata &&158this._ipynb.metadata.kernelspec &&159this._ipynb.metadata.kernelspec.name;160if (this._kernel != null) {161// kernel names are supposed to be case insensitive162// https://jupyter-client.readthedocs.io/en/latest/kernels.html163// We also make them all lower case when reading them in at164// src/packages/jupyter/kernel/kernel-data.ts165this._kernel = this._kernel.toLowerCase();166}167};168169_import_metadata = () => {170const m = this._ipynb != null ? this._ipynb.metadata : undefined;171if (m == null) {172return;173}174const metadata: any = {};175for (const k in m) {176const v = m[k];177if (k === "kernelspec") {178continue;179}180metadata[k] = v;181}182if (misc.len(metadata) > 0) {183this._metadata = metadata;184}185};186187_read_in_cells = () => {188const ipynb = this._ipynb;189this._cells = {};190if ((ipynb != null ? ipynb.cells : undefined) == null) {191// nothing to do192return;193}194let n = 0;195for (let cell of ipynb.cells) {196cell = this._import_cell(cell, n);197this._cells[cell.id] = cell;198n += 1;199}200};201202_update_output_format = (content: any) => {203if ((this._ipynb != null ? this._ipynb.nbformat : undefined) >= 4) {204return content;205}206// fix old deprecated fields207if (content.output_type === "stream") {208if (misc.is_array(content.text)) {209content.text = content.text.join("");210}211content.name = content.stream;212} else {213for (const t of JUPYTER_MIMETYPES) {214const b = t.split("/")[1];215if (content[b] != null) {216content = { data: { [t]: content[b] } };217break; // at most one data per message.218}219}220if (content.text != null) {221content = {222data: { "text/plain": content.text },223output_type: "stream",224};225}226}227return content;228};229230_join_array_strings_obj = (obj: any) => {231if (obj != null) {232for (const key in obj) {233const val = obj[key];234if (misc.is_array(val)) {235obj[key] = val.join("");236}237}238}239return obj;240};241242// Mutate content to be of the format we use internally243_import_cell_output_content = (content: any): void => {244content = this._update_output_format(content); // old versions245this._join_array_strings_obj(content.data); // arrays --> strings246if (misc.is_array(content.text)) {247content.text = content.text.join("");248}249remove_redundant_reps(content.data); // multiple output formats250delete content.prompt_number; // redundant; in some files251};252253_id_is_available = (id: any) => {254return !(255(this._cells != null ? this._cells[id] : undefined) ||256(this._existing_ids != null ? this._existing_ids : []).includes(id)257);258};259260_get_new_id = (cell) => {261if (cell?.id && this._id_is_available(cell.id)) {262// attempt to use id in the ipynb file263return cell.id;264}265if (this._new_id != null) {266return this._new_id(this._id_is_available);267} else {268let id = 0;269while (true) {270const s = `${id}`;271if (this._id_is_available(s)) {272return s;273}274id += 1;275}276}277};278279_get_exec_count = (execution_count?: number, prompt_number?: number) => {280if (execution_count != null) {281return execution_count;282} else if (prompt_number != null) {283return prompt_number;284} else {285return null;286}287};288289_get_cell_type = (cell_type?: string) => {290return cell_type != null ? cell_type : "code";291};292293_get_cell_output = (outputs: any, alt_outputs: any, id: any) => {294if (outputs == null || outputs.length == 0) {295return null;296}297let handler: any;298const cell: any = { id, output: {} };299if (this._output_handler != null) {300handler = this._output_handler(cell);301}302let k: string; // it's perfectly fine that k is a string here.303for (k in outputs) {304let content = outputs[k];305if (alt_outputs != null && alt_outputs[k] != null) {306content = alt_outputs[k];307}308this._import_cell_output_content(content);309if (handler != null) {310handler.message(content);311} else {312cell.output[k] = content;313}314}315if (handler != null && typeof handler.done === "function") {316handler.done();317}318return cell.output;319};320321_get_cell_input(source) {322if (source != null) {323// "If you intend to work with notebook files directly, you must allow multi-line324// string fields to be either a string or list of strings."325// https://nbformat.readthedocs.io/en/latest/format_description.html#top-level-structure326if (misc.is_array(source)) {327return source.join("");328} else {329return source;330}331} else {332return null;333}334}335336_import_cell(cell: any, n: any) {337const id =338(this._existing_ids != null ? this._existing_ids[n] : undefined) != null339? this._existing_ids != null340? this._existing_ids[n]341: undefined342: this._get_new_id(cell);343const obj: any = {344type: "cell",345id,346pos: n,347input: this._get_cell_input(cell.source),348output: this._get_cell_output(349cell.outputs,350cell.metadata != null && cell.metadata.cocalc != null351? cell.metadata.cocalc.outputs352: undefined,353id,354),355cell_type: this._get_cell_type(cell.cell_type),356exec_count: this._get_exec_count(357cell.execution_count,358cell.prompt_number,359),360};361362if (cell.metadata != null) {363for (const k of ["collapsed", "scrolled"]) {364if (cell.metadata[k]) {365obj[k] = !!(cell.metadata != null ? cell.metadata[k] : undefined);366}367}368369if (cell.metadata.slideshow != null) {370obj.slide = cell.metadata.slideshow.slide_type;371}372373if (cell.metadata.tags != null) {374obj.tags = misc.dict(cell.metadata.tags.map((tag) => [tag, true]));375}376const other = misc.copy_without(cell.metadata, [377"collapsed",378"scrolled",379"slideshow",380"tags",381"_root",382"__ownerID",383"__hash",384"__altered",385]);386// See https://github.com/sagemathinc/cocalc/issues/3191 for387// why the _'d ones above; this is to fix "corrupted" worksheets.388if (misc.len(other) > 0) {389obj.metadata = other;390}391}392if (cell.attachments != null) {393obj.attachments = {};394for (const name in cell.attachments) {395const val = cell.attachments[name];396for (const mime in val) {397const base64 = val[mime];398obj.attachments[name] = { type: "base64", value: base64 };399}400}401}402return obj;403}404}405406export function remove_redundant_reps(data?: any) {407if (data == null) {408return;409}410// We only keep the first representation in types, since it provides the richest411// representation in the client; there is no need for the others.412// TODO: probably we should still store all of these types somewhere (in the413// backend only) for the .ipynb export, but I'm not doing that right now!414// This means opening and closing an ipynb file may lose information, which415// no client currently cares about (?) -- maybe nbconvert does.416let keep;417for (const type of JUPYTER_MIMETYPES) {418if (data[type] != null) {419keep = type;420break;421}422}423if (keep != null) {424for (const type in data) {425// NOTE: we only remove multiple reps that are both in JUPYTER_MIMETYPES;426// if there is another rep that is NOT in JUPYTER_MIMETYPES, then it is427// not removed, e.g., application/vnd.jupyter.widget-view+json and428// text/plain both are types of representation of a widget.429if (JUPYTER_MIMETYPES[type] !== undefined && type !== keep) {430delete data[type];431}432}433}434return data;435}436437438