lsst-sqre · athornton · Sep 23, 2024 · Sep 13, 2024 · Sep 6, 2024 · Sep 9, 2024
diff --git a/client/src/rubin/nublado/client/_util.py b/client/src/rubin/nublado/client/_util.py
@@ -1,14 +1,22 @@
 """Utility functions for Nublado client."""
 
 import json
+from enum import Enum
 
 __all__ = [
     "normalize_source",
-    "normalize_source_by_cell",
-    "extract_source_by_cell",
+    "source_string_by_cell",
+    "source_list_by_cell",
+    "notebook_to_disk_form",
+    "notebook_to_api_form",
 ]
 
 
+class NotebookForm(Enum):
+    DISK = "disk"
+    API = "api"
+
+
 def normalize_source(notebook: str) -> str:
     """Extract and concatenate all the source cells from a notebook.
 
@@ -41,13 +49,13 @@ def normalize_source(notebook: str) -> str:
     return "\n".join(
         [
             x.rstrip("\n")
-            for x in normalize_source_by_cell(notebook)
+            for x in source_string_by_cell(notebook)
             if x.rstrip("\n")
         ]
     )
 
 
-def normalize_source_by_cell(notebook: str) -> list[str]:
+def source_string_by_cell(notebook: str) -> list[str]:
     """Extract each cell source to a single string.
 
     Parameters
@@ -61,35 +69,101 @@ def normalize_source_by_cell(notebook: str) -> list[str]:
        A list of all non-empty source lines in a cell as a single Python
     string.  Each cell's source lines (with newline as the line separator) will
     be a separate item of the returned list.
+
+    Notes
+    -----
+    This is what the contents API returns, although the text of the notebook
+    on disk will have each source line as its own entry within a list of
+    strings.  So we will convert it to API form first and then return the
+    source item from each cell.
     """
+    notebook = notebook_to_api_form(notebook)
+    obj = json.loads(notebook)
     return [
-        "\n".join(
-            [
-                x.rstrip("\n")
-                for y in extract_source_by_cell(notebook)
-                for x in y
-                if x.rstrip("\n")
-            ]
-        )
+        x["source"]
+        for x in obj["cells"]
+        if x["cell_type"] == "code" and "source" in x and x["source"]
     ]
 
 
-def extract_source_by_cell(notebook: str) -> list[list[str]]:
-    """Extract all non-empty "code" cells' "source" lines as a list of strings.
+def source_list_by_cell(notebook: str) -> list[list[str]]:
+    """Extract all non-empty "code" cells' "source" entry as a list of strings.
 
     Parameters
     ----------
     notebook
-        The text of the notebook file.
+        The notebook text, or the results of the Contents API.
 
     Returns
     -------
-    list[list[str]]
-       Source lines
+    list[str]
+       Source entries.
+
+    Notes
+    -----
+    In the notebook, "source" is a list of strings.  In the Contents API, it's
+    a single string.  So we will convert the notebook to disk form, and return
+    the list of lists.
     """
+    notebook = notebook_to_disk_form(notebook)
     obj = json.loads(notebook)
     return [
         x["source"]
         for x in obj["cells"]
-        if x["cell_type"] == "code" and x["source"]
+        if x["cell_type"] == "code" and "source" in x and x["source"]
     ]
+
+
+def notebook_to_disk_form(notebook: str) -> str:
+    return _transform_notebook(notebook, NotebookForm.DISK)
+
+
+def notebook_to_api_form(notebook: str) -> str:
+    return _transform_notebook(notebook, NotebookForm.API)
+
+
+def _transform_notebook(notebook: str, form: NotebookForm) -> str:
+    obj = json.loads(notebook)
+    cells = obj["cells"]
+    # Transform each cell's source as needed
+    for cell in cells:
+        if cell["cell_type"] != "code":
+            continue
+        if "source" not in cell or not cell["source"]:
+            continue
+        src = cell["source"]
+        if (isinstance(src, str) and form == NotebookForm.API) or (
+            isinstance(src, list) and form == NotebookForm.DISK
+        ):
+            # Already in the correct form
+            continue
+        if form == NotebookForm.API:
+            # Turn source into a newline-separated string
+            cell["source"] = _list_to_string(src)
+            continue
+        # If we got this far, we need to turn the source into a list, where
+        # all items but the list end in a single newline.
+        cell["source"] = _string_to_list(src)
+    return json.dumps(obj)
+
+
+def _list_to_string(src: list[str]) -> str:
+    copy_list: list[str] = []
+    for src_line in src:
+        copy_line = src_line.rstrip("\n")
+        if copy_line:
+            copy_list.append(copy_line)
+    return "\n".join(copy_list)
+
+
+def _string_to_list(src: str) -> list[str]:
+    src_list = src.split("\n")
+    copy_list: list[str] = []
+    for src_line in src_list:
+        copy_line = src_line.rstrip("\n")
+        if copy_line:
+            copy_line += "\n"
+            copy_list.append(copy_line)
+    if copy_list:
+        copy_list[-1].rstrip("\n")
+    return copy_list
diff --git a/client/src/rubin/nublado/client/nubladoclient.py b/client/src/rubin/nublado/client/nubladoclient.py
@@ -30,7 +30,7 @@
 from websockets.exceptions import WebSocketException
 
 from ._constants import WEBSOCKET_OPEN_TIMEOUT
-from ._util import extract_source_by_cell
+from ._util import source_list_by_cell
 from .exceptions import (
     CodeExecutionError,
     ExecutionAPIError,
@@ -444,7 +444,7 @@ async def run_notebook(self, notebook: Path) -> list[str]:
         self._logger.debug(f"Getting content from {url}")
         resp = await self._client.get(url)
         notebook = resp.json()["content"]
-        sources = extract_source_by_cell(json.dumps(notebook))
+        sources = source_list_by_cell(json.dumps(notebook))
         self._logger.debug(f"Content: {sources}")
         retlist: list[str] = []
         for cellsrc in sources:

diff --git a/client/src/rubin/nublado/client/testing/_jupyter.py b/client/src/rubin/nublado/client/testing/_jupyter.py
@@ -24,7 +24,7 @@
 from httpx import Request, Response
 from safir.datetime import current_datetime
 
-from .._util import normalize_source
+from .._util import normalize_source, notebook_to_api_form
 from ..models import NotebookExecutionResult
 
 
@@ -373,6 +373,10 @@ def get_content(self, request: Request) -> Response:
 
         This is only enough to provide for the NubladoClient's run_notebook
         functionality.  We don't even use a real timestamp.
+
+        Irritatingly, the real Contents API represents the source of each
+        cell as a single string, while a notebook on disk represents it as
+        a list of strings, so we do need to simulate that.
         """
         user = request.headers.get("X-Auth-Request-User", None)
         if user is None:
@@ -385,7 +389,7 @@ def get_content(self, request: Request) -> Response:
         path = str(request.url)[len(contents_url) :]
         try:
             filename = self._user_dir / path
-            content = json.loads(filename.read_text())
+            content = notebook_to_api_form(json.loads(filename.read_text()))
             fn = filename.name
             tstamp = "2024-09-12T17:55:05.077220Z"
             resp_obj = {