-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathjupyter-zeppelin.py
185 lines (157 loc) · 5.52 KB
/
jupyter-zeppelin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import os, sys
import re
import csv
import json
import html
import nbformat
import codecs
from aws.s3 import S3
from StringIO import StringIO
MD = re.compile(r'%md\s')
SQL = re.compile(r'%sql\s')
UNKNOWN_MAGIC = re.compile(r'%\w+\s')
HTML = re.compile(r'%html\s')
def read_io(path):
"""Reads the contents of a local or S3 path into a StringIO.
"""
note = StringIO()
if path.startswith("s3://"):
s3 = S3(env='prod')
for line in s3.read(path):
note.write(line)
note.write("\n")
else:
with open(path) as local:
for line in local.readlines():
note.write(line)
note.seek(0)
return note
def table_cell_to_html(cell):
"""Formats a cell from a Zeppelin TABLE as HTML.
"""
if HTML.match(cell):
# the contents is already HTML
return cell
else:
return html.escape(cell)
def table_to_html(tsv):
"""Formats the tab-separated content of a Zeppelin TABLE as HTML.
"""
io = StringIO.StringIO(tsv)
reader = csv.reader(io, delimiter="\t")
fields = reader.next()
column_headers = "".join([ "<th>" + name + "</th>" for name in fields ])
lines = [
"<table>",
"<tr>{column_headers}</tr>".format(column_headers=column_headers)
]
for row in reader:
lines.append("<tr>" + "".join([ "<td>" + table_cell_to_html(cell) + "</td>" for cell in row ]) + "</tr>")
lines.append("</table>")
return "\n".join(lines)
def convert_json(zeppelin_json):
"""Converts a Zeppelin note from JSON to a Jupyter NotebookNode.
"""
return convert_parsed(json.load(zeppelin_json))
def convert_parsed(zeppelin_note):
"""Converts a Zeppelin note from parsed JSON to a Jupyter NotebookNode.
"""
notebook_name = zeppelin_note['name']
cells = []
index = 0
for paragraph in zeppelin_note['paragraphs']:
code = paragraph.get('text')
if not code:
continue
code = code.lstrip()
cell = {}
if MD.match(code):
cell['cell_type'] = 'markdown'
cell['metadata'] = {}
cell['source'] = code.lstrip('%md').lstrip("\n") # remove '%md'
elif SQL.match(code) or HTML.match(code):
cell['cell_type'] = 'code'
cell['execution_count'] = index
cell['metadata'] = {}
cell['outputs'] = []
cell['source'] = '%' + code # add % to convert to cell magic
elif UNKNOWN_MAGIC.match(code):
# use raw cells for unknown magic
cell['cell_type'] = 'raw'
cell['metadata'] = {'format': 'text/plain'}
cell['source'] = code
else:
cell['cell_type'] = 'code'
cell['execution_count'] = index
cell['metadata'] = {'autoscroll': 'auto'}
cell['outputs'] = []
cell['source'] = code
cells.append(cell)
result = paragraph.get('result')
if cell['cell_type'] == 'code' and result:
if result['code'] == 'SUCCESS':
result_type = result.get('type')
output_by_mime_type = {}
if result_type == 'TEXT':
output_by_mime_type['text/plain'] = result['msg']
elif result_type == 'HTML':
output_by_mime_type['text/html'] = result['msg']
elif result_type == 'TABLE':
output_by_mime_type['text/html'] = table_to_html(result['msg'])
cell['outputs'] = [{
'output_type': 'execute_result',
'metadata': {},
'execution_count': index,
'data': output_by_mime_type
}]
index += 1
notebook = nbformat.from_dict({
"metadata": {
"kernelspec": {
"display_name": "Spark 2.0.0 - Scala 2.11",
"language": "scala",
"name": "spark2-scala"
},
"language_info": {
"codemirror_mode": "text/x-scala",
"file_extension": ".scala",
"mimetype": "text/x-scala",
"name": "scala",
"pygments_lexer": "scala",
"version": "2.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 2,
"cells" : cells,
})
return (notebook_name, notebook)
def write_notebook(notebook_name, notebook, path=None):
"""Writes a NotebookNode to a file created from the notebook name.
If path is None, the output path will be created the notebook name in the current directory.
"""
filename = path
if not filename:
filename = notebook_name + '.ipynb'
if os.path.exists(filename):
for i in range(1, 1000):
filename = notebook_name + ' (' + str(i) + ').ipynb'
if not os.path.exists(filename):
break
if i == 1000:
raise RuntimeError('Cannot write %s: versions 1-1000 already exist.' % (notebook_name,))
with codecs.open(filename, 'w', encoding='UTF-8') as io:
nbformat.write(notebook, io)
return filename
if __name__ == '__main__':
num_args = len(sys.argv)
zeppelin_note_path = None
target_path = None
if num_args == 2:
zeppelin_note_path = sys.argv[1]
elif num_args == 3:
target_path = sys.argv[2]
if not zeppelin_note_path:
exit()
name, content = convert_json(read_io(zeppelin_note_path))
write_notebook(name, content, target_path)