diff --git a/tools/databricks/README.md b/tools/databricks/README.md index 984a579ac..7691e2508 100644 --- a/tools/databricks/README.md +++ b/tools/databricks/README.md @@ -20,4 +20,4 @@ top of the notebook. After that, select *Run all* to execute the tools for the 1. Multiple event logs must be comma-separated. - For example: `/dbfs/path/to/eventlog1,/dbfs/path/to/eventlog2` -**Latest Tools Version Supported** 24.06.1 \ No newline at end of file +**Latest Tools Version Supported** 24.08.0 \ No newline at end of file diff --git a/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb index 5221ebd92..503b18ffb 100644 --- a/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb +++ b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Profiling Tool Notebook Template.ipynb @@ -4,7 +4,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "df33c614-2ecc-47a0-8600-bc891681997f", "showTitle": false, @@ -50,7 +53,7 @@ }, "outputs": [], "source": [ - "TOOLS_VER = \"24.06.1\"\n", + "TOOLS_VER = \"24.08.0\"\n", "print(f\"Using Tools Version: {TOOLS_VER}\")" ] }, @@ -156,7 +159,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "f83af6c8-5a79-4a46-965b-38a4cb621877", "showTitle": false, @@ -380,7 +386,10 @@ "cell_type": "markdown", "metadata": { "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, "inputWidgets": {}, "nuid": "bbe50fde-0bd6-4281-95fd-6a1ec6f17ab2", "showTitle": false, @@ -455,7 +464,7 @@ "stack": true }, "nuid": "91c1bfb2-695a-4e5c-8a25-848a433108dc", - "origId": 1075819839476955, + "origId": 2173122769183713, "title": "Executive View", "version": "DashboardViewV1", "width": 1600 @@ -469,7 +478,7 @@ "stack": true }, "nuid": "62243296-4562-4f06-90ac-d7a609f19c16", - "origId": 1075819839476956, + "origId": 2173122769183714, "title": "App View", "version": "DashboardViewV1", "width": 1920 @@ -479,7 +488,7 @@ "language": "python", "notebookMetadata": { "mostRecentlyExecutedCommandWithImplicitDF": { - "commandId": 203373918309288, + "commandId": 2173122769183692, "dataframes": [ "_sqldf" ] @@ -507,11 +516,11 @@ "widgetInfo": { "widgetType": "text", "defaultValue": "/dbfs/user1/profiling_logs", - "label": null, + "label": "", "name": "Eventlog Path", "options": { "widgetType": "text", - "autoCreated": null, + "autoCreated": false, "validationRegex": null } } diff --git a/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb index 4eab0c475..898a4846f 100644 --- a/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb +++ b/tools/databricks/[RAPIDS Accelerator for Apache Spark] Qualification Tool Notebook Template.ipynb @@ -49,7 +49,7 @@ }, "outputs": [], "source": [ - "TOOLS_VER = \"24.06.1\"\n", + "TOOLS_VER = \"24.08.0\"\n", "print(f\"Using Tools Version: {TOOLS_VER}\")" ] }, @@ -282,6 +282,7 @@ "\n", "try:\n", " output_folder, log_file_location = extract_file_info(CONSOLE_OUTPUT_PATH, OUTPUT_PATH)\n", + " jar_output_folder = os.path.join(output_folder, \"rapids_4_spark_qualification_output\")\n", " print(f\"Output folder detected {output_folder}\")\n", " copy_logs(output_folder, log_file_location, CONSOLE_OUTPUT_PATH, CONSOLE_ERROR_PATH)\n", " print(f\"Logs successfully copied to {output_folder}\")\n", @@ -424,9 +425,110 @@ "outputs": [], "source": [ "summary_output=pd.read_csv(os.path.join(output_folder, \"qualification_summary.csv\"))\n", + "summary_output=summary_output.drop(columns=[\"Unnamed: 0\"]).rename_axis('Index').reset_index()\n", "display(summary_output)" ] }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "73b5e0b0-3a96-4cc6-8e6c-840e4b0d9d43", + "showTitle": false, + "title": "" + } + }, + "source": [ + "\n", + "## Application Status\n", + "\n", + "The report show the status of each eventlog file that was provided\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c9ffbfdb-dbb6-4736-b9cb-2ac457cc6714", + "showTitle": true, + "title": "rapids_4_spark_qualification_output_status.csv" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "status_output=pd.read_csv(os.path.join(jar_output_folder, \"rapids_4_spark_qualification_output_status.csv\"))\n", + "display(status_output)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "09945d39-f9c2-4f4a-8afd-4f309f24f8e0", + "showTitle": false, + "title": "" + } + }, + "source": [ + "\n", + "## Metadata for Migration\n", + "\n", + "The report show the metadata of each app as:\n", + "- Recommended GPU cluster\n", + "- File location of full cluster config recommendations\n", + "- File location of only Gpu specific config recommendations\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "133cf1bd-33b6-4a62-9ae2-5505717092d1", + "showTitle": true, + "title": "app_metadata.json" + }, + "jupyter": { + "source_hidden": true + } + }, + "outputs": [], + "source": [ + "import json\n", + "metadata_file = os.path.join(output_folder, \"app_metadata.json\")\n", + "def camel_to_title(name):\n", + " return re.sub('([a-z])([A-Z])', r'\\1 \\2', name).title()\n", + " \n", + "with open(metadata_file, 'r') as file:\n", + " json_data = json.load(file)\n", + "\n", + "df = pd.DataFrame(json_data)\n", + "df['recommendedGpuCluster'] = df['clusterInfo'].apply(lambda x: x['recommendedCluster'])\n", + "df['sourceCluster'] = df['clusterInfo'].apply(lambda x: x['sourceCluster'])\n", + "df.drop(columns=['clusterInfo'], inplace=True)\n", + "df = df[['appId', 'appName', 'estimatedGpuSpeedupCategory', 'recommendedGpuCluster', 'fullClusterConfigRecommendations', 'gpuConfigRecommendationBreakdown']]\n", + "df.columns = [camel_to_title(col) for col in df.columns]\n", + "display(df)" + ] + }, { "cell_type": "markdown", "metadata": { @@ -474,7 +576,6 @@ }, "outputs": [], "source": [ - "jar_output_folder = os.path.join(output_folder, \"rapids_4_spark_qualification_output\")\n", "stages_output=pd.read_csv(os.path.join(jar_output_folder, \"rapids_4_spark_qualification_output_stages.csv\"))\n", "display(stages_output)" ] @@ -524,7 +625,7 @@ "inputWidgets": {}, "nuid": "998b0c51-0cb6-408e-a01a-d1f5b1a61e1f", "showTitle": true, - "title": "rapids_4_spark_qualification_output_execs" + "title": "rapids_4_spark_qualification_output_execs.csv" }, "jupyter": { "source_hidden": true @@ -549,7 +650,7 @@ "stack": true }, "nuid": "91c1bfb2-695a-4e5c-8a25-848a433108dc", - "origId": 1075819839476974, + "origId": 2173122769183715, "title": "Executive View", "version": "DashboardViewV1", "width": 1600 @@ -563,17 +664,31 @@ "stack": true }, "nuid": "62243296-4562-4f06-90ac-d7a609f19c16", - "origId": 1075819839476975, + "origId": 2173122769183716, "title": "App View", "version": "DashboardViewV1", "width": 1920 + }, + { + "elements": [], + "globalVars": {}, + "guid": "", + "layoutOption": { + "grid": true, + "stack": true + }, + "nuid": "854f9c75-5977-42aa-b3dd-c680b8331f19", + "origId": 2173122769183722, + "title": "Untitled", + "version": "DashboardViewV1", + "width": 1024 } ], "environmentMetadata": null, "language": "python", "notebookMetadata": { "mostRecentlyExecutedCommandWithImplicitDF": { - "commandId": 1075819839476965, + "commandId": 2173122769183704, "dataframes": [ "_sqldf" ]