From 81a249b225a89d2825ef1b67373e8bb0453a3558 Mon Sep 17 00:00:00 2001 From: Anup Kumar Date: Fri, 18 Mar 2022 10:31:04 +0100 Subject: [PATCH] refactor datasets --- ipython_script/analyse_wf_march_22.ipynb | 1822 ++++++++++++++++++++++ 1 file changed, 1822 insertions(+) create mode 100644 ipython_script/analyse_wf_march_22.ipynb diff --git a/ipython_script/analyse_wf_march_22.ipynb b/ipython_script/analyse_wf_march_22.ipynb new file mode 100644 index 0000000..9b97ad5 --- /dev/null +++ b/ipython_script/analyse_wf_march_22.ipynb @@ -0,0 +1,1822 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 46, + "id": "6f18f94d-2489-42d6-a9e7-172c8fadd39f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "ad85fbb3-57aa-49b5-a5d9-8f9d63321309", + "metadata": {}, + "outputs": [], + "source": [ + "wf_path = \"../data/workflow-connections_Feb_22_1.csv\"" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "0d460c01-3cde-4163-9764-220dc5db00ae", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/anupkumar/anaconda3/envs/tool_prediction_gru_wc/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3072: DtypeWarning: Columns (0,2,5) have mixed types.Specify dtype option on import or set low_memory=False.\n", + " interactivity=interactivity, compiler=compiler, result=result)\n" + ] + } + ], + "source": [ + "wf_frame = pd.read_csv(wf_path, sep=\"|\", header=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "23807c0e-abaf-4496-99d6-bbfaa784d8f8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012345678910
0wf_idwf_updatedin_id...in_tool_vout_id...out_tool_vpublisheddeletedhas_errors
1--------+------------+---------+--------------...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
232013-02-075Grep1 ...1.0.17Remove beginning1 ...1.0.0ftf
332013-02-076Cut1 ...1.0.18addValue ...1.0.0ftf
432013-02-077Remove beginning1 ...1.0.06Cut1 ...1.0.1ftf
....................................
24129411349212022-02-151.9924e+06...1.99235e+06toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom...0.38.1fff
24129421349212022-02-151.99240e+06...1.99235e+06toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom...0.38.1fff
24129431349212022-02-151.99241e+06...1.99235e+06toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom...0.38.1fff
24129441349212022-02-151.99241e+06...1.99236e+06toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/...2.4.2+galaxy0fff
2412945(2412943 rows)NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

2412946 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 \\\n", + "0 wf_id wf_updated \n", + "1 --------+------------+---------+--------------... NaN \n", + "2 3 2013-02-07 \n", + "3 3 2013-02-07 \n", + "4 3 2013-02-07 \n", + "... ... ... \n", + "2412941 134921 2022-02-15 \n", + "2412942 134921 2022-02-15 \n", + "2412943 134921 2022-02-15 \n", + "2412944 134921 2022-02-15 \n", + "2412945 (2412943 rows) NaN \n", + "\n", + " 2 3 \\\n", + "0 in_id ... \n", + "1 NaN NaN \n", + "2 5 Grep1 ... \n", + "3 6 Cut1 ... \n", + "4 7 Remove beginning1 ... \n", + "... ... ... \n", + "2412941 1.9924e+06 ... \n", + "2412942 1.99240e+06 ... \n", + "2412943 1.99241e+06 ... \n", + "2412944 1.99241e+06 ... \n", + "2412945 NaN NaN \n", + "\n", + " 4 5 \\\n", + "0 in_tool_v out_id \n", + "1 NaN NaN \n", + "2 1.0.1 7 \n", + "3 1.0.1 8 \n", + "4 1.0.0 6 \n", + "... ... ... \n", + "2412941 1.99235e+06 \n", + "2412942 1.99235e+06 \n", + "2412943 1.99235e+06 \n", + "2412944 1.99236e+06 \n", + "2412945 NaN NaN \n", + "\n", + " 6 \\\n", + "0 ... \n", + "1 NaN \n", + "2 Remove beginning1 ... \n", + "3 addValue ... \n", + "4 Cut1 ... \n", + "... ... \n", + "2412941 toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... \n", + "2412942 toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... \n", + "2412943 toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... \n", + "2412944 toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/... \n", + "2412945 NaN \n", + "\n", + " 7 8 \\\n", + "0 out_tool_v published \n", + "1 NaN NaN \n", + "2 1.0.0 f \n", + "3 1.0.0 f \n", + "4 1.0.1 f \n", + "... ... ... \n", + "2412941 0.38.1 f \n", + "2412942 0.38.1 f \n", + "2412943 0.38.1 f \n", + "2412944 2.4.2+galaxy0 f \n", + "2412945 NaN NaN \n", + "\n", + " 9 10 \n", + "0 deleted has_errors \n", + "1 NaN NaN \n", + "2 t f \n", + "3 t f \n", + "4 t f \n", + "... ... ... \n", + "2412941 f f \n", + "2412942 f f \n", + "2412943 f f \n", + "2412944 f f \n", + "2412945 NaN NaN \n", + "\n", + "[2412946 rows x 11 columns]" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wf_frame" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "9dd25661-cab1-4a78-8570-cd5696b8bb38", + "metadata": {}, + "outputs": [], + "source": [ + "wf_frame_1 = wf_frame[1:len(wf_frame.index)-1]" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "30d18e50-8561-4d8b-869b-2d31ae0cf86a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012345678910
232013-02-075Grep1 ...1.0.17Remove beginning1 ...1.0.0ftf
332013-02-076Cut1 ...1.0.18addValue ...1.0.0ftf
432013-02-077Remove beginning1 ...1.0.06Cut1 ...1.0.1ftf
532013-02-077Remove beginning1 ...1.0.09Cut1 ...1.0.1ftf
632013-02-078addValue ...1.0.011Paste1 ...1.0.0ftf
....................................
24129401349212022-02-151.9924e+06...1.99235e+06toolshed.g2.bx.psu.edu/repos/iuc/filter_tabul...2.0.0fff
24129411349212022-02-151.9924e+06...1.99235e+06toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom...0.38.1fff
24129421349212022-02-151.99240e+06...1.99235e+06toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom...0.38.1fff
24129431349212022-02-151.99241e+06...1.99235e+06toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom...0.38.1fff
24129441349212022-02-151.99241e+06...1.99236e+06toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/...2.4.2+galaxy0fff
\n", + "

2412943 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 \\\n", + "2 3 2013-02-07 5 \n", + "3 3 2013-02-07 6 \n", + "4 3 2013-02-07 7 \n", + "5 3 2013-02-07 7 \n", + "6 3 2013-02-07 8 \n", + "... ... ... ... \n", + "2412940 134921 2022-02-15 1.9924e+06 \n", + "2412941 134921 2022-02-15 1.9924e+06 \n", + "2412942 134921 2022-02-15 1.99240e+06 \n", + "2412943 134921 2022-02-15 1.99241e+06 \n", + "2412944 134921 2022-02-15 1.99241e+06 \n", + "\n", + " 3 \\\n", + "2 Grep1 ... \n", + "3 Cut1 ... \n", + "4 Remove beginning1 ... \n", + "5 Remove beginning1 ... \n", + "6 addValue ... \n", + "... ... \n", + "2412940 ... \n", + "2412941 ... \n", + "2412942 ... \n", + "2412943 ... \n", + "2412944 ... \n", + "\n", + " 4 5 \\\n", + "2 1.0.1 7 \n", + "3 1.0.1 8 \n", + "4 1.0.0 6 \n", + "5 1.0.0 9 \n", + "6 1.0.0 11 \n", + "... ... ... \n", + "2412940 1.99235e+06 \n", + "2412941 1.99235e+06 \n", + "2412942 1.99235e+06 \n", + "2412943 1.99235e+06 \n", + "2412944 1.99236e+06 \n", + "\n", + " 6 \\\n", + "2 Remove beginning1 ... \n", + "3 addValue ... \n", + "4 Cut1 ... \n", + "5 Cut1 ... \n", + "6 Paste1 ... \n", + "... ... \n", + "2412940 toolshed.g2.bx.psu.edu/repos/iuc/filter_tabul... \n", + "2412941 toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... \n", + "2412942 toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... \n", + "2412943 toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... \n", + "2412944 toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/... \n", + "\n", + " 7 8 \\\n", + "2 1.0.0 f \n", + "3 1.0.0 f \n", + "4 1.0.1 f \n", + "5 1.0.1 f \n", + "6 1.0.0 f \n", + "... ... ... \n", + "2412940 2.0.0 f \n", + "2412941 0.38.1 f \n", + "2412942 0.38.1 f \n", + "2412943 0.38.1 f \n", + "2412944 2.4.2+galaxy0 f \n", + "\n", + " 9 10 \n", + "2 t f \n", + "3 t f \n", + "4 t f \n", + "5 t f \n", + "6 t f \n", + "... ... .. \n", + "2412940 f f \n", + "2412941 f f \n", + "2412942 f f \n", + "2412943 f f \n", + "2412944 f f \n", + "\n", + "[2412943 rows x 11 columns]" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wf_frame_1 = wf_frame_1[1:]\n", + "wf_frame_1" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "13182050-3366-4a3b-b121-5c3d4e8555a3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012345678910
1910966724182020-12-041018284toolshed.g2.bx.psu.edu/repos/iuc/anndata_mani...0.6.22.post1+galaxy11018287toolshed.g2.bx.psu.edu/repos/iuc/scanpy_plot/...1.4.4.post1+galaxy1fff
45925919152015-01-1929099cat1 ...1.0.029099cat1 ...1.0.0ftf
108358919152015-01-1929099cat1 ...1.0.029099cat1 ...1.0.0ftf
25264219152015-01-1929099cat1 ...1.0.029099cat1 ...1.0.0ftf
68218872014-03-1415219...15230toolshed.g2.bx.psu.edu/repos/devteam/fastqc/f...ft
....................................
1592784217472018-12-18331828...331801toolshed.g2.bx.psu.edu/repos/iuc/stacks_procr...1.46.0fft
1590874216002018-12-17328839toolshed.g2.bx.psu.edu/repos/galaxyp/openms_m...2.2.0328835toolshed.g2.bx.psu.edu/repos/galaxyp/openms_p...2.2.0fff
147423654882016-07-2981677toolshed.g2.bx.psu.edu/repos/devteam/get_flan...1.0.081707toolshed.g2.bx.psu.edu/repos/devteam/join/gop...1.0.0ftf
1713787396942019-11-21597867toolshed.g2.bx.psu.edu/repos/devteam/bamtools...2.4.1597868toolshed.g2.bx.psu.edu/repos/iuc/umi_tools_co...0.5.3.2fff
21584101058602021-08-141556606...1556612toolshed.g2.bx.psu.edu/repos/devteam/fastqc/f...0.72+galaxy1ff
\n", + "

2412943 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 \\\n", + "1910966 72418 2020-12-04 1018284 \n", + "459259 1915 2015-01-19 29099 \n", + "1083589 1915 2015-01-19 29099 \n", + "252642 1915 2015-01-19 29099 \n", + "6821 887 2014-03-14 15219 \n", + "... ... ... ... \n", + "1592784 21747 2018-12-18 331828 \n", + "1590874 21600 2018-12-17 328839 \n", + "1474236 5488 2016-07-29 81677 \n", + "1713787 39694 2019-11-21 597867 \n", + "2158410 105860 2021-08-14 1556606 \n", + "\n", + " 3 \\\n", + "1910966 toolshed.g2.bx.psu.edu/repos/iuc/anndata_mani... \n", + "459259 cat1 ... \n", + "1083589 cat1 ... \n", + "252642 cat1 ... \n", + "6821 ... \n", + "... ... \n", + "1592784 ... \n", + "1590874 toolshed.g2.bx.psu.edu/repos/galaxyp/openms_m... \n", + "1474236 toolshed.g2.bx.psu.edu/repos/devteam/get_flan... \n", + "1713787 toolshed.g2.bx.psu.edu/repos/devteam/bamtools... \n", + "2158410 ... \n", + "\n", + " 4 5 \\\n", + "1910966 0.6.22.post1+galaxy1 1018287 \n", + "459259 1.0.0 29099 \n", + "1083589 1.0.0 29099 \n", + "252642 1.0.0 29099 \n", + "6821 15230 \n", + "... ... ... \n", + "1592784 331801 \n", + "1590874 2.2.0 328835 \n", + "1474236 1.0.0 81707 \n", + "1713787 2.4.1 597868 \n", + "2158410 1556612 \n", + "\n", + " 6 \\\n", + "1910966 toolshed.g2.bx.psu.edu/repos/iuc/scanpy_plot/... \n", + "459259 cat1 ... \n", + "1083589 cat1 ... \n", + "252642 cat1 ... \n", + "6821 toolshed.g2.bx.psu.edu/repos/devteam/fastqc/f... \n", + "... ... \n", + "1592784 toolshed.g2.bx.psu.edu/repos/iuc/stacks_procr... \n", + "1590874 toolshed.g2.bx.psu.edu/repos/galaxyp/openms_p... \n", + "1474236 toolshed.g2.bx.psu.edu/repos/devteam/join/gop... \n", + "1713787 toolshed.g2.bx.psu.edu/repos/iuc/umi_tools_co... \n", + "2158410 toolshed.g2.bx.psu.edu/repos/devteam/fastqc/f... \n", + "\n", + " 7 8 \\\n", + "1910966 1.4.4.post1+galaxy1 f \n", + "459259 1.0.0 f \n", + "1083589 1.0.0 f \n", + "252642 1.0.0 f \n", + "6821 f \n", + "... ... ... \n", + "1592784 1.46.0 f \n", + "1590874 2.2.0 f \n", + "1474236 1.0.0 f \n", + "1713787 0.5.3.2 f \n", + "2158410 0.72+galaxy1 f \n", + "\n", + " 9 10 \n", + "1910966 f f \n", + "459259 t f \n", + "1083589 t f \n", + "252642 t f \n", + "6821 t \n", + "... ... .. \n", + "1592784 f t \n", + "1590874 f f \n", + "1474236 t f \n", + "1713787 f f \n", + "2158410 f \n", + "\n", + "[2412943 rows x 11 columns]" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wf_frame_1 = wf_frame_1.sample(frac=1)\n", + "wf_frame_1" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "f1b96941-6fa4-4735-a292-4a19e42fe9f7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012345678910
45925919152015-01-1929099cat1 ...1.0.029099cat1 ...1.0.0ftf
108358919152015-01-1929099cat1 ...1.0.029099cat1 ...1.0.0ftf
25264219152015-01-1929099cat1 ...1.0.029099cat1 ...1.0.0ftf
68218872014-03-1415219...15230toolshed.g2.bx.psu.edu/repos/devteam/fastqc/f...ft
122911319152015-01-1929099cat1 ...1.0.029099cat1 ...1.0.0ftf
....................................
29363119152015-01-1929099cat1 ...1.0.029099cat1 ...1.0.0ftf
43123419152015-01-1929099cat1 ...1.0.029099cat1 ...1.0.0ftf
22991841255972022-01-271850173...1850165toolshed.g2.bx.psu.edu/repos/iuc/purge_dups/p...1.2.5+galaxy4ftf
1985563849312021-03-181187845addValue ...1.0.01187846toolshed.g2.bx.psu.edu/repos/bgruening/text_p...0.1.0ft
33102019152015-01-1929099cat1 ...1.0.029099cat1 ...1.0.0ftf
\n", + "

4999 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 \\\n", + "459259 1915 2015-01-19 29099 \n", + "1083589 1915 2015-01-19 29099 \n", + "252642 1915 2015-01-19 29099 \n", + "6821 887 2014-03-14 15219 \n", + "1229113 1915 2015-01-19 29099 \n", + "... ... ... ... \n", + "293631 1915 2015-01-19 29099 \n", + "431234 1915 2015-01-19 29099 \n", + "2299184 125597 2022-01-27 1850173 \n", + "1985563 84931 2021-03-18 1187845 \n", + "331020 1915 2015-01-19 29099 \n", + "\n", + " 3 \\\n", + "459259 cat1 ... \n", + "1083589 cat1 ... \n", + "252642 cat1 ... \n", + "6821 ... \n", + "1229113 cat1 ... \n", + "... ... \n", + "293631 cat1 ... \n", + "431234 cat1 ... \n", + "2299184 ... \n", + "1985563 addValue ... \n", + "331020 cat1 ... \n", + "\n", + " 4 5 \\\n", + "459259 1.0.0 29099 \n", + "1083589 1.0.0 29099 \n", + "252642 1.0.0 29099 \n", + "6821 15230 \n", + "1229113 1.0.0 29099 \n", + "... ... ... \n", + "293631 1.0.0 29099 \n", + "431234 1.0.0 29099 \n", + "2299184 1850165 \n", + "1985563 1.0.0 1187846 \n", + "331020 1.0.0 29099 \n", + "\n", + " 6 \\\n", + "459259 cat1 ... \n", + "1083589 cat1 ... \n", + "252642 cat1 ... \n", + "6821 toolshed.g2.bx.psu.edu/repos/devteam/fastqc/f... \n", + "1229113 cat1 ... \n", + "... ... \n", + "293631 cat1 ... \n", + "431234 cat1 ... \n", + "2299184 toolshed.g2.bx.psu.edu/repos/iuc/purge_dups/p... \n", + "1985563 toolshed.g2.bx.psu.edu/repos/bgruening/text_p... \n", + "331020 cat1 ... \n", + "\n", + " 7 8 \\\n", + "459259 1.0.0 f \n", + "1083589 1.0.0 f \n", + "252642 1.0.0 f \n", + "6821 f \n", + "1229113 1.0.0 f \n", + "... ... ... \n", + "293631 1.0.0 f \n", + "431234 1.0.0 f \n", + "2299184 1.2.5+galaxy4 f \n", + "1985563 0.1.0 f \n", + "331020 1.0.0 f \n", + "\n", + " 9 10 \n", + "459259 t f \n", + "1083589 t f \n", + "252642 t f \n", + "6821 t \n", + "1229113 t f \n", + "... ... .. \n", + "293631 t f \n", + "431234 t f \n", + "2299184 t f \n", + "1985563 t \n", + "331020 t f \n", + "\n", + "[4999 rows x 11 columns]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wf_frame_subset = wf_frame_1[1:5000]\n", + "wf_frame_subset" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "id": "69ae32d2-928b-4c1f-a059-338fa1d4e494", + "metadata": {}, + "outputs": [], + "source": [ + "wf_frame_subset.to_csv(\"../data/wf_frame_subset_march_22_5000.csv\", index=None, header=None)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "cb300cf4-98c1-4eed-85c1-90763128513e", + "metadata": {}, + "outputs": [], + "source": [ + "wf_path_old = \"../data/worflow-connection-20-04.tsv\"\n", + "\n", + "wf_old = pd.read_csv(wf_path_old, sep=\"\\t\")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "257c9f14-da19-4b4e-b09c-a08b85cc5f87", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
32013-02-07 16:48:46.7218665Grep11.0.17Remove beginning11.0.0ftf.1
032013-02-07 16:48:46.7218666Cut11.0.18addValue1.0.0ftf
132013-02-07 16:48:46.7218667Remove beginning11.0.09Cut11.0.1ftf
232013-02-07 16:48:46.7218667Remove beginning11.0.06Cut11.0.1ftf
332013-02-07 16:48:46.7218668addValue1.0.011Paste11.0.0ftf
432013-02-07 16:48:46.7218669Cut11.0.111Paste11.0.0ftf
....................................
746133507792020-04-27 14:36:39.197104727112Cut1NaN727113toolshed.g2.bx.psu.edu/repos/devteam/intersect...NaNffNaN
746134507792020-04-27 14:36:39.197104727111toolshed.g2.bx.psu.edu/repos/devteam/column_ma...NaN727112Cut1NaNffNaN
746135507792020-04-27 14:36:39.197104727110toolshed.g2.bx.psu.edu/repos/devteam/column_ma...NaN727111toolshed.g2.bx.psu.edu/repos/devteam/column_ma...NaNffNaN
746136507792020-04-27 14:36:39.197104727108NaNNaN727114toolshed.g2.bx.psu.edu/repos/devteam/intersect...NaNffNaN
746137507792020-04-27 14:36:39.197104727108NaNNaN727113toolshed.g2.bx.psu.edu/repos/devteam/intersect...NaNffNaN
\n", + "

746138 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " 3 2013-02-07 16:48:46.721866 5 \\\n", + "0 3 2013-02-07 16:48:46.721866 6 \n", + "1 3 2013-02-07 16:48:46.721866 7 \n", + "2 3 2013-02-07 16:48:46.721866 7 \n", + "3 3 2013-02-07 16:48:46.721866 8 \n", + "4 3 2013-02-07 16:48:46.721866 9 \n", + "... ... ... ... \n", + "746133 50779 2020-04-27 14:36:39.197104 727112 \n", + "746134 50779 2020-04-27 14:36:39.197104 727111 \n", + "746135 50779 2020-04-27 14:36:39.197104 727110 \n", + "746136 50779 2020-04-27 14:36:39.197104 727108 \n", + "746137 50779 2020-04-27 14:36:39.197104 727108 \n", + "\n", + " Grep1 1.0.1 7 \\\n", + "0 Cut1 1.0.1 8 \n", + "1 Remove beginning1 1.0.0 9 \n", + "2 Remove beginning1 1.0.0 6 \n", + "3 addValue 1.0.0 11 \n", + "4 Cut1 1.0.1 11 \n", + "... ... ... ... \n", + "746133 Cut1 NaN 727113 \n", + "746134 toolshed.g2.bx.psu.edu/repos/devteam/column_ma... NaN 727112 \n", + "746135 toolshed.g2.bx.psu.edu/repos/devteam/column_ma... NaN 727111 \n", + "746136 NaN NaN 727114 \n", + "746137 NaN NaN 727113 \n", + "\n", + " Remove beginning1 1.0.0 f t f.1 \n", + "0 addValue 1.0.0 f t f \n", + "1 Cut1 1.0.1 f t f \n", + "2 Cut1 1.0.1 f t f \n", + "3 Paste1 1.0.0 f t f \n", + "4 Paste1 1.0.0 f t f \n", + "... ... ... .. .. ... \n", + "746133 toolshed.g2.bx.psu.edu/repos/devteam/intersect... NaN f f NaN \n", + "746134 Cut1 NaN f f NaN \n", + "746135 toolshed.g2.bx.psu.edu/repos/devteam/column_ma... NaN f f NaN \n", + "746136 toolshed.g2.bx.psu.edu/repos/devteam/intersect... NaN f f NaN \n", + "746137 toolshed.g2.bx.psu.edu/repos/devteam/intersect... NaN f f NaN \n", + "\n", + "[746138 rows x 11 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wf_old" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "821608ab-3b08-4ff3-99e1-6db5f04d3e05", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
0...monthcount
1----------------------------------------------...NaNNaN
2toolshed.g2.bx.psu.edu/repos/iuc/snpsift/snpS...2022-02-0167151
3toolshed.g2.bx.psu.edu/repos/devteam/column_m...2022-02-0150054
4upload1 ...2022-02-0143169
............
27203toolshed.g2.bx.psu.edu/repos/iuc/hmmer_jackhm...2021-02-011
27204toolshed.g2.bx.psu.edu/repos/galaxyp/openms_h...2021-02-011
27205toolshed.g2.bx.psu.edu/repos/iuc/circos/circo...2021-02-011
27206toolshed.g2.bx.psu.edu/repos/bgruening/chembl...2021-02-011
27207(27205 rows)NaNNaN
\n", + "

27208 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 \\\n", + "0 ... month \n", + "1 ----------------------------------------------... NaN \n", + "2 toolshed.g2.bx.psu.edu/repos/iuc/snpsift/snpS... 2022-02-01 \n", + "3 toolshed.g2.bx.psu.edu/repos/devteam/column_m... 2022-02-01 \n", + "4 upload1 ... 2022-02-01 \n", + "... ... ... \n", + "27203 toolshed.g2.bx.psu.edu/repos/iuc/hmmer_jackhm... 2021-02-01 \n", + "27204 toolshed.g2.bx.psu.edu/repos/galaxyp/openms_h... 2021-02-01 \n", + "27205 toolshed.g2.bx.psu.edu/repos/iuc/circos/circo... 2021-02-01 \n", + "27206 toolshed.g2.bx.psu.edu/repos/bgruening/chembl... 2021-02-01 \n", + "27207 (27205 rows) NaN \n", + "\n", + " 2 \n", + "0 count \n", + "1 NaN \n", + "2 67151 \n", + "3 50054 \n", + "4 43169 \n", + "... ... \n", + "27203 1 \n", + "27204 1 \n", + "27205 1 \n", + "27206 1 \n", + "27207 NaN \n", + "\n", + "[27208 rows x 3 columns]" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tool_popu_path = \"../data/tool-popularity_Feb_22_12.csv\"\n", + "tool_popu = pd.read_csv(tool_popu_path, sep=\"|\", header=None)\n", + "tool_popu" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "b5330f2c-9533-40d6-9731-62edd9d950b0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
012
2toolshed.g2.bx.psu.edu/repos/iuc/snpsift/snpS...2022-02-0167151
3toolshed.g2.bx.psu.edu/repos/devteam/column_m...2022-02-0150054
4upload1 ...2022-02-0143169
5toolshed.g2.bx.psu.edu/repos/bgruening/text_p...2022-02-0139139
6Cut1 ...2022-02-0129046
............
27202toolshed.g2.bx.psu.edu/repos/devteam/emboss_5...2021-02-011
27203toolshed.g2.bx.psu.edu/repos/iuc/hmmer_jackhm...2021-02-011
27204toolshed.g2.bx.psu.edu/repos/galaxyp/openms_h...2021-02-011
27205toolshed.g2.bx.psu.edu/repos/iuc/circos/circo...2021-02-011
27206toolshed.g2.bx.psu.edu/repos/bgruening/chembl...2021-02-011
\n", + "

27205 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 \\\n", + "2 toolshed.g2.bx.psu.edu/repos/iuc/snpsift/snpS... 2022-02-01 \n", + "3 toolshed.g2.bx.psu.edu/repos/devteam/column_m... 2022-02-01 \n", + "4 upload1 ... 2022-02-01 \n", + "5 toolshed.g2.bx.psu.edu/repos/bgruening/text_p... 2022-02-01 \n", + "6 Cut1 ... 2022-02-01 \n", + "... ... ... \n", + "27202 toolshed.g2.bx.psu.edu/repos/devteam/emboss_5... 2021-02-01 \n", + "27203 toolshed.g2.bx.psu.edu/repos/iuc/hmmer_jackhm... 2021-02-01 \n", + "27204 toolshed.g2.bx.psu.edu/repos/galaxyp/openms_h... 2021-02-01 \n", + "27205 toolshed.g2.bx.psu.edu/repos/iuc/circos/circo... 2021-02-01 \n", + "27206 toolshed.g2.bx.psu.edu/repos/bgruening/chembl... 2021-02-01 \n", + "\n", + " 2 \n", + "2 67151 \n", + "3 50054 \n", + "4 43169 \n", + "5 39139 \n", + "6 29046 \n", + "... ... \n", + "27202 1 \n", + "27203 1 \n", + "27204 1 \n", + "27205 1 \n", + "27206 1 \n", + "\n", + "[27205 rows x 3 columns]" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tool_popu_new = tool_popu[2:len(tool_popu.index) - 1]\n", + "tool_popu_new" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "0fcd3329-30c2-4051-a3f1-6538ac9967eb", + "metadata": {}, + "outputs": [], + "source": [ + "tool_popu_new.to_csv(\"../data/tool_popularity_march_22.csv\", index=None, header=None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bb00aa7-56c8-4913-a3e8-74a3cc505064", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}