diff --git a/ipython_script/analyse_wf_march_22.ipynb b/ipython_script/analyse_wf_march_22.ipynb
new file mode 100644
index 0000000..9b97ad5
--- /dev/null
+++ b/ipython_script/analyse_wf_march_22.ipynb
@@ -0,0 +1,1822 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 46,
+ "id": "6f18f94d-2489-42d6-a9e7-172c8fadd39f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "id": "ad85fbb3-57aa-49b5-a5d9-8f9d63321309",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wf_path = \"../data/workflow-connections_Feb_22_1.csv\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 48,
+ "id": "0d460c01-3cde-4163-9764-220dc5db00ae",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/anupkumar/anaconda3/envs/tool_prediction_gru_wc/lib/python3.6/site-packages/IPython/core/interactiveshell.py:3072: DtypeWarning: Columns (0,2,5) have mixed types.Specify dtype option on import or set low_memory=False.\n",
+ " interactivity=interactivity, compiler=compiler, result=result)\n"
+ ]
+ }
+ ],
+ "source": [
+ "wf_frame = pd.read_csv(wf_path, sep=\"|\", header=None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 49,
+ "id": "23807c0e-abaf-4496-99d6-bbfaa784d8f8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " wf_id | \n",
+ " wf_updated | \n",
+ " in_id | \n",
+ " ... | \n",
+ " in_tool_v | \n",
+ " out_id | \n",
+ " ... | \n",
+ " out_tool_v | \n",
+ " published | \n",
+ " deleted | \n",
+ " has_errors | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " --------+------------+---------+--------------... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 2013-02-07 | \n",
+ " 5 | \n",
+ " Grep1 ... | \n",
+ " 1.0.1 | \n",
+ " 7 | \n",
+ " Remove beginning1 ... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 2013-02-07 | \n",
+ " 6 | \n",
+ " Cut1 ... | \n",
+ " 1.0.1 | \n",
+ " 8 | \n",
+ " addValue ... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3 | \n",
+ " 2013-02-07 | \n",
+ " 7 | \n",
+ " Remove beginning1 ... | \n",
+ " 1.0.0 | \n",
+ " 6 | \n",
+ " Cut1 ... | \n",
+ " 1.0.1 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 2412941 | \n",
+ " 134921 | \n",
+ " 2022-02-15 | \n",
+ " 1.9924e+06 | \n",
+ " ... | \n",
+ " | \n",
+ " 1.99235e+06 | \n",
+ " toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... | \n",
+ " 0.38.1 | \n",
+ " f | \n",
+ " f | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 2412942 | \n",
+ " 134921 | \n",
+ " 2022-02-15 | \n",
+ " 1.99240e+06 | \n",
+ " ... | \n",
+ " | \n",
+ " 1.99235e+06 | \n",
+ " toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... | \n",
+ " 0.38.1 | \n",
+ " f | \n",
+ " f | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 2412943 | \n",
+ " 134921 | \n",
+ " 2022-02-15 | \n",
+ " 1.99241e+06 | \n",
+ " ... | \n",
+ " | \n",
+ " 1.99235e+06 | \n",
+ " toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... | \n",
+ " 0.38.1 | \n",
+ " f | \n",
+ " f | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 2412944 | \n",
+ " 134921 | \n",
+ " 2022-02-15 | \n",
+ " 1.99241e+06 | \n",
+ " ... | \n",
+ " | \n",
+ " 1.99236e+06 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/... | \n",
+ " 2.4.2+galaxy0 | \n",
+ " f | \n",
+ " f | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 2412945 | \n",
+ " (2412943 rows) | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2412946 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 \\\n",
+ "0 wf_id wf_updated \n",
+ "1 --------+------------+---------+--------------... NaN \n",
+ "2 3 2013-02-07 \n",
+ "3 3 2013-02-07 \n",
+ "4 3 2013-02-07 \n",
+ "... ... ... \n",
+ "2412941 134921 2022-02-15 \n",
+ "2412942 134921 2022-02-15 \n",
+ "2412943 134921 2022-02-15 \n",
+ "2412944 134921 2022-02-15 \n",
+ "2412945 (2412943 rows) NaN \n",
+ "\n",
+ " 2 3 \\\n",
+ "0 in_id ... \n",
+ "1 NaN NaN \n",
+ "2 5 Grep1 ... \n",
+ "3 6 Cut1 ... \n",
+ "4 7 Remove beginning1 ... \n",
+ "... ... ... \n",
+ "2412941 1.9924e+06 ... \n",
+ "2412942 1.99240e+06 ... \n",
+ "2412943 1.99241e+06 ... \n",
+ "2412944 1.99241e+06 ... \n",
+ "2412945 NaN NaN \n",
+ "\n",
+ " 4 5 \\\n",
+ "0 in_tool_v out_id \n",
+ "1 NaN NaN \n",
+ "2 1.0.1 7 \n",
+ "3 1.0.1 8 \n",
+ "4 1.0.0 6 \n",
+ "... ... ... \n",
+ "2412941 1.99235e+06 \n",
+ "2412942 1.99235e+06 \n",
+ "2412943 1.99235e+06 \n",
+ "2412944 1.99236e+06 \n",
+ "2412945 NaN NaN \n",
+ "\n",
+ " 6 \\\n",
+ "0 ... \n",
+ "1 NaN \n",
+ "2 Remove beginning1 ... \n",
+ "3 addValue ... \n",
+ "4 Cut1 ... \n",
+ "... ... \n",
+ "2412941 toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... \n",
+ "2412942 toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... \n",
+ "2412943 toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... \n",
+ "2412944 toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/... \n",
+ "2412945 NaN \n",
+ "\n",
+ " 7 8 \\\n",
+ "0 out_tool_v published \n",
+ "1 NaN NaN \n",
+ "2 1.0.0 f \n",
+ "3 1.0.0 f \n",
+ "4 1.0.1 f \n",
+ "... ... ... \n",
+ "2412941 0.38.1 f \n",
+ "2412942 0.38.1 f \n",
+ "2412943 0.38.1 f \n",
+ "2412944 2.4.2+galaxy0 f \n",
+ "2412945 NaN NaN \n",
+ "\n",
+ " 9 10 \n",
+ "0 deleted has_errors \n",
+ "1 NaN NaN \n",
+ "2 t f \n",
+ "3 t f \n",
+ "4 t f \n",
+ "... ... ... \n",
+ "2412941 f f \n",
+ "2412942 f f \n",
+ "2412943 f f \n",
+ "2412944 f f \n",
+ "2412945 NaN NaN \n",
+ "\n",
+ "[2412946 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 49,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wf_frame"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 50,
+ "id": "9dd25661-cab1-4a78-8570-cd5696b8bb38",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wf_frame_1 = wf_frame[1:len(wf_frame.index)-1]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 51,
+ "id": "30d18e50-8561-4d8b-869b-2d31ae0cf86a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 2013-02-07 | \n",
+ " 5 | \n",
+ " Grep1 ... | \n",
+ " 1.0.1 | \n",
+ " 7 | \n",
+ " Remove beginning1 ... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 2013-02-07 | \n",
+ " 6 | \n",
+ " Cut1 ... | \n",
+ " 1.0.1 | \n",
+ " 8 | \n",
+ " addValue ... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3 | \n",
+ " 2013-02-07 | \n",
+ " 7 | \n",
+ " Remove beginning1 ... | \n",
+ " 1.0.0 | \n",
+ " 6 | \n",
+ " Cut1 ... | \n",
+ " 1.0.1 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 3 | \n",
+ " 2013-02-07 | \n",
+ " 7 | \n",
+ " Remove beginning1 ... | \n",
+ " 1.0.0 | \n",
+ " 9 | \n",
+ " Cut1 ... | \n",
+ " 1.0.1 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 3 | \n",
+ " 2013-02-07 | \n",
+ " 8 | \n",
+ " addValue ... | \n",
+ " 1.0.0 | \n",
+ " 11 | \n",
+ " Paste1 ... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 2412940 | \n",
+ " 134921 | \n",
+ " 2022-02-15 | \n",
+ " 1.9924e+06 | \n",
+ " ... | \n",
+ " | \n",
+ " 1.99235e+06 | \n",
+ " toolshed.g2.bx.psu.edu/repos/iuc/filter_tabul... | \n",
+ " 2.0.0 | \n",
+ " f | \n",
+ " f | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 2412941 | \n",
+ " 134921 | \n",
+ " 2022-02-15 | \n",
+ " 1.9924e+06 | \n",
+ " ... | \n",
+ " | \n",
+ " 1.99235e+06 | \n",
+ " toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... | \n",
+ " 0.38.1 | \n",
+ " f | \n",
+ " f | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 2412942 | \n",
+ " 134921 | \n",
+ " 2022-02-15 | \n",
+ " 1.99240e+06 | \n",
+ " ... | \n",
+ " | \n",
+ " 1.99235e+06 | \n",
+ " toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... | \n",
+ " 0.38.1 | \n",
+ " f | \n",
+ " f | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 2412943 | \n",
+ " 134921 | \n",
+ " 2022-02-15 | \n",
+ " 1.99241e+06 | \n",
+ " ... | \n",
+ " | \n",
+ " 1.99235e+06 | \n",
+ " toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... | \n",
+ " 0.38.1 | \n",
+ " f | \n",
+ " f | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 2412944 | \n",
+ " 134921 | \n",
+ " 2022-02-15 | \n",
+ " 1.99241e+06 | \n",
+ " ... | \n",
+ " | \n",
+ " 1.99236e+06 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/... | \n",
+ " 2.4.2+galaxy0 | \n",
+ " f | \n",
+ " f | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2412943 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 \\\n",
+ "2 3 2013-02-07 5 \n",
+ "3 3 2013-02-07 6 \n",
+ "4 3 2013-02-07 7 \n",
+ "5 3 2013-02-07 7 \n",
+ "6 3 2013-02-07 8 \n",
+ "... ... ... ... \n",
+ "2412940 134921 2022-02-15 1.9924e+06 \n",
+ "2412941 134921 2022-02-15 1.9924e+06 \n",
+ "2412942 134921 2022-02-15 1.99240e+06 \n",
+ "2412943 134921 2022-02-15 1.99241e+06 \n",
+ "2412944 134921 2022-02-15 1.99241e+06 \n",
+ "\n",
+ " 3 \\\n",
+ "2 Grep1 ... \n",
+ "3 Cut1 ... \n",
+ "4 Remove beginning1 ... \n",
+ "5 Remove beginning1 ... \n",
+ "6 addValue ... \n",
+ "... ... \n",
+ "2412940 ... \n",
+ "2412941 ... \n",
+ "2412942 ... \n",
+ "2412943 ... \n",
+ "2412944 ... \n",
+ "\n",
+ " 4 5 \\\n",
+ "2 1.0.1 7 \n",
+ "3 1.0.1 8 \n",
+ "4 1.0.0 6 \n",
+ "5 1.0.0 9 \n",
+ "6 1.0.0 11 \n",
+ "... ... ... \n",
+ "2412940 1.99235e+06 \n",
+ "2412941 1.99235e+06 \n",
+ "2412942 1.99235e+06 \n",
+ "2412943 1.99235e+06 \n",
+ "2412944 1.99236e+06 \n",
+ "\n",
+ " 6 \\\n",
+ "2 Remove beginning1 ... \n",
+ "3 addValue ... \n",
+ "4 Cut1 ... \n",
+ "5 Cut1 ... \n",
+ "6 Paste1 ... \n",
+ "... ... \n",
+ "2412940 toolshed.g2.bx.psu.edu/repos/iuc/filter_tabul... \n",
+ "2412941 toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... \n",
+ "2412942 toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... \n",
+ "2412943 toolshed.g2.bx.psu.edu/repos/pjbriggs/trimmom... \n",
+ "2412944 toolshed.g2.bx.psu.edu/repos/devteam/bowtie2/... \n",
+ "\n",
+ " 7 8 \\\n",
+ "2 1.0.0 f \n",
+ "3 1.0.0 f \n",
+ "4 1.0.1 f \n",
+ "5 1.0.1 f \n",
+ "6 1.0.0 f \n",
+ "... ... ... \n",
+ "2412940 2.0.0 f \n",
+ "2412941 0.38.1 f \n",
+ "2412942 0.38.1 f \n",
+ "2412943 0.38.1 f \n",
+ "2412944 2.4.2+galaxy0 f \n",
+ "\n",
+ " 9 10 \n",
+ "2 t f \n",
+ "3 t f \n",
+ "4 t f \n",
+ "5 t f \n",
+ "6 t f \n",
+ "... ... .. \n",
+ "2412940 f f \n",
+ "2412941 f f \n",
+ "2412942 f f \n",
+ "2412943 f f \n",
+ "2412944 f f \n",
+ "\n",
+ "[2412943 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 51,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wf_frame_1 = wf_frame_1[1:]\n",
+ "wf_frame_1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "id": "13182050-3366-4a3b-b121-5c3d4e8555a3",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 1910966 | \n",
+ " 72418 | \n",
+ " 2020-12-04 | \n",
+ " 1018284 | \n",
+ " toolshed.g2.bx.psu.edu/repos/iuc/anndata_mani... | \n",
+ " 0.6.22.post1+galaxy1 | \n",
+ " 1018287 | \n",
+ " toolshed.g2.bx.psu.edu/repos/iuc/scanpy_plot/... | \n",
+ " 1.4.4.post1+galaxy1 | \n",
+ " f | \n",
+ " f | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 459259 | \n",
+ " 1915 | \n",
+ " 2015-01-19 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 1083589 | \n",
+ " 1915 | \n",
+ " 2015-01-19 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 252642 | \n",
+ " 1915 | \n",
+ " 2015-01-19 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 6821 | \n",
+ " 887 | \n",
+ " 2014-03-14 | \n",
+ " 15219 | \n",
+ " ... | \n",
+ " | \n",
+ " 15230 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/fastqc/f... | \n",
+ " | \n",
+ " f | \n",
+ " t | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 1592784 | \n",
+ " 21747 | \n",
+ " 2018-12-18 | \n",
+ " 331828 | \n",
+ " ... | \n",
+ " | \n",
+ " 331801 | \n",
+ " toolshed.g2.bx.psu.edu/repos/iuc/stacks_procr... | \n",
+ " 1.46.0 | \n",
+ " f | \n",
+ " f | \n",
+ " t | \n",
+ "
\n",
+ " \n",
+ " 1590874 | \n",
+ " 21600 | \n",
+ " 2018-12-17 | \n",
+ " 328839 | \n",
+ " toolshed.g2.bx.psu.edu/repos/galaxyp/openms_m... | \n",
+ " 2.2.0 | \n",
+ " 328835 | \n",
+ " toolshed.g2.bx.psu.edu/repos/galaxyp/openms_p... | \n",
+ " 2.2.0 | \n",
+ " f | \n",
+ " f | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 1474236 | \n",
+ " 5488 | \n",
+ " 2016-07-29 | \n",
+ " 81677 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/get_flan... | \n",
+ " 1.0.0 | \n",
+ " 81707 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/join/gop... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 1713787 | \n",
+ " 39694 | \n",
+ " 2019-11-21 | \n",
+ " 597867 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/bamtools... | \n",
+ " 2.4.1 | \n",
+ " 597868 | \n",
+ " toolshed.g2.bx.psu.edu/repos/iuc/umi_tools_co... | \n",
+ " 0.5.3.2 | \n",
+ " f | \n",
+ " f | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 2158410 | \n",
+ " 105860 | \n",
+ " 2021-08-14 | \n",
+ " 1556606 | \n",
+ " ... | \n",
+ " | \n",
+ " 1556612 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/fastqc/f... | \n",
+ " 0.72+galaxy1 | \n",
+ " f | \n",
+ " f | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
2412943 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 \\\n",
+ "1910966 72418 2020-12-04 1018284 \n",
+ "459259 1915 2015-01-19 29099 \n",
+ "1083589 1915 2015-01-19 29099 \n",
+ "252642 1915 2015-01-19 29099 \n",
+ "6821 887 2014-03-14 15219 \n",
+ "... ... ... ... \n",
+ "1592784 21747 2018-12-18 331828 \n",
+ "1590874 21600 2018-12-17 328839 \n",
+ "1474236 5488 2016-07-29 81677 \n",
+ "1713787 39694 2019-11-21 597867 \n",
+ "2158410 105860 2021-08-14 1556606 \n",
+ "\n",
+ " 3 \\\n",
+ "1910966 toolshed.g2.bx.psu.edu/repos/iuc/anndata_mani... \n",
+ "459259 cat1 ... \n",
+ "1083589 cat1 ... \n",
+ "252642 cat1 ... \n",
+ "6821 ... \n",
+ "... ... \n",
+ "1592784 ... \n",
+ "1590874 toolshed.g2.bx.psu.edu/repos/galaxyp/openms_m... \n",
+ "1474236 toolshed.g2.bx.psu.edu/repos/devteam/get_flan... \n",
+ "1713787 toolshed.g2.bx.psu.edu/repos/devteam/bamtools... \n",
+ "2158410 ... \n",
+ "\n",
+ " 4 5 \\\n",
+ "1910966 0.6.22.post1+galaxy1 1018287 \n",
+ "459259 1.0.0 29099 \n",
+ "1083589 1.0.0 29099 \n",
+ "252642 1.0.0 29099 \n",
+ "6821 15230 \n",
+ "... ... ... \n",
+ "1592784 331801 \n",
+ "1590874 2.2.0 328835 \n",
+ "1474236 1.0.0 81707 \n",
+ "1713787 2.4.1 597868 \n",
+ "2158410 1556612 \n",
+ "\n",
+ " 6 \\\n",
+ "1910966 toolshed.g2.bx.psu.edu/repos/iuc/scanpy_plot/... \n",
+ "459259 cat1 ... \n",
+ "1083589 cat1 ... \n",
+ "252642 cat1 ... \n",
+ "6821 toolshed.g2.bx.psu.edu/repos/devteam/fastqc/f... \n",
+ "... ... \n",
+ "1592784 toolshed.g2.bx.psu.edu/repos/iuc/stacks_procr... \n",
+ "1590874 toolshed.g2.bx.psu.edu/repos/galaxyp/openms_p... \n",
+ "1474236 toolshed.g2.bx.psu.edu/repos/devteam/join/gop... \n",
+ "1713787 toolshed.g2.bx.psu.edu/repos/iuc/umi_tools_co... \n",
+ "2158410 toolshed.g2.bx.psu.edu/repos/devteam/fastqc/f... \n",
+ "\n",
+ " 7 8 \\\n",
+ "1910966 1.4.4.post1+galaxy1 f \n",
+ "459259 1.0.0 f \n",
+ "1083589 1.0.0 f \n",
+ "252642 1.0.0 f \n",
+ "6821 f \n",
+ "... ... ... \n",
+ "1592784 1.46.0 f \n",
+ "1590874 2.2.0 f \n",
+ "1474236 1.0.0 f \n",
+ "1713787 0.5.3.2 f \n",
+ "2158410 0.72+galaxy1 f \n",
+ "\n",
+ " 9 10 \n",
+ "1910966 f f \n",
+ "459259 t f \n",
+ "1083589 t f \n",
+ "252642 t f \n",
+ "6821 t \n",
+ "... ... .. \n",
+ "1592784 f t \n",
+ "1590874 f f \n",
+ "1474236 t f \n",
+ "1713787 f f \n",
+ "2158410 f \n",
+ "\n",
+ "[2412943 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 53,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wf_frame_1 = wf_frame_1.sample(frac=1)\n",
+ "wf_frame_1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "id": "f1b96941-6fa4-4735-a292-4a19e42fe9f7",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 459259 | \n",
+ " 1915 | \n",
+ " 2015-01-19 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 1083589 | \n",
+ " 1915 | \n",
+ " 2015-01-19 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 252642 | \n",
+ " 1915 | \n",
+ " 2015-01-19 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 6821 | \n",
+ " 887 | \n",
+ " 2014-03-14 | \n",
+ " 15219 | \n",
+ " ... | \n",
+ " | \n",
+ " 15230 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/fastqc/f... | \n",
+ " | \n",
+ " f | \n",
+ " t | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 1229113 | \n",
+ " 1915 | \n",
+ " 2015-01-19 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 293631 | \n",
+ " 1915 | \n",
+ " 2015-01-19 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 431234 | \n",
+ " 1915 | \n",
+ " 2015-01-19 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 2299184 | \n",
+ " 125597 | \n",
+ " 2022-01-27 | \n",
+ " 1850173 | \n",
+ " ... | \n",
+ " | \n",
+ " 1850165 | \n",
+ " toolshed.g2.bx.psu.edu/repos/iuc/purge_dups/p... | \n",
+ " 1.2.5+galaxy4 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 1985563 | \n",
+ " 84931 | \n",
+ " 2021-03-18 | \n",
+ " 1187845 | \n",
+ " addValue ... | \n",
+ " 1.0.0 | \n",
+ " 1187846 | \n",
+ " toolshed.g2.bx.psu.edu/repos/bgruening/text_p... | \n",
+ " 0.1.0 | \n",
+ " f | \n",
+ " t | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " 331020 | \n",
+ " 1915 | \n",
+ " 2015-01-19 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " 29099 | \n",
+ " cat1 ... | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
4999 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 2 \\\n",
+ "459259 1915 2015-01-19 29099 \n",
+ "1083589 1915 2015-01-19 29099 \n",
+ "252642 1915 2015-01-19 29099 \n",
+ "6821 887 2014-03-14 15219 \n",
+ "1229113 1915 2015-01-19 29099 \n",
+ "... ... ... ... \n",
+ "293631 1915 2015-01-19 29099 \n",
+ "431234 1915 2015-01-19 29099 \n",
+ "2299184 125597 2022-01-27 1850173 \n",
+ "1985563 84931 2021-03-18 1187845 \n",
+ "331020 1915 2015-01-19 29099 \n",
+ "\n",
+ " 3 \\\n",
+ "459259 cat1 ... \n",
+ "1083589 cat1 ... \n",
+ "252642 cat1 ... \n",
+ "6821 ... \n",
+ "1229113 cat1 ... \n",
+ "... ... \n",
+ "293631 cat1 ... \n",
+ "431234 cat1 ... \n",
+ "2299184 ... \n",
+ "1985563 addValue ... \n",
+ "331020 cat1 ... \n",
+ "\n",
+ " 4 5 \\\n",
+ "459259 1.0.0 29099 \n",
+ "1083589 1.0.0 29099 \n",
+ "252642 1.0.0 29099 \n",
+ "6821 15230 \n",
+ "1229113 1.0.0 29099 \n",
+ "... ... ... \n",
+ "293631 1.0.0 29099 \n",
+ "431234 1.0.0 29099 \n",
+ "2299184 1850165 \n",
+ "1985563 1.0.0 1187846 \n",
+ "331020 1.0.0 29099 \n",
+ "\n",
+ " 6 \\\n",
+ "459259 cat1 ... \n",
+ "1083589 cat1 ... \n",
+ "252642 cat1 ... \n",
+ "6821 toolshed.g2.bx.psu.edu/repos/devteam/fastqc/f... \n",
+ "1229113 cat1 ... \n",
+ "... ... \n",
+ "293631 cat1 ... \n",
+ "431234 cat1 ... \n",
+ "2299184 toolshed.g2.bx.psu.edu/repos/iuc/purge_dups/p... \n",
+ "1985563 toolshed.g2.bx.psu.edu/repos/bgruening/text_p... \n",
+ "331020 cat1 ... \n",
+ "\n",
+ " 7 8 \\\n",
+ "459259 1.0.0 f \n",
+ "1083589 1.0.0 f \n",
+ "252642 1.0.0 f \n",
+ "6821 f \n",
+ "1229113 1.0.0 f \n",
+ "... ... ... \n",
+ "293631 1.0.0 f \n",
+ "431234 1.0.0 f \n",
+ "2299184 1.2.5+galaxy4 f \n",
+ "1985563 0.1.0 f \n",
+ "331020 1.0.0 f \n",
+ "\n",
+ " 9 10 \n",
+ "459259 t f \n",
+ "1083589 t f \n",
+ "252642 t f \n",
+ "6821 t \n",
+ "1229113 t f \n",
+ "... ... .. \n",
+ "293631 t f \n",
+ "431234 t f \n",
+ "2299184 t f \n",
+ "1985563 t \n",
+ "331020 t f \n",
+ "\n",
+ "[4999 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wf_frame_subset = wf_frame_1[1:5000]\n",
+ "wf_frame_subset"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "id": "69ae32d2-928b-4c1f-a059-338fa1d4e494",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wf_frame_subset.to_csv(\"../data/wf_frame_subset_march_22_5000.csv\", index=None, header=None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "cb300cf4-98c1-4eed-85c1-90763128513e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "wf_path_old = \"../data/worflow-connection-20-04.tsv\"\n",
+ "\n",
+ "wf_old = pd.read_csv(wf_path_old, sep=\"\\t\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "id": "257c9f14-da19-4b4e-b09c-a08b85cc5f87",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 3 | \n",
+ " 2013-02-07 16:48:46.721866 | \n",
+ " 5 | \n",
+ " Grep1 | \n",
+ " 1.0.1 | \n",
+ " 7 | \n",
+ " Remove beginning1 | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f.1 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3 | \n",
+ " 2013-02-07 16:48:46.721866 | \n",
+ " 6 | \n",
+ " Cut1 | \n",
+ " 1.0.1 | \n",
+ " 8 | \n",
+ " addValue | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 2013-02-07 16:48:46.721866 | \n",
+ " 7 | \n",
+ " Remove beginning1 | \n",
+ " 1.0.0 | \n",
+ " 9 | \n",
+ " Cut1 | \n",
+ " 1.0.1 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 2013-02-07 16:48:46.721866 | \n",
+ " 7 | \n",
+ " Remove beginning1 | \n",
+ " 1.0.0 | \n",
+ " 6 | \n",
+ " Cut1 | \n",
+ " 1.0.1 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 2013-02-07 16:48:46.721866 | \n",
+ " 8 | \n",
+ " addValue | \n",
+ " 1.0.0 | \n",
+ " 11 | \n",
+ " Paste1 | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 3 | \n",
+ " 2013-02-07 16:48:46.721866 | \n",
+ " 9 | \n",
+ " Cut1 | \n",
+ " 1.0.1 | \n",
+ " 11 | \n",
+ " Paste1 | \n",
+ " 1.0.0 | \n",
+ " f | \n",
+ " t | \n",
+ " f | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 746133 | \n",
+ " 50779 | \n",
+ " 2020-04-27 14:36:39.197104 | \n",
+ " 727112 | \n",
+ " Cut1 | \n",
+ " NaN | \n",
+ " 727113 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/intersect... | \n",
+ " NaN | \n",
+ " f | \n",
+ " f | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 746134 | \n",
+ " 50779 | \n",
+ " 2020-04-27 14:36:39.197104 | \n",
+ " 727111 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/column_ma... | \n",
+ " NaN | \n",
+ " 727112 | \n",
+ " Cut1 | \n",
+ " NaN | \n",
+ " f | \n",
+ " f | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 746135 | \n",
+ " 50779 | \n",
+ " 2020-04-27 14:36:39.197104 | \n",
+ " 727110 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/column_ma... | \n",
+ " NaN | \n",
+ " 727111 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/column_ma... | \n",
+ " NaN | \n",
+ " f | \n",
+ " f | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 746136 | \n",
+ " 50779 | \n",
+ " 2020-04-27 14:36:39.197104 | \n",
+ " 727108 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 727114 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/intersect... | \n",
+ " NaN | \n",
+ " f | \n",
+ " f | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 746137 | \n",
+ " 50779 | \n",
+ " 2020-04-27 14:36:39.197104 | \n",
+ " 727108 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 727113 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/intersect... | \n",
+ " NaN | \n",
+ " f | \n",
+ " f | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
746138 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 3 2013-02-07 16:48:46.721866 5 \\\n",
+ "0 3 2013-02-07 16:48:46.721866 6 \n",
+ "1 3 2013-02-07 16:48:46.721866 7 \n",
+ "2 3 2013-02-07 16:48:46.721866 7 \n",
+ "3 3 2013-02-07 16:48:46.721866 8 \n",
+ "4 3 2013-02-07 16:48:46.721866 9 \n",
+ "... ... ... ... \n",
+ "746133 50779 2020-04-27 14:36:39.197104 727112 \n",
+ "746134 50779 2020-04-27 14:36:39.197104 727111 \n",
+ "746135 50779 2020-04-27 14:36:39.197104 727110 \n",
+ "746136 50779 2020-04-27 14:36:39.197104 727108 \n",
+ "746137 50779 2020-04-27 14:36:39.197104 727108 \n",
+ "\n",
+ " Grep1 1.0.1 7 \\\n",
+ "0 Cut1 1.0.1 8 \n",
+ "1 Remove beginning1 1.0.0 9 \n",
+ "2 Remove beginning1 1.0.0 6 \n",
+ "3 addValue 1.0.0 11 \n",
+ "4 Cut1 1.0.1 11 \n",
+ "... ... ... ... \n",
+ "746133 Cut1 NaN 727113 \n",
+ "746134 toolshed.g2.bx.psu.edu/repos/devteam/column_ma... NaN 727112 \n",
+ "746135 toolshed.g2.bx.psu.edu/repos/devteam/column_ma... NaN 727111 \n",
+ "746136 NaN NaN 727114 \n",
+ "746137 NaN NaN 727113 \n",
+ "\n",
+ " Remove beginning1 1.0.0 f t f.1 \n",
+ "0 addValue 1.0.0 f t f \n",
+ "1 Cut1 1.0.1 f t f \n",
+ "2 Cut1 1.0.1 f t f \n",
+ "3 Paste1 1.0.0 f t f \n",
+ "4 Paste1 1.0.0 f t f \n",
+ "... ... ... .. .. ... \n",
+ "746133 toolshed.g2.bx.psu.edu/repos/devteam/intersect... NaN f f NaN \n",
+ "746134 Cut1 NaN f f NaN \n",
+ "746135 toolshed.g2.bx.psu.edu/repos/devteam/column_ma... NaN f f NaN \n",
+ "746136 toolshed.g2.bx.psu.edu/repos/devteam/intersect... NaN f f NaN \n",
+ "746137 toolshed.g2.bx.psu.edu/repos/devteam/intersect... NaN f f NaN \n",
+ "\n",
+ "[746138 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wf_old"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "id": "821608ab-3b08-4ff3-99e1-6db5f04d3e05",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " ... | \n",
+ " month | \n",
+ " count | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " ----------------------------------------------... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " toolshed.g2.bx.psu.edu/repos/iuc/snpsift/snpS... | \n",
+ " 2022-02-01 | \n",
+ " 67151 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/column_m... | \n",
+ " 2022-02-01 | \n",
+ " 50054 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " upload1 ... | \n",
+ " 2022-02-01 | \n",
+ " 43169 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 27203 | \n",
+ " toolshed.g2.bx.psu.edu/repos/iuc/hmmer_jackhm... | \n",
+ " 2021-02-01 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 27204 | \n",
+ " toolshed.g2.bx.psu.edu/repos/galaxyp/openms_h... | \n",
+ " 2021-02-01 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 27205 | \n",
+ " toolshed.g2.bx.psu.edu/repos/iuc/circos/circo... | \n",
+ " 2021-02-01 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 27206 | \n",
+ " toolshed.g2.bx.psu.edu/repos/bgruening/chembl... | \n",
+ " 2021-02-01 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 27207 | \n",
+ " (27205 rows) | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
27208 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 \\\n",
+ "0 ... month \n",
+ "1 ----------------------------------------------... NaN \n",
+ "2 toolshed.g2.bx.psu.edu/repos/iuc/snpsift/snpS... 2022-02-01 \n",
+ "3 toolshed.g2.bx.psu.edu/repos/devteam/column_m... 2022-02-01 \n",
+ "4 upload1 ... 2022-02-01 \n",
+ "... ... ... \n",
+ "27203 toolshed.g2.bx.psu.edu/repos/iuc/hmmer_jackhm... 2021-02-01 \n",
+ "27204 toolshed.g2.bx.psu.edu/repos/galaxyp/openms_h... 2021-02-01 \n",
+ "27205 toolshed.g2.bx.psu.edu/repos/iuc/circos/circo... 2021-02-01 \n",
+ "27206 toolshed.g2.bx.psu.edu/repos/bgruening/chembl... 2021-02-01 \n",
+ "27207 (27205 rows) NaN \n",
+ "\n",
+ " 2 \n",
+ "0 count \n",
+ "1 NaN \n",
+ "2 67151 \n",
+ "3 50054 \n",
+ "4 43169 \n",
+ "... ... \n",
+ "27203 1 \n",
+ "27204 1 \n",
+ "27205 1 \n",
+ "27206 1 \n",
+ "27207 NaN \n",
+ "\n",
+ "[27208 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 61,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tool_popu_path = \"../data/tool-popularity_Feb_22_12.csv\"\n",
+ "tool_popu = pd.read_csv(tool_popu_path, sep=\"|\", header=None)\n",
+ "tool_popu"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "id": "b5330f2c-9533-40d6-9731-62edd9d950b0",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 2 | \n",
+ " toolshed.g2.bx.psu.edu/repos/iuc/snpsift/snpS... | \n",
+ " 2022-02-01 | \n",
+ " 67151 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/column_m... | \n",
+ " 2022-02-01 | \n",
+ " 50054 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " upload1 ... | \n",
+ " 2022-02-01 | \n",
+ " 43169 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " toolshed.g2.bx.psu.edu/repos/bgruening/text_p... | \n",
+ " 2022-02-01 | \n",
+ " 39139 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " Cut1 ... | \n",
+ " 2022-02-01 | \n",
+ " 29046 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 27202 | \n",
+ " toolshed.g2.bx.psu.edu/repos/devteam/emboss_5... | \n",
+ " 2021-02-01 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 27203 | \n",
+ " toolshed.g2.bx.psu.edu/repos/iuc/hmmer_jackhm... | \n",
+ " 2021-02-01 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 27204 | \n",
+ " toolshed.g2.bx.psu.edu/repos/galaxyp/openms_h... | \n",
+ " 2021-02-01 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 27205 | \n",
+ " toolshed.g2.bx.psu.edu/repos/iuc/circos/circo... | \n",
+ " 2021-02-01 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 27206 | \n",
+ " toolshed.g2.bx.psu.edu/repos/bgruening/chembl... | \n",
+ " 2021-02-01 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
27205 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " 0 1 \\\n",
+ "2 toolshed.g2.bx.psu.edu/repos/iuc/snpsift/snpS... 2022-02-01 \n",
+ "3 toolshed.g2.bx.psu.edu/repos/devteam/column_m... 2022-02-01 \n",
+ "4 upload1 ... 2022-02-01 \n",
+ "5 toolshed.g2.bx.psu.edu/repos/bgruening/text_p... 2022-02-01 \n",
+ "6 Cut1 ... 2022-02-01 \n",
+ "... ... ... \n",
+ "27202 toolshed.g2.bx.psu.edu/repos/devteam/emboss_5... 2021-02-01 \n",
+ "27203 toolshed.g2.bx.psu.edu/repos/iuc/hmmer_jackhm... 2021-02-01 \n",
+ "27204 toolshed.g2.bx.psu.edu/repos/galaxyp/openms_h... 2021-02-01 \n",
+ "27205 toolshed.g2.bx.psu.edu/repos/iuc/circos/circo... 2021-02-01 \n",
+ "27206 toolshed.g2.bx.psu.edu/repos/bgruening/chembl... 2021-02-01 \n",
+ "\n",
+ " 2 \n",
+ "2 67151 \n",
+ "3 50054 \n",
+ "4 43169 \n",
+ "5 39139 \n",
+ "6 29046 \n",
+ "... ... \n",
+ "27202 1 \n",
+ "27203 1 \n",
+ "27204 1 \n",
+ "27205 1 \n",
+ "27206 1 \n",
+ "\n",
+ "[27205 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 62,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "tool_popu_new = tool_popu[2:len(tool_popu.index) - 1]\n",
+ "tool_popu_new"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "id": "0fcd3329-30c2-4051-a3f1-6538ac9967eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "tool_popu_new.to_csv(\"../data/tool_popularity_march_22.csv\", index=None, header=None)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8bb00aa7-56c8-4913-a3e8-74a3cc505064",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.6.13"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}