Quick curation fix: allow revisions via webpage, do not approve curat…

…ed revisions (by users other than insdc_ingest_user), send notification if ingest wants to revise a curated sequence.  resolves #  preview URL: ### Summary  ### Screenshot  ### PR Checklist  - [ ] All necessary documentation has been adapted. - [ ] The implemented feature is covered by an appropriate test.
loculus-project · Oct 30, 2024 · c2a6788 · c2a6788
1 parent 93d9864
commit c2a6788
Show file tree

Hide file tree

Showing 8 changed files with 68,221 additions and 0 deletions.
diff --git a/ingest/CCHF.ipynb b/ingest/CCHF.ipynb
@@ -0,0 +1,156 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "We find 4760 accessions.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from Bio import SeqIO\n",
+    "\n",
+    "input_seq = \"results/sequences.fasta\"\n",
+    "\n",
+    "with open(input_seq) as f:\n",
+    "    records = SeqIO.parse(f, \"fasta\")\n",
+    "    fasta_accessions = {record.id.split(\".\")[0] for record in records}\n",
+    "\n",
+    "\n",
+    "print(f\"We find {len(fasta_accessions)} accessions.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "By decreased the minSeedCover to 10% we drop from 123 unaligned sequences to 48. By further decreasing the kmer size to 7 we drop to only 34 sequences that do not align. Finally, by adding `--retry-reverse-complement` we drop to only 9 sequences which do not align (e.g. 25 sequences that aligned when using the reverse-complement)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "We are able to align 4717 accessions.\n",
+      "This mean a total of 43 sequences failed to align.\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open(\"results/nextclade_merged.tsv\", \"r\") as file:\n",
+    "    file.readline()\n",
+    "    aligned_sequences = {line.split('\\t')[0].split(\".\")[0] for line in file}\n",
+    "\n",
+    "failed_alignment = (fasta_accessions - aligned_sequences)\n",
+    "\n",
+    "print(f\"We are able to align {len(aligned_sequences)} accessions.\")\n",
+    "print(f\"This mean a total of {len(fasta_accessions - aligned_sequences)} sequences failed to align.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "A large number of the sequences that have still failed to align are for the envelope glycoprotein precursor and are under 500bp."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sequences that failed to align:\n",
+      "ON564456.1\n",
+      "OR633377.1\n",
+      "MW424419.1\n",
+      "MZ322095.1\n",
+      "MZ326698.1\n",
+      "MZ361738.1\n",
+      "OK573219.1\n",
+      "OK573222.1\n",
+      "MH557793.1\n",
+      "KF425575.1\n",
+      "KJ545687.1\n",
+      "KJ545688.1\n",
+      "KJ545689.1\n",
+      "KJ545690.1\n",
+      "EF189740.1\n",
+      "EF189741.1\n",
+      "EF189742.1\n",
+      "EF189743.1\n",
+      "EF189744.1\n",
+      "EF189745.1\n",
+      "EF189746.1\n",
+      "EF189747.1\n",
+      "EF189748.1\n",
+      "EF189749.1\n",
+      "EF189750.1\n",
+      "EF189751.1\n",
+      "EF189752.1\n",
+      "EF189753.1\n",
+      "FV537244.1\n",
+      "FV537245.1\n",
+      "FV537248.1\n",
+      "FV537249.1\n",
+      "FV537252.1\n",
+      "FV537253.1\n",
+      "GN358810.1\n",
+      "GN358812.1\n",
+      "GN358814.1\n",
+      "JF807429.1\n",
+      "JF807434.1\n",
+      "AF492422.1\n",
+      "AF492423.1\n",
+      "AF492424.1\n",
+      "AY049081.1\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Sequences that failed to align:\")\n",
+    "with open(\"unaligned_sequences.fasta\", \"w\", encoding=\"utf-8\") as output_file:\n",
+    "    with open(\"results/sequences.fasta\") as f:\n",
+    "        records = SeqIO.parse(f, \"fasta\")\n",
+    "        for record in records:\n",
+    "            if record.id.split(\".\")[0] in failed_alignment:\n",
+    "                print(record.description)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}