WARNING:Fast5Filter:xxx reads not found! #73

markme123 · 2022-07-18T09:27:15Z

Fast5_subset fast5 separation pass and FAIL, found that the latest R10 FAST5 has a lot of extraction can not be out .
I checked that there were no fast5 incompletions, and this happened on many R10 versions, but not on R9 .
A third of them didn't come out.

| 3108059 of 5017321|######################## | 61% ETA: 0:24:25

3140641 of 5017321|######################## | 62% ETA: 0:23:55
| 3178341 of 5017321|######################## | 63% ETA: 0:23:27
3208333 of 5017321|######################## | 63% ETA: 0:23:03
| 3238381 of 5017321|######################### | 64% ETA: 0:22:36
3273840 of 5017321|######################### | 65% ETA: 0:22:08
| 3303857 of 5017321|######################### | 65% ETA: 0:21:44
333769
\ 3364211 of 5017321|########################## | 67% ETA: 0:20:59
/ 3378623 of 5017321|########################## | 67% ETA: 0:20:53
| 5017321 of 5017321|#######################################|100% Time: 0:43:08
INFO:Fast5Filter:3377354 reads extracted
WARNING:Fast5Filter:1638696 reads not found!

fbrennen · 2022-07-18T09:30:57Z

Hi @markme123 -- thanks for getting in touch. Can you please tell us:

The full command you ran.
Whether or not there is a particular pattern to the reads that were not found -- are they, for example, already in a fail folder somewhere?

markme123 · 2022-07-20T02:16:47Z

fast5_subset_bin = "/opt/miniconda3/bin/fast5_subset"
def fast5_subset(save_path, reads_list, bacth_name, fast5_input=args.fast5_input):
sp.run([fast5_subset_bin, "-i", fast5_input, "-s", save_path, "-l", reads_list, "-t", "25", "-r", "-f", bacth_name])

It's hard to tell if it's a problem with some FAST5 files because there are so many files

fbrennen · 2022-07-20T07:09:14Z

Hi @markme123 -- how are you generating your reads list? Where did you get your input files from? Is there a chance you only have the reads from MinKNOW's "pass" output folder, so the filtering has already been completed?

markme123 · 2022-07-20T09:19:38Z

`#!/usr/bin/env python

-- encoding: utf-8 --

'''
@file : split_f5.py
@time : 2022/03/29 16:09:25
@author : jiangmian
@Version : 1.0
'''

import argparse
from ont_fast5_api.conversion_tools import fast5_subset as fs
import subprocess as sp
import os
import pandas as pd
from Bio import SeqIO
import glob

parser = argparse.ArgumentParser(description="fast5 split, 本程序调用了seqkit")
parser.add_argument('-i', '--fast5_input', dest="fast5_input", required=True, help="fast5 文件所在")
parser.add_argument('-p', '--flow', dest='flow', required=True, help="芯片上机号")
parser.add_argument('-s', '--save_path', dest="save_path", required=True, help="输出目录")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--summary', dest='summary', help="输入summary 文件绝对路径,--summary 与 --fastq 择一选择")
group.add_argument('--fastq', dest='fastq', help="输入pass_fastq,fail_fastq路径, 示例为 /data/fastq_pass,/data/fastq_fail")
parser.add_argument('-b', '--barcode', dest='barcode', action='store_true', help="有barcode")
args = parser.parse_args()

fast5_subset_bin = "/opt/miniconda3/bin/fast5_subset"
def fast5_subset(save_path, reads_list, bacth_name, fast5_input=args.fast5_input):
sp.run([fast5_subset_bin, "-i", fast5_input, "-s", save_path, "-l", reads_list, "-t", "25", "-r", "-f", bacth_name])

def reads_list_out(reads_list, out_name):
with open(out_name, "w") as out:
for line in reads_list:
out.write(line + "\n")

path = os.path.realpath(args.save_path)
if bool(1 - os.path.exists(f"{path}/fast5_pass")):
os.mkdir(f"{path}/fast5_pass")
if bool(1 - os.path.exists(f"{path}/fast5_fail")):
os.mkdir(f"{path}/fast5_fail")

if args.barcode:
if args.summary:
data = pd.read_csv(args.summary, sep="\t")
barcode = set(list(data['barcode_arrangement']))
for i in barcode:
failed = data[(data.passes_filtering == 0) & (data.barcode_arrangement == i)]['read_id']
passed = data[(data.passes_filtering > 0) & (data.barcode_arrangement == i)]['read_id']
reads_list_out(failed, f"{path}/fail_reads_id_list")
reads_list_out(passed, f"{path}/pass_reads_id_list")
fast5_subset(f"{path}/fast5_pass/{i}", f"{path}/pass_reads_id_list", f"{i}_{args.flow}pass")
fast5_subset(f"{path}/fast5_fail/{i}", f"{path}/fail_reads_id_list", f"{i}{args.flow}fail")
print(f"{i} fast5 分离完毕")
elif args.fastq:
pass_fastq = args.fastq.split(",")[0]
fail_fastq = args.fastq.split(",")[1]
barcode = list(map(lambda x : x.split("/")[-1], glob.glob(f"{pass_fastq}/barcode*")))
for i in barcode:
sp.run(["seqkit", "fx2tab", "-i", "-n", f"{pass_fastq}/{i}/*fastq", ">", f"{path}/pass_reads_id_list"], sheel=True)
sp.run(["seqkit", "fx2tab", "-i", "-n", f"{fail_fastq}/{i}/*fastq", ">", f"{path}/fail_reads_id_list"], sheel=True)
fast5_subset(f"{path}/fast5_pass/{i}", f"{path}/pass_reads_id_list", f"{i}{args.flow}pass")
fast5_subset(f"{path}/fast5_fail/{i}", f"{path}/fail_reads_id_list", f"{i}{args.flow}_fail")
print(f"{i} fast5 分离完毕")
else:
print("--summary 与 --fastq 择一选择")
else:
if args.summary:
data = pd.read_csv(args.summary, sep="\t")
failed = data[data.passes_filtering == 0]['read_id']
reads_list_out(failed, f"{path}/fail_reads_id_list")
passed = data[data.passes_filtering > 0]['read_id']
reads_list_out(passed, f"{path}/pass_reads_id_list")
fast5_subset(f"{path}/fast5_pass", f"{path}/pass_reads_id_list", f"{args.flow}_pass")
fast5_subset(f"{path}/fast5_fail", f"{path}/fail_reads_id_list", f"{args.flow}_fail")
elif args.fastq:
pass_fastq = args.fastq.split(",")[0]
fail_fastq = args.fastq.split(",")[1]
sp.run(["seqkit", "fx2tab", "-i", "-n", f"{pass_fastq}/*fastq", ">", f"{path}/pass_reads_id_list"], sheel=True)
sp.run(["seqkit", "fx2tab", "-i", "-n", f"{fail_fastq}/*fastq", ">", f"{path}/fail_reads_id_list"], sheel=True)
fast5_subset(f"{path}/fast5_pass", f"{path}/pass_reads_id_list", f"{args.flow}_pass")
fast5_subset(f"{path}/fast5_fail", f"{path}/fail_reads_id_list", f"{args.flow}_fail")
else:
print("--summary 与 --fastq 择一选择")
print("fast5 分离完毕")
`

markme123 · 2022-07-20T09:21:03Z

The above is all my code. The pass and FAIL parts are extracted separately

fbrennen · 2022-07-20T09:41:10Z

Hi @markme123 -- how about the other questions? Where did you get your input files from? Is there a chance you only have the reads from MinKNOW's "pass" output folder, so the filtering has already been completed?

markme123 · 2022-07-28T06:43:49Z

I have confirmed that FAST5 is all

fbrennen · 2022-07-28T08:46:10Z

Hi @markme123 -- thanks very much for the extra information. I suspect the issue is down to Guppy splitting some of your reads into new ones -- this means that the read_id field in your summary file is not necessary the same as the read_id in your fast5 files. Instead, you need to use the parent_read_id field in the summary file for your call to fast5_subset.

For example, in your code, change these lines:

if args.summary:
data = pd.read_csv(args.summary, sep="\t")
barcode = set(list(data['barcode_arrangement']))
for i in barcode:
failed = data[(data.passes_filtering == 0) & (data.barcode_arrangement == i)]['read_id']
passed = data[(data.passes_filtering > 0) & (data.barcode_arrangement == i)]['read_id']

[...]

if args.summary:
data = pd.read_csv(args.summary, sep="\t")
failed = data[data.passes_filtering == 0]['read_id']
reads_list_out(failed, f"{path}/fail_reads_id_list")
passed = data[data.passes_filtering > 0]['read_id']
reads_list_out(passed, f"{path}/pass_reads_id_list")

To this:

if args.summary:
data = pd.read_csv(args.summary, sep="\t")
barcode = set(list(data['barcode_arrangement']))
for i in barcode:
failed = data[(data.passes_filtering == 0) & (data.barcode_arrangement == i)]['parent_read_id']  # <== changed to parent_read_id
passed = data[(data.passes_filtering > 0) & (data.barcode_arrangement == i)]['parent_read_id']  # <== changed to parent_read_id

[...]

if args.summary:
data = pd.read_csv(args.summary, sep="\t")
failed = data[data.passes_filtering == 0]['parent_read_id']  # <== changed to parent_read_id
reads_list_out(failed, f"{path}/fail_reads_id_list")
passed = data[data.passes_filtering > 0]['parent_read_id']   # <== changed to parent_read_id
reads_list_out(passed, f"{path}/pass_reads_id_list")

Can you try that and see if it works? Note that this method only works with summary files -- it won't work with your seqkit approach and fastqs.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

WARNING:Fast5Filter:xxx reads not found! #73

WARNING:Fast5Filter:xxx reads not found! #73

markme123 commented Jul 18, 2022

fbrennen commented Jul 18, 2022

markme123 commented Jul 20, 2022

fbrennen commented Jul 20, 2022

markme123 commented Jul 20, 2022

markme123 commented Jul 20, 2022

fbrennen commented Jul 20, 2022

markme123 commented Jul 28, 2022

fbrennen commented Jul 28, 2022

WARNING:Fast5Filter:xxx reads not found! #73

WARNING:Fast5Filter:xxx reads not found! #73

Comments

markme123 commented Jul 18, 2022

fbrennen commented Jul 18, 2022

markme123 commented Jul 20, 2022

fbrennen commented Jul 20, 2022

markme123 commented Jul 20, 2022

-- encoding: utf-8 --

markme123 commented Jul 20, 2022

fbrennen commented Jul 20, 2022

markme123 commented Jul 28, 2022

fbrennen commented Jul 28, 2022