Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WARNING:Fast5Filter:xxx reads not found! #73

Open
markme123 opened this issue Jul 18, 2022 · 8 comments
Open

WARNING:Fast5Filter:xxx reads not found! #73

markme123 opened this issue Jul 18, 2022 · 8 comments

Comments

@markme123
Copy link

Fast5_subset fast5 separation pass and FAIL, found that the latest R10 FAST5 has a lot of extraction can not be out .
I checked that there were no fast5 incompletions, and this happened on many R10 versions, but not on R9 .
A third of them didn't come out.

| 3108059 of 5017321|######################## | 61% ETA: 0:24:25

  • 3140641 of 5017321|######################## | 62% ETA: 0:23:55
    | 3178341 of 5017321|######################## | 63% ETA: 0:23:27
  • 3208333 of 5017321|######################## | 63% ETA: 0:23:03
    | 3238381 of 5017321|######################### | 64% ETA: 0:22:36
  • 3273840 of 5017321|######################### | 65% ETA: 0:22:08
    | 3303857 of 5017321|######################### | 65% ETA: 0:21:44
  • 333769
    \ 3364211 of 5017321|########################## | 67% ETA: 0:20:59
    / 3378623 of 5017321|########################## | 67% ETA: 0:20:53
    | 5017321 of 5017321|#######################################|100% Time: 0:43:08
    INFO:Fast5Filter:3377354 reads extracted
    WARNING:Fast5Filter:1638696 reads not found!
@fbrennen
Copy link
Contributor

Hi @markme123 -- thanks for getting in touch. Can you please tell us:

  • The full command you ran.
  • Whether or not there is a particular pattern to the reads that were not found -- are they, for example, already in a fail folder somewhere?

@markme123
Copy link
Author

fast5_subset_bin = "/opt/miniconda3/bin/fast5_subset"
def fast5_subset(save_path, reads_list, bacth_name, fast5_input=args.fast5_input):
sp.run([fast5_subset_bin, "-i", fast5_input, "-s", save_path, "-l", reads_list, "-t", "25", "-r", "-f", bacth_name])

It's hard to tell if it's a problem with some FAST5 files because there are so many files

@fbrennen
Copy link
Contributor

Hi @markme123 -- how are you generating your reads list? Where did you get your input files from? Is there a chance you only have the reads from MinKNOW's "pass" output folder, so the filtering has already been completed?

@markme123
Copy link
Author

`#!/usr/bin/env python

-- encoding: utf-8 --

'''
@file : split_f5.py
@time : 2022/03/29 16:09:25
@author : jiangmian
@Version : 1.0
'''

import argparse
from ont_fast5_api.conversion_tools import fast5_subset as fs
import subprocess as sp
import os
import pandas as pd
from Bio import SeqIO
import glob

parser = argparse.ArgumentParser(description="fast5 split, 本程序调用了seqkit")
parser.add_argument('-i', '--fast5_input', dest="fast5_input", required=True, help="fast5 文件所在")
parser.add_argument('-p', '--flow', dest='flow', required=True, help="芯片上机号")
parser.add_argument('-s', '--save_path', dest="save_path", required=True, help="输出目录")
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--summary', dest='summary', help="输入summary 文件绝对路径,--summary 与 --fastq 择一选择")
group.add_argument('--fastq', dest='fastq', help="输入pass_fastq,fail_fastq路径, 示例为 /data/fastq_pass,/data/fastq_fail")
parser.add_argument('-b', '--barcode', dest='barcode', action='store_true', help="有barcode")
args = parser.parse_args()

fast5_subset_bin = "/opt/miniconda3/bin/fast5_subset"
def fast5_subset(save_path, reads_list, bacth_name, fast5_input=args.fast5_input):
sp.run([fast5_subset_bin, "-i", fast5_input, "-s", save_path, "-l", reads_list, "-t", "25", "-r", "-f", bacth_name])

def reads_list_out(reads_list, out_name):
with open(out_name, "w") as out:
for line in reads_list:
out.write(line + "\n")

path = os.path.realpath(args.save_path)
if bool(1 - os.path.exists(f"{path}/fast5_pass")):
os.mkdir(f"{path}/fast5_pass")
if bool(1 - os.path.exists(f"{path}/fast5_fail")):
os.mkdir(f"{path}/fast5_fail")

if args.barcode:
if args.summary:
data = pd.read_csv(args.summary, sep="\t")
barcode = set(list(data['barcode_arrangement']))
for i in barcode:
failed = data[(data.passes_filtering == 0) & (data.barcode_arrangement == i)]['read_id']
passed = data[(data.passes_filtering > 0) & (data.barcode_arrangement == i)]['read_id']
reads_list_out(failed, f"{path}/fail_reads_id_list")
reads_list_out(passed, f"{path}/pass_reads_id_list")
fast5_subset(f"{path}/fast5_pass/{i}", f"{path}/pass_reads_id_list", f"{i}_{args.flow}pass")
fast5_subset(f"{path}/fast5_fail/{i}", f"{path}/fail_reads_id_list", f"{i}
{args.flow}fail")
print(f"{i} fast5 分离完毕")
elif args.fastq:
pass_fastq = args.fastq.split(",")[0]
fail_fastq = args.fastq.split(",")[1]
barcode = list(map(lambda x : x.split("/")[-1], glob.glob(f"{pass_fastq}/barcode*")))
for i in barcode:
sp.run(["seqkit", "fx2tab", "-i", "-n", f"{pass_fastq}/{i}/*fastq", ">", f"{path}/pass_reads_id_list"], sheel=True)
sp.run(["seqkit", "fx2tab", "-i", "-n", f"{fail_fastq}/{i}/*fastq", ">", f"{path}/fail_reads_id_list"], sheel=True)
fast5_subset(f"{path}/fast5_pass/{i}", f"{path}/pass_reads_id_list", f"{i}
{args.flow}pass")
fast5_subset(f"{path}/fast5_fail/{i}", f"{path}/fail_reads_id_list", f"{i}
{args.flow}_fail")
print(f"{i} fast5 分离完毕")
else:
print("--summary 与 --fastq 择一选择")
else:
if args.summary:
data = pd.read_csv(args.summary, sep="\t")
failed = data[data.passes_filtering == 0]['read_id']
reads_list_out(failed, f"{path}/fail_reads_id_list")
passed = data[data.passes_filtering > 0]['read_id']
reads_list_out(passed, f"{path}/pass_reads_id_list")
fast5_subset(f"{path}/fast5_pass", f"{path}/pass_reads_id_list", f"{args.flow}_pass")
fast5_subset(f"{path}/fast5_fail", f"{path}/fail_reads_id_list", f"{args.flow}_fail")
elif args.fastq:
pass_fastq = args.fastq.split(",")[0]
fail_fastq = args.fastq.split(",")[1]
sp.run(["seqkit", "fx2tab", "-i", "-n", f"{pass_fastq}/*fastq", ">", f"{path}/pass_reads_id_list"], sheel=True)
sp.run(["seqkit", "fx2tab", "-i", "-n", f"{fail_fastq}/*fastq", ">", f"{path}/fail_reads_id_list"], sheel=True)
fast5_subset(f"{path}/fast5_pass", f"{path}/pass_reads_id_list", f"{args.flow}_pass")
fast5_subset(f"{path}/fast5_fail", f"{path}/fail_reads_id_list", f"{args.flow}_fail")
else:
print("--summary 与 --fastq 择一选择")
print("fast5 分离完毕")
`

@markme123
Copy link
Author

The above is all my code. The pass and FAIL parts are extracted separately

@fbrennen
Copy link
Contributor

Hi @markme123 -- how about the other questions? Where did you get your input files from? Is there a chance you only have the reads from MinKNOW's "pass" output folder, so the filtering has already been completed?

@markme123
Copy link
Author

I have confirmed that FAST5 is all

@fbrennen
Copy link
Contributor

Hi @markme123 -- thanks very much for the extra information. I suspect the issue is down to Guppy splitting some of your reads into new ones -- this means that the read_id field in your summary file is not necessary the same as the read_id in your fast5 files. Instead, you need to use the parent_read_id field in the summary file for your call to fast5_subset.

For example, in your code, change these lines:

if args.summary:
data = pd.read_csv(args.summary, sep="\t")
barcode = set(list(data['barcode_arrangement']))
for i in barcode:
failed = data[(data.passes_filtering == 0) & (data.barcode_arrangement == i)]['read_id']
passed = data[(data.passes_filtering > 0) & (data.barcode_arrangement == i)]['read_id']

[...]

if args.summary:
data = pd.read_csv(args.summary, sep="\t")
failed = data[data.passes_filtering == 0]['read_id']
reads_list_out(failed, f"{path}/fail_reads_id_list")
passed = data[data.passes_filtering > 0]['read_id']
reads_list_out(passed, f"{path}/pass_reads_id_list")

To this:

if args.summary:
data = pd.read_csv(args.summary, sep="\t")
barcode = set(list(data['barcode_arrangement']))
for i in barcode:
failed = data[(data.passes_filtering == 0) & (data.barcode_arrangement == i)]['parent_read_id']  # <== changed to parent_read_id
passed = data[(data.passes_filtering > 0) & (data.barcode_arrangement == i)]['parent_read_id']  # <== changed to parent_read_id

[...]

if args.summary:
data = pd.read_csv(args.summary, sep="\t")
failed = data[data.passes_filtering == 0]['parent_read_id']  # <== changed to parent_read_id
reads_list_out(failed, f"{path}/fail_reads_id_list")
passed = data[data.passes_filtering > 0]['parent_read_id']   # <== changed to parent_read_id
reads_list_out(passed, f"{path}/pass_reads_id_list")

Can you try that and see if it works? Note that this method only works with summary files -- it won't work with your seqkit approach and fastqs.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants