Skip to content

Commit

Permalink
replaced gffpandas with just pandas
Browse files Browse the repository at this point in the history
  • Loading branch information
niekwit committed Sep 9, 2024
1 parent 28435a6 commit 2c52f64
Showing 1 changed file with 16 additions and 7 deletions.
23 changes: 16 additions & 7 deletions workflow/scripts/extend_fragments.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
import subprocess
import pysam
import numpy as np
import gffpandas.gffpandas as gffpd
import pandas as pd
#import gffpandas.gffpandas as gffpd

input_bam = snakemake.input["bam"]
fai = snakemake.input["fai"]
Expand All @@ -45,17 +46,25 @@

# For each chromosome obtain start sites of GATC sites in np array
logging.info("Reading GATC sites from GFF file...")
gatc_gff = gffpd.read_gff3(gatc_gff).df
#gatc_gff = gffpd.read_gff3(gatc_gff).df
gatc_gff = pd.read_csv(gatc_gff,
header=None, sep="\t",
low_memory=False)

chr_gatc = {}
for chr in gatc_gff["seq_id"].unique():
chr_gatc[chr] = gatc_gff[gatc_gff["seq_id"] == chr]
for chr in gatc_gff[0].unique():
"""
Store the GATC sites for each chromosome in a dictionary.
"""
# Subset GATC sites for the current chromosome
chr = str(chr)
tmp = gatc_gff[gatc_gff[0] == chr]

# Only keep the start position
chr_gatc[chr] = chr_gatc[chr]["start"].values + 2 # Correct for DpnI cut site
# Obtain start sites of GATC sites (4th column)
tmp = tmp[3].values + 2 # Correct for DpnI cut site

# Convert to Numpy array for speed
chr_gatc[chr] = np.sort(chr_gatc[chr])
chr_gatc[chr] = np.sort(tmp)

# Read first column of fai file into 0-based index
# converts target ID to reference name
Expand Down

0 comments on commit 2c52f64

Please sign in to comment.