Skip to content

Commit

Permalink
Merge pull request #147 from draeger-lab/GwennyGit-patch-3
Browse files Browse the repository at this point in the history
Updated regexes to raw strings
  • Loading branch information
GwennyGit authored Nov 11, 2024
2 parents 9b86f3c + fcd9616 commit 655261a
Show file tree
Hide file tree
Showing 2 changed files with 40 additions and 40 deletions.
24 changes: 12 additions & 12 deletions src/refinegems/classes/gapfill.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ def _map_to_mnx(biocyc_reacs: pd.DataFrame) -> Union[tuple[pd.DataFrame, pd.Data

# Create list of EC codes in column ec-code_x,
# Join both ec-code columns into one & Create a set of ec-codes
mnx_reacs['ec-code_x'] = mnx_reacs['ec-code_x'].str.split('\s*;\s*')
mnx_reacs['ec-code_x'] = mnx_reacs['ec-code_x'].str.split(r'\s*;\s*')
mnx_reacs['ec-code_x'] = mnx_reacs['ec-code_x'].fillna(
{i: [] for i in mnx_reacs.index}
).map(set).map(list)
Expand Down Expand Up @@ -479,7 +479,7 @@ def _map_ec_to_reac_mnx(unmapped_reacs:pd.DataFrame) -> pd.DataFrame:
mnx_reac_prop = load_a_table_from_database('mnx_reac_prop',False)
# convert table into one EC-number per row
mnx_reac_prop.drop('is_balanced', inplace=True, axis=1)
mnx_reac_prop['ec-code'] = mnx_reac_prop['ec-code'].apply(lambda x: x.split(';') if isinstance(x,str) else None)
mnx_reac_prop['ec-code'] = mnx_reac_prop['ec-code'].apply(lambda x: x.split(r';') if isinstance(x,str) else None)
# exclude entries without EC-number
mnx_reac_prop = mnx_reac_prop.explode('ec-code').dropna(subset='ec-code')
# merge with unmapped reactions
Expand Down Expand Up @@ -507,7 +507,7 @@ def _map_ec_to_reac_bigg(unmapped_reacs:pd.DataFrame) -> pd.DataFrame:
bigg_reacs = load_a_table_from_database('bigg_reactions',False)
bigg_reacs.dropna(subset='EC Number', inplace=True)
bigg_reacs = bigg_reacs[['id','reaction_string','EC Number']].rename({'reaction_string':'equation','EC Number':'ec-code'}, inplace=False, axis=1)
bigg_reacs['ec-code'] = bigg_reacs['ec-code'].apply(lambda x: x.split(',') if isinstance(x,str) else None)
bigg_reacs['ec-code'] = bigg_reacs['ec-code'].apply(lambda x: x.split(r',') if isinstance(x,str) else None)
bigg_reacs = bigg_reacs.explode('ec-code')
# merge with unmapped reactions
bigg_mapping = unmapped_reacs.merge(bigg_reacs, on=['ec-code'], how='left')
Expand Down Expand Up @@ -827,7 +827,7 @@ def add_gene_reac_associations_from_table(self,model:libModel,
for idx,row in reac_table.iterrows():
# check, if G_+ncbiprotein in model
# if yes, add gpr
geneid = 'G_'+row['ncbiprotein'].replace('.','_').replace(':','_')
geneid = 'G_'+row['ncbiprotein'].replace(r'.',r'_').replace(r':',r'_')
for reacid in row['add_to_GPR']:
current_reacid = 'R_'+reacid
if geneid in model_gene_ids:
Expand Down Expand Up @@ -894,10 +894,10 @@ def add_reactions_from_table(self, model:cobra.Model,
if isinstance(refs, dict):
continue
elif refs[0] == "{":
refs = refs.replace("'", "\"")
refs = refs.replace(r"'", r"\"")
refs = json.loads(refs)
else:
refs = refs.split(":")
refs = refs.split(r":")
refs = {refs[0]:refs[1]}
else:
refs = {}
Expand Down Expand Up @@ -1216,7 +1216,7 @@ def get_model_genes(model: libModel) -> pd.DataFrame:

# Step 4: extract locus tag
# -------------------------
genes_not_in_model['locus_tag'] = genes_not_in_model['orgid:locus'].str.split(':').str[1]
genes_not_in_model['locus_tag'] = genes_not_in_model['orgid:locus'].str.split(r':').str[1]

# Step 5: map to EC via KEGG
# --------------------------
Expand Down Expand Up @@ -1358,7 +1358,7 @@ def biocyc_gene_tbl(self, biocyc_gene_tbl_path: str):
inplace=True)

# Turn empty strings into NaNs
self._biocyc_gene_tbl.replace('', np.nan, inplace=True)
self._biocyc_gene_tbl.replace(r'', np.nan, inplace=True)

# Drop only complete empty rows
self._biocyc_gene_tbl.dropna(how='all', inplace=True)
Expand Down Expand Up @@ -1405,11 +1405,11 @@ def biocyc_rxn_tbl(self, biocyc_reacs_tbl_path: str) -> pd.DataFrame:
)

# Turn empty strings into NaNs
self._biocyc_rxn_tbl.replace('', np.nan, inplace=True)
self._biocyc_rxn_tbl.replace(r'', np.nan, inplace=True)

# Set entries in is_spontaneous to booleans &
# specify empty entries in 'is_spontaneous' as False
self._biocyc_rxn_tbl['is_spontaneous'].replace({'T': True, 'F': False}, inplace=True)
self._biocyc_rxn_tbl['is_spontaneous'].replace({r'T': True, r'F': False}, inplace=True)
self._biocyc_rxn_tbl['is_spontaneous'] = self._biocyc_rxn_tbl['is_spontaneous'].fillna(False)

def find_missing_genes(self, model: libModel):
Expand Down Expand Up @@ -1476,7 +1476,7 @@ def find_missing_reactions(self, model: cobra.Model):

# Expand missing genes result table to merge with Biocyc reactions table
missing_genes = pd.DataFrame(
missing_genes['id'].str.split('//').tolist(),
missing_genes['id'].str.split(r'//').tolist(),
index=missing_genes['ncbiprotein']
).stack()
missing_genes = missing_genes.reset_index(
Expand All @@ -1503,7 +1503,7 @@ def find_missing_reactions(self, model: cobra.Model):
)

# Turn ec-code entries with '//' into lists, remove prefix 'EC-' & get unique ec=-odes
self.missing_reactions['ec-code'] = self.missing_reactions['ec-code'].str.replace('EC-', '').str.split('\s*//\s*')
self.missing_reactions['ec-code'] = self.missing_reactions['ec-code'].str.replace(r'EC-', r'').str.split(r'\s*//\s*')
self.missing_reactions['ec-code'] = self.missing_reactions['ec-code'].fillna(
{i: [] for i in self.missing_reactions.index}
).map(set).map(list)
Expand Down
56 changes: 28 additions & 28 deletions src/refinegems/curation/polish.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,15 +180,15 @@ def cv_notes_metab(species_list: list[Species]):
species.setMetaId('meta_' + species.getId())
notes_list = []
elem_used = []
notes_string = species.getNotesString().split('\n')
notes_string = species.getNotesString().split(r'\n')
for elem in notes_string:
for db in metabol_db_dict.keys():
if '<p>' + db in elem:
elem_used.append(elem)
#print(elem.strip()[:-4].split(': ')[1])
#print(elem.strip()[:-4].split(r': ')[1])
fill_in = re.split(r':\s*', elem.strip()[:-4])[1]
if (';') in fill_in and not re.search(r'inchi', db, re.IGNORECASE):
entries = fill_in.split(';')
entries = fill_in.split(r';')
for entry in entries:
if not re.fullmatch(r'^nan$', entry.strip(), re.IGNORECASE):
add_cv_term_metabolites(entry.strip(), db, species)
Expand Down Expand Up @@ -221,16 +221,16 @@ def cv_notes_reac(reaction_list: list[Reaction]):
reaction.setMetaId('meta_' + reaction.getId())
notes_list = []
elem_used = []
notes_string = reaction.getNotesString().split('\n')
notes_string = reaction.getNotesString().split(r'\n')

for elem in notes_string:
for db in reaction_db_dict.keys():
if '<p>' + db in elem:
elem_used.append(elem)
#print(elem.strip()[:-4].split(': ')[1])
#print(elem.strip()[:-4].split(r': ')[1])
fill_in = re.split(r':\s*', elem.strip()[:-4])[1]
if (';') in fill_in:
entries = fill_in.split(';')
entries = fill_in.split(r';')
for entry in entries:
if not re.fullmatch(r'^nan$', entry.strip(), re.IGNORECASE):
add_cv_term_reactions(entry.strip(), db, reaction)
Expand Down Expand Up @@ -590,10 +590,10 @@ def cv_ncbiprotein(gene_list, email, locus2id: pd.DataFrame, protein_fasta: str,

elif (gene.getId() != 'G_spontaneous') and (gene.getId() != 'G_Unknown'): # Has to be omitted as no additional data can be retrieved neither from NCBI nor the CarveMe input file
if 'prot_' in gene.getId():
id_string = gene.getId().split('prot_')[1].split('_') # All NCBI CDS protein FASTA files have the NCBI protein identifier after 'prot_' in the FASTA identifier
id_string = gene.getId().split(r'prot_')[1].split(r'_') # All NCBI CDS protein FASTA files have the NCBI protein identifier after 'prot_' in the FASTA identifier
ncbi_id = id_string[0] # If identifier contains no '_', this is full identifier
else:
id_string = gene.getId().removeprefix('G_').split('_')
id_string = gene.getId().removeprefix('G_').split(r'_')
if 'peg' in id_string:
genes_missing_annotation.append('_'.join(id_string))
continue
Expand Down Expand Up @@ -662,7 +662,7 @@ def add_gp_id_from_gff(locus2id: pd.DataFrame, gene_list: list[GeneProduct]):
locus = gp.getLabel()

if locus in locus2id.index:
add_cv_term_genes(locus2id.loc[locus, 'ProteinID'].split('.')[0], 'REFSEQ', gp)
add_cv_term_genes(locus2id.loc[locus, 'ProteinID'].split(r'.')[0], 'REFSEQ', gp)


def add_gp_ids_from_KEGG(gene_list: list[GeneProduct], kegg_organism_id: str):
Expand All @@ -687,7 +687,7 @@ def add_gp_ids_from_KEGG(gene_list: list[GeneProduct], kegg_organism_id: str):
uniprot_id = mapping_kegg_uniprot[kegg_gene_id]

add_cv_term_genes(kegg_gene_id, 'KEGG', gp)
add_cv_term_genes(uniprot_id.split('up:')[1], 'UNIPROT', gp)
add_cv_term_genes(uniprot_id.split(r'up:')[1], 'UNIPROT', gp)

except KeyError:
no_valid_kegg.append(gp.getLabel())
Expand Down Expand Up @@ -729,7 +729,7 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
if curie[0]: # Prefix is valid but to have same result for same databases need to do a bit of own parsing
if re.fullmatch(r'^biocyc$', curie[0], re.IGNORECASE): # Check for biocyc to also add metacyc if possible
# Always add META if BioCyc sub-datbase prefixes are missing
curie = curie if curie[1].split(':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else [curie[0], f'META:{curie[1]}']
curie = curie if curie[1].split(r':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else [curie[0], f'META:{curie[1]}']

if 'META' in curie[1]:
if is_valid_identifier(*curie): # Get the valid BioCyc identifier & Add to dictionary
Expand All @@ -742,7 +742,7 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
invalid_curies.append(f'{curie[0]}:{curie[1]}')

# Add the MetaCyc identifier additionally
curie[1] = curie[1].split('META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
curie[1] = curie[1].split(r'META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
if re.search(r'^rxn-|-rxn$', curie[1], re.IGNORECASE):
curie[0] = 'metacyc.reaction'
else:
Expand All @@ -765,7 +765,7 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
elif not curie[0]: # Need to do own parsing if prefix is not valid
# Get CURIEs irrespective of pattern
if '/' in extracted_curie:
extracted_curie = extracted_curie.split('/')
extracted_curie = extracted_curie.split(r'/')

# Check for NaN identifiers
if re.fullmatch(r'^nan$', extracted_curie[0], re.IGNORECASE) or re.fullmatch(r'^nan$', extracted_curie[1], re.IGNORECASE):
Expand All @@ -780,13 +780,13 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
elif re.fullmatch(r'^inchikey$', extracted_curie[0], re.IGNORECASE):
curie = (extracted_curie[0].lower(), extracted_curie[1])
else:
wrong_prefix = extracted_curie[0].split(':')
wrong_prefix = extracted_curie[0].split(r':')
curie = (wrong_prefix[0], f'{wrong_prefix[1]}/{"/".join(extracted_curie[1:len(extracted_curie)])}')
elif re.fullmatch(r'^brenda$', extracted_curie[0], re.IGNORECASE): # Brenda & EC code is the same
curie = ('eccode', extracted_curie[1])
elif re.fullmatch(r'^biocyc$', extracted_curie[0], re.IGNORECASE): # Check for biocyc to also add metacyc if possible
# Always add META if BioCyc sub-datbase prefixes are missing
extracted_curie[1] = extracted_curie[1] if extracted_curie[1].split(':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else f'META:{extracted_curie[1]}'
extracted_curie[1] = extracted_curie[1] if extracted_curie[1].split(r':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else f'META:{extracted_curie[1]}'
curie = ['biocyc', extracted_curie[1]]

if 'META' in curie[1]:
Expand All @@ -800,7 +800,7 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
invalid_curies.append(f'{curie[0]}:{curie[1]}')

# Add additionallly the MetaCyc identifier
curie[1] = curie[1].split('META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
curie[1] = curie[1].split(r'META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
if re.search(r'^rxn-|-rxn$', curie[1], re.IGNORECASE):
curie[0] = 'metacyc.reaction'
else:
Expand All @@ -819,10 +819,10 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
# Add BioCyc identfier additionally
curie = ['biocyc', f'META:{curie[1]}'] # Metacyc identifier comes after 'META:' in biocyc identifier
elif re.fullmatch(r'^chebi$', extracted_curie[0], re.IGNORECASE):
new_curie = extracted_curie[1].split(':')
new_curie = extracted_curie[1].split(r':')
curie = (new_curie[0].lower(), new_curie[1])
elif re.search(r'^sbo:', extracted_curie[1], re.IGNORECASE): # Checks for old pattern of SBO term URIs ('MIRIAM/sbo/SBO:identifier')
curie = [extracted_curie[0], extracted_curie[1].split(':')[1]]
curie = [extracted_curie[0], extracted_curie[1].split(r':')[1]]
else:
if re.fullmatch(r'^brenda$', extracted_curie[0], re.IGNORECASE) or re.fullmatch(r'^ec-code$', extracted_curie[0], re.IGNORECASE): # Brenda equals EC code, EC code in URI = ec-code
curie[0] = 'eccode'
Expand All @@ -832,7 +832,7 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
curie[1] = extracted_curie[1]

elif ':' in extracted_curie:
extracted_curie = extracted_curie.split(':')
extracted_curie = extracted_curie.split(r':')

# Check for NaN identifiers
if re.fullmatch(r'^nan$', extracted_curie[0], re.IGNORECASE) or re.fullmatch(r'^nan$', extracted_curie[1], re.IGNORECASE):
Expand All @@ -842,7 +842,7 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
continue
elif re.fullmatch(r'^biocyc$', extracted_curie[0], re.IGNORECASE): # Check for biocyc to also add metacyc if possible
# Always add META if BioCyc sub-datbase prefixes are missing
extracted_curie[1] = extracted_curie[1] if extracted_curie[1].split(':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else f'META:{extracted_curie[1]}'
extracted_curie[1] = extracted_curie[1] if extracted_curie[1].split(r':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else f'META:{extracted_curie[1]}'
curie = ['biocyc', extracted_curie[1]]

if 'META' in curie[1]:
Expand All @@ -856,7 +856,7 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
invalid_curies.append(f'{curie[0]}:{curie[1]}')

# Add MetaCyc identifier additionally
curie[1] = curie[1].split('META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
curie[1] = curie[1].split(r'META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
if re.search(r'^rxn-|-rxn$', curie[1], re.IGNORECASE):
curie[0] = 'metacyc.reaction'
else:
Expand Down Expand Up @@ -891,14 +891,14 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st

if curie[0] == 'eccode':
correct_id = curie[1] # EC number needs to have 4 places if splitted at the dots
while len(correct_id.split('.')) < 4:
while len(correct_id.split(r'.')) < 4:
correct_id = f'{correct_id}.-'
prefix, identifier = normalize_parsed_curie(curie[0], correct_id)
# Add too long EC codes back in model BUT report as invalid CURIEs!
if (len(correct_id.split('.')) > 4): invalid_curies.append(f'{prefix}:{identifier}')
if (len(correct_id.split(r'.')) > 4): invalid_curies.append(f'{prefix}:{identifier}')
# Rhea identifier should only contain 5 numbers but added by CarveMe the Rhea identifier contains '#1'
elif (curie[0] == 'rhea') and ('#' in curie[1]):
prefix, identifier = normalize_parsed_curie(curie[0], curie[1].split('#')[0])
prefix, identifier = normalize_parsed_curie(curie[0], curie[1].split(r'#')[0])
else:
invalid_curies.append(f'{curie[0]}:{curie[1]}')

Expand Down Expand Up @@ -1159,7 +1159,7 @@ def polish_annotations(model: libModel, bioregistry: bool, new_pattern: bool, fi
f'These invalid CURIEs are saved to {curies_filename}')
invalid_curies_df = parse_dict_to_dataframe(all_entity2invalid_curies)
invalid_curies_df.columns = ['entity', 'invalid_curie']
invalid_curies_df[['prefix', 'identifier']] = invalid_curies_df.invalid_curie.str.split(':', n=1, expand = True) # Required for identifiers that aso contain a ':'
invalid_curies_df[['prefix', 'identifier']] = invalid_curies_df.invalid_curie.str.split(r':', n=1, expand = True) # Required for identifiers that aso contain a ':'
invalid_curies_df = invalid_curies_df.drop('invalid_curie', axis=1)
invalid_curies_df.to_csv(curies_filename, sep='\t')

Expand Down Expand Up @@ -1442,9 +1442,9 @@ def check_direction(model:cobra.Model,data:Union[pd.DataFrame,str]) -> cobra.Mod
# load from a table
data = pd.read_csv(data, sep='\t')
# rewrite the columns into a better comparable/searchable format
data['KEGG reaction'] = data['KEGG reaction'].str.extract('.*>(R\d*)<.*')
data['METANETX'] = data['METANETX'].str.extract('.*>(MNXR\d*)<.*')
data['EC-Number'] = data['EC-Number'].str.extract('EC-(.*)')
data['KEGG reaction'] = data['KEGG reaction'].str.extract(r'.*>(R\d*)<.*')
data['METANETX'] = data['METANETX'].str.extract(r'.*>(MNXR\d*)<.*')
data['EC-Number'] = data['EC-Number'].str.extract(r'EC-(.*)')
case _:
mes = f'Unknown data type for parameter data: {type(data)}'
raise TypeError(mes)
Expand Down

0 comments on commit 655261a

Please sign in to comment.