Merge pull request #147 from draeger-lab/GwennyGit-patch-3

Updated regexes to raw strings
draeger-lab · Nov 11, 2024 · 655261a · 655261a
2 parents 9b86f3c + fcd9616
commit 655261a
Show file tree

Hide file tree

Showing 2 changed files with 40 additions and 40 deletions.
diff --git a/src/refinegems/classes/gapfill.py b/src/refinegems/classes/gapfill.py
@@ -259,7 +259,7 @@ def _map_to_mnx(biocyc_reacs: pd.DataFrame) -> Union[tuple[pd.DataFrame, pd.Data
 
             # Create list of EC codes in column ec-code_x, 
             # Join both ec-code columns into one & Create a set of ec-codes
-            mnx_reacs['ec-code_x'] = mnx_reacs['ec-code_x'].str.split('\s*;\s*')
+            mnx_reacs['ec-code_x'] = mnx_reacs['ec-code_x'].str.split(r'\s*;\s*')
             mnx_reacs['ec-code_x'] = mnx_reacs['ec-code_x'].fillna(
                 {i: [] for i in mnx_reacs.index}
                 ).map(set).map(list)
@@ -479,7 +479,7 @@ def _map_ec_to_reac_mnx(unmapped_reacs:pd.DataFrame) -> pd.DataFrame:
         mnx_reac_prop = load_a_table_from_database('mnx_reac_prop',False)
         # convert table into one EC-number per row
         mnx_reac_prop.drop('is_balanced', inplace=True, axis=1)
-        mnx_reac_prop['ec-code'] = mnx_reac_prop['ec-code'].apply(lambda x: x.split(';') if isinstance(x,str) else None)
+        mnx_reac_prop['ec-code'] = mnx_reac_prop['ec-code'].apply(lambda x: x.split(r';') if isinstance(x,str) else None)
         # exclude entries without EC-number
         mnx_reac_prop = mnx_reac_prop.explode('ec-code').dropna(subset='ec-code')
         # merge with unmapped reactions
@@ -507,7 +507,7 @@ def _map_ec_to_reac_bigg(unmapped_reacs:pd.DataFrame) -> pd.DataFrame:
         bigg_reacs = load_a_table_from_database('bigg_reactions',False)
         bigg_reacs.dropna(subset='EC Number', inplace=True)
         bigg_reacs = bigg_reacs[['id','reaction_string','EC Number']].rename({'reaction_string':'equation','EC Number':'ec-code'}, inplace=False, axis=1)
-        bigg_reacs['ec-code'] = bigg_reacs['ec-code'].apply(lambda x: x.split(',') if isinstance(x,str) else None)
+        bigg_reacs['ec-code'] = bigg_reacs['ec-code'].apply(lambda x: x.split(r',') if isinstance(x,str) else None)
         bigg_reacs = bigg_reacs.explode('ec-code')
         # merge with unmapped reactions
         bigg_mapping = unmapped_reacs.merge(bigg_reacs, on=['ec-code'], how='left')
@@ -827,7 +827,7 @@ def add_gene_reac_associations_from_table(self,model:libModel,
         for idx,row in reac_table.iterrows():
             # check, if G_+ncbiprotein in model
             # if yes, add gpr
-            geneid = 'G_'+row['ncbiprotein'].replace('.','_').replace(':','_')
+            geneid = 'G_'+row['ncbiprotein'].replace(r'.',r'_').replace(r':',r'_')
             for reacid in row['add_to_GPR']:
                 current_reacid = 'R_'+reacid
                 if geneid in model_gene_ids:
@@ -894,10 +894,10 @@ def add_reactions_from_table(self, model:cobra.Model,
                 if isinstance(refs, dict):
                     continue
                 elif refs[0] == "{":
-                    refs = refs.replace("'", "\"")
+                    refs = refs.replace(r"'", r"\"")
                     refs = json.loads(refs)
                 else: 
-                    refs = refs.split(":")
+                    refs = refs.split(r":")
                     refs = {refs[0]:refs[1]}
             else:
                 refs = {}
@@ -1216,7 +1216,7 @@ def get_model_genes(model: libModel) -> pd.DataFrame:
 
         # Step 4: extract locus tag
         # -------------------------
-        genes_not_in_model['locus_tag'] = genes_not_in_model['orgid:locus'].str.split(':').str[1]
+        genes_not_in_model['locus_tag'] = genes_not_in_model['orgid:locus'].str.split(r':').str[1]
 
         # Step 5: map to EC via KEGG
         # --------------------------
@@ -1358,7 +1358,7 @@ def biocyc_gene_tbl(self, biocyc_gene_tbl_path: str):
             inplace=True)
 
         # Turn empty strings into NaNs
-        self._biocyc_gene_tbl.replace('', np.nan, inplace=True)
+        self._biocyc_gene_tbl.replace(r'', np.nan, inplace=True)
 
         # Drop only complete empty rows
         self._biocyc_gene_tbl.dropna(how='all', inplace=True)
@@ -1405,11 +1405,11 @@ def biocyc_rxn_tbl(self, biocyc_reacs_tbl_path: str) -> pd.DataFrame:
             )
 
         # Turn empty strings into NaNs
-        self._biocyc_rxn_tbl.replace('', np.nan, inplace=True)
+        self._biocyc_rxn_tbl.replace(r'', np.nan, inplace=True)
 
         # Set entries in is_spontaneous to booleans &
         # specify empty entries in 'is_spontaneous' as False
-        self._biocyc_rxn_tbl['is_spontaneous'].replace({'T': True, 'F': False}, inplace=True)
+        self._biocyc_rxn_tbl['is_spontaneous'].replace({r'T': True, r'F': False}, inplace=True)
         self._biocyc_rxn_tbl['is_spontaneous'] = self._biocyc_rxn_tbl['is_spontaneous'].fillna(False)
 
     def find_missing_genes(self, model: libModel):
@@ -1476,7 +1476,7 @@ def find_missing_reactions(self, model: cobra.Model):
 
         # Expand missing genes result table to merge with Biocyc reactions table
         missing_genes = pd.DataFrame(
-           missing_genes['id'].str.split('//').tolist(), 
+           missing_genes['id'].str.split(r'//').tolist(), 
            index=missing_genes['ncbiprotein']
            ).stack()
         missing_genes = missing_genes.reset_index(
@@ -1503,7 +1503,7 @@ def find_missing_reactions(self, model: cobra.Model):
             )
 
         # Turn ec-code entries with '//' into lists, remove prefix 'EC-' & get unique ec=-odes
-        self.missing_reactions['ec-code'] = self.missing_reactions['ec-code'].str.replace('EC-', '').str.split('\s*//\s*')
+        self.missing_reactions['ec-code'] = self.missing_reactions['ec-code'].str.replace(r'EC-', r'').str.split(r'\s*//\s*')
         self.missing_reactions['ec-code'] = self.missing_reactions['ec-code'].fillna(
             {i: [] for i in self.missing_reactions.index}
             ).map(set).map(list)

diff --git a/src/refinegems/curation/polish.py b/src/refinegems/curation/polish.py
@@ -180,15 +180,15 @@ def cv_notes_metab(species_list: list[Species]):
             species.setMetaId('meta_' + species.getId())
         notes_list = []
         elem_used = []
-        notes_string = species.getNotesString().split('\n')
+        notes_string = species.getNotesString().split(r'\n')
         for elem in notes_string:
             for db in metabol_db_dict.keys():
                 if '<p>' + db in elem:
                     elem_used.append(elem)
-                    #print(elem.strip()[:-4].split(': ')[1])
+                    #print(elem.strip()[:-4].split(r': ')[1])
                     fill_in = re.split(r':\s*', elem.strip()[:-4])[1]
                     if (';') in fill_in and not re.search(r'inchi', db, re.IGNORECASE):
-                        entries = fill_in.split(';')
+                        entries = fill_in.split(r';')
                         for entry in entries:
                             if not re.fullmatch(r'^nan$', entry.strip(), re.IGNORECASE):
                                 add_cv_term_metabolites(entry.strip(), db, species)
@@ -221,16 +221,16 @@ def cv_notes_reac(reaction_list: list[Reaction]):
             reaction.setMetaId('meta_' + reaction.getId())
         notes_list = []
         elem_used = []
-        notes_string = reaction.getNotesString().split('\n')
+        notes_string = reaction.getNotesString().split(r'\n')
 
         for elem in notes_string:
             for db in reaction_db_dict.keys():
                 if '<p>' + db in elem:
                     elem_used.append(elem)
-                    #print(elem.strip()[:-4].split(': ')[1])
+                    #print(elem.strip()[:-4].split(r': ')[1])
                     fill_in = re.split(r':\s*', elem.strip()[:-4])[1]
                     if (';') in fill_in:
-                        entries = fill_in.split(';')
+                        entries = fill_in.split(r';')
                         for entry in entries:
                             if not re.fullmatch(r'^nan$', entry.strip(), re.IGNORECASE):
                                 add_cv_term_reactions(entry.strip(), db, reaction)
@@ -590,10 +590,10 @@ def cv_ncbiprotein(gene_list, email, locus2id: pd.DataFrame, protein_fasta: str,
 
         elif (gene.getId() != 'G_spontaneous') and (gene.getId() != 'G_Unknown'): # Has to be omitted as no additional data can be retrieved neither from NCBI nor the CarveMe input file
             if 'prot_' in gene.getId():
-                id_string = gene.getId().split('prot_')[1].split('_')  # All NCBI CDS protein FASTA files have the NCBI protein identifier after 'prot_' in the FASTA identifier
+                id_string = gene.getId().split(r'prot_')[1].split(r'_')  # All NCBI CDS protein FASTA files have the NCBI protein identifier after 'prot_' in the FASTA identifier
                 ncbi_id = id_string[0]  # If identifier contains no '_', this is full identifier
             else:
-                id_string = gene.getId().removeprefix('G_').split('_')
+                id_string = gene.getId().removeprefix('G_').split(r'_')
                 if 'peg' in id_string: 
                     genes_missing_annotation.append('_'.join(id_string))
                     continue
@@ -662,7 +662,7 @@ def add_gp_id_from_gff(locus2id: pd.DataFrame, gene_list: list[GeneProduct]):
         locus = gp.getLabel()
 
         if locus in locus2id.index:
-            add_cv_term_genes(locus2id.loc[locus, 'ProteinID'].split('.')[0], 'REFSEQ', gp)
+            add_cv_term_genes(locus2id.loc[locus, 'ProteinID'].split(r'.')[0], 'REFSEQ', gp)
 
 
 def add_gp_ids_from_KEGG(gene_list: list[GeneProduct], kegg_organism_id: str):
@@ -687,7 +687,7 @@ def add_gp_ids_from_KEGG(gene_list: list[GeneProduct], kegg_organism_id: str):
                 uniprot_id = mapping_kegg_uniprot[kegg_gene_id]
 
                 add_cv_term_genes(kegg_gene_id, 'KEGG', gp)
-                add_cv_term_genes(uniprot_id.split('up:')[1], 'UNIPROT', gp)
+                add_cv_term_genes(uniprot_id.split(r'up:')[1], 'UNIPROT', gp)
 
             except KeyError:
                 no_valid_kegg.append(gp.getLabel())
@@ -729,7 +729,7 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
         if curie[0]: # Prefix is valid but to have same result for same databases need to do a bit of own parsing
             if re.fullmatch(r'^biocyc$', curie[0], re.IGNORECASE):  # Check for biocyc to also add metacyc if possible
                 # Always add META if BioCyc sub-datbase prefixes are missing
-                curie = curie if curie[1].split(':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else [curie[0], f'META:{curie[1]}']
+                curie = curie if curie[1].split(r':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else [curie[0], f'META:{curie[1]}']
 
                 if 'META' in curie[1]: 
                     if is_valid_identifier(*curie): # Get the valid BioCyc identifier & Add to dictionary
@@ -742,7 +742,7 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
                         invalid_curies.append(f'{curie[0]}:{curie[1]}')
 
                     # Add the MetaCyc identifier additionally
-                    curie[1] = curie[1].split('META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
+                    curie[1] = curie[1].split(r'META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
                     if re.search(r'^rxn-|-rxn$', curie[1], re.IGNORECASE):
                         curie[0] = 'metacyc.reaction'
                     else:
@@ -765,7 +765,7 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
         elif not curie[0]: # Need to do own parsing if prefix is not valid
             # Get CURIEs irrespective of pattern
             if '/' in extracted_curie:
-                extracted_curie = extracted_curie.split('/')
+                extracted_curie = extracted_curie.split(r'/')
 
                 # Check for NaN identifiers
                 if re.fullmatch(r'^nan$', extracted_curie[0], re.IGNORECASE) or re.fullmatch(r'^nan$', extracted_curie[1], re.IGNORECASE):
@@ -780,13 +780,13 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
                     elif re.fullmatch(r'^inchikey$', extracted_curie[0], re.IGNORECASE):
                         curie = (extracted_curie[0].lower(), extracted_curie[1])
                     else:
-                        wrong_prefix = extracted_curie[0].split(':')
+                        wrong_prefix = extracted_curie[0].split(r':')
                         curie = (wrong_prefix[0], f'{wrong_prefix[1]}/{"/".join(extracted_curie[1:len(extracted_curie)])}')
                 elif re.fullmatch(r'^brenda$', extracted_curie[0], re.IGNORECASE): # Brenda & EC code is the same
                     curie = ('eccode', extracted_curie[1])
                 elif re.fullmatch(r'^biocyc$', extracted_curie[0], re.IGNORECASE):  # Check for biocyc to also add metacyc if possible
                     # Always add META if BioCyc sub-datbase prefixes are missing
-                    extracted_curie[1] = extracted_curie[1] if extracted_curie[1].split(':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else f'META:{extracted_curie[1]}'
+                    extracted_curie[1] = extracted_curie[1] if extracted_curie[1].split(r':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else f'META:{extracted_curie[1]}'
                     curie = ['biocyc', extracted_curie[1]]
 
                     if 'META' in curie[1]: 
@@ -800,7 +800,7 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
                             invalid_curies.append(f'{curie[0]}:{curie[1]}')
 
                         # Add additionallly the MetaCyc identifier 
-                        curie[1] = curie[1].split('META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
+                        curie[1] = curie[1].split(r'META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
                         if re.search(r'^rxn-|-rxn$', curie[1], re.IGNORECASE):
                             curie[0] = 'metacyc.reaction'
                         else:
@@ -819,10 +819,10 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
                     # Add BioCyc identfier additionally
                     curie = ['biocyc', f'META:{curie[1]}'] # Metacyc identifier comes after 'META:' in biocyc identifier
                 elif re.fullmatch(r'^chebi$', extracted_curie[0], re.IGNORECASE):
-                    new_curie = extracted_curie[1].split(':')
+                    new_curie = extracted_curie[1].split(r':')
                     curie = (new_curie[0].lower(), new_curie[1])
                 elif re.search(r'^sbo:', extracted_curie[1], re.IGNORECASE): # Checks for old pattern of SBO term URIs ('MIRIAM/sbo/SBO:identifier')
-                    curie = [extracted_curie[0], extracted_curie[1].split(':')[1]]
+                    curie = [extracted_curie[0], extracted_curie[1].split(r':')[1]]
                 else:
                     if re.fullmatch(r'^brenda$', extracted_curie[0], re.IGNORECASE) or re.fullmatch(r'^ec-code$', extracted_curie[0], re.IGNORECASE): # Brenda equals EC code, EC code in URI = ec-code
                         curie[0] = 'eccode'
@@ -832,7 +832,7 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
                     curie[1] = extracted_curie[1]
 
             elif ':' in extracted_curie:
-                extracted_curie = extracted_curie.split(':')
+                extracted_curie = extracted_curie.split(r':')
 
                 # Check for NaN identifiers
                 if re.fullmatch(r'^nan$', extracted_curie[0], re.IGNORECASE) or re.fullmatch(r'^nan$', extracted_curie[1], re.IGNORECASE):
@@ -842,7 +842,7 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
                     continue
                 elif re.fullmatch(r'^biocyc$', extracted_curie[0], re.IGNORECASE):  # Check for biocyc to also add metacyc if possible
                     # Always add META if BioCyc sub-datbase prefixes are missing
-                    extracted_curie[1] = extracted_curie[1] if extracted_curie[1].split(':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else f'META:{extracted_curie[1]}'
+                    extracted_curie[1] = extracted_curie[1] if extracted_curie[1].split(r':')[0] in BIOCYC_TIER1_DATABASES_PREFIXES else f'META:{extracted_curie[1]}'
                     curie = ['biocyc', extracted_curie[1]]
 
                     if 'META' in curie[1]:
@@ -856,7 +856,7 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
                             invalid_curies.append(f'{curie[0]}:{curie[1]}')
 
                         # Add MetaCyc identifier additionally
-                        curie[1] = curie[1].split('META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
+                        curie[1] = curie[1].split(r'META:')[1] # Metacyc identifier comes after 'META:' in biocyc identifier
                         if re.search(r'^rxn-|-rxn$', curie[1], re.IGNORECASE):
                             curie[0] = 'metacyc.reaction'
                         else:
@@ -891,14 +891,14 @@ def get_set_of_curies(uri_list: list[str]) -> tuple[SortedDict[str: SortedSet[st
 
             if curie[0] == 'eccode':
                 correct_id = curie[1] # EC number needs to have 4 places if splitted at the dots
-                while len(correct_id.split('.')) < 4:
+                while len(correct_id.split(r'.')) < 4:
                     correct_id = f'{correct_id}.-'
                 prefix, identifier = normalize_parsed_curie(curie[0], correct_id)
                 # Add too long EC codes back in model BUT report as invalid CURIEs!
-                if (len(correct_id.split('.')) > 4): invalid_curies.append(f'{prefix}:{identifier}')
+                if (len(correct_id.split(r'.')) > 4): invalid_curies.append(f'{prefix}:{identifier}')
             # Rhea identifier should only contain 5 numbers but added by CarveMe the Rhea identifier contains '#1'
             elif (curie[0] == 'rhea') and ('#' in curie[1]):
-                prefix, identifier = normalize_parsed_curie(curie[0], curie[1].split('#')[0])
+                prefix, identifier = normalize_parsed_curie(curie[0], curie[1].split(r'#')[0])
             else:
                 invalid_curies.append(f'{curie[0]}:{curie[1]}')
 
@@ -1159,7 +1159,7 @@ def polish_annotations(model: libModel, bioregistry: bool, new_pattern: bool, fi
                      f'These invalid CURIEs are saved to {curies_filename}')      
         invalid_curies_df = parse_dict_to_dataframe(all_entity2invalid_curies)
         invalid_curies_df.columns = ['entity', 'invalid_curie']
-        invalid_curies_df[['prefix', 'identifier']] = invalid_curies_df.invalid_curie.str.split(':', n=1, expand = True) # Required for identifiers that aso contain a ':'
+        invalid_curies_df[['prefix', 'identifier']] = invalid_curies_df.invalid_curie.str.split(r':', n=1, expand = True) # Required for identifiers that aso contain a ':'
         invalid_curies_df = invalid_curies_df.drop('invalid_curie', axis=1)
         invalid_curies_df.to_csv(curies_filename, sep='\t')
 
@@ -1442,9 +1442,9 @@ def check_direction(model:cobra.Model,data:Union[pd.DataFrame,str]) -> cobra.Mod
             # load from a table
             data = pd.read_csv(data, sep='\t')
             # rewrite the columns into a better comparable/searchable format
-            data['KEGG reaction'] = data['KEGG reaction'].str.extract('.*>(R\d*)<.*')
-            data['METANETX']      = data['METANETX'].str.extract('.*>(MNXR\d*)<.*')
-            data['EC-Number']     = data['EC-Number'].str.extract('EC-(.*)')
+            data['KEGG reaction'] = data['KEGG reaction'].str.extract(r'.*>(R\d*)<.*')
+            data['METANETX']      = data['METANETX'].str.extract(r'.*>(MNXR\d*)<.*')
+            data['EC-Number']     = data['EC-Number'].str.extract(r'EC-(.*)')
         case _:
             mes = f'Unknown data type for parameter data: {type(data)}'
             raise TypeError(mes)