schema for MCT

NEONScience · Oct 29, 2024 · a076f61 · a076f61
1 parent 69daf58
commit a076f61
Show file tree

Hide file tree

Showing 5 changed files with 42 additions and 12 deletions.
diff --git a/dist/neonutilities-1.0.1-py3-none-any.whl b/dist/neonutilities-1.0.1-py3-none-any.whl
diff --git a/dist/neonutilities-1.0.1.tar.gz b/dist/neonutilities-1.0.1.tar.gz
diff --git a/src/neonutilities/__resources__/frame_file_variables.csv b/src/neonutilities/__resources__/frame_file_variables.csv
@@ -15,6 +15,20 @@ MCC,specificEpithet,The specific epithet (second part of the species name) of th
 MCC,scientificName,"Scientific name, associated with the taxonID. This is the name of the lowest level taxonomic rank that can be determined",string,NA,expanded,asIs
 MCC,individualCount,Number of individuals of the same type,integer,NA,expanded,integer
 MCC,fileName,"Name of file, including file extension",string,NA,expanded,asIs
+MCT,dnaSampleID,Identifier for DNA sample,string,NA,expanded,asIs
+MCT,dnaSampleCode,Barcode of a DNA sample,string,NA,expanded,asIs
+MCT,sequenceName,Name associated with the sequence,string,NA,expanded,asIs
+MCT,taxonSequence,Sequence associated with the taxon,string,NA,expanded,asIs
+MCT,domain,The scientific name of the domain in which the taxon is classified,string,NA,expanded,asIs
+MCT,kingdom,The scientific name of the kingdom in which the taxon is classified,string,NA,expanded,asIs
+MCT,phylum,The scientific name of the phylum or division in which the taxon is classified,string,NA,expanded,asIs
+MCT,class,The scientific name of the class in which the taxon is classified,string,NA,expanded,asIs
+MCT,order,The scientific name of the order in which the taxon is classified,string,NA,expanded,asIs
+MCT,family,The scientific name of the family in which the taxon is classified,string,NA,expanded,asIs
+MCT,genus,The scientific name of the genus in which the organism is classified,string,NA,expanded,asIs
+MCT,specificEpithet,The specific epithet (second part of the species name) of the scientific name applied to the taxon,string,NA,expanded,asIs
+MCT,scientificName,"Scientific name, associated with the taxonID. This is the name of the lowest level taxonomic rank that can be determined",string,NA,expanded,asIs
+MCT,individualCount,Number of individuals of the same type,integer,NA,expanded,integer
 REA,hoboSampleID,Unique identifier for the HOBO conductivity logger file,string,NA,expanded,asIs
 REA,hoboSampleCode,Barcode of the HOBO conductivity logger file,string,NA,expanded,asIs
 REA,measurementNumber,The number of the measurement in a time series,integer,NA,expanded,integer

diff --git a/src/neonutilities/tabular_download.py b/src/neonutilities/tabular_download.py
@@ -416,6 +416,8 @@ def zips_by_product(dpid, site="all", startdate=None, enddate=None,
 
         if not os.path.exists(outpath):
             os.makedirs(outpath)
+        else:
+            logging.info("Warning: Download folder already exists. Check carefully for duplicate files.")
 
         if timeindex != "all" or tabl != "all":
             for f in durls["flpth"]:

diff --git a/src/neonutilities/unzip_and_stack.py b/src/neonutilities/unzip_and_stack.py
@@ -516,7 +516,8 @@ def stack_frame_files(framefiles, dpid,
     #v = pd.concat([v, frame_file_variables], ignore_index=True)
 
     fdict = {"DP1.30012.001":"FSP", "DP1.10081.001":"MCC", "DP1.20086.001":"MCC", 
-             "DP1.20141.001":"MCC", "DP1.20190.001":"REA", "DP1.20193.001":"REA"}
+             "DP1.20141.001":"MCC", "DP1.20190.001":"REA", "DP1.20193.001":"REA",
+             "DP1.10081.002":"MCT", "DP1.20086.002":"MCT", "DP1.20141.002":"MCT"}
 
     fvars = pa.Table.from_pandas(frame_file_variables)
     ftab = fvars.filter(pa.compute.field("table") == fdict[dpid])
@@ -551,7 +552,13 @@ def stack_frame_files(framefiles, dpid,
         nm = f"mcc_benthicPerSampleTaxonomy_{seqtyp}"
     elif dpid=="DP1.20141.001":
         nm = f"mcc_surfaceWaterPerSampleTaxonomy_{seqtyp}"
-
+    elif dpid=="DP1.10081.002":
+        nm = f"mct_soilPerSampleTaxonomy_{seqtyp}"
+    elif dpid=="DP1.20086.002":
+        nm = f"mct_benthicPerSampleTaxonomy_{seqtyp}"
+    elif dpid=="DP1.20141.002":
+        nm = f"mct_surfaceWaterPerSampleTaxonomy_{seqtyp}"
+
     return {"frmdat":fpdat, "frmnm":nm}
 
 
@@ -646,7 +653,8 @@ def stack_data_files_parallel(folder,
     stacklist = {}
 
     # handle per-sample (data frame) tables separately
-    if dpid in ["DP1.30012.001", "DP1.10081.001", "DP1.20086.001","DP1.20141.001", "DP1.20190.001", "DP1.20193.001"] and len([f for f in filenames if not f.startswith("NEON.")]) > 0:
+    if dpid in ["DP1.30012.001", "DP1.10081.001", "DP1.20086.001","DP1.20141.001", "DP1.20190.001", 
+                "DP1.20193.001", "DP1.10081.002", "DP1.20086.002","DP1.20141.002"] and len([f for f in filenames if not f.startswith("NEON.")]) > 0:
         framefiles = [f for f in filepaths if not os.path.basename(f).startswith("NEON.")]
         filepaths = [f for f in filepaths if os.path.basename(f).startswith("NEON.")]
         filenames = [f for f in filenames if os.path.basename(f).startswith("NEON.")]
@@ -657,17 +665,20 @@ def stack_data_files_parallel(folder,
 
         # subset microbe community data by taxonomic group
         # and stack both sets
-        if dpid in ["DP1.10081.001", "DP1.20086.001","DP1.20141.001"]:
+        if dpid in ["DP1.10081.001", "DP1.20086.001","DP1.20141.001",
+                    "DP1.10081.002", "DP1.20086.002","DP1.20141.002"]:
             bacteriafiles = [b for b in framefiles if re.search("[_]16S[_]", b)]
             fungifiles = [b for b in framefiles if re.search("[_]ITS[_]", b)]
 
-            fpdat16 = stack_frame_files(bacteriafiles, dpid=dpid,
-                                        seqtyp="16S", cloud_mode=cloud_mode)
-            fpdatIT = stack_frame_files(fungifiles, dpid=dpid,
-                                        seqtyp="ITS", cloud_mode=cloud_mode)
-
-            stacklist[fpdat16["frmnm"]] = fpdat16["frmdat"]
-            stacklist[fpdatIT["frmnm"]] = fpdatIT["frmdat"]
+            if len(bacteriafiles)>0:
+                fpdat16 = stack_frame_files(bacteriafiles, dpid=dpid,
+                                            seqtyp="16S", cloud_mode=cloud_mode)
+                stacklist[fpdat16["frmnm"]] = fpdat16["frmdat"]
+
+            if len(fungifiles)>0:
+                fpdatIT = stack_frame_files(fungifiles, dpid=dpid,
+                                            seqtyp="ITS", cloud_mode=cloud_mode)
+                stacklist[fpdatIT["frmnm"]] = fpdatIT["frmdat"]
 
         else:
             fpdat = stack_frame_files(framefiles, dpid=dpid, seqtyp=None, 
@@ -923,7 +934,10 @@ def stack_data_files_parallel(folder,
     # get issue log table
     # token omitted here since it's not otherwise used in stacking functions
     # consider a runLocal option, like in R stackEddy()
-    stacklist[f"issueLog_{dpnum}"] = get_issue_log(dpid=dpid, token=None)
+    try:
+        stacklist[f"issueLog_{dpnum}"] = get_issue_log(dpid=dpid, token=None)
+    except Exception:
+        pass
 
     # get relevant citation(s)
     try: