Skip to content

Commit

Permalink
schema for MCT
Browse files Browse the repository at this point in the history
  • Loading branch information
cklunch committed Oct 29, 2024
1 parent 69daf58 commit a076f61
Show file tree
Hide file tree
Showing 5 changed files with 42 additions and 12 deletions.
Binary file modified dist/neonutilities-1.0.1-py3-none-any.whl
Binary file not shown.
Binary file modified dist/neonutilities-1.0.1.tar.gz
Binary file not shown.
14 changes: 14 additions & 0 deletions src/neonutilities/__resources__/frame_file_variables.csv
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,20 @@ MCC,specificEpithet,The specific epithet (second part of the species name) of th
MCC,scientificName,"Scientific name, associated with the taxonID. This is the name of the lowest level taxonomic rank that can be determined",string,NA,expanded,asIs
MCC,individualCount,Number of individuals of the same type,integer,NA,expanded,integer
MCC,fileName,"Name of file, including file extension",string,NA,expanded,asIs
MCT,dnaSampleID,Identifier for DNA sample,string,NA,expanded,asIs
MCT,dnaSampleCode,Barcode of a DNA sample,string,NA,expanded,asIs
MCT,sequenceName,Name associated with the sequence,string,NA,expanded,asIs
MCT,taxonSequence,Sequence associated with the taxon,string,NA,expanded,asIs
MCT,domain,The scientific name of the domain in which the taxon is classified,string,NA,expanded,asIs
MCT,kingdom,The scientific name of the kingdom in which the taxon is classified,string,NA,expanded,asIs
MCT,phylum,The scientific name of the phylum or division in which the taxon is classified,string,NA,expanded,asIs
MCT,class,The scientific name of the class in which the taxon is classified,string,NA,expanded,asIs
MCT,order,The scientific name of the order in which the taxon is classified,string,NA,expanded,asIs
MCT,family,The scientific name of the family in which the taxon is classified,string,NA,expanded,asIs
MCT,genus,The scientific name of the genus in which the organism is classified,string,NA,expanded,asIs
MCT,specificEpithet,The specific epithet (second part of the species name) of the scientific name applied to the taxon,string,NA,expanded,asIs
MCT,scientificName,"Scientific name, associated with the taxonID. This is the name of the lowest level taxonomic rank that can be determined",string,NA,expanded,asIs
MCT,individualCount,Number of individuals of the same type,integer,NA,expanded,integer
REA,hoboSampleID,Unique identifier for the HOBO conductivity logger file,string,NA,expanded,asIs
REA,hoboSampleCode,Barcode of the HOBO conductivity logger file,string,NA,expanded,asIs
REA,measurementNumber,The number of the measurement in a time series,integer,NA,expanded,integer
Expand Down
2 changes: 2 additions & 0 deletions src/neonutilities/tabular_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,8 @@ def zips_by_product(dpid, site="all", startdate=None, enddate=None,

if not os.path.exists(outpath):
os.makedirs(outpath)
else:
logging.info("Warning: Download folder already exists. Check carefully for duplicate files.")

if timeindex != "all" or tabl != "all":
for f in durls["flpth"]:
Expand Down
38 changes: 26 additions & 12 deletions src/neonutilities/unzip_and_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,8 @@ def stack_frame_files(framefiles, dpid,
#v = pd.concat([v, frame_file_variables], ignore_index=True)

fdict = {"DP1.30012.001":"FSP", "DP1.10081.001":"MCC", "DP1.20086.001":"MCC",
"DP1.20141.001":"MCC", "DP1.20190.001":"REA", "DP1.20193.001":"REA"}
"DP1.20141.001":"MCC", "DP1.20190.001":"REA", "DP1.20193.001":"REA",
"DP1.10081.002":"MCT", "DP1.20086.002":"MCT", "DP1.20141.002":"MCT"}

fvars = pa.Table.from_pandas(frame_file_variables)
ftab = fvars.filter(pa.compute.field("table") == fdict[dpid])
Expand Down Expand Up @@ -551,7 +552,13 @@ def stack_frame_files(framefiles, dpid,
nm = f"mcc_benthicPerSampleTaxonomy_{seqtyp}"
elif dpid=="DP1.20141.001":
nm = f"mcc_surfaceWaterPerSampleTaxonomy_{seqtyp}"

elif dpid=="DP1.10081.002":
nm = f"mct_soilPerSampleTaxonomy_{seqtyp}"
elif dpid=="DP1.20086.002":
nm = f"mct_benthicPerSampleTaxonomy_{seqtyp}"
elif dpid=="DP1.20141.002":
nm = f"mct_surfaceWaterPerSampleTaxonomy_{seqtyp}"

return {"frmdat":fpdat, "frmnm":nm}


Expand Down Expand Up @@ -646,7 +653,8 @@ def stack_data_files_parallel(folder,
stacklist = {}

# handle per-sample (data frame) tables separately
if dpid in ["DP1.30012.001", "DP1.10081.001", "DP1.20086.001","DP1.20141.001", "DP1.20190.001", "DP1.20193.001"] and len([f for f in filenames if not f.startswith("NEON.")]) > 0:
if dpid in ["DP1.30012.001", "DP1.10081.001", "DP1.20086.001","DP1.20141.001", "DP1.20190.001",
"DP1.20193.001", "DP1.10081.002", "DP1.20086.002","DP1.20141.002"] and len([f for f in filenames if not f.startswith("NEON.")]) > 0:
framefiles = [f for f in filepaths if not os.path.basename(f).startswith("NEON.")]
filepaths = [f for f in filepaths if os.path.basename(f).startswith("NEON.")]
filenames = [f for f in filenames if os.path.basename(f).startswith("NEON.")]
Expand All @@ -657,17 +665,20 @@ def stack_data_files_parallel(folder,

# subset microbe community data by taxonomic group
# and stack both sets
if dpid in ["DP1.10081.001", "DP1.20086.001","DP1.20141.001"]:
if dpid in ["DP1.10081.001", "DP1.20086.001","DP1.20141.001",
"DP1.10081.002", "DP1.20086.002","DP1.20141.002"]:
bacteriafiles = [b for b in framefiles if re.search("[_]16S[_]", b)]
fungifiles = [b for b in framefiles if re.search("[_]ITS[_]", b)]

fpdat16 = stack_frame_files(bacteriafiles, dpid=dpid,
seqtyp="16S", cloud_mode=cloud_mode)
fpdatIT = stack_frame_files(fungifiles, dpid=dpid,
seqtyp="ITS", cloud_mode=cloud_mode)

stacklist[fpdat16["frmnm"]] = fpdat16["frmdat"]
stacklist[fpdatIT["frmnm"]] = fpdatIT["frmdat"]
if len(bacteriafiles)>0:
fpdat16 = stack_frame_files(bacteriafiles, dpid=dpid,
seqtyp="16S", cloud_mode=cloud_mode)
stacklist[fpdat16["frmnm"]] = fpdat16["frmdat"]

if len(fungifiles)>0:
fpdatIT = stack_frame_files(fungifiles, dpid=dpid,
seqtyp="ITS", cloud_mode=cloud_mode)
stacklist[fpdatIT["frmnm"]] = fpdatIT["frmdat"]

else:
fpdat = stack_frame_files(framefiles, dpid=dpid, seqtyp=None,
Expand Down Expand Up @@ -923,7 +934,10 @@ def stack_data_files_parallel(folder,
# get issue log table
# token omitted here since it's not otherwise used in stacking functions
# consider a runLocal option, like in R stackEddy()
stacklist[f"issueLog_{dpnum}"] = get_issue_log(dpid=dpid, token=None)
try:
stacklist[f"issueLog_{dpnum}"] = get_issue_log(dpid=dpid, token=None)
except Exception:
pass

# get relevant citation(s)
try:
Expand Down

0 comments on commit a076f61

Please sign in to comment.