Skip to content

Commit

Permalink
update table sorting
Browse files Browse the repository at this point in the history
  • Loading branch information
cklunch committed Oct 17, 2024
1 parent 3a46458 commit aefec98
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 61 deletions.
Binary file modified dist/neonutilities-1.0.1-py3-none-any.whl
Binary file not shown.
Binary file modified dist/neonutilities-1.0.1.tar.gz
Binary file not shown.
161 changes: 100 additions & 61 deletions src/neonutilities/unzip_and_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,78 @@ def align_sp_cols(sptab):
return sptab


def sort_dat(pdata):
"""
Small helper function to sort data rows by location and date
Parameters
--------
pdata: A pandas table of data records
Return
--------
A pandas table of data records sorted by location and date
Created on 17 Oct 2024
@author: Claire Lunch
"""

# sort the table by site, then HOR/VER, then date, all ascending
pcols = pdata.columns.to_list()
datevar = None
if 'collectDate' in pcols:
datevar = 'collectDate'
else:
if 'endDate' in pcols:
datevar = 'endDate'
else:
if 'startDate' in pcols:
datevar = 'startDate'
else:
if 'date' in pcols:
datevar = 'date'
else:
if 'endDateTime' in pcols:
datevar = 'endDateTime'
else:
if 'startDateTime' in pcols:
datevar = 'startDateTime'
if 'horizontalPosition' not in pcols:
try:
if datevar is None:
pdata.sort_values(by=['siteID'],
ascending=[True],
inplace=True, ignore_index=True)
else:
pdata.sort_values(by=['siteID', datevar],
ascending=[True,True],
inplace=True, ignore_index=True)
except Exception:
try:
pdata.sort_values(by=[datevar],
ascending=[True],
inplace=True, ignore_index=True)
except Exception:
pass

else:
try:
if datevar is None:
pdata.sort_values(by=['siteID', 'horizontalPosition', 'verticalPosition'],
ascending=[True, True, True],
inplace=True, ignore_index=True)
else:
pdata.sort_values(by=['siteID', 'horizontalPosition', 'verticalPosition', datevar],
ascending=[True, True, True, True],
inplace=True, ignore_index=True)
except Exception:
pass

return(pdata)


def format_readme(readmetab,
tables):
"""
Expand Down Expand Up @@ -614,13 +686,23 @@ def stack_data_files_parallel(folder,
readmefiles = glob.glob(os.path.join(folder, '**', '*.txt'), recursive=True)
if any(re.search("readme.20", path) for path in readmefiles):
readmepath = get_recent_publication([path for path in readmefiles if "readme.20" in path])[0]
rd = None
if cloud_mode:
rd = readme_url(readmepath)
try:
rd = readme_url(readmepath)
except Exception:
pass
else:
try:
rd = pd.read_table(readmepath, delimiter='\t', header=None)
except Exception:
pass
if rd is None:
pass
else:
rd = pd.read_table(readmepath, delimiter='\t', header=None)
rd = format_readme(rd, tables)
# save the readme
stacklist[f"readme_{dpnum}"] = rd
rd = format_readme(rd, tables)
# save the readme
stacklist[f"readme_{dpnum}"] = rd

# stack tables according to types
if progress:
Expand Down Expand Up @@ -724,71 +806,28 @@ def stack_data_files_parallel(folder,
if j != "sensor_positions":

locr = re.compile("[.][0-9]{3}[.][0-9]{3}[.][0-9]{3}[.][0-9]{3}[.]")
if locr is None:
pass
indtemp = [locr.search(l) for l in pdat["__filename"]]
if None in indtemp:
pdat = sort_dat(pdat)
else:
indxs = [locr.search(l).group(0) for l in pdat["__filename"]]
indxs = [lt.group(0) for lt in indtemp]
hor = [indx[5:8] for indx in indxs]
ver = [indx[9:12] for indx in indxs]
pdat.insert(2, "horizontalPosition", hor)
pdat.insert(3, "verticalPosition", ver)

# sort the table by site, then HOR/VER, then date, all ascending
pcols = pdat.columns.to_list()
datevar = None
if 'endDateTime' in pcols:
datevar = 'endDateTime'
else:
if 'date' in pcols:
datevar = 'date'
try:
if datevar is None:
pdat.sort_values(by=['siteID', 'horizontalPosition', 'verticalPosition'],
ascending=[True, True, True, True],
inplace=True, ignore_index=True)
else:
pdat.sort_values(by=['siteID', 'horizontalPosition', 'verticalPosition', datevar],
ascending=[True, True, True, True],
inplace=True, ignore_index=True)
except Exception:
pass

# append fields to variables file
if f"variables_{dpnum}" in stacklist.keys():
added_fields_IS = added_fields[0:4]
added_fields_IS.insert(0,"table",j)
vlist[j] = pd.concat([added_fields_IS, vlist[j]], ignore_index=True)
# sort table rows
pdat = sort_dat(pdat)

# append fields to variables file
if f"variables_{dpnum}" in stacklist.keys():
added_fields_IS = added_fields[0:4]
added_fields_IS.insert(0,"table",j)
vlist[j] = pd.concat([added_fields_IS, vlist[j]], ignore_index=True)

else:
# for OS tables, sort by site and date
pcols = pdat.columns.to_list()
datevar = None
if 'collectDate' in pcols:
datevar = 'collectDate'
else:
if 'endDate' in pcols:
datevar = 'endDate'
else:
if 'startDate' in pcols:
datevar = 'startDate'
else:
if 'date' in pcols:
datevar = 'date'
else:
if 'startDateTime' in pcols:
datevar = 'startDateTime'
# sort the table by site then date, all ascending
try:
if datevar is None:
pdat.sort_values(by=['siteID'],
ascending=[True],
inplace=True, ignore_index=True)
else:
pdat.sort_values(by=['siteID', datevar],
ascending=[True,True],
inplace=True, ignore_index=True)
except Exception:
pass
pdat = sort_dat(pdat)

# for SRF files, remove duplicates and modified records
if j == "science_review_flags":
Expand Down

0 comments on commit aefec98

Please sign in to comment.