Skip to content

Commit

Permalink
sort sets metadata and allow serialized overview
Browse files Browse the repository at this point in the history
  • Loading branch information
LoannPeurey committed Feb 19, 2025
1 parent 3f4065e commit bab64e6
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 74 deletions.
8 changes: 5 additions & 3 deletions ChildProject/annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -593,20 +593,22 @@ def _read_sets_metadata(self, warning: str = 'ignore'):
else:
raise ValueError(f"warning argument must be in ['log','return','ignore']")

def get_sets_metadata(self, format: str = 'dataframe', delimiter=None, escape_char='"', header=True, human=False):
def get_sets_metadata(self, format: str = 'dataframe', delimiter=None, escapechar='"', header=True, human=False,
sort_by='set', sort_ascending=True):
"""return metadata about the sets"""
sets = self._read_sets_metadata()
annots = self.annotations.copy().set_index('set')
durations = (annots['range_offset'] - annots['range_onset']).groupby('set').sum()
sets = sets.merge(durations.rename('duration'), how='left', on='set')
sets = sets.sort_values(sort_by, ascending=sort_ascending)

if format == 'dataframe':
return sets
elif format == 'lslike':
return self.get_printable_sets_metadata(sets, delimiter if delimiter is not None else " ", header, human)
elif format == 'csv':
return df.to_csv(None, index=True, delimiter=delimiter if delimiter is not None else ',',
escape_char=escape_char, header=header)
return sets.to_csv(None, index=True, sep=delimiter if delimiter is not None else ',',
escapechar=escapechar, header=header)
else:
raise ValueError(f"format <{format}> is unknown please use one the documented formats")

Expand Down
165 changes: 96 additions & 69 deletions ChildProject/cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import sys
import random
import logging
import json

# add this to setup,py in the requires section and in requirements.txt
import colorlog
Expand Down Expand Up @@ -248,6 +249,15 @@ def validate(args):
help="convert units to be more human readable",
action='store_true',
),
arg("--sort-by",
help="sort the table by the given column name(s)",
default="set",
nargs='+',
choices=['set', 'duration'] + [f.name for f in AnnotationManager.SETS_COLUMNS]),
arg("--sort-descending",
help="sort the table descending instead of ascending",
action='store_true',
),
]
)
def sets_metadata(args):
Expand All @@ -265,16 +275,17 @@ def sets_metadata(args):
if len(args.source) > 1:
logger.info(f"\033[1m\033[35m### {project.recordings['experiment'].iloc[0] if project.recordings.shape[0] else source} ({source}) ###\033[0m")

sets = am.get_sets_metadata()
if 'method' not in sets:
sets['method'] = 'Undefined'
else:
sets['method'] = sets['method'].fillna('Undefined').replace('', 'Undefined')
# sets = am.get_sets_metadata()
# if 'method' not in sets:
# sets['method'] = 'Undefined'
# else:
# sets['method'] = sets['method'].fillna('Undefined').replace('', 'Undefined')

if args.format == 'csv':
logger.info(sets.to_csv(None, index=True))
if args.format == 'snapshot':
logger.info(am.get_sets_metadata('lslike', human=args.human_readable))
logger.info(am.get_sets_metadata('csv', sort_by=args.sort_by, sort_ascending=not args.sort_descending))
elif args.format == 'snapshot':
logger.info(am.get_sets_metadata('lslike', human=args.human_readable, sort_by=args.sort_by,
sort_ascending=not args.sort_descending))



Expand Down Expand Up @@ -558,9 +569,20 @@ def rename_annotations(args):
)


@subcommand([arg("source", help="source data path", nargs='+')])
@subcommand(
[
arg("source",
help="source data path",
nargs='+'),
arg("--format",
help="format to output to",
default="snapshot",
choices=['snapshot', 'json']),
]
)
def overview(args):
"""prints an overview of the contents of a given dataset"""
dict = {}
for source in args.source:

try:
Expand All @@ -576,66 +598,71 @@ def overview(args):
logger.error(f"{source}: [%s] %s", type(e).__name__, e)
continue

if len(args.source) > 1:
logger.info(f"\033[1m\033[35m### {project.recordings['experiment'].iloc[0] if project.recordings.shape[0] else source} ({source}) ###\033[0m")

available = project.recordings['recording_filename'].apply(lambda recording_filename: 1
if project.get_recording_path(recording_filename).exists() else 0).sum()

# recordings count and total duration
output = "\n\033[1m{} recordings with {} hours {} locally ({} discarded)\033[0m:\n".format(
record['recordings']['count'], format(record['recordings']["duration"] / 3600000, '.2f') if record['recordings']["duration"] is not None else '?',
available, record['recordings']["discarded"])

output += "\033[94mdate range :\033[0m {} to {}\n".format(
record['recordings']['first_date'], record['recordings']["last_date"])

output += "\033[94mdevices :\033[0m"
for device in record['recordings']['devices']:
available = (project.recordings[project.recordings["recording_device_type"] == device]
)['recording_filename'].apply(lambda recording_filename: 1
if project.get_recording_path(recording_filename).exists() else 0).sum()
info = record['recordings']['devices'][device]
output += " {} ({}h {}/{} locally);".format(
device, format(info['duration'] / 3600000, '.2f') if info['duration'] is not None else '?', available, info['count'])
output += "\n"

output += "\n\033[1m{} participants\033[0m:\n".format(
record['children']['count'],)

# switch to age in years old if age > 2 years old
min_age = "{:.1f}mo".format(record['children']['min_age']) if record['children'][
'min_age'] < 24 else "{:.1f}yo".format(record['children']['min_age'] / 12)
max_age = "{:.1f}mo".format(record['children']['max_age']) if record['children'][
'max_age'] < 24 else "{:.1f}yo".format(record['children']['max_age'] / 12)
output += "\033[94mage range :\033[0m {} to {}\n".format(
min_age, max_age)

if record['children']['M'] is not None:
output += "\033[94msex distribution :\033[0m {}M {}F\n".format(
record['children']['M'], record['children']["F"])

output += "\033[94mlanguages :\033[0m"
for language in record['children']['languages']:
output += " {} {};".format(
language, record['children']['languages'][language])
output += "\n"

if record['children']['monolingual'] is not None:
output += "\033[94mmonolinguality :\033[0m {}mono {}multi\n".format(
record['children']['monolingual'], record['children']["multilingual"])

if record['children']['normative'] is not None:
output += "\033[94mnormativity :\033[0m {}norm {}non-norm\n".format(
record['children']['normative'], record['children']["non-normative"])

output += "\n\033[1mannotations:\033[0m\n"
output += am.get_sets_metadata('lslike', human=True)



logger.info(output)

if args.format == 'json':
record['annotations'] = am.get_sets_metadata('dataframe').to_dict('index')
dict[project.recordings['experiment'].iloc[0] if project.recordings.shape[0] else source] = record

elif args.format == 'snapshot':
if len(args.source) > 1:
logger.info(f"\033[1m\033[35m### {project.recordings['experiment'].iloc[0] if project.recordings.shape[0] else source} ({source}) ###\033[0m")

available = project.recordings['recording_filename'].apply(lambda recording_filename: 1
if project.get_recording_path(recording_filename).exists() else 0).sum()

# recordings count and total duration
output = "\n\033[1m{} recordings with {} hours {} locally ({} discarded)\033[0m:\n".format(
record['recordings']['count'], format(record['recordings']["duration"] / 3600000, '.2f') if record['recordings']["duration"] is not None else '?',
available, record['recordings']["discarded"])

output += "\033[94mdate range :\033[0m {} to {}\n".format(
record['recordings']['first_date'], record['recordings']["last_date"])

output += "\033[94mdevices :\033[0m"
for device in record['recordings']['devices']:
available = (project.recordings[project.recordings["recording_device_type"] == device]
)['recording_filename'].apply(lambda recording_filename: 1
if project.get_recording_path(recording_filename).exists() else 0).sum()
info = record['recordings']['devices'][device]
output += " {} ({}h {}/{} locally);".format(
device, format(info['duration'] / 3600000, '.2f') if info['duration'] is not None else '?', available, info['count'])
output += "\n"

output += "\n\033[1m{} participants\033[0m:\n".format(
record['children']['count'],)

# switch to age in years old if age > 2 years old
min_age = "{:.1f}mo".format(record['children']['min_age']) if record['children'][
'min_age'] < 24 else "{:.1f}yo".format(record['children']['min_age'] / 12)
max_age = "{:.1f}mo".format(record['children']['max_age']) if record['children'][
'max_age'] < 24 else "{:.1f}yo".format(record['children']['max_age'] / 12)
output += "\033[94mage range :\033[0m {} to {}\n".format(
min_age, max_age)

if record['children']['M'] is not None:
output += "\033[94msex distribution :\033[0m {}M {}F\n".format(
record['children']['M'], record['children']["F"])

output += "\033[94mlanguages :\033[0m"
for language in record['children']['languages']:
output += " {} {};".format(
language, record['children']['languages'][language])
output += "\n"

if record['children']['monolingual'] is not None:
output += "\033[94mmonolinguality :\033[0m {}mono {}multi\n".format(
record['children']['monolingual'], record['children']["multilingual"])

if record['children']['normative'] is not None:
output += "\033[94mnormativity :\033[0m {}norm {}non-norm\n".format(
record['children']['normative'], record['children']["non-normative"])

output += "\n\033[1mannotations:\033[0m\n"
output += am.get_sets_metadata('lslike', human=True)

logger.info(output)

if args.format == 'json':
logger.info(json.dumps(dict))

@subcommand(
[arg("source", help="source data path"), arg("variable", help="name of the variable")]
Expand Down
4 changes: 2 additions & 2 deletions ChildProject/projects.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,14 +438,14 @@ def dict_summary(self):
record = {
'recordings': {
'count': self.recordings.shape[0],
'duration': self.recordings['duration'].sum() if 'duration' in self.recordings.columns else None,
'duration': int(self.recordings['duration'].sum()) if 'duration' in self.recordings.columns else None,
'first_date': self.recordings[self.recordings['date_iso'] != 'NA']['date_iso'].min(),
'last_date': self.recordings[self.recordings['date_iso'] != 'NA']['date_iso'].max(),
'discarded': self.discarded_recordings.shape[0],
'devices': {
device: {
'count': self.recordings[self.recordings['recording_device_type'] == device].shape[0],
'duration': self.recordings[self.recordings['recording_device_type'] == device]['duration'].sum() if 'duration' in self.recordings.columns else None,
'duration': int(self.recordings[self.recordings['recording_device_type'] == device]['duration'].sum()) if 'duration' in self.recordings.columns else None,
} for device in self.recordings['recording_device_type'].unique()}
},
'children': {
Expand Down

0 comments on commit bab64e6

Please sign in to comment.