Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expand file tokenisation #53

Merged
merged 17 commits into from
Mar 2, 2024
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 59 additions & 63 deletions src/ARCTokenization/TopLevelParsers.fs
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ module internal ISA =

open System.IO

let tryParseMetadataSheetFromToken (rootPath:string) (isaCvTerm: CvTerm) (isaMdsParsingF: string -> IParam list) (refFileToken: IParam) =
let tryParseMetadataSheetFromToken (rootPath:string) (isaCvTerm: CvTerm) (isaMdsParsingF: string -> IParam list) (relFileToken: IParam) =

let cvpStr = Param.getValueAsString refFileToken
let cvpStr = Param.getValueAsString relFileToken
let path = Path.Combine(rootPath, cvpStr)
let containsToken = refFileToken|> (fun x -> x.Name = isaCvTerm.Name)
let containsToken = relFileToken|> (fun x -> x.Name = isaCvTerm.Name)

if containsToken then
try
Expand All @@ -22,12 +22,12 @@ module internal ISA =
None
else None

let parseMetadataSheetsFromTokens (rootPath:string) (isaCvTerm: CvTerm) (isaMdsParsingF: string -> IParam list) (refFileTokens: #IParam seq) =
refFileTokens
let parseMetadataSheetsFromTokens (rootPath:string) (isaCvTerm: CvTerm) (isaMdsParsingF: string -> IParam list) (relFileTokens: #IParam seq) =
relFileTokens
|> Seq.choose (fun token -> tryParseMetadataSheetFromToken rootPath isaCvTerm isaMdsParsingF token)

let parseProcessGraphColumnsFromToken (rootPath:string) (refFileToken: IParam) =
let cvpStr = Param.getValueAsString refFileToken
let parseProcessGraphColumnsFromToken (rootPath:string) (relFileToken: IParam) =
let cvpStr = Param.getValueAsString relFileToken
let path = System.IO.Path.Combine(rootPath, cvpStr)
(FsWorkbook.fromXlsxFile path)
.GetWorksheets()
Expand All @@ -42,16 +42,16 @@ module internal ISA =
)
|> Map.ofSeq

let parseProcessGraphColumnsFromTokens (rootPath:string) (isaCvTerm: CvTerm) (refFileTokens: #IParam seq) =
refFileTokens
|> Seq.choose (fun token ->
match token.Name = isaCvTerm.Name with
| true -> Some (parseProcessGraphColumnsFromToken rootPath token)
| false -> None
let parseProcessGraphColumnsFromTokens (rootPath:string) (isaCvTerm: CvTerm) (relFileTokens: #IParam seq) =
relFileTokens
|> Seq.choose (fun (token :#IParam) ->
match token |> Param.equalsTerm isaCvTerm with
| true -> Some (parseProcessGraphColumnsFromToken rootPath token)
| false -> None
)
|> fun x ->
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these functions should not fail but just return an empty sequence (maybe additionally printing warnings). Imagine if this is used in a validation package. It would lead to the parsing failing, without any validation being performed.

match Seq.length x with
| 0 -> failwith "No token found"
| 0 -> Seq.empty
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

convoluted way of just returning x :D

| _ -> x


Expand Down Expand Up @@ -103,7 +103,7 @@ type FileSystem =

/// <summary>
/// Returns all files in the given rootPath as a list of CvParams containing the annotated relative file paths.
///
/// Uses the ARC file system structure to parse the files to tokens.
/// Note that rootPath must be an absolute path ending with a trailing slash.
/// </summary>
/// <param name="rootPath">absolute path ending with a trailing slash</param>
Expand Down Expand Up @@ -144,10 +144,9 @@ type Investigation =
|> List.concat

/// <summary>
/// Returns a function that returns Some flat IParam list representing the investigation metadata if the given token contains a filepath with the standard investigation file name ("isa.investigation.xlsx") or None otherwise.
/// Returns a function that returns Some flat IParam list representing the investigation metadata if the given token contains an investigation file tied to an filepath with the standard investigation file name ("isa.investigation.xlsx") or None otherwise.
/// </summary>
/// <param name="UseLastSheetOnIncorrectName">Wether or not to try parse the last sheet as metadata sheet when there is no sheet with the correct name ("isa_investigation") in the workbook</param>
/// <param name="FileName">The name of the investigation file, note that this should not be set if the file follows spec (as "isa.investigation.xlsx" is the default)</param>
static member tryParseMetadataSheetFromToken(
?UseLastSheetOnIncorrectName: bool
) =
Expand All @@ -162,13 +161,12 @@ type Investigation =


/// <summary>
/// Returns a function that parses all metadata sheets from all the tokens containing a filepath with the standard investigation file name ("isa.investigation.xlsx")
/// in a given collection of tokens as a 2D list containing the individual Investigation metadata as a flat list of `IParam`s.
///
/// if no tokens contain such a file path, the result will be an empty list.
/// Returns a function that parses all metadata sheets from all the tokens containing an `Investigation File' tied to an filepath with the
/// standard investigation file name ("isa.investigation.xlsx") in a given collection of tokens as a 2D list containing the individual
/// Investigation metadata as a flat list of `IParam`s.
/// If no tokens contain such a file path, the result will be an empty list.
/// </summary>
/// <param name="UseLastSheetOnIncorrectName">Wether or not to try parse the last sheet as metadata sheet when there is no sheet with the correct name ("isa_investigation") in the workbook</param>
/// <param name="FileName">The name of the investigation file, note that this should not be set if the file follows spec (as "isa.investigation.xlsx" is the default)</param>
static member parseMetadataSheetsFromTokens(
?UseLastSheetOnIncorrectName: bool
) =
Expand Down Expand Up @@ -214,10 +212,9 @@ type Study =
|> List.concat

/// <summary>
/// Returns a function that returns Some flat IParam list representing the study metadata if the given token contains a filepath with the standard study file name ("isa.study.xlsx") or None otherwise.
/// Returns a function that returns Some flat IParam list representing the study metadata if the given token contains a 'Study File' tied to an filepath with the the standard study file name ("isa.study.xlsx") or None otherwise.
/// </summary>
/// <param name="UseLastSheetOnIncorrectName">Wether or not to try parse the last sheet as metadata sheet when there is no sheet with the correct name ("isa_study") in the workbook</param>
/// <param name="FileName">The name of the study file, note that this should not be set if the file follows spec (as "isa.study.xlsx" is the default)</param>
static member tryParseMetadataSheetFromToken(
?UseLastSheetOnIncorrectName: bool
) =
Expand All @@ -233,13 +230,12 @@ type Study =


/// <summary>
/// Returns a function that parses all metadata sheets from all the tokens containing a filepath with the standard study file name ("isa.study.xlsx")
/// Returns a function that parses all metadata sheets from all the tokens containing a 'Study File' with the standard study file name ("isa.study.xlsx")
/// in a given collection of tokens as a 2D list containing the individual study metadata as a flat list of `IParam`s.
///
/// if no tokens contain such a file path, the result will be an empty list.
/// </summary>
/// <param name="UseLastSheetOnIncorrectName">Wether or not to try parse the last sheet as metadata sheet when there is no sheet with the correct name ("isa_study") in the workbook</param>
/// <param name="FileName">The name of the study file, note that this should not be set if the file follows spec (as "isa.study.xlsx" is the default)</param>
static member parseMetadataSheetsFromTokens(
?UseLastSheetOnIncorrectName: bool
) =
Expand Down Expand Up @@ -275,27 +271,27 @@ type Study =
|> Map.ofSeq

/// <summary>
/// Returns an annotation tables from an IParam if the given token contains a filepath with the standard study file name ("isa.study.xlsx").
/// Map of string * `IParam` 2D List representing the individual parts parts of the Process graph,
/// where the string is the name of the worksheet that contained the table,
/// and the 2D lists represent a single table in which the inner 1D lists represent a single column.
/// Returns an annotation table from an IParam if the given token is annotated with the term 'Study File'.
/// Returns a map of string * `IParam` 2D list representing the individual parts of the process graph.
/// The string is the name of the worksheet that contained the table, and the 2D lists represent a single table where the inner 1D lists represent a single column.
/// </summary>
/// <param name="rootPath">ARC root path</param>
/// <param name="refFileToken">IParam of the ARC Tokens</param>
static member parseProcessGraphColumnsFromToken (rootPath:string) (refFileToken: IParam) =
ISA.parseProcessGraphColumnsFromToken rootPath refFileToken
/// <param name="rootPath">The root path of the ARC</param>
/// <param name="relFileToken">IParam that may be a relevant token</param>
/// <returns>A map of string * `IParam` 2D list representing the individual parts of the process graph</returns>
static member parseProcessGraphColumnsFromToken (rootPath:string) (relFileToken: IParam) =
ISA.parseProcessGraphColumnsFromToken rootPath relFileToken


/// <summary>
/// Returns a seq of annotation tables from an IParam seq if the given tokens contains a filepath with the standard study file name ("isa.study.xlsx").
/// Map of string * `IParam` 2D List representing the individual parts parts of the Process graph,
/// where the string is the name of the worksheet that contained the table,
/// and the 2D lists represent a single table in which the inner 1D lists represent a single column.
/// Returns a seq of annotation tables from an IParam seq for each contained token that is annotated with the term 'Study File'.
/// Returns a map of string * `IParam` 2D list representing the individual parts of the process graph.
/// The string is the name of the worksheet that contained the table, and the 2D lists represent a single table where the inner 1D lists represent a single column.
/// </summary>
/// <param name="rootPath">ARC root path</param>
/// <param name="refFileToken">IParam seq of the ARC Tokens</param>
static member parseProcessGraphColumnsFromTokens (rootPath:string) (refFileTokens: #IParam seq) =
ISA.parseProcessGraphColumnsFromTokens rootPath (StructuralOntology.AFSO.``Study File``) refFileTokens
/// <param name="rootPath">The root path of the ARC</param>
/// <param name="relFileTokens">A seq of IParams that may contain relevant tokens</param>
/// <returns>A Seq of maps of string * `IParam` 2D list representing the individual parts of the process graph</returns>
static member parseProcessGraphColumnsFromTokens (rootPath:string) (relFileTokens: #IParam seq) =
ISA.parseProcessGraphColumnsFromTokens rootPath (StructuralOntology.AFSO.``Study File``) relFileTokens

type Assay =

Expand Down Expand Up @@ -329,7 +325,7 @@ type Assay =
|> List.concat

/// <summary>
/// Returns a function that returns Some flat IParam list representing the assay metadata if the given token contains a filepath with the standard assay file name ("isa.assay.xlsx") or None otherwise.
/// Returns a function that returns Some flat IParam list representing the assay metadata if the given token contains an 'Assay File' with the standard assay file name ("isa.assay.xlsx") or None otherwise.
/// </summary>
/// <param name="UseLastSheetOnIncorrectName">Wether or not to try parse the last sheet as metadata sheet when there is no sheet with the correct name ("isa_assay") in the workbook</param>
static member tryParseMetadataSheetFromToken(
Expand All @@ -346,7 +342,7 @@ type Assay =


/// <summary>
/// Returns a function that parses all metadata sheets from all the tokens containing a filepath with the standard assay file name ("isa.assay.xlsx")
/// Returns a function that parses all metadata sheets from all the tokens containing an 'Assay File' with the standard assay file name ("isa.assay.xlsx")
/// in a given collection of tokens as a 2D list containing the individual assay metadata as a flat list of `IParam`s.
///
/// if no tokens contain such a file path, the result will be an empty list.
Expand Down Expand Up @@ -387,23 +383,23 @@ type Assay =
|> Map.ofSeq

/// <summary>
/// Returns an annotation tables from an IParam if the given token contains a filepath with the standard assay file name ("isa.assay.xlsx").
/// Map of string * `IParam` 2D List representing the individual parts parts of the Process graph,
/// where the string is the name of the worksheet that contained the table,
/// and the 2D lists represent a single table in which the inner 1D lists represent a single column.
/// </summary>
/// <param name="rootPath">ARC root path</param>
/// <param name="refFileToken">IParam of the ARC Tokens</param>
static member parseProcessGraphColumnsFromToken (rootPath:string) (refFileToken: IParam) =
ISA.parseProcessGraphColumnsFromToken rootPath refFileToken

/// <summary>
/// Returns a seq of annotation tables from an IParam seq if the given tokens contains a filepath with the standard assay file name ("isa.assay.xlsx").
/// Map of string * `IParam` 2D List representing the individual parts parts of the Process graph,
/// where the string is the name of the worksheet that contained the table,
/// and the 2D lists represent a single table in which the inner 1D lists represent a single column.
/// Returns an annotation table from an IParam if the given token is annotated with the term 'Assay File'.
/// Returns a map of string * `IParam` 2D list representing the individual parts of the process graph.
/// The string is the name of the worksheet that contained the table, and the 2D lists represent a single table where the inner 1D lists represent a single column.
/// </summary>
/// <param name="rootPath">ARC root path</param>
/// <param name="refFileToken">IParam seq of the ARC Tokens</param>
static member parseProcessGraphColumnsFromTokens (rootPath:string) (refFileTokens: #IParam seq) =
ISA.parseProcessGraphColumnsFromTokens rootPath (StructuralOntology.AFSO.``Assay File``) refFileTokens
/// <param name="rootPath">The root path of the ARC</param>
/// <param name="relFileToken">IParam that may be a relevant token</param>
/// <returns>A map of string * `IParam` 2D list representing the individual parts of the process graph</returns>
static member parseProcessGraphColumnsFromToken (rootPath:string) (relFileToken: IParam) =
ISA.parseProcessGraphColumnsFromToken rootPath relFileToken

/// <summary>
/// Returns a seq of annotation tables from an IParam seq for each contained token that is annotated with the term 'Assay File'.
/// Returns a map of string * `IParam` 2D list representing the individual parts of the process graph.
/// The string is the name of the worksheet that contained the table, and the 2D lists represent a single table where the inner 1D lists represent a single column.
/// </summary>
/// <param name="rootPath">The root path of the ARC</param>
/// <param name="relFileTokens">A seq of IParams that may contain relevant tokens</param>
/// <returns>A Seq of maps of string * `IParam` 2D list representing the individual parts of the process graph</returns>
static member parseProcessGraphColumnsFromTokens (rootPath:string) (relFileTokens: #IParam seq) =
ISA.parseProcessGraphColumnsFromTokens rootPath (StructuralOntology.AFSO.``Assay File``) relFileTokens
Loading