-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Expand file tokenisation #53
Merged
kMutagene
merged 17 commits into
nfdi4plants:main
from
LibraChris:expand-file-tokenisation
Mar 2, 2024
Merged
Changes from 1 commit
Commits
Show all changes
17 commits
Select commit
Hold shift + click to select a range
5d22ada
Add new file and directory terms to AFSO
LibraChris d4bed3b
Update Tokenization for Directories
LibraChris 2d30573
Update Tokenization for Files
LibraChris 26757d1
Fix typo in workflows
LibraChris ae4af8f
Rename Tokens
LibraChris d952ecf
Rework special tokenisation
LibraChris 16669aa
Add arc-like structure for testing
LibraChris 43ed75d
Add requested changes
LibraChris 1037fb0
Add parseARCFileSystem to TopLevelParsers
LibraChris a384133
Move arcStructure test files
LibraChris d09ff31
Add .gitkeep to arcStructureTests
LibraChris f3df422
Add Test for ARC Tokenisation
LibraChris 6519f26
Update ArcStructure in ARCTokenizationTests
LibraChris 9c643e1
Update parser Functions
LibraChris 14000b8
Update ISA tests by using the new ARCTest structure
LibraChris 0831fd5
Update TopLevelParsers.fs
LibraChris ddce451
Adress requested changes
LibraChris File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,11 +9,11 @@ module internal ISA = | |
|
||
open System.IO | ||
|
||
let tryParseMetadataSheetFromToken (rootPath:string) (isaCvTerm: CvTerm) (isaMdsParsingF: string -> IParam list) (refFileToken: IParam) = | ||
let tryParseMetadataSheetFromToken (rootPath:string) (isaCvTerm: CvTerm) (isaMdsParsingF: string -> IParam list) (relFileToken: IParam) = | ||
|
||
let cvpStr = Param.getValueAsString refFileToken | ||
let cvpStr = Param.getValueAsString relFileToken | ||
let path = Path.Combine(rootPath, cvpStr) | ||
let containsToken = refFileToken|> (fun x -> x.Name = isaCvTerm.Name) | ||
let containsToken = relFileToken|> (fun x -> x.Name = isaCvTerm.Name) | ||
|
||
if containsToken then | ||
try | ||
|
@@ -22,12 +22,12 @@ module internal ISA = | |
None | ||
else None | ||
|
||
let parseMetadataSheetsFromTokens (rootPath:string) (isaCvTerm: CvTerm) (isaMdsParsingF: string -> IParam list) (refFileTokens: #IParam seq) = | ||
refFileTokens | ||
let parseMetadataSheetsFromTokens (rootPath:string) (isaCvTerm: CvTerm) (isaMdsParsingF: string -> IParam list) (relFileTokens: #IParam seq) = | ||
relFileTokens | ||
|> Seq.choose (fun token -> tryParseMetadataSheetFromToken rootPath isaCvTerm isaMdsParsingF token) | ||
|
||
let parseProcessGraphColumnsFromToken (rootPath:string) (refFileToken: IParam) = | ||
let cvpStr = Param.getValueAsString refFileToken | ||
let parseProcessGraphColumnsFromToken (rootPath:string) (relFileToken: IParam) = | ||
let cvpStr = Param.getValueAsString relFileToken | ||
let path = System.IO.Path.Combine(rootPath, cvpStr) | ||
(FsWorkbook.fromXlsxFile path) | ||
.GetWorksheets() | ||
|
@@ -42,16 +42,16 @@ module internal ISA = | |
) | ||
|> Map.ofSeq | ||
|
||
let parseProcessGraphColumnsFromTokens (rootPath:string) (isaCvTerm: CvTerm) (refFileTokens: #IParam seq) = | ||
refFileTokens | ||
|> Seq.choose (fun token -> | ||
match token.Name = isaCvTerm.Name with | ||
| true -> Some (parseProcessGraphColumnsFromToken rootPath token) | ||
| false -> None | ||
let parseProcessGraphColumnsFromTokens (rootPath:string) (isaCvTerm: CvTerm) (relFileTokens: #IParam seq) = | ||
relFileTokens | ||
|> Seq.choose (fun (token :#IParam) -> | ||
match token |> Param.equalsTerm isaCvTerm with | ||
| true -> Some (parseProcessGraphColumnsFromToken rootPath token) | ||
| false -> None | ||
) | ||
|> fun x -> | ||
match Seq.length x with | ||
| 0 -> failwith "No token found" | ||
| 0 -> Seq.empty | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. convoluted way of just returning x :D |
||
| _ -> x | ||
|
||
|
||
|
@@ -103,7 +103,7 @@ type FileSystem = | |
|
||
/// <summary> | ||
/// Returns all files in the given rootPath as a list of CvParams containing the annotated relative file paths. | ||
/// | ||
/// Uses the ARC file system structure to parse the files to tokens. | ||
/// Note that rootPath must be an absolute path ending with a trailing slash. | ||
/// </summary> | ||
/// <param name="rootPath">absolute path ending with a trailing slash</param> | ||
|
@@ -144,10 +144,9 @@ type Investigation = | |
|> List.concat | ||
|
||
/// <summary> | ||
/// Returns a function that returns Some flat IParam list representing the investigation metadata if the given token contains a filepath with the standard investigation file name ("isa.investigation.xlsx") or None otherwise. | ||
/// Returns a function that returns Some flat IParam list representing the investigation metadata if the given token contains an investigation file tied to an filepath with the standard investigation file name ("isa.investigation.xlsx") or None otherwise. | ||
/// </summary> | ||
/// <param name="UseLastSheetOnIncorrectName">Wether or not to try parse the last sheet as metadata sheet when there is no sheet with the correct name ("isa_investigation") in the workbook</param> | ||
/// <param name="FileName">The name of the investigation file, note that this should not be set if the file follows spec (as "isa.investigation.xlsx" is the default)</param> | ||
static member tryParseMetadataSheetFromToken( | ||
?UseLastSheetOnIncorrectName: bool | ||
) = | ||
|
@@ -162,13 +161,12 @@ type Investigation = | |
|
||
|
||
/// <summary> | ||
/// Returns a function that parses all metadata sheets from all the tokens containing a filepath with the standard investigation file name ("isa.investigation.xlsx") | ||
/// in a given collection of tokens as a 2D list containing the individual Investigation metadata as a flat list of `IParam`s. | ||
/// | ||
/// if no tokens contain such a file path, the result will be an empty list. | ||
/// Returns a function that parses all metadata sheets from all the tokens containing an `Investigation File' tied to an filepath with the | ||
/// standard investigation file name ("isa.investigation.xlsx") in a given collection of tokens as a 2D list containing the individual | ||
/// Investigation metadata as a flat list of `IParam`s. | ||
/// If no tokens contain such a file path, the result will be an empty list. | ||
/// </summary> | ||
/// <param name="UseLastSheetOnIncorrectName">Wether or not to try parse the last sheet as metadata sheet when there is no sheet with the correct name ("isa_investigation") in the workbook</param> | ||
/// <param name="FileName">The name of the investigation file, note that this should not be set if the file follows spec (as "isa.investigation.xlsx" is the default)</param> | ||
static member parseMetadataSheetsFromTokens( | ||
?UseLastSheetOnIncorrectName: bool | ||
) = | ||
|
@@ -214,10 +212,9 @@ type Study = | |
|> List.concat | ||
|
||
/// <summary> | ||
/// Returns a function that returns Some flat IParam list representing the study metadata if the given token contains a filepath with the standard study file name ("isa.study.xlsx") or None otherwise. | ||
/// Returns a function that returns Some flat IParam list representing the study metadata if the given token contains a 'Study File' tied to an filepath with the the standard study file name ("isa.study.xlsx") or None otherwise. | ||
/// </summary> | ||
/// <param name="UseLastSheetOnIncorrectName">Wether or not to try parse the last sheet as metadata sheet when there is no sheet with the correct name ("isa_study") in the workbook</param> | ||
/// <param name="FileName">The name of the study file, note that this should not be set if the file follows spec (as "isa.study.xlsx" is the default)</param> | ||
static member tryParseMetadataSheetFromToken( | ||
?UseLastSheetOnIncorrectName: bool | ||
) = | ||
|
@@ -233,13 +230,12 @@ type Study = | |
|
||
|
||
/// <summary> | ||
/// Returns a function that parses all metadata sheets from all the tokens containing a filepath with the standard study file name ("isa.study.xlsx") | ||
/// Returns a function that parses all metadata sheets from all the tokens containing a 'Study File' with the standard study file name ("isa.study.xlsx") | ||
/// in a given collection of tokens as a 2D list containing the individual study metadata as a flat list of `IParam`s. | ||
/// | ||
/// if no tokens contain such a file path, the result will be an empty list. | ||
/// </summary> | ||
/// <param name="UseLastSheetOnIncorrectName">Wether or not to try parse the last sheet as metadata sheet when there is no sheet with the correct name ("isa_study") in the workbook</param> | ||
/// <param name="FileName">The name of the study file, note that this should not be set if the file follows spec (as "isa.study.xlsx" is the default)</param> | ||
static member parseMetadataSheetsFromTokens( | ||
?UseLastSheetOnIncorrectName: bool | ||
) = | ||
|
@@ -275,27 +271,27 @@ type Study = | |
|> Map.ofSeq | ||
|
||
/// <summary> | ||
/// Returns an annotation tables from an IParam if the given token contains a filepath with the standard study file name ("isa.study.xlsx"). | ||
/// Map of string * `IParam` 2D List representing the individual parts parts of the Process graph, | ||
/// where the string is the name of the worksheet that contained the table, | ||
/// and the 2D lists represent a single table in which the inner 1D lists represent a single column. | ||
/// Returns an annotation table from an IParam if the given token is annotated with the term 'Study File'. | ||
/// Returns a map of string * `IParam` 2D list representing the individual parts of the process graph. | ||
/// The string is the name of the worksheet that contained the table, and the 2D lists represent a single table where the inner 1D lists represent a single column. | ||
/// </summary> | ||
/// <param name="rootPath">ARC root path</param> | ||
/// <param name="refFileToken">IParam of the ARC Tokens</param> | ||
static member parseProcessGraphColumnsFromToken (rootPath:string) (refFileToken: IParam) = | ||
ISA.parseProcessGraphColumnsFromToken rootPath refFileToken | ||
/// <param name="rootPath">The root path of the ARC</param> | ||
/// <param name="relFileToken">IParam that may be a relevant token</param> | ||
/// <returns>A map of string * `IParam` 2D list representing the individual parts of the process graph</returns> | ||
static member parseProcessGraphColumnsFromToken (rootPath:string) (relFileToken: IParam) = | ||
ISA.parseProcessGraphColumnsFromToken rootPath relFileToken | ||
|
||
|
||
/// <summary> | ||
/// Returns a seq of annotation tables from an IParam seq if the given tokens contains a filepath with the standard study file name ("isa.study.xlsx"). | ||
/// Map of string * `IParam` 2D List representing the individual parts parts of the Process graph, | ||
/// where the string is the name of the worksheet that contained the table, | ||
/// and the 2D lists represent a single table in which the inner 1D lists represent a single column. | ||
/// Returns a seq of annotation tables from an IParam seq for each contained token that is annotated with the term 'Study File'. | ||
/// Returns a map of string * `IParam` 2D list representing the individual parts of the process graph. | ||
/// The string is the name of the worksheet that contained the table, and the 2D lists represent a single table where the inner 1D lists represent a single column. | ||
/// </summary> | ||
/// <param name="rootPath">ARC root path</param> | ||
/// <param name="refFileToken">IParam seq of the ARC Tokens</param> | ||
static member parseProcessGraphColumnsFromTokens (rootPath:string) (refFileTokens: #IParam seq) = | ||
ISA.parseProcessGraphColumnsFromTokens rootPath (StructuralOntology.AFSO.``Study File``) refFileTokens | ||
/// <param name="rootPath">The root path of the ARC</param> | ||
/// <param name="relFileTokens">A seq of IParams that may contain relevant tokens</param> | ||
/// <returns>A Seq of maps of string * `IParam` 2D list representing the individual parts of the process graph</returns> | ||
static member parseProcessGraphColumnsFromTokens (rootPath:string) (relFileTokens: #IParam seq) = | ||
ISA.parseProcessGraphColumnsFromTokens rootPath (StructuralOntology.AFSO.``Study File``) relFileTokens | ||
|
||
type Assay = | ||
|
||
|
@@ -329,7 +325,7 @@ type Assay = | |
|> List.concat | ||
|
||
/// <summary> | ||
/// Returns a function that returns Some flat IParam list representing the assay metadata if the given token contains a filepath with the standard assay file name ("isa.assay.xlsx") or None otherwise. | ||
/// Returns a function that returns Some flat IParam list representing the assay metadata if the given token contains an 'Assay File' with the standard assay file name ("isa.assay.xlsx") or None otherwise. | ||
/// </summary> | ||
/// <param name="UseLastSheetOnIncorrectName">Wether or not to try parse the last sheet as metadata sheet when there is no sheet with the correct name ("isa_assay") in the workbook</param> | ||
static member tryParseMetadataSheetFromToken( | ||
|
@@ -346,7 +342,7 @@ type Assay = | |
|
||
|
||
/// <summary> | ||
/// Returns a function that parses all metadata sheets from all the tokens containing a filepath with the standard assay file name ("isa.assay.xlsx") | ||
/// Returns a function that parses all metadata sheets from all the tokens containing an 'Assay File' with the standard assay file name ("isa.assay.xlsx") | ||
/// in a given collection of tokens as a 2D list containing the individual assay metadata as a flat list of `IParam`s. | ||
/// | ||
/// if no tokens contain such a file path, the result will be an empty list. | ||
|
@@ -387,23 +383,23 @@ type Assay = | |
|> Map.ofSeq | ||
|
||
/// <summary> | ||
/// Returns an annotation tables from an IParam if the given token contains a filepath with the standard assay file name ("isa.assay.xlsx"). | ||
/// Map of string * `IParam` 2D List representing the individual parts parts of the Process graph, | ||
/// where the string is the name of the worksheet that contained the table, | ||
/// and the 2D lists represent a single table in which the inner 1D lists represent a single column. | ||
/// </summary> | ||
/// <param name="rootPath">ARC root path</param> | ||
/// <param name="refFileToken">IParam of the ARC Tokens</param> | ||
static member parseProcessGraphColumnsFromToken (rootPath:string) (refFileToken: IParam) = | ||
ISA.parseProcessGraphColumnsFromToken rootPath refFileToken | ||
|
||
/// <summary> | ||
/// Returns a seq of annotation tables from an IParam seq if the given tokens contains a filepath with the standard assay file name ("isa.assay.xlsx"). | ||
/// Map of string * `IParam` 2D List representing the individual parts parts of the Process graph, | ||
/// where the string is the name of the worksheet that contained the table, | ||
/// and the 2D lists represent a single table in which the inner 1D lists represent a single column. | ||
/// Returns an annotation table from an IParam if the given token is annotated with the term 'Assay File'. | ||
/// Returns a map of string * `IParam` 2D list representing the individual parts of the process graph. | ||
/// The string is the name of the worksheet that contained the table, and the 2D lists represent a single table where the inner 1D lists represent a single column. | ||
/// </summary> | ||
/// <param name="rootPath">ARC root path</param> | ||
/// <param name="refFileToken">IParam seq of the ARC Tokens</param> | ||
static member parseProcessGraphColumnsFromTokens (rootPath:string) (refFileTokens: #IParam seq) = | ||
ISA.parseProcessGraphColumnsFromTokens rootPath (StructuralOntology.AFSO.``Assay File``) refFileTokens | ||
/// <param name="rootPath">The root path of the ARC</param> | ||
/// <param name="relFileToken">IParam that may be a relevant token</param> | ||
/// <returns>A map of string * `IParam` 2D list representing the individual parts of the process graph</returns> | ||
static member parseProcessGraphColumnsFromToken (rootPath:string) (relFileToken: IParam) = | ||
ISA.parseProcessGraphColumnsFromToken rootPath relFileToken | ||
|
||
/// <summary> | ||
/// Returns a seq of annotation tables from an IParam seq for each contained token that is annotated with the term 'Assay File'. | ||
/// Returns a map of string * `IParam` 2D list representing the individual parts of the process graph. | ||
/// The string is the name of the worksheet that contained the table, and the 2D lists represent a single table where the inner 1D lists represent a single column. | ||
/// </summary> | ||
/// <param name="rootPath">The root path of the ARC</param> | ||
/// <param name="relFileTokens">A seq of IParams that may contain relevant tokens</param> | ||
/// <returns>A Seq of maps of string * `IParam` 2D list representing the individual parts of the process graph</returns> | ||
static member parseProcessGraphColumnsFromTokens (rootPath:string) (relFileTokens: #IParam seq) = | ||
ISA.parseProcessGraphColumnsFromTokens rootPath (StructuralOntology.AFSO.``Assay File``) relFileTokens |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
these functions should not fail but just return an empty sequence (maybe additionally printing warnings). Imagine if this is used in a validation package. It would lead to the parsing failing, without any validation being performed.