Skip to content

Commit

Permalink
Add file system tokenization
Browse files Browse the repository at this point in the history
- Add functions for tokenizing absolute and relative directory and file paths
- Add tests
  • Loading branch information
kMutagene committed Oct 26, 2023
1 parent fe1ede8 commit 57de162
Show file tree
Hide file tree
Showing 10 changed files with 232 additions and 11 deletions.
5 changes: 5 additions & 0 deletions src/ARCTokenization/ARCTokenization.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
<SymbolPackageFormat>snupkg</SymbolPackageFormat>
</PropertyGroup>

<ItemGroup>
<InternalsVisibleTo Include="ARCTokenization.Tests" />
</ItemGroup>

<ItemGroup>
<None Include="RELEASE_NOTES.md" />
<Compile Include="structural_ontologies\AFSO.fs" />
Expand All @@ -34,6 +38,7 @@
<Compile Include="Tokenization.fs" />
<Compile Include="Workbook.fs" />
<Compile Include="Worksheet.fs" />
<Compile Include="FileSystem.fs" />
<Compile Include="TopLevelParsers.fs" />
</ItemGroup>

Expand Down
54 changes: 54 additions & 0 deletions src/ARCTokenization/FileSystem.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
namespace ARCTokenization

open ControlledVocabulary
open FSharpAux
open FsSpreadsheet
open ARCTokenization.Terms
open ARCTokenization.StructuralOntology

open System.IO
open System
open ControlledVocabulary

module internal FS =

let tokenizeRelativeDirectoryPaths (rootPath:string) =
let root = System.Uri(rootPath)
seq {
for dir in Directory.EnumerateDirectories(rootPath, "*", SearchOption.AllDirectories) do
let currentUri = System.Uri(dir)
yield CvParam(
cvTerm = AFSO.``Directory Path``,
v = root.MakeRelativeUri(currentUri).ToString()
)
}

let tokenizeAbsoluteDirectoryPaths (rootPath:string) =
seq {
for dir in Directory.EnumerateDirectories(rootPath, "*", SearchOption.AllDirectories) do
yield CvParam(
cvTerm = AFSO.``Directory Path``,
v = dir.Replace("\\","/")
)
}


let tokenizeRelativeFilePaths (rootPath:string) =
let root = System.Uri(rootPath)
seq {
for file in Directory.EnumerateFiles(rootPath, "*", SearchOption.AllDirectories) do
let currentFileUri = System.Uri(file)
yield CvParam(
cvTerm = AFSO.``File Path``,
v = root.MakeRelativeUri(currentFileUri).ToString()
)
}

let tokenizeAbsoluteFilePaths (rootPath:string) =
seq {
for file in Directory.EnumerateFiles(rootPath, "*", SearchOption.AllDirectories) do
yield CvParam(
cvTerm = AFSO.``File Path``,
v = file.Replace("\\","/")
)
}
2 changes: 1 addition & 1 deletion src/ARCTokenization/Tokenization.fs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ open ARCTokenization.Terms

module Tokenization =

let convertTokens (keyParser: IParam list -> string -> (ParamValue -> IParam)) (line : FsCell seq) =
let convertMetadataTokens (keyParser: IParam list -> string -> (ParamValue -> IParam)) (line : FsCell seq) =
match line |> Seq.toList with
| [] -> failwith "Cannot convert nothin"
| key :: [] ->
Expand Down
52 changes: 49 additions & 3 deletions src/ARCTokenization/TopLevelParsers.fs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,52 @@ open FSharpAux
open FsSpreadsheet
open FsSpreadsheet.ExcelIO

type FileSystem =

/// <summary>
/// Returns all directories in the given rootPath as a list of CvParams containing the annotated absolute directory paths.
///
/// Note that rootPath must be an absolute path ending with a trailing slash.
/// </summary>
/// <param name="rootPath">absolute path ending with a trailing slash</param>
static member parseAbsoluteDirectoryPaths(
rootPath:string
) =
FS.tokenizeAbsoluteDirectoryPaths rootPath

/// <summary>
/// Returns all files in the given rootPath as a list of CvParams containing the annotated absolute file paths.
///
/// Note that rootPath must be an absolute path ending with a trailing slash.
/// </summary>
/// <param name="rootPath">absolute path ending with a trailing slash</param>
static member parseAbsoluteFilePaths(
rootPath:string
) =
FS.tokenizeAbsoluteFilePaths rootPath

/// <summary>
/// Returns all directories in the given rootPath as a list of CvParams containing the annotated relative directory paths.
///
/// Note that rootPath must be an absolute path ending with a trailing slash.
/// </summary>
/// <param name="rootPath">absolute path ending with a trailing slash</param>
static member parseRelativeDirectoryPaths(
rootPath:string
) =
FS.tokenizeRelativeDirectoryPaths rootPath

/// <summary>
/// Returns all files in the given rootPath as a list of CvParams containing the annotated relative file paths.
///
/// Note that rootPath must be an absolute path ending with a trailing slash.
/// </summary>
/// <param name="rootPath">absolute path ending with a trailing slash</param>
static member parseRelativeFilePaths(
rootPath:string
) =
FS.tokenizeRelativeFilePaths rootPath

type Investigation =

/// <summary>
Expand All @@ -21,7 +67,7 @@ type Investigation =

FsWorkbook.fromXlsxFile path
|> Workbook.getInvestigationMetadataSheet useLastSheetOnIncorrectName
|> Worksheet.parseRowsWith (Tokenization.convertTokens MetadataSheet.parseInvestigationKey)
|> Worksheet.parseRowsWith (Tokenization.convertMetadataTokens MetadataSheet.parseInvestigationKey)

/// <summary>
/// Parses the metadata sheet from an ISA Study XLSX file as a flat list of `IParam`s.
Expand Down Expand Up @@ -54,7 +100,7 @@ type Study =

FsWorkbook.fromXlsxFile path
|> Workbook.getStudyMetadataSheet useLastSheetOnIncorrectName
|> Worksheet.parseRowsWith (Tokenization.convertTokens MetadataSheet.parseStudyKey)
|> Worksheet.parseRowsWith (Tokenization.convertMetadataTokens MetadataSheet.parseStudyKey)

/// <summary>
/// Parses the metadata sheet from an ISA Study XLSX file as a flat list of `IParam`s.
Expand Down Expand Up @@ -95,7 +141,7 @@ type Assay =

FsWorkbook.fromXlsxFile path
|> Workbook.getAssayMetadataSheet useLastSheetOnIncorrectName
|> Worksheet.parseRowsWith (Tokenization.convertTokens MetadataSheet.parseAssayKey)
|> Worksheet.parseRowsWith (Tokenization.convertMetadataTokens MetadataSheet.parseAssayKey)

/// <summary>
/// Parses the metadata sheet from an ISA Assay XLSX file as a flat list of `IParam`s.
Expand Down
7 changes: 6 additions & 1 deletion tests/ARCTokenization.Tests/ARCTokenization.Tests.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
</PropertyGroup>

<ItemGroup>
<None Include="Fixtures\**" CopyToOutputDirectory="Always" />
<None Include="Fixtures\**" CopyToOutputDirectory="Always"/>
<Compile Include="TestUtils.fs" />
<Compile Include="TestObjects.fs" />
<Compile Include="ReferenceObjects.fs" />
Expand Down Expand Up @@ -36,7 +36,12 @@
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\src\ControlledVocabulary\ControlledVocabulary.fsproj" />
<ProjectReference Include="..\..\src\ARCTokenization\ARCTokenization.fsproj" />
</ItemGroup>

<ItemGroup />

<ItemGroup />

</Project>
Empty file.
Empty file.
Empty file.
66 changes: 64 additions & 2 deletions tests/ARCTokenization.Tests/ReferenceObjects.fs
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ module Tokenization =
lmaoooo", ParamValue.Value 1)
]

module ConvertTokens =
module ConvertMetadataTokens =

let referenceTerms = [
CvTerm.create(accession = "1", name = "ReferenceTerm1", ref = "1")
Expand Down Expand Up @@ -150,4 +150,66 @@ module Tokenization =
UserParam("fk u lmaooooo", ParamValue.CvValue Terms.StructuralTerms.metadataSectionKey)
UserParam("fk u lmaooooo", ParamValue.Value "some value")
UserParam("fk u lmaooooo", ParamValue.Value "another value")
]
]

module FileSystem =

let referenceRelativeDirectoryPaths =
[
@"1"
@"2"
@"1/1_1"
@"2/2_1"
@"2/2_2"
@"2/2_2/2_2_1"
]
|> List.map (fun v ->
CvParam(
cvTerm = CvTerm.create("AFSO:00000010","Directory Path","AFSO"),
v = v
)
)

let referenceAbsoluteDirectoryPaths(root) =
[
@"1"
@"2"
@"1/1_1"
@"2/2_1"
@"2/2_2"
@"2/2_2/2_2_1"
]
|> List.map (fun f -> System.IO.Path.Combine(root, f))
|> List.map (fun v ->
CvParam(
cvTerm = CvTerm.create("AFSO:00000010","Directory Path","AFSO"),
v = v.Replace("\\", "/")
)
)

let referenceRelativeFilePaths =
[
@"1/1_1/.gitkeep"
@"2/2_1/.gitkeep"
@"2/2_2/2_2_1/.gitkeep"
]
|> List.map (fun v ->
CvParam(
cvTerm = CvTerm.create("AFSO:00000009","File Path","AFSO"),
v = v
)
)

let referenceAbsoluteFilePaths(root) =
[
@"1/1_1/.gitkeep"
@"2/2_1/.gitkeep"
@"2/2_2/2_2_1/.gitkeep"
]
|> List.map (fun f -> System.IO.Path.Combine(root, f))
|> List.map (fun v ->
CvParam(
cvTerm = CvTerm.create("AFSO:00000009","File Path","AFSO"),
v = v.Replace("\\", "/")
)
)
57 changes: 53 additions & 4 deletions tests/ARCTokenization.Tests/TokenizationTests/ParserFunctions.fs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,55 @@ open ControlledVocabulary
open ARCTokenization
open Xunit

module FileSystem =

open ReferenceObjects.Tokenization.FileSystem
open System.IO

let parsedRelativeDirectoryPaths = FS.tokenizeRelativeDirectoryPaths (Path.GetFullPath("Fixtures/testPaths/")) |> List.ofSeq

[<Fact>]
let ``Relative directory paths are tokenized correctly`` () =
let actual = parsedRelativeDirectoryPaths
let expected = referenceRelativeDirectoryPaths
Assert.All(
List.zip expected actual,
fun (e, a) -> Assert.True(e.Equals(a))
)

let parsedRelativeFilePaths = FS.tokenizeRelativeFilePaths (Path.GetFullPath("Fixtures/testPaths/")) |> List.ofSeq

[<Fact>]
let ``Relative file paths are tokenized correctly`` () =
let actual = parsedRelativeFilePaths
let expected = referenceRelativeFilePaths
Assert.All(
List.zip expected actual,
fun (e, a) -> Assert.True(e.Equals(a))
)

let parsedAbsoluteDirectoryPaths = FS.tokenizeAbsoluteDirectoryPaths (Path.GetFullPath("Fixtures/testPaths/")) |> List.ofSeq

[<Fact>]
let ``Absolute directory paths are tokenized correctly`` () =
let actual = parsedAbsoluteDirectoryPaths
let expected = referenceAbsoluteDirectoryPaths(Path.Combine(System.Environment.CurrentDirectory, "Fixtures/testPaths/"))
Assert.All(
List.zip expected actual,
fun (e, a) -> Assert.True(e.Equals(a))
)

let parsedAbsoluteFilePaths = FS.tokenizeAbsoluteFilePaths (Path.GetFullPath("Fixtures/testPaths/")) |> List.ofSeq

[<Fact>]
let ``Absolute file paths are tokenized correctly`` () =
let actual = parsedAbsoluteFilePaths
let expected = referenceAbsoluteFilePaths(Path.Combine(System.Environment.CurrentDirectory, "Fixtures/testPaths/"))
Assert.All(
List.zip expected actual,
fun (e, a) -> Assert.True(e.Equals(a))
)

module ParseKeyWithTerms =

open ReferenceObjects.Tokenization.KeyParser
Expand Down Expand Up @@ -75,12 +124,12 @@ module ParseKeyWithTerms =
fun (e, a) -> Assert.True(Param.equals e a)
)

module ConvertTokens =
module ConvertMetadataTokens =

open ReferenceObjects.Tokenization.ConvertTokens
open ReferenceObjects.Tokenization.ConvertMetadataTokens
open FsSpreadsheet

let tokenizer : FsCell seq -> IParam list = Tokenization.convertTokens (MetadataSheet.parseKeyWithTerms referenceTerms)
let tokenizer : FsCell seq -> IParam list = Tokenization.convertMetadataTokens (MetadataSheet.parseKeyWithTerms referenceTerms)

let parsedCvParams = tokenizer referenceRow

Expand Down Expand Up @@ -148,4 +197,4 @@ module ConvertTokens =
let ``UserParam row has metadata section key as value of first token`` () =
let actual = parsedUserParams.[0] |> Param.getValueAsTerm
let expected = Terms.StructuralTerms.metadataSectionKey
Assert.Equal(expected, actual)
Assert.Equal(expected, actual)

0 comments on commit 57de162

Please sign in to comment.