Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RNA Seq normalization methods #136

Merged
merged 7 commits into from
May 14, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions BioFSharp
Submodule BioFSharp added at 92be7c
273 changes: 273 additions & 0 deletions docs/rnaseq_normalization.ipynb

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions src/BioFSharp.Stats/BioFSharp.Stats.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,12 @@
</PropertyGroup>
<ItemGroup>
<Compile Include="OntologyEnrichment.fs" />
<Compile Include="RNASeq.fs" />
<Compile Include="SurprisalAnalysisEmpiricalPermutationTest.fs" />

<None Include="Playground\OntologyEnrichment.fsx" />
<None Include="Playground\SurprisalAnalysisEmpiricalPermutationTest.fsx" />
<None Include="Playground\RNASeq.fsx" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="FSharpAux" Version="1.1.0" />
Expand Down
5 changes: 5 additions & 0 deletions src/BioFSharp.Stats/Playground/RNASeq.fsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#load "../RNASeq.fs"
open BioFSharp
open BioFSharp.Stats
open BioFSharp.Stats.RNASeq

71 changes: 71 additions & 0 deletions src/BioFSharp.Stats/RNASeq.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
namespace BioFSharp.Stats

open System
open System.Collections.Generic

/// Contains types and functions needed for RNA-Seq normalization
module RNASeq =
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

at least the public functions and types should have XML documentation to give context about what they do without the need of browsing the documentation page.

/// Input type for RNA-Seq normalization
type RNASeqInput = {
GeneID : string
GeneLength : float
GeneCount : float
} with static member Create id gl gc = {GeneID=id;GeneLength=gl;GeneCount=gc}
type NormalizationMethod =
| RPKM
| TPM
/// Type with GeneID, normalized data and method of normalization
type NormalizedCounts = {
GeneID : string
NormalizedCount : float
NormalizationMethod: NormalizationMethod
} with static member Create id nc nm = {GeneID=id;NormalizedCount=nc;NormalizationMethod=nm}
/// calculates Reads Per Million
let private calcRPM sumOfAllReadsPerMil counts =
(counts |> float) / sumOfAllReadsPerMil
/// calculates RPKM
let private calcRPKM geneLength rpm =
(float rpm) / ((float geneLength) / 1000.)
///Performs RPKM normalization
let private rpkmsOf (geneIDs:seq<string>) (length:seq<float>) (counts:seq<float>) =
let sumOfAllReads =
counts
|> Seq.sum
let sumOfAllReadsPerMil =
sumOfAllReads / 1000000.
let rpms =
Seq.map (fun counts -> calcRPM sumOfAllReadsPerMil counts) counts
let rpkms =
let rpkm =
Seq.zip length rpms
|> Seq.map (fun (length, rpm) -> calcRPKM length rpm)
rpkm
let rpkmResult =
Seq.map2 (fun ids counts -> {GeneID=ids; NormalizedCount=counts; NormalizationMethod=RPKM}) geneIDs rpkms
rpkmResult
/// Returns RPKM normalized data
let rpkms (idLengthAndCounts:seq<RNASeqInput>) =
rpkmsOf (idLengthAndCounts |> Seq.map (fun x -> x.GeneID)) (idLengthAndCounts |> Seq.map (fun x -> x.GeneLength)) (idLengthAndCounts |> Seq.map (fun x -> x.GeneCount))
/// Performs TPM normalization
let private tpmsOf (idLengthAndCounts:seq<RNASeqInput>) =
let rpk =
idLengthAndCounts
|> Seq.map (fun idLengthAndCounts -> idLengthAndCounts.GeneCount/idLengthAndCounts.GeneLength/1000.)
let sumOfAllReads =
rpk
|> Seq.sum
let sumOfAllReadsPerMil =
sumOfAllReads / 1000000.
let tpms =
rpk
|> Seq.map (fun rpks -> rpks/sumOfAllReadsPerMil)
let geneID =
idLengthAndCounts
|> Seq.map (fun idLengthAndCounts -> idLengthAndCounts.GeneID)
let tpmResult =
Seq.map2 (fun ids counts -> {GeneID=ids; NormalizedCount=counts; NormalizationMethod=TPM}) geneID tpms
tpmResult
/// Returns TPM normalized data
let tpms (idLengthAndCounts:seq<RNASeqInput>) =
tpmsOf idLengthAndCounts

37 changes: 37 additions & 0 deletions tests/BioFSharp.Tests/BioFSharp.Stats/RNASeqTests.fs
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
module RNASeqTests

open BioFSharp.Stats
open Expecto


let testSeq = seq { for i in 1. .. 2. -> ("stringtest"+ i.ToString(),(i,i))}
let testgeneID = seq { "stringtest1"; "stringtest2"}
let testLength = seq {1.; 2.}
let testCount = seq {1.;2.}
let testInSeq = Seq.map3 (fun id gl gc -> RNASeq.RNASeqInput.Create id gl gc) testgeneID testLength testCount

let resultRPKM= seq {("stringtest1", 333333333.3333333); ("stringtest2",333333333.3333333)}
let resultTPM= seq {("stringtest1", 500000.); ("stringtest2", 500000.)}
let RPKMres = Seq.map (fun (id,rpkm) -> RNASeq.NormalizedCounts.Create id rpkm RNASeq.NormalizationMethod.RPKM) resultRPKM
let TPMres = Seq.map (fun (id,tpm) -> RNASeq.NormalizedCounts.Create id tpm RNASeq.NormalizationMethod.TPM) resultTPM
[<Tests>]
let RNASeqTests =

testList "RNASeqTests" [
testCase "RPKM" (fun _ ->
Expect.sequenceEqual
(RNASeq.rpkms testInSeq)
//|> Array.ofSeq)
(RPKMres)
//|> Array.ofSeq)
"RPKM did not return correct Sequence"
)
testCase "TPM" (fun _ ->
Expect.sequenceEqual
(RNASeq.tpms testInSeq)
//|> Array.ofSeq)
(TPMres)
//|> Array.ofSeq)
"TPM did not return correct Sequence"
)
]
2 changes: 2 additions & 0 deletions tests/BioFSharp.Tests/BioFSharp.Tests.fsproj
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
<Compile Include="BioFSharp\BioCollections.fs" />
<Compile Include="BioFSharp\BioItemTests.fs" />
<Compile Include="BioFSharp\PhylTreeTests.fs" />
<Compile Include="BioFSharp.Stats\RNASeqTests.fs" />
<Compile Include="Main.fs" />
</ItemGroup>

Expand All @@ -35,5 +36,6 @@
<PackageReference Include="Microsoft.NET.Test.Sdk" Version="16.*" />
<ProjectReference Include="..\..\src\BioFSharp\BioFSharp.fsproj" />
<ProjectReference Include="..\..\src\BioFSharp.IO\BioFSharp.IO.fsproj" />
<ProjectReference Include="..\..\src\BioFSharp.Stats\BioFSharp.Stats.fsproj" />
</ItemGroup>
</Project>
Loading