-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Add a module for generating trigrams from a dictionary
- Loading branch information
1 parent
74cbada
commit a7504b8
Showing
7 changed files
with
146 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
{-# LANGUAGE OverloadedLists #-} | ||
module Data.Gibberish.GenTrigrams | ||
(mapTrigrams) where | ||
|
||
import Data.Gibberish.Types | ||
import Data.Map (Map ()) | ||
import Data.Map qualified as Map | ||
import Data.Text (Text ()) | ||
import Data.Text qualified as Text | ||
|
||
mapTrigrams :: [Text] -> Map Digram Frequencies | ||
mapTrigrams [] = Map.empty | ||
mapTrigrams (x : xs) = Map.unionWith combine (mkTrigrams x) (mapTrigrams xs) | ||
where | ||
combine (Frequencies f1) (Frequencies f2) = Frequencies $ Map.unionWith (+) f1 f2 | ||
|
||
mkTrigrams :: Text -> Map Digram Frequencies | ||
mkTrigrams word = foldr insert' Map.empty $ scanTrigrams word | ||
where | ||
insert' (a, b, c) map' = | ||
Map.insertWith combineFrequencies (Digram a b) (mkFrequencies c) map' | ||
combineFrequencies (Frequencies m1) (Frequencies m2) = Frequencies (Map.unionWith (+) m1 m2) | ||
mkFrequencies c = Frequencies $ Map.singleton (Unigram c) 1 | ||
|
||
scanTrigrams :: Text -> [(Char, Char, Char)] | ||
scanTrigrams word = case Text.take 3 word of | ||
[a, b, c] -> (a, b, c) : scanTrigrams (Text.tail word) | ||
_ -> [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
{-# LANGUAGE OverloadedLists #-} | ||
|
||
module Test.Gibberish.GenTrigramsTest (tests) where | ||
|
||
import Data.Gibberish.GenTrigrams | ||
import Data.Gibberish.Types | ||
import Test.Gibberish.Gen qualified as Gen | ||
|
||
import Data.List qualified as List | ||
import Data.Map (Map ()) | ||
import Data.Map qualified as Map | ||
import Data.Text (Text ()) | ||
import Data.Text qualified as Text | ||
import Hedgehog | ||
import Hedgehog.Gen qualified as Gen hiding (word) | ||
import Hedgehog.Range qualified as Range | ||
import Test.Tasty (TestTree (), testGroup) | ||
import Test.Tasty.Hedgehog (testPropertyNamed) | ||
|
||
tests :: TestTree | ||
tests = | ||
testGroup | ||
"Test.Gibberish.GenTrigrams" | ||
[ testPropertyNamed "length trigrams" "prop_len_trigrams" prop_len_trigrams, | ||
testPropertyNamed "length frequencies" "prop_len_frequencies" prop_len_frequencies, | ||
testPropertyNamed "contains all trigrams" "prop_trigrams_all" prop_trigrams_all | ||
] | ||
|
||
prop_len_trigrams :: Property | ||
prop_len_trigrams = property $ do | ||
word <- forAll Gen.word | ||
let trigrams' = List.sort $ trigrams word | ||
|
||
cover 10 "with duplicates" $ hasDuplicates trigrams' | ||
cover 10 "no duplicates" $ not (hasDuplicates trigrams') | ||
|
||
assert $ | ||
length (mapTrigrams [word]) <= max 0 (Text.length word - 2) | ||
|
||
prop_len_frequencies :: Property | ||
prop_len_frequencies = property $ do | ||
word <- forAll Gen.word | ||
let wordTrigrams' = List.sort $ trigrams word | ||
|
||
cover 10 "with duplicates" $ hasDuplicates wordTrigrams' | ||
cover 10 "no duplicates" $ not (hasDuplicates wordTrigrams') | ||
|
||
let totalTrigrams = | ||
sum | ||
. concatMap (Map.elems . unFrequencies) | ||
. Map.elems | ||
. mapTrigrams | ||
$ [word] | ||
length wordTrigrams' === fromIntegral totalTrigrams | ||
|
||
prop_trigrams_all :: Property | ||
prop_trigrams_all = property $ do | ||
words' <- forAll $ Gen.list (Range.linear 0 10) Gen.word | ||
let wordTrigrams = mapTrigrams words' | ||
trigrams' = map (List.sort . trigrams) words' | ||
|
||
cover 10 "with duplicates" $ List.any hasDuplicates trigrams' | ||
cover 10 "no duplicates" $ List.any (not . hasDuplicates) trigrams' | ||
|
||
concatNub trigrams' === List.sort (allTrigrams wordTrigrams) | ||
where | ||
concatNub :: Ord a => [[a]] -> [a] | ||
concatNub = List.nub . List.sort . List.concat | ||
|
||
trigrams :: Text -> [(Char, Char, Char)] | ||
trigrams ts = case Text.take 3 ts of | ||
[a, b, c] -> (a, b, c) : trigrams (Text.tail ts) | ||
_ -> [] | ||
|
||
allTrigrams :: Map Digram Frequencies -> [(Char, Char, Char)] | ||
allTrigrams tris = concatMap (uncurry mapFrequencies) $ Map.toList tris | ||
where | ||
mapFrequencies (Digram c1 c2) (Frequencies freqs) = | ||
map (\(Unigram c3) -> (c1, c2, c3)) $ Map.keys freqs | ||
|
||
hasDuplicates :: Ord a => [a] -> Bool | ||
hasDuplicates ls = ls /= List.nub ls |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
module Test.Gibberish.TypesTest (tests) where | ||
|
||
import Test.Gibberish.Gen qualified as Gen | ||
|
||
import Data.Aeson (fromJSON, toJSON) | ||
import Hedgehog | ||
import Test.Tasty (TestTree (), testGroup) | ||
import Test.Tasty.Hedgehog (testPropertyNamed) | ||
|
||
tests :: TestTree | ||
tests = | ||
testGroup | ||
"Test.Gibberish.Types" | ||
[testPropertyNamed "toJSON roundtrip" "prop_toJSON_roundtrip" prop_toJSON_roundtrip] | ||
|
||
prop_toJSON_roundtrip :: Property | ||
prop_toJSON_roundtrip = property $ do | ||
unigram <- forAll Gen.trigram | ||
tripping unigram toJSON fromJSON |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters