-
Notifications
You must be signed in to change notification settings - Fork 242
/
Copy pathAllNonAsciiChars.hs
42 lines (35 loc) · 1.44 KB
/
AllNonAsciiChars.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
{-# LANGUAGE OverloadedStrings #-}
-- | This module extracts all the non-ASCII characters used by the
-- library code (along with how many times they are used).
module Main where
import qualified Data.List as List (sortBy, sort)
import qualified Data.List.NonEmpty as List1 (group, head)
import Data.Char (isAscii, ord)
import Data.Function (on)
import Numeric (showHex)
import System.FilePath.Find (find, always, extension, (||?), (==?))
import System.IO (openFile, hSetEncoding, utf8, IOMode(ReadMode))
import qualified Data.Text as T (Text, pack, unpack, concat)
import qualified Data.Text.IO as T (putStrLn, hGetContents)
readUTF8File :: FilePath -> IO T.Text
readUTF8File f = do
h <- openFile f ReadMode
hSetEncoding h utf8
T.hGetContents h
main :: IO ()
main = do
agdaFiles <- find always
(extension ==? ".agda" ||? extension ==? ".lagda")
"src"
nonAsciiChars <-
filter (not . isAscii) . T.unpack . T.concat <$> mapM readUTF8File agdaFiles
let table :: [(Char, Int)]
table = List.sortBy (flip compare `on` snd) $
map (\cs -> (List1.head cs, length cs)) $
List1.group $ List.sort $ nonAsciiChars
let codePoint :: Char -> T.Text
codePoint c = T.pack $ showHex (ord c) ""
uPlus :: Char -> T.Text
uPlus c = T.concat ["(U+", codePoint c, ")"]
mapM_ (\(c, count) -> T.putStrLn $ T.concat [T.pack [c], " ", uPlus c, ": ", T.pack $ show count])
table