From 86c1251682774d59f71be9316ea3374f5dcb6244 Mon Sep 17 00:00:00 2001 From: Koichi Akabe Date: Thu, 13 Apr 2023 15:45:58 +0900 Subject: [PATCH] Test examples using doctest (#10) --- .github/workflows/CI.yml | 5 ++- README.md | 65 ++++++++++++++++---------------------- docs/source/examples.rst | 21 +++++------- tests/data/system.dic.zst | Bin 0 -> 1038 bytes 4 files changed, 39 insertions(+), 52 deletions(-) create mode 100644 tests/data/system.dic.zst diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 97b1db6..315cb64 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -31,10 +31,13 @@ jobs: - name: Test package run: | python -m pip install --upgrade pip - pip install -r requirements-dev.txt + pip install -r requirements-dev.txt zstandard + python -c "import zstandard;zstandard.ZstdDecompressor().copy_stream(open('tests/data/system.dic.zst','rb'),open('tests/data/system.dic','wb'))" pip install vibrato --no-index --find-links target/wheels --force-reinstall mypy --strict tests pytest + python -m doctest README.md + python -m doctest docs/source/examples.rst pack-sdist: needs: [ test ] diff --git a/README.md b/README.md index f9ef9e7..f3d62cd 100644 --- a/README.md +++ b/README.md @@ -40,43 +40,40 @@ To perform tokenization, follow [the document of Vibrato](https://github.com/daa Check the version number as shown below to use compatible models: ```python -import vibrato -vibrato.VIBRATO_VERSION -#=> "0.5.0" +>>> import vibrato +>>> vibrato.VIBRATO_VERSION +'0.5.0' + ``` Examples: ```python -import vibrato +>>> import vibrato + +>>> with open('tests/data/system.dic', 'rb') as fp: +... tokenizer = vibrato.Vibrato(fp.read()) -with open('path/to/system.dic', 'rb') as fp: - dict_data = fp.read() -tokenizer = vibrato.Vibrato(dict_data) +>>> tokens = tokenizer.tokenize('社長は火星猫だ') -tokens = tokenizer.tokenize('社長は火星猫だ') +>>> len(tokens) +5 -len(tokens) -#=> 5 +>>> tokens[0] +Token { surface: "社長", feature: "名詞,普通名詞,一般,*" } -list(tokens) -#=> [Token { surface: "社長", feature: "名詞,一般,*,*,*,*,社長,シャチョウ,シャチョー,," }, -# Token { surface: "は", feature: "助詞,係助詞,*,*,*,*,は,ハ,ワ,," }, -# Token { surface: "火星", feature: "名詞,一般,*,*,*,*,火星,カセイ,カセイ,," }, -# Token { surface: "猫", feature: "名詞,一般,*,*,*,*,猫,ネコ,ネコ,," }, -# Token { surface: "だ", feature: "助動詞,*,*,*,特殊・ダ,基本形,だ,ダ,ダ,," }] +>>> tokens[0].surface() +'社長' -tokens[0].surface() -#=> '社長' +>>> tokens[0].feature() +'名詞,普通名詞,一般,*' -tokens[0].feature() -#=> '名詞,一般,*,*,*,*,社長,シャチョウ,シャチョー,,' +>>> tokens[0].start() +0 -tokens[0].start() -#=> 0 +>>> tokens[0].end() +2 -tokens[0].end() -#=> 2 ``` ## Note for distributed models @@ -85,22 +82,14 @@ The distributed models are compressed in zstd format. If you want to load these you must decompress them outside the API. ```python -import vibrato -import zstandard # zstandard package in PyPI - -dctx = zstandard.ZstdDecompressor() -with open('path/to/system.dic.zst', 'rb') as fp: - dict_reader = dctx.stream_reader(fp) - tokenizer = vibrato.Vibrato(dict_reader.read()) -``` - -## Documentation +>>> import vibrato +>>> import zstandard # zstandard package in PyPI -Use the help function to show the API reference. +>>> dctx = zstandard.ZstdDecompressor() +>>> with open('tests/data/system.dic.zst', 'rb') as fp: +... with dctx.stream_reader(fp) as dict_reader: +... tokenizer = vibrato.Vibrato(dict_reader.read()) -```python -import vibrato -help(vibrato) ``` ## License diff --git a/docs/source/examples.rst b/docs/source/examples.rst index 21aad6a..039cf63 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -19,27 +19,22 @@ Examples: >>> import vibrato - >>> with open('path/to/system.dic', 'rb') as fp: - ... dict_data = fp.read() - >>> tokenizer = vibrato.Vibrato(dict_data) + >>> with open('tests/data/system.dic', 'rb') as fp: + ... tokenizer = vibrato.Vibrato(fp.read()) >>> tokens = tokenizer.tokenize('社長は火星猫だ') >>> len(tokens) 5 - >>> list(tokens) - [Token { surface: "社長", feature: "名詞,一般,*,*,*,*,社長,シャチョウ,シャチョー,," }, - Token { surface: "は", feature: "助詞,係助詞,*,*,*,*,は,ハ,ワ,," }, - Token { surface: "火星", feature: "名詞,一般,*,*,*,*,火星,カセイ,カセイ,," }, - Token { surface: "猫", feature: "名詞,一般,*,*,*,*,猫,ネコ,ネコ,," }, - Token { surface: "だ", feature: "助動詞,*,*,*,特殊・ダ,基本形,だ,ダ,ダ,," }] + >>> tokens[0] + Token { surface: "社長", feature: "名詞,普通名詞,一般,*" } >>> tokens[0].surface() '社長' >>> tokens[0].feature() - '名詞,一般,*,*,*,*,社長,シャチョウ,シャチョー,,' + '名詞,普通名詞,一般,*' >>> tokens[0].start() 0 @@ -56,6 +51,6 @@ you must decompress them outside the API: >>> import zstandard # zstandard package in PyPI >>> dctx = zstandard.ZstdDecompressor() - >>> with open('path/to/system.dic.zst', 'rb') as fp: - ... dict_reader = dctx.stream_reader(fp) - >>> tokenizer = vibrato.Vibrato(dict_reader.read()) + >>> with open('tests/data/system.dic.zst', 'rb') as fp: + ... with dctx.stream_reader(fp) as dict_reader: + ... tokenizer = vibrato.Vibrato(dict_reader.read()) diff --git a/tests/data/system.dic.zst b/tests/data/system.dic.zst new file mode 100644 index 0000000000000000000000000000000000000000..d74da8717287148c03ccd2396107c48a717dc215 GIT binary patch literal 1038 zcmV+p1o8VQwJ-euXp9E{R0LLOVsc@0Z&YtDUZ3E+9?F3*nvh#4n+QpXY;(Y_tf4(#XGLQ+c{aBEz0Ww#Nv z6eg(a>rJ^SM7b`>H=#s{`=As)MyA_macPC#zCXRqOUfrT`p7xpjPJtZ51)99CYWRJ zV<1o8aD0P8`F8OY&Vt8kx?T@+JtLIbaJk~OQNZENeS*Q%>ox~Lhx|aRMeA=ZRWuR+ z%HtCn!J)|NsAgfB-OnItB;=1Q`;58y_MlASVkR1UxJZ3l<3&4hRq$6C4l{ z6e15Q6eJ8LE-Nr7GAT4J1vNe`0)Q$y0{{R2fB*mge>MXDI6eUi0RjUL7z+>u3=t9& z6b1(s2^kF*1Q!|`91a}_9svRZ1c)RA1uYB)6y=Jk=&7D8=AgVPEGp!`f)|jD=9#YP zfSMfSxPa)0tRdx_y5^jToUjk&iLe+oIlxLrXp$tu0Rj>RB31%LgoFqRG2%)faUy|)G64}y3mt|T&}cEkqD933D*Apn zAm1#s_isX&!puFFGL>_jjh8BwH~hH8YbybC$*8m>i?1OP`GfNkD21fPE%27N_fjz?+AnVOxdZPOEbg^J?NF^7AEsEv4mi4K;rP>Mlfas@E}T1OG#;bz;@jo?yAe0s*5mL#3yZpyLb%`#U+feczoxjf z34?TY(UFusgc5`R0009(0M{1S8-5dhqCTP4L9GPRb zNd{9{O+rshPE}1sQb|KWOi)Nc2uMj%K}SJOR6$D&M^Z&aOG8;wNlZ*hLks}|0|6K4 zqnzc6spzPf<&U>4DgpupBB0%SOAB`fR8HswD4UV$$s;3_M2T!10O<6!Dq&+Ml>$@#z<#9;?g!cf@9Cfa I6ONrW0d!rcvH$=8 literal 0 HcmV?d00001