-
Notifications
You must be signed in to change notification settings - Fork 0
/
treetaggerwrapper.py
2853 lines (2444 loc) · 118 KB
/
treetaggerwrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# -*- coding: utf-8 -*-
r"""
About treetaggerwrapper
=======================
:author: Laurent Pointal <[email protected]> <[email protected]>
:organization: CNRS - LIMSI
:copyright: CNRS - 2004-2015
:license: GNU-GPL Version 3 or greater
:version: 2.2.2
For language independent part-of-speech tagger TreeTagger,
see `Helmut Schmid TreeTagger site`_.
.. _Helmut Schmid TreeTagger site: http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/DecisionTreeTagger.html
For this module, see `Developer Project page`_ and `Project Source repository`_
.. _Developer Project page: https://perso.limsi.fr/pointal/dev:treetaggerwrapper
.. _Project Source repository: https://sourcesup.renater.fr/projects/ttpw/
You can also retrieve the latest version of this module with the svn command::
svn export https://subversion.renater.fr/ttpw/trunk/treetaggerwrapper.py
This wrapper tool is intended to be used in larger projects, where multiple
chunk of texts must be processed via TreeTagger (else you may simply use the
base TreeTagger installation as an external command).
.. warning:: Incompatible module evolutions with version 2.0 on august 20 2015
See :ref:`important modifications notes` below !
If you use this wrapper, a small email would be wellcome to support
module maintenance (where, purpose, funding…).
Send it to [email protected]
Installation
============
Requirements
------------
``treetaggerwrapper`` rely on :mod:`six` module for Python2 and Python3
compatibility. It also uses standard :mod:`io` module for files reading with
decoding / encoding .
Tests have been limited to Python 2.7 and Python 3.4 under Linux and Windows.
It don't work with earlier version of Python as some names are not defined in
their standard libraries.
Automatic
---------
As the module is now registered on `PyPI`_, you can simply install it::
pip install treetaggerwrapper
Or, if you can't (or don't want) to install the module system-wide (and don't
use a `virtual env`_)::
pip install --user treetaggerwrapper
.. _PyPI: https://pypi.python.org/pypi/treetaggerwrapper
.. _virtual env: https://virtualenv.pypa.io/en/latest/
If it is already installed as a package, use pip's install :option:`-U` option
to install the last version (update).
Manual
------
For a complete manual installation, install :mod:`six` module and other
dependancies, and simply put the :file:`treetaggerwrapper.py`
and :file:`treetaggerpoll.py` files in a
directory listed in the Python path (or in your scripts directory).
Configuration
=============
The wrapper search for the treetagger directory
(the one with :file:`bin`, :file:`lib` and :file:`cmd` subdirectories),
allowing variations in TreeTagger directory name
(example: :file:`treetagger`, :file:`TreeTagger`,
:file:`Tree-Tagger-latest`, :file:`Tree Tagger`, etc),
in different locations from user home directory to host-wide directories.
If the treetagger directory is found, its location is stored in a file
:file:`$HOME/.config/treetagger_wrapper.cfg` (or any place following
:envvar:`XDG_CONFIG_DIR` if it is specified),
and at next start the directory indicated in this file is used if it
still exists.
If you installed TreeTagger in a non-guessable location, you still can set up
an environment variable :envvar:`TAGDIR` to reference the
TreeTagger software installation directory, or give a `TAGDIR` named argument
when building a :class:`TreeTagger` object to provide this information,
or simply put that information into configuration file in section ``[CACHE]``
under key ``tagdir = …``.
Usage
=====
Primary usage is to wrap TreeTagger binary and use it as a functional tool.
You have to build a :class:`TreeTagger` object, specifying the target
language [by its country code!], and possibly some other TreeTagger parameters
(else we use standard files specified in the module for each supported language).
Once this wrapper object created, you can simply call its :any:`tag_text()`
method with the string to tag, and it will return a list of lines corresponding
to the text tagged by TreeTagger.
Example (with Python3, **Unicode strings** by default — with Python2 you
need to use explicit notation ``u"string"``, of if within a script start by a
:code:`from __future__ import unicode_literals` directive)::
>>> import pprint # For proper print of sequences.
>>> import treetaggerwrapper
>>> #1) build a TreeTagger wrapper:
>>> tagger = treetaggerwrapper.TreeTagger(TAGLANG='en')
>>> #2) tag your text.
>>> tags = tagger.tag_text("This is a very short text to tag.")
>>> #3) use the tags list... (list of string output from TreeTagger).
>>> pprint.pprint(tags)
['This\tDT\tthis',
'is\tVBZ\tbe',
'a\tDT\ta',
'very\tRB\tvery',
'short\tJJ\tshort',
'text\tNN\ttext',
'to\tTO\tto',
'tag\tVV\ttag',
'.\tSENT\t.']
>>> # Note: in output strings, fields are separated with tab chars (\t).
You can transform it into a list of named tuple :class:`Tag`
(and possible :class:`NotTag` for unknown tokens) using the helper
:func:`make_tags` function::
>>> tags2 = treetaggerwrapper.make_tags(tags)
>>> pprint.pprint(tags2)
[Tag(word='This', pos='DT', lemma='this'),
Tag(word='is', pos='VBZ', lemma='be'),
Tag(word='a', pos='DT', lemma='a'),
Tag(word='very', pos='RB', lemma='very'),
Tag(word='short', pos='JJ', lemma='short'),
Tag(word='text', pos='NN', lemma='text'),
Tag(word='to', pos='TO', lemma='to'),
Tag(word='tag', pos='VV', lemma='tag'),
Tag(word='.', pos='SENT', lemma='.')]
You can also directly process files using :meth:`TreeTagger.tag_file` and
:meth:`TreeTagger.tag_file_to` methods.
The module itself can be used as a command line tool too, for more information
ask for module help::
python treetaggerwrapper.py --help
If available within :envvar:`PYTHONPATH`, the module can also be called
from anywhere with the :option:`-m` Python option::
python -m treetaggerwrapper --help
.. _important modifications notes:
Important modifications notes
=============================
On august 2015, the module has been reworked deeply, some
modifications imply modifications in users code.
- **Methods renamed** (and functions too) to follow Python rules,
they are now lowercase
with underscore separator between words.
Typically for users, ``tt.TagText()`` becomes ``tt.tag_text()``
(for this method a compatibility method has been written, but
no longer support lists of non-unicode strings).
- Work with Python2 and Python3, with same code.
- Use **Unicode strings** internally (it's no more possible to provide
binary strings and their encoding as separated
parameters - you have to decode the strings yourself before calling
module functions).
- Assume **utf-8** when dealing with *TreeTagger binary*, default to its utf-8
versions of parameter and abbrev files. If you use another encoding,
you must specify these files: in your sources, or via environment
vars, or in the :file:`treetagger_wrapper.cfg` configuration file under
encoding name section (respecting Python encoding names as given by
``codecs.lookup(enc).name``, ie. uses ``utf-8``).
- Default to **utf-8** when reading *user files* (you need to specify latin1
if you use such encoding - previously it was the default).
- **Guess TreeTagger location** — you can still provide :envvar:`TAGDIR` as env var
or as :class:`TreeTagger` parameter, but it's no more necessary.
Found directory is cached in :file:`treetagger_wrapper.cfg` configuration
file to only guess once.
- Documentation has been revised to only export main things for module usage;
internals stay documented via comments in the source.
- **Text chunking** has been revisited and must be more efficient.
And you can now also provide your own external chunking function when
creating the wrapper — which will replace internal chuning in the whole
process.
- XML tags generated have been modified (made shorted and with ``ttpw:`` namespace).
- Can be used in **multithreading** context (pipe communications with TreeTagger
are protected by a Lock, preventing concurrent access).
If you need multiple parallel processing, you can create multiple :class:`TreeTagger`
objects, put them in a poll, and work with them from different threads.
- Support polls of taggers for optimal usage on multi-core computers.
See :class:`TaggerPoll` class for thread poll
and :class:`treetaggerpoll.TaggerProcessPoll` class for process poll.
Processing
==========
This module does two main things
--------------------------------
- Manage preprocessing of text (chunking) in place of external Perl scripts as in
base TreeTagger installation, thus avoid starting Perl each time a piece
of text must be tagged.
- Keep alive a pipe connected to TreeTagger process, and use that pipe
to send data and retrieve tags, thus avoid starting TreeTagger each
time and avoid writing / reading temporary files on disk (direct
communication via the pipe).
Supported languages
^^^^^^^^^^^^^^^^^^^
.. note:: Encoding specification
When specifying language with treetaggerwrapper, we use the the two
chars language codes, not the complete language name.
This module support chunking + tagging for languages:
- spanish (es)
- french (fr)
- english (en)
- german (de)
It can be used for tagging only for languages:
- bulgarian (bg)
- dutch (nl)
- estonian (et)
- finnish (fi)
- galician (gl)
- italian (it)
- latin (la)
- mongolian (mn)
- polish (pl)
- russian (ru)
- slovak (sk')
- swahili (sw)
Note: chunking parameters have not been adapted to these language
and their specific features, you may try to chunk with default processing…
with no guaranty.
If you have an external chunker, you can call the tagger with
option ``tagonly`` set to True, you should then provide a simple
string with one token by line (or list of strings with one token
by item).
If you chunker is a callable, you can provide your own chunking function
with :option:`CHUNKERPROC` named parameter when constructing :class:`TreeTagger`
object, and then use it normally (your function is called in place of
standard chunking).
For all these languages, the wrapper use standard filenames for
TreeTagger's parameter and abbreviation files.
You can override these names using :option:`TAGPARFILE` and
:option:`TAGABBREV` parameters, and then use alternate files.
Other things done by this module
--------------------------------
- Can number lines into XML tags (to identify lines after TreeTagger
processing).
- Can mark whitespaces with XML tags.
- By default replace non-talk parts like URLs, emails, IP addresses,
DNS names (can be turned off). Replaced by a 'replaced-xxx' string
followed by an XML tag containing the replaced text as attribute
(the tool was originally used to tag parts of exchanges from technical
mailing lists, containing many of these items).
- Acronyms like U.S.A. are systematically written with a final dot,
even if it is missing in original file.
- Automatic encode/decode files using user specified encoding (default
to utf-8).
In normal mode, all journal outputs are done via Python standard logging system,
standard output is only used if a) you run the module in pipe mode (ie.
results goes to stdout), or b) you set DEBUG or DEBUG_PREPROCESS global
variables and you use the module directly on command line (which make journal
and other traces to be sent to stdout).
For an example of logging use, see :func:`enable_debugging_log` function.
Alternative tool
----------------
You may also take a look at project `treetagger python`_
which wraps TreeTagger command-line tools (simpler than
this module, it may be slower if you have many texts
to tag in your process as it calls and restarts TreeTagger
chunking then tagging tools chain for each text).
.. _treetagger python: https://github.com/miotto/treetagger-python/blob/master/treetagger.py
Hints
=====
Window buffer overflow
----------------------
On windows, if you get the following error about some file manipulation (ex. in an
:func:`osp.abspath` call)::
TypeError: must be (buffer overflow), not str
Check that directories and filenames total length don't exceed 260 chars.
If this is the case, you may try to use UNC names starting by ``\\?\`` (read Microsoft
`Naming Files, Paths, and Namespaces`_ documentation — note: you cannot use ``/``
to separate directories with this notation).
.. _Naming Files, Paths, and Namespaces: https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247.aspx
TreeTagger localization
-----------------------
For your TreeTagger to be automatically find by the script, its **directory**
must follow installation rules below:
Directory naming and content
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Location search function tries to find a directory beginning with ``tree``,
possibly followed by any char (ex. a space, a dash…), followed
by ``tagger``, possibly followed by any sequence of chars (ex. a
version number), and without case distinction.
This match directory names like ``treetagger``, ``TreeTagger``, ``Tree-tagger``,
``Tree Tagger``, ``treetagger-2.0``…
The directory must contain :file:`bin` and :file:`lib` subdirectories
(they are normally created by TreeTagger installation script, or directly
included in TreeTagger Windows zipped archive).
First directory corresponding to these criteria is considered to
be the TreeTagger installation directory.
Searched locations
^^^^^^^^^^^^^^^^^^
TreeTagger directory location is searched from local (user private installation)
to global (system wide installation).
1. Near the :file:`treetaggerwrapper.py` file (TreeTagger being in same directory).
2. Containing the :file:`treetaggerwraper.py` file (module inside TreeTagger directory).
3. User home directory (ex. :file:`/home/login`, :file:`C:\\Users\\login`).
4. First level directories in user home directory (ex. :file:`/home/login/tools`,
:file:`C:\\Users\\login\\Desktop`).
5. For MacOSX, in :file:`~/Library/Frameworks`.
6. For Windows, in program files directories (ex. :file:`C:\\Program Files`).
7. For Windows, in each existing fixed disk root and its first level directories
(ex. :file:`C:\\`, :file:`C:\\Tools`, :file:`E:\\`, :file:`E:\\Apps`).
8. For Posix (Linux, BSD… MacOSX), in a list of standard directories:
- :file:`/usr/bin`,
- :file:`/usr/lib`,
- :file:`/usr/local/bin`,
- :file:`/usr/local/lib`,
- :file:`/opt`,
- :file:`/opt/bin`,
- :file:`/opt/lib`,
- :file:`/opt/local/bin`,
- :file:`/opt/local/lib`.
9. For MacOSX, in applications standard directories:
- :file:`/Applications`,
- :file:`/Applications/bin`,
- :file:`/Library/Frameworks`.
"""
from __future__ import print_function
# Following import prevent working with Python < 2.6 !
from __future__ import unicode_literals
# To allow use of epydoc documentation generation with reStructuredText markup.
# Note that use of sphinx 1.3 :any: role may broke epydoc (not tested).
__docformat__ = "restructuredtext en"
__version__ = '2.2.2'
# Note: I use re.VERBOSE option everywhere to allow spaces and comments into
# regular expressions (more readable). And (?:...) allow to have
# semantic groups of things in the expression but no submatch group
# corresponding in the match object.
# ==============================================================================
__all__ = ["TreeTaggerError", "TreeTagger", "Tag", "make_tags"]
import codecs
import collections
import copy
from six.moves import configparser
import getopt
import glob
import io
import logging
import multiprocessing
import os
import os.path as osp
import platform
from six.moves import queue
import re
import shlex
import six
import string
import subprocess
import sys
import threading
import time
if six.PY2:
# Under Python2 a permission denied error raises an OSError
# with errno 13.
PermissionError = OSError
# Set to enable debugging code (mainly logs).
DEBUG = 0
# Set to enable preprocessing specific debugging code.
DEBUG_PREPROCESS = 0
# Set to enable multithreading specific debugging code.
DEBUG_MULTITHREAD = 0
# Extension added to result files when using command-line.
# (TreeTagger result => ttr)
RESEXT = "ttr"
# We don't print for errors/warnings, we use Python logging system.
logger = logging.getLogger("TreeTagger")
# Avoid No handlers could be found for logger "TreeTagger" message.
logger.addHandler(logging.NullHandler())
# A tag to identify begin/end of a text in the data flow.
# (avoid to restart TreeTagger process each time)
STARTOFTEXT = "<ttpw:start-text />"
ENDOFTEXT = "<ttpw:end-text />"
# A tag to identify line numbers from source text.
NUMBEROFLINE = '<ttpw:line num="{}" />'
# And tags to identify location of whitespaces in source text.
TAGSPACE = "<ttpw:space />"
TAGTAB = "<ttpw:tab />"
TAGLF = "<ttpw:lf />"
TAGCR = "<ttpw:cr />"
TAGVT = "<ttpw:vt />"
TAGFF = "<ttpw:ff />"
# Default input and output for files and strings with no ecoding specified.
USER_ENCODING = "utf-8"
# Identify running plaftorm once.
ON_WINDOWS = (platform.system() == "Windows")
ON_MACOSX = (platform.system() == "Darwin")
ON_POSIX = (os.name == "posix") # Care: true also under MACOSX.
# Extra configuration storage within a config file.
g_config = configparser.SafeConfigParser()
# The config file is stored following XDG rules.
CONFIG_FILENAME = "treetagger_wrapper.cfg"
# Sgml tags for replaced data (when kept in text).
REPLACED_URL_TAG = '<repurl text="{}" />'
REPLACED_EMAIL_TAG = '<repemail text="{}" />'
REPLACED_IP_TAG = '<repip text="{}" />'
REPLACED_DNS_TAG = '<repdns text="{}" />'
# Timeout in case of problem with the tagger process (used when reading).
TAGGER_TIMEOUT = 30
# ==============================================================================
# ALONEMARKS:
# chars which must be kept alone, they must have spaces around them to make
# them tokens (this is done by pre-processing text to chunks.
# Notes: chars from ALONEMARKS may be in pchar or fchar too, to identify
# punctuation after a fchar.
# See Unicode database…
ALONEMARKS = "!?¿;\"«»“”´`¨,*¤@°:%|¦/" \
"()[\\]{}<>«»\u008b\u009b\u0093" \
"&~=±×\226\227" \
"\t\n\r" \
"\u2014\u203E\u0305\u2012\u2013" \
"£¥$€©®"
NUMBER_EXPRESSION = r"""(
[-+]?[0-9]+(?:[.,][0-9]*)?(?:[eE][-+]?[0-9]+)?
|
[-+]?[.,][0-9]+(?:[eE][-+]?[0-9]+)?
)"""
# Langage support.
# Dictionnary g_langsupport is indexed by language code (en, fr, de...).
# Each language code has a dictionnary as value, with corresponding entries:
# tagparfile: name of the TreeTagger language file in TreeTagger lib dir.
# abbrevfile: name of the abbreviations text file in TreeTagger lib dir.
# encoding: encoding to use with TreeTagger, accordingly to these files.
# pchar: characters which have to be cut off at the beginning of a word.
# must be usable into a [] regular expression part.
# fchar: characters which have to be cut off at the end of a word.
# must be usable into a [] regular expression part.
# pclictic: character sequences which have to be cut off at the beginning
# of a word.
# fclictic: character sequences which have to be cut off at the end of
# a word.L
# number: representation of numbers in the language.
# must be a full regular expression for numbers.
# dummysentence: a language valid sentence (sent to ensure that TreeTagger
# push remaining data). Sentence must only contain words and spaces
# (even spaces between punctuation as string is simply splitted
# on whitespaces before being sent to TreeTagger.
# replurlexp: regular expression subtitution string for URLs.
# replemailexp: regular expression subtitution string for emails.
# replipexp: regular expression subtitution string for IP addresses.
# repldnsexp: regular expression subtitution string for DNS names.
g_langsupport = {
"__base__": {
"encoding": "utf-8",
"tagparfile": "",
"abbrevfile": "",
"pchar": ALONEMARKS + "'",
"fchar": ALONEMARKS + "'",
"pclictic": "",
"fclictic": "",
"number": NUMBER_EXPRESSION,
"dummysentence": " .", # Just a final sentence dot.
"replurlexp": 'replaced-url',
"replemailexp": 'replaced-email',
"replipexp": 'replaced-ip',
"repldnsexp": 'replaced-dns'
},
"en": {
"encoding": "utf-8",
"tagparfile": "english-utf8.par",
"abbrevfile": "english-abbreviations",
"pchar": ALONEMARKS + "'",
"fchar": ALONEMARKS + "'",
"pclictic": "",
"fclictic": "'(s|re|ve|d|m|em|ll)|n't",
"number": NUMBER_EXPRESSION,
"dummysentence": "This is a dummy sentence to ensure data push .",
"replurlexp": 'replaced-url',
"replemailexp": 'replaced-email',
"replipexp": 'replaced-ip',
"repldnsexp": 'replaced-dns'
},
"fr": {
"encoding": "utf-8",
"tagparfile": "french-utf8.par",
"abbrevfile": "french-abbreviations-utf8",
"pchar": ALONEMARKS + "'",
"fchar": ALONEMARKS + "'",
"pclictic": "[dcjlmnstDCJLNMST]'|[Qq]u'|[Jj]usqu'|[Ll]orsqu'",
"fclictic": "'-t-elles|-t-ils|-t-on|-ce|-elles|-ils|-je|-la|"
"-les|-leur|-lui|-mêmes|-memes|-même|-meme|-m'|-moi|-on|-toi|-tu|-t'|"
"-vous|-en|-y|-ci|-là|-la",
"number": NUMBER_EXPRESSION,
"dummysentence": "Cela est une phrase inutile pour assurer la "
"transmission des données .",
"replurlexp": 'url-remplacée',
"replemailexp": 'email-remplacé',
"replipexp": 'ip-remplacée>',
"repldnsexp": 'dns-remplacé'
},
"de": {
"encoding": "utf-8",
"tagparfile": "german-utf8.par",
"abbrevfile": "german-abbreviations-utf8",
"pchar": ALONEMARKS + "'",
"fchar": ALONEMARKS + "'",
"pclictic": "",
"fclictic": "'(s|re|ve|d|m|em|ll)|n't",
"number": NUMBER_EXPRESSION,
"dummysentence": "Das ist ein Testsatz um das Stossen der "
"daten sicherzustellen .",
"replurlexp": 'replaced-url',
"replemailexp": 'replaced-email',
"replipexp": 'replaced-ip',
"repldnsexp": 'replaced-dns'
},
"es": {
"encoding": "utf-8",
"tagparfile": "spanish-utf8.par",
"abbrevfile": "spanish-abbreviations",
"pchar": ALONEMARKS + "'",
"fchar": ALONEMARKS + "'",
"pclictic": "",
"fclictic": "",
"number": NUMBER_EXPRESSION,
"dummysentence": "Quiero darle las gracias a usted y explicar un "
"malentendido .",
"replurlexp": 'sustituir-url>',
"replemailexp": 'sustituir-email',
"replipexp": 'sustituir-ip',
"repldnsexp": 'sustituir-dns'
},
}
# For other languages, we provide a way to call TreeTagger, but
# we currently cannot provide pre-processing (chunking).
for name, lang in [
('bulgarian', 'bg'),
('dutch', 'nl'),
('estonian', 'et'),
('finnish', 'fi'),
('galician', 'gl'),
('italian', 'it'),
('latin', 'la'),
('mongolian', 'mn'),
('polish', 'pl'),
('russian', 'ru'),
('slovak', 'sk'),
('swahili', 'sw')]:
ls = g_langsupport[lang] = copy.deepcopy(g_langsupport['__base__'])
if lang in ('la', 'mn', 'sw'):
ls['encoding'] = 'latin-1'
ls['tagparfile'] = name + '.par'
ls['abbrevfile'] = name + '-abbreviations'
else:
ls['tagparfile'] = name + '-utf8.par'
ls['abbrevfile'] = name + '-abbreviations-utf8'
# "C'est la fin ." (+google translate…) - in case someone tries to use
# the module for chunking an officially unsupport language.
g_langsupport['bg']['dummysentence'] = 'Това е края .'
g_langsupport['nl']['dummysentence'] = 'Dit is het einde .'
g_langsupport['et']['dummysentence'] = 'See on lõpuks .'
g_langsupport['fi']['dummysentence'] = 'Tämä on loppu .'
g_langsupport['gl']['dummysentence'] = 'Este é o final .'
g_langsupport['it']['dummysentence'] = 'Questa è la fine .'
g_langsupport['la']['dummysentence'] = 'Hoc est finis .'
g_langsupport['mn']['dummysentence'] = 'Энэ нь эцсийн байна .'
g_langsupport['pl']['dummysentence'] = 'To jest koniec .'
g_langsupport['ru']['dummysentence'] = 'Это конец .'
g_langsupport['sk']['dummysentence'] = 'To je koniec .'
g_langsupport['sw']['dummysentence'] = 'Hii ni mwisho .'
g_langsupport['it']['pclictic'] = "[dD][ae]ll'|[nN]ell'|[Aa]ll'|[lLDd]'|[Ss]ull'|" \
"[Qq]uest'|[Uu]n'|[Ss]enz'|[Tt]utt'"
g_langsupport['gl']['fclictic'] = "-la|-las|-lo|-los|-nos"
# We consider following rules to apply whatever be the language.
# ... is an ellipsis, put spaces around before splitting on spaces
# (make it a token)
ellipfind_re = re.compile(r"((?:\.\.\.)|…)")
ellipfind_subst = r" \1 "
# A regexp to put spaces if missing after alone marks.
punct1find_re = re.compile("([" + ALONEMARKS + "])([^ ])",
re.IGNORECASE | re.VERBOSE)
punct1find_subst = "\\1 \\2"
# A regexp to put spaces if missing before alone marks.
punct2find_re = re.compile("([^ ])([[" + ALONEMARKS + "])",
re.IGNORECASE | re.VERBOSE)
punct2find_subst = "\\1 \\2"
# A regexp to identify acronyms like U.S.A. or U.S.A (written to force
# at least two chars in the acronym, and the final dot optionnal).
#acronymexpr_re = re.compile("^[a-zA-Z]+(\.[a-zA-Z])+\.?$",
# Change regexp to math any Unicode alphabetic (and allow diacritic marks
# on the acronym).
acronymexpr_re = re.compile(r"^[^\W\d_]+(\.[^\W\d_])+\.?$",
re.IGNORECASE | re.VERBOSE | re.UNICODE)
# ==============================================================================
class TreeTaggerError(Exception):
"""For exceptions generated directly by TreeTagger wrapper.
"""
pass
# ==============================================================================
Tag = collections.namedtuple("Tag", "word pos lemma")
"""
A named tuple build by :func:`make_tags` to process :meth:`TreeTagger.tag_text`
output and get fields with meaning.
"""
NotTag = collections.namedtuple("NotTag", "what")
"""
A named tuple built by :func:`make_tags` when a TreeTagger output cannot
match a Tag.
"""
class FinalPart(object):
"""Used to wrap final texts, avoid re-trying to analyze them.
"""
def __init__(self, text):
self.text = text
def __repr__(self):
return repr(self.text)
def __str__(self):
return self.text
# ==============================================================================
def pipe_writer(pipe, text, flushsequence, encoding, errors):
"""Write a text to a pipe and manage pre-post data to ensure flushing.
For internal use.
If text is composed of str strings, they are written as-is (ie. assume
ad-hoc encoding is providen by caller). If it is composed of unicode
strings, then they are converted to the specified encoding.
:param pipe: the Popen pipe on what to write the text.
:type pipe: Popen object (file-like with write and flush methods)
:param text: the text to write.
:type text: string or list of strings
:param flushsequence: lines of tokens to ensure flush by TreeTagger.
:type flushsequence: string (with \\n between tokens)
:param encoding: encoding of texts written on the pipe.
:type encoding: str
:param errors: how to manage encoding errors: strict/ignore/replace.
:type errors: str
"""
try:
# Warn the user of possible bad usage.
if not text:
logger.warning("Requested to tag an empty text.")
# We continue to unlock the thread waiting for the ENDOFTEXT on
# TreeTagger output.
logger.info("Writing starting part to pipe.")
pipe.write((STARTOFTEXT + "\n").encode(encoding, errors))
logger.info("Writing data to pipe.")
if text:
if isinstance(text, six.string_types):
# Typically if called without pre-processing.
if isinstance(text, six.text_type):
text = text.encode(encoding, errors)
pipe.write(text)
if text[-1] != '\n':
pipe.write("\n".encode(encoding, errors))
else:
assert isinstance(text, list)
# Typically when we have done pre-processing.
for line in text:
if isinstance(line, six.text_type):
line = line.encode(encoding, errors)
pipe.write(line)
pipe.write("\n".encode(encoding, errors))
logger.info("Writing ending and flushing part to pipe.")
# Note: ENDOFTEXT is a str - no encoding (basic ASCII).
pipe.write((ENDOFTEXT + "\n.\n" + flushsequence + "\n").encode(encoding, errors))
pipe.flush()
logger.info("Finished writing data to pipe. Pipe flushed.")
except:
logger.error("Failure during pipe writing.", exc_info=True)
# ==============================================================================
class TreeTagger(object):
"""Wrap TreeTagger binary to optimize its usage on multiple texts.
The two main methods you may use are the :meth:`__init__` initializer,
and the :meth:`tag_text` method to process your data and get TreeTagger
output results.
"""
__internals_doc = """
:ivar lang: language to use for tagging.
:type lang: string
:ivar langsupport: dictionnary of language specific values (ref. to
g_langsupport[lang] dictionnary).
:type langsupport: dict
:ivar tagdir: path to directory of installation of TreeTagger.
Set via TAGDIR env. var or construction param, else
guess by :func:`locate_treetagger` function.
:type tagdir: string
:ivar tagbindir: path to binary dir into TreeTagger dir.
:type tagbindir: string
:ivar taglibdir: path to libraries dir into TreeTagger dir.
:type taglibdir: string
:ivar tagbin: path to TreeTagger binary file (used to launch process).
:type tagbin: string
:ivar tagopt: command line options for TreeTagger.
:type tagopt: string
:ivar tagparfile: path to TreeTagger library file.
:type tagparfile: string
:ivar abbrevfile: path to abbreviations file.
:type abbrevfile: string
:ivar taginencoding: encoding to use for TreeTagger input encoding.
:type taginencoding: str
:ivar tagoutencoding: encoding to use for TreeTagger output decoding.
:type tagoutencoding: str
:ivar taginencerr: management of encoding errors for TreeTagger input.
:type taginencerr: str
:ivar tagoutencerr: management of encoding errors for TreeTagger output.
:type tagoutencerr: str
:ivar abbterms: dictionnary of abbreviation terms for fast lookup.
Filled when reading abbreviations file.
:type abbterms: dict [ form ] ==> term
:ivar pchar: characters which have to be cut off at the beginning of
a word.
Filled from g_langsupport dict.
:type pchar: string
:ivar pchar_re: regular expression object to cut-off such chars.
:type pchar_re: SRE_Pattern
:ivar fchar: characters which have to be cut off at the end of a word.
Filled from g_langsupport dict.
:type fchar: string
:ivar fchar_re: regular expression object to cut-off such chars.
:type fchar_re: SRE_Pattern
:ivar pclictic: character sequences which have to be cut off at the
beginning of a word.
Filled from g_langsupport dict.
:type pclictic: string
:ivar pclictic_re: regular expression object to cut-off pclictic
sequences.
:type pclictic_re: SRE_Pattern
:ivar fclictic: character sequences which have to be cut off at the end
of a word.
Filled from g_langsupport dict.
:type fclictic: string
:ivar fclictic_re: regular expression object to cut-off fclictic
sequences.
:type fclictic_re: SRE_Pattern
:ivar number: regular expression of number recognition for the language.
Filled from g_langsupport dict.
:type number: string
:ivar number_re: regular expression object to identify numbers.
:type number_re: SRE_Pattern
:ivar dummysequence: just a small but complete sentence in the language.
Filled from g_langsupport dict.
:type dummysequence: string
:ivar replurlexp: regular expression subtitution string for URLs.
:type replurlexp: string
:ivar replemailexp: regular expression subtitution string for emails.
:type replemailexp: string
:ivar replipexp: regular expression subtitution string for IP addresses.
:type replipexp: string
:ivar repldnsexp: regular expression subtitution string for DNS names.
:type repldnsexp: string
:ivar tagpopen: TreeTagger process control tool.
:type tagpopen: Popen
:ivar taginput: pipe to write to TreeTagger input. Set when opening pipe.
:type taginput: write stream
:ivar tagoutput: pipe to read from TreeTagger input. Set whe opening
pipe.
:type tagoutput: read stream
:ivar taggerlock: synchronization tool for multuthread use of the object.
:type taggerlock: threading.Lock
:ivar chunkerproc: external function for chunking.
:type chunkerproc: fct(tagger, ['text']) => ['chunk']
"""
# --------------------------------------------------------------------------
def __init__(self, **kargs):
""" Construction of a wrapper for a TreeTagger process.
You can specify several parameters at construction time.
These parameters can be set via environment variables too
(except for CHUNKERPROC).
All of them have standard default values, even TAGLANG
default to tagging english.
:keyword TAGLANG: language code for texts ('en','fr',...)
(default to 'en').
:type TAGLANG: string
:keyword TAGDIR: path to TreeTagger installation directory.
:type TAGDIR: string
:keyword TAGOPT: options for TreeTagger
(default to '-token -lemma -sgml -quiet').
:type TAGOPT: string
:keyword TAGPARFILE: parameter file for TreeTagger.
(default available for supported languages).
Use value None to force use of default if
environment variable define a value you don't wants
to use.
:type TAGPARFILE: string
:keyword TAGABBREV: abbreviation file for preprocessing.
(default available for supported languages).
:type TAGABBREV: string
:keyword TAGINENC: encoding to use for TreeTagger input, default
to utf8.
:type TAGINENC: str
:keyword TAGOUTENC: encoding to use for TreeTagger output, default
to utf8
:type TAGOUTENC: str
:keyword TAGINENCERR: management of encoding errors for TreeTagger
input, strict or ignore or replace -
default to replace.
:type TAGINENCERR: str
:keyword TAGOUTENCERR: management of encoding errors for TreeTagger
output, strict or ignore or replace -
default to replace.
:type TAGOUTENCERR: str
:keyword CHUNKERPROC: function to call for chunking in place of
wrapper's chunking — default to None (use
standard chunking).
Take the TreeTagger object as
first parameter and a list of str to chunk as
second parameter. Must return a list of chunk str
(tokens).
Note that normal initialization of chunking
parameters is done even with an external chunking
function, so these parameters are available
for this function.
:type CHUNKERPROC: fct(tagger, ['text']) => list ['chunk']
:return: None
"""
# Get data in different place, setup context for pre-processing and
# processing.
logger.debug("Using treetaggerwrapper.py from %s", osp.abspath(__file__))
self._set_language(kargs)
self._set_tagger(kargs)
self._set_preprocessor(kargs)
# Note: TreeTagger process is started later, when really needed.
if kargs:
badargs = ", ".join(sorted(kargs.keys()))
logger.error("Uknown TreeTagger() parameters: %s", badargs)
raise TreeTaggerError("Uknown TreeTagger() parameters: %s" % (badargs,))
# -------------------------------------------------------------------------
def _set_language(self, kargs):
"""Set language for tagger.
Internal use.
"""
# ----- Find language to tag.
self.lang = get_param("TAGLANG", kargs, "en")
if self.lang not in g_langsupport:
allowed = ', '.join(sorted(g_langsupport.keys()))
logger.error("Language %s not supported - allowed: %s",
self.lang, allowed)
raise TreeTaggerError("Unsupported language code: " + self.lang +
". allowed: " + allowed)
logger.info("lang=%s", self.lang)
self.langsupport = g_langsupport[self.lang]
# -------------------------------------------------------------------------
def _set_tagger(self, kargs):
"""Set tagger paths, files, and options.
Internal use.
"""
self.taggerlock = threading.Lock()
# ----- Find TreeTagger directory.
self.tagdir = get_param("TAGDIR", kargs, None)
if self.tagdir is None:
founddir = locate_treetagger()
if founddir:
self.tagdir = founddir
else:
logger.error("Can't locate TreeTagger directory (and "
"no TAGDIR specified).")
raise TreeTaggerError("Can't locate TreeTagger directory (and "
"no TAGDIR specified).")
self.tagdir = os.path.abspath(self.tagdir)
if not os.path.isdir(self.tagdir):
logger.error("Bad TreeTagger directory: %s", self.tagdir)
raise TreeTaggerError("Bad TreeTagger directory: " + self.tagdir)
logger.info("tagdir=%s", self.tagdir)
# ----- Set subdirectories.
self.tagbindir = os.path.join(self.tagdir, "bin")
self.taglibdir = os.path.join(self.tagdir, "lib")
# ----- Set binary by platform.
if ON_WINDOWS:
self.tagbin = os.path.join(self.tagbindir, "tree-tagger.exe")
elif ON_MACOSX or ON_POSIX:
self.tagbin = os.path.join(self.tagbindir, "tree-tagger")
else:
logger.error("TreeTagger binary name undefined for platform %s",