diff --git a/.travis.yml b/.travis.yml index 9d453856..7c592a73 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,10 +1,10 @@ sudo: required -dist: trusty +dist: bionic language: python python: - "2.7" - - "3.4" + - "3.7" # install system dependencies here with apt-get. before_install: @@ -13,15 +13,24 @@ before_install: # install python dependencies including this package in the travis # virtualenv install: - - ./provision/python.sh - - pip install . + + - if [[ $TRAVIS_PYTHON_VERSION == 3.7 ]]; + then ./provision/python3.sh; + fi + - if [[ $TRAVIS_PYTHON_VERSION == 2.7 ]]; + then ./provision/python2.sh; + fi + - pip install .[pocketsphinx] # commands to run the testing suite. if any of these fail, travic lets us know script: - cd tests && make && cd - - nosetests --with-coverage --cover-package=textract - - pep8 textract/ bin/textract - - cd docs && make html && cd - + - cd tests && pytest && cd - + - pycodestyle textract/ bin/textract + - if [[ $TRAVIS_PYTHON_VERSION == 3.7 ]]; + then cd docs && make html && cd -; + fi # commands to run after the tests successfully complete after_success: diff --git a/README.rst b/README.rst index 7868055b..c12e8217 100644 --- a/README.rst +++ b/README.rst @@ -24,8 +24,8 @@ Extract text from any document. No muss. No fuss. .. |Downloads| image:: https://img.shields.io/pypi/dm/textract.svg :target: https://warehouse.python.org/project/textract/ -.. |Test Coverage| image:: https://coveralls.io/repos/deanmalmgren/textract/badge.png - :target: https://coveralls.io/r/deanmalmgren/textract +.. |Test Coverage| image:: https://coveralls.io/repos/github/deanmalmgren/textract/badge.svg?branch=master + :target: https://coveralls.io/github/deanmalmgren/textract?branch=master .. |Documentation Status| image:: https://readthedocs.org/projects/textract/badge/?version=latest :target: https://readthedocs.org/projects/textract/?badge=latest diff --git a/bin/textract b/bin/textract old mode 100755 new mode 100644 index b915e0d3..15ed4d13 --- a/bin/textract +++ b/bin/textract @@ -29,4 +29,5 @@ def main(): else: args.output.write(output) + main() diff --git a/docs/conf.py b/docs/conf.py index 04fd64f4..8bf0238b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -58,7 +58,7 @@ # built documents. # # The short X.Y version. -release = version = "1.6.1" +release = version = "1.6.3" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -132,7 +132,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['.static'] +html_static_path = [] # Add any extra paths that contain custom files (such as robots.txt or # .htaccess) here, relative to this directory. These files are copied diff --git a/docs/index.rst b/docs/index.rst index a160cc7c..156ddd21 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -74,7 +74,7 @@ file types by either mentioning them on the `issue tracker * ``.pptx`` via `python-pptx`_ -* ``.ps`` via `ps2text`_ +* ``.ps`` via `ps2ascii`_ * ``.rtf`` via `unrtf`_ @@ -96,7 +96,7 @@ file types by either mentioning them on the `issue tracker .. _pdfminer.six: https://github.com/goulu/pdfminer .. _pdftotext: http://poppler.freedesktop.org/ .. _pocketsphinx: https://github.com/cmusphinx/pocketsphinx/ -.. _ps2text: http://pages.cs.wisc.edu/~ghost/doc/pstotext.htm +.. _ps2ascii: https://www.ghostscript.com/doc/current/Use.htm .. _python-docx2txt: https://github.com/ankushshah89/python-docx2txt .. _python-pptx: https://python-pptx.readthedocs.org/en/latest/ .. _SpeechRecognition: https://pypi.python.org/pypi/SpeechRecognition/ diff --git a/provision/python.sh b/provision/python2.sh similarity index 91% rename from provision/python.sh rename to provision/python2.sh index 978370ca..d960b630 100755 --- a/provision/python.sh +++ b/provision/python2.sh @@ -12,4 +12,4 @@ fi pip install -U pip # Install the requirements for this package as well as this module. -pip install -r requirements/python-dev +pip install -r requirements/python-dev2 diff --git a/provision/python3.sh b/provision/python3.sh new file mode 100755 index 00000000..3c8f913d --- /dev/null +++ b/provision/python3.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# This needs to work for vagrant, Travis builds, and Docker builds. +# in a python virtualenv. in the virtual machine provisioning, +# we're passing the directory this should be run from. in travis-ci, +# its run from the root of the repository. +if [ "$#" -eq 1 ]; then + cd $1 +fi + +# upgrade pip so we can use wheel downloads +pip install -U pip + +# Install the requirements for this package as well as this module. +pip install -r requirements/python-dev3 +pip install -r requirements/python-doc diff --git a/requirements/debian b/requirements/debian index 5e9c9aa0..b69b6763 100644 --- a/requirements/debian +++ b/requirements/debian @@ -1,6 +1,7 @@ # required packages gcc libpulse-dev +libasound2-dev libjpeg-dev build-essential git @@ -9,6 +10,7 @@ make # these packages are required by python-docx, which depends on lxml # and requires these things python-dev +python-pip libxml2-dev libxslt1-dev @@ -19,7 +21,7 @@ antiword unrtf # parse image files -tesseract-ocr=3.03\* +tesseract-ocr libjpeg-dev # parse pdfs diff --git a/requirements/python b/requirements/python index 300baaf9..01b76109 100644 --- a/requirements/python +++ b/requirements/python @@ -2,7 +2,7 @@ # package in order for it to properly work. argcomplete==1.10.0 -beautifulsoup4==4.7.1 +beautifulsoup4==4.8.0 chardet==3.0.4 docx2txt==0.8 EbookLib==0.17.1 diff --git a/requirements/python-dev b/requirements/python-dev2 similarity index 87% rename from requirements/python-dev rename to requirements/python-dev2 index 774ad8eb..8965d435 100644 --- a/requirements/python-dev +++ b/requirements/python-dev2 @@ -3,14 +3,14 @@ # documentation builds (python-doc) -r python --r python-doc # needed for tests/run.py script to read .travis.yml file -coveralls==1.8.1 +coveralls==1.8.2 nose==1.3.7 -pep8==1.7.1 +pycodestyle==2.5.0 PyYAML==5.1.1 requests==2.22.0 +pytest==4.6 # needed for managing versions bumpversion==0.5.3 diff --git a/requirements/python-dev3 b/requirements/python-dev3 new file mode 100644 index 00000000..dd93d283 --- /dev/null +++ b/requirements/python-dev3 @@ -0,0 +1,16 @@ +# This includes all packages that are used in development, including all +# packages that are required by textract itself (python), packages for +# documentation builds (python-doc) + +-r python + +# needed for tests/run.py script to read .travis.yml file +coveralls==1.8.2 +nose==1.3.7 +pycodestyle==2.5.0 +PyYAML==5.1.1 +pytest==5.0.1 +requests==2.22.0 + +# needed for managing versions +bumpversion==0.5.3 diff --git a/setup.cfg b/setup.cfg index 44cb1a5e..2136f59d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.6.2 +current_version = 1.6.3 commit = True tag = True diff --git a/setup.py b/setup.py index dd32a29e..2a8dbb99 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ def parse_requirements(requirements_filename): setup( name=textract.__name__, - version="1.6.2", + version="1.6.3", description="extract text from any document. no muss. no fuss.", long_description=long_description, url=github_url, diff --git a/tests/Makefile b/tests/Makefile index 42ea9ec9..1de1daed 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -11,7 +11,8 @@ TARGETS = pdf/ocr_text.txt \ png/raw_text.txt png/standardized_text.txt \ gif/raw_text.txt gif/standardized_text.txt \ jpg/raw_text.txt jpg/standardized_text.txt \ - tiff/raw_text.txt tiff/standardized_text.txt + tiff/raw_text.txt tiff/standardized_text.txt \ + ps/raw_text.txt all: $(TARGETS) @@ -27,6 +28,9 @@ pdf/ocr_text.txt: pdf/ocr_text.pdf cat pdf-ocr-text*.txt > $@ rm -f pdf-ocr-text* +ps/raw_text.txt: ps/raw_text.ps + ps2ascii $< > $@ + # simple pattern rule for creating standard issue tesseract files for different # fileypes. the `g` shell variable is the path to the file without the # extension (e.g. g=png/raw_text) diff --git a/tests/pdf/two_column.txt b/tests/pdf/two_column.txt index b2bf7761..c9933b4c 100644 --- a/tests/pdf/two_column.txt +++ b/tests/pdf/two_column.txt @@ -4,61 +4,61 @@ LETTERS -The role of mentorship in prote´ge´ performance -R. Dean Malmgren1,2, Julio M. Ottino1,3 & Luı´s A. Nunes Amaral1,3,4 +The role of mentorship in protégé performance +R. Dean Malmgren1,2, Julio M. Ottino1,3 & Luı́s A. Nunes Amaral1,3,4 -The role of mentorship in prote´ge´ performance is a matter of import- remains an open question. Indeed, we are unaware of any studies that +The role of mentorship in protégé performance is a matter of import- remains an open question. Indeed, we are unaware of any studies that ance to academic, business and governmental organizations. systematically track mentorship success over the entire career of a -Although the benefits of mentorship for prote´ge´s, mentors and their mentor, so the validity of the rising-star hypothesis has yet to be fully -organizations are apparent1–9, the extent to which prote´ge´s mimic explored. Here we investigate whether prote´ge´s acquire the mentor- +Although the benefits of mentorship for protégés, mentors and their mentor, so the validity of the rising-star hypothesis has yet to be fully +organizations are apparent1–9, the extent to which protégés mimic explored. Here we investigate whether protégés acquire the mentor- their mentors’ career choices and acquire their mentorship skills is ship skills of their mentors, by studying mentorship fecundity, that is, -unclear10–16. The importance of a science, technology, engineering the number of prote´ge´s that a mentor trains over the course of their +unclear10–16. The importance of a science, technology, engineering the number of protégés that a mentor trains over the course of their and mathematics workforce to economic growth and the role of career. This measure is advantageous as it directly measures an out- effective mentorship in maintaining a ‘healthy’ such workforce come of the mentorship process that is relevant to sustained mentor- demand the study of the role of mentorship in academia. Here we ship, allowing us to quantify the degree to which mentor fecundity -investigate one aspect of mentor emulation by studying mentorship determines prote´ge´ fecundity. -fecundity—the number of prote´ge´s a mentor trains—using data Scientific mentorship offers a unique opportunity to study this +investigate one aspect of mentor emulation by studying mentorship determines protégé fecundity. +fecundity—the number of protégés a mentor trains—using data Scientific mentorship offers a unique opportunity to study this from the Mathematics Genealogy Project17, which tracks the mentor- question because there is a structured mentorship environment ship record of thousands of mathematicians over several centuries. between advisor and student that is, in principle, readily accessible18,19. We demonstrate that fecundity among academic mathematicians is We study a prototypical mentorship network collected from the correlated with other measures of academic success. We also find Mathematics Genealogy Project17, which aggregates the graduation -that the average fecundity of mentors remains stable over 60 years of date, mentor and prote´ge´s of 114,666 mathematicians from as early +that the average fecundity of mentors remains stable over 60 years of date, mentor and protégés of 114,666 mathematicians from as early recorded mentorship. We further discover three significant correla- as 1637. This database is unique in its scope and coverage, tracking the tions in mentorship fecundity. First, mentors with low mentorship career-long mentorship record of a large population of mentors in a -fecundities train prote´ge´s that go on to have mentorship fecundities single discipline (see the MPACT Project (http://ils.unc.edu/mpact/) +fecundities train protégés that go on to have mentorship fecundities single discipline (see the MPACT Project (http://ils.unc.edu/mpact/) 37% higher than expected. Second, in the first third of their careers, for a smaller database of theses on information and library sciences -mentors with high fecundities train prote´ge´s that go on to have and references therein). From this information, we construct a net- +mentors with high fecundities train protégés that go on to have and references therein). From this information, we construct a net- fecundities 29% higher than expected. Finally, in the last third of work in which links are formed from a mentor to each of his k pro- -their careers, mentors with high fecundities train prote´ge´s that go on te´ge´s, where k denotes mentorship fecundity. We focus here on the +their careers, mentors with high fecundities train protégés that go on tégés, where k denotes mentorship fecundity. We focus here on the to have fecundities 31% lower than expected. 7,259 mathematicians who graduated between 1900 and 1960, because - A large literature supports the hypothesis that prote´ge´s and mentors their mentorship record is the most reliable (Methods). -benefit from the mentoring relationship1,2. Prote´ge´s that receive career Although the mentorship records gathered from the Mathematics + A large literature supports the hypothesis that protégés and mentors their mentorship record is the most reliable (Methods). +benefit from the mentoring relationship1,2. Protégés that receive career Although the mentorship records gathered from the Mathematics coaching and social support, for instance, are reportedly more likely to Genealogy Project provide the most comprehensive data source avail- have high performance ratings, a higher salary and receive promo- able for the study of academic performance throughout a mathemati- tions1,3. In return, mentors receive fulfilment not only by altruistically cian’s career, there are obviously other plausible metrics for evaluating -improving the welfare of their prote´ge´s, but also by improving their own academic performance20–22. We have also compared the mentorship -welfare4,5,10. Organizations benefit as well, because prote´ge´s are more data against a list of publications for 4,447 mathematicians and a list of +improving the welfare of their protégés, but also by improving their own academic performance20–22. We have also compared the mentorship +welfare4,5,10. Organizations benefit as well, because protégés are more data against a list of publications for 4,447 mathematicians and a list of likely to be committed to their organization6,7 and to exhibit organiza- 269 inductees into the US National Academy of Sciences (NAS; tional citizenship behaviour6. These benefits are not obtained only Methods). We find that mentorship fecundity is much larger for -through the traditional dyadic mentor–prote´ge´ relationship, but also NAS members than for non-NAS members (Fig. 1a). We further find -through peer relationships that supplement prote´ge´ development8,9. that the number of publications is strongly correlated with fecundity, +through the traditional dyadic mentor–protégé relationship, but also NAS members than for non-NAS members (Fig. 1a). We further find +through peer relationships that supplement protégé development8,9. that the number of publications is strongly correlated with fecundity, The benefits of mentorship underscore the importance of under- regardless of whether or not a mathematician is an NAS member standing how mentors were in turn trained to foster the development (Fig. 1b). These results demonstrate that although fecundity is not a -of outstanding mentors. It might be suspected that prote´ge´s learn typical measure of academic performance, it is closely related to other +of outstanding mentors. It might be suspected that protégés learn typical measure of academic performance, it is closely related to other managerial approaches and motivational techniques from their men- measures of academic success. Thus, even though our investigation -tors and, as a result, emulate their mentorship methodologies; this concerns how fecundity is correlated between mentor and prote´ge´, our +tors and, as a result, emulate their mentorship methodologies; this concerns how fecundity is correlated between mentor and protégé, our suggests that outstanding mentors are trained by other outstanding results also address questions in the academic evaluation literature mentors. This possibility is sometimes formalized as the rising-star concerning the success of a mathematician. hypothesis11,12; it postulates that mentors select up-and-coming pro- We first investigate whether it is possible to predict the fecundity of -te´ge´s on the basic of their perceived ability and potential and past a mathematician by modelling the empirical fecundity distribution, +tégés on the basic of their perceived ability and potential and past a mathematician by modelling the empirical fecundity distribution, performance10,13,14, including promotion history and proactive career p(kjt), as a function of graduation year, t. Considering that some -behaviours12. Rising-star prote´ge´s are reportedly more likely to mathematicians remain in academia throughout their careers whereas +behaviours12. Rising-star protégés are reportedly more likely to mathematicians remain in academia throughout their careers whereas intend to mentor, resulting in a ‘perpetual cycle’ of rising-star pro- others spend only a portion of their careers in academia, it might be -te´ge´s that emulate their mentors by seeking other rising stars as their expected that there are two types of individual when it comes to -prote´ge´s15. academic mentorship fecundity—‘haves’ and ‘have-nots’—in the +tégés that emulate their mentors by seeking other rising stars as their expected that there are two types of individual when it comes to +protégés15. academic mentorship fecundity—‘haves’ and ‘have-nots’—in the However, there is conflicting evidence concerning the rising-star sense that mathematicians from these types respectively have or have -hypothesis16, so the extent to which prote´ge´s mimic their mentors not had the opportunity to mentor students throughout their career. +hypothesis16, so the extent to which protégés mimic their mentors not had the opportunity to mentor students throughout their career. 1 Department of Chemical and Biological Engineering, Northwestern University, Evanston, Illinois 60208, USA. 2Datascope Analytics, Evanston, Illinois 60201, USA. 3Northwestern Institute on Complex Systems, Northwestern University, Evanston, Illinois 60208, USA. 4Howard Hughes Medical Institute, Northwestern University, Evanston, Illinois 60208, USA. @@ -128,9 +128,9 @@ mentorship fecundity, for NAS members (red) and non-NAS members non-NAS members for all fecundity levels. Error bars, 1 s.e. 0.0 1900 1920 1940 1960 1980 2000 -If each mentor chooses to train a new academic prote´ge´ with Graduation year, t +If each mentor chooses to train a new academic protégé with Graduation year, t -probability jh or jhn, and stops training academic prote´ge´s otherwise, Figure 2 | Evolution of the fecundity distribution. a–c, Cumulative +probability jh or jhn, and stops training academic protégés otherwise, Figure 2 | Evolution of the fecundity distribution. a–c, Cumulative depending on whether they are a ‘have’ or, respectively, a ‘have-not’, distribution of the fecundity of mathematicians that graduated during 1910 then we would expect that the resulting fecundity distribution is a (a), 1930 (b) and 1950 (c) (symbols), compared with the best-estimate mixture of two discrete exponential distributions predictions of a mixture of two discrete exponentials (lines). Monte Carlo @@ -151,7 +151,7 @@ rejected as a candidate description of the fecundity distribution p(kjt) Discussion and Supplementary Fig. 1. As might be expected, the probability, ph, that an individual is a generated from uncorrelated branching processes in our investigation ‘have’ experiences drastic changes over time as a result of historical of the mathematician genealogy network. Here graduation date is -events, such as the First and Second World Wars, the beginning of the equivalent to birth date and mentors and prote´ge´s are equivalent to +events, such as the First and Second World Wars, the beginning of the equivalent to birth date and mentors and protégés are equivalent to Cold War and considerable increases in academic funding (Fig. 2d). parents and children, respectively. In contrast, the average fecundities of ‘haves’ and ‘have-nots’ do not In a branching process24, a parent p, born at time tp, has kp children. exhibit systematic historical changes, suggesting that these quantities Child c of parent p is born at time tc and subsequently has kc children. @@ -170,8 +170,8 @@ lighted a fundamental property of mentorship among mathematicians, it is not predictive of the behaviour of individual mathematicians in the age difference, tc 2 tp, as parent–child pairs in the empirical network sense that fecundity, according to this model, is a random variable (Fig. 3c). All other attributes of these networks are randomized using a drawn from the distribution in equation (1). We next test whether link-switching algorithm25,26 (Methods), so neither of these random- -prote´ge´s mimic the mentorship fecundity of their mentors, by com- network ensembles introduces correlations between parent fecundity -paring prote´ge´ fecundity with a suitable null model that does not and child fecundity or temporal correlations in fecundity. They there- +protégés mimic the mentorship fecundity of their mentors, by com- network ensembles introduces correlations between parent fecundity +paring protégé fecundity with a suitable null model that does not and child fecundity or temporal correlations in fecundity. They there- introduce correlations in fecundity. As in the study of genealogical fore provide a suitable basis for comparison with the mathematician trees, we perform comparisons of the empirical data with networks genealogy network. 623 @@ -191,7 +191,7 @@ trees, we perform comparisons of the empirical data with networks W. Tollmien the regressions between the Ækcæ z-score (Methods) and tc 2 tp deviate significantly (Fig. 4c and Supplementary Fig. 4c) from this expectation for both random ensembles, to reveal three distinct features. First, - H. D. Kloosterman mentors with kp , 3 train prote´ge´s that go on to have mentorship + H. D. Kloosterman mentors with kp , 3 train protégés that go on to have mentorship b fecundities 37% higher than expected throughout their careers. Second, in the first third of their careers, mentors with kp $ 10 train Generating ensemble I @@ -199,13 +199,13 @@ trees, we perform comparisons of the empirical data with networks - prote´ge´s that go on to have fecundities 29% higher than expected. + protégés that go on to have fecundities 29% higher than expected. Finally, in the last third of their careers, mentors with kp $ 10 train - prote´ge´s that go on to have fecundities 31% lower than expected. - The fact that mentors with k , 3 train prote´ge´s with higher-than- + protégés that go on to have fecundities 31% lower than expected. + The fact that mentors with k , 3 train protégés with higher-than- expected fecundities throughout their careers is somewhat counter- intuitive. From the rising-star hypothesis11,12, it might be expected - that prote´ge´s trained by mentors with k , 3 are likely to mimic their + that protégés trained by mentors with k , 3 are likely to mimic their mentors and therefore have lower-than-expected fecundities. Our results demonstrate that this is not the case. One possible explanation c @@ -216,23 +216,23 @@ trees, we perform comparisons of the empirical data with networks - torship experience for their prote´ge´s. An alternative hypothesis is that - mentors with k , 3 select for, or are selected by, prote´ge´s that have a + torship experience for their protégés. An alternative hypothesis is that + mentors with k , 3 select for, or are selected by, protégés that have a greater aptitude for mentorship. The striking temporal correlations for mentors with kp $ 10 are also intriguing. Because mentors with kp $ 10 represent the upper echelon of mentors in mathematics, these mentors were probably ‘rising stars’ early in their academic careers. The fact that these men- - tors train prote´ge´s with high fecundities early in their careers sup- + tors train protégés with high fecundities early in their careers sup- 1920 1930 1940 1950 1960 1970 ports the rising-star hypothesis. - Graduation year, t By the end of these mentors’ careers, however, their prote´ge´s have + Graduation year, t By the end of these mentors’ careers, however, their protégés have lower-than-expected fecundities. Perhaps mentors, who ultimately Figure 3 | Branching process null models. a, Subset of the mathematician have high fecundities, spend fewer and fewer resources training each genealogy network. Mentors/parents (black circles) are connected to each of - of their prote´ge´s as their careers progress. Alternatively, prote´ge´s with -their prote´ge´s/children (white circles). The horizontal positions of + of their protégés as their careers progress. Alternatively, protégés with +their protégés/children (white circles). The horizontal positions of mathematicians represent their graduation/birth dates, t. The bottom two high mentorship fecundity aspirations might court prolific mentors -parents were born in 1924, the top two parents were born in 1937, and all early in their mentors’ careers whereas prote´ge´s with low fecundity +parents were born in 1924, the top two parents were born in 1937, and all early in their mentors’ careers whereas protégés with low fecundity four parents have a child born in 1958. From a parent’s perspective, three aspirations might court prolific mentors later in their mentors’ careers. essential features of the empirical network must be preserved in random Our findings therefore reveal interesting nuances to the rising-star networks generated from the two branching process null models: the birth hypothesis. @@ -242,8 +242,8 @@ lines highlight the links in the empirical network whose end points can be randomized. Dashed red lines illustrate one of the possible randomization Anecdotally, mathematicians are thought to perform their best work moves after switching the corresponding pair of links. We note that the age at a young age27, a perception that may influence how mentors and -difference between parent and child is not preserved. c, Random networks prote´ge´s choose each other. Perceptions in other domains, however, -from ensemble II preserve the three essential features as well as the age may differ and subsequently influence mentor and prote´ge´ selection in +difference between parent and child is not preserved. c, Random networks protégés choose each other. Perceptions in other domains, however, +from ensemble II preserve the three essential features as well as the age may differ and subsequently influence mentor and protégé selection in difference between parent and child. Solid blue lines of the same colour different ways. As data for other academic disciplines18,19, business and highlight the links in the empirical network whose end points can be the government becomes available, it will be important to determine randomized. Dashed blue lines illustrate one of the possible randomization whether temporal correlations in fecundity are a general consequence @@ -251,17 +251,17 @@ moves after switching the corresponding pair of links. Random networks for each ensemble are generated by attempting 100 switches per link (Methods). mathematicians in academia. To explore the influence of mentor fecundity and age difference on Regardless, our results offer another means of judging academic -prote´ge´ fecundity, we partition prote´ge´s according to the fecundity of impact in science as well as the impact of managers on their employ- -their mentors and the age difference between mentor and prote´ge´, ees, both of which are notoriously complicated and risky affairs. +protégé fecundity, we partition protégés according to the fecundity of impact in science as well as the impact of managers on their employ- +their mentors and the age difference between mentor and protégé, ees, both of which are notoriously complicated and risky affairs. tc 2 tp. Given our findings (Supplementary Discussion and Sup- These assessments are multidimensional, metrics and expectations plementary Figs 2 and 3), it is clear that age differences affect fecundity are domain dependent, and placement of creative output, timescales -in a nonrandom manner for prote´ge´s whose mentors have kp , 3. We of impact and recognition vary significantly from field to field. -partition the remaining prote´ge´s, whose mentors have kp $ 3, into two Ultimately, the assessment of individuals for awards and promotion -groups: prote´ge´s whose mentors are below-average ‘haves’ is based on painstaking individual analysis by selection committees -(3 # kp , 10) and prote´ge´s whose mentors are above-average ‘haves’ and peers. Although these committees may have varying goals and -(kp $ 10). We then partition these three groups of prote´ge´s according incentives, it is important that collective arguments—the kind of +in a nonrandom manner for protégés whose mentors have kp , 3. We of impact and recognition vary significantly from field to field. +partition the remaining protégés, whose mentors have kp $ 3, into two Ultimately, the assessment of individuals for awards and promotion +groups: protégés whose mentors are below-average ‘haves’ is based on painstaking individual analysis by selection committees +(3 # kp , 10) and protégés whose mentors are above-average ‘haves’ and peers. Although these committees may have varying goals and +(kp $ 10). We then partition these three groups of protégés according incentives, it is important that collective arguments—the kind of to when they graduated during their mentors’ careers. Specifically, arguments we are making here—be based on sound quantitative -we split each group of prote´ge´s into terciles, the most fine-grained analysis. Although the extent to which our findings extrapolate to +we split each group of protégés into terciles, the most fine-grained analysis. Although the extent to which our findings extrapolate to grouping that still gives us sufficient power to examine the statistical other domains may vary, we are confident that the kind of analysis 624 ©2010 Macmillan Publishers Limited. All rights reserved @@ -323,8 +323,8 @@ grouping that still gives us sufficient power to examine the statistical –0.1 0.0 0.1 –0.1 0.0 0.1 –0.1 0.0 0.1 Slope -Figure 4 | Effect of age difference between mentor and prote´ge´, tc 2 tp, on black line; slope and intercept as shown). The regression lines for networks -prote´ge´ fecundity. a, Fecundity distribution of children born during the from our null model (grey lines) vary around the expectation of our null +Figure 4 | Effect of age difference between mentor and protégé, tc 2 tp, on black line; slope and intercept as shown). The regression lines for networks +protégé fecundity. a, Fecundity distribution of children born during the from our null model (grey lines) vary around the expectation of our null 1910s (for which the average fecundity was 1.4) to parents with kp , 3, model (dashed black line). c, Significance of linear regressions in b. We 3 # kp , 10 and kp $ 10, compared with the expectation from ensemble I compare the slope and intercept of the empirical regression (black circle) (grey line). We separate children into terciles (early, middle, late) according with the distribution of the slope and intercept of the same quantities @@ -344,7 +344,7 @@ managerial impact. mixture of discrete exponential distributions that has finite variance. Given a set METHODS SUMMARY of child fecundities, Kc 5 {kc}, we quantify how significantly a subset of these Data acquisition. We use data from the Mathematics Genealogy Project17 to child fecundities, Kc* , Kc, deviates from Kc by measuring the z-score of Ækcæ, the -identify the 7,259 prote´ge´ mathematicians that are in the giant component28 and average child fecundity of all nodes within the subset Kc*, compared with Ækcæs, +identify the 7,259 protégé mathematicians that are in the giant component28 and average child fecundity of all nodes within the subset Kc*, compared with Ækcæs, graduated between 1900 and 1960, of which 4,447 have linked publication the average child fecundity computed for children within a subset equivalent to records through the American Mathematical Society’s research database Kc* in the synthetic networks. That is, we compute z 5 (Ækcæ 2 m)/s, where m is MathSciNet. We use a text-matching algorithm29 to semi-automatically match the ensemble average of {Ækcæs} and s is the standard deviation of the ensemble @@ -389,9 +389,9 @@ link in the class25,26. 537–562 (1995). 12. Singh, R., Ragins, B. R. & Tharenou, P. Who gets a mentor? A longitudinal Supplementary Information is linked to the online version of the paper at assessment of the rising star hypothesis. J. Vocat. Behav. 74, 11–17 (2009). www.nature.com/nature. -13. Allen, T. D., Poteet, M. L. & Russell, J. E. A. Prote´ge´ selection by mentors: what - makes the difference? J. Organ. Behav. 21, 271–282 (2000). Acknowledgements We thank R. Guimera`, P. McMullen, A. Pah, M. Sales-Pardo, -14. Allen, T. D. Prote´ge´ selection by mentors: contributing individual and E. N. Sawardecker, D. B. Stouffer and M. J. Stringer for comments and suggestions. +13. Allen, T. D., Poteet, M. L. & Russell, J. E. A. Protégé selection by mentors: what + makes the difference? J. Organ. Behav. 21, 271–282 (2000). Acknowledgements We thank R. Guimerà, P. McMullen, A. Pah, M. Sales-Pardo, +14. Allen, T. D. Protégé selection by mentors: contributing individual and E. N. Sawardecker, D. B. Stouffer and M. J. Stringer for comments and suggestions. organizational factors. J. Vocat. Behav. 65, 469–483 (2004). L.A.N.A. gratefully acknowledges the support of US National Science Foundation 15. Ragins, B. R. & Scandura, T. A. Burden or blessing? Expected costs and benefits of awards SBE 0830388 and IIS 0838564. All figures were generated using being a mentor. J. Organ. Behav. 20, 493–509 (1999). PYGRACE (http://pygrace.sourceforge.net) with colour schemes from @@ -420,11 +420,11 @@ Mathematics Genealogy Project data. We study a prototypical mentorship network collected from the Mathematics Genealogy Project17, which aggregates bution, p(kjt). Next, we generate a synthetic fecundity distribution, ps(k), from the graduation dates, mentors and advisees of 114,666 mathematicians from as model M(Ht) using the best-estimate parameters, ht, and we treat the synthetic early as 1637. From this information, we construct a mathematician genealogy data exactly the same as we treated the empirical data: first, we calculate the best- -network in which links are formed from a mentor to each of his or her k prote´ge´s. estimate parameters, Hs, for model M from maximum-likelihood estimation; +network in which links are formed from a mentor to each of his or her k protégés. estimate parameters, Hs, for model M from maximum-likelihood estimation; The data collected by the Mathematics Genealogy Project are self-reported, so second, we compute the test statistic, Ss, between the model M(Hs) and the there is no guarantee that the observed genealogy network is a complete descrip- synthetic fecundity distribution, ps(k). We generate synthetic fecundity distribu- tion of the mentorship network. In fact, 16,147 mathematicians do not have a tions and their corresponding synthetic test statistics until we accumulate an -recorded mentor and, of these, 8,336 do not have any recorded prote´ge´s. To ensemble of 1,000 Monte Carlo test statistics, {Ss}. Finally, we calculate a two- +recorded mentor and, of these, 8,336 do not have any recorded protégés. To ensemble of 1,000 Monte Carlo test statistics, {Ss}. Finally, we calculate a two- avoid having these mathematicians distort our analysis, we restrict our analysis tailed P value with a precision of 0.001. As is customary in hypothesis testing, we to the 90,211 mathematicians that comprise the giant component28 of the net- reject the model M at time t if the P value is less than a threshold value. We select a work; that is, we restrict our analysis to the largest set of connected mathema- P-value threshold of 0.05; that is, if less than 5% of the synthetic data sets exhibit @@ -436,9 +436,9 @@ records are representative of the evolution of the network. For example, before Furthermore, because mathematicians often have mentorship careers lasting statistic whereby we bin p(kjt) such that each bin has at least one expected 50 years or more, we are not guaranteed to have complete mentorship records observation according to the model M(Ht). This binning prevents observations for mathematicians who graduated after 1960. We therefore restrict our analysis that are exceptionally rare from dominating our statistical test and skewing our -to the 7,259 prote´ge´ mathematicians who graduated between 1900 and 1960, for results. +to the 7,259 protégé mathematicians who graduated between 1900 and 1960, for results. whom we believe that the graduation and mentorship record is the most reliable. Random-network generation. We use the Markov chain Monte Carlo algo- -MathSciNet data. Of the 7,259 prote´ge´ mathematicians that graduated between rithm25,26 to build random networks from the mathematician genealogy net- +MathSciNet data. Of the 7,259 protégé mathematicians that graduated between rithm25,26 to build random networks from the mathematician genealogy net- 1900 and 1960, 4,447 of them have linked MathSciNet publication records, work. The standard version of this algorithm inherently preserves the which are used in our analysis. fecundity of each individual, but it does not preserve the chronology of child US National Academy of Science data. The US National Academy of Science births, {tc}, for each parent. To obtain random networks belonging to ensemble I diff --git a/tests/ps/raw_text.txt b/tests/ps/raw_text.txt deleted file mode 100644 index bce4ddf8..00000000 --- a/tests/ps/raw_text.txt +++ /dev/null @@ -1,3 +0,0 @@ -How exciting! -Narrow Text - diff --git a/tests/rtf/raw_text.txt b/tests/rtf/raw_text.txt index 9d49568b..754b4404 100644 --- a/tests/rtf/raw_text.txt +++ b/tests/rtf/raw_text.txt @@ -1,6 +1,6 @@ I love word documents. They are lovely. They make me so happy I could smile. And that is why I wrote this package. -Sample text is hard. That is where "http://hipsum.co" comes in handy. +Sample text is hard. That is where http://hipsum.co comes in handy. Semiotics church-key VHS, Truffaut cliche actually vegan. Cray Austin pop-up disrupt letterpress, kitsch fixie Cosby sweater cliche craft beer PBR&B. Gentrify cornhole Tonx McSweeney's, Shoreditch keffiyeh ethnic Marfa 90's kogi American Apparel. Shabby chic distillery church-key locavore beard, food truck chillwave sartorial deep v flannel authentic Tumblr narwhal kogi organic. Cred vegan jean shorts Banksy forage Neutra dreamcatcher, hashtag Bushwick polaroid pork belly flannel keytar Portland post-ironic. Cred hoodie vegan, food truck leggings Austin pour-over banjo trust fund before they sold out cray Intelligentsia plaid typewriter. Williamsburg XOXO plaid Carles Austin tofu. Carles Tonx keffiyeh, leggings 90's lo-fi kogi viral semiotics Brooklyn biodiesel tousled bespoke kitsch. Vinyl Tonx art party Thundercats retro, viral asymmetrical artisan bicycle rights bitters master cleanse Kickstarter YOLO. Seitan street art semiotics twee skateboard, PBR&B VHS hashtag meh. Thundercats semiotics shabby chic forage single-origin coffee retro, 3 wolf moon iPhone mumblecore 90's trust fund Intelligentsia. Beard gluten-free seitan, VHS sartorial pork belly gastropub meh whatever authentic synth. Beard single-origin coffee irony fixie, before they sold out Pitchfork kitsch readymade. Helvetica butcher wayfarers, lomo artisan hashtag Brooklyn four loko fanny pack 90's mustache 8-bit. diff --git a/tests/test_tiff.py b/tests/test_tiff.py index f1278346..516d8394 100644 --- a/tests/test_tiff.py +++ b/tests/test_tiff.py @@ -4,4 +4,4 @@ class PngTestCase(base.ShellParserTestCase, unittest.TestCase): - extension = 'png' + extension = 'tiff' diff --git a/tests/wav/raw_text.wav b/tests/wav/raw_text.wav index bdaa891f..6c23d4ea 100644 Binary files a/tests/wav/raw_text.wav and b/tests/wav/raw_text.wav differ diff --git a/textract/__init__.py b/textract/__init__.py index 9bde7dcc..d94a579c 100644 --- a/textract/__init__.py +++ b/textract/__init__.py @@ -1,3 +1,3 @@ from .parsers import process -VERSION = "1.6.1" +VERSION = "1.6.3" diff --git a/textract/colors.py b/textract/colors.py index f79a9d98..4c3500ca 100644 --- a/textract/colors.py +++ b/textract/colors.py @@ -12,6 +12,7 @@ def inner(text): return "\033[%sm%s\033[0m" % (c, text) return inner + red = _wrap_with('31') green = _wrap_with('32') yellow = _wrap_with('33') @@ -32,4 +33,4 @@ def inner(text): # regular expression to omit colorcodes def colorless(text): """Remove color from the text""" - return re.sub("\033\[(1;)?[\d]+m", '', text) + return re.sub(r"\033\[(1;)?[\d]+m", '', text) diff --git a/textract/parsers/__init__.py b/textract/parsers/__init__.py index 0642e451..266e07fe 100644 --- a/textract/parsers/__init__.py +++ b/textract/parsers/__init__.py @@ -86,7 +86,7 @@ def _get_available_extensions(): # from filenames parsers_dir = os.path.join(os.path.dirname(__file__)) glob_filename = os.path.join(parsers_dir, "*" + _FILENAME_SUFFIX + ".py") - ext_re = re.compile(glob_filename.replace('*', "(?P\w+)")) + ext_re = re.compile(glob_filename.replace('*', r"(?P\w+)")) for filename in glob.glob(glob_filename): ext_match = ext_re.match(filename) ext = ext_match.groups()[0] diff --git a/textract/parsers/msg_parser.py b/textract/parsers/msg_parser.py index 212b8b83..a1b7e17a 100644 --- a/textract/parsers/msg_parser.py +++ b/textract/parsers/msg_parser.py @@ -1,6 +1,6 @@ import six -from ExtractMsg import Message +import extract_msg from .utils import BaseParser @@ -23,5 +23,5 @@ class Parser(BaseParser): """ def extract(self, filename, **kwargs): - m = Message(filename) + m = extract_msg.Message(filename) return ensure_bytes(m.subject) + six.b('\n\n') + ensure_bytes(m.body) diff --git a/textract/parsers/ps_parser.py b/textract/parsers/ps_parser.py index b2dd2a6b..5d614357 100644 --- a/textract/parsers/ps_parser.py +++ b/textract/parsers/ps_parser.py @@ -2,9 +2,9 @@ class Parser(ShellParser): - """Extract text from postscript files using pstotext command. + """Extract text from postscript files using ps2ascii command. """ def extract(self, filename, **kwargs): - stdout, _ = self.run(['pstotext', filename]) + stdout, _ = self.run(['ps2ascii', filename]) return stdout