From f1fc5256eb5d30fd6b8d3ea04403d7a88d208f06 Mon Sep 17 00:00:00 2001 From: Peter Thaleikis Date: Sat, 20 Jun 2020 15:50:35 +0400 Subject: [PATCH] Added stopword list for German (de_DE) as pattern and PHP --- README.md | 88 +++++------ lang/de_DE.pattern | 1 + lang/de_DE.php | 367 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 402 insertions(+), 54 deletions(-) create mode 100644 lang/de_DE.pattern create mode 100644 lang/de_DE.php diff --git a/README.md b/README.md index 9606de8..817e5e1 100644 --- a/README.md +++ b/README.md @@ -7,21 +7,16 @@ Yet another PHP implementation of the Rapid Automatic Keyword Extraction algorit ## Why is this package useful? -Keywords describe the main topics expressed in a document/text. Keyword *extraction* in turn allows for the extraction -of important words and phrases from text. This in turn can be used for building a list of tags or to build a keyword -search index or grouping similar content by its topics and much more. This library provides an easy method for PHP -developers to get a list of keywords and phrases from a string of text. +Keywords describe the main topics expressed in a document/text. Keyword *extraction* in turn allows for the extraction of important words and phrases from text. This in turn can be used for building a list of tags or to build a keyword search index or grouping similar content by its topics and much more. This library provides an easy method for PHP developers to get a list of keywords and phrases from a string of text. -This project is based on another project called [RAKE-PHP](https://github.com/Richdark/RAKE-PHP) by Richard Filipčík, -which is a translation from a Python implementation simply called [RAKE](https://github.com/aneesha/RAKE). +This project is based on another project called [RAKE-PHP](https://github.com/Richdark/RAKE-PHP) by Richard Filipčík, which is a translation from a Python implementation simply called [RAKE](https://github.com/aneesha/RAKE). -*As described in: Rose, S., Engel, D., Cramer, N., & Cowley, W. (2010). -[Automatic Keyword Extraction from Individual Documents](https://www.researchgate.net/publication/227988510_Automatic_Keyword_Extraction_from_Individual_Documents). +*As described in: Rose, S., Engel, D., Cramer, N., & Cowley, W. (2010). +[Automatic Keyword Extraction from Individual Documents](https://www.researchgate.net/publication/227988510_Automatic_Keyword_Extraction_from_Individual_Documents). In M. W. Berry & J. Kogan (Eds.), Text Mining: Theory and Applications: John Wiley & Sons.* -This particular package intends to include the following benefits over the original -[RAKE-PHP](https://github.com/Richdark/RAKE-PHP) package: +This particular package intends to include the following benefits over the original[RAKE-PHP](https://github.com/Richdark/RAKE-PHP) package: 1. Add [PSR-2](http://www.php-fig.org/psr/psr-2/) coding standards. 2. Implement [PSR-4](http://www.php-fig.org/psr/psr-4/) in order to be [Composer](https://getcomposer.org) installable. @@ -42,6 +37,7 @@ This particular package intends to include the following benefits over the origi * European Portuguese/português europeu (pt_PT) * Sorani Kurdish/سۆرانی (ckb_IQ) * Arabic (United Arab Emirates)/لإمارات العربية المتحدة (ar_AE) +* German (Germany)/Deutsch (Deutschland) (de_DE) ## Version @@ -54,12 +50,16 @@ v1.0.14 * [Igor Carvalho](https://github.com/Carvlho): Brazilian Portuguese language. * [Khoshbin Ali Ahmed](https://github.com/Xoshbin): Sorani Kurdish and Arabic languages. * [RhaPT](https://github.com/RhaPT): European Portuguese language. +* [Peter Thaleikis](https://github.com/spekulatius): German language. ## Installation ### With Composer -`$ composer require donatello-za/rake-php-plus` +```bash +$ composer require donatello-za/rake-php-plus +``` + ```json { @@ -98,10 +98,9 @@ text is English (US). ```php - use DonatelloZa\RakePlus\RakePlus; -$text = "Criteria of compatibility of a system of linear Diophantine equations, " . +$text = "Criteria of compatibility of a system of linear Diophantine equations, " . "strict inequations, and nonstrict inequations are considered. Upper bounds " . "for components of a minimal set of solutions and algorithms of construction " . "of minimal generating sets of solutions for all types of systems are given."; @@ -131,7 +130,6 @@ Array [14] => types [15] => systems ) - ``` ## Example 2 @@ -140,10 +138,9 @@ Creates a new instance of RakePlus, extract the phrases in different different o and also shows how to get the phrase scores. ```php - use DonatelloZa\RakePlus\RakePlus; -$text = "Criteria of compatibility of a system of linear Diophantine equations, " . +$text = "Criteria of compatibility of a system of linear Diophantine equations, " . "strict inequations, and nonstrict inequations are considered. Upper bounds " . "for components of a minimal set of solutions and algorithms of construction " . "of minimal generating sets of solutions for all types of systems are given."; @@ -155,6 +152,7 @@ $rake = RakePlus::create($text, 'en_US'); $phrases = $rake->sort('asc')->get(); print_r($phrases); ``` + ``` Array ( @@ -234,8 +232,8 @@ Array ``` ```php -// Extract phrases from a new string on the same RakePlus instance. Using the -// same RakePlus instance is faster than creating a new instance as the +// Extract phrases from a new string on the same RakePlus instance. Using the +// same RakePlus instance is faster than creating a new instance as the // language files do not have to be re-loaded and parsed. $text = "A fast Fourier transform (FFT) algorithm computes..."; @@ -257,10 +255,9 @@ Array Creates a new instance of RakePlus and extract the unique keywords from the phrases. ```php - use DonatelloZa\RakePlus\RakePlus; -$text = "Criteria of compatibility of a system of linear Diophantine equations, " . +$text = "Criteria of compatibility of a system of linear Diophantine equations, " . "strict inequations, and nonstrict inequations are considered. Upper bounds " . "for components of a minimal set of solutions and algorithms of construction " . "of minimal generating sets of solutions for all types of systems are given."; @@ -302,10 +299,9 @@ Array Creates a new instance of RakePlus without using the static RakePlus::create method. ```php - use DonatelloZa\RakePlus; -$text = "Criteria of compatibility of a system of linear Diophantine equations, " . +$text = "Criteria of compatibility of a system of linear Diophantine equations, " . "strict inequations, and nonstrict inequations are considered. Upper bounds " . "for components of a minimal set of solutions and algorithms of construction " . "of minimal generating sets of solutions for all types of systems are given."; @@ -332,7 +328,7 @@ $rake = RakePlus::create($text, 'en_US'); // 2: Pass an array containing stopwords $rake = RakePlus::create($text, ['a', 'able', 'about', 'above', ...]); -// 3: Pass the name of a PHP or pattern file, +// 3: Pass the name of a PHP or pattern file, // see lang/en_US.php and lang/en_US.pattern for examples. $rake = RakePlus::create($text, '/path/to/my/stopwords.pattern'); @@ -370,6 +366,7 @@ Array ```php // With a minimum $phrases = RakePlus::create($text, 'en_US', 10)->get(); + print_r($phrases); ``` @@ -408,6 +405,7 @@ print_r($phrases); ```php // Do not filter out numerics $phrases = RakePlus::create($text, 'en_US', 0, false)->get(); + print_r($phrases); ``` @@ -425,34 +423,22 @@ Array **Using the stopwords extractor tool** -The library requires a list of "stopwords" for each language. Stopwords are -common words used in a language such as "and", "are", "or", etc. An example -list of such stopwords can be found -[here (en_US)](http://www.lextek.com/manuals/onix/stopwords2.html). You can -also [take a look at this list](https://github.com/Donatello-za/stopwords-json) -which have stopwords for 50 different languages in individual JSON files. +The library requires a list of "stopwords" for each language. Stopwords are common words used in a language such as "and", "are", "or", etc. An example list of such stopwords can be found [here (en_US)](http://www.lextek.com/manuals/onix/stopwords2.html). You can also [take a look at this list](https://github.com/Donatello-za/stopwords-json) which have stopwords for 50 different languages in individual JSON files. -When working with a simple list such as in the first example, you can copy and -paste the text into a text file and use the extractor tool to -convert it into a format that this library can read efficiently. *An example -of such a stopwords file that have been copied from the hyperlink above have -been included for your convenience (console/stopwords_en_US.txt)* +When working with a simple list such as in the first example, you can copy and paste the text into a text file and use the extractor tool to convert it into a format that this library can read efficiently. *An example of such a stopwords file that have been copied from the hyperlink above have been included for your convenience (console/stopwords_en_US.txt)* -Alternatively you can extract the stopwords from a JSON file of which an -example have also been supplied, look under `console/stopwords_en_US.json` +Alternatively you can extract the stopwords from a JSON file of which an example have also been supplied, look under `console/stopwords_en_US.json` -**Note:** Simply replace `en_US` to whatever locale you wish to use in the -examples below. +**Note:** Simply replace `en_US` to whatever locale you wish to use in the examples below. -**Important:** Before using the `extractor` tool, make sure to use the following -Linux command to check whether your locale is supported: +**Important:** Before using the `extractor` tool, make sure to use the following Linux command to check whether your locale is supported: ```sh $ locale -a ``` -If you do not see the locale you wish to use in the list you can install it -as follows: (in this case we are installing the French locale): +If you do not see the locale you wish to use in the list you can install it as follows: (in this case we are installing the French locale): + ```sh $ sudo locale-gen fr_FR $ sudo locale-gen fr_FR.utf8 @@ -469,26 +455,20 @@ To extract stopwords from a JSON file, run the following from the command line: `$ php extractor.php stopwords_en_US.json --locale=en_US --output=php` -It will output the results to the terminal. You will notice that the results looks -like PHP and in fact it is. You can write the results directly to a PHP file by -piping it: +It will output the results to the terminal. You will notice that the results looks like PHP and in fact it is. You can write the results directly to a PHP file by piping it: -`$ php extractor.php stopwords_en_US.txt --locale=en_US --output=php > en_US.php` +`$ php extractor.php stopwords_en_US.txt --locale=en_US --output=php > en_US.php` -Finally, copy the `en_US.php` file to the `lang/` directory and then instantiate - php-rake-plus like so: +Finally, copy the `en_US.php` file to the `lang/` directory and then instantiate php-rake-plus like so: ```php $rake = RakePlus::create($text, 'en_US'); ``` -To improve the initial loading speed of the language file within RakePlus, you -can also set the exporter to produce the results as a regular expression pattern -using the `--output` argument: +To improve the initial loading speed of the language file within RakePlus, you can also set the exporter to produce the results as a regular expression pattern using the `--output` argument: -`$ php extractor.php stopwords_en_US.txt --locale=en_US --output=pattern > en_US.pattern` +`$ php extractor.php stopwords_en_US.txt --locale=en_US --output=pattern > en_US.pattern` -RakePHP will always look for a `.pattern` file first and if not found it will look -for a `.php` file in the `./lang/` directory. +RakePHP will always look for a `.pattern` file first and if not found it will look for a `.php` file in the `./lang/` directory. ## To run tests diff --git a/lang/de_DE.pattern b/lang/de_DE.pattern new file mode 100644 index 0000000..8d42a62 --- /dev/null +++ b/lang/de_DE.pattern @@ -0,0 +1 @@ +/\bzwischen\b|\bzusammen\b|\bzur\b|\bzum\b|\bzu\b|\bziemlich\b|\bwürdet\b|\bwurdet\b|\bwürdest\b|\bwurdest\b|\bwürden\b|\bwurden\b|\bwürde\b|\bwurde\b|\bwuerdet\b|\bwuerdest\b|\bwuerden\b|\bwuerde\b|\bwohin\b|\bwo\b|\bwirst\b|\bwirklich\b|\bwird\b|\bwir\b|\bwieder\b|\bwie\b|\bwessen\b|\bwerdet\b|\bwerden\b|\bwerde\b|\bwer\b|\bwenn\b|\bwenige\b|\bwen\b|\bwem\b|\bwelches\b|\bwelcher\b|\bwelchen\b|\bwelchem\b|\bwelche\b|\bweiteres\b|\bweiterer\b|\bweiteren\b|\bweiterem\b|\bweitere\b|\bweiter\b|\bweit\b|\bweil\b|\bwegen\b|\bweder\b|\bwas\b|\bwart\b|\bwarst\b|\bwaren\b|\bwar\b|\bwann\b|\bwährend\b|\bwaehrend\b|\bvorueber\b|\bvorüber\b|\bvorher\b|\bvorbei\b|\bvor\b|\bvon\b|\bvielleicht\b|\bviele\b|\bviel\b|\busw\b|\bunterhalb\b|\bunter\b|\bunten\b|\bunseres\b|\bunserer\b|\bunseren\b|\bunserem\b|\bunsere\b|\bunser\b|\buns\b|\bund\b|\bum\b|\bueberll\b|\bueberall\b|\bueber\b|\büberll\b|\büber\b|\btut\b|\btun\b|\btief\b|\btatsaechlich\b|\btatsächlich\b|\bstets\b|\bstatt\b|\bsondern\b|\bsolltet\b|\bsolltest\b|\bsollten\b|\bsollte\b|\bsolches\b|\bsolcher\b|\bsolchen\b|\bsolchem\b|\bsolche\b|\bsogar\b|\bso\b|\bsind\b|\bsie\b|\bsich\b|\bselbst\b|\bseit\b|\bseines\b|\bseiner\b|\bseinen\b|\bseinem\b|\bseine\b|\bsein\b|\bseiet\b|\bseiest\b|\bseien\b|\bseid\b|\bsei\b|\bsehr\b|\bscheinen\b|\bsagtet\b|\bsagtest\b|\bsagten\b|\bsagte\b|\bpro\b|\bohne\b|\boft\b|\boder\b|\bobwohl\b|\boben\b|\bnur\b|\bnirgendwo\b|\bniemandes\b|\bniemanden\b|\bniemandem\b|\bniemand\b|\bnichts\b|\bnein\b|\bnebenan\b|\bnaechste\b|\bnächste\b|\bnachdem\b|\bnach\b|\bmüßt\b|\bmußt\b|\bmusst\b|\bmüssen\b|\bmuß\b|\bmuss\b|\bmuesst\b|\bmuessen\b|\bmit\b|\bmir\b|\bmich\b|\bmeistens\b|\bmeines\b|\bmeiner\b|\bmeinen\b|\bmeinem\b|\bmeine\b|\bmein\b|\bmehrere\b|\bmehr\b|\bmachte\b|\bmacht\b|\bmachen\b|\bkönntet\b|\bkonntet\b|\bkönntest\b|\bkonntest\b|\bkönnten\b|\bkonnten\b|\bkönnte\b|\bkonnte\b|\bkönnt\b|\bkönnen\b|\bkoenntet\b|\bkoenntest\b|\bkoennten\b|\bkoennte\b|\bkoennt\b|\bkoennen\b|\bkaum\b|\bkannst\b|\bkann\b|\bjunges\b|\bjunger\b|\bjungen\b|\bjungem\b|\bjunge\b|\bjung\b|\bjene\b|\bjemandes\b|\bjemanden\b|\bjemandem\b|\bjemand\b|\bjemals\b|\bjedoch\b|\bjedes\b|\bjeder\b|\bjeden\b|\bjedem\b|\bjede\b|\bist\b|\birgendwohin\b|\birgendwo\b|\birgendwenn\b|\birgendwelche\b|\birgendetwas\b|\binnerlich\b|\binnerhalb\b|\bindem\b|\bin\b|\bimmer\b|\bihriges\b|\bihrigen\b|\bihrige\b|\bihres\b|\bihrer\b|\bihren\b|\bihrem\b|\bihre\b|\bihr\b|\bihnen\b|\bihm\b|\bich\b|\bhinunter\b|\bhinter\b|\bhintendran\b|\bhindurch\b|\bhier\b|\bhäufig\b|\bhattet\b|\bhattest\b|\bhatten\b|\bhatte\b|\bhat\b|\bhast\b|\bhaeufig\b|\bhabt\b|\bhaben\b|\bhabe\b|\bgründlich\b|\bgruendlich\b|\bgewesen\b|\bgetrennt\b|\bgetan\b|\bgenug\b|\bgemäß\b|\bgemäss\b|\bgemaeß\b|\bgemaess\b|\bgemacht\b|\bgeht\b|\bgehalten\b|\bgegenueber\b|\bgegenüber\b|\bgegen\b|\bfür\b|\bfuer\b|\bfort\b|\bfertig\b|\bfast\b|\betwas\b|\bes\b|\berscheinen\b|\bentweder\b|\bentgegen\b|\beiniges\b|\beiniger\b|\beinige\b|\beines\b|\beiner\b|\beinen\b|\beinem\b|\beine\b|\bein\b|\bdürftet\b|\bdurftet\b|\bdürftest\b|\bdurftest\b|\bdürften\b|\bdurften\b|\bdürfte\b|\bdurfte\b|\bdurch\b|\bduerftet\b|\bduerftest\b|\bduerften\b|\bduerfte\b|\bdort\b|\bdieses\b|\bdieser\b|\bdiesen\b|\bdiesem\b|\bdiese\b|\bdie\b|\bdeshalb\b|\bdes\b|\bder\b|\bden\b|\bdem\b|\bdaß\b|\bdass\b|\bdas\b|\bdarum\b|\bdarueberhinaus\b|\bdarueber\b|\bdarüberhinaus\b|\bdarüber\b|\bdann\b|\bdanach\b|\bdaher\b|\bda\b|\bbitte\b|\bbist\b|\bbis\b|\bbin\b|\bbevor\b|\bbeinahe\b|\bbeides\b|\bbeider\b|\bbeiden\b|\bbeide\b|\bbei\b|\bbehalten\b|\baußerhalb\b|\bausserhalb\b|\baußerdem\b|\bausserdem\b|\baußer\b|\bausser\b|\baußen\b|\baussen\b|\baus\b|\bauf\b|\bauch\b|\banstatt\b|\banderes\b|\banderer\b|\banderenfalls\b|\banderen\b|\bandere\b|\ban\b|\bam\b|\bals\b|\balles\b|\baller\b|\ballein\b|\balle\b|\babgesehen\b|\baber\b|\bab\b/i diff --git a/lang/de_DE.php b/lang/de_DE.php new file mode 100644 index 0000000..c19edb5 --- /dev/null +++ b/lang/de_DE.php @@ -0,0 +1,367 @@ +