Skip to content

Commit

Permalink
Various improvements and bug fixes.
Browse files Browse the repository at this point in the history
- Made some code format consistency changes.
- Change: Filtering out minimum length phrases are no longer set to 3 as default and is instead disabled by default.
- New Feature: Filtering out numerics is now optional.
- Bug Fix: Regex pattern to split sentences was causing problems especially with sentences that have numbers in them. Changed to a simpler regex pattern.
- Change: Replaced `strlen()` with `mb_strlen()`.
- Change: Removed checks to see if `mb_*` functions exist as this library is dependant on PHP v.5.4.0 which will have those functions available.
  • Loading branch information
Donatello-za committed Sep 15, 2016
1 parent 1d3a1f9 commit 510b12c
Show file tree
Hide file tree
Showing 3 changed files with 201 additions and 47 deletions.
78 changes: 75 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ This particular package intends to include the following benefits over the origi

## Version

1.0.1 Beta
1.0.2 Beta

## Special Thanks

Expand Down Expand Up @@ -123,7 +123,7 @@ and also shows how to get the phrase scores.

```php

use DonatelloZa\RakePlus;
use DonatelloZa\RakePlus\RakePlus;

$text = "Criteria of compatibility of a system of linear Diophantine equations, " .
"strict inequations, and nonstrict inequations are considered. Upper bounds " .
Expand Down Expand Up @@ -250,7 +250,7 @@ You can provide custom stopwords in four different ways:

```php

use DonatelloZa\RakePlus;
use DonatelloZa\RakePlus\RakePlus;

// 1: The standard way (provide a language code)
// RakePlus will first look for ./lang/en_US.pattern, if
Expand All @@ -271,6 +271,78 @@ $rake = RakePlus::create($text, $stopwords);

```

## Example 5

You can specify the minimum number of characters that a phrase\keyword
must be and if less than the minimum it will be filtered out. The
default is 0 (no minimum).

```php

use DonatelloZa\RakePlus\RakePlus;

$text = '6462 Little Crest Suite, 413 Lake Carlietown, WA 12643';

// Without a minimum
$phrases = RakePlus::create($text, 'en_US', 0)->get();
print_r($phrases);

Array
(
[0] => crest suite
[1] => 413 lake carlietown
[2] => wa 12643
)

// With a minimum
$phrases = RakePlus::create($text, 'en_US', 10)->get();
print_r($phrases);

Array
(
[0] => crest suite
[1] => 413 lake carlietown
)

```

## Example 6

You can specify whether phrases\keywords that consists of a numeric
number only should be filtered out or not. The default is to filter out
numerics.


```php

use DonatelloZa\RakePlus\RakePlus;

$text = '6462 Little Crest Suite, 413 Lake Carlietown, WA 12643';

// Filter out numerics
$phrases = RakePlus::create($text, 'en_US', 0, true)->get();
print_r($phrases);

Array
(
[0] => crest suite
[1] => 413 lake carlietown
[2] => wa 12643
)

// Do not filter out numerics
$phrases = RakePlus::create($text, 'en_US', 0, false)->get();
print_r($phrases);

Array
(
[0] => 6462
[1] => crest suite
[2] => 413 lake carlietown
[3] => wa 12643
)

```

## The keyword extractor tool

Expand Down
120 changes: 76 additions & 44 deletions src/RakePlus.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,15 @@ class RakePlus

/** @var array */
private $phrase_scores = [];

/** @var int */
private $minLength = 3;

private $min_length = 0;

/** @var bool */
private $filter_numerics = true;

const ORDER_ASC = 'asc';

const ORDER_DESC = 'desc';

/**
Expand All @@ -43,13 +46,15 @@ class RakePlus
* If $stopwords is a derived instance of StopwordAbstract it will simply
* retrieve the stopwords from the instance.
*
* @param string|null $text
* @param AbstractStopwordProvider|string|array $stopwords
* @param int $phraseMinLegth
* @param string|null $text Text to turn into keywords/phrases.
* @param AbstractStopwordProvider|string|array $stopwords Stopwords to use.
* @param int $phrase_min_length Minimum keyword/phrase length.
* @param bool $filter_numerics Filter out numeric numbers.
*/
public function __construct($text = null, $stopwords = 'en_US', $phraseMinLegth = 0)
public function __construct($text = null, $stopwords = 'en_US', $phrase_min_length = 0, $filter_numerics = true)
{
$this->setMinLength($phraseMinLegth);
$this->setMinLength($phrase_min_length);
$this->setFilterNumerics($filter_numerics);
if (!is_null($text)) {
$this->extract($text, $stopwords);
}
Expand All @@ -75,15 +80,16 @@ public function __construct($text = null, $stopwords = 'en_US', $phraseMinLegth
* If $stopwords is a derived instance of StopwordAbstract it will simply
* retrieve the stopwords from the instance.
*
* @param string|null $text
* @param AbstractStopwordProvider|string|array $stopwords
* @param int $phraseMinLegth
*
* @param string|null $text Text to turn into keywords/phrases.
* @param AbstractStopwordProvider|string|array $stopwords Stopwords to use.
* @param int $phrase_min_length Minimum keyword/phrase length.
* @param bool $filter_numerics Filter out numeric numbers.
*
* @return RakePlus
*/
public static function create($text, $stopwords = 'en_US', $phraseMinLegth = 0)
public static function create($text, $stopwords = 'en_US', $phrase_min_length = 0, $filter_numerics = true)
{
return (new self($text, $stopwords, $phraseMinLegth));
return (new self($text, $stopwords, $phrase_min_length, $filter_numerics));
}

/**
Expand Down Expand Up @@ -243,8 +249,12 @@ public function languageFile()
*/
private function splitSentences($text)
{
$text = preg_replace('/\n/', ' ', $text);
return preg_split('/[\/:.\?!,;\-"\'\(\)\\\x{2018}\x{2019}\x{2013}\n\t]+/u', $text);
// This is an alternative pattern but it doesn't
// seem to like numbers:
// '/[\/:.\?!,;\-"\'\(\)\\\x{2018}\x{2019}\x{2013}\n\t]+/u'

return preg_split('/[.!?,;:\t\-\"\(\)\']/',
preg_replace('/\n/', ' ', $text));
}

/**
Expand All @@ -262,18 +272,14 @@ private function getPhrases(array $sentences, $pattern)
foreach ($sentences as $sentence) {
$phrases_temp = preg_replace($pattern, '|', $sentence);
$phrases = explode('|', $phrases_temp);

foreach ($phrases as $phrase) {
$phrase = trim($phrase);
if (function_exists('mb_strtolower')) {
$phrase = mb_strtolower($phrase);
} else {
$phrase = strtolower($phrase);
}
if ($phrase != '' && !is_numeric($phrase)
&& ($this->minLength === 0
|| strlen($phrase) >= $this->minLength)) {
$results[] = $phrase;
$phrase = mb_strtolower(trim($phrase));
if (!empty($phrase)) {
if (!$this->filter_numerics || ($this->filter_numerics && !is_numeric($phrase))) {
if ($this->min_length === 0 || mb_strlen($phrase) >= $this->min_length) {
$results[] = $phrase;
}
}
}
}
}
Expand All @@ -294,7 +300,7 @@ private function calcWordScores($phrases)
$degrees = [];

foreach ($phrases as $phrase) {
$words = $this->splitPhrase($phrase);
$words = $this->splitPhraseIntoWords($phrase);
$words_count = count($words);
$words_degree = $words_count - 1;

Expand Down Expand Up @@ -333,7 +339,7 @@ private function calcPhraseScores($phrases, $scores)

foreach ($phrases as $phrase) {
$keywords[$phrase] = (isset($keywords[$phrase])) ? $keywords[$phrase] : 0;
$words = $this->splitPhrase($phrase);
$words = $this->splitPhraseIntoWords($phrase);
$score = 0;

foreach ($words as $word) {
Expand All @@ -350,11 +356,11 @@ private function calcPhraseScores($phrases, $scores)
* Split a phrase into multiple words and returns them
* as an array.
*
* @param string
* @param string $phrase
*
* @return array
*/
private function splitPhrase($phrase)
private function splitPhraseIntoWords($phrase)
{
$words_temp = str_word_count($phrase, 1, '0123456789');
$words = [];
Expand All @@ -367,29 +373,55 @@ private function splitPhrase($phrase)

return $words;
}

/**
* Returns minimum number of letters each phrase must have.
*
* Returns the minimum number of letters each phrase/keyword must have.
*
* @return int
*/
public function getMinLength() {
return $this->minLength;
public function getMinLength()
{
return $this->min_length;
}

/**
* Set the minimum length of a phrase that will be taken for further analysis.
* Sets the minimum number of letters each phrase/keyword must have.
*
* @param int $minLength
* @param int $min_length
*
* @return \DonatelloZa\RakePlus\RakePlus
* @return RakePlus
*/
public function setMinLength($minLength) {
$minLengthValue = (int)$minLength;
if ($minLengthValue < 0) {
public function setMinLength($min_length)
{
if ((int)$min_length < 0) {
throw new \InvalidArgumentException('Minimum phrase length must be greater than or equal to 0.');
}
$this->minLength = $minLengthValue;

$this->min_length = (int)$min_length;
return $this;
}

/**
* Sets whether numeric-only phrases/keywords should be filtered
* out or not.
*
* @param $filter_numerics
*
* @return RakePlus
*/
public function setFilterNumerics($filter_numerics = true)
{
$this->filter_numerics = $filter_numerics;
return $this;
}

/**
* Returns whether numeric-only phrases/keywords will be filtered
* out or not.
*
*/
public function getFilterNumerics()
{
return $this->filter_numerics;
}
}
50 changes: 50 additions & 0 deletions tests/RakePlusTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -504,4 +504,54 @@ public function testStopWordPatternFileInstance()
$this->assertEquals($scores['minimal generating sets'], 8.5);
$this->assertEquals($scores['linear diophantine equations'], 9);
}

public function testFilterNumerics()
{
$text = "6462 Little Crest Suite 413 Lake Carlietown, WA 12643";

$rake = RakePlus::create($text, 'en_US', 0, false);
$scores = $rake->scores();

$this->assertEquals(false, $rake->getFilterNumerics());
$this->assertCount(3, $scores);

$this->assertEquals($scores['6462'], 0);
$this->assertEquals($scores['wa 12643'], 1);
$this->assertEquals($scores['crest suite 413 lake carlietown'], 16);
}

public function testDonNotFilterNumerics()
{
$text = "6462 Little Crest Suite 413 Lake Carlietown, WA 12643";
$scores = RakePlus::create($text, 'en_US', 0, true)->scores();

$this->assertCount(2, $scores);

$this->assertEquals($scores['wa 12643'], 1);
$this->assertEquals($scores['crest suite 413 lake carlietown'], 16);
}

public function testMinLengthScores()
{
$text = "Criteria of compatibility of a system of linear Diophantine equations, " .
"strict inequations, and nonstrict inequations are considered. Upper bounds " .
"for components of a minimal set of solutions and algorithms of construction " .
"of minimal generating sets of solutions for all types of systems are given.";

$scores = RakePlus::create($text, 'en_US', 10)->sortByScore()->scores();

$this->assertCount(11, $scores);

$this->assertEquals($scores['compatibility'], 1);
$this->assertEquals($scores['considered'], 1);
$this->assertEquals($scores['components'], 1);
$this->assertEquals($scores['algorithms'], 1);
$this->assertEquals($scores['construction'], 1);
$this->assertEquals($scores['strict inequations'], 4);
$this->assertEquals($scores['nonstrict inequations'], 4);
$this->assertEquals($scores['upper bounds'], 4);
$this->assertEquals($scores['minimal set'], 4.5);
$this->assertEquals($scores['minimal generating sets'], 8.5);
$this->assertEquals($scores['linear diophantine equations'], 9);
}
}

0 comments on commit 510b12c

Please sign in to comment.