From b58d2c30da0932ff3c0d5c80211de1fd4b3ed6aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9C=D0=B8=D1=85=D0=B0=D0=B8=D0=BB=20=D0=9A=D1=80=D0=B0?= =?UTF-8?q?=D1=81=D0=B8=D0=BB=D1=8C=D0=BD=D0=B8=D0=BA=D0=BE=D0=B2?= Date: Sat, 11 Mar 2017 11:07:19 +0300 Subject: [PATCH] Fix HtmlSource meta tags behaviour Only for "keywords" and "description" meta tags "content" attr should be treated as user visible text. --- CHANGELOG.md | 7 ++++++ src/Source/HtmlSource.php | 38 +++++++++++++++++++++++++++------ tests/Source/HtmlSourceTest.php | 15 ++++++++++++- 3 files changed, 52 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cc971b3..0394ae9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## 1.5.1 - 2017-03-11 + +### Fixed + +- HtmlSource: only for "keywords" and "description" meta tags "content" attr should be treated as + user visible text. + ## 1.5 - 2017-03-11 ### Added diff --git a/src/Source/HtmlSource.php b/src/Source/HtmlSource.php index 5e7a9da..29f725f 100644 --- a/src/Source/HtmlSource.php +++ b/src/Source/HtmlSource.php @@ -24,7 +24,6 @@ class HtmlSource implements Source static private $textAttributes = [ 'abbr', 'alt', - 'content', 'label', 'placeholder', 'title' @@ -61,7 +60,7 @@ public function getAsString() $document = new \DOMDocument('1.0'); $document->loadHTML($this->html); - return $this->extractText($document->documentElement); + return $this->extractFromNode($document->documentElement); } /** @@ -71,26 +70,51 @@ public function getAsString() * * @return string */ - private function extractText(\DOMNode $node) + private function extractFromNode(\DOMNode $node) { if ($node instanceof \DOMText) { return trim($node->textContent); } - $text = ''; + $text = []; if ($node instanceof \DOMElement) { foreach ($node->attributes as $attr) { /** @var \DOMAttr $attr */ if (in_array($attr->name, self::$textAttributes, true)) { - $text .= ' ' . trim($attr->textContent); + $text[] = trim($attr->textContent); } } + $text[] = $this->extractFromMeta($node); foreach ($node->childNodes as $child) { - $text .= ' ' . $this->extractText($child); + $text[] = $this->extractFromNode($child); } } - return trim($text); + return trim(implode(' ', $text)); + } + + /** + * Extract text from meta tag. + * + * @param \DOMElement $node + * + * @return string + */ + private function extractFromMeta(\DOMElement $node) + { + if (strtolower($node->nodeName) !== 'meta') { + return ''; + } + + if (!($node->hasAttribute('name') && $node->hasAttribute('content'))) { + return ''; + } + + if (!in_array(strtolower($node->getAttribute('name')), ['description', 'keywords'], true)) { + return ''; + } + + return trim($node->getAttribute('content')); } } diff --git a/tests/Source/HtmlSourceTest.php b/tests/Source/HtmlSourceTest.php index dc78e4a..06d497f 100644 --- a/tests/Source/HtmlSourceTest.php +++ b/tests/Source/HtmlSourceTest.php @@ -25,6 +25,19 @@ class HtmlSourceTest extends TestCase public function testBasics() { $source = new HtmlSource('Bar Baz'); - static::assertEquals('Foo Bar Baz', $source->getAsString()); + static::assertEquals('Foo Bar Baz', $source->getAsString()); + } + + /** + * Only for "keywords" and "description" meta tags "content" attr should be treated as string. + */ + public function testMetaContent() + { + $source = new HtmlSource( + '' . + '' . + '' + ); + static::assertEquals('Foo Bar', $source->getAsString()); } }