Skip to content

Commit

Permalink
Adding PHPStan Level 6
Browse files Browse the repository at this point in the history
  • Loading branch information
phpfui authored and edgrosvenor committed Jan 6, 2023
1 parent 395094e commit 83502b6
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 34 deletions.
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
"ext-libxml": "*"
},
"require-dev": {
"phpunit/phpunit": "^7.0|^8.0|^9.0"
"phpunit/phpunit": "^7.0|^8.0|^9.0",
"phpstan/phpstan": "^1.9"
}
}
7 changes: 7 additions & 0 deletions phpstan.neon.dist
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
parameters:
level: 6
errorFormat: raw
editorUrl: '%%file%% %%line%% %%column%%: %%error%%'
paths:
- src
- tests
58 changes: 37 additions & 21 deletions src/Html2Text.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

class Html2Text {

/** @return array<string, bool | string> */
public static function defaultOptions(): array {
return [
'ignore_errors' => false,
Expand All @@ -23,7 +24,7 @@ public static function defaultOptions(): array {
* </ul>
*
* @param string $html the input HTML
* @param boolean|array $options if boolean, Ignore xml parsing errors, else ['ignore_errors' => false, 'drop_links' => false, 'char_set' => 'auto']
* @param boolean|array<string, bool | string> $options if boolean, Ignore xml parsing errors, else ['ignore_errors' => false, 'drop_links' => false, 'char_set' => 'auto']
* @return string the HTML converted, as best as possible, to text
* @throws Html2TextException if the HTML could not be loaded as a {@link \DOMDocument}
*/
Expand All @@ -43,26 +44,26 @@ public static function convert(string $html, $options = []): string {
}
}

$is_office_document = static::isOfficeDocument($html);
$is_office_document = self::isOfficeDocument($html);

if ($is_office_document) {
// remove office namespace
$html = str_replace(["<o:p>", "</o:p>"], "", $html);
}

$html = static::fixNewlines($html);
$html = self::fixNewlines($html);

// use mb_convert_encoding for legacy versions of php
if (PHP_MAJOR_VERSION * 10 + PHP_MINOR_VERSION < 81 && mb_detect_encoding($html, "UTF-8", true)) {
$html = mb_convert_encoding($html, "HTML-ENTITIES", "UTF-8");
}

$doc = static::getDocument($html, $options);
$doc = self::getDocument($html, $options);

$output = static::iterateOverNode($doc, null, false, $is_office_document, $options);
$output = self::iterateOverNode($doc, null, false, $is_office_document, $options);

// process output for whitespace/newlines
$output = static::processWhitespaceNewlines($output);
$output = self::processWhitespaceNewlines($output);

return $output;
}
Expand All @@ -84,13 +85,15 @@ public static function fixNewlines(string $text): string {
return $text;
}

/** @return array<string> */
public static function nbspCodes(): array {
return [
"\xc2\xa0",
"\u00a0",
];
}

/** @return array<string> */
public static function zwnjCodes(): array {
return [
"\xe2\x80\x8c",
Expand Down Expand Up @@ -118,7 +121,7 @@ public static function processWhitespaceNewlines(string $text): string {
// convert non-breaking spaces to regular spaces to prevent output issues,
// do it here so they do NOT get removed with other leading spaces, as they
// are sometimes used for indentation
$text = static::renderText($text);
$text = self::renderText($text);

// remove trailing whitespace
$text = rtrim($text);
Expand All @@ -127,7 +130,7 @@ public static function processWhitespaceNewlines(string $text): string {
$text = preg_replace("/[ \t]*\n/im", "\n", $text);

// unarmor pre blocks
$text = static::fixNewLines($text);
$text = self::fixNewLines($text);

// remove unnecessary empty lines
$text = preg_replace("/\n\n\n*/im", "\n\n", $text);
Expand All @@ -143,13 +146,14 @@ public static function isOfficeDocument(string $html): bool {
}

public static function isWhitespace(string $text): bool {
return strlen(trim(static::renderText($text), "\n\r\t ")) === 0;
return strlen(trim(self::renderText($text), "\n\r\t ")) === 0;
}

/**
* Parse HTML into a DOMDocument
*
* @param string $html the input HTML
* @param array<string, bool | string> $options
* @return \DOMDocument the parsed document tree
*/
private static function getDocument(string $html, array $options): \DOMDocument {
Expand Down Expand Up @@ -218,17 +222,17 @@ private static function getDocument(string $html, array $options): \DOMDocument
* by a browser.
*/
private static function renderText(string $text): string {
$text = str_replace(static::nbspCodes(), " ", $text);
$text = str_replace(static::zwnjCodes(), "", $text);
$text = str_replace(self::nbspCodes(), " ", $text);
$text = str_replace(self::zwnjCodes(), "", $text);
return $text;
}

private static function nextChildName($node): ?string {
private static function nextChildName(?\DOMNode $node): ?string {
// get the next child
$nextNode = $node->nextSibling;
while ($nextNode != null) {
if ($nextNode instanceof \DOMText) {
if (!static::isWhitespace($nextNode->wholeText)) {
if (!self::isWhitespace($nextNode->wholeText)) {
break;
}
}
Expand All @@ -248,11 +252,12 @@ private static function nextChildName($node): ?string {
return $nextName;
}

private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is_office_document, $options): string {
/** @param array<string, bool | string> $options */
private static function iterateOverNode(\DOMNode $node, ?string $prevName, bool $in_pre, bool $is_office_document, array $options): string {
if ($node instanceof \DOMText) {
// Replace whitespace characters with a space (equivilant to \s)
if ($in_pre) {
$text = "\n" . trim(static::renderText($node->wholeText), "\n\r\t ") . "\n";
$text = "\n" . trim(self::renderText($node->wholeText), "\n\r\t ") . "\n";

// Remove trailing whitespace only
$text = preg_replace("/[ \t]*\n/im", "\n", $text);
Expand All @@ -261,10 +266,10 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is
return str_replace("\n", "\r", $text);

}
$text = static::renderText($node->wholeText);
$text = self::renderText($node->wholeText);
$text = preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $text);

if (!static::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
if (!self::isWhitespace($text) && ($prevName == 'p' || $prevName == 'div')) {
return "\n" . $text;
}
return $text;
Expand All @@ -276,7 +281,7 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is
}

$name = strtolower($node->nodeName);
$nextName = static::nextChildName($node);
$nextName = self::nextChildName($node);

// start whitespace
switch ($name) {
Expand Down Expand Up @@ -321,6 +326,7 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is
// To fix this, for any p element with a className of `MsoNormal` (the standard
// classname in any Microsoft export or outlook for a paragraph that behaves
// like a line return) we skip the first line returns and set the name to br.
// @phpstan-ignore-next-line
if ($is_office_document && $node->getAttribute('class') == 'MsoNormal') {
$output = "";
$name = 'br';
Expand Down Expand Up @@ -368,12 +374,12 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is

while ($n != null) {

$text = static::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);
$text = self::iterateOverNode($n, $previousSiblingName, $in_pre || $name == 'pre', $is_office_document, $options);

// Pass current node name to next child, as previousSibling does not appear to get populated
if ($n instanceof \DOMDocumentType
|| $n instanceof \DOMProcessingInstruction
|| ($n instanceof \DOMText && static::isWhitespace($text))) {
|| ($n instanceof \DOMText && self::isWhitespace($text))) {
// Keep current previousSiblingName, these are invisible
$trailing_whitespace++;
}
Expand Down Expand Up @@ -430,6 +436,7 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is

case "a":
// links are returned in [text](link) format
// @phpstan-ignore-next-line
$href = $node->getAttribute("href");

$output = trim($output);
Expand All @@ -439,18 +446,23 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is
$output = substr($output, 1, strlen($output) - 2);

// for linking images, the title of the <a> overrides the title of the <img>
// @phpstan-ignore-next-line
if ($node->getAttribute("title")) {
// @phpstan-ignore-next-line
$output = $node->getAttribute("title");
}
}

// if there is no link text, but a title attr
// @phpstan-ignore-next-line
if (!$output && $node->getAttribute("title")) {
// @phpstan-ignore-next-line
$output = $node->getAttribute("title");
}

if ($href == null) {
// it doesn't link anywhere
// @phpstan-ignore-next-line
if ($node->getAttribute("name") != null) {
if ($options['drop_links']) {
$output = "$output";
Expand Down Expand Up @@ -486,9 +498,13 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is
break;

case "img":
// @phpstan-ignore-next-line
if ($node->getAttribute("title")) {
// @phpstan-ignore-next-line
$output = "[" . $node->getAttribute("title") . "]";
// @phpstan-ignore-next-line
} elseif ($node->getAttribute("alt")) {
// @phpstan-ignore-next-line
$output = "[" . $node->getAttribute("alt") . "]";
} else {
$output = "";
Expand All @@ -501,7 +517,7 @@ private static function iterateOverNode($node, $prevName, bool $in_pre, bool $is

case "blockquote":
// process quoted text for whitespace/newlines
$output = static::processWhitespaceNewlines($output);
$output = self::processWhitespaceNewlines($output);

// add leading newline
$output = "\n" . $output;
Expand Down
5 changes: 3 additions & 2 deletions src/Html2TextException.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@

class Html2TextException extends \Exception {

var $more_info;
/** @var string $more_info */
public $more_info;

public function __construct($message = "", $more_info = "") {
public function __construct(string $message = "", string $more_info = "") {
parent::__construct($message);
$this->more_info = $more_info;
}
Expand Down
22 changes: 12 additions & 10 deletions tests/Html2TextTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,12 @@ public static function setUpBeforeClass(): void {
/**
* @dataProvider providerFiles
*/
public function testFile(string $test) {
public function testFile(string $test): void {
$this->doTestWithResults($test, $test, []);
}

function doTestWithResults(string $test, string $result, $options = []) {
/** @param bool | array<string, bool | string> $options */
function doTestWithResults(string $test, string $result, $options = []): void {
$html = __DIR__ . "/html/$test.html";
$txt = __DIR__ . "/txt/$result.txt";
$this->assertTrue(file_exists($html), "File '{$html}' does not exist");
Expand All @@ -36,7 +37,8 @@ function doTestWithResults(string $test, string $result, $options = []) {
$this->assertEquals($expected, $output, "{$html} file failed to convert to {$txt}");
}

public function providerFiles() {
/** @return array<array<string>> */
public function providerFiles(): array {
return [
['basic'],
['anchors'],
Expand All @@ -62,34 +64,34 @@ public function providerFiles() {
];
}

public function testInvalidXML() {
public function testInvalidXML(): void {
$this->expectWarning();
$this->doTestWithResults("invalid", "invalid", ['ignore_errors' => false]);
}

public function testInvalidXMLIgnore() {
public function testInvalidXMLIgnore(): void {
$this->doTestWithResults("invalid", "invalid", ['ignore_errors' => true]);
}

public function testInvalidXMLIgnoreOldSyntax() {
public function testInvalidXMLIgnoreOldSyntax(): void {
// for BC, allow old #convert(text, bool) syntax
$this->doTestWithResults("invalid", "invalid", true);
}

public function testInvalidOption() {
public function testInvalidOption(): void {
$this->expectException(InvalidArgumentException::class);
$this->doTestWithResults("basic", "basic", ['invalid_option' => true]);
}

public function testBasicDropLinks() {
public function testBasicDropLinks(): void {
$this->doTestWithResults("basic", "basic.no-links", ['drop_links' => true]);
}

public function testAnchorsDropLinks() {
public function testAnchorsDropLinks(): void {
$this->doTestWithResults("anchors", "anchors.no-links", ['drop_links' => true]);
}

public function testWindows1252() {
public function testWindows1252(): void {
$this->doTestWithResults("windows-1252-example", "windows-1252-example", ['char_set' => 'windows-1252']);
}
}

0 comments on commit 83502b6

Please sign in to comment.