Skip to content

Commit

Permalink
Merge pull request #952 from thephpleague/fix-mbstring-usage
Browse files Browse the repository at this point in the history
Fix mbstring calls without explicit encoding
  • Loading branch information
colinodell authored Dec 10, 2022
2 parents 8e37e27 + 9f6b98a commit 733b51f
Show file tree
Hide file tree
Showing 16 changed files with 169 additions and 15 deletions.
7 changes: 7 additions & 0 deletions .github/workflows/backwards-compatibility.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@ jobs:
with:
fetch-depth: 0

# User in the container seems to be different than the cloned repo's owner.
# Git doesn't like that as the repo will then be unusable by the owner.
# We don't care about this here since this is only used for running one test.
# See https://github.com/actions/runner/issues/2033
- name: Workaround directory permissions
run: mkdir -p /home/runner/work/_temp/_github_home && printf "[safe]\n\tdirectory = /github/workspace" > /home/runner/work/_temp/_github_home/.gitconfig

- name: "BC Check"
uses: docker://nyholm/roave-bc-check-ga
with:
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@ Updates should follow the [Keep a CHANGELOG](https://keepachangelog.com/) princi

## [Unreleased][unreleased]

### Fixed

- Fixed parsing issues when `mb_internal_encoding()` is set to something other than `UTF-8` (#951)

## [2.3.7] - 2022-11-03

### Fixed
Expand Down
3 changes: 2 additions & 1 deletion composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@
"autoload-dev": {
"psr-4": {
"League\\CommonMark\\Tests\\Unit\\": "tests/unit",
"League\\CommonMark\\Tests\\Functional\\": "tests/functional"
"League\\CommonMark\\Tests\\Functional\\": "tests/functional",
"League\\CommonMark\\Tests\\PHPStan\\": "tests/phpstan"
}
},
"scripts": {
Expand Down
2 changes: 2 additions & 0 deletions phpstan.neon.dist
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ parameters:
message: '#Parameter .+ of class .+Reference constructor expects string, string\|null given#'
- path: src/Util/RegexHelper.php
message: '#Method .+RegexHelper::unescape\(\) should return string but returns string\|null#'
rules:
- 'League\CommonMark\Tests\PHPStan\MbstringFunctionCallRule'
2 changes: 1 addition & 1 deletion src/Extension/Autolink/UrlAutolinkParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ public function parse(InlineParserContext $inlineContext): bool
$url = \substr($url, 0, -$diff);
}

$cursor->advanceBy(\mb_strlen($url));
$cursor->advanceBy(\mb_strlen($url, 'UTF-8'));

// Auto-prefix 'http://' onto 'www' URLs
if (\substr($url, 0, 4) === 'www.') {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ public function render(Node $node, ChildNodeRendererInterface $childRenderer): s

$attrs->append('class', $this->config->get('footnote/backref_class'));
$attrs->set('rev', 'footnote');
$attrs->set('href', \mb_strtolower($node->getReference()->getDestination()));
$attrs->set('href', \mb_strtolower($node->getReference()->getDestination(), 'UTF-8'));
$attrs->set('role', 'doc-backlink');

$symbol = $this->config->get('footnote/backref_symbol');
Expand Down
4 changes: 2 additions & 2 deletions src/Extension/Footnote/Renderer/FootnoteRefRenderer.php
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,15 @@ public function render(Node $node, ChildNodeRendererInterface $childRenderer): \

$attrs = $node->data->getData('attributes');
$attrs->append('class', $this->config->get('footnote/ref_class'));
$attrs->set('href', \mb_strtolower($node->getReference()->getDestination()));
$attrs->set('href', \mb_strtolower($node->getReference()->getDestination(), 'UTF-8'));
$attrs->set('role', 'doc-noteref');

$idPrefix = $this->config->get('footnote/ref_id_prefix');

return new HtmlElement(
'sup',
[
'id' => $idPrefix . \mb_strtolower($node->getReference()->getLabel()),
'id' => $idPrefix . \mb_strtolower($node->getReference()->getLabel(), 'UTF-8'),
],
new HtmlElement(
'a',
Expand Down
2 changes: 1 addition & 1 deletion src/Extension/Footnote/Renderer/FootnoteRenderer.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ public function render(Node $node, ChildNodeRendererInterface $childRenderer): \
$attrs = $node->data->getData('attributes');

$attrs->append('class', $this->config->get('footnote/footnote_class'));
$attrs->set('id', $this->config->get('footnote/footnote_id_prefix') . \mb_strtolower($node->getReference()->getLabel()));
$attrs->set('id', $this->config->get('footnote/footnote_id_prefix') . \mb_strtolower($node->getReference()->getLabel(), 'UTF-8'));
$attrs->set('role', 'doc-endnote');

return new HtmlElement(
Expand Down
4 changes: 2 additions & 2 deletions src/Normalizer/SlugNormalizer.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ public function normalize(string $text, array $context = []): string
// Trim whitespace
$slug = \trim($slug);
// Convert to lowercase
$slug = \mb_strtolower($slug);
$slug = \mb_strtolower($slug, 'UTF-8');
// Try replacing whitespace with a dash
$slug = \preg_replace('/\s+/u', '-', $slug) ?? $slug;
// Try removing characters other than letters, numbers, and marks.
$slug = \preg_replace('/[^\p{L}\p{Nd}\p{Nl}\p{M}-]+/u', '', $slug) ?? $slug;
// Trim to requested length if given
if ($length = $context['length'] ?? $this->defaultMaxLength) {
$slug = \mb_substr($slug, 0, $length);
$slug = \mb_substr($slug, 0, $length, 'UTF-8');
}

return $slug;
Expand Down
2 changes: 1 addition & 1 deletion src/Parser/InlineParserContext.php
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ public function getFullMatch(): string
*/
public function getFullMatchLength(): int
{
return \mb_strlen($this->matches[0]);
return \mb_strlen($this->matches[0], 'UTF-8');
}

/**
Expand Down
4 changes: 2 additions & 2 deletions src/Parser/InlineParserEngine.php
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public function __construct(EnvironmentInterface $environment, ReferenceMapInter
\assert($parser instanceof InlineParserInterface);
$regex = $parser->getMatchDefinition()->getRegex();

$this->parsers[] = [$parser, $regex, \strlen($regex) !== \mb_strlen($regex)];
$this->parsers[] = [$parser, $regex, \strlen($regex) !== \mb_strlen($regex, 'UTF-8')];
}
}

Expand Down Expand Up @@ -134,7 +134,7 @@ private function addPlainText(string $text, AbstractBlock $container): void
private function matchParsers(string $contents): array
{
$contents = \trim($contents);
$isMultibyte = \mb_strlen($contents) !== \strlen($contents);
$isMultibyte = \mb_strlen($contents, 'UTF-8') !== \strlen($contents);

$ret = [];

Expand Down
2 changes: 1 addition & 1 deletion src/Reference/ReferenceParser.php
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ private function parseLabel(Cursor $cursor): bool
$cursor->advance();

// spec: A link label can have at most 999 characters inside the square brackets
if (\mb_strlen($this->label, 'utf-8') > 999) {
if (\mb_strlen($this->label, 'UTF-8') > 999) {
return false;
}

Expand Down
2 changes: 1 addition & 1 deletion src/Util/LinkParserHelper.php
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ public static function parseLinkLabel(Cursor $cursor): int
return 0;
}

$length = \mb_strlen($match, 'utf-8');
$length = \mb_strlen($match, 'UTF-8');

if ($length > 1001) {
return 0;
Expand Down
4 changes: 2 additions & 2 deletions src/Util/RegexHelper.php
Original file line number Diff line number Diff line change
Expand Up @@ -104,13 +104,13 @@ public static function isLetter(?string $character): bool
public static function matchAt(string $regex, string $string, int $offset = 0): ?int
{
$matches = [];
$string = \mb_substr($string, $offset, null, 'utf-8');
$string = \mb_substr($string, $offset, null, 'UTF-8');
if (! \preg_match($regex, $string, $matches, \PREG_OFFSET_CAPTURE)) {
return null;
}

// PREG_OFFSET_CAPTURE always returns the byte offset, not the char offset, which is annoying
$charPos = \mb_strlen(\mb_strcut($string, 0, $matches[0][1], 'utf-8'), 'utf-8');
$charPos = \mb_strlen(\mb_strcut($string, 0, $matches[0][1], 'UTF-8'), 'UTF-8');

return $offset + $charPos;
}
Expand Down
108 changes: 108 additions & 0 deletions tests/phpstan/MbstringFunctionCallRule.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
<?php

declare(strict_types=1);

/*
* This file is part of the league/commonmark package.
*
* (c) Colin O'Dell <[email protected]>
*
* For the full copyright and license information, please view the LICENSE
* file that was distributed with this source code.
*/

namespace League\CommonMark\Tests\PHPStan;

use PHPStan\Analyser\Scope;
use PHPStan\Rules\Rule;
use PhpParser\Node;

/**
* Custom phpstan rule that:
*
* 1. Disallows the use of certain mbstring functions that could be problematic
* 2. Requires an explicit encoding be provided to all `mb_*()` functions that support it
*/
final class MbstringFunctionCallRule implements Rule
{
private array $disallowedFunctionsThatAlterGlobalSettings = [
'mb_internal_encoding',
'mb_regex_encoding',
'mb_detect_order',
'mb_language',
];

private array $encodingParamPositionCache = [];

public function getNodeType(): string
{
return Node\Expr\FuncCall::class;
}

public function processNode(Node $node, Scope $scope): array
{
if (! $node instanceof Node\Expr\FuncCall) {
return [];
}

if (! $node->name instanceof Node\Name) {
return [];
}

$functionName = $node->name->toString();
if (! str_starts_with($functionName, 'mb_')) {
return [];
}

if (\in_array($functionName, $this->disallowedFunctionsThatAlterGlobalSettings, true)) {
return [\sprintf('Use of %s() is not allowed in this library because it alters global settings', $functionName)];
}

$encodingParamPosition = $this->getEncodingParamPosition($functionName);
if ($encodingParamPosition === null) {
return [];
}

$arg = $node->args[$encodingParamPosition] ?? null;
if ($arg === null) {
return [\sprintf('%s() is missing the $encoding param (should be "UTF-8")', $functionName)];
}

if (! $arg instanceof Node\Arg) {
return [];
}

$encodingArg = $arg->value;
if (! ($encodingArg instanceof Node\Scalar\String_)) {
return [\sprintf('%s() must define the $encoding as "UTF-8"', $functionName)];
}

if ($encodingArg->value !== 'UTF-8') {
return [\sprintf('%s() must define the $encoding as "UTF-8", not "%s"', $functionName, $encodingArg->value)];
}

return [];
}

private function getEncodingParamPosition(string $function): ?int
{
if (isset($this->encodingParamPositionCache[$function])) {
return $this->encodingParamPositionCache[$function];
}

$reflection = new \ReflectionFunction($function);
$params = $reflection->getParameters();

$encodingParamPosition = null;
foreach ($params as $i => $param) {
if ($param->getName() === 'encoding') {
$encodingParamPosition = $i;
break;
}
}

$this->encodingParamPositionCache[$function] = $encodingParamPosition;

return $encodingParamPosition;
}
}
32 changes: 32 additions & 0 deletions tests/unit/Parser/InlineParserEngineTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@
namespace League\CommonMark\Tests\Unit\Parser;

use League\CommonMark\Environment\Environment;
use League\CommonMark\Extension\CommonMark\Node\Inline\Link;
use League\CommonMark\Extension\CommonMark\Parser\Inline\CloseBracketParser;
use League\CommonMark\Extension\CommonMark\Parser\Inline\OpenBracketParser;
use League\CommonMark\Node\Block\Paragraph;
use League\CommonMark\Node\Inline\Text;
use League\CommonMark\Parser\Inline\InlineParserMatch;
Expand Down Expand Up @@ -76,4 +79,33 @@ public function testParseWithNoInlineParsers(): void
$this->assertTrue($child instanceof Text);
$this->assertSame('The quick brown fox jumps over the lazy dog', $child->getLiteral());
}

/**
* @see https://github.com/thephpleague/commonmark/issues/951
*
* @runInSeparateProcess to avoid polluting the global environment
*/
public function testMultibyteDetectionRegressionFromIssue951(): void
{
\mb_internal_encoding('iso-8859-1'); // @phpstan-ignore-line

$environment = new Environment();
$environment->addInlineParser(new CloseBracketParser(), 30);
$environment->addInlineParser(new OpenBracketParser(), 20);

$engine = new InlineParserEngine($environment, new ReferenceMap());
$paragraph = new Paragraph();
$engine->parse('AAA ÀÀ [label](https://url)', $paragraph);

$this->assertCount(2, $paragraph->children());

$text = $paragraph->firstChild();
$this->assertTrue($text instanceof Text);
$this->assertSame('AAA ÀÀ ', $text->getLiteral());

$link = $paragraph->lastChild();
$this->assertTrue($link instanceof Link);
$this->assertSame('label', $link->firstChild()->getLiteral());
$this->assertSame('https://url', $link->getUrl());
}
}

0 comments on commit 733b51f

Please sign in to comment.