From c56bae71a8c2aa708f5399fe8435867e1a228e95 Mon Sep 17 00:00:00 2001 From: Timofey Potapov Date: Sat, 24 Jun 2023 18:16:35 +0200 Subject: [PATCH 1/3] Fixed: Mojo::DOM doesn't recognize end of comment (when it should) #2030. --- lib/Mojo/DOM/HTML.pm | 4 ++-- t/mojo/dom.t | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/lib/Mojo/DOM/HTML.pm b/lib/Mojo/DOM/HTML.pm index e10b1532dd..2ca814e6a0 100644 --- a/lib/Mojo/DOM/HTML.pm +++ b/lib/Mojo/DOM/HTML.pm @@ -29,7 +29,7 @@ my $TOKEN_RE = qr/ (?:\s+\[.+?\])? # Int Subset \s*) | - --(.*?)--\s* # Comment + --(.*?)-*\s* # Comment | \[CDATA\[(.*?)\]\] # CDATA ) @@ -37,7 +37,7 @@ my $TOKEN_RE = qr/ \?(.*?)\? # Processing Instruction | \s*((?:\/\s*)?[^<>\s\/0-9.\-][^<>\s\/]*\s*(?:(?:$ATTR_RE){0,32766})*+) # Tag - )> + ) (?(3)!?) > # Comment tag can end with a bang | (<) # Runaway "<" )?? diff --git a/t/mojo/dom.t b/t/mojo/dom.t index 151c4dfdf6..c017a52e05 100644 --- a/t/mojo/dom.t +++ b/t/mojo/dom.t @@ -2647,6 +2647,28 @@ EOF is $dom->tree->[3][1], ' bad idea -- HTML5 ', 'right comment'; is $dom->tree->[5][1], ' HTML4 ', 'right comment'; is $dom->tree->[7][1], ' bad idea -- HTML4 ', 'right comment'; + + # Issue #2030 + for ( + '', + '', + '', + ) { + my $dom = Mojo::DOM->new( "$_

OK

" ); + my $space = / / ? ' ' : ''; + is $dom->tree->[1][0], 'comment', "$_: have 1st comment node"; + is $dom->tree->[1][1], $space, "$_: have 1st comment string"; + is $dom->tree->[2][0], 'text', "$_: have text node"; + is $dom->tree->[2][1], ' ', "$_: have 1st text string"; + is $dom->tree->[3][0], 'tag', "$_: have 1st tag node"; + is $dom->tree->[3][1], 'p', "$_: have tag string"; + is $dom->tree->[3][4][0], 'text', "$_: have sub text node"; + is $dom->tree->[3][4][1], 'OK', "$_: have sub text string"; + is $dom->tree->[4][0], 'text', "$_: have 2nd text node"; + is $dom->tree->[4][1], ' ', "$_: have 2nd text string"; + is $dom->tree->[5][0], 'comment', "$_: have 2nd comment node"; + is $dom->tree->[5][1], ' ', "$_: have 2nd comment string"; + } }; subtest 'Huge number of attributes' => sub { From afc2433829fdc374b77e7b010f6511d268aa5ae5 Mon Sep 17 00:00:00 2001 From: Timofey Potapov Date: Sat, 24 Jun 2023 19:32:55 +0200 Subject: [PATCH 2/3] Fixed: Mojo::DOM treats "-- >" as end of comment (it shouldn't) #2029 --- lib/Mojo/DOM/HTML.pm | 9 +++++++-- t/mojo/dom.t | 15 +++++++++++---- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/lib/Mojo/DOM/HTML.pm b/lib/Mojo/DOM/HTML.pm index 2ca814e6a0..7f91a1915a 100644 --- a/lib/Mojo/DOM/HTML.pm +++ b/lib/Mojo/DOM/HTML.pm @@ -29,7 +29,12 @@ my $TOKEN_RE = qr/ (?:\s+\[.+?\])? # Int Subset \s*) | - --(.*?)-*\s* # Comment + --(?| # Comment + () -* !? (?=>) # Empty comment + | + (.*?) + --!?(?=>) # Comment end + ) | \[CDATA\[(.*?)\]\] # CDATA ) @@ -37,7 +42,7 @@ my $TOKEN_RE = qr/ \?(.*?)\? # Processing Instruction | \s*((?:\/\s*)?[^<>\s\/0-9.\-][^<>\s\/]*\s*(?:(?:$ATTR_RE){0,32766})*+) # Tag - ) (?(3)!?) > # Comment tag can end with a bang + )> # Comment tag can end with a bang | (<) # Runaway "<" )?? diff --git a/t/mojo/dom.t b/t/mojo/dom.t index c017a52e05..70c05485d0 100644 --- a/t/mojo/dom.t +++ b/t/mojo/dom.t @@ -2643,10 +2643,12 @@ subtest 'Comments' => sub { '); + is $dom->tree->[1][0], 'comment', 'have comment'; + is $dom->tree->[1][1], ' a > -- > b c ', 'entire string found in comment'; }; subtest 'Huge number of attributes' => sub { From 06387197859934b5ec5efec3be39701906cfd089 Mon Sep 17 00:00:00 2001 From: Timofey Potapov Date: Sat, 24 Jun 2023 21:58:21 +0200 Subject: [PATCH 3/3] Cleaned up a bit. --- lib/Mojo/DOM/HTML.pm | 9 ++------- t/mojo/dom.t | 48 +++++++++++++++++++------------------------- 2 files changed, 23 insertions(+), 34 deletions(-) diff --git a/lib/Mojo/DOM/HTML.pm b/lib/Mojo/DOM/HTML.pm index 7f91a1915a..5064c41a5d 100644 --- a/lib/Mojo/DOM/HTML.pm +++ b/lib/Mojo/DOM/HTML.pm @@ -29,12 +29,7 @@ my $TOKEN_RE = qr/ (?:\s+\[.+?\])? # Int Subset \s*) | - --(?| # Comment - () -* !? (?=>) # Empty comment - | - (.*?) - --!?(?=>) # Comment end - ) + --(?|()-*!?(?=>)|(.*?)--!?(?=>)) # Comment | \[CDATA\[(.*?)\]\] # CDATA ) @@ -42,7 +37,7 @@ my $TOKEN_RE = qr/ \?(.*?)\? # Processing Instruction | \s*((?:\/\s*)?[^<>\s\/0-9.\-][^<>\s\/]*\s*(?:(?:$ATTR_RE){0,32766})*+) # Tag - )> # Comment tag can end with a bang + )> | (<) # Runaway "<" )?? diff --git a/t/mojo/dom.t b/t/mojo/dom.t index 70c05485d0..e0e648f1ab 100644 --- a/t/mojo/dom.t +++ b/t/mojo/dom.t @@ -2645,37 +2645,31 @@ subtest 'Comments' => sub { EOF is $dom->tree->[1][1], ' HTML5 ', 'right comment'; is $dom->tree->[3][1], ' bad idea -- HTML5 ', 'right comment'; - is $dom->tree->[5][1], '<', 'not support html style comments'; - is $dom->tree->[6][1], "!-- HTML4 -- >\n", 'not support html style comments'; - is $dom->tree->[7][1], '<', 'not support html style comments'; - is $dom->tree->[8][1], "!-- bad idea -- HTML4 -- >\n", 'not support html style comments'; - - # Issue #2030 - for ( - '', - '', - '', - ) { - my $dom = Mojo::DOM->new( "$_

OK

" ); + is $dom->tree->[5][1], '<', 'wrong comment'; + is $dom->tree->[6][1], "!-- HTML4 -- >\n", 'wrong comment'; + is $dom->tree->[7][1], '<', 'wrong comment'; + is $dom->tree->[8][1], "!-- bad idea -- HTML4 -- >\n", 'wrong comment'; + + for ('', '', '') { + my $dom = Mojo::DOM->new("$_

OK

"); my $space = / / ? ' ' : ''; - is $dom->tree->[1][0], 'comment', "$_: have 1st comment node"; - is $dom->tree->[1][1], $space, "$_: have 1st comment string"; - is $dom->tree->[2][0], 'text', "$_: have text node"; - is $dom->tree->[2][1], ' ', "$_: have 1st text string"; - is $dom->tree->[3][0], 'tag', "$_: have 1st tag node"; - is $dom->tree->[3][1], 'p', "$_: have tag string"; - is $dom->tree->[3][4][0], 'text', "$_: have sub text node"; - is $dom->tree->[3][4][1], 'OK', "$_: have sub text string"; - is $dom->tree->[4][0], 'text', "$_: have 2nd text node"; - is $dom->tree->[4][1], ' ', "$_: have 2nd text string"; - is $dom->tree->[5][0], 'comment', "$_: have 2nd comment node"; - is $dom->tree->[5][1], ' ', "$_: have 2nd comment string"; + is $dom->tree->[1][0], 'comment', "right node"; + is $dom->tree->[1][1], $space, "right text"; + is $dom->tree->[2][0], 'text', "right node"; + is $dom->tree->[2][1], ' ', "right text"; + is $dom->tree->[3][0], 'tag', "right node"; + is $dom->tree->[3][1], 'p', "right text"; + is $dom->tree->[3][4][0], 'text', "right node"; + is $dom->tree->[3][4][1], 'OK', "right text"; + is $dom->tree->[4][0], 'text', "right node"; + is $dom->tree->[4][1], ' ', "right text"; + is $dom->tree->[5][0], 'comment', "right node"; + is $dom->tree->[5][1], ' ', "right text"; } - # Issue #2029 $dom = Mojo::DOM->new(''); - is $dom->tree->[1][0], 'comment', 'have comment'; - is $dom->tree->[1][1], ' a > -- > b c ', 'entire string found in comment'; + is $dom->tree->[1][0], 'comment', 'right node'; + is $dom->tree->[1][1], ' a > -- > b c ', 'right text'; }; subtest 'Huge number of attributes' => sub {