From bdedef6e1e60605fd1e58f504e1b0c760c6d99ba Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Fri, 12 Jan 2018 11:05:44 -0800 Subject: [PATCH 1/8] adds a bunch of broken tests for #52 --- lib/XRay/Formats/Format.php | 15 +-- lib/XRay/Formats/Mf2.php | 34 ++++-- tests/SanitizeTest.php | 100 ++++++++++++++++++ tests/data/sanitize.example/cleverdevil | 47 ++++++++ .../entry-with-img-no-implied-photo | 14 +++ tests/data/sanitize.example/photo-in-content | 14 +++ .../photo-in-content-empty-alt | 14 +++ .../photo-in-content-with-alt | 14 +++ tests/data/sanitize.example/photos-with-alt | 73 +++++++++++++ 9 files changed, 311 insertions(+), 14 deletions(-) create mode 100644 tests/data/sanitize.example/cleverdevil create mode 100644 tests/data/sanitize.example/entry-with-img-no-implied-photo create mode 100644 tests/data/sanitize.example/photo-in-content create mode 100644 tests/data/sanitize.example/photo-in-content-empty-alt create mode 100644 tests/data/sanitize.example/photo-in-content-with-alt create mode 100644 tests/data/sanitize.example/photos-with-alt diff --git a/lib/XRay/Formats/Format.php b/lib/XRay/Formats/Format.php index fc481ea..dff38d4 100644 --- a/lib/XRay/Formats/Format.php +++ b/lib/XRay/Formats/Format.php @@ -34,10 +34,8 @@ protected static function _loadHTML($html) { return [$doc, $xpath]; } - protected static function sanitizeHTML($html) { - $config = HTMLPurifier_Config::createDefault(); - $config->set('Cache.DefinitionImpl', null); - $config->set('HTML.AllowedElements', [ + protected static function sanitizeHTML($html, $allowImg=true) { + $allowed = [ 'a', 'abbr', 'b', @@ -45,7 +43,6 @@ protected static function sanitizeHTML($html) { 'del', 'em', 'i', - 'img', 'q', 'strike', 'strong', @@ -62,7 +59,13 @@ protected static function sanitizeHTML($html) { 'ul', 'li', 'ol' - ]); + ]; + if($allowImg) + $allowed[] = 'img'; + + $config = HTMLPurifier_Config::createDefault(); + $config->set('Cache.DefinitionImpl', null); + $config->set('HTML.AllowedElements', $allowed); $def = $config->getHTMLDefinition(true); $def->addElement( 'time', diff --git a/lib/XRay/Formats/Mf2.php b/lib/XRay/Formats/Mf2.php index 7c41e22..675a328 100644 --- a/lib/XRay/Formats/Mf2.php +++ b/lib/XRay/Formats/Mf2.php @@ -220,21 +220,39 @@ private static function parseHTMLValue($property, $item) { $textContent = $content; } elseif(!is_string($content) && is_array($content) && array_key_exists('value', $content)) { if(array_key_exists('html', $content)) { - $htmlContent = trim(self::sanitizeHTML($content['html'])); - #$textContent = trim(str_replace(" ","\r",strip_tags($htmlContent))); + // Only allow images in the content if there is no photo property set + if(isset($item['properties']['photo'])) + $allowImg = false; + else + $allowImg = true; + + $htmlContent = trim(self::sanitizeHTML($content['html'], $allowImg)); $textContent = trim(str_replace(" ","\r",$content['value'])); } else { $textContent = trim($content['value']); } } - $data = [ - 'text' => $textContent - ]; - if($htmlContent && $textContent != $htmlContent) { - $data['html'] = $htmlContent; + if($textContent || $htmlContent) { + $data = [ + 'text' => $textContent + ]; + // Only add HTML content if there is actual content. + // If the text content ends up empty, then the HTML should be too + // e.g.
+ // should not return content of + // TODO: still need to remove empty tags when there is other text in the content + if($htmlContent && $textContent && $textContent != $htmlContent) { + $data['html'] = $htmlContent; + } + + if(!$data['text']) + return null; + + return $data; + } else { + return null; } - return $data; } // Always return arrays, and may contain plaintext content diff --git a/tests/SanitizeTest.php b/tests/SanitizeTest.php index 54fdcf6..a06a84a 100644 --- a/tests/SanitizeTest.php +++ b/tests/SanitizeTest.php @@ -151,4 +151,104 @@ public function testSanitizeEmailAuthorURL() { $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->author->photo); } + public function testPhotoInContent() { + // https://github.com/aaronpk/XRay/issues/52 + + $url = 'http://sanitize.example/photo-in-content'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + #print_r($data->data); + + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->html); + } + + public function testPhotoInContentEmptyAltAttribute() { + // https://github.com/aaronpk/XRay/issues/52 + + $url = 'http://sanitize.example/photo-in-content-empty-alt'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->html); + } + + public function testPhotoInContentWithAlt() { + // https://github.com/aaronpk/XRay/issues/52 + + $url = 'http://sanitize.example/photo-in-content-with-alt'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->html); + } + + public function testPhotoInContentWithNoText() { + $url = 'http://sanitize.example/cleverdevil'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectHasAttribute('name', $data->data); + $this->assertEquals('Oh, how well they know me! 🥃', $data->data->name); + $this->assertObjectNotHasAttribute('content', $data->data); + $this->assertEquals('https://cleverdevil.io/file/5bf2fa91c3d4c592f9978200923cb56e/thumb.jpg', $data->data->photo[0]); + } + + public function testPhotosWithAlt() { + // https://github.com/microformats/microformats2-parsing/issues/16 + + $url = 'http://sanitize.example/photos-with-alt'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + #print_r($data->data); + + $this->assertEquals('🌆 Made it to the first #NPSF #earlygang of the year, did in-betweeners abs, and 6:30 workout with a brutal burnout that was really its own workout. But wow pretty sunrise. Plus 50+ deg F? I’ll take it. #100PDPD#justshowup #darknesstodawn #wakeupthesun #fromwhereirun #NovemberProject #sunrise #latergram #nofilter', $data->data->content->text); + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('https://igx.4sqi.net/img/general/original/476_g7yruXflacsGr7PyVmECefyTBMB_R99zmPQxW7pftzA.jpg', $data->data->photo[0]); + $this->assertEquals('https://igx.4sqi.net/img/general/original/476_zM3UgU9JHNhom907Ac_1WCEcUhGOJZaNWGlRmev86YA.jpg', $data->data->photo[1]); + } + + public function testEntryWithImgNoImpliedPhoto() { + // See https://github.com/microformats/microformats2-parsing/issues/6#issuecomment-357286985 + // and https://github.com/aaronpk/XRay/issues/52#issuecomment-357269683 + // and https://github.com/microformats/microformats2-parsing/issues/16 + $url = 'http://sanitize.example/entry-with-img-no-implied-photo'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectNotHasAttribute('photo', $data->data); + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); + $this->assertEquals('This is a photo post with an img tag inside the content, which does not have a u-photo class so should not be removed.', $data->data->content->text); + $this->assertEquals('This is a photo post with an img tag inside the content, which does not have a u-photo class so should not be removed. a photo', $data->data->content->html); + } + } diff --git a/tests/data/sanitize.example/cleverdevil b/tests/data/sanitize.example/cleverdevil new file mode 100644 index 0000000..fe9809d --- /dev/null +++ b/tests/data/sanitize.example/cleverdevil @@ -0,0 +1,47 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +
+
+ +
 
+
+ +
+

Oh, how well they know me! 🥃 +

+ +
+ +
+ Oh, how well they know me! 🥃 +
+ + diff --git a/tests/data/sanitize.example/entry-with-img-no-implied-photo b/tests/data/sanitize.example/entry-with-img-no-implied-photo new file mode 100644 index 0000000..f788e57 --- /dev/null +++ b/tests/data/sanitize.example/entry-with-img-no-implied-photo @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a photo post with an img tag inside the content, which does not have a u-photo class so should not be removed. a photo

+ + diff --git a/tests/data/sanitize.example/photo-in-content b/tests/data/sanitize.example/photo-in-content new file mode 100644 index 0000000..7e9258d --- /dev/null +++ b/tests/data/sanitize.example/photo-in-content @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a photo post with an img tag inside the content.

+ + diff --git a/tests/data/sanitize.example/photo-in-content-empty-alt b/tests/data/sanitize.example/photo-in-content-empty-alt new file mode 100644 index 0000000..c55581d --- /dev/null +++ b/tests/data/sanitize.example/photo-in-content-empty-alt @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a photo post with an img tag inside the content.

+ + diff --git a/tests/data/sanitize.example/photo-in-content-with-alt b/tests/data/sanitize.example/photo-in-content-with-alt new file mode 100644 index 0000000..8b95892 --- /dev/null +++ b/tests/data/sanitize.example/photo-in-content-with-alt @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a photo post with an img tag inside the content. a photo

+ + diff --git a/tests/data/sanitize.example/photos-with-alt b/tests/data/sanitize.example/photos-with-alt new file mode 100644 index 0000000..41853cb --- /dev/null +++ b/tests/data/sanitize.example/photos-with-alt @@ -0,0 +1,73 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + +
+

tantek.com

+ + + +
+ +
+ +
+ + +

a jpg. a jpg. 🌆 Made it to the first #NPSF #earlygang of the year, did in-betweeners abs, and 6:30 workout with a brutal burnout that was really its own workout. But wow pretty sunrise. Plus 50+ deg F? I’ll take it. #100PDPD

#justshowup #darknesstodawn #wakeupthesun #fromwhereirun #NovemberProject #sunrise #latergram #nofilter

+ + on + + (ttk.me t4sE3) + using BBEdit + + + + + From 44770396f9d1d6f6f6dfb7c59959c62601a6467f Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Fri, 12 Jan 2018 11:06:04 -0800 Subject: [PATCH 2/8] add test to ensure a content property is not returned unless it is defined --- tests/ParseTest.php | 11 +++++++++++ tests/data/source.example.com/h-entry-no-content | 15 +++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 tests/data/source.example.com/h-entry-no-content diff --git a/tests/ParseTest.php b/tests/ParseTest.php index ab01e38..fed9132 100644 --- a/tests/ParseTest.php +++ b/tests/ParseTest.php @@ -247,6 +247,17 @@ public function testSyndicationIsURL() { $this->assertEquals('http://syndicated.example/', $data['data']['syndication'][0]); } + public function testHEntryNoContent() { + $url = 'http://source.example.com/h-entry-no-content'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + $this->assertObjectNotHasAttribute('content', $data->data); + $this->assertEquals('This is a Post', $data->data->name); + } + public function testHEntryIsNotFirstObject() { $url = 'http://source.example.com/h-entry-is-not-first'; $response = $this->parse(['url' => $url]); diff --git a/tests/data/source.example.com/h-entry-no-content b/tests/data/source.example.com/h-entry-no-content new file mode 100644 index 0000000..da52466 --- /dev/null +++ b/tests/data/source.example.com/h-entry-no-content @@ -0,0 +1,15 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a Post

+ permalink + + From 66adfbe2f8ea311d392674ac4067f5487544fe28 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Fri, 12 Jan 2018 12:05:32 -0800 Subject: [PATCH 3/8] run name/content dedupe before munging HTML fix for #53 --- lib/XRay/Formats/Format.php | 10 +++++++++- lib/XRay/Formats/Mf2.php | 29 ++++++++++++++++++++++++----- tests/SanitizeTest.php | 8 ++++++-- 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/lib/XRay/Formats/Format.php b/lib/XRay/Formats/Format.php index dff38d4..41637ef 100644 --- a/lib/XRay/Formats/Format.php +++ b/lib/XRay/Formats/Format.php @@ -84,8 +84,16 @@ protected static function sanitizeHTML($html, $allowImg=true) { return trim($sanitized); } + // Return a plaintext version of the input HTML protected static function stripHTML($html) { - return trim(strip_tags($html)); + $config = HTMLPurifier_Config::createDefault(); + $config->set('Cache.DefinitionImpl', null); + $config->set('HTML.AllowedElements', ['br']); + $purifier = new HTMLPurifier($config); + $sanitized = $purifier->purify($html); + $sanitized = str_replace(" ","\r",$sanitized); + $sanitized = html_entity_decode($sanitized); + return trim(str_replace('
',"\n", $sanitized)); } diff --git a/lib/XRay/Formats/Mf2.php b/lib/XRay/Formats/Mf2.php index 675a328..bc25753 100644 --- a/lib/XRay/Formats/Mf2.php +++ b/lib/XRay/Formats/Mf2.php @@ -227,7 +227,8 @@ private static function parseHTMLValue($property, $item) { $allowImg = true; $htmlContent = trim(self::sanitizeHTML($content['html'], $allowImg)); - $textContent = trim(str_replace(" ","\r",$content['value'])); + #$textContent = trim(str_replace(" ","\r",$content['value'])); + $textContent = trim(self::stripHTML($htmlContent)); } else { $textContent = trim($content['value']); } @@ -339,10 +340,13 @@ private static function determineNameAndContent($item, &$data) { $textContent = null; $htmlContent = null; - $content = self::parseHTMLValue('content', $item); - if($content) { + $content = self::getHTMLValue($item, 'content'); + + if(is_string($content)) { + $textContent = $content; + } elseif($content) { $htmlContent = array_key_exists('html', $content) ? $content['html'] : null; - $textContent = array_key_exists('text', $content) ? $content['text'] : null; + $textContent = array_key_exists('value', $content) ? $content['value'] : null; } if($content) { @@ -365,8 +369,9 @@ private static function determineNameAndContent($item, &$data) { // If there is content, always return the plaintext content, and return HTML content if it's different if($content) { + $content = self::parseHTMLValue('content', $item); $data['content']['text'] = $content['text']; - if(array_key_exists('html', $content)) + if(isset($content['html'])) $data['content']['html'] = $content['html']; } } @@ -762,6 +767,20 @@ private static function getPlaintext($mf2, $k, $fallback=null) { return $fallback; } + private static function getHTMLValue($mf2, $k, $fallback=null) { + // Return an array with html and value if the value is html, otherwise return a string + if(!empty($mf2['properties'][$k]) and is_array($mf2['properties'][$k])) { + // $mf2['properties'][$v] will always be an array since the input was from the mf2 parser + $value = $mf2['properties'][$k][0]; + if(is_string($value)) { + return $value; + } elseif(isset($value['html'])) { + return $value; + } + } + return $fallback; + } + private static function getPlaintextValues($mf2, $k, $values=[]) { if(!empty($mf2['properties'][$k]) and is_array($mf2['properties'][$k])) { foreach($mf2['properties'][$k] as $value) { diff --git a/tests/SanitizeTest.php b/tests/SanitizeTest.php index a06a84a..56851a7 100644 --- a/tests/SanitizeTest.php +++ b/tests/SanitizeTest.php @@ -151,7 +151,7 @@ public function testSanitizeEmailAuthorURL() { $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->author->photo); } - public function testPhotoInContent() { + public function testPhotoInContentNoAlt() { // https://github.com/aaronpk/XRay/issues/52 $url = 'http://sanitize.example/photo-in-content'; @@ -161,7 +161,11 @@ public function testPhotoInContent() { $this->assertEquals(200, $response->getStatusCode()); $data = json_decode($body); - #print_r($data->data); + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text); + $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->html); + } $this->assertObjectNotHasAttribute('name', $data->data); $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); From 85c2b9b15fa9a72ef00bac04d97fd7436dbeb673 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Fri, 12 Jan 2018 12:07:43 -0800 Subject: [PATCH 4/8] add failing test for `p-content` containing an `u-photo` --- tests/SanitizeTest.php | 12 ++++++++++++ tests/data/sanitize.example/photo-in-text-content | 14 ++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 tests/data/sanitize.example/photo-in-text-content diff --git a/tests/SanitizeTest.php b/tests/SanitizeTest.php index 56851a7..62e499a 100644 --- a/tests/SanitizeTest.php +++ b/tests/SanitizeTest.php @@ -167,6 +167,18 @@ public function testPhotoInContentNoAlt() { $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->html); } + public function testPhotoInTextContentNoAlt() { + // https://github.com/aaronpk/XRay/issues/56 + + $url = 'http://sanitize.example/photo-in-text-content'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + echo json_encode($data, JSON_PRETTY_PRINT|JSON_UNESCAPED_SLASHES); + $this->assertObjectNotHasAttribute('name', $data->data); $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text); diff --git a/tests/data/sanitize.example/photo-in-text-content b/tests/data/sanitize.example/photo-in-text-content new file mode 100644 index 0000000..36dd2b2 --- /dev/null +++ b/tests/data/sanitize.example/photo-in-text-content @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

This is a photo post with an img tag inside the content.

+ + From 150683e1a747e1b73bc8905ca084865b244eda67 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Fri, 12 Jan 2018 12:34:56 -0800 Subject: [PATCH 5/8] fix error when mailto links are encountered --- lib/helpers.php | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/helpers.php b/lib/helpers.php index d3a503f..8a90a9c 100644 --- a/lib/helpers.php +++ b/lib/helpers.php @@ -11,7 +11,8 @@ function normalize_url($url) { $parts = parse_url($url); if(empty($parts['path'])) $parts['path'] = '/'; - $parts['host'] = strtolower($parts['host']); + if(isset($parts['host'])) + $parts['host'] = strtolower($parts['host']); return build_url($parts); } From 3ac38f9dbf22da6a25194ce6eff51837c87fd73d Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Fri, 12 Jan 2018 12:35:25 -0800 Subject: [PATCH 6/8] add simple case of Known markup for #57 --- tests/SanitizeTest.php | 18 ++++++++++++++++-- .../sanitize.example/photo-with-dupe-name-alt | 15 +++++++++++++++ 2 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 tests/data/sanitize.example/photo-with-dupe-name-alt diff --git a/tests/SanitizeTest.php b/tests/SanitizeTest.php index 62e499a..e26b1bd 100644 --- a/tests/SanitizeTest.php +++ b/tests/SanitizeTest.php @@ -177,8 +177,6 @@ public function testPhotoInTextContentNoAlt() { $this->assertEquals(200, $response->getStatusCode()); $data = json_decode($body); - echo json_encode($data, JSON_PRETTY_PRINT|JSON_UNESCAPED_SLASHES); - $this->assertObjectNotHasAttribute('name', $data->data); $this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]); $this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text); @@ -224,6 +222,22 @@ public function testPhotoInContentWithNoText() { $body = $response->getContent(); $this->assertEquals(200, $response->getStatusCode()); $data = json_decode($body); +print_r($data); + + $this->assertObjectHasAttribute('name', $data->data); + $this->assertEquals('Oh, how well they know me! 🥃', $data->data->name); + $this->assertObjectNotHasAttribute('content', $data->data); + $this->assertEquals('https://cleverdevil.io/file/5bf2fa91c3d4c592f9978200923cb56e/thumb.jpg', $data->data->photo[0]); + } + + public function testPhotoWithDupeNameAndAlt() { + $url = 'http://sanitize.example/photo-with-dupe-name-alt'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); +echo json_encode($data->data, JSON_PRETTY_PRINT+JSON_UNESCAPED_SLASHES); $this->assertObjectHasAttribute('name', $data->data); $this->assertEquals('Oh, how well they know me! 🥃', $data->data->name); diff --git a/tests/data/sanitize.example/photo-with-dupe-name-alt b/tests/data/sanitize.example/photo-with-dupe-name-alt new file mode 100644 index 0000000..85a203c --- /dev/null +++ b/tests/data/sanitize.example/photo-with-dupe-name-alt @@ -0,0 +1,15 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

Photo caption

+

Photo caption

+ + From 4d65b1ca1e97128bcb45819bb703c0f3ab6f5fe9 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Fri, 12 Jan 2018 12:47:02 -0800 Subject: [PATCH 7/8] if removing the img results in empty content, put the name value back closes #57 --- lib/XRay/Formats/Mf2.php | 22 +++++++++++----- tests/SanitizeTest.php | 25 +++++++++++++++---- .../photo-with-dupe-name-alt-2 | 14 +++++++++++ 3 files changed, 50 insertions(+), 11 deletions(-) create mode 100644 tests/data/sanitize.example/photo-with-dupe-name-alt-2 diff --git a/lib/XRay/Formats/Mf2.php b/lib/XRay/Formats/Mf2.php index bc25753..5eab18e 100644 --- a/lib/XRay/Formats/Mf2.php +++ b/lib/XRay/Formats/Mf2.php @@ -349,6 +349,7 @@ private static function determineNameAndContent($item, &$data) { $textContent = array_key_exists('value', $content) ? $content['value'] : null; } + $checkedname = $name; if($content) { // Trim ellipses from the name $name = preg_replace('/ ?(\.\.\.|…)$/', '', $name); @@ -359,20 +360,29 @@ private static function determineNameAndContent($item, &$data) { // Check if the name is a prefix of the content if($contentCompare && $nameCompare && strpos($contentCompare, $nameCompare) === 0) { - $name = null; + $checkedname = null; } } - if($name) { - $data['name'] = $name; + if($checkedname) { + $data['name'] = $checkedname; } // If there is content, always return the plaintext content, and return HTML content if it's different if($content) { $content = self::parseHTMLValue('content', $item); - $data['content']['text'] = $content['text']; - if(isset($content['html'])) - $data['content']['html'] = $content['html']; + if($content['text']) { + $data['content']['text'] = $content['text']; + if(isset($content['html'])) + $data['content']['html'] = $content['html']; + } else { + // If the content text was blank because the img was removed and that was the only content, + // then put the name back as the name if it was previously set. + // See https://github.com/aaronpk/XRay/issues/57 + if($name) { + $data['name'] = $name; + } + } } } diff --git a/tests/SanitizeTest.php b/tests/SanitizeTest.php index e26b1bd..d8e31a1 100644 --- a/tests/SanitizeTest.php +++ b/tests/SanitizeTest.php @@ -222,7 +222,6 @@ public function testPhotoInContentWithNoText() { $body = $response->getContent(); $this->assertEquals(200, $response->getStatusCode()); $data = json_decode($body); -print_r($data); $this->assertObjectHasAttribute('name', $data->data); $this->assertEquals('Oh, how well they know me! 🥃', $data->data->name); @@ -230,19 +229,35 @@ public function testPhotoInContentWithNoText() { $this->assertEquals('https://cleverdevil.io/file/5bf2fa91c3d4c592f9978200923cb56e/thumb.jpg', $data->data->photo[0]); } - public function testPhotoWithDupeNameAndAlt() { + public function testPhotoWithDupeNameAndAlt1() { $url = 'http://sanitize.example/photo-with-dupe-name-alt'; $response = $this->parse(['url' => $url]); $body = $response->getContent(); $this->assertEquals(200, $response->getStatusCode()); $data = json_decode($body); -echo json_encode($data->data, JSON_PRETTY_PRINT+JSON_UNESCAPED_SLASHES); $this->assertObjectHasAttribute('name', $data->data); - $this->assertEquals('Oh, how well they know me! 🥃', $data->data->name); + $this->assertEquals('Photo caption', $data->data->name); $this->assertObjectNotHasAttribute('content', $data->data); - $this->assertEquals('https://cleverdevil.io/file/5bf2fa91c3d4c592f9978200923cb56e/thumb.jpg', $data->data->photo[0]); + $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->photo[0]); + } + + public function testPhotoWithDupeNameAndAlt2() { + // https://github.com/aaronpk/XRay/issues/57 + // This is simliar to adactio's markup + // https://adactio.com/notes/13301 + $url = 'http://sanitize.example/photo-with-dupe-name-alt-2'; + $response = $this->parse(['url' => $url]); + + $body = $response->getContent(); + $this->assertEquals(200, $response->getStatusCode()); + $data = json_decode($body); + + $this->assertObjectHasAttribute('content', $data->data); + $this->assertEquals('Photo caption', $data->data->content->text); + $this->assertObjectNotHasAttribute('name', $data->data); + $this->assertEquals('http://sanitize.example/photo.jpg', $data->data->photo[0]); } public function testPhotosWithAlt() { diff --git a/tests/data/sanitize.example/photo-with-dupe-name-alt-2 b/tests/data/sanitize.example/photo-with-dupe-name-alt-2 new file mode 100644 index 0000000..c5ac1a9 --- /dev/null +++ b/tests/data/sanitize.example/photo-with-dupe-name-alt-2 @@ -0,0 +1,14 @@ +HTTP/1.1 200 OK +Server: Apache +Date: Wed, 09 Dec 2015 03:29:14 GMT +Content-Type: text/html; charset=utf-8 +Connection: keep-alive + + + + Test + + +

Photo caption Photo caption

+ + From 2fd563db0c0f3b40985176537995c4fc34ab08c5 Mon Sep 17 00:00:00 2001 From: Aaron Parecki Date: Fri, 12 Jan 2018 12:48:07 -0800 Subject: [PATCH 8/8] put the comment in the right spot --- tests/SanitizeTest.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/SanitizeTest.php b/tests/SanitizeTest.php index d8e31a1..9d215f3 100644 --- a/tests/SanitizeTest.php +++ b/tests/SanitizeTest.php @@ -230,6 +230,7 @@ public function testPhotoInContentWithNoText() { } public function testPhotoWithDupeNameAndAlt1() { + // https://github.com/aaronpk/XRay/issues/57 $url = 'http://sanitize.example/photo-with-dupe-name-alt'; $response = $this->parse(['url' => $url]); @@ -244,7 +245,6 @@ public function testPhotoWithDupeNameAndAlt1() { } public function testPhotoWithDupeNameAndAlt2() { - // https://github.com/aaronpk/XRay/issues/57 // This is simliar to adactio's markup // https://adactio.com/notes/13301 $url = 'http://sanitize.example/photo-with-dupe-name-alt-2';