Skip to content

Commit

Permalink
Merge branch 'remove-img-from-photo-posts'
Browse files Browse the repository at this point in the history
  • Loading branch information
aaronpk committed Jan 14, 2018
2 parents 2515f61 + 2fd563d commit 71bf274
Show file tree
Hide file tree
Showing 14 changed files with 464 additions and 26 deletions.
25 changes: 18 additions & 7 deletions lib/XRay/Formats/Format.php
Original file line number Diff line number Diff line change
Expand Up @@ -34,18 +34,15 @@ protected static function _loadHTML($html) {
return [$doc, $xpath];
}

protected static function sanitizeHTML($html) {
$config = HTMLPurifier_Config::createDefault();
$config->set('Cache.DefinitionImpl', null);
$config->set('HTML.AllowedElements', [
protected static function sanitizeHTML($html, $allowImg=true) {
$allowed = [
'a',
'abbr',
'b',
'code',
'del',
'em',
'i',
'img',
'q',
'strike',
'strong',
Expand All @@ -62,7 +59,13 @@ protected static function sanitizeHTML($html) {
'ul',
'li',
'ol'
]);
];
if($allowImg)
$allowed[] = 'img';

$config = HTMLPurifier_Config::createDefault();
$config->set('Cache.DefinitionImpl', null);
$config->set('HTML.AllowedElements', $allowed);
$def = $config->getHTMLDefinition(true);
$def->addElement(
'time',
Expand All @@ -81,8 +84,16 @@ protected static function sanitizeHTML($html) {
return trim($sanitized);
}

// Return a plaintext version of the input HTML
protected static function stripHTML($html) {
return trim(strip_tags($html));
$config = HTMLPurifier_Config::createDefault();
$config->set('Cache.DefinitionImpl', null);
$config->set('HTML.AllowedElements', ['br']);
$purifier = new HTMLPurifier($config);
$sanitized = $purifier->purify($html);
$sanitized = str_replace("
","\r",$sanitized);
$sanitized = html_entity_decode($sanitized);
return trim(str_replace('<br>',"\n", $sanitized));
}


Expand Down
83 changes: 65 additions & 18 deletions lib/XRay/Formats/Mf2.php
Original file line number Diff line number Diff line change
Expand Up @@ -220,21 +220,40 @@ private static function parseHTMLValue($property, $item) {
$textContent = $content;
} elseif(!is_string($content) && is_array($content) && array_key_exists('value', $content)) {
if(array_key_exists('html', $content)) {
$htmlContent = trim(self::sanitizeHTML($content['html']));
#$textContent = trim(str_replace("&#xD;","\r",strip_tags($htmlContent)));
$textContent = trim(str_replace("&#xD;","\r",$content['value']));
// Only allow images in the content if there is no photo property set
if(isset($item['properties']['photo']))
$allowImg = false;
else
$allowImg = true;

$htmlContent = trim(self::sanitizeHTML($content['html'], $allowImg));
#$textContent = trim(str_replace("&#xD;","\r",$content['value']));
$textContent = trim(self::stripHTML($htmlContent));
} else {
$textContent = trim($content['value']);
}
}

$data = [
'text' => $textContent
];
if($htmlContent && $textContent != $htmlContent) {
$data['html'] = $htmlContent;
if($textContent || $htmlContent) {
$data = [
'text' => $textContent
];
// Only add HTML content if there is actual content.
// If the text content ends up empty, then the HTML should be too
// e.g. <div class="e-content"><a href=""><img src="" class="u-photo"></a></div>
// should not return content of <a href=""></a>
// TODO: still need to remove empty <a> tags when there is other text in the content
if($htmlContent && $textContent && $textContent != $htmlContent) {
$data['html'] = $htmlContent;
}

if(!$data['text'])
return null;

return $data;
} else {
return null;
}
return $data;
}

// Always return arrays, and may contain plaintext content
Expand Down Expand Up @@ -321,12 +340,16 @@ private static function determineNameAndContent($item, &$data) {
$textContent = null;
$htmlContent = null;

$content = self::parseHTMLValue('content', $item);
if($content) {
$content = self::getHTMLValue($item, 'content');

if(is_string($content)) {
$textContent = $content;
} elseif($content) {
$htmlContent = array_key_exists('html', $content) ? $content['html'] : null;
$textContent = array_key_exists('text', $content) ? $content['text'] : null;
$textContent = array_key_exists('value', $content) ? $content['value'] : null;
}

$checkedname = $name;
if($content) {
// Trim ellipses from the name
$name = preg_replace('/ ?(\.\.\.|…)$/', '', $name);
Expand All @@ -337,19 +360,29 @@ private static function determineNameAndContent($item, &$data) {

// Check if the name is a prefix of the content
if($contentCompare && $nameCompare && strpos($contentCompare, $nameCompare) === 0) {
$name = null;
$checkedname = null;
}
}

if($name) {
$data['name'] = $name;
if($checkedname) {
$data['name'] = $checkedname;
}

// If there is content, always return the plaintext content, and return HTML content if it's different
if($content) {
$data['content']['text'] = $content['text'];
if(array_key_exists('html', $content))
$data['content']['html'] = $content['html'];
$content = self::parseHTMLValue('content', $item);
if($content['text']) {
$data['content']['text'] = $content['text'];
if(isset($content['html']))
$data['content']['html'] = $content['html'];
} else {
// If the content text was blank because the img was removed and that was the only content,
// then put the name back as the name if it was previously set.
// See https://github.com/aaronpk/XRay/issues/57
if($name) {
$data['name'] = $name;
}
}
}
}

Expand Down Expand Up @@ -744,6 +777,20 @@ private static function getPlaintext($mf2, $k, $fallback=null) {
return $fallback;
}

private static function getHTMLValue($mf2, $k, $fallback=null) {
// Return an array with html and value if the value is html, otherwise return a string
if(!empty($mf2['properties'][$k]) and is_array($mf2['properties'][$k])) {
// $mf2['properties'][$v] will always be an array since the input was from the mf2 parser
$value = $mf2['properties'][$k][0];
if(is_string($value)) {
return $value;
} elseif(isset($value['html'])) {
return $value;
}
}
return $fallback;
}

private static function getPlaintextValues($mf2, $k, $values=[]) {
if(!empty($mf2['properties'][$k]) and is_array($mf2['properties'][$k])) {
foreach($mf2['properties'][$k] as $value) {
Expand Down
3 changes: 2 additions & 1 deletion lib/helpers.php
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ function normalize_url($url) {
$parts = parse_url($url);
if(empty($parts['path']))
$parts['path'] = '/';
$parts['host'] = strtolower($parts['host']);
if(isset($parts['host']))
$parts['host'] = strtolower($parts['host']);
return build_url($parts);
}

Expand Down
145 changes: 145 additions & 0 deletions tests/SanitizeTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -151,4 +151,149 @@ public function testSanitizeEmailAuthorURL() {
$this->assertEquals('http://sanitize.example/photo.jpg', $data->data->author->photo);
}

public function testPhotoInContentNoAlt() {
// https://github.com/aaronpk/XRay/issues/52

$url = 'http://sanitize.example/photo-in-content';
$response = $this->parse(['url' => $url]);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);

$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
$this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text);
$this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
}

public function testPhotoInTextContentNoAlt() {
// https://github.com/aaronpk/XRay/issues/56

$url = 'http://sanitize.example/photo-in-text-content';
$response = $this->parse(['url' => $url]);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);

$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
$this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text);
$this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
}

public function testPhotoInContentEmptyAltAttribute() {
// https://github.com/aaronpk/XRay/issues/52

$url = 'http://sanitize.example/photo-in-content-empty-alt';
$response = $this->parse(['url' => $url]);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);

$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
$this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text);
$this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
}

public function testPhotoInContentWithAlt() {
// https://github.com/aaronpk/XRay/issues/52

$url = 'http://sanitize.example/photo-in-content-with-alt';
$response = $this->parse(['url' => $url]);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);

$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
$this->assertEquals('This is a photo post with an img tag inside the content.', $data->data->content->text);
$this->assertEquals('This is a photo post with an <code>img</code> tag inside the content.', $data->data->content->html);
}

public function testPhotoInContentWithNoText() {
$url = 'http://sanitize.example/cleverdevil';
$response = $this->parse(['url' => $url]);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);

$this->assertObjectHasAttribute('name', $data->data);
$this->assertEquals('Oh, how well they know me! 🥃', $data->data->name);
$this->assertObjectNotHasAttribute('content', $data->data);
$this->assertEquals('https://cleverdevil.io/file/5bf2fa91c3d4c592f9978200923cb56e/thumb.jpg', $data->data->photo[0]);
}

public function testPhotoWithDupeNameAndAlt1() {
// https://github.com/aaronpk/XRay/issues/57
$url = 'http://sanitize.example/photo-with-dupe-name-alt';
$response = $this->parse(['url' => $url]);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);

$this->assertObjectHasAttribute('name', $data->data);
$this->assertEquals('Photo caption', $data->data->name);
$this->assertObjectNotHasAttribute('content', $data->data);
$this->assertEquals('http://sanitize.example/photo.jpg', $data->data->photo[0]);
}

public function testPhotoWithDupeNameAndAlt2() {
// This is simliar to adactio's markup
// https://adactio.com/notes/13301
$url = 'http://sanitize.example/photo-with-dupe-name-alt-2';
$response = $this->parse(['url' => $url]);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);

$this->assertObjectHasAttribute('content', $data->data);
$this->assertEquals('Photo caption', $data->data->content->text);
$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('http://sanitize.example/photo.jpg', $data->data->photo[0]);
}

public function testPhotosWithAlt() {
// https://github.com/microformats/microformats2-parsing/issues/16

$url = 'http://sanitize.example/photos-with-alt';
$response = $this->parse(['url' => $url]);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);

#print_r($data->data);

$this->assertEquals('🌆 Made it to the first #NPSF #earlygang of the year, did in-betweeners abs, and 6:30 workout with a brutal burnout that was really its own workout. But wow pretty sunrise. Plus 50+ deg F? I’ll take it. #100PDPD#justshowup #darknesstodawn #wakeupthesun #fromwhereirun #NovemberProject #sunrise #latergram #nofilter', $data->data->content->text);
$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('https://igx.4sqi.net/img/general/original/476_g7yruXflacsGr7PyVmECefyTBMB_R99zmPQxW7pftzA.jpg', $data->data->photo[0]);
$this->assertEquals('https://igx.4sqi.net/img/general/original/476_zM3UgU9JHNhom907Ac_1WCEcUhGOJZaNWGlRmev86YA.jpg', $data->data->photo[1]);
}

public function testEntryWithImgNoImpliedPhoto() {
// See https://github.com/microformats/microformats2-parsing/issues/6#issuecomment-357286985
// and https://github.com/aaronpk/XRay/issues/52#issuecomment-357269683
// and https://github.com/microformats/microformats2-parsing/issues/16
$url = 'http://sanitize.example/entry-with-img-no-implied-photo';
$response = $this->parse(['url' => $url]);

$body = $response->getContent();
$this->assertEquals(200, $response->getStatusCode());
$data = json_decode($body);

$this->assertObjectNotHasAttribute('photo', $data->data);
$this->assertObjectNotHasAttribute('name', $data->data);
$this->assertEquals('http://target.example.com/photo.jpg', $data->data->photo[0]);
$this->assertEquals('This is a photo post with an img tag inside the content, which does not have a u-photo class so should not be removed.', $data->data->content->text);
$this->assertEquals('This is a photo post with an <code>img</code> tag inside the content, which does not have a u-photo class so should not be removed. <img src="http://target.example.com/photo.jpg" alt="a photo">', $data->data->content->html);
}

}
47 changes: 47 additions & 0 deletions tests/data/sanitize.example/cleverdevil
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
HTTP/1.1 200 OK
Server: Apache
Date: Wed, 09 Dec 2015 03:29:14 GMT
Content-Type: text/html; charset=utf-8
Connection: keep-alive

<html>
<head>
<title>Test</title>
</head>
<body>
<div
class="col-md-8 col-md-offset-2 h-entry idno-photos idno-object idno-content">
<div>
<div class="p-author author h-card vcard">
<a href="https://cleverdevil.io/profile/cleverdevil" class="u-url icon-container"><img class="u-photo"
src="https://cleverdevil.io/file/2fa19f964fb8970faaf20b909c69d6cb/thumb.png"/></a>
<a class="p-name fn u-url url"
href="https://cleverdevil.io/profile/cleverdevil">Jonathan LaCour</a>
<a class="u-url" href="https://cleverdevil.io/profile/cleverdevil">
<!-- This is here to force the hand of your MF2 parser --></a>
</div>
<div class="break">&nbsp;</div>
</div>
<div class="datestamp">
<p>
<a class="u-url url" href="https://cleverdevil.io/2018/oh-how-well-they-know-me" rel="permalink">
<time class="dt-published"
datetime="2018-01-11T23:03:32+00:00">January 11, 2018</time>
</a>
</p>
</div>
<div class="idno-body">
<h2 class="photo-title p-name"><a
href="https://cleverdevil.io/2018/oh-how-well-they-know-me">Oh, how well they know me! 🥃</a>
</h2>

<div class="e-content entry-content">

<div class="photo-view">
<a href="https://cleverdevil.io/file/8ff3c164f54ff736bb5fb64d50e1276c/C66353A6-9AAE-46F2-B9BC-1BA7F5574D54.jpeg"
data-original-img="https://cleverdevil.io/file/8ff3c164f54ff736bb5fb64d50e1276c/C66353A6-9AAE-46F2-B9BC-1BA7F5574D54.jpeg"
data-title="Oh, how well they know me! 🥃"
data-footer=""><img src="https://cleverdevil.io/file/5bf2fa91c3d4c592f9978200923cb56e/thumb.jpg" class="u-photo" alt="Oh, how well they know me! 🥃" /></a>
</div>
</body>
</html>
Loading

0 comments on commit 71bf274

Please sign in to comment.