From 027360a020aee55b8c037976971c3089aee63251 Mon Sep 17 00:00:00 2001 From: Saiya Date: Thu, 26 Nov 2015 15:47:21 +0800 Subject: [PATCH] fixed a issue when handing CJK characters --- lib/truncate.coffee | 4 +++- lib/truncate.js | 7 +++++-- package.json | 2 +- readme.md | 44 ++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 51 insertions(+), 6 deletions(-) diff --git a/lib/truncate.coffee b/lib/truncate.coffee index 2223c6e..70f36d6 100644 --- a/lib/truncate.coffee +++ b/lib/truncate.coffee @@ -45,7 +45,7 @@ truncate = (html, length, options)-> #

Lorem ipsum dolor sit amet, consectetur

# tempor incididunt ut labore # - $ = cheerio.load "
#{html}
" + $ = cheerio.load "
#{html}
", decodeEntities: options.decodeEntities $html = $('div').first() # remove excludes elements @@ -100,6 +100,8 @@ truncate.defaultOptions = stripTags: false # postfix of the string ellipsis: '...' + # decode html entities + decodeEntities: false # excludes: img # length: 0 diff --git a/lib/truncate.js b/lib/truncate.js index 3878b36..1c4927b 100644 --- a/lib/truncate.js +++ b/lib/truncate.js @@ -54,7 +54,9 @@ truncate = function(html, length, options) { if (typeof html === 'object') { html = $(html).html(); } - $ = cheerio.load("
" + html + "
"); + $ = cheerio.load("
" + html + "
", { + decodeEntities: options.decodeEntities + }); $html = $('div').first(); if (options.excludes) { if (!Array.isArray(options.excludes)) { @@ -106,7 +108,8 @@ truncate = function(html, length, options) { truncate.defaultOptions = { stripTags: false, - ellipsis: '...' + ellipsis: '...', + decodeEntities: false }; module.exports = truncate; diff --git a/package.json b/package.json index e3017d6..c91b64a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "truncate-html", - "version": "0.0.5", + "version": "0.0.6", "description": "truncate html and keep tags in safe", "main": "lib/truncate.js", "scripts": { diff --git a/readme.md b/readme.md index dae8d4f..2c68b34 100644 --- a/readme.md +++ b/readme.md @@ -15,6 +15,7 @@ truncate(html, [length], [options]) stripTags: Boolean, whether to remove tags ellipsis: String, custom ellipsis sign, set it to empty string to remove the ellipsis postfix excludes: String or Array, the selectors of the elements you want to ignore + decodeEntities: Boolean, auto decode html entities in the html string } ``` @@ -22,7 +23,8 @@ truncate(html, [length], [options]) ```js truncate.defaultOptions = { stripTags: false, - ellipsis: '...' + ellipsis: '...', + decodeEntities: false }; ``` @@ -35,7 +37,6 @@ npm install truncate-html **Notice** Extra blank spaces in html content will be removed. If the html string content's length is shorter than `options.length`, then no ellipsis will be appended to the final html string. If longer, then the final html content's length will be `options.length` + `options.ellipsis`. - ```js var truncate = require('truncate-html'); @@ -91,6 +92,45 @@ truncate(html, { }); // returns: This is a string for~ + +// handing encoded characters +var html = '

 test for <p> encoded string

' +truncate(html, { + length: 20, + decodeEntities: true +}); +// returns:

test for <p> encode...

+ +// when set decodeEntities false +var html = '

 test for <p> encoded string

' +truncate(html, { + length: 20, + decodeEntities: false // this is the dafault value +}); +// returns:

 test for <p...

+ + +// and there may be a surprise by setting `decodeEntities` to true when handing CJK characters +var html = '

 test for <p> 中文 string

' +truncate(html, { + length: 20, + decodeEntities: true +}); +// returns:

test for <p> 中文 str...

+// to fix this, see below for instructions + ``` +### Known issues +Known issues about handing CJK characters when set the option `decodeEntities` to `true`. + +You have seen the option `decodeEntities`, it's really magic! When it's true, encoded html entities will be decoded automatically, so `&` will be treat as a single character. This is probably what we want. But, if there are CJK characters in the html string, they will be replaced by characters like `ö` in the final html you get. That's confused. + +To fix this, you have two choices: + +- keep the option `decodeEntities` false, but `&` will treat as five characters. +- modify cheerio's source code: find out the function `getInverse` in the file `./node_modules/cheerio/node_modules/entities/lib/decode.js`, comment out the last line `.replace(re_nonASCII, singleCharReplacer);`. + + +