mochijson:decode() doesn't handle surrogate pairs correctly #196

sigsergv · 2017-12-28T11:05:45Z

mochijson:decode doesn't convert surrogate pairs in JSON into proper unicode characters.

For example (character 💩):

59> mochijson:decode("\"\\ud83d\\udca9\"").
[56489,55357]

But it should return instead:

59> mochijson:decode("\"\\ud83d\\udca9\"").
[128169]

The text was updated successfully, but these errors were encountered:

sigsergv · 2017-12-28T11:07:37Z

Quick and dirty patch:

diff -uNr ChicagoBoss-orig/deps/mochiweb/src/mochijson.erl ChicagoBoss/deps/mochiweb/src/mochijson.erl
--- ChicagoBoss-orig/deps/mochiweb/src/mochijson.erl    2017-12-28 16:57:09.557338445 +0700
+++ ChicagoBoss/deps/mochiweb/src/mochijson.erl 2017-12-28 17:45:17.673802239 +0700
@@ -306,6 +306,21 @@
             decode_array(Rest, S1#decoder{state=any}, Acc)
     end.
 
+tokenize_string_surrogate_pair(SP1, [$\\, $u, C3, C2, C1, C0 | Rest], S, Acc) ->
+    C = dehex(C0) bor
+        (dehex(C1) bsl 4) bor
+        (dehex(C2) bsl 8) bor 
+        (dehex(C3) bsl 12),
+    if
+        C >= 16#DC00 andalso C =< 16#DFFF ->
+            case catch unicode:characters_to_list(unicode:characters_to_binary(<<SP1:16,C:16>>,utf16,utf8)) of
+                [UnicodeChar] -> tokenize_string(Rest, ?ADV_COL(S, 6), [UnicodeChar | Acc]);
+                _ ->tokenize_string(Rest, ?ADV_COL(S, 6), [SP1, C | Acc])
+            end;
+        true ->
+            tokenize_string(Rest, ?ADV_COL(S, 6), [SP1, C | Acc])
+    end.
+
 tokenize_string(IoList=[C | _], S=#decoder{input_encoding=utf8}, Acc)
   when is_list(C); is_binary(C); C >= 16#7f ->
     List = xmerl_ucs:from_utf8(iolist_to_binary(IoList)),
@@ -334,7 +349,13 @@
         (dehex(C1) bsl 4) bor
         (dehex(C2) bsl 8) bor 
         (dehex(C3) bsl 12),
-    tokenize_string(Rest, ?ADV_COL(S, 6), [C | Acc]);
+    if
+        C >= 16#D800 andalso C =< 16#DBFF ->
+            %% Surrogate pair
+            tokenize_string_surrogate_pair(C, Rest, ?ADV_COL(S, 6), Acc);
+        true ->
+            tokenize_string(Rest, ?ADV_COL(S, 6), [C | Acc])
+    end;
 tokenize_string([C | Rest], S, Acc) when C >= $\s; C < 16#10FFFF ->
     tokenize_string(Rest, ?ADV_COL(S, 1), [C | Acc]).

etrepum · 2017-12-28T15:20:17Z

A pull request with a test would be the preferred method of contribution for this, if you have the time

sigsergv · 2017-12-28T15:58:07Z

I'm not sure is that a proper fix actually. Don't know unicode that well.

etrepum · 2017-12-28T21:51:41Z

Have you tried using mochijson2? UTF8 binaries are generally better to work with than lists of code points. mochijson exists only for compatibility reasons.

sigsergv · 2017-12-29T02:09:54Z

We are planning to migrate to mochijson2 but at this moment we heavily depend upon unicode strings (with unicode characters).

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

mochijson:decode() doesn't handle surrogate pairs correctly #196

mochijson:decode() doesn't handle surrogate pairs correctly #196

sigsergv commented Dec 28, 2017

sigsergv commented Dec 28, 2017 •

edited

Loading

etrepum commented Dec 28, 2017

sigsergv commented Dec 28, 2017

etrepum commented Dec 28, 2017

sigsergv commented Dec 29, 2017

mochijson:decode() doesn't handle surrogate pairs correctly #196

mochijson:decode() doesn't handle surrogate pairs correctly #196

Comments

sigsergv commented Dec 28, 2017

sigsergv commented Dec 28, 2017 • edited Loading

etrepum commented Dec 28, 2017

sigsergv commented Dec 28, 2017

etrepum commented Dec 28, 2017

sigsergv commented Dec 29, 2017

sigsergv commented Dec 28, 2017 •

edited

Loading