-
Notifications
You must be signed in to change notification settings - Fork 474
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
mochijson:decode() doesn't handle surrogate pairs correctly #196
Comments
Quick and dirty patch: diff -uNr ChicagoBoss-orig/deps/mochiweb/src/mochijson.erl ChicagoBoss/deps/mochiweb/src/mochijson.erl
--- ChicagoBoss-orig/deps/mochiweb/src/mochijson.erl 2017-12-28 16:57:09.557338445 +0700
+++ ChicagoBoss/deps/mochiweb/src/mochijson.erl 2017-12-28 17:45:17.673802239 +0700
@@ -306,6 +306,21 @@
decode_array(Rest, S1#decoder{state=any}, Acc)
end.
+tokenize_string_surrogate_pair(SP1, [$\\, $u, C3, C2, C1, C0 | Rest], S, Acc) ->
+ C = dehex(C0) bor
+ (dehex(C1) bsl 4) bor
+ (dehex(C2) bsl 8) bor
+ (dehex(C3) bsl 12),
+ if
+ C >= 16#DC00 andalso C =< 16#DFFF ->
+ case catch unicode:characters_to_list(unicode:characters_to_binary(<<SP1:16,C:16>>,utf16,utf8)) of
+ [UnicodeChar] -> tokenize_string(Rest, ?ADV_COL(S, 6), [UnicodeChar | Acc]);
+ _ ->tokenize_string(Rest, ?ADV_COL(S, 6), [SP1, C | Acc])
+ end;
+ true ->
+ tokenize_string(Rest, ?ADV_COL(S, 6), [SP1, C | Acc])
+ end.
+
tokenize_string(IoList=[C | _], S=#decoder{input_encoding=utf8}, Acc)
when is_list(C); is_binary(C); C >= 16#7f ->
List = xmerl_ucs:from_utf8(iolist_to_binary(IoList)),
@@ -334,7 +349,13 @@
(dehex(C1) bsl 4) bor
(dehex(C2) bsl 8) bor
(dehex(C3) bsl 12),
- tokenize_string(Rest, ?ADV_COL(S, 6), [C | Acc]);
+ if
+ C >= 16#D800 andalso C =< 16#DBFF ->
+ %% Surrogate pair
+ tokenize_string_surrogate_pair(C, Rest, ?ADV_COL(S, 6), Acc);
+ true ->
+ tokenize_string(Rest, ?ADV_COL(S, 6), [C | Acc])
+ end;
tokenize_string([C | Rest], S, Acc) when C >= $\s; C < 16#10FFFF ->
tokenize_string(Rest, ?ADV_COL(S, 1), [C | Acc]). |
A pull request with a test would be the preferred method of contribution for this, if you have the time |
I'm not sure is that a proper fix actually. Don't know unicode that well. |
Have you tried using mochijson2? UTF8 binaries are generally better to work with than lists of code points. mochijson exists only for compatibility reasons. |
We are planning to migrate to mochijson2 but at this moment we heavily depend upon unicode strings (with unicode characters). |
mochijson:decode doesn't convert surrogate pairs in JSON into proper unicode characters.
For example (character 💩):
But it should return instead:
The text was updated successfully, but these errors were encountered: