@@ -115,6 +115,78 @@ def _validate_data_input(
115
115
raise GMTInvalidInput ("data must provide x, y, and z columns." )
116
116
117
117
118
+ def _check_encoding (
119
+ argstr : str ,
120
+ ) -> Literal [
121
+ "ascii" ,
122
+ "ISOLatin1+" ,
123
+ "ISO-8859-1" ,
124
+ "ISO-8859-2" ,
125
+ "ISO-8859-3" ,
126
+ "ISO-8859-4" ,
127
+ "ISO-8859-5" ,
128
+ "ISO-8859-6" ,
129
+ "ISO-8859-7" ,
130
+ "ISO-8859-8" ,
131
+ "ISO-8859-9" ,
132
+ "ISO-8859-10" ,
133
+ "ISO-8859-11" ,
134
+ "ISO-8859-13" ,
135
+ "ISO-8859-14" ,
136
+ "ISO-8859-15" ,
137
+ "ISO-8859-16" ,
138
+ ]:
139
+ """
140
+ Check the charset encoding of a string.
141
+
142
+ All characters in the string must be in the same charset encoding, otherwise the
143
+ default ``ISOLatin1+`` encoding is returned. Characters in the Adobe Symbol and
144
+ ZapfDingbats encodings are also checked because they're independent on the choice of
145
+ encodings.
146
+
147
+ Parameters
148
+ ----------
149
+ argstr
150
+ The string to be checked.
151
+
152
+ Returns
153
+ -------
154
+ encoding
155
+ The encoding of the string.
156
+
157
+ Examples
158
+ --------
159
+ >>> _check_encoding("123ABC+-?!") # ASCII characters only
160
+ 'ascii'
161
+ >>> _check_encoding("12AB±β①②") # Characters in ISOLatin1+
162
+ 'ISOLatin1+'
163
+ >>> _check_encoding("12ABāáâãäåβ①②") # Characters in ISO-8859-4
164
+ 'ISO-8859-4'
165
+ >>> _check_encoding("12ABŒā") # Mix characters in ISOLatin1+ (Œ) and ISO-8859-4 (ā)
166
+ 'ISOLatin1+'
167
+ >>> _check_encoding("123AB中文") # Characters not in any charset encoding
168
+ 'ISOLatin1+'
169
+ """
170
+ # Return "ascii" if the string only contains ASCII characters.
171
+ if all (32 <= ord (c ) <= 126 for c in argstr ):
172
+ return "ascii"
173
+ # Loop through all supported encodings and check if all characters in the string
174
+ # are in the charset of the encoding. If all characters are in the charset, return
175
+ # the encoding. The ISOLatin1+ encoding is checked first because it is the default
176
+ # and most common encoding.
177
+ adobe_chars = set (charset ["Symbol" ].values ()) | set (
178
+ charset ["ZapfDingbats" ].values ()
179
+ )
180
+ for encoding in ["ISOLatin1+" ] + [f"ISO-8859-{ i } " for i in range (1 , 17 )]:
181
+ if encoding == "ISO-8859-12" : # ISO-8859-12 was abandoned. Skip it.
182
+ continue
183
+ if all (c in (set (charset [encoding ].values ()) | adobe_chars ) for c in argstr ):
184
+ return encoding # type: ignore[return-value]
185
+ # Return the "ISOLatin1+" encoding if the string contains characters from multiple
186
+ # charset encodings or contains characters that are not in any charset encoding.
187
+ return "ISOLatin1+"
188
+
189
+
118
190
def data_kind (
119
191
data : Any = None , required : bool = True
120
192
) -> Literal ["arg" , "file" , "geojson" , "grid" , "image" , "matrix" , "vectors" ]:
@@ -192,17 +264,41 @@ def data_kind(
192
264
return kind
193
265
194
266
195
- def non_ascii_to_octal (argstr : str ) -> str :
267
+ def non_ascii_to_octal (
268
+ argstr : str ,
269
+ encoding : Literal [
270
+ "ascii" ,
271
+ "ISOLatin1+" ,
272
+ "ISO-8859-1" ,
273
+ "ISO-8859-2" ,
274
+ "ISO-8859-3" ,
275
+ "ISO-8859-4" ,
276
+ "ISO-8859-5" ,
277
+ "ISO-8859-6" ,
278
+ "ISO-8859-7" ,
279
+ "ISO-8859-8" ,
280
+ "ISO-8859-9" ,
281
+ "ISO-8859-10" ,
282
+ "ISO-8859-11" ,
283
+ "ISO-8859-13" ,
284
+ "ISO-8859-14" ,
285
+ "ISO-8859-15" ,
286
+ "ISO-8859-16" ,
287
+ ] = "ISOLatin1+" ,
288
+ ) -> str :
196
289
r"""
197
290
Translate non-ASCII characters to their corresponding octal codes.
198
291
199
- Currently, only characters in the ISOLatin1+ charset and Symbol/ZapfDingbats fonts
200
- are supported.
292
+ Currently, only non-ASCII characters in the Adobe ISOLatin1+, Adobe Symbol, Adobe
293
+ ZapfDingbats, and ISO-8850-x (x can be in 1-11, 13-17) encodings are supported.
294
+ The Adobe Standard encoding is not supported yet.
201
295
202
296
Parameters
203
297
----------
204
298
argstr
205
299
The string to be translated.
300
+ encoding
301
+ The encoding of characters in the string.
206
302
207
303
Returns
208
304
-------
@@ -219,9 +315,11 @@ def non_ascii_to_octal(argstr: str) -> str:
219
315
'@%34%\\041@%%@%34%\\176@%%@%34%\\241@%%@%34%\\376@%%'
220
316
>>> non_ascii_to_octal("ABC ±120° DEF α ♥")
221
317
'ABC \\261120\\260 DEF @~\\141@~ @%34%\\252@%%'
318
+ >>> non_ascii_to_octal("12ABāáâãäåβ①②", encoding="ISO-8859-4")
319
+ '12AB\\340\\341\\342\\343\\344\\345@~\\142@~@%34%\\254@%%@%34%\\255@%%'
222
320
""" # noqa: RUF002
223
- # Return the string if it only contains printable ASCII characters from 32 to 126 .
224
- if all (32 <= ord (c ) <= 126 for c in argstr ):
321
+ # Return the input string if it only contains ASCII characters.
322
+ if encoding == "ascii" or all (32 <= ord (c ) <= 126 for c in argstr ):
225
323
return argstr
226
324
227
325
# Dictionary mapping non-ASCII characters to octal codes
@@ -232,15 +330,15 @@ def non_ascii_to_octal(argstr: str) -> str:
232
330
mapping .update (
233
331
{c : f"@%34%\\ { i :03o} @%%" for i , c in charset ["ZapfDingbats" ].items ()}
234
332
)
235
- # Adobe ISOLatin1+ charset. Put at the end .
236
- mapping .update ({c : f"\\ { i :03o} " for i , c in charset ["ISOLatin1+" ].items ()})
333
+ # ISOLatin1+ or ISO-8859-x charset .
334
+ mapping .update ({c : f"\\ { i :03o} " for i , c in charset [encoding ].items ()})
237
335
238
336
# Remove any printable characters
239
337
mapping = {k : v for k , v in mapping .items () if k not in string .printable }
240
338
return argstr .translate (str .maketrans (mapping ))
241
339
242
340
243
- def build_arg_list (
341
+ def build_arg_list ( # noqa: PLR0912
244
342
kwdict : dict [str , Any ],
245
343
confdict : dict [str , str ] | None = None ,
246
344
infile : str | pathlib .PurePath | Sequence [str | pathlib .PurePath ] | None = None ,
@@ -310,6 +408,10 @@ def build_arg_list(
310
408
... )
311
409
... )
312
410
['f1.txt', 'f2.txt', '-A0', '-B', '--FORMAT_DATE_MAP=o dd', '->out.txt']
411
+ >>> build_arg_list(dict(B="12ABāβ①②"))
412
+ ['-B12AB\\340@~\\142@~@%34%\\254@%%@%34%\\255@%%', '--PS_CHAR_ENCODING=ISO-8859-4']
413
+ >>> build_arg_list(dict(B="12ABāβ①②"), confdict=dict(PS_CHAR_ENCODING="ISO-8859-5"))
414
+ ['-B12AB\\340@~\\142@~@%34%\\254@%%@%34%\\255@%%', '--PS_CHAR_ENCODING=ISO-8859-5']
313
415
>>> print(build_arg_list(dict(R="1/2/3/4", J="X4i", watre=True)))
314
416
Traceback (most recent call last):
315
417
...
@@ -324,11 +426,22 @@ def build_arg_list(
324
426
elif value is True :
325
427
gmt_args .append (f"-{ key } " )
326
428
elif is_nonstr_iter (value ):
327
- gmt_args .extend (non_ascii_to_octal ( f"-{ key } { _value } " ) for _value in value )
429
+ gmt_args .extend (f"-{ key } { _value } " for _value in value )
328
430
else :
329
- gmt_args .append (non_ascii_to_octal (f"-{ key } { value } " ))
431
+ gmt_args .append (f"-{ key } { value } " )
432
+
433
+ # Convert non-ASCII characters (if any) in the arguments to octal codes
434
+ encoding = _check_encoding ("" .join (gmt_args ))
435
+ if encoding != "ascii" :
436
+ gmt_args = [non_ascii_to_octal (arg , encoding = encoding ) for arg in gmt_args ]
330
437
gmt_args = sorted (gmt_args )
331
438
439
+ # Set --PS_CHAR_ENCODING=encoding if necessary
440
+ if encoding not in {"ascii" , "ISOLatin1+" } and not (
441
+ confdict and "PS_CHAR_ENCODING" in confdict
442
+ ):
443
+ gmt_args .append (f"--PS_CHAR_ENCODING={ encoding } " )
444
+
332
445
if confdict :
333
446
gmt_args .extend (f"--{ key } ={ value } " for key , value in confdict .items ())
334
447
0 commit comments