Skip to content

Commit fa53d81

Browse files
authored
Merge pull request #3801 from davidhewitt/encode-utf8
add `PyStringMethods::encode_utf8`
2 parents 45f2b0a + 662eecf commit fa53d81

File tree

2 files changed

+35
-5
lines changed

2 files changed

+35
-5
lines changed

newsfragments/3801.added.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Add `PyStringMethods::encode_utf8`.

src/types/string.rs

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,9 @@ pub trait PyStringMethods<'py> {
305305
/// replaced with `U+FFFD REPLACEMENT CHARACTER`.
306306
fn to_string_lossy(&self) -> Cow<'_, str>;
307307

308+
/// Encodes this string as a Python `bytes` object, using UTF-8 encoding.
309+
fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>>;
310+
308311
/// Obtains the raw data backing the Python string.
309312
///
310313
/// If the Python string object was created through legacy APIs, its internal storage format
@@ -337,6 +340,14 @@ impl<'py> PyStringMethods<'py> for Bound<'py, PyString> {
337340
self.as_borrowed().to_string_lossy()
338341
}
339342

343+
fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>> {
344+
unsafe {
345+
ffi::PyUnicode_AsUTF8String(self.as_ptr())
346+
.assume_owned_or_err(self.py())
347+
.downcast_into_unchecked::<PyBytes>()
348+
}
349+
}
350+
340351
#[cfg(not(Py_LIMITED_API))]
341352
unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
342353
self.as_borrowed().data()
@@ -371,11 +382,7 @@ impl<'a> Borrowed<'a, '_, PyString> {
371382

372383
#[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))]
373384
{
374-
let bytes = unsafe {
375-
ffi::PyUnicode_AsUTF8String(self.as_ptr())
376-
.assume_owned_or_err(self.py())?
377-
.downcast_into_unchecked::<PyBytes>()
378-
};
385+
let bytes = self.encode_utf8()?;
379386
Ok(Cow::Owned(
380387
unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(),
381388
))
@@ -535,6 +542,28 @@ mod tests {
535542
})
536543
}
537544

545+
#[test]
546+
fn test_encode_utf8_unicode() {
547+
Python::with_gil(|py| {
548+
let s = "哈哈🐈";
549+
let obj = PyString::new_bound(py, s);
550+
assert_eq!(s.as_bytes(), obj.encode_utf8().unwrap().as_bytes());
551+
})
552+
}
553+
554+
#[test]
555+
fn test_encode_utf8_surrogate() {
556+
Python::with_gil(|py| {
557+
let obj: PyObject = py.eval(r"'\ud800'", None, None).unwrap().into();
558+
assert!(obj
559+
.bind(py)
560+
.downcast::<PyString>()
561+
.unwrap()
562+
.encode_utf8()
563+
.is_err());
564+
})
565+
}
566+
538567
#[test]
539568
fn test_to_string_lossy() {
540569
Python::with_gil(|py| {

0 commit comments

Comments
 (0)