Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 32 additions & 1 deletion guide/src/migration.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,37 @@ This should not require migration, nor is there expected to be breakage caused b

Nevertheless, this affects the order of initialization so seemed worth noting in this guide.

### Removed implementations of `From<str::Utf8Error>`, `From<string::FromUtf16Error>`, and `From<char::DecodeUtf16Error>` for `PyErr`

Previously the implementations of `From<string::FromUtf8Error>`, `From<ffi::IntoStringError>`, `From<str::Utf8Error>`, `From<string::FromUtf16Error>`, and `From<char::DecodeUtf16Error>` failed to construct the correct Python exception class, as reported in <https://github.com/PyO3/pyo3/issues/5651>.
The implementations for `string::FromUtf8Error` and `ffi::IntoStringError` were fixed in this release.

For `str::Utf8Error`, the Rust error does not contain the source bytes required to construct the Python exception.
Instead, `PyUnicodeDecodeError::new_err_from_utf8` can be used to convert the error to a `PyErr`.

Before:

```rust,ignore
fn bytes_to_str(bytes: &[u8]) -> PyResult<&str> {
Ok(str::from_utf8(bytes)?)
}
```

After:

```rust
# use pyo3::prelude::*;
use pyo3::exceptions::PyUnicodeDecodeError;

# #[expect(dead_code)]
fn bytes_to_str(bytes: &[u8]) -> PyResult<&str> {
str::from_utf8(bytes).map_err(|e| PyUnicodeDecodeError::new_err_from_utf8(bytes, e))
}
```

For `string::FromUtf16Error` and `char::DecodeUtf16Error` the Rust error types do not contain any of the information required to construct a `UnicodeDecodeError`.
To raise a Python `UnicodeDecodeError` a new error should be manually constructed by calling `PyUnicodeDecodeError::new_err(...)`.

## from 0.26.* to 0.27

### `FromPyObject` reworked for flexibility and efficiency
Expand Down Expand Up @@ -1132,7 +1163,7 @@ let obj: Py<PyList> = bound.unbind();

> [!WARNING]
> Dangling pointer trap 💣
>
>
> Because of the ownership changes, code which uses `.as_ptr()` to convert `&PyAny` and other GIL Refs to a `*mut pyo3_ffi::PyObject` should take care to avoid creating dangling pointers now that `Bound<PyAny>` carries ownership.
>
> For example, the following pattern with `Option<&PyAny>` can easily create a dangling pointer when migrating to the `Bound<PyAny>` smart pointer:
Expand Down
1 change: 1 addition & 0 deletions newsfragments/5668.added.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added new `PyUnicodeDecodeError::new_err_from_utf8` API to create a `PyErr` from a `str::Utf8Error`
1 change: 1 addition & 0 deletions newsfragments/5668.fixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fixed the implementations of `From<string::FromUtf8Error>` and `From<ffi::IntoStringError>` for `PyErr`.
1 change: 1 addition & 0 deletions newsfragments/5668.removed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Removed the failing implementations of `From<str::Utf8Error>`, `From<string::FromUtf16Error>`, and `From<char::DecodeUtf16Error>` for `PyErr`.
17 changes: 10 additions & 7 deletions src/conversions/std/cstring.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::exceptions::PyUnicodeDecodeError;
#[cfg(feature = "experimental-inspect")]
use crate::inspect::TypeHint;
#[cfg(feature = "experimental-inspect")]
Expand All @@ -6,7 +7,6 @@ use crate::types::PyString;
use crate::{Borrowed, Bound, FromPyObject, IntoPyObject, PyAny, PyErr, Python};
use std::borrow::Cow;
use std::ffi::{CStr, CString};
use std::str::Utf8Error;
#[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
use {
crate::{exceptions::PyValueError, ffi},
Expand All @@ -16,21 +16,24 @@ use {
impl<'py> IntoPyObject<'py> for &CStr {
type Target = PyString;
type Output = Bound<'py, Self::Target>;
type Error = Utf8Error;
type Error = PyErr;

#[cfg(feature = "experimental-inspect")]
const OUTPUT_TYPE: TypeHint = <&str>::OUTPUT_TYPE;

#[inline]
fn into_pyobject(self, py: Python<'py>) -> Result<Self::Output, Self::Error> {
self.to_str()?.into_pyobject(py).map_err(|err| match err {})
self.to_str()
.map_err(|e| PyUnicodeDecodeError::new_err_from_utf8(self.to_bytes(), e))?
.into_pyobject(py)
.map_err(|err| match err {})
}
}

impl<'py> IntoPyObject<'py> for CString {
type Target = PyString;
type Output = Bound<'py, Self::Target>;
type Error = Utf8Error;
type Error = PyErr;

#[cfg(feature = "experimental-inspect")]
const OUTPUT_TYPE: TypeHint = <&CStr>::OUTPUT_TYPE;
Expand All @@ -44,7 +47,7 @@ impl<'py> IntoPyObject<'py> for CString {
impl<'py> IntoPyObject<'py> for &CString {
type Target = PyString;
type Output = Bound<'py, Self::Target>;
type Error = Utf8Error;
type Error = PyErr;

#[cfg(feature = "experimental-inspect")]
const OUTPUT_TYPE: TypeHint = <&CStr>::OUTPUT_TYPE;
Expand All @@ -58,7 +61,7 @@ impl<'py> IntoPyObject<'py> for &CString {
impl<'py> IntoPyObject<'py> for Cow<'_, CStr> {
type Target = PyString;
type Output = Bound<'py, Self::Target>;
type Error = Utf8Error;
type Error = PyErr;

#[cfg(feature = "experimental-inspect")]
const OUTPUT_TYPE: TypeHint = <&CStr>::OUTPUT_TYPE;
Expand All @@ -71,7 +74,7 @@ impl<'py> IntoPyObject<'py> for Cow<'_, CStr> {
impl<'py> IntoPyObject<'py> for &Cow<'_, CStr> {
type Target = PyString;
type Output = Bound<'py, Self::Target>;
type Error = Utf8Error;
type Error = PyErr;

#[cfg(feature = "experimental-inspect")]
const OUTPUT_TYPE: TypeHint = <&CStr>::OUTPUT_TYPE;
Expand Down
8 changes: 7 additions & 1 deletion src/conversions/std/osstr.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
use crate::conversion::IntoPyObject;
#[cfg(target_os = "wasi")]
use crate::exceptions::PyUnicodeDecodeError;
use crate::ffi_ptr_ext::FfiPtrExt;
#[cfg(feature = "experimental-inspect")]
use crate::inspect::TypeHint;
Expand Down Expand Up @@ -103,7 +105,11 @@ impl FromPyObject<'_, '_> for OsString {
//
// For WASI: OS strings are UTF-8 by definition.
#[cfg(target_os = "wasi")]
let os_str: &OsStr = OsStr::new(std::str::from_utf8(fs_encoded_bytes.as_bytes())?);
let os_str: &OsStr = OsStr::new(
std::str::from_utf8(fs_encoded_bytes.as_bytes()).map_err(|e| {
PyUnicodeDecodeError::new_err_from_utf8(fs_encoded_bytes.as_bytes(), e)
})?,
);
#[cfg(not(target_os = "wasi"))]
let os_str: &OsStr =
std::os::unix::ffi::OsStrExt::from_bytes(fs_encoded_bytes.as_bytes());
Expand Down
145 changes: 132 additions & 13 deletions src/err/impls.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use crate::{err::PyErrArguments, exceptions, PyErr, Python};
use crate::{err::PyErrArguments, exceptions, types, PyErr, Python};
use crate::{IntoPyObject, Py, PyAny};
use std::io;

Expand Down Expand Up @@ -118,28 +118,82 @@ macro_rules! impl_to_pyerr {
};
}

pub(crate) struct Utf8ErrorWithBytes {
pub(crate) err: std::str::Utf8Error,
pub(crate) bytes: Vec<u8>,
}

impl PyErrArguments for Utf8ErrorWithBytes {
fn arguments(self, py: Python<'_>) -> Py<PyAny> {
let Self { err, bytes } = self;
let start = err.valid_up_to();
let end = err.error_len().map_or(bytes.len(), |l| start + l);

let encoding = types::PyString::new(py, "utf-8").into_any();
let bytes = types::PyBytes::new(py, &bytes).into_any();
let start = types::PyInt::new(py, start).into_any();
let end = types::PyInt::new(py, end).into_any();
let reason = types::PyString::new(py, "invalid utf-8").into_any();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it's hard to do better than this with the current API, we'd have to peek the invalid bytes and deduce the reason for failure?

(Probably not worth doing that, just checking.)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can check for incomplete byte sequences (i.e. unexpected end of input) versus invalid bytes by checking if err.error_len() is None. But the Rust error type does not differentiate between the different kinds of invalid bytes (invalid start byte, invalid continuation, encoding out of range, etc...)


// FIXME(icxolu) remove unwrap
types::PyTuple::new(py, &[encoding, bytes, start, end, reason])
.unwrap()
.into_any()
.unbind()
}
}

impl std::convert::From<Utf8ErrorWithBytes> for PyErr {
fn from(err: Utf8ErrorWithBytes) -> PyErr {
exceptions::PyUnicodeDecodeError::new_err(err)
}
}

impl PyErrArguments for std::string::FromUtf8Error {
fn arguments(self, py: Python<'_>) -> Py<PyAny> {
Utf8ErrorWithBytes {
err: self.utf8_error(),
bytes: self.into_bytes(),
}
.arguments(py)
}
}

impl std::convert::From<std::string::FromUtf8Error> for PyErr {
fn from(err: std::string::FromUtf8Error) -> PyErr {
exceptions::PyUnicodeDecodeError::new_err(err)
}
}

impl PyErrArguments for std::ffi::IntoStringError {
fn arguments(self, py: Python<'_>) -> Py<PyAny> {
Utf8ErrorWithBytes {
err: self.utf8_error(),
bytes: self.into_cstring().into_bytes(),
}
.arguments(py)
}
}

impl std::convert::From<std::ffi::IntoStringError> for PyErr {
fn from(err: std::ffi::IntoStringError) -> PyErr {
exceptions::PyUnicodeDecodeError::new_err(err)
}
}

impl_to_pyerr!(std::array::TryFromSliceError, exceptions::PyValueError);
impl_to_pyerr!(std::num::ParseIntError, exceptions::PyValueError);
impl_to_pyerr!(std::num::ParseFloatError, exceptions::PyValueError);
impl_to_pyerr!(std::num::TryFromIntError, exceptions::PyValueError);
impl_to_pyerr!(std::str::ParseBoolError, exceptions::PyValueError);
impl_to_pyerr!(std::ffi::IntoStringError, exceptions::PyUnicodeDecodeError);
impl_to_pyerr!(std::ffi::NulError, exceptions::PyValueError);
impl_to_pyerr!(std::str::Utf8Error, exceptions::PyUnicodeDecodeError);
impl_to_pyerr!(std::string::FromUtf8Error, exceptions::PyUnicodeDecodeError);
impl_to_pyerr!(
std::string::FromUtf16Error,
exceptions::PyUnicodeDecodeError
);
impl_to_pyerr!(
std::char::DecodeUtf16Error,
exceptions::PyUnicodeDecodeError
);
impl_to_pyerr!(std::net::AddrParseError, exceptions::PyValueError);

#[cfg(test)]
mod tests {
use crate::{PyErr, Python};
use super::*;

use crate::{types::PyAnyMethods, IntoPyObjectExt, PyErr, Python};
use std::io;

#[test]
Expand Down Expand Up @@ -179,4 +233,69 @@ mod tests {
check_err(io::ErrorKind::IsADirectory, "IsADirectoryError");
check_err(io::ErrorKind::NotADirectory, "NotADirectoryError");
}

#[test]
#[allow(invalid_from_utf8)]
fn utf8_errors() {
let bytes = b"abc\xffdef".to_vec();

let check_err = |py_err: PyErr| {
Python::attach(|py| {
let py_err = py_err.into_bound_py_any(py).unwrap();

assert!(py_err.is_instance_of::<exceptions::PyUnicodeDecodeError>());
assert_eq!(
py_err
.getattr("encoding")
.unwrap()
.extract::<String>()
.unwrap(),
"utf-8"
);
assert_eq!(
py_err
.getattr("object")
.unwrap()
.extract::<Vec<u8>>()
.unwrap(),
&*bytes
);
assert_eq!(
py_err.getattr("start").unwrap().extract::<usize>().unwrap(),
3
);
assert_eq!(
py_err.getattr("end").unwrap().extract::<usize>().unwrap(),
4
);
assert_eq!(
py_err
.getattr("reason")
.unwrap()
.extract::<String>()
.unwrap(),
"invalid utf-8"
);
});
};

let utf8_err_with_bytes = Utf8ErrorWithBytes {
err: std::str::from_utf8(&bytes).expect_err("\\xff is invalid utf-8"),
bytes: bytes.clone(),
}
.into();
check_err(utf8_err_with_bytes);

let from_utf8_err = String::from_utf8(bytes.clone())
.expect_err("\\xff is invalid utf-8")
.into();
check_err(from_utf8_err);

let from_utf8_err = std::ffi::CString::new(bytes.clone())
.unwrap()
.into_string()
.expect_err("\\xff is invalid utf-8")
.into();
check_err(from_utf8_err);
}
}
1 change: 1 addition & 0 deletions src/err/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ mod impls;
pub use cast_error::{CastError, CastIntoError};
#[allow(deprecated)]
pub use downcast_error::{DowncastError, DowncastIntoError};
pub(crate) use impls::Utf8ErrorWithBytes;

/// Represents a Python exception.
///
Expand Down
Loading
Loading