From ea89cf9d9e0676d3083273b2da983efd8f631a77 Mon Sep 17 00:00:00 2001 From: deedy5 <65482418+deedy5@users.noreply.github.com> Date: Sat, 15 Feb 2025 23:56:48 +0300 Subject: [PATCH] refactor(response): test1 --- Cargo.lock | 16 ++++-- Cargo.toml | 6 +- src/lib.rs | 36 +++++------- src/response.rs | 147 ++++++++++++++++++++++++++++++++---------------- 4 files changed, 128 insertions(+), 77 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d4b2514..0e33ce4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -331,9 +331,9 @@ dependencies = [ [[package]] name = "equivalent" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" [[package]] name = "flate2" @@ -1084,7 +1084,10 @@ dependencies = [ "encoding_rs", "foldhash", "html2text", + "http", + "http-body-util", "indexmap", + "mime", "pyo3", "pyo3-log", "pythonize", @@ -1093,6 +1096,7 @@ dependencies = [ "tokio", "tokio-util", "tracing", + "url", "webpki-root-certs", ] @@ -1271,9 +1275,9 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "rquest" -version = "2.0.5" +version = "2.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebd5ee98103b5cd8852293da850188aa0128e19fc60053f63af846939f055950" +checksum = "696a06e7772945b5da6993324bd7503a7e6fc7b001edabe21956b08d8544b386" dependencies = [ "antidote", "async-compression", @@ -1426,9 +1430,9 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.13.2" +version = "1.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" [[package]] name = "socket2" diff --git a/Cargo.toml b/Cargo.toml index b16fe24..c46a1cd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,7 @@ pyo3 = { version = "0.23.4", features = ["extension-module", "abi3-py38", "index anyhow = "1.0.95" tracing = { version = "0.1.41", features = ["log-always"] } pyo3-log = "0.12.1" -rquest = { version = "2.0.5", features = [ +rquest = { version = "2.0.6", features = [ "json", "cookies", "socks", @@ -37,6 +37,10 @@ bytes = "1.10.0" pythonize = "0.23.0" serde_json = "1.0.138" webpki-root-certs = "0.26.8" +http-body-util = "0.1.2" +http = "1.2.0" +mime = "0.3.17" +url = "2.5.4" [profile.release] codegen-units = 1 diff --git a/src/lib.rs b/src/lib.rs index e89743e..b37ab15 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ #![allow(clippy::too_many_arguments)] +use std::any; use std::sync::{Arc, LazyLock, Mutex}; use std::time::Duration; @@ -439,35 +440,24 @@ impl RClient { } // Send the request and await the response - let resp = request_builder.send().await?; - - // Response items - let cookies: IndexMapSSR = resp - .cookies() - .map(|cookie| (cookie.name().to_string(), cookie.value().to_string())) - .collect(); - let headers: IndexMapSSR = resp.headers().to_indexmap(); - let status_code = resp.status().as_u16(); - let url = resp.url().to_string(); - let buf = resp.bytes().await?; - - tracing::info!("response: {} {} {}", url, status_code, buf.len()); - Ok((buf, cookies, headers, status_code, url)) + let resp: rquest::Response = request_builder.send().await?; + let url: String = resp.url().to_string(); + + Ok((resp, url)) }; // Execute an async future, releasing the Python GIL for concurrency. // Use Tokio global runtime to block on the future. - let result: Result<(Bytes, IndexMapSSR, IndexMapSSR, u16, String), Error> = + let response: Result<(rquest::Response, String)> = py.allow_threads(|| RUNTIME.block_on(future)); - let (f_buf, f_cookies, f_headers, f_status_code, f_url) = result?; - + let result = response?; + let resp = http::Response::from(result.0); + let url = result.1; Ok(Response { - content: PyBytes::new(py, &f_buf).unbind(), - cookies: f_cookies, - encoding: String::new(), - headers: f_headers, - status_code: f_status_code, - url: f_url, + resp, + _content: None, + _encoding: None, + url, }) } } diff --git a/src/response.rs b/src/response.rs index a574642..5929a7d 100644 --- a/src/response.rs +++ b/src/response.rs @@ -1,32 +1,31 @@ -use crate::utils::{get_encoding_from_content, get_encoding_from_headers}; -use anyhow::{anyhow, Result}; -use encoding_rs::Encoding; -use foldhash::fast::RandomState; +use anyhow::Result; +use encoding_rs::{Encoding, UTF_8}; use html2text::{ from_read, from_read_with_decorator, render::{RichDecorator, TrivialDecorator}, }; -use indexmap::IndexMap; -use pyo3::{prelude::*, types::PyBytes}; +use http::HeaderMap; +use mime::Mime; +use pyo3::{ + prelude::*, + types::{PyBytes, PyDict}, + IntoPyObjectExt, +}; use pythonize::pythonize; use serde_json::from_slice; +use crate::RUNTIME; +use http_body_util::BodyExt; + /// A struct representing an HTTP response. /// /// This struct provides methods to access various parts of an HTTP response, such as headers, cookies, status code, and the response body. /// It also supports decoding the response body as text or JSON, with the ability to specify the character encoding. #[pyclass] pub struct Response { - #[pyo3(get)] - pub content: Py, - #[pyo3(get)] - pub cookies: IndexMap, - #[pyo3(get, set)] - pub encoding: String, - #[pyo3(get)] - pub headers: IndexMap, - #[pyo3(get)] - pub status_code: u16, + pub resp: http::Response, + pub _content: Option>, + pub _encoding: Option, #[pyo3(get)] pub url: String, } @@ -34,57 +33,110 @@ pub struct Response { #[pymethods] impl Response { #[getter] - fn get_encoding(&mut self, py: Python) -> Result<&String> { - if !self.encoding.is_empty() { - return Ok(&self.encoding); + fn get_content<'rs>(&mut self, py: Python<'rs>) -> Result> { + if let Some(content) = &self._content { + let cloned = content.clone_ref(py); + return Ok(cloned.into_bound(py)); } - self.encoding = get_encoding_from_headers(&self.headers) - .or_else(|| get_encoding_from_content(self.content.as_bytes(py))) - .unwrap_or_else(|| "utf-8".to_string()); - Ok(&self.encoding) + + let bytes = py.allow_threads(|| { + RUNTIME.block_on(async { + BodyExt::collect(self.resp.body_mut()) + .await + .map(|buf| buf.to_bytes()) + }) + })?; + + let content = PyBytes::new(py, &bytes); + self._content = Some(content.clone().unbind()); + Ok(content) } #[getter] - fn text(&mut self, py: Python) -> Result { - // If self.encoding is empty, call get_encoding to populate self.encoding - if self.encoding.is_empty() { - self.get_encoding(py)?; + fn encoding<'rs>(&self, py: Python<'rs>) -> Result { + if let Some(encoding) = &self._encoding { + return Ok(encoding.clone()); } + let encoding = py.allow_threads(|| { + let content_type = self + .resp + .headers() + .get(http::header::CONTENT_TYPE) + .and_then(|value| value.to_str().ok()) + .and_then(|value| value.parse::().ok()); - // Convert Py to &[u8] - let raw_bytes = self.content.as_bytes(py); + let encoding = content_type + .as_ref() + .and_then(|mime| mime.get_param("charset").map(|charset| charset.as_str())) + .unwrap_or("utf-8") + .to_string(); + encoding + }); + Ok(encoding) + } - // Release the GIL here because decoding can be CPU-intensive - py.allow_threads(|| { - let encoding = Encoding::for_label(self.encoding.as_bytes()) - .ok_or_else(|| anyhow!("Unsupported charset: {}", self.encoding))?; - let (decoded_str, detected_encoding, _) = encoding.decode(raw_bytes); + #[getter] + fn text(&mut self, py: Python) -> Result { + let content = self.get_content(py)?.unbind(); + let encoding = self.encoding(py)?; + let raw_bytes = content.as_bytes(py); + let text = py.allow_threads(|| { + let encoding = Encoding::for_label(encoding.as_bytes()).unwrap_or(UTF_8); + let (text, _, _) = encoding.decode(raw_bytes); + text + }); + Ok(text.to_string()) + } - // Update self.encoding based on the detected encoding - if self.encoding != detected_encoding.name() { - self.encoding = detected_encoding.name().to_string(); - } + fn json<'rs>(&mut self, py: Python<'rs>) -> Result> { + let content = self.get_content(py)?.unbind(); + let raw_bytes = content.as_bytes(py); + let json_value: serde_json::Value = from_slice(raw_bytes)?; + let result = pythonize(py, &json_value)?; + Ok(result) + } - Ok(decoded_str.to_string()) - }) + #[getter] + fn headers<'rs>(&self, py: Python<'rs>) -> Result> { + let res = PyDict::new(py); + for (key, value) in self.resp.headers() { + res.set_item(key.as_str(), value.to_str()?)?; + } + Ok(res) } - fn json(&mut self, py: Python) -> Result { - let json_value: serde_json::Value = from_slice(self.content.as_bytes(py))?; - let result = pythonize(py, &json_value).unwrap().unbind(); - Ok(result) + #[getter] + fn cookies<'rs>(&self, py: Python<'rs>) -> Result> { + let cookie_dict = PyDict::new(py); + let set_cookies = self.resp.headers().get_all(http::header::SET_COOKIE); + for cookie_header in set_cookies.iter() { + if let Ok(cookie_str) = cookie_header.to_str() { + if let Some((name, value)) = cookie_str.split_once('=') { + cookie_dict.set_item(name.trim(), value.trim())?; + } + } + } + Ok(cookie_dict) + } + + #[getter] + fn status_code<'rs>(&self, py: Python<'rs>) -> Result> { + let status_code = self.resp.status().as_u16().into_bound_py_any(py)?; + Ok(status_code) } #[getter] fn text_markdown(&mut self, py: Python) -> Result { - let raw_bytes = self.content.bind(py).as_bytes(); + let content = self.get_content(py)?.unbind(); + let raw_bytes = content.as_bytes(py); let text = py.allow_threads(|| from_read(raw_bytes, 100))?; Ok(text) } #[getter] fn text_plain(&mut self, py: Python) -> Result { - let raw_bytes = self.content.bind(py).as_bytes(); + let content = self.get_content(py)?.unbind(); + let raw_bytes = content.as_bytes(py); let text = py.allow_threads(|| from_read_with_decorator(raw_bytes, 100, TrivialDecorator::new()))?; Ok(text) @@ -92,7 +144,8 @@ impl Response { #[getter] fn text_rich(&mut self, py: Python) -> Result { - let raw_bytes = self.content.bind(py).as_bytes(); + let content = self.get_content(py)?.unbind(); + let raw_bytes = content.as_bytes(py); let text = py.allow_threads(|| from_read_with_decorator(raw_bytes, 100, RichDecorator::new()))?; Ok(text)