-
Notifications
You must be signed in to change notification settings - Fork 495
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #103 from louis030195/apple-native-ocr
feat: apple native ocr
- Loading branch information
Showing
14 changed files
with
246 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
use std::env; | ||
|
||
#[cfg(target_os = "macos")] | ||
fn main() { | ||
let destination = env::var("DESTINATION").unwrap_or_default(); | ||
|
||
if destination == "brew" { | ||
println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path/../lib"); | ||
} else if destination == "tauri" { | ||
println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path/../Frameworks"); | ||
} else { | ||
println!("cargo:rustc-link-arg=-Wl,-rpath,@executable_path/../../screenpipe-vision/lib"); | ||
} | ||
|
||
println!("cargo:rustc-link-lib=dylib=screenpipe"); | ||
} | ||
|
||
#[cfg(not(target_os = "macos"))] | ||
fn main() {} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
use image::DynamicImage; | ||
use std::ffi::CStr; | ||
use std::os::raw::{c_char, c_uchar}; | ||
|
||
#[link(name = "screenpipe")] | ||
extern "C" { | ||
fn perform_ocr( | ||
image_data: *const c_uchar, | ||
length: usize, | ||
width: i32, | ||
height: i32, | ||
) -> *mut c_char; | ||
} | ||
|
||
pub fn perform_ocr_apple(image: &DynamicImage) -> String { | ||
let rgba = image.to_rgba8(); | ||
let (width, height) = rgba.dimensions(); | ||
let raw_data = rgba.as_raw(); | ||
|
||
unsafe { | ||
let result_ptr = perform_ocr( | ||
raw_data.as_ptr(), | ||
raw_data.len(), | ||
width as i32, | ||
height as i32, | ||
); | ||
let result = CStr::from_ptr(result_ptr).to_string_lossy().into_owned(); | ||
libc::free(result_ptr as *mut libc::c_void); | ||
result | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,7 @@ | ||
pub mod apple; | ||
pub mod core; | ||
pub mod utils; | ||
pub use core::{continuous_capture, get_monitor, process_ocr_task, CaptureResult}; | ||
#[cfg(target_os = "macos")] | ||
pub use apple::perform_ocr_apple; | ||
pub use core::{continuous_capture, process_ocr_task, CaptureResult, ControlMessage}; | ||
pub use utils::{perform_ocr_tesseract, OcrEngine}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import CoreGraphics | ||
import Foundation | ||
import Vision | ||
|
||
@_cdecl("perform_ocr") | ||
public func performOCR(imageData: UnsafePointer<UInt8>, length: Int, width: Int, height: Int) | ||
-> UnsafeMutablePointer<CChar>? { | ||
|
||
// print("Attempting to create image from raw data") | ||
// print("Image dimensions: \(width)x\(height)") | ||
|
||
guard let dataProvider = CGDataProvider(data: Data(bytes: imageData, count: length) as CFData) | ||
else { | ||
// print("Failed to create CGDataProvider.") | ||
return strdup("Error: Failed to create CGDataProvider") | ||
} | ||
|
||
guard | ||
let cgImage = CGImage( | ||
width: width, | ||
height: height, | ||
bitsPerComponent: 8, | ||
bitsPerPixel: 32, | ||
bytesPerRow: width * 4, | ||
space: CGColorSpaceCreateDeviceRGB(), | ||
bitmapInfo: CGBitmapInfo(rawValue: CGImageAlphaInfo.premultipliedLast.rawValue), | ||
provider: dataProvider, | ||
decode: nil, | ||
shouldInterpolate: false, | ||
intent: .defaultIntent | ||
) | ||
else { | ||
// print("Failed to create CGImage.") | ||
return strdup("Error: Failed to create CGImage") | ||
} | ||
|
||
// print("CGImage created successfully.") | ||
|
||
let semaphore = DispatchSemaphore(value: 0) | ||
var ocrResult = "" | ||
|
||
let request = VNRecognizeTextRequest { request, error in | ||
defer { semaphore.signal() } | ||
|
||
if let error = error { | ||
// print("Error in text recognition request: \(error)") | ||
ocrResult = "Error: \(error.localizedDescription)" | ||
return | ||
} | ||
|
||
guard let observations = request.results as? [VNRecognizedTextObservation] else { | ||
// print("Failed to process image or no text found.") | ||
ocrResult = "Error: Failed to process image or no text found" | ||
return | ||
} | ||
|
||
// print("Number of text observations: \(observations.count)") | ||
|
||
for (_, observation) in observations.enumerated() { | ||
guard let topCandidate = observation.topCandidates(1).first else { | ||
// print("No top candidate for observation \(index)") | ||
continue | ||
} | ||
ocrResult += "\(topCandidate.string)\n" | ||
} | ||
} | ||
|
||
request.recognitionLevel = .accurate | ||
|
||
let handler = VNImageRequestHandler(cgImage: cgImage, options: [:]) | ||
do { | ||
// print("Performing OCR...") | ||
try handler.perform([request]) | ||
} catch { | ||
// print("Failed to perform OCR: \(error)") | ||
return strdup("Error: Failed to perform OCR - \(error.localizedDescription)") | ||
} | ||
|
||
semaphore.wait() | ||
|
||
return strdup(ocrResult.isEmpty ? "No text found" : ocrResult) | ||
} | ||
|
||
// swiftc -emit-library -o screenpipe-vision/lib/libscreenpipe.dylib screenpipe-vision/src/ocr.swift | ||
|
||
// or | ||
// swiftc -emit-library -o /usr/local/lib/libscreenpipe.dylib screenpipe-vision/src/ocr.swift |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
#[cfg(target_os = "macos")] | ||
#[cfg(test)] | ||
mod tests { | ||
use image::GenericImageView; | ||
use screenpipe_vision::perform_ocr_apple; | ||
use std::path::PathBuf; | ||
|
||
#[tokio::test] | ||
async fn test_apple_native_ocr() { | ||
let mut path = PathBuf::from(env!("CARGO_MANIFEST_DIR")); | ||
path.push("tests"); | ||
path.push("testing_OCR.png"); | ||
println!("Path to testing_OCR.png: {:?}", path); | ||
|
||
// Check if file exists and print its size | ||
if let Ok(metadata) = std::fs::metadata(&path) { | ||
println!("File size: {} bytes", metadata.len()); | ||
} | ||
|
||
// Attempt to open the image | ||
let image = image::open(&path).expect("Failed to open image"); | ||
println!("Image dimensions: {:?}", image.dimensions()); | ||
|
||
// Convert image to RGB format | ||
let rgb_image = image.to_rgb8(); | ||
println!("RGB image dimensions: {:?}", rgb_image.dimensions()); | ||
|
||
let result = perform_ocr_apple(&image); | ||
|
||
println!("OCR text: {:?}", result); | ||
assert!( | ||
result.contains("ocr_tx.receiver_count"), | ||
"OCR failed: {:?}", | ||
result | ||
); | ||
} | ||
} |