diff --git a/src/lib.rs b/src/lib.rs index 2baace6..e4cd581 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -11,6 +11,7 @@ use std::{ path::{Path, PathBuf}, sync::mpsc, }; +use regex::Regex; pub mod detectors; pub mod filters; @@ -289,6 +290,102 @@ fn filter_candidates( } } +/// Detects the programming language from the given text content +/// +/// If the language cannot be determined, None will be returned. +/// +/// # Examples +/// ``` +/// use hyperpolyglot::{detect_from_text, Detection}; +/// +/// let content = r#" +/// fn main() { +/// println!("Hello World!"); +/// } +/// "#; +/// let language = detect_from_text(content).unwrap(); +/// assert_eq!(Detection::Classifier("Rust"), language); +/// ``` +pub fn detect_from_text(content: &str) -> Option { + // Early return if the content looks like plain text + if looks_like_plain_text(content) { + return None; + } + + // Since we don't have filename/extension info, we'll use all supported languages as candidates + let candidates: Vec<&'static str> = LANGUAGE_INFO.keys().copied().collect(); + + // Truncate content if needed + let content = truncate_to_char_boundary(content, MAX_CONTENT_SIZE_BYTES); + + // Use classifier to determine the language + Some(Detection::Classifier(detectors::classify(content, &candidates))) +} + +fn looks_like_plain_text(content: &str) -> bool { + // Common build output and log patterns + let build_log_patterns = [ + Regex::new(r"(built|Building|building) in \d+").unwrap(), + Regex::new(r"^\[?\d{1,2}:\d{2}:\d{2}(.\d{3})?\]?").unwrap(), // Time stamps + Regex::new(r"^(✔|✓|->|\[INFO\]|\[ERROR\]|\[WARN\])").unwrap(), // Common log indicators + Regex::new(r"(starting|finished|completed|done|ready|listening)").unwrap(), + ]; + + // Existing sentence pattern + let sentence_pattern = Regex::new(r"^[A-Z].*[.!?]$").unwrap(); + + // Code patterns + let code_patterns = [ + Regex::new(r"[{}\[\]();]").unwrap(), + Regex::new(r"^(function|def|class|if|for|while|import|package)\b").unwrap(), + Regex::new(r"^\s*(public|private|protected)\b").unwrap(), + ]; + + let lines: Vec<&str> = content + .lines() + .map(|line| line.trim()) + .filter(|line| !line.is_empty()) + .collect(); + + if lines.is_empty() { + return true; + } + + // Check for build/log output + let log_lines = lines + .iter() + .filter(|line| { + build_log_patterns.iter().any(|pattern| pattern.is_match(line)) + }) + .count(); + + let log_ratio = log_lines as f32 / lines.len() as f32; + + // If it looks like build output or logs, return true (it's not code) + if log_ratio > 0.2 { + return true; + } + + let code_lines = lines + .iter() + .filter(|line| { + code_patterns.iter().any(|pattern| pattern.is_match(line)) + }) + .count(); + + let code_ratio = code_lines as f32 / lines.len() as f32; + let sentence_lines = lines + .iter() + .filter(|line| sentence_pattern.is_match(line)) + .count(); + let sentence_ratio = sentence_lines as f32 / lines.len() as f32; + + // Consider it plain text if: + // 1. It has more sentence-like lines than code-like lines AND sentence ratio is significant + // 2. OR if it has very few code-like patterns + sentence_ratio > code_ratio && sentence_ratio > 0.3 || code_ratio < 0.1 +} + #[cfg(test)] mod tests { use super::*;