ucbepic · sushruth2003 · Nov 29, 2024 · shreyashankar · Nov 30, 2024 · shreyashankar
diff --git a/docetl/operations/link_resolve.py b/docetl/operations/link_resolve.py
@@ -1,10 +1,11 @@
 import random
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Any, Dict, List, Tuple
-
 import jinja2
 from jinja2 import Template
 from rich.prompt import Confirm
+import re
+from slugify import slugify
 
 from docetl.operations.base import BaseOperation
 from docetl.operations.utils import RichLoopBar, rich_as_completed
@@ -139,6 +140,12 @@ def execute(self, input_data: List[Dict]) -> Tuple[List[Dict], float]:
         return input_data, total_cost
 
     def compare(self, link_idx, id_idx, link_value, id_value, item):
+        # Try basic string matching first
+        if isinstance(link_value, str) and isinstance(id_value, str):
+            if link_value.lower() == id_value.lower() or slugify(link_value) == slugify(id_value):
+                self.replacements[link_value] = id_value
+                return 0.0
+
         prompt = self.prompt_template.render(
             link_value = link_value,
             id_value = id_value,

diff --git a/docetl/operations/resolve.py b/docetl/operations/resolve.py
@@ -6,6 +6,8 @@
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Any, Dict, List, Tuple, Optional
+import re
+from slugify import slugify
 
 import jinja2
 from jinja2 import Template
@@ -23,6 +25,14 @@ def find_cluster(item, cluster_map):
         item = cluster_map[item]
     return item
 
+def are_strings_similar(str1: Any, str2: Any) -> bool:
+    """Check if two strings are similar using basic normalization."""
+    if str1 is None or str2 is None:
+        return False
+    if str1.lower() == str2.lower() or slugify(str1) == slugify(str2):
+        return True
+    return False
+
 
 class ResolveOperation(BaseOperation):
     class schema(BaseOperation.schema):
@@ -54,7 +64,7 @@ def compare_pair(
         max_retries_per_timeout: int = 2,
     ) -> Tuple[bool, float]:
         """
-        Compares two items using an LLM model to determine if they match.
+        Compares two items using basic string matching first, falling back to LLM for complex cases.
 
         Args:
             comparison_prompt (str): The prompt template for comparison.
@@ -65,6 +75,7 @@ def compare_pair(
         Returns:
             Tuple[bool, float]: A tuple containing a boolean indicating whether the items match and the cost of the comparison.
         """
+        # Check blocking keys first (case-insensitive exact match)
         if blocking_keys:
             if all(
                 key in item1
@@ -74,6 +85,22 @@ def compare_pair(
             ):
                 return True, 0
 
+        # For each key that exists in both items, try basic string matching
+        common_keys = set(item1.keys()) & set(item2.keys())
+        if common_keys:
+            exact_matches = 0
+            for key in common_keys:
+                if are_strings_similar(item1[key], item2[key]):
+                    exact_matches += 1
+
+            # If all common fields match exactly, return True
+            if exact_matches == len(common_keys):
+                return True, 0
+            # If no fields match at all, likely not a match
+            if exact_matches == 0 and len(common_keys) > 1:
+                return False, 0
+
+        # For complex cases, fall back to LLM
         prompt_template = Template(comparison_prompt)
         prompt = prompt_template.render(input1=item1, input2=item2)
         response = self.runner.api.call_llm(

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -38,6 +38,7 @@ rapidfuzz = "^3.10.0"
 fastapi = { version = "^0.115.0", optional = true }
 uvicorn = { version = "^0.31.0", optional = true }
 websockets = "^13.1"
+python-slugify = "^8.0.4"
 
 [tool.poetry.extras]
 parsing = ["python-docx", "openpyxl", "pydub", "python-pptx", "azure-ai-documentintelligence", "paddlepaddle", "paddleocr", "pymupdf"]