refine check_wrong_beginning_letters function

the function now considers abbreviations valid if they are similar to full text while not strictly having the same starting letters as the full names
philcaz · Oct 20, 2024 · 34577b5 · 34577b5
1 parent f2970a4
commit 34577b5
Showing 1 changed file with 102 additions and 23 deletions.
diff --git a/scripts/check_quality.py b/scripts/check_quality.py
@@ -58,34 +58,113 @@ def check_wrong_escape(filepath):
 # Check for wrong beginning letters in journal abbreviations
 def check_wrong_beginning_letters(filepath):
     # Words that are typically ignored when creating abbreviations
-    ignore_words = {'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 'la', 'el', 'le', 'et'}
+    ignore_words = {
+        'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by', 
+        'la', 'el', 'le', 'et', 'der', 'die', 'das', 'dem', 'und', 'für'  # Articles in multiple languages
+    }
 
+    # Special cases for abbreviations
+    special_cases = {
+        'proceedings': ['p', 'proc'],
+        'or': ['or'],
+        'spie': ['spie'],
+        'notes': ['notes']
+    }
+
+    def clean_text(text):
+        # Remove special characters except periods (important for compound abbreviations)
+        # and normalize spaces
+        cleaned = re.sub(r'[^\w\s\.]', ' ', text)
+        return ' '.join(filter(None, cleaned.lower().split()))
+
+    def split_compound_abbrev(abbrev):
+        # Split abbreviation that might contain compound parts (e.g., "Nat.forsch")
+        parts = []
+        for part in abbrev.split():
+            # Split on periods but keep them with the preceding part
+            subparts = [sp for sp in re.split(r'(?<=\.)(?=[^\.])', part) if sp]
+            parts.extend(subparts)
+        return parts
+
+    def get_significant_words(text):
+        # Split text into words and filter out ignore words
+        return [w for w in clean_text(text).split() if w.lower() not in ignore_words]
+
+    def is_compound_word_match(full_word, abbrev_part):
+        # Handle compound word abbreviations (e.g., "Nat.forsch" matching "Naturforschenden")
+        if '.' in abbrev_part:
+            # Split the compound abbreviation
+            abbrev_subparts = abbrev_part.split('.')
+            # Get the first few characters of the full word to match against first part
+            word_start = full_word[:len(abbrev_subparts[0])]
+
+            # For the second part (if exists), try to find it within the remaining word
+            if len(abbrev_subparts) > 1 and abbrev_subparts[1]:
+                remaining_word = full_word[len(abbrev_subparts[0]):]
+                return (word_start.lower() == abbrev_subparts[0].lower() and 
+                       abbrev_subparts[1].lower() in remaining_word.lower())
+
+            return word_start.lower() == abbrev_subparts[0].lower()
+        return False
+
+    def is_valid_abbreviation(full_name, abbrev):
+        # Clean and split both strings
+        full_words = get_significant_words(full_name)
+        abbrev_parts = split_compound_abbrev(clean_text(abbrev))
+
+        # Handle cases where abbreviation is the same as full name
+        if clean_text(full_name) == clean_text(abbrev):
+            return True
+
+        # Handle special cases
+        for special_word, valid_abbrevs in special_cases.items():
+            if special_word in full_words:
+                if any(va in abbrev_parts for va in valid_abbrevs):
+                    return True
+
+        # Track matched parts and their positions
+        matched_parts = 0
+        used_full_words = set()
+
+        for abbrev_part in abbrev_parts:
+            found_match = False
+
+            # Try matching against each full word
+            for i, full_word in enumerate(full_words):
+                if i in used_full_words:
+                    continue
+
+                # Check for compound word match
+                if is_compound_word_match(full_word, abbrev_part):
+                    found_match = True
+                    matched_parts += 1
+                    used_full_words.add(i)
+                    break
+
+                # Check for regular abbreviation patterns
+                elif (full_word.lower().startswith(abbrev_part.lower()) or
+                      (len(abbrev_part) >= 2 and abbrev_part[0].lower() == full_word[0].lower())):
+                    found_match = True
+                    matched_parts += 1
+                    used_full_words.add(i)
+                    break
+
+        # Consider the abbreviation valid if we matched most parts
+        min_required_matches = max(1, len(abbrev_parts) * 0.5)
+        return matched_parts >= min_required_matches
+
     with open(filepath, 'r', encoding='utf-8') as f:
         reader = csv.reader(f)
         for line_number, row in enumerate(reader, start=1):
-            if len(row) < 2:  # Skip if row doesn't have both full name and abbreviation
-                continue
+            if len(row) >= 2:
+                full_name = row[0].strip()
+                abbreviation = row[1].strip()
 
-            full_name = row[0].strip()
-            abbreviation = row[1].strip()
-
-            # Skip empty entries
-            if not full_name or not abbreviation:
-                continue
-
-            # Get significant words from full name (ignoring articles, prepositions, etc.)
-            full_words = [word for word in full_name.split() 
-                         if word.lower() not in ignore_words]
-            abbrev_words = abbreviation.split()
-
-            # Skip if either is empty after filtering
-            if not full_words or not abbrev_words:
-                continue
-
-            # Check if abbreviation starts with the same letter as the first significant word
-            if not abbrev_words[0].lower().startswith(full_words[0][0].lower()):
-                error(f"Wrong beginning letter found in {filepath} at line {line_number} " f"Full: '{full_name}', Abbrev: '{abbreviation}')", 
-                      'ERROR Wrong Starting Letter')
+                if not is_valid_abbreviation(full_name, abbreviation):
+                    error(
+                        f"Abbrev mismatch full name pattern in {filepath} at line {line_number}:" f"Full: '{full_name}', " f"Abbrev: '{abbreviation}'",
+                        'ERROR Wrong Starting Letter'
+                    )
 
 
 # Check for duplicate entries