Skip to content

Commit

Permalink
Merge pull request #2 from philcaz/add-quality-checker
Browse files Browse the repository at this point in the history
Add quality checker and set up GitHub Action file to run the checker
  • Loading branch information
surluna authored Oct 20, 2024
2 parents 50edbd5 + 34577b5 commit 3142f3c
Show file tree
Hide file tree
Showing 2 changed files with 302 additions and 0 deletions.
35 changes: 35 additions & 0 deletions .github/workflows/quality-check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Quality Check

on: [push, pull_request]

jobs:
quality-check:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'

- name: Run Quality Check
id: quality_check
run: |
python ./scripts/check_quality.py
continue-on-error: true # Continue if there are warnings/errors, so we can log the output

- name: Upload Quality Check Summary
if: always()
uses: actions/upload-artifact@v3
with:
name: check-quality-summary
path: ./check_quality_summary.txt

- name: Fail on Errors
if: steps.quality_check.outcome == 'failure'
run: |
echo "Quality check failed due to errors."
exit 1
267 changes: 267 additions & 0 deletions scripts/check_quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
import os
import re
import sys
import itertools
import csv

# Path to the journals folder (change this path accordingly)
JOURNALS_FOLDER_PATH = "./journals/"
SUMMARY_FILE_PATH = "./check_quality_summary.txt"
errors = []
warnings = []

# Error and Warning Counts
error_counts = {
'ERROR Wrong Escape': 0,
'ERROR Wrong Starting Letter': 0,
'ERROR Non-UTF8': 0
}
warning_counts = {
'WARN Duplicate FullName/Abbreviation': 0,
'WARN Same Abbreviation as Full Name': 0,
'WARN Outdated Manage Abbreviation': 0
}
# Error tracking
def error(message, error_type):
errors.append((error_type, f"ERROR: {message}"))
error_counts[error_type] += 1

# Warning tracking
def warning(message, warning_type):
warnings.append((warning_type, f"WARN: {message}"))
warning_counts[warning_type] += 1

# Check if non-UTF8 characters are present in the file
def check_non_utf8_characters(filepath):
try:
with open(filepath, 'r', encoding='utf-8') as f:
for line_number, line in enumerate(f, start=1):
try:
line.encode('utf-8')
except UnicodeEncodeError as e:
error(f"Non-UTF8 character found in {filepath} at line {line_number}: {e}", 'ERROR Non-UTF8')
except UnicodeDecodeError as e:
error(f"File {filepath} contains non-UTF-8 characters: {e}", 'ERROR Non-UTF8')

# Check if there are wrong escape characters in abbreviation entries
def check_wrong_escape(filepath):
valid_escapes = {'\\', '\n', '\t', '\r', '\"'}
with open(filepath, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
for line_number, row in enumerate(reader, start=1):
for field in row:
matches = re.findall(r"\\.", field)
for match in matches:
if match not in valid_escapes:
error(f"Wrong escape character found in {filepath} at line {line_number}: {field}", 'ERROR Wrong Escape')

# Check for wrong beginning letters in journal abbreviations
def check_wrong_beginning_letters(filepath):
# Words that are typically ignored when creating abbreviations
ignore_words = {
'a', 'an', 'and', 'the', 'of', 'or', 'in', 'on', 'at', 'to', 'for', 'with', 'by',
'la', 'el', 'le', 'et', 'der', 'die', 'das', 'dem', 'und', 'für' # Articles in multiple languages
}

# Special cases for abbreviations
special_cases = {
'proceedings': ['p', 'proc'],
'or': ['or'],
'spie': ['spie'],
'notes': ['notes']
}

def clean_text(text):
# Remove special characters except periods (important for compound abbreviations)
# and normalize spaces
cleaned = re.sub(r'[^\w\s\.]', ' ', text)
return ' '.join(filter(None, cleaned.lower().split()))

def split_compound_abbrev(abbrev):
# Split abbreviation that might contain compound parts (e.g., "Nat.forsch")
parts = []
for part in abbrev.split():
# Split on periods but keep them with the preceding part
subparts = [sp for sp in re.split(r'(?<=\.)(?=[^\.])', part) if sp]
parts.extend(subparts)
return parts

def get_significant_words(text):
# Split text into words and filter out ignore words
return [w for w in clean_text(text).split() if w.lower() not in ignore_words]

def is_compound_word_match(full_word, abbrev_part):
# Handle compound word abbreviations (e.g., "Nat.forsch" matching "Naturforschenden")
if '.' in abbrev_part:
# Split the compound abbreviation
abbrev_subparts = abbrev_part.split('.')
# Get the first few characters of the full word to match against first part
word_start = full_word[:len(abbrev_subparts[0])]

# For the second part (if exists), try to find it within the remaining word
if len(abbrev_subparts) > 1 and abbrev_subparts[1]:
remaining_word = full_word[len(abbrev_subparts[0]):]
return (word_start.lower() == abbrev_subparts[0].lower() and
abbrev_subparts[1].lower() in remaining_word.lower())

return word_start.lower() == abbrev_subparts[0].lower()
return False

def is_valid_abbreviation(full_name, abbrev):
# Clean and split both strings
full_words = get_significant_words(full_name)
abbrev_parts = split_compound_abbrev(clean_text(abbrev))

# Handle cases where abbreviation is the same as full name
if clean_text(full_name) == clean_text(abbrev):
return True

# Handle special cases
for special_word, valid_abbrevs in special_cases.items():
if special_word in full_words:
if any(va in abbrev_parts for va in valid_abbrevs):
return True

# Track matched parts and their positions
matched_parts = 0
used_full_words = set()

for abbrev_part in abbrev_parts:
found_match = False

# Try matching against each full word
for i, full_word in enumerate(full_words):
if i in used_full_words:
continue

# Check for compound word match
if is_compound_word_match(full_word, abbrev_part):
found_match = True
matched_parts += 1
used_full_words.add(i)
break

# Check for regular abbreviation patterns
elif (full_word.lower().startswith(abbrev_part.lower()) or
(len(abbrev_part) >= 2 and abbrev_part[0].lower() == full_word[0].lower())):
found_match = True
matched_parts += 1
used_full_words.add(i)
break

# Consider the abbreviation valid if we matched most parts
min_required_matches = max(1, len(abbrev_parts) * 0.5)
return matched_parts >= min_required_matches

with open(filepath, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
for line_number, row in enumerate(reader, start=1):
if len(row) >= 2:
full_name = row[0].strip()
abbreviation = row[1].strip()

if not is_valid_abbreviation(full_name, abbreviation):
error(
f"Abbrev mismatch full name pattern in {filepath} at line {line_number}:" f"Full: '{full_name}', " f"Abbrev: '{abbreviation}'",
'ERROR Wrong Starting Letter'
)


# Check for duplicate entries
def check_duplicates(filepath):
full_name_entries = {}
abbreviation_entries = {}
with open(filepath, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
for line_number, row in enumerate(reader, start=1):
if len(row) < 2:
continue

full_name = row[0].strip()
abbreviation = row[1].strip()

# Check for duplicate full names or abbreviations
if full_name in full_name_entries or abbreviation in abbreviation_entries:
warning(f"Duplicate found in {filepath} at line {line_number}: Full Name: '{full_name}', Abbreviation: '{abbreviation}', first instance seen at line {full_name_entries.get(full_name) or abbreviation_entries.get(abbreviation)}", 'WARN Duplicate FullName/Abbreviation')
else:
full_name_entries[full_name] = line_number
abbreviation_entries[abbreviation] = line_number

# Check if abbreviation and full form are the same
def check_full_form_identical_to_abbreviation(filepath):
with open(filepath, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
for line_number, row in enumerate(reader, start=1):
if len(row) == 2 and row[0].strip() == row[1].strip() and ' ' in row[0].strip():
warning(f"Abbreviation is the same as full form in {filepath} at line {line_number}: {row[0]}", 'WARN Same Abbreviation as Full Name')

# Check for outdated abbreviations
def check_outdated_abbreviations(filepath):
with open(filepath, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
for line_number, row in enumerate(reader, start=1):
if "Manage." in row and "Manag." not in row:
warning(f"Outdated abbreviation used in {filepath} at line {line_number}: {','.join(row)}", 'WARN Outdated Manage Abbreviation')

if __name__ == "__main__":
if not os.path.exists(JOURNALS_FOLDER_PATH):
print("Journals folder not found. Please make sure the path is correct.")
sys.exit(1)

# Iterate through all CSV files in the journals folder
for filename in os.listdir(JOURNALS_FOLDER_PATH):
if filename.endswith(".csv"):
filepath = os.path.join(JOURNALS_FOLDER_PATH, filename)

# Run the checks
check_non_utf8_characters(filepath)
check_wrong_escape(filepath)
check_wrong_beginning_letters(filepath)
check_duplicates(filepath)
check_full_form_identical_to_abbreviation(filepath)
check_outdated_abbreviations(filepath)

# Write the summary to a file
total_issues = sum(error_counts.values()) + sum(warning_counts.values())
summary_output = []

summary_output.append("# Quality Check Summary Report\n")
summary_output.append("| Status | Count |\n")
summary_output.append("| ------------- | ----- |\n")
summary_output.append(f"| 🔍 Total Issues | {total_issues} |\n")
summary_output.append(f"| ❌ Errors Found | {sum(error_counts.values())} |\n")
summary_output.append(f"| ⚠️ Warnings Found | {sum(warning_counts.values())} |\n\n")

# Write detailed errors and warnings
if errors or warnings:
summary_output.append("## Errors per Input File\n\n")
files = set([msg.split(' in ')[1].split(' at ')[0] for _, msg in errors + warnings])
for file in files:
summary_output.append(f"### Issues in file `{file}`\n")
file_errors = [msg for err_type, msg in errors if file in msg]
file_warnings = [msg for warn_type, msg in warnings if file in msg]
if file_errors:
summary_output.append("#### Errors:\n")
for err in file_errors:
summary_output.append(f"- {err.split('ERROR: ')[1]}\n")
if file_warnings:
summary_output.append("#### Warnings:\n")
for warn in file_warnings:
summary_output.append(f"- {warn.split('WARN: ')[1]}\n")
summary_output.append("\n")
else:
summary_output.append("Quality check completed with no errors or warnings.\n")

# Write to summary file
with open(SUMMARY_FILE_PATH, 'w', encoding='utf-8') as summary_file:
summary_file.writelines(summary_output)

# Print the summary to console
for line in summary_output:
print(line, end='')

# Set exit code based on errors
if sum(error_counts.values()) > 0:
sys.exit(1) # Fail with an exit code if errors are found
else:
sys.exit(0) # Exit successfully if no errors

0 comments on commit 3142f3c

Please sign in to comment.