jsh9 · jsh9 · Dec 26, 2024 · Dec 25, 2024 · Dec 26, 2024 · Dec 26, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,32 +1,33 @@
 repos:
   - repo: https://github.com/pycqa/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
       - id: isort
 
   - repo: https://github.com/jsh9/cercis
-    rev: 0.2.3
+    rev: 0.2.5
     hooks:
       - id: cercis
+        exclude: ^tests/data/
 
   - repo: https://github.com/pre-commit/mirrors-prettier
-    rev: v3.0.3
+    rev: v4.0.0-alpha.8
     hooks:
       - id: prettier
 
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v5.0.0
     hooks:
       - id: end-of-file-fixer
       - id: trailing-whitespace
 
   - repo: https://github.com/pre-commit/pre-commit
-    rev: v3.5.0
+    rev: v4.0.1
     hooks:
       - id: validate_manifest
 
   - repo: https://github.com/jsh9/markdown-toc-creator
-    rev: 0.0.4
+    rev: 0.0.10
     hooks:
       - id: markdown-toc-creator
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,18 @@
 # Change Log
 
+## [unpublished]
+
+- Changed
+
+  - Added `DOC002` (syntax error) to handle cases where there are syntax errors
+    in the Python file
+  - Replaced invisible and zero-width characters with empty strings so that
+    Python's AST can correctly parse the files
+  - Added end-to-end test (essentially an integration test)
+
+- Full diff
+  - https://github.com/jsh9/pydoclint/compare/0.5.13...0.5.14
+
 ## [0.5.13] - 2024-12-20
 
 - Fixed

diff --git a/docs/violation_codes.md b/docs/violation_codes.md
@@ -22,6 +22,7 @@
 | Code     | Explanation                              |
 | -------- | ---------------------------------------- |
 | `DOC001` | Potential formatting errors in docstring |
+| `DOC002` | Syntax error in the Python file          |
 
 ## 1. `DOC1xx`: Violations about input arguments
 

diff --git a/pydoclint/main.py b/pydoclint/main.py
@@ -16,6 +16,7 @@
 from pydoclint.parse_config import (
     injectDefaultOptionsFromUserSpecifiedTomlFilePath,
 )
+from pydoclint.utils.invisibleChars import replaceInvisibleChars
 from pydoclint.utils.violation import Violation
 from pydoclint.visitor import Visitor
 
@@ -644,7 +645,23 @@ def _checkFile(
         #       not this may be good enough.
         src: str = ''.join(fp.readlines())
 
-    tree: ast.Module = ast.parse(src)
+    tree: ast.Module
+    try:
+        tree = ast.parse(src)
+    except SyntaxError as e:
+        if str(e).startswith('invalid non-printable character'):
+            src_ = replaceInvisibleChars(src)
+            try:
+                # In case there's another syntax error after fixing
+                # this invalid non-printable character error
+                tree = ast.parse(src_)
+            except SyntaxError as e2:
+                return [Violation(code=2, line=0, msgPostfix=str(e2))]
+        else:  # other syntax errors
+            return [Violation(code=2, line=0, msgPostfix=str(e))]
+    except Exception as e3:  # other non-SyntaxError exceptions
+        raise e3
+
     visitor = Visitor(
         style=style,
         argTypeHintsInSignature=argTypeHintsInSignature,

diff --git a/pydoclint/utils/invisibleChars.py b/pydoclint/utils/invisibleChars.py
@@ -0,0 +1,31 @@
+def replaceInvisibleChars(text: str) -> str:
+    """Replace invisible characters so that AST can correctly parse the code"""
+    invisibleToSpace = {
+        '\uFEFF': ' ',  # Byte order mark (zero-width but might act as a separator)
+    }
+
+    invisibleToEmpty = {
+        '\u200B': '',  # Zero width space
+        '\u200C': '',  # Zero width non-joiner
+        '\u200D': '',  # Zero width joiner
+        '\u2060': '',  # Word joiner
+        '\u180E': '',  # Mongolian vowel separator
+        '\u061C': '',  # Arabic letter mark
+        '\u200E': '',  # Left-to-right mark
+        '\u200F': '',  # Right-to-left mark
+        '\u202A': '',  # Left-to-right embedding
+        '\u202B': '',  # Right-to-left embedding
+        '\u202C': '',  # Pop directional formatting
+        '\u202D': '',  # Left-to-right override
+        '\u202E': '',  # Right-to-left override
+        '\u2061': '',  # Function application
+        '\u2062': '',  # Invisible times
+        '\u2063': '',  # Invisible separator
+        '\u2064': '',  # Invisible plus
+        '\u034F': '',  # Combining grapheme joiner
+    }
+
+    for char, replacement in {**invisibleToSpace, **invisibleToEmpty}.items():
+        text = text.replace(char, replacement)
+
+    return text
diff --git a/pydoclint/utils/violation.py b/pydoclint/utils/violation.py
@@ -7,6 +7,7 @@
 
 VIOLATION_CODES = types.MappingProxyType({
     1: 'Potential formatting errors in docstring. Error message:',
+    2: 'Syntax errors; cannot parse this Python file. Error message:',
 
     101: 'Docstring contains fewer arguments than in function signature.',
     102: 'Docstring contains more arguments than in function signature.',

diff --git a/tests/data/edge_cases/20_invisible_zero_width_chars/case.py b/tests/data/edge_cases/20_invisible_zero_width_chars/case.py
@@ -0,0 +1,53 @@
+# This script defines a function to demonstrate the inclusion of various invisible and zero-width Unicode characters.
+
+def demonstrate_invisible_chars():
+    # Each variable below includes an invisible or zero-width Unicode character.
+    zero_width_space = 1  # Zero Width Space
+    zero_width_non_joiner = ‌2  # Zero Width Non-Joiner
+    zero_width_joiner = ‍3  # Zero Width Joiner
+    byte_order_mark = 4  # Zero Width No-Break Space (Byte Order Mark)
+    word_joiner = ⁠5  # Word Joiner
+    mongolian_vowel_separator = ᠎6  # Mongolian Vowel Separator
+    arabic_letter_mark = 7 ؜  # Arabic Letter Mark
+    left_to_right_mark = ‎8  # Left-to-Right Mark
+    right_to_left_mark = ‏9  # Right-to-Left Mark
+    left_to_right_embedding = ‪10  # Left-to-Right Embedding
+    right_to_left_embedding = ‫11  # Right-to-Left Embedding
+    pop_directional_formatting = ‬12  # Pop Directional Formatting
+    left_to_right_override = ‭13  # Left-to-Right Override
+    right_to_left_override = ‮14  # Right-to-Left Override
+    function_application = ⁡15  # Function Application
+    invisible_times = ⁢16  # Invisible Times
+    invisible_separator = ⁣17  # Invisible Separator
+    invisible_plus = ⁤18  # Invisible Plus
+    combining_grapheme_joiner = ͏  19 # Combining Grapheme Joiner
+
+    # The following dictionary maps character descriptions to their corresponding Unicode characters.
+    invisible_chars = {
+        "Zero Width Space": zero_width_space,
+        "Zero Width Non-Joiner": zero_width_non_joiner,
+        "Zero Width Joiner": zero_width_joiner,
+        "Byte Order Mark": byte_order_mark,
+        "Word Joiner": word_joiner,
+        "Mongolian Vowel Separator": mongolian_vowel_separator,
+        "Arabic Letter Mark": arabic_letter_mark,
+        "Left-to-Right Mark": left_to_right_mark,
+        "Right-to-Left Mark": right_to_left_mark,
+        "Left-to-Right Embedding": left_to_right_embedding,
+        "Right-to-Left Embedding": right_to_left_embedding,
+        "Pop Directional Formatting": pop_directional_formatting,
+        "Left-to-Right Override": left_to_right_override,
+        "Right-to-Left Override": right_to_left_override,
+        "Function Application": function_application,
+        "Invisible Times": invisible_times,
+        "Invisible Separator": invisible_separator,
+        "Invisible Plus": invisible_plus,
+        "Combining Grapheme Joiner": combining_grapheme_joiner,
+    }
+
+    return invisible_chars
+
+# Demonstrate the usage
+invisible_chars = demonstrate_invisible_chars()
+for name, char in invisible_chars.items():
+    print(f"{name}: {repr(char)}")
diff --git a/tests/data/edge_cases/21_syntax_error/case_21a.py b/tests/data/edge_cases/21_syntax_error/case_21a.py
@@ -0,0 +1,5 @@
+a =  # this is a comment
+b =
+c +
+e = '
+f
diff --git a/tests/data/edge_cases/21_syntax_error/case_21b.py b/tests/data/edge_cases/21_syntax_error/case_21b.py
@@ -0,0 +1,2 @@
+# Python 2 syntax
+print 'haha'
diff --git a/tests/data/edge_cases/21_syntax_error/case_21c.py b/tests/data/edge_cases/21_syntax_error/case_21c.py
@@ -0,0 +1,2 @@
+# coding: utf-8
+print "BOM BOOM!"
diff --git a/tests/data/lib2to3_test_cases/README.txt b/tests/data/lib2to3_test_cases/README.txt
@@ -0,0 +1,2 @@
+The test cases in this folder comes from Python's officiel lib2to3 test cases:
+https://github.com/python/cpython/tree/11b89094129e4f16705805956e00868e8c924336/Lib/test/test_lib2to3/data
diff --git a/tests/data/lib2to3_test_cases/bom.py b/tests/data/lib2to3_test_cases/bom.py
@@ -0,0 +1,2 @@
+# coding: utf-8
+print "BOM BOOM!"
diff --git a/tests/data/lib2to3_test_cases/crlf.py b/tests/data/lib2to3_test_cases/crlf.py
@@ -0,0 +1,3 @@
+print "hi"
+
+print "Like bad Windows newlines?"
diff --git a/tests/data/lib2to3_test_cases/different_encoding.py b/tests/data/lib2to3_test_cases/different_encoding.py
@@ -0,0 +1,6 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+print u'ßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞ'
+
+def f(x):
+    print '%s\t->  α(%2i):%s  β(%s)'
diff --git a/tests/data/lib2to3_test_cases/false_encoding.py b/tests/data/lib2to3_test_cases/false_encoding.py
@@ -0,0 +1,2 @@
+#!/usr/bin/env python
+print '#coding=0'
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		The test cases in this folder comes from Python's officiel lib2to3 test cases:
		https://github.com/python/cpython/tree/11b89094129e4f16705805956e00868e8c924336/Lib/test/test_lib2to3/data