Merge pull request #593 from carmenbianca/fix-sameline-multiline

Fix sameline multiline
fsfe · Oct 11, 2022 · 3450858 · 3450858
2 parents b99581b + c4b2ba3
commit 3450858
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 40 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -60,7 +60,8 @@ The versions follow [semantic versioning](https://semver.org).
 - Sanitize xargs input in scripts documentation
 - License identifiers in comments with symmetrical ASCII art frames are now
   properly detected (#560)
-
+- Fixed an error where copyright statements contained within a multi-line
+  comment style on a single line could not be parsed (#593).
 - In PHP files, add header after `<?php` (#543).
 
 ### Security

diff --git a/src/reuse/_comment.py b/src/reuse/_comment.py
@@ -17,7 +17,7 @@
 import logging
 import operator
 from textwrap import dedent
-from typing import List
+from typing import List, NamedTuple
 
 _LOGGER = logging.getLogger(__name__)
 
@@ -30,14 +30,24 @@ class CommentCreateError(Exception):
     """An error occurred during the creation of a comment."""
 
 
+class MultiLineSegments(NamedTuple):
+    """Components that make up a multi-line comment style, e.g. '/*', '*', and
+    '*/'.
+    """
+
+    start: str
+    middle: str
+    end: str
+
+
 class CommentStyle:
     """Base class for comment style."""
 
     SINGLE_LINE = ""
     INDENT_AFTER_SINGLE = ""
     # (start, middle, end)
     # e.g., ("/*", "*", "*/")
-    MULTI_LINE = ("", "", "")
+    MULTI_LINE = MultiLineSegments("", "", "")
     INDENT_BEFORE_MIDDLE = ""
     INDENT_AFTER_MIDDLE = ""
     INDENT_BEFORE_END = ""
@@ -50,7 +60,7 @@ def can_handle_single(cls) -> bool:
     @classmethod
     def can_handle_multi(cls) -> bool:
         """Whether the :class:`CommentStyle` can handle multi-line comments."""
-        return all((cls.MULTI_LINE[0], cls.MULTI_LINE[2]))
+        return all((cls.MULTI_LINE.start, cls.MULTI_LINE.end))
 
     @classmethod
     def create_comment(cls, text: str, force_multi: bool = False) -> str:
@@ -90,19 +100,19 @@ def _create_comment_multi(cls, text: str) -> str:
         if not cls.can_handle_multi():
             raise CommentCreateError(f"{cls} cannot create multi-line comments")
         result = []
-        result.append(cls.MULTI_LINE[0])
+        result.append(cls.MULTI_LINE.start)
         for line in text.split("\n"):
-            if cls.MULTI_LINE[2] in text:
+            if cls.MULTI_LINE.end in text:
                 raise CommentCreateError(
                     f"'{line}' contains a premature comment delimiter"
                 )
             line_result = ""
-            if cls.MULTI_LINE[1]:
-                line_result += cls.INDENT_BEFORE_MIDDLE + cls.MULTI_LINE[1]
+            if cls.MULTI_LINE.middle:
+                line_result += cls.INDENT_BEFORE_MIDDLE + cls.MULTI_LINE.middle
             if line:
                 line_result += cls.INDENT_AFTER_MIDDLE + line
             result.append(line_result)
-        result.append(cls.INDENT_BEFORE_END + cls.MULTI_LINE[2])
+        result.append(cls.INDENT_BEFORE_END + cls.MULTI_LINE.end)
         return "\n".join(result)
 
     @classmethod
@@ -139,9 +149,9 @@ def _parse_comment_single(cls, text: str) -> str:
 
     @classmethod
     def _remove_middle_marker(cls, line: str) -> str:
-        if cls.MULTI_LINE[1]:
+        if cls.MULTI_LINE.middle:
             possible_line = line.lstrip()
-            prefix = cls.MULTI_LINE[1]
+            prefix = cls.MULTI_LINE.middle
             if possible_line.startswith(prefix):
                 line = possible_line.lstrip(prefix)
                 # Note to future self: line.removeprefix would be preferable
@@ -174,11 +184,11 @@ def _parse_comment_multi(cls, text: str) -> str:
             last = None  # Set this later.
             last_is_first = True
 
-        if not first.startswith(cls.MULTI_LINE[0]):
+        if not first.startswith(cls.MULTI_LINE.start):
             raise CommentParseError(
                 f"'{first}' does not start with a comment marker"
             )
-        first = first.lstrip(cls.MULTI_LINE[0])
+        first = first.lstrip(cls.MULTI_LINE.start)
         first = first.lstrip()
 
         for line in lines:
@@ -188,11 +198,11 @@ def _parse_comment_multi(cls, text: str) -> str:
         if last_is_first:
             last = first
             first = ""
-        if not last.endswith(cls.MULTI_LINE[2]):
+        if not last.endswith(cls.MULTI_LINE.end):
             raise CommentParseError(
                 f"'{last}' does not end with a comment delimiter"
             )
-        last = last.rstrip(cls.MULTI_LINE[2])
+        last = last.rstrip(cls.MULTI_LINE.end)
         last = last.rstrip()
         last = cls._remove_middle_marker(last)
 
@@ -223,11 +233,11 @@ def comment_at_first_character(cls, text: str) -> str:
                     break
                 end = i
             return "\n".join(lines[0 : end + 1])
-        if cls.can_handle_multi() and text.startswith(cls.MULTI_LINE[0]):
+        if cls.can_handle_multi() and text.startswith(cls.MULTI_LINE.start):
             end = 0
             for i, line in enumerate(lines):
                 end = i
-                if line.endswith(cls.MULTI_LINE[2]):
+                if line.endswith(cls.MULTI_LINE.end):
                     break
             else:
                 raise CommentParseError("Comment block never delimits")
@@ -245,15 +255,15 @@ class AppleScriptCommentStyle(CommentStyle):
 
     SINGLE_LINE = "--"
     INDENT_AFTER_SINGLE = " "
-    MULTI_LINE = ("(*", "", "*)")
+    MULTI_LINE = MultiLineSegments("(*", "", "*)")
 
 
 class AspxCommentStyle(CommentStyle):
     """ASPX comment style."""
 
     _shorthand = "aspx"
 
-    MULTI_LINE = ("<%--", "", "--%>")
+    MULTI_LINE = MultiLineSegments("<%--", "", "--%>")
 
 
 class BatchFileCommentStyle(CommentStyle):
@@ -270,7 +280,7 @@ class BibTexCommentStyle(CommentStyle):
 
     _shorthand = "bibtex"
 
-    MULTI_LINE = ("@Comment{", "", "}")
+    MULTI_LINE = MultiLineSegments("@Comment{", "", "}")
 
 
 class CCommentStyle(CommentStyle):
@@ -280,7 +290,7 @@ class CCommentStyle(CommentStyle):
 
     SINGLE_LINE = "//"
     INDENT_AFTER_SINGLE = " "
-    MULTI_LINE = ("/*", "*", "*/")
+    MULTI_LINE = MultiLineSegments("/*", "*", "*/")
     INDENT_BEFORE_MIDDLE = " "
     INDENT_AFTER_MIDDLE = " "
     INDENT_BEFORE_END = " "
@@ -291,7 +301,7 @@ class CssCommentStyle(CommentStyle):
 
     _shorthand = "css"
 
-    MULTI_LINE = ("/*", "*", "*/")
+    MULTI_LINE = MultiLineSegments("/*", "*", "*/")
     INDENT_BEFORE_MIDDLE = " "
     INDENT_AFTER_MIDDLE = " "
     INDENT_BEFORE_END = " "
@@ -327,15 +337,15 @@ class FtlCommentStyle(CommentStyle):
 
     _shorthand = "ftl"
 
-    MULTI_LINE = ("<#--", "", "-->")
+    MULTI_LINE = MultiLineSegments("<#--", "", "-->")
 
 
 class HandlebarsCommentStyle(CommentStyle):
     """Handlebars comment style."""
 
     _shorthand = "handlebars"
 
-    MULTI_LINE = ("{{!--", "", "--}}")
+    MULTI_LINE = MultiLineSegments("{{!--", "", "--}}")
 
 
 class HaskellCommentStyle(CommentStyle):
@@ -352,15 +362,15 @@ class HtmlCommentStyle(CommentStyle):
 
     _shorthand = "html"
 
-    MULTI_LINE = ("<!--", "", "-->")
+    MULTI_LINE = MultiLineSegments("<!--", "", "-->")
 
 
 class JinjaCommentStyle(CommentStyle):
     """Jinja2 comment style."""
 
     _shorthand = "jinja"
 
-    MULTI_LINE = ("{#", "", "#}")
+    MULTI_LINE = MultiLineSegments("{#", "", "#}")
 
 
 class LispCommentStyle(CommentStyle):
@@ -386,7 +396,7 @@ class MlCommentStyle(CommentStyle):
 
     _shorthand = "ml"
 
-    MULTI_LINE = ("(*", "*", "*)")
+    MULTI_LINE = MultiLineSegments("(*", "*", "*)")
     INDENT_BEFORE_MIDDLE = " "
     INDENT_AFTER_MIDDLE = " "
     INDENT_BEFORE_END = " "
@@ -399,7 +409,7 @@ class PlantUmlCommentStyle(CommentStyle):
 
     SINGLE_LINE = "'"
     INDENT_AFTER_SINGLE = " "
-    MULTI_LINE = ("/'", "'", "'/")
+    MULTI_LINE = MultiLineSegments("/'", "'", "'/")
     INDENT_BEFORE_MIDDLE = " "
     INDENT_AFTER_MIDDLE = " "
     INDENT_BEFORE_END = " "

diff --git a/src/reuse/_util.py b/src/reuse/_util.py
@@ -46,10 +46,10 @@
     "".join(
         {
             r"(?:{})*".format(  # pylint: disable=consider-using-f-string
-                re.escape(style.MULTI_LINE[2])
+                re.escape(style.MULTI_LINE.end)
             )
             for style in _all_style_classes()
-            if style.MULTI_LINE[2]
+            if style.MULTI_LINE.end
         }
     )
 )
@@ -60,17 +60,17 @@
     re.compile(
         r"(?P<copyright>(?P<prefix>SPDX-FileCopyrightText:)\s+"
         r"((?P<year>\d{4} - \d{4}|\d{4}),?\s+)?"
-        r"(?P<statement>.*)?)" + _END_PATTERN
+        r"(?P<statement>.*?))" + _END_PATTERN
     ),
     re.compile(
         r"(?P<copyright>(?P<prefix>Copyright(\s?\([cC]\))?)\s+"
         r"((?P<year>\d{4} - \d{4}|\d{4}),?\s+)?"
-        r"(?P<statement>.*)?)" + _END_PATTERN
+        r"(?P<statement>.*?))" + _END_PATTERN
     ),
     re.compile(
         r"(?P<copyright>(?P<prefix>©)\s+"
         r"((?P<year>\d{4} - \d{4}|\d{4}),?\s+)?"
-        r"(?P<statement>.*)?)" + _END_PATTERN
+        r"(?P<statement>.*?))" + _END_PATTERN
     ),
 ]
 
@@ -282,7 +282,7 @@ def extract_spdx_info(text: str) -> SpdxInfo:
         for pattern in _COPYRIGHT_PATTERNS:
             match = pattern.search(line)
             if match is not None:
-                copyright_matches.add(match.groupdict()["copyright"])
+                copyright_matches.add(match.groupdict()["copyright"].strip())
                 break
 
     return SpdxInfo(expressions, copyright_matches)

diff --git a/tests/test_comment.py b/tests/test_comment.py
@@ -56,9 +56,9 @@ def test_create_comment_generic_multi(MultiStyle):
     text = "Hello"
     expected = cleandoc(
         f"""
-        {MultiStyle.MULTI_LINE[0]}
-        {MultiStyle.INDENT_BEFORE_MIDDLE}{MultiStyle.MULTI_LINE[1]}{MultiStyle.INDENT_AFTER_MIDDLE}Hello
-        {MultiStyle.INDENT_BEFORE_END}{MultiStyle.MULTI_LINE[2]}
+        {MultiStyle.MULTI_LINE.start}
+        {MultiStyle.INDENT_BEFORE_MIDDLE}{MultiStyle.MULTI_LINE.middle}{MultiStyle.INDENT_AFTER_MIDDLE}Hello
+        {MultiStyle.INDENT_BEFORE_END}{MultiStyle.MULTI_LINE.end}
         """
     )
 
@@ -78,9 +78,23 @@ def test_parse_comment_generic_multi(MultiStyle):
     # pylint: disable=line-too-long
     text = cleandoc(
         f"""
-        {MultiStyle.MULTI_LINE[0]}
-        {MultiStyle.INDENT_BEFORE_MIDDLE}{MultiStyle.MULTI_LINE[1]}{MultiStyle.INDENT_AFTER_MIDDLE}Hello
-        {MultiStyle.INDENT_BEFORE_END}{MultiStyle.MULTI_LINE[2]}
+        {MultiStyle.MULTI_LINE.start}
+        {MultiStyle.INDENT_BEFORE_MIDDLE}{MultiStyle.MULTI_LINE.middle}{MultiStyle.INDENT_AFTER_MIDDLE}Hello
+        {MultiStyle.INDENT_BEFORE_END}{MultiStyle.MULTI_LINE.end}
+        """
+    )
+    expected = "Hello"
+
+    assert MultiStyle.parse_comment(text) == expected
+
+
+def test_parse_comment_sameline_multi(MultiStyle):
+    """If a multi-line comment style is on a single line, it should still be
+    parsed.
+    """
+    text = cleandoc(
+        f"""
+        {MultiStyle.MULTI_LINE.start} Hello {MultiStyle.MULTI_LINE.end}
         """
     )
     expected = "Hello"

diff --git a/tests/test_util.py b/tests/test_util.py
@@ -176,6 +176,16 @@ def test_extract_with_ignore_block():
     assert len(result.spdx_expressions) == 1
 
 
+def test_extract_sameline_multiline():
+    """When a copyright line is in a multi-line style comment on a single line,
+    do not include the comment end pattern as part of the copyright.
+    """
+    text = "<!-- SPDX-FileCopyrightText: Jane Doe -->"
+    result = _util.extract_spdx_info(text)
+    assert len(result.copyright_lines) == 1
+    assert result.copyright_lines == {"SPDX-FileCopyrightText: Jane Doe"}
+
+
 def test_filter_ignore_block_with_comment_style():
     """Test that the ignore block is properly removed if start and end markers
     are in comment style.