Skip to content

Commit

Permalink
add normlization to string comparisons (issue #40)
Browse files Browse the repository at this point in the history
  • Loading branch information
tfeldmann committed Jan 20, 2024
1 parent d93bec9 commit 9c305e3
Show file tree
Hide file tree
Showing 6 changed files with 43 additions and 45 deletions.
2 changes: 2 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,8 @@ Special tags:
## Environment variables

- `ORGANIZE_CONFIG` - The path to the default config file.
- `ORGANIZE_EXIFTOOL_PATH` - Path to the `exiftool` executable (Default: `""`)
- `ORGANIZE_NORMALIZE_UNICODE` - Whether to normalize strings to NFC unicode form for comparisons (Default `"1"`)
- `NO_COLOR` - if this is set, the output is not colored.
- `EDITOR` - The editor used to edit the config file.

Expand Down
5 changes: 3 additions & 2 deletions organize/filters/name.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from organize.filter import FilterConfig
from organize.output import Output
from organize.resource import Resource
from organize.utils import normalize_unicode


@dataclass(config=ConfigDict(coerce_numbers_to_str=True, extra="forbid"))
Expand Down Expand Up @@ -72,8 +73,8 @@ def pipeline(self, res: Resource, output: Output) -> bool:
name, ext = res.path.stem, res.path.suffix
if not name:
name = ext
result = self.matches(name)
m = self._matcher.match(name)
result = self.matches(normalize_unicode(name))
m = self._matcher.match(normalize_unicode(name))
if not m:
m = name

Expand Down
3 changes: 2 additions & 1 deletion organize/filters/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from organize.filter import FilterConfig
from organize.output import Output
from organize.resource import Resource
from organize.utils import normalize_unicode


@dataclass(config=ConfigDict(coerce_numbers_to_str=True, extra="forbid"))
Expand Down Expand Up @@ -42,7 +43,7 @@ def matches(self, path: str):

def pipeline(self, res: Resource, output: Output) -> bool:
assert res.path is not None, "Does not support standalone mode"
match = self.matches(res.path.name)
match = self.matches(normalize_unicode(res.path.name))
if match:
res.deep_merge(key=self.filter_config.name, data=match.groupdict())
return True
Expand Down
4 changes: 2 additions & 2 deletions organize/rule.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from .registry import action_by_name, filter_by_name
from .resource import Resource
from .template import render
from .utils import ReportSummary, normalize_unicode
from .utils import ReportSummary
from .validators import FlatList, flatten
from .walker import Walker

Expand Down Expand Up @@ -237,7 +237,7 @@ def walk(self, rule_nr: int = 0):
expanded_path = render(loc_path)
for path in _walk_funcs[self.targets](expanded_path):
yield Resource(
path=Path(normalize_unicode(path)),
path=Path(path),
basedir=Path(expanded_path),
rule=self,
rule_nr=rule_nr,
Expand Down
21 changes: 5 additions & 16 deletions organize/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,26 +4,15 @@
from copy import deepcopy
from dataclasses import dataclass
from pathlib import Path
from typing import Any, TypeVar, Union
from typing import Any, Union

ENV_ORGANIZE_NORMALIZE_UNICODE = os.environ.get("ORGANIZE_NORMALIZE_UNICODE", "1")

def _parse_bool(val: str) -> bool:
return val.lower() in ("1", "true")


ENV_ORGANIZE_NORMALIZE_UNICODE = _parse_bool(
os.environ.get("ORGANIZE_NORMALIZE_UNICODE", "0")
)

T = TypeVar("T", str, Path)


def normalize_unicode(text: T, form: str = "NFC") -> T:
if not ENV_ORGANIZE_NORMALIZE_UNICODE:
return text
if isinstance(text, str):
def normalize_unicode(text: str, form: str = "NFC") -> str:
if ENV_ORGANIZE_NORMALIZE_UNICODE == "1":
return unicodedata.normalize(form, text)
return Path(unicodedata.normalize(form, str(text)))
return text


@dataclass
Expand Down
53 changes: 29 additions & 24 deletions tests/core/test_unicode.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from pathlib import Path

import pytest
from conftest import make_files, read_files

from organize import Config
Expand Down Expand Up @@ -59,45 +60,49 @@ def test_folder_umlauts(fs):
assert read_files("Erträge") == {}


def test_normalize():
formA = b"Ertr\xc3\xa4gnisaufstellung.txt".decode("utf-8") # copied from config
formB = b"Ertra\xcc\x88gnisaufstellung.txt".decode("utf-8") # copied from filename
assert normalize_unicode(formA) == normalize_unicode(formB)
CONFUSABLES = (
(
b"Ertr\xc3\xa4gnisaufstellung".decode("utf-8"),
b"Ertra\xcc\x88gnisaufstellung".decode("utf-8"),
),
(
b"Ertra\xcc\x88gnisaufstellung".decode("utf-8"),
b"Ertr\xc3\xa4gnisaufstellung".decode("utf-8"),
),
)


def test_normalization_regex(fs):
make_files(
{b"Ertra\xcc\x88gnisaufstellung.txt".decode("utf-8"): ""},
"test",
)
config = (
b"""
@pytest.mark.parametrize("a, b", CONFUSABLES)
def test_normalize(a, b):
assert a != b
assert normalize_unicode(a) == normalize_unicode(b)


@pytest.mark.parametrize("a, b", CONFUSABLES)
def test_normalization_regex(fs, a, b):
make_files({f"{a}.txt": ""}, "test")
config = f"""
rules:
- locations: /test
filters:
- regex: 'Ertr\xc3\xa4gnisaufstellung.txt$'
- regex: {b}
actions:
- rename: "found-regex.txt"
"""
).decode("utf-8")
Config.from_string(config).execute(simulate=False)
assert read_files("test") == {"found-regex.txt"}
assert read_files("test") == {"found-regex.txt": ""}


def test_normalization_filename(fs):
make_files(
{b"Ertr\xcc\x88gnisaufstellung.txt".decode("utf-8"): ""},
"test",
)
config = (
b"""
@pytest.mark.parametrize("a, b", CONFUSABLES)
def test_normalization_filename(fs, a, b):
make_files({f"{a}.txt": ""}, "test")
config = f"""
rules:
- locations: /test
filters:
- name: "Ertr\xc3\xa4gnisaufstellung"
- name: {a}
actions:
- rename: "found-regex.txt"
"""
).decode("utf-8")
Config.from_string(config).execute(simulate=False)
assert read_files("test") == {"found-regex.txt"}
assert read_files("test") == {"found-regex.txt": ""}

0 comments on commit 9c305e3

Please sign in to comment.