Skip to content

Commit

Permalink
IO - Change origin attribute when not find on system
Browse files Browse the repository at this point in the history
  • Loading branch information
PrimozGodec committed Aug 28, 2023
1 parent e1bdd82 commit 1ed998e
Show file tree
Hide file tree
Showing 2 changed files with 103 additions and 2 deletions.
44 changes: 43 additions & 1 deletion Orange/data/io_util.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os.path
import subprocess
from collections import defaultdict

Expand All @@ -6,7 +7,7 @@

from Orange.data import (
is_discrete_values, MISSING_VALUES, Variable,
DiscreteVariable, StringVariable, ContinuousVariable, TimeVariable,
DiscreteVariable, StringVariable, ContinuousVariable, TimeVariable, Table,
)
from Orange.misc.collections import natural_sorted

Expand Down Expand Up @@ -207,3 +208,44 @@ def mapvalues(arr):
values = [_var.parse(i) for i in orig_values]

return values, var


def _extract_new_origin(attr, table, lookup_dirs):
# origin exist
if os.path.exists(attr.attributes["origin"]):
return attr.attributes["origin"]

# last dir of origin in lookup dirs
dir_ = os.path.basename(os.path.normpath(attr.attributes["origin"]))
for ld in lookup_dirs:
new_dir = os.path.join(ld, dir_)
if os.path.exists(new_dir):
return new_dir

# all first part of column content in lookup dirs
for ld in lookup_dirs:
if all(os.path.join(ld, v) for v in table.get_column(attr)):
return ld

Check warning on line 228 in Orange/data/io_util.py

View check run for this annotation

Codecov / codecov/patch

Orange/data/io_util.py#L226-L228

Added lines #L226 - L228 were not covered by tests


def update_origin(table: Table, file_path: str) -> Table:
"""
Lookup dirs:
1. Dir where CSV
2. Parent dir (CSV in root of parent dir)
Possible situations:
1. Last dir of origin in lookup dirs
2. Any first part of column content in lookup dirs
"""
file_dir = os.path.dirname(file_path)
parent_dir = os.path.dirname(file_dir)
# if file_dir already root file_dir == parent_dir
lookup_dirs = tuple({file_dir, parent_dir})
for attr in table.domain:
if "origin" in attr.attributes:
new_orig = _extract_new_origin(attr, table, lookup_dirs)
if new_orig:
attr.attributes["origin"] = new_orig
return table

61 changes: 60 additions & 1 deletion Orange/data/tests/test_io_util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
import os.path
import unittest
from tempfile import TemporaryDirectory

from Orange.data import ContinuousVariable, guess_data_type
import numpy as np

from Orange.data import ContinuousVariable, guess_data_type, Table, Domain, \
StringVariable
from Orange.data.io_util import update_origin


class TestIoUtil(unittest.TestCase):
Expand All @@ -10,5 +16,58 @@ def test_guess_continuous_w_nans(self):
ContinuousVariable)


class TestUpdateOrigin(unittest.TestCase):
FILE_NAMES = [["file1.txt"], ["file2.txt"], ["file3.text"]]

def setUp(self) -> None:
self.alt_dir = adr = TemporaryDirectory()
# create empty files
for f in self.FILE_NAMES:
with open(os.path.join(adr.name, f[0]), 'w'):
pass

def tearDown(self) -> None:
self.alt_dir.cleanup()

def test_origin_not_changed(self):
"""
Origin exist; keep it unchanged, even though dataset path also includes
files from column.
"""
with TemporaryDirectory() as dir_name:
var = StringVariable("Files")
var.attributes["origin"] = dir_name
table = Table.from_list(Domain([], metas=[var]), self.FILE_NAMES)
update_origin(table, self.alt_dir.name)
self.assertEqual(table.domain[var].attributes["origin"], dir_name)

def test_origin_subdir(self):
"""Origin is wrong but last dir in origin exit in the dataset file's path"""
var = StringVariable("Files")
var.attributes["origin"] = f"/a/b/{os.path.basename(self.alt_dir.name)}"
table = Table.from_list(Domain([], metas=[var]), self.FILE_NAMES)
update_origin(table, self.alt_dir.name)
self.assertEqual(table.domain[var].attributes["origin"], self.alt_dir.name)

def test_origin_parents_subdir(self):
"""
Origin is wrong but last dir in origin exit in the dataset file parents's path
"""
# make the dir where dataset is placed
dataset_dir = os.path.join(self.alt_dir.name, "subdir")

var = StringVariable("Files")
var.attributes["origin"] = f"/a/b/{os.path.basename(self.alt_dir.name)}"
table = Table.from_list(Domain([], metas=[var]), self.FILE_NAMES)
update_origin(table, dataset_dir)
self.assertEqual(table.domain[var].attributes["origin"], self.alt_dir.name)

def test_column_paths_subdir(self):
pass

def test_column_paths_parents_subdir(self):
pass


if __name__ == '__main__':
unittest.main()

0 comments on commit 1ed998e

Please sign in to comment.