parse_csv also takes file-likes

johnmartins · Jun 24, 2024 · 5f84668 · 5f84668
2 parents 058ad35 + 1b6f9c7
commit 5f84668
Show file tree

Hide file tree

Showing 2 changed files with 58 additions and 24 deletions.
diff --git a/cpm/parse.py b/cpm/parse.py
@@ -1,3 +1,4 @@
+from typing import TextIO, Union
 from cpm.exceptions import *
 from cpm.models import DSM
 from os import listdir
@@ -31,54 +32,81 @@ def parse_csv_dir(dir_path: str, pattern: str =  None,  delimiter: str = 'auto',
     return dsm_array
 
 
-def parse_csv(filepath: str, delimiter: str = 'auto', encoding: str = 'utf-8', instigator: str = 'column'):
+def parse_csv(file: Union[str, TextIO], delimiter: str = 'auto', encoding: str = 'utf-8', instigator: str = 'column'):
     """
     Parse CSV to DSM
-    :param filepath: Targeted CSV file
+    :param file: Targeted CSV file or file-like object
     :param delimiter: CSV delimiter. Defaults to auto-detection.
     :param encoding: text-encoding. Defaults to utf-8
     :param instigator: Determines directionality of DSM. Defaults to columns instigating rows.
     :return: DSM
     """
-
+
+    content = _read_file(file, encoding)
+
     if delimiter == 'auto':
-        with open(filepath, 'r', encoding=encoding) as file:
-            delimiter = detect_delimiter(file.read())
+        delimiter = detect_delimiter(content)
 
     # Identify number of rows, and separate header row
     num_cols = 0
     column_names = []
-    with open(filepath, 'r') as file:
-        for line in file:
-            column_names.append(line.split(delimiter)[0])
-            num_cols += 1
+    lines = _get_file_lines(file, encoding)
+    for line in lines:
+        column_names.append(line.split(delimiter)[0])
+        num_cols += 1
 
     # We do not want the first column in the header
     column_names.pop(0)
 
     data = []
 
-    with open(filepath, 'r') as file:
-        for i, line in enumerate(file):
-            if i == 0:
+    for i, line in enumerate(lines):
+        if i == 0:
+            continue
+        data.append([])
+        for j, col in enumerate(line.split(delimiter)):
+            if j == 0:
                 continue
-            data.append([])
-            for j, col in enumerate(line.split(delimiter)):
-                if j == 0:
-                    continue
-                if col == "":
+            if col == "":
+                data[i-1].append(None)
+            else:
+                try:
+                    data[i-1].append(float(col))
+                except ValueError:
                     data[i-1].append(None)
-                else:
-                    try:
-                        data[i-1].append(float(col))
-                    except ValueError:
-                        data[i - 1].append(None)
 
     dsm = DSM(matrix=data, columns=column_names, instigator=instigator)
 
     return dsm
 
 
+def _read_file(file, encoding):
+    if isinstance(file, str):
+        with open(file, 'r', encoding=encoding) as f:
+            return f.read()
+    elif hasattr(file, 'read'):
+        position = file.tell()
+        content = file.read()
+        file.seek(position)
+        return content
+    else:
+        raise ValueError("Invalid file input. Must be a filepath or a file-like object.")
+
+
+def _get_file_lines(file, encoding):
+    if isinstance(file, str):
+        with open(file, 'r', encoding=encoding) as f:
+            return f.readlines()
+    elif hasattr(file, 'read'):
+        position = file.tell()
+        file.seek(0)
+        lines = file.readlines()
+        file.seek(position)
+        return lines
+    else:
+        raise ValueError("Invalid file input. Must be a filepath or a file-like object.")
+
+
 def detect_delimiter(text, look_ahead=1000):
     """
     Attempts to determine CSV delmiter based on a certain amount of sample characters
@@ -114,4 +142,3 @@ def detect_delimiter(text, look_ahead=1000):
         raise AutoDelimiterError('None of the default delimiters matched the file. Is the file empty?')
 
     return best_delimiter
-
diff --git a/tests/test_parser.py b/tests/test_parser.py
@@ -1,4 +1,3 @@
-import pytest
 from cpm.parse import parse_csv
 
 
@@ -58,3 +57,11 @@ def test_parse_dsm_network_instigator_row():
     assert len(a_neighbours) == 1
     assert a_neighbours[0] == 3
 
+
+def test_parse_file_object():
+    path = './tests/test-assets/dsm-network-test.csv'
+    with open(path) as file:
+        dsm = parse_csv(file)
+
+        for col in ['A', 'B', 'C', 'D']:
+            assert col in dsm.columns