Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix corpus count methods #46

Merged
merged 1 commit into from
Oct 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 0 additions & 8 deletions machine/corpora/alignment_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,6 @@ def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[Align
with tac.get_rows() as rows:
yield from rows

@property
def missing_rows_allowed(self) -> bool:
return any(ac.missing_rows_allowed for ac in self.alignment_collections)

def count(self, include_empty: bool = True) -> int:
return sum(ac.count(include_empty) for ac in self.alignment_collections)

Expand Down Expand Up @@ -64,10 +60,6 @@ def __init__(self, corpus: AlignmentCorpus, transform: Callable[[AlignmentRow],
def alignment_collections(self) -> Iterable[AlignmentCollection]:
return self._corpus.alignment_collections

@property
def missing_rows_allowed(self) -> bool:
return self._corpus.missing_rows_allowed

def count(self, include_empty: bool = True) -> int:
return self._corpus.count(include_empty)

Expand Down
4 changes: 0 additions & 4 deletions machine/corpora/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,6 @@ def _get_rows(self) -> Generator[Row, None, None]:
def __iter__(self) -> ContextManagedGenerator[Row, None, None]:
return self.get_rows()

@property
def missing_rows_allowed(self) -> bool:
return True

def count(self, include_empty: bool = True) -> int:
with self.get_rows() as rows:
return sum(1 for row in rows if include_empty or not row.is_empty)
Expand Down
12 changes: 0 additions & 12 deletions machine/corpora/flatten.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,6 @@ def texts(self) -> Iterable[Text]:
def is_tokenized(self) -> bool:
return all(c.is_tokenized for c in self._corpora)

@property
def missing_rows_allowed(self) -> bool:
return any(c.missing_rows_allowed for c in self._corpora)

def count(self, include_empty: bool = True) -> int:
return sum(c.count(include_empty) for c in self._corpora)

Expand All @@ -78,10 +74,6 @@ def __init__(self, corpora: List[AlignmentCorpus]) -> None:
def alignment_collections(self) -> Iterable[AlignmentCollection]:
return chain.from_iterable(c.alignment_collections for c in self._corpora)

@property
def missing_rows_allowed(self) -> bool:
return any(c.missing_rows_allowed for c in self._corpora)

def count(self, include_empty: bool = True) -> int:
return sum(c.count(include_empty) for c in self._corpora)

Expand All @@ -103,10 +95,6 @@ def is_source_tokenized(self) -> bool:
def is_target_tokenized(self) -> bool:
return all(c.is_target_tokenized for c in self._corpora)

@property
def missing_rows_allowed(self) -> bool:
return any(c.missing_rows_allowed for c in self._corpora)

def count(self, include_empty: bool = True) -> int:
return sum(c.count(include_empty) for c in self._corpora)

Expand Down
12 changes: 0 additions & 12 deletions machine/corpora/parallel_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,10 +495,6 @@ def is_source_tokenized(self) -> bool:
def is_target_tokenized(self) -> bool:
return self._is_target_tokenized

@property
def missing_rows_allowed(self) -> bool:
return self._corpus.missing_rows_allowed

def count(self, include_empty: bool = True) -> int:
return self._corpus.count(include_empty)

Expand Down Expand Up @@ -572,10 +568,6 @@ def is_source_tokenized(self) -> bool:
def is_target_tokenized(self) -> bool:
return False

@property
def missing_rows_allowed(self) -> bool:
return False

def count(self, include_empty: bool = True) -> int:
if include_empty:
return len(self._df)
Expand Down Expand Up @@ -637,10 +629,6 @@ def is_source_tokenized(self) -> bool:
def is_target_tokenized(self) -> bool:
return False

@property
def missing_rows_allowed(self) -> bool:
return False

def count(self, include_empty: bool = True) -> int:
try:
from datasets.arrow_dataset import Dataset
Expand Down
15 changes: 0 additions & 15 deletions machine/corpora/standard_parallel_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,21 +61,6 @@ def all_source_rows(self) -> bool:
def all_target_rows(self) -> bool:
return self._all_target_rows

@property
def missing_rows_allowed(self) -> bool:
if self._source_corpus.missing_rows_allowed or self._target_corpus.missing_rows_allowed:
return True
source_text_ids = {t.id for t in self._source_corpus.texts}
target_text_ids = {t.id for t in self._target_corpus.texts}
return source_text_ids != target_text_ids

def count(self, include_empty: bool = True) -> int:
if self.missing_rows_allowed:
return super().count(include_empty)
if include_empty:
return self._source_corpus.count(include_empty)
return min(self._source_corpus.count(include_empty), self._target_corpus.count(include_empty))

def _get_rows(self) -> Generator[ParallelTextRow, None, None]:
source_text_ids = {t.id for t in self._source_corpus.texts}
target_text_ids = {t.id for t in self._target_corpus.texts}
Expand Down
8 changes: 0 additions & 8 deletions machine/corpora/text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,6 @@ def _get_rows(self, text_ids: Optional[Iterable[str]] = None) -> Generator[TextR
with text.get_rows() as rows:
yield from rows

@property
def missing_rows_allowed(self) -> bool:
return any(t.missing_rows_allowed for t in self.texts)

def count(self, include_empty: bool = True) -> int:
return sum(t.count(include_empty) for t in self.texts)

Expand Down Expand Up @@ -163,10 +159,6 @@ def texts(self) -> Iterable[Text]:
def is_tokenized(self) -> bool:
return self._is_tokenized

@property
def missing_rows_allowed(self) -> bool:
return self._corpus.missing_rows_allowed

def count(self, include_empty: bool = True) -> int:
return self._corpus.count(include_empty)

Expand Down
20 changes: 14 additions & 6 deletions machine/corpora/text_file_alignment_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,18 @@ def _get_rows(self) -> Generator[AlignmentRow, None, None]:
yield AlignmentRow(self.id, row_ref, AlignedWordPair.from_string(line))
line_num += 1

@property
def missing_rows_allowed(self) -> bool:
return False

def count(self, include_empty: bool = True) -> int:
with open(self._filename, mode="rb") as file:
return sum(1 for line in file if include_empty or len(line.strip()) > 0)
if include_empty:
with open(self._filename, mode="rb") as file:
return sum(1 for _ in file)

with open(self._filename, "r", encoding="utf-8-sig") as file:
count = 0
for line in file:
line = line.rstrip("\r\n")
index = line.find("\t")
if index >= 0:
line = line[index + 1 :]
if len(line.strip()) > 0:
count += 1
return count
21 changes: 15 additions & 6 deletions machine/corpora/text_file_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,19 @@ def _get_rows(self) -> Generator[TextRow, None, None]:
yield self._create_row(line, row_ref, flags)
line_num += 1

@property
def missing_rows_allowed(self) -> bool:
return False

def count(self, include_empty: bool = True) -> int:
with open(self._filename, mode="rb") as file:
return sum(1 for line in file if include_empty or len(line.strip()) > 0)
if include_empty:
with open(self._filename, mode="rb") as file:
return sum(1 for _ in file)

with open(self._filename, mode="r", encoding="utf-8-sig") as file:
count = 0
for line in file:
line = line.rstrip("\r\n")
if len(line) > 0:
columns = line.split("\t")
if len(columns) > 1:
line = columns[1]
if len(line.strip()) > 0:
count += 1
return count
71 changes: 70 additions & 1 deletion tests/corpora/test_parallel_text_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from machine.scripture import ENGLISH_VERSIFICATION, ORIGINAL_VERSIFICATION, VerseRef, Versification


def test_get_rows_no_segments() -> None:
def test_get_rows_no_rows() -> None:
source_corpus = DictionaryTextCorpus()
target_corpus = DictionaryTextCorpus()
parallel_corpus = StandardParallelTextCorpus(source_corpus, target_corpus)
Expand Down Expand Up @@ -1162,6 +1162,75 @@ def test_from_hf_dataset() -> None:
assert set_equals(rows[2].aligned_word_pairs, [AlignedWordPair(2, 2)])


def test_count_no_rows() -> None:
source_corpus = DictionaryTextCorpus()
target_corpus = DictionaryTextCorpus()
parallel_corpus = StandardParallelTextCorpus(source_corpus, target_corpus)

assert parallel_corpus.count(include_empty=True) == 0
assert parallel_corpus.count(include_empty=False) == 0


def test_count_missing_row() -> None:
source_corpus = DictionaryTextCorpus(
MemoryText(
"text1",
[
text_row("text1", 1, "source segment 1 ."),
text_row("text1", 3, "source segment 3 ."),
],
)
)
target_corpus = DictionaryTextCorpus(
MemoryText(
"text1",
[
text_row("text1", 1, "target segment 1 ."),
text_row("text1", 2, "target segment 2 ."),
text_row("text1", 3, "target segment 3 ."),
],
)
)

parallel_corpus = StandardParallelTextCorpus(source_corpus, target_corpus)

assert parallel_corpus.count(include_empty=True) == 2
assert parallel_corpus.count(include_empty=False) == 2

parallel_corpus = StandardParallelTextCorpus(source_corpus, target_corpus, all_target_rows=True)

assert parallel_corpus.count(include_empty=True) == 3
assert parallel_corpus.count(include_empty=False) == 2


def test_count_empty_row() -> None:
source_corpus = DictionaryTextCorpus(
MemoryText(
"text1",
[
text_row("text1", 1, "source segment 1 ."),
text_row("text1", 2, "source segment 2 ."),
text_row("text1", 3, "source segment 3 ."),
],
)
)
target_corpus = DictionaryTextCorpus(
MemoryText(
"text1",
[
text_row("text1", 1, "target segment 1 ."),
text_row("text1", 2),
text_row("text1", 3, "target segment 3 ."),
],
)
)

parallel_corpus = StandardParallelTextCorpus(source_corpus, target_corpus)

assert parallel_corpus.count(include_empty=True) == 3
assert parallel_corpus.count(include_empty=False) == 2


def text_row(text_id: str, ref: Any, text: str = "", flags: TextRowFlags = TextRowFlags.SENTENCE_START) -> TextRow:
return TextRow(text_id, ref, [] if len(text) == 0 else text.split(), flags)

Expand Down
32 changes: 31 additions & 1 deletion tests/corpora/test_text_file_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def test_get_rows_nonempty_text_no_refs() -> None:
assert text is not None
rows = list(text.get_rows())

assert len(rows) == 3
assert len(rows) == 4

assert rows[0].ref == MultiKeyRef("Test3", [1])
assert rows[0].text == "Line one."
Expand All @@ -60,3 +60,33 @@ def test_get_rows_empty_text() -> None:
rows = list(text.get_rows())

assert len(rows) == 0


def test_count_nonempty_text_refs() -> None:
corpus = TextFileTextCorpus(TEXT_TEST_PROJECT_PATH)

text = corpus.get_text("Test1")
assert text is not None

assert text.count(include_empty=True) == 5
assert text.count(include_empty=False) == 4


def test_count_nonempty_text_no_refs() -> None:
corpus = TextFileTextCorpus(TEXT_TEST_PROJECT_PATH)

text = corpus.get_text("Test3")
assert text is not None

assert text.count(include_empty=True) == 4
assert text.count(include_empty=False) == 3


def test_count_empty_text() -> None:
corpus = TextFileTextCorpus(TEXT_TEST_PROJECT_PATH)

text = corpus.get_text("Test2")
assert text is not None

assert text.count(include_empty=True) == 0
assert text.count(include_empty=False) == 0
1 change: 1 addition & 0 deletions tests/testutils/data/txt/Test3.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
Line one.
Line two.
Line three.