Skip to content

Commit

Permalink
Count Bytes and Docs (#186)
Browse files Browse the repository at this point in the history
* added option to count bytes

* version

* added document counter
  • Loading branch information
soldni authored Aug 17, 2024
1 parent 4cf2d17 commit f365470
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 1 deletion.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dolma"
version = "1.0.9"
version = "1.0.10"
description = "Data filters"
license = { text = "Apache-2.0" }
readme = "README.md"
Expand Down
13 changes: 13 additions & 0 deletions python/dolma/taggers/length.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,19 @@
from ..core.utils import split_paragraphs


@TaggerRegistry.add("bytes_length_v1")
class BytesLengthV1(BaseTagger):
def predict(self, doc: Document) -> DocResult:
score = len(doc.text.encode("utf-8"))
return DocResult(doc=doc, spans=[Span(start=0, end=len(doc.text), type="bytes", score=score)])


@TaggerRegistry.add("doc_count_v1")
class DocCountLengthV1(BaseTagger):
def predict(self, doc: Document) -> DocResult:
return DocResult(doc=doc, spans=[Span(start=0, end=len(doc.text), type="docs", score=1)])


@TaggerRegistry.add("char_length_v1")
class CharLengthV1(BaseTagger):
def predict(self, doc: Document) -> DocResult:
Expand Down

0 comments on commit f365470

Please sign in to comment.