Skip to content

Commit

Permalink
V2 of Gopher tagger (#181)
Browse files Browse the repository at this point in the history
* added new version of gopher tagger

* incrementing version
  • Loading branch information
soldni authored Aug 1, 2024
1 parent a01a222 commit a72c76b
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 3 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[project]
name = "dolma"
version = "1.0.5"
version = "1.0.6"
description = "Data filters"
license = { text = "Apache-2.0" }
readme = "README.md"
Expand Down
17 changes: 15 additions & 2 deletions python/dolma/taggers/gopher.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import logging
import re
from collections import Counter
from dataclasses import dataclass
from statistics import median
Expand Down Expand Up @@ -135,7 +136,7 @@ def as_spans(self) -> List[Span]:
return spans


def get_attributes(text: str) -> GopherAttributes:
def get_attributes(text: str, ignore_empty_lines: bool = False) -> GopherAttributes:
attrs = GopherAttributes([], [])
attrs.character_count = len(text)
if attrs.character_count == 0:
Expand Down Expand Up @@ -173,7 +174,11 @@ def get_attributes(text: str) -> GopherAttributes:
) / max(ng_char_count, 1)
attrs.fraction_of_characters_in_duplicate_ngrams.append((n, value))

lines = text.split("\n")
if ignore_empty_lines:
lines = re.split(r"\n+", text)
else:
lines = text.split("\n")

line_count = len(lines)
for line in lines:
if any(line.startswith(s) for s in BULLET_POINTS):
Expand Down Expand Up @@ -218,3 +223,11 @@ def predict(self, doc: Document) -> DocResult:
attrs = get_attributes(doc.text)
result = DocResult(doc=doc, spans=attrs.as_spans())
return result


@TaggerRegistry.add("gopher_v2")
class GopherTaggerV2(GopherTagger):
def predict(self, doc: Document) -> DocResult:
attrs = get_attributes(doc.text, ignore_empty_lines=True)
result = DocResult(doc=doc, spans=attrs.as_spans())
return result

0 comments on commit a72c76b

Please sign in to comment.