Skip to content

Commit 961dbd2

Browse files
raise error if token count exceeds 1024 instead of attempting to re-chunk
Signed-off-by: Khaled Sulayman <[email protected]>
1 parent 32e1661 commit 961dbd2

File tree

1 file changed

+7
-15
lines changed

1 file changed

+7
-15
lines changed

notebooks/instructlab-knowledge/utils/create_seed_dataset.py

Lines changed: 7 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -147,24 +147,16 @@ def add_icls(qna_yaml: Dict[str, str], chunked_document: Dataset) -> Dataset:
147147
)
148148
)
149149
chunked_document_all_icl = safe_concatenate_datasets(chunked_document_all_icl)
150-
chunked_document_all_icl = chunked_document_all_icl.map(
151-
lambda x: {
152-
"chunks": chunk_document(
153-
[x["document"]], server_ctx_size=4096, chunk_word_count=1024
154-
)
155-
if get_token_count(x["document"], tokenizer) > 1024
156-
else [x["document"]]
157-
}
158-
)
150+
for c in chunked_document_all_icl:
151+
if get_token_count(c["document"], tokenizer) > 1024:
152+
raise ValueError("Chunk exceeds token count of 1024")
153+
154+
159155
df = chunked_document_all_icl.to_pandas()
160-
df_exploded = df.explode("chunks").reset_index(drop=True)
161-
new_ds = Dataset.from_pandas(df_exploded)
162-
new_ds = new_ds.remove_columns("document").rename_columns(
163-
{"chunks": "document"}
164-
)
156+
new_ds = Dataset.from_pandas(df)
165157

166158
# Only keep document greater than 100 tokens
167159
new_ds = new_ds.filter(
168-
lambda x: get_token_count(x["document"], tokenizer) > 100
160+
lambda c: get_token_count(c["document"], tokenizer) > 100
169161
)
170162
return new_ds

0 commit comments

Comments
 (0)