Skip to content

Commit fe5cdcb

Browse files
raise error if token count exceeds 1024 instead of attempting to re-chunk
Signed-off-by: Khaled Sulayman <[email protected]>
1 parent d96f286 commit fe5cdcb

File tree

1 file changed

+9
-10
lines changed

1 file changed

+9
-10
lines changed

notebooks/instructlab-knowledge/utils/create_seed_dataset.py

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -158,16 +158,15 @@ def add_icls(qna_yaml: Dict[str, str], chunked_document: Dataset) -> Dataset:
158158
}
159159
)
160160
)
161-
chunked_document_all_icl = safe_concatenate_datasets(chunked_document_all_icl)
162-
chunked_document_all_icl = chunked_document_all_icl.map(
163-
lambda x: {
164-
"chunks": chunk_document(
165-
[x["document"]], server_ctx_size=4096, chunk_word_count=1024
166-
)
167-
if get_token_count(x["document"], tokenizer) > 1024
168-
else [x["document"]]
169-
}
170-
)
161+
chunked_document_all_icl = []
162+
for c in safe_concatenate_datasets(chunked_document_all_icl):
163+
if get_token_count(c["document"], tokenizer) > 1024:
164+
raise ValueError("Chunk exceeds token count of 1024")
165+
166+
chunked_document_all_icl.append({
167+
"chunks": [c["document"]]
168+
})
169+
171170
df = chunked_document_all_icl.to_pandas()
172171
df_exploded = df.explode("chunks").reset_index(drop=True)
173172
new_ds = Dataset.from_pandas(df_exploded)

0 commit comments

Comments
 (0)