Skip to content

Commit 3cd988f

Browse files
precommit
1 parent ef9faca commit 3cd988f

File tree

1 file changed

+70
-61
lines changed

1 file changed

+70
-61
lines changed

notebooks/integrations/hugging-face/huggingface-integration-millions-of-documents-with-cohere-reranking.ipynb

Lines changed: 70 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -173,9 +173,9 @@
173173
" # hosts=[\"http://localhost:9200\"]\n",
174174
" cloud_id=ELASTIC_CLOUD_ID,\n",
175175
" api_key=ELASTIC_API_KEY,\n",
176-
" request_timeout=120, \n",
176+
" request_timeout=120,\n",
177177
" max_retries=10,\n",
178-
" retry_on_timeout=True\n",
178+
" retry_on_timeout=True,\n",
179179
")"
180180
]
181181
},
@@ -205,6 +205,7 @@
205205
"source": [
206206
"print(client.info())\n",
207207
"\n",
208+
"\n",
208209
"# define this now so we can use it later\n",
209210
"def pretty_search_response(response):\n",
210211
" if len(response[\"hits\"][\"hits\"]) == 0:\n",
@@ -265,15 +266,16 @@
265266
"source": [
266267
"API_KEY = getpass(\"Huggingface API key: \")\n",
267268
"client.inference.put_model(\n",
268-
" inference_id='my_hf_endpoint_object',\n",
269+
" inference_id=\"my_hf_endpoint_object\",\n",
269270
" body={\n",
270271
" \"service\": \"hugging_face\",\n",
271-
" \"service_settings\": {\"api_key\": API_KEY, \n",
272-
" \"url\": \"https://yb0j0ol2xzvro0oc.us-east-1.aws.endpoints.huggingface.cloud\",\n",
273-
" \"similarity\": \"dot_product\"\n",
274-
" },\n",
272+
" \"service_settings\": {\n",
273+
" \"api_key\": API_KEY,\n",
274+
" \"url\": \"https://yb0j0ol2xzvro0oc.us-east-1.aws.endpoints.huggingface.cloud\",\n",
275+
" \"similarity\": \"dot_product\",\n",
276+
" },\n",
275277
" },\n",
276-
" task_type=\"text_embedding\"\n",
278+
" task_type=\"text_embedding\",\n",
277279
")"
278280
]
279281
},
@@ -296,9 +298,8 @@
296298
],
297299
"source": [
298300
"client.inference.inference(\n",
299-
" inference_id='my_hf_endpoint_object',\n",
300-
" input=\"this is the raw text of my document!\"\n",
301-
" )"
301+
" inference_id=\"my_hf_endpoint_object\", input=\"this is the raw text of my document!\"\n",
302+
")"
302303
]
303304
},
304305
{
@@ -407,12 +408,12 @@
407408
"source": [
408409
"client.indices.create(\n",
409410
" index=\"hf-endpoint-index\",\n",
410-
" settings = {\n",
411+
" settings={\n",
411412
" \"index\": {\n",
412413
" \"default_pipeline\": \"hf_pipeline\",\n",
413414
" }\n",
414415
" },\n",
415-
" mappings = {\n",
416+
" mappings={\n",
416417
" \"properties\": {\n",
417418
" \"text\": {\"type\": \"text\"},\n",
418419
" \"text_embedding\": {\n",
@@ -421,7 +422,7 @@
421422
" \"similarity\": \"dot_product\",\n",
422423
" },\n",
423424
" }\n",
424-
" }\n",
425+
" },\n",
425426
")"
426427
]
427428
},
@@ -455,19 +456,16 @@
455456
],
456457
"source": [
457458
"client.indices.create(\n",
458-
" index=\"hf-semantic-text-index\",\n",
459-
" mappings={\n",
459+
" index=\"hf-semantic-text-index\",\n",
460+
" mappings={\n",
460461
" \"properties\": {\n",
461462
" \"infer_field\": {\n",
462463
" \"type\": \"semantic_text\",\n",
463-
" \"inference_id\": \"my_hf_endpoint_object\"\n",
464+
" \"inference_id\": \"my_hf_endpoint_object\",\n",
464465
" },\n",
465-
" \"text_field\": {\n",
466-
" \"type\": \"text\",\n",
467-
" \"copy_to\": \"infer_field\"\n",
468-
" }\n",
466+
" \"text_field\": {\"type\": \"text\", \"copy_to\": \"infer_field\"},\n",
469467
" }\n",
470-
" }\n",
468+
" },\n",
471469
")"
472470
]
473471
},
@@ -488,10 +486,29 @@
488486
"metadata": {},
489487
"outputs": [],
490488
"source": [
491-
"langs = ['ar', 'bn', 'en', 'es', 'fa', 'fi', 'fr', 'hi', 'id', 'ja', 'ko', 'ru', 'sw', 'te', 'th', 'zh']\n",
489+
"langs = [\n",
490+
" \"ar\",\n",
491+
" \"bn\",\n",
492+
" \"en\",\n",
493+
" \"es\",\n",
494+
" \"fa\",\n",
495+
" \"fi\",\n",
496+
" \"fr\",\n",
497+
" \"hi\",\n",
498+
" \"id\",\n",
499+
" \"ja\",\n",
500+
" \"ko\",\n",
501+
" \"ru\",\n",
502+
" \"sw\",\n",
503+
" \"te\",\n",
504+
" \"th\",\n",
505+
" \"zh\",\n",
506+
"]\n",
492507
"\n",
493508
"\n",
494-
"all_langs_datasets = [iter(datasets.load_dataset('miracl/miracl-corpus', lang)['train']) for lang in langs]"
509+
"all_langs_datasets = [\n",
510+
" iter(datasets.load_dataset(\"miracl/miracl-corpus\", lang)[\"train\"]) for lang in langs\n",
511+
"]"
495512
]
496513
},
497514
{
@@ -665,11 +682,13 @@
665682
" for ds in all_langs_datasets:\n",
666683
" text = next(ds, sentinel)\n",
667684
" if text is not sentinel:\n",
668-
" documents.append({\n",
669-
" \"_index\": \"hf-semantic-text-index\",\n",
670-
" \"_source\": {\"text_field\": text['text']},\n",
671-
" })\n",
672-
" # if you are using an ingest pipeline instead of a \n",
685+
" documents.append(\n",
686+
" {\n",
687+
" \"_index\": \"hf-semantic-text-index\",\n",
688+
" \"_source\": {\"text_field\": text[\"text\"]},\n",
689+
" }\n",
690+
" )\n",
691+
" # if you are using an ingest pipeline instead of a\n",
673692
" # semantic text field, use this instead:\n",
674693
" # documents.append(\n",
675694
" # {\n",
@@ -680,7 +699,7 @@
680699
"\n",
681700
" try:\n",
682701
" response = helpers.bulk(client, documents, raise_on_error=False, timeout=\"60s\")\n",
683-
" print(\"Docs uplaoded:\", (j+1)*MAX_BULK_SIZE)\n",
702+
" print(\"Docs uplaoded:\", (j + 1) * MAX_BULK_SIZE)\n",
684703
"\n",
685704
" except Exception as e:\n",
686705
" print(\"exception:\", str(e))"
@@ -705,11 +724,9 @@
705724
"source": [
706725
"query = \"English speaking countries\"\n",
707726
"semantic_search_results = client.search(\n",
708-
" index=\"hf-semantic-text-index\",\n",
709-
" query={\"semantic\": {\"field\": \"infer_field\", \"query\": query}},\n",
710-
")\n",
711-
"\n",
712-
" "
727+
" index=\"hf-semantic-text-index\",\n",
728+
" query={\"semantic\": {\"field\": \"infer_field\", \"query\": query}},\n",
729+
")"
713730
]
714731
},
715732
{
@@ -795,17 +812,14 @@
795812
" task_type=\"rerank\",\n",
796813
" inference_id=\"my_cohere_rerank_endpoint\",\n",
797814
" body={\n",
798-
" \"service\": \"cohere\",\n",
799-
" \"service_settings\": {\n",
815+
" \"service\": \"cohere\",\n",
816+
" \"service_settings\": {\n",
800817
" \"api_key\": \"h2OzeuORCdvJ8eidGYbHmjfeWcecRQN8MYGDHxK1\",\n",
801-
" \"model_id\": \"rerank-english-v3.0\"\n",
802-
" },\n",
803-
" \"task_settings\": {\n",
804-
" \"top_n\": 100,\n",
805-
" \"return_documents\": True\n",
806-
" }\n",
807-
" }\n",
808-
")\n"
818+
" \"model_id\": \"rerank-english-v3.0\",\n",
819+
" },\n",
820+
" \"task_settings\": {\"top_n\": 100, \"return_documents\": True},\n",
821+
" },\n",
822+
")"
809823
]
810824
},
811825
{
@@ -817,25 +831,20 @@
817831
"source": [
818832
"reranked_search_results = client.search(\n",
819833
" index=\"hf-semantic-text-index\",\n",
820-
" retriever= {\n",
834+
" retriever={\n",
821835
" \"text_similarity_reranker\": {\n",
822-
" \"retriever\": {\n",
823-
" \"standard\": {\n",
824-
" \"query\": {\n",
825-
" \"semantic\": {\n",
826-
" \"field\": \"infer_field\",\n",
827-
" \"query\": query \n",
828-
" }\n",
836+
" \"retriever\": {\n",
837+
" \"standard\": {\n",
838+
" \"query\": {\"semantic\": {\"field\": \"infer_field\", \"query\": query}}\n",
829839
" }\n",
830-
" }\n",
831-
" },\n",
832-
" \"field\": \"text_field\",\n",
833-
" \"inference_id\": \"my_cohere_rerank_endpoint\",\n",
834-
" \"inference_text\": query,\n",
835-
" \"rank_window_size\": 100,\n",
840+
" },\n",
841+
" \"field\": \"text_field\",\n",
842+
" \"inference_id\": \"my_cohere_rerank_endpoint\",\n",
843+
" \"inference_text\": query,\n",
844+
" \"rank_window_size\": 100,\n",
836845
" }\n",
837-
" }\n",
838-
")\n"
846+
" },\n",
847+
")"
839848
]
840849
},
841850
{

0 commit comments

Comments
 (0)