|
173 | 173 | " # hosts=[\"http://localhost:9200\"]\n",
|
174 | 174 | " cloud_id=ELASTIC_CLOUD_ID,\n",
|
175 | 175 | " api_key=ELASTIC_API_KEY,\n",
|
176 |
| - " request_timeout=120, \n", |
| 176 | + " request_timeout=120,\n", |
177 | 177 | " max_retries=10,\n",
|
178 |
| - " retry_on_timeout=True\n", |
| 178 | + " retry_on_timeout=True,\n", |
179 | 179 | ")"
|
180 | 180 | ]
|
181 | 181 | },
|
|
205 | 205 | "source": [
|
206 | 206 | "print(client.info())\n",
|
207 | 207 | "\n",
|
| 208 | + "\n", |
208 | 209 | "# define this now so we can use it later\n",
|
209 | 210 | "def pretty_search_response(response):\n",
|
210 | 211 | " if len(response[\"hits\"][\"hits\"]) == 0:\n",
|
|
265 | 266 | "source": [
|
266 | 267 | "API_KEY = getpass(\"Huggingface API key: \")\n",
|
267 | 268 | "client.inference.put_model(\n",
|
268 |
| - " inference_id='my_hf_endpoint_object',\n", |
| 269 | + " inference_id=\"my_hf_endpoint_object\",\n", |
269 | 270 | " body={\n",
|
270 | 271 | " \"service\": \"hugging_face\",\n",
|
271 |
| - " \"service_settings\": {\"api_key\": API_KEY, \n", |
272 |
| - " \"url\": \"https://yb0j0ol2xzvro0oc.us-east-1.aws.endpoints.huggingface.cloud\",\n", |
273 |
| - " \"similarity\": \"dot_product\"\n", |
274 |
| - " },\n", |
| 272 | + " \"service_settings\": {\n", |
| 273 | + " \"api_key\": API_KEY,\n", |
| 274 | + " \"url\": \"https://yb0j0ol2xzvro0oc.us-east-1.aws.endpoints.huggingface.cloud\",\n", |
| 275 | + " \"similarity\": \"dot_product\",\n", |
| 276 | + " },\n", |
275 | 277 | " },\n",
|
276 |
| - " task_type=\"text_embedding\"\n", |
| 278 | + " task_type=\"text_embedding\",\n", |
277 | 279 | ")"
|
278 | 280 | ]
|
279 | 281 | },
|
|
296 | 298 | ],
|
297 | 299 | "source": [
|
298 | 300 | "client.inference.inference(\n",
|
299 |
| - " inference_id='my_hf_endpoint_object',\n", |
300 |
| - " input=\"this is the raw text of my document!\"\n", |
301 |
| - " )" |
| 301 | + " inference_id=\"my_hf_endpoint_object\", input=\"this is the raw text of my document!\"\n", |
| 302 | + ")" |
302 | 303 | ]
|
303 | 304 | },
|
304 | 305 | {
|
|
407 | 408 | "source": [
|
408 | 409 | "client.indices.create(\n",
|
409 | 410 | " index=\"hf-endpoint-index\",\n",
|
410 |
| - " settings = {\n", |
| 411 | + " settings={\n", |
411 | 412 | " \"index\": {\n",
|
412 | 413 | " \"default_pipeline\": \"hf_pipeline\",\n",
|
413 | 414 | " }\n",
|
414 | 415 | " },\n",
|
415 |
| - " mappings = {\n", |
| 416 | + " mappings={\n", |
416 | 417 | " \"properties\": {\n",
|
417 | 418 | " \"text\": {\"type\": \"text\"},\n",
|
418 | 419 | " \"text_embedding\": {\n",
|
|
421 | 422 | " \"similarity\": \"dot_product\",\n",
|
422 | 423 | " },\n",
|
423 | 424 | " }\n",
|
424 |
| - " }\n", |
| 425 | + " },\n", |
425 | 426 | ")"
|
426 | 427 | ]
|
427 | 428 | },
|
|
455 | 456 | ],
|
456 | 457 | "source": [
|
457 | 458 | "client.indices.create(\n",
|
458 |
| - " index=\"hf-semantic-text-index\",\n", |
459 |
| - " mappings={\n", |
| 459 | + " index=\"hf-semantic-text-index\",\n", |
| 460 | + " mappings={\n", |
460 | 461 | " \"properties\": {\n",
|
461 | 462 | " \"infer_field\": {\n",
|
462 | 463 | " \"type\": \"semantic_text\",\n",
|
463 |
| - " \"inference_id\": \"my_hf_endpoint_object\"\n", |
| 464 | + " \"inference_id\": \"my_hf_endpoint_object\",\n", |
464 | 465 | " },\n",
|
465 |
| - " \"text_field\": {\n", |
466 |
| - " \"type\": \"text\",\n", |
467 |
| - " \"copy_to\": \"infer_field\"\n", |
468 |
| - " }\n", |
| 466 | + " \"text_field\": {\"type\": \"text\", \"copy_to\": \"infer_field\"},\n", |
469 | 467 | " }\n",
|
470 |
| - " }\n", |
| 468 | + " },\n", |
471 | 469 | ")"
|
472 | 470 | ]
|
473 | 471 | },
|
|
488 | 486 | "metadata": {},
|
489 | 487 | "outputs": [],
|
490 | 488 | "source": [
|
491 |
| - "langs = ['ar', 'bn', 'en', 'es', 'fa', 'fi', 'fr', 'hi', 'id', 'ja', 'ko', 'ru', 'sw', 'te', 'th', 'zh']\n", |
| 489 | + "langs = [\n", |
| 490 | + " \"ar\",\n", |
| 491 | + " \"bn\",\n", |
| 492 | + " \"en\",\n", |
| 493 | + " \"es\",\n", |
| 494 | + " \"fa\",\n", |
| 495 | + " \"fi\",\n", |
| 496 | + " \"fr\",\n", |
| 497 | + " \"hi\",\n", |
| 498 | + " \"id\",\n", |
| 499 | + " \"ja\",\n", |
| 500 | + " \"ko\",\n", |
| 501 | + " \"ru\",\n", |
| 502 | + " \"sw\",\n", |
| 503 | + " \"te\",\n", |
| 504 | + " \"th\",\n", |
| 505 | + " \"zh\",\n", |
| 506 | + "]\n", |
492 | 507 | "\n",
|
493 | 508 | "\n",
|
494 |
| - "all_langs_datasets = [iter(datasets.load_dataset('miracl/miracl-corpus', lang)['train']) for lang in langs]" |
| 509 | + "all_langs_datasets = [\n", |
| 510 | + " iter(datasets.load_dataset(\"miracl/miracl-corpus\", lang)[\"train\"]) for lang in langs\n", |
| 511 | + "]" |
495 | 512 | ]
|
496 | 513 | },
|
497 | 514 | {
|
|
665 | 682 | " for ds in all_langs_datasets:\n",
|
666 | 683 | " text = next(ds, sentinel)\n",
|
667 | 684 | " if text is not sentinel:\n",
|
668 |
| - " documents.append({\n", |
669 |
| - " \"_index\": \"hf-semantic-text-index\",\n", |
670 |
| - " \"_source\": {\"text_field\": text['text']},\n", |
671 |
| - " })\n", |
672 |
| - " # if you are using an ingest pipeline instead of a \n", |
| 685 | + " documents.append(\n", |
| 686 | + " {\n", |
| 687 | + " \"_index\": \"hf-semantic-text-index\",\n", |
| 688 | + " \"_source\": {\"text_field\": text[\"text\"]},\n", |
| 689 | + " }\n", |
| 690 | + " )\n", |
| 691 | + " # if you are using an ingest pipeline instead of a\n", |
673 | 692 | " # semantic text field, use this instead:\n",
|
674 | 693 | " # documents.append(\n",
|
675 | 694 | " # {\n",
|
|
680 | 699 | "\n",
|
681 | 700 | " try:\n",
|
682 | 701 | " response = helpers.bulk(client, documents, raise_on_error=False, timeout=\"60s\")\n",
|
683 |
| - " print(\"Docs uplaoded:\", (j+1)*MAX_BULK_SIZE)\n", |
| 702 | + " print(\"Docs uplaoded:\", (j + 1) * MAX_BULK_SIZE)\n", |
684 | 703 | "\n",
|
685 | 704 | " except Exception as e:\n",
|
686 | 705 | " print(\"exception:\", str(e))"
|
|
705 | 724 | "source": [
|
706 | 725 | "query = \"English speaking countries\"\n",
|
707 | 726 | "semantic_search_results = client.search(\n",
|
708 |
| - " index=\"hf-semantic-text-index\",\n", |
709 |
| - " query={\"semantic\": {\"field\": \"infer_field\", \"query\": query}},\n", |
710 |
| - ")\n", |
711 |
| - "\n", |
712 |
| - " " |
| 727 | + " index=\"hf-semantic-text-index\",\n", |
| 728 | + " query={\"semantic\": {\"field\": \"infer_field\", \"query\": query}},\n", |
| 729 | + ")" |
713 | 730 | ]
|
714 | 731 | },
|
715 | 732 | {
|
|
795 | 812 | " task_type=\"rerank\",\n",
|
796 | 813 | " inference_id=\"my_cohere_rerank_endpoint\",\n",
|
797 | 814 | " body={\n",
|
798 |
| - " \"service\": \"cohere\",\n", |
799 |
| - " \"service_settings\": {\n", |
| 815 | + " \"service\": \"cohere\",\n", |
| 816 | + " \"service_settings\": {\n", |
800 | 817 | " \"api_key\": \"h2OzeuORCdvJ8eidGYbHmjfeWcecRQN8MYGDHxK1\",\n",
|
801 |
| - " \"model_id\": \"rerank-english-v3.0\"\n", |
802 |
| - " },\n", |
803 |
| - " \"task_settings\": {\n", |
804 |
| - " \"top_n\": 100,\n", |
805 |
| - " \"return_documents\": True\n", |
806 |
| - " }\n", |
807 |
| - " }\n", |
808 |
| - ")\n" |
| 818 | + " \"model_id\": \"rerank-english-v3.0\",\n", |
| 819 | + " },\n", |
| 820 | + " \"task_settings\": {\"top_n\": 100, \"return_documents\": True},\n", |
| 821 | + " },\n", |
| 822 | + ")" |
809 | 823 | ]
|
810 | 824 | },
|
811 | 825 | {
|
|
817 | 831 | "source": [
|
818 | 832 | "reranked_search_results = client.search(\n",
|
819 | 833 | " index=\"hf-semantic-text-index\",\n",
|
820 |
| - " retriever= {\n", |
| 834 | + " retriever={\n", |
821 | 835 | " \"text_similarity_reranker\": {\n",
|
822 |
| - " \"retriever\": {\n", |
823 |
| - " \"standard\": {\n", |
824 |
| - " \"query\": {\n", |
825 |
| - " \"semantic\": {\n", |
826 |
| - " \"field\": \"infer_field\",\n", |
827 |
| - " \"query\": query \n", |
828 |
| - " }\n", |
| 836 | + " \"retriever\": {\n", |
| 837 | + " \"standard\": {\n", |
| 838 | + " \"query\": {\"semantic\": {\"field\": \"infer_field\", \"query\": query}}\n", |
829 | 839 | " }\n",
|
830 |
| - " }\n", |
831 |
| - " },\n", |
832 |
| - " \"field\": \"text_field\",\n", |
833 |
| - " \"inference_id\": \"my_cohere_rerank_endpoint\",\n", |
834 |
| - " \"inference_text\": query,\n", |
835 |
| - " \"rank_window_size\": 100,\n", |
| 840 | + " },\n", |
| 841 | + " \"field\": \"text_field\",\n", |
| 842 | + " \"inference_id\": \"my_cohere_rerank_endpoint\",\n", |
| 843 | + " \"inference_text\": query,\n", |
| 844 | + " \"rank_window_size\": 100,\n", |
836 | 845 | " }\n",
|
837 |
| - " }\n", |
838 |
| - ")\n" |
| 846 | + " },\n", |
| 847 | + ")" |
839 | 848 | ]
|
840 | 849 | },
|
841 | 850 | {
|
|
0 commit comments