diff --git a/VERSION.txt b/VERSION.txt index 4e1d30d969..bd8bf882d0 100644 --- a/VERSION.txt +++ b/VERSION.txt @@ -1 +1 @@ -1.6.1rc0 +1.7.0 diff --git a/docs/_src/api/openapi/openapi-1.7.0.json b/docs/_src/api/openapi/openapi-1.7.0.json new file mode 100644 index 0000000000..209a1723ad --- /dev/null +++ b/docs/_src/api/openapi/openapi-1.7.0.json @@ -0,0 +1,886 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.7.0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Feedback Feedback Get", + "type": "array", + "items": { + "$ref": "#/components/schemas/Label" + } + } + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to retrieve by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "Answer": { + "title": "Answer", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "Document": { + "title": "Document", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image", + "audio" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "string" + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + }, + { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + } + ] + } + } + }, + "additionalProperties": false + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "Label": { + "title": "Label", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/Answer" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/_src/api/openapi/openapi.json b/docs/_src/api/openapi/openapi.json index fcc336df71..209a1723ad 100644 --- a/docs/_src/api/openapi/openapi.json +++ b/docs/_src/api/openapi/openapi.json @@ -2,7 +2,7 @@ "openapi": "3.0.2", "info": { "title": "Haystack REST API", - "version": "1.6.1rc0" + "version": "1.7.0" }, "paths": { "/initialized": { diff --git a/docs/v1.7.0/Makefile b/docs/v1.7.0/Makefile new file mode 100644 index 0000000000..8634435d76 --- /dev/null +++ b/docs/v1.7.0/Makefile @@ -0,0 +1,25 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. + +SPHINXBUILD := sphinx-build +MAKEINFO := makeinfo + +BUILDDIR := build +SOURCE := _src/ +# SPHINXFLAGS := -a -W -n -A local=1 -d $(BUILDDIR)/doctree +SPHINXFLAGS := -A local=1 -d $(BUILDDIR)/doctree +SPHINXOPTS := $(SPHINXFLAGS) $(SOURCE) + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + $(SPHINXBUILD) -M $@ $(SPHINXOPTS) $(BUILDDIR)/$@ diff --git a/docs/v1.7.0/_src/api/Makefile b/docs/v1.7.0/_src/api/Makefile new file mode 100644 index 0000000000..d4bb2cbb9e --- /dev/null +++ b/docs/v1.7.0/_src/api/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/v1.7.0/_src/api/_static/floating_sidebar.css b/docs/v1.7.0/_src/api/_static/floating_sidebar.css new file mode 100644 index 0000000000..e59adc6722 --- /dev/null +++ b/docs/v1.7.0/_src/api/_static/floating_sidebar.css @@ -0,0 +1,29 @@ +div.sphinxsidebarwrapper { + position: relative; + top: 0px; + padding: 0; +} + +div.sphinxsidebar { + margin: 0; + padding: 0 15px 0 15px; + width: 210px; + float: left; + font-size: 1em; + text-align: left; +} + +div.sphinxsidebar .logo { + font-size: 1.8em; + color: #0A507A; + font-weight: 300; + text-align: center; +} + +div.sphinxsidebar .logo img { + vertical-align: middle; +} + +div.sphinxsidebar .download a img { + vertical-align: middle; +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/api/_templates/xxlayout.html b/docs/v1.7.0/_src/api/_templates/xxlayout.html new file mode 100644 index 0000000000..de71588332 --- /dev/null +++ b/docs/v1.7.0/_src/api/_templates/xxlayout.html @@ -0,0 +1,46 @@ +{# put the sidebar before the body #} +{% block sidebar1 %}{{ sidebar() }}{% endblock %} +{% block sidebar2 %}{% endblock %} + +{% block extrahead %} + +{{ super() }} +{#- if not embedded #} + + +{#- endif #} +{% endblock %} \ No newline at end of file diff --git a/docs/v1.7.0/_src/api/api/crawler.md b/docs/v1.7.0/_src/api/api/crawler.md new file mode 100644 index 0000000000..494f97f1ab --- /dev/null +++ b/docs/v1.7.0/_src/api/api/crawler.md @@ -0,0 +1,144 @@ + + +# Module crawler + + + +## Crawler + +```python +class Crawler(BaseComponent) +``` + +Crawl texts from a website so that we can use them later in Haystack as a corpus for search / question answering etc. + +**Example:** +```python +| from haystack.nodes.connector import Crawler +| +| crawler = Crawler(output_dir="crawled_files") +| # crawl Haystack docs, i.e. all pages that include haystack.deepset.ai/overview/ +| docs = crawler.crawl(urls=["https://haystack.deepset.ai/overview/get-started"], +| filter_urls= ["haystack.deepset.ai/overview/"]) +``` + + + +#### Crawler.\_\_init\_\_ + +```python +def __init__(output_dir: str, urls: Optional[List[str]] = None, crawler_depth: int = 1, filter_urls: Optional[List] = None, overwrite_existing_files=True, id_hash_keys: Optional[List[str]] = None, extract_hidden_text=True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) +``` + +Init object with basic params for crawling (can be overwritten later). + +**Arguments**: + +- `output_dir`: Path for the directory to store files +- `urls`: List of http(s) address(es) (can also be supplied later when calling crawl()) +- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options: +0: Only initial list of urls +1: Follow links found on the initial URLs (but no further) +- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with. +All URLs not matching at least one of the regular expressions will be dropped. +- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. +- `extract_hidden_text`: Whether to extract the hidden text contained in page. +E.g. the text can be inside a span with style="display: none" +- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on +dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted. +E.g. 2: Crawler will wait 2 seconds before scraping page +- `crawler_naming_function`: A function mapping the crawled page to a file name. +By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url. +E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link) + This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores. + 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest() + This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content. + + + +#### Crawler.crawl + +```python +def crawl(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = None, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> List[Path] +``` + +Craw URL(s), extract the text from the HTML, create a Haystack Document object out of it and save it (one JSON + +file per URL, including text and basic meta data). +You can optionally specify via `filter_urls` to only crawl URLs that match a certain pattern. +All parameters are optional here and only meant to overwrite instance attributes at runtime. +If no parameters are provided to this method, the instance attributes that were passed during __init__ will be used. + +**Arguments**: + +- `output_dir`: Path for the directory to store files +- `urls`: List of http addresses or single http address +- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options: +0: Only initial list of urls +1: Follow links found on the initial URLs (but no further) +- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with. +All URLs not matching at least one of the regular expressions will be dropped. +- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. +- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on +dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted. +E.g. 2: Crawler will wait 2 seconds before scraping page +- `crawler_naming_function`: A function mapping the crawled page to a file name. +By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url. +E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link) + This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores. + 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest() + This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content. + +**Returns**: + +List of paths where the crawled webpages got stored + + + +#### Crawler.run + +```python +def run(output_dir: Union[str, Path, None] = None, urls: Optional[List[str]] = None, crawler_depth: Optional[int] = None, filter_urls: Optional[List] = None, overwrite_existing_files: Optional[bool] = None, return_documents: Optional[bool] = False, id_hash_keys: Optional[List[str]] = None, extract_hidden_text: Optional[bool] = True, loading_wait_time: Optional[int] = None, crawler_naming_function: Optional[Callable[[str, str], str]] = None) -> Tuple[Dict[str, Union[List[Document], List[Path]]], str] +``` + +Method to be executed when the Crawler is used as a Node within a Haystack pipeline. + +**Arguments**: + +- `output_dir`: Path for the directory to store files +- `urls`: List of http addresses or single http address +- `crawler_depth`: How many sublinks to follow from the initial list of URLs. Current options: +0: Only initial list of urls +1: Follow links found on the initial URLs (but no further) +- `filter_urls`: Optional list of regular expressions that the crawled URLs must comply with. +All URLs not matching at least one of the regular expressions will be dropped. +- `overwrite_existing_files`: Whether to overwrite existing files in output_dir with new content +- `return_documents`: Return json files content +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. +- `extract_hidden_text`: Whether to extract the hidden text contained in page. +E.g. the text can be inside a span with style="display: none" +- `loading_wait_time`: Seconds to wait for page loading before scraping. Recommended when page relies on +dynamic DOM manipulations. Use carefully and only when needed. Crawler will have scraping speed impacted. +E.g. 2: Crawler will wait 2 seconds before scraping page +- `crawler_naming_function`: A function mapping the crawled page to a file name. +By default, the file name is generated from the processed page url (string compatible with Mac, Unix and Windows paths) and the last 6 digits of the MD5 sum of this unprocessed page url. +E.g. 1) crawler_naming_function=lambda url, page_content: re.sub("[<>:'/\\|?*\0 ]", "_", link) + This example will generate a file name from the url by replacing all characters that are not allowed in file names with underscores. + 2) crawler_naming_function=lambda url, page_content: hashlib.md5(f"{url}{page_content}".encode("utf-8")).hexdigest() + This example will generate a file name from the url and the page content by using the MD5 hash of the concatenation of the url and the page content. + +**Returns**: + +Tuple({"paths": List of filepaths, ...}, Name of output edge) + diff --git a/docs/v1.7.0/_src/api/api/document_classifier.md b/docs/v1.7.0/_src/api/api/document_classifier.md new file mode 100644 index 0000000000..1aa84a58f0 --- /dev/null +++ b/docs/v1.7.0/_src/api/api/document_classifier.md @@ -0,0 +1,162 @@ + + +# Module base + + + +## BaseDocumentClassifier + +```python +class BaseDocumentClassifier(BaseComponent) +``` + + + +#### BaseDocumentClassifier.timing + +```python +def timing(fn, attr_name) +``` + +Wrapper method used to time functions. + + + +# Module transformers + + + +## TransformersDocumentClassifier + +```python +class TransformersDocumentClassifier(BaseDocumentClassifier) +``` + +Transformer based model for document classification using the HuggingFace's transformers framework +(https://github.com/huggingface/transformers). +While the underlying model can vary (BERT, Roberta, DistilBERT ...), the interface remains the same. +This node classifies documents and adds the output from the classification step to the document's meta data. +The meta field of the document is a dictionary with the following format: +``'meta': {'name': '450_Baelor.txt', 'classification': {'label': 'neutral', 'probability' = 0.9997646, ...} }`` + +Classification is run on document's content field by default. If you want it to run on another field, +set the `classification_field` to one of document's meta fields. + +With this document_classifier, you can directly get predictions via predict() + + **Usage example at query time:** + ```python +| ... +| retriever = BM25Retriever(document_store=document_store) +| document_classifier = TransformersDocumentClassifier(model_name_or_path="bhadresh-savani/distilbert-base-uncased-emotion") +| p = Pipeline() +| p.add_node(component=retriever, name="Retriever", inputs=["Query"]) +| p.add_node(component=document_classifier, name="Classifier", inputs=["Retriever"]) +| res = p.run( +| query="Who is the father of Arya Stark?", +| params={"Retriever": {"top_k": 10}} +| ) +| +| # print the classification results +| print_documents(res, max_text_len=100, print_meta=True) +| # or access the predicted class label directly +| res["documents"][0].to_dict()["meta"]["classification"]["label"] + ``` + +**Usage example at index time:** + ```python +| ... +| converter = TextConverter() +| preprocessor = Preprocessor() +| document_store = ElasticsearchDocumentStore() +| document_classifier = TransformersDocumentClassifier(model_name_or_path="bhadresh-savani/distilbert-base-uncased-emotion", +| batch_size=16) +| p = Pipeline() +| p.add_node(component=converter, name="TextConverter", inputs=["File"]) +| p.add_node(component=preprocessor, name="Preprocessor", inputs=["TextConverter"]) +| p.add_node(component=document_classifier, name="DocumentClassifier", inputs=["Preprocessor"]) +| p.add_node(component=document_store, name="DocumentStore", inputs=["DocumentClassifier"]) +| p.run(file_paths=file_paths) + ``` + + + +#### TransformersDocumentClassifier.\_\_init\_\_ + +```python +def __init__(model_name_or_path: str = "bhadresh-savani/distilbert-base-uncased-emotion", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, return_all_scores: bool = False, task: str = "text-classification", labels: Optional[List[str]] = None, batch_size: int = 16, classification_field: str = None, progress_bar: bool = True) +``` + +Load a text classification model from Transformers. + +Available models for the task of text-classification include: +- ``'bhadresh-savani/distilbert-base-uncased-emotion'`` +- ``'Hate-speech-CNERG/dehatebert-mono-english'`` + +Available models for the task of zero-shot-classification include: +- ``'valhalla/distilbart-mnli-12-3'`` +- ``'cross-encoder/nli-distilroberta-base'`` + +See https://huggingface.co/models for full list of available models. +Filter for text classification models: https://huggingface.co/models?pipeline_tag=text-classification&sort=downloads +Filter for zero-shot classification models (NLI): https://huggingface.co/models?pipeline_tag=zero-shot-classification&sort=downloads&search=nli + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. 'bhadresh-savani/distilbert-base-uncased-emotion'. +See https://huggingface.co/models for full list of available models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `tokenizer`: Name of the tokenizer (usually the same as model) +- `use_gpu`: Whether to use GPU (if available). +- `return_all_scores`: Whether to return all prediction scores or just the one of the predicted class. Only used for task 'text-classification'. +- `task`: 'text-classification' or 'zero-shot-classification' +- `labels`: Only used for task 'zero-shot-classification'. List of string defining class labels, e.g., +["positive", "negative"] otherwise None. Given a LABEL, the sequence fed to the model is " sequence to +classify This example is LABEL . " and the model predicts whether that sequence is a contradiction +or an entailment. +- `batch_size`: Number of Documents to be processed at a time. +- `classification_field`: Name of Document's meta field to be used for classification. If left unset, Document.content is used by default. +- `progress_bar`: Whether to show a progress bar while processing. + + + +#### TransformersDocumentClassifier.predict + +```python +def predict(documents: List[Document], batch_size: Optional[int] = None) -> List[Document] +``` + +Returns documents containing classification result in a meta field. + +Documents are updated in place. + +**Arguments**: + +- `documents`: A list of Documents to classify. +- `batch_size`: The number of Documents to classify at a time. + +**Returns**: + +A list of Documents enriched with meta information. + + + +#### TransformersDocumentClassifier.predict\_batch + +```python +def predict_batch(documents: Union[List[Document], List[List[Document]]], batch_size: Optional[int] = None) -> Union[List[Document], List[List[Document]]] +``` + +Returns documents containing classification result in meta field. + +Documents are updated in place. + +**Arguments**: + +- `documents`: List of Documents or list of lists of Documents to classify. +- `batch_size`: Number of Documents to classify at a time. + +**Returns**: + +List of Documents or list of lists of Documents enriched with meta information. + diff --git a/docs/v1.7.0/_src/api/api/document_store.md b/docs/v1.7.0/_src/api/api/document_store.md new file mode 100644 index 0000000000..073837da5b --- /dev/null +++ b/docs/v1.7.0/_src/api/api/document_store.md @@ -0,0 +1,5118 @@ + + +# Module base + + + +## BaseKnowledgeGraph + +```python +class BaseKnowledgeGraph(BaseComponent) +``` + +Base class for implementing Knowledge Graphs. + + + +## BaseDocumentStore + +```python +class BaseDocumentStore(BaseComponent) +``` + +Base class for implementing Document Stores. + + + +#### BaseDocumentStore.write\_documents + +```python +@abstractmethod +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Indexes documents for later queries. + +**Arguments**: + +- `documents`: a list of Python dictionaries or a list of Haystack Document objects. +For documents as dictionaries, the format is {"text": ""}. +Optionally: Include meta data via {"text": "", +"meta":{"name": ", "author": "somebody", ...}} +It can be used for filtering and is accessible in the responses of the Finder. +- `index`: Optional name of index where the documents shall be written to. +If None, the DocumentStore's default index (self.index) will be used. +- `batch_size`: Number of documents that are passed to bulk function at a time. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + +**Returns**: + +None + + + +#### BaseDocumentStore.get\_all\_documents + +```python +@abstractmethod +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Get documents from the document store. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: Number of documents that are passed to bulk function at a time. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + + + +#### BaseDocumentStore.get\_all\_documents\_generator + +```python +@abstractmethod +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + +__Example__: +```python +filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } +} +``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + + + +#### BaseDocumentStore.get\_all\_labels\_aggregated + +```python +def get_all_labels_aggregated(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, open_domain: bool = True, drop_negative_labels: bool = False, drop_no_answers: bool = False, aggregate_by_meta: Optional[Union[str, list]] = None, headers: Optional[Dict[str, str]] = None) -> List[MultiLabel] +``` + +Return all labels in the DocumentStore, aggregated into MultiLabel objects. + +This aggregation step helps, for example, if you collected multiple possible answers for one question and you +want now all answers bundled together in one place for evaluation. +How they are aggregated is defined by the open_domain and aggregate_by_meta parameters. +If the questions are being asked to a single document (i.e. SQuAD style), you should set open_domain=False to aggregate by question and document. +If the questions are being asked to your full collection of documents, you should set open_domain=True to aggregate just by question. +If the questions are being asked to a subslice of your document set (e.g. product review use cases), +you should set open_domain=True and populate aggregate_by_meta with the names of Label meta fields to aggregate by question and your custom meta fields. +For example, in a product review use case, you might set aggregate_by_meta=["product_id"] so that Labels +with the same question but different answers from different documents are aggregated into the one MultiLabel +object, provided that they have the same product_id (to be found in Label.meta["product_id"]) + +**Arguments**: + +- `index`: Name of the index to get the labels from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `open_domain`: When True, labels are aggregated purely based on the question text alone. +When False, labels are aggregated in a closed domain fashion based on the question text +and also the id of the document that the label is tied to. In this setting, this function +might return multiple MultiLabel objects with the same question string. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) +- `aggregate_by_meta`: The names of the Label meta fields by which to aggregate. For example: ["product_id"] +TODO drop params + + + +#### BaseDocumentStore.normalize\_embedding + +```python +def normalize_embedding(emb: np.ndarray) -> None +``` + +Performs L2 normalization of embeddings vector inplace. Input can be a single vector (1D array) or a matrix +(2D array). + + + +#### BaseDocumentStore.add\_eval\_data + +```python +def add_eval_data(filename: str, doc_index: str = "eval_document", label_index: str = "label", batch_size: Optional[int] = None, preprocessor: Optional[PreProcessor] = None, max_docs: Union[int, bool] = None, open_domain: bool = False, headers: Optional[Dict[str, str]] = None) +``` + +Adds a SQuAD-formatted file to the DocumentStore in order to be able to perform evaluation on it. + +If a jsonl file and a batch_size is passed to the function, documents are loaded batchwise +from disk and also indexed batchwise to the DocumentStore in order to prevent out of memory errors. + +**Arguments**: + +- `filename`: Name of the file containing evaluation data (json or jsonl) +- `doc_index`: Elasticsearch index where evaluation documents should be stored +- `label_index`: Elasticsearch index where labeled questions should be stored +- `batch_size`: Optional number of documents that are loaded and processed at a time. +When set to None (default) all documents are processed at once. +- `preprocessor`: Optional PreProcessor to preprocess evaluation documents. +It can be used for splitting documents into passages (and assigning labels to corresponding passages). +Currently the PreProcessor does not support split_by sentence, cleaning nor split_overlap != 0. +When set to None (default) preprocessing is disabled. +- `max_docs`: Optional number of documents that will be loaded. +When set to None (default) all available eval documents are used. +- `open_domain`: Set this to True if your file is an open domain dataset where two different answers to the +same question might be found in different contexts. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + + + +#### BaseDocumentStore.delete\_index + +```python +@abstractmethod +def delete_index(index: str) +``` + +Delete an existing index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +#### BaseDocumentStore.run + +```python +def run(documents: List[Union[dict, Document]], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, id_hash_keys: Optional[List[str]] = None) +``` + +Run requests of document stores + +Comment: We will gradually introduce the primitives. The doument stores also accept dicts and parse them to documents. +In the future, however, only documents themselves will be accepted. Parsing the dictionaries in the run function +is therefore only an interim solution until the run function also accepts documents. + +**Arguments**: + +- `documents`: A list of dicts that are documents. +- `headers`: A list of headers. +- `index`: Optional name of index where the documents shall be written to. +If None, the DocumentStore's default index (self.index) will be used. +- `id_hash_keys`: List of the fields that the hashes of the ids are generated from. + + + +#### BaseDocumentStore.describe\_documents + +```python +def describe_documents(index=None) +``` + +Return a summary of the documents in the document store + + + +## KeywordDocumentStore + +```python +class KeywordDocumentStore(BaseDocumentStore) +``` + +Base class for implementing Document Stores that support keyword searches. + + + +#### KeywordDocumentStore.query + +```python +@abstractmethod +def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False, scale_score: bool = True) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query as defined by keyword matching algorithms like BM25. + +**Arguments**: + +- `query`: The query +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `custom_query`: Custom query to be executed. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) +- `all_terms_must_match`: Whether all terms of the query must match the document. +If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). +Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). +Defaults to False. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### KeywordDocumentStore.query\_batch + +```python +@abstractmethod +def query_batch(queries: List[str], filters: Optional[ + Union[ + Dict[str, Union[Dict, List, str, int, float, bool]], + List[Dict[str, Union[Dict, List, str, int, float, bool]]], + ] + ] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False, scale_score: bool = True) -> List[List[Document]] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the provided queries as defined by keyword matching algorithms like BM25. + +This method lets you find relevant documents for a single query string (output: List of Documents), or a +a list of query strings (output: List of Lists of Documents). + +**Arguments**: + +- `queries`: Single query or list of queries. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `custom_query`: Custom query to be executed. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) +- `all_terms_must_match`: Whether all terms of the query must match the document. +If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). +Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). +Defaults to False. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### get\_batches\_from\_generator + +```python +def get_batches_from_generator(iterable, n) +``` + +Batch elements of an iterable into fixed-length chunks or blocks. + + + +# Module elasticsearch + + + +#### prepare\_hosts + +```python +def prepare_hosts(host, port) +``` + +Create a list of host(s) + port(s) to allow direct client connections to multiple elasticsearch nodes, +in the format expected by the client. + + + +## BaseElasticsearchDocumentStore + +```python +class BaseElasticsearchDocumentStore(KeywordDocumentStore) +``` + +Base class implementing the common logic for Elasticsearch and Opensearch + + + +#### BaseElasticsearchDocumentStore.get\_document\_by\_id + +```python +def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +``` + +Fetch a document by specifying its text id string + + + +#### BaseElasticsearchDocumentStore.get\_documents\_by\_id + +```python +def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Fetch documents by specifying a list of text id strings. Be aware that passing a large number of ids might lead +to performance issues. Note that Elasticsearch limits the number of results to 10,000 documents by default. + + + +#### BaseElasticsearchDocumentStore.get\_metadata\_values\_by\_key + +```python +def get_metadata_values_by_key(key: str, query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> List[dict] +``` + +Get values associated with a metadata key. The output is in the format: + +[{"value": "my-value-1", "count": 23}, {"value": "my-value-2", "count": 12}, ... ] + +**Arguments**: + +- `key`: the meta key name to get the values for. +- `query`: narrow down the scope to documents matching the query string. +- `filters`: Narrow down the scope to documents that match the given filters. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `index`: Elasticsearch index where the meta values should be searched. If not supplied, +self.index will be used. +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + + + +#### BaseElasticsearchDocumentStore.write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Indexes documents for later queries in Elasticsearch. + +Behaviour if a document with the same ID already exists in ElasticSearch: +a) (Default) Throw Elastic's standard error message for duplicate IDs. +b) If `self.update_existing_documents=True` for DocumentStore: Overwrite existing documents. +(This is only relevant if you pass your own ID when initializing a `Document`. +If don't set custom IDs for your Documents or just pass a list of dictionaries here, +they will automatically get UUIDs assigned. See the `Document` class for details) + +**Arguments**: + +- `documents`: a list of Python dictionaries or a list of Haystack Document objects. +For documents as dictionaries, the format is {"content": ""}. +Optionally: Include meta data via {"content": "", +"meta":{"name": ", "author": "somebody", ...}} +It can be used for filtering and is accessible in the responses of the Finder. +Advanced: If you are using your own Elasticsearch mapping, the key names in the dictionary +should be changed to what you have set for self.content_field and self.name_field. +- `index`: Elasticsearch index where the documents should be indexed. If not supplied, self.index will be used. +- `batch_size`: Number of documents that are passed to Elasticsearch's bulk function at a time. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + +**Raises**: + +- `DuplicateDocumentError`: Exception trigger on duplicate document + +**Returns**: + +None + + + +#### BaseElasticsearchDocumentStore.write\_labels + +```python +def write_labels(labels: Union[List[Label], List[dict]], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000) +``` + +Write annotation labels into document store. + +**Arguments**: + +- `labels`: A list of Python dictionaries or a list of Haystack Label objects. +- `index`: Elasticsearch index where the labels should be stored. If not supplied, self.label_index will be used. +- `batch_size`: Number of labels that are passed to Elasticsearch's bulk function at a time. +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + + + +#### BaseElasticsearchDocumentStore.update\_document\_meta + +```python +def update_document_meta(id: str, meta: Dict[str, str], index: str = None, headers: Optional[Dict[str, str]] = None) +``` + +Update the metadata dictionary of a document by specifying its string id + + + +#### BaseElasticsearchDocumentStore.get\_document\_count + +```python +def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the number of documents in the document store. + + + +#### BaseElasticsearchDocumentStore.get\_label\_count + +```python +def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the number of labels in the document store + + + +#### BaseElasticsearchDocumentStore.get\_embedding\_count + +```python +def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the count of embeddings in the document store. + + + +#### BaseElasticsearchDocumentStore.get\_all\_documents + +```python +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Get documents from the document store. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + + + +#### BaseElasticsearchDocumentStore.get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + + + +#### BaseElasticsearchDocumentStore.get\_all\_labels + +```python +def get_all_labels(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000) -> List[Label] +``` + +Return all labels in the document store + + + +#### BaseElasticsearchDocumentStore.query + +```python +def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False, scale_score: bool = True) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query as defined by the BM25 algorithm. + +**Arguments**: + +- `query`: The query +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `custom_query`: query string as per Elasticsearch DSL with a mandatory query placeholder(query). +Optionally, ES `filter` clause can be added where the values of `terms` are placeholders +that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..) +names must match with the filters dict supplied in self.retrieve(). +:: + + **An example custom_query:** + ```python + | { + | "size": 10, + | "query": { + | "bool": { + | "should": [{"multi_match": { + | "query": ${query}, // mandatory query placeholder + | "type": "most_fields", + | "fields": ["content", "title"]}}], + | "filter": [ // optional custom filters + | {"terms": {"year": ${years}}}, + | {"terms": {"quarter": ${quarters}}}, + | {"range": {"date": {"gte": ${date}}}} + | ], + | } + | }, + | } + ``` + + **For this custom_query, a sample retrieve() could be:** + ```python + | self.retrieve(query="Why did the revenue increase?", + | filters={"years": ["2019"], "quarters": ["Q1", "Q2"]}) + ``` + +Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings. +See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html. +You will find the highlighted output in the returned Document's meta field by key "highlighted". +:: + + **Example custom_query with highlighting:** + ```python + | { + | "size": 10, + | "query": { + | "bool": { + | "should": [{"multi_match": { + | "query": ${query}, // mandatory query placeholder + | "type": "most_fields", + | "fields": ["content", "title"]}}], + | } + | }, + | "highlight": { // enable highlighting + | "fields": { // for fields content and title + | "content": {}, + | "title": {} + | } + | }, + | } + ``` + + **For this custom_query, highlighting info can be accessed by:** + ```python + | docs = self.retrieve(query="Why did the revenue increase?") + | highlighted_content = docs[0].meta["highlighted"]["content"] + | highlighted_title = docs[0].meta["highlighted"]["title"] + ``` +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. +- `all_terms_must_match`: Whether all terms of the query must match the document. +If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). +Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). +Defaults to false. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### BaseElasticsearchDocumentStore.query\_batch + +```python +def query_batch(queries: List[str], filters: Optional[ + Union[ + Dict[str, Union[Dict, List, str, int, float, bool]], + List[Dict[str, Union[Dict, List, str, int, float, bool]]], + ] + ] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False, scale_score: bool = True) -> List[List[Document]] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the provided queries as defined by keyword matching algorithms like BM25. + +This method lets you find relevant documents for list of query strings (output: List of Lists of Documents). + +**Arguments**: + +- `queries`: List of query strings. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. Can be a single filter that will be applied to each query or a list of filters +(one filter per query). + +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `custom_query`: Custom query to be executed. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) +- `all_terms_must_match`: Whether all terms of the query must match the document. +If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). +Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). +Defaults to False. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### BaseElasticsearchDocumentStore.query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return +- `index`: Index name for storing the docs and metadata +- `return_embedding`: To return document embedding +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### BaseElasticsearchDocumentStore.update\_embeddings + +```python +def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) +``` + +Updates the embeddings in the the document store using the encoding model specified in the retriever. + +This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). + +**Arguments**: + +- `retriever`: Retriever to use to update the embeddings. +- `index`: Index name to update +- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to False, +only documents without embeddings are processed. This mode can be used for +incremental updating of embeddings, wherein, only newly indexed documents +get processed. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + +**Returns**: + +None + + + +#### BaseElasticsearchDocumentStore.delete\_all\_documents + +```python +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. +- `filters`: Optional filters to narrow down the documents to be deleted. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + +**Returns**: + +None + + + +#### BaseElasticsearchDocumentStore.delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the documents from. If None, the +DocumentStore's default index (self.index) will be used +- `ids`: Optional list of IDs to narrow down the documents to be deleted. +- `filters`: Optional filters to narrow down the documents to be deleted. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + + If filters are provided along with a list of IDs, this method deletes the + intersection of the two query results (documents that match the filters and + have their ID in the list). +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + +**Returns**: + +None + + + +#### BaseElasticsearchDocumentStore.delete\_labels + +```python +def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete labels in an index. All labels are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the labels from. If None, the +DocumentStore's default label index (self.label_index) will be used +- `ids`: Optional list of IDs to narrow down the labels to be deleted. +- `filters`: Optional filters to narrow down the labels to be deleted. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. + +**Returns**: + +None + + + +#### BaseElasticsearchDocumentStore.delete\_index + +```python +def delete_index(index: str) +``` + +Delete an existing elasticsearch index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +## ElasticsearchDocumentStore + +```python +class ElasticsearchDocumentStore(BaseElasticsearchDocumentStore) +``` + + + +#### ElasticsearchDocumentStore.\_\_init\_\_ + +```python +def __init__(host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, username: str = "", password: str = "", api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", scheme: str = "http", ca_certs: Optional[str] = None, verify_certs: bool = True, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity: str = "dot_product", timeout: int = 30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", use_system_proxy: bool = False) +``` + +A DocumentStore using Elasticsearch to store and query the documents for our search. + +* Keeps all the logic to store and query documents from Elastic, incl. mapping of fields, adding filters or boosts to your queries, and storing embeddings + * You can either use an existing Elasticsearch index or create a new one via haystack + * Retrievers operate on top of this DocumentStore to find the relevant documents for a query + +**Arguments**: + +- `host`: url(s) of elasticsearch nodes +- `port`: port(s) of elasticsearch nodes +- `username`: username (standard authentication via http_auth) +- `password`: password (standard authentication via http_auth) +- `api_key_id`: ID of the API key (altenative authentication mode to the above http_auth) +- `api_key`: Secret value of the API key (altenative authentication mode to the above http_auth) +- `aws4auth`: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package) +- `index`: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one. +- `label_index`: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one. +- `search_fields`: Name of fields used by BM25Retriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"] +- `content_field`: Name of field that might contain the answer and will therefore be passed to the Reader Model (e.g. "full_text"). +If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will just be returned. +- `name_field`: Name of field that contains the title of the the doc +- `embedding_field`: Name of field containing an embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top) +- `embedding_dim`: Dimensionality of embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top) +- `custom_mapping`: If you want to use your own custom mapping for creating a new index in Elasticsearch, you can supply it here as a dictionary. +- `analyzer`: Specify the default analyzer from one of the built-ins when creating a new Elasticsearch Index. +Elasticsearch also has built-in analyzers for different languages (e.g. impacting tokenization). More info at: +https://www.elastic.co/guide/en/elasticsearch/reference/7.9/analysis-analyzers.html +- `excluded_meta_data`: Name of fields in Elasticsearch that should not be returned (e.g. [field_one, field_two]). +Helpful if you have fields with long, irrelevant content that you don't want to display in results (e.g. embedding vectors). +- `scheme`: 'https' or 'http', protocol used to connect to your elasticsearch instance +- `ca_certs`: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. You can use certifi package with certifi.where() to find where the CA certs file is located in your machine. +- `verify_certs`: Whether to be strict about ca certificates +- `recreate_index`: If set to True, an existing elasticsearch index will be deleted and a new one will be +created using the config you are using for initialization. Be aware that all data in the old index will be +lost if you choose to recreate the index. Be aware that both the document_index and the label_index will +be recreated. +- `create_index`: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case) +..deprecated:: 2.0 +This param is deprecated. In the next major version we will always try to create an index if there is no +existing index (the current behaviour when create_index=True). If you are looking to recreate an +existing index by deleting it first if it already exist use param recreate_index. +- `refresh_type`: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search. +If set to 'wait_for', continue only after changes are visible (slow, but safe). +If set to 'false', continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion). +More info at https://www.elastic.co/guide/en/elasticsearch/reference/6.8/docs-refresh.html +- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default since it is +more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model. +- `timeout`: Number of seconds after which an ElasticSearch request times out. +- `return_embedding`: To return document embedding +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `index_type`: The type of index to be created. Choose from 'flat' and 'hnsw'. Currently the +ElasticsearchDocumentStore does not support HNSW but OpenDistroElasticsearchDocumentStore does. +- `scroll`: Determines how long the current index is fixed, e.g. during updating all documents with embeddings. +Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h" +For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html +- `skip_missing_embeddings`: Parameter to control queries based on vector similarity when indexed documents miss embeddings. +Parameter options: (True, False) +False: Raises exception if one or more documents do not have embeddings at query time +True: Query will ignore all documents without embeddings (recommended if you concurrently index and query) +- `synonyms`: List of synonyms can be passed while elasticsearch initialization. +For example: [ "foo, bar => baz", + "foozball , foosball" ] +More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html +- `synonym_type`: Synonym filter type can be passed. +Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process. +More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html +- `use_system_proxy`: Whether to use system proxy. + + + +# Module opensearch + + + +## OpenSearchDocumentStore + +```python +class OpenSearchDocumentStore(BaseElasticsearchDocumentStore) +``` + + + +#### OpenSearchDocumentStore.\_\_init\_\_ + +```python +def __init__(scheme: str = "https", username: str = "admin", password: str = "admin", host: Union[str, List[str]] = "localhost", port: Union[int, List[int]] = 9200, api_key_id: Optional[str] = None, api_key: Optional[str] = None, aws4auth=None, index: str = "document", label_index: str = "label", search_fields: Union[str, list] = "content", content_field: str = "content", name_field: str = "name", embedding_field: str = "embedding", embedding_dim: int = 768, custom_mapping: Optional[dict] = None, excluded_meta_data: Optional[list] = None, analyzer: str = "standard", ca_certs: Optional[str] = None, verify_certs: bool = False, recreate_index: bool = False, create_index: bool = True, refresh_type: str = "wait_for", similarity: str = "dot_product", timeout: int = 30, return_embedding: bool = False, duplicate_documents: str = "overwrite", index_type: str = "flat", scroll: str = "1d", skip_missing_embeddings: bool = True, synonyms: Optional[List] = None, synonym_type: str = "synonym", use_system_proxy: bool = False) +``` + +Document Store using OpenSearch (https://opensearch.org/). It is compatible with the AWS Elasticsearch Service. + +In addition to native Elasticsearch query & filtering, it provides efficient vector similarity search using +the KNN plugin that can scale to a large number of documents. + +**Arguments**: + +- `host`: url(s) of elasticsearch nodes +- `port`: port(s) of elasticsearch nodes +- `username`: username (standard authentication via http_auth) +- `password`: password (standard authentication via http_auth) +- `api_key_id`: ID of the API key (altenative authentication mode to the above http_auth) +- `api_key`: Secret value of the API key (altenative authentication mode to the above http_auth) +- `aws4auth`: Authentication for usage with aws elasticsearch (can be generated with the requests-aws4auth package) +- `index`: Name of index in elasticsearch to use for storing the documents that we want to search. If not existing yet, we will create one. +- `label_index`: Name of index in elasticsearch to use for storing labels. If not existing yet, we will create one. +- `search_fields`: Name of fields used by BM25Retriever to find matches in the docs to our incoming query (using elastic's multi_match query), e.g. ["title", "full_text"] +- `content_field`: Name of field that might contain the answer and will therefore be passed to the Reader Model (e.g. "full_text"). +If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will just be returned. +- `name_field`: Name of field that contains the title of the the doc +- `embedding_field`: Name of field containing an embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top) +Note, that in OpenSearch the similarity type for efficient approximate vector similarity calculations is tied to the embedding field's data type which cannot be changed after creation. +- `embedding_dim`: Dimensionality of embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top) +- `custom_mapping`: If you want to use your own custom mapping for creating a new index in Elasticsearch, you can supply it here as a dictionary. +- `analyzer`: Specify the default analyzer from one of the built-ins when creating a new Elasticsearch Index. +Elasticsearch also has built-in analyzers for different languages (e.g. impacting tokenization). More info at: +https://www.elastic.co/guide/en/elasticsearch/reference/7.9/analysis-analyzers.html +- `excluded_meta_data`: Name of fields in Elasticsearch that should not be returned (e.g. [field_one, field_two]). +Helpful if you have fields with long, irrelevant content that you don't want to display in results (e.g. embedding vectors). +- `scheme`: 'https' or 'http', protocol used to connect to your elasticsearch instance +- `ca_certs`: Root certificates for SSL: it is a path to certificate authority (CA) certs on disk. You can use certifi package with certifi.where() to find where the CA certs file is located in your machine. +- `verify_certs`: Whether to be strict about ca certificates +- `create_index`: Whether to try creating a new index (If the index of that name is already existing, we will just continue in any case +- `refresh_type`: Type of ES refresh used to control when changes made by a request (e.g. bulk) are made visible to search. +If set to 'wait_for', continue only after changes are visible (slow, but safe). +If set to 'false', continue directly (fast, but sometimes unintuitive behaviour when docs are not immediately available after ingestion). +More info at https://www.elastic.co/guide/en/elasticsearch/reference/6.8/docs-refresh.html +- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default since it is +more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model. +Note, that the use of efficient approximate vector calculations in OpenSearch is tied to embedding_field's data type which cannot be changed after creation. +You won't be able to use approximate vector calculations on an embedding_field which was created with a different similarity value. +In such cases a fallback to exact but slow vector calculations will happen and a warning will be displayed. +- `timeout`: Number of seconds after which an ElasticSearch request times out. +- `return_embedding`: To return document embedding +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `index_type`: The type of index to be created. Choose from 'flat' and 'hnsw'. +As OpenSearch currently does not support all similarity functions (e.g. dot_product) in exact vector similarity calculations, +we don't make use of exact vector similarity when index_type='flat'. Instead we use the same approximate vector similarity calculations like in 'hnsw', but further optimized for accuracy. +Exact vector similarity is only used as fallback when there's a mismatch between certain requested and indexed similarity types. +In these cases however, a warning will be displayed. See similarity param for more information. +- `scroll`: Determines how long the current index is fixed, e.g. during updating all documents with embeddings. +Defaults to "1d" and should not be larger than this. Can also be in minutes "5m" or hours "15h" +For details, see https://www.elastic.co/guide/en/elasticsearch/reference/current/scroll-api.html +- `skip_missing_embeddings`: Parameter to control queries based on vector similarity when indexed documents miss embeddings. +Parameter options: (True, False) +False: Raises exception if one or more documents do not have embeddings at query time +True: Query will ignore all documents without embeddings (recommended if you concurrently index and query) +- `synonyms`: List of synonyms can be passed while elasticsearch initialization. +For example: [ "foo, bar => baz", + "foozball , foosball" ] +More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-tokenfilter.html +- `synonym_type`: Synonym filter type can be passed. +Synonym or Synonym_graph to handle synonyms, including multi-word synonyms correctly during the analysis process. +More info at https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-synonym-graph-tokenfilter.html + + + +#### OpenSearchDocumentStore.query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return +- `index`: Index name for storing the docs and metadata +- `return_embedding`: To return document embedding +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +## OpenDistroElasticsearchDocumentStore + +```python +class OpenDistroElasticsearchDocumentStore(OpenSearchDocumentStore) +``` + +A DocumentStore which has an Open Distro for Elasticsearch service behind it. + + + +# Module memory + + + +## InMemoryDocumentStore + +```python +class InMemoryDocumentStore(BaseDocumentStore) +``` + +In-memory document store + + + +#### InMemoryDocumentStore.\_\_init\_\_ + +```python +def __init__(index: str = "document", label_index: str = "label", embedding_field: Optional[str] = "embedding", embedding_dim: int = 768, return_embedding: bool = False, similarity: str = "dot_product", progress_bar: bool = True, duplicate_documents: str = "overwrite", use_gpu: bool = True, scoring_batch_size: int = 500000) +``` + +**Arguments**: + +- `index`: The documents are scoped to an index attribute that can be used when writing, querying, +or deleting documents. This parameter sets the default value for document index. +- `label_index`: The default value of index attribute for the labels. +- `embedding_field`: Name of field containing an embedding vector (Only needed when using a dense retriever (e.g. DensePassageRetriever, EmbeddingRetriever) on top) +- `embedding_dim`: The size of the embedding vector. +- `return_embedding`: To return document embedding +- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default sine it is +more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence BERT model. +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `use_gpu`: Whether to use a GPU or the CPU for calculating embedding similarity. +Falls back to CPU if no GPU is available. +- `scoring_batch_size`: Batch size of documents to calculate similarity for. Very small batch sizes are inefficent. +Very large batch sizes can overrun GPU memory. In general you want to make sure +you have at least `embedding_dim`*`scoring_batch_size`*4 bytes available in GPU memory. +Since the data is originally stored in CPU memory there is little risk of overruning memory +when running on CPU. + + + +#### InMemoryDocumentStore.write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Indexes documents for later queries. + +**Arguments**: + +- `documents`: a list of Python dictionaries or a list of Haystack Document objects. +For documents as dictionaries, the format is {"content": ""}. + Optionally: Include meta data via {"content": "", + "meta": {"name": ", "author": "somebody", ...}} + It can be used for filtering and is accessible in the responses of the Finder. +:param index: write documents to a custom namespace. For instance, documents for evaluation can be indexed in a + separate index than the documents for search. +:param duplicate_documents: Handle duplicates document based on parameter options. + Parameter options : ( 'skip','overwrite','fail') + skip: Ignore the duplicates documents + overwrite: Update any existing documents with the same ID when adding documents. + fail: an error is raised if the document ID of the document being added already + exists. +:raises DuplicateDocumentError: Exception trigger on duplicate document +:return: None + + + +#### InMemoryDocumentStore.write\_labels + +```python +def write_labels(labels: Union[List[dict], List[Label]], index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Write annotation labels into document store. + + + +#### InMemoryDocumentStore.get\_document\_by\_id + +```python +def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +``` + +Fetch a document by specifying its text id string. + + + +#### InMemoryDocumentStore.get\_documents\_by\_id + +```python +def get_documents_by_id(ids: List[str], index: Optional[str] = None) -> List[Document] +``` + +Fetch documents by specifying a list of text id strings. + + + +#### InMemoryDocumentStore.get\_scores\_torch + +```python +def get_scores_torch(query_emb: np.ndarray, document_to_search: List[Document]) -> List[float] +``` + +Calculate similarity scores between query embedding and a list of documents using torch. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `document_to_search`: List of documents to compare `query_emb` against. + + + +#### InMemoryDocumentStore.get\_scores\_numpy + +```python +def get_scores_numpy(query_emb: np.ndarray, document_to_search: List[Document]) -> List[float] +``` + +Calculate similarity scores between query embedding and a list of documents using numpy. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `document_to_search`: List of documents to compare `query_emb` against. + + + +#### InMemoryDocumentStore.query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. +Example: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` +To use the same logical operator multiple times on the same level, logical operators take +optionally a list of dictionaries as value. +Example: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return +- `index`: Index name for storing the docs and metadata +- `return_embedding`: To return document embedding +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### InMemoryDocumentStore.update\_embeddings + +```python +def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000) +``` + +Updates the embeddings in the the document store using the encoding model specified in the retriever. + +This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). + +**Arguments**: + +- `retriever`: Retriever to use to get embeddings for text +- `index`: Index name for which embeddings are to be updated. If set to None, the default self.index is used. +- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to False, +only documents without embeddings are processed. This mode can be used for +incremental updating of embeddings, wherein, only newly indexed documents +get processed. +- `filters`: Narrow down the scope to documents that match the given filters. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. +Example: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + +**Returns**: + +None + + + +#### InMemoryDocumentStore.get\_document\_count + +```python +def get_document_count(filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the number of documents in the document store. + + + +#### InMemoryDocumentStore.update\_document\_meta + +```python +def update_document_meta(id: str, meta: Dict[str, Any], index: str = None) +``` + +Update the metadata dictionary of a document by specifying its string id. + +**Arguments**: + +- `id`: The ID of the Document whose metadata is being updated. +- `meta`: A dictionary with key-value pairs that should be added / changed for the provided Document ID. +- `index`: Name of the index the Document is located at. + + + +#### InMemoryDocumentStore.get\_embedding\_count + +```python +def get_embedding_count(filters: Optional[Dict[str, List[str]]] = None, index: Optional[str] = None) -> int +``` + +Return the count of embeddings in the document store. + + + +#### InMemoryDocumentStore.get\_label\_count + +```python +def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the number of labels in the document store. + + + +#### InMemoryDocumentStore.get\_all\_documents + +```python +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Get all documents from the document store as a list. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Narrow down the scope to documents that match the given filters. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. +Example: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. + + + +#### InMemoryDocumentStore.get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get all documents from the document store. The methods returns a Python Generator that yields individual + +documents. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Narrow down the scope to documents that match the given filters. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. +Example: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. + + + +#### InMemoryDocumentStore.get\_all\_labels + +```python +def get_all_labels(index: str = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) -> List[Label] +``` + +Return all labels in the document store. + + + +#### InMemoryDocumentStore.delete\_all\_documents + +```python +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. +- `filters`: Narrow down the scope to documents that match the given filters. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. +Example: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + +**Returns**: + +None + + + +#### InMemoryDocumentStore.delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `ids`: Optional list of IDs to narrow down the documents to be deleted. +- `filters`: Narrow down the scope to documents that match the given filters. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. +Example: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + +**Returns**: + +None + + + +#### InMemoryDocumentStore.delete\_index + +```python +def delete_index(index: str) +``` + +Delete an existing index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +#### InMemoryDocumentStore.delete\_labels + +```python +def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete labels in an index. All labels are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the labels from. If None, the +DocumentStore's default label index (self.label_index) will be used. +- `ids`: Optional list of IDs to narrow down the labels to be deleted. +- `filters`: Narrow down the scope to documents that match the given filters. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. +Example: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + +**Returns**: + +None + + + +# Module sql + + + +## SQLDocumentStore + +```python +class SQLDocumentStore(BaseDocumentStore) +``` + + + +#### SQLDocumentStore.\_\_init\_\_ + +```python +def __init__(url: str = "sqlite://", index: str = "document", label_index: str = "label", duplicate_documents: str = "overwrite", check_same_thread: bool = False, isolation_level: str = None) +``` + +An SQL backed DocumentStore. Currently supports SQLite, PostgreSQL and MySQL backends. + +**Arguments**: + +- `url`: URL for SQL database as expected by SQLAlchemy. More info here: https://docs.sqlalchemy.org/en/13/core/engines.html#database-urls +- `index`: The documents are scoped to an index attribute that can be used when writing, querying, or deleting documents. +This parameter sets the default value for document index. +- `label_index`: The default value of index attribute for the labels. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `check_same_thread`: Set to False to mitigate multithreading issues in older SQLite versions (see https://docs.sqlalchemy.org/en/14/dialects/sqlite.html?highlight=check_same_thread#threading-pooling-behavior) +- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) + + + +#### SQLDocumentStore.get\_document\_by\_id + +```python +def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +``` + +Fetch a document by specifying its text id string + + + +#### SQLDocumentStore.get\_documents\_by\_id + +```python +def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Fetch documents by specifying a list of text id strings + + + +#### SQLDocumentStore.get\_documents\_by\_vector\_ids + +```python +def get_documents_by_vector_ids(vector_ids: List[str], index: Optional[str] = None, batch_size: int = 10_000) +``` + +Fetch documents by specifying a list of text vector id strings + + + +#### SQLDocumentStore.get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### SQLDocumentStore.get\_all\_labels + +```python +def get_all_labels(index=None, filters: Optional[dict] = None, headers: Optional[Dict[str, str]] = None) +``` + +Return all labels in the document store + + + +#### SQLDocumentStore.write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> None +``` + +Indexes documents for later queries. + +**Arguments**: + +- `documents`: a list of Python dictionaries or a list of Haystack Document objects. +For documents as dictionaries, the format is {"text": ""}. +Optionally: Include meta data via {"text": "", +"meta":{"name": ", "author": "somebody", ...}} +It can be used for filtering and is accessible in the responses of the Finder. +- `index`: add an optional index attribute to documents. It can be later used for filtering. For instance, +documents for evaluation can be indexed in a separate index than the documents for search. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents +but is considerably slower (default). +fail: an error is raised if the document ID of the document being added already +exists. + +**Returns**: + +None + + + +#### SQLDocumentStore.write\_labels + +```python +def write_labels(labels, index=None, headers: Optional[Dict[str, str]] = None) +``` + +Write annotation labels into document store. + + + +#### SQLDocumentStore.update\_vector\_ids + +```python +def update_vector_ids(vector_id_map: Dict[str, str], index: Optional[str] = None, batch_size: int = 10_000) +``` + +Update vector_ids for given document_ids. + +**Arguments**: + +- `vector_id_map`: dict containing mapping of document_id -> vector_id. +- `index`: filter documents by the optional index attribute for documents in database. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### SQLDocumentStore.reset\_vector\_ids + +```python +def reset_vector_ids(index: Optional[str] = None) +``` + +Set vector IDs for all documents as None + + + +#### SQLDocumentStore.update\_document\_meta + +```python +def update_document_meta(id: str, meta: Dict[str, str], index: str = None) +``` + +Update the metadata dictionary of a document by specifying its string id + + + +#### SQLDocumentStore.get\_document\_count + +```python +def get_document_count(filters: Optional[Dict[str, Any]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the number of documents in the document store. + + + +#### SQLDocumentStore.get\_label\_count + +```python +def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the number of labels in the document store + + + +#### SQLDocumentStore.delete\_all\_documents + +```python +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. +- `filters`: Optional filters to narrow down the documents to be deleted. + +**Returns**: + +None + + + +#### SQLDocumentStore.delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. If None, the +DocumentStore's default index (self.index) will be used. +- `ids`: Optional list of IDs to narrow down the documents to be deleted. +- `filters`: Optional filters to narrow down the documents to be deleted. +Example filters: {"name": ["some", "more"], "category": ["only_one"]}. +If filters are provided along with a list of IDs, this method deletes the +intersection of the two query results (documents that match the filters and +have their ID in the list). + +**Returns**: + +None + + + +#### SQLDocumentStore.delete\_index + +```python +def delete_index(index: str) +``` + +Delete an existing index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +#### SQLDocumentStore.delete\_labels + +```python +def delete_labels(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete labels from the document store. All labels are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the labels from. If None, the +DocumentStore's default label index (self.label_index) will be used. +- `ids`: Optional list of IDs to narrow down the labels to be deleted. +- `filters`: Optional filters to narrow down the labels to be deleted. +Example filters: {"id": ["9a196e41-f7b5-45b4-bd19-5feb7501c159", "9a196e41-f7b5-45b4-bd19-5feb7501c159"]} or {"query": ["question2"]} + +**Returns**: + +None + + + +# Module faiss + + + +## FAISSDocumentStore + +```python +class FAISSDocumentStore(SQLDocumentStore) +``` + +Document store for very large scale embedding based dense retrievers like the DPR. + +It implements the FAISS library(https://github.com/facebookresearch/faiss) +to perform similarity search on vectors. + +The document text and meta-data (for filtering) are stored using the SQLDocumentStore, while +the vector embeddings are indexed in a FAISS Index. + + + +#### FAISSDocumentStore.\_\_init\_\_ + +```python +def __init__(sql_url: str = "sqlite:///faiss_document_store.db", vector_dim: int = None, embedding_dim: int = 768, faiss_index_factory_str: str = "Flat", faiss_index: Optional[faiss.swigfaiss.Index] = None, return_embedding: bool = False, index: str = "document", similarity: str = "dot_product", embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", faiss_index_path: Union[str, Path] = None, faiss_config_path: Union[str, Path] = None, isolation_level: str = None, n_links: int = 64, ef_search: int = 20, ef_construction: int = 80, validate_index_sync: bool = True) +``` + +**Arguments**: + +- `sql_url`: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale +deployment, Postgres is recommended. +- `vector_dim`: Deprecated. Use embedding_dim instead. +- `embedding_dim`: The embedding vector size. Default: 768. +- `faiss_index_factory_str`: Create a new FAISS index of the specified type. +The type is determined from the given string following the conventions +of the original FAISS index factory. +Recommended options: +- "Flat" (default): Best accuracy (= exact). Becomes slow and RAM intense for > 1 Mio docs. +- "HNSW": Graph-based heuristic. If not further specified, + we use the following config: + HNSW64, efConstruction=80 and efSearch=20 +- "IVFx,Flat": Inverted Index. Replace x with the number of centroids aka nlist. + Rule of thumb: nlist = 10 * sqrt (num_docs) is a good starting point. +For more details see: +- Overview of indices https://github.com/facebookresearch/faiss/wiki/Faiss-indexes +- Guideline for choosing an index https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index +- FAISS Index factory https://github.com/facebookresearch/faiss/wiki/The-index-factory +Benchmarks: XXX +- `faiss_index`: Pass an existing FAISS Index, i.e. an empty one that you configured manually +or one with docs that you used in Haystack before and want to load again. +- `return_embedding`: To return document embedding. Unlike other document stores, FAISS will return normalized embeddings +- `index`: Name of index in document store to use. +- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default since it is +more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence-Transformer model. +In both cases, the returned values in Document.score are normalized to be in range [0,1]: +For `dot_product`: expit(np.asarray(raw_score / 100)) +FOr `cosine`: (raw_score + 1) / 2 +- `embedding_field`: Name of field containing an embedding vector. +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `faiss_index_path`: Stored FAISS index file. Can be created via calling `save()`. +If specified no other params besides faiss_config_path must be specified. +- `faiss_config_path`: Stored FAISS initial configuration parameters. +Can be created via calling `save()` +- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) +- `n_links`: used only if index_factory == "HNSW" +- `ef_search`: used only if index_factory == "HNSW" +- `ef_construction`: used only if index_factory == "HNSW" +- `validate_index_sync`: Whether to check that the document count equals the embedding count at initialization time + + + +#### FAISSDocumentStore.write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> None +``` + +Add new documents to the DocumentStore. + +**Arguments**: + +- `documents`: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index +them right away in FAISS. If not, you can later call update_embeddings() to create & index them. +- `index`: (SQL) index name for storing the docs and metadata +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. + +**Raises**: + +- `DuplicateDocumentError`: Exception trigger on duplicate document + +**Returns**: + +None + + + +#### FAISSDocumentStore.update\_embeddings + +```python +def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None, batch_size: int = 10_000) +``` + +Updates the embeddings in the the document store using the encoding model specified in the retriever. + +This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). + +**Arguments**: + +- `retriever`: Retriever to use to get embeddings for text +- `index`: Index name for which embeddings are to be updated. If set to None, the default self.index is used. +- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to False, +only documents without embeddings are processed. This mode can be used for +incremental updating of embeddings, wherein, only newly indexed documents +get processed. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + +**Returns**: + +None + + + +#### FAISSDocumentStore.get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get all documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `return_embedding`: Whether to return the document embeddings. Unlike other document stores, FAISS will return normalized embeddings +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### FAISSDocumentStore.get\_embedding\_count + +```python +def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int +``` + +Return the count of embeddings in the document store. + + + +#### FAISSDocumentStore.train\_index + +```python +def train_index(documents: Optional[Union[List[dict], List[Document]]], embeddings: Optional[np.ndarray] = None, index: Optional[str] = None) +``` + +Some FAISS indices (e.g. IVF) require initial "training" on a sample of vectors before you can add your final vectors. + +The train vectors should come from the same distribution as your final ones. +You can pass either documents (incl. embeddings) or just the plain embeddings that the index shall be trained on. + +**Arguments**: + +- `documents`: Documents (incl. the embeddings) +- `embeddings`: Plain embeddings +- `index`: Name of the index to train. If None, the DocumentStore's default index (self.index) will be used. + +**Returns**: + +None + + + +#### FAISSDocumentStore.delete\_all\_documents + +```python +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete all documents from the document store. + + + +#### FAISSDocumentStore.delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents from the document store. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `ids`: Optional list of IDs to narrow down the documents to be deleted. +- `filters`: Optional filters to narrow down the documents to be deleted. +Example filters: {"name": ["some", "more"], "category": ["only_one"]}. +If filters are provided along with a list of IDs, this method deletes the +intersection of the two query results (documents that match the filters and +have their ID in the list). + +**Returns**: + +None + + + +#### FAISSDocumentStore.delete\_index + +```python +def delete_index(index: str) +``` + +Delete an existing index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +#### FAISSDocumentStore.query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `top_k`: How many documents to return +- `index`: Index name to query the document from. +- `return_embedding`: To return document embedding. Unlike other document stores, FAISS will return normalized embeddings +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### FAISSDocumentStore.save + +```python +def save(index_path: Union[str, Path], config_path: Optional[Union[str, Path]] = None) +``` + +Save FAISS Index to the specified file. + +**Arguments**: + +- `index_path`: Path to save the FAISS index to. +- `config_path`: Path to save the initial configuration parameters to. +Defaults to the same as the file path, save the extension (.json). +This file contains all the parameters passed to FAISSDocumentStore() +at creation time (for example the SQL path, embedding_dim, etc), and will be +used by the `load` method to restore the index with the appropriate configuration. + +**Returns**: + +None + + + +#### FAISSDocumentStore.load + +```python +@classmethod +def load(cls, index_path: Union[str, Path], config_path: Optional[Union[str, Path]] = None) +``` + +Load a saved FAISS index from a file and connect to the SQL database. + +Note: In order to have a correct mapping from FAISS to SQL, + make sure to use the same SQL DB that you used when calling `save()`. + +**Arguments**: + +- `index_path`: Stored FAISS index file. Can be created via calling `save()` +- `config_path`: Stored FAISS initial configuration parameters. +Can be created via calling `save()` + + + +# Module milvus1 + + + +## Milvus1DocumentStore + +```python +class Milvus1DocumentStore(SQLDocumentStore) +``` + +Milvus (https://milvus.io/) is a highly reliable, scalable Document Store specialized on storing and processing vectors. +Therefore, it is particularly suited for Haystack users that work with dense retrieval methods (like DPR). +In contrast to FAISS, Milvus ... + - runs as a separate service (e.g. a Docker container) and can scale easily in a distributed environment + - allows dynamic data management (i.e. you can insert/delete vectors without recreating the whole index) + - encapsulates multiple ANN libraries (FAISS, ANNOY ...) + +This class uses Milvus for all vector related storage, processing and querying. +The meta-data (e.g. for filtering) and the document text are however stored in a separate SQL Database as Milvus +does not allow these data types (yet). + +Usage: +1. Start a Milvus server (see https://milvus.io/docs/v1.0.0/install_milvus.md) +2. Run pip install farm-haystack[milvus1] +3. Init a MilvusDocumentStore in Haystack + + + +#### Milvus1DocumentStore.\_\_init\_\_ + +```python +def __init__(sql_url: str = "sqlite:///", milvus_url: str = "tcp://localhost:19530", connection_pool: str = "SingletonThread", index: str = "document", vector_dim: int = None, embedding_dim: int = 768, index_file_size: int = 1024, similarity: str = "dot_product", index_type: IndexType = IndexType.FLAT, index_param: Optional[Dict[str, Any]] = None, search_param: Optional[Dict[str, Any]] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", isolation_level: str = None) +``` + +**WARNING:** Milvus1DocumentStore is deprecated and will be removed in a future version. Please switch to Milvus2 + +or consider using another DocumentStore. + +**Arguments**: + +- `sql_url`: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale +deployment, Postgres is recommended. If using MySQL then same server can also be used for +Milvus metadata. For more details see https://milvus.io/docs/v1.0.0/data_manage.md. +- `milvus_url`: Milvus server connection URL for storing and processing vectors. +Protocol, host and port will automatically be inferred from the URL. +See https://milvus.io/docs/v1.0.0/install_milvus.md for instructions to start a Milvus instance. +- `connection_pool`: Connection pool type to connect with Milvus server. Default: "SingletonThread". +- `index`: Index name for text, embedding and metadata (in Milvus terms, this is the "collection name"). +- `vector_dim`: Deprecated. Use embedding_dim instead. +- `embedding_dim`: The embedding vector size. Default: 768. +- `index_file_size`: Specifies the size of each segment file that is stored by Milvus and its default value is 1024 MB. +When the size of newly inserted vectors reaches the specified volume, Milvus packs these vectors into a new segment. +Milvus creates one index file for each segment. When conducting a vector search, Milvus searches all index files one by one. +As a rule of thumb, we would see a 30% ~ 50% increase in the search performance after changing the value of index_file_size from 1024 to 2048. +Note that an overly large index_file_size value may cause failure to load a segment into the memory or graphics memory. +(From https://milvus.io/docs/v1.0.0/performance_faq.md#How-can-I-get-the-best-performance-from-Milvus-through-setting-index_file_size) +- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default and recommended for DPR embeddings. +'cosine' is recommended for Sentence Transformers. +- `index_type`: Type of approximate nearest neighbour (ANN) index used. The choice here determines your tradeoff between speed and accuracy. +Some popular options: +- FLAT (default): Exact method, slow +- IVF_FLAT, inverted file based heuristic, fast +- HSNW: Graph based, fast +- ANNOY: Tree based, fast +See: https://milvus.io/docs/v1.0.0/index.md +- `index_param`: Configuration parameters for the chose index_type needed at indexing time. +For example: {"nlist": 16384} as the number of cluster units to create for index_type IVF_FLAT. +See https://milvus.io/docs/v1.0.0/index.md +- `search_param`: Configuration parameters for the chose index_type needed at query time +For example: {"nprobe": 10} as the number of cluster units to query for index_type IVF_FLAT. +See https://milvus.io/docs/v1.0.0/index.md +- `return_embedding`: To return document embedding. +- `embedding_field`: Name of field containing an embedding vector. +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) + + + +#### Milvus1DocumentStore.write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None, index_param: Optional[Dict[str, Any]] = None) +``` + +Add new documents to the DocumentStore. + +**Arguments**: + +- `documents`: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index +them right away in Milvus. If not, you can later call update_embeddings() to create & index them. +- `index`: (SQL) index name for storing the docs and metadata +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. + +**Raises**: + +- `DuplicateDocumentError`: Exception trigger on duplicate document + +**Returns**: + +None + + + +#### Milvus1DocumentStore.update\_embeddings + +```python +def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, batch_size: int = 10_000, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None) +``` + +Updates the embeddings in the the document store using the encoding model specified in the retriever. + +This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). + +**Arguments**: + +- `retriever`: Retriever to use to get embeddings for text +- `index`: (SQL) index name for storing the docs and metadata +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to False, +only documents without embeddings are processed. This mode can be used for +incremental updating of embeddings, wherein, only newly indexed documents +get processed. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Example: {"name": ["some", "more"], "category": ["only_one"]} + +**Returns**: + +None + + + +#### Milvus1DocumentStore.query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `top_k`: How many documents to return +- `index`: (SQL) index name for storing the docs and metadata +- `return_embedding`: To return document embedding +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + +**Returns**: + +list of Documents that are the most similar to `query_emb` + + + +#### Milvus1DocumentStore.delete\_all\_documents + +```python +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete all documents (from SQL AND Milvus). + +**Arguments**: + +- `index`: (SQL) index name for storing the docs and metadata +- `filters`: Optional filters to narrow down the search space. +Example: {"name": ["some", "more"], "category": ["only_one"]} + +**Returns**: + +None + + + +#### Milvus1DocumentStore.delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. If None, the +DocumentStore's default index (self.index) will be used. +- `ids`: Optional list of IDs to narrow down the documents to be deleted. +- `filters`: Optional filters to narrow down the documents to be deleted. +Example filters: {"name": ["some", "more"], "category": ["only_one"]}. +If filters are provided along with a list of IDs, this method deletes the +intersection of the two query results (documents that match the filters and +have their ID in the list). + +**Returns**: + +None + + + +#### Milvus1DocumentStore.delete\_index + +```python +def delete_index(index: str) +``` + +Delete an existing index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +#### Milvus1DocumentStore.get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get all documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### Milvus1DocumentStore.get\_all\_documents + +```python +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Get documents from the document store (optionally using filter criteria). + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### Milvus1DocumentStore.get\_document\_by\_id + +```python +def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +``` + +Fetch a document by specifying its text id string + +**Arguments**: + +- `id`: ID of the document +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. + + + +#### Milvus1DocumentStore.get\_documents\_by\_id + +```python +def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Fetch multiple documents by specifying their IDs (strings) + +**Arguments**: + +- `ids`: List of IDs of the documents +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `batch_size`: is currently not used + + + +#### Milvus1DocumentStore.get\_all\_vectors + +```python +def get_all_vectors(index: Optional[str] = None) -> List[np.ndarray] +``` + +Helper function to dump all vectors stored in Milvus server. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. + +**Returns**: + +List[np.array]: List of vectors. + + + +#### Milvus1DocumentStore.get\_embedding\_count + +```python +def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None) -> int +``` + +Return the count of embeddings in the document store. + + + +# Module milvus2 + + + +## Milvus2DocumentStore + +```python +class Milvus2DocumentStore(SQLDocumentStore) +``` + +Limitations: +Milvus 2.0 so far doesn't support the deletion of documents (https://github.com/milvus-io/milvus/issues/7130). +Therefore, delete_documents() and update_embeddings() won't work yet. + +Differences to 1.x: +Besides big architectural changes that impact performance and reliability 2.0 supports the filtering by scalar data types. +For Haystack users this means you can now run a query using vector similarity and filter for some meta data at the same time! +(See https://milvus.io/docs/v2.0.x/comparison.md for more details) + +Usage: +1. Start a Milvus service via docker (see https://milvus.io/docs/v2.0.x/install_standalone-docker.md) +2. Run pip install farm-haystack[milvus] +3. Init a MilvusDocumentStore() in Haystack + +Overview: +Milvus (https://milvus.io/) is a highly reliable, scalable Document Store specialized on storing and processing vectors. +Therefore, it is particularly suited for Haystack users that work with dense retrieval methods (like DPR). + +In contrast to FAISS, Milvus ... + - runs as a separate service (e.g. a Docker container) and can scale easily in a distributed environment + - allows dynamic data management (i.e. you can insert/delete vectors without recreating the whole index) + - encapsulates multiple ANN libraries (FAISS, ANNOY ...) + +This class uses Milvus for all vector related storage, processing and querying. +The meta-data (e.g. for filtering) and the document text are however stored in a separate SQL Database as Milvus +does not allow these data types (yet). + + + +#### Milvus2DocumentStore.\_\_init\_\_ + +```python +def __init__(sql_url: str = "sqlite:///", host: str = "localhost", port: str = "19530", connection_pool: str = "SingletonThread", index: str = "document", vector_dim: int = None, embedding_dim: int = 768, index_file_size: int = 1024, similarity: str = "dot_product", index_type: str = "IVF_FLAT", index_param: Optional[Dict[str, Any]] = None, search_param: Optional[Dict[str, Any]] = None, return_embedding: bool = False, embedding_field: str = "embedding", id_field: str = "id", custom_fields: Optional[List[Any]] = None, progress_bar: bool = True, duplicate_documents: str = "overwrite", isolation_level: str = None, consistency_level: int = 0, recreate_index: bool = False) +``` + +**Arguments**: + +- `sql_url`: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale +deployment, Postgres is recommended. If using MySQL then same server can also be used for +Milvus metadata. For more details see https://milvus.io/docs/v1.1.0/data_manage.md. +- `milvus_url`: Milvus server connection URL for storing and processing vectors. +Protocol, host and port will automatically be inferred from the URL. +See https://milvus.io/docs/v2.0.x/install_standalone-docker.md for instructions to start a Milvus instance. +- `connection_pool`: Connection pool type to connect with Milvus server. Default: "SingletonThread". +- `index`: Index name for text, embedding and metadata (in Milvus terms, this is the "collection name"). +- `vector_dim`: Deprecated. Use embedding_dim instead. +- `embedding_dim`: The embedding vector size. Default: 768. +- `index_file_size`: Specifies the size of each segment file that is stored by Milvus and its default value is 1024 MB. +When the size of newly inserted vectors reaches the specified volume, Milvus packs these vectors into a new segment. +Milvus creates one index file for each segment. When conducting a vector search, Milvus searches all index files one by one. +As a rule of thumb, we would see a 30% ~ 50% increase in the search performance after changing the value of index_file_size from 1024 to 2048. +Note that an overly large index_file_size value may cause failure to load a segment into the memory or graphics memory. +(From https://milvus.io/docs/v2.0.x/performance_faq.md) +- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default and recommended for DPR embeddings. +'cosine' is recommended for Sentence Transformers, but is not directly supported by Milvus. +However, you can normalize your embeddings and use `dot_product` to get the same results. +See https://milvus.io/docs/v2.0.x/metric.md. +- `index_type`: Type of approximate nearest neighbour (ANN) index used. The choice here determines your tradeoff between speed and accuracy. +Some popular options: +- FLAT (default): Exact method, slow +- IVF_FLAT, inverted file based heuristic, fast +- HSNW: Graph based, fast +- ANNOY: Tree based, fast +See: https://milvus.io/docs/v2.0.x/index.md +- `index_param`: Configuration parameters for the chose index_type needed at indexing time. +For example: {"nlist": 16384} as the number of cluster units to create for index_type IVF_FLAT. +See https://milvus.io/docs/v2.0.x/index.md +- `search_param`: Configuration parameters for the chose index_type needed at query time +For example: {"nprobe": 10} as the number of cluster units to query for index_type IVF_FLAT. +See https://milvus.io/docs/v2.0.x/index.md +- `return_embedding`: To return document embedding. +- `embedding_field`: Name of field containing an embedding vector. +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `isolation_level`: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) +- `recreate_index`: If set to True, an existing Milvus index will be deleted and a new one will be +created using the config you are using for initialization. Be aware that all data in the old index will be +lost if you choose to recreate the index. Be aware that both the document_index and the label_index will +be recreated. + + + +#### Milvus2DocumentStore.write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None, index_param: Optional[Dict[str, Any]] = None) +``` + +Add new documents to the DocumentStore. + +**Arguments**: + +- `documents`: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index +them right away in Milvus. If not, you can later call `update_embeddings()` to create & index them. +- `index`: (SQL) index name for storing the docs and metadata +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. + +**Raises**: + +- `DuplicateDocumentError`: Exception trigger on duplicate document + + + +#### Milvus2DocumentStore.update\_embeddings + +```python +def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, batch_size: int = 10_000, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Any]] = None) +``` + +Updates the embeddings in the the document store using the encoding model specified in the retriever. + +This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). + +**Arguments**: + +- `retriever`: Retriever to use to get embeddings for text +- `index`: (SQL) index name for storing the docs and metadata +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to False, +only documents without embeddings are processed. This mode can be used for +incremental updating of embeddings, wherein, only newly indexed documents +get processed. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Example: {"name": ["some", "more"], "category": ["only_one"]} + +**Returns**: + +None + + + +#### Milvus2DocumentStore.query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Any]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `top_k`: How many documents to return +- `index`: (SQL) index name for storing the docs and metadata +- `return_embedding`: To return document embedding +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### Milvus2DocumentStore.delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Any]] = None, headers: Optional[Dict[str, str]] = None, batch_size: int = 10_000) +``` + +Delete all documents (from SQL AND Milvus). + +**Arguments**: + +- `index`: (SQL) index name for storing the docs and metadata +- `filters`: Optional filters to narrow down the search space. +Example: {"name": ["some", "more"], "category": ["only_one"]} + +**Returns**: + +None + + + +#### Milvus2DocumentStore.delete\_index + +```python +def delete_index(index: str) +``` + +Delete an existing index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +#### Milvus2DocumentStore.get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get all documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### Milvus2DocumentStore.get\_all\_documents + +```python +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Any]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Get documents from the document store (optionally using filter criteria). + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents to return. +Example: {"name": ["some", "more"], "category": ["only_one"]} +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### Milvus2DocumentStore.get\_document\_by\_id + +```python +def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +``` + +Fetch a document by specifying its text id string + +**Arguments**: + +- `id`: ID of the document +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. + + + +#### Milvus2DocumentStore.get\_documents\_by\_id + +```python +def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Fetch multiple documents by specifying their IDs (strings) + +**Arguments**: + +- `ids`: List of IDs of the documents +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### Milvus2DocumentStore.get\_embedding\_count + +```python +def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, List[str]]] = None) -> int +``` + +Return the count of embeddings in the document store. + + + +# Module weaviate + + + +## WeaviateDocumentStore + +```python +class WeaviateDocumentStore(BaseDocumentStore) +``` + +Weaviate is a cloud-native, modular, real-time vector search engine built to scale your machine learning models. +(See https://weaviate.io/developers/weaviate/current/index.html#what-is-weaviate) + +Some of the key differences in contrast to FAISS & Milvus: +1. Stores everything in one place: documents, meta data and vectors - so less network overhead when scaling this up +2. Allows combination of vector search and scalar filtering, i.e. you can filter for a certain tag and do dense retrieval on that subset +3. Has less variety of ANN algorithms, as of now only HNSW. +4. Requires document ids to be in uuid-format. If wrongly formatted ids are provided at indexing time they will be replaced with uuids automatically. + +Weaviate python client is used to connect to the server, more details are here +https://weaviate-python-client.readthedocs.io/en/docs/weaviate.html + +Usage: +1. Start a Weaviate server (see https://weaviate.io/developers/weaviate/current/getting-started/installation.html) +2. Init a WeaviateDocumentStore in Haystack + +Limitations: +The current implementation is not supporting the storage of labels, so you cannot run any evaluation workflows. + + + +#### WeaviateDocumentStore.\_\_init\_\_ + +```python +def __init__(host: Union[str, List[str]] = "http://localhost", port: Union[int, List[int]] = 8080, timeout_config: tuple = (5, 15), username: str = None, password: str = None, index: str = "Document", embedding_dim: int = 768, content_field: str = "content", name_field: str = "name", similarity: str = "cosine", index_type: str = "hnsw", custom_schema: Optional[dict] = None, return_embedding: bool = False, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", recreate_index: bool = False) +``` + +**Arguments**: + +- `host`: Weaviate server connection URL for storing and processing documents and vectors. +For more details, refer "https://weaviate.io/developers/weaviate/current/getting-started/installation.html" +- `port`: port of Weaviate instance +- `timeout_config`: Weaviate Timeout config as a tuple of (retries, time out seconds). +- `username`: username (standard authentication via http_auth) +- `password`: password (standard authentication via http_auth) +- `index`: Index name for document text, embedding and metadata (in Weaviate terminology, this is a "Class" in Weaviate schema). +- `embedding_dim`: The embedding vector size. Default: 768. +- `content_field`: Name of field that might contain the answer and will therefore be passed to the Reader Model (e.g. "full_text"). +If no Reader is used (e.g. in FAQ-Style QA) the plain content of this field will just be returned. +- `name_field`: Name of field that contains the title of the the doc +- `similarity`: The similarity function used to compare document vectors. Available options are 'cosine' (default), 'dot_product' and 'l2'. +'cosine' is recommended for Sentence Transformers. +- `index_type`: Index type of any vector object defined in weaviate schema. The vector index type is pluggable. +Currently, HSNW is only supported. +See: https://weaviate.io/developers/weaviate/current/more-resources/performance.html +- `custom_schema`: Allows to create custom schema in Weaviate, for more details +See https://weaviate.io/developers/weaviate/current/data-schema/schema-configuration.html +- `module_name`: Vectorization module to convert data into vectors. Default is "text2vec-trasnformers" +For more details, See https://weaviate.io/developers/weaviate/current/modules/ +- `return_embedding`: To return document embedding. +- `embedding_field`: Name of field containing an embedding vector. +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already exists. +- `recreate_index`: If set to True, an existing Weaviate index will be deleted and a new one will be +created using the config you are using for initialization. Be aware that all data in the old index will be +lost if you choose to recreate the index. + + + +#### WeaviateDocumentStore.get\_document\_by\_id + +```python +def get_document_by_id(id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> Optional[Document] +``` + +Fetch a document by specifying its uuid string + + + +#### WeaviateDocumentStore.get\_documents\_by\_id + +```python +def get_documents_by_id(ids: List[str], index: Optional[str] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Fetch documents by specifying a list of uuid strings. + + + +#### WeaviateDocumentStore.write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Add new documents to the DocumentStore. + +**Arguments**: + +- `documents`: List of `Dicts` or List of `Documents`. A dummy embedding vector for each document is automatically generated if it is not provided. The document id needs to be in uuid format. Otherwise a correctly formatted uuid will be automatically generated based on the provided id. +- `index`: index name for storing the docs and metadata +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. + +**Raises**: + +- `DuplicateDocumentError`: Exception trigger on duplicate document + +**Returns**: + +None + + + +#### WeaviateDocumentStore.update\_document\_meta + +```python +def update_document_meta(id: str, meta: Dict[str, Union[List, str, int, float, bool]], index: str = None) +``` + +Update the metadata dictionary of a document by specifying its string id. +Overwrites only the specified fields, the unspecified ones remain unchanged. + + + +#### WeaviateDocumentStore.get\_embedding\_count + +```python +def get_embedding_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None) -> int +``` + +Return the number of embeddings in the document store, which is the same as the number of documents since +every document has a default embedding. + + + +#### WeaviateDocumentStore.get\_document\_count + +```python +def get_document_count(filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, index: Optional[str] = None, only_documents_without_embedding: bool = False, headers: Optional[Dict[str, str]] = None) -> int +``` + +Return the number of documents in the document store. + + + +#### WeaviateDocumentStore.get\_all\_documents + +```python +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Get documents from the document store. + +Note this limitation from the changelog of Weaviate 1.8.0: + +.. quote:: + Due to the increasing cost of each page outlined above, there is a limit to + how many objects can be retrieved using pagination. By default setting the sum + of offset and limit to higher than 10,000 objects, will lead to an error. + If you must retrieve more than 10,000 objects, you can increase this limit by + setting the environment variable `QUERY_MAXIMUM_RESULTS=`. + + Warning: Setting this to arbitrarily high values can make the memory consumption + of a single query explode and single queries can slow down the entire cluster. + We recommend setting this value to the lowest possible value that does not + interfere with your users' expectations. + +(https://github.com/semi-technologies/weaviate/releases/tag/v1.8.0) + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### WeaviateDocumentStore.get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +Note this limitation from the changelog of Weaviate 1.8.0: + +.. quote:: + Due to the increasing cost of each page outlined above, there is a limit to + how many objects can be retrieved using pagination. By default setting the sum + of offset and limit to higher than 10,000 objects, will lead to an error. + If you must retrieve more than 10,000 objects, you can increase this limit by + setting the environment variable `QUERY_MAXIMUM_RESULTS=`. + + Warning: Setting this to arbitrarily high values can make the memory consumption + of a single query explode and single queries can slow down the entire cluster. + We recommend setting this value to the lowest possible value that does not + interfere with your users' expectations. + +(https://github.com/semi-technologies/weaviate/releases/tag/v1.8.0) + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + + + +#### WeaviateDocumentStore.query + +```python +def query(query: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, all_terms_must_match: bool = False, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query as defined by Weaviate semantic search. + +**Arguments**: + +- `query`: The query +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `all_terms_must_match`: Not used in Weaviate. +- `custom_query`: Custom query that will executed using query.raw method, for more details refer +https://weaviate.io/developers/weaviate/current/graphql-references/filters.html +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Not used in Weaviate. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### WeaviateDocumentStore.query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return +- `index`: index name for storing the docs and metadata +- `return_embedding`: To return document embedding +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### WeaviateDocumentStore.update\_embeddings + +```python +def update_embeddings(retriever, index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, update_existing_embeddings: bool = True, batch_size: int = 10_000) +``` + +Updates the embeddings in the the document store using the encoding model specified in the retriever. + +This can be useful if want to change the embeddings for your documents (e.g. after changing the retriever config). + +**Arguments**: + +- `retriever`: Retriever to use to update the embeddings. +- `index`: Index name to update +- `update_existing_embeddings`: Weaviate mandates an embedding while creating the document itself. +This option must be always true for weaviate and it will update the embeddings for all the documents. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. + +**Returns**: + +None + + + +#### WeaviateDocumentStore.delete\_all\_documents + +```python +def delete_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + +**Returns**: + +None + + + +#### WeaviateDocumentStore.delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents in an index. All documents are deleted if no filters are passed. + +**Arguments**: + +- `index`: Index name to delete the document from. If None, the +DocumentStore's default index (self.index) will be used. +- `ids`: Optional list of IDs to narrow down the documents to be deleted. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` + If filters are provided along with a list of IDs, this method deletes the + intersection of the two query results (documents that match the filters and + have their ID in the list). + +**Returns**: + +None + + + +#### WeaviateDocumentStore.delete\_index + +```python +def delete_index(index: str) +``` + +Delete an existing index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +#### WeaviateDocumentStore.delete\_labels + +```python +def delete_labels() +``` + +Implemented to respect BaseDocumentStore's contract. + +Weaviate does not support labels (yet). + + + +#### WeaviateDocumentStore.get\_all\_labels + +```python +def get_all_labels() +``` + +Implemented to respect BaseDocumentStore's contract. + +Weaviate does not support labels (yet). + + + +#### WeaviateDocumentStore.get\_label\_count + +```python +def get_label_count() +``` + +Implemented to respect BaseDocumentStore's contract. + +Weaviate does not support labels (yet). + + + +#### WeaviateDocumentStore.write\_labels + +```python +def write_labels() +``` + +Implemented to respect BaseDocumentStore's contract. + +Weaviate does not support labels (yet). + + + +# Module graphdb + + + +## GraphDBKnowledgeGraph + +```python +class GraphDBKnowledgeGraph(BaseKnowledgeGraph) +``` + +Knowledge graph store that runs on a GraphDB instance. + + + +#### GraphDBKnowledgeGraph.\_\_init\_\_ + +```python +def __init__(host: str = "localhost", port: int = 7200, username: str = "", password: str = "", index: Optional[str] = None, prefixes: str = "") +``` + +Init the knowledge graph by defining the settings to connect with a GraphDB instance + +**Arguments**: + +- `host`: address of server where the GraphDB instance is running +- `port`: port where the GraphDB instance is running +- `username`: username to login to the GraphDB instance (if any) +- `password`: password to login to the GraphDB instance (if any) +- `index`: name of the index (also called repository) stored in the GraphDB instance +- `prefixes`: definitions of namespaces with a new line after each namespace, e.g., PREFIX hp: + + + +#### GraphDBKnowledgeGraph.create\_index + +```python +def create_index(config_path: Path, headers: Optional[Dict[str, str]] = None) +``` + +Create a new index (also called repository) stored in the GraphDB instance + +**Arguments**: + +- `config_path`: path to a .ttl file with configuration settings, details: +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +https://graphdb.ontotext.com/documentation/free/configuring-a-repository.html#configure-a-repository-programmatically + + + +#### GraphDBKnowledgeGraph.delete\_index + +```python +def delete_index(headers: Optional[Dict[str, str]] = None) +``` + +Delete the index that GraphDBKnowledgeGraph is connected to. This method deletes all data stored in the index. + +**Arguments**: + +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + + + +#### GraphDBKnowledgeGraph.import\_from\_ttl\_file + +```python +def import_from_ttl_file(index: str, path: Path, headers: Optional[Dict[str, str]] = None) +``` + +Load an existing knowledge graph represented in the form of triples of subject, predicate, and object from a .ttl file into an index of GraphDB + +**Arguments**: + +- `index`: name of the index (also called repository) in the GraphDB instance where the imported triples shall be stored +- `path`: path to a .ttl containing a knowledge graph +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + + + +#### GraphDBKnowledgeGraph.get\_all\_triples + +```python +def get_all_triples(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Query the given index in the GraphDB instance for all its stored triples. Duplicates are not filtered. + +**Arguments**: + +- `index`: name of the index (also called repository) in the GraphDB instance +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + +**Returns**: + +all triples stored in the index + + + +#### GraphDBKnowledgeGraph.get\_all\_subjects + +```python +def get_all_subjects(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Query the given index in the GraphDB instance for all its stored subjects. Duplicates are not filtered. + +**Arguments**: + +- `index`: name of the index (also called repository) in the GraphDB instance +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + +**Returns**: + +all subjects stored in the index + + + +#### GraphDBKnowledgeGraph.get\_all\_predicates + +```python +def get_all_predicates(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Query the given index in the GraphDB instance for all its stored predicates. Duplicates are not filtered. + +**Arguments**: + +- `index`: name of the index (also called repository) in the GraphDB instance +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + +**Returns**: + +all predicates stored in the index + + + +#### GraphDBKnowledgeGraph.get\_all\_objects + +```python +def get_all_objects(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Query the given index in the GraphDB instance for all its stored objects. Duplicates are not filtered. + +**Arguments**: + +- `index`: name of the index (also called repository) in the GraphDB instance +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + +**Returns**: + +all objects stored in the index + + + +#### GraphDBKnowledgeGraph.query + +```python +def query(sparql_query: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Execute a SPARQL query on the given index in the GraphDB instance + +**Arguments**: + +- `sparql_query`: SPARQL query that shall be executed +- `index`: name of the index (also called repository) in the GraphDB instance +- `headers`: Custom HTTP headers to pass to http client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) + +**Returns**: + +query result + + + +# Module deepsetcloud + + + +#### disable\_and\_log + +```python +def disable_and_log(func) +``` + +Decorator to disable write operation, shows warning and inputs instead. + + + +## DeepsetCloudDocumentStore + +```python +class DeepsetCloudDocumentStore(KeywordDocumentStore) +``` + + + +#### DeepsetCloudDocumentStore.\_\_init\_\_ + +```python +def __init__(api_key: str = None, workspace: str = "default", index: Optional[str] = None, duplicate_documents: str = "overwrite", api_endpoint: Optional[str] = None, similarity: str = "dot_product", return_embedding: bool = False, label_index: str = "default", embedding_dim: int = 768) +``` + +A DocumentStore facade enabling you to interact with the documents stored in deepset Cloud. + +Thus you can run experiments like trying new nodes, pipelines, etc. without having to index your data again. + +You can also use this DocumentStore to create new pipelines on deepset Cloud. To do that, take the following +steps: + +- create a new DeepsetCloudDocumentStore without an index (e.g. `DeepsetCloudDocumentStore()`) +- create query and indexing pipelines using this DocumentStore +- call `Pipeline.save_to_deepset_cloud()` passing the pipelines and a `pipeline_config_name` +- call `Pipeline.deploy_on_deepset_cloud()` passing the `pipeline_config_name` + +DeepsetCloudDocumentStore is not intended for use in production-like scenarios. +See [https://haystack.deepset.ai/components/document-store](https://haystack.deepset.ai/components/document-store) +for more information. + +**Arguments**: + +- `api_key`: Secret value of the API key. +If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable. +See docs on how to generate an API key for your workspace: https://docs.cloud.deepset.ai/docs/connect-deepset-cloud-to-your-application +- `workspace`: workspace name in deepset Cloud +- `index`: name of the index to access within the deepset Cloud workspace. This equals typically the name of +your pipeline. You can run Pipeline.list_pipelines_on_deepset_cloud() to see all available ones. +If you set index to `None`, this DocumentStore will always return empty results. +This is especially useful if you want to create a new Pipeline within deepset Cloud +(see Pipeline.save_to_deepset_cloud()` and `Pipeline.deploy_on_deepset_cloud()`). +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `api_endpoint`: The URL of the deepset Cloud API. +If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable. +If DEEPSET_CLOUD_API_ENDPOINT environment variable is not specified either, defaults to "https://api.cloud.deepset.ai/api/v1". +- `similarity`: The similarity function used to compare document vectors. 'dot_product' is the default since it is +more performant with DPR embeddings. 'cosine' is recommended if you are using a Sentence Transformer model. +- `label_index`: index for the evaluation set interface +- `return_embedding`: To return document embedding. +- `embedding_dim`: Specifies the dimensionality of the embedding vector (only needed when using a dense retriever, for example, DensePassageRetriever pr EmbeddingRetriever, on top). + + + +#### DeepsetCloudDocumentStore.get\_all\_documents + +```python +def get_all_documents(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> List[Document] +``` + +Get documents from the document store. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: Number of documents that are passed to bulk function at a time. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + + + +#### DeepsetCloudDocumentStore.get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 10_000, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + + + +#### DeepsetCloudDocumentStore.query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR) +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return +- `index`: Index name for storing the docs and metadata +- `return_embedding`: To return document embedding +- `headers`: Custom HTTP headers to pass to requests +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### DeepsetCloudDocumentStore.query + +```python +def query(query: Optional[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, custom_query: Optional[str] = None, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None, all_terms_must_match: bool = False, scale_score: bool = True) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query as defined by the BM25 algorithm. + +**Arguments**: + +- `query`: The query +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `custom_query`: Custom query to be executed. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Custom HTTP headers to pass to requests +- `all_terms_must_match`: Whether all terms of the query must match the document. +If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). +Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). +Defaults to False. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### DeepsetCloudDocumentStore.write\_documents + +```python +@disable_and_log +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 10_000, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Indexes documents for later queries. + +**Arguments**: + +- `documents`: a list of Python dictionaries or a list of Haystack Document objects. +For documents as dictionaries, the format is {"text": ""}. +Optionally: Include meta data via {"text": "", +"meta":{"name": ", "author": "somebody", ...}} +It can be used for filtering and is accessible in the responses of the Finder. +- `index`: Optional name of index where the documents shall be written to. +If None, the DocumentStore's default index (self.index) will be used. +- `batch_size`: Number of documents that are passed to bulk function at a time. +- `duplicate_documents`: Handle duplicates document based on parameter options. +Parameter options : ( 'skip','overwrite','fail') +skip: Ignore the duplicates documents +overwrite: Update any existing documents with the same ID when adding documents. +fail: an error is raised if the document ID of the document being added already +exists. +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + +**Returns**: + +None + + + +#### DeepsetCloudDocumentStore.update\_document\_meta + +```python +@disable_and_log +def update_document_meta(id: str, meta: Dict[str, Any], index: str = None) +``` + +Update the metadata dictionary of a document by specifying its string id. + +**Arguments**: + +- `id`: The ID of the Document whose metadata is being updated. +- `meta`: A dictionary with key-value pairs that should be added / changed for the provided Document ID. +- `index`: Name of the index the Document is located at. + + + +#### DeepsetCloudDocumentStore.get\_evaluation\_sets + +```python +def get_evaluation_sets() -> List[dict] +``` + +Returns a list of uploaded evaluation sets to deepset cloud. + +**Returns**: + +list of evaluation sets as dicts +These contain ("name", "evaluation_set_id", "created_at", "matched_labels", "total_labels") as fields. + + + +#### DeepsetCloudDocumentStore.get\_all\_labels + +```python +def get_all_labels(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) -> List[Label] +``` + +Returns a list of labels for the given index name. + +**Arguments**: + +- `index`: Optional name of evaluation set for which labels should be searched. +If None, the DocumentStore's default label_index (self.label_index) will be used. +- `headers`: Not supported. + +**Returns**: + +list of Labels. + + + +#### DeepsetCloudDocumentStore.get\_label\_count + +```python +def get_label_count(index: Optional[str] = None, headers: Optional[Dict[str, str]] = None) -> int +``` + +Counts the number of labels for the given index and returns the value. + +**Arguments**: + +- `index`: Optional evaluation set name for which the labels should be counted. +If None, the DocumentStore's default label_index (self.label_index) will be used. +- `headers`: Not supported. + +**Returns**: + +number of labels for the given index + + + +# Module pinecone + + + +## PineconeDocumentStore + +```python +class PineconeDocumentStore(SQLDocumentStore) +``` + +Document store for very large scale embedding based dense retrievers like the DPR. This is a hosted document store, +this means that your vectors will not be stored locally but in the cloud. This means that the similarity +search will be run on the cloud as well. + +It implements the Pinecone vector database ([https://www.pinecone.io](https://www.pinecone.io)) +to perform similarity search on vectors. In order to use this document store, you need an API key that you can +obtain by creating an account on the [Pinecone website](https://www.pinecone.io). + +The document text is stored using the SQLDocumentStore, while +the vector embeddings and metadata (for filtering) are indexed in a Pinecone Index. + + + +#### PineconeDocumentStore.\_\_init\_\_ + +```python +def __init__(api_key: str, environment: str = "us-west1-gcp", sql_url: str = "sqlite:///pinecone_document_store.db", pinecone_index: Optional[pinecone.Index] = None, embedding_dim: int = 768, return_embedding: bool = False, index: str = "document", similarity: str = "cosine", replicas: int = 1, shards: int = 1, embedding_field: str = "embedding", progress_bar: bool = True, duplicate_documents: str = "overwrite", recreate_index: bool = False, metadata_config: dict = {"indexed": []}, validate_index_sync: bool = True) +``` + +**Arguments**: + +- `api_key`: Pinecone vector database API key ([https://app.pinecone.io](https://app.pinecone.io)). +- `environment`: Pinecone cloud environment uses `"us-west1-gcp"` by default. Other GCP and AWS regions are +supported, contact Pinecone [here](https://www.pinecone.io/contact/) if required. +- `sql_url`: SQL connection URL for database. It defaults to local file based SQLite DB. For large scale +deployment, Postgres is recommended. +- `pinecone_index`: pinecone-client Index object, an index will be initialized or loaded if not specified. +- `embedding_dim`: The embedding vector size. +- `return_embedding`: Whether to return document embeddings. +- `index`: Name of index in document store to use. +- `similarity`: The similarity function used to compare document vectors. `"cosine"` is the default +and is recommended if you are using a Sentence-Transformer model. `"dot_product"` is more performant +with DPR embeddings. +In both cases, the returned values in Document.score are normalized to be in range [0,1]: + - For `"dot_product"`: `expit(np.asarray(raw_score / 100))` + - For `"cosine"`: `(raw_score + 1) / 2` +- `replicas`: The number of replicas. Replicas duplicate the index. They provide higher availability and +throughput. +- `shards`: The number of shards to be used in the index. We recommend to use 1 shard per 1GB of data. +- `embedding_field`: Name of field containing an embedding vector. +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_documents`: Handle duplicate documents based on parameter options.\ +Parameter options: + - `"skip"`: Ignore the duplicate documents. + - `"overwrite"`: Update any existing documents with the same ID when adding documents. + - `"fail"`: An error is raised if the document ID of the document being added already exists. +- `recreate_index`: If set to True, an existing Pinecone index will be deleted and a new one will be +created using the config you are using for initialization. Be aware that all data in the old index will be +lost if you choose to recreate the index. Be aware that both the document_index and the label_index will +be recreated. +- `metadata_config`: Which metadata fields should be indexed. Should be in the format +`{"indexed": ["metadata-field-1", "metadata-field-2", "metadata-field-n"]}`. +Indexing metadata fields is a prerequisite to allow filtering of documents by metadata values. +- `validate_index_sync`: Whether to check that the document count equals the embedding count at initialization time + + + +#### PineconeDocumentStore.write\_documents + +```python +def write_documents(documents: Union[List[dict], List[Document]], index: Optional[str] = None, batch_size: int = 32, duplicate_documents: Optional[str] = None, headers: Optional[Dict[str, str]] = None) +``` + +Add new documents to the DocumentStore. + +**Arguments**: + +- `documents`: List of `Dicts` or list of `Documents`. If they already contain embeddings, we'll index them +right away in Pinecone. If not, you can later call `update_embeddings()` to create & index them. +- `index`: Index name for storing the docs and metadata. +- `batch_size`: Number of documents to process at a time. When working with large number of documents, +batching can help to reduce the memory footprint. +- `duplicate_documents`: handle duplicate documents based on parameter options. +Parameter options: + - `"skip"`: Ignore the duplicate documents. + - `"overwrite"`: Update any existing documents with the same ID when adding documents. + - `"fail"`: An error is raised if the document ID of the document being added already exists. +- `headers`: PineconeDocumentStore does not support headers. + +**Raises**: + +- `DuplicateDocumentError`: Exception trigger on duplicate document. + + + +#### PineconeDocumentStore.update\_embeddings + +```python +def update_embeddings(retriever: "BaseRetriever", index: Optional[str] = None, update_existing_embeddings: bool = True, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, batch_size: int = 32) +``` + +Updates the embeddings in the document store using the encoding model specified in the retriever. + +This can be useful if you want to add or change the embeddings for your documents (e.g. after changing the +retriever config). + +**Arguments**: + +- `retriever`: Retriever to use to get embeddings for text. +- `index`: Index name for which embeddings are to be updated. If set to `None`, the default `self.index` is +used. +- `update_existing_embeddings`: Whether to update existing embeddings of the documents. If set to `False`, +only documents without embeddings are processed. This mode can be used for incremental updating of +embeddings, wherein, only newly indexed documents get processed. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `batch_size`: Number of documents to process at a time. When working with large number of documents, +batching can help reduce memory footprint. + + + +#### PineconeDocumentStore.get\_all\_documents\_generator + +```python +def get_all_documents_generator(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, return_embedding: Optional[bool] = None, batch_size: int = 32, headers: Optional[Dict[str, str]] = None) -> Generator[Document, None, None] +``` + +Get all documents from the document store. Under-the-hood, documents are fetched in batches from the + +document store and yielded as individual documents. This method can be used to iteratively process +a large number of documents without having to load all documents in memory. + +**Arguments**: + +- `index`: Name of the index to get the documents from. If None, the +DocumentStore's default index (self.index) will be used. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `return_embedding`: Whether to return the document embeddings. +- `batch_size`: When working with large number of documents, batching can help reduce memory footprint. +- `headers`: PineconeDocumentStore does not support headers. + + + +#### PineconeDocumentStore.get\_embedding\_count + +```python +def get_embedding_count(index: Optional[str] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None) -> int +``` + +Return the count of embeddings in the document store. + + + +#### PineconeDocumentStore.update\_document\_meta + +```python +def update_document_meta(id: str, meta: Dict[str, str], index: str = None) +``` + +Update the metadata dictionary of a document by specifying its string id + + + +#### PineconeDocumentStore.delete\_documents + +```python +def delete_documents(index: Optional[str] = None, ids: Optional[List[str]] = None, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, headers: Optional[Dict[str, str]] = None) +``` + +Delete documents from the document store. + +**Arguments**: + +- `index`: Index name to delete the documents from. If `None`, the DocumentStore's default index +(`self.index`) will be used. +- `ids`: Optional list of IDs to narrow down the documents to be deleted. +- `filters`: Optional filters to narrow down the documents for which embeddings are to be updated. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + ``` +- `headers`: PineconeDocumentStore does not support headers. + + + +#### PineconeDocumentStore.delete\_index + +```python +def delete_index(index: str) +``` + +Delete an existing index. The index including all data will be removed. + +**Arguments**: + +- `index`: The name of the index to delete. + +**Returns**: + +None + + + +#### PineconeDocumentStore.query\_by\_embedding + +```python +def query_by_embedding(query_emb: np.ndarray, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: int = 10, index: Optional[str] = None, return_embedding: Optional[bool] = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = True) -> List[Document] +``` + +Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. + +**Arguments**: + +- `query_emb`: Embedding of the query (e.g. gathered from DPR). +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return. +- `index`: The name of the index from which to retrieve documents. +- `return_embedding`: Whether to return document embedding. +- `headers`: PineconeDocumentStore does not support headers. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### PineconeDocumentStore.load + +```python +@classmethod +def load(cls) +``` + +Default class method used for loading indexes. Not applicable to the PineconeDocumentStore. + + + +# Module utils + + + +#### eval\_data\_from\_json + +```python +def eval_data_from_json(filename: str, max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None, open_domain: bool = False) -> Tuple[List[Document], List[Label]] +``` + +Read Documents + Labels from a SQuAD-style file. + +Document and Labels can then be indexed to the DocumentStore and be used for evaluation. + +**Arguments**: + +- `filename`: Path to file in SQuAD format +- `max_docs`: This sets the number of documents that will be loaded. By default, this is set to None, thus reading in all available eval documents. +- `open_domain`: Set this to True if your file is an open domain dataset where two different answers to the same question might be found in different contexts. + + + +#### eval\_data\_from\_jsonl + +```python +def eval_data_from_jsonl(filename: str, batch_size: Optional[int] = None, max_docs: Union[int, bool] = None, preprocessor: PreProcessor = None, open_domain: bool = False) -> Generator[Tuple[List[Document], List[Label]], None, None] +``` + +Read Documents + Labels from a SQuAD-style file in jsonl format, i.e. one document per line. + +Document and Labels can then be indexed to the DocumentStore and be used for evaluation. + +This is a generator which will yield one tuple per iteration containing a list +of batch_size documents and a list with the documents' labels. +If batch_size is set to None, this method will yield all documents and labels. + +**Arguments**: + +- `filename`: Path to file in SQuAD format +- `max_docs`: This sets the number of documents that will be loaded. By default, this is set to None, thus reading in all available eval documents. +- `open_domain`: Set this to True if your file is an open domain dataset where two different answers to the same question might be found in different contexts. + + + +#### squad\_json\_to\_jsonl + +```python +def squad_json_to_jsonl(squad_file: str, output_file: str) +``` + +Converts a SQuAD-json-file into jsonl format with one document per line. + +**Arguments**: + +- `squad_file`: SQuAD-file in json format. +- `output_file`: Name of output file (SQuAD in jsonl format) + + + +#### convert\_date\_to\_rfc3339 + +```python +def convert_date_to_rfc3339(date: str) -> str +``` + +Converts a date to RFC3339 format, as Weaviate requires dates to be in RFC3339 format including the time and +timezone. + +If the provided date string does not contain a time and/or timezone, we use 00:00 as default time +and UTC as default time zone. + +This method cannot be part of WeaviateDocumentStore, as this would result in a circular import between weaviate.py +and filter_utils.py. + diff --git a/docs/v1.7.0/_src/api/api/evaluation.md b/docs/v1.7.0/_src/api/api/evaluation.md new file mode 100644 index 0000000000..35d4bebe14 --- /dev/null +++ b/docs/v1.7.0/_src/api/api/evaluation.md @@ -0,0 +1,148 @@ + + +# Module evaluator + + + +## EvalDocuments + +```python +class EvalDocuments(BaseComponent) +``` + +This is a pipeline node that should be placed after a node that returns a List of Document, e.g., Retriever or +Ranker, in order to assess its performance. Performance metrics are stored in this class and updated as each +sample passes through it. To view the results of the evaluation, call EvalDocuments.print(). Note that results +from this Node may differ from that when calling Retriever.eval() since that is a closed domain evaluation. Have +a look at our evaluation tutorial for more info about open vs closed domain eval ( +https://haystack.deepset.ai/tutorials/evaluation). + +EvalDocuments node is deprecated and will be removed in a future version. +Please use pipeline.eval() instead. + + + +#### EvalDocuments.\_\_init\_\_ + +```python +def __init__(debug: bool = False, open_domain: bool = True, top_k: int = 10) +``` + +**Arguments**: + +- `open_domain`: When True, a document is considered correctly retrieved so long as the answer string can be found within it. +When False, correct retrieval is evaluated based on document_id. +- `debug`: When True, a record of each sample and its evaluation will be stored in EvalDocuments.log +- `top_k`: calculate eval metrics for top k results, e.g., recall@k + + + +#### EvalDocuments.run + +```python +def run(documents: List[Document], labels: List[Label], top_k: Optional[int] = None) +``` + +Run this node on one sample and its labels + + + +#### EvalDocuments.print + +```python +def print() +``` + +Print the evaluation results + + + +## EvalAnswers + +```python +class EvalAnswers(BaseComponent) +``` + +This is a pipeline node that should be placed after a Reader in order to assess the performance of the Reader +individually or to assess the extractive QA performance of the whole pipeline. Performance metrics are stored in +this class and updated as each sample passes through it. To view the results of the evaluation, call EvalAnswers.print(). +Note that results from this Node may differ from that when calling Reader.eval() +since that is a closed domain evaluation. Have a look at our evaluation tutorial for more info about +open vs closed domain eval (https://haystack.deepset.ai/tutorials/evaluation). + +EvalAnswers node is deprecated and will be removed in a future version. +Please use pipeline.eval() instead. + + + +#### EvalAnswers.\_\_init\_\_ + +```python +def __init__(skip_incorrect_retrieval: bool = True, open_domain: bool = True, sas_model: str = None, debug: bool = False) +``` + +**Arguments**: + +- `skip_incorrect_retrieval`: When set to True, this eval will ignore the cases where the retriever returned no correct documents +- `open_domain`: When True, extracted answers are evaluated purely on string similarity rather than the position of the extracted answer +- `sas_model`: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric. +The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps. +Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture. +More info in the paper: https://arxiv.org/abs/2108.06130 +Models: +- You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data. + Not all cross encoders can be used because of different return types. + If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class +- Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" +- Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large" +- Large model for German only: "deepset/gbert-large-sts" +- `debug`: When True, a record of each sample and its evaluation will be stored in EvalAnswers.log + + + +#### EvalAnswers.run + +```python +def run(labels: List[Label], answers: List[Answer], correct_retrieval: bool) +``` + +Run this node on one sample and its labels + + + +#### EvalAnswers.print + +```python +def print(mode) +``` + +Print the evaluation results + + + +#### semantic\_answer\_similarity + +```python +def semantic_answer_similarity(predictions: List[List[str]], gold_labels: List[List[str]], sas_model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2", batch_size: int = 32, use_gpu: bool = True) -> Tuple[List[float], List[float], List[List[float]]] +``` + +Computes Transformer-based similarity of predicted answer to gold labels to derive a more meaningful metric than EM or F1. + +Returns per QA pair a) the similarity of the most likely prediction (top 1) to all available gold labels + b) the highest similarity of all predictions to gold labels + c) a matrix consisting of the similarities of all the predicitions compared to all gold labels + +**Arguments**: + +- `predictions`: Predicted answers as list of multiple preds per question +- `gold_labels`: Labels as list of multiple possible answers per question +- `sas_model_name_or_path`: SentenceTransformers semantic textual similarity model, should be path or string +pointing to downloadable models. +- `batch_size`: Number of prediction label pairs to encode at once. +- `use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity. +Falls back to CPU if no GPU is available. + +**Returns**: + +top_1_sas, top_k_sas, pred_label_matrix + diff --git a/docs/v1.7.0/_src/api/api/extractor.md b/docs/v1.7.0/_src/api/api/extractor.md new file mode 100644 index 0000000000..7828b5f12a --- /dev/null +++ b/docs/v1.7.0/_src/api/api/extractor.md @@ -0,0 +1,81 @@ + + +# Module entity + + + +## EntityExtractor + +```python +class EntityExtractor(BaseComponent) +``` + +This node is used to extract entities out of documents. + +The most common use case for this would be as a named entity extractor. +The default model used is dslim/bert-base-NER. +This node can be placed in a querying pipeline to perform entity extraction on retrieved documents only, +or it can be placed in an indexing pipeline so that all documents in the document store have extracted entities. +The entities extracted by this Node will populate Document.entities + +**Arguments**: + +- `model_name_or_path`: The name of the model to use for entity extraction. +- `use_gpu`: Whether to use the GPU or not. +- `batch_size`: The batch size to use for entity extraction. +- `progress_bar`: Whether to show a progress bar or not. + + + +#### EntityExtractor.run + +```python +def run(documents: Optional[Union[List[Document], List[dict]]] = None) -> Tuple[Dict, str] +``` + +This is the method called when this node is used in a pipeline + + + +#### EntityExtractor.extract + +```python +def extract(text) +``` + +This function can be called to perform entity extraction when using the node in isolation. + + + +#### EntityExtractor.extract\_batch + +```python +def extract_batch(texts: Union[List[str], List[List[str]]], batch_size: Optional[int] = None) +``` + +This function allows to extract entities out of a list of strings or a list of lists of strings. + +**Arguments**: + +- `texts`: List of str or list of lists of str to extract entities from. +- `batch_size`: Number of texts to make predictions on at a time. + + + +#### simplify\_ner\_for\_qa + +```python +def simplify_ner_for_qa(output) +``` + +Returns a simplified version of the output dictionary +with the following structure: +[ + { + answer: { ... } + entities: [ { ... }, {} ] + } +] +The entities included are only the ones that overlap with +the answer itself. + diff --git a/docs/v1.7.0/_src/api/api/file_classifier.md b/docs/v1.7.0/_src/api/api/file_classifier.md new file mode 100644 index 0000000000..8ffe4963b6 --- /dev/null +++ b/docs/v1.7.0/_src/api/api/file_classifier.md @@ -0,0 +1,44 @@ + + +# Module file\_type + + + +## FileTypeClassifier + +```python +class FileTypeClassifier(BaseComponent) +``` + +Route files in an Indexing Pipeline to corresponding file converters. + + + +#### FileTypeClassifier.\_\_init\_\_ + +```python +def __init__(supported_types: List[str] = DEFAULT_TYPES) +``` + +Node that sends out files on a different output edge depending on their extension. + +**Arguments**: + +- `supported_types`: The file types that this node can distinguish between. +The default values are: `txt`, `pdf`, `md`, `docx`, and `html`. +Lists with duplicate elements are not allowed. + + + +#### FileTypeClassifier.run + +```python +def run(file_paths: Union[Path, List[Path], str, List[str], List[Union[Path, str]]]) +``` + +Sends out files on a different output edge depending on their extension. + +**Arguments**: + +- `file_paths`: paths to route on different edges. + diff --git a/docs/v1.7.0/_src/api/api/file_converter.md b/docs/v1.7.0/_src/api/api/file_converter.md new file mode 100644 index 0000000000..c1c4710178 --- /dev/null +++ b/docs/v1.7.0/_src/api/api/file_converter.md @@ -0,0 +1,730 @@ + + +# Module base + + + +## BaseConverter + +```python +class BaseConverter(BaseComponent) +``` + +Base class for implementing file converts to transform input documents to text format for ingestion in DocumentStore. + + + +#### BaseConverter.\_\_init\_\_ + +```python +def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True) +``` + +**Arguments**: + +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. +- `progress_bar`: Show a progress bar for the conversion. + + + +#### BaseConverter.convert + +```python +@abstractmethod +def convert(file_path: Path, meta: Optional[Dict[str, Any]], remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) -> List[Document] +``` + +Convert a file to a dictionary containing the text and any associated meta data. + +File converters may extract file meta like name or size. In addition to it, user +supplied meta data like author, url, external IDs can be supplied as a dictionary. + +**Arguments**: + +- `file_path`: path of the file to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Select the file encoding (default is `UTF-8`) +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. + + + +#### BaseConverter.validate\_language + +```python +def validate_language(text: str, valid_languages: Optional[List[str]] = None) -> bool +``` + +Validate if the language of the text is one of valid languages. + + + +#### BaseConverter.run + +```python +def run(file_paths: Union[Path, List[Path]], meta: Optional[Union[Dict[str, str], List[Optional[Dict[str, str]]]]] = None, remove_numeric_tables: Optional[bool] = None, known_ligatures: Dict[str, str] = KNOWN_LIGATURES, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8", id_hash_keys: Optional[List[str]] = None) +``` + +Extract text from a file. + +**Arguments**: + +- `file_paths`: Path to the files you want to convert +- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents. +Can be any custom keys and values. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `known_ligatures`: Some converters tends to recognize clusters of letters as ligatures, such as "ff" (double f). +Such ligatures however make text hard to compare with the content of other files, +which are generally ligature free. Therefore we automatically find and replace the most +common ligatures with their split counterparts. The default mapping is in +`haystack.nodes.file_converter.base.KNOWN_LIGATURES`: it is rather biased towards Latin alphabeths +but excludes all ligatures that are known to be used in IPA. +You can use this parameter to provide your own set of ligatures to clean up from the documents. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Select the file encoding (default is `UTF-8`) +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. + + + +# Module docx + + + +## DocxToTextConverter + +```python +class DocxToTextConverter(BaseConverter) +``` + + + +#### DocxToTextConverter.convert + +```python +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +``` + +Extract text from a .docx file. + +Note: As docx doesn't contain "page" information, we actually extract and return a list of paragraphs here. +For compliance with other converters we nevertheless opted for keeping the methods name. + +**Arguments**: + +- `file_path`: Path to the .docx file you want to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Not applicable +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. + + + +# Module image + + + +## ImageToTextConverter + +```python +class ImageToTextConverter(BaseConverter) +``` + + + +#### ImageToTextConverter.\_\_init\_\_ + +```python +def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None) +``` + +**Arguments**: + +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified here +(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html) +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. Run the following line of code to check available language packs: +# List of available languages +print(pytesseract.get_languages(config='')) +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. + + + +#### ImageToTextConverter.convert + +```python +def convert(file_path: Union[Path, str], meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +``` + +Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) + +**Arguments**: + +- `file_path`: path to image file +- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents. +Can be any custom keys and values. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages supported by tessarect +(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Not applicable +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. + + + +# Module markdown + + + +## MarkdownConverter + +```python +class MarkdownConverter(BaseConverter) +``` + + + +#### MarkdownConverter.convert + +```python +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document] +``` + +Reads text from a txt file and executes optional preprocessing steps. + +**Arguments**: + +- `file_path`: path of the file to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `encoding`: Select the file encoding (default is `utf-8`) +- `remove_numeric_tables`: Not applicable +- `valid_languages`: Not applicable +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. + + + +#### MarkdownConverter.markdown\_to\_text + +```python +@staticmethod +def markdown_to_text(markdown_string: str) -> str +``` + +Converts a markdown string to plaintext + +**Arguments**: + +- `markdown_string`: String in markdown format + + + +# Module pdf + + + +## PDFToTextConverter + +```python +class PDFToTextConverter(BaseConverter) +``` + + + +#### PDFToTextConverter.\_\_init\_\_ + +```python +def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, encoding: Optional[str] = "UTF-8") +``` + +**Arguments**: + +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. +- `encoding`: Encoding that will be passed as `-enc` parameter to `pdftotext`. +Defaults to "UTF-8" in order to support special characters (e.g. German Umlauts, Cyrillic ...). +(See list of available encodings, such as "Latin1", by running `pdftotext -listenc` in the terminal) + + + +#### PDFToTextConverter.convert + +```python +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +``` + +Extract text from a .pdf file using the pdftotext library (https://www.xpdfreader.com/pdftotext-man.html) + +**Arguments**: + +- `file_path`: Path to the .pdf file you want to convert +- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents. +Can be any custom keys and values. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Encoding that overwrites self.encoding and will be passed as `-enc` parameter to `pdftotext`. +(See list of available encodings by running `pdftotext -listenc` in the terminal) +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. + + + +## PDFToTextOCRConverter + +```python +class PDFToTextOCRConverter(BaseConverter) +``` + + + +#### PDFToTextOCRConverter.\_\_init\_\_ + +```python +def __init__(remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = ["eng"], id_hash_keys: Optional[List[str]] = None) +``` + +Extract text from image file using the pytesseract library (https://github.com/madmaze/pytesseract) + +**Arguments**: + +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages supported by tessarect +(https://tesseract-ocr.github.io/tessdoc/Data-Files-in-different-versions.html). +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. + + + +#### PDFToTextOCRConverter.convert + +```python +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +``` + +Convert a file to a dictionary containing the text and any associated meta data. + +File converters may extract file meta like name or size. In addition to it, user +supplied meta data like author, url, external IDs can be supplied as a dictionary. + +**Arguments**: + +- `file_path`: path of the file to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Not applicable +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. + + + +# Module parsr + + + +## ParsrConverter + +```python +class ParsrConverter(BaseConverter) +``` + +File converter that makes use of the open-source Parsr tool by axa-group. +(https://github.com/axa-group/Parsr). +This Converter extracts both text and tables. +Supported file formats are: PDF, DOCX + + + +#### ParsrConverter.\_\_init\_\_ + +```python +def __init__(parsr_url: str = "http://localhost:3001", extractor: Literal["pdfminer", "pdfjs"] = "pdfminer", table_detection_mode: Literal["lattice", "stream"] = "lattice", preceding_context_len: int = 3, following_context_len: int = 3, remove_page_headers: bool = False, remove_page_footers: bool = False, remove_table_of_contents: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True) +``` + +**Arguments**: + +- `parsr_url`: URL endpoint to Parsr"s REST API. +- `extractor`: Backend used to extract textual structured from PDFs. ("pdfminer" or "pdfjs") +- `table_detection_mode`: Parsing method used to detect tables and their cells. +"lattice" detects tables and their cells by demarcated lines between cells. +"stream" detects tables and their cells by looking at whitespace between cells. +- `preceding_context_len`: Number of lines before a table to extract as preceding context +(will be returned as part of meta data). +- `following_context_len`: Number of lines after a table to extract as preceding context +(will be returned as part of meta data). +- `remove_page_headers`: Whether to remove text that Parsr detected as a page header. +- `remove_page_footers`: Whether to remove text that Parsr detected as a page footer. +- `remove_table_of_contents`: Whether to remove text that Parsr detected as a table of contents. +- `valid_languages`: Validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. +- `add_page_number`: Adds the number of the page a table occurs in to the Document's meta field +`"page"`. + + + +#### ParsrConverter.convert + +```python +def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document] +``` + +Extract text and tables from a PDF or DOCX using the open-source Parsr tool. + +**Arguments**: + +- `file_path`: Path to the file you want to convert. +- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents. +Can be any custom keys and values. +- `remove_numeric_tables`: Not applicable. +- `valid_languages`: Validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Not applicable. +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. + + + +# Module azure + + + +## AzureConverter + +```python +class AzureConverter(BaseConverter) +``` + +File converter that makes use of Microsoft Azure's Form Recognizer service +(https://azure.microsoft.com/en-us/services/form-recognizer/). +This Converter extracts both text and tables. +Supported file formats are: PDF, JPEG, PNG, BMP and TIFF. + +In order to be able to use this Converter, you need an active Azure account +and a Form Recognizer or Cognitive Services resource. +(Here you can find information on how to set this up: +https://docs.microsoft.com/en-us/azure/applied-ai-services/form-recognizer/quickstarts/try-v3-python-sdk#prerequisites) + + + +#### AzureConverter.\_\_init\_\_ + +```python +def __init__(endpoint: str, credential_key: str, model_id: str = "prebuilt-document", valid_languages: Optional[List[str]] = None, save_json: bool = False, preceding_context_len: int = 3, following_context_len: int = 3, merge_multiple_column_headers: bool = True, id_hash_keys: Optional[List[str]] = None, add_page_number: bool = True) +``` + +**Arguments**: + +- `endpoint`: Your Form Recognizer or Cognitive Services resource's endpoint. +- `credential_key`: Your Form Recognizer or Cognitive Services resource's subscription key. +- `model_id`: The identifier of the model you want to use to extract information out of your file. +Default: "prebuilt-document". General purpose models are "prebuilt-document" +and "prebuilt-layout". +List of available prebuilt models: +https://azuresdkdocs.blob.core.windows.net/$web/python/azure-ai-formrecognizer/3.2.0b1/index.html#documentanalysisclient +- `valid_languages`: Validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `save_json`: Whether to save the output of the Form Recognizer to a JSON file. +- `preceding_context_len`: Number of lines before a table to extract as preceding context (will be returned as part of meta data). +- `following_context_len`: Number of lines after a table to extract as subsequent context (will be returned as part of meta data). +- `merge_multiple_column_headers`: Some tables contain more than one row as a column header (i.e., column description). +This parameter lets you choose, whether to merge multiple column header +rows to a single row. +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. +- `add_page_number`: Adds the number of the page a table occurs in to the Document's meta field +`"page"`. + + + +#### AzureConverter.convert + +```python +def convert(file_path: Path, meta: Optional[Dict[str, Any]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None, pages: Optional[str] = None, known_language: Optional[str] = None) -> List[Document] +``` + +Extract text and tables from a PDF, JPEG, PNG, BMP or TIFF file using Azure's Form Recognizer service. + +**Arguments**: + +- `file_path`: Path to the file you want to convert. +- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents. +Can be any custom keys and values. +- `remove_numeric_tables`: Not applicable. +- `valid_languages`: Validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Not applicable. +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. +- `pages`: Custom page numbers for multi-page documents(PDF/TIFF). Input the page numbers and/or ranges +of pages you want to get in the result. For a range of pages, use a hyphen, +like pages=”1-3, 5-6”. Separate each page number or range with a comma. +- `known_language`: Locale hint of the input document. +See supported locales here: https://aka.ms/azsdk/formrecognizer/supportedlocales. + + + +#### AzureConverter.convert\_azure\_json + +```python +def convert_azure_json(file_path: Path, meta: Optional[Dict[str, Any]] = None, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +``` + +Extract text and tables from the JSON output of Azure's Form Recognizer service. + +**Arguments**: + +- `file_path`: Path to the JSON-file you want to convert. +- `meta`: Optional dictionary with metadata that shall be attached to all resulting documents. +Can be any custom keys and values. +- `valid_languages`: Validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. + + + +# Module tika + + + +## TikaConverter + +```python +class TikaConverter(BaseConverter) +``` + + + +#### TikaConverter.\_\_init\_\_ + +```python +def __init__(tika_url: str = "http://localhost:9998/tika", remove_numeric_tables: bool = False, valid_languages: Optional[List[str]] = None, id_hash_keys: Optional[List[str]] = None) +``` + +**Arguments**: + +- `tika_url`: URL of the Tika server +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. + + + +#### TikaConverter.convert + +```python +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +``` + +**Arguments**: + +- `file_path`: path of the file to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Not applicable +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. + +**Returns**: + +A list of pages and the extracted meta data of the file. + + + +# Module txt + + + +## TextConverter + +```python +class TextConverter(BaseConverter) +``` + + + +#### TextConverter.convert + +```python +def convert(file_path: Path, meta: Optional[Dict[str, str]] = None, remove_numeric_tables: Optional[bool] = None, valid_languages: Optional[List[str]] = None, encoding: Optional[str] = "utf-8", id_hash_keys: Optional[List[str]] = None) -> List[Document] +``` + +Reads text from a txt file and executes optional preprocessing steps. + +**Arguments**: + +- `file_path`: path of the file to convert +- `meta`: dictionary of meta data key-value pairs to append in the returned document. +- `remove_numeric_tables`: This option uses heuristics to remove numeric rows from the tables. +The tabular structures in documents might be noise for the reader model if it +does not have table parsing capability for finding answers. However, tables +may also have long strings that could possible candidate for searching answers. +The rows containing strings are thus retained in this option. +- `valid_languages`: validate languages from a list of languages specified in the ISO 639-1 +(https://en.wikipedia.org/wiki/ISO_639-1) format. +This option can be used to add test for encoding errors. If the extracted text is +not one of the valid languages, then it might likely be encoding error resulting +in garbled text. +- `encoding`: Select the file encoding (default is `utf-8`) +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. + diff --git a/docs/v1.7.0/_src/api/api/generator.md b/docs/v1.7.0/_src/api/api/generator.md new file mode 100644 index 0000000000..645be7383f --- /dev/null +++ b/docs/v1.7.0/_src/api/api/generator.md @@ -0,0 +1,297 @@ + + +# Module base + + + +## BaseGenerator + +```python +class BaseGenerator(BaseComponent) +``` + +Abstract class for Generators + + + +#### BaseGenerator.predict + +```python +@abstractmethod +def predict(query: str, documents: List[Document], top_k: Optional[int]) -> Dict +``` + +Abstract method to generate answers. + +**Arguments**: + +- `query`: Query +- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. +- `top_k`: Number of returned answers + +**Returns**: + +Generated answers plus additional infos in a dict + + + +#### BaseGenerator.predict\_batch + +```python +def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None) +``` + +Generate the answer to the input queries. The generation will be conditioned on the supplied documents. + +These documents can for example be retrieved via the Retriever. + +- If you provide a list containing a single query... + + - ... and a single list of Documents, the query will be applied to each Document individually. + - ... and a list of lists of Documents, the query will be applied to each list of Documents and the Answers + will be aggregated per Document list. + +- If you provide a list of multiple queries... + + - ... and a single list of Documents, each query will be applied to each Document individually. + - ... and a list of lists of Documents, each query will be applied to its corresponding list of Documents + and the Answers will be aggregated per query-Document pair. + +**Arguments**: + +- `queries`: List of queries. +- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. +Can be a single list of Documents or a list of lists of Documents. +- `top_k`: Number of returned answers per query. +- `batch_size`: Not applicable. + +**Returns**: + +Generated answers plus additional infos in a dict like this: +```python +| {'queries': 'who got the first nobel prize in physics', +| 'answers': +| [{'query': 'who got the first nobel prize in physics', +| 'answer': ' albert einstein', +| 'meta': { 'doc_ids': [...], +| 'doc_scores': [80.42758 ...], +| 'doc_probabilities': [40.71379089355469, ... +| 'content': ['Albert Einstein was a ...] +| 'titles': ['"Albert Einstein"', ...] +| }}]} +``` + + + +# Module transformers + + + +## RAGenerator + +```python +class RAGenerator(BaseGenerator) +``` + +Implementation of Facebook's Retrieval-Augmented Generator (https://arxiv.org/abs/2005.11401) based on +HuggingFace's transformers (https://huggingface.co/transformers/model_doc/rag.html). + +Instead of "finding" the answer within a document, these models **generate** the answer. +In that sense, RAG follows a similar approach as GPT-3 but it comes with two huge advantages +for real-world applications: +a) it has a manageable model size +b) the answer generation is conditioned on retrieved documents, +i.e. the model can easily adjust to domain documents even after training has finished +(in contrast: GPT-3 relies on the web data seen during training) + +**Example** + +```python +| query = "who got the first nobel prize in physics?" +| +| # Retrieve related documents from retriever +| retrieved_docs = retriever.retrieve(query=query) +| +| # Now generate answer from query and retrieved documents +| generator.predict( +| query=query, +| documents=retrieved_docs, +| top_k=1 +| ) +| +| # Answer +| +| {'query': 'who got the first nobel prize in physics', +| 'answers': +| [{'query': 'who got the first nobel prize in physics', +| 'answer': ' albert einstein', +| 'meta': { 'doc_ids': [...], +| 'doc_scores': [80.42758 ...], +| 'doc_probabilities': [40.71379089355469, ... +| 'content': ['Albert Einstein was a ...] +| 'titles': ['"Albert Einstein"', ...] +| }}]} +``` + + + +#### RAGenerator.\_\_init\_\_ + +```python +def __init__(model_name_or_path: str = "facebook/rag-token-nq", model_version: Optional[str] = None, retriever: Optional[DensePassageRetriever] = None, generator_type: str = "token", top_k: int = 2, max_length: int = 200, min_length: int = 2, num_beams: int = 2, embed_title: bool = True, prefix: Optional[str] = None, use_gpu: bool = True, progress_bar: bool = True) +``` + +Load a RAG model from Transformers along with passage_embedding_model. + +See https://huggingface.co/transformers/model_doc/rag.html for more details + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. +'facebook/rag-token-nq', 'facebook/rag-sequence-nq'. +See https://huggingface.co/models for full list of available models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `retriever`: `DensePassageRetriever` used to embedded passages for the docs passed to `predict()`. This is optional and is only needed if the docs you pass don't already contain embeddings in `Document.embedding`. +- `generator_type`: Which RAG generator implementation to use ("token" or "sequence") +- `top_k`: Number of independently generated text to return +- `max_length`: Maximum length of generated text +- `min_length`: Minimum length of generated text +- `num_beams`: Number of beams for beam search. 1 means no beam search. +- `embed_title`: Embedded the title of passage while generating embedding +- `prefix`: The prefix used by the generator's tokenizer. +- `use_gpu`: Whether to use GPU. Falls back on CPU if no GPU is available. + + + +#### RAGenerator.predict + +```python +def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict +``` + +Generate the answer to the input query. The generation will be conditioned on the supplied documents. + +These documents can for example be retrieved via the Retriever. + +**Arguments**: + +- `query`: Query +- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. +- `top_k`: Number of returned answers + +**Returns**: + +Generated answers plus additional infos in a dict like this: +```python +| {'query': 'who got the first nobel prize in physics', +| 'answers': +| [{'query': 'who got the first nobel prize in physics', +| 'answer': ' albert einstein', +| 'meta': { 'doc_ids': [...], +| 'doc_scores': [80.42758 ...], +| 'doc_probabilities': [40.71379089355469, ... +| 'content': ['Albert Einstein was a ...] +| 'titles': ['"Albert Einstein"', ...] +| }}]} +``` + + + +## Seq2SeqGenerator + +```python +class Seq2SeqGenerator(BaseGenerator) +``` + +A generic sequence-to-sequence generator based on HuggingFace's transformers. + +This generator supports all [Text2Text](https://huggingface.co/models?pipeline_tag=text2text-generation) models +from the Hugging Face hub. If the primary interface for the model specified by `model_name_or_path` constructor +parameter is AutoModelForSeq2SeqLM from Hugging Face, then you can use it in this Generator. + +Moreover, as language models prepare model input in their specific encoding, each model +specified with model_name_or_path parameter in this Seq2SeqGenerator should have an +accompanying model input converter that takes care of prefixes, separator tokens etc. +By default, we provide model input converters for a few well-known seq2seq language models (e.g. ELI5). +It is the responsibility of Seq2SeqGenerator user to ensure an appropriate model input converter +is either already registered or specified on a per-model basis in the Seq2SeqGenerator constructor. + +For mode details on custom model input converters refer to _BartEli5Converter + +For a list of all text2text-generation models, see +the [Hugging Face Model Hub](https://huggingface.co/models?pipeline_tag=text2text-generation) + + +**Example** + +```python +| query = "Why is Dothraki language important?" +| +| # Retrieve related documents from retriever +| retrieved_docs = retriever.retrieve(query=query) +| +| # Now generate answer from query and retrieved documents +| generator.predict( +| query=query, +| documents=retrieved_docs, +| top_k=1 +| ) +| +| # Answer +| +| {'query': 'who got the first nobel prize in physics', +| 'answers': +| [{'query': 'who got the first nobel prize in physics', +| 'answer': ' albert einstein', +| 'meta': { 'doc_ids': [...], +| 'doc_scores': [80.42758 ...], +| 'doc_probabilities': [40.71379089355469, ... +| 'content': ['Albert Einstein was a ...] +| 'titles': ['"Albert Einstein"', ...] +| }}]} +``` + + + +#### Seq2SeqGenerator.\_\_init\_\_ + +```python +def __init__(model_name_or_path: str, input_converter: Optional[Callable] = None, top_k: int = 1, max_length: int = 200, min_length: int = 2, num_beams: int = 8, use_gpu: bool = True, progress_bar: bool = True) +``` + +**Arguments**: + +- `model_name_or_path`: a HF model name for auto-regressive language model like GPT2, XLNet, XLM, Bart, T5 etc +- `input_converter`: an optional Callable to prepare model input for the underlying language model +specified in model_name_or_path parameter. The required __call__ method signature for +the Callable is: +__call__(tokenizer: PreTrainedTokenizer, query: str, documents: List[Document], +top_k: Optional[int] = None) -> BatchEncoding: +- `top_k`: Number of independently generated text to return +- `max_length`: Maximum length of generated text +- `min_length`: Minimum length of generated text +- `num_beams`: Number of beams for beam search. 1 means no beam search. +- `use_gpu`: Whether to use GPU or the CPU. Falls back on CPU if no GPU is available. + + + +#### Seq2SeqGenerator.predict + +```python +def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict +``` + +Generate the answer to the input query. The generation will be conditioned on the supplied documents. + +These document can be retrieved via the Retriever or supplied directly via predict method. + +**Arguments**: + +- `query`: Query +- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. +- `top_k`: Number of returned answers + +**Returns**: + +Generated answers + diff --git a/docs/v1.7.0/_src/api/api/other_nodes.md b/docs/v1.7.0/_src/api/api/other_nodes.md new file mode 100644 index 0000000000..e638a57014 --- /dev/null +++ b/docs/v1.7.0/_src/api/api/other_nodes.md @@ -0,0 +1,132 @@ + + +# Module docs2answers + + + +## Docs2Answers + +```python +class Docs2Answers(BaseComponent) +``` + +This Node is used to convert retrieved documents into predicted answers format. + +It is useful for situations where you are calling a Retriever only pipeline via REST API. +This ensures that your output is in a compatible format. + +**Arguments**: + +- `progress_bar`: Whether to show a progress bar + + + +# Module join\_docs + + + +## JoinDocuments + +```python +class JoinDocuments(JoinNode) +``` + +A node to join documents outputted by multiple retriever nodes. + +The node allows multiple join modes: +* concatenate: combine the documents from multiple nodes. Any duplicate documents are discarded. + The score is only determined by the last node that outputs the document. +* merge: merge scores of documents from multiple nodes. Optionally, each input score can be given a different + `weight` & a `top_k` limit can be set. This mode can also be used for "reranking" retrieved documents. +* reciprocal_rank_fusion: combines the documents based on their rank in multiple nodes. + + + +#### JoinDocuments.\_\_init\_\_ + +```python +def __init__(join_mode: str = "concatenate", weights: Optional[List[float]] = None, top_k_join: Optional[int] = None, sort_by_score: bool = True) +``` + +**Arguments**: + +- `join_mode`: `concatenate` to combine documents from multiple retrievers `merge` to aggregate scores of +individual documents, `reciprocal_rank_fusion` to apply rank based scoring. +- `weights`: A node-wise list(length of list must be equal to the number of input nodes) of weights for +adjusting document scores when using the `merge` join_mode. By default, equal weight is given +to each retriever score. This param is not compatible with the `concatenate` join_mode. +- `top_k_join`: Limit documents to top_k based on the resulting scores of the join. +- `sort_by_score`: Whether to sort the incoming documents by their score. Set this to True if all your +Documents are coming with `score` values. Set to False if any of the Documents come +from sources where the `score` is set to `None`, like `TfidfRetriever` on Elasticsearch. + + + +# Module join\_answers + + + +## JoinAnswers + +```python +class JoinAnswers(JoinNode) +``` + +A node to join `Answer`s produced by multiple `Reader` nodes. + + + +#### JoinAnswers.\_\_init\_\_ + +```python +def __init__(join_mode: str = "concatenate", weights: Optional[List[float]] = None, top_k_join: Optional[int] = None, sort_by_score: bool = True) +``` + +**Arguments**: + +- `join_mode`: `"concatenate"` to combine documents from multiple `Reader`s. `"merge"` to aggregate scores +of individual `Answer`s. +- `weights`: A node-wise list (length of list must be equal to the number of input nodes) of weights for +adjusting `Answer` scores when using the `"merge"` join_mode. By default, equal weight is assigned to each +`Reader` score. This parameter is not compatible with the `"concatenate"` join_mode. +- `top_k_join`: Limit `Answer`s to top_k based on the resulting scored of the join. +- `sort_by_score`: Whether to sort the incoming answers by their score. Set this to True if your Answers +are coming from a Reader or TableReader. Set to False if any Answers come from a Generator since this assigns +None as a score to each. + + + +# Module route\_documents + + + +## RouteDocuments + +```python +class RouteDocuments(BaseComponent) +``` + +A node to split a list of `Document`s by `content_type` or by the values of a metadata field and route them to +different nodes. + + + +#### RouteDocuments.\_\_init\_\_ + +```python +def __init__(split_by: str = "content_type", metadata_values: Optional[List[str]] = None) +``` + +**Arguments**: + +- `split_by`: Field to split the documents by, either `"content_type"` or a metadata field name. +If this parameter is set to `"content_type"`, the list of `Document`s will be split into a list containing +only `Document`s of type `"text"` (will be routed to `"output_1"`) and a list containing only `Document`s of +type `"table"` (will be routed to `"output_2"`). +If this parameter is set to a metadata field name, you need to specify the parameter `metadata_values` as +well. +- `metadata_values`: If the parameter `split_by` is set to a metadata field name, you need to provide a list +of values to group the `Document`s to. `Document`s whose metadata field is equal to the first value of the +provided list will be routed to `"output_1"`, `Document`s whose metadata field is equal to the second +value of the provided list will be routed to `"output_2"`, etc. + diff --git a/docs/v1.7.0/_src/api/api/pipelines.md b/docs/v1.7.0/_src/api/api/pipelines.md new file mode 100644 index 0000000000..72b53ea4a3 --- /dev/null +++ b/docs/v1.7.0/_src/api/api/pipelines.md @@ -0,0 +1,1657 @@ + + +# Module base + + + +## Pipeline + +```python +class Pipeline() +``` + +Pipeline brings together building blocks to build a complex search pipeline with Haystack and user-defined components. + +Under the hood, a Pipeline is represented as a directed acyclic graph of component nodes. You can use it for custom query flows with the option to branch queries (for example, extractive question answering and keyword match query), merge candidate documents for a Reader from multiple Retrievers, or re-ranking of candidate documents. + + + +#### Pipeline.root\_node + +```python +@property +def root_node() -> Optional[str] +``` + +Returns the root node of the pipeline's graph. + + + +#### Pipeline.components + +```python +@property +def components() -> Dict[str, BaseComponent] +``` + +Returns all components used by this pipeline. +Note that this also includes such components that are being utilized by other components only and are not being used as a pipeline node directly. + + + +#### Pipeline.to\_code + +```python +def to_code(pipeline_variable_name: str = "pipeline", generate_imports: bool = True, add_comment: bool = False) -> str +``` + +Returns the code to create this pipeline as string. + +**Arguments**: + +- `pipeline_variable_name`: The variable name of the generated pipeline. +Default value is 'pipeline'. +- `generate_imports`: Whether to include the required import statements into the code. +Default value is True. +- `add_comment`: Whether to add a preceding comment that this code has been generated. +Default value is False. + + + +#### Pipeline.to\_notebook\_cell + +```python +def to_notebook_cell(pipeline_variable_name: str = "pipeline", generate_imports: bool = True, add_comment: bool = True) +``` + +Creates a new notebook cell with the code to create this pipeline. + +**Arguments**: + +- `pipeline_variable_name`: The variable name of the generated pipeline. +Default value is 'pipeline'. +- `generate_imports`: Whether to include the required import statements into the code. +Default value is True. +- `add_comment`: Whether to add a preceding comment that this code has been generated. +Default value is True. + + + +#### Pipeline.load\_from\_deepset\_cloud + +```python +@classmethod +def load_from_deepset_cloud(cls, pipeline_config_name: str, pipeline_name: str = "query", workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, overwrite_with_env_variables: bool = False) +``` + +Load Pipeline from Deepset Cloud defining the individual components and how they're tied together to form + +a Pipeline. A single config can declare multiple Pipelines, in which case an explicit `pipeline_name` must +be passed. + +In order to get a list of all available pipeline_config_names, call `list_pipelines_on_deepset_cloud()`. +Use the returned `name` as `pipeline_config_name`. + +**Arguments**: + +- `pipeline_config_name`: name of the config file inside the Deepset Cloud workspace. +To get a list of all available pipeline_config_names, call `list_pipelines_on_deepset_cloud()`. +- `pipeline_name`: specifies which pipeline to load from config. +Deepset Cloud typically provides a 'query' and a 'index' pipeline per config. +- `workspace`: workspace in Deepset Cloud +- `api_key`: Secret value of the API key. +If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable. +- `api_endpoint`: The URL of the Deepset Cloud API. +If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable. +- `overwrite_with_env_variables`: Overwrite the config with environment variables. For example, +to change return_no_answer param for a FARMReader, an env +variable 'READER_PARAMS_RETURN_NO_ANSWER=False' can be set. Note that an +`_` sign must be used to specify nested hierarchical properties. + + + +#### Pipeline.list\_pipelines\_on\_deepset\_cloud + +```python +@classmethod +def list_pipelines_on_deepset_cloud(cls, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None) -> List[dict] +``` + +Lists all pipeline configs available on Deepset Cloud. + +**Arguments**: + +- `workspace`: workspace in Deepset Cloud +- `api_key`: Secret value of the API key. +If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable. +- `api_endpoint`: The URL of the Deepset Cloud API. +If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable. + +Returns: + list of dictionaries: List[dict] + each dictionary: { + "name": str -> `pipeline_config_name` to be used in `load_from_deepset_cloud()`, + "..." -> additional pipeline meta information + } + example: + [{'name': 'my_super_nice_pipeline_config', + 'pipeline_id': '2184e0c1-c6ec-40a1-9b28-5d2768e5efa2', + 'status': 'DEPLOYED', + 'created_at': '2022-02-01T09:57:03.803991+00:00', + 'deleted': False, + 'is_default': False, + 'indexing': {'status': 'IN_PROGRESS', + 'pending_file_count': 3, + 'total_file_count': 31}}] + + + +#### Pipeline.save\_to\_deepset\_cloud + +```python +@classmethod +def save_to_deepset_cloud(cls, query_pipeline: Pipeline, index_pipeline: Pipeline, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, overwrite: bool = False) +``` + +Saves a Pipeline config to Deepset Cloud defining the individual components and how they're tied together to form + +a Pipeline. A single config must declare a query pipeline and a index pipeline. + +**Arguments**: + +- `query_pipeline`: the query pipeline to save. +- `index_pipeline`: the index pipeline to save. +- `pipeline_config_name`: name of the config file inside the Deepset Cloud workspace. +- `workspace`: workspace in Deepset Cloud +- `api_key`: Secret value of the API key. +If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable. +- `api_endpoint`: The URL of the Deepset Cloud API. +If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable. +- `overwrite`: Whether to overwrite the config if it already exists. Otherwise an error is being raised. + + + +#### Pipeline.deploy\_on\_deepset\_cloud + +```python +@classmethod +def deploy_on_deepset_cloud(cls, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, timeout: int = 60, show_curl_message: bool = True) +``` + +Deploys the pipelines of a pipeline config on Deepset Cloud. + +Blocks until pipelines are successfully deployed, deployment failed or timeout exceeds. +If pipelines are already deployed no action will be taken and an info will be logged. +If timeout exceeds a TimeoutError will be raised. +If deployment fails a DeepsetCloudError will be raised. + +Pipeline config must be present on Deepset Cloud. See save_to_deepset_cloud() for more information. + +**Arguments**: + +- `pipeline_config_name`: name of the config file inside the Deepset Cloud workspace. +- `workspace`: workspace in Deepset Cloud +- `api_key`: Secret value of the API key. +If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable. +- `api_endpoint`: The URL of the Deepset Cloud API. +If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable. +- `timeout`: The time in seconds to wait until deployment completes. +If the timeout is exceeded an error will be raised. +- `show_curl_message`: Whether to print an additional message after successful deployment showing how to query the pipeline using curl. + + + +#### Pipeline.undeploy\_on\_deepset\_cloud + +```python +@classmethod +def undeploy_on_deepset_cloud(cls, pipeline_config_name: str, workspace: str = "default", api_key: Optional[str] = None, api_endpoint: Optional[str] = None, timeout: int = 60) +``` + +Undeploys the pipelines of a pipeline config on Deepset Cloud. + +Blocks until pipelines are successfully undeployed, undeployment failed or timeout exceeds. +If pipelines are already undeployed no action will be taken and an info will be logged. +If timeout exceeds a TimeoutError will be raised. +If deployment fails a DeepsetCloudError will be raised. + +Pipeline config must be present on Deepset Cloud. See save_to_deepset_cloud() for more information. + +**Arguments**: + +- `pipeline_config_name`: name of the config file inside the Deepset Cloud workspace. +- `workspace`: workspace in Deepset Cloud +- `api_key`: Secret value of the API key. +If not specified, will be read from DEEPSET_CLOUD_API_KEY environment variable. +- `api_endpoint`: The URL of the Deepset Cloud API. +If not specified, will be read from DEEPSET_CLOUD_API_ENDPOINT environment variable. +- `timeout`: The time in seconds to wait until undeployment completes. +If the timeout is exceeded an error will be raised. + + + +#### Pipeline.add\_node + +```python +def add_node(component: BaseComponent, name: str, inputs: List[str]) +``` + +Add a new node to the pipeline. + +**Arguments**: + +- `component`: The object to be called when the data is passed to the node. It can be a Haystack component +(like Retriever, Reader, or Generator) or a user-defined object that implements a run() +method to process incoming data from predecessor node. +- `name`: The name for the node. It must not contain any dots. +- `inputs`: A list of inputs to the node. If the predecessor node has a single outgoing edge, just the name +of node is sufficient. For instance, a 'BM25Retriever' node would always output a single +edge with a list of documents. It can be represented as ["BM25Retriever"]. + +In cases when the predecessor node has multiple outputs, e.g., a "QueryClassifier", the output +must be specified explicitly as "QueryClassifier.output_2". + + + +#### Pipeline.get\_node + +```python +def get_node(name: str) -> Optional[BaseComponent] +``` + +Get a node from the Pipeline. + +**Arguments**: + +- `name`: The name of the node. + + + +#### Pipeline.set\_node + +```python +def set_node(name: str, component) +``` + +Set the component for a node in the Pipeline. + +**Arguments**: + +- `name`: The name of the node. +- `component`: The component object to be set at the node. + + + +#### Pipeline.run + +```python +def run(query: Optional[str] = None, file_paths: Optional[List[str]] = None, labels: Optional[MultiLabel] = None, documents: Optional[List[Document]] = None, meta: Optional[Union[dict, List[dict]]] = None, params: Optional[dict] = None, debug: Optional[bool] = None) +``` + +Runs the Pipeline, one node at a time. + +**Arguments**: + +- `query`: The search query (for query pipelines only). +- `file_paths`: The files to index (for indexing pipelines only). +- `labels`: Ground-truth labels that you can use to perform an isolated evaluation of pipelines. These labels are input to nodes in the pipeline. +- `documents`: A list of Document objects to be processed by the Pipeline Nodes. +- `meta`: Files' metadata. Used in indexing pipelines in combination with `file_paths`. +- `params`: Dictionary of parameters to be dispatched to the nodes. +To pass a parameter to all Nodes, use: `{"top_k": 10}`. +To pass a parameter to targeted Nodes, run: + `{"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}}` +- `debug`: Specifies whether the Pipeline should instruct Nodes to collect debug information +about their execution. By default, this information includes the input parameters +the Nodes received and the output they generated. You can then find all debug information in the dictionary returned by this method under the key `_debug`. + + + +#### Pipeline.run\_batch + +```python +def run_batch(queries: List[str] = None, file_paths: Optional[List[str]] = None, labels: Optional[Union[MultiLabel, List[MultiLabel]]] = None, documents: Optional[Union[List[Document], List[List[Document]]]] = None, meta: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, params: Optional[dict] = None, debug: Optional[bool] = None) +``` + +Runs the Pipeline in a batch mode, one node at a time. The batch mode means that the Pipeline can take more than one query as input. You can use this method for query pipelines only. When used with an indexing pipeline, it calls the pipeline `run()` method. + +Here's what this method returns for Retriever-Reader pipelines: +- Single query: Retrieves top-k relevant Documents and returns a list of answers for each retrieved Document. +- A list of queries: Retrieves top-k relevant Documents for each query and returns a list of answers for each query. + +Here's what this method returns for Reader-only pipelines: +- Single query + a list of Documents: Applies the query to each Document individually and returns answers for each single Document. +- Single query + a list of lists of Documents: Applies the query to each list of Documents and returns aggregated answers for each list of Documents. +- A list of queries + a list of Documents: Applies each query to each Document individually and returns answers for each query-document pair. +- A list of queries + a list of lists of Documents: Applies each query to its corresponding Document list and aggregates answers for each list of Documents. + +**Arguments**: + +- `queries`: List of search queries (for query pipelines only). +- `file_paths`: The files to index (for indexing pipelines only). If you provide `file_paths` the Pipeline's `run` method instead of `run_batch` is called. +- `labels`: Ground-truth labels that you can use to perform an isolated evaluation of pipelines. These labels are input to nodes in the pipeline. +- `documents`: A list of Document objects or a list of lists of Document objects to be processed by the Pipeline Nodes. +- `meta`: Files' metadata. Used in indexing pipelines in combination with `file_paths`. +- `params`: Dictionary of parameters to be dispatched to the nodes. +To pass a parameter to all Nodes, use: `{"top_k": 10}`. +To pass a parameter to targeted Nodes, run: + `{"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}}` +- `debug`: Specifies whether the Pipeline should instruct Nodes to collect debug information +about their execution. By default, this information includes the input parameters +the Nodes received and the output they generated. You can then find all debug information in the dictionary returned by this method under the key `_debug`. + + + +#### Pipeline.eval\_beir + +```python +@classmethod +def eval_beir(cls, index_pipeline: Pipeline, query_pipeline: Pipeline, index_params: dict = {}, query_params: dict = {}, dataset: str = "scifact", dataset_dir: Path = Path("."), top_k_values: List[int] = [1, 3, 5, 10, 100, 1000], keep_index: bool = False) -> Tuple[Dict[str, float], Dict[str, float], Dict[str, float], Dict[str, float]] +``` + +Runs information retrieval evaluation of a pipeline using BEIR on a specified BEIR dataset. + +See https://github.com/beir-cellar/beir for more information. + +**Arguments**: + +- `index_pipeline`: The indexing pipeline to use. +- `query_pipeline`: The query pipeline to evaluate. +- `index_params`: The params to use during indexing (see pipeline.run's params). +- `query_params`: The params to use during querying (see pipeline.run's params). +- `dataset`: The BEIR dataset to use. +- `dataset_dir`: The directory to store the dataset to. +- `top_k_values`: The top_k values each metric will be calculated for. +- `keep_index`: Whether to keep the index after evaluation. +If True the index will be kept after beir evaluation. Otherwise it will be deleted immediately afterwards. + Defaults to False. + +Returns a tuple containing the ncdg, map, recall and precision scores. +Each metric is represented by a dictionary containing the scores for each top_k value. + + + +#### Pipeline.execute\_eval\_run + +```python +@classmethod +def execute_eval_run(cls, index_pipeline: Pipeline, query_pipeline: Pipeline, evaluation_set_labels: List[MultiLabel], corpus_file_paths: List[str], experiment_name: str, experiment_run_name: str, experiment_tracking_tool: Literal["mlflow", None] = None, experiment_tracking_uri: Optional[str] = None, corpus_file_metas: List[Dict[str, Any]] = None, corpus_meta: Dict[str, Any] = {}, evaluation_set_meta: Dict[str, Any] = {}, pipeline_meta: Dict[str, Any] = {}, index_params: dict = {}, query_params: dict = {}, sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, reuse_index: bool = False, custom_document_id_field: Optional[str] = None, document_scope: Literal[ + "document_id", + "context", + "document_id_and_context", + "document_id_or_context", + "answer", + "document_id_or_answer", + ] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult +``` + +Starts an experiment run that first indexes the specified files (forming a corpus) using the index pipeline + +and subsequently evaluates the query pipeline on the provided labels (forming an evaluation set) using pipeline.eval(). +Parameters and results (metrics and predictions) of the run are tracked by an experiment tracking tool for further analysis. +You can specify the experiment tracking tool by setting the params `experiment_tracking_tool` and `experiment_tracking_uri` +or by passing a (custom) tracking head to Tracker.set_tracking_head(). +Note, that `experiment_tracking_tool` only supports `mlflow` currently. + +For easier comparison you can pass additional metadata regarding corpus (corpus_meta), evaluation set (evaluation_set_meta) and pipelines (pipeline_meta). +E.g. you can give them names or ids to identify them across experiment runs. + +This method executes an experiment run. Each experiment run is part of at least one experiment. +An experiment typically consists of multiple runs to be compared (e.g. using different retrievers in query pipeline). +Experiment tracking tools usually share the same concepts of experiments and provide additional functionality to easily compare runs across experiments. + +E.g. you can call execute_eval_run() multiple times with different retrievers in your query pipeline and compare the runs in mlflow: + +```python + | for retriever_type, query_pipeline in zip(["sparse", "dpr", "embedding"], [sparse_pipe, dpr_pipe, embedding_pipe]): + | eval_result = Pipeline.execute_eval_run( + | index_pipeline=index_pipeline, + | query_pipeline=query_pipeline, + | evaluation_set_labels=labels, + | corpus_file_paths=file_paths, + | corpus_file_metas=file_metas, + | experiment_tracking_tool="mlflow", + | experiment_tracking_uri="http://localhost:5000", + | experiment_name="my-retriever-experiment", + | experiment_run_name=f"run_{retriever_type}", + | pipeline_meta={"name": f"my-pipeline-{retriever_type}"}, + | evaluation_set_meta={"name": "my-evalset"}, + | corpus_meta={"name": "my-corpus"}. + | reuse_index=False + | ) +``` + +**Arguments**: + +- `index_pipeline`: The indexing pipeline to use. +- `query_pipeline`: The query pipeline to evaluate. +- `evaluation_set_labels`: The labels to evaluate on forming an evaluation set. +- `corpus_file_paths`: The files to be indexed and searched during evaluation forming a corpus. +- `experiment_name`: The name of the experiment +- `experiment_run_name`: The name of the experiment run +- `experiment_tracking_tool`: The experiment tracking tool to be used. Currently we only support "mlflow". +If left unset the current TrackingHead specified by Tracker.set_tracking_head() will be used. +- `experiment_tracking_uri`: The uri of the experiment tracking server to be used. Must be specified if experiment_tracking_tool is set. +You can use deepset's public mlflow server via https://public-mlflow.deepset.ai/. +Note, that artifact logging (e.g. Pipeline YAML or evaluation result CSVs) are currently not allowed on deepset's public mlflow server as this might expose sensitive data. +- `corpus_file_metas`: The optional metadata to be stored for each corpus file (e.g. title). +- `corpus_meta`: Metadata about the corpus to track (e.g. name, date, author, version). +- `evaluation_set_meta`: Metadata about the evalset to track (e.g. name, date, author, version). +- `pipeline_meta`: Metadata about the pipelines to track (e.g. name, author, version). +- `index_params`: The params to use during indexing (see pipeline.run's params). +- `query_params`: The params to use during querying (see pipeline.run's params). +- `sas_model_name_or_path`: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric. +The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps. +Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture. +More info in the paper: https://arxiv.org/abs/2108.06130 +Models: +- You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data. +Not all cross encoders can be used because of different return types. +If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class +- Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" +- Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large" +- Large model for German only: "deepset/gbert-large-sts" +- `sas_batch_size`: Number of prediction label pairs to encode at once by CrossEncoder or SentenceTransformer while calculating SAS. +- `sas_use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity. +Falls back to CPU if no GPU is available. +- `add_isolated_node_eval`: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode. +This mode helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node. +If a node performs much better in the isolated evaluation than in the integrated evaluation, the previous node needs to be optimized to improve the pipeline's performance. +If a node's performance is similar in both modes, this node itself needs to be optimized to improve the pipeline's performance. +The isolated evaluation calculates the upper bound of each node's evaluation metrics under the assumption that it received perfect inputs from the previous node. +To this end, labels are used as input to the node instead of the output of the previous node in the pipeline. +The generated dataframes in the EvaluationResult then contain additional rows, which can be distinguished from the integrated evaluation results based on the +values "integrated" or "isolated" in the column "eval_mode" and the evaluation report then additionally lists the upper bound of each node's evaluation metrics. +- `reuse_index`: Whether to reuse existing non-empty index and to keep the index after evaluation. +If True the index will be kept after evaluation and no indexing will take place if index has already documents. Otherwise it will be deleted immediately afterwards. +Defaults to False. +- `custom_document_id_field`: Custom field name within `Document`'s `meta` which identifies the document and is being used as criterion for matching documents to labels during evaluation. +This is especially useful if you want to match documents on other criteria (e.g. file names) than the default document ids as these could be heavily influenced by preprocessing. +If not set (default) the `Document`'s `id` is being used as criterion for matching documents to labels. +- `document_scope`: A criterion for deciding whether documents are relevant or not. +You can select between: +- 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param. + A typical use case is Document Retrieval. +- 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `context_matching_...` params). + A typical use case is Document-Independent Passage Retrieval. +- 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match. + A typical use case is Document-Specific Passage Retrieval. +- 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match. + A typical use case is Document Retrieval having sparse context labels. +- 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically. + A typical use case is Question Answering. +- 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match. + This is intended to be a proper default value in order to support both main use cases: + - Document Retrieval + - Question Answering +The default value is 'document_id_or_answer'. +- `answer_scope`: Specifies the scope in which a matching answer is considered correct. +You can select between: +- 'any' (default): Any matching answer is considered correct. +- 'context': The answer is only considered correct if its context matches as well. + Uses fuzzy matching (see `context_matching_...` params). +- 'document_id': The answer is only considered correct if its document ID matches as well. + You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param. +- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well. +The default value is 'any'. +In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'. +- `context_matching_min_length`: The minimum string length context and candidate need to have in order to be scored. +Returns 0.0 otherwise. +- `context_matching_boost_split_overlaps`: Whether to boost split overlaps (e.g. [AB] <-> [BC]) that result from different preprocessing params. +If we detect that the score is near a half match and the matching part of the candidate is at its boundaries +we cut the context on the same side, recalculate the score and take the mean of both. +Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total. +- `context_matching_threshold`: Score threshold that candidates must surpass to be included into the result list. Range: [0,100] + + + +#### Pipeline.eval + +```python +@send_event +def eval(labels: List[MultiLabel], documents: Optional[List[List[Document]]] = None, params: Optional[dict] = None, sas_model_name_or_path: str = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult +``` + +Evaluates the pipeline by running the pipeline once per query in debug mode + +and putting together all data that is needed for evaluation, e.g. calculating metrics. + +If you want to calculate SAS (Semantic Answer Similarity) metrics, you have to specify `sas_model_name_or_path`. + +You will be able to control the scope within which an answer or a document is considered correct afterwards (See `document_scope` and `answer_scope` params in `EvaluationResult.calculate_metrics()`). +Some of these scopes require additional information that already needs to be specified during `eval()`: +- `custom_document_id_field` param to select a custom document ID from document's meta data for ID matching (only affects 'document_id' scopes) +- `context_matching_...` param to fine-tune the fuzzy matching mechanism that determines whether some text contexts match each other (only affects 'context' scopes, default values should work most of the time) + +**Arguments**: + +- `labels`: The labels to evaluate on +- `documents`: List of List of Document that the first node in the pipeline should get as input per multilabel. Can be used to evaluate a pipeline that consists of a reader without a retriever. +- `params`: Dictionary of parameters to be dispatched to the nodes. +If you want to pass a param to all nodes, you can just use: {"top_k":10} +If you want to pass it to targeted nodes, you can do: +{"Retriever": {"top_k": 10}, "Reader": {"top_k": 3, "debug": True}} +- `sas_model_name_or_path`: Name or path of "Semantic Answer Similarity (SAS) model". When set, the model will be used to calculate similarity between predictions and labels and generate the SAS metric. +The SAS metric correlates better with human judgement of correct answers as it does not rely on string overlaps. +Example: Prediction = "30%", Label = "thirty percent", EM and F1 would be overly pessimistic with both being 0, while SAS paints a more realistic picture. +More info in the paper: https://arxiv.org/abs/2108.06130 +Models: +- You can use Bi Encoders (sentence transformers) or cross encoders trained on Semantic Textual Similarity (STS) data. +Not all cross encoders can be used because of different return types. +If you use custom cross encoders please make sure they work with sentence_transformers.CrossEncoder class +- Good default for multiple languages: "sentence-transformers/paraphrase-multilingual-mpnet-base-v2" +- Large, powerful, but slow model for English only: "cross-encoder/stsb-roberta-large" +- Large model for German only: "deepset/gbert-large-sts" +- `sas_batch_size`: Number of prediction label pairs to encode at once by CrossEncoder or SentenceTransformer while calculating SAS. +- `sas_use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity. +Falls back to CPU if no GPU is available. +- `add_isolated_node_eval`: If set to True, in addition to the integrated evaluation of the pipeline, each node is evaluated in isolated evaluation mode. +This mode helps to understand the bottlenecks of a pipeline in terms of output quality of each individual node. +If a node performs much better in the isolated evaluation than in the integrated evaluation, the previous node needs to be optimized to improve the pipeline's performance. +If a node's performance is similar in both modes, this node itself needs to be optimized to improve the pipeline's performance. +The isolated evaluation calculates the upper bound of each node's evaluation metrics under the assumption that it received perfect inputs from the previous node. +To this end, labels are used as input to the node instead of the output of the previous node in the pipeline. +The generated dataframes in the EvaluationResult then contain additional rows, which can be distinguished from the integrated evaluation results based on the +values "integrated" or "isolated" in the column "eval_mode" and the evaluation report then additionally lists the upper bound of each node's evaluation metrics. +- `custom_document_id_field`: Custom field name within `Document`'s `meta` which identifies the document and is being used as criterion for matching documents to labels during evaluation. +This is especially useful if you want to match documents on other criteria (e.g. file names) than the default document ids as these could be heavily influenced by preprocessing. +If not set (default) the `Document`'s `id` is being used as criterion for matching documents to labels. +- `context_matching_min_length`: The minimum string length context and candidate need to have in order to be scored. +Returns 0.0 otherwise. +- `context_matching_boost_split_overlaps`: Whether to boost split overlaps (e.g. [AB] <-> [BC]) that result from different preprocessing params. +If we detect that the score is near a half match and the matching part of the candidate is at its boundaries +we cut the context on the same side, recalculate the score and take the mean of both. +Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total. +- `context_matching_threshold`: Score threshold that candidates must surpass to be included into the result list. Range: [0,100] + + + +#### Pipeline.get\_nodes\_by\_class + +```python +def get_nodes_by_class(class_type) -> List[Any] +``` + +Gets all nodes in the pipeline that are an instance of a certain class (incl. subclasses). + +This is for example helpful if you loaded a pipeline and then want to interact directly with the document store. +Example: +| from haystack.document_stores.base import BaseDocumentStore +| INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME) +| res = INDEXING_PIPELINE.get_nodes_by_class(class_type=BaseDocumentStore) + +**Returns**: + +List of components that are an instance the requested class + + + +#### Pipeline.get\_document\_store + +```python +def get_document_store() -> Optional[BaseDocumentStore] +``` + +Return the document store object used in the current pipeline. + +**Returns**: + +Instance of DocumentStore or None + + + +#### Pipeline.draw + +```python +def draw(path: Path = Path("pipeline.png")) +``` + +Create a Graphviz visualization of the pipeline. + +**Arguments**: + +- `path`: the path to save the image. + + + +#### Pipeline.load\_from\_yaml + +```python +@classmethod +def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, strict_version_check: bool = False) +``` + +Load Pipeline from a YAML file defining the individual components and how they're tied together to form + +a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must +be passed. + +Here's a sample configuration: + + ```yaml + | version: '1.0.0' + | + | components: # define all the building-blocks for Pipeline + | - name: MyReader # custom-name for the component; helpful for visualization & debugging + | type: FARMReader # Haystack Class name for the component + | params: + | no_ans_boost: -10 + | model_name_or_path: deepset/roberta-base-squad2 + | - name: MyESRetriever + | type: BM25Retriever + | params: + | document_store: MyDocumentStore # params can reference other components defined in the YAML + | custom_query: null + | - name: MyDocumentStore + | type: ElasticsearchDocumentStore + | params: + | index: haystack_test + | + | pipelines: # multiple Pipelines can be defined using the components from above + | - name: my_query_pipeline # a simple extractive-qa Pipeline + | nodes: + | - name: MyESRetriever + | inputs: [Query] + | - name: MyReader + | inputs: [MyESRetriever] + ``` + +Note that, in case of a mismatch in version between Haystack and the YAML, a warning will be printed. +If the pipeline loads correctly regardless, save again the pipeline using `Pipeline.save_to_yaml()` to remove the warning. + +**Arguments**: + +- `path`: path of the YAML file. +- `pipeline_name`: if the YAML contains multiple pipelines, the pipeline_name to load must be set. +- `overwrite_with_env_variables`: Overwrite the YAML configuration with environment variables. For example, +to change index name param for an ElasticsearchDocumentStore, an env +variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an +`_` sign must be used to specify nested hierarchical properties. +- `strict_version_check`: whether to fail in case of a version mismatch (throws a warning otherwise) + + + +#### Pipeline.load\_from\_config + +```python +@classmethod +def load_from_config(cls, pipeline_config: Dict, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, strict_version_check: bool = False) +``` + +Load Pipeline from a config dict defining the individual components and how they're tied together to form + +a Pipeline. A single config can declare multiple Pipelines, in which case an explicit `pipeline_name` must +be passed. + +Here's a sample configuration: + + ```python + | { + | "version": "ignore", + | "components": [ + | { # define all the building-blocks for Pipeline + | "name": "MyReader", # custom-name for the component; helpful for visualization & debugging + | "type": "FARMReader", # Haystack Class name for the component + | "params": {"no_ans_boost": -10, "model_name_or_path": "deepset/roberta-base-squad2"}, + | }, + | { + | "name": "MyESRetriever", + | "type": "BM25Retriever", + | "params": { + | "document_store": "MyDocumentStore", # params can reference other components defined in the YAML + | "custom_query": None, + | }, + | }, + | {"name": "MyDocumentStore", "type": "ElasticsearchDocumentStore", "params": {"index": "haystack_test"}}, + | ], + | "pipelines": [ + | { # multiple Pipelines can be defined using the components from above + | "name": "my_query_pipeline", # a simple extractive-qa Pipeline + | "nodes": [ + | {"name": "MyESRetriever", "inputs": ["Query"]}, + | {"name": "MyReader", "inputs": ["MyESRetriever"]}, + | ], + | } + | ], + | } + ``` + +**Arguments**: + +- `pipeline_config`: the pipeline config as dict +- `pipeline_name`: if the config contains multiple pipelines, the pipeline_name to load must be set. +- `overwrite_with_env_variables`: Overwrite the configuration with environment variables. For example, +to change index name param for an ElasticsearchDocumentStore, an env +variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an +`_` sign must be used to specify nested hierarchical properties. +- `strict_version_check`: whether to fail in case of a version mismatch (throws a warning otherwise). + + + +#### Pipeline.save\_to\_yaml + +```python +def save_to_yaml(path: Path, return_defaults: bool = False) +``` + +Save a YAML configuration for the Pipeline that can be used with `Pipeline.load_from_yaml()`. + +**Arguments**: + +- `path`: path of the output YAML file. +- `return_defaults`: whether to output parameters that have the default values. + + + +#### Pipeline.get\_config + +```python +def get_config(return_defaults: bool = False) -> dict +``` + +Returns a configuration for the Pipeline that can be used with `Pipeline.load_from_config()`. + +**Arguments**: + +- `return_defaults`: whether to output parameters that have the default values. + + + +#### Pipeline.print\_eval\_report + +```python +def print_eval_report(eval_result: EvaluationResult, n_wrong_examples: int = 3, metrics_filter: Optional[Dict[str, List[str]]] = None, document_scope: Literal[ + "document_id", + "context", + "document_id_and_context", + "document_id_or_context", + "answer", + "document_id_or_answer", + ] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", wrong_examples_fields: List[str] = ["answer", "context", "document_id"], max_characters_per_field: int = 150) +``` + +Prints evaluation report containing a metrics funnel and worst queries for further analysis. + +**Arguments**: + +- `eval_result`: The evaluation result, can be obtained by running eval(). +- `n_wrong_examples`: The number of worst queries to show. +- `metrics_filter`: The metrics to show per node. If None all metrics will be shown. +- `document_scope`: A criterion for deciding whether documents are relevant or not. +You can select between: +- 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param. + A typical use case is Document Retrieval. +- 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params). + A typical use case is Document-Independent Passage Retrieval. +- 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match. + A typical use case is Document-Specific Passage Retrieval. +- 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match. + A typical use case is Document Retrieval having sparse context labels. +- 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically. + A typical use case is Question Answering. +- 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match. + This is intended to be a proper default value in order to support both main use cases: + - Document Retrieval + - Question Answering +The default value is 'document_id_or_answer'. +- `answer_scope`: Specifies the scope in which a matching answer is considered correct. +You can select between: + - 'any' (default): Any matching answer is considered correct. + - 'context': The answer is only considered correct if its context matches as well. + Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params). + - 'document_id': The answer is only considered correct if its document ID matches as well. + You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param. + - 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well. + The default value is 'any'. + In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'. +:param wrong_examples_fields: A list of fields to include in the worst samples. +:param max_characters_per_field: The maximum number of characters to include in the worst samples report (per field). + + + +## \_HaystackBeirRetrieverAdapter + +```python +class _HaystackBeirRetrieverAdapter() +``` + + + +#### \_HaystackBeirRetrieverAdapter.\_\_init\_\_ + +```python +def __init__(index_pipeline: Pipeline, query_pipeline: Pipeline, index_params: dict, query_params: dict) +``` + +Adapter mimicking a BEIR retriever used by BEIR's EvaluateRetrieval class to run BEIR evaluations on Haystack Pipelines. + +This has nothing to do with Haystack's retriever classes. +See https://github.com/beir-cellar/beir/blob/main/beir/retrieval/evaluation.py. + +**Arguments**: + +- `index_pipeline`: The indexing pipeline to use. +- `query_pipeline`: The query pipeline to evaluate. +- `index_params`: The params to use during indexing (see pipeline.run's params). +- `query_params`: The params to use during querying (see pipeline.run's params). + + + +# Module ray + + + +## RayPipeline + +```python +class RayPipeline(Pipeline) +``` + +[Ray](https://ray.io) is a framework for distributed computing. + +With Ray, you can distribute a Pipeline's components across a cluster of machines. The individual components of a +Pipeline can be independently scaled. For instance, an extractive QA Pipeline deployment can have three replicas +of the Reader and a single replica for the Retriever. This way, you can use your resources more efficiently by horizontally scaling Components. + +To set the number of replicas, add `num_replicas` in the YAML configuration for the node in a pipeline: + + ```yaml + | components: + | ... + | + | pipelines: + | - name: ray_query_pipeline + | type: RayPipeline + | nodes: + | - name: ESRetriever + | inputs: [ Query ] + | serve_deployment_kwargs: + | num_replicas: 2 # number of replicas to create on the Ray cluster + ``` + +A Ray Pipeline can only be created with a YAML Pipeline configuration. + +```python +from haystack.pipeline import RayPipeline +pipeline = RayPipeline.load_from_yaml(path="my_pipelines.yaml", pipeline_name="my_query_pipeline") +pipeline.run(query="What is the capital of Germany?") +``` + +By default, RayPipelines create an instance of RayServe locally. To connect to an existing Ray instance, +set the `address` parameter when creating the RayPipeline instance. + +YAML definitions of Ray pipelines are validated at load. For more information, see [YAML File Definitions](https://haystack-website-git-fork-fstau-dev-287-search-deepset-overnice.vercel.app/components/pipelines#yaml-file-definitions). + + + +#### RayPipeline.\_\_init\_\_ + +```python +def __init__(address: str = None, ray_args: Optional[Dict[str, Any]] = None, serve_args: Optional[Dict[str, Any]] = None) +``` + +**Arguments**: + +- `address`: The IP address for the Ray cluster. If set to `None`, a local Ray instance is started. +- `kwargs`: Optional parameters for initializing Ray. +- `serve_args`: Optional parameters for initializing Ray Serve. + + + +#### RayPipeline.load\_from\_yaml + +```python +@classmethod +def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True, address: Optional[str] = None, strict_version_check: bool = False, ray_args: Optional[Dict[str, Any]] = None, serve_args: Optional[Dict[str, Any]] = None) +``` + +Load Pipeline from a YAML file defining the individual components and how they're tied together to form + +a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must +be passed. + +Here's a sample configuration: + + ```yaml + | version: '1.0.0' + | + | components: # define all the building-blocks for Pipeline + | - name: MyReader # custom-name for the component; helpful for visualization & debugging + | type: FARMReader # Haystack Class name for the component + | params: + | no_ans_boost: -10 + | model_name_or_path: deepset/roberta-base-squad2 + | - name: MyESRetriever + | type: ElasticsearchRetriever + | params: + | document_store: MyDocumentStore # params can reference other components defined in the YAML + | custom_query: null + | - name: MyDocumentStore + | type: ElasticsearchDocumentStore + | params: + | index: haystack_test + | + | pipelines: # multiple Pipelines can be defined using the components from above + | - name: my_query_pipeline # a simple extractive-qa Pipeline + | type: RayPipeline + | nodes: + | - name: MyESRetriever + | inputs: [Query] + | serve_deployment_kwargs: + | num_replicas: 2 # number of replicas to create on the Ray cluster + | - name: MyReader + | inputs: [MyESRetriever] + ``` + + +Note that, in case of a mismatch in version between Haystack and the YAML, a warning will be printed. +If the pipeline loads correctly regardless, save again the pipeline using `RayPipeline.save_to_yaml()` to remove the warning. + +**Arguments**: + +- `path`: path of the YAML file. +- `pipeline_name`: if the YAML contains multiple pipelines, the pipeline_name to load must be set. +- `overwrite_with_env_variables`: Overwrite the YAML configuration with environment variables. For example, +to change index name param for an ElasticsearchDocumentStore, an env +variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an +`_` sign must be used to specify nested hierarchical properties. +- `address`: The IP address for the Ray cluster. If set to None, a local Ray instance is started. +- `serve_args`: Optional parameters for initializing Ray Serve. + + + +## \_RayDeploymentWrapper + +```python +class _RayDeploymentWrapper() +``` + +Ray Serve supports calling of __init__ methods on the Classes to create "deployment" instances. + +In case of Haystack, some Components like Retrievers have complex init methods that needs objects +like Document Stores. + +This wrapper class encapsulates the initialization of Components. Given a Component Class +name, it creates an instance using the YAML Pipeline config. + + + +#### \_RayDeploymentWrapper.\_\_init\_\_ + +```python +def __init__(pipeline_config: dict, component_name: str) +``` + +Create an instance of Component. + +**Arguments**: + +- `pipeline_config`: Pipeline YAML parsed as a dict. +- `component_name`: Component Class name. + + + +#### \_RayDeploymentWrapper.\_\_call\_\_ + +```python +def __call__(*args, **kwargs) +``` + +Ray calls this method which is then re-directed to the corresponding component's run(). + + + +#### \_RayDeploymentWrapper.load\_from\_pipeline\_config + +```python +@staticmethod +def load_from_pipeline_config(pipeline_config: dict, component_name: str) +``` + +Load an individual component from a YAML config for Pipelines. + +**Arguments**: + +- `pipeline_config`: the Pipelines YAML config parsed as a dict. +- `component_name`: the name of the component to load. + + + +# Module standard\_pipelines + + + +## BaseStandardPipeline + +```python +class BaseStandardPipeline(ABC) +``` + +Base class for pre-made standard Haystack pipelines. +This class does not inherit from Pipeline. + + + +#### BaseStandardPipeline.add\_node + +```python +def add_node(component, name: str, inputs: List[str]) +``` + +Add a new node to the pipeline. + +**Arguments**: + +- `component`: The object to be called when the data is passed to the node. It can be a Haystack component +(like Retriever, Reader, or Generator) or a user-defined object that implements a run() +method to process incoming data from predecessor node. +- `name`: The name for the node. It must not contain any dots. +- `inputs`: A list of inputs to the node. If the predecessor node has a single outgoing edge, just the name +of node is sufficient. For instance, a 'BM25Retriever' node would always output a single +edge with a list of documents. It can be represented as ["BM25Retriever"]. + +In cases when the predecessor node has multiple outputs, e.g., a "QueryClassifier", the output +must be specified explicitly as "QueryClassifier.output_2". + + + +#### BaseStandardPipeline.get\_node + +```python +def get_node(name: str) +``` + +Get a node from the Pipeline. + +**Arguments**: + +- `name`: The name of the node. + + + +#### BaseStandardPipeline.set\_node + +```python +def set_node(name: str, component) +``` + +Set the component for a node in the Pipeline. + +**Arguments**: + +- `name`: The name of the node. +- `component`: The component object to be set at the node. + + + +#### BaseStandardPipeline.draw + +```python +def draw(path: Path = Path("pipeline.png")) +``` + +Create a Graphviz visualization of the pipeline. + +**Arguments**: + +- `path`: the path to save the image. + + + +#### BaseStandardPipeline.save\_to\_yaml + +```python +def save_to_yaml(path: Path, return_defaults: bool = False) +``` + +Save a YAML configuration for the Pipeline that can be used with `Pipeline.load_from_yaml()`. + +**Arguments**: + +- `path`: path of the output YAML file. +- `return_defaults`: whether to output parameters that have the default values. + + + +#### BaseStandardPipeline.load\_from\_yaml + +```python +@classmethod +def load_from_yaml(cls, path: Path, pipeline_name: Optional[str] = None, overwrite_with_env_variables: bool = True) +``` + +Load Pipeline from a YAML file defining the individual components and how they're tied together to form + +a Pipeline. A single YAML can declare multiple Pipelines, in which case an explicit `pipeline_name` must +be passed. + +Here's a sample configuration: + + ```yaml + | version: '1.0.0' + | + | components: # define all the building-blocks for Pipeline + | - name: MyReader # custom-name for the component; helpful for visualization & debugging + | type: FARMReader # Haystack Class name for the component + | params: + | no_ans_boost: -10 + | model_name_or_path: deepset/roberta-base-squad2 + | - name: MyESRetriever + | type: BM25Retriever + | params: + | document_store: MyDocumentStore # params can reference other components defined in the YAML + | custom_query: null + | - name: MyDocumentStore + | type: ElasticsearchDocumentStore + | params: + | index: haystack_test + | + | pipelines: # multiple Pipelines can be defined using the components from above + | - name: my_query_pipeline # a simple extractive-qa Pipeline + | nodes: + | - name: MyESRetriever + | inputs: [Query] + | - name: MyReader + | inputs: [MyESRetriever] + ``` + +**Arguments**: + +- `path`: path of the YAML file. +- `pipeline_name`: if the YAML contains multiple pipelines, the pipeline_name to load must be set. +- `overwrite_with_env_variables`: Overwrite the YAML configuration with environment variables. For example, +to change index name param for an ElasticsearchDocumentStore, an env +variable 'MYDOCSTORE_PARAMS_INDEX=documents-2021' can be set. Note that an +`_` sign must be used to specify nested hierarchical properties. + + + +#### BaseStandardPipeline.get\_nodes\_by\_class + +```python +def get_nodes_by_class(class_type) -> List[Any] +``` + +Gets all nodes in the pipeline that are an instance of a certain class (incl. subclasses). + +This is for example helpful if you loaded a pipeline and then want to interact directly with the document store. +Example: +```python +| from haystack.document_stores.base import BaseDocumentStore +| INDEXING_PIPELINE = Pipeline.load_from_yaml(Path(PIPELINE_YAML_PATH), pipeline_name=INDEXING_PIPELINE_NAME) +| res = INDEXING_PIPELINE.get_nodes_by_class(class_type=BaseDocumentStore) +``` + +**Returns**: + +List of components that are an instance of the requested class + + + +#### BaseStandardPipeline.get\_document\_store + +```python +def get_document_store() -> Optional[BaseDocumentStore] +``` + +Return the document store object used in the current pipeline. + +**Returns**: + +Instance of DocumentStore or None + + + +#### BaseStandardPipeline.eval + +```python +def eval(labels: List[MultiLabel], params: Optional[dict] = None, sas_model_name_or_path: Optional[str] = None, sas_batch_size: int = 32, sas_use_gpu: bool = True, add_isolated_node_eval: bool = False, custom_document_id_field: Optional[str] = None, context_matching_min_length: int = 100, context_matching_boost_split_overlaps: bool = True, context_matching_threshold: float = 65.0) -> EvaluationResult +``` + +Evaluates the pipeline by running the pipeline once per query in debug mode + +and putting together all data that is needed for evaluation, e.g. calculating metrics. + +If you want to calculate SAS (Semantic Answer Similarity) metrics, you have to specify `sas_model_name_or_path`. + +You will be able to control the scope within which an answer or a document is considered correct afterwards (See `document_scope` and `answer_scope` params in `EvaluationResult.calculate_metrics()`). +Some of these scopes require additional information that already needs to be specified during `eval()`: +- `custom_document_id_field` param to select a custom document ID from document's meta data for ID matching (only affects 'document_id' scopes) +- `context_matching_...` param to fine-tune the fuzzy matching mechanism that determines whether some text contexts match each other (only affects 'context' scopes, default values should work most of the time) + +**Arguments**: + +- `labels`: The labels to evaluate on +- `params`: Params for the `retriever` and `reader`. For instance, +params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +- `sas_model_name_or_path`: SentenceTransformers semantic textual similarity model to be used for sas value calculation, +should be path or string pointing to downloadable models. +- `sas_batch_size`: Number of prediction label pairs to encode at once by CrossEncoder or SentenceTransformer while calculating SAS. +- `sas_use_gpu`: Whether to use a GPU or the CPU for calculating semantic answer similarity. +Falls back to CPU if no GPU is available. +- `add_isolated_node_eval`: Whether to additionally evaluate the reader based on labels as input instead of output of previous node in pipeline +- `custom_document_id_field`: Custom field name within `Document`'s `meta` which identifies the document and is being used as criterion for matching documents to labels during evaluation. +This is especially useful if you want to match documents on other criteria (e.g. file names) than the default document ids as these could be heavily influenced by preprocessing. +If not set (default) the `Document`'s `id` is being used as criterion for matching documents to labels. +- `context_matching_min_length`: The minimum string length context and candidate need to have in order to be scored. +Returns 0.0 otherwise. +- `context_matching_boost_split_overlaps`: Whether to boost split overlaps (e.g. [AB] <-> [BC]) that result from different preprocessing params. +If we detect that the score is near a half match and the matching part of the candidate is at its boundaries +we cut the context on the same side, recalculate the score and take the mean of both. +Thus [AB] <-> [BC] (score ~50) gets recalculated with B <-> B (score ~100) scoring ~75 in total. +- `context_matching_threshold`: Score threshold that candidates must surpass to be included into the result list. Range: [0,100] + + + +#### BaseStandardPipeline.print\_eval\_report + +```python +def print_eval_report(eval_result: EvaluationResult, n_wrong_examples: int = 3, metrics_filter: Optional[Dict[str, List[str]]] = None, document_scope: Literal[ + "document_id", + "context", + "document_id_and_context", + "document_id_or_context", + "answer", + "document_id_or_answer", + ] = "document_id_or_answer", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any", wrong_examples_fields: List[str] = ["answer", "context", "document_id"], max_characters_per_field: int = 150) +``` + +Prints evaluation report containing a metrics funnel and worst queries for further analysis. + +**Arguments**: + +- `eval_result`: The evaluation result, can be obtained by running eval(). +- `n_wrong_examples`: The number of worst queries to show. +- `metrics_filter`: The metrics to show per node. If None all metrics will be shown. +- `document_scope`: A criterion for deciding whether documents are relevant or not. +You can select between: +- 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param. + A typical use case is Document Retrieval. +- 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params). + A typical use case is Document-Independent Passage Retrieval. +- 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match. + A typical use case is Document-Specific Passage Retrieval. +- 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match. + A typical use case is Document Retrieval having sparse context labels. +- 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically. + A typical use case is Question Answering. +- 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match. + This is intended to be a proper default value in order to support both main use cases: + - Document Retrieval + - Question Answering +The default value is 'document_id_or_answer'. +- `answer_scope`: Specifies the scope in which a matching answer is considered correct. +You can select between: +- 'any' (default): Any matching answer is considered correct. +- 'context': The answer is only considered correct if its context matches as well. + Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params). +- 'document_id': The answer is only considered correct if its document ID matches as well. + You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param. +- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well. +The default value is 'any'. +In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'. +- `wrong_examples_fields`: A list of field names to include in the worst samples. +- `max_characters_per_field`: The maximum number of characters per wrong example to show (per field). + + + +#### BaseStandardPipeline.run\_batch + +```python +def run_batch(queries: List[str], params: Optional[dict] = None, debug: Optional[bool] = None) +``` + +Run a batch of queries through the pipeline. + +**Arguments**: + +- `queries`: List of query strings. +- `params`: Parameters for the individual nodes of the pipeline. For instance, +`params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}}` +- `debug`: Whether the pipeline should instruct nodes to collect debug information +about their execution. By default these include the input parameters +they received and the output they generated. +All debug information can then be found in the dict returned +by this method under the key "_debug" + + + +## ExtractiveQAPipeline + +```python +class ExtractiveQAPipeline(BaseStandardPipeline) +``` + +Pipeline for Extractive Question Answering. + + + +#### ExtractiveQAPipeline.\_\_init\_\_ + +```python +def __init__(reader: BaseReader, retriever: BaseRetriever) +``` + +**Arguments**: + +- `reader`: Reader instance +- `retriever`: Retriever instance + + + +#### ExtractiveQAPipeline.run + +```python +def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None) +``` + +**Arguments**: + +- `query`: The search query string. +- `params`: Params for the `retriever` and `reader`. For instance, +params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +- `debug`: Whether the pipeline should instruct nodes to collect debug information +about their execution. By default these include the input parameters +they received and the output they generated. +All debug information can then be found in the dict returned +by this method under the key "_debug" + + + +## DocumentSearchPipeline + +```python +class DocumentSearchPipeline(BaseStandardPipeline) +``` + +Pipeline for semantic document search. + + + +#### DocumentSearchPipeline.\_\_init\_\_ + +```python +def __init__(retriever: BaseRetriever) +``` + +**Arguments**: + +- `retriever`: Retriever instance + + + +#### DocumentSearchPipeline.run + +```python +def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None) +``` + +**Arguments**: + +- `query`: the query string. +- `params`: params for the `retriever` and `reader`. For instance, params={"Retriever": {"top_k": 10}} +- `debug`: Whether the pipeline should instruct nodes to collect debug information +about their execution. By default these include the input parameters +they received and the output they generated. +All debug information can then be found in the dict returned +by this method under the key "_debug" + + + +## GenerativeQAPipeline + +```python +class GenerativeQAPipeline(BaseStandardPipeline) +``` + +Pipeline for Generative Question Answering. + + + +#### GenerativeQAPipeline.\_\_init\_\_ + +```python +def __init__(generator: BaseGenerator, retriever: BaseRetriever) +``` + +**Arguments**: + +- `generator`: Generator instance +- `retriever`: Retriever instance + + + +#### GenerativeQAPipeline.run + +```python +def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None) +``` + +**Arguments**: + +- `query`: the query string. +- `params`: params for the `retriever` and `generator`. For instance, +params={"Retriever": {"top_k": 10}, "Generator": {"top_k": 5}} +- `debug`: Whether the pipeline should instruct nodes to collect debug information +about their execution. By default these include the input parameters +they received and the output they generated. +All debug information can then be found in the dict returned +by this method under the key "_debug" + + + +## SearchSummarizationPipeline + +```python +class SearchSummarizationPipeline(BaseStandardPipeline) +``` + +Pipeline that retrieves documents for a query and then summarizes those documents. + + + +#### SearchSummarizationPipeline.\_\_init\_\_ + +```python +def __init__(summarizer: BaseSummarizer, retriever: BaseRetriever, return_in_answer_format: bool = False) +``` + +**Arguments**: + +- `summarizer`: Summarizer instance +- `retriever`: Retriever instance +- `return_in_answer_format`: Whether the results should be returned as documents (False) or in the answer +format used in other QA pipelines (True). With the latter, you can use this +pipeline as a "drop-in replacement" for other QA pipelines. + + + +#### SearchSummarizationPipeline.run + +```python +def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None) +``` + +**Arguments**: + +- `query`: the query string. +- `params`: params for the `retriever` and `summarizer`. For instance, +params={"Retriever": {"top_k": 10}, "Summarizer": {"generate_single_summary": True}} +- `debug`: Whether the pipeline should instruct nodes to collect debug information +about their execution. By default these include the input parameters +they received and the output they generated. +All debug information can then be found in the dict returned +by this method under the key "_debug" + + + +#### SearchSummarizationPipeline.run\_batch + +```python +def run_batch(queries: List[str], params: Optional[dict] = None, debug: Optional[bool] = None) +``` + +Run a batch of queries through the pipeline. + +**Arguments**: + +- `queries`: List of query strings. +- `params`: Parameters for the individual nodes of the pipeline. For instance, +`params={"Retriever": {"top_k": 10}, "Summarizer": {"generate_single_summary": True}}` +- `debug`: Whether the pipeline should instruct nodes to collect debug information +about their execution. By default these include the input parameters +they received and the output they generated. +All debug information can then be found in the dict returned +by this method under the key "_debug" + + + +## FAQPipeline + +```python +class FAQPipeline(BaseStandardPipeline) +``` + +Pipeline for finding similar FAQs using semantic document search. + + + +#### FAQPipeline.\_\_init\_\_ + +```python +def __init__(retriever: BaseRetriever) +``` + +**Arguments**: + +- `retriever`: Retriever instance + + + +#### FAQPipeline.run + +```python +def run(query: str, params: Optional[dict] = None, debug: Optional[bool] = None) +``` + +**Arguments**: + +- `query`: the query string. +- `params`: params for the `retriever`. For instance, params={"Retriever": {"top_k": 10}} +- `debug`: Whether the pipeline should instruct nodes to collect debug information +about their execution. By default these include the input parameters +they received and the output they generated. +All debug information can then be found in the dict returned +by this method under the key "_debug" + + + +## TranslationWrapperPipeline + +```python +class TranslationWrapperPipeline(BaseStandardPipeline) +``` + +Takes an existing search pipeline and adds one "input translation node" after the Query and one +"output translation" node just before returning the results + + + +#### TranslationWrapperPipeline.\_\_init\_\_ + +```python +def __init__(input_translator: BaseTranslator, output_translator: BaseTranslator, pipeline: BaseStandardPipeline) +``` + +Wrap a given `pipeline` with the `input_translator` and `output_translator`. + +**Arguments**: + +- `input_translator`: A Translator node that shall translate the input query from language A to B +- `output_translator`: A Translator node that shall translate the pipeline results from language B to A +- `pipeline`: The pipeline object (e.g. ExtractiveQAPipeline) you want to "wrap". +Note that pipelines with split or merge nodes are currently not supported. + + + +## QuestionGenerationPipeline + +```python +class QuestionGenerationPipeline(BaseStandardPipeline) +``` + +A simple pipeline that takes documents as input and generates +questions that it thinks can be answered by the documents. + + + +## RetrieverQuestionGenerationPipeline + +```python +class RetrieverQuestionGenerationPipeline(BaseStandardPipeline) +``` + +A simple pipeline that takes a query as input, performs retrieval, and then generates +questions that it thinks can be answered by the retrieved documents. + + + +## QuestionAnswerGenerationPipeline + +```python +class QuestionAnswerGenerationPipeline(BaseStandardPipeline) +``` + +This is a pipeline which takes a document as input, generates questions that the model thinks can be answered by +this document, and then performs question answering of this questions using that single document. + + + +## MostSimilarDocumentsPipeline + +```python +class MostSimilarDocumentsPipeline(BaseStandardPipeline) +``` + + + +#### MostSimilarDocumentsPipeline.\_\_init\_\_ + +```python +def __init__(document_store: BaseDocumentStore) +``` + +Initialize a Pipeline for finding the most similar documents to a given document. + +This pipeline can be helpful if you already show a relevant document to your end users and they want to search for just similar ones. + +**Arguments**: + +- `document_store`: Document Store instance with already stored embeddings. + + + +#### MostSimilarDocumentsPipeline.run + +```python +def run(document_ids: List[str], top_k: int = 5) +``` + +**Arguments**: + +- `document_ids`: document ids +- `top_k`: How many documents id to return against single document + + + +#### MostSimilarDocumentsPipeline.run\_batch + +```python +def run_batch(document_ids: List[str], top_k: int = 5) +``` + +**Arguments**: + +- `document_ids`: document ids +- `top_k`: How many documents id to return against single document + diff --git a/docs/v1.7.0/_src/api/api/preprocessor.md b/docs/v1.7.0/_src/api/api/preprocessor.md new file mode 100644 index 0000000000..bcdd935344 --- /dev/null +++ b/docs/v1.7.0/_src/api/api/preprocessor.md @@ -0,0 +1,110 @@ + + +# Module base + + + +## BasePreProcessor + +```python +class BasePreProcessor(BaseComponent) +``` + + + +#### BasePreProcessor.process + +```python +@abstractmethod +def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = True, clean_header_footer: Optional[bool] = False, clean_empty_lines: Optional[bool] = True, remove_substrings: List[str] = [], split_by: Optional[str] = "word", split_length: Optional[int] = 1000, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = True, id_hash_keys: Optional[List[str]] = None) -> List[Document] +``` + +Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a +list of Documents. + + + +# Module preprocessor + + + +## PreProcessor + +```python +class PreProcessor(BasePreProcessor) +``` + + + +#### PreProcessor.\_\_init\_\_ + +```python +def __init__(clean_whitespace: bool = True, clean_header_footer: bool = False, clean_empty_lines: bool = True, remove_substrings: List[str] = [], split_by: str = "word", split_length: int = 200, split_overlap: int = 0, split_respect_sentence_boundary: bool = True, tokenizer_model_folder: Optional[Union[str, Path]] = None, language: str = "en", id_hash_keys: Optional[List[str]] = None, progress_bar: bool = True, add_page_number: bool = False) +``` + +**Arguments**: + +- `clean_header_footer`: Use heuristic to remove footers and headers across different pages by searching +for the longest common string. This heuristic uses exact matches and therefore +works well for footers like "Copyright 2019 by XXX", but won't detect "Page 3 of 4" +or similar. +- `clean_whitespace`: Strip whitespaces before or after each line in the text. +- `clean_empty_lines`: Remove more than two empty lines in the text. +- `remove_substrings`: Remove specified substrings from the text. +- `split_by`: Unit for splitting the document. Can be "word", "sentence", or "passage". Set to None to disable splitting. +- `split_length`: Max. number of the above split unit (e.g. words) that are allowed in one document. For instance, if n -> 10 & split_by -> +"sentence", then each output document will have 10 sentences. +- `split_overlap`: Word overlap between two adjacent documents after a split. +Setting this to a positive number essentially enables the sliding window approach. +For example, if split_by -> `word`, +split_length -> 5 & split_overlap -> 2, then the splits would be like: +[w1 w2 w3 w4 w5, w4 w5 w6 w7 w8, w7 w8 w10 w11 w12]. +Set the value to 0 to ensure there is no overlap among the documents after splitting. +- `split_respect_sentence_boundary`: Whether to split in partial sentences if split_by -> `word`. If set +to True, the individual split will always have complete sentences & +the number of words will be <= split_length. +- `language`: The language used by "nltk.tokenize.sent_tokenize" in iso639 format. Available options: "en", "es", "de", "fr" & many more. +- `tokenizer_model_folder`: Path to the folder containing the NTLK PunktSentenceTokenizer models, if loading a model from a local path. Leave empty otherwise. +- `id_hash_keys`: Generate the document id from a custom list of strings that refer to the document's +attributes. If you want to ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. `"meta"` to this field (e.g. [`"content"`, `"meta"`]). +In this case the id will be generated by using the content and the defined metadata. +- `progress_bar`: Whether to show a progress bar. +- `add_page_number`: Add the number of the page a paragraph occurs in to the Document's meta +field `"page"`. Page boundaries are determined by `"\f"' character which is added +in between pages by `PDFToTextConverter`, `TikaConverter`, `ParsrConverter` and +`AzureConverter`. + + + +#### PreProcessor.process + +```python +def process(documents: Union[dict, Document, List[Union[dict, Document]]], clean_whitespace: Optional[bool] = None, clean_header_footer: Optional[bool] = None, clean_empty_lines: Optional[bool] = None, remove_substrings: List[str] = [], split_by: Optional[str] = None, split_length: Optional[int] = None, split_overlap: Optional[int] = None, split_respect_sentence_boundary: Optional[bool] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +``` + +Perform document cleaning and splitting. Can take a single document or a list of documents as input and returns a list of documents. + + + +#### PreProcessor.clean + +```python +def clean(document: Union[dict, Document], clean_whitespace: bool, clean_header_footer: bool, clean_empty_lines: bool, remove_substrings: List[str], id_hash_keys: Optional[List[str]] = None) -> Document +``` + +Perform document cleaning on a single document and return a single document. This method will deal with whitespaces, headers, footers +and empty lines. Its exact functionality is defined by the parameters passed into PreProcessor.__init__(). + + + +#### PreProcessor.split + +```python +def split(document: Union[dict, Document], split_by: str, split_length: int, split_overlap: int, split_respect_sentence_boundary: bool, id_hash_keys: Optional[List[str]] = None) -> List[Document] +``` + +Perform document splitting on a single document. This method can split on different units, at different lengths, +with different strides. It can also respect sentence boundaries. Its exact functionality is defined by +the parameters passed into PreProcessor.__init__(). Takes a single document as input and returns a list of documents. + diff --git a/docs/v1.7.0/_src/api/api/primitives.md b/docs/v1.7.0/_src/api/api/primitives.md new file mode 100644 index 0000000000..8be8ee27e3 --- /dev/null +++ b/docs/v1.7.0/_src/api/api/primitives.md @@ -0,0 +1,563 @@ + + +# Module schema + + + +## Document + +```python +@dataclass +class Document() +``` + + + +#### Document.\_\_init\_\_ + +```python +def __init__(content: Union[str, pd.DataFrame], content_type: Literal["text", "table", "image", "audio"] = "text", id: Optional[str] = None, score: Optional[float] = None, meta: Dict[str, Any] = None, embedding: Optional[np.ndarray] = None, id_hash_keys: Optional[List[str]] = None) +``` + +One of the core data classes in Haystack. It's used to represent documents / passages in a standardized way within Haystack. + +Documents are stored in DocumentStores, are returned by Retrievers, are the input for Readers and are used in +many other places that manipulate or interact with document-level data. + +Note: There can be multiple Documents originating from one file (e.g. PDF), if you split the text +into smaller passages. We'll have one Document per passage in this case. + +Each document has a unique ID. This can be supplied by the user or generated automatically. +It's particularly helpful for handling of duplicates and referencing documents in other objects (e.g. Labels) + +There's an easy option to convert from/to dicts via `from_dict()` and `to_dict`. + +**Arguments**: + +- `content`: Content of the document. For most cases, this will be text, but it can be a table or image. +- `content_type`: One of "text", "table" or "image". Haystack components can use this to adjust their +handling of Documents and check compatibility. +- `id`: Unique ID for the document. If not supplied by the user, we'll generate one automatically by +creating a hash from the supplied text. This behaviour can be further adjusted by `id_hash_keys`. +- `score`: The relevance score of the Document determined by a model (e.g. Retriever or Re-Ranker). +If model's `scale_score` was set to True (default) score is in the unit interval (range of [0,1]), where 1 means extremely relevant. +- `meta`: Meta fields for a document like name, url, or author in the form of a custom dict (any keys and values allowed). +- `embedding`: Vector encoding of the text +- `id_hash_keys`: Generate the document id from a custom list of strings that refere to the documents attributes. +If you want ensure you don't have duplicate documents in your DocumentStore but texts are +not unique, you can modify the metadata and pass e.g. "meta" to this field (e.g. ["content", "meta"]). +In this case the id will be generated by using the content and the defined metadata. + + + +#### Document.to\_dict + +```python +def to_dict(field_map={}) -> Dict +``` + +Convert Document to dict. An optional field_map can be supplied to change the names of the keys in the + +resulting dict. This way you can work with standardized Document objects in Haystack, but adjust the format that +they are serialized / stored in other places (e.g. elasticsearch) +Example: +| doc = Document(content="some text", content_type="text") +| doc.to_dict(field_map={"custom_content_field": "content"}) +| >>> {"custom_content_field": "some text", content_type": "text"} + +**Arguments**: + +- `field_map`: Dict with keys being the custom target keys and values being the standard Document attributes + +**Returns**: + +dict with content of the Document + + + +#### Document.from\_dict + +```python +@classmethod +def from_dict(cls, dict: Dict[str, Any], field_map: Dict[str, Any] = {}, id_hash_keys: Optional[List[str]] = None) -> Document +``` + +Create Document from dict. An optional field_map can be supplied to adjust for custom names of the keys in the + +input dict. This way you can work with standardized Document objects in Haystack, but adjust the format that +they are serialized / stored in other places (e.g. elasticsearch) +Example: +| my_dict = {"custom_content_field": "some text", content_type": "text"} +| Document.from_dict(my_dict, field_map={"custom_content_field": "content"}) + +**Arguments**: + +- `field_map`: Dict with keys being the custom target keys and values being the standard Document attributes + +**Returns**: + +dict with content of the Document + + + +#### Document.\_\_lt\_\_ + +```python +def __lt__(other) +``` + +Enable sorting of Documents by score + + + +## SpeechDocument + +```python +@dataclass +class SpeechDocument(Document) +``` + +Text-based document that also contains some accessory audio information +(either generated from the text with text to speech nodes, or extracted +from an audio source containing spoken words). + +Note: for documents of this type the primary information source is *text*, +so this is _not_ an audio document. The embeddings are computed on the textual +representation and will work with regular, text-based nodes and pipelines. + + + +## Span + +```python +@dataclass +class Span() +``` + + + +#### end + +Defining a sequence of characters (Text span) or cells (Table span) via start and end index. + +For extractive QA: Character where answer starts/ends +For TableQA: Cell where the answer starts/ends (counted from top left to bottom right of table) + +**Arguments**: + +- `start`: Position where the span starts +- `end`: Position where the spand ends + + + +## Answer + +```python +@dataclass +class Answer() +``` + + + +#### meta + +The fundamental object in Haystack to represent any type of Answers (e.g. extractive QA, generative QA or TableQA). + +For example, it's used within some Nodes like the Reader, but also in the REST API. + +**Arguments**: + +- `answer`: The answer string. If there's no possible answer (aka "no_answer" or "is_impossible) this will be an empty string. +- `type`: One of ("generative", "extractive", "other"): Whether this answer comes from an extractive model +(i.e. we can locate an exact answer string in one of the documents) or from a generative model +(i.e. no pointer to a specific document, no offsets ...). +- `score`: The relevance score of the Answer determined by a model (e.g. Reader or Generator). +In the range of [0,1], where 1 means extremely relevant. +- `context`: The related content that was used to create the answer (i.e. a text passage, part of a table, image ...) +- `offsets_in_document`: List of `Span` objects with start and end positions of the answer **in the +document** (as stored in the document store). +For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start +For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start +(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here) +- `offsets_in_context`: List of `Span` objects with start and end positions of the answer **in the +context** (i.e. the surrounding text/table of a certain window size). +For extractive QA: Character where answer starts => `Answer.offsets_in_document[0].start +For TableQA: Cell where the answer starts (counted from top left to bottom right of table) => `Answer.offsets_in_document[0].start +(Note that in TableQA there can be multiple cell ranges that are relevant for the answer, thus there can be multiple `Spans` here) +- `document_id`: ID of the document that the answer was located it (if any) +- `meta`: Dict that can be used to associate any kind of custom meta data with the answer. +In extractive QA, this will carry the meta data of the document where the answer was found. + + + +#### Answer.\_\_lt\_\_ + +```python +def __lt__(other) +``` + +Enable sorting of Answers by score + + + +## SpeechAnswer + +```python +@dataclass +class SpeechAnswer(Answer) +``` + +Text-based answer that also contains some accessory audio information +(either generated from the text with text to speech nodes, or extracted +from an audio source containing spoken words). + +Note: for answer of this type the primary information source is *text*, +so this is _not_ an audio document. The embeddings are computed on the textual +representation and will work with regular, text-based nodes and pipelines. + + + +## Label + +```python +@dataclass +class Label() +``` + + + +#### Label.\_\_init\_\_ + +```python +def __init__(query: str, document: Document, is_correct_answer: bool, is_correct_document: bool, origin: Literal["user-feedback", "gold-label"], answer: Optional[Answer], id: Optional[str] = None, no_answer: Optional[bool] = None, pipeline_id: Optional[str] = None, created_at: Optional[str] = None, updated_at: Optional[str] = None, meta: Optional[dict] = None, filters: Optional[dict] = None) +``` + +Object used to represent label/feedback in a standardized way within Haystack. + +This includes labels from dataset like SQuAD, annotations from labeling tools, +or, user-feedback from the Haystack REST API. + +**Arguments**: + +- `query`: the question (or query) for finding answers. +- `document`: +- `answer`: the answer object. +- `is_correct_answer`: whether the sample is positive or negative. +- `is_correct_document`: in case of negative sample(is_correct_answer is False), there could be two cases; +incorrect answer but correct document & incorrect document. This flag denotes if +the returned document was correct. +- `origin`: the source for the labels. It can be used to later for filtering. +- `id`: Unique ID used within the DocumentStore. If not supplied, a uuid will be generated automatically. +- `no_answer`: whether the question in unanswerable. +- `pipeline_id`: pipeline identifier (any str) that was involved for generating this label (in-case of user feedback). +- `created_at`: Timestamp of creation with format yyyy-MM-dd HH:mm:ss. +Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S"). +- `created_at`: Timestamp of update with format yyyy-MM-dd HH:mm:ss. +Generate in Python via time.strftime("%Y-%m-%d %H:%M:%S") +- `meta`: Meta fields like "annotator_name" in the form of a custom dict (any keys and values allowed). +- `filters`: filters that should be applied to the query to rule out non-relevant documents. For example, if there are different correct answers +in a DocumentStore depending on the retrieved document and the answer in this label is correct only on condition of the filters. + + + +## MultiLabel + +```python +@dataclass +class MultiLabel() +``` + + + +#### MultiLabel.\_\_init\_\_ + +```python +def __init__(labels: List[Label], drop_negative_labels=False, drop_no_answers=False, **kwargs) +``` + +There are often multiple `Labels` associated with a single query. For example, there can be multiple annotated + +answers for one question or multiple documents contain the information you want for a query. +This class is "syntactic sugar" that simplifies the work with such a list of related Labels. +It stored the original labels in MultiLabel.labels and provides additional aggregated attributes that are +automatically created at init time. For example, MultiLabel.no_answer allows you to easily access if any of the +underlying Labels provided a text answer and therefore demonstrates that there is indeed a possible answer. + +**Arguments**: + +- `labels`: A list of labels that belong to a similar query and shall be "grouped" together +- `drop_negative_labels`: Whether to drop negative labels from that group (e.g. thumbs down feedback from UI) +- `drop_no_answers`: Whether to drop labels that specify the answer is impossible +- `kwargs`: All additional attributes are ignored. This is just a workaround to enable smooth `to_dict()`-`from_dict()`-(de)serialization. + + + +## EvaluationResult + +```python +class EvaluationResult() +``` + + + +#### EvaluationResult.\_\_init\_\_ + +```python +def __init__(node_results: Dict[str, pd.DataFrame] = None) -> None +``` + +A convenience class to store, pass, and interact with results of a pipeline evaluation run (for example `pipeline.eval()`). + +Detailed results are stored as one dataframe per node. This class makes them more accessible and provides +convenience methods to work with them. +For example, you can calculate eval metrics, get detailed reports, or simulate different top_k settings: + +```python +| eval_results = pipeline.eval(...) +| +| # derive detailed metrics +| eval_results.calculate_metrics() +| +| # show summary of incorrect queries +| eval_results.wrong_examples() +``` + +Each row of the underlying DataFrames contains either an answer or a document that has been retrieved during evaluation. +Rows are enriched with basic information like rank, query, type, or node. +Additional answer or document-specific evaluation information, like gold labels +and metrics showing whether the row matches the gold labels, are included, too. +The DataFrames have the following schema: +- multilabel_id: The ID of the multilabel, which is unique for the pair of query and filters. +- query: The actual query string. +- filters: The filters used with the query. +- gold_answers (answers only): The expected answers. +- answer (answers only): The actual answer. +- context: The content of the document (the surrounding context of the answer for QA). +- exact_match (answers only): A metric showing if the answer exactly matches the gold label. +- f1 (answers only): A metric showing how well the answer overlaps with the gold label on a token basis. +- sas (answers only, optional): A metric showing how well the answer matches the gold label on a semantic basis. +- exact_match_context_scope (answers only): exact_match with enforced context match. +- f1_context_scope (answers only): f1 with enforced context scope match. +- sas_context_scope (answers only): sas with enforced context scope match. +- exact_match_document_scope (answers only): exact_match with enforced document scope match. +- f1_document_scope (answers only): f1 with enforced document scope match. +- sas_document_scope (answers only): sas with enforced document scope match. +- exact_match_document_id_and_context_scope: (answers only): exact_match with enforced document and context scope match. +- f1_document_id_and_context_scope (answers only): f1 with enforced document and context scope match. +- sas_document_id_and_context_scope (answers only): sas with enforced document and context scope match. +- gold_contexts: The contents of the gold documents. +- gold_id_match (documents only): A metric showing whether one of the gold document IDs matches the document. +- context_match (documents only): A metric showing whether one of the gold contexts matches the document content. +- answer_match (documents only): A metric showing whether the document contains the answer. +- gold_id_or_answer_match (documents only): A Boolean operation specifying that there should be either `'gold_id_match' OR 'answer_match'`. +- gold_id_and_answer_match (documents only): A Boolean operation specifying that there should be both `'gold_id_match' AND 'answer_match'`. +- gold_id_or_context_match (documents only): A Boolean operation specifying that there should be either `'gold_id_match' OR 'context_match'`. +- gold_id_and_context_match (documents only): A Boolean operation specifying that there should be both `'gold_id_match' AND 'context_match'`. +- gold_id_and_context_and_answer_match (documents only): A Boolean operation specifying that there should be `'gold_id_match' AND 'context_match' AND 'answer_match'`. +- context_and_answer_match (documents only): A Boolean operation specifying that there should be both `'context_match' AND 'answer_match'`. +- rank: A rank or 1-based-position in the result list. +- document_id: The ID of the document that has been retrieved or that contained the answer. +- gold_document_ids: The IDs of the documents to be retrieved. +- custom_document_id: The custom ID of the document (specified by `custom_document_id_field`) that has been retrieved or that contained the answer. +- gold_custom_document_ids: The custom documents IDs (specified by `custom_document_id_field`) to be retrieved. +- offsets_in_document (answers only): The position or offsets within the document where the answer was found. +- gold_offsets_in_documents (answers only): The position or offsets of the gold answer within the document. +- gold_answers_exact_match (answers only): exact_match values per gold_answer. +- gold_answers_f1 (answers only): f1 values per gold_answer. +- gold_answers_sas (answers only): sas values per gold answer. +- gold_documents_id_match: The document ID match per gold label (if `custom_document_id_field` has been specified, custom IDs are used). +- gold_contexts_similarity: Context similarity per gold label. +- gold_answers_match (documents only): Specifies whether the document contains an answer per gold label. +- type: Possible values: 'answer' or 'document'. +- node: The node name +- eval_mode: Specifies whether the evaluation was executed in integrated or isolated mode. + Check pipeline.eval()'s add_isolated_node_eval parameter for more information. + +**Arguments**: + +- `node_results`: The evaluation Dataframes per pipeline node. + + + +#### EvaluationResult.calculate\_metrics + +```python +def calculate_metrics(simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, document_scope: Literal[ + "document_id", + "context", + "document_id_and_context", + "document_id_or_context", + "answer", + "document_id_or_answer", + ] = "document_id_or_answer", eval_mode: Literal["integrated", "isolated"] = "integrated", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any") -> Dict[str, Dict[str, float]] +``` + +Calculates proper metrics for each node. + +For Nodes that return Documents, the default metrics are: +- mrr (`Mean Reciprocal Rank `_) +- map (`Mean Average Precision `_) +- ndcg (`Normalized Discounted Cumulative Gain `_) +- precision (Precision: How many of the returned documents were relevant?) +- recall_multi_hit (Recall according to Information Retrieval definition: How many of the relevant documents were retrieved per query?) +- recall_single_hit (Recall for Question Answering: How many of the queries returned at least one relevant document?) + +For Nodes that return answers, the default metrics are: +- exact_match (How many of the queries returned the exact answer?) +- f1 (How well do the returned results overlap with any gold answer on a token basis?) +- sas, if a SAS model has been provided when calling `pipeline.eval()` (How semantically similar is the prediction to the gold answers?) + +During the eval run, you can simulate lower top_k values for Reader and Retriever than the actual values. +For example, you can calculate `top_1_f1` for Reader nodes by setting `simulated_top_k_reader=1`. + +If you applied `simulated_top_k_retriever` to a Reader node, you should treat the results with caution as they can differ from an actual eval run with a corresponding `top_k_retriever` heavily. + +**Arguments**: + +- `simulated_top_k_reader`: Simulates the `top_k` parameter of the Reader. +- `simulated_top_k_retriever`: Simulates the `top_k` parameter of the Retriever. +Note: There might be a discrepancy between simulated Reader metrics and an actual Pipeline run with Retriever `top_k`. +- `eval_mode`: The input the Node was evaluated on. +Usually a Node gets evaluated on the prediction provided by its predecessor Nodes in the Pipeline (`value='integrated'`). +However, as the quality of the Node can heavily depend on the Node's input and thus the predecessor's quality, +you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your Node. +For example, when evaluating the Reader, use `value='isolated'` to simulate a perfect Retriever in an ExtractiveQAPipeline. +Possible values are: `integrated`, `isolated`. +The default value is `integrated`. +- `document_scope`: A criterion for deciding whether documents are relevant or not. +You can select between: +- 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param. + A typical use case is Document Retrieval. +- 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params). + A typical use case is Document-Independent Passage Retrieval. +- 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match. + A typical use case is Document-Specific Passage Retrieval. +- 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match. + A typical use case is Document Retrieval having sparse context labels. +- 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically. + A typical use case is Question Answering. +- 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match. + This is intended to be a proper default value in order to support both main use cases: + - Document Retrieval + - Question Answering +The default value is 'document_id_or_answer'. +- `answer_scope`: Specifies the scope in which a matching answer is considered correct. +You can select between: +- 'any' (default): Any matching answer is considered correct. +- 'context': The answer is only considered correct if its context matches as well. + Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params). +- 'document_id': The answer is only considered correct if its document ID matches as well. + You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param. +- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well. +The default value is 'any'. +In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'. + + + +#### EvaluationResult.wrong\_examples + +```python +def wrong_examples(node: str, n: int = 3, simulated_top_k_reader: int = -1, simulated_top_k_retriever: int = -1, document_scope: Literal[ + "document_id", + "context", + "document_id_and_context", + "document_id_or_context", + "answer", + "document_id_or_answer", + ] = "document_id_or_answer", document_metric: str = "recall_single_hit", answer_metric: str = "f1", document_metric_threshold: float = 0.5, answer_metric_threshold: float = 0.5, eval_mode: Literal["integrated", "isolated"] = "integrated", answer_scope: Literal["any", "context", "document_id", "document_id_and_context"] = "any") -> List[Dict] +``` + +Returns the worst performing queries. + +Worst performing queries are calculated based on the metric +that is either a document metric or an answer metric according to the node type. + +Lower top_k values for reader and retriever than the actual values during the eval run can be simulated. +See calculate_metrics() for more information. + +**Arguments**: + +- `simulated_top_k_reader`: simulates top_k param of reader +- `simulated_top_k_retriever`: simulates top_k param of retriever. +remarks: there might be a discrepancy between simulated reader metrics and an actual pipeline run with retriever top_k +- `document_metric`: the document metric worst queries are calculated with. +values can be: 'recall_single_hit', 'recall_multi_hit', 'mrr', 'map', 'precision' +- `answer_metric`: the answer metric worst queries are calculated with. +values can be: 'f1', 'exact_match' and 'sas' if the evaluation was made using a SAS model. +- `document_metric_threshold`: the threshold for the document metric (only samples below selected metric +threshold will be considered) +- `answer_metric_threshold`: the threshold for the answer metric (only samples below selected metric +threshold will be considered) +- `eval_mode`: the input on which the node was evaluated on. +Usually nodes get evaluated on the prediction provided by its predecessor nodes in the pipeline (value='integrated'). +However, as the quality of the node itself can heavily depend on the node's input and thus the predecessor's quality, +you might want to simulate a perfect predecessor in order to get an independent upper bound of the quality of your node. +For example when evaluating the reader use value='isolated' to simulate a perfect retriever in an ExtractiveQAPipeline. +Values can be 'integrated', 'isolated'. +Default value is 'integrated'. +- `document_scope`: A criterion for deciding whether documents are relevant or not. +You can select between: +- 'document_id': Specifies that the document ID must match. You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param. + A typical use case is Document Retrieval. +- 'context': Specifies that the content of the document must match. Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params). + A typical use case is Document-Independent Passage Retrieval. +- 'document_id_and_context': A Boolean operation specifying that both `'document_id' AND 'context'` must match. + A typical use case is Document-Specific Passage Retrieval. +- 'document_id_or_context': A Boolean operation specifying that either `'document_id' OR 'context'` must match. + A typical use case is Document Retrieval having sparse context labels. +- 'answer': Specifies that the document contents must include the answer. The selected `answer_scope` is enforced automatically. + A typical use case is Question Answering. +- 'document_id_or_answer' (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match. + This is intended to be a proper default value in order to support both main use cases: + - Document Retrieval + - Question Answering +The default value is 'document_id_or_answer'. +- `answer_scope`: Specifies the scope in which a matching answer is considered correct. +You can select between: +- 'any' (default): Any matching answer is considered correct. +- 'context': The answer is only considered correct if its context matches as well. + Uses fuzzy matching (see `pipeline.eval()`'s `context_matching_...` params). +- 'document_id': The answer is only considered correct if its document ID matches as well. + You can specify a custom document ID through `pipeline.eval()`'s `custom_document_id_field` param. +- 'document_id_and_context': The answer is only considered correct if its document ID and its context match as well. +The default value is 'any'. +In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to 'answer' or 'document_id_or_answer'. + + + +#### EvaluationResult.save + +```python +def save(out_dir: Union[str, Path], **to_csv_kwargs) +``` + +Saves the evaluation result. + +The result of each node is saved in a separate csv with file name {node_name}.csv to the out_dir folder. + +**Arguments**: + +- `out_dir`: Path to the target folder the csvs will be saved. +- `to_csv_kwargs`: kwargs to be passed to pd.DataFrame.to_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_csv.html. +This method uses different default values than pd.DataFrame.to_csv() for the following parameters: +index=False, quoting=csv.QUOTE_NONNUMERIC (to avoid problems with \r chars) + + + +#### EvaluationResult.load + +```python +@classmethod +def load(cls, load_dir: Union[str, Path], **read_csv_kwargs) +``` + +Loads the evaluation result from disk. Expects one csv file per node. See save() for further information. + +**Arguments**: + +- `load_dir`: The directory containing the csv files. +- `read_csv_kwargs`: kwargs to be passed to pd.read_csv(). See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html. +This method uses different default values than pd.read_csv() for the following parameters: +header=0, converters=CONVERTERS +where CONVERTERS is a dictionary mapping all array typed columns to ast.literal_eval. + diff --git a/docs/v1.7.0/_src/api/api/pseudo_label_generator.md b/docs/v1.7.0/_src/api/api/pseudo_label_generator.md new file mode 100644 index 0000000000..53cba941b9 --- /dev/null +++ b/docs/v1.7.0/_src/api/api/pseudo_label_generator.md @@ -0,0 +1,172 @@ + + +# Module pseudo\_label\_generator + + + +## PseudoLabelGenerator + +```python +class PseudoLabelGenerator(BaseComponent) +``` + +PseudoLabelGenerator is a component that creates Generative Pseudo Labeling (GPL) training data for the +training of dense retrievers. + +GPL is an unsupervised domain adaptation method for the training of dense retrievers. It is based on question +generation and pseudo labelling with powerful cross-encoders. To train a domain-adapted model, it needs access +to an unlabeled target corpus, usually through DocumentStore and a Retriever to mine for negatives. + +For more details, see [GPL](https://github.com/UKPLab/gpl). + +For example: + + +```python +| document_store = DocumentStore(...) +| retriever = Retriever(...) +| qg = QuestionGenerator(model_name_or_path="doc2query/msmarco-t5-base-v1") +| plg = PseudoLabelGenerator(qg, retriever) +| output, output_id = psg.run(documents=document_store.get_all_documents()) +| +``` + +**Notes**: + + + While the NLP researchers trained the default question + [generation](https://huggingface.co/doc2query/msmarco-t5-base-v1) and the cross + [encoder](https://huggingface.co/cross-encoder/ms-marco-MiniLM-L-6-v2) models on + the English language corpus, we can also use the language-specific question generation and + cross-encoder models in the target language of our choice to apply GPL to documents in languages + other than English. + + As of this writing, the German language question + [generation](https://huggingface.co/ml6team/mt5-small-german-query-generation) and the cross + [encoder](https://huggingface.co/ml6team/cross-encoder-mmarco-german-distilbert-base) models are + already available, as well as question [generation](https://huggingface.co/doc2query/msmarco-14langs-mt5-base-v1) + and the cross [encoder](https://huggingface.co/cross-encoder/mmarco-mMiniLMv2-L12-H384-v1) + models trained on fourteen languages. + + + +#### PseudoLabelGenerator.\_\_init\_\_ + +```python +def __init__(question_producer: Union[QuestionGenerator, List[Dict[str, str]]], retriever: BaseRetriever, cross_encoder_model_name_or_path: str = "cross-encoder/ms-marco-MiniLM-L-6-v2", max_questions_per_document: int = 3, top_k: int = 50, batch_size: int = 16, progress_bar: bool = True) +``` + +Loads the cross-encoder model and prepares PseudoLabelGenerator. + +**Arguments**: + +- `question_producer` (`Union[QuestionGenerator, List[Dict[str, str]]]`): The question producer used to generate questions or a list of already produced +questions/document pairs in a Dictionary format {"question": "question text ...", "document": "document text ..."}. +- `retriever` (`BaseRetriever`): The Retriever used to query document stores. +- `cross_encoder_model_name_or_path` (`str (optional)`): The path to the cross encoder model, defaults to +`cross-encoder/ms-marco-MiniLM-L-6-v2`. +- `max_questions_per_document` (`int`): The max number of questions generated per document, defaults to 3. +- `top_k` (`int (optional)`): The number of answers retrieved for each question, defaults to 50. +- `batch_size` (`int (optional)`): The number of documents to process at a time. +- `progress_bar` (`bool (optional)`): Whether to show a progress bar, defaults to True. + + + +#### PseudoLabelGenerator.generate\_questions + +```python +def generate_questions(documents: List[Document], batch_size: Optional[int] = None) -> List[Dict[str, str]] +``` + +It takes a list of documents and generates a list of question-document pairs. + +**Arguments**: + +- `documents` (`List[Document]`): A list of documents to generate questions from. +- `batch_size` (`Optional[int]`): The number of documents to process at a time. + +**Returns**: + +A list of question-document pairs. + + + +#### PseudoLabelGenerator.mine\_negatives + +```python +def mine_negatives(question_doc_pairs: List[Dict[str, str]], batch_size: Optional[int] = None) -> List[Dict[str, str]] +``` + +Given a list of question and positive document pairs, this function returns a list of question/positive document/negative document + +dictionaries. + +**Arguments**: + +- `question_doc_pairs` (`List[Dict[str, str]]`): A list of question/positive document pairs. +- `batch_size` (`int (optional)`): The number of queries to run in a batch. + +**Returns**: + +A list of dictionaries, where each dictionary contains the question, positive document, +and negative document. + + + +#### PseudoLabelGenerator.generate\_margin\_scores + +```python +def generate_margin_scores(mined_negatives: List[Dict[str, str]], batch_size: Optional[int] = None) -> List[Dict] +``` + +Given a list of mined negatives, this function predicts the score margin between the positive and negative document using + +the cross-encoder. + +The function returns a list of examples, where each example is a dictionary with the following keys: + +* question: The question string. +* pos_doc: Positive document string (the document containing the answer). +* neg_doc: Negative document string (the document that doesn't contain the answer). +* score: The margin between the score for question-positive document pair and the score for question-negative document pair. + +**Arguments**: + +- `mined_negatives` (`List[Dict[str, str]]`): The list of mined negatives. +- `batch_size` (`int (optional)`): The number of mined negative lists to run in a batch. + +**Returns**: + +A list of dictionaries, each of which has the following keys: +- question: The question string +- pos_doc: Positive document string +- neg_doc: Negative document string +- score: The score margin + + + +#### PseudoLabelGenerator.generate\_pseudo\_labels + +```python +def generate_pseudo_labels(documents: List[Document], batch_size: Optional[int] = None) -> Tuple[dict, str] +``` + +Given a list of documents, this function generates a list of question-document pairs, mines for negatives, and + +scores a positive/negative margin with cross-encoder. The output is the training data for the +adaptation of dense retriever models. + +**Arguments**: + +- `documents` (`List[Document]`): List[Document] = The list of documents to mine negatives from. +- `batch_size` (`Optional[int]`): The number of documents to process in a batch. + +**Returns**: + +A dictionary with a single key 'gpl_labels' representing a list of dictionaries, where each +dictionary contains the following keys: +- question: The question string. +- pos_doc: Positive document for the given question. +- neg_doc: Negative document for the given question. +- score: The margin between the score for question-positive document pair and the score for question-negative document pair. + diff --git a/docs/v1.7.0/_src/api/api/query_classifier.md b/docs/v1.7.0/_src/api/api/query_classifier.md new file mode 100644 index 0000000000..b92d8fa7cf --- /dev/null +++ b/docs/v1.7.0/_src/api/api/query_classifier.md @@ -0,0 +1,163 @@ + + +# Module base + + + +## BaseQueryClassifier + +```python +class BaseQueryClassifier(BaseComponent) +``` + +Abstract class for Query Classifiers + + + +# Module sklearn + + + +## SklearnQueryClassifier + +```python +class SklearnQueryClassifier(BaseQueryClassifier) +``` + +A node to classify an incoming query into one of two categories using a lightweight sklearn model. Depending on the result, the query flows to a different branch in your pipeline +and the further processing can be customized. You can define this by connecting the further pipeline to either `output_1` or `output_2` from this node. + +**Example**: + + ```python + |{ + |pipe = Pipeline() + |pipe.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"]) + |pipe.add_node(component=elastic_retriever, name="ElasticRetriever", inputs=["QueryClassifier.output_2"]) + |pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) + + |# Keyword queries will use the ElasticRetriever + |pipe.run("kubernetes aws") + + |# Semantic queries (questions, statements, sentences ...) will leverage the DPR retriever + |pipe.run("How to manage kubernetes on aws") + + ``` + + Models: + + Pass your own `Sklearn` binary classification model or use one of the following pretrained ones: + 1) Keywords vs. Questions/Statements (Default) + query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle) + query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle) + output_1 => question/statement + output_2 => keyword query + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt) + + + 2) Questions vs. Statements + query_classifier can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/model.pickle) + query_vectorizer can be found [here](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/vectorizer.pickle) + output_1 => question + output_2 => statement + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt) + + See also the [tutorial](https://haystack.deepset.ai/tutorials/pipelines) on pipelines. + + + +#### SklearnQueryClassifier.\_\_init\_\_ + +```python +def __init__(model_name_or_path: Union[ + str, Any + ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", vectorizer_name_or_path: Union[ + str, Any + ] = "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle", batch_size: Optional[int] = None, progress_bar: bool = True) +``` + +**Arguments**: + +- `model_name_or_path`: Gradient boosting based binary classifier to classify between keyword vs statement/question +queries or statement vs question queries. +- `vectorizer_name_or_path`: A ngram based Tfidf vectorizer for extracting features from query. +- `batch_size`: Number of queries to process at a time. +- `progress_bar`: Whether to show a progress bar. + + + +# Module transformers + + + +## TransformersQueryClassifier + +```python +class TransformersQueryClassifier(BaseQueryClassifier) +``` + +A node to classify an incoming query into categories using a transformer model. +Depending on the result, the query flows to a different branch in your pipeline and the further processing +can be customized. You can define this by connecting the further pipeline to `output_1`, `output_2`, ..., `output_n` +from this node. +This node also supports zero-shot-classification. + +**Example**: + + ```python + |{ + |pipe = Pipeline() + |pipe.add_node(component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"]) + |pipe.add_node(component=elastic_retriever, name="ElasticRetriever", inputs=["QueryClassifier.output_2"]) + |pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["QueryClassifier.output_1"]) + + |# Keyword queries will use the ElasticRetriever + |pipe.run("kubernetes aws") + + |# Semantic queries (questions, statements, sentences ...) will leverage the DPR retriever + |pipe.run("How to manage kubernetes on aws") + + ``` + + Models: + + Pass your own `Transformer` classification/zero-shot-classification model from file/huggingface or use one of the following + pretrained ones hosted on Huggingface: + 1) Keywords vs. Questions/Statements (Default) + model_name_or_path="shahrukhx01/bert-mini-finetune-question-detection" + output_1 => question/statement + output_2 => keyword query + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/readme.txt) + + + 2) Questions vs. Statements + `model_name_or_path`="shahrukhx01/question-vs-statement-classifier" + output_1 => question + output_2 => statement + [Readme](https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/readme.txt) + + + See also the [tutorial](https://haystack.deepset.ai/tutorials/pipelines) on pipelines. + + + +#### TransformersQueryClassifier.\_\_init\_\_ + +```python +def __init__(model_name_or_path: Union[Path, str] = "shahrukhx01/bert-mini-finetune-question-detection", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, task: str = "text-classification", labels: List[str] = DEFAULT_LABELS, batch_size: int = 16, progress_bar: bool = True) +``` + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model, for example 'shahrukhx01/bert-mini-finetune-question-detection'. +See [Hugging Face models](https://huggingface.co/models) for a full list of available models. +- `model_version`: The version of the model to use from the Hugging Face model hub. This can be a tag name, a branch name, or a commit hash. +- `tokenizer`: The name of the tokenizer (usually the same as model). +- `use_gpu`: Whether to use GPU (if available). +- `task`: Specifies the type of classification. Possible values: 'text-classification' or 'zero-shot-classification'. +- `labels`: If the task is 'text-classification' and an ordered list of labels is provided, the first label corresponds to output_1, +the second label to output_2, and so on. The labels must match the model labels; only the order can differ. +If the task is 'zero-shot-classification', these are the candidate labels. +- `batch_size`: The number of queries to be processed at a time. +- `progress_bar`: Whether to show a progress bar. + diff --git a/docs/v1.7.0/_src/api/api/question_generator.md b/docs/v1.7.0/_src/api/api/question_generator.md new file mode 100644 index 0000000000..3e00a211d3 --- /dev/null +++ b/docs/v1.7.0/_src/api/api/question_generator.md @@ -0,0 +1,57 @@ + + +# Module question\_generator + + + +## QuestionGenerator + +```python +class QuestionGenerator(BaseComponent) +``` + +The Question Generator takes only a document as input and outputs questions that it thinks can be +answered by this document. In our current implementation, input texts are split into chunks of 50 words +with a 10 word overlap. This is because the default model `valhalla/t5-base-e2e-qg` seems to generate only +about 3 questions per passage regardless of length. Our approach prioritizes the creation of more questions +over processing efficiency (T5 is able to digest much more than 50 words at once). The returned questions +generally come in an order dictated by the order of their answers i.e. early questions in the list generally +come from earlier in the document. + + + +#### QuestionGenerator.\_\_init\_\_ + +```python +def __init__(model_name_or_path="valhalla/t5-base-e2e-qg", model_version=None, num_beams=4, max_length=256, no_repeat_ngram_size=3, length_penalty=1.5, early_stopping=True, split_length=50, split_overlap=10, use_gpu=True, prompt="generate questions:", num_queries_per_doc=1, sep_token: str = "", batch_size: int = 16, progress_bar: bool = True) +``` + +Uses the valhalla/t5-base-e2e-qg model by default. This class supports any question generation model that is + +implemented as a Seq2SeqLM in HuggingFace Transformers. Note that this style of question generation (where the only input +is a document) is sometimes referred to as end-to-end question generation. Answer-supervised question +generation is not currently supported. + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. "valhalla/t5-base-e2e-qg". +See https://huggingface.co/models for full list of available models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `use_gpu`: Whether to use GPU or the CPU. Falls back on CPU if no GPU is available. +- `batch_size`: Number of documents to process at a time. + + + +#### QuestionGenerator.generate\_batch + +```python +def generate_batch(texts: Union[List[str], List[List[str]]], batch_size: Optional[int] = None) -> Union[List[List[str]], List[List[List[str]]]] +``` + +Generates questions for a list of strings or a list of lists of strings. + +**Arguments**: + +- `texts`: List of str or list of list of str. +- `batch_size`: Number of texts to process at a time. + diff --git a/docs/v1.7.0/_src/api/api/ranker.md b/docs/v1.7.0/_src/api/api/ranker.md new file mode 100644 index 0000000000..afb6744612 --- /dev/null +++ b/docs/v1.7.0/_src/api/api/ranker.md @@ -0,0 +1,172 @@ + + +# Module base + + + +## BaseRanker + +```python +class BaseRanker(BaseComponent) +``` + + + +#### BaseRanker.timing + +```python +def timing(fn, attr_name) +``` + +Wrapper method used to time functions. + + + +#### BaseRanker.eval + +```python +def eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False) -> dict +``` + +Performs evaluation of the Ranker. + +Ranker is evaluated in the same way as a Retriever based on whether it finds the correct document given the query string and at which +position in the ranking of documents the correct document is. + +| Returns a dict containing the following metrics: + + - "recall": Proportion of questions for which correct document is among retrieved documents + - "mrr": Mean of reciprocal rank. Rewards retrievers that give relevant documents a higher rank. + Only considers the highest ranked relevant document. + - "map": Mean of average precision for each question. Rewards retrievers that give relevant + documents a higher rank. Considers all retrieved relevant documents. If ``open_domain=True``, + average precision is normalized by the number of retrieved relevant documents per query. + If ``open_domain=False``, average precision is normalized by the number of all relevant documents + per query. + +**Arguments**: + +- `label_index`: Index/Table in DocumentStore where labeled questions are stored +- `doc_index`: Index/Table in DocumentStore where documents that are used for evaluation are stored +- `top_k`: How many documents to return per query +- `open_domain`: If ``True``, retrieval will be evaluated by checking if the answer string to a question is +contained in the retrieved docs (common approach in open-domain QA). +If ``False``, retrieval uses a stricter evaluation that checks if the retrieved document ids +are within ids explicitly stated in the labels. +- `return_preds`: Whether to add predictions in the returned dictionary. If True, the returned dictionary +contains the keys "predictions" and "metrics". + + + +# Module sentence\_transformers + + + +## SentenceTransformersRanker + +```python +class SentenceTransformersRanker(BaseRanker) +``` + +Sentence Transformer based pre-trained Cross-Encoder model for Document Re-ranking (https://huggingface.co/cross-encoder). +Re-Ranking can be used on top of a retriever to boost the performance for document search. This is particularly useful if the retriever has a high recall but is bad in sorting the documents by relevance. + +SentenceTransformerRanker handles Cross-Encoder models + - use a single logit as similarity score e.g. cross-encoder/ms-marco-MiniLM-L-12-v2 + - use two output logits (no_answer, has_answer) e.g. deepset/gbert-base-germandpr-reranking +https://www.sbert.net/docs/pretrained-models/ce-msmarco.html#usage-with-transformers + +| With a SentenceTransformersRanker, you can: + - directly get predictions via predict() + +Usage example: + +```python +| retriever = BM25Retriever(document_store=document_store) +| ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2") +| p = Pipeline() +| p.add_node(component=retriever, name="ESRetriever", inputs=["Query"]) +| p.add_node(component=ranker, name="Ranker", inputs=["ESRetriever"]) +``` + + + +#### SentenceTransformersRanker.\_\_init\_\_ + +```python +def __init__(model_name_or_path: Union[str, Path], model_version: Optional[str] = None, top_k: int = 10, use_gpu: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, batch_size: int = 16, scale_score: bool = True, progress_bar: bool = True) +``` + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. +'cross-encoder/ms-marco-MiniLM-L-12-v2'. +See https://huggingface.co/cross-encoder for full list of available models +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `top_k`: The maximum number of documents to return +- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available. +- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones +The strings will be converted into pytorch devices, so use the string notation described here: +https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device +(e.g. ["cuda:0"]). +- `batch_size`: Number of documents to process at a time. +- `scale_score`: The raw predictions will be transformed using a Sigmoid activation function in case the model +only predicts a single label. For multi-label predictions, no scaling is applied. Set this +to False if you do not want any scaling of the raw predictions. +- `progress_bar`: Whether to show a progress bar while processing the documents. + + + +#### SentenceTransformersRanker.predict + +```python +def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> List[Document] +``` + +Use loaded ranker model to re-rank the supplied list of Document. + +Returns list of Document sorted by (desc.) similarity with the query. + +**Arguments**: + +- `query`: Query string +- `documents`: List of Document to be re-ranked +- `top_k`: The maximum number of documents to return + +**Returns**: + +List of Document + + + +#### SentenceTransformersRanker.predict\_batch + +```python +def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None) -> Union[List[Document], List[List[Document]]] +``` + +Use loaded ranker model to re-rank the supplied lists of Documents. + +Returns lists of Documents sorted by (desc.) similarity with the corresponding queries. + + +- If you provide a list containing a single query... + + - ... and a single list of Documents, the single list of Documents will be re-ranked based on the + supplied query. + - ... and a list of lists of Documents, each list of Documents will be re-ranked individually based on the + supplied query. + + +- If you provide a list of multiple queries... + + - ... you need to provide a list of lists of Documents. Each list of Documents will be re-ranked based on + its corresponding query. + +**Arguments**: + +- `queries`: Single query string or list of queries +- `documents`: Single list of Documents or list of lists of Documents to be reranked. +- `top_k`: The maximum number of documents to return per Document list. +- `batch_size`: Number of Documents to process at a time. + diff --git a/docs/v1.7.0/_src/api/api/reader.md b/docs/v1.7.0/_src/api/api/reader.md new file mode 100644 index 0000000000..7627affa30 --- /dev/null +++ b/docs/v1.7.0/_src/api/api/reader.md @@ -0,0 +1,914 @@ + + +# Module base + + + +## BaseReader + +```python +class BaseReader(BaseComponent) +``` + + + +#### BaseReader.timing + +```python +def timing(fn, attr_name) +``` + +Wrapper method used to time functions. + + + +# Module farm + + + +## FARMReader + +```python +class FARMReader(BaseReader) +``` + +Transformer based model for extractive Question Answering using the FARM framework (https://github.com/deepset-ai/FARM). +While the underlying model can vary (BERT, Roberta, DistilBERT, ...), the interface remains the same. + +| With a FARMReader, you can: + + - directly get predictions via predict() + - fine-tune the model on QA data via train() + + + +#### FARMReader.\_\_init\_\_ + +```python +def __init__(model_name_or_path: str, model_version: Optional[str] = None, context_window_size: int = 150, batch_size: int = 50, use_gpu: bool = True, devices: List[torch.device] = [], no_ans_boost: float = 0.0, return_no_answer: bool = False, top_k: int = 10, top_k_per_candidate: int = 3, top_k_per_sample: int = 1, num_processes: Optional[int] = None, max_seq_len: int = 256, doc_stride: int = 128, progress_bar: bool = True, duplicate_filtering: int = 0, use_confidence_scores: bool = True, confidence_threshold: Optional[float] = None, proxies: Optional[Dict[str, str]] = None, local_files_only=False, force_download=False, use_auth_token: Optional[Union[str, bool]] = None) +``` + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. 'bert-base-cased', +'deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2', 'distilbert-base-uncased-distilled-squad'. +See https://huggingface.co/models for full list of available models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `context_window_size`: The size, in characters, of the window around the answer span that is used when +displaying the context around the answer. +- `batch_size`: Number of samples the model receives in one batch for inference. +Memory consumption is much lower in inference mode. Recommendation: Increase the batch size +to a value so only a single batch is used. +- `use_gpu`: Whether to use GPUs or the CPU. Falls back on CPU if no GPU is available. +- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]). +Unused if `use_gpu` is False. +- `no_ans_boost`: How much the no_answer logit is boosted/increased. +If set to 0 (default), the no_answer logit is not changed. +If a negative number, there is a lower chance of "no_answer" being predicted. +If a positive number, there is an increased chance of "no_answer" +- `return_no_answer`: Whether to include no_answer predictions in the results. +- `top_k`: The maximum number of answers to return +- `top_k_per_candidate`: How many answers to extract for each candidate doc that is coming from the retriever (might be a long text). +Note that this is not the number of "final answers" you will receive +(see `top_k` in FARMReader.predict() or Finder.get_answers() for that) +and that FARM includes no_answer in the sorted list of predictions. +- `top_k_per_sample`: How many answers to extract from each small text passage that the model can process at once +(one "candidate doc" is usually split into many smaller "passages"). +You usually want a very small value here, as it slows down inference +and you don't gain much of quality by having multiple answers from one passage. +Note that this is not the number of "final answers" you will receive +(see `top_k` in FARMReader.predict() or Finder.get_answers() for that) +and that FARM includes no_answer in the sorted list of predictions. +- `num_processes`: The number of processes for `multiprocessing.Pool`. Set to value of 0 to disable +multiprocessing. Set to None to let Inferencer determine optimum number. If you +want to debug the Language Model, you might need to disable multiprocessing! +- `max_seq_len`: Max sequence length of one input text for the model +- `doc_stride`: Length of striding window for splitting long texts (used if ``len(text) > max_seq_len``) +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `duplicate_filtering`: Answers are filtered based on their position. Both start and end position of the answers are considered. +The higher the value, answers that are more apart are filtered out. 0 corresponds to exact duplicates. -1 turns off duplicate removal. +- `use_confidence_scores`: Determines the type of score that is used for ranking a predicted answer. +`True` => a scaled confidence / relevance score between [0, 1]. +This score can also be further calibrated on your dataset via self.eval() +(see https://haystack.deepset.ai/components/reader#confidence-scores). +`False` => an unscaled, raw score [-inf, +inf] which is the sum of start and end logit +from the model for the predicted span. +Using confidence scores can change the ranking of no_answer compared to using the +unscaled raw scores. +- `confidence_threshold`: Filters out predictions below confidence_threshold. Value should be between 0 and 1. Disabled by default. +- `proxies`: Dict of proxy servers to use for downloading external models. Example: {'http': 'some.proxy:1234', 'http://hostname': 'my.proxy:3111'} +- `local_files_only`: Whether to force checking for local files only (and forbid downloads) +- `force_download`: Whether fo force a (re-)download even if the model exists locally in the cache. +- `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`, +the local token will be used, which must be previously created via `transformer-cli login`. +Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained + + + +#### FARMReader.train + +```python +def train(data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], batch_size: int = 10, n_epochs: int = 2, learning_rate: float = 1e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), grad_acc_steps: int = 1) +``` + +Fine-tune a model on a QA dataset. Options: + +- Take a plain language model (e.g. `bert-base-cased`) and train it for QA (e.g. on SQuAD data) +- Take a QA model (e.g. `deepset/bert-base-cased-squad2`) and fine-tune it for your domain (e.g. using your labels collected via the haystack annotation tool) + +Checkpoints can be stored via setting `checkpoint_every` to a custom number of steps. +If any checkpoints are stored, a subsequent run of train() will resume training from the latest available checkpoint. + +**Arguments**: + +- `data_dir`: Path to directory containing your training data in SQuAD style +- `train_filename`: Filename of training data +- `dev_filename`: Filename of dev / eval data +- `test_filename`: Filename of test data +- `dev_split`: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here +that gets split off from training data for eval. +- `use_gpu`: Whether to use GPU (if available) +- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]). +Unused if `use_gpu` is False. +- `batch_size`: Number of samples the model receives in one batch for training +- `n_epochs`: Number of iterations on the whole training data set +- `learning_rate`: Learning rate of the optimizer +- `max_seq_len`: Maximum text length (in tokens). Everything longer gets cut down. +- `warmup_proportion`: Proportion of training steps until maximum learning rate is reached. +Until that point LR is increasing linearly. After that it's decreasing again linearly. +Options for different schedules are available in FARM. +- `evaluate_every`: Evaluate the model every X steps on the hold-out eval dataset +- `save_dir`: Path to store the final model +- `num_processes`: The number of processes for `multiprocessing.Pool` during preprocessing. +Set to value of 1 to disable multiprocessing. When set to 1, you cannot split away a dev set from train set. +Set to None to use all CPU cores minus one. +- `use_amp`: Optimization level of NVIDIA's automatic mixed precision (AMP). The higher the level, the faster the model. +Available options: +None (Don't use AMP) +"O0" (Normal FP32 training) +"O1" (Mixed Precision => Recommended) +"O2" (Almost FP16) +"O3" (Pure FP16). +See details on: https://nvidia.github.io/apex/amp.html +- `checkpoint_root_dir`: the Path of directory where all train checkpoints are saved. For each individual +checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is created. +- `checkpoint_every`: save a train checkpoint after this many steps of training. +- `checkpoints_to_keep`: maximum number of train checkpoints to save. +- `caching`: whether or not to use caching for preprocessed dataset +- `cache_path`: Path to cache the preprocessed dataset +- `processor`: The processor to use for preprocessing. If None, the default SquadProcessor is used. +- `grad_acc_steps`: The number of steps to accumulate gradients for before performing a backward pass. + +**Returns**: + +None + + + +#### FARMReader.distil\_prediction\_layer\_from + +```python +def distil_prediction_layer_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], student_batch_size: int = 10, teacher_batch_size: Optional[int] = None, n_epochs: int = 2, learning_rate: float = 3e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss_weight: float = 0.5, distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "kl_div", temperature: float = 1.0, grad_acc_steps: int = 1) +``` + +Fine-tune a model on a QA dataset using logit-based distillation. You need to provide a teacher model that is already finetuned on the dataset + +and a student model that will be trained using the teacher's logits. The idea of this is to increase the accuracy of a lightweight student model. +using a more complex teacher. +Originally proposed in: https://arxiv.org/pdf/1503.02531.pdf +This can also be considered as the second stage of distillation finetuning as described in the TinyBERT paper: +https://arxiv.org/pdf/1909.10351.pdf +**Example** +```python +student = FARMReader(model_name_or_path="prajjwal1/bert-medium") +teacher = FARMReader(model_name_or_path="deepset/bert-large-uncased-whole-word-masking-squad2") +student.distil_prediction_layer_from(teacher, data_dir="squad2", train_filename="train.json", test_filename="dev.json", + learning_rate=3e-5, distillation_loss_weight=1.0, temperature=5) +``` + +Checkpoints can be stored via setting `checkpoint_every` to a custom number of steps. +If any checkpoints are stored, a subsequent run of train() will resume training from the latest available checkpoint. + +**Arguments**: + +- `teacher_model`: Model whose logits will be used to improve accuracy +- `data_dir`: Path to directory containing your training data in SQuAD style +- `train_filename`: Filename of training data +- `dev_filename`: Filename of dev / eval data +- `test_filename`: Filename of test data +- `dev_split`: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here +that gets split off from training data for eval. +- `use_gpu`: Whether to use GPU (if available) +- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]). +Unused if `use_gpu` is False. +- `student_batch_size`: Number of samples the student model receives in one batch for training +- `student_batch_size`: Number of samples the teacher model receives in one batch for distillation +- `n_epochs`: Number of iterations on the whole training data set +- `learning_rate`: Learning rate of the optimizer +- `max_seq_len`: Maximum text length (in tokens). Everything longer gets cut down. +- `warmup_proportion`: Proportion of training steps until maximum learning rate is reached. +Until that point LR is increasing linearly. After that it's decreasing again linearly. +Options for different schedules are available in FARM. +- `evaluate_every`: Evaluate the model every X steps on the hold-out eval dataset +- `save_dir`: Path to store the final model +- `num_processes`: The number of processes for `multiprocessing.Pool` during preprocessing. +Set to value of 1 to disable multiprocessing. When set to 1, you cannot split away a dev set from train set. +Set to None to use all CPU cores minus one. +- `use_amp`: Optimization level of NVIDIA's automatic mixed precision (AMP). The higher the level, the faster the model. +Available options: +None (Don't use AMP) +"O0" (Normal FP32 training) +"O1" (Mixed Precision => Recommended) +"O2" (Almost FP16) +"O3" (Pure FP16). +See details on: https://nvidia.github.io/apex/amp.html +- `checkpoint_root_dir`: the Path of directory where all train checkpoints are saved. For each individual +checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is created. +- `checkpoint_every`: save a train checkpoint after this many steps of training. +- `checkpoints_to_keep`: maximum number of train checkpoints to save. +- `caching`: whether or not to use caching for preprocessed dataset and teacher logits +- `cache_path`: Path to cache the preprocessed dataset and teacher logits +- `distillation_loss_weight`: The weight of the distillation loss. A higher weight means the teacher outputs are more important. +- `distillation_loss`: Specifies how teacher and model logits should be compared. Can either be a string ("mse" for mean squared error or "kl_div" for kl divergence loss) or a callable loss function (needs to have named parameters student_logits and teacher_logits) +- `temperature`: The temperature for distillation. A higher temperature will result in less certainty of teacher outputs. A lower temperature means more certainty. A temperature of 1.0 does not change the certainty of the model. +- `tinybert_loss`: Whether to use the TinyBERT loss function for distillation. This requires the student to be a TinyBERT model and the teacher to be a finetuned version of bert-base-uncased. +- `tinybert_epochs`: Number of epochs to train the student model with the TinyBERT loss function. After this many epochs, the student model is trained with the regular distillation loss function. +- `tinybert_learning_rate`: Learning rate to use when training the student model with the TinyBERT loss function. +- `tinybert_train_filename`: Filename of training data to use when training the student model with the TinyBERT loss function. To best follow the original paper, this should be an augmented version of the training data created using the augment_squad.py script. If not specified, the training data from the original training is used. +- `processor`: The processor to use for preprocessing. If None, the default SquadProcessor is used. +- `grad_acc_steps`: The number of steps to accumulate gradients for before performing a backward pass. + +**Returns**: + +None + + + +#### FARMReader.distil\_intermediate\_layers\_from + +```python +def distil_intermediate_layers_from(teacher_model: "FARMReader", data_dir: str, train_filename: str, dev_filename: Optional[str] = None, test_filename: Optional[str] = None, use_gpu: Optional[bool] = None, devices: List[torch.device] = [], batch_size: int = 10, n_epochs: int = 5, learning_rate: float = 5e-5, max_seq_len: Optional[int] = None, warmup_proportion: float = 0.2, dev_split: float = 0, evaluate_every: int = 300, save_dir: Optional[str] = None, num_processes: Optional[int] = None, use_amp: str = None, checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3, caching: bool = False, cache_path: Path = Path("cache/data_silo"), distillation_loss: Union[str, Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = "mse", temperature: float = 1.0, processor: Optional[Processor] = None, grad_acc_steps: int = 1) +``` + +The first stage of distillation finetuning as described in the TinyBERT paper: + +https://arxiv.org/pdf/1909.10351.pdf +**Example** +```python +student = FARMReader(model_name_or_path="prajjwal1/bert-medium") +teacher = FARMReader(model_name_or_path="huawei-noah/TinyBERT_General_6L_768D") +student.distil_intermediate_layers_from(teacher, data_dir="squad2", train_filename="train.json", test_filename="dev.json", + learning_rate=3e-5, distillation_loss_weight=1.0, temperature=5) +``` + +Checkpoints can be stored via setting `checkpoint_every` to a custom number of steps. +If any checkpoints are stored, a subsequent run of train() will resume training from the latest available checkpoint. + +**Arguments**: + +- `teacher_model`: Model whose logits will be used to improve accuracy +- `data_dir`: Path to directory containing your training data in SQuAD style +- `train_filename`: Filename of training data. To best follow the original paper, this should be an augmented version of the training data created using the augment_squad.py script +- `dev_filename`: Filename of dev / eval data +- `test_filename`: Filename of test data +- `dev_split`: Instead of specifying a dev_filename, you can also specify a ratio (e.g. 0.1) here +that gets split off from training data for eval. +- `use_gpu`: Whether to use GPU (if available) +- `devices`: List of GPU devices to limit inference to certain GPUs and not use all available ones (e.g. [torch.device('cuda:0')]). +Unused if `use_gpu` is False. +- `student_batch_size`: Number of samples the student model receives in one batch for training +- `student_batch_size`: Number of samples the teacher model receives in one batch for distillation +- `n_epochs`: Number of iterations on the whole training data set +- `learning_rate`: Learning rate of the optimizer +- `max_seq_len`: Maximum text length (in tokens). Everything longer gets cut down. +- `warmup_proportion`: Proportion of training steps until maximum learning rate is reached. +Until that point LR is increasing linearly. After that it's decreasing again linearly. +Options for different schedules are available in FARM. +- `evaluate_every`: Evaluate the model every X steps on the hold-out eval dataset +- `save_dir`: Path to store the final model +- `num_processes`: The number of processes for `multiprocessing.Pool` during preprocessing. +Set to value of 1 to disable multiprocessing. When set to 1, you cannot split away a dev set from train set. +Set to None to use all CPU cores minus one. +- `use_amp`: Optimization level of NVIDIA's automatic mixed precision (AMP). The higher the level, the faster the model. +Available options: +None (Don't use AMP) +"O0" (Normal FP32 training) +"O1" (Mixed Precision => Recommended) +"O2" (Almost FP16) +"O3" (Pure FP16). +See details on: https://nvidia.github.io/apex/amp.html +- `checkpoint_root_dir`: the Path of directory where all train checkpoints are saved. For each individual +checkpoint, a subdirectory with the name epoch_{epoch_num}_step_{step_num} is created. +- `checkpoint_every`: save a train checkpoint after this many steps of training. +- `checkpoints_to_keep`: maximum number of train checkpoints to save. +- `caching`: whether or not to use caching for preprocessed dataset and teacher logits +- `cache_path`: Path to cache the preprocessed dataset and teacher logits +- `distillation_loss_weight`: The weight of the distillation loss. A higher weight means the teacher outputs are more important. +- `distillation_loss`: Specifies how teacher and model logits should be compared. Can either be a string ("mse" for mean squared error or "kl_div" for kl divergence loss) or a callable loss function (needs to have named parameters student_logits and teacher_logits) +- `temperature`: The temperature for distillation. A higher temperature will result in less certainty of teacher outputs. A lower temperature means more certainty. A temperature of 1.0 does not change the certainty of the model. +- `processor`: The processor to use for preprocessing. If None, the default SquadProcessor is used. +- `grad_acc_steps`: The number of steps to accumulate gradients for before performing a backward pass. + +**Returns**: + +None + + + +#### FARMReader.update\_parameters + +```python +def update_parameters(context_window_size: Optional[int] = None, no_ans_boost: Optional[float] = None, return_no_answer: Optional[bool] = None, max_seq_len: Optional[int] = None, doc_stride: Optional[int] = None) +``` + +Hot update parameters of a loaded Reader. It may not to be safe when processing concurrent requests. + + + +#### FARMReader.save + +```python +def save(directory: Path) +``` + +Saves the Reader model so that it can be reused at a later point in time. + +**Arguments**: + +- `directory`: Directory where the Reader model should be saved + + + +#### FARMReader.save\_to\_remote + +```python +def save_to_remote(repo_id: str, private: Optional[bool] = None, commit_message: str = "Add new model to Hugging Face.") +``` + +Saves the Reader model to Hugging Face Model Hub with the given model_name. For this to work: + +- Be logged in to Hugging Face on your machine via transformers-cli +- Have git lfs installed (https://packagecloud.io/github/git-lfs/install), you can test it by git lfs --version + +**Arguments**: + +- `repo_id`: A namespace (user or an organization) and a repo name separated by a '/' of the model you want to save to Hugging Face +- `private`: Set to true to make the model repository private +- `commit_message`: Commit message while saving to Hugging Face + + + +#### FARMReader.predict\_batch + +```python +def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None) +``` + +Use loaded QA model to find answers for the queries in the Documents. + +- If you provide a list containing a single query... + + - ... and a single list of Documents, the query will be applied to each Document individually. + - ... and a list of lists of Documents, the query will be applied to each list of Documents and the Answers + will be aggregated per Document list. + +- If you provide a list of multiple queries... + + - ... and a single list of Documents, each query will be applied to each Document individually. + - ... and a list of lists of Documents, each query will be applied to its corresponding list of Documents + and the Answers will be aggregated per query-Document pair. + +**Arguments**: + +- `queries`: Single query or list of queries. +- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. +Can be a single list of Documents or a list of lists of Documents. +- `top_k`: Number of returned answers per query. +- `batch_size`: Number of query-document pairs to be processed at a time. + + + +#### FARMReader.predict + +```python +def predict(query: str, documents: List[Document], top_k: Optional[int] = None) +``` + +Use loaded QA model to find answers for a query in the supplied list of Document. + +Returns dictionaries containing answers sorted by (desc.) score. +Example: + ```python + |{ + | 'query': 'Who is the father of Arya Stark?', + | 'answers':[Answer( + | 'answer': 'Eddard,', + | 'context': "She travels with her father, Eddard, to King's Landing when he is", + | 'score': 0.9787139466668613, + | 'offsets_in_context': [Span(start=29, end=35], + | 'offsets_in_context': [Span(start=347, end=353], + | 'document_id': '88d1ed769d003939d3a0d28034464ab2' + | ),... + | ] + |} + ``` + +**Arguments**: + +- `query`: Query string +- `documents`: List of Document in which to search for the answer +- `top_k`: The maximum number of answers to return + +**Returns**: + +Dict containing query and answers + + + +#### FARMReader.eval\_on\_file + +```python +def eval_on_file(data_dir: Union[Path, str], test_filename: str, device: Optional[Union[str, torch.device]] = None, calibrate_conf_scores: bool = False) +``` + +Performs evaluation on a SQuAD-formatted file. + +Returns a dict containing the following metrics: + - "EM": exact match score + - "f1": F1-Score + - "top_n_accuracy": Proportion of predicted answers that overlap with correct answer + +**Arguments**: + +- `data_dir`: The directory in which the test set can be found +- `test_filename`: The name of the file containing the test data in SQuAD format. +- `device`: The device on which the tensors should be processed. +Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda") +or use the Reader's device by default. +- `calibrate_conf_scores`: Whether to calibrate the temperature for scaling of the confidence scores. + + + +#### FARMReader.eval + +```python +def eval(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", calibrate_conf_scores: bool = False) +``` + +Performs evaluation on evaluation documents in the DocumentStore. + +Returns a dict containing the following metrics: + - "EM": Proportion of exact matches of predicted answers with their corresponding correct answers + - "f1": Average overlap between predicted answers and their corresponding correct answers + - "top_n_accuracy": Proportion of predicted answers that overlap with correct answer + +**Arguments**: + +- `document_store`: DocumentStore containing the evaluation documents +- `device`: The device on which the tensors should be processed. +Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda") +or use the Reader's device by default. +- `label_index`: Index/Table name where labeled questions are stored +- `doc_index`: Index/Table name where documents that are used for evaluation are stored +- `label_origin`: Field name where the gold labels are stored +- `calibrate_conf_scores`: Whether to calibrate the temperature for scaling of the confidence scores. + + + +#### FARMReader.calibrate\_confidence\_scores + +```python +def calibrate_confidence_scores(document_store: BaseDocumentStore, device: Optional[Union[str, torch.device]] = None, label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold_label") +``` + +Calibrates confidence scores on evaluation documents in the DocumentStore. + +**Arguments**: + +- `document_store`: DocumentStore containing the evaluation documents +- `device`: The device on which the tensors should be processed. +Choose from torch.device("cpu") and torch.device("cuda") (or simply "cpu" or "cuda") +or use the Reader's device by default. +- `label_index`: Index/Table name where labeled questions are stored +- `doc_index`: Index/Table name where documents that are used for evaluation are stored +- `label_origin`: Field name where the gold labels are stored + + + +#### FARMReader.predict\_on\_texts + +```python +def predict_on_texts(question: str, texts: List[str], top_k: Optional[int] = None) +``` + +Use loaded QA model to find answers for a question in the supplied list of Document. + +Returns dictionaries containing answers sorted by (desc.) score. +Example: + ```python + |{ + | 'question': 'Who is the father of Arya Stark?', + | 'answers':[ + | {'answer': 'Eddard,', + | 'context': " She travels with her father, Eddard, to King's Landing when he is ", + | 'offset_answer_start': 147, + | 'offset_answer_end': 154, + | 'score': 0.9787139466668613, + | 'document_id': '1337' + | },... + | ] + |} + ``` + +**Arguments**: + +- `question`: Question string +- `documents`: List of documents as string type +- `top_k`: The maximum number of answers to return + +**Returns**: + +Dict containing question and answers + + + +#### FARMReader.convert\_to\_onnx + +```python +@classmethod +def convert_to_onnx(cls, model_name: str, output_path: Path, convert_to_float16: bool = False, quantize: bool = False, task_type: str = "question_answering", opset_version: int = 11) +``` + +Convert a PyTorch BERT model to ONNX format and write to ./onnx-export dir. The converted ONNX model + +can be loaded with in the `FARMReader` using the export path as `model_name_or_path` param. + +Usage: + + `from haystack.reader.farm import FARMReader + from pathlib import Path + onnx_model_path = Path("roberta-onnx-model") + FARMReader.convert_to_onnx(model_name="deepset/bert-base-cased-squad2", output_path=onnx_model_path) + reader = FARMReader(onnx_model_path)` + +**Arguments**: + +- `model_name`: transformers model name +- `output_path`: Path to output the converted model +- `convert_to_float16`: Many models use float32 precision by default. With the half precision of float16, +inference is faster on Nvidia GPUs with Tensor core like T4 or V100. On older GPUs, +float32 could still be be more performant. +- `quantize`: convert floating point number to integers +- `task_type`: Type of task for the model. Available options: "question_answering" or "embeddings". +- `opset_version`: ONNX opset version + + + +# Module transformers + + + +## TransformersReader + +```python +class TransformersReader(BaseReader) +``` + +Transformer based model for extractive Question Answering using the HuggingFace's transformers framework +(https://github.com/huggingface/transformers). +While the underlying model can vary (BERT, Roberta, DistilBERT ...), the interface remains the same. +With this reader, you can directly get predictions via predict() + + + +#### TransformersReader.\_\_init\_\_ + +```python +def __init__(model_name_or_path: str = "distilbert-base-uncased-distilled-squad", model_version: Optional[str] = None, tokenizer: Optional[str] = None, context_window_size: int = 70, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answers: bool = False, max_seq_len: int = 256, doc_stride: int = 128, batch_size: int = 16) +``` + +Load a QA model from Transformers. + +Available models include: + +- ``'distilbert-base-uncased-distilled-squad`'`` +- ``'bert-large-cased-whole-word-masking-finetuned-squad``' +- ``'bert-large-uncased-whole-word-masking-finetuned-squad``' + +See https://huggingface.co/models for full list of available QA models + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. 'bert-base-cased', +'deepset/bert-base-cased-squad2', 'deepset/bert-base-cased-squad2', 'distilbert-base-uncased-distilled-squad'. +See https://huggingface.co/models for full list of available models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `tokenizer`: Name of the tokenizer (usually the same as model) +- `context_window_size`: Num of chars (before and after the answer) to return as "context" for each answer. +The context usually helps users to understand if the answer really makes sense. +- `use_gpu`: Whether to use GPU (if available). +- `top_k`: The maximum number of answers to return +- `top_k_per_candidate`: How many answers to extract for each candidate doc that is coming from the retriever (might be a long text). +Note that this is not the number of "final answers" you will receive +(see `top_k` in TransformersReader.predict() or Finder.get_answers() for that) +and that no_answer can be included in the sorted list of predictions. +- `return_no_answers`: If True, the HuggingFace Transformers model could return a "no_answer" (i.e. when there is an unanswerable question) +If False, it cannot return a "no_answer". Note that `no_answer_boost` is unfortunately not available with TransformersReader. +If you would like to set no_answer_boost, use a `FARMReader`. +- `max_seq_len`: max sequence length of one input text for the model +- `doc_stride`: length of striding window for splitting long texts (used if len(text) > max_seq_len) +- `batch_size`: Number of documents to process at a time. + + + +#### TransformersReader.predict + +```python +def predict(query: str, documents: List[Document], top_k: Optional[int] = None) +``` + +Use loaded QA model to find answers for a query in the supplied list of Document. + +Returns dictionaries containing answers sorted by (desc.) score. +Example: + + ```python + |{ + | 'query': 'Who is the father of Arya Stark?', + | 'answers':[ + | {'answer': 'Eddard,', + | 'context': " She travels with her father, Eddard, to King's Landing when he is ", + | 'offset_answer_start': 147, + | 'offset_answer_end': 154, + | 'score': 0.9787139466668613, + | 'document_id': '1337' + | },... + | ] + |} + ``` + +**Arguments**: + +- `query`: Query string +- `documents`: List of Document in which to search for the answer +- `top_k`: The maximum number of answers to return + +**Returns**: + +Dict containing query and answers + + + +#### TransformersReader.predict\_batch + +```python +def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None) +``` + +Use loaded QA model to find answers for the queries in the Documents. + +- If you provide a list containing a single query... + + - ... and a single list of Documents, the query will be applied to each Document individually. + - ... and a list of lists of Documents, the query will be applied to each list of Documents and the Answers + will be aggregated per Document list. + +- If you provide a list of multiple queries... + + - ... and a single list of Documents, each query will be applied to each Document individually. + - ... and a list of lists of Documents, each query will be applied to its corresponding list of Documents + and the Answers will be aggregated per query-Document pair. + +**Arguments**: + +- `queries`: Single query or list of queries. +- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. +Can be a single list of Documents or a list of lists of Documents. +- `top_k`: Number of returned answers per query. +- `batch_size`: Number of query-document pairs to be processed at a time. + + + +# Module table + + + +## TableReader + +```python +class TableReader(BaseReader) +``` + +Transformer-based model for extractive Question Answering on Tables with TaPas +using the HuggingFace's transformers framework (https://github.com/huggingface/transformers). +With this reader, you can directly get predictions via predict() + +**Example**: + +```python +from haystack import Document +from haystack.reader import TableReader +import pandas as pd + +table_reader = TableReader(model_name_or_path="google/tapas-base-finetuned-wtq") +data = { + "actors": ["brad pitt", "leonardo di caprio", "george clooney"], + "age": ["57", "46", "60"], + "number of movies": ["87", "53", "69"], + "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"], +} +table = pd.DataFrame(data) +document = Document(content=table, content_type="table") +query = "When was DiCaprio born?" +prediction = table_reader.predict(query=query, documents=[document]) +answer = prediction["answers"][0].answer # "10 june 1996" +``` + + + +#### TableReader.\_\_init\_\_ + +```python +def __init__(model_name_or_path: str = "google/tapas-base-finetuned-wtq", model_version: Optional[str] = None, tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, top_k_per_candidate: int = 3, return_no_answer: bool = False, max_seq_len: int = 256) +``` + +Load a TableQA model from Transformers. + +Available models include: + +- ``'google/tapas-base-finetuned-wtq`'`` +- ``'google/tapas-base-finetuned-wikisql-supervised``' +- ``'deepset/tapas-large-nq-hn-reader'`` +- ``'deepset/tapas-large-nq-reader'`` + +See https://huggingface.co/models?pipeline_tag=table-question-answering +for full list of available TableQA models. + +The nq-reader models are able to provide confidence scores, but cannot handle questions that need aggregation +over multiple cells. The returned answers are sorted first by a general table score and then by answer span +scores. +All the other models can handle aggregation questions, but don't provide reasonable confidence scores. + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. +See https://huggingface.co/models?pipeline_tag=table-question-answering for full list of available models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, +or commit hash. +- `tokenizer`: Name of the tokenizer (usually the same as model) +- `use_gpu`: Whether to use GPU or CPU. Falls back on CPU if no GPU is available. +- `top_k`: The maximum number of answers to return +- `top_k_per_candidate`: How many answers to extract for each candidate table that is coming from +the retriever. +- `return_no_answer`: Whether to include no_answer predictions in the results. +(Only applicable with nq-reader models.) +- `max_seq_len`: Max sequence length of one input table for the model. If the number of tokens of +query + table exceed max_seq_len, the table will be truncated by removing rows until the +input size fits the model. + + + +#### TableReader.predict + +```python +def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict +``` + +Use loaded TableQA model to find answers for a query in the supplied list of Documents + +of content_type ``'table'``. + +Returns dictionary containing query and list of Answer objects sorted by (desc.) score. +WARNING: The answer scores are not reliable, as they are always extremely high, even if + a question cannot be answered by a given table. + +**Arguments**: + +- `query`: Query string +- `documents`: List of Document in which to search for the answer. Documents should be +of content_type ``'table'``. +- `top_k`: The maximum number of answers to return + +**Returns**: + +Dict containing query and answers + + + +#### TableReader.predict\_batch + +```python +def predict_batch(queries: List[str], documents: Union[List[Document], List[List[Document]]], top_k: Optional[int] = None, batch_size: Optional[int] = None) +``` + +Use loaded TableQA model to find answers for the supplied queries in the supplied Documents + +of content_type ``'table'``. + +Returns dictionary containing query and list of Answer objects sorted by (desc.) score. + +WARNING: The answer scores are not reliable, as they are always extremely high, even if +a question cannot be answered by a given table. + +- If you provide a list containing a single query... + + - ... and a single list of Documents, the query will be applied to each Document individually. + - ... and a list of lists of Documents, the query will be applied to each list of Documents and the Answers + will be aggregated per Document list. + +- If you provide a list of multiple queries... + + - ... and a single list of Documents, each query will be applied to each Document individually. + - ... and a list of lists of Documents, each query will be applied to its corresponding list of Documents + and the Answers will be aggregated per query-Document pair. + +**Arguments**: + +- `queries`: Single query string or list of queries. +- `documents`: Single list of Documents or list of lists of Documents in which to search for the answers. +Documents should be of content_type ``'table'``. +- `top_k`: The maximum number of answers to return per query. +- `batch_size`: Not applicable. + + + +## RCIReader + +```python +class RCIReader(BaseReader) +``` + +Table Reader model based on Glass et al. (2021)'s Row-Column-Intersection model. +See the original paper for more details: +Glass, Michael, et al. (2021): "Capturing Row and Column Semantics in Transformer Based Question Answering over Tables" +(https://aclanthology.org/2021.naacl-main.96/) + +Each row and each column is given a score with regard to the query by two separate models. The score of each cell +is then calculated as the sum of the corresponding row score and column score. Accordingly, the predicted answer is +the cell with the highest score. + +Pros and Cons of RCIReader compared to TableReader: ++ Provides meaningful confidence scores ++ Allows larger tables as input +- Does not support aggregation over table cells +- Slower + + + +#### RCIReader.\_\_init\_\_ + +```python +def __init__(row_model_name_or_path: str = "michaelrglass/albert-base-rci-wikisql-row", column_model_name_or_path: str = "michaelrglass/albert-base-rci-wikisql-col", row_model_version: Optional[str] = None, column_model_version: Optional[str] = None, row_tokenizer: Optional[str] = None, column_tokenizer: Optional[str] = None, use_gpu: bool = True, top_k: int = 10, max_seq_len: int = 256) +``` + +Load an RCI model from Transformers. + +Available models include: + +- ``'michaelrglass/albert-base-rci-wikisql-row'`` + ``'michaelrglass/albert-base-rci-wikisql-col'`` +- ``'michaelrglass/albert-base-rci-wtq-row'`` + ``'michaelrglass/albert-base-rci-wtq-col'`` + +**Arguments**: + +- `row_model_name_or_path`: Directory of a saved row scoring model or the name of a public model +- `column_model_name_or_path`: Directory of a saved column scoring model or the name of a public model +- `row_model_version`: The version of row model to use from the HuggingFace model hub. +Can be tag name, branch name, or commit hash. +- `column_model_version`: The version of column model to use from the HuggingFace model hub. +Can be tag name, branch name, or commit hash. +- `row_tokenizer`: Name of the tokenizer for the row model (usually the same as model) +- `column_tokenizer`: Name of the tokenizer for the column model (usually the same as model) +- `use_gpu`: Whether to use GPU or CPU. Falls back on CPU if no GPU is available. +- `top_k`: The maximum number of answers to return +- `max_seq_len`: Max sequence length of one input table for the model. If the number of tokens of +query + table exceed max_seq_len, the table will be truncated by removing rows until the +input size fits the model. + + + +#### RCIReader.predict + +```python +def predict(query: str, documents: List[Document], top_k: Optional[int] = None) -> Dict +``` + +Use loaded RCI models to find answers for a query in the supplied list of Documents + +of content_type ``'table'``. + +Returns dictionary containing query and list of Answer objects sorted by (desc.) score. +The existing RCI models on the HF model hub don"t allow aggregation, therefore, the answer will always be +composed of a single cell. + +**Arguments**: + +- `query`: Query string +- `documents`: List of Document in which to search for the answer. Documents should be +of content_type ``'table'``. +- `top_k`: The maximum number of answers to return + +**Returns**: + +Dict containing query and answers + diff --git a/docs/v1.7.0/_src/api/api/retriever.md b/docs/v1.7.0/_src/api/api/retriever.md new file mode 100644 index 0000000000..715c2e2156 --- /dev/null +++ b/docs/v1.7.0/_src/api/api/retriever.md @@ -0,0 +1,1797 @@ + + +# Module base + + + +## BaseGraphRetriever + +```python +class BaseGraphRetriever(BaseComponent) +``` + +Base classfor knowledge graph retrievers. + + + +## BaseRetriever + +```python +class BaseRetriever(BaseComponent) +``` + +Base class for regular retrievers. + + + +#### BaseRetriever.retrieve + +```python +@abstractmethod +def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query. + +**Arguments**: + +- `query`: The query +- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### BaseRetriever.timing + +```python +def timing(fn, attr_name) +``` + +Wrapper method used to time functions. + + + +#### BaseRetriever.eval + +```python +def eval(label_index: str = "label", doc_index: str = "eval_document", label_origin: str = "gold-label", top_k: int = 10, open_domain: bool = False, return_preds: bool = False, headers: Optional[Dict[str, str]] = None) -> dict +``` + +Performs evaluation on the Retriever. + +Retriever is evaluated based on whether it finds the correct document given the query string and at which +position in the ranking of documents the correct document is. + +| Returns a dict containing the following metrics: + + - "recall": Proportion of questions for which correct document is among retrieved documents + - "mrr": Mean of reciprocal rank. Rewards retrievers that give relevant documents a higher rank. + Only considers the highest ranked relevant document. + - "map": Mean of average precision for each question. Rewards retrievers that give relevant + documents a higher rank. Considers all retrieved relevant documents. If ``open_domain=True``, + average precision is normalized by the number of retrieved relevant documents per query. + If ``open_domain=False``, average precision is normalized by the number of all relevant documents + per query. + +**Arguments**: + +- `label_index`: Index/Table in DocumentStore where labeled questions are stored +- `doc_index`: Index/Table in DocumentStore where documents that are used for evaluation are stored +- `top_k`: How many documents to return per query +- `open_domain`: If ``True``, retrieval will be evaluated by checking if the answer string to a question is +contained in the retrieved docs (common approach in open-domain QA). +If ``False``, retrieval uses a stricter evaluation that checks if the retrieved document ids +are within ids explicitly stated in the labels. +- `return_preds`: Whether to add predictions in the returned dictionary. If True, the returned dictionary +contains the keys "predictions" and "metrics". +- `headers`: Custom HTTP headers to pass to document store client if supported (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='} for basic authentication) + + + +# Module sparse + + + +## BM25Retriever + +```python +class BM25Retriever(BaseRetriever) +``` + + + +#### BM25Retriever.\_\_init\_\_ + +```python +def __init__(document_store: KeywordDocumentStore, top_k: int = 10, all_terms_must_match: bool = False, custom_query: Optional[str] = None, scale_score: bool = True) +``` + +**Arguments**: + +- `document_store`: an instance of one of the following DocumentStores to retrieve from: ElasticsearchDocumentStore, OpenSearchDocumentStore and OpenDistroElasticsearchDocumentStore +- `all_terms_must_match`: Whether all terms of the query must match the document. +If true all query terms must be present in a document in order to be retrieved (i.e the AND operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy AND fish AND restaurant"). +Otherwise at least one query term must be present in a document in order to be retrieved (i.e the OR operator is being used implicitly between query terms: "cozy fish restaurant" -> "cozy OR fish OR restaurant"). +Defaults to False. +- `custom_query`: query string as per Elasticsearch DSL with a mandatory query placeholder(query). + Optionally, ES `filter` clause can be added where the values of `terms` are placeholders + that get substituted during runtime. The placeholder(${filter_name_1}, ${filter_name_2}..) + names must match with the filters dict supplied in self.retrieve(). + :: + + **An example custom_query:** + ```python + | { + | "size": 10, + | "query": { + | "bool": { + | "should": [{"multi_match": { + | "query": ${query}, // mandatory query placeholder + | "type": "most_fields", + | "fields": ["content", "title"]}}], + | "filter": [ // optional custom filters + | {"terms": {"year": ${years}}}, + | {"terms": {"quarter": ${quarters}}}, + | {"range": {"date": {"gte": ${date}}}} + | ], + | } + | }, + | } + ``` + + **For this custom_query, a sample retrieve() could be:** + ```python +| self.retrieve(query="Why did the revenue increase?", +| filters={"years": ["2019"], "quarters": ["Q1", "Q2"]}) +``` + + Optionally, highlighting can be defined by specifying Elasticsearch's highlight settings. + See https://www.elastic.co/guide/en/elasticsearch/reference/current/highlighting.html. + You will find the highlighted output in the returned Document's meta field by key "highlighted". + :: + + **Example custom_query with highlighting:** + ```python + | { + | "size": 10, + | "query": { + | "bool": { + | "should": [{"multi_match": { + | "query": ${query}, // mandatory query placeholder + | "type": "most_fields", + | "fields": ["content", "title"]}}], + | } + | }, + | "highlight": { // enable highlighting + | "fields": { // for fields content and title + | "content": {}, + | "title": {} + | } + | }, + | } + ``` + + **For this custom_query, highlighting info can be accessed by:** + ```python + | docs = self.retrieve(query="Why did the revenue increase?") + | highlighted_content = docs[0].meta["highlighted"]["content"] + | highlighted_title = docs[0].meta["highlighted"]["title"] + ``` +- `top_k`: How many documents to return per query. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### BM25Retriever.retrieve + +```python +def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query. + +**Arguments**: + +- `query`: The query +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### BM25Retriever.retrieve\_batch + +```python +def retrieve_batch(queries: List[str], filters: Optional[ + Union[ + Dict[str, Union[Dict, List, str, int, float, bool]], + List[Dict[str, Union[Dict, List, str, int, float, bool]]], + ] + ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the supplied queries. + +Returns a list of lists of Documents (one per query). + +**Arguments**: + +- `queries`: List of query strings. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. +- `batch_size`: Not applicable. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true similarity scores (e.g. cosine or dot_product) which naturally have a different +value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +## FilterRetriever + +```python +class FilterRetriever(BM25Retriever) +``` + +Naive "Retriever" that returns all documents that match the given filters. No impact of query at all. +Helpful for benchmarking, testing and if you want to do QA on small documents without an "active" retriever. + + + +#### FilterRetriever.retrieve + +```python +def retrieve(query: str, filters: dict = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query. + +**Arguments**: + +- `query`: Has no effect, can pass in empty string +- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field +- `top_k`: Has no effect, pass in any int or None +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `headers`: Custom HTTP headers to pass to elasticsearch client (e.g. {'Authorization': 'Basic YWRtaW46cm9vdA=='}) +Check out https://www.elastic.co/guide/en/elasticsearch/reference/current/http-clients.html for more information. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +## TfidfRetriever + +```python +class TfidfRetriever(BaseRetriever) +``` + +Read all documents from a SQL backend. + +Split documents into smaller units (eg, paragraphs or pages) to reduce the +computations when text is passed on to a Reader for QA. + +It uses sklearn's TfidfVectorizer to compute a tf-idf matrix. + + + +#### TfidfRetriever.\_\_init\_\_ + +```python +def __init__(document_store: BaseDocumentStore, top_k: int = 10, auto_fit=True) +``` + +**Arguments**: + +- `document_store`: an instance of a DocumentStore to retrieve documents from. +- `top_k`: How many documents to return per query. +- `auto_fit`: Whether to automatically update tf-idf matrix by calling fit() after new documents have been added + + + +#### TfidfRetriever.retrieve + +```python +def retrieve(query: str, filters: Optional[ + Union[ + Dict[str, Union[Dict, List, str, int, float, bool]], + List[Dict[str, Union[Dict, List, str, int, float, bool]]], + ] + ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query. + +**Arguments**: + +- `query`: The query +- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### TfidfRetriever.retrieve\_batch + +```python +def retrieve_batch(queries: List[str], filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the supplied queries. + +Returns a list of lists of Documents (one per query). + +**Arguments**: + +- `queries`: Single query string or list of queries. +- `filters`: A dictionary where the keys specify a metadata field and the value is a list of accepted values for that field +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `batch_size`: Not applicable. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true similarity scores (e.g. cosine or dot_product) which naturally have a different +value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### TfidfRetriever.fit + +```python +def fit() +``` + +Performing training on this class according to the TF-IDF algorithm. + + + +# Module dense + + + +## DensePassageRetriever + +```python +class DensePassageRetriever(BaseRetriever) +``` + +Retriever that uses a bi-encoder (one transformer for query, one transformer for passage). +See the original paper for more details: +Karpukhin, Vladimir, et al. (2020): "Dense Passage Retrieval for Open-Domain Question Answering." +(https://arxiv.org/abs/2004.04906). + + + +#### DensePassageRetriever.\_\_init\_\_ + +```python +def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "facebook/dpr-question_encoder-single-nq-base", passage_embedding_model: Union[Path, str] = "facebook/dpr-ctx_encoder-single-nq-base", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True) +``` + +Init the Retriever incl. the two encoder models from a local or remote model checkpoint. + +The checkpoint format matches huggingface transformers' model format + +**Example:** + + ```python + | # remote model from FAIR + | DensePassageRetriever(document_store=your_doc_store, + | query_embedding_model="facebook/dpr-question_encoder-single-nq-base", + | passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base") + | # or from local path + | DensePassageRetriever(document_store=your_doc_store, + | query_embedding_model="model_directory/question-encoder", + | passage_embedding_model="model_directory/context-encoder") + ``` + +**Arguments**: + +- `document_store`: An instance of DocumentStore from which to retrieve documents. +- `query_embedding_model`: Local path or remote name of question encoder checkpoint. The format equals the +one used by hugging-face transformers' modelhub models +Currently available remote names: ``"facebook/dpr-question_encoder-single-nq-base"`` +- `passage_embedding_model`: Local path or remote name of passage encoder checkpoint. The format equals the +one used by hugging-face transformers' modelhub models +Currently available remote names: ``"facebook/dpr-ctx_encoder-single-nq-base"`` +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `max_seq_len_query`: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down." +- `max_seq_len_passage`: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down." +- `top_k`: How many documents to return per query. +- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available. +- `batch_size`: Number of questions or passages to encode at once. In case of multiple gpus, this will be the total batch size. +- `embed_title`: Whether to concatenate title and passage to a text pair that is then used to create the embedding. +This is the approach used in the original paper and is likely to improve performance if your +titles contain meaningful information for retrieval (topic, entities etc.) . +The title is expected to be present in doc.meta["name"] and can be supplied in the documents +before writing them to the DocumentStore like this: +{"text": "my text", "meta": {"name": "my title"}}. +- `use_fast_tokenizers`: Whether to use fast Rust tokenizers +- `similarity_function`: Which function to apply for calculating the similarity of query and passage embeddings during training. +Options: `dot_product` (Default) or `cosine` +- `global_loss_buffer_size`: Buffer size for all_gather() in DDP. +Increase if errors like "encoded data exceeds max_size ..." come up +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones +These strings will be converted into pytorch devices, so use the string notation described here: +https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device +(e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for DPR, training +will only use the first device provided in this list. +- `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`, +the local token will be used, which must be previously created via `transformer-cli login`. +Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### DensePassageRetriever.retrieve + +```python +def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query. + +**Arguments**: + +- `query`: The query +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### DensePassageRetriever.retrieve\_batch + +```python +def retrieve_batch(queries: List[str], filters: Optional[ + Union[ + Dict[str, Union[Dict, List, str, int, float, bool]], + List[Dict[str, Union[Dict, List, str, int, float, bool]]], + ] + ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the supplied queries. + +Returns a list of lists of Documents (one per query). + +**Arguments**: + +- `queries`: List of query strings. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. Can be a single filter that will be applied to each query or a list of filters +(one filter per query). + +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `batch_size`: Number of queries to embed at a time. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true similarity scores (e.g. cosine or dot_product) which naturally have a different +value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### DensePassageRetriever.embed\_queries + +```python +def embed_queries(texts: List[str]) -> List[np.ndarray] +``` + +Create embeddings for a list of queries using the query encoder + +**Arguments**: + +- `texts`: Queries to embed + +**Returns**: + +Embeddings, one per input queries + + + +#### DensePassageRetriever.embed\_documents + +```python +def embed_documents(docs: List[Document]) -> List[np.ndarray] +``` + +Create embeddings for a list of documents using the passage encoder + +**Arguments**: + +- `docs`: List of Document objects used to represent documents / passages in a standardized way within Haystack. + +**Returns**: + +Embeddings of documents / passages shape (batch_size, embedding_dim) + + + +#### DensePassageRetriever.train + +```python +def train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_samples: int = None, max_processes: int = 128, multiprocessing_strategy: Optional[str] = None, dev_split: float = 0, batch_size: int = 2, embed_title: bool = True, num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "AdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/dpr", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder", checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3) +``` + +train a DensePassageRetrieval model + +**Arguments**: + +- `data_dir`: Directory where training file, dev file and test file are present +- `train_filename`: training filename +- `dev_filename`: development set filename, file to be used by model in eval step of training +- `test_filename`: test set filename, file to be used by model in test step after training +- `max_samples`: maximum number of input samples to convert. Can be used for debugging a smaller dataset. +- `max_processes`: the maximum number of processes to spawn in the multiprocessing.Pool used in DataSilo. +It can be set to 1 to disable the use of multiprocessing or make debugging easier. +- `multiprocessing_strategy`: Set the multiprocessing sharing strategy, this can be one of file_descriptor/file_system depending on your OS. +If your system has low limits for the number of open file descriptors, and you can’t raise them, +you should use the file_system strategy. +- `dev_split`: The proportion of the train set that will sliced. Only works if dev_filename is set to None +- `batch_size`: total number of samples in 1 batch of data +- `embed_title`: whether to concatenate passage title with each passage. The default setting in official DPR embeds passage title with the corresponding passage +- `num_hard_negatives`: number of hard negative passages(passages which are very similar(high score by BM25) to query but do not contain the answer +- `num_positives`: number of positive passages +- `n_epochs`: number of epochs to train the model on +- `evaluate_every`: number of training steps after evaluation is run +- `n_gpu`: number of gpus to train on +- `learning_rate`: learning rate of optimizer +- `epsilon`: epsilon parameter of optimizer +- `weight_decay`: weight decay parameter of optimizer +- `grad_acc_steps`: number of steps to accumulate gradient over before back-propagation is done +- `use_amp`: Whether to use automatic mixed precision (AMP) or not. The options are: +"O0" (FP32) +"O1" (Mixed Precision) +"O2" (Almost FP16) +"O3" (Pure FP16). +For more information, refer to: https://nvidia.github.io/apex/amp.html +- `optimizer_name`: what optimizer to use (default: AdamW) +- `num_warmup_steps`: number of warmup steps +- `optimizer_correct_bias`: Whether to correct bias in optimizer +- `save_dir`: directory where models are saved +- `query_encoder_save_dir`: directory inside save_dir where query_encoder model files are saved +- `passage_encoder_save_dir`: directory inside save_dir where passage_encoder model files are saved +Checkpoints can be stored via setting `checkpoint_every` to a custom number of steps. +If any checkpoints are stored, a subsequent run of train() will resume training from the latest available checkpoint. + + + +#### DensePassageRetriever.save + +```python +def save(save_dir: Union[Path, str], query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder") +``` + +Save DensePassageRetriever to the specified directory. + +**Arguments**: + +- `save_dir`: Directory to save to. +- `query_encoder_dir`: Directory in save_dir that contains query encoder model. +- `passage_encoder_dir`: Directory in save_dir that contains passage encoder model. + +**Returns**: + +None + + + +#### DensePassageRetriever.load + +```python +@classmethod +def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_title: bool = True, use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder") +``` + +Load DensePassageRetriever from the specified directory. + + + +## TableTextRetriever + +```python +class TableTextRetriever(BaseRetriever) +``` + +Retriever that uses a tri-encoder to jointly retrieve among a database consisting of text passages and tables +(one transformer for query, one transformer for text passages, one transformer for tables). +See the original paper for more details: +Kostić, Bogdan, et al. (2021): "Multi-modal Retrieval of Tables and Texts Using Tri-encoder Models" +(https://arxiv.org/abs/2108.04049), + + + +#### TableTextRetriever.\_\_init\_\_ + +```python +def __init__(document_store: BaseDocumentStore, query_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-question_encoder", passage_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-passage_encoder", table_embedding_model: Union[Path, str] = "deepset/bert-small-mm_retrieval-table_encoder", model_version: Optional[str] = None, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, top_k: int = 10, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", global_loss_buffer_size: int = 150000, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, use_fast: bool = True) +``` + +Init the Retriever incl. the two encoder models from a local or remote model checkpoint. + +The checkpoint format matches huggingface transformers' model format + +**Arguments**: + +- `document_store`: An instance of DocumentStore from which to retrieve documents. +- `query_embedding_model`: Local path or remote name of question encoder checkpoint. The format equals the +one used by hugging-face transformers' modelhub models. +- `passage_embedding_model`: Local path or remote name of passage encoder checkpoint. The format equals the +one used by hugging-face transformers' modelhub models. +- `table_embedding_model`: Local path or remote name of table encoder checkpoint. The format equala the +one used by hugging-face transformers' modelhub models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `max_seq_len_query`: Longest length of each query sequence. Maximum number of tokens for the query text. Longer ones will be cut down." +- `max_seq_len_passage`: Longest length of each passage/context sequence. Maximum number of tokens for the passage text. Longer ones will be cut down." +- `top_k`: How many documents to return per query. +- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available. +- `batch_size`: Number of questions or passages to encode at once. In case of multiple gpus, this will be the total batch size. +- `embed_meta_fields`: Concatenate the provided meta fields and text passage / table to a text pair that is +then used to create the embedding. +This is the approach used in the original paper and is likely to improve +performance if your titles contain meaningful information for retrieval +(topic, entities etc.). +- `use_fast_tokenizers`: Whether to use fast Rust tokenizers +- `similarity_function`: Which function to apply for calculating the similarity of query and passage embeddings during training. +Options: `dot_product` (Default) or `cosine` +- `global_loss_buffer_size`: Buffer size for all_gather() in DDP. +Increase if errors like "encoded data exceeds max_size ..." come up +- `progress_bar`: Whether to show a tqdm progress bar or not. +Can be helpful to disable in production deployments to keep the logs clean. +- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones +These strings will be converted into pytorch devices, so use the string notation described here: +https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device +(e.g. ["cuda:0"]). Note: as multi-GPU training is currently not implemented for TableTextRetriever, +training will only use the first device provided in this list. +- `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`, +the local token will be used, which must be previously created via `transformer-cli login`. +Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. +- `use_fast`: Whether to use the fast version of DPR tokenizers or fallback to the standard version. Defaults to True. + + + +#### TableTextRetriever.retrieve\_batch + +```python +def retrieve_batch(queries: List[str], filters: Optional[ + Union[ + Dict[str, Union[Dict, List, str, int, float, bool]], + List[Dict[str, Union[Dict, List, str, int, float, bool]]], + ] + ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the supplied queries. + +Returns a list of lists of Documents (one per query). + +**Arguments**: + +- `queries`: List of query strings. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. Can be a single filter that will be applied to each query or a list of filters +(one filter per query). + +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `batch_size`: Number of queries to embed at a time. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true similarity scores (e.g. cosine or dot_product) which naturally have a different +value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### TableTextRetriever.embed\_queries + +```python +def embed_queries(texts: List[str]) -> List[np.ndarray] +``` + +Create embeddings for a list of queries using the query encoder + +**Arguments**: + +- `texts`: Queries to embed + +**Returns**: + +Embeddings, one per input queries + + + +#### TableTextRetriever.embed\_documents + +```python +def embed_documents(docs: List[Document]) -> List[np.ndarray] +``` + +Create embeddings for a list of text documents and / or tables using the text passage encoder and + +the table encoder. + +**Arguments**: + +- `docs`: List of Document objects used to represent documents / passages in +a standardized way within Haystack. + +**Returns**: + +Embeddings of documents / passages. Shape: (batch_size, embedding_dim) + + + +#### TableTextRetriever.train + +```python +def train(data_dir: str, train_filename: str, dev_filename: str = None, test_filename: str = None, max_samples: int = None, max_processes: int = 128, dev_split: float = 0, batch_size: int = 2, embed_meta_fields: List[str] = ["page_title", "section_title", "caption"], num_hard_negatives: int = 1, num_positives: int = 1, n_epochs: int = 3, evaluate_every: int = 1000, n_gpu: int = 1, learning_rate: float = 1e-5, epsilon: float = 1e-08, weight_decay: float = 0.0, num_warmup_steps: int = 100, grad_acc_steps: int = 1, use_amp: str = None, optimizer_name: str = "AdamW", optimizer_correct_bias: bool = True, save_dir: str = "../saved_models/mm_retrieval", query_encoder_save_dir: str = "query_encoder", passage_encoder_save_dir: str = "passage_encoder", table_encoder_save_dir: str = "table_encoder", checkpoint_root_dir: Path = Path("model_checkpoints"), checkpoint_every: Optional[int] = None, checkpoints_to_keep: int = 3) +``` + +Train a TableTextRetrieval model. + +**Arguments**: + +- `data_dir`: Directory where training file, dev file and test file are present. +- `train_filename`: Training filename. +- `dev_filename`: Development set filename, file to be used by model in eval step of training. +- `test_filename`: Test set filename, file to be used by model in test step after training. +- `max_samples`: Maximum number of input samples to convert. Can be used for debugging a smaller dataset. +- `max_processes`: The maximum number of processes to spawn in the multiprocessing.Pool used in DataSilo. +It can be set to 1 to disable the use of multiprocessing or make debugging easier. +- `dev_split`: The proportion of the train set that will sliced. Only works if dev_filename is set to None. +- `batch_size`: Total number of samples in 1 batch of data. +- `embed_meta_fields`: Concatenate meta fields with each passage and table. +The default setting in official MMRetrieval embeds page title, +section title and caption with the corresponding table and title with +corresponding text passage. +- `num_hard_negatives`: Number of hard negative passages (passages which are +very similar (high score by BM25) to query but do not contain the answer)- +- `num_positives`: Number of positive passages. +- `n_epochs`: Number of epochs to train the model on. +- `evaluate_every`: Number of training steps after evaluation is run. +- `n_gpu`: Number of gpus to train on. +- `learning_rate`: Learning rate of optimizer. +- `epsilon`: Epsilon parameter of optimizer. +- `weight_decay`: Weight decay parameter of optimizer. +- `grad_acc_steps`: Number of steps to accumulate gradient over before back-propagation is done. +- `use_amp`: Whether to use automatic mixed precision (AMP) or not. The options are: +"O0" (FP32) +"O1" (Mixed Precision) +"O2" (Almost FP16) +"O3" (Pure FP16). +For more information, refer to: https://nvidia.github.io/apex/amp.html +- `optimizer_name`: What optimizer to use (default: TransformersAdamW). +- `num_warmup_steps`: Number of warmup steps. +- `optimizer_correct_bias`: Whether to correct bias in optimizer. +- `save_dir`: Directory where models are saved. +- `query_encoder_save_dir`: Directory inside save_dir where query_encoder model files are saved. +- `passage_encoder_save_dir`: Directory inside save_dir where passage_encoder model files are saved. +- `table_encoder_save_dir`: Directory inside save_dir where table_encoder model files are saved. + + + +#### TableTextRetriever.save + +```python +def save(save_dir: Union[Path, str], query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder") +``` + +Save TableTextRetriever to the specified directory. + +**Arguments**: + +- `save_dir`: Directory to save to. +- `query_encoder_dir`: Directory in save_dir that contains query encoder model. +- `passage_encoder_dir`: Directory in save_dir that contains passage encoder model. +- `table_encoder_dir`: Directory in save_dir that contains table encoder model. + +**Returns**: + +None + + + +#### TableTextRetriever.load + +```python +@classmethod +def load(cls, load_dir: Union[Path, str], document_store: BaseDocumentStore, max_seq_len_query: int = 64, max_seq_len_passage: int = 256, max_seq_len_table: int = 256, use_gpu: bool = True, batch_size: int = 16, embed_meta_fields: List[str] = ["name", "section_title", "caption"], use_fast_tokenizers: bool = True, similarity_function: str = "dot_product", query_encoder_dir: str = "query_encoder", passage_encoder_dir: str = "passage_encoder", table_encoder_dir: str = "table_encoder") +``` + +Load TableTextRetriever from the specified directory. + + + +## EmbeddingRetriever + +```python +class EmbeddingRetriever(BaseRetriever) +``` + + + +#### EmbeddingRetriever.\_\_init\_\_ + +```python +def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: Optional[str] = None, pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, embed_meta_fields: List[str] = []) +``` + +**Arguments**: + +- `document_store`: An instance of DocumentStore from which to retrieve documents. +- `embedding_model`: Local path or name of model in Hugging Face's model hub such as ``'sentence-transformers/all-MiniLM-L6-v2'`` +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available. +- `batch_size`: Number of documents to encode at once. +- `max_seq_len`: Longest length of each document sequence. Maximum number of tokens for the document text. Longer ones will be cut down. +- `model_format`: Name of framework that was used for saving the model or model type. If no model_format is +provided, it will be inferred automatically from the model configuration files. +Options: + +- ``'farm'`` (will use `_DefaultEmbeddingEncoder` as embedding encoder) +- ``'transformers'`` (will use `_DefaultEmbeddingEncoder` as embedding encoder) +- ``'sentence_transformers'`` (will use `_SentenceTransformersEmbeddingEncoder` as embedding encoder) +- ``'retribert'`` (will use `_RetribertEmbeddingEncoder` as embedding encoder) +- `pooling_strategy`: Strategy for combining the embeddings from the model (for farm / transformers models only). +Options: + +- ``'cls_token'`` (sentence vector) +- ``'reduce_mean'`` (sentence vector) +- ``'reduce_max'`` (sentence vector) +- ``'per_token'`` (individual token vectors) +- `emb_extraction_layer`: Number of layer from which the embeddings shall be extracted (for farm / transformers models only). +Default: -1 (very last layer). +- `top_k`: How many documents to return per query. +- `progress_bar`: If true displays progress bar during embedding. +- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones +These strings will be converted into pytorch devices, so use the string notation described here: +https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device +(e.g. ["cuda:0"]). Note: As multi-GPU training is currently not implemented for EmbeddingRetriever, +training will only use the first device provided in this list. +- `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`, +the local token will be used, which must be previously created via `transformer-cli login`. +Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. +- `embed_meta_fields`: Concatenate the provided meta fields and text passage / table to a text pair that is +then used to create the embedding. +This approach is also used in the TableTextRetriever paper and is likely to improve +performance if your titles contain meaningful information for retrieval +(topic, entities etc.). + + + +#### EmbeddingRetriever.retrieve + +```python +def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query. + +**Arguments**: + +- `query`: The query +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### EmbeddingRetriever.retrieve\_batch + +```python +def retrieve_batch(queries: List[str], filters: Optional[ + Union[ + Dict[str, Union[Dict, List, str, int, float, bool]], + List[Dict[str, Union[Dict, List, str, int, float, bool]]], + ] + ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the supplied queries. + +Returns a list of lists of Documents (one per query). + +**Arguments**: + +- `queries`: List of query strings. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. Can be a single filter that will be applied to each query or a list of filters +(one filter per query). + +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `batch_size`: Number of queries to embed at a time. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true similarity scores (e.g. cosine or dot_product) which naturally have a different +value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### EmbeddingRetriever.embed\_queries + +```python +def embed_queries(texts: List[str]) -> List[np.ndarray] +``` + +Create embeddings for a list of queries. + +**Arguments**: + +- `texts`: Queries to embed + +**Returns**: + +Embeddings, one per input queries + + + +#### EmbeddingRetriever.embed\_documents + +```python +def embed_documents(docs: List[Document]) -> List[np.ndarray] +``` + +Create embeddings for a list of documents. + +**Arguments**: + +- `docs`: List of documents to embed + +**Returns**: + +Embeddings, one per input document + + + +#### EmbeddingRetriever.train + +```python +def train(training_data: List[Dict[str, Any]], learning_rate: float = 2e-5, n_epochs: int = 1, num_warmup_steps: int = None, batch_size: int = 16) -> None +``` + +Trains/adapts the underlying embedding model. + +Each training data example is a dictionary with the following keys: + +* question: the question string +* pos_doc: the positive document string +* neg_doc: the negative document string +* score: the score margin + +**Arguments**: + +- `training_data` (`List[Dict[str, Any]]`): The training data +- `learning_rate` (`float`): The learning rate +- `n_epochs` (`int`): The number of epochs +- `num_warmup_steps` (`int`): The number of warmup steps +- `batch_size` (`int (optional)`): The batch size to use for the training, defaults to 16 + + + +#### EmbeddingRetriever.save + +```python +def save(save_dir: Union[Path, str]) -> None +``` + +Save the model to the given directory + +**Arguments**: + +- `save_dir` (`Union[Path, str]`): The directory where the model will be saved + + + +## MultihopEmbeddingRetriever + +```python +class MultihopEmbeddingRetriever(EmbeddingRetriever) +``` + +Retriever that applies iterative retrieval using a shared encoder for query and passage. +See original paper for more details: + +Xiong, Wenhan, et. al. (2020): "Answering complex open-domain questions with multi-hop dense retrieval" +(https://arxiv.org/abs/2009.12756) + + + +#### MultihopEmbeddingRetriever.\_\_init\_\_ + +```python +def __init__(document_store: BaseDocumentStore, embedding_model: str, model_version: Optional[str] = None, num_iterations: int = 2, use_gpu: bool = True, batch_size: int = 32, max_seq_len: int = 512, model_format: str = "farm", pooling_strategy: str = "reduce_mean", emb_extraction_layer: int = -1, top_k: int = 10, progress_bar: bool = True, devices: Optional[List[Union[str, torch.device]]] = None, use_auth_token: Optional[Union[str, bool]] = None, scale_score: bool = True, embed_meta_fields: List[str] = []) +``` + +**Arguments**: + +- `document_store`: An instance of DocumentStore from which to retrieve documents. +- `embedding_model`: Local path or name of model in Hugging Face's model hub such as ``'sentence-transformers/all-MiniLM-L6-v2'`` +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `num_iterations`: The number of times passages are retrieved, i.e., the number of hops (Defaults to 2.) +- `use_gpu`: Whether to use all available GPUs or the CPU. Falls back on CPU if no GPU is available. +- `batch_size`: Number of documents to encode at once. +- `max_seq_len`: Longest length of each document sequence. Maximum number of tokens for the document text. Longer ones will be cut down. +- `model_format`: Name of framework that was used for saving the model or model type. If no model_format is +provided, it will be inferred automatically from the model configuration files. +Options: + +- ``'farm'`` (will use `_DefaultEmbeddingEncoder` as embedding encoder) +- ``'transformers'`` (will use `_DefaultEmbeddingEncoder` as embedding encoder) +- ``'sentence_transformers'`` (will use `_SentenceTransformersEmbeddingEncoder` as embedding encoder) +- ``'retribert'`` (will use `_RetribertEmbeddingEncoder` as embedding encoder) +- `pooling_strategy`: Strategy for combining the embeddings from the model (for farm / transformers models only). +Options: + +- ``'cls_token'`` (sentence vector) +- ``'reduce_mean'`` (sentence vector) +- ``'reduce_max'`` (sentence vector) +- ``'per_token'`` (individual token vectors) +- `emb_extraction_layer`: Number of layer from which the embeddings shall be extracted (for farm / transformers models only). +Default: -1 (very last layer). +- `top_k`: How many documents to return per query. +- `progress_bar`: If true displays progress bar during embedding. +- `devices`: List of GPU (or CPU) devices, to limit inference to certain GPUs and not use all available ones +These strings will be converted into pytorch devices, so use the string notation described here: +https://pytorch.org/docs/stable/tensor_attributes.html?highlight=torch%20device#torch.torch.device +(e.g. ["cuda:0"]). Note: As multi-GPU training is currently not implemented for EmbeddingRetriever, +training will only use the first device provided in this list. +- `use_auth_token`: API token used to download private models from Huggingface. If this parameter is set to `True`, +the local token will be used, which must be previously created via `transformer-cli login`. +Additional information can be found here https://huggingface.co/transformers/main_classes/model.html#transformers.PreTrainedModel.from_pretrained +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. +- `embed_meta_fields`: Concatenate the provided meta fields and text passage / table to a text pair that is +then used to create the embedding. +This approach is also used in the TableTextRetriever paper and is likely to improve +performance if your titles contain meaningful information for retrieval +(topic, entities etc.). + + + +#### MultihopEmbeddingRetriever.retrieve + +```python +def retrieve(query: str, filters: Optional[Dict[str, Union[Dict, List, str, int, float, bool]]] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, scale_score: bool = None) -> List[Document] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the query. + +**Arguments**: + +- `query`: The query +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +#### MultihopEmbeddingRetriever.retrieve\_batch + +```python +def retrieve_batch(queries: List[str], filters: Optional[ + Union[ + Dict[str, Union[Dict, List, str, int, float, bool]], + List[Dict[str, Union[Dict, List, str, int, float, bool]]], + ] + ] = None, top_k: Optional[int] = None, index: str = None, headers: Optional[Dict[str, str]] = None, batch_size: Optional[int] = None, scale_score: bool = None) -> List[List[Document]] +``` + +Scan through documents in DocumentStore and return a small number documents + +that are most relevant to the supplied queries. + +If you supply a single query, a single list of Documents is returned. If you supply a list of queries, a list of +lists of Documents (one per query) is returned. + +**Arguments**: + +- `queries`: Single query string or list of queries. +- `filters`: Optional filters to narrow down the search space to documents whose metadata fulfill certain +conditions. Can be a single filter that will be applied to each query or a list of filters +(one filter per query). + +Filters are defined as nested dictionaries. The keys of the dictionaries can be a logical +operator (`"$and"`, `"$or"`, `"$not"`), a comparison operator (`"$eq"`, `"$in"`, `"$gt"`, +`"$gte"`, `"$lt"`, `"$lte"`) or a metadata field name. +Logical operator keys take a dictionary of metadata field names and/or logical operators as +value. Metadata field names take a dictionary of comparison operators as value. Comparison +operator keys take a single value or (in case of `"$in"`) a list of values as value. +If no logical operator is provided, `"$and"` is used as default operation. If no comparison +operator is provided, `"$eq"` (or `"$in"` if the comparison value is a list) is used as default +operation. + + __Example__: + ```python + filters = { + "$and": { + "type": {"$eq": "article"}, + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": {"$in": ["economy", "politics"]}, + "publisher": {"$eq": "nytimes"} + } + } + } + # or simpler using default operators + filters = { + "type": "article", + "date": {"$gte": "2015-01-01", "$lt": "2021-01-01"}, + "rating": {"$gte": 3}, + "$or": { + "genre": ["economy", "politics"], + "publisher": "nytimes" + } + } + ``` + + To use the same logical operator multiple times on the same level, logical operators take + optionally a list of dictionaries as value. + + __Example__: + ```python + filters = { + "$or": [ + { + "$and": { + "Type": "News Paper", + "Date": { + "$lt": "2019-01-01" + } + } + }, + { + "$and": { + "Type": "Blog Post", + "Date": { + "$gte": "2019-01-01" + } + } + } + ] + } + ``` +- `top_k`: How many documents to return per query. +- `index`: The name of the index in the DocumentStore from which to retrieve documents +- `batch_size`: Number of queries to embed at a time. +- `scale_score`: Whether to scale the similarity score to the unit interval (range of [0,1]). +If true similarity scores (e.g. cosine or dot_product) which naturally have a different +value range will be scaled to a range of [0,1], where 1 means extremely relevant. +Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. + + + +# Module text2sparql + + + +## Text2SparqlRetriever + +```python +class Text2SparqlRetriever(BaseGraphRetriever) +``` + +Graph retriever that uses a pre-trained Bart model to translate natural language questions +given in text form to queries in SPARQL format. +The generated SPARQL query is executed on a knowledge graph. + + + +#### Text2SparqlRetriever.\_\_init\_\_ + +```python +def __init__(knowledge_graph, model_name_or_path, top_k: int = 1) +``` + +Init the Retriever by providing a knowledge graph and a pre-trained BART model + +**Arguments**: + +- `knowledge_graph`: An instance of BaseKnowledgeGraph on which to execute SPARQL queries. +- `model_name_or_path`: Name of or path to a pre-trained BartForConditionalGeneration model. +- `top_k`: How many SPARQL queries to generate per text query. + + + +#### Text2SparqlRetriever.retrieve + +```python +def retrieve(query: str, top_k: Optional[int] = None) +``` + +Translate a text query to SPARQL and execute it on the knowledge graph to retrieve a list of answers + +**Arguments**: + +- `query`: Text query that shall be translated to SPARQL and then executed on the knowledge graph +- `top_k`: How many SPARQL queries to generate per text query. + + + +#### Text2SparqlRetriever.retrieve\_batch + +```python +def retrieve_batch(queries: List[str], top_k: Optional[int] = None) +``` + +Translate a list of queries to SPARQL and execute it on the knowledge graph to retrieve + +a list of lists of answers (one per query). + +**Arguments**: + +- `queries`: List of queries that shall be translated to SPARQL and then executed on the +knowledge graph. +- `top_k`: How many SPARQL queries to generate per text query. + + + +#### Text2SparqlRetriever.format\_result + +```python +def format_result(result) +``` + +Generate formatted dictionary output with text answer and additional info + +**Arguments**: + +- `result`: The result of a SPARQL query as retrieved from the knowledge graph + diff --git a/docs/v1.7.0/_src/api/api/summarizer.md b/docs/v1.7.0/_src/api/api/summarizer.md new file mode 100644 index 0000000000..134f6dce29 --- /dev/null +++ b/docs/v1.7.0/_src/api/api/summarizer.md @@ -0,0 +1,165 @@ + + +# Module base + + + +## BaseSummarizer + +```python +class BaseSummarizer(BaseComponent) +``` + +Abstract class for Summarizer + + + +#### BaseSummarizer.predict + +```python +@abstractmethod +def predict(documents: List[Document], generate_single_summary: Optional[bool] = None) -> List[Document] +``` + +Abstract method for creating a summary. + +**Arguments**: + +- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. +- `generate_single_summary`: Whether to generate a single summary for all documents or one summary per document. +If set to "True", all docs will be joined to a single string that will then +be summarized. +Important: The summary will depend on the order of the supplied documents! + +**Returns**: + +List of Documents, where Document.content contains the summarization and Document.meta["context"] +the original, not summarized text + + + +# Module transformers + + + +## TransformersSummarizer + +```python +class TransformersSummarizer(BaseSummarizer) +``` + +Transformer based model to summarize the documents using the HuggingFace's transformers framework + +You can use any model that has been fine-tuned on a summarization task. For example: +'`bart-large-cnn`', '`t5-small`', '`t5-base`', '`t5-large`', '`t5-3b`', '`t5-11b`'. +See the up-to-date list of available models on +`huggingface.co/models `__ + +**Example** + +```python +| docs = [Document(text="PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions." +| "The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by" +| "the shutoffs which were expected to last through at least midday tomorrow.")] +| +| # Summarize +| summary = summarizer.predict( +| documents=docs, +| generate_single_summary=True +| ) +| +| # Show results (List of Documents, containing summary and original text) +| print(summary) +| +| [ +| { +| "text": "California's largest electricity provider has turned off power to hundreds of thousands of customers.", +| ... +| "meta": { +| "context": "PGE stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. ..." +| }, +| ... +| }, +``` + + + +#### TransformersSummarizer.\_\_init\_\_ + +```python +def __init__(model_name_or_path: str = "google/pegasus-xsum", model_version: Optional[str] = None, tokenizer: Optional[str] = None, max_length: int = 200, min_length: int = 5, use_gpu: bool = True, clean_up_tokenization_spaces: bool = True, separator_for_single_summary: str = " ", generate_single_summary: bool = False, batch_size: int = 16, progress_bar: bool = True) +``` + +Load a Summarization model from Transformers. + +See the up-to-date list of available models at +https://huggingface.co/models?filter=summarization + +**Arguments**: + +- `model_name_or_path`: Directory of a saved model or the name of a public model e.g. +'facebook/rag-token-nq', 'facebook/rag-sequence-nq'. +See https://huggingface.co/models?filter=summarization for full list of available models. +- `model_version`: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. +- `tokenizer`: Name of the tokenizer (usually the same as model) +- `max_length`: Maximum length of summarized text +- `min_length`: Minimum length of summarized text +- `use_gpu`: Whether to use GPU (if available). +- `clean_up_tokenization_spaces`: Whether or not to clean up the potential extra spaces in the text output +- `separator_for_single_summary`: If `generate_single_summary=True` in `predict()`, we need to join all docs +into a single text. This separator appears between those subsequent docs. +- `generate_single_summary`: Whether to generate a single summary for all documents or one summary per document. +If set to "True", all docs will be joined to a single string that will then +be summarized. +Important: The summary will depend on the order of the supplied documents! +- `batch_size`: Number of documents to process at a time. +- `progress_bar`: Whether to show a progress bar. + + + +#### TransformersSummarizer.predict + +```python +def predict(documents: List[Document], generate_single_summary: Optional[bool] = None) -> List[Document] +``` + +Produce the summarization from the supplied documents. + +These document can for example be retrieved via the Retriever. + +**Arguments**: + +- `documents`: Related documents (e.g. coming from a retriever) that the answer shall be conditioned on. +- `generate_single_summary`: Whether to generate a single summary for all documents or one summary per document. +If set to "True", all docs will be joined to a single string that will then +be summarized. +Important: The summary will depend on the order of the supplied documents! + +**Returns**: + +List of Documents, where Document.text contains the summarization and Document.meta["context"] +the original, not summarized text + + + +#### TransformersSummarizer.predict\_batch + +```python +def predict_batch(documents: Union[List[Document], List[List[Document]]], generate_single_summary: Optional[bool] = None, batch_size: Optional[int] = None) -> Union[List[Document], List[List[Document]]] +``` + +Produce the summarization from the supplied documents. + +These documents can for example be retrieved via the Retriever. + +**Arguments**: + +- `documents`: Single list of related documents or list of lists of related documents +(e.g. coming from a retriever) that the answer shall be conditioned on. +- `generate_single_summary`: Whether to generate a single summary for each provided document list or +one summary per document. +If set to "True", all docs of a document list will be joined to a single string +that will then be summarized. +Important: The summary will depend on the order of the supplied documents! +- `batch_size`: Number of Documents to process at a time. + diff --git a/docs/v1.7.0/_src/api/api/translator.md b/docs/v1.7.0/_src/api/api/translator.md new file mode 100644 index 0000000000..159a6b987e --- /dev/null +++ b/docs/v1.7.0/_src/api/api/translator.md @@ -0,0 +1,130 @@ + + +# Module base + + + +## BaseTranslator + +```python +class BaseTranslator(BaseComponent) +``` + +Abstract class for a Translator component that translates either a query or a doc from language A to language B. + + + +#### BaseTranslator.translate + +```python +@abstractmethod +def translate(results: List[Dict[str, Any]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]] +``` + +Translate the passed query or a list of documents from language A to B. + + + +#### BaseTranslator.run + +```python +def run(results: List[Dict[str, Any]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, answers: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) +``` + +Method that gets executed when this class is used as a Node in a Haystack Pipeline + + + +# Module transformers + + + +## TransformersTranslator + +```python +class TransformersTranslator(BaseTranslator) +``` + +Translator component based on Seq2Seq models from Huggingface's transformers library. +Exemplary use cases: +- Translate a query from Language A to B (e.g. if you only have good models + documents in language B) +- Translate a document from Language A to B (e.g. if you want to return results in the native language of the user) + +We currently recommend using OPUS models (see __init__() for details) + +**Example:** + +```python +| DOCS = [ +| Document(content="Heinz von Foerster was an Austrian American scientist combining physics and philosophy, +| and widely attributed as the originator of Second-order cybernetics.") +| ] +| translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de") +| res = translator.translate(documents=DOCS, query=None) +``` + + + +#### TransformersTranslator.\_\_init\_\_ + +```python +def __init__(model_name_or_path: str, tokenizer_name: Optional[str] = None, max_seq_len: Optional[int] = None, clean_up_tokenization_spaces: Optional[bool] = True, use_gpu: bool = True, progress_bar: bool = True) +``` + +Initialize the translator with a model that fits your targeted languages. While we support all seq2seq + +models from Hugging Face's model hub, we recommend using the OPUS models from Helsinki NLP. They provide plenty +of different models, usually one model per language pair and translation direction. +They have a pretty standardized naming that should help you find the right model: +- "Helsinki-NLP/opus-mt-en-de" => translating from English to German +- "Helsinki-NLP/opus-mt-de-en" => translating from German to English +- "Helsinki-NLP/opus-mt-fr-en" => translating from French to English +- "Helsinki-NLP/opus-mt-hi-en"=> translating from Hindi to English +... + +They also have a few multilingual models that support multiple languages at once. + +**Arguments**: + +- `model_name_or_path`: Name of the seq2seq model that shall be used for translation. +Can be a remote name from Huggingface's modelhub or a local path. +- `tokenizer_name`: Optional tokenizer name. If not supplied, `model_name_or_path` will also be used for the +tokenizer. +- `max_seq_len`: The maximum sentence length the model accepts. (Optional) +- `clean_up_tokenization_spaces`: Whether or not to clean up the tokenization spaces. (default True) +- `use_gpu`: Whether to use GPU or the CPU. Falls back on CPU if no GPU is available. +- `progress_bar`: Whether to show a progress bar. + + + +#### TransformersTranslator.translate + +```python +def translate(results: List[Dict[str, Any]] = None, query: Optional[str] = None, documents: Optional[Union[List[Document], List[Answer], List[str], List[Dict[str, Any]]]] = None, dict_key: Optional[str] = None) -> Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]] +``` + +Run the actual translation. You can supply a query or a list of documents. Whatever is supplied will be translated. + +**Arguments**: + +- `results`: Generated QA pairs to translate +- `query`: The query string to translate +- `documents`: The documents to translate +- `dict_key`: If you pass a dictionary in `documents`, you can specify here the field which shall be translated. + + + +#### TransformersTranslator.translate\_batch + +```python +def translate_batch(queries: Optional[List[str]] = None, documents: Optional[Union[List[Document], List[Answer], List[List[Document]], List[List[Answer]]]] = None, batch_size: Optional[int] = None) -> List[Union[str, List[Document], List[Answer], List[str], List[Dict[str, Any]]]] +``` + +Run the actual translation. You can supply a single query, a list of queries or a list (of lists) of documents. + +**Arguments**: + +- `queries`: Single query or list of queries. +- `documents`: List of documents or list of lists of documets. +- `batch_size`: Not applicable. + diff --git a/docs/v1.7.0/_src/api/api/utils.md b/docs/v1.7.0/_src/api/api/utils.md new file mode 100644 index 0000000000..d8c3583d47 --- /dev/null +++ b/docs/v1.7.0/_src/api/api/utils.md @@ -0,0 +1,301 @@ + + +# Module export\_utils + + + +#### print\_answers + +```python +def print_answers(results: dict, details: str = "all", max_text_len: Optional[int] = None) +``` + +Utility function to print results of Haystack pipelines + +**Arguments**: + +- `results`: Results that the pipeline returned. +- `details`: Defines the level of details to print. Possible values: minimum, medium, all. +- `max_text_len`: Specifies the maximum allowed length for a text field. If you don't want to shorten the text, set this value to None. + +**Returns**: + +None + + + +#### print\_documents + +```python +def print_documents(results: dict, max_text_len: Optional[int] = None, print_name: bool = True, print_meta: bool = False) +``` + +Utility that prints a compressed representation of the documents returned by a pipeline. + +**Arguments**: + +- `max_text_len`: Shorten the document's content to a maximum number of characters. When set to `None`, the document is not shortened. +- `print_name`: Whether to print the document's name from the metadata. +- `print_meta`: Whether to print the document's metadata. + + + +#### print\_questions + +```python +def print_questions(results: dict) +``` + +Utility to print the output of a question generating pipeline in a readable format. + + + +#### export\_answers\_to\_csv + +```python +def export_answers_to_csv(agg_results: list, output_file) +``` + +Exports answers coming from finder.get_answers() to a CSV file. + +**Arguments**: + +- `agg_results`: A list of predictions coming from finder.get_answers(). +- `output_file`: The name of the output file. + +**Returns**: + +None + + + +#### convert\_labels\_to\_squad + +```python +def convert_labels_to_squad(labels_file: str) +``` + +Convert the export from the labeling UI to the SQuAD format for training. + +**Arguments**: + +- `labels_file`: The path to the file containing labels. + + + +# Module preprocessing + + + +#### convert\_files\_to\_docs + +```python +def convert_files_to_docs(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False, encoding: Optional[str] = None, id_hash_keys: Optional[List[str]] = None) -> List[Document] +``` + +Convert all files(.txt, .pdf, .docx) in the sub-directories of the given path to Documents that can be written to a + +Document Store. + +**Arguments**: + +- `dir_path`: The path of the directory containing the Files. +- `clean_func`: A custom cleaning function that gets applied to each Document (input: str, output: str). +- `split_paragraphs`: Whether to split text by paragraph. +- `encoding`: Character encoding to use when converting pdf documents. +- `id_hash_keys`: A list of Document attribute names from which the Document ID should be hashed from. +Useful for generating unique IDs even if the Document contents are identical. +To ensure you don't have duplicate Documents in your Document Store if texts are +not unique, you can modify the metadata and pass [`"content"`, `"meta"`] to this field. +If you do this, the Document ID will be generated by using the content and the defined metadata. + + + +#### tika\_convert\_files\_to\_docs + +```python +def tika_convert_files_to_docs(dir_path: str, clean_func: Optional[Callable] = None, split_paragraphs: bool = False, merge_short: bool = True, merge_lowercase: bool = True, id_hash_keys: Optional[List[str]] = None) -> List[Document] +``` + +Convert all files (.txt, .pdf) in the sub-directories of the given path to Documents that can be written to a + +Document Store. + +**Arguments**: + +- `merge_lowercase`: Whether to convert merged paragraphs to lowercase. +- `merge_short`: Whether to allow merging of short paragraphs +- `dir_path`: The path to the directory containing the files. +- `clean_func`: A custom cleaning function that gets applied to each doc (input: str, output:str). +- `split_paragraphs`: Whether to split text by paragraphs. +- `id_hash_keys`: A list of Document attribute names from which the Document ID should be hashed from. +Useful for generating unique IDs even if the Document contents are identical. +To ensure you don't have duplicate Documents in your Document Store if texts are +not unique, you can modify the metadata and pass [`"content"`, `"meta"`] to this field. +If you do this, the Document ID will be generated by using the content and the defined metadata. + + + +# Module squad\_data + + + +## SquadData + +```python +class SquadData() +``` + +This class is designed to manipulate data that is in SQuAD format + + + +#### SquadData.\_\_init\_\_ + +```python +def __init__(squad_data) +``` + +**Arguments**: + +- `squad_data`: SQuAD format data, either as a dictionary with a `data` key, or just a list of SQuAD documents. + + + +#### SquadData.merge\_from\_file + +```python +def merge_from_file(filename: str) +``` + +Merge the contents of a JSON file in the SQuAD format with the data stored in this object. + + + +#### SquadData.merge + +```python +def merge(new_data: List) +``` + +Merge data in SQuAD format with the data stored in this object. + +**Arguments**: + +- `new_data`: A list of SQuAD document data. + + + +#### SquadData.from\_file + +```python +@classmethod +def from_file(cls, filename: str) +``` + +Create a SquadData object by providing the name of a JSON file in the SQuAD format. + + + +#### SquadData.save + +```python +def save(filename: str) +``` + +Write the data stored in this object to a JSON file. + + + +#### SquadData.to\_document\_objs + +```python +def to_document_objs() +``` + +Export all paragraphs stored in this object to haystack.Document objects. + + + +#### SquadData.to\_label\_objs + +```python +def to_label_objs() +``` + +Export all labels stored in this object to haystack.Label objects. + + + +#### SquadData.to\_df + +```python +@staticmethod +def to_df(data) +``` + +Convert a list of SQuAD document dictionaries into a pandas dataframe (each row is one annotation). + + + +#### SquadData.count + +```python +def count(unit="questions") +``` + +Count the samples in the data. Choose a unit: "paragraphs", "questions", "answers", "no_answers", "span_answers". + + + +#### SquadData.df\_to\_data + +```python +@classmethod +def df_to_data(cls, df) +``` + +Convert a data frame into the SQuAD format data (list of SQuAD document dictionaries). + + + +#### SquadData.sample\_questions + +```python +def sample_questions(n) +``` + +Return a sample of n questions in the SQuAD format (a list of SQuAD document dictionaries). +Note that if the same question is asked on multiple different passages, this function treats that +as a single question. + + + +#### SquadData.get\_all\_paragraphs + +```python +def get_all_paragraphs() +``` + +Return all paragraph strings. + + + +#### SquadData.get\_all\_questions + +```python +def get_all_questions() +``` + +Return all question strings. Note that if the same question appears for different paragraphs, this function returns it multiple times. + + + +#### SquadData.get\_all\_document\_titles + +```python +def get_all_document_titles() +``` + +Return all document title strings. + diff --git a/docs/v1.7.0/_src/api/conf.py b/docs/v1.7.0/_src/api/conf.py new file mode 100644 index 0000000000..46046eccc0 --- /dev/null +++ b/docs/v1.7.0/_src/api/conf.py @@ -0,0 +1,52 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import sphinx_rtd_theme +import os +import sys + +sys.path.append("/Users/deepset/deepset/haystack") + + +# -- Project information ----------------------------------------------------- + +project = "Haystack" +copyright = "2020, deepset" +author = "deepset" + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ["sphinx.ext.autodoc", "sphinx_rtd_theme"] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] diff --git a/docs/v1.7.0/_src/api/img/annotation_tool.png b/docs/v1.7.0/_src/api/img/annotation_tool.png new file mode 100644 index 0000000000..eb2c601d9e Binary files /dev/null and b/docs/v1.7.0/_src/api/img/annotation_tool.png differ diff --git a/docs/v1.7.0/_src/api/img/code_snippet_usage.png b/docs/v1.7.0/_src/api/img/code_snippet_usage.png new file mode 100644 index 0000000000..e7d836bd9c Binary files /dev/null and b/docs/v1.7.0/_src/api/img/code_snippet_usage.png differ diff --git a/docs/v1.7.0/_src/api/img/colab_gpu_runtime.jpg b/docs/v1.7.0/_src/api/img/colab_gpu_runtime.jpg new file mode 100644 index 0000000000..883180b97e Binary files /dev/null and b/docs/v1.7.0/_src/api/img/colab_gpu_runtime.jpg differ diff --git a/docs/v1.7.0/_src/api/img/sketched_concepts_white.png b/docs/v1.7.0/_src/api/img/sketched_concepts_white.png new file mode 100644 index 0000000000..9fe5fd5c94 Binary files /dev/null and b/docs/v1.7.0/_src/api/img/sketched_concepts_white.png differ diff --git a/docs/v1.7.0/_src/api/index.rst b/docs/v1.7.0/_src/api/index.rst new file mode 100644 index 0000000000..42ff660913 --- /dev/null +++ b/docs/v1.7.0/_src/api/index.rst @@ -0,0 +1,16 @@ +.. Haystack documentation master file, created by + sphinx-quickstart on Tue Jul 28 14:14:55 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + api/database + api/retriever + api/reader + api/indexing + api/rest_api + api/file_converters + api/finder diff --git a/docs/v1.7.0/_src/api/make.bat b/docs/v1.7.0/_src/api/make.bat new file mode 100644 index 0000000000..2119f51099 --- /dev/null +++ b/docs/v1.7.0/_src/api/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/v1.7.0/_src/api/openapi/openapi-1.2.0.json b/docs/v1.7.0/_src/api/openapi/openapi-1.2.0.json new file mode 100644 index 0000000000..36971bd89f --- /dev/null +++ b/docs/v1.7.0/_src/api/openapi/openapi-1.2.0.json @@ -0,0 +1,834 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.2.0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status_initialized_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version_hs_version_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query_query_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback_feedback_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Feedback", + "anyOf": [ + { + "$ref": "#/components/schemas/LabelSerialized" + }, + { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + ] + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback_feedback_delete", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics_eval_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback_export_feedback_get", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file_file_upload_post", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents_documents_get_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents_documents_delete_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "AnswerSerialized": { + "title": "AnswerSerialized", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "type": "string" + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "DocumentSerialized": { + "title": "DocumentSerialized", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "type": "string" + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "array", + "items": { + "type": "number" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + } + } + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "LabelSerialized": { + "title": "LabelSerialized", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "type": "string" + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/api/openapi/openapi-1.2.1rc0.json b/docs/v1.7.0/_src/api/openapi/openapi-1.2.1rc0.json new file mode 100644 index 0000000000..5958d6a11f --- /dev/null +++ b/docs/v1.7.0/_src/api/openapi/openapi-1.2.1rc0.json @@ -0,0 +1,827 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.2.1rc0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status_initialized_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version_hs_version_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query_query_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback_feedback_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Feedback", + "anyOf": [ + { + "$ref": "#/components/schemas/LabelSerialized" + }, + { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + ] + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback_feedback_delete", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics_eval_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback_export_feedback_get", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file_file_upload_post", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents_documents_get_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents_documents_delete_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "AnswerSerialized": { + "title": "AnswerSerialized", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "type": "string" + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "DocumentSerialized": { + "title": "DocumentSerialized", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "type": "string" + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "array", + "items": { + "type": "number" + } + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + } + } + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "LabelSerialized": { + "title": "LabelSerialized", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "type": "string" + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/api/openapi/openapi-1.3.0.json b/docs/v1.7.0/_src/api/openapi/openapi-1.3.0.json new file mode 100644 index 0000000000..cd388129fb --- /dev/null +++ b/docs/v1.7.0/_src/api/openapi/openapi-1.3.0.json @@ -0,0 +1,834 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.3.0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status_initialized_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version_hs_version_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query_query_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback_feedback_get", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Feedback", + "anyOf": [ + { + "$ref": "#/components/schemas/LabelSerialized" + }, + { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + ] + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback_feedback_delete", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics_eval_feedback_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback_export_feedback_get", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file_file_upload_post", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents_documents_get_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents_documents_delete_by_filters_post", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "AnswerSerialized": { + "title": "AnswerSerialized", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "type": "string" + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "DocumentSerialized": { + "title": "DocumentSerialized", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "type": "string" + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "array", + "items": { + "type": "number" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + } + } + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "LabelSerialized": { + "title": "LabelSerialized", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/AnswerSerialized" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/DocumentSerialized" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "type": "string" + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/api/openapi/openapi-1.3.1rc0.json b/docs/v1.7.0/_src/api/openapi/openapi-1.3.1rc0.json new file mode 100644 index 0000000000..8fff7c9626 --- /dev/null +++ b/docs/v1.7.0/_src/api/openapi/openapi-1.3.1rc0.json @@ -0,0 +1,892 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.3.1rc0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Feedback Feedback Get", + "type": "array", + "items": { + "$ref": "#/components/schemas/Label" + } + } + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Feedback", + "anyOf": [ + { + "$ref": "#/components/schemas/Label" + }, + { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + ] + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "Answer": { + "title": "Answer", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "Document": { + "title": "Document", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "string" + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + }, + { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + } + ] + } + } + } + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "Label": { + "title": "Label", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/Answer" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/api/openapi/openapi-1.4.0.json b/docs/v1.7.0/_src/api/openapi/openapi-1.4.0.json new file mode 100644 index 0000000000..9dd0363856 --- /dev/null +++ b/docs/v1.7.0/_src/api/openapi/openapi-1.4.0.json @@ -0,0 +1,892 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.4.0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Feedback Feedback Get", + "type": "array", + "items": { + "$ref": "#/components/schemas/Label" + } + } + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Feedback", + "anyOf": [ + { + "$ref": "#/components/schemas/Label" + }, + { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + ] + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "Answer": { + "title": "Answer", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "Document": { + "title": "Document", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "string" + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + }, + { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + } + ] + } + } + } + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "Label": { + "title": "Label", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/Answer" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/api/openapi/openapi-1.4.1rc0.json b/docs/v1.7.0/_src/api/openapi/openapi-1.4.1rc0.json new file mode 100644 index 0000000000..1f61a21ec4 --- /dev/null +++ b/docs/v1.7.0/_src/api/openapi/openapi-1.4.1rc0.json @@ -0,0 +1,893 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.4.1rc0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Feedback Feedback Get", + "type": "array", + "items": { + "$ref": "#/components/schemas/Label" + } + } + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Feedback", + "anyOf": [ + { + "$ref": "#/components/schemas/Label" + }, + { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + ] + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "Answer": { + "title": "Answer", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "Document": { + "title": "Document", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image", + "audio" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "string" + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + }, + { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + } + ] + } + } + } + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "Label": { + "title": "Label", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/Answer" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/api/openapi/openapi-1.5.0.json b/docs/v1.7.0/_src/api/openapi/openapi-1.5.0.json new file mode 100644 index 0000000000..fd07ddd0d5 --- /dev/null +++ b/docs/v1.7.0/_src/api/openapi/openapi-1.5.0.json @@ -0,0 +1,892 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.5.0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Feedback Feedback Get", + "type": "array", + "items": { + "$ref": "#/components/schemas/Label" + } + } + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Feedback", + "anyOf": [ + { + "$ref": "#/components/schemas/Label" + }, + { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + ] + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "Answer": { + "title": "Answer", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "Document": { + "title": "Document", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "string" + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + }, + { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + } + ] + } + } + } + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "Label": { + "title": "Label", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/Answer" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/api/openapi/openapi-1.5.1rc0.json b/docs/v1.7.0/_src/api/openapi/openapi-1.5.1rc0.json new file mode 100644 index 0000000000..9ff281d5a1 --- /dev/null +++ b/docs/v1.7.0/_src/api/openapi/openapi-1.5.1rc0.json @@ -0,0 +1,893 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.5.1rc0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Feedback Feedback Get", + "type": "array", + "items": { + "$ref": "#/components/schemas/Label" + } + } + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Feedback", + "anyOf": [ + { + "$ref": "#/components/schemas/Label" + }, + { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + ] + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "Answer": { + "title": "Answer", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "Document": { + "title": "Document", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image", + "audio" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "string" + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + }, + { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + } + ] + } + } + } + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "Label": { + "title": "Label", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/Answer" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/api/openapi/openapi-1.6.0.json b/docs/v1.7.0/_src/api/openapi/openapi-1.6.0.json new file mode 100644 index 0000000000..5872045250 --- /dev/null +++ b/docs/v1.7.0/_src/api/openapi/openapi-1.6.0.json @@ -0,0 +1,893 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.6.0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Feedback Feedback Get", + "type": "array", + "items": { + "$ref": "#/components/schemas/Label" + } + } + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback", + "requestBody": { + "content": { + "application/json": { + "schema": { + "title": "Feedback", + "anyOf": [ + { + "$ref": "#/components/schemas/Label" + }, + { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + ] + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "Answer": { + "title": "Answer", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "Document": { + "title": "Document", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image", + "audio" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "string" + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + }, + { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + } + ] + } + } + } + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "Label": { + "title": "Label", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/Answer" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/api/openapi/openapi-1.6.1rc0.json b/docs/v1.7.0/_src/api/openapi/openapi-1.6.1rc0.json new file mode 100644 index 0000000000..fcc336df71 --- /dev/null +++ b/docs/v1.7.0/_src/api/openapi/openapi-1.6.1rc0.json @@ -0,0 +1,886 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.6.1rc0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Feedback Feedback Get", + "type": "array", + "items": { + "$ref": "#/components/schemas/Label" + } + } + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to retrieve by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "Answer": { + "title": "Answer", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "Document": { + "title": "Document", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image", + "audio" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "string" + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + }, + { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + } + ] + } + } + }, + "additionalProperties": false + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "Label": { + "title": "Label", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/Answer" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/api/openapi/openapi-1.7.0.json b/docs/v1.7.0/_src/api/openapi/openapi-1.7.0.json new file mode 100644 index 0000000000..6934f44969 --- /dev/null +++ b/docs/v1.7.0/_src/api/openapi/openapi-1.7.0.json @@ -0,0 +1,886 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.7.0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Feedback Feedback Get", + "type": "array", + "items": { + "$ref": "#/components/schemas/Label" + } + } + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "Answer": { + "title": "Answer", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "Document": { + "title": "Document", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image", + "audio" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "string" + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + }, + { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + } + ] + } + } + }, + "additionalProperties": false + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "Label": { + "title": "Label", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/Answer" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/api/openapi/openapi.json b/docs/v1.7.0/_src/api/openapi/openapi.json new file mode 100644 index 0000000000..209a1723ad --- /dev/null +++ b/docs/v1.7.0/_src/api/openapi/openapi.json @@ -0,0 +1,886 @@ +{ + "openapi": "3.0.2", + "info": { + "title": "Haystack REST API", + "version": "1.7.0" + }, + "paths": { + "/initialized": { + "get": { + "tags": [ + "search" + ], + "summary": "Check Status", + "description": "This endpoint can be used during startup to understand if the\nserver is ready to take any requests, or is still loading.\n\nThe recommended approach is to call this endpoint with a short timeout,\nlike 500ms, and in case of no reply, consider the server busy.", + "operationId": "check_status", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/hs_version": { + "get": { + "tags": [ + "search" + ], + "summary": "Haystack Version", + "description": "Get the running Haystack version.", + "operationId": "haystack_version", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/query": { + "post": { + "tags": [ + "search" + ], + "summary": "Query", + "description": "This endpoint receives the question as a string and allows the requester to set\nadditional parameters that will be passed on to the Haystack pipeline.", + "operationId": "query", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueryResponse" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback", + "description": "This endpoint allows the API user to retrieve all the feedback that has been submitted\nthrough the `POST /feedback` endpoint.", + "operationId": "get_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Feedback Feedback Get", + "type": "array", + "items": { + "$ref": "#/components/schemas/Label" + } + } + } + } + } + } + }, + "post": { + "tags": [ + "feedback" + ], + "summary": "Post Feedback", + "description": "This endpoint allows the API user to submit feedback on an answer for a particular query.\n\nFor example, the user can send feedback on whether the answer was correct and\nwhether the right snippet was identified as the answer.\n\nInformation submitted through this endpoint is used to train the underlying QA model.", + "operationId": "post_feedback", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CreateLabelSerialized" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + }, + "delete": { + "tags": [ + "feedback" + ], + "summary": "Delete Feedback", + "description": "This endpoint allows the API user to delete all the\nfeedback that has been sumbitted through the\n`POST /feedback` endpoint", + "operationId": "delete_feedback", + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + } + } + } + }, + "/eval-feedback": { + "post": { + "tags": [ + "feedback" + ], + "summary": "Get Feedback Metrics", + "description": "This endpoint returns basic accuracy metrics based on user feedback,\ne.g., the ratio of correct answers or correctly identified documents.\nYou can filter the output by document or label.\n\nExample:\n\n`curl --location --request POST 'http://127.0.0.1:8000/eval-doc-qa-feedback' --header 'Content-Type: application/json' --data-raw '{ \"filters\": {\"document_id\": [\"XRR3xnEBCYVTkbTystOB\"]} }'`", + "operationId": "get_feedback_metrics", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + } + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/export-feedback": { + "get": { + "tags": [ + "feedback" + ], + "summary": "Export Feedback", + "description": "This endpoint returns JSON output in the SQuAD format for question/answer pairs\nthat were marked as \"relevant\" by user feedback through the `POST /feedback` endpoint.\n\nThe context_size param can be used to limit response size for large documents.", + "operationId": "export_feedback", + "parameters": [ + { + "required": false, + "schema": { + "title": "Context Size", + "type": "integer", + "default": 100000 + }, + "name": "context_size", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Full Document Context", + "type": "boolean", + "default": true + }, + "name": "full_document_context", + "in": "query" + }, + { + "required": false, + "schema": { + "title": "Only Positive Labels", + "type": "boolean", + "default": false + }, + "name": "only_positive_labels", + "in": "query" + } + ], + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/file-upload": { + "post": { + "tags": [ + "file-upload" + ], + "summary": "Upload File", + "description": "You can use this endpoint to upload a file for indexing\n(see https://haystack.deepset.ai/guides/rest-api#indexing-documents-in-the-haystack-rest-api-document-store).", + "operationId": "upload_file", + "requestBody": { + "content": { + "multipart/form-data": { + "schema": { + "$ref": "#/components/schemas/Body_upload_file_file_upload_post" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": {} + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/get_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Get Documents", + "description": "This endpoint allows you to retrieve documents contained in your document store.\nYou can filter the documents to retrieve by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "get_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Get Documents Documents Get By Filters Post", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + } + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + }, + "/documents/delete_by_filters": { + "post": { + "tags": [ + "document" + ], + "summary": "Delete Documents", + "description": "This endpoint allows you to delete documents contained in your document store.\nYou can filter the documents to delete by metadata (like the document's name),\nor provide an empty JSON object to clear the document store.\n\nExample of filters:\n`'{\"filters\": {{\"name\": [\"some\", \"more\"], \"category\": [\"only_one\"]}}'`\n\nTo get all documents you should provide an empty dict, like:\n`'{\"filters\": {}}'`", + "operationId": "delete_documents", + "requestBody": { + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/FilterRequest" + } + } + }, + "required": true + }, + "responses": { + "200": { + "description": "Successful Response", + "content": { + "application/json": { + "schema": { + "title": "Response Delete Documents Documents Delete By Filters Post", + "type": "boolean" + } + } + } + }, + "422": { + "description": "Validation Error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/HTTPValidationError" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "Answer": { + "title": "Answer", + "required": [ + "answer" + ], + "type": "object", + "properties": { + "answer": { + "title": "Answer", + "type": "string" + }, + "type": { + "title": "Type", + "enum": [ + "generative", + "extractive", + "other" + ], + "type": "string", + "default": "extractive" + }, + "score": { + "title": "Score", + "type": "number" + }, + "context": { + "title": "Context", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "offsets_in_document": { + "title": "Offsets In Document", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "offsets_in_context": { + "title": "Offsets In Context", + "type": "array", + "items": { + "$ref": "#/components/schemas/Span" + } + }, + "document_id": { + "title": "Document Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + } + } + }, + "Body_upload_file_file_upload_post": { + "title": "Body_upload_file_file_upload_post", + "required": [ + "files" + ], + "type": "object", + "properties": { + "files": { + "title": "Files", + "type": "array", + "items": { + "type": "string", + "format": "binary" + } + }, + "meta": { + "title": "Meta", + "type": "string", + "default": "null" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables" + }, + "valid_languages": { + "title": "Valid Languages" + }, + "clean_whitespace": { + "title": "Clean Whitespace" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines" + }, + "clean_header_footer": { + "title": "Clean Header Footer" + }, + "split_by": { + "title": "Split By" + }, + "split_length": { + "title": "Split Length" + }, + "split_overlap": { + "title": "Split Overlap" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary" + } + } + }, + "CreateLabelSerialized": { + "title": "CreateLabelSerialized", + "required": [ + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + }, + "additionalProperties": false + }, + "Document": { + "title": "Document", + "required": [ + "content", + "content_type", + "id", + "meta" + ], + "type": "object", + "properties": { + "content": { + "title": "Content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + }, + "content_type": { + "title": "Content Type", + "enum": [ + "text", + "table", + "image", + "audio" + ], + "type": "string" + }, + "id": { + "title": "Id", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "score": { + "title": "Score", + "type": "number" + }, + "embedding": { + "title": "Embedding", + "type": "string" + } + } + }, + "FilterRequest": { + "title": "FilterRequest", + "type": "object", + "properties": { + "filters": { + "title": "Filters", + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + }, + { + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + }, + { + "type": "object", + "additionalProperties": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + }, + { + "type": "number" + }, + { + "type": "boolean" + } + ] + } + } + ] + } + } + }, + "additionalProperties": false + }, + "HTTPValidationError": { + "title": "HTTPValidationError", + "type": "object", + "properties": { + "detail": { + "title": "Detail", + "type": "array", + "items": { + "$ref": "#/components/schemas/ValidationError" + } + } + } + }, + "Label": { + "title": "Label", + "required": [ + "id", + "query", + "document", + "is_correct_answer", + "is_correct_document", + "origin" + ], + "type": "object", + "properties": { + "id": { + "title": "Id", + "type": "string" + }, + "query": { + "title": "Query", + "type": "string" + }, + "document": { + "$ref": "#/components/schemas/Document" + }, + "is_correct_answer": { + "title": "Is Correct Answer", + "type": "boolean" + }, + "is_correct_document": { + "title": "Is Correct Document", + "type": "boolean" + }, + "origin": { + "title": "Origin", + "enum": [ + "user-feedback", + "gold-label" + ], + "type": "string" + }, + "answer": { + "$ref": "#/components/schemas/Answer" + }, + "no_answer": { + "title": "No Answer", + "type": "boolean" + }, + "pipeline_id": { + "title": "Pipeline Id", + "type": "string" + }, + "created_at": { + "title": "Created At", + "type": "string" + }, + "updated_at": { + "title": "Updated At", + "type": "string" + }, + "meta": { + "title": "Meta", + "type": "object" + }, + "filters": { + "title": "Filters", + "type": "object" + } + } + }, + "QueryRequest": { + "title": "QueryRequest", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "params": { + "title": "Params", + "type": "object" + }, + "debug": { + "title": "Debug", + "type": "boolean", + "default": false + } + }, + "additionalProperties": false + }, + "QueryResponse": { + "title": "QueryResponse", + "required": [ + "query" + ], + "type": "object", + "properties": { + "query": { + "title": "Query", + "type": "string" + }, + "answers": { + "title": "Answers", + "type": "array", + "items": { + "$ref": "#/components/schemas/Answer" + }, + "default": [] + }, + "documents": { + "title": "Documents", + "type": "array", + "items": { + "$ref": "#/components/schemas/Document" + }, + "default": [] + }, + "_debug": { + "title": " Debug", + "type": "object" + } + } + }, + "Span": { + "title": "Span", + "required": [ + "start", + "end" + ], + "type": "object", + "properties": { + "start": { + "title": "Start", + "type": "integer" + }, + "end": { + "title": "End", + "type": "integer" + } + } + }, + "ValidationError": { + "title": "ValidationError", + "required": [ + "loc", + "msg", + "type" + ], + "type": "object", + "properties": { + "loc": { + "title": "Location", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "integer" + } + ] + } + }, + "msg": { + "title": "Message", + "type": "string" + }, + "type": { + "title": "Error Type", + "type": "string" + } + } + } + } + } +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/api/pydoc/answer-generator.yml b/docs/v1.7.0/_src/api/pydoc/answer-generator.yml new file mode 100644 index 0000000000..c4a4ca5b9b --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/answer-generator.yml @@ -0,0 +1,21 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/answer_generator] + modules: ['base', 'transformers'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: generator.md + diff --git a/docs/v1.7.0/_src/api/pydoc/crawler.yml b/docs/v1.7.0/_src/api/pydoc/crawler.yml new file mode 100644 index 0000000000..b9489d589f --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/crawler.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/connector] + modules: ['crawler'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: crawler.md diff --git a/docs/v1.7.0/_src/api/pydoc/document-classifier.yml b/docs/v1.7.0/_src/api/pydoc/document-classifier.yml new file mode 100644 index 0000000000..4070d36773 --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/document-classifier.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/document_classifier] + modules: ['base', 'transformers'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: document_classifier.md diff --git a/docs/v1.7.0/_src/api/pydoc/document-store.yml b/docs/v1.7.0/_src/api/pydoc/document-store.yml new file mode 100644 index 0000000000..33181e3071 --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/document-store.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/document_stores] + modules: ['base', 'elasticsearch', 'opensearch', 'memory', 'sql', 'faiss', 'milvus1', 'milvus2', 'weaviate', 'graphdb', 'deepsetcloud', 'pinecone', 'utils'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: document_store.md diff --git a/docs/v1.7.0/_src/api/pydoc/evaluation.yml b/docs/v1.7.0/_src/api/pydoc/evaluation.yml new file mode 100644 index 0000000000..cfff806d3f --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/evaluation.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/evaluator] + modules: ['evaluator'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: evaluation.md diff --git a/docs/v1.7.0/_src/api/pydoc/extractor.yml b/docs/v1.7.0/_src/api/pydoc/extractor.yml new file mode 100644 index 0000000000..5dd3add5eb --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/extractor.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/extractor] + modules: ['entity'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: extractor.md diff --git a/docs/v1.7.0/_src/api/pydoc/file-classifier.yml b/docs/v1.7.0/_src/api/pydoc/file-classifier.yml new file mode 100644 index 0000000000..b6fa4c94a2 --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/file-classifier.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/file_classifier] + modules: ['file_type'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: file_classifier.md diff --git a/docs/v1.7.0/_src/api/pydoc/file-converters.yml b/docs/v1.7.0/_src/api/pydoc/file-converters.yml new file mode 100644 index 0000000000..14b17e26f6 --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/file-converters.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/file_converter] + modules: ['base', 'docx', 'image', 'markdown', 'pdf', 'parsr', 'azure', 'tika', 'txt'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: file_converter.md diff --git a/docs/v1.7.0/_src/api/pydoc/other-nodes.yml b/docs/v1.7.0/_src/api/pydoc/other-nodes.yml new file mode 100644 index 0000000000..0d3c4483e5 --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/other-nodes.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/other] + modules: ['docs2answers', 'join_docs', 'join_answers', 'route_documents'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: other_nodes.md diff --git a/docs/v1.7.0/_src/api/pydoc/pipelines.yml b/docs/v1.7.0/_src/api/pydoc/pipelines.yml new file mode 100644 index 0000000000..705643ccac --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/pipelines.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/pipelines] + modules: ['base', 'ray', 'standard_pipelines'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: pipelines.md diff --git a/docs/v1.7.0/_src/api/pydoc/preprocessor.yml b/docs/v1.7.0/_src/api/pydoc/preprocessor.yml new file mode 100644 index 0000000000..5481f76c15 --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/preprocessor.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/preprocessor] + modules: ['base', 'preprocessor'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: preprocessor.md diff --git a/docs/v1.7.0/_src/api/pydoc/primitives.yml b/docs/v1.7.0/_src/api/pydoc/primitives.yml new file mode 100644 index 0000000000..6a3e8f9d41 --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/primitives.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/] + modules: ['schema'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: primitives.md diff --git a/docs/v1.7.0/_src/api/pydoc/pseudo-label-generator.yml b/docs/v1.7.0/_src/api/pydoc/pseudo-label-generator.yml new file mode 100644 index 0000000000..f21f5a388c --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/pseudo-label-generator.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/label_generator] + modules: ['pseudo_label_generator'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: pseudo_label_generator.md diff --git a/docs/v1.7.0/_src/api/pydoc/query-classifier.yml b/docs/v1.7.0/_src/api/pydoc/query-classifier.yml new file mode 100644 index 0000000000..5be82ee872 --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/query-classifier.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/query_classifier] + modules: ['base', 'sklearn', 'transformers'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: query_classifier.md diff --git a/docs/v1.7.0/_src/api/pydoc/question-generator.yml b/docs/v1.7.0/_src/api/pydoc/question-generator.yml new file mode 100644 index 0000000000..4d52568635 --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/question-generator.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/question_generator] + modules: ['question_generator'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: question_generator.md diff --git a/docs/v1.7.0/_src/api/pydoc/ranker.yml b/docs/v1.7.0/_src/api/pydoc/ranker.yml new file mode 100644 index 0000000000..dfbce80af6 --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/ranker.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/ranker] + modules: ['base', 'sentence_transformers'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: ranker.md diff --git a/docs/v1.7.0/_src/api/pydoc/reader.yml b/docs/v1.7.0/_src/api/pydoc/reader.yml new file mode 100644 index 0000000000..1910d36e7e --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/reader.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/reader] + modules: ['base', 'farm', 'transformers', 'table'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: reader.md diff --git a/docs/v1.7.0/_src/api/pydoc/retriever.yml b/docs/v1.7.0/_src/api/pydoc/retriever.yml new file mode 100644 index 0000000000..ee64cdfa04 --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/retriever.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/retriever] + modules: ['base', 'sparse', 'dense', 'text2sparql'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: retriever.md diff --git a/docs/v1.7.0/_src/api/pydoc/summarizer.yml b/docs/v1.7.0/_src/api/pydoc/summarizer.yml new file mode 100644 index 0000000000..d6c53bc25f --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/summarizer.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/summarizer] + modules: ['base', 'transformers'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: summarizer.md diff --git a/docs/v1.7.0/_src/api/pydoc/translator.yml b/docs/v1.7.0/_src/api/pydoc/translator.yml new file mode 100644 index 0000000000..36038321e8 --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/translator.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/nodes/translator] + modules: ['base', 'transformers'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: translator.md diff --git a/docs/v1.7.0/_src/api/pydoc/utils.yml b/docs/v1.7.0/_src/api/pydoc/utils.yml new file mode 100644 index 0000000000..ff6fca5c08 --- /dev/null +++ b/docs/v1.7.0/_src/api/pydoc/utils.yml @@ -0,0 +1,20 @@ +loaders: + - type: python + search_path: [../../../../haystack/utils] + modules: ['doc_store', 'export_utils', 'preprocessing', 'squad_data'] + ignore_when_discovered: ['__init__'] +processors: + - type: filter + expression: + documented_only: true + do_not_filter_modules: false + skip_empty_modules: true + - type: smart + - type: crossref +renderer: + type: markdown + descriptive_class_title: false + descriptive_module_title: true + add_method_class_prefix: true + add_member_class_prefix: false + filename: utils.md diff --git a/docs/v1.7.0/_src/benchmarks/farm_per_component.html b/docs/v1.7.0/_src/benchmarks/farm_per_component.html new file mode 100644 index 0000000000..6a9d3d5cea --- /dev/null +++ b/docs/v1.7.0/_src/benchmarks/farm_per_component.html @@ -0,0 +1,48 @@ + + + + + + + +
+ + + diff --git a/docs/v1.7.0/_src/benchmarks/reader_performance.json b/docs/v1.7.0/_src/benchmarks/reader_performance.json new file mode 100644 index 0000000000..be935fe271 --- /dev/null +++ b/docs/v1.7.0/_src/benchmarks/reader_performance.json @@ -0,0 +1,44 @@ +{ + "chart_type": "BarChart", + "title": "Reader Performance", + "subtitle": "Time and Accuracy Benchmarks", + "description": "Performance benchmarks of different Readers that can be used off-the-shelf in Haystack. Some models are geared towards speed, while others are more performance-focused. Accuracy is measured as F1 score and speed as passages/sec (with passages of 384 tokens). Each Reader is benchmarked using the SQuAD v2.0 development set, which contains 11866 question answer pairs. When tokenized using the BERT tokenizer and split using a sliding window approach, these become 12350 passages that are passed into the model. We set max_seq_len=384 and doc_stride=128. These benchmarking tests are run using an AWS p3.2xlarge instance with a Nvidia V100 GPU with this script. Please note that we are using the FARMReader class rather than the TransformersReader class. Also, the F1 measure that is reported here is in fact calculated on token level, rather than word level as is done in the official SQuAD script.", + "bars": "horizontal", + "columns": [ + "Model", + "F1", + "Speed (passages/sec)" + ], + "data": [ + { + "F1": 82.58860575299658, + "Speed": 125.81040525892848, + "Model": "RoBERTa" + }, + { + "F1": 78.87858491007042, + "Speed": 260.6443097981493, + "Model": "MiniLM" + }, + { + "F1": 74.31182400443286, + "Speed": 121.08066567525722, + "Model": "BERT base" + }, + { + "F1": 83.26306774734308, + "Speed": 42.21949937744112, + "Model": "BERT large" + }, + { + "F1": 84.50422699207468, + "Speed": 42.07400844838985, + "Model": "XLM-RoBERTa" + }, + { + "F1": 42.31925844723574, + "Speed": 222.91207128366702, + "Model": "DistilBERT" + } + ] +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/benchmarks/retriever_map.json b/docs/v1.7.0/_src/benchmarks/retriever_map.json new file mode 100644 index 0000000000..51e0687cf3 --- /dev/null +++ b/docs/v1.7.0/_src/benchmarks/retriever_map.json @@ -0,0 +1,204 @@ +{ + "chart_type": "LineChart", + "title": "Retriever Accuracy", + "subtitle": "mAP at different number of docs", + "description": "Here you can see how the mean avg. precision (mAP) of the retriever decays as the number of documents increases. The set up is the same as the above querying benchmark except that a varying number of negative documents are used to fill the document store.", + "columns": [ + "n_docs", + "BM25 / Elasticsearch", + "DPR / Elasticsearch", + "DPR / FAISS (flat)", + "DPR / FAISS (HNSW)", + "DPR / Milvus (flat)", + "DPR / Milvus (HNSW)", + "Sentence Transformers / Elasticsearch" + ], + "axis": [ + { + "x": "Number of docs", + "y": "mAP" + } + ], + "data": [ + { + "model": "DPR / Elasticsearch", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 10000, + "map": 89.87097014904354 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 10000, + "map": 66.26543444531747 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 1000, + "map": 90.06638620360428 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 10000, + "map": 87.11255142468549 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 10000, + "map": 89.87097014904354 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 10000, + "map": 89.51337675393017 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 10000, + "map": 89.87097014904354 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 10000, + "map": 88.24421129104469 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 100000, + "map": 86.54606328368976 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 500000, + "map": 80.86137228234091 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 100000, + "map": 56.25299537353825 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 500000, + "map": 45.595090262466535 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 100000, + "map": 82.74686664920836 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 500000, + "map": 76.49564526892904 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "map": 86.54606328368973 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 500000, + "map": 80.86137228234091 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 100000, + "map": 84.33419639513305 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 500000, + "map": 75.73062475537202 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 100000, + "map": 86.54606328368973 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 500000, + "map": 80.86137228234091 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 100000, + "map": 81.63864883662649 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 500000, + "map": 73.57986207906387 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 1000, + "map": 74.20444712972909 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 1000, + "map": 92.95105322830891 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 10000, + "map": 89.8709701490436 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 100000, + "map": 86.54014997282701 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 1000, + "map": 92.76308330349686 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 10000, + "map": 89.00403653862938 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 100000, + "map": 85.7342431384476 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 500000, + "map": 80.85588135082547 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 500000, + "map": 77.5426462347698 + } + ] +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/benchmarks/retriever_performance.json b/docs/v1.7.0/_src/benchmarks/retriever_performance.json new file mode 100644 index 0000000000..dbb9340481 --- /dev/null +++ b/docs/v1.7.0/_src/benchmarks/retriever_performance.json @@ -0,0 +1,88 @@ +{ + "chart_type": "BarChart", + "title": "Retriever Performance", + "subtitle": "Time and Accuracy Benchmarks", + "description": "Comparison of the speed and accuracy of different DocumentStore / Retriever combinations on 100k documents. Indexing speed (in docs/sec) refers to how quickly Documents can be inserted into a DocumentStore. Querying speed (in queries/sec) refers to the speed at which the system returns relevant Documents when presented with a query.\n\nThe dataset used is Wikipedia, split into 100 word passages (from here)). \n\nFor querying, we use the Natural Questions development set in combination with the wiki passages. The Document Store is populated with the 100 word passages in which the answer spans occur (i.e. gold passages) as well as a random selection of 100 word passages in which the answer spans do not occur (i.e. negative passages). We take a total of 100k gold and negative passages. Query and document embedding are generated by the \"facebook/dpr-question_encoder-single-nq-base\" and \"facebook/dpr-ctx_encoder-single-nq-base\" models. The retriever returns 10 candidates and both the recall and mAP scores are calculated on these 10.\n\nFor FAISS HNSW, we use n_links=128, efSearch=20 and efConstruction=80. We use a cosine similarity function with BM25 retrievers, and dot product with DPR. Both index and query benchmarks are performed on an AWS P3.2xlarge instance which is accelerated by an Nvidia V100 GPU.", + "bars": "horizontal", + "columns": [ + "Model", + "mAP", + "Index Speed (docs/sec)", + "Query Speed (queries/sec)" + ], + "series": { + "s0": "map", + "s1": "time", + "s2": "time" + }, + "axes": { + "label": "map", + "time_side": "top", + "time_label": "seconds" + }, + "data": [ + { + "model": "DPR / Elasticsearch", + "n_docs": 100000, + "index_speed": 71.36964873196698, + "query_speed": 5.192368815242574, + "map": 86.54606328368976 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 100000, + "index_speed": 485.5602670200369, + "query_speed": 103.0884393334727, + "map": 56.25299537353825 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 100000, + "index_speed": 119.52937722555107, + "query_speed": 6.385621466857457, + "map": 82.74686664920836 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "index_speed": 100.01184910084558, + "query_speed": 6.6270933964840415, + "map": 86.54606328368973 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 100000, + "index_speed": 89.90389306648805, + "query_speed": 39.7839528511866, + "map": 84.33419639513305 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 100000, + "index_speed": 116.00982709720004, + "query_speed": 28.57264344960955, + "map": 86.54606328368973 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 100000, + "index_speed": 115.61076852516383, + "query_speed": 38.80526238789059, + "map": 81.63864883662649 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 100000, + "index_speed": 70.05381128388427, + "query_speed": 15.306895223372484, + "map": 86.54014997282701 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 100000, + "index_speed": 70.31004397719536, + "query_speed": 24.95733865947408, + "map": 85.7342431384476 + } + ] +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/benchmarks/retriever_speed.json b/docs/v1.7.0/_src/benchmarks/retriever_speed.json new file mode 100644 index 0000000000..7877d2a358 --- /dev/null +++ b/docs/v1.7.0/_src/benchmarks/retriever_speed.json @@ -0,0 +1,204 @@ +{ + "chart_type": "LineChart", + "title": "Retriever Speed", + "subtitle": "Query Speed at different number of docs", + "description": "Here you can see how the query speed of different Retriever / DocumentStore combinations scale as the number of documents increases. The set up is the same as the above querying benchmark except that a varying number of negative documents are used to fill the document store.", + "columns": [ + "n_docs", + "BM25 / Elasticsearch", + "DPR / Elasticsearch", + "DPR / FAISS (flat)", + "DPR / FAISS (HNSW)", + "DPR / Milvus (flat)", + "DPR / Milvus (HNSW)", + "Sentence Transformers / Elasticsearch" + ], + "axis": [ + { + "x": "Number of docs", + "y": "Queries/sec" + } + ], + "data": [ + { + "model": "DPR / Elasticsearch", + "n_docs": 1000, + "query_speed": 34.22768858415144 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 10000, + "query_speed": 22.197089725786853 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 10000, + "query_speed": 127.11481826852273 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 1000, + "query_speed": 47.51341215808855 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 10000, + "query_speed": 29.74515869340777 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 1000, + "query_speed": 42.49634272581313 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 10000, + "query_speed": 27.684040507849826 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 1000, + "query_speed": 43.36685860983961 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 10000, + "query_speed": 41.819147130090286 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 1000, + "query_speed": 41.12204778755844 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 10000, + "query_speed": 37.86882443918513 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 1000, + "query_speed": 41.14803671045185 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 10000, + "query_speed": 40.072871546542935 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 100000, + "query_speed": 5.192368815242574 + }, + { + "model": "DPR / Elasticsearch", + "n_docs": 500000, + "query_speed": 1.0337466563959614 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 100000, + "query_speed": 103.0884393334727 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 500000, + "query_speed": 78.95037031647355 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 100000, + "query_speed": 6.385621466857457 + }, + { + "model": "Sentence Transformers / Elasticsearch", + "n_docs": 500000, + "query_speed": 1.4175454254854258 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 100000, + "query_speed": 6.6270933964840415 + }, + { + "model": "DPR / FAISS (flat)", + "n_docs": 500000, + "query_speed": 1.5394964631878052 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 100000, + "query_speed": 39.7839528511866 + }, + { + "model": "DPR / FAISS (HNSW)", + "n_docs": 500000, + "query_speed": 39.84177061191119 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 100000, + "query_speed": 28.57264344960955 + }, + { + "model": "DPR / Milvus (flat)", + "n_docs": 500000, + "query_speed": 15.645867393099733 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 100000, + "query_speed": 38.80526238789059 + }, + { + "model": "DPR / Milvus (HNSW)", + "n_docs": 500000, + "query_speed": 37.15717318924075 + }, + { + "model": "BM25 / Elasticsearch", + "n_docs": 1000, + "query_speed": 282.95914917837337 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 1000, + "query_speed": 29.061163356184426 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 10000, + "query_speed": 24.834414667596725 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 100000, + "query_speed": 15.306895223372484 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 1000, + "query_speed": 29.10621389658101 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 10000, + "query_speed": 26.92417300437131 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 100000, + "query_speed": 24.95733865947408 + }, + { + "model": "DPR / OpenSearch (flat)", + "n_docs": 500000, + "query_speed": 11.33271222977541 + }, + { + "model": "DPR / OpenSearch (HNSW)", + "n_docs": 500000, + "query_speed": 24.13921492357397 + } + ] +} \ No newline at end of file diff --git a/docs/v1.7.0/_src/tutorials/Makefile b/docs/v1.7.0/_src/tutorials/Makefile new file mode 100644 index 0000000000..d4bb2cbb9e --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/v1.7.0/_src/tutorials/conf.py b/docs/v1.7.0/_src/tutorials/conf.py new file mode 100644 index 0000000000..4511b84159 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/conf.py @@ -0,0 +1,51 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = "Tutorials" +copyright = "2020, deepset" +author = "deepset" + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = ["IPython.sphinxext.ipython_console_highlighting"] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "alabaster" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] diff --git a/docs/v1.7.0/_src/tutorials/index.rst b/docs/v1.7.0/_src/tutorials/index.rst new file mode 100644 index 0000000000..4351a5f784 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/index.rst @@ -0,0 +1,13 @@ +Tutorials +==================================== + +.. toctree:: + :maxdepth: 4 + :caption: Contents: + + 1) Using Haystack to search through your own documents + 2) Make Haystack understand your jargon + 3) Connect Haystack to your Datastore of choice + 4) Answer incoming questions using FAQ pages + 5) Benchmark the different components of Haystack + 6) SoTA: Powerup Haystack with DPR diff --git a/docs/v1.7.0/_src/tutorials/make.bat b/docs/v1.7.0/_src/tutorials/make.bat new file mode 100644 index 0000000000..2119f51099 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/v1.7.0/_src/tutorials/tutorials/1.md b/docs/v1.7.0/_src/tutorials/tutorials/1.md new file mode 100644 index 0000000000..e792b2cb1e --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/1.md @@ -0,0 +1,301 @@ + + +# Build Your First QA System + + + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb) + +Question Answering can be used in a variety of use cases. A very common one: Using it to navigate through complex knowledge bases or long documents ("search setting"). + +A "knowledge base" could for example be your website, an internal wiki or a collection of financial reports. +In this tutorial we will work on a slightly different domain: "Game of Thrones". + +Let's see how we can use a bunch of Wikipedia articles to answer a variety of questions about the +marvellous seven kingdoms. + + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + + +```python +from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers +from haystack.nodes import FARMReader, TransformersReader +``` + +## Document Store + +Haystack finds answers to queries within the documents stored in a `DocumentStore`. The current implementations of `DocumentStore` include `ElasticsearchDocumentStore`, `FAISSDocumentStore`, `SQLDocumentStore`, and `InMemoryDocumentStore`. + +**Here:** We recommended Elasticsearch as it comes preloaded with features like [full-text queries](https://www.elastic.co/guide/en/elasticsearch/reference/current/full-text-queries.html), [BM25 retrieval](https://www.elastic.co/elasticon/conf/2016/sf/improved-text-scoring-with-bm25), and [vector storage for text embeddings](https://www.elastic.co/guide/en/elasticsearch/reference/7.6/dense-vector.html). + +**Alternatives:** If you are unable to setup an Elasticsearch instance, then follow the [Tutorial 3](https://github.com/deepset-ai/haystack/blob/master/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb) for using SQL/InMemory document stores. + +**Hint**: This tutorial creates a new document store instance with Wikipedia articles on Game of Thrones. However, you can configure Haystack to work with your existing document stores. + +### Start an Elasticsearch server +You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (e.g. in Colab notebooks), then you can manually download and execute Elasticsearch from source. + + +```python +# Recommended: Start Elasticsearch using Docker via the Haystack utility function +from haystack.utils import launch_es + +launch_es() +``` + + +```python +# In Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + + +```python +# Connect to Elasticsearch + +from haystack.document_stores import ElasticsearchDocumentStore + +document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") +``` + +## Preprocessing of documents + +Haystack provides a customizable pipeline for: + - converting files into texts + - cleaning texts + - splitting texts + - writing them to a Document Store + +In this tutorial, we download Wikipedia articles about Game of Thrones, apply a basic cleaning function, and index them in Elasticsearch. + + +```python +# Let's first fetch some documents that we want to query +# Here: 517 Wikipedia articles for Game of Thrones +doc_dir = "data/tutorial1" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# Convert files to dicts +# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) +# It must take a str as input, and return a str. +docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) + +# We now have a list of dictionaries that we can write to our document store. +# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself. +# The default format here is: +# { +# 'content': "", +# 'meta': {'name': "", ...} +# } +# (Optionally: you can also add more key-value-pairs here, that will be indexed as fields in Elasticsearch and +# can be accessed later for filtering or shown in the responses of the Pipeline) + +# Let's have a look at the first 3 entries: +print(docs[:3]) + +# Now, let's write the dicts containing documents to our DB. +document_store.write_documents(docs) +``` + +## Initialize Retriever, Reader & Pipeline + +### Retriever + +Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question could be answered. +They use some simple but fast algorithm. + +**Here:** We use Elasticsearch's default BM25 algorithm + +**Alternatives:** + +- Customize the `BM25Retriever`with custom queries (e.g. boosting) and filters +- Use `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging +- Use `EmbeddingRetriever` to find candidate documents based on the similarity of embeddings (e.g. created via Sentence-BERT) +- Use `DensePassageRetriever` to use different embedding models for passage and query (see Tutorial 6) + + +```python +from haystack.nodes import BM25Retriever + +retriever = BM25Retriever(document_store=document_store) +``` + + +```python +# Alternative: An in-memory TfidfRetriever based on Pandas dataframes for building quick-prototypes with SQLite document store. + +# from haystack.nodes import TfidfRetriever +# retriever = TfidfRetriever(document_store=document_store) +``` + +### Reader + +A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based +on powerful, but slower deep learning models. + +Haystack currently supports Readers based on the frameworks FARM and Transformers. +With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models). + +**Here:** a medium sized RoBERTa QA model using a Reader based on FARM (https://huggingface.co/deepset/roberta-base-squad2) + +**Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package) + +**Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy) + +**Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean the model prefers "no answer possible" + +#### FARMReader + + +```python +# Load a local model or any of the QA models on +# Hugging Face's model hub (https://huggingface.co/models) + +reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) +``` + +#### TransformersReader + + +```python +# Alternative: +# reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) +``` + +### Pipeline + +With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. +Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. +To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. +You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). + + +```python +from haystack.pipelines import ExtractiveQAPipeline + +pipe = ExtractiveQAPipeline(reader, retriever) +``` + +## Voilà! Ask a question! + + +```python +# You can configure how many candidates the Reader and Retriever shall return +# The higher top_k_retriever, the better (but also the slower) your answers. +prediction = pipe.run( + query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +) +``` + + +```python +# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}}) +# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}}) +``` + + +```python +# Now you can either print the object directly... +from pprint import pprint + +pprint(prediction) + +# Sample output: +# { +# 'answers': [ , +# , +# ... +# ] +# 'documents': [ , +# , +# ... +# ], +# 'no_ans_gap': 11.688868522644043, +# 'node_id': 'Reader', +# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}}, +# 'query': 'Who is the father of Arya Stark?', +# 'root_node': 'Query' +# } +``` + + +```python +# ...or use a util to simplify the output +# Change `minimum` to `medium` or `all` to raise the level of detail +print_answers(prediction, details="minimum") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) + diff --git a/docs/v1.7.0/_src/tutorials/tutorials/10.md b/docs/v1.7.0/_src/tutorials/tutorials/10.md new file mode 100644 index 0000000000..98d5fcb8b7 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/10.md @@ -0,0 +1,212 @@ + + +# Question Answering on a Knowledge Graph + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial10_Knowledge_Graph.ipynb) + +Haystack allows storing and querying knowledge graphs with the help of pre-trained models that translate text queries to SPARQL queries. +This tutorial demonstrates how to load an existing knowledge graph into haystack, load a pre-trained retriever, and execute text queries on the knowledge graph. +The training of models that translate text queries into SPARQL queries is currently not supported. + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,inmemorygraph] +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + + +```python +# Here are some imports that we'll need + +import subprocess +import time +from pathlib import Path + +from haystack.nodes import Text2SparqlRetriever +from haystack.document_stores import InMemoryKnowledgeGraph +from haystack.utils import fetch_archive_from_http +``` + +## Downloading Knowledge Graph and Model + + +```python +# Let's first fetch some triples that we want to store in our knowledge graph +# Here: exemplary triples from the wizarding world +graph_dir = "data/tutorial10" +s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/triples_and_config.zip" +fetch_archive_from_http(url=s3_url, output_dir=graph_dir) + +# Fetch a pre-trained BART model that translates text queries to SPARQL queries +model_dir = "../saved_models/tutorial10_knowledge_graph/" +s3_url = "https://fandom-qa.s3-eu-west-1.amazonaws.com/saved_models/hp_v3.4.zip" +fetch_archive_from_http(url=s3_url, output_dir=model_dir) +``` + +## Initialize a knowledge graph and load data + +Currently, Haystack supports two alternative implementations for knowledge graphs: +* simple InMemoryKnowledgeGraph (based on RDFLib in-memory store) +* GraphDBKnowledgeGraph, which runs on GraphDB. + +### InMemoryKnowledgeGraph + + +```python +# Initialize a in memory knowledge graph and use "tutorial_10_index" as the name of the index +kg = InMemoryKnowledgeGraph(index="tutorial_10_index") + +# Delete the index as it might have been already created in previous runs +kg.delete_index() + +# Create the index +kg.create_index() + +# Import triples of subject, predicate, and object statements from a ttl file +kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir) / "triples.ttl") +print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}") +print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.") +``` + +### GraphDBKnowledgeGraph (alternative) + +#### Launching a GraphDB instance + + +```python +# # Unfortunately, there seems to be no good way to run GraphDB in colab environments +# # In your local environment, you could start a GraphDB server with docker +# # Feel free to check GraphDB's website for the free version https://www.ontotext.com/products/graphdb/graphdb-free/ +# import os + +# LAUNCH_GRAPHDB = os.environ.get("LAUNCH_GRAPHDB", False) + +# if LAUNCH_GRAPHDB: +# print("Starting GraphDB ...") +# status = subprocess.run( +# [ +# "docker run -d -p 7200:7200 --name graphdb-instance-tutorial docker-registry.ontotext.com/graphdb-free:9.4.1-adoptopenjdk11" +# ], +# shell=True, +# ) +# if status.returncode: +# raise Exception( +# "Failed to launch GraphDB. Maybe it is already running or you already have a container with that name that you could start?" +# ) +# time.sleep(5) +``` + +#### Creating a new GraphDB repository (also known as index in haystack's document stores) + + +```python +# from haystack.document_stores import GraphDBKnowledgeGraph + +# # Initialize a knowledge graph connected to GraphDB and use "tutorial_10_index" as the name of the index +# kg = GraphDBKnowledgeGraph(index="tutorial_10_index") + +# # Delete the index as it might have been already created in previous runs +# kg.delete_index() + +# # Create the index based on a configuration file +# kg.create_index(config_path=Path(graph_dir) / "repo-config.ttl") + +# # Import triples of subject, predicate, and object statements from a ttl file +# kg.import_from_ttl_file(index="tutorial_10_index", path=Path(graph_dir) / "triples.ttl") +# print(f"The last triple stored in the knowledge graph is: {kg.get_all_triples()[-1]}") +# print(f"There are {len(kg.get_all_triples())} triples stored in the knowledge graph.") +``` + + +```python +# # Define prefixes for names of resources so that we can use shorter resource names in queries +# prefixes = """PREFIX rdf: +# PREFIX xsd: +# PREFIX hp: +# """ +# kg.prefixes = prefixes +``` + +## Load the pre-trained retriever + + +```python +# Load a pre-trained model that translates text queries to SPARQL queries +kgqa_retriever = Text2SparqlRetriever(knowledge_graph=kg, model_name_or_path=Path(model_dir) / "hp_v3.4") +``` + +## Query Execution + +We can now ask questions that will be answered by our knowledge graph! +One limitation though: our pre-trained model can only generate questions about resources it has seen during training. +Otherwise, it cannot translate the name of the resource to the identifier used in the knowledge graph. +E.g. "Harry" -> "hp:Harry_potter" + + +```python +query = "In which house is Harry Potter?" +print(f'Translating the text query "{query}" to a SPARQL query and executing it on the knowledge graph...') +result = kgqa_retriever.retrieve(query=query) +print(result) +# Correct SPARQL query: select ?a { hp:Harry_potter hp:house ?a . } +# Correct answer: Gryffindor + +print("Executing a SPARQL query with prefixed names of resources...") +result = kgqa_retriever._query_kg( + sparql_query="select distinct ?sbj where { ?sbj hp:job hp:Keeper_of_keys_and_grounds . }" +) +print(result) +# Paraphrased question: Who is the keeper of keys and grounds? +# Correct answer: Rubeus Hagrid + +print("Executing a SPARQL query with full names of resources...") +result = kgqa_retriever._query_kg( + sparql_query="select distinct ?obj where { ?obj . }" +) +print(result) +# Paraphrased question: What is the patronus of Hermione? +# Correct answer: Otter +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.7.0/_src/tutorials/tutorials/11.md b/docs/v1.7.0/_src/tutorials/tutorials/11.md new file mode 100644 index 0000000000..5a75af1bc6 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/11.md @@ -0,0 +1,446 @@ + + +# Pipelines Tutorial + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial11_Pipelines.ipynb) + +In this tutorial, you will learn how the `Pipeline` class acts as a connector between all the different +building blocks that are found in FARM. Whether you are using a Reader, Generator, Summarizer +or Retriever (or 2), the `Pipeline` class will help you build a Directed Acyclic Graph (DAG) that +determines how to route the output of one component into the input of another. + + + + +## Setting Up the Environment + +Let's start by ensuring we have a GPU running to ensure decent speed in this tutorial. +In Google colab, you can change to a GPU runtime in the menu: +- **Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + +These lines are to install Haystack through pip + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] + +# Install pygraphviz +!apt install libgraphviz-dev +!pip install pygraphviz +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + +If running from Colab or a no Docker environment, you will want to start Elasticsearch from source + + +```python +# In Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + +## Initialization + +Then let's fetch some data (in this case, pages from the Game of Thrones wiki) and prepare it so that it can +be used indexed into our `DocumentStore` + + +```python +from haystack.utils import ( + print_answers, + print_documents, + fetch_archive_from_http, + convert_files_to_docs, + clean_wiki_text, +) + +# Download and prepare data - 517 Wikipedia articles for Game of Thrones +doc_dir = "data/tutorial11" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt11.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# convert files to dicts containing documents that can be indexed to our datastore +got_docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) +``` + +Here we initialize the core components that we will be gluing together using the `Pipeline` class. +We have a `DocumentStore`, an `BM25Retriever` and a `FARMReader`. +These can be combined to create a classic Retriever-Reader pipeline that is designed +to perform Open Domain Question Answering. + + +```python +from haystack import Pipeline +from haystack.utils import launch_es +from haystack.document_stores import ElasticsearchDocumentStore +from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader + + +# Initialize DocumentStore and index documents +launch_es() +document_store = ElasticsearchDocumentStore() +document_store.delete_documents() +document_store.write_documents(got_docs) + +# Initialize Sparse retriever +bm25_retriever = BM25Retriever(document_store=document_store) + +# Initialize dense retriever +embedding_retriever = EmbeddingRetriever( + document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1" +) +document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False) + +# Initialize reader +reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") +``` + +## Prebuilt Pipelines + +Haystack features many prebuilt pipelines that cover common tasks. +Here we have an `ExtractiveQAPipeline` (the successor to the now deprecated `Finder` class). + + +```python +from haystack.pipelines import ExtractiveQAPipeline + +# Prebuilt pipeline +p_extractive_premade = ExtractiveQAPipeline(reader=reader, retriever=bm25_retriever) +res = p_extractive_premade.run( + query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +) +print_answers(res, details="minimum") +``` + +If you want to just do the retrieval step, you can use a `DocumentSearchPipeline` + + +```python +from haystack.pipelines import DocumentSearchPipeline + +p_retrieval = DocumentSearchPipeline(bm25_retriever) +res = p_retrieval.run(query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}}) +print_documents(res, max_text_len=200) +``` + +Or if you want to use a `Generator` instead of a `Reader`, +you can initialize a `GenerativeQAPipeline` like this: + + +```python +from haystack.pipelines import GenerativeQAPipeline, FAQPipeline +from haystack.nodes import RAGenerator + +# We set this to True so that the document store returns document embeddings with each document +# This is needed by the Generator +document_store.return_embedding = True + +# Initialize generator +rag_generator = RAGenerator() + +# Generative QA +p_generator = GenerativeQAPipeline(generator=rag_generator, retriever=embedding_retriever) +res = p_generator.run(query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}}) +print_answers(res, details="minimum") + +# We are setting this to False so that in later pipelines, +# we get a cleaner printout +document_store.return_embedding = False +``` + +Haystack features prebuilt pipelines to do: +- just document search (DocumentSearchPipeline), +- document search with summarization (SearchSummarizationPipeline) +- generative QA (GenerativeQAPipeline) +- FAQ style QA (FAQPipeline) +- translated search (TranslationWrapperPipeline) +To find out more about these pipelines, have a look at our [documentation](https://haystack.deepset.ai/docs/latest/pipelinesmd) + + +With any Pipeline, whether prebuilt or custom constructed, +you can save a diagram showing how all the components are connected. + +![image](https://github.com/deepset-ai/haystack/blob/master/docs/img/retriever-reader-pipeline.png?raw=true) + + +```python +p_extractive_premade.draw("pipeline_extractive_premade.png") +p_retrieval.draw("pipeline_retrieval.png") +p_generator.draw("pipeline_generator.png") +``` + +## Custom Pipelines + +Now we are going to rebuild the `ExtractiveQAPipelines` using the generic Pipeline class. +We do this by adding the building blocks that we initialized as nodes in the graph. + + +```python +# Custom built extractive QA pipeline +p_extractive = Pipeline() +p_extractive.add_node(component=bm25_retriever, name="Retriever", inputs=["Query"]) +p_extractive.add_node(component=reader, name="Reader", inputs=["Retriever"]) + +# Now we can run it +res = p_extractive.run( + query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +) +print_answers(res, details="minimum") +p_extractive.draw("pipeline_extractive.png") +``` + +Pipelines offer a very simple way to ensemble together different components. +In this example, we are going to combine the power of an `EmbeddingRetriever` +with the keyword based `BM25Retriever`. +See our [documentation](https://haystack.deepset.ai/docs/latest/retrievermd) to understand why +we might want to combine a dense and sparse retriever. + +![image](https://github.com/deepset-ai/haystack/blob/master/docs/img/tutorial11_custompipelines_pipeline_ensemble.png?raw=true) + +Here we use a `JoinDocuments` node so that the predictions from each retriever can be merged together. + + +```python +from haystack.nodes import JoinDocuments + +# Create ensembled pipeline +p_ensemble = Pipeline() +p_ensemble.add_node(component=bm25_retriever, name="ESRetriever", inputs=["Query"]) +p_ensemble.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"]) +p_ensemble.add_node( + component=JoinDocuments(join_mode="concatenate"), name="JoinResults", inputs=["ESRetriever", "EmbeddingRetriever"] +) +p_ensemble.add_node(component=reader, name="Reader", inputs=["JoinResults"]) +p_ensemble.draw("pipeline_ensemble.png") + +# Run pipeline +res = p_ensemble.run( + query="Who is the father of Arya Stark?", params={"EmbeddingRetriever": {"top_k": 5}, "ESRetriever": {"top_k": 5}} +) +print_answers(res, details="minimum") +``` + +## Custom Nodes + +Nodes are relatively simple objects +and we encourage our users to design their own if they don't see on that fits their use case + +The only requirements are: +- Create a class that inherits `BaseComponent`. +- Add a method run() to your class. Add the mandatory and optional arguments it needs to process. These arguments must be passed as input to the pipeline, inside `params`, or output by preceding nodes. +- Add processing logic inside the run() (e.g. reformatting the query). +- Return a tuple that contains your output data (for the next node) +and the name of the outgoing edge (by default "output_1" for nodes that have one output) +- Add a class attribute outgoing_edges = 1 that defines the number of output options from your node. You only need a higher number here if you have a decision node (see below). + +Here we have a template for a Node: + + +```python +from haystack import BaseComponent +from typing import Optional, List + + +class CustomNode(BaseComponent): + outgoing_edges = 1 + + def run(self, query: str, my_optional_param: Optional[int]): + # process the inputs + output = {"my_output": ...} + return output, "output_1" + + def run_batch(self, queries: List[str], my_optional_param: Optional[int]): + # process the inputs + output = {"my_output": ...} + return output, "output_1" +``` + +## Decision Nodes + +Decision Nodes help you route your data so that only certain branches of your `Pipeline` are run. +One popular use case for such query classifiers is routing keyword queries to Elasticsearch and questions to EmbeddingRetriever + Reader. +With this approach you keep optimal speed and simplicity for keywords while going deep with transformers when it's most helpful. + +![image](https://github.com/deepset-ai/haystack/blob/master/docs/img/tutorial11_decision_nodes_pipeline_classifier.png?raw=true) + +Though this looks very similar to the ensembled pipeline shown above, +the key difference is that only one of the retrievers is run for each request. +By contrast both retrievers are always run in the ensembled approach. + +Below, we define a very naive `QueryClassifier` and show how to use it: + + +```python +class CustomQueryClassifier(BaseComponent): + outgoing_edges = 2 + + def run(self, query: str): + if "?" in query: + return {}, "output_2" + else: + return {}, "output_1" + + def run_batch(self, queries: List[str]): + split = {"output_1": {"queries": []}, "output_2": {"queries": []}} + for query in queries: + if "?" in query: + split["output_2"]["queries"].append(query) + else: + split["output_1"]["queries"].append(query) + + return split, "split" + + +# Here we build the pipeline +p_classifier = Pipeline() +p_classifier.add_node(component=CustomQueryClassifier(), name="QueryClassifier", inputs=["Query"]) +p_classifier.add_node(component=bm25_retriever, name="ESRetriever", inputs=["QueryClassifier.output_1"]) +p_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_2"]) +p_classifier.add_node(component=reader, name="QAReader", inputs=["ESRetriever", "EmbeddingRetriever"]) +p_classifier.draw("pipeline_classifier.png") + +# Run only the dense retriever on the full sentence query +res_1 = p_classifier.run(query="Who is the father of Arya Stark?") +print("Embedding Retriever Results" + "\n" + "=" * 15) +print_answers(res_1) + +# Run only the sparse retriever on a keyword based query +res_2 = p_classifier.run(query="Arya Stark father") +print("ES Results" + "\n" + "=" * 15) +print_answers(res_2) +``` + +## Evaluation Nodes + +We have also designed a set of nodes that can be used to evaluate the performance of a system. +Have a look at our [tutorial](https://haystack.deepset.ai/docs/latest/tutorial5md) to get hands on with the code and learn more about Evaluation Nodes! + +## Debugging Pipelines + +You can print out debug information from nodes in your pipelines in a few different ways. + + +```python +# 1) You can set the `debug` attribute of a given node. +bm25_retriever.debug = True + +# 2) You can provide `debug` as a parameter when running your pipeline +result = p_classifier.run(query="Who is the father of Arya Stark?", params={"ESRetriever": {"debug": True}}) + +# 3) You can provide the `debug` paramter to all nodes in your pipeline +result = p_classifier.run(query="Who is the father of Arya Stark?", params={"debug": True}) + +result["_debug"] +``` + +## YAML Configs + +A full `Pipeline` can be defined in a YAML file and simply loaded. +Having your pipeline available in a YAML is particularly useful +when you move between experimentation and production environments. +Just export the YAML from your notebook / IDE and import it into your production environment. +It also helps with version control of pipelines, +allows you to share your pipeline easily with colleagues, +and simplifies the configuration of pipeline parameters in production. + +It consists of two main sections: you define all objects (e.g. a reader) in components +and then stick them together to a pipeline in pipelines. +You can also set one component to be multiple nodes of a pipeline or to be a node across multiple pipelines. +It will be loaded just once in memory and therefore doesn't hurt your resources more than actually needed. + +The contents of a YAML file should look something like this: + +```yaml +version: '0.7' +components: # define all the building-blocks for Pipeline +- name: MyReader # custom-name for the component; helpful for visualization & debugging + type: FARMReader # Haystack Class name for the component + params: + no_ans_boost: -10 + model_name_or_path: deepset/roberta-base-squad2 +- name: MyESRetriever + type: BM25Retriever + params: + document_store: MyDocumentStore # params can reference other components defined in the YAML + custom_query: null +- name: MyDocumentStore + type: ElasticsearchDocumentStore + params: + index: haystack_test +pipelines: # multiple Pipelines can be defined using the components from above +- name: my_query_pipeline # a simple extractive-qa Pipeline + nodes: + - name: MyESRetriever + inputs: [Query] + - name: MyReader + inputs: [MyESRetriever] +``` + +To load, simply call: +``` python +yaml_pipeline = Pipeline.load_from_yaml(Path("sample.yaml")) +``` + +## Conclusion + +The possibilities are endless with the `Pipeline` class and we hope that this tutorial will inspire you +to build custom pipeplines that really work for your use case! + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.7.0/_src/tutorials/tutorials/12.md b/docs/v1.7.0/_src/tutorials/tutorials/12.md new file mode 100644 index 0000000000..5692a4caa4 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/12.md @@ -0,0 +1,185 @@ + + +# Long-Form Question Answering + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial12_LFQA.ipynb) + +Follow this tutorial to learn how to build and use a pipeline for Long-Form Question Answering (LFQA). LFQA is a variety of the generative question answering task. LFQA systems query large document stores for relevant information and then use this information to generate accurate, multi-sentence answers. In a regular question answering system, the retrieved documents related to the query (context passages) act as source tokens for extracted answers. In an LFQS system, context passages provide the context the system uses to generate original, abstractive, long-form answers. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install -q git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss] +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + + +```python +from haystack.utils import convert_files_to_docs, fetch_archive_from_http, clean_wiki_text +from haystack.nodes import Seq2SeqGenerator +``` + +### Document Store + +FAISS is a library for efficient similarity search on a cluster of dense vectors. +The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood +to store the document text and other meta data. The vector embeddings of the text are +indexed on a FAISS Index that later is queried for searching answers. +The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for +faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor. +For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index + + +```python +from haystack.document_stores import FAISSDocumentStore + +document_store = FAISSDocumentStore(embedding_dim=128, faiss_index_factory_str="Flat") +``` + +### Cleaning & indexing documents + +Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore + + +```python +# Let's first get some files that we want to use +doc_dir = "data/tutorial12" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt12.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# Convert files to dicts +docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) + +# Now, let's write the dicts containing documents to our DB. +document_store.write_documents(docs) +``` + +### Initialize Retriever and Reader/Generator + +#### Retriever + +We use a `DensePassageRetriever` and we invoke `update_embeddings` to index the embeddings of documents in the `FAISSDocumentStore` + + + + +```python +from haystack.nodes import DensePassageRetriever + +retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model="vblagoje/dpr-question_encoder-single-lfqa-wiki", + passage_embedding_model="vblagoje/dpr-ctx_encoder-single-lfqa-wiki", +) + +document_store.update_embeddings(retriever) +``` + +Before we blindly use the `DensePassageRetriever` let's empirically test it to make sure a simple search indeed finds the relevant documents. + + +```python +from haystack.utils import print_documents +from haystack.pipelines import DocumentSearchPipeline + +p_retrieval = DocumentSearchPipeline(retriever) +res = p_retrieval.run(query="Tell me something about Arya Stark?", params={"Retriever": {"top_k": 10}}) +print_documents(res, max_text_len=512) +``` + +#### Reader/Generator + +Similar to previous Tutorials we now initalize our reader/generator. + +Here we use a `Seq2SeqGenerator` with the *vblagoje/bart_lfqa* model (see: https://huggingface.co/vblagoje/bart_lfqa) + + + + +```python +generator = Seq2SeqGenerator(model_name_or_path="vblagoje/bart_lfqa") +``` + +### Pipeline + +With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. +Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. +To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `GenerativeQAPipeline` that combines a retriever and a reader/generator to answer our questions. +You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). + + +```python +from haystack.pipelines import GenerativeQAPipeline + +pipe = GenerativeQAPipeline(generator, retriever) +``` + +## Voilà! Ask a question! + + +```python +pipe.run( + query="How did Arya Stark's character get portrayed in a television adaptation?", params={"Retriever": {"top_k": 3}} +) +``` + + +```python +pipe.run(query="Why is Arya Stark an unusual character?", params={"Retriever": {"top_k": 3}}) +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.7.0/_src/tutorials/tutorials/13.md b/docs/v1.7.0/_src/tutorials/tutorials/13.md new file mode 100644 index 0000000000..42c39c60d6 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/13.md @@ -0,0 +1,202 @@ + + +# Question Generation + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial13_Question_generation.ipynb) + +This is a bare bones tutorial showing what is possible with the QuestionGenerator Nodes and Pipelines which automatically +generate questions which the question generation model thinks can be answered by a given document. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + + +```python +# Imports needed to run this notebook + +from pprint import pprint +from tqdm import tqdm +from haystack.nodes import QuestionGenerator, BM25Retriever, FARMReader +from haystack.document_stores import ElasticsearchDocumentStore +from haystack.pipelines import ( + QuestionGenerationPipeline, + RetrieverQuestionGenerationPipeline, + QuestionAnswerGenerationPipeline, +) +from haystack.utils import launch_es, print_questions +``` + +Let's start an Elasticsearch instance with one of the options below: + + +```python +# Option 1: Start Elasticsearch service via Docker +launch_es() +``` + + +```python +# Option 2: In Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + +Let's initialize some core components + + +```python +text1 = "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum and first released in 1991, Python's design philosophy emphasizes code readability with its notable use of significant whitespace." +text2 = "Princess Arya Stark is the third child and second daughter of Lord Eddard Stark and his wife, Lady Catelyn Stark. She is the sister of the incumbent Westerosi monarchs, Sansa, Queen in the North, and Brandon, King of the Andals and the First Men. After narrowly escaping the persecution of House Stark by House Lannister, Arya is trained as a Faceless Man at the House of Black and White in Braavos, using her abilities to avenge her family. Upon her return to Westeros, she exacts retribution for the Red Wedding by exterminating the Frey male line." +text3 = "Dry Cleaning are an English post-punk band who formed in South London in 2018.[3] The band is composed of vocalist Florence Shaw, guitarist Tom Dowse, bassist Lewis Maynard and drummer Nick Buxton. They are noted for their use of spoken word primarily in lieu of sung vocals, as well as their unconventional lyrics. Their musical stylings have been compared to Wire, Magazine and Joy Division.[4] The band released their debut single, 'Magic of Meghan' in 2019. Shaw wrote the song after going through a break-up and moving out of her former partner's apartment the same day that Meghan Markle and Prince Harry announced they were engaged.[5] This was followed by the release of two EPs that year: Sweet Princess in August and Boundary Road Snacks and Drinks in October. The band were included as part of the NME 100 of 2020,[6] as well as DIY magazine's Class of 2020.[7] The band signed to 4AD in late 2020 and shared a new single, 'Scratchcard Lanyard'.[8] In February 2021, the band shared details of their debut studio album, New Long Leg. They also shared the single 'Strong Feelings'.[9] The album, which was produced by John Parish, was released on 2 April 2021.[10]" + +docs = [{"content": text1}, {"content": text2}, {"content": text3}] + +# Initialize document store and write in the documents +document_store = ElasticsearchDocumentStore() +document_store.write_documents(docs) + +# Initialize Question Generator +question_generator = QuestionGenerator() +``` + +## Question Generation Pipeline + +The most basic version of a question generator pipeline takes a document as input and outputs generated questions +which the the document can answer. + + +```python +question_generation_pipeline = QuestionGenerationPipeline(question_generator) +for idx, document in enumerate(document_store): + + print(f"\n * Generating questions for document {idx}: {document.content[:100]}...\n") + result = question_generation_pipeline.run(documents=[document]) + print_questions(result) +``` + +## Retriever Question Generation Pipeline + +This pipeline takes a query as input. It retrieves relevant documents and then generates questions based on these. + + +```python +retriever = BM25Retriever(document_store=document_store) +rqg_pipeline = RetrieverQuestionGenerationPipeline(retriever, question_generator) + +print(f"\n * Generating questions for documents matching the query 'Arya Stark'\n") +result = rqg_pipeline.run(query="Arya Stark") +print_questions(result) +``` + +## Question Answer Generation Pipeline + +This pipeline takes a document as input, generates questions on it, and attempts to answer these questions using +a Reader model + + +```python +reader = FARMReader("deepset/roberta-base-squad2") +qag_pipeline = QuestionAnswerGenerationPipeline(question_generator, reader) +for idx, document in enumerate(tqdm(document_store)): + + print(f"\n * Generating questions and answers for document {idx}: {document.content[:100]}...\n") + result = qag_pipeline.run(documents=[document]) + print_questions(result) +``` + +## Translated Question Answer Generation Pipeline +Trained models for Question Answer Generation are not available in many languages other than English. Haystack +provides a workaround for that issue by machine-translating a pipeline's inputs and outputs with the +TranslationWrapperPipeline. The following example generates German questions and answers on a German text +document - by using an English model for Question Answer Generation. + + +```python +# Fill the document store with a German document. +text1 = "Python ist eine interpretierte Hochsprachenprogrammiersprache für allgemeine Zwecke. Sie wurde von Guido van Rossum entwickelt und 1991 erstmals veröffentlicht. Die Design-Philosophie von Python legt den Schwerpunkt auf die Lesbarkeit des Codes und die Verwendung von viel Leerraum (Whitespace)." +docs = [{"content": text1}] +document_store.delete_documents() +document_store.write_documents(docs) + +# Load machine translation models +from haystack.nodes import TransformersTranslator + +in_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-de-en") +out_translator = TransformersTranslator(model_name_or_path="Helsinki-NLP/opus-mt-en-de") + +# Wrap the previously defined QuestionAnswerGenerationPipeline +from haystack.pipelines import TranslationWrapperPipeline + +pipeline_with_translation = TranslationWrapperPipeline( + input_translator=in_translator, output_translator=out_translator, pipeline=qag_pipeline +) + +for idx, document in enumerate(tqdm(document_store)): + print(f"\n * Generating questions and answers for document {idx}: {document.content[:100]}...\n") + result = pipeline_with_translation.run(documents=[document]) + print_questions(result) +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.7.0/_src/tutorials/tutorials/14.md b/docs/v1.7.0/_src/tutorials/tutorials/14.md new file mode 100644 index 0000000000..f0849c84ef --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/14.md @@ -0,0 +1,460 @@ + + +# Query Classifier Tutorial +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial14_Query_Classifier.ipynb) + +One of the great benefits of using state-of-the-art NLP models like those available in Haystack is that it allows users to state their queries as *plain natural language questions*: rather than trying to come up with just the right set of keywords to find the answer to their question, users can simply ask their question in much the same way that they would ask it of a (very knowledgeable!) person. + +But just because users *can* ask their questions in "plain English" (or "plain German", etc.), that doesn't mean they always *will*. For instance, a user might input a few keywords rather than a complete question because they don't understand the pipeline's full capabilities, or because they are so accustomed to keyword search. While a standard Haystack pipeline might handle such queries with reasonable accuracy, for a variety of reasons we still might prefer that our pipeline be sensitive to the type of query it is receiving, so that it behaves differently when a user inputs, say, a collection of keywords instead of a question. + +For this reason, Haystack comes with built-in capabilities to distinguish between three types of queries: **keyword queries**, **interrogative queries**, and **statement queries**, described below. + +1. **Keyword queries** can be thought of more or less as lists of words, such as "Alaska cruises summer". While the meanings of individual words may matter in a keyword query, the linguistic connections *between* words do not. Hence, in a keyword query the order of words is largely irrelevant: "Alaska cruises summer", "summer Alaska cruises", and "summer cruises Alaska" are functionally the same. + +2. **Interrogative queries** (or **question queries**) are queries phrased as natural language questions, such as "Who was the father of Eddard Stark?". Unlike with keyword queries, word order very much matters here: "Who was the father of Eddard Stark?" and "Who was Eddard Stark the father of?" are very different questions, despite having exactly the same words. (Note that while we often write questions with question marks, Haystack can find interrogative queries without such a dead giveaway!) + +3. **Statement queries** are just declarative sentences, such as "Daenerys loved Jon". These are like interrogative queries in that word order matters—again, "Daenerys loved Jon" and "Jon loved Daenerys" mean very different things—but they are statements instead of questions. + +In this tutorial you will learn how to use **query classifiers** to branch your Haystack pipeline based on the type of query it receives. Haystack comes with two out-of-the-box query classification schemas, each of which routes a given query into one of two branches: + +1. **Keyword vs. Question/Statement** — routes a query into one of two branches depending on whether it is a full question/statement or a collection of keywords. + +2. **Question vs. Statement** — routes a natural language query into one of two branches depending on whether it is a question or a statement. + +Furthermore, for each classification schema there are two types of nodes capable of performing this classification: a **`TransformersQueryClassifier`** that uses a transformer model, and an **`SklearnQueryClassifier`** that uses a more lightweight model built in `sklearn`. + +With all of that explanation out of the way, let's dive in! + +### Prepare the Environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + +Next we make sure the latest version of Haystack is installed: + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack (Colab) +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] + +# Install these to allow pipeline visualization +!apt install libgraphviz-dev +!pip install pygraphviz +``` + +### Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + +### Trying Some Query Classifiers on their Own + +Before integrating query classifiers into our pipelines, let's test them out on their own and see what they actually do. First we initiate a simple, out-of-the-box **keyword vs. question/statement** `SklearnQueryClassifier`: + + +```python +# Here we create the keyword vs question/statement query classifier +from haystack.nodes import SklearnQueryClassifier + +keyword_classifier = SklearnQueryClassifier() +``` + +Now let's feed some queries into this query classifier. We'll test with one keyword query, one interrogative query, and one statement query. Note that we don't need to use any punctuation, such as question marks, for the query classifier to make the right decision. + + +```python +queries = [ + "Arya Stark father", # Keyword Query + "Who was the father of Arya Stark", # Interrogative Query + "Lord Eddard was the father of Arya Stark", # Statement Query +] +``` + +Below, you can see what the classifier does with these queries: it correctly determines that "Arya Stark father" is a keyword query and sends it to branch 2. It also correctly classifies both the interrogative query "Who was the father of Arya Stark" and the statement query "Lord Eddard was the father of Arya Stark" as non-keyword queries, and sends them to branch 1. + + +```python +import pandas as pd + +k_vs_qs_results = {"Query": [], "Output Branch": [], "Class": []} + +for query in queries: + result = keyword_classifier.run(query=query) + k_vs_qs_results["Query"].append(query) + k_vs_qs_results["Output Branch"].append(result[1]) + k_vs_qs_results["Class"].append("Question/Statement" if result[1] == "output_1" else "Keyword") + +pd.DataFrame.from_dict(k_vs_qs_results) +``` + +Next, we will illustrate a **question vs. statement** `SklearnQueryClassifier`. We define our classifier below. Note that this time we have to explicitly specify the model and vectorizer since the default for a `SklearnQueryClassifier` (and a `TransformersQueryClassifier`) is keyword vs. question/statement classification. + + +```python +# Here we create the question vs statement query classifier +model_url = ( + "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/model.pickle" +) +vectorizer_url = ( + "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier_statements/vectorizer.pickle" +) + +question_classifier = SklearnQueryClassifier(model_name_or_path=model_url, vectorizer_name_or_path=vectorizer_url) +``` + +We will test this classifier on the two question/statement queries from the last go-round: + + +```python +queries = [ + "Who was the father of Arya Stark", # Interrogative Query + "Lord Eddard was the father of Arya Stark", # Statement Query +] + +q_vs_s_results = {"Query": [], "Output Branch": [], "Class": []} + +for query in queries: + result = question_classifier.run(query=query) + q_vs_s_results["Query"].append(query) + q_vs_s_results["Output Branch"].append(result[1]) + q_vs_s_results["Class"].append("Question" if result[1] == "output_1" else "Statement") + +pd.DataFrame.from_dict(q_vs_s_results) +``` + +And as we see, the question "Who was the father of Arya Stark" is sent to branch 1, while the statement "Lord Eddard was the father of Arya Stark" is sent to branch 2. This means we can have our pipeline treat statements and questions differently. + +### Using Query Classifiers in a Pipeline + +Now let's see how we can use query classifiers in a question-answering (QA) pipeline. We start by initiating Elasticsearch: + + +```python +# In Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + +Next we fetch some data—for our example we'll use pages from the Game of Thrones wiki—and index it in our `DocumentStore`: + + +```python +from haystack.utils import ( + print_answers, + print_documents, + fetch_archive_from_http, + convert_files_to_docs, + clean_wiki_text, + launch_es, +) +from haystack.pipelines import Pipeline +from haystack.document_stores import ElasticsearchDocumentStore +from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader, TransformersQueryClassifier + +# Download and prepare data - 517 Wikipedia articles for Game of Thrones +doc_dir = "data/tutorial14" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt14.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# convert files to dicts containing documents that can be indexed to our datastore +got_docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) + +# Initialize DocumentStore and index documents +# launch_es() # Uncomment this line for local Elasticsearch +document_store = ElasticsearchDocumentStore() +document_store.delete_documents() +document_store.write_documents(got_docs) +``` + +#### Pipelines with Keyword vs. Question/Statement Classification + +Our first illustration will be a simple retriever-reader QA pipeline, but the choice of which retriever we use will depend on the type of query received: **keyword** queries will use a sparse **`BM25Retriever`**, while **question/statement** queries will use the more accurate but also more computationally expensive **`EmbeddingRetriever`**. + +We start by initializing our retrievers and reader: + + +```python +# Initialize sparse retriever for keyword queries +bm25_retriever = BM25Retriever(document_store=document_store) + +# Initialize dense retriever for question/statement queries +embedding_retriever = EmbeddingRetriever( + document_store=document_store, embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1" +) +document_store.update_embeddings(embedding_retriever, update_existing_embeddings=False) + +reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2") +``` + +Now we define our pipeline. As promised, the question/statement branch `output_1` from the query classifier is fed into an `EmbeddingRetriever`, while the keyword branch `output_2` from the same classifier is fed into a `BM25Retriever`. Both of these retrievers are then fed into our reader. Our pipeline can thus be thought of as having something of a diamond shape: all queries are sent into the classifier, which splits those queries into two different retrievers, and those retrievers feed their outputs to the same reader. + + +```python +# Here we build the pipeline +sklearn_keyword_classifier = Pipeline() +sklearn_keyword_classifier.add_node(component=SklearnQueryClassifier(), name="QueryClassifier", inputs=["Query"]) +sklearn_keyword_classifier.add_node( + component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_1"] +) +sklearn_keyword_classifier.add_node(component=bm25_retriever, name="BM25Retriever", inputs=["QueryClassifier.output_2"]) +sklearn_keyword_classifier.add_node(component=reader, name="QAReader", inputs=["BM25Retriever", "EmbeddingRetriever"]) + +# Visualization of the pipeline +sklearn_keyword_classifier.draw("sklearn_keyword_classifier.png") +``` + +Below, we can see how this choice affects the branching structure: the keyword query "arya stark father" and the question query "Who is the father of Arya Stark?" generate noticeably different results, a distinction that is likely due to the use of different retrievers for keyword vs. question/statement queries. + + +```python +# Useful for framing headers +equal_line = "=" * 30 + +# Run only the dense retriever on the full sentence query +res_1 = sklearn_keyword_classifier.run(query="Who is the father of Arya Stark?") +print(f"\n\n{equal_line}\nQUESTION QUERY RESULTS\n{equal_line}") +print_answers(res_1, details="minimum") +print("\n\n") + +# Run only the sparse retriever on a keyword based query +res_2 = sklearn_keyword_classifier.run(query="arya stark father") +print(f"\n\n{equal_line}\nKEYWORD QUERY RESULTS\n{equal_line}") +print_answers(res_2, details="minimum") +``` + +The above example uses an `SklearnQueryClassifier`, but of course we can do precisely the same thing with a `TransformersQueryClassifier`. This is illustrated below, where we have constructed the same diamond-shaped pipeline. + + +```python +# Here we build the pipeline +transformer_keyword_classifier = Pipeline() +transformer_keyword_classifier.add_node( + component=TransformersQueryClassifier(), name="QueryClassifier", inputs=["Query"] +) +transformer_keyword_classifier.add_node( + component=embedding_retriever, name="EmbeddingRetriever", inputs=["QueryClassifier.output_1"] +) +transformer_keyword_classifier.add_node( + component=bm25_retriever, name="BM25Retriever", inputs=["QueryClassifier.output_2"] +) +transformer_keyword_classifier.add_node( + component=reader, name="QAReader", inputs=["BM25Retriever", "EmbeddingRetriever"] +) + + +# Useful for framing headers +equal_line = "=" * 30 + +# Run only the dense retriever on the full sentence query +res_1 = transformer_keyword_classifier.run(query="Who is the father of Arya Stark?") +print(f"\n\n{equal_line}\nQUESTION QUERY RESULTS\n{equal_line}") +print_answers(res_1, details="minimum") +print("\n\n") + +# Run only the sparse retriever on a keyword based query +res_2 = transformer_keyword_classifier.run(query="arya stark father") +print(f"\n\n{equal_line}\nKEYWORD QUERY RESULTS\n{equal_line}") +print_answers(res_2, details="minimum") +``` + +#### Pipeline with Question vs. Statement Classification + +Above we saw a potential use for keyword vs. question/statement classification: we might choose to use a less resource-intensive retriever for keyword queries than for question/statement queries. But what about question vs. statement classification? + +To illustrate one potential use for question vs. statement classification, we will build a pipeline that looks as follows: + +1. The pipeline will start with a retriever that **every query** will go through. +2. The pipeline will end with a reader that **only question queries** will go through. + +In other words, our pipeline will be a **retriever-only pipeline for statement queries**—given the statement "Arya Stark was the daughter of a Lord", all we will get back are the most relevant documents—but it will be a **retriever-reader pipeline for question queries**. + +To make things more concrete, our pipeline will start with a retriever, which is then fed into a `TransformersQueryClassifier` that is set to do question vs. statement classification. Note that this means we need to explicitly choose the model, since as mentioned previously a default `TransformersQueryClassifier` performs keyword vs. question/statement classification. The classifier's first branch, which handles question queries, will then be sent to the reader, while the second branch will not be connected to any other nodes. As a result, the last node of the pipeline depends on the type of query: questions go all the way through the reader, while statements only go through the retriever. This pipeline is illustrated below: + + +```python +# Here we build the pipeline +transformer_question_classifier = Pipeline() +transformer_question_classifier.add_node(component=embedding_retriever, name="EmbeddingRetriever", inputs=["Query"]) +transformer_question_classifier.add_node( + component=TransformersQueryClassifier(model_name_or_path="shahrukhx01/question-vs-statement-classifier"), + name="QueryClassifier", + inputs=["EmbeddingRetriever"], +) +transformer_question_classifier.add_node(component=reader, name="QAReader", inputs=["QueryClassifier.output_1"]) + +# Visualization of the pipeline +transformer_question_classifier.draw("transformer_question_classifier.png") +``` + +And here are the results of this pipeline: with a question query like "Who is the father of Arya Stark?", we obtain answers from a reader, and with a statement query like "Arya Stark was the daughter of a Lord", we just obtain documents from a retriever. + + +```python +# Useful for framing headers +equal_line = "=" * 30 + +# Run the retriever + reader on the question query +res_1 = transformer_question_classifier.run(query="Who is the father of Arya Stark?") +print(f"\n\n{equal_line}\nQUESTION QUERY RESULTS\n{equal_line}") +print_answers(res_1, details="minimum") +print("\n\n") + +# Run only the retriever on the statement query +res_2 = transformer_question_classifier.run(query="Arya Stark was the daughter of a Lord.") +print(f"\n\n{equal_line}\nSTATEMENT QUERY RESULTS\n{equal_line}") +print_documents(res_2) +``` + +### Other use cases for Query Classifiers: custom classification models and zero-shot classification. + +`TransformersQueryClassifier` is very flexible and also supports other options for classifying queries. +For example, we may be interested in detecting the sentiment or classifying the topics. We can do this by loading a custom classification model from the Hugging Face Hub or by using zero-shot classification. + +#### Custom classification model vs zero-shot classification +- Rraditional text classification models are trained to predict one of a few "hard-coded" classes and require a dedicated training dataset. In the Hugging Face Hub, you can find many pre-trained models, maybe even related to your domain of interest. +- Zero-shot classification is very versatile: by choosing a suitable base transformer, you can classify the text without any training dataset. You just have to provide the candidate categories. + +#### Using custom classification models +We can use a public model, available in the Hugging Face Hub. For example, if we want to classify the sentiment of the queries, we can choose an appropriate model, such as https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment. + +*In this case, the `labels` parameter must contain a list with the exact model labels. +The first label we provide corresponds to output_1, the second label to output_2, and so on.* + + +```python +from haystack.nodes import TransformersQueryClassifier + +# Remember to compile a list with the exact model labels +# The first label you provide corresponds to output_1, the second label to output_2, and so on. +labels = ["LABEL_0", "LABEL_1", "LABEL_2"] + +sentiment_query_classifier = TransformersQueryClassifier( + model_name_or_path="cardiffnlp/twitter-roberta-base-sentiment", + use_gpu=True, + task="text-classification", + labels=labels, +) +``` + + +```python +queries = [ + "What's the answer?", # neutral query + "Would you be so lovely to tell me the answer?", # positive query + "Can you give me the damn right answer for once??", # negative query +] +``` + + +```python +import pandas as pd + +sent_results = {"Query": [], "Output Branch": [], "Class": []} + +for query in queries: + result = sentiment_query_classifier.run(query=query) + sent_results["Query"].append(query) + sent_results["Output Branch"].append(result[1]) + if result[1] == "output_1": + sent_results["Class"].append("negative") + elif result[1] == "output_2": + sent_results["Class"].append("neutral") + elif result[1] == "output_3": + sent_results["Class"].append("positive") + +pd.DataFrame.from_dict(sent_results) +``` + +#### Using zero-shot classification +You can also perform zero-shot classification by providing a suitable base transformer model and **choosing** the classes the model should predict. +For example, we may be interested in whether the user query is related to music or cinema. + +*In this case, the `labels` parameter is a list containing the candidate classes.* + + +```python +# In zero-shot-classification, you can choose the labels +labels = ["music", "cinema"] + +query_classifier = TransformersQueryClassifier( + model_name_or_path="typeform/distilbert-base-uncased-mnli", + use_gpu=True, + task="zero-shot-classification", + labels=labels, +) +``` + + +```python +queries = [ + "In which films does John Travolta appear?", # query about cinema + "What is the Rolling Stones first album?", # query about music + "Who was Sergio Leone?", # query about cinema +] +``` + + +```python +import pandas as pd + +query_classification_results = {"Query": [], "Output Branch": [], "Class": []} + +for query in queries: + result = query_classifier.run(query=query) + query_classification_results["Query"].append(query) + query_classification_results["Output Branch"].append(result[1]) + query_classification_results["Class"].append("music" if result[1] == "output_1" else "cinema") + +pd.DataFrame.from_dict(query_classification_results) +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.7.0/_src/tutorials/tutorials/15.md b/docs/v1.7.0/_src/tutorials/tutorials/15.md new file mode 100644 index 0000000000..c06ac36101 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/15.md @@ -0,0 +1,431 @@ + + +# Open-Domain QA on Tables +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial15_TableQA.ipynb) + +This tutorial shows you how to perform question-answering on tables using the `EmbeddingRetriever` or `BM25Retriever` as retriever node and the `TableReader` as reader node. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] + +# The TaPAs-based TableReader requires the torch-scatter library +import torch + +version = torch.__version__ +!pip install torch-scatter -f https://data.pyg.org/whl/torch-{version}.html + +# Install pygraphviz for visualization of Pipelines +!apt install libgraphviz-dev +!pip install pygraphviz +``` + +### Start an Elasticsearch server +You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (e.g. in Colab notebooks), then you can manually download and execute Elasticsearch from source. + + +```python +# Recommended: Start Elasticsearch using Docker via the Haystack utility function +from haystack.utils import launch_es + +launch_es() +``` + + +```python +# In Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + + +```python +# Connect to Elasticsearch +from haystack.document_stores import ElasticsearchDocumentStore + +document_index = "document" +document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index=document_index) +``` + +## Add Tables to DocumentStore +To quickly demonstrate the capabilities of the `EmbeddingRetriever` and the `TableReader` we use a subset of 1000 tables and text documents from a dataset we have published in [this paper](https://arxiv.org/abs/2108.04049). + +Just as text passages, tables are represented as `Document` objects in Haystack. The content field, though, is a pandas DataFrame instead of a string. + + +```python +# Let's first fetch some tables that we want to query +# Here: 1000 tables from OTT-QA +from haystack.utils import fetch_archive_from_http + +doc_dir = "data/tutorial15" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/table_text_dataset.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) +``` + + +```python +# Add the tables to the DocumentStore + +import json +from haystack import Document +import pandas as pd + + +def read_tables(filename): + processed_tables = [] + with open(filename) as tables: + tables = json.load(tables) + for key, table in tables.items(): + current_columns = table["header"] + current_rows = table["data"] + current_df = pd.DataFrame(columns=current_columns, data=current_rows) + document = Document(content=current_df, content_type="table", id=key) + processed_tables.append(document) + + return processed_tables + + +tables = read_tables(f"{doc_dir}/tables.json") +document_store.write_documents(tables, index=document_index) + +# Showing content field and meta field of one of the Documents of content_type 'table' +print(tables[0].content) +print(tables[0].meta) +``` + +## Initialize Retriever, Reader & Pipeline + +### Retriever + +Retrievers help narrowing down the scope for the Reader to a subset of tables where a given question could be answered. +They use some simple but fast algorithm. + +**Here:** We specify an embedding model that is finetuned so it can also generate embeddings for tables (instead of just text). + +**Alternatives:** + +- `BM25Retriever` that uses BM25 algorithm + + + +```python +from haystack.nodes.retriever import EmbeddingRetriever + +retriever = EmbeddingRetriever(document_store=document_store, embedding_model="deepset/all-mpnet-base-v2-table") +``` + + +```python +# Add table embeddings to the tables in DocumentStore +document_store.update_embeddings(retriever=retriever) +``` + + +```python +## Alternative: BM25Retriever +# from haystack.nodes.retriever import BM25Retriever +# retriever = BM25Retriever(document_store=document_store) +``` + + +```python +# Try the Retriever +from haystack.utils import print_documents + +retrieved_tables = retriever.retrieve("Who won the Super Bowl?", top_k=5) +# Get highest scored table +print(retrieved_tables[0].content) +``` + +### Reader +The `TableReader` is based on TaPas, a transformer-based language model capable of grasping the two-dimensional structure of a table. It scans the tables returned by the retriever and extracts the anser. The available TableReader models can be found [here](https://huggingface.co/models?pipeline_tag=table-question-answering&sort=downloads). + +**Notice**: The `TableReader` will return an answer for each table, even if the query cannot be answered by the table. Furthermore, the confidence scores are not useful as of now, given that they will *always* be very high (i.e. 1 or close to 1). + + +```python +from haystack.nodes import TableReader + +reader = TableReader(model_name_or_path="google/tapas-base-finetuned-wtq", max_seq_len=512) +``` + + +```python +# Try the TableReader on one Table + +table_doc = document_store.get_document_by_id("36964e90-3735-4ba1-8e6a-bec236e88bb2") +print(table_doc.content) +``` + + +```python +from haystack.utils import print_answers + +prediction = reader.predict(query="Who played Gregory House in the series House?", documents=[table_doc]) +print_answers(prediction, details="all") +``` + +The offsets in the `offsets_in_document` and `offsets_in_context` field indicate the table cells that the model predicts to be part of the answer. They need to be interpreted on the linearized table, i.e., a flat list containing all of the table cells. + + +```python +print(f"Predicted answer: {prediction['answers'][0].answer}") +print(f"Meta field: {prediction['answers'][0].meta}") +``` + +### Pipeline +The Retriever and the Reader can be sticked together to a pipeline in order to first retrieve relevant tables and then extract the answer. + +**Notice**: Given that the `TableReader` does not provide useful confidence scores and returns an answer for each of the tables, the sorting of the answers might be not helpful. + + +```python +# Initialize pipeline +from haystack import Pipeline + +table_qa_pipeline = Pipeline() +table_qa_pipeline.add_node(component=retriever, name="EmbeddingRetriever", inputs=["Query"]) +table_qa_pipeline.add_node(component=reader, name="TableReader", inputs=["EmbeddingRetriever"]) +``` + + +```python +prediction = table_qa_pipeline.run("When was Guilty Gear Xrd : Sign released?", params={"top_k": 30}) +print_answers(prediction, details="minimum") +``` + + +```python +# Add 500 text passages to our document store. + + +def read_texts(filename): + processed_passages = [] + with open(filename) as passages: + passages = json.load(passages) + for key, content in passages.items(): + document = Document(content=content, content_type="text", id=key) + processed_passages.append(document) + + return processed_passages + + +passages = read_texts(f"{doc_dir}/texts.json") +document_store.write_documents(passages, index=document_index) +``` + + +```python +document_store.update_embeddings(retriever=retriever, update_existing_embeddings=False) +``` + +## Pipeline for QA on Combination of Text and Tables +We are using one node for retrieving both texts and tables, the `EmbeddingRetriever`. In order to do question-answering on the Documents coming from the `EmbeddingRetriever`, we need to route Documents of type `"text"` to a `FARMReader` (or alternatively `TransformersReader`) and Documents of type `"table"` to a `TableReader`. + +To achieve this, we make use of two additional nodes: +- `RouteDocuments`: Splits the List of Documents retrieved by the `EmbeddingRetriever` into two lists containing only Documents of type `"text"` or `"table"`, respectively. +- `JoinAnswers`: Takes Answers coming from two different Readers (in this case `FARMReader` and `TableReader`) and joins them to a single list of Answers. + + +```python +from haystack.nodes import FARMReader, RouteDocuments, JoinAnswers + +text_reader = FARMReader("deepset/roberta-base-squad2") +# In order to get meaningful scores from the TableReader, use "deepset/tapas-large-nq-hn-reader" or +# "deepset/tapas-large-nq-reader" as TableReader models. The disadvantage of these models is, however, +# that they are not capable of doing aggregations over multiple table cells. +table_reader = TableReader("deepset/tapas-large-nq-hn-reader") +route_documents = RouteDocuments() +join_answers = JoinAnswers() +``` + + +```python +text_table_qa_pipeline = Pipeline() +text_table_qa_pipeline.add_node(component=retriever, name="EmbeddingRetriever", inputs=["Query"]) +text_table_qa_pipeline.add_node(component=route_documents, name="RouteDocuments", inputs=["EmbeddingRetriever"]) +text_table_qa_pipeline.add_node(component=text_reader, name="TextReader", inputs=["RouteDocuments.output_1"]) +text_table_qa_pipeline.add_node(component=table_reader, name="TableReader", inputs=["RouteDocuments.output_2"]) +text_table_qa_pipeline.add_node(component=join_answers, name="JoinAnswers", inputs=["TextReader", "TableReader"]) +``` + + +```python +# Let's have a look on the structure of the combined Table an Text QA pipeline. +from IPython import display + +text_table_qa_pipeline.draw() +display.Image("pipeline.png") +``` + + +```python +# Example query whose answer resides in a text passage +predictions = text_table_qa_pipeline.run(query="Who was Thomas Alva Edison?") +``` + + +```python +# We can see both text passages and tables as contexts of the predicted answers. +print_answers(predictions, details="minimum") +``` + + +```python +# Example query whose answer resides in a table +predictions = text_table_qa_pipeline.run(query="Which country does the film Macaroni come from?") +``` + + +```python +# We can see both text passages and tables as contexts of the predicted answers. +print_answers(predictions, details="minimum") +``` + +## Evaluation +To evaluate our pipeline, we can use haystack's evaluation feature. We just need to convert our labels into `MultiLabel` objects and the `eval` method will do the rest. + + +```python +from haystack import Label, MultiLabel, Answer + + +def read_labels(filename, tables): + processed_labels = [] + with open(filename) as labels: + labels = json.load(labels) + for table in tables: + if table.id not in labels: + continue + label = labels[table.id] + label = Label( + query=label["query"], + document=table, + is_correct_answer=True, + is_correct_document=True, + answer=Answer(answer=label["answer"]), + origin="gold-label", + ) + processed_labels.append(MultiLabel(labels=[label])) + return processed_labels + + +table_labels = read_labels(f"{doc_dir}/labels.json", tables) +passage_labels = read_labels(f"{doc_dir}/labels.json", passages) +``` + + +```python +eval_results = text_table_qa_pipeline.eval(table_labels + passage_labels, params={"top_k": 10}) +``` + + +```python +# Calculating and printing the evaluation metrics +print(eval_results.calculate_metrics()) +``` + +## Adding tables from PDFs +It can sometimes be hard to provide your data in form of a pandas DataFrame. For this case, we provide the `ParsrConverter` wrapper that can help you to convert, for example, a PDF file into a document that you can index. + +**Attention: `parsr` needs a docker environment for execution, but Colab doesn't support docker.** +**If you have a local docker environment, you can uncomment and run the following cells.** + + +```python +# import time + +# !docker run -d -p 3001:3001 axarev/parsr +# time.sleep(30) +``` + + +```python +# !wget https://www.w3.org/WAI/WCAG21/working-examples/pdf-table/table.pdf +``` + + +```python +# from haystack.nodes import ParsrConverter + +# converter = ParsrConverter() + +# docs = converter.convert("table.pdf") + +# tables = [doc for doc in docs if doc.content_type == "table"] +``` + + +```python +# print(tables) +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) + diff --git a/docs/v1.7.0/_src/tutorials/tutorials/16.md b/docs/v1.7.0/_src/tutorials/tutorials/16.md new file mode 100644 index 0000000000..f13c392a31 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/16.md @@ -0,0 +1,275 @@ + + +# Extending your Metadata using DocumentClassifiers at Index Time + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial16_Document_Classifier_at_Index_Time.ipynb) + +With DocumentClassifier it's possible to automatically enrich your documents with categories, sentiments, topics or whatever metadata you like. This metadata could be used for efficient filtering or further processing. Say you have some categories your users typically filter on. If the documents are tagged manually with these categories, you could automate this process by training a model. Or you can leverage the full power and flexibility of zero shot classification. All you need to do is pass your categories to the classifier, no labels required. This tutorial shows how to integrate it in your indexing pipeline. + +DocumentClassifier adds the classification result (label and score) to Document's meta property. +Hence, we can use it to classify documents at index time. \ +The result can be accessed at query time: for example by applying a filter for "classification.label". + +This tutorial will show you how to integrate a classification model into your preprocessing steps and how you can filter for this additional metadata at query time. In the last section we show how to put it all together and create an indexing pipeline. + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr] + +!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz +!tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin + +# Install pygraphviz +!apt install libgraphviz-dev +!pip install pygraphviz +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + + +```python +# Here are the imports we need +from haystack.document_stores.elasticsearch import ElasticsearchDocumentStore +from haystack.nodes import PreProcessor, TransformersDocumentClassifier, FARMReader, BM25Retriever +from haystack.schema import Document +from haystack.utils import convert_files_to_docs, fetch_archive_from_http, print_answers +``` + + +```python +# This fetches some sample files to work with + +doc_dir = "data/tutorial16" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial16.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) +``` + +## Read and preprocess documents + + + +```python +# note that you can also use the document classifier before applying the PreProcessor, e.g. before splitting your documents + +all_docs = convert_files_to_docs(dir_path=doc_dir) +preprocessor_sliding_window = PreProcessor(split_overlap=3, split_length=10, split_respect_sentence_boundary=False) +docs_sliding_window = preprocessor_sliding_window.process(all_docs) +``` + +## Apply DocumentClassifier + +We can enrich the document metadata at index time using any transformers document classifier model. While traditional classification models are trained to predict one of a few "hard-coded" classes and required a dedicated training dataset, zero-shot classification is super flexible and you can easily switch the classes the model should predict on the fly. Just supply them via the labels param. +Here we use a zero shot model that is supposed to classify our documents in 'music', 'natural language processing' and 'history'. Feel free to change them for whatever you like to classify. \ +These classes can later on be accessed at query time. + + +```python +doc_classifier = TransformersDocumentClassifier( + model_name_or_path="cross-encoder/nli-distilroberta-base", + task="zero-shot-classification", + labels=["music", "natural language processing", "history"], + batch_size=16, +) +``` + + +```python +# we can also use any other transformers model besides zero shot classification + +# doc_classifier_model = 'bhadresh-savani/distilbert-base-uncased-emotion' +# doc_classifier = TransformersDocumentClassifier(model_name_or_path=doc_classifier_model, batch_size=16, use_gpu=-1) +``` + + +```python +# we could also specifiy a different field we want to run the classification on + +# doc_classifier = TransformersDocumentClassifier(model_name_or_path="cross-encoder/nli-distilroberta-base", +# task="zero-shot-classification", +# labels=["music", "natural language processing", "history"], +# batch_size=16, use_gpu=-1, +# classification_field="description") +``` + + +```python +# classify using gpu, batch_size makes sure we do not run out of memory +classified_docs = doc_classifier.predict(docs_sliding_window) +``` + + +```python +# let's see how it looks: there should be a classification result in the meta entry containing labels and scores. +print(classified_docs[0].to_dict()) +``` + +## Indexing + + +```python +# In Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + + +```python +# Connect to Elasticsearch +document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") +``` + + +```python +# Now, let's write the docs to our DB. +document_store.delete_all_documents() +document_store.write_documents(classified_docs) +``` + + +```python +# check if indexed docs contain classification results +test_doc = document_store.get_all_documents()[0] +print( + f'document {test_doc.id} with content \n\n{test_doc.content}\n\nhas label {test_doc.meta["classification"]["label"]}' +) +``` + +## Querying the data + +All we have to do to filter for one of our classes is to set a filter on "classification.label". + + +```python +# Initialize QA-Pipeline +from haystack.pipelines import ExtractiveQAPipeline + +retriever = BM25Retriever(document_store=document_store) +reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) +pipe = ExtractiveQAPipeline(reader, retriever) +``` + + +```python +## Voilà! Ask a question while filtering for "music"-only documents +prediction = pipe.run( + query="What is heavy metal?", + params={"Retriever": {"top_k": 10, "filters": {"classification.label": ["music"]}}, "Reader": {"top_k": 5}}, +) +``` + + +```python +print_answers(prediction, details="high") +``` + +## Wrapping it up in an indexing pipeline + + +```python +from pathlib import Path +from haystack.pipelines import Pipeline +from haystack.nodes import TextConverter, PreProcessor, FileTypeClassifier, PDFToTextConverter, DocxToTextConverter +``` + + +```python +file_type_classifier = FileTypeClassifier() +text_converter = TextConverter() +pdf_converter = PDFToTextConverter() +docx_converter = DocxToTextConverter() + +indexing_pipeline_with_classification = Pipeline() +indexing_pipeline_with_classification.add_node( + component=file_type_classifier, name="FileTypeClassifier", inputs=["File"] +) +indexing_pipeline_with_classification.add_node( + component=text_converter, name="TextConverter", inputs=["FileTypeClassifier.output_1"] +) +indexing_pipeline_with_classification.add_node( + component=pdf_converter, name="PdfConverter", inputs=["FileTypeClassifier.output_2"] +) +indexing_pipeline_with_classification.add_node( + component=docx_converter, name="DocxConverter", inputs=["FileTypeClassifier.output_4"] +) +indexing_pipeline_with_classification.add_node( + component=preprocessor_sliding_window, + name="Preprocessor", + inputs=["TextConverter", "PdfConverter", "DocxConverter"], +) +indexing_pipeline_with_classification.add_node( + component=doc_classifier, name="DocumentClassifier", inputs=["Preprocessor"] +) +indexing_pipeline_with_classification.add_node( + component=document_store, name="DocumentStore", inputs=["DocumentClassifier"] +) +indexing_pipeline_with_classification.draw("index_time_document_classifier.png") + +document_store.delete_documents() +txt_files = [f for f in Path(doc_dir).iterdir() if f.suffix == ".txt"] +pdf_files = [f for f in Path(doc_dir).iterdir() if f.suffix == ".pdf"] +docx_files = [f for f in Path(doc_dir).iterdir() if f.suffix == ".docx"] +indexing_pipeline_with_classification.run(file_paths=txt_files) +indexing_pipeline_with_classification.run(file_paths=pdf_files) +indexing_pipeline_with_classification.run(file_paths=docx_files) + +document_store.get_all_documents()[0] +``` + + +```python +# we can store this pipeline and use it from the REST-API +indexing_pipeline_with_classification.save_to_yaml("indexing_pipeline_with_classification.yaml") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) + diff --git a/docs/v1.7.0/_src/tutorials/tutorials/17.md b/docs/v1.7.0/_src/tutorials/tutorials/17.md new file mode 100644 index 0000000000..d5aa1e0790 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/17.md @@ -0,0 +1,378 @@ + + +# Make Your QA Pipelines Talk! + + + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial17_Audio.ipynb) + +Question answering works primarily on text, but Haystack provides some features for audio files that contain speech as well. + +In this tutorial, we're going to see how to use `AnswerToSpeech` to convert answers into audio files. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,audio] +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + +### Setup Elasticsearch + + + +```python +# Recommended: Start Elasticsearch using Docker via the Haystack utility function +from haystack.utils import launch_es + +launch_es() +``` + + +```python +# In Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + +### Populate the document store with `SpeechDocuments` + +First of all, we will populate the document store with a simple indexing pipeline. See [Tutorial 1](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial1_Basic_QA_Pipeline.ipynb) for more details about these steps. + +To the basic version, we can add here a DocumentToSpeech node that also generates an audio file for each of the indexed documents. This will make possible, during querying, to access the audio version of the documents the answers were extracted from without having to generate it on the fly. + +**Note**: this additional step can slow down your indexing quite a lot if you are not running on GPU. Experiment with very small corpora to start. + + +```python +from haystack.document_stores import ElasticsearchDocumentStore +from haystack.utils import fetch_archive_from_http, launch_es +from pathlib import Path +from haystack import Pipeline +from haystack.nodes import FileTypeClassifier, TextConverter, PreProcessor, DocumentToSpeech + +document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document") + +# Get the documents +documents_path = "data/tutorial17" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt17.zip" +fetch_archive_from_http(url=s3_url, output_dir=documents_path) + +# List all the paths +file_paths = [p for p in Path(documents_path).glob("**/*")] + +# NOTE: In this example we're going to use only one text file from the wiki, as the DocumentToSpeech node is quite slow +# on CPU machines. Comment out this line to use all documents from the dataset if you machine is powerful enough. +file_paths = [p for p in file_paths if "Stormborn" in p.name] + +# Prepare some basic metadata for the files +files_metadata = [{"name": path.name} for path in file_paths] + +# Here we create a basic indexing pipeline +indexing_pipeline = Pipeline() + +# - Makes sure the file is a TXT file (FileTypeClassifier node) +classifier = FileTypeClassifier() +indexing_pipeline.add_node(classifier, name="classifier", inputs=["File"]) + +# - Converts a file into text and performs basic cleaning (TextConverter node) +text_converter = TextConverter(remove_numeric_tables=True) +indexing_pipeline.add_node(text_converter, name="text_converter", inputs=["classifier.output_1"]) + +# - Pre-processes the text by performing splits and adding metadata to the text (Preprocessor node) +preprocessor = PreProcessor( + clean_whitespace=True, + clean_empty_lines=True, + split_length=100, + split_overlap=50, + split_respect_sentence_boundary=True, +) +indexing_pipeline.add_node(preprocessor, name="preprocessor", inputs=["text_converter"]) + +# +# DocumentToSpeech +# +# Here is where we convert all documents to be indexed into SpeechDocuments, that will hold not only +# the text content, but also their audio version. +# +# Note that DocumentToSpeech implements a light caching, so if a document's audio have already +# been generated in a previous pass in the same folder, it will reuse the existing file instead +# of generating it again. +doc2speech = DocumentToSpeech( + model_name_or_path="espnet/kan-bayashi_ljspeech_vits", generated_audio_dir=Path("./generated_audio_documents") +) +indexing_pipeline.add_node(doc2speech, name="doc2speech", inputs=["preprocessor"]) + +# - Writes the resulting documents into the document store (ElasticsearchDocumentStore node from the previous cell) +indexing_pipeline.add_node(document_store, name="document_store", inputs=["doc2speech"]) + +# Then we run it with the documents and their metadata as input +output = indexing_pipeline.run(file_paths=file_paths, meta=files_metadata) +``` + + +```python +from pprint import pprint + +# You can now check the document store and verify that documents have been enriched with a path +# to the generated audio file +document = next(document_store.get_all_documents_generator()) +pprint(document) + +# Sample output: +# +# +``` + +### Querying + +Now we will create a pipeline very similar to the basic `ExtractiveQAPipeline` of Tutorial 1, +with the addition of a node that converts our answers into audio files! Once the answer is retrieved, we can also listen to the audio version of the document where the answer came from. + + +```python +from pathlib import Path +from haystack import Pipeline +from haystack.nodes import BM25Retriever, FARMReader, AnswerToSpeech + +retriever = BM25Retriever(document_store=document_store) +reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2-distilled", use_gpu=True) +answer2speech = AnswerToSpeech( + model_name_or_path="espnet/kan-bayashi_ljspeech_vits", generated_audio_dir=Path("./audio_answers") +) + +audio_pipeline = Pipeline() +audio_pipeline.add_node(retriever, name="Retriever", inputs=["Query"]) +audio_pipeline.add_node(reader, name="Reader", inputs=["Retriever"]) +audio_pipeline.add_node(answer2speech, name="AnswerToSpeech", inputs=["Reader"]) +``` + +## Ask a question! + + +```python +# You can configure how many candidates the Reader and Retriever shall return +# The higher top_k_retriever, the better (but also the slower) your answers. +prediction = audio_pipeline.run( + query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +) +``` + + +```python +# Now you can either print the object directly... +from pprint import pprint + +pprint(prediction) + +# Sample output: +# { +# 'answers': [ , +# , +# ... +# ] +# 'documents': [ , +# , +# ... +# ], +# 'no_ans_gap': 11.688868522644043, +# 'node_id': 'Reader', +# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}}, +# 'query': 'Who was born during a storm?', +# 'root_node': 'Query' +# } +``` + + +```python +from haystack.utils import print_answers + +# ...or use a util to simplify the output +# Change `minimum` to `medium` or `all` to raise the level of detail +print_answers(prediction, details="minimum") + +# Sample output: +# +# Query: Who was born during a storm +# Answers: +# [ { 'answer_audio': PosixPath('generated_audio_answers/07d6265486b22356362387c5a098ba7d.wav'), +# 'answer': 'Daenerys Targaryen', +# 'context_transcript': PosixPath('generated_audio_answers/3f1ca228d6c4cfb633e55f89e97de7ac.wav'), +# 'context': ' refers to both Daenerys Targaryen, who was born during a terrible storm, and '}, +# { 'answer_audio': PosixPath('generated_audio_answers/83c3a02141cac4caffe0718cfd6c405c.wav'), +# 'answer': 'Daenerys', +# 'context_audio': PosixPath('generated_audio_answers/8c562ebd7e7f41e1f9208384957df173.wav'), +# 'context': 'The title of the episode refers to both Daenerys Targaryen, who was born during a terrible storm'}, +# ... +``` + + +```python +# The document the first answer was extracted from +original_document = [doc for doc in prediction["documents"] if doc.id == prediction["answers"][0].document_id][0] +pprint(original_document) + +# Sample output +# +# +``` + +### Hear them out! + + +```python +from IPython.display import display, Audio +import soundfile as sf +``` + + +```python +# The first answer in isolation + +print("Answer: ", prediction["answers"][0].answer) + +speech, _ = sf.read(prediction["answers"][0].answer_audio) +display(Audio(speech, rate=24000)) +``` + + +```python +# The context of the first answer + +print("Context: ", prediction["answers"][0].context) + +speech, _ = sf.read(prediction["answers"][0].context_audio) +display(Audio(speech, rate=24000)) +``` + + +```python +# The document the first answer was extracted from + +document = [doc for doc in prediction["documents"] if doc.id == prediction["answers"][0].document_id][0] + +print("Document: ", document.content) + +speech, _ = sf.read(document.meta["content_audio"]) +display(Audio(speech, rate=24000)) +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) + diff --git a/docs/v1.7.0/_src/tutorials/tutorials/18.md b/docs/v1.7.0/_src/tutorials/tutorials/18.md new file mode 100644 index 0000000000..aeb16a5cc6 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/18.md @@ -0,0 +1,298 @@ + + +# Generative Pseudo Labeling for Domain Adaptation of Dense Retrievals + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial18_GPL.ipynb) + +*Note: Adapted to Haystack from Nils Riemers' original [notebook](https://colab.research.google.com/gist/jamescalam/d2c888775c87f9882bb7c379a96adbc8/gpl-domain-adaptation.ipynb#scrollTo=183ff7ab) + +The NLP models we use every day were trained on a corpus of data that reflects the world from the past. In the meantime, we've experienced world-changing events, like the COVID pandemics, and we'd like our models to know about them. Training a model from scratch is tedious work but what if we could just update the models with new data? Generative Pseudo Labeling comes to the rescue. + +The example below shows you how to use GPL to fine-tune a model so that it can answer the query: "How is COVID-19 transmitted?". + +We're using TAS-B: A DistilBERT model that achieves state-of-the-art performance on MS MARCO (500k queries from Bing Search Engine). Both DistilBERT and MS MARCO were created with data from 2018 and before, hence, it lacks the knowledge of any COVID-related information. + +For this example, we're using just four documents. When you ask the model ""How is COVID-19 transmitted?", here are the answers that you get (dot-score and document): +- 94.84 Ebola is transmitted via direct contact with blood +- 92.87 HIV is transmitted via sex or sharing needles +- 92.31 Corona is transmitted via the air +- 91.54 Polio is transmitted via contaminated water or food + + +You can see that the correct document is only third, outranked by Ebola and HIV information. Let's see how we can make this better. + +## Efficient Domain Adaptation with GPL +This notebook demonstrates [Generative Pseudo Labeling (GPL)](https://arxiv.org/abs/2112.07577), an efficient approach to adapt existing dense retrieval models to new domains and data. + +We get a collection of 10k scientific papers on COVID-19 and then fine-tune the model within 15-60 minutes (depending on your GPU) so that it includes the COVID knowledge. + +If we search again with the updated model, we get the search results we would expect: +- Query: How is COVID-19 transmitted +- 97.70 Corona is transmitted via the air +- 96.71 Ebola is transmitted via direct contact with blood +- 95.14 Polio is transmitted via contaminated water or food +- 94.13 HIV is transmitted via sex or sharing needles + +### Prepare the Environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + + +```python +!nvidia-smi +``` + + +```python +!pip install -q datasets +!pip install "faiss-gpu>=1.6.3,<2" +!pip install -q git+https://github.com/deepset-ai/haystack.git +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + + +```python +from sentence_transformers import SentenceTransformer, util +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +from datasets import load_dataset +``` + + +```python +# We load the TAS-B model, a state-of-the-art model trained on MS MARCO +max_seq_length = 200 +model_name = "msmarco-distilbert-base-tas-b" + +org_model = SentenceTransformer(model_name) +org_model.max_seq_length = max_seq_length +``` + + +```python +# We define a simple query and some documents how diseases are transmitted +# As TAS-B was trained on rather out-dated data (2018 and older), it has now idea about COVID-19 +# So in the below example, it fails to recognize the relationship between COVID-19 and Corona + + +def show_examples(model): + query = "How is COVID-19 transmitted" + docs = [ + "Corona is transmitted via the air", + "Ebola is transmitted via direct contact with blood", + "HIV is transmitted via sex or sharing needles", + "Polio is transmitted via contaminated water or food", + ] + + query_emb = model.encode(query) + docs_emb = model.encode(docs) + scores = util.dot_score(query_emb, docs_emb)[0] + doc_scores = sorted(zip(docs, scores), key=lambda x: x[1], reverse=True) + + print("Query:", query) + for doc, score in doc_scores: + # print(doc, score) + print(f"{score:0.02f}\t{doc}") + + +print("Original Model") +show_examples(org_model) +``` + +# Get Some Data on COVID-19 +We select 10k scientific publications (title + abstract) that are connected to COVID-19. As a dataset, we use [TREC-COVID-19](https://huggingface.co/datasets/nreimers/trec-covid). + + +```python +dataset = load_dataset("nreimers/trec-covid", split="train") +num_documents = 10000 +corpus = [] +for row in dataset: + if len(row["title"]) > 20 and len(row["text"]) > 100: + text = row["title"] + " " + row["text"] + + text_lower = text.lower() + + # The dataset also contains many papers on other diseases. To make the training in this demo + # more efficient, we focus on papers that talk about COVID. + if "covid" in text_lower or "corona" in text_lower or "sars-cov-2" in text_lower: + corpus.append(text) + + if len(corpus) >= num_documents: + break + +print("Len Corpus:", len(corpus)) +``` + +# Initialize Haystack Retriever and DocumentStore + +Let's add corpus documents to `FAISSDocumentStore` and update corpus embeddings via `EmbeddingRetriever` + + +```python +from haystack.nodes.retriever import EmbeddingRetriever +from haystack.document_stores import FAISSDocumentStore + +document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", similarity="cosine") +document_store.write_documents([{"content": t} for t in corpus]) + + +retriever = EmbeddingRetriever( + document_store=document_store, + embedding_model="sentence-transformers/msmarco-distilbert-base-tas-b", + model_format="sentence_transformers", + max_seq_len=max_seq_length, + progress_bar=False, +) +document_store.update_embeddings(retriever) +``` + +## (Optional) Download Pre-Generated Questions or Generate Them Outside of Haystack + +The first step of the GPL algorithm requires us to generate questions for a given text passage. Even though our pre-COVID trained model hasn't seen any COVID-related content, it can still produce sensible queries by copying words from the input text. As generating questions from 10k documents is a bit slow (depending on the GPU used), we'll download question/document pairs directly from the Hugging Face hub. + + + +```python +from tqdm.auto import tqdm + +query_doc_pairs = [] + +load_queries_from_hub = True + +# Generation of the queries is quite slow in Colab due to the old GPU and the limited CPU +# I pre-computed the queries and uploaded these to the HF dataset hub. Here we just download them +if load_queries_from_hub: + generated_queries = load_dataset("nreimers/trec-covid-generated-queries", split="train") + for row in generated_queries: + query_doc_pairs.append({"question": row["query"], "document": row["doc"]}) +else: + # Load doc2query model + t5_name = "doc2query/msmarco-t5-base-v1" + t5_tokenizer = AutoTokenizer.from_pretrained(t5_name) + t5_model = AutoModelForSeq2SeqLM.from_pretrained(t5_name).cuda() + + batch_size = 32 + queries_per_doc = 3 + + for start_idx in tqdm(range(0, len(corpus), batch_size)): + corpus_batch = corpus[start_idx : start_idx + batch_size] + enc_inp = t5_tokenizer( + corpus_batch, max_length=max_seq_length, truncation=True, padding=True, return_tensors="pt" + ) + + outputs = t5_model.generate( + input_ids=enc_inp["input_ids"].cuda(), + attention_mask=enc_inp["attention_mask"].cuda(), + max_length=64, + do_sample=True, + top_p=0.95, + num_return_sequences=queries_per_doc, + ) + + decoded_output = t5_tokenizer.batch_decode(outputs, skip_special_tokens=True) + + for idx, query in enumerate(decoded_output): + corpus_id = int(idx / queries_per_doc) + query_doc_pairs.append({"question": query, "document": corpus_batch[corpus_id]}) + + +print("Generated queries:", len(query_doc_pairs)) +``` + +# Use PseudoLabelGenerator to Genenerate Retriever Adaptation Training Data + +PseudoLabelGenerator run will execute all three steps of the GPL [algorithm](https://github.com/UKPLab/gpl#how-does-gpl-work): + 1. Question generation - optional step + 2. Negative mining + 3. Pseudo labeling (margin scoring) + +The output of the `PseudoLabelGenerator` is the training data we'll use to adapt our `EmbeddingRetriever`. + + + +```python +from haystack.nodes.question_generator import QuestionGenerator +from haystack.nodes.label_generator import PseudoLabelGenerator + +use_question_generator = False + + +if use_question_generator: + questions_producer = QuestionGenerator( + model_name_or_path="doc2query/msmarco-t5-base-v1", + max_length=64, + split_length=128, + batch_size=32, + num_queries_per_doc=3, + ) + +else: + questions_producer = query_doc_pairs + +# We can use either QuestionGenerator or already generated questions in PseudoLabelGenerator +psg = PseudoLabelGenerator(questions_producer, retriever, max_questions_per_document=10, batch_size=32, top_k=10) +output, pipe_id = psg.run(documents=document_store.get_all_documents()) +``` + +# Update the Retriever + +Now that we have the generated training data produced by `PseudoLabelGenerator`, we'll update the `EmbeddingRetriever`. Let's take a peek at the training data. + + +```python +output["gpl_labels"][0] +``` + + +```python +len(output["gpl_labels"]) +``` + + +```python +retriever.train(output["gpl_labels"]) +``` + +## Verify that EmbeddingRetriever Is Adapted and Save It For Future Use + +Let's repeat our query to see if the Retriever learned about COVID and can now rank it as #1 among the answers. + + +```python +print("Original Model") +show_examples(org_model) + +print("\n\nAdapted Model") +show_examples(retriever.embedding_encoder.embedding_model) +``` + + +```python +retriever.save("adapted_retriever") +``` diff --git a/docs/v1.7.0/_src/tutorials/tutorials/2.md b/docs/v1.7.0/_src/tutorials/tutorials/2.md new file mode 100644 index 0000000000..54a3611d32 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/2.md @@ -0,0 +1,172 @@ + + +# Fine-tuning a Model on Your Own Data + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial2_Finetune_a_model_on_your_data.ipynb) + +For many use cases it is sufficient to just use one of the existing public models that were trained on SQuAD or other public QA datasets (e.g. Natural Questions). +However, if you have domain-specific questions, fine-tuning your model on custom examples will very likely boost your performance. +While this varies by domain, we saw that ~ 2000 examples can easily increase performance by +5-20%. + +This tutorial shows you how to fine-tune a pretrained model on your own dataset. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + + +```python +from haystack.nodes import FARMReader +from haystack.utils import fetch_archive_from_http +``` + + +## Create Training Data + +There are two ways to generate training data + +1. **Annotation**: You can use the [annotation tool](https://haystack.deepset.ai/guides/annotation) to label your data, i.e. highlighting answers to your questions in a document. The tool supports structuring your workflow with organizations, projects, and users. The labels can be exported in SQuAD format that is compatible for training with Haystack. + +![Snapshot of the annotation tool](https://raw.githubusercontent.com/deepset-ai/haystack/master/docs/img/annotation_tool.png) + +2. **Feedback**: For production systems, you can collect training data from direct user feedback via Haystack's [REST API interface](https://github.com/deepset-ai/haystack#rest-api). This includes a customizable user feedback API for providing feedback on the answer returned by the API. The API provides a feedback export endpoint to obtain the feedback data for fine-tuning your model further. + + +## Fine-tune your model + +Once you have collected training data, you can fine-tune your base models. +We initialize a reader as a base model and fine-tune it on our own custom dataset (should be in SQuAD-like format). +We recommend using a base model that was trained on SQuAD or a similar QA dataset before to benefit from Transfer Learning effects. + +**Recommendation**: Run training on a GPU. +If you are using Colab: Enable this in the menu "Runtime" > "Change Runtime type" > Select "GPU" in dropdown. +Then change the `use_gpu` arguments below to `True` + + +```python +reader = FARMReader(model_name_or_path="distilbert-base-uncased-distilled-squad", use_gpu=True) +data_dir = "data/squad20" +# data_dir = "PATH/TO_YOUR/TRAIN_DATA" +reader.train(data_dir=data_dir, train_filename="dev-v2.0.json", use_gpu=True, n_epochs=1, save_dir="my_model") +``` + + +```python +# Saving the model happens automatically at the end of training into the `save_dir` you specified +# However, you could also save a reader manually again via: +reader.save(directory="my_model") +``` + + +```python +# If you want to load it at a later point, just do: +new_reader = FARMReader(model_name_or_path="my_model") +``` + +## Distill your model +In this case, we have used "distilbert-base-uncased" as our base model. This model was trained using a process called distillation. In this process, a bigger model is trained first and is used to train a smaller model which increases its accuracy. This is why "distilbert-base-uncased" can achieve quite competitive performance while being very small. + +Sometimes, however, you can't use an already distilled model and have to distil it yourself. For this case, haystack has implemented [distillation features](https://haystack.deepset.ai/guides/model-distillation). + +### Augmenting your training data +To get the most out of model distillation, we recommend increasing the size of your training data by using data augmentation. You can do this by running the [`augment_squad.py` script](https://github.com/deepset-ai/haystack/blob/master/haystack/utils/augment_squad.py): + + +```python +# Downloading script +!wget https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/utils/augment_squad.py + +doc_dir = "data/tutorial2" + +# Downloading smaller glove vector file (only for demonstration purposes) +glove_url = "https://nlp.stanford.edu/data/glove.6B.zip" +fetch_archive_from_http(url=glove_url, output_dir=doc_dir) + +# Downloading very small dataset to make tutorial faster (please use a bigger dataset for real use cases) +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/squad_small.json.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# Just replace the path with your dataset and adjust the output (also please remove glove path to use bigger glove vector file) +!python augment_squad.py --squad_path squad_small.json --output_path augmented_dataset.json --multiplication_factor 2 --glove_path glove.6B.300d.txt +``` + +In this case, we use a multiplication factor of 2 to keep this example lightweight. Usually you would use a factor like 20 depending on the size of your training data. Augmenting this small dataset with a multiplication factor of 2, should take about 5 to 10 minutes to run on one V100 GPU. + +### Running distillation +Distillation in haystack is done in two steps: First, you run intermediate layer distillation on the augmented dataset to ensure the two models behave similarly. After that, you run the prediction layer distillation on the non-augmented dataset to optimize the model for your specific task. + +If you want, you can leave out the intermediate layer distillation step and only run the prediction layer distillation. This way you also do not need to perform data augmentation. However, this will make the model significantly less accurate. + + +```python +# Loading a fine-tuned model as teacher e.g. "deepset/​bert-​base-​uncased-​squad2" +teacher = FARMReader(model_name_or_path="my_model", use_gpu=True) + +# You can use any pre-trained language model as teacher that uses the same tokenizer as the teacher model. +# The number of the layers in the teacher model also needs to be a multiple of the number of the layers in the student. +student = FARMReader(model_name_or_path="huawei-noah/TinyBERT_General_6L_768D", use_gpu=True) + +student.distil_intermediate_layers_from(teacher, data_dir=".", train_filename="augmented_dataset.json", use_gpu=True) +student.distil_prediction_layer_from(teacher, data_dir="data/squad20", train_filename="dev-v2.0.json", use_gpu=True) + +student.save(directory="my_distilled_model") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.7.0/_src/tutorials/tutorials/3.md b/docs/v1.7.0/_src/tutorials/tutorials/3.md new file mode 100644 index 0000000000..f73e08c8d9 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/3.md @@ -0,0 +1,245 @@ + + +# Build a QA System Without Elasticsearch + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial3_Basic_QA_Pipeline_without_Elasticsearch.ipynb) + +Haystack provides alternatives to Elasticsearch for developing quick prototypes. + +You can use an `InMemoryDocumentStore` or a `SQLDocumentStore`(with SQLite) as the document store. + +If you are interested in more feature-rich Elasticsearch, then please refer to the Tutorial 1. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + + +```python +from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers +from haystack.nodes import FARMReader, TransformersReader +``` + +## Document Store + + + +```python +# In-Memory Document Store +from haystack.document_stores import InMemoryDocumentStore + +document_store = InMemoryDocumentStore() +``` + + +```python +# SQLite Document Store +# from haystack.document_stores import SQLDocumentStore +# document_store = SQLDocumentStore(url="sqlite:///qa.db") +``` + +## Preprocessing of documents + +Haystack provides a customizable pipeline for: + - converting files into texts + - cleaning texts + - splitting texts + - writing them to a Document Store + +In this tutorial, we download Wikipedia articles on Game of Thrones, apply a basic cleaning function, and index them in Elasticsearch. + + +```python +# Let's first get some documents that we want to query +# Here: 517 Wikipedia articles for Game of Thrones +doc_dir = "data/tutorial3" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt3.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# convert files to dicts containing documents that can be indexed to our datastore +# You can optionally supply a cleaning function that is applied to each doc (e.g. to remove footers) +# It must take a str as input, and return a str. +docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) + +# We now have a list of dictionaries that we can write to our document store. +# If your texts come from a different source (e.g. a DB), you can of course skip convert_files_to_dicts() and create the dictionaries yourself. +# The default format here is: {"name": ", "text": ""} + +# Let's have a look at the first 3 entries: +print(docs[:3]) +# Now, let's write the docs to our DB. +document_store.write_documents(docs) +``` + +## Initialize Retriever, Reader & Pipeline + +### Retriever + +Retrievers help narrowing down the scope for the Reader to smaller units of text where a given question could be answered. + +With InMemoryDocumentStore or SQLDocumentStore, you can use the TfidfRetriever. For more retrievers, please refer to the tutorial-1. + + +```python +# An in-memory TfidfRetriever based on Pandas dataframes + +from haystack.nodes import TfidfRetriever + +retriever = TfidfRetriever(document_store=document_store) +``` + +### Reader + +A Reader scans the texts returned by retrievers in detail and extracts the k best answers. They are based +on powerful, but slower deep learning models. + +Haystack currently supports Readers based on the frameworks FARM and Transformers. +With both you can either load a local model or one from Hugging Face's model hub (https://huggingface.co/models). + +**Here:** a medium sized RoBERTa QA model using a Reader based on FARM (https://huggingface.co/deepset/roberta-base-squad2) + +**Alternatives (Reader):** TransformersReader (leveraging the `pipeline` of the Transformers package) + +**Alternatives (Models):** e.g. "distilbert-base-uncased-distilled-squad" (fast) or "deepset/bert-large-uncased-whole-word-masking-squad2" (good accuracy) + +**Hint:** You can adjust the model to return "no answer possible" with the no_ans_boost. Higher values mean the model prefers "no answer possible" + +#### FARMReader + + +```python +# Load a local model or any of the QA models on +# Hugging Face's model hub (https://huggingface.co/models) + +reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) +``` + +#### TransformersReader + + +```python +# Alternative: +# reader = TransformersReader(model_name_or_path="distilbert-base-uncased-distilled-squad", tokenizer="distilbert-base-uncased", use_gpu=-1) +``` + +### Pipeline + +With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. +Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. +To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. +You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). + + +```python +from haystack.pipelines import ExtractiveQAPipeline + +pipe = ExtractiveQAPipeline(reader, retriever) +``` + +## Voilà! Ask a question! + + +```python +# You can configure how many candidates the reader and retriever shall return +# The higher top_k for retriever, the better (but also the slower) your answers. +prediction = pipe.run( + query="Who is the father of Arya Stark?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +) +``` + + +```python +# prediction = pipe.run(query="Who created the Dothraki vocabulary?", params={"Reader": {"top_k": 5}}) +# prediction = pipe.run(query="Who is the sister of Sansa?", params={"Reader": {"top_k": 5}}) +``` + + +```python +# Now you can either print the object directly... +from pprint import pprint + +pprint(prediction) + +# Sample output: +# { +# 'answers': [ , +# , +# ... +# ] +# 'documents': [ , +# , +# ... +# ], +# 'no_ans_gap': 11.688868522644043, +# 'node_id': 'Reader', +# 'params': {'Reader': {'top_k': 5}, 'Retriever': {'top_k': 5}}, +# 'query': 'Who is the father of Arya Stark?', +# 'root_node': 'Query' +# } +``` + + +```python +# ...or use a util to simplify the output +# Change `minimum` to `medium` or `all` to raise the level of detail +print_answers(prediction, details="minimum") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.7.0/_src/tutorials/tutorials/4.md b/docs/v1.7.0/_src/tutorials/tutorials/4.md new file mode 100644 index 0000000000..3ab2e87c23 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/4.md @@ -0,0 +1,203 @@ + + +# Utilizing existing FAQs for Question Answering + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial4_FAQ_style_QA.ipynb) + +While *extractive Question Answering* works on pure texts and is therefore more generalizable, there's also a common alternative that utilizes existing FAQ data. + +**Pros**: + +- Very fast at inference time +- Utilize existing FAQ data +- Quite good control over answers + +**Cons**: + +- Generalizability: We can only answer questions that are similar to existing ones in FAQ + +In some use cases, a combination of extractive QA and FAQ-style can also be an interesting option. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + + +```python +from haystack.document_stores import ElasticsearchDocumentStore + +from haystack.nodes import EmbeddingRetriever +import pandas as pd +``` + +### Start an Elasticsearch server +You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source. + + +```python +# Recommended: Start Elasticsearch using Docker via the Haystack utility function +from haystack.utils import launch_es + +launch_es() +``` + + +```python +# In Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + +### Init the DocumentStore +In contrast to Tutorial 1 (extractive QA), we: + +* specify the name of our `text_field` in Elasticsearch that we want to return as an answer +* specify the name of our `embedding_field` in Elasticsearch where we'll store the embedding of our question and that is used later for calculating our similarity to the incoming user question +* set `excluded_meta_data=["question_emb"]` so that we don't return the huge embedding vectors in our search results + + +```python +from haystack.document_stores import ElasticsearchDocumentStore + +document_store = ElasticsearchDocumentStore( + host="localhost", + username="", + password="", + index="document", + embedding_field="question_emb", + embedding_dim=384, + excluded_meta_data=["question_emb"], + similarity="cosine", +) +``` + +### Create a Retriever using embeddings +Instead of retrieving via Elasticsearch's plain BM25, we want to use vector similarity of the questions (user question vs. FAQ ones). +We can use the `EmbeddingRetriever` for this purpose and specify a model that we use for the embeddings. + + +```python +retriever = EmbeddingRetriever( + document_store=document_store, + embedding_model="sentence-transformers/all-MiniLM-L6-v2", + use_gpu=True, + scale_score=False, +) +``` + +### Prepare & Index FAQ data +We create a pandas dataframe containing some FAQ data (i.e curated pairs of question + answer) and index those in elasticsearch. +Here: We download some question-answer pairs related to COVID-19 + + +```python +from haystack.utils import fetch_archive_from_http + +# Download +doc_dir = "data/tutorial4" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/small_faq_covid.csv.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# Get dataframe with columns "question", "answer" and some custom metadata +df = pd.read_csv(f"{doc_dir}/small_faq_covid.csv") +# Minimal cleaning +df.fillna(value="", inplace=True) +df["question"] = df["question"].apply(lambda x: x.strip()) +print(df.head()) + +# Get embeddings for our questions from the FAQs +questions = list(df["question"].values) +df["question_emb"] = retriever.embed_queries(texts=questions) +df = df.rename(columns={"question": "content"}) + +# Convert Dataframe to list of dicts and index them in our DocumentStore +docs_to_index = df.to_dict(orient="records") +document_store.write_documents(docs_to_index) +``` + +### Ask questions +Initialize a Pipeline (this time without a reader) and ask questions + + +```python +from haystack.pipelines import FAQPipeline + +pipe = FAQPipeline(retriever=retriever) +``` + + +```python +from haystack.utils import print_answers + +prediction = pipe.run(query="How is the virus spreading?", params={"Retriever": {"top_k": 10}}) +print_answers(prediction, details="medium") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.7.0/_src/tutorials/tutorials/5.md b/docs/v1.7.0/_src/tutorials/tutorials/5.md new file mode 100644 index 0000000000..89944b01c7 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/5.md @@ -0,0 +1,636 @@ + + +# Evaluation of a Pipeline and its Components + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial5_Evaluation.ipynb) + +To be able to make a statement about the quality of results a question-answering pipeline or any other pipeline in haystack produces, it is important to evaluate it. Furthermore, evaluation allows determining which components of the pipeline can be improved. +The results of the evaluation can be saved as CSV files, which contain all the information to calculate additional metrics later on or inspect individual predictions. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + +## Start an Elasticsearch server +You can start Elasticsearch on your local machine instance using Docker. If Docker is not readily available in your environment (eg., in Colab notebooks), then you can manually download and execute Elasticsearch from source. + + +```python +# If Docker is available: Start Elasticsearch as docker container +# from haystack.utils import launch_es +# launch_es() + +# Alternative in Colab / No Docker environments: Start Elasticsearch from source +! wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.9.2-linux-x86_64.tar.gz -q +! tar -xzf elasticsearch-7.9.2-linux-x86_64.tar.gz +! chown -R daemon:daemon elasticsearch-7.9.2 + +import os +from subprocess import Popen, PIPE, STDOUT + +es_server = Popen( + ["elasticsearch-7.9.2/bin/elasticsearch"], stdout=PIPE, stderr=STDOUT, preexec_fn=lambda: os.setuid(1) # as daemon +) +# wait until ES has started +! sleep 30 +``` + +## Fetch, Store And Preprocess the Evaluation Dataset + + +```python +from haystack.utils import fetch_archive_from_http + +# Download evaluation data, which is a subset of Natural Questions development set containing 50 documents with one question per document and multiple annotated answers +doc_dir = "data/tutorial5" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/nq_dev_subset_v2.json.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) +``` + + +```python +# make sure these indices do not collide with existing ones, the indices will be wiped clean before data is inserted +doc_index = "tutorial5_docs" +label_index = "tutorial5_labels" +``` + + +```python +# Connect to Elasticsearch +from haystack.document_stores import ElasticsearchDocumentStore + +# Connect to Elasticsearch +document_store = ElasticsearchDocumentStore( + host="localhost", + username="", + password="", + index=doc_index, + label_index=label_index, + embedding_field="emb", + embedding_dim=768, + excluded_meta_data=["emb"], +) +``` + + +```python +from haystack.nodes import PreProcessor + +# Add evaluation data to Elasticsearch Document Store +# We first delete the custom tutorial indices to not have duplicate elements +# and also split our documents into shorter passages using the PreProcessor +preprocessor = PreProcessor( + split_by="word", + split_length=200, + split_overlap=0, + split_respect_sentence_boundary=False, + clean_empty_lines=False, + clean_whitespace=False, +) +document_store.delete_documents(index=doc_index) +document_store.delete_documents(index=label_index) + +# The add_eval_data() method converts the given dataset in json format into Haystack document and label objects. Those objects are then indexed in their respective document and label index in the document store. The method can be used with any dataset in SQuAD format. +document_store.add_eval_data( + filename="data/tutorial5/nq_dev_subset_v2.json", + doc_index=doc_index, + label_index=label_index, + preprocessor=preprocessor, +) +``` + +## Initialize the Two Components of an ExtractiveQAPipeline: Retriever and Reader + + +```python +# Initialize Retriever +from haystack.nodes import BM25Retriever + +retriever = BM25Retriever(document_store=document_store) + +# Alternative: Evaluate dense retrievers (EmbeddingRetriever or DensePassageRetriever) +# The EmbeddingRetriever uses a single transformer based encoder model for query and document. +# In contrast, DensePassageRetriever uses two separate encoders for both. + +# Please make sure the "embedding_dim" parameter in the DocumentStore above matches the output dimension of your models! +# Please also take care that the PreProcessor splits your files into chunks that can be completely converted with +# the max_seq_len limitations of Transformers +# The SentenceTransformer model "sentence-transformers/multi-qa-mpnet-base-dot-v1" generally works well with the EmbeddingRetriever on any kind of English text. +# For more information and suggestions on different models check out the documentation at: https://www.sbert.net/docs/pretrained_models.html + +# from haystack.retriever import EmbeddingRetriever, DensePassageRetriever +# retriever = EmbeddingRetriever(document_store=document_store, +# embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1") +# retriever = DensePassageRetriever(document_store=document_store, +# query_embedding_model="facebook/dpr-question_encoder-single-nq-base", +# passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", +# use_gpu=True, +# max_seq_len_passage=256, +# embed_title=True) +# document_store.update_embeddings(retriever, index=doc_index) +``` + + +```python +# Initialize Reader +from haystack.nodes import FARMReader + +reader = FARMReader("deepset/roberta-base-squad2", top_k=4, return_no_answer=True) + +# Define a pipeline consisting of the initialized retriever and reader +from haystack.pipelines import ExtractiveQAPipeline + +pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever) + +# The evaluation also works with any other pipeline. +# For example you could use a DocumentSearchPipeline as an alternative: + +# from haystack.pipelines import DocumentSearchPipeline +# pipeline = DocumentSearchPipeline(retriever=retriever) +``` + +## Evaluation of an ExtractiveQAPipeline +Here we evaluate retriever and reader in open domain fashion on the full corpus of documents i.e. a document is considered +correctly retrieved if it contains the gold answer string within it. The reader is evaluated based purely on the +predicted answer string, regardless of which document this came from and the position of the extracted span. + +The generation of predictions is separated from the calculation of metrics. This allows you to run the computation-heavy model predictions only once and then iterate flexibly on the metrics or reports you want to generate. + + + +```python +from haystack.schema import EvaluationResult, MultiLabel + +# We can load evaluation labels from the document store +# We are also opting to filter out no_answer samples +eval_labels = document_store.get_all_labels_aggregated(drop_negative_labels=True, drop_no_answers=True) + +## Alternative: Define queries and labels directly + +# eval_labels = [ +# MultiLabel( +# labels=[ +# Label( +# query="who is written in the book of life", +# answer=Answer( +# answer="every person who is destined for Heaven or the World to Come", +# offsets_in_context=[Span(374, 434)] +# ), +# document=Document( +# id='1b090aec7dbd1af6739c4c80f8995877-0', +# content_type="text", +# content='Book of Life - wikipedia Book of Life Jump to: navigation, search This article is +# about the book mentioned in Christian and Jewish religious teachings...' +# ), +# is_correct_answer=True, +# is_correct_document=True, +# origin="gold-label" +# ) +# ] +# ) +# ] + +# Similar to pipeline.run() we can execute pipeline.eval() +eval_result = pipeline.eval(labels=eval_labels, params={"Retriever": {"top_k": 5}}) +``` + + +```python +# The EvaluationResult contains a pandas dataframe for each pipeline node. +# That's why there are two dataframes in the EvaluationResult of an ExtractiveQAPipeline. + +retriever_result = eval_result["Retriever"] +retriever_result.head() +``` + + +```python +reader_result = eval_result["Reader"] +reader_result.head() +``` + + +```python +# We can filter for all documents retrieved for a given query +query = "who is written in the book of life" +retriever_book_of_life = retriever_result[retriever_result["query"] == query] +``` + + +```python +# We can also filter for all answers predicted for a given query +reader_book_of_life = reader_result[reader_result["query"] == query] +``` + + +```python +# Save the evaluation result so that we can reload it later and calculate evaluation metrics without running the pipeline again. +eval_result.save("../") +``` + +## Calculating Evaluation Metrics +Load an EvaluationResult to quickly calculate standard evaluation metrics for all predictions, +such as F1-score of each individual prediction of the Reader node or recall of the retriever. +To learn more about the metrics, see [Evaluation Metrics](https://haystack.deepset.ai/guides/evaluation#metrics-retrieval) + + +```python +saved_eval_result = EvaluationResult.load("../") +metrics = saved_eval_result.calculate_metrics() +print(f'Retriever - Recall (single relevant document): {metrics["Retriever"]["recall_single_hit"]}') +print(f'Retriever - Recall (multiple relevant documents): {metrics["Retriever"]["recall_multi_hit"]}') +print(f'Retriever - Mean Reciprocal Rank: {metrics["Retriever"]["mrr"]}') +print(f'Retriever - Precision: {metrics["Retriever"]["precision"]}') +print(f'Retriever - Mean Average Precision: {metrics["Retriever"]["map"]}') + +print(f'Reader - F1-Score: {metrics["Reader"]["f1"]}') +print(f'Reader - Exact Match: {metrics["Reader"]["exact_match"]}') +``` + +## Generating an Evaluation Report +A summary of the evaluation results can be printed to get a quick overview. It includes some aggregated metrics and also shows a few wrongly predicted examples. + + +```python +pipeline.print_eval_report(saved_eval_result) +``` + +## Advanced Evaluation Metrics +As an advanced evaluation metric, semantic answer similarity (SAS) can be calculated. This metric takes into account whether the meaning of a predicted answer is similar to the annotated gold answer rather than just doing string comparison. +To this end SAS relies on pre-trained models. For English, we recommend "cross-encoder/stsb-roberta-large", whereas for German we recommend "deepset/gbert-large-sts". A good multilingual model is "sentence-transformers/paraphrase-multilingual-mpnet-base-v2". +More info on this metric can be found in our [paper](https://arxiv.org/abs/2108.06130) or in our [blog post](https://www.deepset.ai/blog/semantic-answer-similarity-to-evaluate-qa). + + +```python +advanced_eval_result = pipeline.eval( + labels=eval_labels, params={"Retriever": {"top_k": 5}}, sas_model_name_or_path="cross-encoder/stsb-roberta-large" +) + +metrics = advanced_eval_result.calculate_metrics() +print(metrics["Reader"]["sas"]) +``` + +## Isolated Evaluation Mode +The isolated node evaluation uses labels as input to the Reader node instead of the output of the preceeding Retriever node. +Thereby, we can additionally calculate the upper bounds of the evaluation metrics of the Reader. Note that even with isolated evaluation enabled, integrated evaluation will still be running. + + + +```python +eval_result_with_upper_bounds = pipeline.eval( + labels=eval_labels, params={"Retriever": {"top_k": 5}, "Reader": {"top_k": 5}}, add_isolated_node_eval=True +) +``` + + +```python +pipeline.print_eval_report(eval_result_with_upper_bounds) +``` + +## Advanced Label Scopes +Answers are considered correct if the predicted answer matches the gold answer in the labels. Documents are considered correct if the predicted document ID matches the gold document ID in the labels. Sometimes, these simple definitions of "correctness" are not sufficient. There are cases where you want to further specify the "scope" within which an answer or a document is considered correct. For this reason, `EvaluationResult.calculate_metrics()` offers the parameters `answer_scope` and `document_scope`. + +Say you want to ensure that an answer is only considered correct if it stems from a specific context of surrounding words. This is especially useful if your answer is very short, like a date (for example, "2011") or a place ("Berlin"). Such short answer might easily appear in multiple completely different contexts. Some of those contexts might perfectly fit the actual question and answer it. Some others might not: they don't relate to the question at all but still contain the answer string. In that case, you might want to ensure that only answers that stem from the correct context are considered correct. To do that, specify `answer_scope="context"` in `calculate_metrics()`. + +`answer_scope` takes the following values: +- `any` (default): Any matching answer is considered correct. +- `context`: The answer is only considered correct if its context matches as well. It uses fuzzy matching (see `context_matching` parameters of `pipeline.eval()`). +- `document_id`: The answer is only considered correct if its document ID matches as well. You can specify a custom document ID through the `custom_document_id_field` parameter of `pipeline.eval()`. +- `document_id_and_context`: The answer is only considered correct if its document ID and its context match as well. + +In Question Answering, to enforce that the retrieved document is considered correct whenever the answer is correct, set `document_scope` to `answer` or `document_id_or_answer`. + +`document_scope` takes the following values: +- `document_id`: Specifies that the document ID must match. You can specify a custom document ID through the `custom_document_id_field` parameter of `pipeline.eval()`. +- `context`: Specifies that the content of the document must match. It uses fuzzy matching (see the `context_matching` parameters of `pipeline.eval()`). +- `document_id_and_context`: A Boolean operation specifying that both `'document_id' AND 'context'` must match. +- `document_id_or_context`: A Boolean operation specifying that either `'document_id' OR 'context'` must match. +- `answer`: Specifies that the document contents must include the answer. The selected `answer_scope` is enforced. +- `document_id_or_answer` (default): A Boolean operation specifying that either `'document_id' OR 'answer'` must match. + + +```python +metrics = saved_eval_result.calculate_metrics(answer_scope="context") +print(f'Retriever - Recall (single relevant document): {metrics["Retriever"]["recall_single_hit"]}') +print(f'Retriever - Recall (multiple relevant documents): {metrics["Retriever"]["recall_multi_hit"]}') +print(f'Retriever - Mean Reciprocal Rank: {metrics["Retriever"]["mrr"]}') +print(f'Retriever - Precision: {metrics["Retriever"]["precision"]}') +print(f'Retriever - Mean Average Precision: {metrics["Retriever"]["map"]}') + +print(f'Reader - F1-Score: {metrics["Reader"]["f1"]}') +print(f'Reader - Exact Match: {metrics["Reader"]["exact_match"]}') +``` + + +```python +document_store.get_all_documents()[0] +``` + + +```python +# Let's try Document Retrieval on a file level (it's sufficient if the correct file identified by its name (for example, 'Book of Life') was retrieved). +eval_result_custom_doc_id = pipeline.eval( + labels=eval_labels, params={"Retriever": {"top_k": 5}}, custom_document_id_field="name" +) +metrics = eval_result_custom_doc_id.calculate_metrics(document_scope="document_id") +print(f'Retriever - Recall (single relevant document): {metrics["Retriever"]["recall_single_hit"]}') +print(f'Retriever - Recall (multiple relevant documents): {metrics["Retriever"]["recall_multi_hit"]}') +print(f'Retriever - Mean Reciprocal Rank: {metrics["Retriever"]["mrr"]}') +print(f'Retriever - Precision: {metrics["Retriever"]["precision"]}') +print(f'Retriever - Mean Average Precision: {metrics["Retriever"]["map"]}') +``` + + +```python +# Let's enforce the context again: +metrics = eval_result_custom_doc_id.calculate_metrics(document_scope="document_id_and_context") +print(f'Retriever - Recall (single relevant document): {metrics["Retriever"]["recall_single_hit"]}') +print(f'Retriever - Recall (multiple relevant documents): {metrics["Retriever"]["recall_multi_hit"]}') +print(f'Retriever - Mean Reciprocal Rank: {metrics["Retriever"]["mrr"]}') +print(f'Retriever - Precision: {metrics["Retriever"]["precision"]}') +print(f'Retriever - Mean Average Precision: {metrics["Retriever"]["map"]}') +``` + +## Storing results in MLflow +Storing evaluation results in CSVs is fine but not enough if you want to compare and track multiple evaluation runs. MLflow is a handy tool when it comes to tracking experiments. So we decided to use it to track all of `Pipeline.eval()` with reproducability of your experiments in mind. + +### Host your own MLflow or use deepset's public MLflow + +If you don't want to use deepset's public MLflow instance under https://public-mlflow.deepset.ai, you can easily host it yourself. + + +```python +# !pip install mlflow +# !mlflow server --serve-artifacts +``` + +### Preprocessing the dataset +Preprocessing the dataset works a bit differently than before. Instead of directly generating documents (and labels) out of a SQuAD file, we first save them to disk. This is necessary to experiment with different indexing pipelines. + + +```python +import tempfile +from pathlib import Path +from haystack.nodes import PreProcessor +from haystack.document_stores import InMemoryDocumentStore + +document_store = InMemoryDocumentStore() + +label_preprocessor = PreProcessor( + split_length=200, + split_overlap=0, + split_respect_sentence_boundary=False, + clean_empty_lines=False, + clean_whitespace=False, +) + +# The add_eval_data() method converts the given dataset in json format into Haystack document and label objects. +# Those objects are then indexed in their respective document and label index in the document store. +# The method can be used with any dataset in SQuAD format. +# We only use it to get the evaluation set labels and the corpus files. +document_store.add_eval_data( + filename="data/tutorial5/nq_dev_subset_v2.json", + doc_index=document_store.index, + label_index=document_store.label_index, + preprocessor=label_preprocessor, +) + +# the evaluation set to evaluate the pipelines on +evaluation_set_labels = document_store.get_all_labels_aggregated(drop_negative_labels=True, drop_no_answers=True) + +# Pipelines need files as input to be able to test different preprocessors. +# Even though this looks a bit cumbersome to write the documents back to files we gain a lot of evaluation potential and reproducibility. +docs = document_store.get_all_documents() +temp_dir = tempfile.TemporaryDirectory() +file_paths = [] +for doc in docs: + file_name = doc.id + ".txt" + file_path = Path(temp_dir.name) / file_name + file_paths.append(file_path) + with open(file_path, "w") as f: + f.write(doc.content) +file_metas = [d.meta for d in docs] +``` + +### Run experiments +In this experiment we evaluate extractive QA pipelines with two different retrievers on the evaluation set given the corpus: +**ElasticsearchRetriever vs. EmbeddingRetriever** + + +```python +from haystack.nodes import BM25Retriever, EmbeddingRetriever, FARMReader, TextConverter +from haystack.pipelines import Pipeline +from haystack.document_stores import ElasticsearchDocumentStore +``` + + +```python +# helper function to create query and index pipeline +def create_pipelines(document_store, preprocessor, retriever, reader): + query_pipeline = Pipeline() + query_pipeline.add_node(component=retriever, inputs=["Query"], name="Retriever") + query_pipeline.add_node(component=reader, inputs=["Retriever"], name="Reader") + index_pipeline = Pipeline() + index_pipeline.add_node(component=TextConverter(), inputs=["File"], name="TextConverter") + index_pipeline.add_node(component=preprocessor, inputs=["TextConverter"], name="Preprocessor") + index_pipeline.add_node(component=retriever, inputs=["Preprocessor"], name="Retriever") + index_pipeline.add_node(component=document_store, inputs=["Retriever"], name="DocumentStore") + return query_pipeline, index_pipeline +``` + + +```python +# Name of the experiment in MLflow +EXPERIMENT_NAME = "haystack-tutorial-5" +``` + +#### Run using BM25Retriever + + +```python +document_store = ElasticsearchDocumentStore(index="sparse_index", recreate_index=True) +preprocessor = PreProcessor( + split_length=200, + split_overlap=0, + split_respect_sentence_boundary=False, + clean_empty_lines=False, + clean_whitespace=False, +) +es_retriever = BM25Retriever(document_store=document_store) +reader = FARMReader("deepset/roberta-base-squad2", top_k=3, return_no_answer=True, batch_size=8) +query_pipeline, index_pipeline = create_pipelines(document_store, preprocessor, es_retriever, reader) + +sparse_eval_result = Pipeline.execute_eval_run( + index_pipeline=index_pipeline, + query_pipeline=query_pipeline, + evaluation_set_labels=evaluation_set_labels, + corpus_file_paths=file_paths, + corpus_file_metas=file_metas, + experiment_name=EXPERIMENT_NAME, + experiment_run_name="sparse", + corpus_meta={"name": "nq_dev_subset_v2.json"}, + evaluation_set_meta={"name": "nq_dev_subset_v2.json"}, + pipeline_meta={"name": "sparse-pipeline"}, + add_isolated_node_eval=True, + experiment_tracking_tool="mlflow", + experiment_tracking_uri="https://public-mlflow.deepset.ai", + reuse_index=True, +) +``` + +#### Run using EmbeddingRetriever + + +```python +document_store = ElasticsearchDocumentStore(index="dense_index", recreate_index=True) +emb_retriever = EmbeddingRetriever( + document_store=document_store, + model_format="sentence_transformers", + embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1", + batch_size=8, +) +query_pipeline, index_pipeline = create_pipelines(document_store, preprocessor, emb_retriever, reader) + +dense_eval_result = Pipeline.execute_eval_run( + index_pipeline=index_pipeline, + query_pipeline=query_pipeline, + evaluation_set_labels=evaluation_set_labels, + corpus_file_paths=file_paths, + corpus_file_metas=file_metas, + experiment_name=EXPERIMENT_NAME, + experiment_run_name="embedding", + corpus_meta={"name": "nq_dev_subset_v2.json"}, + evaluation_set_meta={"name": "nq_dev_subset_v2.json"}, + pipeline_meta={"name": "embedding-pipeline"}, + add_isolated_node_eval=True, + experiment_tracking_tool="mlflow", + experiment_tracking_uri="https://public-mlflow.deepset.ai", + reuse_index=True, + answer_scope="context", +) +``` + +You can now open MLflow (e.g. https://public-mlflow.deepset.ai/ if you used the public one hosted by deepset) and look for the haystack-eval-experiment experiment. Try out mlflow's compare function and have fun... + +Note that on our public mlflow instance we are not able to log artifacts like the evaluation results or the piplines.yaml file. + +## Evaluation of Individual Components: Retriever +Sometimes you might want to evaluate individual components, for example, if you don't have a pipeline but only a retriever or a reader with a model that you trained yourself. +Here we evaluate only the retriever, based on whether the gold_label document is retrieved. + + +```python +## Evaluate Retriever on its own +# Note that no_answer samples are omitted when evaluation is performed with this method +retriever_eval_results = retriever.eval(top_k=5, label_index=label_index, doc_index=doc_index) +# Retriever Recall is the proportion of questions for which the correct document containing the answer is +# among the correct documents +print("Retriever Recall:", retriever_eval_results["recall"]) +# Retriever Mean Avg Precision rewards retrievers that give relevant documents a higher rank +print("Retriever Mean Avg Precision:", retriever_eval_results["map"]) +``` + +Just as a sanity check, we can compare the recall from `retriever.eval()` with the multi hit recall from `pipeline.eval(add_isolated_node_eval=True)`. +These two recall metrics are only comparable since we chose to filter out no_answer samples when generating eval_labels and setting document_scope to `"document_id"`. Per default `calculate_metrics()` has document_scope set to `"document_id_or_answer"` which interprets documents as relevant if they either match the gold document ID or contain the answer. + + +```python +metrics = eval_result_with_upper_bounds.calculate_metrics(document_scope="document_id") +print(metrics["Retriever"]["recall_multi_hit"]) +``` + +## Evaluation of Individual Components: Reader +Here we evaluate only the reader in a closed domain fashion i.e. the reader is given one query +and its corresponding relevant document and metrics are calculated on whether the right position in this text is selected by +the model as the answer span (i.e. SQuAD style) + + +```python +# Evaluate Reader on its own +reader_eval_results = reader.eval(document_store=document_store, label_index=label_index, doc_index=doc_index) +top_n = reader_eval_results["top_n"] +# Evaluation of Reader can also be done directly on a SQuAD-formatted file without passing the data to Elasticsearch +# reader_eval_results = reader.eval_on_file("../data/nq", "nq_dev_subset_v2.json", device=device) + +# Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer including no_answers +print(f"Reader Top-{top_n}-Accuracy:", reader_eval_results["top_n_accuracy"]) +# Reader Top-1-Exact Match is the proportion of questions where the first predicted answer is exactly the same as the correct answer including no_answers +print("Reader Top-1-Exact Match:", reader_eval_results["EM"]) +# Reader Top-1-F1-Score is the average overlap between the first predicted answers and the correct answers including no_answers +print("Reader Top-1-F1-Score:", reader_eval_results["f1"]) +# Reader Top-N-Accuracy is the proportion of predicted answers that match with their corresponding correct answer excluding no_answers +print(f"Reader Top-{top_n}-Accuracy (without no_answers):", reader_eval_results["top_n_accuracy_text_answer"]) +# Reader Top-N-Exact Match is the proportion of questions where the predicted answer within the first n results is exactly the same as the correct answer excluding no_answers (no_answers are always present within top n). +print(f"Reader Top-{top_n}-Exact Match (without no_answers):", reader_eval_results["top_n_EM_text_answer"]) +# Reader Top-N-F1-Score is the average overlap between the top n predicted answers and the correct answers excluding no_answers (no_answers are always present within top n). +print(f"Reader Top-{top_n}-F1-Score (without no_answers):", reader_eval_results["top_n_f1_text_answer"]) +``` + +Just as a sanity check, we can compare the top-n exact_match and f1 metrics from `reader.eval()` with the exact_match and f1 from `pipeline.eval(add_isolated_node_eval=True)`. +These two approaches return the same values because pipeline.eval() calculates top-n metrics per default. Small discrepancies might occur due to string normalization in pipeline.eval()'s answer-to-label comparison. reader.eval() does not use string normalization. + + +```python +metrics = eval_result_with_upper_bounds.calculate_metrics(eval_mode="isolated") +print(metrics["Reader"]["exact_match"]) +print(metrics["Reader"]["f1"]) +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.7.0/_src/tutorials/tutorials/6.md b/docs/v1.7.0/_src/tutorials/tutorials/6.md new file mode 100644 index 0000000000..85ef9a53fe --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/6.md @@ -0,0 +1,251 @@ + + +# Better Retrieval via "Embedding Retrieval" + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial6_Better_Retrieval_via_Embedding_Retrieval.ipynb) + +### Importance of Retrievers + +The Retriever has a huge impact on the performance of our overall search pipeline. + + +### Different types of Retrievers +#### Sparse +Family of algorithms based on counting the occurrences of words (bag-of-words) resulting in very sparse vectors with length = vocab size. + +**Examples**: BM25, TF-IDF + +**Pros**: Simple, fast, well explainable + +**Cons**: Relies on exact keyword matches between query and text + + +#### Dense +These retrievers use neural network models to create "dense" embedding vectors. Within this family, there are two different approaches: + +a) Single encoder: Use a **single model** to embed both the query and the passage. +b) Dual-encoder: Use **two models**, one to embed the query and one to embed the passage. + +**Examples**: REALM, DPR, Sentence-Transformers + +**Pros**: Captures semantic similarity instead of "word matches" (for example, synonyms, related topics). + +**Cons**: Computationally more heavy to use, initial training of the model (though this is less of an issue nowadays as many pre-trained models are available and most of the time, it's not needed to train the model). + + +### Embedding Retrieval + +In this Tutorial, we use an `EmbeddingRetriever` with [Sentence Transformers](https://www.sbert.net/index.html) models. + +These models are trained to embed similar sentences close to each other in a shared embedding space. + +Some models have been fine-tuned on massive Information Retrieval data and can be used to retrieve documents based on a short query (for example, `multi-qa-mpnet-base-dot-v1`). There are others that are more suited to semantic similarity tasks where you are trying to find the most similar documents to a given document (for example, `all-mpnet-base-v2`). There are even models that are multilingual (for example, `paraphrase-multilingual-mpnet-base-v2`). For a good overview of different models with their evaluation metrics, see the [Pretrained Models](https://www.sbert.net/docs/pretrained_models.html#) in the Sentence Transformers documentation. + +*Use this* [link](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial6_Better_Retrieval_via_Embedding_Retrieval.ipynb) *to open the notebook in Google Colab.* + + +### Prepare the Environment + +#### Colab: Enable the GPU Runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss] +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + + +```python +from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers +from haystack.nodes import FARMReader, TransformersReader +``` + +### Document Store + +#### Option 1: FAISS + +FAISS is a library for efficient similarity search on a cluster of dense vectors. +The `FAISSDocumentStore` uses a SQL(SQLite in-memory be default) database under-the-hood +to store the document text and other meta data. The vector embeddings of the text are +indexed on a FAISS Index that later is queried for searching answers. +The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for +faster search at the expense of some accuracy. Just set the faiss_index_factor_str argument in the constructor. +For more info on which suits your use case: https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index + + +```python +from haystack.document_stores import FAISSDocumentStore + +document_store = FAISSDocumentStore(faiss_index_factory_str="Flat") +``` + +#### Option 2: Milvus + +Milvus is an open source database library that is also optimized for vector similarity searches like FAISS. +Like FAISS it has both a "Flat" and "HNSW" mode but it outperforms FAISS when it comes to dynamic data management. +It does require a little more setup, however, as it is run through Docker and requires the setup of some config files. +See [their docs](https://milvus.io/docs/v1.0.0/milvus_docker-cpu.md) for more details. + + +```python +# Milvus cannot be run on COlab, so this cell is commented out. +# To run Milvus you need Docker (versions below 2.0.0) or a docker-compose (versions >= 2.0.0), neither of which is available on Colab. +# See Milvus' documentation for more details: https://milvus.io/docs/install_standalone-docker.md + +# !pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[milvus] + +# from haystack.utils import launch_milvus +# from haystack.document_stores import MilvusDocumentStore + +# launch_milvus() +# document_store = MilvusDocumentStore() +``` + +### Cleaning & indexing documents + +Similarly to the previous tutorials, we download, convert and index some Game of Thrones articles to our DocumentStore + + +```python +# Let's first get some files that we want to use +doc_dir = "data/tutorial6" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt6.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# Convert files to dicts +docs = convert_files_to_docs(dir_path=doc_dir, clean_func=clean_wiki_text, split_paragraphs=True) + +# Now, let's write the dicts containing documents to our DB. +document_store.write_documents(docs) +``` + +### Initialize Retriever, Reader & Pipeline + +#### Retriever + +**Here:** We use an `EmbeddingRetriever`. + +**Alternatives:** + +- `BM25Retriever` with custom queries (for example, boosting) and filters +- `DensePassageRetriever` which uses two encoder models, one to embed the query and one to embed the passage, and then compares the embedding for retrieval +- `TfidfRetriever` in combination with a SQL or InMemory Document store for simple prototyping and debugging + + +```python +from haystack.nodes import EmbeddingRetriever + +retriever = EmbeddingRetriever( + document_store=document_store, + embedding_model="sentence-transformers/multi-qa-mpnet-base-dot-v1", + model_format="sentence_transformers", +) +# Important: +# Now that we initialized the Retriever, we need to call update_embeddings() to iterate over all +# previously indexed documents and update their embedding representation. +# While this can be a time consuming operation (depending on the corpus size), it only needs to be done once. +# At query time, we only need to embed the query and compare it to the existing document embeddings, which is very fast. +document_store.update_embeddings(retriever) +``` + +#### Reader + +Similar to previous Tutorials we now initalize our reader. + +Here we use a FARMReader with the *deepset/roberta-base-squad2* model (see: https://huggingface.co/deepset/roberta-base-squad2) + + + +##### FARMReader + + +```python +# Load a local model or any of the QA models on +# Hugging Face's model hub (https://huggingface.co/models) + +reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2", use_gpu=True) +``` + +### Pipeline + +With a Haystack `Pipeline` you can stick together your building blocks to a search pipeline. +Under the hood, `Pipelines` are Directed Acyclic Graphs (DAGs) that you can easily customize for your own use cases. +To speed things up, Haystack also comes with a few predefined Pipelines. One of them is the `ExtractiveQAPipeline` that combines a retriever and a reader to answer our questions. +You can learn more about `Pipelines` in the [docs](https://haystack.deepset.ai/docs/latest/pipelinesmd). + + +```python +from haystack.pipelines import ExtractiveQAPipeline + +pipe = ExtractiveQAPipeline(reader, retriever) +``` + +## Voilà! Ask a question! + + +```python +# You can configure how many candidates the reader and retriever shall return +# The higher top_k for retriever, the better (but also the slower) your answers. +prediction = pipe.run( + query="Who created the Dothraki vocabulary?", params={"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}} +) +``` + + +```python +print_answers(prediction, details="minimum") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! + +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.7.0/_src/tutorials/tutorials/7.md b/docs/v1.7.0/_src/tutorials/tutorials/7.md new file mode 100644 index 0000000000..c22882b00d --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/7.md @@ -0,0 +1,204 @@ + + +# Generative QA with "Retrieval-Augmented Generation" + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial7_RAG_Generator.ipynb) + +While extractive QA highlights the span of text that answers a query, +generative QA can return a novel text answer that it has composed. +In this tutorial, you will learn how to set up a generative system using the +[RAG model](https://arxiv.org/abs/2005.11401) which conditions the +answer generator on a set of retrieved documents. + +### Prepare environment + +#### Colab: Enable the GPU runtime +Make sure you enable the GPU runtime to experience decent speed in this tutorial. +**Runtime -> Change Runtime type -> Hardware accelerator -> GPU** + + + + +```python +# Make sure you have a GPU running +!nvidia-smi +``` + +Here are the packages and imports that we'll need: + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,faiss] +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + + +```python +from typing import List +import requests +import pandas as pd +from haystack import Document +from haystack.document_stores import FAISSDocumentStore +from haystack.nodes import RAGenerator, DensePassageRetriever +from haystack.utils import fetch_archive_from_http +``` + +Let's download a csv containing some sample text and preprocess the data. + + + +```python +# Download sample +doc_dir = "data/tutorial7/" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/small_generator_dataset.csv.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) + +# Create dataframe with columns "title" and "text" +df = pd.read_csv(f"{doc_dir}/small_generator_dataset.csv", sep=",") +# Minimal cleaning +df.fillna(value="", inplace=True) + +print(df.head()) +``` + +We can cast our data into Haystack Document objects. +Alternatively, we can also just use dictionaries with "text" and "meta" fields + + +```python +# Use data to initialize Document objects +titles = list(df["title"].values) +texts = list(df["text"].values) +documents: List[Document] = [] +for title, text in zip(titles, texts): + documents.append(Document(content=text, meta={"name": title or ""})) +``` + +Here we initialize the FAISSDocumentStore, DensePassageRetriever and RAGenerator. +FAISS is chosen here since it is optimized vector storage. + + +```python +# Initialize FAISS document store. +# Set `return_embedding` to `True`, so generator doesn't have to perform re-embedding +document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True) + +# Initialize DPR Retriever to encode documents, encode question and query documents +retriever = DensePassageRetriever( + document_store=document_store, + query_embedding_model="facebook/dpr-question_encoder-single-nq-base", + passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", + use_gpu=True, + embed_title=True, +) + +# Initialize RAG Generator +generator = RAGenerator( + model_name_or_path="facebook/rag-token-nq", + use_gpu=True, + top_k=1, + max_length=200, + min_length=2, + embed_title=True, + num_beams=2, +) +``` + +We write documents to the DocumentStore, first by deleting any remaining documents then calling `write_documents()`. +The `update_embeddings()` method uses the retriever to create an embedding for each document. + + + +```python +# Delete existing documents in documents store +document_store.delete_documents() + +# Write documents to document store +document_store.write_documents(documents) + +# Add documents embeddings to index +document_store.update_embeddings(retriever=retriever) +``` + +Here are our questions: + + +```python +QUESTIONS = [ + "who got the first nobel prize in physics", + "when is the next deadpool movie being released", + "which mode is used for short wave broadcast service", + "who is the owner of reading football club", + "when is the next scandal episode coming out", + "when is the last time the philadelphia won the superbowl", + "what is the most current adobe flash player version", + "how many episodes are there in dragon ball z", + "what is the first step in the evolution of the eye", + "where is gall bladder situated in human body", + "what is the main mineral in lithium batteries", + "who is the president of usa right now", + "where do the greasers live in the outsiders", + "panda is a national animal of which country", + "what is the name of manchester united stadium", +] +``` + +Now let's run our system! +The retriever will pick out a small subset of documents that it finds relevant. +These are used to condition the generator as it generates the answer. +What it should return then are novel text spans that form and answer to your question! + + +```python +# Or alternatively use the Pipeline class +from haystack.pipelines import GenerativeQAPipeline +from haystack.utils import print_answers + +pipe = GenerativeQAPipeline(generator=generator, retriever=retriever) +for question in QUESTIONS: + res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 5}}) + print_answers(res, details="minimum") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.7.0/_src/tutorials/tutorials/8.md b/docs/v1.7.0/_src/tutorials/tutorials/8.md new file mode 100644 index 0000000000..cb78d48b11 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/8.md @@ -0,0 +1,224 @@ + + +# Preprocessing + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial8_Preprocessing.ipynb) + +Haystack includes a suite of tools to extract text from different file types, normalize white space +and split text into smaller pieces to optimize retrieval. +These data preprocessing steps can have a big impact on the systems performance and effective handling of data is key to getting the most out of Haystack. + +Ultimately, Haystack expects data to be provided as a list documents in the following dictionary format: +``` python +docs = [ + { + 'content': DOCUMENT_TEXT_HERE, + 'meta': {'name': DOCUMENT_NAME, ...} + }, ... +] +``` + +This tutorial will show you all the tools that Haystack provides to help you cast your data into this format. + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab,ocr] + +# For Colab/linux based machines +!wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-linux-4.04.tar.gz +!tar -xvf xpdf-tools-linux-4.04.tar.gz && sudo cp xpdf-tools-linux-4.04/bin64/pdftotext /usr/local/bin + +# For Macos machines +# !wget --no-check-certificate https://dl.xpdfreader.com/xpdf-tools-mac-4.03.tar.gz +# !tar -xvf xpdf-tools-mac-4.03.tar.gz && sudo cp xpdf-tools-mac-4.03/bin64/pdftotext /usr/local/bin +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + + +```python +# Here are the imports we need +from haystack.nodes import TextConverter, PDFToTextConverter, DocxToTextConverter, PreProcessor +from haystack.utils import convert_files_to_docs, fetch_archive_from_http +``` + + +```python +# This fetches some sample files to work with + +doc_dir = "data/tutorial8" +s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/preprocessing_tutorial8.zip" +fetch_archive_from_http(url=s3_url, output_dir=doc_dir) +``` + +## Converters + +Haystack's converter classes are designed to help you turn files on your computer into the documents +that can be processed by the Haystack pipeline. +There are file converters for txt, pdf, docx files as well as a converter that is powered by Apache Tika. +The parameter `valid_languages` does not convert files to the target language, but checks if the conversion worked as expected. + + +```python +# Here are some examples of how you would use file converters + +converter = TextConverter(remove_numeric_tables=True, valid_languages=["en"]) +doc_txt = converter.convert(file_path="data/tutorial8/classics.txt", meta=None)[0] + +converter = PDFToTextConverter(remove_numeric_tables=True, valid_languages=["en"]) +doc_pdf = converter.convert(file_path="data/tutorial8/bert.pdf", meta=None)[0] + +converter = DocxToTextConverter(remove_numeric_tables=False, valid_languages=["en"]) +doc_docx = converter.convert(file_path="data/tutorial8/heavy_metal.docx", meta=None)[0] +``` + + +```python +# Haystack also has a convenience function that will automatically apply the right converter to each file in a directory. + +all_docs = convert_files_to_docs(dir_path=doc_dir) +``` + +## PreProcessor + +The PreProcessor class is designed to help you clean text and split text into sensible units. +File splitting can have a very significant impact on the system's performance and is absolutely mandatory for Dense Passage Retrieval models. +In general, we recommend you split the text from your files into small documents of around 100 words for dense retrieval methods +and no more than 10,000 words for sparse methods. +Have a look at the [Preprocessing](https://haystack.deepset.ai/docs/latest/preprocessingmd) +and [Optimization](https://haystack.deepset.ai/docs/latest/optimizationmd) pages on our website for more details. + + +```python +# This is a default usage of the PreProcessor. +# Here, it performs cleaning of consecutive whitespaces +# and splits a single large document into smaller documents. +# Each document is up to 1000 words long and document breaks cannot fall in the middle of sentences +# Note how the single document passed into the document gets split into 5 smaller documents + +preprocessor = PreProcessor( + clean_empty_lines=True, + clean_whitespace=True, + clean_header_footer=False, + split_by="word", + split_length=100, + split_respect_sentence_boundary=True, +) +docs_default = preprocessor.process([doc_txt]) +print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}") +``` + +## Cleaning + +- `clean_empty_lines` will normalize 3 or more consecutive empty lines to be just a two empty lines +- `clean_whitespace` will remove any whitespace at the beginning or end of each line in the text +- `clean_header_footer` will remove any long header or footer texts that are repeated on each page + +## Splitting +By default, the PreProcessor will respect sentence boundaries, meaning that documents will not start or end +midway through a sentence. +This will help reduce the possibility of answer phrases being split between two documents. +This feature can be turned off by setting `split_respect_sentence_boundary=False`. + + +```python +# Not respecting sentence boundary vs respecting sentence boundary + +preprocessor_nrsb = PreProcessor(split_respect_sentence_boundary=False) +docs_nrsb = preprocessor_nrsb.process([doc_txt]) + +print("RESPECTING SENTENCE BOUNDARY") +end_text = docs_default[0].content[-50:] +print('End of document: "...' + end_text + '"') +print() +print("NOT RESPECTING SENTENCE BOUNDARY") +end_text_nrsb = docs_nrsb[0].content[-50:] +print('End of document: "...' + end_text_nrsb + '"') +``` + +A commonly used strategy to split long documents, especially in the field of Question Answering, +is the sliding window approach. If `split_length=10` and `split_overlap=3`, your documents will look like this: + +- doc1 = words[0:10] +- doc2 = words[7:17] +- doc3 = words[14:24] +- ... + +You can use this strategy by following the code below. + + +```python +# Sliding window approach + +preprocessor_sliding_window = PreProcessor(split_overlap=3, split_length=10, split_respect_sentence_boundary=False) +docs_sliding_window = preprocessor_sliding_window.process([doc_txt]) + +doc1 = docs_sliding_window[0].content[:200] +doc2 = docs_sliding_window[1].content[:100] +doc3 = docs_sliding_window[2].content[:100] + +print('Document 1: "' + doc1 + '..."') +print('Document 2: "' + doc2 + '..."') +print('Document 3: "' + doc3 + '..."') +``` + +## Bringing it all together + + +```python +all_docs = convert_files_to_docs(dir_path=doc_dir) +preprocessor = PreProcessor( + clean_empty_lines=True, + clean_whitespace=True, + clean_header_footer=False, + split_by="word", + split_length=100, + split_respect_sentence_boundary=True, +) +docs = preprocessor.process(all_docs) + +print(f"n_files_input: {len(all_docs)}\nn_docs_output: {len(docs)}") +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) + diff --git a/docs/v1.7.0/_src/tutorials/tutorials/9.md b/docs/v1.7.0/_src/tutorials/tutorials/9.md new file mode 100644 index 0000000000..d77e5da173 --- /dev/null +++ b/docs/v1.7.0/_src/tutorials/tutorials/9.md @@ -0,0 +1,263 @@ + + +# Training Your Own "Dense Passage Retrieval" Model + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/deepset-ai/haystack/blob/master/tutorials/Tutorial9_DPR_training.ipynb) + +Haystack contains all the tools needed to train your own Dense Passage Retrieval model. +This tutorial will guide you through the steps required to create a retriever that is specifically tailored to your domain. + + +```python +# Install the latest release of Haystack in your own environment +#! pip install farm-haystack + +# Install the latest master of Haystack +!pip install --upgrade pip +!pip install git+https://github.com/deepset-ai/haystack.git#egg=farm-haystack[colab] +``` + +## Logging + +We configure how logging messages should be displayed and which log level should be used before importing Haystack. +Example log message: +INFO - haystack.utils.preprocessing - Converting data/tutorial1/218_Olenna_Tyrell.txt +Default log level in basicConfig is WARNING so the explicit parameter is not necessary but can be changed easily: + + +```python +import logging + +logging.basicConfig(format="%(levelname)s - %(name)s - %(message)s", level=logging.WARNING) +logging.getLogger("haystack").setLevel(logging.INFO) +``` + + +```python +# Here are some imports that we'll need + +from haystack.nodes import DensePassageRetriever +from haystack.utils import fetch_archive_from_http +from haystack.document_stores import InMemoryDocumentStore +``` + +## Training Data + +DPR training performed using Information Retrieval data. +More specifically, you want to feed in pairs of queries and relevant documents. + +To train a model, we will need a dataset that has the same format as the original DPR training data. +Each data point in the dataset should have the following dictionary structure. + +``` python + { + "dataset": str, + "question": str, + "answers": list of str + "positive_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str} + "negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str} + "hard_negative_ctxs": list of dictionaries of format {'title': str, 'text': str, 'score': int, 'title_score': int, 'passage_id': str} + } +``` + +`positive_ctxs` are context passages which are relevant to the query. +In some datasets, queries might have more than one positive context +in which case you can set the `num_positives` parameter to be higher than the default 1. +Note that `num_positives` needs to be lower or equal to the minimum number of `positive_ctxs` for queries in your data. +If you have an unequal number of positive contexts per example, +you might want to generate some soft labels by retrieving similar contexts which contain the answer. + +DPR is standardly trained using a method known as in-batch negatives. +This means that positive contexts for a given query are treated as negative contexts for the other queries in the batch. +Doing so allows for a high degree of computational efficiency, thus allowing the model to be trained on large amounts of data. + +`negative_ctxs` is not actually used in Haystack's DPR training so we recommend you set it to an empty list. +They were used by the original DPR authors in an experiment to compare it against the in-batch negatives method. + +`hard_negative_ctxs` are passages that are not relevant to the query. +In the original DPR paper, these are fetched using a retriever to find the most relevant passages to the query. +Passages which contain the answer text are filtered out. + +If you'd like to convert your SQuAD format data into something that can train a DPR model, +check out the utility script at [`haystack/utils/squad_to_dpr.py`](https://github.com/deepset-ai/haystack/blob/master/haystack/utils/squad_to_dpr.py) + +## Using Question Answering Data + +Question Answering datasets can sometimes be used as training data. +Google's Natural Questions dataset, is sufficiently large +and contains enough unique passages, that it can be converted into a DPR training set. +This is done simply by considering answer containing passages as relevant documents to the query. + +The SQuAD dataset, however, is not as suited to this use case since its question and answer pairs +are created on only a very small slice of wikipedia documents. + +## Download Original DPR Training Data + +WARNING: These files are large! The train set is 7.4GB and the dev set is 800MB + +We can download the original DPR training data with the following cell. +Note that this data is probably only useful if you are trying to train from scratch. + + +```python +# Download original DPR data +# WARNING: the train set is 7.4GB and the dev set is 800MB + +doc_dir = "data/tutorial9" + +s3_url_train = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-train.json.gz" +s3_url_dev = "https://dl.fbaipublicfiles.com/dpr/data/retriever/biencoder-nq-dev.json.gz" + +fetch_archive_from_http(s3_url_train, output_dir=doc_dir + "/train") +fetch_archive_from_http(s3_url_dev, output_dir=doc_dir + "/dev") +``` + +## Option 1: Training DPR from Scratch + +The default variables that we provide below are chosen to train a DPR model from scratch. +Here, both passage and query embedding models are initialized using BERT base +and the model is trained using Google's Natural Questions dataset (in a format specialised for DPR). + +If you are working in a language other than English, +you will want to initialize the passage and query embedding models with a language model that supports your language +and also provide a dataset in your language. + + +```python +# Here are the variables to specify our training data, the models that we use to initialize DPR +# and the directory where we'll be saving the model + +train_filename = "train/biencoder-nq-train.json" +dev_filename = "dev/biencoder-nq-dev.json" + +query_model = "bert-base-uncased" +passage_model = "bert-base-uncased" + +save_dir = "../saved_models/dpr" +``` + +## Option 2: Finetuning DPR + +If you have your own domain specific question answering or information retrieval dataset, +you might instead be interested in finetuning a pretrained DPR model. +In this case, you would initialize both query and passage models using the original pretrained model. +You will want to load something like this set of variables instead of the ones above + + +```python +# Here are the variables you might want to use instead of the set above +# in order to perform pretraining + +doc_dir = "PATH_TO_YOUR_DATA_DIR" +train_filename = "TRAIN_FILENAME" +dev_filename = "DEV_FILENAME" + +query_model = "facebook/dpr-question_encoder-single-nq-base" +passage_model = "facebook/dpr-ctx_encoder-single-nq-base" + +save_dir = "../saved_models/dpr" +``` + +## Initialization + +Here we want to initialize our model either with plain language model weights for training from scratch +or else with pretrained DPR weights for finetuning. +We follow the [original DPR parameters](https://github.com/facebookresearch/DPR#best-hyperparameter-settings) +for their max passage length but set max query length to 64 since queries are very rarely longer. + + +```python +## Initialize DPR model + +retriever = DensePassageRetriever( + document_store=InMemoryDocumentStore(), + query_embedding_model=query_model, + passage_embedding_model=passage_model, + max_seq_len_query=64, + max_seq_len_passage=256, +) +``` + +## Training + +Let's start training and save our trained model! + +On a V100 GPU, you can fit up to batch size 16 so we set gradient accumulation steps to 8 in order +to simulate the batch size 128 of the original DPR experiment. + +When `embed_title=True`, the document title is prepended to the input text sequence with a `[SEP]` token +between it and document text. + +When training from scratch with the above variables, 1 epoch takes around an hour and we reached the following performance: + +``` +loss: 0.046580662854042276 +task_name: text_similarity +acc: 0.992524064068483 +f1: 0.8804297774366846 +acc_and_f1: 0.9364769207525838 +average_rank: 0.19631619339984652 +report: + precision recall f1-score support + +hard_negative 0.9961 0.9961 0.9961 201887 + positive 0.8804 0.8804 0.8804 6515 + + accuracy 0.9925 208402 + macro avg 0.9383 0.9383 0.9383 208402 + weighted avg 0.9925 0.9925 0.9925 208402 + +``` + + +```python +# Start training our model and save it when it is finished + +retriever.train( + data_dir=doc_dir, + train_filename=train_filename, + dev_filename=dev_filename, + test_filename=dev_filename, + n_epochs=1, + batch_size=16, + grad_acc_steps=8, + save_dir=save_dir, + evaluate_every=3000, + embed_title=True, + num_positives=1, + num_hard_negatives=1, +) +``` + +## Loading + +Loading our newly trained model is simple! + + +```python +reloaded_retriever = DensePassageRetriever.load(load_dir=save_dir, document_store=None) +``` + +## About us + +This [Haystack](https://github.com/deepset-ai/haystack/) notebook was made with love by [deepset](https://deepset.ai/) in Berlin, Germany + +We bring NLP to the industry via open source! +Our focus: Industry specific language models & large scale QA systems. + +Some of our other work: +- [German BERT](https://deepset.ai/german-bert) +- [GermanQuAD and GermanDPR](https://deepset.ai/germanquad) +- [FARM](https://github.com/deepset-ai/FARM) + +Get in touch: +[Twitter](https://twitter.com/deepset_ai) | [LinkedIn](https://www.linkedin.com/company/deepset-ai/) | [Slack](https://haystack.deepset.ai/community/join) | [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | [Website](https://deepset.ai) + +By the way: [we're hiring!](https://www.deepset.ai/jobs) diff --git a/docs/v1.7.0/make.bat b/docs/v1.7.0/make.bat new file mode 100644 index 0000000000..7d79440912 --- /dev/null +++ b/docs/v1.7.0/make.bat @@ -0,0 +1,38 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=_src/ +set BUILDDIR=build +set SPHINXFLAGS=-a -n -A local=1 +set SPHINXOPTS=%SPHINXFLAGS% %SOURCE% +set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% + +if "%1" == "" goto help + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.http://sphinx-doc.org/ + exit /b 1 +) + +%SPHINXBUILD% -b %1 %ALLSPINXOPTS% %BUILDDIR%/%1 +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/haystack/json-schemas/haystack-pipeline-1.7.0.schema.json b/haystack/json-schemas/haystack-pipeline-1.7.0.schema.json new file mode 100644 index 0000000000..b80b7f87c5 --- /dev/null +++ b/haystack/json-schemas/haystack-pipeline-1.7.0.schema.json @@ -0,0 +1,5103 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema", + "$id": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/haystack-pipeline-1.7.0.schema.json", + "title": "Haystack Pipeline", + "description": "Haystack Pipeline YAML file describing the nodes of the pipelines. For more info read the docs at: https://haystack.deepset.ai/components/pipelines#yaml-file-definitions", + "type": "object", + "properties": { + "version": { + "title": "Version", + "description": "Version of the Haystack Pipeline file.", + "type": "string", + "const": "1.7.0" + }, + "extras": { + "title": "Additional properties group", + "description": "To be specified only if contains special pipelines (for example, if this is a Ray pipeline)", + "type": "string", + "enum": [ + "ray" + ] + }, + "components": { + "title": "Components", + "description": "Component nodes and their configurations, to later be used in the pipelines section. Define here all the building blocks for the pipelines.", + "type": "array", + "items": { + "anyOf": [ + { + "$ref": "#/definitions/DeepsetCloudDocumentStoreComponent" + }, + { + "$ref": "#/definitions/ElasticsearchDocumentStoreComponent" + }, + { + "$ref": "#/definitions/FAISSDocumentStoreComponent" + }, + { + "$ref": "#/definitions/GraphDBKnowledgeGraphComponent" + }, + { + "$ref": "#/definitions/InMemoryDocumentStoreComponent" + }, + { + "$ref": "#/definitions/InMemoryKnowledgeGraphComponent" + }, + { + "$ref": "#/definitions/Milvus2DocumentStoreComponent" + }, + { + "$ref": "#/definitions/OpenDistroElasticsearchDocumentStoreComponent" + }, + { + "$ref": "#/definitions/OpenSearchDocumentStoreComponent" + }, + { + "$ref": "#/definitions/PineconeDocumentStoreComponent" + }, + { + "$ref": "#/definitions/SQLDocumentStoreComponent" + }, + { + "$ref": "#/definitions/WeaviateDocumentStoreComponent" + }, + { + "$ref": "#/definitions/AnswerToSpeechComponent" + }, + { + "$ref": "#/definitions/AzureConverterComponent" + }, + { + "$ref": "#/definitions/BM25RetrieverComponent" + }, + { + "$ref": "#/definitions/CrawlerComponent" + }, + { + "$ref": "#/definitions/DensePassageRetrieverComponent" + }, + { + "$ref": "#/definitions/Docs2AnswersComponent" + }, + { + "$ref": "#/definitions/DocumentToSpeechComponent" + }, + { + "$ref": "#/definitions/DocxToTextConverterComponent" + }, + { + "$ref": "#/definitions/ElasticsearchFilterOnlyRetrieverComponent" + }, + { + "$ref": "#/definitions/ElasticsearchRetrieverComponent" + }, + { + "$ref": "#/definitions/EmbeddingRetrieverComponent" + }, + { + "$ref": "#/definitions/EntityExtractorComponent" + }, + { + "$ref": "#/definitions/EvalAnswersComponent" + }, + { + "$ref": "#/definitions/EvalDocumentsComponent" + }, + { + "$ref": "#/definitions/FARMReaderComponent" + }, + { + "$ref": "#/definitions/FileTypeClassifierComponent" + }, + { + "$ref": "#/definitions/FilterRetrieverComponent" + }, + { + "$ref": "#/definitions/ImageToTextConverterComponent" + }, + { + "$ref": "#/definitions/JoinAnswersComponent" + }, + { + "$ref": "#/definitions/JoinDocumentsComponent" + }, + { + "$ref": "#/definitions/MarkdownConverterComponent" + }, + { + "$ref": "#/definitions/MultihopEmbeddingRetrieverComponent" + }, + { + "$ref": "#/definitions/OpenAIAnswerGeneratorComponent" + }, + { + "$ref": "#/definitions/PDFToTextConverterComponent" + }, + { + "$ref": "#/definitions/PDFToTextOCRConverterComponent" + }, + { + "$ref": "#/definitions/ParsrConverterComponent" + }, + { + "$ref": "#/definitions/PreProcessorComponent" + }, + { + "$ref": "#/definitions/PseudoLabelGeneratorComponent" + }, + { + "$ref": "#/definitions/QuestionGeneratorComponent" + }, + { + "$ref": "#/definitions/RAGeneratorComponent" + }, + { + "$ref": "#/definitions/RCIReaderComponent" + }, + { + "$ref": "#/definitions/RouteDocumentsComponent" + }, + { + "$ref": "#/definitions/SentenceTransformersRankerComponent" + }, + { + "$ref": "#/definitions/Seq2SeqGeneratorComponent" + }, + { + "$ref": "#/definitions/SklearnQueryClassifierComponent" + }, + { + "$ref": "#/definitions/TableReaderComponent" + }, + { + "$ref": "#/definitions/TableTextRetrieverComponent" + }, + { + "$ref": "#/definitions/Text2SparqlRetrieverComponent" + }, + { + "$ref": "#/definitions/TextConverterComponent" + }, + { + "$ref": "#/definitions/TfidfRetrieverComponent" + }, + { + "$ref": "#/definitions/TikaConverterComponent" + }, + { + "$ref": "#/definitions/TransformersDocumentClassifierComponent" + }, + { + "$ref": "#/definitions/TransformersQueryClassifierComponent" + }, + { + "$ref": "#/definitions/TransformersReaderComponent" + }, + { + "$ref": "#/definitions/TransformersSummarizerComponent" + }, + { + "$ref": "#/definitions/TransformersTranslatorComponent" + } + ] + }, + "required": [ + "type", + "name" + ], + "additionalProperties": true + }, + "pipelines": { + "title": "Pipelines", + "description": "Multiple pipelines can be defined using the components from the same YAML file.", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Name of the pipeline.", + "type": "string" + }, + "nodes": { + "title": "Nodes", + "description": "Nodes to be used by this particular pipeline", + "type": "array", + "items": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "The name of this particular node in the pipeline. This should be one of the names from the components defined in the same file.", + "type": "string" + }, + "inputs": { + "title": "Inputs", + "description": "Input parameters for this node.", + "type": "array", + "items": { + "type": "string" + } + }, + "serve_deployment_kwargs": { + "title": "serve_deployment_kwargs", + "description": "Arguments to be passed to the Ray Serve `deployment()` method (only for Ray pipelines)", + "type": "object", + "properties": { + "num_replicas": { + "description": "How many replicas Ray should create for this node (only for Ray pipelines)", + "type": "integer" + }, + "version": { + "type": "string" + }, + "prev_version": { + "type": "string" + }, + "init_args": { + "type": "array" + }, + "init_kwargs": { + "type": "object" + }, + "router_prefix": { + "type": "string" + }, + "ray_actor_options": { + "type": "object" + }, + "user_config": { + "type": {} + }, + "max_concurrent_queries": { + "type": "integer" + } + }, + "additionalProperties": true + } + }, + "required": [ + "name", + "inputs" + ], + "additionalProperties": false + }, + "required": [ + "name", + "nodes" + ], + "additionalProperties": false + }, + "additionalProperties": false + }, + "additionalProperties": false + } + } + }, + "required": [ + "version", + "components", + "pipelines" + ], + "additionalProperties": false, + "oneOf": [ + { + "not": { + "required": [ + "extras" + ] + }, + "properties": { + "pipelines": { + "title": "Pipelines", + "items": { + "properties": { + "nodes": { + "items": { + "not": { + "required": [ + "serve_deployment_kwargs" + ] + } + } + } + } + } + } + } + }, + { + "properties": { + "extras": { + "enum": [ + "ray" + ] + } + }, + "required": [ + "extras" + ] + } + ], + "definitions": { + "DeepsetCloudDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "DeepsetCloudDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "api_key": { + "title": "Api Key", + "type": "string" + }, + "workspace": { + "title": "Workspace", + "default": "default", + "type": "string" + }, + "index": { + "title": "Index", + "type": "string" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "api_endpoint": { + "title": "Api Endpoint", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "label_index": { + "title": "Label Index", + "default": "default", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "ElasticsearchDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "ElasticsearchDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "port": { + "title": "Port", + "default": 9200, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "username": { + "title": "Username", + "default": "", + "type": "string" + }, + "password": { + "title": "Password", + "default": "", + "type": "string" + }, + "api_key_id": { + "title": "Api Key Id", + "type": "string" + }, + "api_key": { + "title": "Api Key", + "type": "string" + }, + "aws4auth": { + "title": "Aws4Auth" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "search_fields": { + "title": "Search Fields", + "default": "content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": {} + } + ] + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "custom_mapping": { + "title": "Custom Mapping", + "type": "object" + }, + "excluded_meta_data": { + "title": "Excluded Meta Data", + "type": "array", + "items": {} + }, + "analyzer": { + "title": "Analyzer", + "default": "standard", + "type": "string" + }, + "scheme": { + "title": "Scheme", + "default": "http", + "type": "string" + }, + "ca_certs": { + "title": "Ca Certs", + "type": "string" + }, + "verify_certs": { + "title": "Verify Certs", + "default": true, + "type": "boolean" + }, + "recreate_index": { + "title": "Recreate Index", + "default": false, + "type": "boolean" + }, + "create_index": { + "title": "Create Index", + "default": true, + "type": "boolean" + }, + "refresh_type": { + "title": "Refresh Type", + "default": "wait_for", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "timeout": { + "title": "Timeout", + "default": 30, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "flat", + "type": "string" + }, + "scroll": { + "title": "Scroll", + "default": "1d", + "type": "string" + }, + "skip_missing_embeddings": { + "title": "Skip Missing Embeddings", + "default": true, + "type": "boolean" + }, + "synonyms": { + "title": "Synonyms", + "type": "array", + "items": {} + }, + "synonym_type": { + "title": "Synonym Type", + "default": "synonym", + "type": "string" + }, + "use_system_proxy": { + "title": "Use System Proxy", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "FAISSDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "FAISSDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "sql_url": { + "title": "Sql Url", + "default": "sqlite:///faiss_document_store.db", + "type": "string" + }, + "vector_dim": { + "title": "Vector Dim", + "type": "integer" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "faiss_index_factory_str": { + "title": "Faiss Index Factory Str", + "default": "Flat", + "type": "string" + }, + "faiss_index": { + "title": "Faiss Index", + "type": "string", + "default": null + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "faiss_index_path": { + "title": "Faiss Index Path", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "format": "path" + } + ] + }, + "faiss_config_path": { + "title": "Faiss Config Path", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "format": "path" + } + ] + }, + "isolation_level": { + "title": "Isolation Level", + "type": "string" + }, + "n_links": { + "title": "N Links", + "default": 64, + "type": "integer" + }, + "ef_search": { + "title": "Ef Search", + "default": 20, + "type": "integer" + }, + "ef_construction": { + "title": "Ef Construction", + "default": 80, + "type": "integer" + }, + "validate_index_sync": { + "title": "Validate Index Sync", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "GraphDBKnowledgeGraphComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "GraphDBKnowledgeGraph" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "localhost", + "type": "string" + }, + "port": { + "title": "Port", + "default": 7200, + "type": "integer" + }, + "username": { + "title": "Username", + "default": "", + "type": "string" + }, + "password": { + "title": "Password", + "default": "", + "type": "string" + }, + "index": { + "title": "Index", + "type": "string" + }, + "prefixes": { + "title": "Prefixes", + "default": "", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "InMemoryDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "InMemoryDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "scoring_batch_size": { + "title": "Scoring Batch Size", + "default": 500000, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "InMemoryKnowledgeGraphComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "InMemoryKnowledgeGraph" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "index": { + "title": "Index", + "default": "document", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "Milvus2DocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "Milvus2DocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "sql_url": { + "title": "Sql Url", + "default": "sqlite:///", + "type": "string" + }, + "host": { + "title": "Host", + "default": "localhost", + "type": "string" + }, + "port": { + "title": "Port", + "default": "19530", + "type": "string" + }, + "connection_pool": { + "title": "Connection Pool", + "default": "SingletonThread", + "type": "string" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "vector_dim": { + "title": "Vector Dim", + "type": "integer" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "index_file_size": { + "title": "Index File Size", + "default": 1024, + "type": "integer" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "IVF_FLAT", + "type": "string" + }, + "index_param": { + "title": "Index Param", + "type": "object" + }, + "search_param": { + "title": "Search Param", + "type": "object" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "id_field": { + "title": "Id Field", + "default": "id", + "type": "string" + }, + "custom_fields": { + "title": "Custom Fields", + "type": "array", + "items": {} + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "isolation_level": { + "title": "Isolation Level", + "type": "string" + }, + "consistency_level": { + "title": "Consistency Level", + "default": 0, + "type": "integer" + }, + "recreate_index": { + "title": "Recreate Index", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "OpenDistroElasticsearchDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "OpenDistroElasticsearchDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "scheme": { + "title": "Scheme", + "default": "https", + "type": "string" + }, + "username": { + "title": "Username", + "default": "admin", + "type": "string" + }, + "password": { + "title": "Password", + "default": "admin", + "type": "string" + }, + "host": { + "title": "Host", + "default": "localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "port": { + "title": "Port", + "default": 9200, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "api_key_id": { + "title": "Api Key Id", + "type": "string" + }, + "api_key": { + "title": "Api Key", + "type": "string" + }, + "aws4auth": { + "title": "Aws4Auth" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "search_fields": { + "title": "Search Fields", + "default": "content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": {} + } + ] + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "custom_mapping": { + "title": "Custom Mapping", + "type": "object" + }, + "excluded_meta_data": { + "title": "Excluded Meta Data", + "type": "array", + "items": {} + }, + "analyzer": { + "title": "Analyzer", + "default": "standard", + "type": "string" + }, + "ca_certs": { + "title": "Ca Certs", + "type": "string" + }, + "verify_certs": { + "title": "Verify Certs", + "default": false, + "type": "boolean" + }, + "recreate_index": { + "title": "Recreate Index", + "default": false, + "type": "boolean" + }, + "create_index": { + "title": "Create Index", + "default": true, + "type": "boolean" + }, + "refresh_type": { + "title": "Refresh Type", + "default": "wait_for", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "cosine", + "type": "string" + }, + "timeout": { + "title": "Timeout", + "default": 30, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "flat", + "type": "string" + }, + "scroll": { + "title": "Scroll", + "default": "1d", + "type": "string" + }, + "skip_missing_embeddings": { + "title": "Skip Missing Embeddings", + "default": true, + "type": "boolean" + }, + "synonyms": { + "title": "Synonyms", + "type": "array", + "items": {} + }, + "synonym_type": { + "title": "Synonym Type", + "default": "synonym", + "type": "string" + }, + "use_system_proxy": { + "title": "Use System Proxy", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "OpenSearchDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "OpenSearchDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "scheme": { + "title": "Scheme", + "default": "https", + "type": "string" + }, + "username": { + "title": "Username", + "default": "admin", + "type": "string" + }, + "password": { + "title": "Password", + "default": "admin", + "type": "string" + }, + "host": { + "title": "Host", + "default": "localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "port": { + "title": "Port", + "default": 9200, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "api_key_id": { + "title": "Api Key Id", + "type": "string" + }, + "api_key": { + "title": "Api Key", + "type": "string" + }, + "aws4auth": { + "title": "Aws4Auth" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "search_fields": { + "title": "Search Fields", + "default": "content", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": {} + } + ] + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "custom_mapping": { + "title": "Custom Mapping", + "type": "object" + }, + "excluded_meta_data": { + "title": "Excluded Meta Data", + "type": "array", + "items": {} + }, + "analyzer": { + "title": "Analyzer", + "default": "standard", + "type": "string" + }, + "ca_certs": { + "title": "Ca Certs", + "type": "string" + }, + "verify_certs": { + "title": "Verify Certs", + "default": false, + "type": "boolean" + }, + "recreate_index": { + "title": "Recreate Index", + "default": false, + "type": "boolean" + }, + "create_index": { + "title": "Create Index", + "default": true, + "type": "boolean" + }, + "refresh_type": { + "title": "Refresh Type", + "default": "wait_for", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "dot_product", + "type": "string" + }, + "timeout": { + "title": "Timeout", + "default": 30, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "flat", + "type": "string" + }, + "scroll": { + "title": "Scroll", + "default": "1d", + "type": "string" + }, + "skip_missing_embeddings": { + "title": "Skip Missing Embeddings", + "default": true, + "type": "boolean" + }, + "synonyms": { + "title": "Synonyms", + "type": "array", + "items": {} + }, + "synonym_type": { + "title": "Synonym Type", + "default": "synonym", + "type": "string" + }, + "use_system_proxy": { + "title": "Use System Proxy", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "PineconeDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "PineconeDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "api_key": { + "title": "Api Key", + "type": "string" + }, + "environment": { + "title": "Environment", + "default": "us-west1-gcp", + "type": "string" + }, + "sql_url": { + "title": "Sql Url", + "default": "sqlite:///pinecone_document_store.db", + "type": "string" + }, + "pinecone_index": { + "title": "Pinecone Index", + "type": "string", + "default": null + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "cosine", + "type": "string" + }, + "replicas": { + "title": "Replicas", + "default": 1, + "type": "integer" + }, + "shards": { + "title": "Shards", + "default": 1, + "type": "integer" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "recreate_index": { + "title": "Recreate Index", + "default": false, + "type": "boolean" + }, + "metadata_config": { + "title": "Metadata Config", + "default": { + "indexed": [] + }, + "type": "object" + }, + "validate_index_sync": { + "title": "Validate Index Sync", + "default": true, + "type": "boolean" + } + }, + "required": [ + "api_key" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "SQLDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "SQLDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "url": { + "title": "Url", + "default": "sqlite://", + "type": "string" + }, + "index": { + "title": "Index", + "default": "document", + "type": "string" + }, + "label_index": { + "title": "Label Index", + "default": "label", + "type": "string" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "check_same_thread": { + "title": "Check Same Thread", + "default": false, + "type": "boolean" + }, + "isolation_level": { + "title": "Isolation Level", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "WeaviateDocumentStoreComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "WeaviateDocumentStore" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "host": { + "title": "Host", + "default": "http://localhost", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "port": { + "title": "Port", + "default": 8080, + "anyOf": [ + { + "type": "integer" + }, + { + "type": "array", + "items": { + "type": "integer" + } + } + ] + }, + "timeout_config": { + "title": "Timeout Config", + "default": [ + 5, + 15 + ], + "type": "array", + "items": {} + }, + "username": { + "title": "Username", + "type": "string" + }, + "password": { + "title": "Password", + "type": "string" + }, + "index": { + "title": "Index", + "default": "Document", + "type": "string" + }, + "embedding_dim": { + "title": "Embedding Dim", + "default": 768, + "type": "integer" + }, + "content_field": { + "title": "Content Field", + "default": "content", + "type": "string" + }, + "name_field": { + "title": "Name Field", + "default": "name", + "type": "string" + }, + "similarity": { + "title": "Similarity", + "default": "cosine", + "type": "string" + }, + "index_type": { + "title": "Index Type", + "default": "hnsw", + "type": "string" + }, + "custom_schema": { + "title": "Custom Schema", + "type": "object" + }, + "return_embedding": { + "title": "Return Embedding", + "default": false, + "type": "boolean" + }, + "embedding_field": { + "title": "Embedding Field", + "default": "embedding", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_documents": { + "title": "Duplicate Documents", + "default": "overwrite", + "type": "string" + }, + "recreate_index": { + "title": "Recreate Index", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "AnswerToSpeechComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "AnswerToSpeech" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "espnet/kan-bayashi_ljspeech_vits", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "format": "path" + } + ] + }, + "generated_audio_dir": { + "title": "Generated Audio Dir", + "default": "generated_audio_answers", + "type": "string", + "format": "path" + }, + "audio_params": { + "title": "Audio Params", + "type": "object" + }, + "transformers_params": { + "title": "Transformers Params", + "type": "object" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "AzureConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "AzureConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "endpoint": { + "title": "Endpoint", + "type": "string" + }, + "credential_key": { + "title": "Credential Key", + "type": "string" + }, + "model_id": { + "title": "Model Id", + "default": "prebuilt-document", + "type": "string" + }, + "valid_languages": { + "title": "Valid Languages", + "type": "array", + "items": { + "type": "string" + } + }, + "save_json": { + "title": "Save Json", + "default": false, + "type": "boolean" + }, + "preceding_context_len": { + "title": "Preceding Context Len", + "default": 3, + "type": "integer" + }, + "following_context_len": { + "title": "Following Context Len", + "default": 3, + "type": "integer" + }, + "merge_multiple_column_headers": { + "title": "Merge Multiple Column Headers", + "default": true, + "type": "boolean" + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + }, + "add_page_number": { + "title": "Add Page Number", + "default": true, + "type": "boolean" + } + }, + "required": [ + "endpoint", + "credential_key" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "BM25RetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "BM25Retriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "all_terms_must_match": { + "title": "All Terms Must Match", + "default": false, + "type": "boolean" + }, + "custom_query": { + "title": "Custom Query", + "type": "string" + }, + "scale_score": { + "title": "Scale Score", + "default": true, + "type": "boolean" + } + }, + "required": [ + "document_store" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "CrawlerComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "Crawler" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "output_dir": { + "title": "Output Dir", + "type": "string" + }, + "urls": { + "title": "Urls", + "type": "array", + "items": { + "type": "string" + } + }, + "crawler_depth": { + "title": "Crawler Depth", + "default": 1, + "type": "integer" + }, + "filter_urls": { + "title": "Filter Urls", + "type": "array", + "items": {} + }, + "overwrite_existing_files": { + "title": "Overwrite Existing Files", + "default": true + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + }, + "extract_hidden_text": { + "title": "Extract Hidden Text", + "default": true + }, + "loading_wait_time": { + "title": "Loading Wait Time", + "type": "integer" + }, + "crawler_naming_function": { + "title": "Crawler Naming Function", + "type": "string", + "default": null + } + }, + "required": [ + "output_dir" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "DensePassageRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "DensePassageRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "query_embedding_model": { + "title": "Query Embedding Model", + "default": "facebook/dpr-question_encoder-single-nq-base", + "anyOf": [ + { + "type": "string", + "format": "path" + }, + { + "type": "string" + } + ] + }, + "passage_embedding_model": { + "title": "Passage Embedding Model", + "default": "facebook/dpr-ctx_encoder-single-nq-base", + "anyOf": [ + { + "type": "string", + "format": "path" + }, + { + "type": "string" + } + ] + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "max_seq_len_query": { + "title": "Max Seq Len Query", + "default": 64, + "type": "integer" + }, + "max_seq_len_passage": { + "title": "Max Seq Len Passage", + "default": 256, + "type": "integer" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "batch_size": { + "title": "Batch Size", + "default": 16, + "type": "integer" + }, + "embed_title": { + "title": "Embed Title", + "default": true, + "type": "boolean" + }, + "use_fast_tokenizers": { + "title": "Use Fast Tokenizers", + "default": true, + "type": "boolean" + }, + "similarity_function": { + "title": "Similarity Function", + "default": "dot_product", + "type": "string" + }, + "global_loss_buffer_size": { + "title": "Global Loss Buffer Size", + "default": 150000, + "type": "integer" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "devices": { + "title": "Devices", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + "use_auth_token": { + "title": "Use Auth Token", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "string" + } + ] + }, + "scale_score": { + "title": "Scale Score", + "default": true, + "type": "boolean" + } + }, + "required": [ + "document_store" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "Docs2AnswersComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "Docs2Answers" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "DocumentToSpeechComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "DocumentToSpeech" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "espnet/kan-bayashi_ljspeech_vits", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "format": "path" + } + ] + }, + "generated_audio_dir": { + "title": "Generated Audio Dir", + "default": "generated_audio_documents", + "type": "string", + "format": "path" + }, + "audio_params": { + "title": "Audio Params", + "type": "object" + }, + "transformers_params": { + "title": "Transformers Params", + "type": "object" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "DocxToTextConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "DocxToTextConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "remove_numeric_tables": { + "title": "Remove Numeric Tables", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "ElasticsearchFilterOnlyRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "ElasticsearchFilterOnlyRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "all_terms_must_match": { + "title": "All Terms Must Match", + "default": false, + "type": "boolean" + }, + "custom_query": { + "title": "Custom Query", + "type": "string" + } + }, + "required": [ + "document_store" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "ElasticsearchRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "ElasticsearchRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "all_terms_must_match": { + "title": "All Terms Must Match", + "default": false, + "type": "boolean" + }, + "custom_query": { + "title": "Custom Query", + "type": "string" + } + }, + "required": [ + "document_store" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "EmbeddingRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "EmbeddingRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "embedding_model": { + "title": "Embedding Model", + "type": "string" + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "batch_size": { + "title": "Batch Size", + "default": 32, + "type": "integer" + }, + "max_seq_len": { + "title": "Max Seq Len", + "default": 512, + "type": "integer" + }, + "model_format": { + "title": "Model Format", + "type": "string" + }, + "pooling_strategy": { + "title": "Pooling Strategy", + "default": "reduce_mean", + "type": "string" + }, + "emb_extraction_layer": { + "title": "Emb Extraction Layer", + "default": -1, + "type": "integer" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "devices": { + "title": "Devices", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + "use_auth_token": { + "title": "Use Auth Token", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "string" + } + ] + }, + "scale_score": { + "title": "Scale Score", + "default": true, + "type": "boolean" + }, + "embed_meta_fields": { + "title": "Embed Meta Fields", + "default": [], + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "document_store", + "embedding_model" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "EntityExtractorComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "EntityExtractor" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "dslim/bert-base-NER", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "batch_size": { + "title": "Batch Size", + "default": 16, + "type": "integer" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "EvalAnswersComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "EvalAnswers" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "skip_incorrect_retrieval": { + "title": "Skip Incorrect Retrieval", + "default": true, + "type": "boolean" + }, + "open_domain": { + "title": "Open Domain", + "default": true, + "type": "boolean" + }, + "sas_model": { + "title": "Sas Model", + "type": "string" + }, + "debug": { + "title": "Debug", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "EvalDocumentsComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "EvalDocuments" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "debug": { + "title": "Debug", + "default": false, + "type": "boolean" + }, + "open_domain": { + "title": "Open Domain", + "default": true, + "type": "boolean" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "FARMReaderComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "FARMReader" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "type": "string" + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "context_window_size": { + "title": "Context Window Size", + "default": 150, + "type": "integer" + }, + "batch_size": { + "title": "Batch Size", + "default": 50, + "type": "integer" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "devices": { + "title": "Devices", + "default": [], + "type": "array", + "items": { + "type": "string" + } + }, + "no_ans_boost": { + "title": "No Ans Boost", + "default": 0.0, + "type": "number" + }, + "return_no_answer": { + "title": "Return No Answer", + "default": false, + "type": "boolean" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "top_k_per_candidate": { + "title": "Top K Per Candidate", + "default": 3, + "type": "integer" + }, + "top_k_per_sample": { + "title": "Top K Per Sample", + "default": 1, + "type": "integer" + }, + "num_processes": { + "title": "Num Processes", + "type": "integer" + }, + "max_seq_len": { + "title": "Max Seq Len", + "default": 256, + "type": "integer" + }, + "doc_stride": { + "title": "Doc Stride", + "default": 128, + "type": "integer" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "duplicate_filtering": { + "title": "Duplicate Filtering", + "default": 0, + "type": "integer" + }, + "use_confidence_scores": { + "title": "Use Confidence Scores", + "default": true, + "type": "boolean" + }, + "confidence_threshold": { + "title": "Confidence Threshold", + "type": "number" + }, + "proxies": { + "title": "Proxies", + "type": "object", + "additionalProperties": { + "type": "string" + } + }, + "local_files_only": { + "title": "Local Files Only", + "default": false + }, + "force_download": { + "title": "Force Download", + "default": false + }, + "use_auth_token": { + "title": "Use Auth Token", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "string" + } + ] + } + }, + "required": [ + "model_name_or_path" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "FileTypeClassifierComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "FileTypeClassifier" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "supported_types": { + "title": "Supported Types", + "default": [ + "txt", + "pdf", + "md", + "docx", + "html" + ], + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "FilterRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "FilterRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "all_terms_must_match": { + "title": "All Terms Must Match", + "default": false, + "type": "boolean" + }, + "custom_query": { + "title": "Custom Query", + "type": "string" + }, + "scale_score": { + "title": "Scale Score", + "default": true, + "type": "boolean" + } + }, + "required": [ + "document_store" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "ImageToTextConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "ImageToTextConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "remove_numeric_tables": { + "title": "Remove Numeric Tables", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "default": [ + "eng" + ], + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "JoinAnswersComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "JoinAnswers" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "join_mode": { + "title": "Join Mode", + "default": "concatenate", + "type": "string" + }, + "weights": { + "title": "Weights", + "type": "array", + "items": { + "type": "number" + } + }, + "top_k_join": { + "title": "Top K Join", + "type": "integer" + }, + "sort_by_score": { + "title": "Sort By Score", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "JoinDocumentsComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "JoinDocuments" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "join_mode": { + "title": "Join Mode", + "default": "concatenate", + "type": "string" + }, + "weights": { + "title": "Weights", + "type": "array", + "items": { + "type": "number" + } + }, + "top_k_join": { + "title": "Top K Join", + "type": "integer" + }, + "sort_by_score": { + "title": "Sort By Score", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "MarkdownConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "MarkdownConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "remove_numeric_tables": { + "title": "Remove Numeric Tables", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "MultihopEmbeddingRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "MultihopEmbeddingRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "embedding_model": { + "title": "Embedding Model", + "type": "string" + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "num_iterations": { + "title": "Num Iterations", + "default": 2, + "type": "integer" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "batch_size": { + "title": "Batch Size", + "default": 32, + "type": "integer" + }, + "max_seq_len": { + "title": "Max Seq Len", + "default": 512, + "type": "integer" + }, + "model_format": { + "title": "Model Format", + "default": "farm", + "type": "string" + }, + "pooling_strategy": { + "title": "Pooling Strategy", + "default": "reduce_mean", + "type": "string" + }, + "emb_extraction_layer": { + "title": "Emb Extraction Layer", + "default": -1, + "type": "integer" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "devices": { + "title": "Devices", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + "use_auth_token": { + "title": "Use Auth Token", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "string" + } + ] + }, + "scale_score": { + "title": "Scale Score", + "default": true, + "type": "boolean" + }, + "embed_meta_fields": { + "title": "Embed Meta Fields", + "default": [], + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "document_store", + "embedding_model" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "OpenAIAnswerGeneratorComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "OpenAIAnswerGenerator" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "api_key": { + "title": "Api Key", + "type": "string" + }, + "model": { + "title": "Model", + "default": "text-curie-001", + "type": "string" + }, + "max_tokens": { + "title": "Max Tokens", + "default": 7, + "type": "integer" + }, + "top_k": { + "title": "Top K", + "default": 5, + "type": "integer" + }, + "temperature": { + "title": "Temperature", + "default": 0, + "type": "integer" + }, + "presence_penalty": { + "title": "Presence Penalty", + "default": -2.0, + "type": "number" + }, + "frequency_penalty": { + "title": "Frequency Penalty", + "default": -2.0, + "type": "number" + }, + "examples_context": { + "title": "Examples Context", + "type": "string" + }, + "examples": { + "title": "Examples", + "type": "array", + "items": {} + }, + "stop_words": { + "title": "Stop Words", + "type": "array", + "items": {} + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "required": [ + "api_key" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "PDFToTextConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "PDFToTextConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "remove_numeric_tables": { + "title": "Remove Numeric Tables", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + }, + "encoding": { + "title": "Encoding", + "default": "UTF-8", + "type": "string" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "PDFToTextOCRConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "PDFToTextOCRConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "remove_numeric_tables": { + "title": "Remove Numeric Tables", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "default": [ + "eng" + ], + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "ParsrConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "ParsrConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "parsr_url": { + "title": "Parsr Url", + "default": "http://localhost:3001", + "type": "string" + }, + "extractor": { + "title": "Extractor", + "default": "pdfminer", + "enum": [ + "pdfminer", + "pdfjs" + ], + "type": "string" + }, + "table_detection_mode": { + "title": "Table Detection Mode", + "default": "lattice", + "enum": [ + "lattice", + "stream" + ], + "type": "string" + }, + "preceding_context_len": { + "title": "Preceding Context Len", + "default": 3, + "type": "integer" + }, + "following_context_len": { + "title": "Following Context Len", + "default": 3, + "type": "integer" + }, + "remove_page_headers": { + "title": "Remove Page Headers", + "default": false, + "type": "boolean" + }, + "remove_page_footers": { + "title": "Remove Page Footers", + "default": false, + "type": "boolean" + }, + "remove_table_of_contents": { + "title": "Remove Table Of Contents", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + }, + "add_page_number": { + "title": "Add Page Number", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "PreProcessorComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "PreProcessor" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "clean_whitespace": { + "title": "Clean Whitespace", + "default": true, + "type": "boolean" + }, + "clean_header_footer": { + "title": "Clean Header Footer", + "default": false, + "type": "boolean" + }, + "clean_empty_lines": { + "title": "Clean Empty Lines", + "default": true, + "type": "boolean" + }, + "remove_substrings": { + "title": "Remove Substrings", + "default": [], + "type": "array", + "items": { + "type": "string" + } + }, + "split_by": { + "title": "Split By", + "default": "word", + "type": "string" + }, + "split_length": { + "title": "Split Length", + "default": 200, + "type": "integer" + }, + "split_overlap": { + "title": "Split Overlap", + "default": 0, + "type": "integer" + }, + "split_respect_sentence_boundary": { + "title": "Split Respect Sentence Boundary", + "default": true, + "type": "boolean" + }, + "tokenizer_model_folder": { + "title": "Tokenizer Model Folder", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "format": "path" + } + ] + }, + "language": { + "title": "Language", + "default": "en", + "type": "string" + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "add_page_number": { + "title": "Add Page Number", + "default": false, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "PseudoLabelGeneratorComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "PseudoLabelGenerator" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "question_producer": { + "title": "Question Producer", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "additionalProperties": { + "type": "string" + } + } + } + ] + }, + "retriever": { + "title": "Retriever", + "type": "string" + }, + "cross_encoder_model_name_or_path": { + "title": "Cross Encoder Model Name Or Path", + "default": "cross-encoder/ms-marco-MiniLM-L-6-v2", + "type": "string" + }, + "max_questions_per_document": { + "title": "Max Questions Per Document", + "default": 3, + "type": "integer" + }, + "top_k": { + "title": "Top K", + "default": 50, + "type": "integer" + }, + "batch_size": { + "title": "Batch Size", + "default": 16, + "type": "integer" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "required": [ + "question_producer", + "retriever" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "QuestionGeneratorComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "QuestionGenerator" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "valhalla/t5-base-e2e-qg" + }, + "model_version": { + "title": "Model Version" + }, + "num_beams": { + "title": "Num Beams", + "default": 4 + }, + "max_length": { + "title": "Max Length", + "default": 256 + }, + "no_repeat_ngram_size": { + "title": "No Repeat Ngram Size", + "default": 3 + }, + "length_penalty": { + "title": "Length Penalty", + "default": 1.5 + }, + "early_stopping": { + "title": "Early Stopping", + "default": true + }, + "split_length": { + "title": "Split Length", + "default": 50 + }, + "split_overlap": { + "title": "Split Overlap", + "default": 10 + }, + "use_gpu": { + "title": "Use Gpu", + "default": true + }, + "prompt": { + "title": "Prompt", + "default": "generate questions:" + }, + "num_queries_per_doc": { + "title": "Num Queries Per Doc", + "default": 1 + }, + "sep_token": { + "title": "Sep Token", + "default": "", + "type": "string" + }, + "batch_size": { + "title": "Batch Size", + "default": 16, + "type": "integer" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "RAGeneratorComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "RAGenerator" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "facebook/rag-token-nq", + "type": "string" + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "retriever": { + "title": "Retriever", + "type": "string", + "default": null + }, + "generator_type": { + "title": "Generator Type", + "default": "token", + "type": "string" + }, + "top_k": { + "title": "Top K", + "default": 2, + "type": "integer" + }, + "max_length": { + "title": "Max Length", + "default": 200, + "type": "integer" + }, + "min_length": { + "title": "Min Length", + "default": 2, + "type": "integer" + }, + "num_beams": { + "title": "Num Beams", + "default": 2, + "type": "integer" + }, + "embed_title": { + "title": "Embed Title", + "default": true, + "type": "boolean" + }, + "prefix": { + "title": "Prefix", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "RCIReaderComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "RCIReader" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "row_model_name_or_path": { + "title": "Row Model Name Or Path", + "default": "michaelrglass/albert-base-rci-wikisql-row", + "type": "string" + }, + "column_model_name_or_path": { + "title": "Column Model Name Or Path", + "default": "michaelrglass/albert-base-rci-wikisql-col", + "type": "string" + }, + "row_model_version": { + "title": "Row Model Version", + "type": "string" + }, + "column_model_version": { + "title": "Column Model Version", + "type": "string" + }, + "row_tokenizer": { + "title": "Row Tokenizer", + "type": "string" + }, + "column_tokenizer": { + "title": "Column Tokenizer", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "max_seq_len": { + "title": "Max Seq Len", + "default": 256, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "RouteDocumentsComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "RouteDocuments" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "split_by": { + "title": "Split By", + "default": "content_type", + "type": "string" + }, + "metadata_values": { + "title": "Metadata Values", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "SentenceTransformersRankerComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "SentenceTransformersRanker" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "anyOf": [ + { + "type": "string" + }, + { + "type": "string", + "format": "path" + } + ] + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "devices": { + "title": "Devices", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + "batch_size": { + "title": "Batch Size", + "default": 16, + "type": "integer" + }, + "scale_score": { + "title": "Scale Score", + "default": true, + "type": "boolean" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "required": [ + "model_name_or_path" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "Seq2SeqGeneratorComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "Seq2SeqGenerator" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "type": "string" + }, + "input_converter": { + "title": "Input Converter", + "type": "string", + "default": null + }, + "top_k": { + "title": "Top K", + "default": 1, + "type": "integer" + }, + "max_length": { + "title": "Max Length", + "default": 200, + "type": "integer" + }, + "min_length": { + "title": "Min Length", + "default": 2, + "type": "integer" + }, + "num_beams": { + "title": "Num Beams", + "default": 8, + "type": "integer" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "required": [ + "model_name_or_path" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "SklearnQueryClassifierComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "SklearnQueryClassifier" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/model.pickle", + "anyOf": [ + { + "type": "string" + }, + {} + ] + }, + "vectorizer_name_or_path": { + "title": "Vectorizer Name Or Path", + "default": "https://ext-models-haystack.s3.eu-central-1.amazonaws.com/gradboost_query_classifier/vectorizer.pickle", + "anyOf": [ + { + "type": "string" + }, + {} + ] + }, + "batch_size": { + "title": "Batch Size", + "type": "integer" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TableReaderComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TableReader" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "google/tapas-base-finetuned-wtq", + "type": "string" + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "tokenizer": { + "title": "Tokenizer", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "top_k_per_candidate": { + "title": "Top K Per Candidate", + "default": 3, + "type": "integer" + }, + "return_no_answer": { + "title": "Return No Answer", + "default": false, + "type": "boolean" + }, + "max_seq_len": { + "title": "Max Seq Len", + "default": 256, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TableTextRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TableTextRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "query_embedding_model": { + "title": "Query Embedding Model", + "default": "deepset/bert-small-mm_retrieval-question_encoder", + "anyOf": [ + { + "type": "string", + "format": "path" + }, + { + "type": "string" + } + ] + }, + "passage_embedding_model": { + "title": "Passage Embedding Model", + "default": "deepset/bert-small-mm_retrieval-passage_encoder", + "anyOf": [ + { + "type": "string", + "format": "path" + }, + { + "type": "string" + } + ] + }, + "table_embedding_model": { + "title": "Table Embedding Model", + "default": "deepset/bert-small-mm_retrieval-table_encoder", + "anyOf": [ + { + "type": "string", + "format": "path" + }, + { + "type": "string" + } + ] + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "max_seq_len_query": { + "title": "Max Seq Len Query", + "default": 64, + "type": "integer" + }, + "max_seq_len_passage": { + "title": "Max Seq Len Passage", + "default": 256, + "type": "integer" + }, + "max_seq_len_table": { + "title": "Max Seq Len Table", + "default": 256, + "type": "integer" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "batch_size": { + "title": "Batch Size", + "default": 16, + "type": "integer" + }, + "embed_meta_fields": { + "title": "Embed Meta Fields", + "default": [ + "name", + "section_title", + "caption" + ], + "type": "array", + "items": { + "type": "string" + } + }, + "use_fast_tokenizers": { + "title": "Use Fast Tokenizers", + "default": true, + "type": "boolean" + }, + "similarity_function": { + "title": "Similarity Function", + "default": "dot_product", + "type": "string" + }, + "global_loss_buffer_size": { + "title": "Global Loss Buffer Size", + "default": 150000, + "type": "integer" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + }, + "devices": { + "title": "Devices", + "type": "array", + "items": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "string" + } + ] + } + }, + "use_auth_token": { + "title": "Use Auth Token", + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "string" + } + ] + }, + "scale_score": { + "title": "Scale Score", + "default": true, + "type": "boolean" + }, + "use_fast": { + "title": "Use Fast", + "default": true, + "type": "boolean" + } + }, + "required": [ + "document_store" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "Text2SparqlRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "Text2SparqlRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "knowledge_graph": { + "title": "Knowledge Graph" + }, + "model_name_or_path": { + "title": "Model Name Or Path" + }, + "top_k": { + "title": "Top K", + "default": 1, + "type": "integer" + } + }, + "required": [ + "knowledge_graph", + "model_name_or_path" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TextConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TextConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "remove_numeric_tables": { + "title": "Remove Numeric Tables", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TfidfRetrieverComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TfidfRetriever" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "document_store": { + "title": "Document Store", + "type": "string" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "auto_fit": { + "title": "Auto Fit", + "default": true + } + }, + "required": [ + "document_store" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TikaConverterComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TikaConverter" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "tika_url": { + "title": "Tika Url", + "default": "http://localhost:9998/tika", + "type": "string" + }, + "remove_numeric_tables": { + "title": "Remove Numeric Tables", + "default": false, + "type": "boolean" + }, + "valid_languages": { + "title": "Valid Languages", + "type": "array", + "items": { + "type": "string" + } + }, + "id_hash_keys": { + "title": "Id Hash Keys", + "type": "array", + "items": { + "type": "string" + } + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TransformersDocumentClassifierComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TransformersDocumentClassifier" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "bhadresh-savani/distilbert-base-uncased-emotion", + "type": "string" + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "tokenizer": { + "title": "Tokenizer", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "return_all_scores": { + "title": "Return All Scores", + "default": false, + "type": "boolean" + }, + "task": { + "title": "Task", + "default": "text-classification", + "type": "string" + }, + "labels": { + "title": "Labels", + "type": "array", + "items": { + "type": "string" + } + }, + "batch_size": { + "title": "Batch Size", + "default": 16, + "type": "integer" + }, + "classification_field": { + "title": "Classification Field", + "type": "string" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TransformersQueryClassifierComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TransformersQueryClassifier" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "shahrukhx01/bert-mini-finetune-question-detection", + "anyOf": [ + { + "type": "string", + "format": "path" + }, + { + "type": "string" + } + ] + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "tokenizer": { + "title": "Tokenizer", + "type": "string" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "task": { + "title": "Task", + "default": "text-classification", + "type": "string" + }, + "labels": { + "title": "Labels", + "default": [ + "LABEL_1", + "LABEL_0" + ], + "type": "array", + "items": { + "type": "string" + } + }, + "batch_size": { + "title": "Batch Size", + "default": 16, + "type": "integer" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TransformersReaderComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TransformersReader" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "distilbert-base-uncased-distilled-squad", + "type": "string" + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "tokenizer": { + "title": "Tokenizer", + "type": "string" + }, + "context_window_size": { + "title": "Context Window Size", + "default": 70, + "type": "integer" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "top_k": { + "title": "Top K", + "default": 10, + "type": "integer" + }, + "top_k_per_candidate": { + "title": "Top K Per Candidate", + "default": 3, + "type": "integer" + }, + "return_no_answers": { + "title": "Return No Answers", + "default": false, + "type": "boolean" + }, + "max_seq_len": { + "title": "Max Seq Len", + "default": 256, + "type": "integer" + }, + "doc_stride": { + "title": "Doc Stride", + "default": 128, + "type": "integer" + }, + "batch_size": { + "title": "Batch Size", + "default": 16, + "type": "integer" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TransformersSummarizerComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TransformersSummarizer" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "default": "google/pegasus-xsum", + "type": "string" + }, + "model_version": { + "title": "Model Version", + "type": "string" + }, + "tokenizer": { + "title": "Tokenizer", + "type": "string" + }, + "max_length": { + "title": "Max Length", + "default": 200, + "type": "integer" + }, + "min_length": { + "title": "Min Length", + "default": 5, + "type": "integer" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "clean_up_tokenization_spaces": { + "title": "Clean Up Tokenization Spaces", + "default": true, + "type": "boolean" + }, + "separator_for_single_summary": { + "title": "Separator For Single Summary", + "default": " ", + "type": "string" + }, + "generate_single_summary": { + "title": "Generate Single Summary", + "default": false, + "type": "boolean" + }, + "batch_size": { + "title": "Batch Size", + "default": 16, + "type": "integer" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + }, + "TransformersTranslatorComponent": { + "type": "object", + "properties": { + "name": { + "title": "Name", + "description": "Custom name for the component. Helpful for visualization and debugging.", + "type": "string" + }, + "type": { + "title": "Type", + "description": "Haystack Class name for the component.", + "type": "string", + "const": "TransformersTranslator" + }, + "params": { + "title": "Parameters", + "type": "object", + "properties": { + "model_name_or_path": { + "title": "Model Name Or Path", + "type": "string" + }, + "tokenizer_name": { + "title": "Tokenizer Name", + "type": "string" + }, + "max_seq_len": { + "title": "Max Seq Len", + "type": "integer" + }, + "clean_up_tokenization_spaces": { + "title": "Clean Up Tokenization Spaces", + "default": true, + "type": "boolean" + }, + "use_gpu": { + "title": "Use Gpu", + "default": true, + "type": "boolean" + }, + "progress_bar": { + "title": "Progress Bar", + "default": true, + "type": "boolean" + } + }, + "required": [ + "model_name_or_path" + ], + "additionalProperties": false, + "description": "Each parameter can reference other components defined in the same YAML file." + } + }, + "required": [ + "type", + "name" + ], + "additionalProperties": false + } + } +} \ No newline at end of file diff --git a/haystack/json-schemas/haystack-pipeline.schema.json b/haystack/json-schemas/haystack-pipeline.schema.json index 020d9acc38..5e8a33a48b 100644 --- a/haystack/json-schemas/haystack-pipeline.schema.json +++ b/haystack/json-schemas/haystack-pipeline.schema.json @@ -158,6 +158,20 @@ "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/haystack-pipeline-1.6.0.schema.json" } ] + }, + { + "allOf": [ + { + "properties": { + "version": { + "const": "1.7.0" + } + } + }, + { + "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/haystack-pipeline-1.7.0.schema.json" + } + ] } ] } \ No newline at end of file diff --git a/haystack/nodes/_json_schema.py b/haystack/nodes/_json_schema.py index f16ceaf0c9..87c798b8f7 100644 --- a/haystack/nodes/_json_schema.py +++ b/haystack/nodes/_json_schema.py @@ -386,16 +386,16 @@ def update_json_schema(destination_path: Path = JSON_SCHEMAS_PATH): index_name = "haystack-pipeline.schema.json" with open(destination_path / index_name, "r") as json_file: index = json.load(json_file) - index["oneOf"].append( - { - "allOf": [ - {"properties": {"version": {"const": haystack_version}}}, - { - "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/" - f"haystack-pipeline-{haystack_version}.schema.json" - }, - ] - } - ) + new_entry = { + "allOf": [ + {"properties": {"version": {"const": haystack_version}}}, + { + "$ref": "https://raw.githubusercontent.com/deepset-ai/haystack/master/haystack/json-schemas/" + f"haystack-pipeline-{haystack_version}.schema.json" + }, + ] + } + if new_entry not in index["oneOf"]: + index["oneOf"].append(new_entry) with open(destination_path / index_name, "w") as json_file: json.dump(index, json_file, indent=2)