diff --git a/examples/pipelines/slides_search/.env.example b/examples/pipelines/slides_search/.env.example new file mode 100644 index 0000000..8db6c24 --- /dev/null +++ b/examples/pipelines/slides_search/.env.example @@ -0,0 +1,14 @@ +OPENAI_API_KEY= + +PATHWAY_HOST="0.0.0.0" + +PATHWAY_NW_HOST=pathway_app + +FILE_SERVER_URL=http://localhost:8080/ + +SEARCH_TOPK=6 + +SCHEMA_FILE_PATH="parse_schema.yaml" +# to disable schema parsing, simply comment out the above line or set `SCHEMA_FILE_PATH=""`. + +PATHWAY_LICENSE_KEY="YOUR PATHWAY KEY" # can be obtained here: https://pathway.com/user/license diff --git a/examples/pipelines/slides_search/Dockerfile.app b/examples/pipelines/slides_search/Dockerfile.app new file mode 100644 index 0000000..54c2a1e --- /dev/null +++ b/examples/pipelines/slides_search/Dockerfile.app @@ -0,0 +1,17 @@ +FROM pathwaycom/pathway:0.13.2 + +ENV DOCKER_BUILDKIT=1 +ENV PYTHONUNBUFFERED=1 + +RUN apt-get update && apt-get install -y \ + poppler-utils \ + libreoffice \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY . . + +EXPOSE 8000 + +CMD ["python", "app.py"] diff --git a/examples/pipelines/slides_search/Dockerfile.nginx b/examples/pipelines/slides_search/Dockerfile.nginx new file mode 100644 index 0000000..8a453d2 --- /dev/null +++ b/examples/pipelines/slides_search/Dockerfile.nginx @@ -0,0 +1,7 @@ +FROM nginx:latest + +COPY nginx.conf /etc/nginx/conf.d/default.conf + +RUN mkdir -p /app/pw_dump_images /app/pw_dump_files + +EXPOSE 8080 8443 diff --git a/examples/pipelines/slides_search/Dockerfile.ui b/examples/pipelines/slides_search/Dockerfile.ui new file mode 100644 index 0000000..7d4b54e --- /dev/null +++ b/examples/pipelines/slides_search/Dockerfile.ui @@ -0,0 +1,15 @@ +FROM pathwaycom/pathway:0.13.2 + +ENV PYTHONUNBUFFERED=1 + +WORKDIR /ui + +COPY ui/requirements-ui.txt requirements-ui.txt + +RUN pip install -U --no-cache-dir -r requirements-ui.txt + +COPY ui/ . + +EXPOSE 8501 + +CMD [ "streamlit", "run", "ui.py" ] diff --git a/examples/pipelines/slides_search/README.md b/examples/pipelines/slides_search/README.md new file mode 100644 index 0000000..044da13 --- /dev/null +++ b/examples/pipelines/slides_search/README.md @@ -0,0 +1,240 @@ +# **Slide AI Search App** + +## **Overview** + +This app template will help you build a multi-modal search service using `GPT-4o` with Metadata Extraction and Vector Index. It uses [Pathway](https://github.com/pathwaycom/llm-app) for indexing and retrieving slides from PowerPoint and PDF presentations. + +How is this different? + +* Build highly accurate RAG pipelines powered by indexes that are updated in real-time. +* Pathway uses vision language models to understand and index your presentations and PDFs, automatically updating as changes are made. +* Get started with a minimalistic and production-ready approach. + +Boost productivity with accurate search across your PowerPoints, PDFs, and Slides all within your work environment. Try out the [demo](https://sales-rag-chat.demo.pathway.com/#search-your-slide-decks) here. + + +## Quickstart + +Check the `.env.example`, create a new `.env` file and fill in the template. +For a quick start, you need to only change the following fields: +- `PATHWAY_LICENSE_KEY` +- `OPENAI_API_KEY` + +This app template is available for free via [Pathway Scale](https://pathway.com/features). Get your [license key here](https://pathway.com/user/license) and fill in the `PATHWAY_LICENSE_KEY` here in the `.env` file. + +## How it Helps + +**1) Improved Efficiency:** + +* **Save Efforts:** You no longer need to manually sift through countless presentations. +* **Faster Information Retrieval:** Instantly find specific information with a few keywords or descriptive prompts, saving you time when preparing for presentations or reviewing past projects. + +**2) Enhanced Organization** + +* **Automated Categorization:** You can organize your slide library by topic, project, or other criteria. Configure the schema file to cusomize the parsed fields. + +**3) Enhanced Reliability** + +* **Automatic Updates:** Hybrid indexes update automatically whenever a new slide is added or removed, ensuring your information is always current and accurate. + +**4) Automated Slide Parsing:** + +* Process PPTX and PDF slide decks with vision language models to extract the content. + +**5) Flexible Data Sources:** + +* Compatible with local directories, SharePoint, Google Drive, and other Pathway connectors, ensuring a wide range of application scenarios can be supported. + +By automating the extraction and retrieval of slide information, this app addresses the critical pain point of managing and utilizing extensive slide decks efficiently, enhancing productivity and information accuracy for sales teams. + + +## Architecture: + +The architecture of the Slides AI Search App is designed to connect various local or cloud repositories, transforming and indexing slides for efficient querying. It supports integration with closed and open-source LLMs for enhanced search capabilities. + +![Architecture](ai-slides-diagram.svg) + +This demo consists of three parts: +* `app.py`: Pathway app that handles parsing, indexing and backend. +* `nginx`: File server that hosts images to be consumed by the UI. +* `UI`: A Streamlit UI for interacting with the app. + + +## How it works: + +### **Data Ingestion** + +1. **Data Sources**: + * The application reads slide files (PPTX and PDF) from a specified directory. The directory is set to `./data/`in the `app.py` file. + * In the default app setup, the connected folder is a local file folder. You can add more folders and file sources, such as [Google Drive](https://pathway.com/developers/user-guide/connectors/gdrive-connector/#google-drive-connector) or [Sharepoint](https://pathway.com/developers/user-guide/connecting-to-data/connectors/#tutorials), by adding a line of code to the template. + * More inputs can be added by configuring the `sources` list in the `app.py`. + + +### **Slide Parsing and Indexing** + + +1. **Parsing**: + * The [`SlideParser`](https://pathway.com/developers/api-docs/pathway-xpacks-llm/parsers#pathway.xpacks.llm.parsers.SlideParser) from Pathway is used to parse the slides. The parser is configured to parse a text description and schema that is defined in the `parse_schema.yaml`. + * Our example schema includes fields such as `category`, `tags`, `title`, `main_color`, `language`, and `has_images`. This can be modified for specific use cases. + * Note that, UI is configured to make use of two extracted fields `category` and `language`, these need to be kept for the UI to work. However, the app can still be used without the UI with different schemas or no parsed schema. +2. **Embedding**: + * Parsed slide content is embedded with the OpenAI's `text-embedding-ada-002` embedder. + * The embeddings are then stored in Pathway's vector store using the `SlidesVectorStoreServer`. +3. **Metadata Handling**: + * Images and files are dumped into local directories (`storage/pw_dump_images` and `storage/pw_dump_files`). + * Each slide gets a unique ID with `add_slide_id` function in the `app.py`. This helps with opening files and images from the the UI. + + +### **Query Handling** + +1. **Retrieval Augmented Generation (RAG)**: + * The `DeckRetriever` class builds the backend, handling all steps of the application from parsing files to serving the endpoints. Refer to the [API docs](https://pathway.com/developers/api-docs/pathway-xpacks-llm/question_answering#pathway.xpacks.llm.question_answering.DeckRetriever) for more information. + +## Pipeline Organization + +This folder contains several components necessary for setting up and running the Sales Slide RAG application: + + +1. **app.py**: + * The main application that sets up the slide search functionality. It configures the OpenAI chat model, slide parser, vector store, and initializes the DeckRetriever for handling queries. +2. **parse_schema.yaml**: + * Defines the schema for parsing the slides and including fields such as `category`, `tags`, `title`, `main_color`, `language`, and `has_images`. + * These fields will be appended to the `metadata`, if you prefer to also add them to `text` field, set `include_schema_in_text` of `SlideParser` to `True`. +3. **.env**: + * Config file for the environment variables, such as the OpenAI API key and Pathway key. + + +## **Prerequisites/Configuration** + +### **Environment Setup** + +1. **OpenAI API Key**: + * Get an API key from the [OpenAI’s API Key Management page](https://platform.openai.com/account/api-keys). Keep this API key secure. + * Configure your key in the `.env` file. + * You can refer to the stub file `.env.example` in this repository. + * Note: This is only needed in OpenAI LLMs and embedders. It is also possible to use other multi-modal, local LLMs and embedders. + +2. **Pathway’s License Key**: + * This app template is available for free via [Pathway Scale](https://pathway.com/features). + * Get your [license key here](https://pathway.com/user/license). + +3. **SCHEMA_FILE_PATH**: + * Path to file that defines the schema to be parsed. It can be kept as default and the `parse_schema.yaml` can be configured. + +4. **SEARCH_TOPK**: + * Number of elements to be retrieved from the index by default. + +## How to run the project + +Make sure you are in the right directory: +```bash +cd examples/pipelines/slides_search +``` + +### Locally +Running the whole demo without Docker is not suggested as there are three components. + +1. **Download and Install LibreOffice:** + * Download LibreOffice from the [LibreOffice website](https://www.libreoffice.org/download/download-libreoffice). + * Follow the installation instructions specific to your operating system. \ + +2. **Verify LibreOffice Installation:** + * Download LibreOffice from the LibreOffice website. + * Open a terminal or command prompt and run the following command: + * You should see the LibreOffice version information, indicating LibreOffice is installed correctly. + + **Purpose:** LibreOffice helps with converting PPTX files into PDFs, which is essential for the document processing workflow in the Slides AI Search App. + +If you are on Windows, please refer to the [running with Docker](#Running-with-docker) section below. + +To run the Pathway app without the UI, + +```bash +python app.py +``` + +### Running with Docker + +Build the Docker with: + +```bash +docker-compose build +``` + +And, run with: + +```bash +docker-compose up +``` + +This will start all three components of the demo. + +## Using the app + +After Docker is running, you will see a stream of logs of your files being parsed. + +### Accessing the UI + +On your browser, visit [`http://localhost:8501`](http://localhost:8501/) to access the UI. + +Here, you will see a search bar, some filters, and information about the indexed documents on the left side. + + +### Sending requests to the server + +#### With CURL + +UI is not a necessary component, especially for developers. If you are interested in building your own app, check out the following ways to use the app: + +First, let's check the indexed files: +```bash +curl -X 'POST' 'http://0.0.0.0:8000/v1/pw_list_documents' -H 'accept: */*' -H 'Content-Type: application/json' +``` + +This will return a list of metadata from the indexed files. + +Now, let's search through our slides: + + +```bash +curl -X 'POST' 'http://0.0.0.0:8000/v1/pw_ai_answer' -H 'accept: */*' -H 'Content-Type: application/json' -d '{ + "prompt": "diagrams that contain value propositions" +}' +``` + +This will search through our files, and return parsed slides with the `text`, `slide_id` and other `metadata` (also including the parsed schema). + +#### With the Pathway RAG Client + +Import RAGClient with: + +```python +from pathway.xpacks.llm.question_answering import RAGClient +``` + +Initialize the client: + +```python +# conn = RAGClient(url=f"http://{PATHWAY_HOST}:{PATHWAY_PORT}") + +# with the default config +conn = RAGClient(url=f"http://localhost:8000") +``` + +List the indexed files: +```python +conn.pw_list_documents() +``` +> `[{'path': 'data/slide.pdf'}, ...` + +Query the app: + +```python +conn.pw_ai_answer("introduction slide") +``` +> `[{'dist': 0.47761982679367065, 'metadata': ...` + + +## Not sure how to get started? + +Let's discuss how we can help you build a powerful, customized RAG application. [Reach us here to talk or request a demo!](https://pathway.com/solutions/enterprise-generative-ai?modal=requestdemo) diff --git a/examples/pipelines/slides_search/ai-slides-diagram.svg b/examples/pipelines/slides_search/ai-slides-diagram.svg new file mode 100644 index 0000000..c5bd3ff --- /dev/null +++ b/examples/pipelines/slides_search/ai-slides-diagram.svg @@ -0,0 +1,3 @@ + + +

Slides parsing

Slide descriptions, parsed text, metadata

Embedded query

REST API service

In-memory 
Slide Indexing
(Vector/ Hybrid 
Index)

Slide Query

Relevant slides

Retrieval

Pathway connectors

PowerPoint, PDF, 
Google slides
Connected
Data Sources
User requests
Slide Indexing Pipeline (containerized solution)
GPT-4o
\ No newline at end of file diff --git a/examples/pipelines/slides_search/app.py b/examples/pipelines/slides_search/app.py new file mode 100644 index 0000000..edcf4c2 --- /dev/null +++ b/examples/pipelines/slides_search/app.py @@ -0,0 +1,141 @@ +import base64 +import logging +import os + +import pathway as pw +from dotenv import load_dotenv +from pathway.udfs import DiskCache, ExponentialBackoffRetryStrategy +from pathway.xpacks.llm import embedders, llms +from pathway.xpacks.llm.parsers import SlideParser +from pathway.xpacks.llm.question_answering import DeckRetriever +from pathway.xpacks.llm.vector_store import SlidesVectorStoreServer +from pydantic import BaseModel +from utils import get_model_from_file + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(name)s %(levelname)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + force=True, # instructor library overrides `basicConfig`, this fixes logging +) + +IMAGE_DUMP_FOLDER = "storage/pw_dump_images" +FILE_DUMP_FOLDER = "storage/pw_dump_files" + + +def encode_str(original_string: str) -> str: + return base64.urlsafe_b64encode(original_string.encode("utf-8")).decode("utf-8") + + +def add_slide_id(text: str, metadata: dict) -> tuple[str, dict]: + # encode url-safe filename. remove `_` chars to use them as the page separator. + encoded_name = encode_str(metadata["path"].split("/")[-1]).replace("_", "-") + + page = metadata["image_page"] + page_count = metadata["tot_pages"] + + slide_id = f"{encoded_name}_{page}_{page_count}.png" + + logging.info(f"`add_slide_id` for {slide_id}...") + + metadata["slide_id"] = slide_id + + return (text, metadata) + + +def dump_img_callback(key, row, time, is_addition): + # save images parsed by the Pathway + metadata = row["data"] + with open(f"{IMAGE_DUMP_FOLDER}/{metadata['slide_id'].value}", "wb") as f: + f.write(base64.b64decode(metadata["b64_image"].value)) + + +def dump_file_callback(key, row, time, is_addition): + # save parsed files + file_name = row["path"].value.split("/")[-1] + with open(f"{FILE_DUMP_FOLDER}/{file_name}", "wb") as f: + f.write(row["data"]) + + +if __name__ == "__main__": + load_dotenv() + + PATHWAY_HOST = os.environ.get("PATHWAY_HOST", "0.0.0.0") + PATHWAY_PORT = int(os.environ.get("PATHWAY_PORT", 8000)) + + SEARCH_TOPK = int(os.environ.get("SEARCH_TOPK", 6)) + + os.makedirs(IMAGE_DUMP_FOLDER, exist_ok=True) + os.makedirs(FILE_DUMP_FOLDER, exist_ok=True) + + pydantic_schema: type[BaseModel] | None = None + schema_file = os.environ.get("SCHEMA_FILE_PATH") + + if schema_file: + pydantic_schema = get_model_from_file(schema_file) + + path = "./data/" + + folder = pw.io.fs.read( + path=path, + format="binary", + with_metadata=True, + ) + + # folder = pw.io.gdrive.read( + # object_id=, + # with_metadata=True, + # service_user_credentials_file="secrets.json", + # refresh_interval=30, + # object_size_limit=None, + # name_pattern="*.pdf", + # ) + + sources = [ + folder, + ] # list of input sources + + chat = llms.OpenAIChat( + model="gpt-4o", + retry_strategy=ExponentialBackoffRetryStrategy(max_retries=6), + cache_strategy=DiskCache(), + temperature=0.0, + ) + + parser = SlideParser( + detail_parse_schema=pydantic_schema, + run_mode="parallel", + include_schema_in_text=False, + ) + embedder = embedders.OpenAIEmbedder(cache_strategy=DiskCache()) + + doc_store = SlidesVectorStoreServer( + *sources, + embedder=embedder, + splitter=None, + parser=parser, + doc_post_processors=[add_slide_id], + ) + + app = DeckRetriever( + llm=chat, + indexer=doc_store, + search_topk=SEARCH_TOPK, + ) + + chunked_docs = app.indexer._graph["chunked_docs"] + + m_table = chunked_docs.select( + data=pw.this.data["metadata"], + ) + + pw.io.subscribe(m_table, on_change=dump_img_callback) + + docs = app.indexer._graph["docs"] + + t = docs.select(data=docs.data, path=docs._metadata["path"]) + pw.io.subscribe(t, on_change=dump_file_callback) + + app.build_server(host=PATHWAY_HOST, port=PATHWAY_PORT) + + app.run_server(with_cache=True, terminate_on_error=False) diff --git a/examples/pipelines/slides_search/data/us-state-of-gen-ai-report.pdf b/examples/pipelines/slides_search/data/us-state-of-gen-ai-report.pdf new file mode 100644 index 0000000..4a67641 Binary files /dev/null and b/examples/pipelines/slides_search/data/us-state-of-gen-ai-report.pdf differ diff --git a/examples/pipelines/slides_search/docker-compose.yml b/examples/pipelines/slides_search/docker-compose.yml new file mode 100644 index 0000000..c8e9859 --- /dev/null +++ b/examples/pipelines/slides_search/docker-compose.yml @@ -0,0 +1,49 @@ +version: '3.11.9' + +services: + app: + container_name: pathway_app + build: + context: . + dockerfile: Dockerfile.app + ports: + - "8000:8000" + env_file: + - .env + networks: + - my_network + volumes: + - ./data:/app/data + - ./storage/pw_dump_files:/app/storage/pw_dump_files + - ./storage/pw_dump_images:/app/storage/pw_dump_images + + nginx: + container_name: file_server + build: + context: . + dockerfile: Dockerfile.nginx + ports: + - "8080:8080" + - "8443:8443" + env_file: + - .env + networks: + - my_network + volumes: + - ./storage/pw_dump_files:/app/pw_dump_files + - ./storage/pw_dump_images:/app/pw_dump_images + + ui: + build: + context: . + dockerfile: Dockerfile.ui + ports: + - "8501:8501" + env_file: + - .env + networks: + - my_network + +networks: + my_network: + driver: bridge diff --git a/examples/pipelines/slides_search/nginx.conf b/examples/pipelines/slides_search/nginx.conf new file mode 100644 index 0000000..15fca8e --- /dev/null +++ b/examples/pipelines/slides_search/nginx.conf @@ -0,0 +1,17 @@ +server { + listen 8080; + listen 8443; + location = / { + deny all; + } + location /images { + alias /app/pw_dump_images; + autoindex on; + } + location /documents { + alias /app/pw_dump_files; + default_type application/pdf; + add_header Content-Disposition 'inline'; + autoindex on; + } +} diff --git a/examples/pipelines/slides_search/parse_schema.yaml b/examples/pipelines/slides_search/parse_schema.yaml new file mode 100644 index 0000000..ef2ab60 --- /dev/null +++ b/examples/pipelines/slides_search/parse_schema.yaml @@ -0,0 +1,35 @@ +fields: + category: + type: option + values: + - "Problem Statement" + - "Solution Overview" + - "Product/Service Demo" + - "Benefits and Value Proposition" + - "Competitive Landscape" + - "Case Studies and Testimonials" + - "Features and Specifications" + - "How it Works" + - "Return on Investment (ROI) and Cost Savings" + - "Call to Action (CTA) and Next Steps" + - "Company Overview and Credentials" + - "Market Opportunity and Trends" + - "Customer Success Stories and Use Cases" + - "Pricing and Packaging" + - "Implementation and Support" + tags: + type: list[str] + description: "Tags associated with the slide. Such as [`price`, `shopping`, `consumer`] or [`beverage`, `cola`, `manufacturing`], etc." + title: + type: str + description: "title of the slide" + main_color: + type: str + title: "color" + description: "Most common color. Such as `black`, `blue`, `yellow`, etc." + language: + type: str + description: "language of the slide. example: `fr`, `en`, ..." + has_images: + type: bool + description: "Whether the slide contains photographs." diff --git a/examples/pipelines/slides_search/ui/.streamlit/config.toml b/examples/pipelines/slides_search/ui/.streamlit/config.toml new file mode 100644 index 0000000..23f772f --- /dev/null +++ b/examples/pipelines/slides_search/ui/.streamlit/config.toml @@ -0,0 +1,5 @@ +[server] +enableStaticServing = true + +[theme] +base = "light" diff --git a/examples/pipelines/slides_search/ui/requirements-ui.txt b/examples/pipelines/slides_search/ui/requirements-ui.txt new file mode 100644 index 0000000..982f002 --- /dev/null +++ b/examples/pipelines/slides_search/ui/requirements-ui.txt @@ -0,0 +1,5 @@ +streamlit==1.35.0 +load_dotenv==0.1.0 +nest_asyncio==1.6.0 +aiohttp==3.9.5 +beautifulsoup4==4.12.3 diff --git a/examples/pipelines/slides_search/ui/ui.py b/examples/pipelines/slides_search/ui/ui.py new file mode 100644 index 0000000..217aeb9 --- /dev/null +++ b/examples/pipelines/slides_search/ui/ui.py @@ -0,0 +1,460 @@ +# flake8 ignore:E501 +import logging +import os +import urllib.parse +from itertools import cycle + +import requests +import streamlit as st +from bs4 import BeautifulSoup +from dotenv import load_dotenv +from pathway.xpacks.llm.question_answering import RAGClient + +load_dotenv() + +PATHWAY_HOST = os.environ.get( + "PATHWAY_NW_HOST", "pathway_app" +) # set in the network settings of docker-compose +PATHWAY_PORT = os.environ.get("PATHWAY_PORT", 8000) + +FILE_SERVER_BASE_URL = os.environ.get("FILE_SERVER_URL", "http://file_server:8080/") +DOCKER_FILE_SV_BASE_URL = "http://file_server:8080/documents" # for internal requests + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(name)s %(levelname)s %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", + force=True, +) + +st.set_page_config(page_title="Find the right slide") # ,page_icon="favicon.ico" + + +logger = logging.getLogger("streamlit") +logger.setLevel(logging.INFO) + +conn = RAGClient(url=f"http://{PATHWAY_HOST}:{PATHWAY_PORT}") + + +file_server_image_base_url = f"{FILE_SERVER_BASE_URL}images" + +file_server_pdf_base_url = f"{FILE_SERVER_BASE_URL}documents" + + +note = """ +

Search your slide decks""" +st.markdown(note, unsafe_allow_html=True) + +st.markdown( + """ + +""", + unsafe_allow_html=True, +) + +question = st.text_input(label="", placeholder="Why buy") + + +def get_options_list(metadata_list: list[dict], opt_key: str) -> list: + """Get all available options in a specific metadata key.""" + options = set(map(lambda x: x[opt_key], metadata_list)) + return list(options) + + +def get_image_serve_url(metadata: dict) -> str: + slide_name_enc, page, tot_pages = ( + metadata["slide_id"].replace(".png", "").split("_") + ) + + page, tot_pages = int(page), int(tot_pages) + name = f"{slide_name_enc}_{page}_{tot_pages}.png" + base_url: str = file_server_image_base_url + return f"{base_url}/{name}" + + +def get_adjacent_image_urls(metadata: dict) -> list[str]: + logger.info( + {"_type": "create_adjacent_image_urls", "slide_id": metadata["slide_id"]} + ) + slide_name_enc, page, tot_pages = ( + metadata["slide_id"].replace(".png", "").split("_") + ) + base_url: str = file_server_image_base_url + + page, tot_pages = int(page), int(tot_pages) + + ret_images = [] + + if page > 1: + prev_img_name = f"{base_url}/{slide_name_enc}_{page - 2}_{tot_pages}.png" + ret_images.append(prev_img_name) + + if page > 0: + prev_img_name = f"{base_url}/{slide_name_enc}_{page - 1}_{tot_pages}.png" + ret_images.append(prev_img_name) + + cur_img_name = f"{base_url}/{slide_name_enc}_{page}_{tot_pages}.png" + ret_images.append(cur_img_name) + + if page + 1 < tot_pages: + next_img_name = f"{base_url}/{slide_name_enc}_{page + 1}_{tot_pages}.png" + ret_images.append(next_img_name) + + if page + 2 < tot_pages: + next_img_name = f"{base_url}/{slide_name_enc}_{page + 2}_{tot_pages}.png" + ret_images.append(next_img_name) + + return ret_images + + +st.session_state["available_categories"] = None +st.session_state["available_languages"] = None + +logger.info("Requesting pw_list_documents...") +document_meta_list = conn.pw_list_documents(keys=None) +logger.info("Received response pw_list_documents") + +st.session_state["document_meta_list"] = document_meta_list + + +available_categories = get_options_list(document_meta_list, "category") +st.session_state["available_categories"] = available_categories + +available_languages = get_options_list(document_meta_list, "language") +st.session_state["available_languages"] = available_languages + + +available_files = get_options_list(st.session_state["document_meta_list"], "path") + + +def get_slide_link(file_name, page_num=None) -> str: + filename_encoded = urllib.parse.quote(file_name) + image_url = f"{file_server_pdf_base_url}/{filename_encoded}" + if page_num is not None: + image_url += f"#page={page_num}" + return image_url + + +def get_all_drive_files() -> list[str]: + logger.info("request get_all_drive_files") + response = requests.get(DOCKER_FILE_SV_BASE_URL) + logger.info("response get_all_drive_files") + + if response.status_code == 200: + soup = BeautifulSoup(response.content, "html.parser") + + file_links = [a["href"] for a in soup.find_all("a", href=True)] + + file_links = [link for link in file_links if not link.endswith("/")] + else: + file_links = [] + return file_links + + +# DRIVE_ID = os.environ.get("DRIVE_ID", "foo") +# DRIVE_URL = f"https://drive.google.com/drive/folders/{DRIVE_ID}" +# drive_htm = f""" +#
+# +#
+# Google Drive Logo +#
Connected Folder ⚡
+#
+#
+#
+# """ + +with st.sidebar: + st.info( + """This demo app only allows `PDF` and `PPTX` documents. + For other file types, convert to `PDF` or contact **Pathway**.""" + ) + # st.markdown(drive_htm, unsafe_allow_html=True) + file_names = [i.split("/")[-1] for i in available_files] + links = [get_slide_link(i) for i in file_names] + + markdown_table = "| Slides Ready for Search |\n| --- |\n" + for file_name, link in zip(file_names, links): + markdown_table += f"| [{file_name}]({link}) |\n" + st.markdown(markdown_table, unsafe_allow_html=True) + + all_drive_files = get_all_drive_files() + all_drive_files = [urllib.parse.unquote(i) for i in all_drive_files] + all_drive_files = [i for i in all_drive_files if i.endswith(".pdf")] + logger.info(f"All drive files: {all_drive_files}\nIndexed files: {file_names}") + currently_processing_files = set(all_drive_files) - set(file_names) + + st.markdown("\n\n", unsafe_allow_html=True) + + if currently_processing_files: + markdown_table = "| Indexing in Progress |\n| --- |\n" + + links = [get_slide_link(i) for i in currently_processing_files] + for file_name, link in zip(currently_processing_files, links): + markdown_table += f"| [{file_name}]({link}) |\n" + st.markdown(markdown_table, unsafe_allow_html=True) + else: + st.markdown("## No new files detected.") + + +category_options = st.session_state["available_categories"] + +lang_options = st.session_state["available_languages"] + +cols = cycle(st.columns(2)) + +with next(cols): + cat_options = st.multiselect( + "Filtered Categories", + category_options or [], + [], + key="cat_selection", + label_visibility="hidden", + placeholder="Filtered Categories", + ) + +with next(cols): + language_options = st.multiselect( + "Languages", + lang_options or [], + [], + key="lang_selection", + label_visibility="hidden", + placeholder="Filtered Languages", + ) + +with st.sidebar: + cat_prefix = "cat_" + + logger.info("All category options: %s", category_options) + + selected_categories = category_options if len(cat_options) == 0 else cat_options + + logger.info("Selected categories: %s", selected_categories) + + lang_prefix = "lang_" + + selected_languages = ( + lang_options if len(language_options) == 0 else language_options + ) + + st.session_state.category_filter = selected_categories + st.session_state.language_filter = selected_languages + + +def get_category_filter(category: str) -> str: + return f"contains({str(category)}, category)" + + +# TODO: merge these +def get_language_filter(lang: str) -> str: + return f"contains({str(lang)}, language)" + + +def combine_filters(*args: str | None) -> str: + """Construct single jmespath filter with `&&` from number of filters.""" + return " && ".join([arg for arg in args if arg is not None]) + + +icon_thumbs_up = '' # noqa: E501 + +icon_thumbs_down = '' # noqa: E501 + + +css = """ +""" + + +def get_ext_img_with_href(url, target_url, *args) -> str: + width: int = 600 + margin = 20 + + def get_img_html(dc): + return f""" +
+ +
""" # TODO: add href + + slider_images = "\n".join([get_img_html(dc) for dc in args]) + + html_code = f""" + + + + +
+ +
+ + {slider_images} + +
+
+ """ # noqa: E501 + return html_code + + +st.markdown(css, unsafe_allow_html=True) + + +def log_rate_answer(event, idx, kwargs): + logger.info({"_type": "rate_event", "rating": event, "rank": idx, **kwargs}) + + +if question: + select_cat = st.session_state.category_filter + select_lang = st.session_state.language_filter + + filter_ls = [get_category_filter(select_cat), get_language_filter(select_lang)] + + combined_query_filter = combine_filters(*filter_ls) + + logger.info( + { + "_type": "search_request_event", + "filter": combined_query_filter, + "query": question, + } + ) + + response = conn.pw_ai_answer(question, filters=combined_query_filter) + + logger.info( + { + "_type": "search_response_event", + "filter": combined_query_filter, + "query": question, + "response": type(response), + } + ) + + if response: + logger.info(type(response[0])) + + text_responses = [r["text"] for r in response] + + image_metadatas = [r["metadata"] for r in response] + + for m in image_metadatas: + logger.info("Retrieved metadatas: %s || %s", m["language"], m["category"]) + + st.markdown(f"**Searched for:** {question}") + + for idx, cur_metadata in enumerate(image_metadatas): + file_name = cur_metadata["path"].split("/")[-1] + + select_page = cur_metadata["image_page"] + 1 + + adjacent_urls = get_adjacent_image_urls(cur_metadata) + + args = [{"url": i} for i in adjacent_urls] + + image_html = get_ext_img_with_href( + get_image_serve_url(cur_metadata), + get_slide_link(file_name, select_page), + *args, + ) + + image_url = get_slide_link(file_name, select_page) + + slide_id = cur_metadata["slide_id"] + + st.markdown(f"Page `{select_page}` of [`{file_name}`]({image_url})") + + st.markdown(image_html, unsafe_allow_html=True) + + log_args = ( + idx, + { + "slide_id": slide_id, + "filter": combined_query_filter, + "query": question, + "file_name": file_name, + "selected_cat": select_cat, + "selected_lang": select_lang, + }, + ) + col1, col2, col3 = st.columns([12, 1, 1]) + with col2: + st.button( + "👍", + on_click=log_rate_answer, + type="primary", + key=slide_id + "_up", + args=("like", *log_args), + ) + with col3: + st.button( + "👎", + on_click=log_rate_answer, + type="secondary", + key=slide_id + "_down", + args=("dislike", *log_args), + ) + + else: + st.markdown( + f"""No results were found for search query: `{question}` + and filter criteria: `{combined_query_filter}`""" + ) diff --git a/examples/pipelines/slides_search/utils.py b/examples/pipelines/slides_search/utils.py new file mode 100644 index 0000000..4407242 --- /dev/null +++ b/examples/pipelines/slides_search/utils.py @@ -0,0 +1,39 @@ +import typing +from pathlib import Path + +import yaml +from pydantic import BaseModel, Field, create_model + +CUSTOM_FIELDS = {"option": typing.Literal} + + +def get_model_from_file(file_path: str | Path) -> type[BaseModel]: + """ + Return Pydantic schema from a YAML file. + + Replaces types of `CUSTOM_FIELDS` with the definitions, other types are evaluated + as primitive Python type. + + Args: + - file_path: Path of the YAML file. + """ + with open(file_path, "r") as file: + schema = yaml.safe_load(file) + + fields: dict[str, typing.Any] = {} + for field_name, field_info in schema["fields"].items(): + f_type_raw = field_info.pop("type") + f_type = CUSTOM_FIELDS.get(f_type_raw) + + if f_type is None: # not custom definition, can be evaluated as primitive type + field_type = eval(f_type_raw) + else: + field_type = f_type + + if field_type == typing.Literal: + field_type = typing.Literal[tuple(field_info["values"])] + + fields[field_name] = (field_type, Field(**field_info)) + + PydanticSchema = create_model("ParsePydanticSchema", **fields) + return PydanticSchema