diff --git a/backend/bin/docker-entrypoint.sh b/backend/bin/docker-entrypoint.sh new file mode 100755 index 0000000..8708402 --- /dev/null +++ b/backend/bin/docker-entrypoint.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +echo "Running docker-entrypoint initialization script" + +# run migrations on sqlite database +alembic upgrade head + +# create, if needed, the admin user +voilib-management --create-admin + +# run the CMD passed as command-line arguments +exec "$@" diff --git a/backend/dockerfile b/backend/dockerfile index 2757cd7..2d5940b 100644 --- a/backend/dockerfile +++ b/backend/dockerfile @@ -24,3 +24,7 @@ COPY . . # allow installing development dependencies to run tests ARG INSTALL_DEV=false RUN bash -c "if [ $INSTALL_DEV == 'true' ] ; then pip install -e .[dev] ; else pip install . ; fi" + +RUN chmod +x "/backend/bin/docker-entrypoint.sh" +ENTRYPOINT ["/backend/bin/docker-entrypoint.sh"] +CMD ["uvicorn", "src.voilib.main:app", "--proxy-headers", "--host", "0.0.0.0", "--port", "80"] diff --git a/backend/requirements.txt b/backend/requirements.txt index 4f4bed7..c548eee 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,9 +1,13 @@ aiosqlite==0.19.0 alembic==1.12.0 +altair==5.1.1 anyio==4.0.0 async-timeout==4.0.3 +attrs==23.1.0 av==10.0.0 bcrypt==4.0.1 +blinker==1.6.2 +cachetools==5.3.1 certifi==2023.7.22 cffi==1.15.1 charset-normalizer==3.2.0 @@ -23,6 +27,8 @@ faster-whisper==0.7.1 filelock==3.12.3 flatbuffers==23.5.26 fsspec==2023.9.0 +gitdb==4.0.10 +GitPython==3.1.34 greenlet==2.0.2 grpcio==1.57.0 grpcio-tools==1.57.0 @@ -36,12 +42,17 @@ huggingface-hub==0.16.4 humanfriendly==10.0 hyperframe==6.0.1 idna==3.4 +importlib-metadata==6.8.0 itsdangerous==2.1.2 Jinja2==3.1.2 joblib==1.3.2 +jsonschema==4.19.0 +jsonschema-specifications==2023.7.1 lit==16.0.6 Mako==1.2.4 +markdown-it-py==3.0.0 MarkupSafe==2.1.3 +mdurl==0.1.2 mpmath==1.3.0 networkx==3.1 nltk==3.8.1 @@ -61,22 +72,32 @@ onnxruntime==1.15.1 orjson==3.9.5 ormar==0.12.2 packaging==23.1 +pandas==2.1.0 passlib==1.7.4 -Pillow==10.0.0 +Pillow==9.5.0 portalocker==2.7.0 protobuf==4.24.2 +pyarrow==13.0.0 pyasn1==0.5.0 pycparser==2.21 pydantic==1.10.8 +pydeck==0.8.0 +Pygments==2.16.1 +Pympler==1.0.1 python-dateutil==2.8.2 python-dotenv==1.0.0 python-jose==3.3.0 python-multipart==0.0.6 +pytz==2023.3 +pytz-deprecation-shim==0.1.0.post0 PyYAML==6.0.1 qdrant-client==1.4.0 redis==5.0.0 +referencing==0.30.2 regex==2023.8.8 requests==2.31.0 +rich==13.5.2 +rpds-py==0.10.0 rq==1.15.1 rsa==4.9 safetensors==0.3.3 @@ -85,22 +106,33 @@ scipy==1.11.2 sentence-transformers==2.2.2 sentencepiece==0.1.99 six==1.16.0 +smmap==5.0.0 sniffio==1.3.0 SQLAlchemy==1.4.41 starlette==0.27.0 +streamlit==1.26.0 sympy==1.12 +tenacity==8.2.3 threadpoolctl==3.2.0 tokenizers==0.13.3 +toml==0.10.2 +toolz==0.12.0 torch==2.0.1 torchvision==0.15.2 +tornado==6.3.3 tqdm==4.66.1 transformers==4.32.1 triton==2.0.0 typing_extensions==4.7.1 +tzdata==2023.3 +tzlocal==4.3.1 ujson==5.8.0 urllib3==1.26.16 uvicorn==0.23.2 uvloop==0.17.0 +validators==0.22.0 +watchdog==3.0.0 watchfiles==0.20.0 websockets==11.0.3 xmltodict==0.13.0 +zipp==3.16.2 diff --git a/backend/setup.cfg b/backend/setup.cfg index 378c297..bf716ca 100644 --- a/backend/setup.cfg +++ b/backend/setup.cfg @@ -30,6 +30,7 @@ install_requires = sentence-transformers>=2.2.2,<3 qdrant-client>=1.1.6,<2 faster-whisper>=0.7.1,<1 + streamlit>=1.26.0,<2 [options.packages.find] where=src diff --git a/backend/src/voilib/management/__init__.py b/backend/src/voilib/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git "a/backend/src/voilib/management/pages/1_\360\237\224\221-Login.py" "b/backend/src/voilib/management/pages/1_\360\237\224\221-Login.py" new file mode 100644 index 0000000..c4a81e3 --- /dev/null +++ "b/backend/src/voilib/management/pages/1_\360\237\224\221-Login.py" @@ -0,0 +1,60 @@ +# Copyright (c) 2022-2023 Pablo González Carrizo (unmonoqueteclea) +# All rights reserved. + +import asyncio +import typing +from datetime import timedelta + +import streamlit as st + +from voilib import auth + +st.set_page_config(page_title="Voilib", page_icon="🎧") +st.title("🔑 Login") + +SHOW_LOGIN_FORM = True +USERNAME_KEY = "logged_user_username" +TOKEN_KEY = "logged_user_token" + + +async def _login(username: str, password: str) -> typing.Optional[str]: + user = await auth.authenticate_user(username, password) + if user: + return auth.create_access_token( + data={"sub": user.username}, # type: ignore + expires_delta=timedelta(minutes=auth.ACCESS_TOKEN_EXPIRE_MINUTES), + ) + + +async def main(): + global SHOW_LOGIN_FORM + if USERNAME_KEY in st.session_state and TOKEN_KEY in st.session_state: + SHOW_LOGIN_FORM = False + if SHOW_LOGIN_FORM: + with st.form(key="login"): + username = st.text_input("Username") + password = st.text_input("Password", type="password") + clicked = st.form_submit_button("Login", use_container_width=True) + if clicked: + token = await _login(username, password) + if token: + st.session_state[USERNAME_KEY] = username + st.session_state[TOKEN_KEY] = token + SHOW_LOGIN_FORM = False + st.experimental_rerun() + else: + st.error("Invalid credentials. Please, try again") + else: + username = st.session_state[USERNAME_KEY] + st.info(f"""👤 Already logged in as **{username}**""") + logout = st.button("Logout", use_container_width=True) + if logout: + del st.session_state[USERNAME_KEY] + del st.session_state[TOKEN_KEY] + SHOW_LOGIN_FORM = True + st.experimental_rerun() + + +if __name__ == "__main__": + loop = asyncio.new_event_loop() + loop.run_until_complete(main()) diff --git "a/backend/src/voilib/management/pages/2_\360\237\223\210-Stats.py" "b/backend/src/voilib/management/pages/2_\360\237\223\210-Stats.py" new file mode 100644 index 0000000..3c496d4 --- /dev/null +++ "b/backend/src/voilib/management/pages/2_\360\237\223\210-Stats.py" @@ -0,0 +1,47 @@ +# Copyright (c) 2022-2023 Pablo González Carrizo (unmonoqueteclea) +# All rights reserved. + +import asyncio + +import pandas as pd +import streamlit as st + +from voilib.management import utils +from voilib.models import analytics + + +async def main(): + st.set_page_config(page_title="Voilib", page_icon="🎧") + st.title("📈 Stats") + authenticated = utils.login_message(st.session_state) + + if authenticated: + tab_last, tab_graphs = st.tabs(["Last queries", "Queries per day"]) + with tab_last: + st.write("Last 20 queries performed by Voilib users") + qs = await analytics.Query.objects.order_by("-created_at").limit(20).all() + markdown_queries = "" + for query in qs: + date = query.created_at.strftime("%Y-%m-%d, %H:%M:%S") # type: ignore + markdown_queries += f"\n - `{date}` {query.text}" + if len(qs) == 0: + st.write("⚠️ No queries yet!") + st.markdown(markdown_queries) + with tab_graphs: + qs = await analytics.Query.objects.order_by("-created_at").values( + fields=["created_at", "text"] + ) + df = pd.DataFrame(qs) + if df.shape[0] == 0: + st.write("⚠️ No queries yet!") + else: + df["created_at"] = pd.to_datetime(df["created_at"]).dt.date + st.bar_chart(data=df.created_at.value_counts()) + refresh = st.button("Refresh", use_container_width=True) + if refresh: + st.experimental_rerun() + + +if __name__ == "__main__": + loop = asyncio.new_event_loop() + loop.run_until_complete(main()) diff --git "a/backend/src/voilib/management/pages/3_\360\237\224\210-Media.py" "b/backend/src/voilib/management/pages/3_\360\237\224\210-Media.py" new file mode 100644 index 0000000..c81bde7 --- /dev/null +++ "b/backend/src/voilib/management/pages/3_\360\237\224\210-Media.py" @@ -0,0 +1,72 @@ +# Copyright (c) 2022-2023 Pablo González Carrizo (unmonoqueteclea) +# All rights reserved. + +import asyncio + +import streamlit as st +from voilib import collection, models, routers, settings +from voilib.management import utils as m_utils + + +async def add_channel(): + st.header("Add new podcast") + with st.form("my_form"): + st.markdown( + """Write below the RSS feed url from a podcast and click `ADD` + to include it in the database. """ + ) + st.markdown( + """After adding a new channel, you should + + """ + ) + channel_url = st.text_input("Channel RSS feed url") + add_click = st.form_submit_button("Add channel", use_container_width=True) + if add_click: + with st.spinner("⌛ Adding new channel... Please, wait."): + _, ch = await collection.get_or_create_channel(channel_url) + settings.queue.enqueue( + collection.update_channel, ch, job_timeout="600m" + ) + st.success( + f"""Channel "{ch.title}" correctly added to the + database. Its episodes are being updated in a + background task. This process can take a few minutes.""" + ) + + +async def podcasts_and_episodes(): + st.header("Podcasts and episodes") + + col1, col2, col3 = st.columns(3) + col1.metric("Channels", await models.Channel.objects.count()) + col2.metric( + "Transcribed episodes", + await models.Episode.objects.filter(transcribed=True).count(), + ) + col3.metric( + "Indexed episodes", + await models.Episode.objects.filter(embeddings=True).count(), + ) + with st.spinner("⌛ Loading channels..."): + for ch in (await routers.analytics._media()).channels: + with st.expander( + f"**{ch.title}**. Indexed {ch.available_episodes}/{ch.total_episodes}" + ): + st.image(ch.image) + st.markdown(ch.description) + + +async def main(): + st.set_page_config(page_title="Voilib", page_icon="🎧") + st.title("📻 Media") + authenticated = m_utils.login_message(st.session_state) + if authenticated: + await add_channel() + st.divider() + await podcasts_and_episodes() + + +if __name__ == "__main__": + loop = asyncio.new_event_loop() + loop.run_until_complete(main()) diff --git "a/backend/src/voilib/management/pages/4_\342\232\231\357\270\217-Tasks.py" "b/backend/src/voilib/management/pages/4_\342\232\231\357\270\217-Tasks.py" new file mode 100644 index 0000000..35d16e8 --- /dev/null +++ "b/backend/src/voilib/management/pages/4_\342\232\231\357\270\217-Tasks.py" @@ -0,0 +1,105 @@ +# Copyright (c) 2022-2023 Pablo González Carrizo (unmonoqueteclea) +# All rights reserved. + +import asyncio +import datetime + +import streamlit as st +from voilib import collection, settings, tasks, utils +from voilib.management import utils as m_utils + + +async def load_default_channels(): + st.header("1. Load default channel list (only once)") + st.markdown( + """Voilib comes with a predefined list of podcasts `RSS` + feeds. If you import them, the system will transcribe and + index them. This is, usually, the first task that is performed + in a new installation. Alternatively, you can also provide + your own urls for `RSS` feeds. **You should run this task only + once** """ + ) + st.info("This action may take up to 4 minutes.") + with st.expander("Show the list of channels"): + lines = [f"- {item['name']}" for item in collection.default_channels()] + st.markdown("\n".join(lines)) + import_default = st.button("⚙️ Import default channels", use_container_width=True) + if import_default: + with st.spinner("⌛ Loading default channels... Please, wait."): + await collection.add_default_channels() + st.success("Default list of channels correctly added") + + +async def update_channels(): + st.header("2. Update channel episodes") + st.markdown( + """After loading channels to the system, you will need to + **update the list of eposides of each one**. This task crawls the + list of imported feeds to find new episodes (that will be + transcribed and indexed when requested. """ + ) + if last_execution := utils.get_event("event_update_start"): + last_execution_time = float(last_execution["time"]) + date = datetime.datetime.fromtimestamp(last_execution_time).strftime("%c") + st.markdown(f"**Last execution**: `{date}`") + update_channels = st.button("⚙️ Update channels", use_container_width=True) + if update_channels: + settings.queue.enqueue(tasks.update_channels) + st.success("Channels started to update in the background") + + +async def transcribe_pending(): + st.header("3. Transcribe episodes") + st.markdown( + """Trigger the **transcription process** that will take + episodes from the last `number of days` and transcribe them in + random order. When transcriptions finish, the episodes won't + be ready yet for queries, you should **index** them first (see + next tasks). """ + ) + if last_execution := utils.get_event("event_transcription_start"): + last_execution_time = float(last_execution["time"]) + date = datetime.datetime.fromtimestamp(last_execution_time).strftime("%c") + st.markdown(f"**Last execution**: `{date}`: {last_execution['info']}") + + days = st.number_input("Number of days", min_value=1, step=1) + start = st.button("🎧 Start transcription process", use_container_width=True) + if start: + total = await tasks.transcribe_episodes(days) # type: ignore + st.success(f"Started transcription of {total} episodes in a background process") + + +async def store_pending(): + st.header("4. Index episodes") + st.markdown( + """Trigger the process that will index all finished + transcriptions so that users can query them """ + ) + if last_execution := utils.get_event("event_store_start"): + last_execution_time = float(last_execution["time"]) + date = datetime.datetime.fromtimestamp(last_execution_time).strftime("%c") + st.markdown(f"**Last execution**: `{date}`: {last_execution['info']}") + + start = st.button("💾 Start indexing process", use_container_width=True) + if start: + settings.queue.enqueue(tasks.store_episodes_embeddings) + st.success("Started indexing in a background process") + + +async def main(): + st.set_page_config(page_title="Voilib", page_icon="🎧") + st.title("⚙️ Tasks") + authenticated = m_utils.login_message(st.session_state) + if authenticated: + await load_default_channels() + st.divider() + await update_channels() + st.divider() + await transcribe_pending() + st.divider() + await store_pending() + + +if __name__ == "__main__": + loop = asyncio.new_event_loop() + loop.run_until_complete(main()) diff --git a/backend/src/voilib/management/pages/__init__.py b/backend/src/voilib/management/pages/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/src/voilib/management/utils.py b/backend/src/voilib/management/utils.py new file mode 100644 index 0000000..fb37a2f --- /dev/null +++ b/backend/src/voilib/management/utils.py @@ -0,0 +1,17 @@ +# Copyright (c) 2022-2023 Pablo González Carrizo (unmonoqueteclea) +# All rights reserved. + +import streamlit as st + +USERNAME_KEY = "logged_user_username" +TOKEN_KEY = "logged_user_token" + + +def login_message(session_state) -> bool: + username = session_state.get(USERNAME_KEY) + token = session_state.get(TOKEN_KEY) + if not username or not token: + st.error("👤 Unauthenticated user, please login first.") + return False + st.info(f"👋 Hello, {username}") + return True diff --git "a/backend/src/voilib/management/\360\237\217\240-Home.py" "b/backend/src/voilib/management/\360\237\217\240-Home.py" new file mode 100644 index 0000000..0508119 --- /dev/null +++ "b/backend/src/voilib/management/\360\237\217\240-Home.py" @@ -0,0 +1,26 @@ +# Copyright (c) 2022-2023 Pablo González Carrizo (unmonoqueteclea) +# All rights reserved. + +import streamlit as st + +from voilib import __version__ +from voilib.management import utils + +st.set_page_config(page_title="Voilib", page_icon="🎧") +st.title("🎧 Voilib Management Dashboard") +authenticated = utils.login_message(st.session_state) + +st.markdown( + f"""**Management tools for Voilib deployments.** + +- Voilib verson: `{__version__}` + +Select one menu option from the sidebar. You will need an **admin** user +to retrieve the info. + +![GitHub](https://img.shields.io/badge/github-%23121011.svg?style=for-the-badge&logo=github&logoColor=white) + You will find more info in the [official repository](https://github.com/unmonoqueteclea/voilib) + + +""" +) diff --git a/backend/src/voilib/routers/analytics.py b/backend/src/voilib/routers/analytics.py index 4ac393a..430bd5b 100644 --- a/backend/src/voilib/routers/analytics.py +++ b/backend/src/voilib/routers/analytics.py @@ -6,7 +6,6 @@ from fastapi import APIRouter, Depends from fastapi_pagination import Page from fastapi_pagination.ext.ormar import paginate - from voilib import auth from voilib.models import analytics, media, users from voilib.schemas import analytics as analytics_schemas @@ -40,6 +39,7 @@ async def _media(): channels.append( analytics_schemas.ChannelAnalytics( title=channel.title, + description=channel.description, # type: ignore image=channel.image, url=channel.url, total_episodes=await eps.count(), diff --git a/backend/src/voilib/schemas/analytics.py b/backend/src/voilib/schemas/analytics.py index 23de2ff..876c8a7 100644 --- a/backend/src/voilib/schemas/analytics.py +++ b/backend/src/voilib/schemas/analytics.py @@ -5,7 +5,6 @@ """ from pydantic import BaseModel - from voilib.models.analytics import Query QueryOut = Query.get_pydantic(exclude={"pk"}) @@ -15,6 +14,7 @@ class ChannelAnalytics(BaseModel): """Schema to show some analytics about a channel.""" title: str + description: str total_episodes: int image: str url: str diff --git a/backend/src/voilib/settings.py b/backend/src/voilib/settings.py index a695689..9a5fd5c 100644 --- a/backend/src/voilib/settings.py +++ b/backend/src/voilib/settings.py @@ -63,7 +63,9 @@ def qdrant_use_file(self) -> bool: @property def redis_cache(self) -> redis.Redis: - return redis.Redis(host=self.redis_host, db=REDIS_CACHE_DB_NUMBER) + return redis.Redis( + host=self.redis_host, db=REDIS_CACHE_DB_NUMBER, decode_responses=True + ) def create_queue(settings: Settings) -> Queue: diff --git a/backend/src/voilib/tasks.py b/backend/src/voilib/tasks.py index 85c943f..f3db900 100644 --- a/backend/src/voilib/tasks.py +++ b/backend/src/voilib/tasks.py @@ -23,6 +23,7 @@ async def update_channels() -> int: """ logger.info("updating all channels") + utils.log_event("event_update_start", "") total = 0 for ch in await models.Channel.objects.all(): ch_info = f"channel {ch.id}-{ch.title}" @@ -33,6 +34,7 @@ async def update_channels() -> int: total += added except Exception: logger.error(f"error while reading channel {ch_info}", exc_info=True) + utils.log_event("event_update_end", "") logger.info(f"finished channels update after creating {total}") return total diff --git a/backend/src/voilib/utils.py b/backend/src/voilib/utils.py index 86164fc..decf319 100644 --- a/backend/src/voilib/utils.py +++ b/backend/src/voilib/utils.py @@ -7,6 +7,7 @@ small as possible. """ +import typing import re import time import unicodedata @@ -25,3 +26,11 @@ def slugify(value: str) -> str: def log_event(key: str, info: str) -> None: redis = settings.settings.redis_cache redis.set(key, f"{time.time()}|{info}") + + +def get_event(key: str) -> typing.Optional[dict]: + redis = settings.settings.redis_cache + event = redis.get(key) + if event: + return {"time": event.split("|")[0], "info": event.split("|")[1]} + return diff --git a/infra/development/compose.yml b/infra/development/compose.yml index 4b57697..596740b 100644 --- a/infra/development/compose.yml +++ b/infra/development/compose.yml @@ -94,6 +94,23 @@ services: - ../../backend/:/backend/ - ../../data/:/data/ + management: + <<: *defaults + image: voilib-worker:latest + env_file: + - .env.dev + build: + context: ../../backend + dockerfile: dockerfile + args: + INSTALL_DEV: true + ports: + - 8501:8501 + command: streamlit run src/voilib/management/🏠-Home.py + volumes: + - ../../backend/:/backend/ + - ../../data/:/data/ + frontend: <<: *defaults image: voilib-ui:latest